OSDN Git Service

autoit.cpp - Macros >> User 1 ..... Variable >> User 2 (#749) (2)
[winmerge-jp/winmerge-jp.git] / Src / codepage_detect.cpp
1 /** 
2  * @file  codepage_detect.cpp
3  *
4  * @brief Deducing codepage from file contents, when we can
5  *
6  */
7
8 #include "pch.h"
9 #include "codepage_detect.h"
10 #include <cstdio>
11 #include <cstring>
12 #include <algorithm>
13 #include <memory>
14 #include "unicoder.h"
15 #include "ExConverter.h"
16 #include "charsets.h"
17 #include "FileTextEncoding.h"
18 #include "paths.h"
19 #include "markdown.h"
20
21 /**
22  * @brief Prefixes to handle when searching for codepage names
23  * NB: prefixes ending in '-' must go first!
24  */
25 static const char *f_wincp_prefixes[] =
26 {
27         "WINDOWS-", "WINDOWS", "CP-", "CP", "MSDOS-", "MSDOS"
28 };
29
30 /**
31  * @brief Remove prefix from the text.
32  * @param [in] text Text to process.
33  * @param [in] prefix Prefix to remove.
34  * @return Text without the prefix.
35  */
36 static const char *EatPrefix(const char *text, const char *prefix)
37 {
38         size_t len = strlen(prefix);
39         if (len)
40                 if (_strnicmp(text, prefix, len) == 0)
41                         return text + len;
42         return 0;
43 }
44
45 /**
46  * @brief Try to to match codepage name from codepages module, & watch for f_wincp_prefixes aliases
47  */
48 static int
49 FindEncodingIdFromNameOrAlias(const char *encodingName)
50 {
51         // Try name as given
52         unsigned encodingId = GetEncodingIdFromName(encodingName);
53         if (encodingId == 0)
54         {
55                 // Handle purely numeric values (codepages)
56                 char *ahead = nullptr;
57                 unsigned codepage = strtol(encodingName, &ahead, 10);
58                 int i = 0;
59                 while (*ahead != '\0' && i < sizeof(f_wincp_prefixes)/sizeof(f_wincp_prefixes[0]))
60                 {
61                         if (const char *remainder = EatPrefix(encodingName, f_wincp_prefixes[i]))
62                         {
63                                 codepage = strtol(remainder, &ahead, 10);
64                         }
65                         ++i;
66                 }
67                 if (*ahead == '\0')
68                 {
69                         encodingId = GetEncodingIdFromCodePage(codepage);
70                 }
71         }
72         return encodingId;
73 }
74
75 /**
76  * @brief Parser for HTML files to find encoding information
77  */
78 static unsigned demoGuessEncoding_html(const char *src, size_t len, int defcodepage)
79 {
80         CMarkdown markdown(src, src + len, CMarkdown::Html);
81         //As <html> and <head> are optional, there is nothing to pull...
82         //markdown.Move("html").Pop().Move("head").Pop();
83         while (markdown.Move("meta"))
84         {
85                 std::string charset(markdown.GetAttribute("charset"));
86                 if (charset.empty())
87                 {
88                         std::string http_equiv(markdown.GetAttribute("http-equiv"));
89                         if (!http_equiv.empty() && _stricmp(http_equiv.c_str(), "content-type") == 0)
90                         {
91                                 std::string content(markdown.GetAttribute("content"));
92                                 if (!content.empty())
93                                 {
94                                         char *pchKey = &content[0];
95                                         while (size_t cchKey = strcspn(pchKey += strspn(pchKey, "; \t\r\n"), ";="))
96                                         {
97                                                 char *pchValue = pchKey + cchKey;
98                                                 size_t cchValue = strcspn(pchValue += strspn(pchValue, "= \t\r\n"), "; \t\r\n");
99                                                 if (cchKey >= 7 && _strnicmp(pchKey, "charset", 7) == 0 && (cchKey == 7 || strchr(" \t\r\n", pchKey[7])))
100                                                 {
101                                                         pchValue[cchValue] = '\0';
102                                                         charset = pchValue;
103                                                         break;
104                                                 }
105                                                 pchKey = pchValue + cchValue;
106                                         }
107                                 }
108                         }
109                 }
110                 if (!charset.empty())
111                 {
112                         // Is it an encoding name known to charsets module ?
113                         int encodingId = FindEncodingIdFromNameOrAlias(charset.c_str());
114                         if (encodingId)
115                                 return GetEncodingCodePageFromId(encodingId);
116                         return defcodepage;
117                 }
118         }
119         return defcodepage;
120 }
121
122 /**
123  * @brief Parser for XML files to find encoding information
124  */
125 static unsigned demoGuessEncoding_xml(const char *src, size_t len, int defcodepage)
126 {
127         const char *psrc = src;
128         std::unique_ptr<char[]> buf;
129         if (len >= 2 && (src[0] == 0 || src[1] == 0))
130         {
131                 buf.reset(new char[len]);
132                 int i, j;
133                 for (i = 0, j = 0; i < (int)len; i++)
134                 {
135                         if (src[i])
136                                 buf[j++] = src[i];
137                 }
138                 len = j;
139                 psrc = buf.get();
140         }
141         CMarkdown xml(psrc, psrc + len);
142         if (xml.Move("?xml"))
143         {
144                 std::string encoding(xml.GetAttribute("encoding"));
145                 if (!encoding.empty())
146                 {
147                         // Is it an encoding name we can find in charsets module ?
148                         unsigned encodingId = FindEncodingIdFromNameOrAlias(encoding.c_str());
149                         if (encodingId)
150                         {
151                                 return GetEncodingCodePageFromId(encodingId);
152                         }
153                 }
154         }
155         return defcodepage;
156 }
157
158 /**
159  * @brief Parser for rc files to find encoding information
160  * @note sscanf() requires first argument to be zero-terminated so we must
161  * copy lines to temporary buffer.
162  */
163 static unsigned demoGuessEncoding_rc(const char *src, size_t len, int defcodepage)
164 {
165         // NB: Diffutils may replace line endings by '\0'
166         unsigned cp = defcodepage;
167         char line[80];
168         do
169         {
170                 while (len && (*src == '\r' || *src == '\n' || *src == '\0'))
171                 {
172                         ++src;
173                         --len;
174                 }
175                 const char *base = src;
176                 while (len && *src != '\r' && *src != '\n' && *src != '\0')
177                 {
178                         ++src;
179                         --len;
180                 }
181                 size_t n = len < sizeof line - 1 ? len : sizeof line - 1;
182                 memcpy(line, base, n);
183                 line[n] = 0;
184         } while (len && sscanf_s(line, "#pragma code_page(%5u)", &cp) != 1);
185         return cp;
186 }
187
188 /**
189  * @brief Try to deduce encoding for this file.
190  * @param [in] ext File extension.
191  * @param [in] src File contents (as a string).
192  * @param [in] len Size of the file contents string.
193  * @return Codepage number.
194  */
195 static unsigned GuessEncoding_from_bytes(const String& ext, const char *src, size_t len, int guessEncodingType)
196 {
197         unsigned cp = ucr::getDefaultCodepage();
198         if (!ucr::CheckForInvalidUtf8(src, len))
199                 cp = ucr::CP_UTF_8;
200         else if (guessEncodingType & 2)
201         {
202                 IExconverter *pexconv = Exconverter::getInstance();
203                 if (pexconv != nullptr && src != nullptr)
204                 {
205                         int autodetectType = (unsigned)guessEncodingType >> 16;
206                         cp = pexconv->detectInputCodepage(autodetectType, cp, src, len);
207                 }
208         }
209         if (guessEncodingType & 1)
210         {
211                 String lower_ext = strutils::makelower(ext);
212                 if (lower_ext == _T(".rc"))
213                 {
214                         cp = demoGuessEncoding_rc(src, len, cp);
215                 }
216                 else if (lower_ext == _T(".htm") || lower_ext == _T(".html"))
217                 {
218                         cp = demoGuessEncoding_html(src, len, cp);
219                 }
220                 else if (lower_ext == _T(".xml") || lower_ext == _T(".xsl"))
221                 {
222                         cp = demoGuessEncoding_xml(src, len, cp);
223                 }
224         }
225         return cp;
226 }
227
228 /**
229  * @brief Try to deduce encoding for this file.
230  * @param [in] filepath Full path to the file.
231  * @param [in] bGuessEncoding Try to guess codepage (not just unicode encoding).
232  * @return Structure getting the encoding info.
233  */
234 FileTextEncoding GuessCodepageEncoding(const String& filepath, int guessEncodingType, ptrdiff_t mapmaxlen)
235 {
236         FileTextEncoding encoding;
237         CMarkdown::FileImage fi(filepath != _T("NUL") ? filepath.c_str() : nullptr, mapmaxlen);
238         encoding.SetCodepage(ucr::getDefaultCodepage());
239         encoding.m_bom = false;
240         switch (fi.nByteOrder)
241         {
242         case 8 + 2 + 0:
243                 encoding.SetUnicoding(ucr::UCS2LE);
244                 encoding.SetCodepage(ucr::CP_UCS2LE);
245                 encoding.m_bom = true;
246                 break;
247         case 8 + 2 + 1:
248                 encoding.SetUnicoding(ucr::UCS2BE);
249                 encoding.SetCodepage(ucr::CP_UCS2BE);
250                 encoding.m_bom = true;
251                 break;
252         case 8 + 1:
253                 encoding.SetUnicoding(ucr::UTF8);
254                 encoding.SetCodepage(ucr::CP_UTF_8);
255                 encoding.m_bom = true;
256                 break;
257         default:
258                 encoding.m_bom = false;
259                 break;
260         }
261         if (fi.nByteOrder < 4 && guessEncodingType != 0)
262         {
263                 String ext = paths::FindExtension(filepath);
264                 const char *src = (char *)fi.pImage;
265                 size_t len = fi.cbImage;
266                 if (len == static_cast<size_t>(mapmaxlen))
267                 {
268                         for (size_t i = len; i--; )
269                         {
270                                 if (isspace((unsigned char)src[i]))
271                                 {
272                                         // make len an even number for ucs-2 detection
273                                         if ((i % 2) == 0)
274                                                 len = i;
275                                         else
276                                                 len = i + 1;
277                                         break;
278                                 }
279                         }
280                 }
281                 if (unsigned cp = GuessEncoding_from_bytes(ext, src, len, guessEncodingType))
282                         encoding.SetCodepage(cp);
283                 else
284                         encoding.SetCodepage(ucr::getDefaultCodepage());
285         }
286         return encoding;
287 }