2 * @file codepage_detect.cpp
4 * @brief Deducing codepage from file contents, when we can
7 // ID line follows -- this is updated by SVN
8 // $Id: codepage_detect.cpp 7172 2010-05-19 12:57:18Z jtuc $
10 #include "codepage_detect.h"
15 #include <boost/scoped_array.hpp>
17 #include "ExConverter.h"
20 #include "FileTextEncoding.h"
26 # define strcasecmp(a, b) stricmp((a), (b))
27 # define strncasecmp(a, b, n) strnicmp((a), (b), (n))
30 /** @brief Buffer size used in this file. */
31 static const int BufSize = 65536;
34 * @brief Prefixes to handle when searching for codepage names
35 * NB: prefixes ending in '-' must go first!
37 static const char *f_wincp_prefixes[] =
39 "WINDOWS-", "WINDOWS", "CP-", "CP", "MSDOS-", "MSDOS"
43 * @brief Remove prefix from the text.
44 * @param [in] text Text to process.
45 * @param [in] prefix Prefix to remove.
46 * @return Text without the prefix.
48 static const char *EatPrefix(const char *text, const char *prefix)
50 size_t len = strlen(prefix);
52 if (strncasecmp(text, prefix, len) == 0)
58 * @brief Try to to match codepage name from codepages module, & watch for f_wincp_prefixes aliases
61 FindEncodingIdFromNameOrAlias(const char *encodingName)
64 unsigned encodingId = GetEncodingIdFromName(encodingName);
67 // Handle purely numeric values (codepages)
69 unsigned codepage = strtol(encodingName, &ahead, 10);
71 while (*ahead != '\0' && i < sizeof(f_wincp_prefixes)/sizeof(f_wincp_prefixes[0]))
73 if (const char *remainder = EatPrefix(encodingName, f_wincp_prefixes[i]))
75 codepage = strtol(remainder, &ahead, 10);
81 encodingId = GetEncodingIdFromCodePage(codepage);
88 * @brief Parser for HTML files to find encoding information
90 static unsigned demoGuessEncoding_html(const char *src, size_t len, int defcodepage)
92 CMarkdown markdown(src, src + len, CMarkdown::Html);
93 //As <html> and <head> are optional, there is nothing to pull...
94 //markdown.Move("html").Pop().Move("head").Pop();
95 while (markdown.Move("meta"))
97 std::string http_equiv(markdown.GetAttribute("http-equiv"));
98 if (!http_equiv.empty() && strcasecmp(http_equiv.c_str(), "content-type") == 0)
100 std::string content(markdown.GetAttribute("content"));
101 char *pchKey = &content[0];
102 if (!content.empty())
104 while (size_t cchKey = strcspn(pchKey += strspn(pchKey, "; \t\r\n"), ";="))
106 char *pchValue = pchKey + cchKey;
107 size_t cchValue = strcspn(pchValue += strspn(pchValue, "= \t\r\n"), "; \t\r\n");
108 if (cchKey >= 7 && strncasecmp(pchKey, "charset", 7) == 0 && (cchKey == 7 || strchr(" \t\r\n", pchKey[7])))
110 pchValue[cchValue] = '\0';
111 // Is it an encoding name known to charsets module ?
112 unsigned encodingId = FindEncodingIdFromNameOrAlias(pchValue);
115 return GetEncodingCodePageFromId(encodingId);
119 pchKey = pchValue + cchValue;
128 * @brief Parser for XML files to find encoding information
130 static unsigned demoGuessEncoding_xml(const char *src, size_t len, int defcodepage)
132 const char *psrc = src;
133 boost::scoped_array<char> buf;
134 if (len >= 2 && (src[0] == 0 || src[1] == 0))
136 buf.reset(new char[len]);
138 for (i = 0, j = 0; i < (int)len; i++)
146 CMarkdown xml(psrc, psrc + len);
147 if (xml.Move("?xml"))
149 std::string encoding(xml.GetAttribute("encoding"));
150 if (!encoding.empty())
152 // Is it an encoding name we can find in charsets module ?
153 unsigned encodingId = FindEncodingIdFromNameOrAlias(encoding.c_str());
156 return GetEncodingCodePageFromId(encodingId);
164 * @brief Parser for rc files to find encoding information
165 * @note sscanf() requires first argument to be zero-terminated so we must
166 * copy lines to temporary buffer.
168 static unsigned demoGuessEncoding_rc(const char *src, size_t len, int defcodepage)
170 // NB: Diffutils may replace line endings by '\0'
171 unsigned cp = defcodepage;
175 while (len && (*src == '\r' || *src == '\n' || *src == '\0'))
180 const char *base = src;
181 while (len && *src != '\r' && *src != '\n' && *src != '\0')
186 size_t n = len < sizeof line - 1 ? len : sizeof line - 1;
187 memcpy(line, base, n);
189 } while (len && sscanf(line, "#pragma code_page(%u)", &cp) != 1);
194 * @brief Try to deduce encoding for this file.
195 * @param [in] ext File extension.
196 * @param [in] src File contents (as a string).
197 * @param [in] len Size of the file contents string.
198 * @return Codepage number.
200 static unsigned GuessEncoding_from_bytes(const String& ext, const char *src, size_t len, int guessEncodingType)
202 unsigned cp = ucr::getDefaultCodepage();
203 if (guessEncodingType & 2)
205 IExconverter *pexconv = Exconverter::getInstance();
206 if (pexconv && src != NULL)
208 int autodetectType = (unsigned)guessEncodingType >> 16;
209 cp = pexconv->detectInputCodepage(autodetectType, cp, src, len);
214 if (!ucr::CheckForInvalidUtf8(src, len))
217 if (guessEncodingType & 1)
221 String lower_ext = string_makelower(ext);
222 if (lower_ext == _T(".rc"))
224 cp = demoGuessEncoding_rc(src, len, cp);
226 else if (lower_ext == _T(".htm") || lower_ext == _T(".html"))
228 cp = demoGuessEncoding_html(src, len, cp);
230 else if (lower_ext == _T(".xml") || lower_ext == _T(".xsl"))
232 cp = demoGuessEncoding_xml(src, len, cp);
239 * @brief Try to deduce encoding for this file.
240 * @param [in] filepath Full path to the file.
241 * @param [in] bGuessEncoding Try to guess codepage (not just unicode encoding).
242 * @return Structure getting the encoding info.
244 FileTextEncoding GuessCodepageEncoding(const String& filepath, int guessEncodingType)
246 FileTextEncoding encoding;
247 const int mapmaxlen = BufSize;
248 CMarkdown::FileImage fi(filepath.c_str(), mapmaxlen);
249 encoding.SetCodepage(ucr::getDefaultCodepage());
250 encoding.m_bom = false;
251 switch (fi.nByteOrder)
254 encoding.SetUnicoding(ucr::UCS2LE);
255 encoding.SetCodepage(CP_UCS2LE);
256 encoding.m_bom = true;
259 encoding.SetUnicoding(ucr::UCS2BE);
260 encoding.SetCodepage(CP_UCS2BE);
261 encoding.m_bom = true;
264 encoding.SetUnicoding(ucr::UTF8);
265 encoding.SetCodepage(CP_UTF8);
266 encoding.m_bom = true;
269 encoding.m_bom = false;
272 if (fi.nByteOrder < 4 && guessEncodingType != 0)
274 String ext = paths_FindExtension(filepath);
275 const char *src = (char *)fi.pImage;
276 size_t len = fi.cbImage;
277 if (len == mapmaxlen)
279 for (size_t i = len; i--; )
281 if (isspace((unsigned char)src[i]))
283 // make len an even number for ucs-2 detection
292 if (unsigned cp = GuessEncoding_from_bytes(ext, src, len, guessEncodingType))
293 encoding.SetCodepage(cp);
295 encoding.SetCodepage(ucr::getDefaultCodepage());