2 * @file codepage_detect.cpp
4 * @brief Deducing codepage from file contents, when we can
9 #include "codepage_detect.h"
15 #include "ExConverter.h"
17 #include "FileTextEncoding.h"
22 * @brief Prefixes to handle when searching for codepage names
23 * NB: prefixes ending in '-' must go first!
25 static const char *f_wincp_prefixes[] =
27 "WINDOWS-", "WINDOWS", "CP-", "CP", "MSDOS-", "MSDOS"
31 * @brief Remove prefix from the text.
32 * @param [in] text Text to process.
33 * @param [in] prefix Prefix to remove.
34 * @return Text without the prefix.
36 static const char *EatPrefix(const char *text, const char *prefix)
38 size_t len = strlen(prefix);
40 if (_strnicmp(text, prefix, len) == 0)
46 * @brief Try to to match codepage name from codepages module, & watch for f_wincp_prefixes aliases
49 FindEncodingIdFromNameOrAlias(const char *encodingName)
52 unsigned encodingId = GetEncodingIdFromName(encodingName);
55 // Handle purely numeric values (codepages)
56 char *ahead = nullptr;
57 unsigned codepage = strtol(encodingName, &ahead, 10);
59 while (*ahead != '\0' && i < sizeof(f_wincp_prefixes)/sizeof(f_wincp_prefixes[0]))
61 if (const char *remainder = EatPrefix(encodingName, f_wincp_prefixes[i]))
63 codepage = strtol(remainder, &ahead, 10);
69 encodingId = GetEncodingIdFromCodePage(codepage);
76 * @brief Parser for HTML files to find encoding information
78 static unsigned demoGuessEncoding_html(const char *src, size_t len, int defcodepage)
80 CMarkdown markdown(src, src + len, CMarkdown::Html);
81 //As <html> and <head> are optional, there is nothing to pull...
82 //markdown.Move("html").Pop().Move("head").Pop();
83 while (markdown.Move("meta"))
85 std::string charset(markdown.GetAttribute("charset"));
88 std::string http_equiv(markdown.GetAttribute("http-equiv"));
89 if (!http_equiv.empty() && _stricmp(http_equiv.c_str(), "content-type") == 0)
91 std::string content(markdown.GetAttribute("content"));
94 char *pchKey = &content[0];
95 while (size_t cchKey = strcspn(pchKey += strspn(pchKey, "; \t\r\n"), ";="))
97 char *pchValue = pchKey + cchKey;
98 size_t cchValue = strcspn(pchValue += strspn(pchValue, "= \t\r\n"), "; \t\r\n");
99 if (cchKey >= 7 && _strnicmp(pchKey, "charset", 7) == 0 && (cchKey == 7 || strchr(" \t\r\n", pchKey[7])))
101 pchValue[cchValue] = '\0';
105 pchKey = pchValue + cchValue;
110 if (!charset.empty())
112 // Is it an encoding name known to charsets module ?
113 int encodingId = FindEncodingIdFromNameOrAlias(charset.c_str());
115 return GetEncodingCodePageFromId(encodingId);
123 * @brief Parser for XML files to find encoding information
125 static unsigned demoGuessEncoding_xml(const char *src, size_t len, int defcodepage)
127 const char *psrc = src;
128 std::unique_ptr<char[]> buf;
129 if (len >= 2 && (src[0] == 0 || src[1] == 0))
131 buf.reset(new char[len]);
133 for (i = 0, j = 0; i < (int)len; i++)
141 CMarkdown xml(psrc, psrc + len);
142 if (xml.Move("?xml"))
144 std::string encoding(xml.GetAttribute("encoding"));
145 if (!encoding.empty())
147 // Is it an encoding name we can find in charsets module ?
148 unsigned encodingId = FindEncodingIdFromNameOrAlias(encoding.c_str());
151 return GetEncodingCodePageFromId(encodingId);
159 * @brief Parser for rc files to find encoding information
160 * @note sscanf() requires first argument to be zero-terminated so we must
161 * copy lines to temporary buffer.
163 static unsigned demoGuessEncoding_rc(const char *src, size_t len, int defcodepage)
165 // NB: Diffutils may replace line endings by '\0'
166 unsigned cp = defcodepage;
170 while (len && (*src == '\r' || *src == '\n' || *src == '\0'))
175 const char *base = src;
176 while (len && *src != '\r' && *src != '\n' && *src != '\0')
181 size_t n = len < sizeof line - 1 ? len : sizeof line - 1;
182 memcpy(line, base, n);
184 } while (len && sscanf_s(line, "#pragma code_page(%5u)", &cp) != 1);
188 namespace codepage_detect
191 * @brief Try to deduce encoding for this file.
192 * @param [in] ext File extension.
193 * @param [in] src File contents (as a string).
194 * @param [in] len Size of the file contents string.
195 * @return Codepage number.
197 FileTextEncoding Guess(const String& ext, const void * src, size_t len, int guessEncodingType)
199 FileTextEncoding encoding;
201 encoding.SetUnicoding(ucr::DetermineEncoding(reinterpret_cast<const unsigned char *>(src), len, &encoding.m_bom));
203 encoding.m_bom = true;
204 if (encoding.m_unicoding != ucr::NONE)
206 unsigned cp = ucr::getDefaultCodepage();
207 if (guessEncodingType != 0)
209 if (!ucr::CheckForInvalidUtf8(reinterpret_cast<const char*>(src), len))
211 else if (guessEncodingType & 2)
213 IExconverter* pexconv = Exconverter::getInstance();
214 if (pexconv != nullptr && src != nullptr)
216 int autodetectType = (unsigned)guessEncodingType >> 16;
217 cp = pexconv->detectInputCodepage(autodetectType, cp, reinterpret_cast<const char *>(src), len);
220 if (guessEncodingType & 1)
222 String lower_ext = strutils::makelower(ext);
223 if (lower_ext == _T(".rc"))
225 cp = demoGuessEncoding_rc(reinterpret_cast<const char *>(src), len, cp);
227 else if (lower_ext == _T(".htm") || lower_ext == _T(".html"))
229 cp = demoGuessEncoding_html(reinterpret_cast<const char *>(src), len, cp);
231 else if (lower_ext == _T(".xml") || lower_ext == _T(".xsl"))
233 cp = demoGuessEncoding_xml(reinterpret_cast<const char *>(src), len, cp);
237 encoding.SetCodepage(cp);
242 * @brief Try to deduce encoding for this file.
243 * @param [in] filepath Full path to the file.
244 * @param [in] bGuessEncoding Try to guess codepage (not just unicode encoding).
245 * @return Structure getting the encoding info.
247 FileTextEncoding Guess(const String& filepath, int guessEncodingType, ptrdiff_t mapmaxlen)
249 CMarkdown::FileImage fi(filepath != _T("NUL") ? filepath.c_str() : nullptr, mapmaxlen);
250 String ext = paths::FindExtension(filepath);
251 return Guess(ext, fi.pImage, fi.cbImage, guessEncodingType);