Src/codepage_detect.cpp

   1 /**
   2  * @file  codepage_detect.cpp
   3  *
   4  * @brief Deducing codepage from file contents, when we can
   5  *
   6  */
   7
   8 #include "pch.h"
   9 #include "codepage_detect.h"
  10 #include <cstdio>
  11 #include <cstring>
  12 #include <algorithm>
  13 #include <memory>
  14 #include "unicoder.h"
  15 #include "ExConverter.h"
  16 #include "charsets.h"
  17 #include "FileTextEncoding.h"
  18 #include "paths.h"
  19 #include "markdown.h"
  20
  21 /**
  22  * @brief Prefixes to handle when searching for codepage names
  23  * NB: prefixes ending in '-' must go first!
  24  */
  25 static const char *f_wincp_prefixes[] =
  26 {
  27         "WINDOWS-", "WINDOWS", "CP-", "CP", "MSDOS-", "MSDOS"
  28 };
  29
  30 /**
  31  * @brief Remove prefix from the text.
  32  * @param [in] text Text to process.
  33  * @param [in] prefix Prefix to remove.
  34  * @return Text without the prefix.
  35  */
  36 static const char *EatPrefix(const char *text, const char *prefix)
  37 {
  38         size_t len = strlen(prefix);
  39         if (len)
  40                 if (_strnicmp(text, prefix, len) == 0)
  41                         return text + len;
  42         return 0;
  43 }
  44
  45 /**
  46  * @brief Try to to match codepage name from codepages module, & watch for f_wincp_prefixes aliases
  47  */
  48 static int
  49 FindEncodingIdFromNameOrAlias(const char *encodingName)
  50 {
  51         // Try name as given
  52         unsigned encodingId = GetEncodingIdFromName(encodingName);
  53         if (encodingId == 0)
  54         {
  55                 // Handle purely numeric values (codepages)
  56                 char *ahead = nullptr;
  57                 unsigned codepage = strtol(encodingName, &ahead, 10);
  58                 int i = 0;
  59                 while (*ahead != '\0' && i < sizeof(f_wincp_prefixes)/sizeof(f_wincp_prefixes[0]))
  60                 {
  61                         if (const char *remainder = EatPrefix(encodingName, f_wincp_prefixes[i]))
  62                         {
  63                                 codepage = strtol(remainder, &ahead, 10);
  64                         }
  65                         ++i;
  66                 }
  67                 if (*ahead == '\0')
  68                 {
  69                         encodingId = GetEncodingIdFromCodePage(codepage);
  70                 }
  71         }
  72         return encodingId;
  73 }
  74
  75 /**
  76  * @brief Parser for HTML files to find encoding information
  77  */
  78 static unsigned demoGuessEncoding_html(const char *src, size_t len, int defcodepage)
  79 {
  80         CMarkdown markdown(src, src + len, CMarkdown::Html);
  81         //As <html> and <head> are optional, there is nothing to pull...
  82         //markdown.Move("html").Pop().Move("head").Pop();
  83         while (markdown.Move("meta"))
  84         {
  85                 std::string charset(markdown.GetAttribute("charset"));
  86                 if (charset.empty())
  87                 {
  88                         std::string http_equiv(markdown.GetAttribute("http-equiv"));
  89                         if (!http_equiv.empty() && _stricmp(http_equiv.c_str(), "content-type") == 0)
  90                         {
  91                                 std::string content(markdown.GetAttribute("content"));
  92                                 if (!content.empty())
  93                                 {
  94                                         char *pchKey = &content[0];
  95                                         while (size_t cchKey = strcspn(pchKey += strspn(pchKey, "; \t\r\n"), ";="))
  96                                         {
  97                                                 char *pchValue = pchKey + cchKey;
  98                                                 size_t cchValue = strcspn(pchValue += strspn(pchValue, "= \t\r\n"), "; \t\r\n");
  99                                                 if (cchKey >= 7 && _strnicmp(pchKey, "charset", 7) == 0 && (cchKey == 7 || strchr(" \t\r\n", pchKey[7])))
 100                                                 {
 101                                                         pchValue[cchValue] = '\0';
 102                                                         charset = pchValue;
 103                                                         break;
 104                                                 }
 105                                                 pchKey = pchValue + cchValue;
 106                                         }
 107                                 }
 108                         }
 109                 }
 110                 if (!charset.empty())
 111                 {
 112                         // Is it an encoding name known to charsets module ?
 113                         int encodingId = FindEncodingIdFromNameOrAlias(charset.c_str());
 114                         if (encodingId)
 115                                 return GetEncodingCodePageFromId(encodingId);
 116                         return defcodepage;
 117                 }
 118         }
 119         return defcodepage;
 120 }
 121
 122 /**
 123  * @brief Parser for XML files to find encoding information
 124  */
 125 static unsigned demoGuessEncoding_xml(const char *src, size_t len, int defcodepage)
 126 {
 127         const char *psrc = src;
 128         std::unique_ptr<char[]> buf;
 129         if (len >= 2 && (src[0] == 0 || src[1] == 0))
 130         {
 131                 buf.reset(new char[len]);
 132                 int i, j;
 133                 for (i = 0, j = 0; i < (int)len; i++)
 134                 {
 135                         if (src[i])
 136                                 buf[j++] = src[i];
 137                 }
 138                 len = j;
 139                 psrc = buf.get();
 140         }
 141         CMarkdown xml(psrc, psrc + len);
 142         if (xml.Move("?xml"))
 143         {
 144                 std::string encoding(xml.GetAttribute("encoding"));
 145                 if (!encoding.empty())
 146                 {
 147                         // Is it an encoding name we can find in charsets module ?
 148                         unsigned encodingId = FindEncodingIdFromNameOrAlias(encoding.c_str());
 149                         if (encodingId)
 150                         {
 151                                 return GetEncodingCodePageFromId(encodingId);
 152                         }
 153                 }
 154         }
 155         return defcodepage;
 156 }
 157
 158 /**
 159  * @brief Parser for rc files to find encoding information
 160  * @note sscanf() requires first argument to be zero-terminated so we must
 161  * copy lines to temporary buffer.
 162  */
 163 static unsigned demoGuessEncoding_rc(const char *src, size_t len, int defcodepage)
 164 {
 165         // NB: Diffutils may replace line endings by '\0'
 166         unsigned cp = defcodepage;
 167         char line[80];
 168         do
 169         {
 170                 while (len && (*src == '\r' || *src == '\n' || *src == '\0'))
 171                 {
 172                         ++src;
 173                         --len;
 174                 }
 175                 const char *base = src;
 176                 while (len && *src != '\r' && *src != '\n' && *src != '\0')
 177                 {
 178                         ++src;
 179                         --len;
 180                 }
 181                 size_t n = len < sizeof line - 1 ? len : sizeof line - 1;
 182                 memcpy(line, base, n);
 183                 line[n] = 0;
 184         } while (len && sscanf_s(line, "#pragma code_page(%5u)", &cp) != 1);
 185         return cp;
 186 }
 187
 188 /**
 189  * @brief Try to deduce encoding for this file.
 190  * @param [in] ext File extension.
 191  * @param [in] src File contents (as a string).
 192  * @param [in] len Size of the file contents string.
 193  * @return Codepage number.
 194  */
 195 static unsigned GuessEncoding_from_bytes(const String& ext, const char *src, size_t len, int guessEncodingType)
 196 {
 197         unsigned cp = ucr::getDefaultCodepage();
 198         if (!ucr::CheckForInvalidUtf8(src, len))
 199                 cp = ucr::CP_UTF_8;
 200         else if (guessEncodingType & 2)
 201         {
 202                 IExconverter *pexconv = Exconverter::getInstance();
 203                 if (pexconv != nullptr && src != nullptr)
 204                 {
 205                         int autodetectType = (unsigned)guessEncodingType >> 16;
 206                         cp = pexconv->detectInputCodepage(autodetectType, cp, src, len);
 207                 }
 208         }
 209         if (guessEncodingType & 1)
 210         {
 211                 String lower_ext = strutils::makelower(ext);
 212                 if (lower_ext == _T(".rc"))
 213                 {
 214                         cp = demoGuessEncoding_rc(src, len, cp);
 215                 }
 216                 else if (lower_ext == _T(".htm") || lower_ext == _T(".html"))
 217                 {
 218                         cp = demoGuessEncoding_html(src, len, cp);
 219                 }
 220                 else if (lower_ext == _T(".xml") || lower_ext == _T(".xsl"))
 221                 {
 222                         cp = demoGuessEncoding_xml(src, len, cp);
 223                 }
 224         }
 225         return cp;
 226 }
 227
 228 /**
 229  * @brief Try to deduce encoding for this file.
 230  * @param [in] filepath Full path to the file.
 231  * @param [in] bGuessEncoding Try to guess codepage (not just unicode encoding).
 232  * @return Structure getting the encoding info.
 233  */
 234 FileTextEncoding GuessCodepageEncoding(const String& filepath, int guessEncodingType, ptrdiff_t mapmaxlen)
 235 {
 236         FileTextEncoding encoding;
 237         CMarkdown::FileImage fi(filepath != _T("NUL") ? filepath.c_str() : nullptr, mapmaxlen);
 238         encoding.SetCodepage(ucr::getDefaultCodepage());
 239         encoding.m_bom = false;
 240         switch (fi.nByteOrder)
 241         {
 242         case 8 + 2 + 0:
 243                 encoding.SetUnicoding(ucr::UCS2LE);
 244                 encoding.SetCodepage(ucr::CP_UCS2LE);
 245                 encoding.m_bom = true;
 246                 break;
 247         case 8 + 2 + 1:
 248                 encoding.SetUnicoding(ucr::UCS2BE);
 249                 encoding.SetCodepage(ucr::CP_UCS2BE);
 250                 encoding.m_bom = true;
 251                 break;
 252         case 8 + 1:
 253                 encoding.SetUnicoding(ucr::UTF8);
 254                 encoding.SetCodepage(ucr::CP_UTF_8);
 255                 encoding.m_bom = true;
 256                 break;
 257         default:
 258                 encoding.m_bom = false;
 259                 break;
 260         }
 261         if (fi.nByteOrder < 4 && guessEncodingType != 0)
 262         {
 263                 String ext = paths::FindExtension(filepath);
 264                 const char *src = (char *)fi.pImage;
 265                 size_t len = fi.cbImage;
 266                 if (len == static_cast<size_t>(mapmaxlen))
 267                 {
 268                         for (size_t i = len; i--; )
 269                         {
 270                                 if (isspace((unsigned char)src[i]))
 271                                 {
 272                                         // make len an even number for ucs-2 detection
 273                                         if ((i % 2) == 0)
 274                                                 len = i;
 275                                         else
 276                                                 len = i + 1;
 277                                         break;
 278                                 }
 279                         }
 280                 }
 281                 if (unsigned cp = GuessEncoding_from_bytes(ext, src, len, guessEncodingType))
 282                         encoding.SetCodepage(cp);
 283                 else
 284                         encoding.SetCodepage(ucr::getDefaultCodepage());
 285         }
 286         return encoding;
 287 }