Src/codepage_detect.cpp

   1 /**
   2  * @file  codepage_detect.cpp
   3  *
   4  * @brief Deducing codepage from file contents, when we can
   5  *
   6  */
   7 // ID line follows -- this is updated by SVN
   8 // $Id: codepage_detect.cpp 7172 2010-05-19 12:57:18Z jtuc $
   9
  10 #include "codepage_detect.h"
  11 #include <cstdio>
  12 #include <cstring>
  13 #include <algorithm>
  14 #include <windows.h>
  15 #include <boost/scoped_array.hpp>
  16 #include "unicoder.h"
  17 #include "ExConverter.h"
  18 #include "codepage.h"
  19 #include "charsets.h"
  20 #include "FileTextEncoding.h"
  21 #include "paths.h"
  22 #include "markdown.h"
  23
  24
  25 #ifdef _WIN32
  26 #  define strcasecmp(a, b) stricmp((a), (b))
  27 #  define strncasecmp(a, b, n) strnicmp((a), (b), (n))
  28 #endif
  29
  30 /** @brief Buffer size used in this file. */
  31 static const int BufSize = 65536;
  32
  33 /**
  34  * @brief Prefixes to handle when searching for codepage names
  35  * NB: prefixes ending in '-' must go first!
  36  */
  37 static const char *f_wincp_prefixes[] =
  38 {
  39         "WINDOWS-", "WINDOWS", "CP-", "CP", "MSDOS-", "MSDOS"
  40 };
  41
  42 /**
  43  * @brief Remove prefix from the text.
  44  * @param [in] text Text to process.
  45  * @param [in] prefix Prefix to remove.
  46  * @return Text without the prefix.
  47  */
  48 static const char *EatPrefix(const char *text, const char *prefix)
  49 {
  50         size_t len = strlen(prefix);
  51         if (len)
  52                 if (strncasecmp(text, prefix, len) == 0)
  53                         return text + len;
  54         return 0;
  55 }
  56
  57 /**
  58  * @brief Try to to match codepage name from codepages module, & watch for f_wincp_prefixes aliases
  59  */
  60 static int
  61 FindEncodingIdFromNameOrAlias(const char *encodingName)
  62 {
  63         // Try name as given
  64         unsigned encodingId = GetEncodingIdFromName(encodingName);
  65         if (encodingId == 0)
  66         {
  67                 // Handle purely numeric values (codepages)
  68                 char *ahead = 0;
  69                 unsigned codepage = strtol(encodingName, &ahead, 10);
  70                 int i = 0;
  71                 while (*ahead != '\0' && i < sizeof(f_wincp_prefixes)/sizeof(f_wincp_prefixes[0]))
  72                 {
  73                         if (const char *remainder = EatPrefix(encodingName, f_wincp_prefixes[i]))
  74                         {
  75                                 codepage = strtol(remainder, &ahead, 10);
  76                         }
  77                         ++i;
  78                 }
  79                 if (*ahead == '\0')
  80                 {
  81                         encodingId = GetEncodingIdFromCodePage(codepage);
  82                 }
  83         }
  84         return encodingId;
  85 }
  86
  87 /**
  88  * @brief Parser for HTML files to find encoding information
  89  */
  90 static unsigned demoGuessEncoding_html(const char *src, size_t len, int defcodepage)
  91 {
  92         CMarkdown markdown(src, src + len, CMarkdown::Html);
  93         //As <html> and <head> are optional, there is nothing to pull...
  94         //markdown.Move("html").Pop().Move("head").Pop();
  95         while (markdown.Move("meta"))
  96         {
  97                 std::string http_equiv(markdown.GetAttribute("http-equiv"));
  98                 if (!http_equiv.empty() && strcasecmp(http_equiv.c_str(), "content-type") == 0)
  99                 {
 100                         std::string content(markdown.GetAttribute("content"));
 101                         char *pchKey = &content[0];
 102                         if (!content.empty())
 103                         {
 104                                 while (size_t cchKey = strcspn(pchKey += strspn(pchKey, "; \t\r\n"), ";="))
 105                                 {
 106                                         char *pchValue = pchKey + cchKey;
 107                                         size_t cchValue = strcspn(pchValue += strspn(pchValue, "= \t\r\n"), "; \t\r\n");
 108                                         if (cchKey >= 7 && strncasecmp(pchKey, "charset", 7) == 0 && (cchKey == 7 || strchr(" \t\r\n", pchKey[7])))
 109                                         {
 110                                                 pchValue[cchValue] = '\0';
 111                                                 // Is it an encoding name known to charsets module ?
 112                                                 unsigned encodingId = FindEncodingIdFromNameOrAlias(pchValue);
 113                                                 if (encodingId)
 114                                                 {
 115                                                         return GetEncodingCodePageFromId(encodingId);
 116                                                 }
 117                                                 return defcodepage;
 118                                         }
 119                                         pchKey = pchValue + cchValue;
 120                                 }
 121                         }
 122                 }
 123         }
 124         return defcodepage;
 125 }
 126
 127 /**
 128  * @brief Parser for XML files to find encoding information
 129  */
 130 static unsigned demoGuessEncoding_xml(const char *src, size_t len, int defcodepage)
 131 {
 132         const char *psrc = src;
 133         boost::scoped_array<char> buf;
 134         if (len >= 2 && (src[0] == 0 || src[1] == 0))
 135         {
 136                 buf.reset(new char[len]);
 137                 int i, j;
 138                 for (i = 0, j = 0; i < (int)len; i++)
 139                 {
 140                         if (src[i])
 141                                 buf[j++] = src[i];
 142                 }
 143                 len = j;
 144                 psrc = buf.get();
 145         }
 146         CMarkdown xml(psrc, psrc + len);
 147         if (xml.Move("?xml"))
 148         {
 149                 std::string encoding(xml.GetAttribute("encoding"));
 150                 if (!encoding.empty())
 151                 {
 152                         // Is it an encoding name we can find in charsets module ?
 153                         unsigned encodingId = FindEncodingIdFromNameOrAlias(encoding.c_str());
 154                         if (encodingId)
 155                         {
 156                                 return GetEncodingCodePageFromId(encodingId);
 157                         }
 158                 }
 159         }
 160         return defcodepage;
 161 }
 162
 163 /**
 164  * @brief Parser for rc files to find encoding information
 165  * @note sscanf() requires first argument to be zero-terminated so we must
 166  * copy lines to temporary buffer.
 167  */
 168 static unsigned demoGuessEncoding_rc(const char *src, size_t len, int defcodepage)
 169 {
 170         // NB: Diffutils may replace line endings by '\0'
 171         unsigned cp = defcodepage;
 172         char line[80];
 173         do
 174         {
 175                 while (len && (*src == '\r' || *src == '\n' || *src == '\0'))
 176                 {
 177                         ++src;
 178                         --len;
 179                 }
 180                 const char *base = src;
 181                 while (len && *src != '\r' && *src != '\n' && *src != '\0')
 182                 {
 183                         ++src;
 184                         --len;
 185                 }
 186                 size_t n = len < sizeof line - 1 ? len : sizeof line - 1;
 187                 memcpy(line, base, n);
 188                 line[n] = 0;
 189         } while (len && sscanf(line, "#pragma code_page(%u)", &cp) != 1);
 190         return cp;
 191 }
 192
 193 /**
 194  * @brief Try to deduce encoding for this file.
 195  * @param [in] ext File extension.
 196  * @param [in] src File contents (as a string).
 197  * @param [in] len Size of the file contents string.
 198  * @return Codepage number.
 199  */
 200 static unsigned GuessEncoding_from_bytes(const String& ext, const char *src, size_t len, int guessEncodingType)
 201 {
 202         unsigned cp = ucr::getDefaultCodepage();
 203         if (guessEncodingType & 2)
 204         {
 205                 IExconverter *pexconv = Exconverter::getInstance();
 206                 if (pexconv && src != NULL)
 207                 {
 208                         int autodetectType = (unsigned)guessEncodingType >> 16;
 209                         cp = pexconv->detectInputCodepage(autodetectType, cp, src, len);
 210                 }
 211         }
 212         else
 213         {
 214                 if (!ucr::CheckForInvalidUtf8(src, len))
 215                         cp = CP_UTF8;
 216         }
 217         if (guessEncodingType & 1)
 218         {
 219                 if (len > BufSize)
 220                         len = BufSize;
 221                 String lower_ext = string_makelower(ext);
 222                 if (lower_ext == _T(".rc"))
 223                 {
 224                         cp = demoGuessEncoding_rc(src, len, cp);
 225                 }
 226                 else if (lower_ext == _T(".htm") || lower_ext == _T(".html"))
 227                 {
 228                         cp = demoGuessEncoding_html(src, len, cp);
 229                 }
 230                 else if (lower_ext == _T(".xml") || lower_ext == _T(".xsl"))
 231                 {
 232                         cp = demoGuessEncoding_xml(src, len, cp);
 233                 }
 234         }
 235         return cp;
 236 }
 237
 238 /**
 239  * @brief Try to deduce encoding for this file.
 240  * @param [in] filepath Full path to the file.
 241  * @param [in] bGuessEncoding Try to guess codepage (not just unicode encoding).
 242  * @return Structure getting the encoding info.
 243  */
 244 FileTextEncoding GuessCodepageEncoding(const String& filepath, int guessEncodingType)
 245 {
 246         FileTextEncoding encoding;
 247         const int mapmaxlen = BufSize;
 248         CMarkdown::FileImage fi(filepath.c_str(), mapmaxlen);
 249         encoding.SetCodepage(ucr::getDefaultCodepage());
 250         encoding.m_bom = false;
 251         switch (fi.nByteOrder)
 252         {
 253         case 8 + 2 + 0:
 254                 encoding.SetUnicoding(ucr::UCS2LE);
 255                 encoding.SetCodepage(CP_UCS2LE);
 256                 encoding.m_bom = true;
 257                 break;
 258         case 8 + 2 + 1:
 259                 encoding.SetUnicoding(ucr::UCS2BE);
 260                 encoding.SetCodepage(CP_UCS2BE);
 261                 encoding.m_bom = true;
 262                 break;
 263         case 8 + 1:
 264                 encoding.SetUnicoding(ucr::UTF8);
 265                 encoding.SetCodepage(CP_UTF8);
 266                 encoding.m_bom = true;
 267                 break;
 268         default:
 269                 encoding.m_bom = false;
 270                 break;
 271         }
 272         if (fi.nByteOrder < 4 && guessEncodingType != 0)
 273         {
 274                 String ext = paths_FindExtension(filepath);
 275                 const char *src = (char *)fi.pImage;
 276                 size_t len = fi.cbImage;
 277                 if (len == mapmaxlen)
 278                 {
 279                         for (size_t i = len; i--; )
 280                         {
 281                                 if (isspace((unsigned char)src[i]))
 282                                 {
 283                                         // make len an even number for ucs-2 detection
 284                                         if ((i % 2) == 0)
 285                                                 len = i;
 286                                         else
 287                                                 len = i + 1;
 288                                         break;
 289                                 }
 290                         }
 291                 }
 292                 if (unsigned cp = GuessEncoding_from_bytes(ext, src, len, guessEncodingType))
 293                         encoding.SetCodepage(cp);
 294                 else
 295                         encoding.SetCodepage(ucr::getDefaultCodepage());
 296         }
 297         return encoding;
 298 }