Src/Common/unicoder.cpp

   1 /**
   2  *  @file   unicoder.cpp
   3  *  @author Perry Rapp, Creator, 2003-2006
   4  *  @date   Created: 2003-10
   5  *  @date   Edited:  2006-02-20 (Perry Rapp)
   6  *
   7  *  @brief  Implementation of utility unicode conversion routines
   8  */
   9
  10 /* The MIT License
  11 Copyright (c) 2003 Perry Rapp
  12 Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
  13 The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
  14 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  15 */
  16
  17 #include "pch.h"
  18 #include "unicoder.h"
  19 #include <windows.h>
  20 #include <cassert>
  21 #include <memory>
  22 #include <Poco/UnicodeConverter.h>
  23 #include "UnicodeString.h"
  24 #include "ExConverter.h"
  25
  26 using Poco::UnicodeConverter;
  27
  28 namespace ucr
  29 {
  30
  31 // store the default codepage as specified by user in options
  32 static int f_nDefaultCodepage = GetACP();
  33
  34
  35 # pragma warning(push)          // Saves the current warning state.
  36 # pragma warning(disable:4244)  // Temporarily disables warning 4244: "conversion from 'int' to 'char', possible loss of data"
  37 /**
  38  * @brief Convert unicode codepoint to UTF-8 byte string
  39  *
  40  * utf8 must be a 7+ byte buffer
  41  * returns length of byte string written
  42  * Does not zero-terminate!
  43  */
  44 int Ucs4_to_Utf8(unsigned unich, unsigned char * utf8)
  45 {
  46         if (unich <= 0x7f)
  47         {
  48                 utf8[0] = (unsigned char)unich;
  49                 return 1;
  50         }
  51         else if (unich <= 0x7ff)
  52         {
  53                 utf8[0] = 0xc0 + (unich >> 6);
  54                 utf8[1] = 0x80 + (unich & 0x3f);
  55                 return 2;
  56         }
  57         else if (unich <= 0xffff)
  58         {
  59                 utf8[0] = 0xe0 + (unich >> 12);
  60                 utf8[1] = 0x80 + ((unich >> 6) & 0x3f);
  61                 utf8[2] = 0x80 + (unich & 0x3f);
  62                 return 3;
  63         }
  64         else if (unich <= 0x1fffff)
  65         {
  66                 utf8[0] = 0xf0 + (unich >> 18);
  67                 utf8[1] = 0x80 + ((unich >> 12) & 0x3f);
  68                 utf8[2] = 0x80 + ((unich >> 6) & 0x3f);
  69                 utf8[3] = 0x80 + (unich & 0x3f);
  70                 return 4;
  71         }
  72         else if (unich <= 0x3ffffff)
  73         {
  74                 utf8[0] = 0xf8 + (unich >> 24);
  75                 utf8[1] = 0x80 + ((unich >> 18) & 0x3f);
  76                 utf8[2] = 0x80 + ((unich >> 12) & 0x3f);
  77                 utf8[3] = 0x80 + ((unich >> 6) & 0x3f);
  78                 utf8[4] = 0x80 + (unich & 0x3f);
  79                 return 5;
  80         }
  81         else if (unich <= 0x7fffffff)
  82         {
  83                 utf8[0] = 0xfc + (unich >> 30);
  84                 utf8[1] = 0x80 + ((unich >> 24) & 0x3f);
  85                 utf8[2] = 0x80 + ((unich >> 18) & 0x3f);
  86                 utf8[3] = 0x80 + ((unich >> 12) & 0x3f);
  87                 utf8[4] = 0x80 + ((unich >> 6) & 0x3f);
  88                 utf8[5] = 0x80 + (unich & 0x3f);
  89                 return 6;
  90         }
  91         else
  92         {
  93                 // Invalid Unicode codepoint (high bit was set)
  94                 // TODO: What do we do ?
  95                 utf8[0] = '?';
  96                 return 1;
  97         }
  98 }
  99 # pragma warning(pop)           // Restores the warning state.
 100
 101 /**
 102  * @brief Gets a length of UTF-8 character in bytes.
 103  * @param [in] ch The character for which to get the length.
 104  * @return Byte length of UTF-8 character, -1 if invalid.
 105  */
 106 int Utf8len_fromLeadByte(unsigned char ch)
 107 {
 108         if (ch < 0x80) return 1;
 109         if (ch < 0xC0) return -1;
 110         if (ch < 0xE0) return 2;
 111         if (ch < 0xF0) return 3;
 112         if (ch < 0xF8) return 4;
 113         if (ch < 0xFC) return 5;
 114         if (ch < 0xFE) return 6;
 115         return -1;
 116 }
 117
 118 /**
 119  * @brief return #bytes required to represent Unicode codepoint as UTF-8
 120  */
 121 int Utf8len_fromCodepoint(unsigned ch)
 122 {
 123         if (ch <= 0x7F) return 1;
 124         if (ch <= 0x7FF) return 2;
 125         if (ch <= 0xFFFF) return 3;
 126         if (ch <= 0x1FFFFF) return 4;
 127         if (ch <= 0x3FFFFFF) return 5;
 128         if (ch <= 0x7FFFFFFF) return 6;
 129         return -1;
 130 }
 131
 132 /**
 133  * @brief How many chars in this UTF-8 string ?
 134  *
 135  * @param size size argument as filemapping are not 0 terminated
 136  *
 137  * @bug Fails for files larger than 2gigs
 138  */
 139 size_t stringlen_of_utf8(const char* text, size_t size)
 140 {
 141         size_t len = 0;
 142         for (size_t i = 0; i < size;)
 143         {
 144                 int chlen = Utf8len_fromLeadByte(text[i]);
 145                 if (chlen < 1) chlen = 1;
 146                 i += chlen;
 147                 len ++;
 148         }
 149         return len;
 150 }
 151
 152 /**
 153  * @brief Read UTF-8 character and return as Unicode
 154  */
 155 unsigned GetUtf8Char(unsigned char * str)
 156 {
 157         /* test short cases first, as probably much more common */
 158         if (!(*str & 0x80 && *str & 0x40))
 159         {
 160                 return str[0];
 161         }
 162         if (!(*str & 0x20))
 163         {
 164                 unsigned ch = ((str[0] & 0x1F) << 6)
 165                                 + (str[1] & 0x3F);
 166                 return ch;
 167         }
 168         if (!(*str & 0x10))
 169         {
 170                 unsigned ch = ((str[0] & 0x0f) << 12)
 171                                 + ((str[1] & 0x3F) << 6)
 172                                 + (str[2] & 0x3F);
 173                 return ch;
 174         }
 175         if (!(*str & 0x08))
 176         {
 177                 unsigned ch = ((str[0] & 0x0F) << 18)
 178                                 + ((str[1] & 0x3F) << 12)
 179                                 + ((str[2] & 0x3F) << 6)
 180                                 + (str[3] & 0x3F);
 181                 return ch;
 182         }
 183         if (!(*str & 0x04))
 184         {
 185                 unsigned ch = ((str[0] & 0x0F) << 24)
 186                                 + ((str[1] & 0x3F) << 18)
 187                                 + ((str[2] & 0x3F) << 12)
 188                                 + ((str[3] & 0x3F) << 6)
 189                                 + (str[4] & 0x3F);
 190                 return ch;
 191         }
 192         else
 193         {
 194                 unsigned ch = ((str[0] & 0x0F) << 30)
 195                                 + ((str[1] & 0x3F) << 24)
 196                                 + ((str[2] & 0x3F) << 18)
 197                                 + ((str[3] & 0x3F) << 12)
 198                                 + ((str[4] & 0x3F) << 6)
 199                                 + (str[5] & 0x3F);
 200                 return ch;
 201         }
 202 }
 203
 204
 205 # pragma warning(push)          // Saves the current warning state.
 206 # pragma warning(disable:4244)  // Temporarily disables warning 4244: "conversion from 'int' to 'char', possible loss of data"
 207 /**
 208  * @brief Write unicode codepoint u out as UTF-8 to lpd, and advance lpd
 209  *
 210  * Returns number of bytes written (or -1 for error, in which case it writes '?')
 211  */
 212 int to_utf8_advance(unsigned u, unsigned char * &lpd)
 213 {
 214         if (u < 0x80)
 215         {
 216                 *lpd++ = u;
 217                 return 1;
 218         }
 219         else if (u < 0x800)
 220         {
 221                 *lpd++ = 0xC0 + (u >> 6);
 222                 *lpd++ = 0x80 + (u & 0x3F);
 223                 return 2;
 224         }
 225         else if (u < 0x10000)
 226         {
 227                 *lpd++ = 0xE0 + (u >> 12);
 228                 *lpd++ = 0x80 + ((u >> 6) & 0x3F);
 229                 *lpd++ = 0x80 + (u & 0x3F);
 230                 return 3;
 231         }
 232         else if (u < 0x200000)
 233         {
 234                 *lpd++ = 0xF0 + (u >> 18);
 235                 *lpd++ = 0x80 + ((u >> 12) & 0x3F);
 236                 *lpd++ = 0x80 + ((u >> 6) & 0x3F);
 237                 *lpd++ = 0x80 + (u & 0x3F);
 238                 return 4;
 239         }
 240         else if (u < 0x4000000)
 241         {
 242                 *lpd++ = 0xF8 + (u >> 24);
 243                 *lpd++ = 0x80 + ((u >> 18) & 0x3F);
 244                 *lpd++ = 0x80 + ((u >> 12) & 0x3F);
 245                 *lpd++ = 0x80 + ((u >> 6) & 0x3F);
 246                 *lpd++ = 0x80 + (u & 0x3F);
 247                 return 5;
 248         }
 249         else if (u < 0x80000000)
 250         {
 251                 *lpd++ = 0xFC + (u >> 30);
 252                 *lpd++ = 0x80 + ((u >> 24) & 0x3F);
 253                 *lpd++ = 0x80 + ((u >> 18) & 0x3F);
 254                 *lpd++ = 0x80 + ((u >> 12) & 0x3F);
 255                 *lpd++ = 0x80 + ((u >> 6) & 0x3F);
 256                 *lpd++ = 0x80 + (u & 0x3F);
 257                 return 6;
 258         }
 259         else
 260         {
 261                 *lpd++ = '?';
 262                 return 1;
 263         }
 264 }
 265 # pragma warning(pop)           // Restores the warning state.
 266
 267 /**
 268  * @brief convert character passed (Unicode codepoint) to a TCHAR (set lossy flag if imperfect conversion)
 269  */
 270 void maketchar(String & ch, unsigned unich, bool & lossy)
 271 {
 272         static unsigned codepage = CP_ACP;
 273         // NB: Windows always draws in CP_ACP, not CP_THREAD_ACP, so we must use CP_ACP as an internal codepage
 274
 275         maketchar(ch, unich, lossy, codepage);
 276 }
 277
 278 /**
 279  * @brief convert character passed (Unicode codepoint) to a TCHAR (set lossy flag if imperfect conversion)
 280  */
 281 void maketchar(String & ch, unsigned unich, bool & lossy, unsigned codepage)
 282 {
 283 #ifdef _UNICODE
 284         if (unich < 0x10000)
 285         {
 286                 ch = static_cast<TCHAR>(unich);
 287                 return;
 288         }
 289         else if (unich < 0x110000)
 290         {
 291                 ch = static_cast<TCHAR>(((unich - 0x10000)/0x400 + 0xd800));
 292                 ch += static_cast<TCHAR>(((unich % 0x400) + 0xdc00));
 293                 return;
 294         }
 295         lossy = true;
 296         ch = '?';
 297         return;
 298 #else
 299         if (unich < 0x80)
 300         {
 301                 ch = (TCHAR)unich;
 302                 return;
 303         }
 304         wchar_t wch = (wchar_t)unich;
 305         if (!lossy)
 306         {
 307                 // So far it isn't lossy, so try for lossless conversion
 308                 char outch[3] = {0};
 309                 BOOL defaulted = FALSE;
 310                 DWORD flags = WC_NO_BEST_FIT_CHARS;
 311                 if (WideCharToMultiByte(codepage, flags, &wch, 1, outch, sizeof(outch), nullptr, &defaulted)
 312                                 && !defaulted)
 313                 {
 314                         ch = outch;
 315                         return;
 316                 }
 317                 lossy = true;
 318         }
 319         // already lossy, so make our best shot
 320         DWORD flags = WC_COMPOSITECHECK + WC_DISCARDNS + WC_SEPCHARS + WC_DEFAULTCHAR;
 321         TCHAR outbuff[16];
 322         int n = WideCharToMultiByte(codepage, flags, &wch, 1, outbuff, sizeof(outbuff) - 1, nullptr, nullptr);
 323         if (n > 0)
 324         {
 325                 outbuff[n] = 0;
 326                 ch = outbuff;
 327                 return;
 328         }
 329         ch = _T("?");
 330 #endif
 331 }
 332
 333 /**
 334  * @brief convert 8-bit character input to Unicode codepoint and return it
 335  */
 336 unsigned byteToUnicode(unsigned char ch)
 337 {
 338         static unsigned codepage = CP_ACP;
 339         // NB: Windows always draws in CP_ACP, not CP_THREAD_ACP, so we must use CP_ACP as an internal codepage
 340
 341         return byteToUnicode(ch, codepage);
 342 }
 343
 344 /**
 345  * @brief convert 8-bit character input to Unicode codepoint and return it
 346  */
 347 unsigned byteToUnicode(unsigned char ch, unsigned codepage)
 348 {
 349
 350         if (ch < 0x80)
 351                 return ch;
 352
 353         DWORD flags = 0;
 354         wchar_t wbuff;
 355         int n = MultiByteToWideChar(codepage, flags, (const char*) & ch, 1, &wbuff, 1);
 356         if (n > 0)
 357                 return wbuff;
 358         else
 359                 return '?';
 360 }
 361
 362 /**
 363  * @brief Return encoding used for TCHAR & String
 364  */
 365 void getInternalEncoding(UNICODESET * unicoding, int * codepage)
 366 {
 367 #ifdef _UNICODE
 368         *unicoding = UCS2LE;
 369         *codepage = CP_UCS2LE;
 370 #else
 371         // NB: Windows always draws in CP_ACP, not CP_THREAD_ACP, so we must use CP_ACP as an internal codepage
 372         *unicoding = NONE;
 373         *codepage = CP_ACP;
 374 #endif
 375 }
 376
 377 /**
 378  * @brief Write appropriate BOM (Unicode byte order marker)
 379  * returns #bytes written
 380  */
 381 int writeBom(void* dest, UNICODESET unicoding)
 382 {
 383         unsigned char * lpd = reinterpret_cast<unsigned char *>(dest);
 384         // write Unicode byte order marker (BOM)
 385         if (unicoding == UCS2LE)
 386         {
 387                 *lpd++ = 0xFF;
 388                 *lpd++ = 0xFE;
 389                 return 2;
 390         }
 391         else if (unicoding == UCS2BE)
 392         {
 393                 *lpd++ = 0xFE;
 394                 *lpd++ = 0xFF;
 395                 return 2;
 396         }
 397         else if (unicoding == UTF8)
 398         {
 399                 *lpd++ = 0xEF;
 400                 *lpd++ = 0xBB;
 401                 *lpd++ = 0xBF;
 402                 return 3;
 403         }
 404         return 0;
 405 }
 406
 407 int getBomSize(UNICODESET unicoding)
 408 {
 409         if (unicoding == UCS2LE)
 410                 return 2;
 411         else if (unicoding == UCS2BE)
 412                 return 2;
 413         else if (unicoding == UTF8)
 414                 return 3;
 415         return 0;
 416 }
 417
 418 /**
 419  * @brief Extract character from pointer, handling UCS-2 codesets
 420  *  This does not handle MBCS or UTF-8 codepages correctly!
 421  *  Client should not use this except for Unicode or SBCS codepages.
 422  */
 423 unsigned get_unicode_char(unsigned char * ptr, UNICODESET codeset, int codepage)
 424 {
 425         unsigned ch;
 426         switch (codeset)
 427         {
 428         case UCS2LE:
 429                 ch = *((WORD *)ptr);
 430                 break;
 431         case UCS2BE:
 432                 ch = (ptr[0] << 8) + ptr[1];
 433                 break;
 434         default:
 435                 // TODO: How do we recognize valid codepage ?
 436                 // if not, use byteToUnicode(*ptr)
 437                 ch = byteToUnicode(*ptr, codepage);
 438         }
 439         return ch;
 440 }
 441
 442 /**
 443  * @brief Convert series of bytes (8-bit chars) to TCHARs.
 444  *
 445  * @param [out] str String returned.
 446  * @param [in] lpd Original byte array to convert.
 447  * @param [in] len Length of the original byte array.
 448  * @param [in] codepage Codepage used.
 449  * @param [out] lossy Was conversion lossy?
 450  * @return true if conversion succeeds, false otherwise.
 451  * @todo This doesn't inform the caller whether translation was lossy
 452  *  In fact, this doesn't even know. Probably going to have to make
 453  *  two passes, the first with MB_ERR_INVALID_CHARS. Ugh. :(
 454  */
 455 bool maketstring(String & str, const char* lpd, size_t len, int codepage, bool * lossy)
 456 {
 457         if (!len)
 458         {
 459                 str.clear();
 460                 return true;
 461         }
 462
 463         int defcodepage = getDefaultCodepage();
 464
 465         // 0 is a valid value (CP_ACP)!
 466         if (codepage == -1)
 467                 codepage = defcodepage;
 468
 469 #ifdef UNICODE
 470         // Convert input to Unicode, using specified codepage
 471         // TCHAR is wchar_t, so convert into String (str)
 472         DWORD flags = MB_ERR_INVALID_CHARS;
 473         size_t wlen = len * 2 + 6;
 474         assert(wlen < INT_MAX);
 475
 476         try
 477         {
 478                 str.resize(wlen);
 479         }
 480         catch (std::bad_alloc&)
 481         {
 482                 // Not enough memory - exit
 483                 return false;
 484         }
 485
 486         LPWSTR wbuff = &*str.begin();
 487         if (codepage == CP_ACP || IsValidCodePage(codepage))
 488         {
 489                 int n = MultiByteToWideChar(codepage, flags, lpd, static_cast<int>(len), wbuff, static_cast<int>(wlen - 1));
 490                 if (n)
 491                 {
 492                         /*
 493                         NB: MultiByteToWideChar is documented as only zero-terminating
 494                         if input was zero-terminated, but it appears that it can
 495                         zero-terminate even if input wasn't.
 496                         So we check if it zero-terminated and adjust count accordingly.
 497                         */
 498                         //>2007-01-11 jtuc: We must preserve an embedded zero even if it is
 499                         // the last input character. As we don't expect MultiByteToWideChar to
 500                         // add a zero that does not originate from the input string, it is a
 501                         // good idea to ASSERT that the assumption holds.
 502                         if (wbuff[n-1] == 0 && lpd[len-1] != 0)
 503                         {
 504                                 //assert(false);
 505                                 *lossy = true;
 506                                 --n;
 507                         }
 508                         try
 509                         {
 510                                 str.resize(n);
 511                         }
 512                         catch (std::bad_alloc&)
 513                         {
 514                                 // Not enough memory - exit
 515                                 return false;
 516                         }
 517                         return true;
 518                 }
 519                 else
 520                 {
 521                         if (GetLastError() == ERROR_INVALID_FLAGS)
 522                         {
 523                                 n = MultiByteToWideChar(codepage, 0, lpd, static_cast<int>(len), wbuff, static_cast<int>(wlen-1));
 524                                 if (n)
 525                                 {
 526                                         /* NB: MultiByteToWideChar is documented as only zero-terminating
 527                                         if input was zero-terminated, but it appears that it can
 528                                         zero-terminate even if input wasn't.
 529                                         So we check if it zero-terminated and adjust count accordingly.
 530                                         */
 531                                         if (wbuff[n-1] == 0 && lpd[len-1] != 0)
 532                                         {
 533                                                 //assert(false);
 534                                                 *lossy = true;
 535                                                 --n;
 536                                         }
 537                                         try
 538                                         {
 539                                                 str.resize(n);
 540                                         }
 541                                         catch (std::bad_alloc&)
 542                                         {
 543                                                 // Not enough memory - exit
 544                                                 return false;
 545                                         }
 546                                         return true;
 547                                 }
 548                         }
 549                         if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
 550                         {
 551                                 *lossy = true;
 552                                 flags = 0;
 553                                 // wlen & wbuff are still fine
 554                                 n = MultiByteToWideChar(codepage, flags, lpd, static_cast<int>(len), wbuff, static_cast<int>(wlen-1));
 555                                 if (n)
 556                                 {
 557                                         try
 558                                         {
 559                                                 str.resize(n);
 560                                         }
 561                                         catch (std::bad_alloc&)
 562                                         {
 563                                                 // Not enough memory - exit
 564                                                 return false;
 565                                         }
 566                                         return true;
 567                                 }
 568                         }
 569                         str = _T("?");
 570                         return true;
 571                 }
 572         }
 573         else
 574         {
 575                 IExconverter *pexconv = Exconverter::getInstance();
 576                 if (pexconv != nullptr)
 577                 {
 578                         size_t n = wlen;
 579                         if (pexconv->convertToUnicode(codepage, lpd, &len, wbuff, &n))
 580                         {
 581                                 try
 582                                 {
 583                                         str.resize(n);
 584                                 }
 585                                 catch (std::bad_alloc&)
 586                                 {
 587                                         // Not enough memory - exit
 588                                         return false;
 589                                 }
 590                                 return true;
 591                         }
 592                         else
 593                         {
 594                                 *lossy = true;
 595                                 str = _T("?");
 596                         }
 597                         return true;
 598                 }
 599                 else
 600                 {
 601                         *lossy = true;
 602                         str = _T("?");
 603                 }
 604                 return true;
 605         }
 606
 607 #else
 608         int dstcodepage = IsValidCodePage(defcodepage) ? defcodepage : GetACP();
 609
 610         if (EqualCodepages(codepage, dstcodepage))
 611         {
 612                 // trivial case, they want the bytes in the file interpreted in our current codepage
 613                 // Only caveat is that input (lpd) is not zero-terminated
 614                 str = String(lpd, len);
 615                 return true;
 616         }
 617
 618         if (codepage == CP_ACP || IsValidCodePage(codepage))
 619         {
 620                 str = CrossConvertToStringA(lpd, len, codepage, dstcodepage, lossy);
 621                 if (*lossy)
 622                         str = _T("?");
 623                 return true;
 624         }
 625         else
 626         {
 627                 IExconverter *pexconv = Exconverter::getInstance();
 628                 if (pexconv != nullptr)
 629                 {
 630                         size_t n = len * 6 + 6;
 631                         try
 632                         {
 633                                 str.resize(n);
 634                         }
 635                         catch (std::bad_alloc&)
 636                         {
 637                                 // Not enough memory - exit
 638                                 return false;
 639                         }
 640                         char *buff = &*str.begin();
 641                         pexconv->convert(codepage, dstcodepage, (const unsigned char *)lpd, &len, (unsigned char *)buff, &n);
 642                         if (n)
 643                         {
 644                                 try
 645                                 {
 646                                         str.resize(n);
 647                                 }
 648                                 catch (std::bad_alloc&)
 649                                 {
 650                                         // Not enough memory - exit
 651                                         return false;
 652                                 }
 653                         }
 654                         else
 655                                 str = _T("?");
 656                 }
 657                 else
 658                         str = _T("?");
 659                 return true;
 660         }
 661 #endif
 662 }
 663
 664 /**
 665  * @brief (ANSI build only) Convert from one 8 bit codepage to another
 666  */
 667 #ifndef UNICODE
 668 String CrossConvertToStringA(const char* src, unsigned srclen, int cpin, int cpout, bool * lossy)
 669 {
 670         int wlen = srclen * 2 + 6;
 671         int clen = wlen * 2 + 6;
 672         String str;
 673         str.resize(clen);
 674         char* cbuff = &*str.begin();
 675         int nbytes = CrossConvert(src, srclen, cbuff, clen, cpin, cpout, lossy);
 676         str.resize(nbytes);
 677         return str;
 678 }
 679 #endif
 680
 681 /**
 682  * @brief Convert from one 8-bit codepage to another
 683  *
 684  * destsize must be at least 2
 685  */
 686 int CrossConvert(const char* src, unsigned srclen, char* dest, unsigned destsize, int cpin, int cpout, bool * lossy)
 687 {
 688         assert(destsize > 1);
 689
 690         // Convert input to Unicode, using specified codepage
 691         DWORD flags = 0;
 692         int wlen = srclen * 2 + 6;
 693         auto wbuff = std::make_unique<wchar_t[]>(wlen);
 694         int n;
 695         if (cpin == CP_UCS2LE)
 696         {
 697                 if (srclen == -1)
 698                         srclen = static_cast<unsigned>(wcslen((wchar_t *)src) * sizeof(wchar_t));
 699                 memcpy(wbuff.get(), src, srclen);
 700                 n = srclen / sizeof(wchar_t);
 701         }
 702         else if (cpin == CP_UCS2BE)
 703         {
 704                 if (srclen == -1)
 705                         srclen = static_cast<unsigned>(wcslen((wchar_t *)src) * sizeof(wchar_t));
 706                 _swab((char *)src, (char *)wbuff.get(), srclen);
 707                 n = srclen / sizeof(wchar_t);
 708         }
 709         else
 710         {
 711                 n = MultiByteToWideChar(cpin, flags, (const char*)src, srclen, wbuff.get(), wlen - 1);
 712                 if (!n)
 713                 {
 714                         int nsyserr = ::GetLastError();
 715                         dest[0] = '?';
 716                         return 1;
 717                 }
 718         }
 719         /*
 720         NB: MultiByteToWideChar is documented as only zero-terminating
 721         if input was zero-terminated, but it appears that it can
 722         zero-terminate even if input wasn't.
 723         So we check if it zero-terminated and adjust count accordingly.
 724         */
 725         if (wbuff[n-1] == 0)
 726                 --n;
 727         wbuff[n] = 0; // zero-terminate string
 728
 729         // Now convert to TCHAR (which means defcodepage)
 730         flags = WC_NO_BEST_FIT_CHARS; // TODO: Think about this
 731         BOOL defaulted = FALSE;
 732         BOOL * pdefaulted = &defaulted;
 733         if (cpout == CP_UTF8)
 734         {
 735                 flags = 0;
 736                 pdefaulted = nullptr;
 737         }
 738         if (cpout == CP_UCS2LE)
 739         {
 740                 memcpy(dest, wbuff.get(), n * sizeof(wchar_t));
 741                 n = n * sizeof(wchar_t);
 742                 dest[n] = 0;
 743                 dest[n + 1] = 0;
 744         }
 745         else if (cpout == CP_UCS2BE)
 746         {
 747                 _swab((char *)wbuff.get(), dest, n * sizeof(wchar_t));
 748                 n = n * sizeof(wchar_t);
 749                 dest[n] = 0;
 750                 dest[n + 1] = 0;
 751         }
 752         else
 753         {
 754                 n = WideCharToMultiByte(cpout, flags, wbuff.get(), n, dest, destsize - 1, nullptr, pdefaulted);
 755                 if (!n)
 756                 {
 757                         int nsyserr = ::GetLastError();
 758                 }
 759                 dest[n] = 0;
 760         }
 761         if (lossy)
 762                 *lossy = !!defaulted;
 763         return n;
 764 }
 765
 766 /**
 767  * @brief Buffer constructor.
 768  * The constructor creates buffer with given size.
 769  * @param [in] initialSize Buffer's size.
 770  */
 771 buffer::buffer(size_t initialSize)
 772 {
 773         size = 0;
 774         capacity = initialSize;
 775         ptr = (unsigned char *)calloc(capacity, 1);
 776 }
 777
 778 /**
 779  * @brief Buffer destructor.
 780  * Frees the reserved buffer.
 781  */
 782 buffer::~buffer()
 783 {
 784         free(ptr);
 785 }
 786
 787 /**
 788  * @brief Resize the buffer.
 789  * @param [in] newSize New size of the buffer.
 790  */
 791 void buffer::resize(size_t newSize)
 792 {
 793         if (capacity < newSize)
 794         {
 795                 capacity = newSize;
 796                 unsigned char *tmp = static_cast<unsigned char *>(realloc(ptr, capacity));
 797                 if (tmp == nullptr)
 798                         throw std::bad_alloc();
 799                 ptr = tmp;
 800         }
 801 }
 802
 803 unsigned char *convertTtoUTF8(buffer * buf, const TCHAR *src, int srcbytes/* = -1*/)
 804 {
 805         bool bSucceeded;
 806 #ifdef _UNICODE
 807         bSucceeded = convert(CP_UCS2LE,
 808                 (unsigned char *)src, (int)((srcbytes < 0) ? wcslen((const wchar_t *)src) * sizeof(wchar_t) : srcbytes),
 809                 CP_UTF8, buf);
 810 #else
 811         bSucceeded = convert(GetACP(),
 812                 (unsigned char *)src, (int)((srcbytes < 0) ? strlen((const char *)src) : srcbytes),
 813                 CP_UTF8, buf);
 814 #endif
 815         if (!bSucceeded)
 816                 *((unsigned char *)buf->ptr) = 0;
 817         return buf->ptr;
 818 }
 819
 820 unsigned char *convertTtoUTF8(const TCHAR *src, int srcbytes/* = -1*/)
 821 {
 822         buffer buf(256);
 823         convertTtoUTF8(&buf, src, srcbytes);
 824         return (unsigned char *)_strdup((const char *)buf.ptr);
 825 }
 826
 827 TCHAR *convertUTF8toT(buffer * buf, const char *src, int srcbytes/* = -1*/)
 828 {
 829         bool bSucceeded;
 830 #ifdef _UNICODE
 831         bSucceeded = convert(CP_UTF8,
 832                 (const unsigned char *)src, (int)((srcbytes < 0) ? strlen((const char *)src) : srcbytes),
 833                 CP_UCS2LE, buf);
 834 #else
 835         bSucceeded = convert(CP_UTF8,
 836                 (const unsigned char *)src, (int)((srcbytes < 0) ? strlen((const char *)src) : srcbytes),
 837                 GetACP(), buf);
 838 #endif
 839         if (!bSucceeded)
 840                 *((TCHAR *)buf->ptr) = 0;
 841         return (TCHAR *)buf->ptr;
 842 }
 843
 844 TCHAR *convertUTF8toT(const char *src, int srcbytes/* = -1*/)
 845 {
 846         buffer buf(256);
 847         convertUTF8toT(&buf, src, srcbytes);
 848         return (TCHAR *)_tcsdup((LPCTSTR)buf.ptr);
 849 }
 850
 851 void dealloc(void *ptr)
 852 {
 853         free(ptr);
 854 }
 855
 856 String toTString(const std::wstring& str)
 857 {
 858 #ifdef UNICODE
 859         return str;
 860 #else
 861         return toThreadCP(str);
 862 #endif
 863 }
 864
 865 String toTString(const std::string& str)
 866 {
 867 #ifdef UNICODE
 868         std::wstring wstr;
 869         UnicodeConverter::toUTF16(str, wstr);
 870         return wstr;
 871 #else
 872         const char *p = convertUTF8toT(str.c_str(), str.length());
 873         std::string astr = p;
 874         dealloc((void *)p);
 875         return astr;
 876 #endif
 877 }
 878
 879 void toUTF16(const String& tstr, std::wstring& wstr)
 880 {
 881 #ifdef UNICODE
 882         wstr = tstr;
 883 #else
 884         UnicodeConverter::toUTF16(tstr, wstr);
 885 #endif
 886 }
 887
 888 std::string toUTF8(const String& tstr)
 889 {
 890         std::string u8str;
 891         toUTF8(tstr, u8str);
 892         return u8str;
 893 }
 894
 895 void toUTF8(const String& tstr, std::string& u8str)
 896 {
 897 #ifdef _UNICODE
 898         u8str.clear();
 899         size_t len = tstr.length();
 900         if (len == 0)
 901                 return;
 902         u8str.resize(len * 3);
 903         char *p = &u8str[0];
 904         for (String::const_iterator it = tstr.begin(); it != tstr.end(); ++it)
 905         {
 906                 unsigned uc = *it;
 907                 if (uc >= 0xd800 && uc < 0xdc00)
 908                 {
 909                         ++it;
 910                         if (it != tstr.end())
 911                         {
 912                                 wchar_t uc2 = *it;
 913                                 uc = ((uc & 0x3ff) << 10) + (uc2 & 0x3ff) + 0x10000;
 914                         }
 915                 }
 916                 p += Ucs4_to_Utf8(uc, reinterpret_cast<unsigned char *>(p));
 917         }
 918         u8str.resize(p - &u8str[0]);
 919 #else
 920         const char *p = (const char *)convertTtoUTF8(tstr.c_str(), tstr.length());
 921         u8str = p;
 922         dealloc((void *)p);
 923 #endif
 924 }
 925
 926 bool convert(int codepage1, const unsigned char * src, int srcbytes, int codepage2, buffer * dest)
 927 {
 928         UNICODESET unicoding[2];
 929         int codepage[2] = {codepage1, codepage2};
 930
 931         int i;
 932         for (i = 0; i < 2; i++)
 933         {
 934                 switch (codepage[i])
 935                 {
 936                 case CP_UCS2LE:
 937                         unicoding[i] = UCS2LE; break;
 938                 case CP_UCS2BE:
 939                         unicoding[i] = UCS2BE; break;
 940                 case CP_UTF8:
 941                         unicoding[i] = UTF8; break;
 942                 default:
 943                         unicoding[i] = NONE; break;
 944                 }
 945         }
 946
 947         return convert(unicoding[0], codepage1, src, srcbytes, unicoding[1], codepage2, dest);
 948 }
 949
 950 /**
 951  * @brief Convert from one text encoding to another; return false if any lossing conversions
 952  */
 953 bool convert(UNICODESET unicoding1, int codepage1, const unsigned char * src, size_t srcbytes, UNICODESET unicoding2, int codepage2, buffer * dest)
 954 {
 955         if (unicoding1 == unicoding2 && (unicoding1 || EqualCodepages(codepage1, codepage2)))
 956         {
 957                 // simple byte copy
 958                 dest->resize(srcbytes + 2);
 959                 CopyMemory(dest->ptr, src, srcbytes);
 960                 dest->ptr[srcbytes] = 0;
 961                 dest->ptr[srcbytes+1] = 0;
 962                 dest->size = srcbytes;
 963                 return true;
 964         }
 965         if ((unicoding1 == UCS2LE && unicoding2 == UCS2BE)
 966                         || (unicoding1 == UCS2BE && unicoding2 == UCS2LE))
 967         {
 968                 // simple byte swap
 969                 dest->resize(srcbytes + 2);
 970                 for (size_t i = 0; i < srcbytes; i += 2)
 971                 {
 972                         // Byte-swap into destination
 973                         dest->ptr[i] = src[i+1];
 974                         dest->ptr[i+1] = src[i];
 975                 }
 976                 dest->ptr[srcbytes] = 0;
 977                 dest->ptr[srcbytes+1] = 0;
 978                 dest->size = srcbytes;
 979                 return true;
 980         }
 981         if (unicoding1 != UCS2LE && unicoding2 != UCS2LE)
 982         {
 983                 // Break problem into two simpler pieces by converting through UCS-2LE
 984                 buffer intermed(dest->capacity + 2);
 985                 bool step1 = convert(unicoding1, codepage1, src, srcbytes, UCS2LE, 0, &intermed);
 986                 bool step2 = convert(UCS2LE, 0, intermed.ptr, intermed.size, unicoding2, codepage2, dest);
 987                 return step1 && step2;
 988         }
 989         if (unicoding1 == UCS2LE)
 990         {
 991                 // From UCS-2LE to 8-bit (or UTF-8)
 992
 993                 // WideCharToMultiByte: lpDefaultChar & lpUsedDefaultChar must be `nullptr` when using UTF-8
 994
 995                 int destcp = (unicoding2 == UTF8 ? CP_UTF8 : codepage2);
 996                 if (destcp == CP_ACP || IsValidCodePage(destcp))
 997                 {
 998                         DWORD flags = 0;
 999                         int bytes = WideCharToMultiByte(destcp, flags, (LPCWSTR)src, static_cast<int>(srcbytes/2), 0, 0, nullptr, nullptr);
1000                         dest->resize(bytes + 2);
1001                         int losses = 0;
1002                         bytes = WideCharToMultiByte(destcp, flags, (LPCWSTR)src, static_cast<int>(srcbytes/2), (char *)dest->ptr, static_cast<int>(dest->capacity), nullptr, nullptr);
1003                         dest->ptr[bytes] = 0;
1004                         dest->ptr[bytes+1] = 0;
1005                         dest->size = bytes;
1006                         return losses==0;
1007                 }
1008                 else
1009                 {
1010                         size_t srcsize = srcbytes / 2;
1011                         size_t dstsize = srcbytes * 6;
1012                         dest->resize(dstsize + 2);
1013                         IExconverter *pexconv = Exconverter::getInstance();
1014                         if (pexconv != nullptr)
1015                         {
1016                                 bool result = pexconv->convertFromUnicode(destcp, (LPWSTR)src, &srcsize, (char *)dest->ptr, &dstsize);
1017                                 dest->ptr[dstsize] = 0;
1018                                 dest->ptr[dstsize+1] = 0;
1019                                 dest->size = dstsize;
1020                                 return result;
1021                         }
1022                         else
1023                                 return false;
1024                 }
1025         }
1026         else
1027         {
1028                 // From 8-bit (or UTF-8) to UCS-2LE
1029                 int srccp = (unicoding1 == UTF8 ? CP_UTF8 : codepage1);
1030                 if (srccp == CP_ACP || IsValidCodePage(srccp))
1031                 {
1032                         DWORD flags = 0;
1033                         int wchars = MultiByteToWideChar(srccp, flags, (LPCSTR)src, static_cast<int>(srcbytes), 0, 0);
1034                         dest->resize((wchars + 1) *2);
1035                         wchars = MultiByteToWideChar(srccp, flags, (LPCSTR)src, static_cast<int>(srcbytes), (LPWSTR)dest->ptr, static_cast<int>(dest->capacity/2));
1036                         dest->ptr[wchars * 2] = 0;
1037                         dest->ptr[wchars * 2 + 1] = 0;
1038                         dest->size = wchars * 2;
1039                         return true;
1040                 }
1041                 else
1042                 {
1043                         size_t srcsize = srcbytes;
1044                         size_t dstsize = srcbytes;
1045                         dest->resize((srcbytes + 1) * sizeof(wchar_t));
1046                         IExconverter *pexconv = Exconverter::getInstance();
1047                         if (pexconv != nullptr)
1048                         {
1049                                 bool result = pexconv->convertToUnicode(srccp, (LPCSTR)src, &srcsize, (LPWSTR)dest->ptr, &dstsize);
1050                                 dest->ptr[dstsize * sizeof(wchar_t)] = 0;
1051                                 dest->ptr[dstsize * sizeof(wchar_t) + 1] = 0;
1052                                 dest->size = dstsize * sizeof(wchar_t);
1053                                 return result;
1054                         }
1055                         else
1056                                 return false;
1057                 }
1058         }
1059 }
1060
1061 /**
1062  * @brief Convert from Unicode to Ansi using given codepage.
1063  * @param [in] from String to convert.
1064  * @param [in] codepage Codepage to use in conversion.
1065  * @param [out] to Ansi string.
1066  */
1067 static void convert(const std::wstring& from, unsigned codepage, std::string& to)
1068 {
1069         int len = WideCharToMultiByte(codepage, 0, from.c_str(), static_cast<int>(from.length()), 0, 0, 0, 0);
1070         if (len)
1071         {
1072                 to.resize(len);
1073                 WideCharToMultiByte(codepage, 0, from.c_str(), static_cast<int>(from.length()), &to[0], static_cast<int>(len), nullptr, nullptr);
1074         }
1075         else
1076         {
1077                 to.clear();
1078         }
1079 }
1080
1081 /**
1082  * @brief Convert from Unicode to Ansi using system codepage.
1083  * This function converts Unicode string to ansi string using system codepage.
1084  * This conversion function should be used when converting strings containing
1085  * paths. As paths are handled by the system and are not file content.
1086  * @param [in] str String to convert.
1087  * @return Ansi string.
1088  */
1089 std::string toSystemCP(const std::string& str)
1090 {
1091         return str;
1092 }
1093
1094 std::string toSystemCP(const std::wstring& str)
1095 {
1096         std::string to;
1097         convert(str, CP_ACP, to);
1098         return to;
1099 }
1100
1101 /**
1102  * @brief Convert from Unicode to Ansi using thread codepage.
1103  * This function converts Unicode string to ansi string using thread codepage.
1104  * Thread codepage is practically the codepage WinMerge is using internally.
1105  * @param [in] str String to convert.
1106  * @return Ansi string.
1107  */
1108 std::string toThreadCP(const std::string& str)
1109 {
1110         return str;
1111 }
1112
1113 std::string toThreadCP(const std::wstring& str)
1114 {
1115         std::string to;
1116         convert(str, CP_THREAD_ACP, to);
1117         return to;
1118 }
1119
1120 // Algorithm originally from:
1121 // TortoiseMerge - a Diff/Patch program
1122 // Copyright (C) 2007 - TortoiseSVN
1123 /**
1124  * @brief Check for invalid UTF-8 bytes in buffer.
1125  * This function checks if there are invalid UTF-8 bytes in the given buffer.
1126  * If such bytes are found, caller knows this buffer is not valid UTF-8 file.
1127  * @param [in] pBuffer Pointer to begin of the buffer.
1128  * @param [in] size Size of the buffer in bytes.
1129  * @return true if invalid bytes found, false otherwise.
1130  */
1131 bool CheckForInvalidUtf8(const char* pBuffer, size_t size)
1132 {
1133         bool bUTF8 = false;
1134         for (unsigned char* pb = (unsigned char*)pBuffer, *end = pb + size; pb < end;)
1135         {
1136                 unsigned c = *pb++;
1137
1138                 if (!(c & 0x80)) continue;
1139
1140                 if ((c >= 0xF5) || (c == 0xC0) || (c == 0xC1))
1141                         return true;
1142
1143                 uint32_t v = 0x80808000; //1st 0-byte covers scenario if no any next "if" fired at all
1144
1145                 if ((c & 0xE0) == 0xC0)
1146                 {
1147                         if (pb == end)
1148                                 return true;
1149                         *reinterpret_cast<unsigned char*>(&v) = *pb++;
1150                 }
1151                 else if ((c & 0xF0) == 0xE0)
1152                 {
1153                         if (pb > end - 2)
1154                                 return true;
1155                         *reinterpret_cast<uint16_t*>(&v) = *reinterpret_cast<uint16_t*>(pb);
1156                         pb += 2;
1157                 }
1158                 else if ((c & 0xF8) == 0xF0)
1159                 {
1160                         if (pb > end - 3)
1161                                 return true;
1162                         static_assert(sizeof(char) == sizeof(uint8_t), "unexpected char-size");
1163
1164                         *reinterpret_cast<uint16_t*>(&v) = *reinterpret_cast<uint16_t*>(pb);
1165                         reinterpret_cast<uint8_t*>(&v)[2] = pb[2];
1166                         pb += 3;
1167                 }
1168
1169                 if ((v & (0xC0C0C0C0)) != 0x80808080)
1170                         return true;
1171                 bUTF8 = true;
1172         }
1173         return !bUTF8;
1174 }
1175
1176 /**
1177  * @brief Determine encoding from byte buffer.
1178  * @param [in] pBuffer Pointer to the begin of the buffer.
1179  * @param [in] size Size of the buffer.
1180  * @param [out] pBom Returns true if buffer had BOM bytes, false otherwise.
1181  * @return One of UNICODESET values as encoding.
1182  * EF BB BF UTF-8
1183  * FF FE UTF-16, little endian
1184  * FE FF UTF-16, big endian
1185  * FF FE 00 00 UTF-32, little endian
1186  * 00 00 FE FF UTF-32, big-endian
1187  */
1188 UNICODESET DetermineEncoding(const unsigned char *pBuffer, uint64_t size, bool * pBom)
1189 {
1190         UNICODESET unicoding = NONE;
1191         *pBom = false;
1192
1193         if (size >= 2)
1194         {
1195                 if (pBuffer[0] == 0xFF && pBuffer[1] == 0xFE)
1196                 {
1197                         unicoding = UCS2LE; //UNI little endian
1198                         *pBom = true;
1199                 }
1200                 else if (pBuffer[0] == 0xFE && pBuffer[1] == 0xFF)
1201                 {
1202                         unicoding = UCS2BE; //UNI big endian
1203                         *pBom = true;
1204                 }
1205         }
1206         if (size >= 3)
1207         {
1208                 if (pBuffer[0] == 0xEF && pBuffer[1] == 0xBB && pBuffer[2] == 0xBF)
1209                 {
1210                         unicoding = UTF8;
1211                         *pBom = true;
1212                 }
1213         }
1214         if (size >= 4)
1215         {
1216                 if (pBuffer[0] == 0xFF && pBuffer[1] == 0xFE &&
1217                                 pBuffer[2] == 0x00 && pBuffer[3] == 0x00)
1218                 {
1219                         unicoding = UCS4LE; //UTF-32, little endian
1220                         *pBom = true;
1221                 }
1222                 else if (pBuffer[0] == 0x00 && pBuffer[1] == 0x00 &&
1223                                 pBuffer[2] == 0xFE && pBuffer[3] == 0xFF)
1224                 {
1225                         unicoding = UCS4BE; //UTF-32, big endian
1226                         *pBom = true;
1227                 }
1228         }
1229
1230         return unicoding;
1231 }
1232
1233 /**
1234  * @brief Change any special codepage constants into real codepage numbers
1235  */
1236 static int NormalizeCodepage(int cp)
1237 {
1238         if (cp == CP_THREAD_ACP) // should only happen on Win2000+
1239         {
1240                 TCHAR buff[32];
1241                 if (GetLocaleInfo(GetThreadLocale(), LOCALE_IDEFAULTANSICODEPAGE, buff, sizeof(buff) / sizeof(buff[0])))
1242                         cp = _ttol(buff);
1243                 else
1244                         // a valid codepage is better than no codepage
1245                         cp = GetACP();
1246         }
1247         if (cp == CP_ACP) cp = GetACP();
1248         if (cp == CP_OEMCP) cp = GetOEMCP();
1249         return cp;
1250 }
1251
1252 /**
1253  * @brief Compare two codepages for equality
1254  */
1255 bool EqualCodepages(int cp1, int cp2)
1256 {
1257         return (cp1 == cp2)
1258                         || (NormalizeCodepage(cp1) == NormalizeCodepage(cp2));
1259 }
1260
1261 int getDefaultCodepage()
1262 {
1263         return f_nDefaultCodepage;
1264 }
1265
1266 void setDefaultCodepage(int cp)
1267 {
1268         f_nDefaultCodepage = cp;
1269 }
1270
1271 } // namespace ucr
1272