Src/markdown.cpp

   1 /* markdown.cpp: Pull-parse XML sources
   2  * Copyright (c) 2005 Jochen Tucht
   3  *
   4  * This library is free software; you can redistribute it and/or
   5  * modify it under the terms of the GNU Lesser General Public
   6  * License as published by the Free Software Foundation; either
   7  * version 2.1 of the License, or (at your option) any later version.
   8  *
   9  * This library is distributed in the hope that it will be useful,
  10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  12  * Lesser General Public License for more details.
  13  *
  14  * You should have received a copy of the GNU Lesser General Public
  15  * License along with this library; if not, write to the Free Software
  16  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  17  *
  18  * OS/Libs:     Win32/STL/shlwapi/iconv
  19  *                      iconv.dll is loaded on demand, and is not required as long as
  20  *                      program doesn't call iconv based methods.
  21  *
  22  * Remarks:     Pull-parsing is a very simple way to parse XML. It does not require
  23  *                      callback functions, and it does not build object trees in memory. It
  24  *                      just travels through plain source.
  25  *
  26  *                      This library reads source text from memory. It can safely operate
  27  *                      on memory mapped files, as it does not require text to be zero-
  28  *                      terminated. It will also read most of the usual meta stuff (<? ?>,
  29  *                      <!-- -->, <![ []]>, and DTD tags), but applying meta information is
  30  *                      left to the caller. Thus, the library does not exactly implement an
  31  *                      XML parser. It just helps reading XML.
  32  *
  33  *                      This library is not modeled after an existing pull parsing API,
  34  *                      so don't expect to find the same methods you've seen elsewhere.
  35  *                      In particular, this library does not follow XmlPull's event model,
  36  *                      but attempts to be somewhat closer to a tree-based API.
  37  *                      For simplicity, this library does not perform any validation, nor
  38  *                      provide error handling other than returning empty text in case it
  39  *                      fails to retrieve something.
  40  *
  41  *                      The name of the core class, CMarkdown, actually was going to be
  42  *                      CMarkup when I came across another XML tool with same name on
  43  *                      CodeProject. Like TinyXml and XMLite, and unlike CMarkdown, CMarkup
  44  *                      follows DOM-like approach, suffering from considerable memory
  45  *                      footprint. Anyway, class name CMarkdown somewhat reflects the nature
  46  *                      of pull-parsing, pulling down the leaves of an XML tree so programs
  47  *                      can reach them from a flat loop, rather than climb up the tree and
  48  *                      push the leaves to some callback function, or preprocess the entire
  49  *                      tree in some way before allowing programs to retrieve content.
  50  *
  51  *                      Recommended reading:
  52  *
  53  *                      www.sosnoski.com/articles/parsing1.html (SAX2 Basics)
  54  *                      www.sosnoski.com/articles/parsing2.html (SAX vs Pull)
  55  *                      www.sosnoski.com/articles/parsing3.html (Performance)
  56  *                      www.xml.com/pub/a/2002/08/14/xmlpull.html (XMLPULL API)
  57  *                      www.xml.com/pub/a/2002/09/25/xmlpull.html (response to above)
  58  *                      www.stylusstudio.com/xmldev/200205/post61120.html (discussion)
  59  *
  60  *                      There are lots of related articles on the web, though.
  61
  62 Please mind 2. b) of the GNU LGPL terms, and log your changes below.
  63
  64 DATE:           BY:                                     DESCRIPTION:
  65 ==========      ==================      ================================================
  66 2005-01-15      Jochen Tucht            Created
  67 2005-02-26      Jochen Tucht            Load iconv.dll through DLLPSTUB
  68 2005-03-20      Jochen Tucht            Add IgnoreCase option for ASCII-7 tag/attr names.
  69                                                                 Add HtmlUTags option to check for (potentially)
  70                                                                 unbalanced HTML tags. Html option is combination
  71                                                                 of the above. Using these options imposes
  72                                                                 performance penalty, so avoid it if you can.
  73                                                                 New flag CMarkdown::FileImage::Handle makes
  74                                                                 CMarkdown::FileImage::FileImage() accept a
  75                                                                 handle rather than a filename.
  76 2005-06-22      Jochen Tucht            New method CMarkdown::_HSTR::Entities().
  77 2005-07-29      Jochen Tucht            ByteOrder detection for 16/32 bit encodings
  78 2005-09-09      Jochen Tucht            Patch by Takashi Sawanaka fixes crash due to
  79                                                                 reading beyond end of text with HtmlUTags option
  80 2005-12-04      Jochen Tucht            Fix UTF-8 signature detection
  81                                                                 Strip bogus trailing slash in name of empty tag
  82 2008-08-27      Jochen Neubeck          Replace MFC CMap by STL std::map
  83 */
  84
  85 #include "pch.h"
  86 #include "markdown.h"
  87 #include <cstring>
  88 #include <cstdint>
  89 #include <Poco/ByteOrder.h>
  90 #include <Poco/NumberParser.h>
  91 #include <Poco/SharedMemory.h>
  92 #include "unicoder.h"
  93 #include "TFile.h"
  94
  95 #ifndef MAKEWORD
  96 #define MAKEWORD(a, b)      ((unsigned short)(((unsigned char)((unsigned)(a) & 0xff)) | ((unsigned short)((unsigned char)((unsigned)(b) & 0xff))) << 8))
  97 #define MAKELONG(a, b)      ((unsigned)(((unsigned short)((unsigned)(a) & 0xffff)) | ((unsigned)((unsigned short)((unsigned)(b) & 0xffff))) << 16))
  98 #define LOWORD(l)           ((unsigned short)((unsigned)(l) & 0xffff))
  99 #define HIWORD(l)           ((unsigned short)((unsigned)(l) >> 16))
 100 #define LOBYTE(w)           ((unsigned char)((unsigned)(w) & 0xff))
 101 #define HIBYTE(w)           ((unsigned char)((unsigned)(w) >> 8))
 102 #endif
 103
 104 using Poco::ByteOrder;
 105 using Poco::NumberParser;
 106 using Poco::SharedMemory;
 107 using Poco::File;
 108
 109 void CMarkdown::Load(EntityMap &entityMap)
 110 {
 111         entityMap["amp"] = "&";
 112         entityMap["quot"] = "\"";
 113         entityMap["apos"] = "'";
 114         entityMap["lt"] = "<";
 115         entityMap["gt"] = ">";
 116 }
 117
 118 void CMarkdown::Load(EntityMap &entityMap, int dummy)
 119 {
 120         while (Move("!ENTITY"))
 121         {
 122                 std::string hstrValue;
 123                 std::string hstrKey = GetAttribute(0, &hstrValue);
 124                 if (!hstrKey.empty())
 125                 {
 126                         entityMap[hstrKey] = hstrValue;
 127                 }
 128         }
 129 }
 130
 131 std::string CMarkdown::Resolve(const EntityMap &map, const std::string& v)
 132 {
 133         std::string ret(v);
 134         char *p, *q = &ret[0];
 135         while ((p = strchr(q, '&')) != nullptr && (q = strchr(p, ';')) != nullptr)
 136         {
 137                 *q = '\0';
 138                 char *key = p + 1;
 139                 std::string value;
 140                 if (*key == '#')
 141                 {
 142                         unsigned ordinal = '?';
 143                         *key = '0';
 144                         if (NumberParser::tryParseHex(key, ordinal))
 145                                 value.assign(1, static_cast<std::string::value_type>(ordinal));
 146                         *key = '#';
 147                 }
 148                 else
 149                 {
 150                         EntityMap::const_iterator p1 = map.find(key);
 151                         if (p1 != map.end())
 152                                 value = p1->second;
 153                 }
 154                 *q = ';';
 155                 ++q;
 156                 size_t cchValue = value.length();
 157                 if (cchValue != 0)
 158                 {
 159                         size_t i = p - &ret[0];
 160                         size_t j = q - &ret[0];
 161                         size_t cchKey = q - p;
 162                         if (cchValue != cchKey)
 163                         {
 164                                 size_t b = ret.length();
 165                                 size_t cbMove = (b - j) * sizeof(char);
 166                                 if (cchKey > cchValue)
 167                                 {
 168                                         size_t cchGrow = cchKey - cchValue;
 169                                         memmove(q - cchGrow, q, cbMove);
 170                                         ret.resize(b - cchGrow);
 171                                 }
 172                                 p = &ret[0] + i;
 173                                 q = &ret[0] + j;
 174                                 if (cchValue > cchKey)
 175                                 {
 176                                         size_t cchGrow = cchValue - cchKey;
 177                                         ret.resize(b + cchGrow);
 178                                         memmove(q + cchGrow, q, cbMove);
 179                                 }
 180                         }
 181                         memcpy(p, value.c_str(), cchValue * sizeof(char));
 182                         q = p + cchValue;
 183                 }
 184         }
 185         return ret;
 186 }
 187
 188 std::string CMarkdown::Entities(const std::string& v)
 189 {
 190         std::string ret(v);
 191         char *p, *q = &ret[0];
 192         while (*(p = q))
 193         {
 194                 char *value = nullptr;
 195                 switch (*p)
 196                 {
 197                 case '&': value = "&amp;"; break;
 198                 case '"': value = "&quot;"; break;
 199                 case '\'': value = "&apos;"; break;
 200                 case '<' : value = "&lt;"; break;
 201                 case '>' : value = "&gt;"; break;
 202                 }
 203                 ++q;
 204                 if (value != nullptr)
 205                 {
 206                         size_t cchValue = strlen(value);
 207                         if (cchValue > 1)
 208                         {
 209                                 ptrdiff_t i = p - &ret[0];
 210                                 ptrdiff_t j = q - &ret[0];
 211                                 size_t b = v.length();
 212                                 ret.resize(b + cchValue - 1);
 213                                 p = &ret[0] + i;
 214                                 q = &ret[0] + j;
 215                                 memmove(q + cchValue - 1, q, (b - j) * sizeof(char));
 216                         }
 217                         memcpy(p, value, cchValue * sizeof(char));
 218                         q = p + cchValue;
 219                 }
 220         }
 221         return ret;
 222 }
 223
 224 //This is a hopefully complete list of the 36 (?) (potentially) unbalanced HTML
 225 //tags. It is based on tags.c from Tidy library,
 226 //"http://cvs.sourceforge.net/viewcvs.py/*checkout*/tidy/tidy/src/tags.c?rev=1.55".
 227 //It should include all tags from tag_defs[] array which are flagged either
 228 //CM_EMPTY (no closing tag) or CM_OPT (optional closing tag).
 229
 230 static const char htmlUTags[] =
 231 (
 232         "area\0"
 233         "base\0"
 234         "basefont\0"
 235         "body\0"
 236         "br\0"
 237         "col\0"
 238         "colgroup\0"
 239         "dd\0"
 240         "dt\0"
 241         "frame\0"
 242         "head\0"
 243         "hr\0"
 244         "html\0"
 245         "img\0"
 246         "input\0"
 247         "isindex\0"
 248         "li\0"
 249         "link\0"
 250         "meta\0"
 251         "optgroup\0"
 252         "option\0"
 253         "p\0"
 254         "param\0"
 255         "tbody\0"
 256         "td\0"
 257         "tfoot\0"
 258         "th\0"
 259         "thead\0"
 260         "tr\0"
 261         "nextid\0"
 262         /* proprietary elements */
 263         "bgsound\0"     //MICROSOFT
 264         "embed\0"       //NETSCAPE
 265         "keygen\0"      //NETSCAPE
 266         "marquee\0"     //MICROSOFT
 267         "spacer\0"      //NETSCAPE
 268         "wbr\0"         //PROPRIETARY
 269 );
 270
 271 CMarkdown::CMarkdown(const char *upper, const char *ahead, unsigned flags):
 272 first(nullptr), lower(nullptr), upper(upper), ahead(ahead),
 273 memcmp(flags & IgnoreCase ? ::_memicmp : ::memcmp),
 274 utags(flags & HtmlUTags ? htmlUTags : nullptr)
 275 {
 276         if (CMarkdown::ahead > CMarkdown::upper)
 277         {
 278                 --CMarkdown::ahead;
 279         }
 280 }
 281
 282 CMarkdown::operator bool()
 283 {
 284         return upper < ahead &&
 285         (
 286                 MAKEWORD(upper[0], upper[1]) != MAKEWORD('<', '/')
 287         &&      MAKEWORD(upper[0], upper[1]) != MAKEWORD(']', '>')
 288         );
 289 }
 290
 291 size_t CMarkdown::FindTag(const char *tags, const char *markup) const
 292 {
 293         while (ptrdiff_t len = strlen(tags))
 294         {
 295                 unsigned char c;
 296                 if
 297                 (
 298                         (ahead - markup) > len
 299                 &&      memcmp(markup, tags, len) == 0
 300                 &&      (isspace(c = markup[len]) || c == '[' || c == '>' || c == '"' || c == '\'' || c == '=')
 301                 )
 302                 {
 303                         return len;
 304                 }
 305                 tags += len + 1;
 306         }
 307         return 0;
 308 }
 309
 310 void CMarkdown::Scan()
 311 {
 312         if (first == upper && *this)
 313         {
 314                 int depth = 0;
 315                 do
 316                 {
 317                         switch (*upper++)
 318                         {
 319                         case '/':
 320                                 if (upper[-2] == '<')
 321                                         depth -= 2;
 322                                 break;
 323                         case '?':
 324                                 if (upper[-2] == '<')
 325                                 {
 326                                         do
 327                                         {
 328                                         } while (upper <= ahead && (*upper++ != '>' || upper[-2] != '?'));
 329                                         --depth;
 330                                 }
 331                                 break;
 332                         case '!':
 333                                 if (upper[-2] == '<' && upper <= ahead)
 334                                 {
 335                                         if (*upper == '-')
 336                                         {
 337                                                 do
 338                                                 {
 339                                                 } while (upper <= ahead && (*upper++ != '>' || upper[-2] != '-' || upper[-3] != '-'));
 340                                                 --depth;
 341                                         }
 342                                         else if (*upper == '[')
 343                                         {
 344                                                 do
 345                                                 {
 346                                                 } while (upper <= ahead && (*upper++ != '>' || upper[-2] != ']' || upper[-3] != ']'));
 347                                                 --depth;
 348                                         }
 349                                         else
 350                                         {
 351                                                 int quoting = 0;
 352                                                 do
 353                                                 {
 354                                                         switch (*upper)
 355                                                         {
 356                                                         case '"':
 357                                                                 if (!(quoting & 1))
 358                                                                         quoting ^= 2;
 359                                                                 break;
 360                                                         case '\'':
 361                                                                 if (!(quoting & 2))
 362                                                                         quoting ^= 1;
 363                                                                 break;
 364                                                         case '<':
 365                                                         case '[':
 366                                                                 if (!quoting)
 367                                                                         ++depth;
 368                                                                 break;
 369                                                         case ']':
 370                                                         case '>':
 371                                                                 if (!quoting)
 372                                                                         --depth;
 373                                                                 break;
 374                                                         }
 375                                                 } while (++upper <= ahead && depth);
 376                                         }
 377                                 }
 378                                 break;
 379                         case '>':
 380                                 if (upper[-2] == '/' || utags && FindTag(utags, first + 1))
 381                                         --depth;
 382                                 break;
 383                         case '<':
 384                                 ++depth;
 385                                 break;
 386                         }
 387                 } while (upper <= ahead && depth);
 388         }
 389 }
 390
 391 CMarkdown &CMarkdown::Move()
 392 {
 393         Scan();
 394         for (;;)
 395         {
 396                 while (*this && *upper != '<')
 397                 {
 398                         ++upper;
 399                 }
 400                 if (utags != nullptr && upper < ahead && *upper == '<')
 401                 {
 402                         size_t utlen = FindTag(utags, upper + 2);
 403                         if (utlen != 0)
 404                         {
 405                                 upper += 2 + utlen;
 406                                 continue;
 407                         }
 408                 }
 409                 break;
 410         }
 411         first = lower = upper;
 412         return *this;
 413 }
 414
 415 CMarkdown &CMarkdown::Move(const char *name)
 416 {
 417         while (Move())
 418         {
 419                 const char *q = lower;
 420                 const char *p = q + 1;
 421                 unsigned char c;
 422                 do
 423                 {
 424                         ++q;
 425                 } while (q <= ahead && !isspace(c = *q) && c != '[' && c != '>' && c != '"' && c != '\'' && c != '=');
 426                 size_t length = q - p;
 427                 if (memcmp(p, name, length) == 0 && name[length] == '\0')
 428                 {
 429                         break;
 430                 }
 431         }
 432         return *this;
 433 }
 434
 435 bool CMarkdown::Pull()
 436 {
 437         if (lower < ahead && (*lower != '<' || ++lower < ahead))
 438         {
 439                 if (first[1] == '!')
 440                 {
 441                         if (first[2] != '[' && first[2] != '-')
 442                         {
 443                                 // neither CDATA nor comment: assume DTD tag
 444                                 unsigned quoting = 0;
 445                                 while (lower < ahead && (quoting || *lower != '[' && *lower != '>'))
 446                                 {
 447                                         switch (*lower)
 448                                         {
 449                                         case '"':
 450                                                 if (!(quoting & 1))
 451                                                         quoting ^= 2;
 452                                                 break;
 453                                         case '\'':
 454                                                 if (!(quoting & 2))
 455                                                         quoting ^= 1;
 456                                                 break;
 457                                         }
 458                                         ++lower;
 459                                 }
 460                                 if (*lower == '[')
 461                                 {
 462                                         upper = lower;
 463                                         return true;
 464                                 }
 465                         }
 466                         return false;
 467                 }
 468                 while (lower < ahead && *lower != '>')
 469                 {
 470                         ++lower;
 471                 }
 472                 if (lower[-1] != '/' && lower[-1] != '?' && !(utags && FindTag(utags, first + 1)))
 473                 {
 474                         upper = lower;
 475                         return true;
 476                 }
 477         }
 478         return false;
 479 }
 480
 481 CMarkdown &CMarkdown::Pop()
 482 {
 483         if (!Pull())
 484         {
 485                 upper = ahead;
 486         }
 487         return *this;
 488 }
 489
 490 bool CMarkdown::Push()
 491 {
 492         if (upper < ahead)
 493         {
 494                 switch MAKEWORD(upper[0], upper[1])
 495                 {
 496                 case MAKEWORD('<', '/'):
 497                 case MAKEWORD(']', '>'):
 498                         upper += 2;
 499                         return true;
 500                 }
 501         }
 502         return false;
 503 }
 504
 505 std::string CMarkdown::GetTagName() const
 506 {
 507         const char *p = first;
 508         const char *q = first;
 509         if (q < ahead && (p = ++q) < ahead)
 510         {
 511                 if (*q == '!' && (*++q == '-' || *q == '['))
 512                 {
 513                         ++q;
 514                 }
 515                 else
 516                 {
 517                         unsigned char c;
 518                         while (q < ahead && !isspace(c = *q) && c != '[' && c != '>' && c != '"' && c != '\'' && c != '=' && c != '/')
 519                         {
 520                                 ++q;
 521                         }
 522                 }
 523         }
 524         return std::string(p, q - p);
 525 }
 526
 527 std::string CMarkdown::GetTagText() const
 528 {
 529         const char *p = first, *q = first;
 530         if (q < ahead && (p = ++q) < ahead && (*q != '!' || ++q < ahead))
 531         {
 532                 if (*q == '-' || *q == '[')
 533                 {
 534                         ++q;
 535                 }
 536                 else
 537                 {
 538                         unsigned quoting = 0;
 539                         while (q < ahead && (quoting || (*q != '[' && *q != '<' && *q != '>' && *q != '/')))
 540                         {
 541                                 switch (*q)
 542                                 {
 543                                 case '"':
 544                                         if (!(quoting & 1))
 545                                                 quoting ^= 2;
 546                                         break;
 547                                 case '\'':
 548                                         if (!(quoting & 2))
 549                                                 quoting ^= 1;
 550                                         break;
 551                                 }
 552                                 ++q;
 553                         }
 554                 }
 555         }
 556         return std::string(p, q - p);
 557 }
 558
 559 std::string CMarkdown::GetInnerText()
 560 {
 561         Scan();
 562         const char *p = first;
 563         const char *q = upper;
 564         char bracket = '>';
 565         if (p < upper && ++p < upper && *p == '!' && ++p < upper)
 566         {
 567                 bracket = *p;
 568                 if (bracket != '-')
 569                 {
 570                         bracket = '[';
 571                 }
 572         }
 573         p = lower;
 574         unsigned quoting = 0;
 575         while (p < upper && (quoting || *p != bracket))
 576         {
 577                 switch (*p)
 578                 {
 579                 case '"':
 580                         if (!(quoting & 1))
 581                                 quoting ^= 2;
 582                         break;
 583                 case '\'':
 584                         if (!(quoting & 2))
 585                                 quoting ^= 1;
 586                         break;
 587                 }
 588                 ++p;
 589         }
 590         if (p < q && p < --q && p < --q)
 591         {
 592                 ++p;
 593         }
 594         return std::string(p, q - p);
 595 }
 596
 597 std::string CMarkdown::GetOuterText()
 598 {
 599         Scan();
 600         const char *q = upper;
 601         if (q > first)
 602         {
 603                 while (q[-1] != '>' && q <= ahead)
 604                 {
 605                         ++q;
 606                 }
 607         }
 608         return std::string(lower, q - first);
 609 }
 610
 611 class CMarkdown::Token
 612 {
 613 public:
 614         const char *lower;
 615         const char *upper;
 616         int IsSpecial(const char *, const char *);
 617 };
 618
 619 int CMarkdown::Token::IsSpecial(const char *p, const char *ahead)
 620 {
 621         while (p <= ahead && isspace((unsigned char)*p))
 622         {
 623                 ++p;
 624         }
 625         lower = p;
 626         int special = 1;
 627         while (p <= ahead && !isspace((unsigned char)*p))
 628         {
 629                 switch (char c = *p)
 630                 {
 631                 case '"':
 632                 case '\'':
 633                         if (special && p < ahead)
 634                         {
 635                                 do
 636                                 {
 637                                         ++p;
 638                                 } while (p < ahead && *p != c);
 639                         }
 640                         [[fallthrough]];
 641                 case '/':
 642                 case '=':
 643                 case '<':
 644                 case '>':
 645                 case '[':
 646                 case ']':
 647                         upper = p + special;
 648                         return special;
 649                 }
 650                 ++p;
 651                 special = 0;
 652         }
 653         upper = p;
 654         return special;
 655 }
 656
 657 std::string CMarkdown::GetAttribute(const char *key, std::string *pv)
 658 {
 659         const char *name = 0;
 660         size_t cname = 0;
 661         const char *value = 0;
 662         size_t cvalue = 0;
 663         bool equals = false;
 664         const char *p = lower;
 665         Token token;
 666         do
 667         {
 668                 if (token.IsSpecial(p, ahead))
 669                 {
 670                         switch (*token.lower)
 671                         {
 672                         case '=':
 673                                 equals = true;
 674                                 break;
 675                         case '"':
 676                         case '\'':
 677                                 equals = false;
 678                                 cvalue = token.upper - (value = token.lower);
 679                                 if (cvalue >= 2)
 680                                 {
 681                                         ++value;
 682                                         cvalue -= 2;
 683                                 }
 684                                 break;
 685                         case '[':
 686                         case '>':
 687                                 token.upper = token.lower;
 688                                 break;
 689                         }
 690                 }
 691                 else if (token.upper != token.lower)
 692                 {
 693                         if (equals)
 694                         {
 695                                 equals = false;
 696                                 cvalue = token.upper - (value = token.lower);
 697                         }
 698                         else
 699                         {
 700                                 cname = token.upper - (name = token.lower);
 701                         }
 702                 }
 703                 p = token.upper;
 704                 if (name && value)
 705                 {
 706                         if (key == nullptr)
 707                         {
 708                                 lower = p;
 709                                 *pv = std::string(value, cvalue);
 710                                 return std::string(name, cname);
 711                         }
 712                         if (memcmp(name, key, cname) == 0 && key[cname] == '\0')
 713                         {
 714                                 return std::string(value, cvalue);
 715                         }
 716                         name = value = 0;
 717                 }
 718         } while (token.upper != token.lower);
 719         if (key == nullptr)
 720         {
 721                 lower = p;
 722                 return "";
 723         }
 724         return pv ? *pv : "";
 725 }
 726
 727 int CMarkdown::FileImage::GuessByteOrder(unsigned dwBOM)
 728 {
 729         int nByteOrder = 0;
 730         if (dwBOM)
 731         {
 732                 unsigned short wBOM = LOWORD(dwBOM);
 733                 unsigned short wBOMhigh = HIWORD(dwBOM);
 734                 nByteOrder = 2;
 735                 if (wBOM == 0 || wBOMhigh == 0)
 736                 {
 737                         wBOM |= wBOMhigh;
 738                         nByteOrder = 4;
 739                 }
 740                 if (wBOM == 0xFEFF || wBOM == 0xFFFE)
 741                 {
 742                         nByteOrder += 8 + static_cast<int>((char *)memchr(&dwBOM, 0xFF, 4) - (char *)&dwBOM);
 743                 }
 744                 else if (LOBYTE(wBOM) == 0 || HIBYTE(wBOM) == 0)
 745                 {
 746                         unsigned char cBOM = LOBYTE(wBOM) | HIBYTE(wBOM);
 747                         nByteOrder += static_cast<int>((char *)memchr(&dwBOM, cBOM, 4) - (char *)&dwBOM);
 748                 }
 749                 else if ((dwBOM & 0xFFFFFF) == 0xBFBBEF)
 750                 {
 751                         nByteOrder = 8 + 1;
 752                 }
 753                 else
 754                 {
 755                         nByteOrder = 1;
 756                 }
 757         }
 758         return nByteOrder;
 759 }
 760
 761 CMarkdown::FileImage::FileImage(const TCHAR *path, size_t trunc, unsigned flags)
 762 : pImage(nullptr), cbImage(0), nByteOrder(0), m_pSharedMemory(nullptr), pCopy(nullptr)
 763 {
 764         if (flags & Mapping)
 765         {
 766                 pImage = (void *)(path);
 767                 cbImage = trunc;
 768         }
 769         else if (path != nullptr)
 770         {
 771                 try
 772                 {
 773                         TFile file(path);
 774                         m_pSharedMemory = new SharedMemory(file, SharedMemory::AM_READ);
 775                         pImage = m_pSharedMemory->begin();
 776                         cbImage = m_pSharedMemory->end() - m_pSharedMemory->begin();
 777                 }
 778                 catch (...)
 779                 {
 780                 }
 781         }
 782         if (pImage == nullptr)
 783         {
 784                 cbImage = 0;
 785         }
 786         else if (cbImage >= 4 && (flags & Octets & (nByteOrder = GuessByteOrder(*(unsigned *)pImage))))
 787         {
 788                 switch (nByteOrder)
 789                 {
 790                 case 2 + 1:
 791                 case 2 + 1 + 8:
 792                         // big endian: swab first
 793                         cbImage &= ~1UL;
 794                         pCopy = new unsigned char[cbImage];
 795                         if (pCopy != nullptr)
 796                         {
 797                                 for (size_t i = 0; i < cbImage / 2; ++i)
 798                                         *((uint16_t *)pCopy + i) = Poco::ByteOrder::flipBytes(*((uint16_t *)pImage + i));
 799                         }
 800
 801                         delete m_pSharedMemory;
 802                         pImage = pCopy;
 803                         if (pImage != nullptr)
 804                         {
 805                                 [[fallthrough]];
 806                         case 2 + 0:
 807                         case 2 + 0 + 8:
 808                                 // little endian
 809                                 size_t cchImage = cbImage / 2;
 810                                 uint16_t *pchImage = (uint16_t *)pImage;
 811                                 if (nByteOrder & 8)
 812                                 {
 813                                         ++pchImage;
 814                                         --cchImage;
 815                                 }
 816                                 cbImage = ucr::Utf8len_of_string(pchImage, cchImage);
 817                                 pCopy = new unsigned char[cbImage];
 818                                 if (pCopy != nullptr)
 819                                 {
 820                                         uint16_t *pu16;
 821                                         unsigned char *pu8;
 822                                         for (pu16 = (uint16_t *)pchImage, pu8 = (unsigned char *)pCopy; pu16 < pchImage + cchImage; ++pu16)
 823                                                 pu8 += ucr::Ucs4_to_Utf8(*pu16, pu8);
 824                                 }
 825                                 delete m_pSharedMemory;
 826                                 m_pSharedMemory = nullptr;
 827                                 pImage = pCopy;
 828                         }
 829                         break;
 830                 case 4 + 1:
 831                 case 4 + 1 + 8:
 832                 case 4 + 2:
 833                 case 4 + 2 + 8:
 834                         // odd word endianness: swab first
 835                         cbImage &= ~3UL;
 836                         pCopy = new unsigned char[cbImage];
 837                         if (pCopy != nullptr)
 838                         {
 839                                 for (size_t i = 0; i < cbImage / 2; ++i)
 840                                         *((uint16_t *)pCopy + i) = Poco::ByteOrder::flipBytes(*((uint16_t *)pImage + i));
 841                         }
 842                         delete m_pSharedMemory;
 843                         m_pSharedMemory = nullptr;
 844                         pImage = pCopy;
 845                         if (pImage != nullptr)
 846                         {
 847                                 [[fallthrough]];
 848                         case 4 + 0:
 849                         case 4 + 0 + 8:
 850                         case 4 + 3:
 851                         case 4 + 3 + 8:
 852                                 size_t cchImage = cbImage;
 853                                 char *pchImage = (char *)pImage;
 854                                 if (nByteOrder & 8)
 855                                 {
 856                                         pchImage += 4;
 857                                         cchImage -= 4;
 858                                 }
 859                                 unsigned uch;
 860                                 cbImage = 0;
 861                                 for (size_t i = 0; i < cchImage; i += 4)
 862                                 {
 863                                         memcpy(&uch, pchImage + i, 4);
 864                                         if (nByteOrder & 2)
 865                                                 uch = ByteOrder::fromBigEndian(uch);
 866                                         else
 867                                                 uch = ByteOrder::fromLittleEndian(uch);
 868                                         cbImage += ucr::Utf8len_fromCodepoint(uch);
 869                                 }
 870                                 void *pCopy2 = new unsigned char[cbImage];
 871                                 if (pCopy2 != nullptr)
 872                                 {
 873                                         cbImage = 0;
 874                                         for (size_t i = 0; i < cchImage; i += 4)
 875                                         {
 876                                                 memcpy(&uch, pchImage + i, 4);
 877                                                 if (nByteOrder & 2)
 878                                                         uch = ByteOrder::fromBigEndian(uch);
 879                                                 else
 880                                                         uch = ByteOrder::fromLittleEndian(uch);
 881                                                 cbImage += ucr::Ucs4_to_Utf8(uch, (unsigned char *)pCopy2 + cbImage);
 882                                         }
 883                                 }
 884                                 delete m_pSharedMemory;
 885                                 m_pSharedMemory = nullptr;
 886                                 pImage = pCopy2;
 887                                 delete [] pCopy;
 888                                 pCopy = pCopy2;
 889                         }
 890                         break;
 891                 }
 892         }
 893 }
 894
 895 CMarkdown::FileImage::~FileImage()
 896 {
 897         delete m_pSharedMemory;
 898         delete [] pCopy;
 899 }