1 /* markdown.cpp: Pull-parse XML sources
2 * Copyright (c) 2005 Jochen Tucht
4 * This library is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU Lesser General Public
6 * License as published by the Free Software Foundation; either
7 * version 2.1 of the License, or (at your option) any later version.
9 * This library is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * Lesser General Public License for more details.
14 * You should have received a copy of the GNU Lesser General Public
15 * License along with this library; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
18 * OS/Libs: Win32/STL/shlwapi/iconv
19 * iconv.dll is loaded on demand, and is not required as long as
20 * program doesn't call iconv based methods.
22 * Remarks: Pull-parsing is a very simple way to parse XML. It does not require
23 * callback functions, and it does not build object trees in memory. It
24 * just travels through plain source.
26 * This library reads source text from memory. It can safely operate
27 * on memory mapped files, as it does not require text to be zero-
28 * terminated. It will also read most of the usual meta stuff (<? ?>,
29 * <!-- -->, <![ []]>, and DTD tags), but applying meta information is
30 * left to the caller. Thus, the library does not exactly implement an
31 * XML parser. It just helps reading XML.
33 * This library is not modeled after an existing pull parsing API,
34 * so don't expect to find the same methods you've seen elsewhere.
35 * In particular, this library does not follow XmlPull's event model,
36 * but attempts to be somewhat closer to a tree-based API.
37 * For simplicity, this library does not perform any validation, nor
38 * provide error handling other than returning empty text in case it
39 * fails to retrieve something.
41 * The name of the core class, CMarkdown, actually was going to be
42 * CMarkup when I came across another XML tool with same name on
43 * CodeProject. Like TinyXml and XMLite, and unlike CMarkdown, CMarkup
44 * follows DOM-like approach, suffering from considerable memory
45 * footprint. Anyway, class name CMarkdown somewhat reflects the nature
46 * of pull-parsing, pulling down the leaves of an XML tree so programs
47 * can reach them from a flat loop, rather than climb up the tree and
48 * push the leaves to some callback function, or preprocess the entire
49 * tree in some way before allowing programs to retrieve content.
51 * Recommended reading:
53 * www.sosnoski.com/articles/parsing1.html (SAX2 Basics)
54 * www.sosnoski.com/articles/parsing2.html (SAX vs Pull)
55 * www.sosnoski.com/articles/parsing3.html (Performance)
56 * www.xml.com/pub/a/2002/08/14/xmlpull.html (XMLPULL API)
57 * www.xml.com/pub/a/2002/09/25/xmlpull.html (response to above)
58 * www.stylusstudio.com/xmldev/200205/post61120.html (discussion)
60 * There are lots of related articles on the web, though.
62 Please mind 2. b) of the GNU LGPL terms, and log your changes below.
64 DATE: BY: DESCRIPTION:
65 ========== ================== ================================================
66 2005-01-15 Jochen Tucht Created
67 2005-02-26 Jochen Tucht Load iconv.dll through DLLPSTUB
68 2005-03-20 Jochen Tucht Add IgnoreCase option for ASCII-7 tag/attr names.
69 Add HtmlUTags option to check for (potentially)
70 unbalanced HTML tags. Html option is combination
71 of the above. Using these options imposes
72 performance penalty, so avoid it if you can.
73 New flag CMarkdown::FileImage::Handle makes
74 CMarkdown::FileImage::FileImage() accept a
75 handle rather than a filename.
76 2005-06-22 Jochen Tucht New method CMarkdown::_HSTR::Entities().
77 2005-07-29 Jochen Tucht ByteOrder detection for 16/32 bit encodings
78 2005-09-09 Jochen Tucht Patch by Takashi Sawanaka fixes crash due to
79 reading beyond end of text with HtmlUTags option
80 2005-12-04 Jochen Tucht Fix UTF-8 signature detection
81 Strip bogus trailing slash in name of empty tag
82 2008-08-27 Jochen Neubeck Replace MFC CMap by STL std::map
89 #include <Poco/ByteOrder.h>
90 #include <Poco/NumberParser.h>
91 #include <Poco/SharedMemory.h>
96 #define MAKEWORD(a, b) ((unsigned short)(((unsigned char)((unsigned)(a) & 0xff)) | ((unsigned short)((unsigned char)((unsigned)(b) & 0xff))) << 8))
97 #define MAKELONG(a, b) ((unsigned)(((unsigned short)((unsigned)(a) & 0xffff)) | ((unsigned)((unsigned short)((unsigned)(b) & 0xffff))) << 16))
98 #define LOWORD(l) ((unsigned short)((unsigned)(l) & 0xffff))
99 #define HIWORD(l) ((unsigned short)((unsigned)(l) >> 16))
100 #define LOBYTE(w) ((unsigned char)((unsigned)(w) & 0xff))
101 #define HIBYTE(w) ((unsigned char)((unsigned)(w) >> 8))
104 using Poco::ByteOrder;
105 using Poco::NumberParser;
106 using Poco::SharedMemory;
109 void CMarkdown::Load(EntityMap &entityMap)
111 entityMap["amp"] = "&";
112 entityMap["quot"] = "\"";
113 entityMap["apos"] = "'";
114 entityMap["lt"] = "<";
115 entityMap["gt"] = ">";
118 void CMarkdown::Load(EntityMap &entityMap, int dummy)
120 while (Move("!ENTITY"))
122 std::string hstrValue;
123 std::string hstrKey = GetAttribute(0, &hstrValue);
124 if (!hstrKey.empty())
126 entityMap[hstrKey] = hstrValue;
131 std::string CMarkdown::Resolve(const EntityMap &map, const std::string& v)
134 char *p, *q = &ret[0];
135 while ((p = strchr(q, '&')) != nullptr && (q = strchr(p, ';')) != nullptr)
142 unsigned ordinal = '?';
144 if (NumberParser::tryParseHex(key, ordinal))
145 value.assign(1, static_cast<std::string::value_type>(ordinal));
150 EntityMap::const_iterator p1 = map.find(key);
156 size_t cchValue = value.length();
159 size_t i = p - &ret[0];
160 size_t j = q - &ret[0];
161 size_t cchKey = q - p;
162 if (cchValue != cchKey)
164 size_t b = ret.length();
165 size_t cbMove = (b - j) * sizeof(char);
166 if (cchKey > cchValue)
168 size_t cchGrow = cchKey - cchValue;
169 memmove(q - cchGrow, q, cbMove);
170 ret.resize(b - cchGrow);
174 if (cchValue > cchKey)
176 size_t cchGrow = cchValue - cchKey;
177 ret.resize(b + cchGrow);
178 memmove(q + cchGrow, q, cbMove);
181 memcpy(p, value.c_str(), cchValue * sizeof(char));
188 std::string CMarkdown::Entities(const std::string& v)
191 char *p, *q = &ret[0];
194 char *value = nullptr;
197 case '&': value = "&"; break;
198 case '"': value = """; break;
199 case '\'': value = "'"; break;
200 case '<' : value = "<"; break;
201 case '>' : value = ">"; break;
204 if (value != nullptr)
206 size_t cchValue = strlen(value);
209 ptrdiff_t i = p - &ret[0];
210 ptrdiff_t j = q - &ret[0];
211 size_t b = v.length();
212 ret.resize(b + cchValue - 1);
215 memmove(q + cchValue - 1, q, (b - j) * sizeof(char));
217 memcpy(p, value, cchValue * sizeof(char));
224 //This is a hopefully complete list of the 36 (?) (potentially) unbalanced HTML
225 //tags. It is based on tags.c from Tidy library,
226 //"http://cvs.sourceforge.net/viewcvs.py/*checkout*/tidy/tidy/src/tags.c?rev=1.55".
227 //It should include all tags from tag_defs[] array which are flagged either
228 //CM_EMPTY (no closing tag) or CM_OPT (optional closing tag).
230 static const char htmlUTags[] =
262 /* proprietary elements */
263 "bgsound\0" //MICROSOFT
265 "keygen\0" //NETSCAPE
266 "marquee\0" //MICROSOFT
267 "spacer\0" //NETSCAPE
268 "wbr\0" //PROPRIETARY
271 CMarkdown::CMarkdown(const char *upper, const char *ahead, unsigned flags):
272 first(nullptr), lower(nullptr), upper(upper), ahead(ahead),
273 memcmp(flags & IgnoreCase ? ::_memicmp : ::memcmp),
274 utags(flags & HtmlUTags ? htmlUTags : nullptr)
276 if (CMarkdown::ahead > CMarkdown::upper)
282 CMarkdown::operator bool()
284 return upper < ahead &&
286 MAKEWORD(upper[0], upper[1]) != MAKEWORD('<', '/')
287 && MAKEWORD(upper[0], upper[1]) != MAKEWORD(']', '>')
291 size_t CMarkdown::FindTag(const char *tags, const char *markup) const
293 while (ptrdiff_t len = strlen(tags))
298 (ahead - markup) > len
299 && memcmp(markup, tags, len) == 0
300 && (isspace(c = markup[len]) || c == '[' || c == '>' || c == '"' || c == '\'' || c == '=')
310 void CMarkdown::Scan()
312 if (first == upper && *this)
320 if (upper[-2] == '<')
324 if (upper[-2] == '<')
328 } while (upper <= ahead && (*upper++ != '>' || upper[-2] != '?'));
333 if (upper[-2] == '<' && upper <= ahead)
339 } while (upper <= ahead && (*upper++ != '>' || upper[-2] != '-' || upper[-3] != '-'));
342 else if (*upper == '[')
346 } while (upper <= ahead && (*upper++ != '>' || upper[-2] != ']' || upper[-3] != ']'));
375 } while (++upper <= ahead && depth);
380 if (upper[-2] == '/' || utags && FindTag(utags, first + 1))
387 } while (upper <= ahead && depth);
391 CMarkdown &CMarkdown::Move()
396 while (*this && *upper != '<')
400 if (utags != nullptr && upper < ahead && *upper == '<')
402 size_t utlen = FindTag(utags, upper + 2);
411 first = lower = upper;
415 CMarkdown &CMarkdown::Move(const char *name)
419 const char *q = lower;
420 const char *p = q + 1;
425 } while (q <= ahead && !isspace(c = *q) && c != '[' && c != '>' && c != '"' && c != '\'' && c != '=');
426 size_t length = q - p;
427 if (memcmp(p, name, length) == 0 && name[length] == '\0')
435 bool CMarkdown::Pull()
437 if (lower < ahead && (*lower != '<' || ++lower < ahead))
441 if (first[2] != '[' && first[2] != '-')
443 // neither CDATA nor comment: assume DTD tag
444 unsigned quoting = 0;
445 while (lower < ahead && (quoting || *lower != '[' && *lower != '>'))
468 while (lower < ahead && *lower != '>')
472 if (lower[-1] != '/' && lower[-1] != '?' && !(utags && FindTag(utags, first + 1)))
481 CMarkdown &CMarkdown::Pop()
490 bool CMarkdown::Push()
494 switch MAKEWORD(upper[0], upper[1])
496 case MAKEWORD('<', '/'):
497 case MAKEWORD(']', '>'):
505 std::string CMarkdown::GetTagName() const
507 const char *p = first;
508 const char *q = first;
509 if (q < ahead && (p = ++q) < ahead)
511 if (*q == '!' && (*++q == '-' || *q == '['))
518 while (q < ahead && !isspace(c = *q) && c != '[' && c != '>' && c != '"' && c != '\'' && c != '=' && c != '/')
524 return std::string(p, q - p);
527 std::string CMarkdown::GetTagText() const
529 const char *p = first, *q = first;
530 if (q < ahead && (p = ++q) < ahead && (*q != '!' || ++q < ahead))
532 if (*q == '-' || *q == '[')
538 unsigned quoting = 0;
539 while (q < ahead && (quoting || (*q != '[' && *q != '<' && *q != '>' && *q != '/')))
556 return std::string(p, q - p);
559 std::string CMarkdown::GetInnerText()
562 const char *p = first;
563 const char *q = upper;
565 if (p < upper && ++p < upper && *p == '!' && ++p < upper)
574 unsigned quoting = 0;
575 while (p < upper && (quoting || *p != bracket))
590 if (p < q && p < --q && p < --q)
594 return std::string(p, q - p);
597 std::string CMarkdown::GetOuterText()
600 const char *q = upper;
603 while (q[-1] != '>' && q <= ahead)
608 return std::string(lower, q - first);
611 class CMarkdown::Token
616 int IsSpecial(const char *, const char *);
619 int CMarkdown::Token::IsSpecial(const char *p, const char *ahead)
621 while (p <= ahead && isspace((unsigned char)*p))
627 while (p <= ahead && !isspace((unsigned char)*p))
633 if (special && p < ahead)
638 } while (p < ahead && *p != c);
657 std::string CMarkdown::GetAttribute(const char *key, std::string *pv)
659 const char *name = 0;
661 const char *value = 0;
664 const char *p = lower;
668 if (token.IsSpecial(p, ahead))
670 switch (*token.lower)
678 cvalue = token.upper - (value = token.lower);
687 token.upper = token.lower;
691 else if (token.upper != token.lower)
696 cvalue = token.upper - (value = token.lower);
700 cname = token.upper - (name = token.lower);
709 *pv = std::string(value, cvalue);
710 return std::string(name, cname);
712 if (memcmp(name, key, cname) == 0 && key[cname] == '\0')
714 return std::string(value, cvalue);
718 } while (token.upper != token.lower);
724 return pv ? *pv : "";
727 int CMarkdown::FileImage::GuessByteOrder(unsigned dwBOM)
732 unsigned short wBOM = LOWORD(dwBOM);
733 unsigned short wBOMhigh = HIWORD(dwBOM);
735 if (wBOM == 0 || wBOMhigh == 0)
740 if (wBOM == 0xFEFF || wBOM == 0xFFFE)
742 nByteOrder += 8 + static_cast<int>((char *)memchr(&dwBOM, 0xFF, 4) - (char *)&dwBOM);
744 else if (LOBYTE(wBOM) == 0 || HIBYTE(wBOM) == 0)
746 unsigned char cBOM = LOBYTE(wBOM) | HIBYTE(wBOM);
747 nByteOrder += static_cast<int>((char *)memchr(&dwBOM, cBOM, 4) - (char *)&dwBOM);
749 else if ((dwBOM & 0xFFFFFF) == 0xBFBBEF)
761 CMarkdown::FileImage::FileImage(const TCHAR *path, size_t trunc, unsigned flags)
762 : pImage(nullptr), cbImage(0), nByteOrder(0), m_pSharedMemory(nullptr), pCopy(nullptr)
766 pImage = (void *)(path);
769 else if (path != nullptr)
774 m_pSharedMemory = new SharedMemory(file, SharedMemory::AM_READ);
775 pImage = m_pSharedMemory->begin();
776 cbImage = m_pSharedMemory->end() - m_pSharedMemory->begin();
782 if (pImage == nullptr)
786 else if (cbImage >= 4 && (flags & Octets & (nByteOrder = GuessByteOrder(*(unsigned *)pImage))))
792 // big endian: swab first
794 pCopy = new unsigned char[cbImage];
795 if (pCopy != nullptr)
797 for (size_t i = 0; i < cbImage / 2; ++i)
798 *((uint16_t *)pCopy + i) = Poco::ByteOrder::flipBytes(*((uint16_t *)pImage + i));
801 delete m_pSharedMemory;
803 if (pImage != nullptr)
809 size_t cchImage = cbImage / 2;
810 uint16_t *pchImage = (uint16_t *)pImage;
816 cbImage = ucr::Utf8len_of_string(pchImage, cchImage);
817 pCopy = new unsigned char[cbImage];
818 if (pCopy != nullptr)
822 for (pu16 = (uint16_t *)pchImage, pu8 = (unsigned char *)pCopy; pu16 < pchImage + cchImage; ++pu16)
823 pu8 += ucr::Ucs4_to_Utf8(*pu16, pu8);
825 delete m_pSharedMemory;
826 m_pSharedMemory = nullptr;
834 // odd word endianness: swab first
836 pCopy = new unsigned char[cbImage];
837 if (pCopy != nullptr)
839 for (size_t i = 0; i < cbImage / 2; ++i)
840 *((uint16_t *)pCopy + i) = Poco::ByteOrder::flipBytes(*((uint16_t *)pImage + i));
842 delete m_pSharedMemory;
843 m_pSharedMemory = nullptr;
845 if (pImage != nullptr)
852 size_t cchImage = cbImage;
853 char *pchImage = (char *)pImage;
861 for (size_t i = 0; i < cchImage; i += 4)
863 memcpy(&uch, pchImage + i, 4);
865 uch = ByteOrder::fromBigEndian(uch);
867 uch = ByteOrder::fromLittleEndian(uch);
868 cbImage += ucr::Utf8len_fromCodepoint(uch);
870 void *pCopy2 = new unsigned char[cbImage];
871 if (pCopy2 != nullptr)
874 for (size_t i = 0; i < cchImage; i += 4)
876 memcpy(&uch, pchImage + i, 4);
878 uch = ByteOrder::fromBigEndian(uch);
880 uch = ByteOrder::fromLittleEndian(uch);
881 cbImage += ucr::Ucs4_to_Utf8(uch, (unsigned char *)pCopy2 + cbImage);
884 delete m_pSharedMemory;
885 m_pSharedMemory = nullptr;
895 CMarkdown::FileImage::~FileImage()
897 delete m_pSharedMemory;