1 /* markdown.cpp: Pull-parse XML sources
2 * Copyright (c) 2005 Jochen Tucht
4 * This library is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU Lesser General Public
6 * License as published by the Free Software Foundation; either
7 * version 2.1 of the License, or (at your option) any later version.
9 * This library is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * Lesser General Public License for more details.
14 * You should have received a copy of the GNU Lesser General Public
15 * License along with this library; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
18 * OS/Libs: Win32/STL/shlwapi/iconv
19 * iconv.dll is loaded on demand, and is not required as long as
20 * program doesn't call iconv based methods.
22 * Remarks: Pull-parsing is a very simple way to parse XML. It does not require
23 * callback functions, and it does not build object trees in memory. It
24 * just travels through plain source.
26 * This library reads source text from memory. It can safely operate
27 * on memory mapped files, as it does not require text to be zero-
28 * terminated. It will also read most of the usual meta stuff (<? ?>,
29 * <!-- -->, <![ []]>, and DTD tags), but applying meta information is
30 * left to the caller. Thus, the library does not exactly implement an
31 * XML parser. It just helps reading XML.
33 * This library is not modeled after an existing pull parsing API,
34 * so don't expect to find the same methods you've seen elsewhere.
35 * In particular, this library does not follow XmlPull's event model,
36 * but attempts to be somewhat closer to a tree-based API.
37 * For simplicity, this library does not perform any validation, nor
38 * provide error handling other than returning empty text in case it
39 * fails to retrieve something.
41 * The name of the core class, CMarkdown, actually was going to be
42 * CMarkup when I came across another XML tool with same name on
43 * CodeProject. Like TinyXml and XMLite, and unlike CMarkdown, CMarkup
44 * follows DOM-like approach, suffering from considerable memory
45 * footprint. Anyway, class name CMarkdown somewhat reflects the nature
46 * of pull-parsing, pulling down the leaves of an XML tree so programs
47 * can reach them from a flat loop, rather than climb up the tree and
48 * push the leaves to some callback function, or preprocess the entire
49 * tree in some way before allowing programs to retrieve content.
51 * Recommended reading:
53 * www.sosnoski.com/articles/parsing1.html (SAX2 Basics)
54 * www.sosnoski.com/articles/parsing2.html (SAX vs Pull)
55 * www.sosnoski.com/articles/parsing3.html (Performance)
56 * www.xml.com/pub/a/2002/08/14/xmlpull.html (XMLPULL API)
57 * www.xml.com/pub/a/2002/09/25/xmlpull.html (response to above)
58 * www.stylusstudio.com/xmldev/200205/post61120.html (discussion)
60 * There are lots of related articles on the web, though.
62 Please mind 2. b) of the GNU LGPL terms, and log your changes below.
64 DATE: BY: DESCRIPTION:
65 ========== ================== ================================================
66 2005-01-15 Jochen Tucht Created
67 2005-02-26 Jochen Tucht Load iconv.dll through DLLPSTUB
68 2005-03-20 Jochen Tucht Add IgnoreCase option for ASCII-7 tag/attr names.
69 Add HtmlUTags option to check for (potentially)
70 unbalanced HTML tags. Html option is combination
71 of the above. Using these options imposes
72 performance penalty, so avoid it if you can.
73 New flag CMarkdown::FileImage::Handle makes
74 CMarkdown::FileImage::FileImage() accept a
75 handle rather than a filename.
76 2005-06-22 Jochen Tucht New method CMarkdown::_HSTR::Entities().
77 2005-07-29 Jochen Tucht ByteOrder detection for 16/32 bit encodings
78 2005-09-09 Jochen Tucht Patch by Takashi Sawanaka fixes crash due to
79 reading beyond end of text with HtmlUTags option
80 2005-12-04 Jochen Tucht Fix UTF-8 signature detection
81 Strip bogus trailing slash in name of empty tag
82 2008-08-27 Jochen Neubeck Replace MFC CMap by STL std::map
89 #include <Poco/ByteOrder.h>
90 #include <Poco/NumberParser.h>
91 #include <Poco/SharedMemory.h>
95 #define MAKEWORD(a, b) ((unsigned short)(((unsigned char)((unsigned)(a) & 0xff)) | ((unsigned short)((unsigned char)((unsigned)(b) & 0xff))) << 8))
96 #define MAKELONG(a, b) ((unsigned)(((unsigned short)((unsigned)(a) & 0xffff)) | ((unsigned)((unsigned short)((unsigned)(b) & 0xffff))) << 16))
97 #define LOWORD(l) ((unsigned short)((unsigned)(l) & 0xffff))
98 #define HIWORD(l) ((unsigned short)((unsigned)(l) >> 16))
99 #define LOBYTE(w) ((unsigned char)((unsigned)(w) & 0xff))
100 #define HIBYTE(w) ((unsigned char)((unsigned)(w) >> 8))
102 using Poco::ByteOrder;
103 using Poco::NumberParser;
104 using Poco::SharedMemory;
107 void CMarkdown::Load(EntityMap &entityMap)
109 entityMap["amp"] = "&";
110 entityMap["quot"] = "\"";
111 entityMap["apos"] = "'";
112 entityMap["lt"] = "<";
113 entityMap["gt"] = ">";
116 void CMarkdown::Load(EntityMap &entityMap, int dummy)
118 while (Move("!ENTITY"))
120 std::string hstrValue;
121 std::string hstrKey = GetAttribute(0, &hstrValue);
122 if (!hstrKey.empty())
124 entityMap[hstrKey] = hstrValue;
129 std::string CMarkdown::Resolve(const EntityMap &map, const std::string& v)
132 char *p, *q = &ret[0];
133 while ((p = strchr(q, '&')) != nullptr && (q = strchr(p, ';')) != nullptr)
140 unsigned ordinal = '?';
142 if (NumberParser::tryParseHex(key, ordinal))
143 value.assign(1, static_cast<std::string::value_type>(ordinal));
148 EntityMap::const_iterator p1 = map.find(key);
154 size_t cchValue = value.length();
157 size_t i = p - &ret[0];
158 size_t j = q - &ret[0];
159 size_t cchKey = q - p;
160 if (cchValue != cchKey)
162 size_t b = ret.length();
163 size_t cbMove = (b - j) * sizeof(char);
164 if (cchKey > cchValue)
166 size_t cchGrow = cchKey - cchValue;
167 memmove(q - cchGrow, q, cbMove);
168 ret.resize(b - cchGrow);
172 if (cchValue > cchKey)
174 size_t cchGrow = cchValue - cchKey;
175 ret.resize(b + cchGrow);
176 memmove(q + cchGrow, q, cbMove);
179 memcpy(p, value.c_str(), cchValue * sizeof(char));
186 std::string CMarkdown::Entities(const std::string& v)
189 char *p, *q = &ret[0];
192 char *value = nullptr;
195 case '&': value = "&"; break;
196 case '"': value = """; break;
197 case '\'': value = "'"; break;
198 case '<' : value = "<"; break;
199 case '>' : value = ">"; break;
202 if (value != nullptr)
204 size_t cchValue = strlen(value);
207 ptrdiff_t i = p - &ret[0];
208 ptrdiff_t j = q - &ret[0];
209 size_t b = v.length();
210 ret.resize(b + cchValue - 1);
213 memmove(q + cchValue - 1, q, (b - j) * sizeof(char));
215 memcpy(p, value, cchValue * sizeof(char));
222 //This is a hopefully complete list of the 36 (?) (potentially) unbalanced HTML
223 //tags. It is based on tags.c from Tidy library,
224 //"http://cvs.sourceforge.net/viewcvs.py/*checkout*/tidy/tidy/src/tags.c?rev=1.55".
225 //It should include all tags from tag_defs[] array which are flagged either
226 //CM_EMPTY (no closing tag) or CM_OPT (optional closing tag).
228 static const char htmlUTags[] =
260 /* proprietary elements */
261 "bgsound\0" //MICROSOFT
263 "keygen\0" //NETSCAPE
264 "marquee\0" //MICROSOFT
265 "spacer\0" //NETSCAPE
266 "wbr\0" //PROPRIETARY
269 CMarkdown::CMarkdown(const char *upper, const char *ahead, unsigned flags):
270 first(nullptr), lower(nullptr), upper(upper), ahead(ahead),
271 memcmp(flags & IgnoreCase ? ::_memicmp : ::memcmp),
272 utags(flags & HtmlUTags ? htmlUTags : nullptr)
274 if (CMarkdown::ahead > CMarkdown::upper)
280 CMarkdown::operator bool()
282 return upper < ahead &&
284 MAKEWORD(upper[0], upper[1]) != MAKEWORD('<', '/')
285 && MAKEWORD(upper[0], upper[1]) != MAKEWORD(']', '>')
289 size_t CMarkdown::FindTag(const char *tags, const char *markup) const
291 while (ptrdiff_t len = strlen(tags))
296 (ahead - markup) > len
297 && memcmp(markup, tags, len) == 0
298 && (isspace(c = markup[len]) || c == '[' || c == '>' || c == '"' || c == '\'' || c == '=')
308 void CMarkdown::Scan()
310 if (first == upper && *this)
318 if (upper[-2] == '<')
322 if (upper[-2] == '<')
326 } while (upper <= ahead && (*upper++ != '>' || upper[-2] != '?'));
331 if (upper[-2] == '<' && upper <= ahead)
337 } while (upper <= ahead && (*upper++ != '>' || upper[-2] != '-' || upper[-3] != '-'));
340 else if (*upper == '[')
344 } while (upper <= ahead && (*upper++ != '>' || upper[-2] != ']' || upper[-3] != ']'));
373 } while (++upper <= ahead && depth);
378 if (upper[-2] == '/' || utags && FindTag(utags, first + 1))
385 } while (upper <= ahead && depth);
389 CMarkdown &CMarkdown::Move()
394 while (*this && *upper != '<')
398 if (utags != nullptr && upper < ahead && *upper == '<')
400 size_t utlen = FindTag(utags, upper + 2);
409 first = lower = upper;
413 CMarkdown &CMarkdown::Move(const char *name)
417 const char *q = lower;
418 const char *p = q + 1;
423 } while (q <= ahead && !isspace(c = *q) && c != '[' && c != '>' && c != '"' && c != '\'' && c != '=');
424 size_t length = q - p;
425 if (memcmp(p, name, length) == 0 && name[length] == '\0')
433 bool CMarkdown::Pull()
435 if (lower < ahead && (*lower != '<' || ++lower < ahead))
439 if (first[2] != '[' && first[2] != '-')
441 // neither CDATA nor comment: assume DTD tag
442 unsigned quoting = 0;
443 while (lower < ahead && (quoting || *lower != '[' && *lower != '>'))
466 while (lower < ahead && *lower != '>')
470 if (lower[-1] != '/' && lower[-1] != '?' && !(utags && FindTag(utags, first + 1)))
479 CMarkdown &CMarkdown::Pop()
488 bool CMarkdown::Push()
492 switch MAKEWORD(upper[0], upper[1])
494 case MAKEWORD('<', '/'):
495 case MAKEWORD(']', '>'):
503 std::string CMarkdown::GetTagName() const
505 const char *p = first;
506 const char *q = first;
507 if (q < ahead && (p = ++q) < ahead)
509 if (*q == '!' && (*++q == '-' || *q == '['))
516 while (q < ahead && !isspace(c = *q) && c != '[' && c != '>' && c != '"' && c != '\'' && c != '=' && c != '/')
522 return std::string(p, q - p);
525 std::string CMarkdown::GetTagText() const
527 const char *p = first, *q = first;
528 if (q < ahead && (p = ++q) < ahead && (*q != '!' || ++q < ahead))
530 if (*q == '-' || *q == '[')
536 unsigned quoting = 0;
537 while (q < ahead && (quoting || (*q != '[' && *q != '<' && *q != '>' && *q != '/')))
554 return std::string(p, q - p);
557 std::string CMarkdown::GetInnerText()
560 const char *p = first;
561 const char *q = upper;
563 if (p < upper && ++p < upper && *p == '!' && ++p < upper)
572 unsigned quoting = 0;
573 while (p < upper && (quoting || *p != bracket))
588 if (p < q && p < --q && p < --q)
592 return std::string(p, q - p);
595 std::string CMarkdown::GetOuterText()
598 const char *q = upper;
601 while (q[-1] != '>' && q <= ahead)
606 return std::string(lower, q - first);
609 class CMarkdown::Token
614 int IsSpecial(const char *, const char *);
617 int CMarkdown::Token::IsSpecial(const char *p, const char *ahead)
619 while (p <= ahead && isspace((unsigned char)*p))
625 while (p <= ahead && !isspace((unsigned char)*p))
631 if (special && p < ahead)
636 } while (p < ahead && *p != c);
655 std::string CMarkdown::GetAttribute(const char *key, std::string *pv)
657 const char *name = 0;
659 const char *value = 0;
662 const char *p = lower;
666 if (token.IsSpecial(p, ahead))
668 switch (*token.lower)
676 cvalue = token.upper - (value = token.lower);
685 token.upper = token.lower;
689 else if (token.upper != token.lower)
694 cvalue = token.upper - (value = token.lower);
698 cname = token.upper - (name = token.lower);
707 *pv = std::string(value, cvalue);
708 return std::string(name, cname);
710 if (memcmp(name, key, cname) == 0 && key[cname] == '\0')
712 return std::string(value, cvalue);
716 } while (token.upper != token.lower);
722 return pv ? *pv : "";
725 int CMarkdown::FileImage::GuessByteOrder(unsigned dwBOM)
730 unsigned short wBOM = LOWORD(dwBOM);
731 unsigned short wBOMhigh = HIWORD(dwBOM);
733 if (wBOM == 0 || wBOMhigh == 0)
738 if (wBOM == 0xFEFF || wBOM == 0xFFFE)
740 nByteOrder += 8 + static_cast<int>((char *)memchr(&dwBOM, 0xFF, 4) - (char *)&dwBOM);
742 else if (LOBYTE(wBOM) == 0 || HIBYTE(wBOM) == 0)
744 unsigned char cBOM = LOBYTE(wBOM) | HIBYTE(wBOM);
745 nByteOrder += static_cast<int>((char *)memchr(&dwBOM, cBOM, 4) - (char *)&dwBOM);
747 else if ((dwBOM & 0xFFFFFF) == 0xBFBBEF)
759 CMarkdown::FileImage::FileImage(const TCHAR *path, size_t trunc, unsigned flags)
760 : pImage(nullptr), cbImage(0), nByteOrder(0), m_pSharedMemory(nullptr), pCopy(nullptr)
764 pImage = (void *)(path);
772 m_pSharedMemory = new SharedMemory(file, SharedMemory::AM_READ);
773 pImage = m_pSharedMemory->begin();
774 cbImage = m_pSharedMemory->end() - m_pSharedMemory->begin();
780 if (pImage == nullptr)
784 else if (cbImage >= 4 && (flags & Octets & (nByteOrder = GuessByteOrder(*(unsigned *)pImage))))
790 // big endian: swab first
792 pCopy = new unsigned char[cbImage];
793 if (pCopy != nullptr)
795 for (size_t i = 0; i < cbImage / 2; ++i)
796 *((uint16_t *)pCopy + i) = Poco::ByteOrder::flipBytes(*((uint16_t *)pImage + i));
799 delete m_pSharedMemory;
801 if (pImage != nullptr)
806 size_t cchImage = cbImage / 2;
807 uint16_t *pchImage = (uint16_t *)pImage;
813 cbImage = ucr::Utf8len_of_string(pchImage, cchImage);
814 pCopy = new unsigned char[cbImage];
815 if (pCopy != nullptr)
819 for (pu16 = (uint16_t *)pchImage, pu8 = (unsigned char *)pCopy; pu16 < pchImage + cchImage; ++pu16)
820 pu8 += ucr::Ucs4_to_Utf8(*pu16, pu8);
822 delete m_pSharedMemory;
823 m_pSharedMemory = nullptr;
831 // odd word endianness: swab first
833 pCopy = new unsigned char[cbImage];
834 if (pCopy != nullptr)
836 for (size_t i = 0; i < cbImage / 2; ++i)
837 *((uint16_t *)pCopy + i) = Poco::ByteOrder::flipBytes(*((uint16_t *)pImage + i));
839 delete m_pSharedMemory;
840 m_pSharedMemory = nullptr;
842 if (pImage != nullptr)
848 size_t cchImage = cbImage;
849 char *pchImage = (char *)pImage;
857 for (size_t i = 0; i < cchImage; i += 4)
859 memcpy(&uch, pchImage + i, 4);
861 uch = ByteOrder::fromBigEndian(uch);
863 uch = ByteOrder::fromLittleEndian(uch);
864 cbImage += ucr::Utf8len_fromCodepoint(uch);
866 void *pCopy2 = new unsigned char[cbImage];
867 if (pCopy2 != nullptr)
870 for (size_t i = 0; i < cchImage; i += 4)
872 memcpy(&uch, pchImage + i, 4);
874 uch = ByteOrder::fromBigEndian(uch);
876 uch = ByteOrder::fromLittleEndian(uch);
877 cbImage += ucr::Ucs4_to_Utf8(uch, (unsigned char *)pCopy2 + cbImage);
880 delete m_pSharedMemory;
881 m_pSharedMemory = nullptr;
891 CMarkdown::FileImage::~FileImage()
893 delete m_pSharedMemory;