From 048bdb7a085147e2293db20222a70dc0f80db93a Mon Sep 17 00:00:00 2001
From: Jochen Tucht <jtuc@users.sourceforge.net>
Date: Fri, 15 Jul 2005 08:22:13 +0000
Subject: [PATCH] PATCH: [ 1225880 ] Project file parsing based on CMarkdown
 class

---
 Src/ProjectFile.cpp | 142 ++++++++++++++++++++-------------------------
 Src/markdown.cpp    | 164 +++++++++++++++++++++++++++++++++++++++++++++++++---
 Src/markdown.h      |  21 +++++--
 3 files changed, 233 insertions(+), 94 deletions(-)
diff --git a/Src/ProjectFile.cpp b/Src/ProjectFile.cpp
index 7715e7ad1..bb4926311 100755
--- a/Src/ProjectFile.cpp
+++ b/Src/ProjectFile.cpp
@@ -24,6 +24,7 @@
 
 #include "stdafx.h"
 #include "ProjectFile.h"
+#include "markdown.h"
 
 ProjectFile::ProjectFile()
 {
@@ -31,104 +32,85 @@ ProjectFile::ProjectFile()
 }
 
 /** 
+ * @brief Get message from exception into sError, or else throw it.
+ */
+static BOOL NTAPI False(CException *e, CString *sError)
+{
+	if (sError == NULL)
+		throw e;
+	TCHAR szError[1024];
+	e->GetErrorMessage(szError, 1024);
+	*sError = szError;
+	e->Delete();
+	return FALSE;
+}
+
+/** 
  * @brief Open given path-file and read data from it to member variables.
  */
 BOOL ProjectFile::Read(LPCTSTR path, CString *sError)
 {
-	ASSERT(sError != NULL);
-	CFile file;
-	CFileException e;
-
-	if (!file.Open(path, CFile::modeRead, &e))
+	try
 	{
-		TCHAR szError[1024];
-		e.GetErrorMessage(szError, 1024);
-		*sError = szError;
-		return FALSE;
-	}
-
-	char buf[4096] = {0};
-	TCHAR buf2[4096] = {0};
-	TCHAR tmpPath[MAX_PATH] = {0};
-	UINT bytesRead = file.Read(buf, 4095);
-
-	USES_CONVERSION;
-	_tcsncpy(buf2, A2T(buf), 4096);
-
-	if (_tcsstr(buf2, _T("<?xml")) && _tcsstr(buf2, _T("?>")))
-	{
-		TCHAR *pProject = _tcsstr(buf2, _T("<project>"));
-		
-		if (pProject)
+		CMarkdown::EntityMap entities;
+		entities.Load();
+		CMarkdown::File xmlfile = path;
+		if (xmlfile.pImage == NULL)
 		{
-			TCHAR *pPaths = _tcsstr(buf2, _T("<paths>"));
-			TCHAR *pLeft = _tcsstr(buf2, _T("<left>"));
-			TCHAR *pRight = _tcsstr(buf2, _T("<right>"));
-			TCHAR *pFilter = _tcsstr(buf2, _T("<filter>"));
-			TCHAR *pSubs = _tcsstr(buf2, _T("<subfolders>"));
-
-			CString subs;
-			GetVal(pPaths, pLeft, &m_leftFile, _T("<left>"), _T("</left>"), buf2);
-			GetVal(pPaths, pRight, &m_rightFile, _T("<right>"), _T("</right>"), buf2);
-			GetVal(pPaths, pFilter, &m_filter, _T("<filter>"), _T("</filter>"), buf2);
-			if (GetVal(pPaths, pSubs, &subs, _T("<subfolders>"), _T("</subfolders>"), buf2))
-				m_subfolders = _ttoi(subs);
+			CFileException::ThrowOsError(GetLastError(), path);
 		}
+		// If encoding is other than UTF-8, assume CP_ACP
+		CMarkdown::String encoding = CMarkdown(xmlfile).Move("?xml").GetAttribute("encoding");
+		UINT codepage = lstrcmpiA(encoding.A, "UTF-8") == 0 ? CP_UTF8 : CP_ACP;
+
+		CMarkdown project = CMarkdown(xmlfile).Move("project").Pop();
+		CMarkdown paths = CMarkdown(project).Move("paths").Pop();
+		m_leftFile = CMarkdown::String(CMarkdown(paths).Move("left").GetInnerText()->Unicode(codepage)->Resolve(entities)).W;
+		m_rightFile = CMarkdown::String(CMarkdown(paths).Move("right").GetInnerText()->Unicode(codepage)->Resolve(entities)).W;
+		m_filter = CMarkdown::String(CMarkdown(paths).Move("filter").GetInnerText()->Unicode(codepage)->Resolve(entities)).W;
+		sscanf(CMarkdown::String(CMarkdown(paths).Move("subfolders").GetInnerText()).A, "%d", &m_subfolders);
+	}
+	catch (CException *e)
+	{
+		return False(e, sError);
 	}
-
-	file.Close();
-
 	return TRUE;
 }
 
 /** 
  * @brief Save data from member variables to path-file.
- * @note paths are converted to ASCII
+ * @note paths are converted to UTF-8
  */
 BOOL ProjectFile::Save(LPCTSTR path, CString *sError)
 {
-	UINT flags = CFile::modeCreate | CFile::modeWrite;
-	CFile file;
-	CFileException e;
-
-	if (!file.Open(path, flags,&e))
+	try
 	{
-		TCHAR szError[1024];
-		e.GetErrorMessage(szError, 1024);
-		*sError = szError;
-		
-		return FALSE;
+		static const char szFormat[]
+		(
+			"<?xml version='1.0' encoding='UTF-8'?>\n"
+			"<project>\n"
+			"\t<paths>\n"
+			"\t\t<left>%s</left>\n"
+			"\t\t<right>%s</right>\n"
+			"\t\t<filter>%s</filter>\n"
+			"\t\t<subfolders>%d</subfolders>\n"
+			"\t</paths>\n"
+			"</project>\n"
+		);
+		fprintf
+		(
+			CStdioFile(path, CFile::modeCreate|CFile::modeWrite|CFile::typeText).m_pStream,
+			szFormat,
+			CMarkdown::String(CMarkdown::HSTR(GetLeft().AllocSysString())->Entities()->Octets(CP_UTF8)).A,
+			CMarkdown::String(CMarkdown::HSTR(GetRight().AllocSysString())->Entities()->Octets(CP_UTF8)).A,
+			CMarkdown::String(CMarkdown::HSTR(GetFilter().AllocSysString())->Entities()->Octets(CP_UTF8)).A,
+			GetSubfolders() ? 1 : 0
+		);
+	}
+	catch (CException *e)
+	{
+		return False(e, sError);
 	}
-
-	TCHAR buf2[4096] = {0};
-	
-	_tcscpy(buf2,_T("<?xml version=\"1.0\"?>\n<project>\n\t<paths>\n\t\t"));
-	
-	_tcscat(buf2,_T("<left>"));
-	_tcscat(buf2,GetLeft());
-	_tcscat(buf2,_T("</left>\n\t\t"));
-	_tcscat(buf2,_T("<right>"));
-	_tcscat(buf2,GetRight());
-	_tcscat(buf2,_T("</right>\n\t\t"));
-	_tcscat(buf2,_T("<filter>"));
-	_tcscat(buf2,GetFilter());
-	_tcscat(buf2,_T("</filter>\n\t\t"));
-	_tcscat(buf2,_T("<subfolders>"));
-	_tcscat(buf2,GetSubfolders() ? _T("1") : _T("0"));
-	_tcscat(buf2,_T("</subfolders>\n"));
-	
-	_tcscat(buf2,_T("\t</paths>\n</project>"));
-
-	// convert the string from unicode to ascii, because Read is expecting ascii
-	char buf[4096] = {0};
-	
-	USES_CONVERSION;
-	strncpy(buf, T2A(buf2), 4096);
-
-
-	file.Write(buf,strlen(buf));
-	file.Close();
-
 	return TRUE;
 }
 
diff --git a/Src/markdown.cpp b/Src/markdown.cpp
index d612b2107..70284a441 100644
--- a/Src/markdown.cpp
+++ b/Src/markdown.cpp
@@ -65,6 +65,15 @@ DATE:		BY:					DESCRIPTION:
 ==========	==================	================================================
 2005/01/15	Jochen Tucht		Created
 2005/02/26	Jochen Tucht		Load iconv.dll through DLLPSTUB
+2005/03/20	Jochen Tucht		Add IgnoreCase option for ASCII-7 tag/attr names.
+								Add HtmlUTags option to check for (potentially)
+								unbalanced HTML tags. Html option is combination
+								of the above. Using these options imposes
+								performance penalty, so avoid it if you can.
+								New flag CMarkdown::FileImage::Handle makes
+								CMarkdown::FileImage::FileImage() accept a
+								handle rather than a filename.
+2005/06/22	Jochen Tucht		New method CMarkdown::_HSTR::Entities().
 */
 
 #include "stdafx.h"
@@ -256,6 +265,55 @@ CMarkdown::HSTR CMarkdown::_HSTR::Resolve(const CMarkdown::EntityMap &map)
 	return H;
 }
 
+CMarkdown::HSTR CMarkdown::_HSTR::Entities()
+{
+	HSTR H = this;
+	BSTR p, q = H->B;
+	while (*(p = q))
+	{
+		OLECHAR *value = 0;
+		switch (*p)
+		{
+		case '&': value = L"&amp;"; break;
+		case '"': value = L"&quot;"; break;
+		case '\'': value = L"&apos;"; break;
+		case '<' : value = L"&lt;"; break;
+		case '>' : value = L"&gt;"; break;
+		}
+		++q;
+		if (value)
+		{
+			int i = p - H->B;
+			int j = q - H->B;
+			int cchValue = lstrlenW(value);
+			if (int cchGrow = cchValue - 1)
+			{
+				BSTR B = H->B;
+				int b = SysStringLen(B);
+				size_t cbMove = (b - j) * sizeof(OLECHAR);
+				if (cchGrow < 0)
+				{
+					memmove(q + cchGrow, q, cbMove);
+				}
+				if (!SysReAllocStringLen(&B, B, b + cchGrow))
+				{
+					continue;
+				}
+				H = (HSTR)B;
+				p = H->B + i;
+				q = H->B + j;
+				if (cchGrow > 0)
+				{
+					memmove(q + cchGrow, q, cbMove);
+				}
+			}
+			memcpy(p, value, cchValue * sizeof(OLECHAR));
+			q = p + cchValue;
+		}
+	}
+	return H;
+}
+
 CMarkdown::HSTR CMarkdown::_HSTR::Trim(const OLECHAR *pszTrimChars)
 {
 	HSTR H = this;
@@ -267,8 +325,57 @@ CMarkdown::HSTR CMarkdown::_HSTR::Trim(const OLECHAR *pszTrimChars)
 	return H;
 }
 
-CMarkdown::CMarkdown(const char *upper, const char *ahead):
-first(0), lower(0), upper(upper), ahead(ahead)
+//This is a hopefully complete list of the 36 (?) (potentially) unbalanced HTML
+//tags. It is based on tags.c from Tidy library,
+//"http://cvs.sourceforge.net/viewcvs.py/*checkout*/tidy/tidy/src/tags.c?rev=1.55".
+//It should include all tags from tag_defs[] array which are flagged either
+//CM_EMPTY (no closing tag) or CM_OPT (optional closing tag).
+
+static const char htmlUTags[]
+(
+	"area\0"
+	"base\0"
+	"basefont\0"
+	"body\0"
+	"br\0"
+	"col\0"
+	"colgroup\0"
+	"dd\0"
+	"dt\0"
+	"frame\0"
+	"head\0"
+	"hr\0"
+	"html\0"
+	"img\0"
+	"input\0"
+	"isindex\0"
+	"li\0"
+	"link\0"
+	"meta\0"
+	"optgroup\0"
+	"option\0"
+	"p\0"
+	"param\0"
+	"tbody\0"
+	"td\0"
+	"tfoot\0"
+	"th\0"
+	"thead\0"
+	"tr\0"
+	"nextid\0"
+	/* proprietary elements */
+	"bgsound\0"	//MICROSOFT
+	"embed\0"	//NETSCAPE
+	"keygen\0"	//NETSCAPE
+	"marquee\0"	//MICROSOFT
+	"spacer\0"	//NETSCAPE
+	"wbr\0"		//PROPRIETARY
+);
+
+CMarkdown::CMarkdown(const char *upper, const char *ahead, unsigned flags):
+first(0), lower(0), upper(upper), ahead(ahead),
+memcmp(flags & IgnoreCase ? ::memicmp : ::memcmp),
+utags(flags & HtmlUTags ? htmlUTags : NULL)
 {
 	if (CMarkdown::ahead > CMarkdown::upper)
 	{
@@ -285,6 +392,25 @@ CMarkdown::operator bool()
 	);
 }
 
+int CMarkdown::FindTag(const char *tags, const char *markup)
+{
+	while (int len = lstrlenA(tags))
+	{
+		unsigned char c;
+		if
+		(
+			ahead - markup > len
+		&&	memcmp(markup, tags, len) == 0
+		&&	(isspace(c = markup[len]) || c == '[' || c == '>' || c == '"' || c == '\'' || c == '=')
+		)
+		{
+			return len;
+		}
+		tags += len + 1;
+	}
+	return 0;
+}
+
 void CMarkdown::Scan()
 {
 	if (first == upper && *this)
@@ -355,7 +481,7 @@ void CMarkdown::Scan()
 				}
 				break;
 			case '>':
-				if (upper[-2] == '/')
+				if (upper[-2] == '/' || utags && FindTag(utags, first + 1))
 					--depth;
 				break;
 			case '<':
@@ -369,9 +495,21 @@ void CMarkdown::Scan()
 CMarkdown &CMarkdown::Move()
 {
 	Scan();
-	while (*this && *upper != '<')
+	for (;;)
 	{
-		++upper;
+		while (*this && *upper != '<')
+		{
+			++upper;
+		}
+		if (utags && MAKEWORD(upper[0], upper[1]) == MAKEWORD('<', '/'))
+		{
+			if (int utlen = FindTag(utags, upper + 2))
+			{
+				upper += 2 + utlen;
+				continue;
+			}
+		}
+		break;
 	}
 	first = lower = upper;
 	return *this;
@@ -434,7 +572,7 @@ bool CMarkdown::Pull()
 		{
 			++lower;
 		}
-		if (lower[-1] != '/' && lower[-1] != '?')
+		if (lower[-1] != '/' && lower[-1] != '?' && !(utags && FindTag(utags, first + 1)))
 		{
 			upper = lower;
 			return true;
@@ -480,7 +618,7 @@ CMarkdown::HSTR CMarkdown::GetTagName()
 		}
 		else
 		{
-			while (q < ahead && !isspace(c = *q) && c != '[' && c != '>' && c != '"' && c != '\'' && c != '=' )
+			while (q < ahead && !isspace(c = *q) && c != '[' && c != '>' && c != '"' && c != '\'' && c != '=')
 			{
 				++q;
 			}
@@ -712,7 +850,12 @@ LPVOID NTAPI CMarkdown::FileImage::MapFile(HANDLE hFile, DWORD dwSize)
 CMarkdown::FileImage::FileImage(LPCTSTR path, DWORD trunc, int flags):
 pImage(NULL)
 {
-	HANDLE hFile = CreateFile(path, GENERIC_READ, FILE_SHARE_READ, NULL, OPEN_EXISTING, 0, 0);
+	HANDLE hFile
+	(
+		flags & Handle
+	?	HANDLE(path)
+	:	CreateFile(path, GENERIC_READ, FILE_SHARE_READ, NULL, OPEN_EXISTING, 0, 0)
+	);
 	if (hFile != INVALID_HANDLE_VALUE)
 	{
 		cbImage = GetFileSize(hFile, 0);
@@ -756,7 +899,10 @@ pImage(NULL)
 				}
 			}
 		}
-		CloseHandle(hFile);
+		if (!(flags & Handle))
+		{
+			CloseHandle(hFile);
+		}
 	}
 	if (pImage == NULL)
 	{
diff --git a/Src/markdown.h b/Src/markdown.h
index dd1fe4c9b..fad9bb0a1 100644
--- a/Src/markdown.h
+++ b/Src/markdown.h
@@ -122,6 +122,7 @@ public:
 		// Convert(converter) converts string using an ICONV descriptor
 		_HSTR *Convert(const Converter &);
 		_HSTR *Resolve(const EntityMap &);
+		_HSTR *Entities();
 		_HSTR *Trim(const OLECHAR *);
 	} *HSTR;
 	union String
@@ -182,7 +183,13 @@ public:
 	const char *lower;	// beginning of enclosed text (valid after Move)
 	const char *upper;	// end of enclosed text (initially beginning of file)
 	const char *ahead;	// last char of file
-	CMarkdown(const char *upper, const char *ahead);
+	enum
+	{
+		IgnoreCase = 0x01,
+		HtmlUTags = 0x02,			// check for unbalanced tags
+		Html = IgnoreCase|HtmlUTags	// shortcut
+	};
+	CMarkdown(const char *upper, const char *ahead, unsigned flags = 0);
 	operator bool();				// is node ahead?
 	void Scan();					// find closing tag
 	CMarkdown &Move();				// move to next node
@@ -196,6 +203,9 @@ public:
 	HSTR GetOuterText();			// text including enclosing tags
 	HSTR GetAttribute(const char *, const void * = 0); // random or enumerate
 private:
+	int (__cdecl *const memcmp)(const void *, const void *, size_t);
+	const char *const utags;
+	int FindTag(const char *, const char *);
 	class Token;
 };
 
@@ -207,7 +217,8 @@ public:
 	LPVOID pImage;
 	enum
 	{
-		Octets = 1
+		Octets = 0x10,
+		Handle = 0x20
 	};
 	FileImage(LPCTSTR, DWORD trunc = 0, int flags = 0);
 	~FileImage();
@@ -218,9 +229,9 @@ class CMarkdown::File : public CMarkdown::FileImage, public CMarkdown
 {
 //	Construct CMarkdown object from file.
 public:
-	File(LPCTSTR path, DWORD trunc = 0):
-	CMarkdown::FileImage(path, trunc, Octets),
-	CMarkdown((const char *)pImage, (const char *)pImage + cbImage)
+	File(LPCTSTR path, DWORD trunc = 0, unsigned flags = Octets):
+	CMarkdown::FileImage(path, trunc, flags),
+	CMarkdown((const char *)pImage, (const char *)pImage + cbImage, flags)
 	{
 	}
 };
-- 
2.11.0