From d622cc25bbe1d1dda36d5e88f8171b777a6ae31f Mon Sep 17 00:00:00 2001
From: Jochen Tucht <jtuc@users.sourceforge.net>
Date: Sat, 20 Aug 2005 06:50:18 +0000
Subject: [PATCH] PATCH: [ 1247875 ] codepage detection based on CMarkdown
 class

---
 Src/codepage_detect.cpp | 658 +++++++++---------------------------------------
 Src/codepage_detect.h   |   4 +-
 Src/markdown.cpp        | 182 ++++++++++----
 Src/markdown.h          |  11 +-
 4 files changed, 262 insertions(+), 593 deletions(-)

diff --git a/Src/codepage_detect.cpp b/Src/codepage_detect.cpp
index 6edcd8529..5f182d433 100644
--- a/Src/codepage_detect.cpp
+++ b/Src/codepage_detect.cpp
@@ -8,11 +8,12 @@
 // $Id$
 
 #include "StdAfx.h"
+#include <shlwapi.h>
 #include "codepage_detect.h"
-#include "UniFile.h"
 #include "unicoder.h"
 #include "codepage.h"
 #include "charsets.h"
+#include "markdown.h"
 
 #ifdef _DEBUG
 #define new DEBUG_NEW
@@ -20,384 +21,146 @@
 static char THIS_FILE[] = __FILE__;
 #endif
 
-
-static bool GuessEncoding_html_from_unifile(UniFile * pufile, int * encoding, int * codepage);
-static bool GuessEncoding_xml_from_unifile(UniFile * pufile, int * encoding, int * codepage);
-static bool GuessEncoding_rc_from_unifile(UniFile * pufile, int * encoding, int * codepage);
-static bool codepage_from_html_line(LPCSTR line, int * codepage);
-static bool codepage_from_html_line(LPCWSTR line, int * codepage);
-static bool codepage_from_xml_line(LPCSTR line, int * codepage);
-static bool codepage_from_xml_line(LPCWSTR line, int * codepage);
-static bool codepage_from_rc_line(LPCSTR line, int * codepage);
-static bool codepage_from_rc_line(LPCWSTR line, int * codepage);
-static bool isValidCodepage(int cp);
-static bool encoding_from_attrib_value(LPCSTR valstart, int * pEncodingId);
-static bool encoding_from_attrib_value(LPCWSTR valstart, int * pEncodingId);
-static const char *stristr(const char * szStringToBeSearched, const char * szSubstringToSearchFor);
-static const wchar_t *wcsistr(const wchar_t * szStringToBeSearched, const wchar_t * szSubstringToSearchFor);
-
-/** 
- * @brief Default constructor setting color to black.
- */
 /**
- * @brief Try to deduce encoding for this file
+ * @brief Is specified codepage number valid on this system?
  */
-void
-GuessCodepageEncoding(const CString & filepath, int * unicoding, int * codepage,
-                      BOOL bGuessEncoding)
+static unsigned ValidCodepage(unsigned cp)
 {
-	UniMemFile ufile;
-	UniFile * pufile = &ufile;
-
-	if (!pufile->OpenReadOnly(filepath))
-	{
-		*unicoding = ucr::NONE;
-		*codepage = getDefaultCodepage();
-		return;
-	}
-	bool hasbom = pufile->ReadBom();
-	*unicoding = pufile->GetUnicoding();
-	*codepage = pufile->GetCodepage();
-	if (!hasbom && bGuessEncoding)
-	{
-		if (!filepath.Right(4).CompareNoCase(_T(".htm"))
-			|| !filepath.Right(5).CompareNoCase(_T(".html")))
-		{
-			GuessEncoding_html_from_unifile(pufile, unicoding, codepage);
-		}
-		else if (!filepath.Right(4).CompareNoCase(_T(".xml"))
-			|| !filepath.Right(5).CompareNoCase(_T(".xsl")))
-		{
-			GuessEncoding_xml_from_unifile(pufile, unicoding, codepage);
-		}
-		else if (!filepath.Right(3).CompareNoCase(_T(".rc")))
-		{
-			GuessEncoding_rc_from_unifile(pufile, unicoding, codepage);
-		}
-	}
-	pufile->Close();
-}
-
-/*** Return true if text begins with prefix (case-insensitive), for char */
-static bool
-StartsWithInsensitive(LPCSTR text, LPCSTR prefix, int prefixlen)
-{
-	return 0 == strnicmp(text, prefix, prefixlen);
-}
-/*** Return true if text begins with prefix (case-insensitive), for wchar_t */
-static bool
-StartsWithInsensitive(LPCWSTR text, LPCWSTR prefix, int prefixlen)
-{
-	return 0 == wcsnicmp(text, prefix, prefixlen);
+	return cp && isCodepageSupported(cp) ? cp : 0;
 }
 
 /**
  * @brief Parser for HTML files to find encoding information
- *
- * To be removed when plugin event added for this
  */
-static bool
-GuessEncoding_html_from_unifile(UniFile * pufile, int * encoding, int * codepage)
+static unsigned demoGuessEncoding_html(const char *src, size_t len)
 {
-	CString line, eol;
-	while (1)
+	CMarkdown markdown(src, src + len, CMarkdown::Html);
+	//As <html> and <head> are optional, there is nothing to pull...
+	//markdown.Move("html").Pop().Move("head").Pop();
+	while (markdown.Move("meta"))
 	{
-		if (pufile->GetLineNumber() > 30)
-			break;
-		if (!pufile->ReadString(line, eol))
-			break;
-		if (codepage_from_html_line(line, codepage))
-			return true;
-
-	}
-	return false;
-}
-
-/**
- * @brief Parser for HTML files to find encoding information
- */
-static bool demoGuessEncoding_html(const char **data, int count, int * cp)
-{
-	if (count > 30)
-		count = 30;
-	while (count--)
-	{
-		const char *line = *data++;
-		if (codepage_from_html_line(line, cp))
-			return true;
-	}
-	return false;
-}
-
-
-/**
- * @brief Deduce codepage from this line of text from an HTML file, if we can
- *
- * char version
- *
- * @todo It is unfortunate to have both a char and a wchar_t version of this.
- */
-static bool
-codepage_from_html_line(LPCSTR line, int * codepage)
-{
-	/** @todo This is not a very complete matching algorithm */
-	static LPCSTR metapref = "<meta http-equiv=\"Content-Type\" content=\"text/html; charset=";
-	static int metalen = strlen(metapref);
-
-	LPCSTR cpcandidate = stristr(line, metapref);
-	if (!cpcandidate)
-		return false;
-	cpcandidate += metalen;
-
-	int encodingId = 0;
-	if (encoding_from_attrib_value(cpcandidate, &encodingId))
-	{
-		*codepage = GetEncodingCodePageFromId(encodingId);
-		if (*codepage)
-			return true;
-	}
-
-	return false;
-}
-
-/** 
- * @brief Deduce codepage from this line of text from an HTML file, if we can
- *
- * wchar_t version
- */
-static bool
-codepage_from_html_line(LPCWSTR line, int * codepage)
-{
-	/** @todo This is not a very complete matching algorithm */
-	static LPCWSTR metapref = L"<meta http-equiv=\"Content-Type\" content=\"text/html; charset=";
-	static int metalen = wcslen(metapref);
-
-	LPCWSTR cpcandidate = wcsistr(line, metapref);
-	if (!cpcandidate)
-		return false;
-	cpcandidate += metalen;
-
-	int encodingId = 0;
-	if (encoding_from_attrib_value(cpcandidate, &encodingId))
-	{
-		*codepage = GetEncodingCodePageFromId(encodingId);
-		if (*codepage)
-			return true;
+		CMarkdown::String http_equiv = markdown.GetAttribute("http-equiv");
+		if (http_equiv.A && lstrcmpiA(http_equiv.A, "content-type") == 0)
+		{
+			CMarkdown::String content = markdown.GetAttribute("content");
+			if (char *pchKey = content.A)
+			{
+				while (int cchKey = strcspn(pchKey += strspn(pchKey, "; \t\r\n"), ";="))
+				{
+					char *pchValue = pchKey + cchKey;
+					int cchValue = strcspn(pchValue += strspn(pchValue, "= \t\r\n"), "; \t\r\n");
+					if (cchKey >= 7 && memicmp(pchKey, "charset", 7) == 0 && (cchKey == 7 || strchr(" \t\r\n", pchKey[7])))
+					{
+						pchValue[cchValue] = '\0';
+						// Is it an encoding name known to charsets module ?
+						unsigned encodingId = GetEncodingIdFromName(pchValue);
+						if (encodingId == 0)
+						{
+							if (unsigned codepage = atoi(pchValue))
+							{
+								encodingId = GetEncodingIdFromCodePage(codepage);
+							}
+						}
+						if (encodingId)
+						{
+							return GetEncodingCodePageFromId(encodingId);
+						}
+						return 0;
+					}
+					pchKey = pchValue + cchValue;
+				}
+			}
+		}
 	}
-
-	return false;
+	return 0;
 }
 
 /**
  * @brief Parser for XML files to find encoding information
  */
-static bool
-GuessEncoding_xml_from_unifile(UniFile * pufile, int * encoding, int * codepage)
-{
-	CString line, eol;
-	while (1)
-	{
-		if (pufile->GetLineNumber() > 30)
-			break;
-		if (!pufile->ReadString(line, eol))
-			break;
-		if (codepage_from_xml_line(line, codepage))
-			return true;
-
-	}
-	return false;
-}
-
-/**
- * @brief Parser for HTML files to find encoding information
- *
- * To be removed when plugin event added for this
- */
-static bool demoGuessEncoding_xml(const char **data, int count, int * cp)
-{
-	if (count > 30)
-		count = 30;
-	while (count--)
-	{
-		const char *line = *data++;
-		if (codepage_from_xml_line(line, cp))
-			return true;
-	}
-	return false;
-}
-
-/** 
- * @brief Deduce codepage from this line of text from an XML file, if we can
- *
- * char version
- *
- * @todo It is unfortunate to have both a char and a wchar_t version of this.
- */
-static bool
-codepage_from_xml_line(LPCSTR line, int * codepage)
-{
-	/** @todo This is not a very complete matching algorithm */
-	static LPCSTR metapref = "<?xml version=\"1.0\" encoding=";
-	static int metalen = strlen(metapref);
-
-	LPCSTR cpcandidate = stristr(line, metapref);
-	if (!cpcandidate)
-		return false;
-	cpcandidate += metalen;
-
-	int encodingId = 0;
-	if (encoding_from_attrib_value(cpcandidate, &encodingId))
-	{
-		*codepage = GetEncodingCodePageFromId(encodingId);
-		if (*codepage)
-			return true;
-	}
-
-	return false;
-}
-
-/** 
- * @brief Deduce codepage from this line of text from an XML file, if we can
- *
- * wchar_t version
- */
-static bool
-codepage_from_xml_line(LPCWSTR line, int * codepage)
+static unsigned demoGuessEncoding_xml(const char *src, size_t len)
 {
-	/** @todo This is not a very complete matching algorithm */
-	static LPCWSTR metapref = L"<?xml version=\"1.0\" encoding=";
-	static int metalen = wcslen(metapref);
-
-	LPCWSTR cpcandidate = wcsistr(line, metapref);
-	if (!cpcandidate)
-		return false;
-	cpcandidate += metalen;
-	
-	int encodingId = 0;
-	if (encoding_from_attrib_value(cpcandidate, &encodingId))
+	CMarkdown xml(src, src + len);
+	if (xml.Move("?xml"))
 	{
-		*codepage = GetEncodingCodePageFromId(encodingId);
-		if (*codepage)
-			return true;
+		CMarkdown::String encoding = xml.GetAttribute("encoding");
+		if (encoding.A)
+		{
+			// Is it an encoding name known to charsets module ?
+			unsigned encodingId = GetEncodingIdFromName(encoding.A);
+			if (encodingId == 0)
+			{
+				if (unsigned codepage = atoi(encoding.A))
+				{
+					encodingId = GetEncodingIdFromCodePage(codepage);
+				}
+			}
+			if (encodingId)
+			{
+				return GetEncodingCodePageFromId(encodingId);
+			}
+		}
 	}
-
-	return false;
+	return 0;
 }
 
 /**
  * @brief Parser for rc files to find encoding information
  */
-static bool
-GuessEncoding_rc_from_unifile(UniFile * pufile, int * encoding, int * codepage)
+static unsigned demoGuessEncoding_rc(const char *src, size_t len)
 {
-	CString line, eol;
-	while (1)
+	unsigned cp = 0;
+	const char *line = 0;
+	do
 	{
-		if (pufile->GetLineNumber() > 30)
-			break;
-		if (!pufile->ReadString(line, eol))
-			break;
-		if (codepage_from_rc_line(line, codepage))
-			return true;
-	}
-	return false;
+		while (len && (*src == '\r' || *src == '\n'))
+		{
+			++src;
+			--len;
+		}
+		line = src;
+		while (len && *src != '\r' && *src != '\n')
+		{
+			++src;
+			--len;
+		}
+	} while (len && sscanf(line, "#pragma code_page(%d)", &cp) != 1);
+	return ValidCodepage(cp);
 }
 
-
 /**
- * @brief Parser for rc files to find encoding information
+ * @brief Try to deduce encoding for this file
  */
-static bool demoGuessEncoding_rc(const char **data, int count, int * cp)
+unsigned GuessEncoding_from_bytes(LPCTSTR ext, const char *src, size_t len)
 {
-	if (count > 30)
-		count = 30;
-	while (count--)
+	if (len > 4096)
+		len = 4096;
+	unsigned cp = 0;
+	if (lstrcmpi(ext, _T(".rc")) ==  0)
 	{
-		const char *line = *data++;
-		if (codepage_from_rc_line(line, cp))
-			return true;
+		cp = demoGuessEncoding_rc(src, len);
 	}
-	return false;
-}
-
-/**
- * @brief Deduce codepage from this line of text from an HTML file, if we can
- *
- * char version
- *
- * @todo It is unfortunate to have both a char and a wchar_t version of this.
- */
-static bool
-codepage_from_rc_line(LPCSTR line, int * codepage)
-{
-	int cp=0;
-	if (1 == sscanf(line, "#pragma code_page(%d)", &cp)
-		&& isValidCodepage(cp))
+	else if (lstrcmpi(ext, _T(".htm")) == 0 || lstrcmpi(ext, _T(".html")) == 0)
 	{
-		*codepage = cp;
-		return true;
+		cp = demoGuessEncoding_html(src, len);
 	}
-	return false;
-
-}
-
-/**
- * @brief Deduce codepage from this line of text from an HTML file, if we can
- *
- * wchar_t version
- *
- * @todo It is unfortunate to have both a char and a wchar_t version of this.
- */
-static bool
-codepage_from_rc_line(LPCWSTR line, int * codepage)
-{
-	int cp=0;
-	if (1 == swscanf(line, L"#pragma code_page(%d)", &cp)
-		&& isValidCodepage(cp))
+	else if (lstrcmpi(ext, _T(".xml")) == 0 || lstrcmpi(ext, _T(".xsl")) == 0)
 	{
-		*codepage = cp;
-		return true;
+		cp = demoGuessEncoding_xml(src, len);
 	}
-	return false;
-
-}
-
-/**
- * @brief Is specified codepage number valid on this system?
- */
-static bool isValidCodepage(int cp)
-{
-	return isCodepageSupported(cp);
+	return cp;
 }
 
 /**
  * @brief Try to deduce encoding for this file
  */
-bool
-GuessEncoding_from_bytes(const CString & sExt, const char **data, int count, int *codepage)
+bool GuessEncoding_from_bytes(LPCTSTR ext, const char **data, int count, int *codepage)
 {
-	if (lstrcmpi(sExt, _T(".rc")) ==  0)
-	{
-		int cp=0;
-		if (demoGuessEncoding_rc(data, count, &cp))
-		{
-			*codepage = cp;
-			return true;
-		}
-	}
-	else if (lstrcmpi(sExt, _T(".htm")) == 0 || lstrcmpi(sExt, _T(".html")) == 0)
-	{
-		int cp=0;
-		if (demoGuessEncoding_html(data, count, &cp))
-		{
-			*codepage = cp;
-			return true;
-		}
-	}
-	else if (lstrcmpi(sExt, _T(".xml")) == 0 || lstrcmpi(sExt, _T(".xsl")) == 0)
+	if (data)
 	{
-		int cp=0;
-		if (demoGuessEncoding_xml(data, count, &cp))
+		const char *src = data[0];
+		size_t len = data[count] - src;
+		if (unsigned cp = GuessEncoding_from_bytes(ext, src, len))
 		{
 			*codepage = cp;
 			return true;
@@ -407,219 +170,30 @@ GuessEncoding_from_bytes(const CString & sExt, const char **data, int count, int
 }
 
 /**
- * @brief Parse an xml or html attribute into an encoding id from charset.h
- */
-static bool
-encoding_from_attrib_value(LPCSTR valstart, int * pEncodingId)
-{
-	static char buffer[128];
-	LPCSTR end = valstart;
-	int offset = 0;
-
-	// copy candidate value into buffer (using appropriate delimiter)
-	if (*valstart == '\'')
-	{
-		++end;
-		// single quoted attribute
-		while (*end && (end - valstart < sizeof(buffer)-1)
-			&& *end != '\'')
-		{
-			buffer[offset] = *end;
-			++end;
-			++offset;
-		}
-	}
-	else if (*valstart == '"')
-	{
-		++end;
-		// double quoted attribute
-		while (*end && (end - valstart < sizeof(buffer)-1)
-			&& *end != '"')
-		{
-			buffer[offset] = *end;
-			++end;
-			++offset;
-		}
-	}
-	else
-	{
-		// unquoted attibute, so watch for space or end tag
-		while (*end && (end - valstart < sizeof(buffer)-1)
-			&& *end != ' ' && *end != '>')
-		{
-			buffer[offset] = *end;
-			++end;
-			++offset;
-		}
-	}
-	// must zero-terminate buffer
-	buffer[offset] = 0;
-
-	// Is it an encoding name known to charsets module ?
-	*pEncodingId = GetEncodingIdFromName(buffer);
-	// GetEncodingIdFromName returns non-zero if valid
-	if (*pEncodingId != 0)
-		return true;
-
-	// Is it a codepage known to charsets module ?
-	int cpnum = 0;
-	if (1 == sscanf(buffer, "%d", &cpnum))
-	{
-		*pEncodingId = GetEncodingIdFromCodePage(cpnum);
-		// GetEncodingIdFromName returns non-zero if valid
-		if (*pEncodingId != 0)
-			return true;
-	}
-
-	return false;
-}
-
-/**
- * @brief Parse an xml or html attribute into an encoding id from charset.h
+ * @brief Try to deduce encoding for this file
  */
-static bool
-encoding_from_attrib_value(LPCWSTR valstart, int * pEncodingId)
+void GuessCodepageEncoding(LPCTSTR filepath, int *unicoding, int *codepage, BOOL bGuessEncoding)
 {
-	static wchar_t buffer[128];
-	LPCWSTR end = valstart;
-	int offset = 0;
-
-	// copy candidate value into buffer (using appropriate delimiter)
-	if (*valstart == '\'')
-	{
-		++end;
-		// single quoted attribute
-		while (*end && (end - valstart < sizeof(buffer)-1)
-			&& *end != '\'')
-		{
-			buffer[offset] = *end;
-			++end;
-			++offset;
-		}
-	}
-	else if (*valstart == '"')
+	CMarkdown::FileImage fi(filepath, 4096);
+	*unicoding = ucr::NONE;
+	switch (fi.nByteOrder)
 	{
-		++end;
-		// double quoted attribute
-		while (*end && (end - valstart < sizeof(buffer)-1)
-			&& *end != '"')
-		{
-			buffer[offset] = *end;
-			++end;
-			++offset;
-		}
+	case 8 + 2 + 0:
+		*unicoding = ucr::UCS2LE;
+		break;
+	case 8 + 2 + 1:
+		*unicoding = ucr::UCS2BE;
+		break;
+	case 8:
+		*unicoding = ucr::UTF8;
+		break;
 	}
-	else
+	if (fi.nByteOrder == 0 && bGuessEncoding)
 	{
-		// unquoted attibute, so watch for space or end tag
-		while (*end && (end - valstart < sizeof(buffer)-1)
-			&& *end != ' ' && *end != '>')
+		LPCTSTR ext = PathFindExtension(filepath);
+		if (unsigned cp = GuessEncoding_from_bytes(ext, (char *)fi.pImage, fi.cbImage))
 		{
-			buffer[offset] = *end;
-			++end;
-			++offset;
+			*codepage = cp;
 		}
 	}
-	// must zero-terminate buffer
-	buffer[offset] = 0;
-
-	// Is it an encoding name known to charsets module ?
-	USES_CONVERSION;
-	*pEncodingId = GetEncodingIdFromName(W2A(buffer));
-	// GetEncodingIdFromName returns non-zero if valid
-	if (*pEncodingId != 0)
-		return true;
-
-	// Is it a codepage known to charsets module ?
-	int cpnum = 0;
-	if (1 == swscanf(buffer, L"%d", &cpnum))
-	{
-		*pEncodingId = GetEncodingIdFromCodePage(cpnum);
-		// GetEncodingIdFromName returns non-zero if valid
-		if (*pEncodingId != 0)
-			return true;
-	}
-
-	return false;
 }
-
-
-static const char *
-stristr(const char * szStringToBeSearched, const char * szSubstringToSearchFor)
-{
-	const char * pPos = NULL;
-	char * szCopy1 = NULL;
-	char * szCopy2 = NULL;
-
-	// verify parameters
-	if (szStringToBeSearched == NULL || szSubstringToSearchFor == NULL)
-	{
-		return szStringToBeSearched;
-	}
-
-	// empty substring - return input (consistent with strstr)
-	if (strlen(szSubstringToSearchFor) == 0)
-		return szStringToBeSearched;
-
-	szCopy1 = strlwr(strdup(szStringToBeSearched));
-	szCopy2 = strlwr(strdup(szSubstringToSearchFor));
-
-	if ( szCopy1 == NULL || szCopy2 == NULL  ) {
-		// another option is to raise an exception here
-		free((void*)szCopy1);
-		free((void*)szCopy2);
-		return NULL;
-	}
-
-	pPos = strstr(szCopy1, szCopy2);
-
-	if ( pPos != NULL ) {
-		// map to the original string
-		pPos = szStringToBeSearched + (pPos - szCopy1);
-	}
-
-	free((void*)szCopy1);
-	free((void*)szCopy2);
-
-	return pPos;
-} // stristr(...)
-
-static const wchar_t *
-wcsistr(const wchar_t * szStringToBeSearched, const wchar_t * szSubstringToSearchFor)
-{
-	const wchar_t * pPos = NULL;
-	wchar_t * szCopy1 = NULL;
-	wchar_t * szCopy2 = NULL;
-
-	// verify parameters
-	if (szStringToBeSearched == NULL || szSubstringToSearchFor == NULL)
-	{
-		return szStringToBeSearched;
-	}
-
-	// empty substring - return input (consistent with strstr)
-	if (wcslen(szSubstringToSearchFor) == 0)
-		return szStringToBeSearched;
-
-	szCopy1 = wcslwr(wcsdup(szStringToBeSearched));
-	szCopy2 = wcslwr(wcsdup(szSubstringToSearchFor));
-
-	if ( szCopy1 == NULL || szCopy2 == NULL  ) {
-		// another option is to raise an exception here
-		free((void*)szCopy1);
-		free((void*)szCopy2);
-		return NULL;
-	}
-
-	pPos = wcsstr(szCopy1, szCopy2);
-
-	if ( pPos != NULL ) {
-		// map to the original string
-		pPos = szStringToBeSearched + (pPos - szCopy1);
-	}
-
-	free((void*)szCopy1);
-	free((void*)szCopy2);
-
-	return pPos;
-} // wcsistr(...)
diff --git a/Src/codepage_detect.h b/Src/codepage_detect.h
index 06e47489a..29147dbf4 100644
--- a/Src/codepage_detect.h
+++ b/Src/codepage_detect.h
@@ -1,9 +1,9 @@
 #ifndef codepage_detect_h_included
 #define codepage_detect_h_included
 
-void GuessCodepageEncoding(const CString & filepath, int * unicoding, int * codepage, 
+void GuessCodepageEncoding(LPCTSTR filepath, int * unicoding, int * codepage, 
                            BOOL bGuessEncoding);
 
-bool GuessEncoding_from_bytes(const CString & sExt, const char **data, int count, int *codepage);
+bool GuessEncoding_from_bytes(LPCTSTR ext, const char **data, int count, int *codepage);
 
 #endif // codepage_detect_h_included
diff --git a/Src/markdown.cpp b/Src/markdown.cpp
index 70284a441..edf77132a 100644
--- a/Src/markdown.cpp
+++ b/Src/markdown.cpp
@@ -74,6 +74,7 @@ DATE:		BY:					DESCRIPTION:
 								CMarkdown::FileImage::FileImage() accept a
 								handle rather than a filename.
 2005/06/22	Jochen Tucht		New method CMarkdown::_HSTR::Entities().
+2005/07/29	Jochen Tucht		ByteOrder detection for 16/32 bit encodings
 */
 
 #include "stdafx.h"
@@ -116,6 +117,38 @@ size_t CMarkdown::Converter::iconv(const char **inbuf, size_t *inbytesleft, char
 	return handle != INVALID_HANDLE_VALUE ? ICONV->iconv(handle, inbuf, inbytesleft, outbuf, outbytesleft) : -1;
 }
 
+size_t CMarkdown::Converter::Convert(const char *S, size_t s, char *D, size_t d) const
+{
+	// reset iconv internal state and tell if converter is valid
+	if (iconv(0, 0, 0, 0) != -1)
+	{
+		if (D == NULL)
+		{
+			while (s)
+			{
+				char buffer[100];
+				char *C = buffer;
+				size_t c = sizeof buffer;
+				if (iconv(&S, &s, &C, &c) == -1 && c == sizeof buffer)
+				{
+					// some error other than 'outbuf exhausted': stop here
+					break;
+				}
+				d += sizeof buffer - c;
+			}
+		}
+		else
+		{
+			iconv(&S, &s, &D, &d); // convert entire string
+		}
+	}
+	else
+	{
+		d = 0;
+	}
+	return d;
+}
+
 template<> UINT AFXAPI HashKey(BSTR B)
 {
 	return MAKELONG(B[0], lstrlenW(B));
@@ -162,7 +195,7 @@ CMarkdown::HSTR CMarkdown::_HSTR::Octets(UINT codepage)
 	if (codepage != 1200) // 1200 means 'no conversion'
 	{
 		int w = SysStringLen(B);
-		int a = WideCharToMultiByte (codepage, 0, W, w, 0, 0, 0, 0);
+		int a = WideCharToMultiByte(codepage, 0, W, w, 0, 0, 0, 0);
 		H = (HSTR)SysAllocStringByteLen(0, a);
 		WideCharToMultiByte(codepage, 0, W, w, H->A, a, 0, 0);
 		SysFreeString(B);
@@ -173,31 +206,11 @@ CMarkdown::HSTR CMarkdown::_HSTR::Octets(UINT codepage)
 CMarkdown::HSTR CMarkdown::_HSTR::Convert(const CMarkdown::Converter &converter)
 {
 	HSTR H = this;
-	// reset iconv internal state and tell if converter is valid
-	if (converter.iconv(0, 0, 0, 0) != -1)
-	{
-		const char *R = A;
-		size_t r = SysStringByteLen(B);
-		const char *S = R;
-		size_t s = r;
-		size_t d = 0;
-		while (r)
-		{
-			char buffer[100];
-			char *C = buffer;
-			size_t c = sizeof buffer;
-			if (converter.iconv(&R, &r, &C, &c) == -1 && c == sizeof buffer)
-			{
-				// some error other than 'outbuf exhausted': stop here
-				break;
-			}
-			d += sizeof buffer - c;
-		}
+	size_t s = SysStringByteLen(B);
+	if (size_t d = converter.Convert(A, s, 0, 0))
+	{
 		H = (HSTR)SysAllocStringByteLen(0, d);
-		char *D = H->A;
-		// nothing should go wrong here as outbuf has now accurate size
-		converter.iconv(0, 0, 0, 0); // reset iconv internal state
-		converter.iconv(&S, &s, &D, &d); // convert entire string
+		converter.Convert(A, s, H->A, d);
 		SysFreeString(B);
 	}
 	return H;
@@ -847,8 +860,42 @@ LPVOID NTAPI CMarkdown::FileImage::MapFile(HANDLE hFile, DWORD dwSize)
 	return pMapping;
 }
 
-CMarkdown::FileImage::FileImage(LPCTSTR path, DWORD trunc, int flags):
-pImage(NULL)
+int CMarkdown::FileImage::GuessByteOrder(DWORD dwBOM)
+{
+	int nByteOrder = 0;
+	if (dwBOM)
+	{
+		WORD wBOM = LOWORD(dwBOM);
+		WORD wBOMhigh = HIWORD(dwBOM);
+		nByteOrder = 2;
+		if (wBOM == 0 || wBOMhigh == 0)
+		{
+			wBOM |= wBOMhigh;
+			nByteOrder = 4;
+		}
+		if (wBOM == 0xFEFF || wBOM == 0xFFFE)
+		{
+			nByteOrder += 8 + ((char *)memchr(&dwBOM, 0xFF, 4) - (char *)&dwBOM);
+		}
+		else if (LOBYTE(wBOM) == 0 || HIBYTE(wBOM) == 0)
+		{
+			BYTE cBOM = LOBYTE(wBOM) | HIBYTE(wBOM);
+			nByteOrder += ((char *)memchr(&dwBOM, cBOM, 4) - (char *)&dwBOM);
+		}
+		else if (dwBOM & 0xFFFFFF == 0xBFBBEF)
+		{
+			nByteOrder = 8 + 1;
+		}
+		else
+		{
+			nByteOrder = 1;
+		}
+	}
+	return nByteOrder;
+}
+
+CMarkdown::FileImage::FileImage(LPCTSTR path, DWORD trunc, int flags)
+: pImage(NULL), nByteOrder(0)
 {
 	HANDLE hFile
 	(
@@ -866,36 +913,81 @@ pImage(NULL)
 				cbImage = trunc;
 			}
 			pImage = MapFile(hFile, cbImage);
-			if (pImage)
+			if (pImage && cbImage >= 4 && (flags & Octets & (nByteOrder = GuessByteOrder(*(LPDWORD)pImage))))
 			{
-				if (flags & Octets && cbImage >= 2)
+				LPVOID pCopy;
+				switch (nByteOrder)
 				{
-					LPVOID pCopy;
-					switch (*(LPWCH)pImage) case 0xFFFE:
+				case 2 + 1:
+				case 2 + 1 + 8:
+					// big endian: swab first
+					cbImage &= ~1UL;
+					pCopy = MapFile(INVALID_HANDLE_VALUE, cbImage);
+					if (pCopy)
 					{
-						// big endian: swab first
-						cbImage &= ~1UL;
+						_swab((char *)pImage, (char *)pCopy, cbImage);
+					}
+					UnmapViewOfFile(pImage);
+					pImage = pCopy;
+					if (pImage)
+					{
+					case 2 + 0:
+					case 2 + 0 + 8:
+						// little endian
+						int cchImage = cbImage / 2;
+						LPWCH pchImage = (LPWCH)pImage;
+						if (nByteOrder & 8)
+						{
+							++pchImage;
+							--cchImage;
+						}
+						cbImage = WideCharToMultiByte(CP_UTF8, 0, pchImage, cchImage, 0, 0, 0, 0);
 						pCopy = MapFile(INVALID_HANDLE_VALUE, cbImage);
 						if (pCopy)
 						{
-							_swab((char *)pImage, (char *)pCopy, cbImage);
+							WideCharToMultiByte(CP_UTF8, 0, pchImage, cchImage, (LPCH)pCopy, cbImage, 0, 0);
 						}
 						UnmapViewOfFile(pImage);
 						pImage = pCopy;
-						if (pImage) case 0xFEFF:
+					}
+					break;
+				case 4 + 1:
+				case 4 + 1 + 8:
+				case 4 + 2:
+				case 4 + 2 + 8:
+					// odd word endianness: swab first
+					cbImage &= ~3UL;
+					pCopy = MapFile(INVALID_HANDLE_VALUE, cbImage);
+					if (pCopy)
+					{
+						_swab((char *)pImage, (char *)pCopy, cbImage);
+					}
+					UnmapViewOfFile(pImage);
+					pImage = pCopy;
+					if (pImage)
+					{
+					case 4 + 0:
+					case 4 + 0 + 8:
+					case 4 + 3:
+					case 4 + 3 + 8:
+						int cchImage = cbImage;
+						LPCH pchImage = (LPCH)pImage;
+						if (nByteOrder & 8)
 						{
-							// little endian
-							int cchImage = cbImage / 2 - 1;
-							cbImage = WideCharToMultiByte(CP_UTF8, 0, (LPWCH)pImage + 1, cchImage, 0, 0, 0, 0);
-							pCopy = MapFile(INVALID_HANDLE_VALUE, cbImage);
-							if (pCopy)
-							{
-								WideCharToMultiByte(CP_UTF8, 0, (LPWCH)pImage + 1, cchImage, (LPCH)pCopy, cbImage, 0, 0);
-							}
-							UnmapViewOfFile(pImage);
-							pImage = pCopy;
+							pchImage += 4;
+							cchImage -= 4;
 						}
+						CMarkdown::Converter converter("utf-8", nByteOrder & 2 ? "ucs-4be" : "ucs-4le");
+						cbImage = converter.Convert(pchImage, cchImage, 0, 0);
+						pCopy = MapFile(INVALID_HANDLE_VALUE, cbImage);
+						if (pCopy)
+						{
+							converter.Convert(pchImage, cchImage, (LPCH)pCopy, cbImage);
+						}
+						UnmapViewOfFile(pImage);
+						pImage = pCopy;
 					}
+					break;
 				}
 			}
 		}
diff --git a/Src/markdown.h b/Src/markdown.h
index fad9bb0a1..ee02de265 100644
--- a/Src/markdown.h
+++ b/Src/markdown.h
@@ -52,6 +52,7 @@ public:
 		Converter(const char *tocode, const char *fromcode);
 		~Converter();
 		size_t iconv(const char **inbuf, size_t *inbytesleft, char **outbuf, size_t *outbytesleft) const;
+		size_t Convert(const char *, size_t, char *, size_t) const;
 	};
 	class EntityString
 	{
@@ -185,8 +186,8 @@ public:
 	const char *ahead;	// last char of file
 	enum
 	{
-		IgnoreCase = 0x01,
-		HtmlUTags = 0x02,			// check for unbalanced tags
+		IgnoreCase = 0x10,
+		HtmlUTags = 0x20,			// check for unbalanced tags
 		Html = IgnoreCase|HtmlUTags	// shortcut
 	};
 	CMarkdown(const char *upper, const char *ahead, unsigned flags = 0);
@@ -217,12 +218,14 @@ public:
 	LPVOID pImage;
 	enum
 	{
-		Octets = 0x10,
-		Handle = 0x20
+		Handle = 1,
+		Octets = 2 + 4,
 	};
+	int nByteOrder;
 	FileImage(LPCTSTR, DWORD trunc = 0, int flags = 0);
 	~FileImage();
 	static LPVOID NTAPI MapFile(HANDLE hFile, DWORD dwSize);
+	static int NTAPI GuessByteOrder(DWORD);
 };
 
 class CMarkdown::File : public CMarkdown::FileImage, public CMarkdown
-- 
2.11.0