From d622cc25bbe1d1dda36d5e88f8171b777a6ae31f Mon Sep 17 00:00:00 2001 From: Jochen Tucht Date: Sat, 20 Aug 2005 06:50:18 +0000 Subject: [PATCH] PATCH: [ 1247875 ] codepage detection based on CMarkdown class --- Src/codepage_detect.cpp | 658 +++++++++--------------------------------------- Src/codepage_detect.h | 4 +- Src/markdown.cpp | 182 ++++++++++---- Src/markdown.h | 11 +- 4 files changed, 262 insertions(+), 593 deletions(-) diff --git a/Src/codepage_detect.cpp b/Src/codepage_detect.cpp index 6edcd8529..5f182d433 100644 --- a/Src/codepage_detect.cpp +++ b/Src/codepage_detect.cpp @@ -8,11 +8,12 @@ // $Id$ #include "StdAfx.h" +#include #include "codepage_detect.h" -#include "UniFile.h" #include "unicoder.h" #include "codepage.h" #include "charsets.h" +#include "markdown.h" #ifdef _DEBUG #define new DEBUG_NEW @@ -20,384 +21,146 @@ static char THIS_FILE[] = __FILE__; #endif - -static bool GuessEncoding_html_from_unifile(UniFile * pufile, int * encoding, int * codepage); -static bool GuessEncoding_xml_from_unifile(UniFile * pufile, int * encoding, int * codepage); -static bool GuessEncoding_rc_from_unifile(UniFile * pufile, int * encoding, int * codepage); -static bool codepage_from_html_line(LPCSTR line, int * codepage); -static bool codepage_from_html_line(LPCWSTR line, int * codepage); -static bool codepage_from_xml_line(LPCSTR line, int * codepage); -static bool codepage_from_xml_line(LPCWSTR line, int * codepage); -static bool codepage_from_rc_line(LPCSTR line, int * codepage); -static bool codepage_from_rc_line(LPCWSTR line, int * codepage); -static bool isValidCodepage(int cp); -static bool encoding_from_attrib_value(LPCSTR valstart, int * pEncodingId); -static bool encoding_from_attrib_value(LPCWSTR valstart, int * pEncodingId); -static const char *stristr(const char * szStringToBeSearched, const char * szSubstringToSearchFor); -static const wchar_t *wcsistr(const wchar_t * szStringToBeSearched, const wchar_t * szSubstringToSearchFor); - -/** - * @brief Default constructor setting color to black. - */ /** - * @brief Try to deduce encoding for this file + * @brief Is specified codepage number valid on this system? */ -void -GuessCodepageEncoding(const CString & filepath, int * unicoding, int * codepage, - BOOL bGuessEncoding) +static unsigned ValidCodepage(unsigned cp) { - UniMemFile ufile; - UniFile * pufile = &ufile; - - if (!pufile->OpenReadOnly(filepath)) - { - *unicoding = ucr::NONE; - *codepage = getDefaultCodepage(); - return; - } - bool hasbom = pufile->ReadBom(); - *unicoding = pufile->GetUnicoding(); - *codepage = pufile->GetCodepage(); - if (!hasbom && bGuessEncoding) - { - if (!filepath.Right(4).CompareNoCase(_T(".htm")) - || !filepath.Right(5).CompareNoCase(_T(".html"))) - { - GuessEncoding_html_from_unifile(pufile, unicoding, codepage); - } - else if (!filepath.Right(4).CompareNoCase(_T(".xml")) - || !filepath.Right(5).CompareNoCase(_T(".xsl"))) - { - GuessEncoding_xml_from_unifile(pufile, unicoding, codepage); - } - else if (!filepath.Right(3).CompareNoCase(_T(".rc"))) - { - GuessEncoding_rc_from_unifile(pufile, unicoding, codepage); - } - } - pufile->Close(); -} - -/*** Return true if text begins with prefix (case-insensitive), for char */ -static bool -StartsWithInsensitive(LPCSTR text, LPCSTR prefix, int prefixlen) -{ - return 0 == strnicmp(text, prefix, prefixlen); -} -/*** Return true if text begins with prefix (case-insensitive), for wchar_t */ -static bool -StartsWithInsensitive(LPCWSTR text, LPCWSTR prefix, int prefixlen) -{ - return 0 == wcsnicmp(text, prefix, prefixlen); + return cp && isCodepageSupported(cp) ? cp : 0; } /** * @brief Parser for HTML files to find encoding information - * - * To be removed when plugin event added for this */ -static bool -GuessEncoding_html_from_unifile(UniFile * pufile, int * encoding, int * codepage) +static unsigned demoGuessEncoding_html(const char *src, size_t len) { - CString line, eol; - while (1) + CMarkdown markdown(src, src + len, CMarkdown::Html); + //As and are optional, there is nothing to pull... + //markdown.Move("html").Pop().Move("head").Pop(); + while (markdown.Move("meta")) { - if (pufile->GetLineNumber() > 30) - break; - if (!pufile->ReadString(line, eol)) - break; - if (codepage_from_html_line(line, codepage)) - return true; - - } - return false; -} - -/** - * @brief Parser for HTML files to find encoding information - */ -static bool demoGuessEncoding_html(const char **data, int count, int * cp) -{ - if (count > 30) - count = 30; - while (count--) - { - const char *line = *data++; - if (codepage_from_html_line(line, cp)) - return true; - } - return false; -} - - -/** - * @brief Deduce codepage from this line of text from an HTML file, if we can - * - * char version - * - * @todo It is unfortunate to have both a char and a wchar_t version of this. - */ -static bool -codepage_from_html_line(LPCSTR line, int * codepage) -{ - /** @todo This is not a very complete matching algorithm */ - static LPCSTR metapref = "= 7 && memicmp(pchKey, "charset", 7) == 0 && (cchKey == 7 || strchr(" \t\r\n", pchKey[7]))) + { + pchValue[cchValue] = '\0'; + // Is it an encoding name known to charsets module ? + unsigned encodingId = GetEncodingIdFromName(pchValue); + if (encodingId == 0) + { + if (unsigned codepage = atoi(pchValue)) + { + encodingId = GetEncodingIdFromCodePage(codepage); + } + } + if (encodingId) + { + return GetEncodingCodePageFromId(encodingId); + } + return 0; + } + pchKey = pchValue + cchValue; + } + } + } } - - return false; + return 0; } /** * @brief Parser for XML files to find encoding information */ -static bool -GuessEncoding_xml_from_unifile(UniFile * pufile, int * encoding, int * codepage) -{ - CString line, eol; - while (1) - { - if (pufile->GetLineNumber() > 30) - break; - if (!pufile->ReadString(line, eol)) - break; - if (codepage_from_xml_line(line, codepage)) - return true; - - } - return false; -} - -/** - * @brief Parser for HTML files to find encoding information - * - * To be removed when plugin event added for this - */ -static bool demoGuessEncoding_xml(const char **data, int count, int * cp) -{ - if (count > 30) - count = 30; - while (count--) - { - const char *line = *data++; - if (codepage_from_xml_line(line, cp)) - return true; - } - return false; -} - -/** - * @brief Deduce codepage from this line of text from an XML file, if we can - * - * char version - * - * @todo It is unfortunate to have both a char and a wchar_t version of this. - */ -static bool -codepage_from_xml_line(LPCSTR line, int * codepage) -{ - /** @todo This is not a very complete matching algorithm */ - static LPCSTR metapref = "GetLineNumber() > 30) - break; - if (!pufile->ReadString(line, eol)) - break; - if (codepage_from_rc_line(line, codepage)) - return true; - } - return false; + while (len && (*src == '\r' || *src == '\n')) + { + ++src; + --len; + } + line = src; + while (len && *src != '\r' && *src != '\n') + { + ++src; + --len; + } + } while (len && sscanf(line, "#pragma code_page(%d)", &cp) != 1); + return ValidCodepage(cp); } - /** - * @brief Parser for rc files to find encoding information + * @brief Try to deduce encoding for this file */ -static bool demoGuessEncoding_rc(const char **data, int count, int * cp) +unsigned GuessEncoding_from_bytes(LPCTSTR ext, const char *src, size_t len) { - if (count > 30) - count = 30; - while (count--) + if (len > 4096) + len = 4096; + unsigned cp = 0; + if (lstrcmpi(ext, _T(".rc")) == 0) { - const char *line = *data++; - if (codepage_from_rc_line(line, cp)) - return true; + cp = demoGuessEncoding_rc(src, len); } - return false; -} - -/** - * @brief Deduce codepage from this line of text from an HTML file, if we can - * - * char version - * - * @todo It is unfortunate to have both a char and a wchar_t version of this. - */ -static bool -codepage_from_rc_line(LPCSTR line, int * codepage) -{ - int cp=0; - if (1 == sscanf(line, "#pragma code_page(%d)", &cp) - && isValidCodepage(cp)) + else if (lstrcmpi(ext, _T(".htm")) == 0 || lstrcmpi(ext, _T(".html")) == 0) { - *codepage = cp; - return true; + cp = demoGuessEncoding_html(src, len); } - return false; - -} - -/** - * @brief Deduce codepage from this line of text from an HTML file, if we can - * - * wchar_t version - * - * @todo It is unfortunate to have both a char and a wchar_t version of this. - */ -static bool -codepage_from_rc_line(LPCWSTR line, int * codepage) -{ - int cp=0; - if (1 == swscanf(line, L"#pragma code_page(%d)", &cp) - && isValidCodepage(cp)) + else if (lstrcmpi(ext, _T(".xml")) == 0 || lstrcmpi(ext, _T(".xsl")) == 0) { - *codepage = cp; - return true; + cp = demoGuessEncoding_xml(src, len); } - return false; - -} - -/** - * @brief Is specified codepage number valid on this system? - */ -static bool isValidCodepage(int cp) -{ - return isCodepageSupported(cp); + return cp; } /** * @brief Try to deduce encoding for this file */ -bool -GuessEncoding_from_bytes(const CString & sExt, const char **data, int count, int *codepage) +bool GuessEncoding_from_bytes(LPCTSTR ext, const char **data, int count, int *codepage) { - if (lstrcmpi(sExt, _T(".rc")) == 0) - { - int cp=0; - if (demoGuessEncoding_rc(data, count, &cp)) - { - *codepage = cp; - return true; - } - } - else if (lstrcmpi(sExt, _T(".htm")) == 0 || lstrcmpi(sExt, _T(".html")) == 0) - { - int cp=0; - if (demoGuessEncoding_html(data, count, &cp)) - { - *codepage = cp; - return true; - } - } - else if (lstrcmpi(sExt, _T(".xml")) == 0 || lstrcmpi(sExt, _T(".xsl")) == 0) + if (data) { - int cp=0; - if (demoGuessEncoding_xml(data, count, &cp)) + const char *src = data[0]; + size_t len = data[count] - src; + if (unsigned cp = GuessEncoding_from_bytes(ext, src, len)) { *codepage = cp; return true; @@ -407,219 +170,30 @@ GuessEncoding_from_bytes(const CString & sExt, const char **data, int count, int } /** - * @brief Parse an xml or html attribute into an encoding id from charset.h - */ -static bool -encoding_from_attrib_value(LPCSTR valstart, int * pEncodingId) -{ - static char buffer[128]; - LPCSTR end = valstart; - int offset = 0; - - // copy candidate value into buffer (using appropriate delimiter) - if (*valstart == '\'') - { - ++end; - // single quoted attribute - while (*end && (end - valstart < sizeof(buffer)-1) - && *end != '\'') - { - buffer[offset] = *end; - ++end; - ++offset; - } - } - else if (*valstart == '"') - { - ++end; - // double quoted attribute - while (*end && (end - valstart < sizeof(buffer)-1) - && *end != '"') - { - buffer[offset] = *end; - ++end; - ++offset; - } - } - else - { - // unquoted attibute, so watch for space or end tag - while (*end && (end - valstart < sizeof(buffer)-1) - && *end != ' ' && *end != '>') - { - buffer[offset] = *end; - ++end; - ++offset; - } - } - // must zero-terminate buffer - buffer[offset] = 0; - - // Is it an encoding name known to charsets module ? - *pEncodingId = GetEncodingIdFromName(buffer); - // GetEncodingIdFromName returns non-zero if valid - if (*pEncodingId != 0) - return true; - - // Is it a codepage known to charsets module ? - int cpnum = 0; - if (1 == sscanf(buffer, "%d", &cpnum)) - { - *pEncodingId = GetEncodingIdFromCodePage(cpnum); - // GetEncodingIdFromName returns non-zero if valid - if (*pEncodingId != 0) - return true; - } - - return false; -} - -/** - * @brief Parse an xml or html attribute into an encoding id from charset.h + * @brief Try to deduce encoding for this file */ -static bool -encoding_from_attrib_value(LPCWSTR valstart, int * pEncodingId) +void GuessCodepageEncoding(LPCTSTR filepath, int *unicoding, int *codepage, BOOL bGuessEncoding) { - static wchar_t buffer[128]; - LPCWSTR end = valstart; - int offset = 0; - - // copy candidate value into buffer (using appropriate delimiter) - if (*valstart == '\'') - { - ++end; - // single quoted attribute - while (*end && (end - valstart < sizeof(buffer)-1) - && *end != '\'') - { - buffer[offset] = *end; - ++end; - ++offset; - } - } - else if (*valstart == '"') + CMarkdown::FileImage fi(filepath, 4096); + *unicoding = ucr::NONE; + switch (fi.nByteOrder) { - ++end; - // double quoted attribute - while (*end && (end - valstart < sizeof(buffer)-1) - && *end != '"') - { - buffer[offset] = *end; - ++end; - ++offset; - } + case 8 + 2 + 0: + *unicoding = ucr::UCS2LE; + break; + case 8 + 2 + 1: + *unicoding = ucr::UCS2BE; + break; + case 8: + *unicoding = ucr::UTF8; + break; } - else + if (fi.nByteOrder == 0 && bGuessEncoding) { - // unquoted attibute, so watch for space or end tag - while (*end && (end - valstart < sizeof(buffer)-1) - && *end != ' ' && *end != '>') + LPCTSTR ext = PathFindExtension(filepath); + if (unsigned cp = GuessEncoding_from_bytes(ext, (char *)fi.pImage, fi.cbImage)) { - buffer[offset] = *end; - ++end; - ++offset; + *codepage = cp; } } - // must zero-terminate buffer - buffer[offset] = 0; - - // Is it an encoding name known to charsets module ? - USES_CONVERSION; - *pEncodingId = GetEncodingIdFromName(W2A(buffer)); - // GetEncodingIdFromName returns non-zero if valid - if (*pEncodingId != 0) - return true; - - // Is it a codepage known to charsets module ? - int cpnum = 0; - if (1 == swscanf(buffer, L"%d", &cpnum)) - { - *pEncodingId = GetEncodingIdFromCodePage(cpnum); - // GetEncodingIdFromName returns non-zero if valid - if (*pEncodingId != 0) - return true; - } - - return false; } - - -static const char * -stristr(const char * szStringToBeSearched, const char * szSubstringToSearchFor) -{ - const char * pPos = NULL; - char * szCopy1 = NULL; - char * szCopy2 = NULL; - - // verify parameters - if (szStringToBeSearched == NULL || szSubstringToSearchFor == NULL) - { - return szStringToBeSearched; - } - - // empty substring - return input (consistent with strstr) - if (strlen(szSubstringToSearchFor) == 0) - return szStringToBeSearched; - - szCopy1 = strlwr(strdup(szStringToBeSearched)); - szCopy2 = strlwr(strdup(szSubstringToSearchFor)); - - if ( szCopy1 == NULL || szCopy2 == NULL ) { - // another option is to raise an exception here - free((void*)szCopy1); - free((void*)szCopy2); - return NULL; - } - - pPos = strstr(szCopy1, szCopy2); - - if ( pPos != NULL ) { - // map to the original string - pPos = szStringToBeSearched + (pPos - szCopy1); - } - - free((void*)szCopy1); - free((void*)szCopy2); - - return pPos; -} // stristr(...) - -static const wchar_t * -wcsistr(const wchar_t * szStringToBeSearched, const wchar_t * szSubstringToSearchFor) -{ - const wchar_t * pPos = NULL; - wchar_t * szCopy1 = NULL; - wchar_t * szCopy2 = NULL; - - // verify parameters - if (szStringToBeSearched == NULL || szSubstringToSearchFor == NULL) - { - return szStringToBeSearched; - } - - // empty substring - return input (consistent with strstr) - if (wcslen(szSubstringToSearchFor) == 0) - return szStringToBeSearched; - - szCopy1 = wcslwr(wcsdup(szStringToBeSearched)); - szCopy2 = wcslwr(wcsdup(szSubstringToSearchFor)); - - if ( szCopy1 == NULL || szCopy2 == NULL ) { - // another option is to raise an exception here - free((void*)szCopy1); - free((void*)szCopy2); - return NULL; - } - - pPos = wcsstr(szCopy1, szCopy2); - - if ( pPos != NULL ) { - // map to the original string - pPos = szStringToBeSearched + (pPos - szCopy1); - } - - free((void*)szCopy1); - free((void*)szCopy2); - - return pPos; -} // wcsistr(...) diff --git a/Src/codepage_detect.h b/Src/codepage_detect.h index 06e47489a..29147dbf4 100644 --- a/Src/codepage_detect.h +++ b/Src/codepage_detect.h @@ -1,9 +1,9 @@ #ifndef codepage_detect_h_included #define codepage_detect_h_included -void GuessCodepageEncoding(const CString & filepath, int * unicoding, int * codepage, +void GuessCodepageEncoding(LPCTSTR filepath, int * unicoding, int * codepage, BOOL bGuessEncoding); -bool GuessEncoding_from_bytes(const CString & sExt, const char **data, int count, int *codepage); +bool GuessEncoding_from_bytes(LPCTSTR ext, const char **data, int count, int *codepage); #endif // codepage_detect_h_included diff --git a/Src/markdown.cpp b/Src/markdown.cpp index 70284a441..edf77132a 100644 --- a/Src/markdown.cpp +++ b/Src/markdown.cpp @@ -74,6 +74,7 @@ DATE: BY: DESCRIPTION: CMarkdown::FileImage::FileImage() accept a handle rather than a filename. 2005/06/22 Jochen Tucht New method CMarkdown::_HSTR::Entities(). +2005/07/29 Jochen Tucht ByteOrder detection for 16/32 bit encodings */ #include "stdafx.h" @@ -116,6 +117,38 @@ size_t CMarkdown::Converter::iconv(const char **inbuf, size_t *inbytesleft, char return handle != INVALID_HANDLE_VALUE ? ICONV->iconv(handle, inbuf, inbytesleft, outbuf, outbytesleft) : -1; } +size_t CMarkdown::Converter::Convert(const char *S, size_t s, char *D, size_t d) const +{ + // reset iconv internal state and tell if converter is valid + if (iconv(0, 0, 0, 0) != -1) + { + if (D == NULL) + { + while (s) + { + char buffer[100]; + char *C = buffer; + size_t c = sizeof buffer; + if (iconv(&S, &s, &C, &c) == -1 && c == sizeof buffer) + { + // some error other than 'outbuf exhausted': stop here + break; + } + d += sizeof buffer - c; + } + } + else + { + iconv(&S, &s, &D, &d); // convert entire string + } + } + else + { + d = 0; + } + return d; +} + template<> UINT AFXAPI HashKey(BSTR B) { return MAKELONG(B[0], lstrlenW(B)); @@ -162,7 +195,7 @@ CMarkdown::HSTR CMarkdown::_HSTR::Octets(UINT codepage) if (codepage != 1200) // 1200 means 'no conversion' { int w = SysStringLen(B); - int a = WideCharToMultiByte (codepage, 0, W, w, 0, 0, 0, 0); + int a = WideCharToMultiByte(codepage, 0, W, w, 0, 0, 0, 0); H = (HSTR)SysAllocStringByteLen(0, a); WideCharToMultiByte(codepage, 0, W, w, H->A, a, 0, 0); SysFreeString(B); @@ -173,31 +206,11 @@ CMarkdown::HSTR CMarkdown::_HSTR::Octets(UINT codepage) CMarkdown::HSTR CMarkdown::_HSTR::Convert(const CMarkdown::Converter &converter) { HSTR H = this; - // reset iconv internal state and tell if converter is valid - if (converter.iconv(0, 0, 0, 0) != -1) - { - const char *R = A; - size_t r = SysStringByteLen(B); - const char *S = R; - size_t s = r; - size_t d = 0; - while (r) - { - char buffer[100]; - char *C = buffer; - size_t c = sizeof buffer; - if (converter.iconv(&R, &r, &C, &c) == -1 && c == sizeof buffer) - { - // some error other than 'outbuf exhausted': stop here - break; - } - d += sizeof buffer - c; - } + size_t s = SysStringByteLen(B); + if (size_t d = converter.Convert(A, s, 0, 0)) + { H = (HSTR)SysAllocStringByteLen(0, d); - char *D = H->A; - // nothing should go wrong here as outbuf has now accurate size - converter.iconv(0, 0, 0, 0); // reset iconv internal state - converter.iconv(&S, &s, &D, &d); // convert entire string + converter.Convert(A, s, H->A, d); SysFreeString(B); } return H; @@ -847,8 +860,42 @@ LPVOID NTAPI CMarkdown::FileImage::MapFile(HANDLE hFile, DWORD dwSize) return pMapping; } -CMarkdown::FileImage::FileImage(LPCTSTR path, DWORD trunc, int flags): -pImage(NULL) +int CMarkdown::FileImage::GuessByteOrder(DWORD dwBOM) +{ + int nByteOrder = 0; + if (dwBOM) + { + WORD wBOM = LOWORD(dwBOM); + WORD wBOMhigh = HIWORD(dwBOM); + nByteOrder = 2; + if (wBOM == 0 || wBOMhigh == 0) + { + wBOM |= wBOMhigh; + nByteOrder = 4; + } + if (wBOM == 0xFEFF || wBOM == 0xFFFE) + { + nByteOrder += 8 + ((char *)memchr(&dwBOM, 0xFF, 4) - (char *)&dwBOM); + } + else if (LOBYTE(wBOM) == 0 || HIBYTE(wBOM) == 0) + { + BYTE cBOM = LOBYTE(wBOM) | HIBYTE(wBOM); + nByteOrder += ((char *)memchr(&dwBOM, cBOM, 4) - (char *)&dwBOM); + } + else if (dwBOM & 0xFFFFFF == 0xBFBBEF) + { + nByteOrder = 8 + 1; + } + else + { + nByteOrder = 1; + } + } + return nByteOrder; +} + +CMarkdown::FileImage::FileImage(LPCTSTR path, DWORD trunc, int flags) +: pImage(NULL), nByteOrder(0) { HANDLE hFile ( @@ -866,36 +913,81 @@ pImage(NULL) cbImage = trunc; } pImage = MapFile(hFile, cbImage); - if (pImage) + if (pImage && cbImage >= 4 && (flags & Octets & (nByteOrder = GuessByteOrder(*(LPDWORD)pImage)))) { - if (flags & Octets && cbImage >= 2) + LPVOID pCopy; + switch (nByteOrder) { - LPVOID pCopy; - switch (*(LPWCH)pImage) case 0xFFFE: + case 2 + 1: + case 2 + 1 + 8: + // big endian: swab first + cbImage &= ~1UL; + pCopy = MapFile(INVALID_HANDLE_VALUE, cbImage); + if (pCopy) { - // big endian: swab first - cbImage &= ~1UL; + _swab((char *)pImage, (char *)pCopy, cbImage); + } + UnmapViewOfFile(pImage); + pImage = pCopy; + if (pImage) + { + case 2 + 0: + case 2 + 0 + 8: + // little endian + int cchImage = cbImage / 2; + LPWCH pchImage = (LPWCH)pImage; + if (nByteOrder & 8) + { + ++pchImage; + --cchImage; + } + cbImage = WideCharToMultiByte(CP_UTF8, 0, pchImage, cchImage, 0, 0, 0, 0); pCopy = MapFile(INVALID_HANDLE_VALUE, cbImage); if (pCopy) { - _swab((char *)pImage, (char *)pCopy, cbImage); + WideCharToMultiByte(CP_UTF8, 0, pchImage, cchImage, (LPCH)pCopy, cbImage, 0, 0); } UnmapViewOfFile(pImage); pImage = pCopy; - if (pImage) case 0xFEFF: + } + break; + case 4 + 1: + case 4 + 1 + 8: + case 4 + 2: + case 4 + 2 + 8: + // odd word endianness: swab first + cbImage &= ~3UL; + pCopy = MapFile(INVALID_HANDLE_VALUE, cbImage); + if (pCopy) + { + _swab((char *)pImage, (char *)pCopy, cbImage); + } + UnmapViewOfFile(pImage); + pImage = pCopy; + if (pImage) + { + case 4 + 0: + case 4 + 0 + 8: + case 4 + 3: + case 4 + 3 + 8: + int cchImage = cbImage; + LPCH pchImage = (LPCH)pImage; + if (nByteOrder & 8) { - // little endian - int cchImage = cbImage / 2 - 1; - cbImage = WideCharToMultiByte(CP_UTF8, 0, (LPWCH)pImage + 1, cchImage, 0, 0, 0, 0); - pCopy = MapFile(INVALID_HANDLE_VALUE, cbImage); - if (pCopy) - { - WideCharToMultiByte(CP_UTF8, 0, (LPWCH)pImage + 1, cchImage, (LPCH)pCopy, cbImage, 0, 0); - } - UnmapViewOfFile(pImage); - pImage = pCopy; + pchImage += 4; + cchImage -= 4; } + CMarkdown::Converter converter("utf-8", nByteOrder & 2 ? "ucs-4be" : "ucs-4le"); + cbImage = converter.Convert(pchImage, cchImage, 0, 0); + pCopy = MapFile(INVALID_HANDLE_VALUE, cbImage); + if (pCopy) + { + converter.Convert(pchImage, cchImage, (LPCH)pCopy, cbImage); + } + UnmapViewOfFile(pImage); + pImage = pCopy; } + break; } } } diff --git a/Src/markdown.h b/Src/markdown.h index fad9bb0a1..ee02de265 100644 --- a/Src/markdown.h +++ b/Src/markdown.h @@ -52,6 +52,7 @@ public: Converter(const char *tocode, const char *fromcode); ~Converter(); size_t iconv(const char **inbuf, size_t *inbytesleft, char **outbuf, size_t *outbytesleft) const; + size_t Convert(const char *, size_t, char *, size_t) const; }; class EntityString { @@ -185,8 +186,8 @@ public: const char *ahead; // last char of file enum { - IgnoreCase = 0x01, - HtmlUTags = 0x02, // check for unbalanced tags + IgnoreCase = 0x10, + HtmlUTags = 0x20, // check for unbalanced tags Html = IgnoreCase|HtmlUTags // shortcut }; CMarkdown(const char *upper, const char *ahead, unsigned flags = 0); @@ -217,12 +218,14 @@ public: LPVOID pImage; enum { - Octets = 0x10, - Handle = 0x20 + Handle = 1, + Octets = 2 + 4, }; + int nByteOrder; FileImage(LPCTSTR, DWORD trunc = 0, int flags = 0); ~FileImage(); static LPVOID NTAPI MapFile(HANDLE hFile, DWORD dwSize); + static int NTAPI GuessByteOrder(DWORD); }; class CMarkdown::File : public CMarkdown::FileImage, public CMarkdown -- 2.11.0