From: Takashi Sawanaka Date: Wed, 5 May 2021 12:35:12 +0000 (+0900) Subject: GuessCodepageEncoding() -> codepage_detect::Guess() X-Git-Tag: v2.16.13~62 X-Git-Url: http://git.osdn.net/view?p=winmerge-jp%2Fwinmerge-jp.git;a=commitdiff_plain;h=c03eebe5b1297e77de8e42ef857019685809542a GuessCodepageEncoding() -> codepage_detect::Guess() --- diff --git a/Src/Common/multiformatText.cpp b/Src/Common/multiformatText.cpp index ce4befa28..322b8d494 100644 --- a/Src/Common/multiformatText.cpp +++ b/Src/Common/multiformatText.cpp @@ -85,7 +85,7 @@ void storageForPlugins::SetDataFileEncoding(const String& filename, FileTextEnco } void storageForPlugins::SetDataFileUnknown(const String& filename, bool bOverwrite /*= false*/) { - FileTextEncoding encoding = GuessCodepageEncoding(filename, 1); + FileTextEncoding encoding = codepage_detect::Guess(filename, 1); SetDataFileEncoding(filename, encoding, bOverwrite); } diff --git a/Src/ConflictFileParser.cpp b/Src/ConflictFileParser.cpp index f9a99a57d..a55c1bca6 100644 --- a/Src/ConflictFileParser.cpp +++ b/Src/ConflictFileParser.cpp @@ -102,7 +102,7 @@ bool ParseConflictFile(const String& conflictFileName, bool success4 = baseRevision.Open(baseRevisionFileName, _T("wb")); // detect codepage of conflict file - FileTextEncoding encoding = GuessCodepageEncoding(conflictFileName, iGuessEncodingType); + FileTextEncoding encoding = codepage_detect::Guess(conflictFileName, iGuessEncodingType); conflictFile.SetUnicoding(encoding.m_unicoding); conflictFile.SetBom(encoding.m_bom); diff --git a/Src/DiffContext.cpp b/Src/DiffContext.cpp index 992114996..a3218fb00 100644 --- a/Src/DiffContext.cpp +++ b/Src/DiffContext.cpp @@ -101,7 +101,7 @@ bool CDiffContext::UpdateInfoFromDiskHalf(DIFFITEM &di, int nIndex) if (!dfi.Update(filepath)) return false; UpdateVersion(di, nIndex); - dfi.encoding = GuessCodepageEncoding(filepath, m_iGuessEncodingType); + dfi.encoding = codepage_detect::Guess(filepath, m_iGuessEncodingType); return true; } diff --git a/Src/DiffTextBuffer.cpp b/Src/DiffTextBuffer.cpp index 028219470..7fb39158b 100644 --- a/Src/DiffTextBuffer.cpp +++ b/Src/DiffTextBuffer.cpp @@ -275,7 +275,7 @@ int CDiffTextBuffer::LoadFromFile(LPCTSTR pszFileNameInit, { // re-detect codepage int iGuessEncodingType = GetOptionsMgr()->GetInt(OPT_CP_DETECT); - FileTextEncoding encoding2 = GuessCodepageEncoding(pszFileName, iGuessEncodingType); + FileTextEncoding encoding2 = codepage_detect::Guess(pszFileName, iGuessEncodingType); pufile->SetUnicoding(encoding2.m_unicoding); pufile->SetCodepage(encoding2.m_codepage); pufile->SetBom(encoding2.m_bom); diff --git a/Src/FolderCmp.cpp b/Src/FolderCmp.cpp index e35447901..9d54ea9a0 100644 --- a/Src/FolderCmp.cpp +++ b/Src/FolderCmp.cpp @@ -145,7 +145,7 @@ int FolderCmp::prepAndCompareFiles(DIFFITEM &di) // Unpacked files will be deleted at end of this function. filepathTransformed[nIndex] = filepathUnpacked[nIndex]; - encoding[nIndex] = GuessCodepageEncoding(filepathTransformed[nIndex], m_pCtxt->m_iGuessEncodingType); + encoding[nIndex] = codepage_detect::Guess(filepathTransformed[nIndex], m_pCtxt->m_iGuessEncodingType); m_diffFileData.m_FileLocation[nIndex].encoding = encoding[nIndex]; } diff --git a/Src/MainFrm.cpp b/Src/MainFrm.cpp index 7616761bb..09f0bca4c 100644 --- a/Src/MainFrm.cpp +++ b/Src/MainFrm.cpp @@ -645,7 +645,7 @@ void CMainFrame::OnFileOpen() static void FileLocationGuessEncodings(FileLocation & fileloc, int iGuessEncoding) { - fileloc.encoding = GuessCodepageEncoding(fileloc.filepath, iGuessEncoding); + fileloc.encoding = codepage_detect::Guess(fileloc.filepath, iGuessEncoding); } bool CMainFrame::ShowAutoMergeDoc(CDirDoc * pDirDoc, diff --git a/Src/MergeDoc.cpp b/Src/MergeDoc.cpp index 8598cab43..1138fbc4e 100644 --- a/Src/MergeDoc.cpp +++ b/Src/MergeDoc.cpp @@ -2777,7 +2777,7 @@ DWORD CMergeDoc::LoadOneFile(int index, String filename, bool readOnly, const St { m_ptBuf[index]->FreeAll(); loadSuccess = LoadFile(filename.c_str(), index, readOnly, - GuessCodepageEncoding(filename, GetOptionsMgr()->GetInt(OPT_CP_DETECT), -1)); + codepage_detect::Guess(filename, GetOptionsMgr()->GetInt(OPT_CP_DETECT), -1)); } } else @@ -3172,7 +3172,7 @@ void CMergeDoc::ChangeFile(int nBuffer, const String& path, int nLineIndex) strDesc[nBuffer] = _T(""); fileloc[nBuffer].setPath(path); - fileloc[nBuffer].encoding = GuessCodepageEncoding(path, GetOptionsMgr()->GetInt(OPT_CP_DETECT)); + fileloc[nBuffer].encoding = codepage_detect::Guess(path, GetOptionsMgr()->GetInt(OPT_CP_DETECT)); if (OpenDocs(m_nBuffers, fileloc, bRO, strDesc)) MoveOnLoad(nBuffer, nLineIndex); diff --git a/Src/codepage_detect.cpp b/Src/codepage_detect.cpp index 03743ebe3..e9099251d 100644 --- a/Src/codepage_detect.cpp +++ b/Src/codepage_detect.cpp @@ -185,6 +185,8 @@ static unsigned demoGuessEncoding_rc(const char *src, size_t len, int defcodepag return cp; } +namespace codepage_detect +{ /** * @brief Try to deduce encoding for this file. * @param [in] ext File extension. @@ -192,37 +194,48 @@ static unsigned demoGuessEncoding_rc(const char *src, size_t len, int defcodepag * @param [in] len Size of the file contents string. * @return Codepage number. */ -static unsigned GuessEncoding_from_bytes(const String& ext, const char *src, size_t len, int guessEncodingType) +FileTextEncoding Guess(const String& ext, const void * src, size_t len, int guessEncodingType) { + FileTextEncoding encoding; + int bomsize = 0; + encoding.SetUnicoding(ucr::DetermineEncoding(reinterpret_cast(src), len, &encoding.m_bom)); + if (bomsize > 0) + encoding.m_bom = true; + if (encoding.m_unicoding != ucr::NONE) + return encoding; unsigned cp = ucr::getDefaultCodepage(); - if (!ucr::CheckForInvalidUtf8(src, len)) - cp = ucr::CP_UTF_8; - else if (guessEncodingType & 2) + if (guessEncodingType != 0) { - IExconverter *pexconv = Exconverter::getInstance(); - if (pexconv != nullptr && src != nullptr) - { - int autodetectType = (unsigned)guessEncodingType >> 16; - cp = pexconv->detectInputCodepage(autodetectType, cp, src, len); - } - } - if (guessEncodingType & 1) - { - String lower_ext = strutils::makelower(ext); - if (lower_ext == _T(".rc")) - { - cp = demoGuessEncoding_rc(src, len, cp); - } - else if (lower_ext == _T(".htm") || lower_ext == _T(".html")) + if (!ucr::CheckForInvalidUtf8(reinterpret_cast(src), len)) + cp = ucr::CP_UTF_8; + else if (guessEncodingType & 2) { - cp = demoGuessEncoding_html(src, len, cp); + IExconverter* pexconv = Exconverter::getInstance(); + if (pexconv != nullptr && src != nullptr) + { + int autodetectType = (unsigned)guessEncodingType >> 16; + cp = pexconv->detectInputCodepage(autodetectType, cp, reinterpret_cast(src), len); + } } - else if (lower_ext == _T(".xml") || lower_ext == _T(".xsl")) + if (guessEncodingType & 1) { - cp = demoGuessEncoding_xml(src, len, cp); + String lower_ext = strutils::makelower(ext); + if (lower_ext == _T(".rc")) + { + cp = demoGuessEncoding_rc(reinterpret_cast(src), len, cp); + } + else if (lower_ext == _T(".htm") || lower_ext == _T(".html")) + { + cp = demoGuessEncoding_html(reinterpret_cast(src), len, cp); + } + else if (lower_ext == _T(".xml") || lower_ext == _T(".xsl")) + { + cp = demoGuessEncoding_xml(reinterpret_cast(src), len, cp); + } } } - return cp; + encoding.SetCodepage(cp); + return encoding; } /** @@ -231,57 +244,11 @@ static unsigned GuessEncoding_from_bytes(const String& ext, const char *src, siz * @param [in] bGuessEncoding Try to guess codepage (not just unicode encoding). * @return Structure getting the encoding info. */ -FileTextEncoding GuessCodepageEncoding(const String& filepath, int guessEncodingType, ptrdiff_t mapmaxlen) +FileTextEncoding Guess(const String& filepath, int guessEncodingType, ptrdiff_t mapmaxlen) { - FileTextEncoding encoding; CMarkdown::FileImage fi(filepath != _T("NUL") ? filepath.c_str() : nullptr, mapmaxlen); - encoding.SetCodepage(ucr::getDefaultCodepage()); - encoding.m_bom = false; - switch (fi.nByteOrder) - { - case 8 + 2 + 0: - encoding.SetUnicoding(ucr::UCS2LE); - encoding.SetCodepage(ucr::CP_UCS2LE); - encoding.m_bom = true; - break; - case 8 + 2 + 1: - encoding.SetUnicoding(ucr::UCS2BE); - encoding.SetCodepage(ucr::CP_UCS2BE); - encoding.m_bom = true; - break; - case 8 + 1: - encoding.SetUnicoding(ucr::UTF8); - encoding.SetCodepage(ucr::CP_UTF_8); - encoding.m_bom = true; - break; - default: - encoding.m_bom = false; - break; - } - if (fi.nByteOrder < 4 && guessEncodingType != 0) - { - String ext = paths::FindExtension(filepath); - const char *src = (char *)fi.pImage; - size_t len = fi.cbImage; - if (len == static_cast(mapmaxlen)) - { - for (size_t i = len; i--; ) - { - if (isspace((unsigned char)src[i])) - { - // make len an even number for ucs-2 detection - if ((i % 2) == 0) - len = i; - else - len = i + 1; - break; - } - } - } - if (unsigned cp = GuessEncoding_from_bytes(ext, src, len, guessEncodingType)) - encoding.SetCodepage(cp); - else - encoding.SetCodepage(ucr::getDefaultCodepage()); - } - return encoding; + String ext = paths::FindExtension(filepath); + return Guess(ext, fi.pImage, fi.cbImage, guessEncodingType); +} + } diff --git a/Src/codepage_detect.h b/Src/codepage_detect.h index 44c1fa6a2..63c44abbe 100644 --- a/Src/codepage_detect.h +++ b/Src/codepage_detect.h @@ -8,7 +8,11 @@ #include "UnicodeString.h" #include "FileTextEncoding.h" +namespace codepage_detect +{ /** @brief Buffer size used in this file. */ -static const int BufSize = 65536; +constexpr int BufSize = 65536; -FileTextEncoding GuessCodepageEncoding(const String& filepath, int guessEncodingType, ptrdiff_t mapmaxlen = BufSize); +FileTextEncoding Guess(const String& filepath, int guessEncodingType, ptrdiff_t mapmaxlen = BufSize); +FileTextEncoding Guess(const String& ext, const void* src, size_t len, int guessEncodingType); +} diff --git a/Testing/GoogleTest/Encoding/codepage_detect_test.cpp b/Testing/GoogleTest/Encoding/codepage_detect_test.cpp index 026fcaf89..c4974e980 100644 --- a/Testing/GoogleTest/Encoding/codepage_detect_test.cpp +++ b/Testing/GoogleTest/Encoding/codepage_detect_test.cpp @@ -45,24 +45,24 @@ namespace TEST_F(CodepageDetectTest, GuessCodepageEncoding0) { FileTextEncoding enc; - enc = GuessCodepageEncoding(_T("../../Data/Unicode/UCS-2LE/DiffItem.h"), 0); + enc = codepage_detect::Guess(_T("../../Data/Unicode/UCS-2LE/DiffItem.h"), 0); EXPECT_EQ(1200, enc.m_codepage); EXPECT_EQ(true, enc.m_bom); EXPECT_EQ(ucr::UCS2LE, enc.m_unicoding); - enc = GuessCodepageEncoding(_T("../../Data/Unicode/UCS-2BE/DiffItem.h"), 0); + enc = codepage_detect::Guess(_T("../../Data/Unicode/UCS-2BE/DiffItem.h"), 0); EXPECT_EQ(1201, enc.m_codepage); EXPECT_EQ(true, enc.m_bom); EXPECT_EQ(ucr::UCS2BE, enc.m_unicoding); - enc = GuessCodepageEncoding(_T("../../Data/Unicode/UTF-8/DiffItem.h"), 0); + enc = codepage_detect::Guess(_T("../../Data/Unicode/UTF-8/DiffItem.h"), 0); EXPECT_EQ(65001, enc.m_codepage); EXPECT_EQ(true, enc.m_bom); EXPECT_EQ(ucr::UTF8, enc.m_unicoding); - enc = GuessCodepageEncoding(_T("../../Data/Unicode/UTF-8-NOBOM/DiffItem.h"), 0); + enc = codepage_detect::Guess(_T("../../Data/Unicode/UTF-8-NOBOM/DiffItem.h"), 0); EXPECT_EQ(ucr::getDefaultCodepage(), enc.m_codepage); EXPECT_EQ(false, enc.m_bom); EXPECT_EQ(ucr::NONE, enc.m_unicoding); - enc = GuessCodepageEncoding(_T("abcdefg12345"), 0); + enc = codepage_detect::Guess(_T("abcdefg12345"), 0); EXPECT_EQ(ucr::getDefaultCodepage(), enc.m_codepage); EXPECT_EQ(false, enc.m_bom); EXPECT_EQ(ucr::NONE, enc.m_unicoding); @@ -71,22 +71,22 @@ namespace TEST_F(CodepageDetectTest, GuessCodepageEncoding1) { FileTextEncoding enc; - enc = GuessCodepageEncoding(_T("../../Data/Unicode/UTF-8-NOBOM/DiffItem.h"), 1); + enc = codepage_detect::Guess(_T("../../Data/Unicode/UTF-8-NOBOM/DiffItem.h"), 1); EXPECT_EQ(65001, enc.m_codepage); EXPECT_EQ(false, enc.m_bom); EXPECT_EQ(ucr::UTF8, enc.m_unicoding); - enc = GuessCodepageEncoding(_T("../../../Docs/Manual/EN/About_Doc.xml"), 1); + enc = codepage_detect::Guess(_T("../../../Docs/Manual/EN/About_Doc.xml"), 1); EXPECT_EQ(65001, enc.m_codepage); EXPECT_EQ(false, enc.m_bom); EXPECT_EQ(ucr::UTF8, enc.m_unicoding); - enc = GuessCodepageEncoding(_T("../../../Docs/Developers/readme-developers.html"), 1); + enc = codepage_detect::Guess(_T("../../../Docs/Developers/readme-developers.html"), 1); EXPECT_EQ(28591, enc.m_codepage); EXPECT_EQ(false, enc.m_bom); EXPECT_EQ(ucr::NONE, enc.m_unicoding); - enc = GuessCodepageEncoding(_T("../../../ShellExtension/Languages/ShellExtensionRussian.rc"), 1); + enc = codepage_detect::Guess(_T("../../../ShellExtension/Languages/ShellExtensionRussian.rc"), 1); EXPECT_EQ(65001, enc.m_codepage); EXPECT_EQ(true, enc.m_bom); EXPECT_EQ(ucr::UTF8, enc.m_unicoding);