OSDN Git Service

GuessCodepageEncoding() -> codepage_detect::Guess()
authorTakashi Sawanaka <sdottaka@users.sourceforge.net>
Wed, 5 May 2021 12:35:12 +0000 (21:35 +0900)
committerTakashi Sawanaka <sdottaka@users.sourceforge.net>
Wed, 5 May 2021 12:35:12 +0000 (21:35 +0900)
Src/Common/multiformatText.cpp
Src/ConflictFileParser.cpp
Src/DiffContext.cpp
Src/DiffTextBuffer.cpp
Src/FolderCmp.cpp
Src/MainFrm.cpp
Src/MergeDoc.cpp
Src/codepage_detect.cpp
Src/codepage_detect.h
Testing/GoogleTest/Encoding/codepage_detect_test.cpp

index ce4befa..322b8d4 100644 (file)
@@ -85,7 +85,7 @@ void storageForPlugins::SetDataFileEncoding(const String& filename, FileTextEnco
 }
 void storageForPlugins::SetDataFileUnknown(const String& filename, bool bOverwrite /*= false*/) 
 {
-       FileTextEncoding encoding = GuessCodepageEncoding(filename, 1);
+       FileTextEncoding encoding = codepage_detect::Guess(filename, 1);
        SetDataFileEncoding(filename, encoding, bOverwrite);
 }
 
index f9a99a5..a55c1bc 100644 (file)
@@ -102,7 +102,7 @@ bool ParseConflictFile(const String& conflictFileName,
        bool success4 = baseRevision.Open(baseRevisionFileName, _T("wb"));
 
        // detect codepage of conflict file
-       FileTextEncoding encoding = GuessCodepageEncoding(conflictFileName, iGuessEncodingType);
+       FileTextEncoding encoding = codepage_detect::Guess(conflictFileName, iGuessEncodingType);
 
        conflictFile.SetUnicoding(encoding.m_unicoding);
        conflictFile.SetBom(encoding.m_bom);
index 9921149..a3218fb 100644 (file)
@@ -101,7 +101,7 @@ bool CDiffContext::UpdateInfoFromDiskHalf(DIFFITEM &di, int nIndex)
        if (!dfi.Update(filepath))
                return false;
        UpdateVersion(di, nIndex);
-       dfi.encoding = GuessCodepageEncoding(filepath, m_iGuessEncodingType);
+       dfi.encoding = codepage_detect::Guess(filepath, m_iGuessEncodingType);
        return true;
 }
 
index 0282194..7fb3915 100644 (file)
@@ -275,7 +275,7 @@ int CDiffTextBuffer::LoadFromFile(LPCTSTR pszFileNameInit,
                {
                        // re-detect codepage
                        int iGuessEncodingType = GetOptionsMgr()->GetInt(OPT_CP_DETECT);
-                       FileTextEncoding encoding2 = GuessCodepageEncoding(pszFileName, iGuessEncodingType);
+                       FileTextEncoding encoding2 = codepage_detect::Guess(pszFileName, iGuessEncodingType);
                        pufile->SetUnicoding(encoding2.m_unicoding);
                        pufile->SetCodepage(encoding2.m_codepage);
                        pufile->SetBom(encoding2.m_bom);
index e354479..9d54ea9 100644 (file)
@@ -145,7 +145,7 @@ int FolderCmp::prepAndCompareFiles(DIFFITEM &di)
                        // Unpacked files will be deleted at end of this function.
                        filepathTransformed[nIndex] = filepathUnpacked[nIndex];
 
-                       encoding[nIndex] = GuessCodepageEncoding(filepathTransformed[nIndex], m_pCtxt->m_iGuessEncodingType);
+                       encoding[nIndex] = codepage_detect::Guess(filepathTransformed[nIndex], m_pCtxt->m_iGuessEncodingType);
                        m_diffFileData.m_FileLocation[nIndex].encoding = encoding[nIndex];
                }
 
index 7616761..09f0bca 100644 (file)
@@ -645,7 +645,7 @@ void CMainFrame::OnFileOpen()
 static void
 FileLocationGuessEncodings(FileLocation & fileloc, int iGuessEncoding)
 {
-       fileloc.encoding = GuessCodepageEncoding(fileloc.filepath, iGuessEncoding);
+       fileloc.encoding = codepage_detect::Guess(fileloc.filepath, iGuessEncoding);
 }
 
 bool CMainFrame::ShowAutoMergeDoc(CDirDoc * pDirDoc,
index 8598cab..1138fbc 100644 (file)
@@ -2777,7 +2777,7 @@ DWORD CMergeDoc::LoadOneFile(int index, String filename, bool readOnly, const St
                {
                        m_ptBuf[index]->FreeAll();
                        loadSuccess = LoadFile(filename.c_str(), index, readOnly,
-                               GuessCodepageEncoding(filename, GetOptionsMgr()->GetInt(OPT_CP_DETECT), -1));
+                               codepage_detect::Guess(filename, GetOptionsMgr()->GetInt(OPT_CP_DETECT), -1));
                }
        }
        else
@@ -3172,7 +3172,7 @@ void CMergeDoc::ChangeFile(int nBuffer, const String& path, int nLineIndex)
 
        strDesc[nBuffer] = _T("");
        fileloc[nBuffer].setPath(path);
-       fileloc[nBuffer].encoding = GuessCodepageEncoding(path, GetOptionsMgr()->GetInt(OPT_CP_DETECT));
+       fileloc[nBuffer].encoding = codepage_detect::Guess(path, GetOptionsMgr()->GetInt(OPT_CP_DETECT));
        
        if (OpenDocs(m_nBuffers, fileloc, bRO, strDesc))
                MoveOnLoad(nBuffer, nLineIndex);
index 03743eb..e909925 100644 (file)
@@ -185,6 +185,8 @@ static unsigned demoGuessEncoding_rc(const char *src, size_t len, int defcodepag
        return cp;
 }
 
+namespace codepage_detect
+{
 /**
  * @brief Try to deduce encoding for this file.
  * @param [in] ext File extension.
@@ -192,37 +194,48 @@ static unsigned demoGuessEncoding_rc(const char *src, size_t len, int defcodepag
  * @param [in] len Size of the file contents string.
  * @return Codepage number.
  */
-static unsigned GuessEncoding_from_bytes(const String& ext, const char *src, size_t len, int guessEncodingType)
+FileTextEncoding Guess(const String& ext, const void * src, size_t len, int guessEncodingType)
 {
+       FileTextEncoding encoding;
+       int bomsize = 0;
+       encoding.SetUnicoding(ucr::DetermineEncoding(reinterpret_cast<const unsigned char *>(src), len, &encoding.m_bom));
+       if (bomsize > 0)
+               encoding.m_bom = true;
+       if (encoding.m_unicoding != ucr::NONE)
+               return encoding;
        unsigned cp = ucr::getDefaultCodepage();
-       if (!ucr::CheckForInvalidUtf8(src, len))
-               cp = ucr::CP_UTF_8;
-       else if (guessEncodingType & 2)
+       if (guessEncodingType != 0)
        {
-               IExconverter *pexconv = Exconverter::getInstance();
-               if (pexconv != nullptr && src != nullptr)
-               {
-                       int autodetectType = (unsigned)guessEncodingType >> 16;
-                       cp = pexconv->detectInputCodepage(autodetectType, cp, src, len);
-               }
-       }
-       if (guessEncodingType & 1)
-       {
-               String lower_ext = strutils::makelower(ext);
-               if (lower_ext == _T(".rc"))
-               {
-                       cp = demoGuessEncoding_rc(src, len, cp);
-               }
-               else if (lower_ext == _T(".htm") || lower_ext == _T(".html"))
+               if (!ucr::CheckForInvalidUtf8(reinterpret_cast<const char*>(src), len))
+                       cp = ucr::CP_UTF_8;
+               else if (guessEncodingType & 2)
                {
-                       cp = demoGuessEncoding_html(src, len, cp);
+                       IExconverter* pexconv = Exconverter::getInstance();
+                       if (pexconv != nullptr && src != nullptr)
+                       {
+                               int autodetectType = (unsigned)guessEncodingType >> 16;
+                               cp = pexconv->detectInputCodepage(autodetectType, cp, reinterpret_cast<const char *>(src), len);
+                       }
                }
-               else if (lower_ext == _T(".xml") || lower_ext == _T(".xsl"))
+               if (guessEncodingType & 1)
                {
-                       cp = demoGuessEncoding_xml(src, len, cp);
+                       String lower_ext = strutils::makelower(ext);
+                       if (lower_ext == _T(".rc"))
+                       {
+                               cp = demoGuessEncoding_rc(reinterpret_cast<const char *>(src), len, cp);
+                       }
+                       else if (lower_ext == _T(".htm") || lower_ext == _T(".html"))
+                       {
+                               cp = demoGuessEncoding_html(reinterpret_cast<const char *>(src), len, cp);
+                       }
+                       else if (lower_ext == _T(".xml") || lower_ext == _T(".xsl"))
+                       {
+                               cp = demoGuessEncoding_xml(reinterpret_cast<const char *>(src), len, cp);
+                       }
                }
        }
-       return cp;
+       encoding.SetCodepage(cp);
+       return encoding;
 }
 
 /**
@@ -231,57 +244,11 @@ static unsigned GuessEncoding_from_bytes(const String& ext, const char *src, siz
  * @param [in] bGuessEncoding Try to guess codepage (not just unicode encoding).
  * @return Structure getting the encoding info.
  */
-FileTextEncoding GuessCodepageEncoding(const String& filepath, int guessEncodingType, ptrdiff_t mapmaxlen)
+FileTextEncoding Guess(const String& filepath, int guessEncodingType, ptrdiff_t mapmaxlen)
 {
-       FileTextEncoding encoding;
        CMarkdown::FileImage fi(filepath != _T("NUL") ? filepath.c_str() : nullptr, mapmaxlen);
-       encoding.SetCodepage(ucr::getDefaultCodepage());
-       encoding.m_bom = false;
-       switch (fi.nByteOrder)
-       {
-       case 8 + 2 + 0:
-               encoding.SetUnicoding(ucr::UCS2LE);
-               encoding.SetCodepage(ucr::CP_UCS2LE);
-               encoding.m_bom = true;
-               break;
-       case 8 + 2 + 1:
-               encoding.SetUnicoding(ucr::UCS2BE);
-               encoding.SetCodepage(ucr::CP_UCS2BE);
-               encoding.m_bom = true;
-               break;
-       case 8 + 1:
-               encoding.SetUnicoding(ucr::UTF8);
-               encoding.SetCodepage(ucr::CP_UTF_8);
-               encoding.m_bom = true;
-               break;
-       default:
-               encoding.m_bom = false;
-               break;
-       }
-       if (fi.nByteOrder < 4 && guessEncodingType != 0)
-       {
-               String ext = paths::FindExtension(filepath);
-               const char *src = (char *)fi.pImage;
-               size_t len = fi.cbImage;
-               if (len == static_cast<size_t>(mapmaxlen))
-               {
-                       for (size_t i = len; i--; )
-                       {
-                               if (isspace((unsigned char)src[i]))
-                               {
-                                       // make len an even number for ucs-2 detection
-                                       if ((i % 2) == 0)
-                                               len = i;
-                                       else
-                                               len = i + 1;
-                                       break;
-                               }
-                       }
-               }
-               if (unsigned cp = GuessEncoding_from_bytes(ext, src, len, guessEncodingType))
-                       encoding.SetCodepage(cp);
-               else
-                       encoding.SetCodepage(ucr::getDefaultCodepage());
-       }
-       return encoding;
+       String ext = paths::FindExtension(filepath);
+       return Guess(ext, fi.pImage, fi.cbImage, guessEncodingType);
+}
+
 }
index 44c1fa6..63c44ab 100644 (file)
@@ -8,7 +8,11 @@
 #include "UnicodeString.h"
 #include "FileTextEncoding.h"
 
+namespace codepage_detect
+{
 /** @brief Buffer size used in this file. */
-static const int BufSize = 65536;
+constexpr int BufSize = 65536;
 
-FileTextEncoding GuessCodepageEncoding(const String& filepath, int guessEncodingType, ptrdiff_t mapmaxlen = BufSize);
+FileTextEncoding Guess(const String& filepath, int guessEncodingType, ptrdiff_t mapmaxlen = BufSize);
+FileTextEncoding Guess(const String& ext, const void* src, size_t len, int guessEncodingType);
+}
index 026fcaf..c4974e9 100644 (file)
@@ -45,24 +45,24 @@ namespace
        TEST_F(CodepageDetectTest, GuessCodepageEncoding0)
        {
                FileTextEncoding enc;
-               enc = GuessCodepageEncoding(_T("../../Data/Unicode/UCS-2LE/DiffItem.h"), 0);
+               enc = codepage_detect::Guess(_T("../../Data/Unicode/UCS-2LE/DiffItem.h"), 0);
                EXPECT_EQ(1200, enc.m_codepage);
                EXPECT_EQ(true, enc.m_bom);
                EXPECT_EQ(ucr::UCS2LE, enc.m_unicoding);
-               enc = GuessCodepageEncoding(_T("../../Data/Unicode/UCS-2BE/DiffItem.h"), 0);
+               enc = codepage_detect::Guess(_T("../../Data/Unicode/UCS-2BE/DiffItem.h"), 0);
                EXPECT_EQ(1201, enc.m_codepage);
                EXPECT_EQ(true, enc.m_bom);
                EXPECT_EQ(ucr::UCS2BE, enc.m_unicoding);
-               enc = GuessCodepageEncoding(_T("../../Data/Unicode/UTF-8/DiffItem.h"), 0);
+               enc = codepage_detect::Guess(_T("../../Data/Unicode/UTF-8/DiffItem.h"), 0);
                EXPECT_EQ(65001, enc.m_codepage);
                EXPECT_EQ(true, enc.m_bom);
                EXPECT_EQ(ucr::UTF8, enc.m_unicoding);
-               enc = GuessCodepageEncoding(_T("../../Data/Unicode/UTF-8-NOBOM/DiffItem.h"), 0);
+               enc = codepage_detect::Guess(_T("../../Data/Unicode/UTF-8-NOBOM/DiffItem.h"), 0);
                EXPECT_EQ(ucr::getDefaultCodepage(), enc.m_codepage);
                EXPECT_EQ(false, enc.m_bom);
                EXPECT_EQ(ucr::NONE, enc.m_unicoding);
 
-               enc = GuessCodepageEncoding(_T("abcdefg12345"), 0);
+               enc = codepage_detect::Guess(_T("abcdefg12345"), 0);
                EXPECT_EQ(ucr::getDefaultCodepage(), enc.m_codepage);
                EXPECT_EQ(false, enc.m_bom);
                EXPECT_EQ(ucr::NONE, enc.m_unicoding);
@@ -71,22 +71,22 @@ namespace
        TEST_F(CodepageDetectTest, GuessCodepageEncoding1)
        {
                FileTextEncoding enc;
-               enc = GuessCodepageEncoding(_T("../../Data/Unicode/UTF-8-NOBOM/DiffItem.h"), 1);
+               enc = codepage_detect::Guess(_T("../../Data/Unicode/UTF-8-NOBOM/DiffItem.h"), 1);
                EXPECT_EQ(65001, enc.m_codepage);
                EXPECT_EQ(false, enc.m_bom);
                EXPECT_EQ(ucr::UTF8, enc.m_unicoding);
 
-               enc = GuessCodepageEncoding(_T("../../../Docs/Manual/EN/About_Doc.xml"), 1);
+               enc = codepage_detect::Guess(_T("../../../Docs/Manual/EN/About_Doc.xml"), 1);
                EXPECT_EQ(65001, enc.m_codepage);
                EXPECT_EQ(false, enc.m_bom);
                EXPECT_EQ(ucr::UTF8, enc.m_unicoding);
 
-               enc = GuessCodepageEncoding(_T("../../../Docs/Developers/readme-developers.html"), 1);
+               enc = codepage_detect::Guess(_T("../../../Docs/Developers/readme-developers.html"), 1);
                EXPECT_EQ(28591, enc.m_codepage);
                EXPECT_EQ(false, enc.m_bom);
                EXPECT_EQ(ucr::NONE, enc.m_unicoding);
 
-               enc = GuessCodepageEncoding(_T("../../../ShellExtension/Languages/ShellExtensionRussian.rc"), 1);
+               enc = codepage_detect::Guess(_T("../../../ShellExtension/Languages/ShellExtensionRussian.rc"), 1);
                EXPECT_EQ(65001, enc.m_codepage);
                EXPECT_EQ(true, enc.m_bom);
                EXPECT_EQ(ucr::UTF8, enc.m_unicoding);