}
void storageForPlugins::SetDataFileUnknown(const String& filename, bool bOverwrite /*= false*/)
{
- FileTextEncoding encoding = GuessCodepageEncoding(filename, 1);
+ FileTextEncoding encoding = codepage_detect::Guess(filename, 1);
SetDataFileEncoding(filename, encoding, bOverwrite);
}
bool success4 = baseRevision.Open(baseRevisionFileName, _T("wb"));
// detect codepage of conflict file
- FileTextEncoding encoding = GuessCodepageEncoding(conflictFileName, iGuessEncodingType);
+ FileTextEncoding encoding = codepage_detect::Guess(conflictFileName, iGuessEncodingType);
conflictFile.SetUnicoding(encoding.m_unicoding);
conflictFile.SetBom(encoding.m_bom);
if (!dfi.Update(filepath))
return false;
UpdateVersion(di, nIndex);
- dfi.encoding = GuessCodepageEncoding(filepath, m_iGuessEncodingType);
+ dfi.encoding = codepage_detect::Guess(filepath, m_iGuessEncodingType);
return true;
}
{
// re-detect codepage
int iGuessEncodingType = GetOptionsMgr()->GetInt(OPT_CP_DETECT);
- FileTextEncoding encoding2 = GuessCodepageEncoding(pszFileName, iGuessEncodingType);
+ FileTextEncoding encoding2 = codepage_detect::Guess(pszFileName, iGuessEncodingType);
pufile->SetUnicoding(encoding2.m_unicoding);
pufile->SetCodepage(encoding2.m_codepage);
pufile->SetBom(encoding2.m_bom);
// Unpacked files will be deleted at end of this function.
filepathTransformed[nIndex] = filepathUnpacked[nIndex];
- encoding[nIndex] = GuessCodepageEncoding(filepathTransformed[nIndex], m_pCtxt->m_iGuessEncodingType);
+ encoding[nIndex] = codepage_detect::Guess(filepathTransformed[nIndex], m_pCtxt->m_iGuessEncodingType);
m_diffFileData.m_FileLocation[nIndex].encoding = encoding[nIndex];
}
static void
FileLocationGuessEncodings(FileLocation & fileloc, int iGuessEncoding)
{
- fileloc.encoding = GuessCodepageEncoding(fileloc.filepath, iGuessEncoding);
+ fileloc.encoding = codepage_detect::Guess(fileloc.filepath, iGuessEncoding);
}
bool CMainFrame::ShowAutoMergeDoc(CDirDoc * pDirDoc,
{
m_ptBuf[index]->FreeAll();
loadSuccess = LoadFile(filename.c_str(), index, readOnly,
- GuessCodepageEncoding(filename, GetOptionsMgr()->GetInt(OPT_CP_DETECT), -1));
+ codepage_detect::Guess(filename, GetOptionsMgr()->GetInt(OPT_CP_DETECT), -1));
}
}
else
strDesc[nBuffer] = _T("");
fileloc[nBuffer].setPath(path);
- fileloc[nBuffer].encoding = GuessCodepageEncoding(path, GetOptionsMgr()->GetInt(OPT_CP_DETECT));
+ fileloc[nBuffer].encoding = codepage_detect::Guess(path, GetOptionsMgr()->GetInt(OPT_CP_DETECT));
if (OpenDocs(m_nBuffers, fileloc, bRO, strDesc))
MoveOnLoad(nBuffer, nLineIndex);
return cp;
}
+namespace codepage_detect
+{
/**
* @brief Try to deduce encoding for this file.
* @param [in] ext File extension.
* @param [in] len Size of the file contents string.
* @return Codepage number.
*/
-static unsigned GuessEncoding_from_bytes(const String& ext, const char *src, size_t len, int guessEncodingType)
+FileTextEncoding Guess(const String& ext, const void * src, size_t len, int guessEncodingType)
{
+ FileTextEncoding encoding;
+ int bomsize = 0;
+ encoding.SetUnicoding(ucr::DetermineEncoding(reinterpret_cast<const unsigned char *>(src), len, &encoding.m_bom));
+ if (bomsize > 0)
+ encoding.m_bom = true;
+ if (encoding.m_unicoding != ucr::NONE)
+ return encoding;
unsigned cp = ucr::getDefaultCodepage();
- if (!ucr::CheckForInvalidUtf8(src, len))
- cp = ucr::CP_UTF_8;
- else if (guessEncodingType & 2)
+ if (guessEncodingType != 0)
{
- IExconverter *pexconv = Exconverter::getInstance();
- if (pexconv != nullptr && src != nullptr)
- {
- int autodetectType = (unsigned)guessEncodingType >> 16;
- cp = pexconv->detectInputCodepage(autodetectType, cp, src, len);
- }
- }
- if (guessEncodingType & 1)
- {
- String lower_ext = strutils::makelower(ext);
- if (lower_ext == _T(".rc"))
- {
- cp = demoGuessEncoding_rc(src, len, cp);
- }
- else if (lower_ext == _T(".htm") || lower_ext == _T(".html"))
+ if (!ucr::CheckForInvalidUtf8(reinterpret_cast<const char*>(src), len))
+ cp = ucr::CP_UTF_8;
+ else if (guessEncodingType & 2)
{
- cp = demoGuessEncoding_html(src, len, cp);
+ IExconverter* pexconv = Exconverter::getInstance();
+ if (pexconv != nullptr && src != nullptr)
+ {
+ int autodetectType = (unsigned)guessEncodingType >> 16;
+ cp = pexconv->detectInputCodepage(autodetectType, cp, reinterpret_cast<const char *>(src), len);
+ }
}
- else if (lower_ext == _T(".xml") || lower_ext == _T(".xsl"))
+ if (guessEncodingType & 1)
{
- cp = demoGuessEncoding_xml(src, len, cp);
+ String lower_ext = strutils::makelower(ext);
+ if (lower_ext == _T(".rc"))
+ {
+ cp = demoGuessEncoding_rc(reinterpret_cast<const char *>(src), len, cp);
+ }
+ else if (lower_ext == _T(".htm") || lower_ext == _T(".html"))
+ {
+ cp = demoGuessEncoding_html(reinterpret_cast<const char *>(src), len, cp);
+ }
+ else if (lower_ext == _T(".xml") || lower_ext == _T(".xsl"))
+ {
+ cp = demoGuessEncoding_xml(reinterpret_cast<const char *>(src), len, cp);
+ }
}
}
- return cp;
+ encoding.SetCodepage(cp);
+ return encoding;
}
/**
* @param [in] bGuessEncoding Try to guess codepage (not just unicode encoding).
* @return Structure getting the encoding info.
*/
-FileTextEncoding GuessCodepageEncoding(const String& filepath, int guessEncodingType, ptrdiff_t mapmaxlen)
+FileTextEncoding Guess(const String& filepath, int guessEncodingType, ptrdiff_t mapmaxlen)
{
- FileTextEncoding encoding;
CMarkdown::FileImage fi(filepath != _T("NUL") ? filepath.c_str() : nullptr, mapmaxlen);
- encoding.SetCodepage(ucr::getDefaultCodepage());
- encoding.m_bom = false;
- switch (fi.nByteOrder)
- {
- case 8 + 2 + 0:
- encoding.SetUnicoding(ucr::UCS2LE);
- encoding.SetCodepage(ucr::CP_UCS2LE);
- encoding.m_bom = true;
- break;
- case 8 + 2 + 1:
- encoding.SetUnicoding(ucr::UCS2BE);
- encoding.SetCodepage(ucr::CP_UCS2BE);
- encoding.m_bom = true;
- break;
- case 8 + 1:
- encoding.SetUnicoding(ucr::UTF8);
- encoding.SetCodepage(ucr::CP_UTF_8);
- encoding.m_bom = true;
- break;
- default:
- encoding.m_bom = false;
- break;
- }
- if (fi.nByteOrder < 4 && guessEncodingType != 0)
- {
- String ext = paths::FindExtension(filepath);
- const char *src = (char *)fi.pImage;
- size_t len = fi.cbImage;
- if (len == static_cast<size_t>(mapmaxlen))
- {
- for (size_t i = len; i--; )
- {
- if (isspace((unsigned char)src[i]))
- {
- // make len an even number for ucs-2 detection
- if ((i % 2) == 0)
- len = i;
- else
- len = i + 1;
- break;
- }
- }
- }
- if (unsigned cp = GuessEncoding_from_bytes(ext, src, len, guessEncodingType))
- encoding.SetCodepage(cp);
- else
- encoding.SetCodepage(ucr::getDefaultCodepage());
- }
- return encoding;
+ String ext = paths::FindExtension(filepath);
+ return Guess(ext, fi.pImage, fi.cbImage, guessEncodingType);
+}
+
}
#include "UnicodeString.h"
#include "FileTextEncoding.h"
+namespace codepage_detect
+{
/** @brief Buffer size used in this file. */
-static const int BufSize = 65536;
+constexpr int BufSize = 65536;
-FileTextEncoding GuessCodepageEncoding(const String& filepath, int guessEncodingType, ptrdiff_t mapmaxlen = BufSize);
+FileTextEncoding Guess(const String& filepath, int guessEncodingType, ptrdiff_t mapmaxlen = BufSize);
+FileTextEncoding Guess(const String& ext, const void* src, size_t len, int guessEncodingType);
+}
TEST_F(CodepageDetectTest, GuessCodepageEncoding0)
{
FileTextEncoding enc;
- enc = GuessCodepageEncoding(_T("../../Data/Unicode/UCS-2LE/DiffItem.h"), 0);
+ enc = codepage_detect::Guess(_T("../../Data/Unicode/UCS-2LE/DiffItem.h"), 0);
EXPECT_EQ(1200, enc.m_codepage);
EXPECT_EQ(true, enc.m_bom);
EXPECT_EQ(ucr::UCS2LE, enc.m_unicoding);
- enc = GuessCodepageEncoding(_T("../../Data/Unicode/UCS-2BE/DiffItem.h"), 0);
+ enc = codepage_detect::Guess(_T("../../Data/Unicode/UCS-2BE/DiffItem.h"), 0);
EXPECT_EQ(1201, enc.m_codepage);
EXPECT_EQ(true, enc.m_bom);
EXPECT_EQ(ucr::UCS2BE, enc.m_unicoding);
- enc = GuessCodepageEncoding(_T("../../Data/Unicode/UTF-8/DiffItem.h"), 0);
+ enc = codepage_detect::Guess(_T("../../Data/Unicode/UTF-8/DiffItem.h"), 0);
EXPECT_EQ(65001, enc.m_codepage);
EXPECT_EQ(true, enc.m_bom);
EXPECT_EQ(ucr::UTF8, enc.m_unicoding);
- enc = GuessCodepageEncoding(_T("../../Data/Unicode/UTF-8-NOBOM/DiffItem.h"), 0);
+ enc = codepage_detect::Guess(_T("../../Data/Unicode/UTF-8-NOBOM/DiffItem.h"), 0);
EXPECT_EQ(ucr::getDefaultCodepage(), enc.m_codepage);
EXPECT_EQ(false, enc.m_bom);
EXPECT_EQ(ucr::NONE, enc.m_unicoding);
- enc = GuessCodepageEncoding(_T("abcdefg12345"), 0);
+ enc = codepage_detect::Guess(_T("abcdefg12345"), 0);
EXPECT_EQ(ucr::getDefaultCodepage(), enc.m_codepage);
EXPECT_EQ(false, enc.m_bom);
EXPECT_EQ(ucr::NONE, enc.m_unicoding);
TEST_F(CodepageDetectTest, GuessCodepageEncoding1)
{
FileTextEncoding enc;
- enc = GuessCodepageEncoding(_T("../../Data/Unicode/UTF-8-NOBOM/DiffItem.h"), 1);
+ enc = codepage_detect::Guess(_T("../../Data/Unicode/UTF-8-NOBOM/DiffItem.h"), 1);
EXPECT_EQ(65001, enc.m_codepage);
EXPECT_EQ(false, enc.m_bom);
EXPECT_EQ(ucr::UTF8, enc.m_unicoding);
- enc = GuessCodepageEncoding(_T("../../../Docs/Manual/EN/About_Doc.xml"), 1);
+ enc = codepage_detect::Guess(_T("../../../Docs/Manual/EN/About_Doc.xml"), 1);
EXPECT_EQ(65001, enc.m_codepage);
EXPECT_EQ(false, enc.m_bom);
EXPECT_EQ(ucr::UTF8, enc.m_unicoding);
- enc = GuessCodepageEncoding(_T("../../../Docs/Developers/readme-developers.html"), 1);
+ enc = codepage_detect::Guess(_T("../../../Docs/Developers/readme-developers.html"), 1);
EXPECT_EQ(28591, enc.m_codepage);
EXPECT_EQ(false, enc.m_bom);
EXPECT_EQ(ucr::NONE, enc.m_unicoding);
- enc = GuessCodepageEncoding(_T("../../../ShellExtension/Languages/ShellExtensionRussian.rc"), 1);
+ enc = codepage_detect::Guess(_T("../../../ShellExtension/Languages/ShellExtensionRussian.rc"), 1);
EXPECT_EQ(65001, enc.m_codepage);
EXPECT_EQ(true, enc.m_bom);
EXPECT_EQ(ucr::UTF8, enc.m_unicoding);