OSDN Git Service

Optimize Unicode conversion
authorsdottaka <none@none>
Sat, 12 Jan 2013 13:55:46 +0000 (22:55 +0900)
committersdottaka <none@none>
Sat, 12 Jan 2013 13:55:46 +0000 (22:55 +0900)
Externals/poco/Foundation/src/UnicodeConverter.cpp
Src/Common/ExConverter.cpp
Src/Common/multiformatText.cpp
Src/Common/unicoder.cpp

index faabaf1..3ce2ec8 100644 (file)
@@ -51,6 +51,9 @@ namespace Poco {
 
 void UnicodeConverter::toUTF16(const std::string& utf8String, std::wstring& utf16String)
 {
+#if 1
+       toUTF16(utf8String.c_str(), utf8String.length(), utf16String);
+#else
        utf16String.clear();
        UTF8Encoding utf8Encoding;
        TextIterator it(utf8String, utf8Encoding);
@@ -69,6 +72,7 @@ void UnicodeConverter::toUTF16(const std::string& utf8String, std::wstring& utf1
                        utf16String += (wchar_t) (cc & 0x3ff) | 0xdc00;
                }
        }
+#endif
 }
 
 
index 74b68fe..0a64a0c 100644 (file)
@@ -87,17 +87,30 @@ public:
        bool convert(int srcCodepage, int dstCodepage, const unsigned char * src, size_t * srcbytes, unsigned char * dest, size_t * destbytes)
        {
                bool bsucceeded;
-               size_t wsize = *srcbytes * 2 + 6;
-               wchar_t *pbuf = new wchar_t[wsize];
-               bsucceeded = convertToUnicode(srcCodepage, (const char *)src, srcbytes, pbuf, &wsize);
-               if (!bsucceeded)
+#ifdef POCO_ARCH_BIG_ENDIAN
+               if (srcCodepage == CP_UCS2BE)
+#else
+               if (srcCodepage == CP_UCS2LE)
+#endif
+               {
+                       size_t srcwchars = *srcbytes / sizeof(wchar_t);
+                       bsucceeded = convertFromUnicode(dstCodepage, (const wchar_t *)src, &srcwchars, (char *)dest, destbytes);
+                       *srcbytes = srcwchars * sizeof(wchar_t);
+               }
+               else
                {
+                       size_t wsize = *srcbytes * 2 + 6;
+                       wchar_t *pbuf = new wchar_t[wsize];
+                       bsucceeded = convertToUnicode(srcCodepage, (const char *)src, srcbytes, pbuf, &wsize);
+                       if (!bsucceeded)
+                       {
+                               delete [] pbuf;
+                               *destbytes = 0;
+                               return false;
+                       }
+                       bsucceeded = convertFromUnicode(dstCodepage, pbuf, &wsize, (char *)dest, destbytes);
                        delete [] pbuf;
-                       destbytes = 0;
-                       return false;
                }
-               bsucceeded = convertFromUnicode(dstCodepage, pbuf, &wsize, (char *)dest, destbytes);
-               delete [] pbuf;
                return bsucceeded;
        }
 
index 059d72f..886c6ea 100644 (file)
@@ -37,6 +37,9 @@
 #include <boost/scoped_array.hpp>
 #include <boost/scoped_ptr.hpp>
 #include <Poco/SharedMemory.h>
+#include <Poco/FileStream.h>
+#include <Poco/ByteOrder.h>
+#include <Poco/Buffer.h>
 #include <Poco/Exception.h>
 #include "unicoder.h"
 #include "ExConverter.h"
 #include "MergeApp.h"
 
 using Poco::SharedMemory;
+using Poco::FileOutputStream;
+using Poco::ByteOrder;
 using Poco::Exception;
+using Poco::Buffer;
+using Poco::Int64;
 
 ////////////////////////////////////////////////////////////////////////////////
 
@@ -584,6 +591,39 @@ static size_t TransformUtf8ToUcs2(const char * pcsUtf, size_t nUtf, wchar_t * ps
        return (nUcs - nremains);
 }
 
+template<typename T, bool flipbytes>
+inline const T *findNextLine(const T *pstart, const T *pend)
+{
+       for (const T *p = pstart; p < pend; ++p)
+       {
+               int ch = flipbytes ? ByteOrder::flipBytes(*p) : *p;
+               if (ch == '\n')
+                       return p + 1;
+               else if (ch == '\r')
+               {
+                       if (p + 1 < pend && *(p + 1) == (flipbytes ? ByteOrder::flipBytes('\n') : '\n'))
+                               return p + 2;
+                       else
+                               return p + 1;
+               }
+       }
+       return pend;
+}
+
+static const char *findNextLine(ucr::UNICODESET unicoding, const char *pstart, const char *pend)
+{
+       switch (unicoding)
+       {
+       case ucr::UCS2LE:
+               return (const char *)findNextLine<unsigned short, false>((const unsigned short *)pstart, (const unsigned short *)pend);
+       case ucr::UCS2BE:
+               return (const char *)findNextLine<unsigned short, true>((const unsigned short *)pstart, (const unsigned short *)pend);
+       default:
+               return findNextLine<char, false>(pstart, pend);
+       }
+       return pend;
+}
+
 bool AnyCodepageToUTF8(int codepage, const String& filepath, const String& filepathDst, int & nFileChanged, bool bWriteBOM)
 {
        UniMemFile ufile;
@@ -620,24 +660,38 @@ bool AnyCodepageToUTF8(int codepage, const String& filepath, const String& filep
                size_t nSizeBOM = (bWriteBOM) ? 3 : 0;
                size_t nDstSize = nBufSize * 2;
 
+               const size_t minbufsize = 128 * 1024;
+
                // create the destination file
-               TFile fileOut(filepathDst);
-               fileOut.setSize(nDstSize + nSizeBOM);
-               SharedMemory shmOut(fileOut, SharedMemory::AM_WRITE);
+               FileOutputStream fout(ucr::toUTF8(filepathDst), std::ios::out|std::ios::binary|std::ios::trunc);
+               Buffer<char> obuf(minbufsize);
+               Int64 pos = nSizeOldBOM;
 
                // write BOM
                if (bWriteBOM)
-                       ucr::writeBom(shmOut.begin(), ucr::UTF8);
+               {
+                       char bom[4];
+                       fout.write(bom, ucr::writeBom(bom, ucr::UTF8));
+               }
 
                // write data
-               size_t srcbytes = nBufSize;
-               size_t destbytes = nDstSize;
-               if (pexconv)
-                       pexconv->convert(codepage, CP_UTF8, (const unsigned char *)pszBuf+nSizeOldBOM, &srcbytes, (unsigned char *)shmOut.begin()+nSizeBOM, &destbytes);
-               else
+               for (;;)
                {
-                       bool lossy = false;
-                       ucr::CrossConvert((const char *)pszBuf+nSizeOldBOM, srcbytes, shmOut.begin()+nSizeBOM, destbytes, codepage, CP_UTF8, &lossy);
+                       size_t srcbytes = findNextLine(unicoding, pszBuf + pos + minbufsize, pszBuf + nBufSize) - (pszBuf + pos);
+                       if (srcbytes == 0)
+                               break;
+                       if (srcbytes > obuf.size())
+                               obuf.resize(srcbytes * 2, false);
+                       size_t destbytes = obuf.size();
+                       if (pexconv)
+                               pexconv->convert(codepage, CP_UTF8, (const unsigned char *)pszBuf+pos, &srcbytes, (unsigned char *)obuf.begin(), &destbytes);
+                       else
+                       {
+                               bool lossy = false;
+                               destbytes = ucr::CrossConvert((const char *)pszBuf+pos, srcbytes, obuf.begin(), destbytes, codepage, CP_UTF8, &lossy);
+                       }
+                       fout.write(obuf.begin(), destbytes);
+                       pos += srcbytes;
                }
 
                nFileChanged ++;
index 52dd292..6857c2e 100644 (file)
@@ -878,8 +878,25 @@ std::string toUTF8(const String& tstr)
 
 void toUTF8(const String& tstr, std::string& u8str)
 {
-#ifdef UNICODE
-       UnicodeConverter::toUTF8(tstr, u8str);
+#ifdef _UNICODE
+       u8str.clear();
+       size_t len = tstr.length();
+       if (len == 0)
+               return;
+       u8str.resize(len * 3);
+       char *p = &u8str[0];
+       for (String::const_iterator it = tstr.begin(); it != tstr.end(); ++it)
+       {
+               unsigned uc = *it;
+               if (uc >= 0xd800 && uc < 0xdc00)
+               {
+                       ++it;
+                       wchar_t uc2 = *it;
+                       uc = ((uc & 0x3ff) << 10) + (uc2 & 0x3ff) + 0x10000;
+               }
+               p += Ucs4_to_Utf8(uc, reinterpret_cast<unsigned char *>(p));
+       }
+       u8str.resize(p - &u8str[0]);
 #else
        const char *p = (const char *)convertTtoUTF8(tstr.c_str(), tstr.length());
        u8str = p;