Optimize Unicode conversion

author sdottaka <none@none>

Sat, 12 Jan 2013 13:55:46 +0000 (22:55 +0900)

committer sdottaka <none@none>

Sat, 12 Jan 2013 13:55:46 +0000 (22:55 +0900)
author sdottaka <none@none>
Sat, 12 Jan 2013 13:55:46 +0000 (22:55 +0900)
committer sdottaka <none@none>
Sat, 12 Jan 2013 13:55:46 +0000 (22:55 +0900)
diff --git a/Externals/poco/Foundation/src/UnicodeConverter.cpp b/Externals/poco/Foundation/src/UnicodeConverter.cpp

index faabaf1..3ce2ec8 100644 (file)
--- a/Externals/poco/Foundation/src/UnicodeConverter.cpp
+++ b/Externals/poco/Foundation/src/UnicodeConverter.cpp
@@ -51,6 +51,9 @@ namespace Poco {
  
  void UnicodeConverter::toUTF16(const std::string& utf8String, std::wstring& utf16String)
  {
+#if 1
+       toUTF16(utf8String.c_str(), utf8String.length(), utf16String);
+#else
         utf16String.clear();
         UTF8Encoding utf8Encoding;
         TextIterator it(utf8String, utf8Encoding);
@@ -69,6 +72,7 @@ void UnicodeConverter::toUTF16(const std::string& utf8String, std::wstring& utf1
                         utf16String += (wchar_t) (cc & 0x3ff) | 0xdc00;
                 }
         }
+#endif
  }
  
  
diff --git a/Src/Common/ExConverter.cpp b/Src/Common/ExConverter.cpp

index 74b68fe..0a64a0c 100644 (file)
--- a/Src/Common/ExConverter.cpp
+++ b/Src/Common/ExConverter.cpp
@@ -87,17 +87,30 @@ public:
         bool convert(int srcCodepage, int dstCodepage, const unsigned char * src, size_t * srcbytes, unsigned char * dest, size_t * destbytes)
         {
                 bool bsucceeded;
-               size_t wsize = *srcbytes * 2 + 6;
-               wchar_t *pbuf = new wchar_t[wsize];
-               bsucceeded = convertToUnicode(srcCodepage, (const char *)src, srcbytes, pbuf, &wsize);
-               if (!bsucceeded)
+#ifdef POCO_ARCH_BIG_ENDIAN
+               if (srcCodepage == CP_UCS2BE)
+#else
+               if (srcCodepage == CP_UCS2LE)
+#endif
+               {
+                       size_t srcwchars = *srcbytes / sizeof(wchar_t);
+                       bsucceeded = convertFromUnicode(dstCodepage, (const wchar_t *)src, &srcwchars, (char *)dest, destbytes);
+                       *srcbytes = srcwchars * sizeof(wchar_t);
+               }
+               else
                 {
+                       size_t wsize = *srcbytes * 2 + 6;
+                       wchar_t *pbuf = new wchar_t[wsize];
+                       bsucceeded = convertToUnicode(srcCodepage, (const char *)src, srcbytes, pbuf, &wsize);
+                       if (!bsucceeded)
+                       {
+                               delete [] pbuf;
+                               *destbytes = 0;
+                               return false;
+                       }
+                       bsucceeded = convertFromUnicode(dstCodepage, pbuf, &wsize, (char *)dest, destbytes);
                         delete [] pbuf;
-                       destbytes = 0;
-                       return false;
                 }
-               bsucceeded = convertFromUnicode(dstCodepage, pbuf, &wsize, (char *)dest, destbytes);
-               delete [] pbuf;
                 return bsucceeded;
         }
  
diff --git a/Src/Common/multiformatText.cpp b/Src/Common/multiformatText.cpp

index 059d72f..886c6ea 100644 (file)
--- a/Src/Common/multiformatText.cpp
+++ b/Src/Common/multiformatText.cpp
@@ -37,6 +37,9 @@
  #include <boost/scoped_array.hpp>
  #include <boost/scoped_ptr.hpp>
  #include <Poco/SharedMemory.h>
+#include <Poco/FileStream.h>
+#include <Poco/ByteOrder.h>
+#include <Poco/Buffer.h>
  #include <Poco/Exception.h>
  #include "unicoder.h"
  #include "ExConverter.h"
@@ -48,7 +51,11 @@
  #include "MergeApp.h"
  
  using Poco::SharedMemory;
+using Poco::FileOutputStream;
+using Poco::ByteOrder;
  using Poco::Exception;
+using Poco::Buffer;
+using Poco::Int64;
  
  ////////////////////////////////////////////////////////////////////////////////
  
@@ -584,6 +591,39 @@ static size_t TransformUtf8ToUcs2(const char * pcsUtf, size_t nUtf, wchar_t * ps
         return (nUcs - nremains);
  }
  
+template<typename T, bool flipbytes>
+inline const T *findNextLine(const T *pstart, const T *pend)
+{
+       for (const T *p = pstart; p < pend; ++p)
+       {
+               int ch = flipbytes ? ByteOrder::flipBytes(*p) : *p;
+               if (ch == '\n')
+                       return p + 1;
+               else if (ch == '\r')
+               {
+                       if (p + 1 < pend && *(p + 1) == (flipbytes ? ByteOrder::flipBytes('\n') : '\n'))
+                               return p + 2;
+                       else
+                               return p + 1;
+               }
+       }
+       return pend;
+}
+
+static const char *findNextLine(ucr::UNICODESET unicoding, const char *pstart, const char *pend)
+{
+       switch (unicoding)
+       {
+       case ucr::UCS2LE:
+               return (const char *)findNextLine<unsigned short, false>((const unsigned short *)pstart, (const unsigned short *)pend);
+       case ucr::UCS2BE:
+               return (const char *)findNextLine<unsigned short, true>((const unsigned short *)pstart, (const unsigned short *)pend);
+       default:
+               return findNextLine<char, false>(pstart, pend);
+       }
+       return pend;
+}
+
  bool AnyCodepageToUTF8(int codepage, const String& filepath, const String& filepathDst, int & nFileChanged, bool bWriteBOM)
  {
         UniMemFile ufile;
@@ -620,24 +660,38 @@ bool AnyCodepageToUTF8(int codepage, const String& filepath, const String& filep
                 size_t nSizeBOM = (bWriteBOM) ? 3 : 0;
                 size_t nDstSize = nBufSize * 2;
  
+               const size_t minbufsize = 128 * 1024;
+
                 // create the destination file
-               TFile fileOut(filepathDst);
-               fileOut.setSize(nDstSize + nSizeBOM);
-               SharedMemory shmOut(fileOut, SharedMemory::AM_WRITE);
+               FileOutputStream fout(ucr::toUTF8(filepathDst), std::ios::out|std::ios::binary|std::ios::trunc);
+               Buffer<char> obuf(minbufsize);
+               Int64 pos = nSizeOldBOM;
  
                 // write BOM
                 if (bWriteBOM)
-                       ucr::writeBom(shmOut.begin(), ucr::UTF8);
+               {
+                       char bom[4];
+                       fout.write(bom, ucr::writeBom(bom, ucr::UTF8));
+               }
  
                 // write data
-               size_t srcbytes = nBufSize;
-               size_t destbytes = nDstSize;
-               if (pexconv)
-                       pexconv->convert(codepage, CP_UTF8, (const unsigned char *)pszBuf+nSizeOldBOM, &srcbytes, (unsigned char *)shmOut.begin()+nSizeBOM, &destbytes);
-               else
+               for (;;)
                 {
-                       bool lossy = false;
-                       ucr::CrossConvert((const char *)pszBuf+nSizeOldBOM, srcbytes, shmOut.begin()+nSizeBOM, destbytes, codepage, CP_UTF8, &lossy);
+                       size_t srcbytes = findNextLine(unicoding, pszBuf + pos + minbufsize, pszBuf + nBufSize) - (pszBuf + pos);
+                       if (srcbytes == 0)
+                               break;
+                       if (srcbytes > obuf.size())
+                               obuf.resize(srcbytes * 2, false);
+                       size_t destbytes = obuf.size();
+                       if (pexconv)
+                               pexconv->convert(codepage, CP_UTF8, (const unsigned char *)pszBuf+pos, &srcbytes, (unsigned char *)obuf.begin(), &destbytes);
+                       else
+                       {
+                               bool lossy = false;
+                               destbytes = ucr::CrossConvert((const char *)pszBuf+pos, srcbytes, obuf.begin(), destbytes, codepage, CP_UTF8, &lossy);
+                       }
+                       fout.write(obuf.begin(), destbytes);
+                       pos += srcbytes;
                 }
  
                 nFileChanged ++;
diff --git a/Src/Common/unicoder.cpp b/Src/Common/unicoder.cpp

index 52dd292..6857c2e 100644 (file)
--- a/Src/Common/unicoder.cpp
+++ b/Src/Common/unicoder.cpp
@@ -878,8 +878,25 @@ std::string toUTF8(const String& tstr)
  
  void toUTF8(const String& tstr, std::string& u8str)
  {
-#ifdef UNICODE
-       UnicodeConverter::toUTF8(tstr, u8str);
+#ifdef _UNICODE
+       u8str.clear();
+       size_t len = tstr.length();
+       if (len == 0)
+               return;
+       u8str.resize(len * 3);
+       char *p = &u8str[0];
+       for (String::const_iterator it = tstr.begin(); it != tstr.end(); ++it)
+       {
+               unsigned uc = *it;
+               if (uc >= 0xd800 && uc < 0xdc00)
+               {
+                       ++it;
+                       wchar_t uc2 = *it;
+                       uc = ((uc & 0x3ff) << 10) + (uc2 & 0x3ff) + 0x10000;
+               }
+               p += Ucs4_to_Utf8(uc, reinterpret_cast<unsigned char *>(p));
+       }
+       u8str.resize(p - &u8str[0]);
  #else
         const char *p = (const char *)convertTtoUTF8(tstr.c_str(), tstr.length());
         u8str = p;
author	sdottaka <none@none>
	Sat, 12 Jan 2013 13:55:46 +0000 (22:55 +0900)
committer	sdottaka <none@none>
	Sat, 12 Jan 2013 13:55:46 +0000 (22:55 +0900)
Externals/poco/Foundation/src/UnicodeConverter.cpp		patch \| blob \| history
Src/Common/ExConverter.cpp		patch \| blob \| history
Src/Common/multiformatText.cpp		patch \| blob \| history
Src/Common/unicoder.cpp		patch \| blob \| history