1 /////////////////////////////////////////////////////////////////////////////
2 // WinMerge: an interactive diff/merge utility
3 // Copyright (C) 1997-2000 Thingamahoochie Software
5 // SPDX-License-Identifier: GPL-2.0-or-later
6 /////////////////////////////////////////////////////////////////////////////
8 * @file multiformatText.cpp
10 * @brief Implementation of class storageForPlugins
12 * @date Created: 2003-11-24
17 #include "multiformatText.h"
23 #include <Poco/SharedMemory.h>
24 #include <Poco/FileStream.h>
25 #include <Poco/ByteOrder.h>
26 #include <Poco/Buffer.h>
27 #include <Poco/Exception.h>
29 #include "ExConverter.h"
32 #include "codepage_detect.h"
33 #include "Environment.h"
37 using Poco::SharedMemory;
38 using Poco::FileOutputStream;
39 using Poco::ByteOrder;
40 using Poco::Exception;
43 ////////////////////////////////////////////////////////////////////////////////
45 static void *GetVariantArrayData(VARIANT& array, unsigned& size)
48 SafeArrayAccessData(array.parray, (void**)&parrayData);
50 SafeArrayGetLBound(array.parray, 1, &lbound);
51 SafeArrayGetUBound(array.parray, 1, &ubound);
52 size = ubound - lbound;
56 void storageForPlugins::Initialize()
58 SysFreeString(m_bstr);
60 VariantClear(&m_array);
61 m_tempFilenameDst.clear();
64 void storageForPlugins::SetDataFileAnsi(const String& filename, bool bOverwrite /*= false*/)
66 FileTextEncoding encoding;
67 encoding.SetUnicoding(ucr::NONE);
68 encoding.SetCodepage(ucr::getDefaultCodepage());
69 SetDataFileEncoding(filename, encoding, bOverwrite);
71 void storageForPlugins::SetDataFileEncoding(const String& filename, FileTextEncoding encoding, bool bOverwrite /*= false*/)
73 m_filename = filename;
76 if (encoding.m_unicoding != ucr::NONE && encoding.m_unicoding != ucr::UTF8)
77 m_bOriginalIsUnicode = m_bCurrentIsUnicode = true;
79 m_bOriginalIsUnicode = m_bCurrentIsUnicode = false;
80 m_bCurrentIsFile = true;
81 m_bOverwriteSourceFile = bOverwrite;
82 m_codepage = encoding.m_codepage;
83 m_nBomSize = encoding.m_bom ? ucr::getBomSize(encoding.m_unicoding) : 0;
86 void storageForPlugins::SetDataFileUnknown(const String& filename, bool bOverwrite /*= false*/)
88 FileTextEncoding encoding = GuessCodepageEncoding(filename, 1);
89 SetDataFileEncoding(filename, encoding, bOverwrite);
92 const TCHAR *storageForPlugins::GetDestFileName()
94 if (m_tempFilenameDst.empty())
96 m_tempFilenameDst = env::GetTemporaryFileName(env::GetTemporaryPath(), _T ("_WM"));
97 if (!m_tempFileExtensionDst.empty())
99 String tempFilenameDstNew = m_tempFilenameDst + m_tempFileExtensionDst;
102 TFile(m_tempFilenameDst).renameTo(tempFilenameDstNew);
103 m_tempFilenameDst = tempFilenameDstNew;
107 LogErrorStringUTF8(e.displayText());
111 return m_tempFilenameDst.c_str();
115 void storageForPlugins::ValidateNewFile()
117 // changed data are : file, nChanged
118 // nChanged passed as pointer so already upToDate
120 if (m_nChangedValid == m_nChanged)
122 // plugin succeeded, but nothing changed, just delete the new file
125 TFile(m_tempFilenameDst).remove();
129 LogErrorStringUTF8(e.displayText());
131 // we may reuse the temp filename
132 // tempFilenameDst.Empty();
136 m_nChangedValid = m_nChanged;
137 if (m_bOverwriteSourceFile)
141 TFile(m_filename).remove();
142 TFile(m_tempFilenameDst).renameTo(m_filename);
146 LogErrorStringUTF8(e.displayText());
151 // do not delete the original file name
152 m_filename = m_tempFilenameDst;
153 // for next transformation, we may overwrite/delete the source file
154 m_bOverwriteSourceFile = true;
156 m_tempFilenameDst.erase();
159 void storageForPlugins::ValidateNewBuffer()
161 // changed data are : buffer, nChanged
162 // passed as pointers so already upToDate
163 m_nChangedValid = m_nChanged;
166 ////////////////////////////////////////////////////////////////////////////////
168 void storageForPlugins::ValidateInternal(bool bNewIsFile, bool bNewIsUnicode)
170 assert (m_bCurrentIsFile != bNewIsFile || m_bCurrentIsUnicode != bNewIsUnicode);
172 // if we create a file, we remove the remaining previous file
175 if (m_bOverwriteSourceFile)
179 TFile(m_filename).remove();
180 TFile(m_tempFilenameDst).renameTo(m_filename);
188 // do not delete the original file name
189 m_filename = m_tempFilenameDst;
190 // for next transformation, we may overwrite/delete the source file
191 m_bOverwriteSourceFile = true;
193 m_tempFilenameDst.erase();
196 // old memory structures are freed
197 if (!m_bCurrentIsFile)
198 // except if the old data have been in situ replaced by new ones
199 if (bNewIsFile || m_bCurrentIsUnicode != bNewIsUnicode)
201 if (m_bCurrentIsUnicode)
203 SysFreeString(m_bstr);
207 VariantClear(&m_array);
210 m_bCurrentIsUnicode = bNewIsUnicode;
211 m_bCurrentIsFile = bNewIsFile;
214 m_codepage = ucr::CP_UCS2LE;
219 m_codepage = ucr::getDefaultCodepage();
224 const TCHAR *storageForPlugins::GetDataFileUnicode()
226 if (m_bCurrentIsFile && m_bCurrentIsUnicode)
227 return m_filename.c_str();
230 char * pchar = nullptr;
235 std::unique_ptr<SharedMemory> pshmIn;
237 if (m_bCurrentIsFile)
239 // Init filedata struct and open file as memory mapped (in file)
240 TFile fileIn(m_filename);
243 pshmIn.reset(new SharedMemory(fileIn, SharedMemory::AM_READ));
244 pchar = pshmIn->begin() + m_nBomSize;
245 nchars = static_cast<unsigned>(pshmIn->end() - pchar);
249 if (!fileIn.isDevice() && fileIn.getSize() > 0)
257 if (m_bCurrentIsUnicode)
259 pchar = (char *)m_bstr;
260 nchars = SysStringLen(m_bstr) * sizeof(wchar_t);
264 pchar = (char *)GetVariantArrayData(m_array, nchars);
268 // Compute the dest size (in bytes)
269 int textForeseenSize = nchars * sizeof(wchar_t) + 6; // from unicoder.cpp maketstring
270 int textRealSize = textForeseenSize;
272 // Init filedata struct and open file as memory mapped (out file)
275 TFile fileOut(m_tempFilenameDst);
276 fileOut.setSize(textForeseenSize + 2);
279 SharedMemory shmOut(fileOut, SharedMemory::AM_WRITE);
280 bom_bytes = ucr::writeBom(shmOut.begin(), ucr::UCS2LE);
281 // to UCS-2 conversion, from unicoder.cpp maketstring
283 textRealSize = ucr::CrossConvert(pchar, nchars, (char *)shmOut.begin()+bom_bytes, textForeseenSize-1, m_codepage, ucr::CP_UCS2LE, &lossy);
285 // size may have changed
286 fileOut.setSize(textRealSize + bom_bytes);
288 // Release pointers to source data
289 if (!m_bCurrentIsFile && !m_bCurrentIsUnicode)
290 SafeArrayUnaccessData(m_array.parray);
292 if ((textRealSize == 0) && (textForeseenSize > 0))
295 try { TFile(m_tempFilenameDst).remove(); } catch (...) {}
299 ValidateInternal(true, true);
300 return m_filename.c_str();
309 BSTR * storageForPlugins::GetDataBufferUnicode()
311 if (!m_bCurrentIsFile && m_bCurrentIsUnicode)
320 std::unique_ptr<SharedMemory> pshmIn;
322 if (m_bCurrentIsFile)
324 // Init filedata struct and open file as memory mapped (in file)
325 TFile fileIn(m_filename);
328 pshmIn.reset(new SharedMemory(fileIn, SharedMemory::AM_READ));
330 pchar = pshmIn->begin() + m_nBomSize;
331 nchars = static_cast<unsigned>(pshmIn->end() - pchar);
335 if (!fileIn.isDevice() && fileIn.getSize() > 0)
343 pchar = (char *)GetVariantArrayData(m_array, nchars);
346 // Compute the dest size (in bytes)
347 int textForeseenSize = nchars * sizeof(wchar_t) + 6; // from unicoder.cpp maketstring
348 int textRealSize = textForeseenSize;
350 // allocate the memory
351 std::unique_ptr<wchar_t[]> tempBSTR(new wchar_t[textForeseenSize]);
354 wchar_t * pbstrBuffer = tempBSTR.get();
355 bool bAllocSuccess = (pbstrBuffer != nullptr);
358 // to UCS-2 conversion, from unicoder.cpp maketstring
360 textRealSize = ucr::CrossConvert(pchar, nchars, (char *)pbstrBuffer, textForeseenSize-1, m_codepage, ucr::CP_UCS2LE, &lossy);
361 SysFreeString(m_bstr);
362 m_bstr = SysAllocStringLen(tempBSTR.get(), textRealSize / sizeof(wchar_t));
363 if (m_bstr == nullptr)
364 bAllocSuccess = false;
367 // Release pointers to source data
368 if (!m_bCurrentIsFile && !m_bCurrentIsUnicode)
369 SafeArrayUnaccessData(m_array.parray);
374 ValidateInternal(false, true);
383 const TCHAR *storageForPlugins::GetDataFileAnsi()
385 if (m_bCurrentIsFile && !m_bCurrentIsUnicode)
386 return m_filename.c_str();
389 char * pchar = nullptr;
394 std::unique_ptr<SharedMemory> pshmIn;
396 if (m_bCurrentIsFile)
398 // Init filedata struct and open file as memory mapped (in file)
399 TFile fileIn(m_filename);
402 pshmIn.reset(new SharedMemory(fileIn, SharedMemory::AM_READ));
404 pchar = pshmIn->begin()+m_nBomSize; // pass the BOM
405 nchars = static_cast<unsigned>(pshmIn->end() - pchar);
409 if (!fileIn.isDevice() && fileIn.getSize() > 0)
417 if (m_bCurrentIsUnicode)
419 pchar = (char *)m_bstr;
420 nchars = SysStringLen(m_bstr) * sizeof(wchar_t);
424 pchar = (char *)GetVariantArrayData(m_array, nchars);
428 // Compute the dest size (in bytes)
429 int textForeseenSize = nchars;
430 if (m_bCurrentIsUnicode)
431 textForeseenSize = nchars * 3; // from unicoder.cpp convertToBuffer
432 int textRealSize = textForeseenSize;
434 // Init filedata struct and open file as memory mapped (out file)
436 TFile fileOut(m_tempFilenameDst);
437 fileOut.setSize(textForeseenSize);
439 SharedMemory shmOut(fileOut, SharedMemory::AM_WRITE);
441 if (m_bCurrentIsUnicode)
443 // UCS-2 to Ansi conversion, from unicoder.cpp convertToBuffer
445 textRealSize = ucr::CrossConvert(pchar, nchars, (char *)shmOut.begin(), textForeseenSize, m_codepage, ucr::getDefaultCodepage(), &lossy);
449 std::memcpy(shmOut.begin(), pchar, nchars);
452 // size may have changed
453 fileOut.setSize(textRealSize);
455 // Release pointers to source data
456 if (!m_bCurrentIsFile && !m_bCurrentIsUnicode)
457 SafeArrayUnaccessData(m_array.parray);
459 if ((textRealSize == 0) && (textForeseenSize > 0))
462 try { TFile(m_tempFilenameDst).remove(); } catch (...) {}
466 ValidateInternal(true, false);
467 return m_filename.c_str();
476 VARIANT * storageForPlugins::GetDataBufferAnsi()
478 if (!m_bCurrentIsFile && !m_bCurrentIsUnicode)
487 std::unique_ptr<SharedMemory> pshmIn;
489 if (m_bCurrentIsFile)
491 // Init filedata struct and open file as memory mapped (in file)
492 TFile fileIn(m_filename);
493 pshmIn.reset(new SharedMemory(fileIn, SharedMemory::AM_READ));
495 pchar = pshmIn->begin() + m_nBomSize;
496 nchars = static_cast<unsigned>(pshmIn->end() - pchar);
500 pchar = (char *)m_bstr;
501 nchars = SysStringLen(m_bstr) * sizeof(wchar_t);
504 // Compute the dest size (in bytes)
505 int textForeseenSize = nchars;
506 if (m_bCurrentIsUnicode)
507 textForeseenSize = nchars * 3; // from unicoder.cpp convertToBuffer
508 int textRealSize = textForeseenSize;
510 // allocate the memory
511 SAFEARRAYBOUND rgsabound = {static_cast<ULONG>(textForeseenSize), 0};
512 m_array.vt = VT_UI1 | VT_ARRAY;
513 m_array.parray = SafeArrayCreate(VT_UI1, 1, &rgsabound);
515 SafeArrayAccessData(m_array.parray, (void**)&parrayData);
518 if (m_bCurrentIsUnicode)
520 // to Ansi conversion, from unicoder.cpp convertToBuffer
522 textRealSize = ucr::CrossConvert(pchar, nchars, (char *)parrayData, textForeseenSize, m_codepage, ucr::getDefaultCodepage(), &lossy);
526 std::memcpy(parrayData, pchar, nchars);
528 // size may have changed
529 SafeArrayUnaccessData(m_array.parray);
530 SAFEARRAYBOUND rgsaboundnew = {static_cast<ULONG>(textRealSize), 0};
531 SafeArrayRedim(m_array.parray, &rgsaboundnew);
533 ValidateInternal(false, false);
542 template<typename T, bool flipbytes>
543 inline const T *findNextLine(const T *pstart, const T *pend)
545 for (const T *p = pstart; p < pend; ++p)
547 int ch = flipbytes ? ByteOrder::flipBytes(*p) : *p;
552 if (p + 1 < pend && *(p + 1) == (flipbytes ? ByteOrder::flipBytes('\n') : '\n'))
561 static const char *findNextLine(ucr::UNICODESET unicoding, const char *pstart, const char *pend)
566 return (const char *)findNextLine<unsigned short, false>((const unsigned short *)pstart, (const unsigned short *)pend);
568 return (const char *)findNextLine<unsigned short, true>((const unsigned short *)pstart, (const unsigned short *)pend);
570 return findNextLine<char, false>(pstart, pend);
574 bool AnyCodepageToUTF8(int codepage, const String& filepath, const String& filepathDst, int & nFileChanged, bool bWriteBOM)
577 if (!ufile.OpenReadOnly(filepath))
580 ucr::UNICODESET unicoding = ufile.GetUnicoding();
581 // Finished with examing file contents
584 TFile fileIn(filepath);
587 // Init filedataIn struct and open file as memory mapped (input)
588 SharedMemory shmIn(fileIn, SharedMemory::AM_READ);
590 IExconverter *pexconv = Exconverter::getInstance();
592 char * pszBuf = shmIn.begin();
593 size_t nBufSize = shmIn.end() - shmIn.begin();
594 size_t nSizeOldBOM = 0;
606 const size_t minbufsize = 128 * 1024;
608 // create the destination file
609 FileOutputStream fout(ucr::toUTF8(filepathDst), std::ios::out|std::ios::binary|std::ios::trunc);
610 Buffer<char> obuf(minbufsize);
611 int64_t pos = nSizeOldBOM;
617 fout.write(bom, ucr::writeBom(bom, ucr::UTF8));
623 size_t srcbytes = findNextLine(unicoding, pszBuf + pos + minbufsize, pszBuf + nBufSize) - (pszBuf + pos);
626 if (srcbytes * 3 > obuf.size())
627 obuf.resize(srcbytes * 3 * 2, false);
628 size_t destbytes = obuf.size();
629 if (pexconv != nullptr)
631 size_t srcbytes2 = srcbytes;
632 if (!pexconv->convert(codepage, ucr::CP_UTF_8, (const unsigned char *)pszBuf+pos, &srcbytes2, (unsigned char *)obuf.begin(), &destbytes))
633 throw "failed to convert file contents to utf-8";
638 destbytes = ucr::CrossConvert((const char *)pszBuf+pos, static_cast<unsigned>(srcbytes), obuf.begin(), static_cast<unsigned>(destbytes), codepage, ucr::CP_UTF_8, &lossy);
640 fout.write(obuf.begin(), destbytes);
649 if (fileIn.getSize() == 0)