OSDN Git Service

ce4befa281080e313041f3adbe166c4f2430c199
[winmerge-jp/winmerge-jp.git] / Src / Common / multiformatText.cpp
1 /////////////////////////////////////////////////////////////////////////////
2 //    WinMerge:  an interactive diff/merge utility
3 //    Copyright (C) 1997-2000  Thingamahoochie Software
4 //    Author: Dean Grimm
5 //    SPDX-License-Identifier: GPL-2.0-or-later
6 /////////////////////////////////////////////////////////////////////////////
7 /**
8  * @file multiformatText.cpp
9  *
10  * @brief Implementation of class storageForPlugins
11  *
12  * @date  Created: 2003-11-24
13  */ 
14
15 #include "pch.h"
16 #define NOMINMAX
17 #include "multiformatText.h"
18 #include <vector>
19 #include <algorithm>
20 #include <cstring>
21 #include <cassert>
22 #include <memory>
23 #include <Poco/SharedMemory.h>
24 #include <Poco/FileStream.h>
25 #include <Poco/ByteOrder.h>
26 #include <Poco/Buffer.h>
27 #include <Poco/Exception.h>
28 #include "unicoder.h"
29 #include "ExConverter.h"
30 #include "paths.h"
31 #include "UniFile.h"
32 #include "codepage_detect.h"
33 #include "Environment.h"
34 #include "TFile.h"
35 #include "MergeApp.h"
36
37 using Poco::SharedMemory;
38 using Poco::FileOutputStream;
39 using Poco::ByteOrder;
40 using Poco::Exception;
41 using Poco::Buffer;
42
43 ////////////////////////////////////////////////////////////////////////////////
44
45 static void *GetVariantArrayData(VARIANT& array, unsigned& size)
46 {
47         char * parrayData;
48         SafeArrayAccessData(array.parray, (void**)&parrayData);
49         LONG ubound, lbound;
50         SafeArrayGetLBound(array.parray, 1, &lbound);
51         SafeArrayGetUBound(array.parray, 1, &ubound);
52         size = ubound - lbound;
53         return parrayData;
54 }
55
56 void storageForPlugins::Initialize()
57 {
58         SysFreeString(m_bstr);
59         m_bstr = nullptr;
60         VariantClear(&m_array);
61         m_tempFilenameDst.clear();
62 }
63
64 void storageForPlugins::SetDataFileAnsi(const String& filename, bool bOverwrite /*= false*/) 
65 {
66         FileTextEncoding encoding;
67         encoding.SetUnicoding(ucr::NONE);
68         encoding.SetCodepage(ucr::getDefaultCodepage());
69         SetDataFileEncoding(filename, encoding, bOverwrite); 
70 }
71 void storageForPlugins::SetDataFileEncoding(const String& filename, FileTextEncoding encoding, bool bOverwrite /*= false*/)
72 {
73         m_filename = filename;
74         m_nChangedValid = 0;
75         m_nChanged = 0;
76         if (encoding.m_unicoding != ucr::NONE && encoding.m_unicoding != ucr::UTF8)
77                 m_bOriginalIsUnicode = m_bCurrentIsUnicode = true;
78         else
79                 m_bOriginalIsUnicode = m_bCurrentIsUnicode = false;
80         m_bCurrentIsFile = true;
81         m_bOverwriteSourceFile = bOverwrite;
82         m_codepage = encoding.m_codepage;
83         m_nBomSize = encoding.m_bom ? ucr::getBomSize(encoding.m_unicoding) : 0;
84         Initialize();
85 }
86 void storageForPlugins::SetDataFileUnknown(const String& filename, bool bOverwrite /*= false*/) 
87 {
88         FileTextEncoding encoding = GuessCodepageEncoding(filename, 1);
89         SetDataFileEncoding(filename, encoding, bOverwrite);
90 }
91
92 const TCHAR *storageForPlugins::GetDestFileName()
93 {
94         if (m_tempFilenameDst.empty())
95         {
96                 m_tempFilenameDst = env::GetTemporaryFileName(env::GetTemporaryPath(), _T ("_WM"));
97                 if (!m_tempFileExtensionDst.empty())
98                 {
99                         String tempFilenameDstNew = m_tempFilenameDst + m_tempFileExtensionDst;
100                         try
101                         {
102                                 TFile(m_tempFilenameDst).renameTo(tempFilenameDstNew);
103                                 m_tempFilenameDst = tempFilenameDstNew;
104                         }
105                         catch (Exception& e)
106                         {
107                                 LogErrorStringUTF8(e.displayText());
108                         }
109                 }
110         }
111         return m_tempFilenameDst.c_str();
112 }
113
114
115 void storageForPlugins::ValidateNewFile()
116 {
117         // changed data are : file, nChanged
118         // nChanged passed as pointer so already upToDate
119         // now update file
120         if (m_nChangedValid == m_nChanged)
121         {
122                 // plugin succeeded, but nothing changed, just delete the new file
123                 try
124                 {
125                         TFile(m_tempFilenameDst).remove();
126                 }
127                 catch (Exception& e)
128                 {
129                         LogErrorStringUTF8(e.displayText());
130                 }
131                 // we may reuse the temp filename
132                 // tempFilenameDst.Empty();
133         }
134         else
135         {
136                 m_nChangedValid = m_nChanged;
137                 if (m_bOverwriteSourceFile)
138                 {
139                         try
140                         {
141                                 TFile(m_filename).remove();
142                                 TFile(m_tempFilenameDst).renameTo(m_filename);
143                         }
144                         catch (Exception& e)
145                         {
146                                 LogErrorStringUTF8(e.displayText());
147                         }
148                 }
149                 else
150                 {
151                         // do not delete the original file name
152                         m_filename = m_tempFilenameDst;
153                         // for next transformation, we may overwrite/delete the source file
154                         m_bOverwriteSourceFile = true;
155                 }
156                 m_tempFilenameDst.erase();
157         }
158 }
159 void storageForPlugins::ValidateNewBuffer()
160 {
161         // changed data are : buffer, nChanged
162         // passed as pointers so already upToDate
163         m_nChangedValid = m_nChanged;
164 }
165
166 ////////////////////////////////////////////////////////////////////////////////
167
168 void storageForPlugins::ValidateInternal(bool bNewIsFile, bool bNewIsUnicode)
169 {
170         assert (m_bCurrentIsFile != bNewIsFile || m_bCurrentIsUnicode != bNewIsUnicode);
171
172         // if we create a file, we remove the remaining previous file 
173         if (bNewIsFile)
174         {
175                 if (m_bOverwriteSourceFile)
176                 {
177                         try
178                         {
179                                 TFile(m_filename).remove();
180                                 TFile(m_tempFilenameDst).renameTo(m_filename);
181                         }
182                         catch (...)
183                         {
184                         }
185                 }
186                 else
187                 {
188                         // do not delete the original file name
189                         m_filename = m_tempFilenameDst;
190                         // for next transformation, we may overwrite/delete the source file
191                         m_bOverwriteSourceFile = true;
192                 }
193                 m_tempFilenameDst.erase();
194         }
195
196         // old memory structures are freed
197         if (!m_bCurrentIsFile)
198                 // except if the old data have been in situ replaced by new ones
199                 if (bNewIsFile || m_bCurrentIsUnicode != bNewIsUnicode)
200                 {
201                         if (m_bCurrentIsUnicode)
202                         {
203                                 SysFreeString(m_bstr);
204                                 m_bstr = nullptr;
205                         }
206                         else
207                                 VariantClear(&m_array);
208                 }
209
210         m_bCurrentIsUnicode = bNewIsUnicode;
211         m_bCurrentIsFile = bNewIsFile;
212         if (bNewIsUnicode)
213         {
214                 m_codepage = ucr::CP_UCS2LE;
215                 m_nBomSize = 2; 
216         }
217         else
218         {
219                 m_codepage = ucr::getDefaultCodepage();
220                 m_nBomSize = 0;
221         }
222 }
223
224 const TCHAR *storageForPlugins::GetDataFileUnicode()
225 {
226         if (m_bCurrentIsFile && m_bCurrentIsUnicode)
227                 return m_filename.c_str();
228
229         unsigned nchars;
230         char * pchar = nullptr;
231
232         try
233         {
234                 {
235                         std::unique_ptr<SharedMemory> pshmIn;
236                         // Get source data
237                         if (m_bCurrentIsFile)
238                         {
239                                 // Init filedata struct and open file as memory mapped (in file)
240                                 TFile fileIn(m_filename);
241                                 try
242                                 {
243                                         pshmIn.reset(new SharedMemory(fileIn, SharedMemory::AM_READ));
244                                         pchar = pshmIn->begin() + m_nBomSize;
245                                         nchars = static_cast<unsigned>(pshmIn->end() - pchar);
246                                 }
247                                 catch (...)
248                                 {
249                                         if (!fileIn.isDevice() && fileIn.getSize() > 0)
250                                                 return nullptr;
251                                         pchar = "";
252                                         nchars = 0;
253                                 }                       
254                         }
255                         else
256                         {
257                                 if (m_bCurrentIsUnicode)
258                                 {
259                                         pchar = (char *)m_bstr;
260                                         nchars = SysStringLen(m_bstr) * sizeof(wchar_t);
261                                 }
262                                 else
263                                 {
264                                         pchar = (char *)GetVariantArrayData(m_array, nchars);
265                                 }
266                         }
267
268                         // Compute the dest size (in bytes)
269                         int textForeseenSize = nchars * sizeof(wchar_t) + 6; // from unicoder.cpp maketstring
270                         int textRealSize = textForeseenSize;
271
272                         // Init filedata struct and open file as memory mapped (out file)
273                         GetDestFileName();
274
275                         TFile fileOut(m_tempFilenameDst);
276                         fileOut.setSize(textForeseenSize + 2);
277                         int bom_bytes = 0;
278                         {
279                                 SharedMemory shmOut(fileOut, SharedMemory::AM_WRITE);
280                                 bom_bytes = ucr::writeBom(shmOut.begin(), ucr::UCS2LE);
281                                 // to UCS-2 conversion, from unicoder.cpp maketstring
282                                 bool lossy;
283                                 textRealSize = ucr::CrossConvert(pchar, nchars, (char *)shmOut.begin()+bom_bytes, textForeseenSize-1, m_codepage, ucr::CP_UCS2LE, &lossy);
284                         }
285                         // size may have changed
286                         fileOut.setSize(textRealSize + bom_bytes);
287
288                         // Release pointers to source data
289                         if (!m_bCurrentIsFile && !m_bCurrentIsUnicode)
290                                 SafeArrayUnaccessData(m_array.parray);
291
292                         if ((textRealSize == 0) && (textForeseenSize > 0))
293                         {
294                                 // conversion error
295                                 try { TFile(m_tempFilenameDst).remove(); } catch (...) {}
296                                 return nullptr;
297                         }
298                 }
299                 ValidateInternal(true, true);
300                 return m_filename.c_str();
301         }
302         catch (...)
303         {
304                 return nullptr;
305         }
306 }
307
308
309 BSTR * storageForPlugins::GetDataBufferUnicode()
310 {
311         if (!m_bCurrentIsFile && m_bCurrentIsUnicode)
312                 return &m_bstr;
313
314         unsigned nchars;
315         char * pchar;
316
317         try
318         {
319                 {
320                         std::unique_ptr<SharedMemory> pshmIn;
321                         // Get source data
322                         if (m_bCurrentIsFile) 
323                         {
324                                 // Init filedata struct and open file as memory mapped (in file)
325                                 TFile fileIn(m_filename);
326                                 try
327                                 {
328                                         pshmIn.reset(new SharedMemory(fileIn, SharedMemory::AM_READ));
329
330                                         pchar = pshmIn->begin() + m_nBomSize;
331                                         nchars = static_cast<unsigned>(pshmIn->end() - pchar);
332                                 }
333                                 catch (...)
334                                 {
335                                         if (!fileIn.isDevice() && fileIn.getSize() > 0)
336                                                 return nullptr;
337                                         pchar = "";
338                                         nchars = 0;
339                                 }                       
340                         }
341                         else
342                         {
343                                 pchar = (char *)GetVariantArrayData(m_array, nchars);
344                         }
345
346                         // Compute the dest size (in bytes)
347                         int textForeseenSize = nchars * sizeof(wchar_t) + 6; // from unicoder.cpp maketstring
348                         int textRealSize = textForeseenSize;
349
350                         // allocate the memory
351                         std::unique_ptr<wchar_t[]> tempBSTR(new wchar_t[textForeseenSize]);
352
353                         // fill in the data
354                         wchar_t * pbstrBuffer = tempBSTR.get();
355                         bool bAllocSuccess = (pbstrBuffer != nullptr);
356                         if (bAllocSuccess)
357                         {
358                                 // to UCS-2 conversion, from unicoder.cpp maketstring
359                                 bool lossy;
360                                 textRealSize = ucr::CrossConvert(pchar, nchars, (char *)pbstrBuffer, textForeseenSize-1, m_codepage, ucr::CP_UCS2LE, &lossy);
361                                 SysFreeString(m_bstr);
362                                 m_bstr = SysAllocStringLen(tempBSTR.get(), textRealSize / sizeof(wchar_t));
363                                 if (m_bstr == nullptr)
364                                         bAllocSuccess = false;
365                         }
366
367                         // Release pointers to source data
368                         if (!m_bCurrentIsFile && !m_bCurrentIsUnicode)
369                                 SafeArrayUnaccessData(m_array.parray);
370
371                         if (!bAllocSuccess)
372                                 return nullptr;
373                 }
374                 ValidateInternal(false, true);
375                 return &m_bstr;
376         }
377         catch (...)
378         {
379                 return nullptr;
380         }
381 }
382
383 const TCHAR *storageForPlugins::GetDataFileAnsi()
384 {
385         if (m_bCurrentIsFile && !m_bCurrentIsUnicode)
386                 return m_filename.c_str();
387
388         unsigned nchars;
389         char * pchar = nullptr;
390
391         try
392         {
393                 {
394                         std::unique_ptr<SharedMemory> pshmIn;
395                         // Get source data
396                         if (m_bCurrentIsFile)
397                         {
398                                 // Init filedata struct and open file as memory mapped (in file)
399                                 TFile fileIn(m_filename);
400                                 try
401                                 {
402                                         pshmIn.reset(new SharedMemory(fileIn, SharedMemory::AM_READ));
403
404                                         pchar = pshmIn->begin()+m_nBomSize; // pass the BOM
405                                         nchars = static_cast<unsigned>(pshmIn->end() - pchar);
406                                 }
407                                 catch (...)
408                                 {
409                                         if (!fileIn.isDevice() && fileIn.getSize() > 0)
410                                                 return nullptr;
411                                         pchar = "";
412                                         nchars = 0;
413                                 }
414                         }
415                         else 
416                         {
417                                 if (m_bCurrentIsUnicode)
418                                 {
419                                         pchar  = (char *)m_bstr;
420                                         nchars = SysStringLen(m_bstr) * sizeof(wchar_t);
421                                 }
422                                 else
423                                 {
424                                         pchar = (char *)GetVariantArrayData(m_array, nchars);
425                                 }
426                         }
427
428                         // Compute the dest size (in bytes)
429                         int textForeseenSize = nchars; 
430                         if (m_bCurrentIsUnicode)
431                                 textForeseenSize = nchars * 3; // from unicoder.cpp convertToBuffer
432                         int textRealSize = textForeseenSize;
433
434                         // Init filedata struct and open file as memory mapped (out file)
435                         GetDestFileName();
436                         TFile fileOut(m_tempFilenameDst);
437                         fileOut.setSize(textForeseenSize);
438                         {
439                                 SharedMemory shmOut(fileOut, SharedMemory::AM_WRITE);
440
441                                 if (m_bCurrentIsUnicode)
442                                 {
443                                         // UCS-2 to Ansi conversion, from unicoder.cpp convertToBuffer
444                                         bool lossy;
445                                         textRealSize = ucr::CrossConvert(pchar, nchars, (char *)shmOut.begin(), textForeseenSize, m_codepage, ucr::getDefaultCodepage(), &lossy);
446                                 }
447                                 else
448                                 {
449                                         std::memcpy(shmOut.begin(), pchar, nchars);
450                                 }
451                         }
452                         // size may have changed
453                         fileOut.setSize(textRealSize);
454
455                         // Release pointers to source data
456                         if (!m_bCurrentIsFile && !m_bCurrentIsUnicode)
457                                 SafeArrayUnaccessData(m_array.parray);
458
459                         if ((textRealSize == 0) && (textForeseenSize > 0))
460                         {
461                                 // conversion error
462                                 try { TFile(m_tempFilenameDst).remove(); } catch (...) {}
463                                 return nullptr;
464                         }
465                 }
466                 ValidateInternal(true, false);
467                 return m_filename.c_str();
468         }
469         catch (...)
470         {
471                 return nullptr;
472         }
473 }
474
475
476 VARIANT * storageForPlugins::GetDataBufferAnsi()
477 {
478         if (!m_bCurrentIsFile && !m_bCurrentIsUnicode)
479                 return &m_array;
480
481         unsigned nchars;
482         char * pchar;
483
484         try
485         {
486                 {
487                         std::unique_ptr<SharedMemory> pshmIn;
488                         // Get source data
489                         if (m_bCurrentIsFile) 
490                         {
491                                 // Init filedata struct and open file as memory mapped (in file)
492                                 TFile fileIn(m_filename);
493                                 pshmIn.reset(new SharedMemory(fileIn, SharedMemory::AM_READ));
494
495                                 pchar = pshmIn->begin() + m_nBomSize;
496                                 nchars = static_cast<unsigned>(pshmIn->end() - pchar);
497                         }
498                         else
499                         {
500                                 pchar  = (char *)m_bstr;
501                                 nchars = SysStringLen(m_bstr) * sizeof(wchar_t);
502                         }
503
504                         // Compute the dest size (in bytes)
505                         int textForeseenSize = nchars; 
506                         if (m_bCurrentIsUnicode)
507                                 textForeseenSize = nchars * 3; // from unicoder.cpp convertToBuffer
508                         int textRealSize = textForeseenSize;
509
510                         // allocate the memory
511                         SAFEARRAYBOUND rgsabound = {static_cast<ULONG>(textForeseenSize), 0};
512                         m_array.vt = VT_UI1 | VT_ARRAY;
513                         m_array.parray = SafeArrayCreate(VT_UI1, 1, &rgsabound);
514                         char * parrayData;
515                         SafeArrayAccessData(m_array.parray, (void**)&parrayData);
516
517                         // fill in the data
518                         if (m_bCurrentIsUnicode)
519                         {
520                                 // to Ansi conversion, from unicoder.cpp convertToBuffer
521                                 bool lossy;
522                                 textRealSize = ucr::CrossConvert(pchar, nchars, (char *)parrayData, textForeseenSize, m_codepage, ucr::getDefaultCodepage(), &lossy);
523                         }
524                         else
525                         {
526                                 std::memcpy(parrayData, pchar, nchars);
527                         }
528                         // size may have changed
529                         SafeArrayUnaccessData(m_array.parray);
530                         SAFEARRAYBOUND rgsaboundnew = {static_cast<ULONG>(textRealSize), 0};
531                         SafeArrayRedim(m_array.parray, &rgsaboundnew);
532                 }
533                 ValidateInternal(false, false);
534                 return &m_array;
535         }
536         catch (...)
537         {
538                 return nullptr;
539         }
540 }
541
542 template<typename T, bool flipbytes>
543 inline const T *findNextLine(const T *pstart, const T *pend)
544 {
545         for (const T *p = pstart; p < pend; ++p)
546         {
547                 int ch = flipbytes ? ByteOrder::flipBytes(*p) : *p;
548                 if (ch == '\n')
549                         return p + 1;
550                 else if (ch == '\r')
551                 {
552                         if (p + 1 < pend && *(p + 1) == (flipbytes ? ByteOrder::flipBytes('\n') : '\n'))
553                                 return p + 2;
554                         else
555                                 return p + 1;
556                 }
557         }
558         return pend;
559 }
560
561 static const char *findNextLine(ucr::UNICODESET unicoding, const char *pstart, const char *pend)
562 {
563         switch (unicoding)
564         {
565         case ucr::UCS2LE:
566                 return (const char *)findNextLine<unsigned short, false>((const unsigned short *)pstart, (const unsigned short *)pend);
567         case ucr::UCS2BE:
568                 return (const char *)findNextLine<unsigned short, true>((const unsigned short *)pstart, (const unsigned short *)pend);
569         default:
570                 return findNextLine<char, false>(pstart, pend);
571         }
572 }
573
574 bool AnyCodepageToUTF8(int codepage, const String& filepath, const String& filepathDst, int & nFileChanged, bool bWriteBOM)
575 {
576         UniMemFile ufile;
577         if (!ufile.OpenReadOnly(filepath))
578                 return true;
579         ufile.ReadBom();
580         ucr::UNICODESET unicoding = ufile.GetUnicoding();
581         // Finished with examing file contents
582         ufile.Close();
583
584         TFile fileIn(filepath);
585         try
586         {
587                 // Init filedataIn struct and open file as memory mapped (input)
588                 SharedMemory shmIn(fileIn, SharedMemory::AM_READ);
589
590                 IExconverter *pexconv = Exconverter::getInstance();
591
592                 char * pszBuf = shmIn.begin();
593                 size_t nBufSize = shmIn.end() - shmIn.begin();
594                 size_t nSizeOldBOM = 0;
595                 switch (unicoding)
596                 {
597                 case ucr::UTF8:
598                         nSizeOldBOM = 3;
599                         break;
600                 case ucr::UCS2LE:
601                 case ucr::UCS2BE:
602                         nSizeOldBOM = 2;
603                         break;
604                 }
605
606                 const size_t minbufsize = 128 * 1024;
607
608                 // create the destination file
609                 FileOutputStream fout(ucr::toUTF8(filepathDst), std::ios::out|std::ios::binary|std::ios::trunc);
610                 Buffer<char> obuf(minbufsize);
611                 int64_t pos = nSizeOldBOM;
612
613                 // write BOM
614                 if (bWriteBOM)
615                 {
616                         char bom[4];
617                         fout.write(bom, ucr::writeBom(bom, ucr::UTF8));
618                 }
619
620                 // write data
621                 for (;;)
622                 {
623                         size_t srcbytes = findNextLine(unicoding, pszBuf + pos + minbufsize, pszBuf + nBufSize) - (pszBuf + pos);
624                         if (srcbytes == 0)
625                                 break;
626                         if (srcbytes * 3 > obuf.size())
627                                 obuf.resize(srcbytes * 3 * 2, false);
628                         size_t destbytes = obuf.size();
629                         if (pexconv != nullptr)
630                         {
631                                 size_t srcbytes2 = srcbytes;
632                                 if (!pexconv->convert(codepage, ucr::CP_UTF_8, (const unsigned char *)pszBuf+pos, &srcbytes2, (unsigned char *)obuf.begin(), &destbytes))
633                                         throw "failed to convert file contents to utf-8";
634                         }
635                         else
636                         {
637                                 bool lossy = false;
638                                 destbytes = ucr::CrossConvert((const char *)pszBuf+pos, static_cast<unsigned>(srcbytes), obuf.begin(), static_cast<unsigned>(destbytes), codepage, ucr::CP_UTF_8, &lossy);
639                         }
640                         fout.write(obuf.begin(), destbytes);
641                         pos += srcbytes;
642                 }
643
644                 nFileChanged ++;
645                 return true;
646         }
647         catch (...)
648         {
649                 if (fileIn.getSize() == 0)
650                         return true;
651                 return false;
652         }
653 }
654