OSDN Git Service

stringdiffs.cpp: If the word-by-word diff process takes too long, interrupt it.
[winmerge-jp/winmerge-jp.git] / Src / stringdiffs.cpp
1 /** 
2  * @file  stringdiffs.cpp
3  *
4  * @brief Implementation file for ComputeWordDiffs (q.v.)
5  *
6  */
7
8 #include "pch.h"
9 #include "stringdiffs.h"
10 #define NOMINMAX
11 #include <cassert>
12 #include <chrono>
13 #include "CompareOptions.h"
14 #include "stringdiffsi.h"
15 #include "Diff3.h"
16
17 using std::vector;
18
19 namespace strdiff
20 {
21
22 static bool Initialized;
23 static bool CustomChars;
24 static TCHAR *BreakChars;
25 static TCHAR BreakCharDefaults[] = _T(",.;:");
26 static int TimeoutMilliSeconds = 500;
27
28 static bool isSafeWhitespace(TCHAR ch);
29 static bool isWordBreak(int breakType, const TCHAR *str, int index);
30
31 void Init()
32 {
33         BreakChars = &BreakCharDefaults[0];
34         Initialized = true;
35 }
36
37 void Close()
38 {
39         if (CustomChars)
40         {
41                 free(BreakChars);
42                 BreakChars = nullptr;
43                 CustomChars = false;
44         }
45         Initialized = false;
46 }
47
48 void SetBreakChars(const TCHAR *breakChars)
49 {
50         assert(Initialized);
51
52         if (CustomChars)
53                 free(BreakChars);
54
55         CustomChars = true;
56         BreakChars = _tcsdup(breakChars);
57 }
58
59 std::vector<wdiff>
60 ComputeWordDiffs(const String& str1, const String& str2,
61         bool case_sensitive, bool eol_sensitive, int whitespace, int breakType, bool byte_level)
62 {
63         String strs[3] = {str1, str2, _T("")};
64         return ComputeWordDiffs(2, strs, case_sensitive, eol_sensitive, whitespace, breakType, byte_level);
65 }
66
67 struct Comp02Functor
68 {
69         Comp02Functor(const String *strs, bool case_sensitive) : 
70                 strs_(strs), case_sensitive_(case_sensitive)
71         {
72         }
73         bool operator()(const wdiff &wd3)
74         {
75                 size_t wlen0 = wd3.end[0] - wd3.begin[0] + 1;
76                 size_t wlen2 = wd3.end[2] - wd3.begin[2] + 1;
77                 if (wlen0 != wlen2)
78                         return false;
79                 if (case_sensitive_)
80                 {
81                         if (memcmp(&strs_[0][wd3.begin[0]], &strs_[2][wd3.begin[2]], wlen0 * sizeof(TCHAR)) != 0)
82                                 return false;
83                 }
84                 else
85                 {
86                         if (_tcsnicmp(&strs_[0][wd3.begin[0]], &strs_[2][wd3.begin[2]], wlen0) != 0)
87                                 return false;
88                 }
89                 return true;
90         }
91         const String *strs_;
92         bool case_sensitive_;
93 };
94
95 /**
96  * @brief Construct our worker object and tell it to do the work
97  */
98 std::vector<wdiff>
99 ComputeWordDiffs(int nFiles, const String str[3],
100         bool case_sensitive, bool eol_sensitive, int whitespace, int breakType, bool byte_level)
101 {
102         std::vector<wdiff> diffs;
103         if (nFiles == 2)
104         {
105                 stringdiffs sdiffs(str[0], str[1], case_sensitive, eol_sensitive, whitespace, breakType, &diffs);
106                 // Hash all words in both lines and then compare them word by word
107                 // storing differences into m_wdiffs
108                 sdiffs.BuildWordDiffList();
109
110                 if (byte_level)
111                         sdiffs.wordLevelToByteLevel();
112
113                 // Now copy m_wdiffs into caller-supplied m_pDiffs (coalescing adjacents if possible)
114                 sdiffs.PopulateDiffs();
115
116         }
117         else
118         {
119                 if (str[0].empty())
120                 {
121                         stringdiffs sdiffs(str[1], str[2], case_sensitive, eol_sensitive, whitespace, breakType, &diffs);
122                         sdiffs.BuildWordDiffList();
123                         if (byte_level)
124                                 sdiffs.wordLevelToByteLevel();
125                         sdiffs.PopulateDiffs();
126                         for (size_t i = 0; i < diffs.size(); i++)
127                         {
128                                 wdiff& diff = diffs[i];
129                                 diff.begin[2] = diff.begin[1];
130                                 diff.begin[1] = diff.begin[0];
131                                 diff.begin[0] = 0;
132                                 diff.end[2] = diff.end[1];
133                                 diff.end[1] = diff.end[0];
134                                 diff.end[0] = -1;
135                         }
136                 }
137                 else if (str[1].empty())
138                 {
139                         stringdiffs sdiffs(str[0], str[2], case_sensitive, eol_sensitive, whitespace, breakType, &diffs);
140                         sdiffs.BuildWordDiffList();
141                         if (byte_level)
142                                 sdiffs.wordLevelToByteLevel();
143                         sdiffs.PopulateDiffs();
144                         for (size_t i = 0; i < diffs.size(); i++)
145                         {
146                                 wdiff& diff = diffs[i];
147                                 diff.begin[2] = diff.begin[1];
148                                 //diff.begin[0] = diff.begin[0];
149                                 diff.begin[1] = 0;
150                                 diff.end[2] = diff.end[1];
151                                 //diff.end[0] = diff.end[0];
152                                 diff.end[1] = -1;
153                         }
154                 }
155                 else if (str[2].empty())
156                 {
157                         stringdiffs sdiffs(str[0], str[1], case_sensitive, eol_sensitive, whitespace, breakType, &diffs);
158                         sdiffs.BuildWordDiffList();
159                         if (byte_level)
160                                 sdiffs.wordLevelToByteLevel();
161                         sdiffs.PopulateDiffs();
162                         for (size_t i = 0; i < diffs.size(); i++)
163                         {
164                                 wdiff& diff = diffs[i];
165                                 //diff.begin[1] = diff.begin[1];
166                                 //diff.begin[0] = diff.begin[0];
167                                 diff.begin[2] = 0;
168                                 //diff.end[1] = diff.end[1];
169                                 //diff.end[0] = diff.end[0];
170                                 diff.end[2] = -1;
171                         }
172                 }
173                 else
174                 {
175                         std::vector<wdiff> diffs10, diffs12;
176                         stringdiffs sdiffs10(str[1], str[0], case_sensitive, eol_sensitive, 0, breakType, &diffs10);
177                         stringdiffs sdiffs12(str[1], str[2], case_sensitive, eol_sensitive, 0, breakType, &diffs12);
178                         // Hash all words in both lines and then compare them word by word
179                         // storing differences into m_wdiffs
180                         sdiffs10.BuildWordDiffList();
181                         sdiffs12.BuildWordDiffList();
182                         if (byte_level)
183                         {
184                                 sdiffs10.wordLevelToByteLevel();
185                                 sdiffs12.wordLevelToByteLevel();
186                         }
187                         // Now copy m_wdiffs into caller-supplied m_pDiffs (coalescing adjacents if possible)
188                         sdiffs10.PopulateDiffs();
189                         sdiffs12.PopulateDiffs();
190
191                         Make3wayDiff(diffs, diffs10, diffs12, 
192                                 Comp02Functor(str, case_sensitive), false);
193                 }
194         }
195         return diffs;
196 }
197
198 /**
199  * @brief stringdiffs constructor simply loads all members from arguments
200  */
201 stringdiffs::stringdiffs(const String & str1, const String & str2,
202         bool case_sensitive, bool eol_sensitive, int whitespace, int breakType,
203         std::vector<wdiff> * pDiffs)
204 : m_str1(str1)
205 , m_str2(str2)
206 , m_case_sensitive(case_sensitive)
207 , m_eol_sensitive(eol_sensitive)
208 , m_whitespace(whitespace)
209 , m_breakType(breakType)
210 , m_pDiffs(pDiffs)
211 , m_matchblock(true) // Change to false to get word to word compare
212 {
213 }
214
215 /**
216  * @brief Destructor.
217  * The destructor frees all diffs added to the vectors.
218  */
219 stringdiffs::~stringdiffs()
220 {
221 }
222
223 #ifdef STRINGDIFF_LOGGING
224 void
225 stringdiffs::debugoutput()
226 {
227         for (size_t i = 0; i < m_wdiffs.size(); i++)
228         {
229                 String str1;
230                 String str2;
231                 TCHAR buf[256];
232                 int s1 = m_wdiffs[i].begin[0];
233                 int e1 = m_wdiffs[i].end[0];
234                 int s2 = m_wdiffs[i].begin[1];
235                 int e2 = m_wdiffs[i].end[1];
236
237                 int len1 = e1 - s1 + 1;
238                 int len2 = e2 - s2 + 1;
239
240                 if (len1 < 50)
241                         str1 = m_str1.substr(s1 ,e1 - s1 + 1);
242                 else
243                         str1 = m_str1.substr(s1, 50);
244
245                 if (len2 < 50)
246                         str2 = m_str2.substr(s2, e2- s2 + 1);
247                 else
248                         str2 = m_str2.substr(s2, 50);
249
250                 wsprintf(buf, _T("left=  %s,   %d,%d,\nright= %s,   %d,%d \n"),
251                         str1.c_str(), s1, e1, str2.c_str(), s2, e2);
252                 OutputDebugString(buf);
253         }
254 }
255 #endif
256
257 bool
258 stringdiffs::BuildWordDiffList_DP()
259 {
260         std::vector<char> edscript;
261
262         //if (dp(edscript) <= 0)
263         //      return false;
264         if (onp(edscript) < 0)
265                 return false;
266
267         int i = 1, j = 1;
268         for (size_t k = 0; k < edscript.size(); k++)
269         {
270                 int s1, e1, s2, e2;
271                 if (edscript[k] == '-')
272                 {
273                         if (m_whitespace == WHITESPACE_IGNORE_ALL)
274                         {
275                                 if (IsSpace(m_words1[i]))
276                                 {
277                                         i++;
278                                         continue;
279                                 }
280                         }
281                                 
282                         s1 = m_words1[i].start;
283                         e1 = m_words1[i].end;
284                         s2 = m_words2[j-1].end+1;
285                         e2 = s2-1;
286                         m_wdiffs.push_back(wdiff(s1, e1, s2, e2));
287                         i++;
288                 }
289                 else if (edscript[k] == '+')
290                 {
291                         if (m_whitespace == WHITESPACE_IGNORE_ALL)
292                         {
293                                 if (IsSpace(m_words2[j]))
294                                 {
295                                         j++;
296                                         continue;
297                                 }
298                         }
299
300                         s1 = m_words1[i-1].end+1;
301                         e1 = s1-1;
302                         s2 = m_words2[j].start;
303                         e2 = m_words2[j].end;
304                         m_wdiffs.push_back(wdiff(s1, e1, s2, e2));
305                         j++;
306                 }
307                 else if (edscript[k] == '!')
308                 {
309                         if (m_whitespace == WHITESPACE_IGNORE_CHANGE || m_whitespace == WHITESPACE_IGNORE_ALL)
310                         {
311                                 if (IsSpace(m_words1[i]) && IsSpace(m_words2[j]))
312                                 {
313                                         i++; j++;
314                                         continue;
315                                 }
316                         }
317                                 
318                         s1 =  m_words1[i].start;
319                         e1 =  m_words1[i].end;
320                         s2 =  m_words2[j].start;
321                         e2 =  m_words2[j].end ;
322                         m_wdiffs.push_back(wdiff(s1, e1, s2, e2));
323                         i++; j++;
324                 }
325                 else
326                 {
327                         i++; j++;
328                 }
329         }
330 #ifdef STRINGDIFF_LOGGING
331         debugoutput();
332 #endif
333         return true;
334 }
335
336 /**
337  * @brief Add all different elements between lines to the wdiff list
338  */
339 void
340 stringdiffs::BuildWordDiffList()
341 {
342         m_words1 = BuildWordsArray(m_str1);
343         m_words2 = BuildWordsArray(m_str2);
344
345         bool succeeded = false;
346 #ifdef _WIN64
347         if (m_words1.size() < 20480 && m_words2.size() < 20480)
348 #else
349         if (m_words1.size() < 2048 && m_words2.size() < 2048)
350 #endif
351         {
352                 succeeded = BuildWordDiffList_DP();
353         }
354         if (!succeeded)
355         {
356                 int s1 = m_words1[0].start;
357                 int e1 = m_words1[m_words1.size() - 1].end;
358                 int s2 = m_words2[0].start;
359                 int e2 = m_words2[m_words2.size() - 1].end;
360                 m_wdiffs.push_back(wdiff(s1, e1, s2, e2));              
361                 return;
362         }
363
364 }
365
366 /**
367  * @brief Break line into constituent words
368  */
369 std::vector<stringdiffs::word>
370 stringdiffs::BuildWordsArray(const String & str)
371 {
372         std::vector<word> words;
373         int i = 0, begin = 0;
374         ICUBreakIterator *pIterChar = ICUBreakIterator::getCharacterBreakIterator(reinterpret_cast<const UChar *>(str.c_str()), static_cast<int32_t>(str.length()));
375
376         size_t sLen = str.length();
377         assert(sLen < INT_MAX);
378         int iLen = static_cast<int>(sLen);
379
380         // dummy;
381         words.push_back(word(0, -1, 0, 0));
382
383         // state when we are looking for next word
384 inspace:
385         if (isSafeWhitespace(str[i])) 
386         {
387                 i = pIterChar->next();
388                 goto inspace;
389         }
390         if (begin < i)
391         {
392                 // just finished a word
393                 // e is first word character (space or at end)
394                 int e = i - 1;
395
396                 words.push_back(word(begin, e, dlspace, Hash(str, begin, e, 0)));
397         }
398         if (i == iLen)
399                 return words;
400         begin = i;
401         goto inword;
402
403         // state when we are inside a word
404 inword:
405         bool atspace=false;
406         if (i == iLen || ((atspace = isSafeWhitespace(str[i])) != 0) || isWordBreak(m_breakType, str.c_str(), i))
407         {
408                 if (begin<i)
409                 {
410                         // just finished a word
411                         // e is first non-word character (space or at end)
412                         int e = i-1;
413                         
414                         words.push_back(word(begin, e, dlword, Hash(str, begin, e, 0)));
415                 }
416                 if (i == iLen)
417                 {
418                         return words;
419                 }
420                 else if (atspace)
421                 {
422                         begin = i;
423                         goto inspace;
424                 }
425                 else
426                 {
427                         // start a new word because we hit a non-whitespace word break (eg, a comma)
428                         // but, we have to put each word break character into its own word
429                         int inext = pIterChar->next();
430                         words.push_back(word(i, inext - 1, dlbreak, Hash(str, i, inext - 1, 0)));
431                         i = inext;
432                         begin = i;
433                         goto inword;
434                 }
435         }
436         i = pIterChar->next();
437         goto inword; // safe even if we're at the end or no longer in a word
438 }
439
440 /**
441  * @brief Populate m_pDiffs from m_wdiffs (combining adjacent diffs)
442  *
443  * Doing the combining of adjacent diffs here keeps some complexity out of BuildWordsArray.
444  */
445 void
446 stringdiffs::PopulateDiffs()
447 {
448         auto IsEOLorEmpty = [](const String& text, size_t begin, size_t end) -> bool {
449                 if (end - begin + 1 > 2)
450                         return false;
451                 String str = text.substr(begin, end - begin + 1);
452                 return (str.empty() || str == _T("\r\n") || str == _T("\n") || str == _T("\r"));
453         };
454         
455         for (int i=0; i< (int)m_wdiffs.size(); ++i)
456         {
457                 bool skipIt = false;
458                 // combine it with next ?
459                 if (i+1< (int)m_wdiffs.size())
460                 {
461                         if (m_wdiffs[i].end[0] + 1 == m_wdiffs[i+1].begin[0]
462                                 && m_wdiffs[i].end[1] + 1 == m_wdiffs[i+1].begin[1])
463                         {
464                                 // diff[i] and diff[i+1] are contiguous
465                                 // so combine them into diff[i+1] and ignore diff[i]
466                                 m_wdiffs[i+1].begin[0] = m_wdiffs[i].begin[0];
467                                 m_wdiffs[i+1].begin[1] = m_wdiffs[i].begin[1];
468                                 skipIt = true;
469                         }
470                 }
471                 else
472                 {
473                         if (!m_eol_sensitive &&
474                                 IsEOLorEmpty(m_str1, m_wdiffs[i].begin[0], m_wdiffs[i].end[0]) &&
475                                 IsEOLorEmpty(m_str2, m_wdiffs[i].begin[1], m_wdiffs[i].end[1]))
476                                 skipIt = true;
477                 }
478                 if (!skipIt)
479                 {
480                         // Should never have a pair where both are missing
481                         assert(m_wdiffs[i].begin[0]>=0 || m_wdiffs[i].begin[1]>=0);
482
483                         // Store the diff[i] in the caller list (m_pDiffs)
484                         m_pDiffs->push_back(wdiff(m_wdiffs[i]));
485                 }
486         }
487 }
488
489 // diffutils hash
490
491 /* Rotate a value n bits to the left. */
492 #define UINT_BIT (sizeof (unsigned) * CHAR_BIT)
493 #define ROL(v, n) ((v) << (n) | (v) >> (UINT_BIT - (n)))
494 /* Given a hash value and a new character, return a new hash value. */
495 #define HASH(h, c) ((c) + ROL (h, 7))
496
497 unsigned
498 stringdiffs::Hash(const String & str, int begin, int end, unsigned h) const
499 {
500         for (int i = begin; i <= end; ++i)
501         {
502                 TCHAR ch = static_cast<unsigned>(str[i]);
503                 if (m_case_sensitive)
504                 {
505                         h += HASH(h, ch);
506                 }
507                 else
508                 {
509                         ch = static_cast<unsigned>(_totupper(ch));
510                         h += HASH(h, ch);
511                 }
512         }
513
514         return h;
515 }
516
517 /**
518  * @brief Compare two words (by reference to original strings)
519  */
520 bool
521 stringdiffs::AreWordsSame(const word & word1, const word & word2) const
522 {
523         if (this->m_whitespace != WHITESPACE_COMPARE_ALL)
524         {
525                 if (IsSpace(word1) && IsSpace(word2))
526                         return true;
527         }
528         if (word1.hash != word2.hash)
529                 return false;
530         if (word1.length() != word2.length())
531                 return false;
532         for (int i=0; i<word1.length(); ++i)
533         {
534                 if (!caseMatch(m_str1[word1.start+i], m_str2[word2.start+i]))
535                         return false;
536         }
537         return true;
538 }
539
540 /**
541  * @brief Return true if characters match
542  */
543 bool
544 stringdiffs::caseMatch(TCHAR ch1, TCHAR ch2) const
545 {
546         if (m_case_sensitive) 
547                 return ch1==ch2;
548         else 
549                 return _totupper(ch1)==_totupper(ch2);
550 }
551
552 /**
553  * @ brief An O(NP) Sequence Comparison Algorithm. Sun Wu, Udi Manber, Gene Myers
554  */
555 int
556 stringdiffs::onp(std::vector<char> &edscript)
557 {
558         auto start = std::chrono::system_clock::now();
559
560         int M = static_cast<int>(m_words1.size() - 1);
561         int N = static_cast<int>(m_words2.size() - 1);
562         bool exchanged = false;
563         if (M > N)
564         {
565                 M = static_cast<int>(m_words2.size() - 1);
566                 N = static_cast<int>(m_words1.size() - 1);
567                 exchanged = true;
568         }
569         int *fp = (new int[(M+1) + 1 + (N+1)]) + (M+1);
570         struct EditScriptElem { int op; int neq; int pk; int pi; };
571         std::vector<EditScriptElem> *es = (new std::vector<EditScriptElem>[(M+1) + 1 + (N+1)]) + (M+1);
572         int DELTA = N - M;
573         
574         auto addEditScriptElem = [&es, &fp](int k) {
575                 EditScriptElem ese;
576                 if (fp[k - 1] + 1 > fp[k + 1])
577                 {
578                         ese.op = '+';
579                         ese.neq = fp[k] - (fp[k - 1] + 1);
580                         ese.pk = k - 1;
581                 }
582                 else
583                 {
584                         ese.op = '-';
585                         ese.neq = fp[k] - fp[k + 1];
586                         ese.pk = k + 1;
587                 }
588                 ese.pi = static_cast<int>(es[ese.pk].size() - 1);
589                 es[k].push_back(ese);
590         };
591
592         const int COUNTMAX = 100000;
593         int count = 0;
594         int k;
595         for (k = -(M+1); k <= (N+1); k++)
596                 fp[k] = -1; 
597         int p = -1;
598         do
599         {
600                 p = p + 1;
601                 for (k = -p; k <= DELTA-1; k++)
602                 {
603                         fp[k] = snake(k, std::max(fp[k-1] + 1, fp[k+1]), exchanged);
604                         addEditScriptElem(k);
605                         count++;
606                 }
607                 for (k = DELTA + p; k >= DELTA+1; k--)
608                 {
609                         fp[k] = snake(k, std::max(fp[k-1] + 1, fp[k+1]), exchanged);
610                         addEditScriptElem(k);
611                         count++;
612                 }
613                 k = DELTA;
614                 fp[k] = snake(k, std::max(fp[k-1] + 1, fp[k+1]), exchanged);
615                 addEditScriptElem(k);
616                 count++;
617
618                 if (count > COUNTMAX)
619                 {
620                         count = 0;
621                         auto end = std::chrono::system_clock::now();
622                         auto msec = std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count();
623                         if (msec > TimeoutMilliSeconds)
624                         {
625                                 delete [] (es - (M+1));
626                                 delete [] (fp - (M+1));
627                                 return -1;
628                         }
629                 }
630         } while (fp[k] != N);
631
632         edscript.clear();
633
634         std::vector<char> ses;
635         int i;
636         for (k = DELTA, i = static_cast<int>(es[DELTA].size() - 1); i >= 0;)
637         {
638                 EditScriptElem& esi = es[k][i];
639                 for (int j = 0; j < esi.neq; ++j)
640                         ses.push_back('=');
641                 ses.push_back(static_cast<char>(esi.op));
642                 i = esi.pi;
643                 k = esi.pk;
644         }
645         std::reverse(ses.begin(), ses.end());
646
647         int D = 0;
648         for (i = 1; i < static_cast<int>(ses.size()); i++)
649         {
650                 switch (ses[i])
651                 {
652                 case '+':
653                         if (i + 1 < ses.size() && ses[i + 1] == '-')
654                         {
655                                 edscript.push_back('!');
656                                 i++;
657                                 D++;
658                         }
659                         else
660                         {
661                                 edscript.push_back(exchanged ? '-' : '+');
662                                 D++;
663                         }
664                         break;
665                 case '-':
666                         if (i + 1 < ses.size() && ses[i + 1] == '+')
667                         {
668                                 edscript.push_back('!');
669                                 i++;
670                                 D++;
671                         }
672                         else
673                         {
674                                 edscript.push_back(exchanged ? '+' : '-');
675                                 D++;
676                         }
677                         break;
678                 default:
679                         edscript.push_back('=');
680                 }
681         }
682                 
683         delete [] (es - (M+1));
684         delete [] (fp - (M+1));
685
686         return D;
687 }
688
689 int
690 stringdiffs::snake(int k, int y, bool exchanged)
691 {
692         int M = static_cast<int>(exchanged ? m_words2.size() - 1 : m_words1.size() - 1);
693         int N = static_cast<int>(exchanged ? m_words1.size() - 1 : m_words2.size() - 1);
694         int x = y - k;
695         while (x < M && y < N && (exchanged ? AreWordsSame(m_words1[y + 1], m_words2[x + 1]) : AreWordsSame(m_words1[x + 1], m_words2[y + 1]))) {
696                 x = x + 1; y = y + 1;
697         }
698         return y;
699 }
700
701 /**
702  * @brief Return true if chars match
703  *
704  * Caller must not call this for lead bytes
705  */
706 static inline bool
707 matchchar(const TCHAR *ch1, const TCHAR *ch2, size_t len, bool casitive)
708 {
709         if (casitive)
710                 return memcmp(ch1, ch2, len * sizeof(TCHAR)) == 0;
711         for (size_t i = 0; i < len; ++i)
712         {
713                 if (_totupper(ch1[i]) != _totupper(ch2[i]))
714                         return false;
715         }
716         return true;
717 }
718
719
720 /** Does character introduce a multicharacter character? */
721 static inline bool IsLeadByte(TCHAR ch)
722 {
723 #ifdef UNICODE
724         return false;
725 #else
726         return _getmbcp() && IsDBCSLeadByte(ch);
727 #endif
728 }
729
730 /**
731  * @brief Is it whitespace (excludes all lead & trail bytes)?
732  */
733 static inline bool
734 isSafeWhitespace(TCHAR ch)
735 {
736         return _istspace((unsigned)ch) && !IsLeadByte(ch);
737 }
738
739 /**
740  * @brief Is it a non-whitespace wordbreak character (ie, punctuation)?
741  */
742 static bool
743 isWordBreak(int breakType, const TCHAR *str, int index)
744 {
745         TCHAR ch = str[index];
746         // breakType==1 means break also on punctuation
747         if ((ch & 0xff00) == 0)
748         {
749 //              TCHAR nextCh = str[index + 1];
750                 // breakType==0 means whitespace only
751                 if (breakType==0)
752                         return false;
753                 return _tcschr(BreakChars, ch) != nullptr;
754         }
755         else 
756         {
757 //              if (
758 //                      ch==0xff0c/* Fullwidth Full Stop */ || 
759 //                      ch==0xff0e/* Fullwidth Comma */ ||
760 //                      ch==0xff1b/* Fullwidth Semicolon */ ||
761 //                      ch==0xff1a/* Fullwidth Colon */ ||
762 //                      ch==0x3002/* Ideographic Full Stop */ || 
763 //                      ch==0x3001/* Ideographic Comma */
764 //                      )
765 //                      return true;
766 //              WORD wCharType, wCharTypeNext;
767 //              GetStringTypeW(CT_CTYPE3, &ch, 1, &wCharType);
768 //              TCHAR nextCh = str[index + 1];
769 //              GetStringTypeW(CT_CTYPE3, &nextCh, 1, &wCharTypeNext);
770 //              return (wCharType != wCharTypeNext);
771 //              
772                 return true;
773         }
774 }
775
776
777 /**
778  * @brief advance current pointer over whitespace, until not whitespace or beyond end
779  * @param pcurrent [in,out] current location (to be advanced)
780  * @param end [in] last valid position (only go one beyond this)
781  */
782 static void
783 AdvanceOverWhitespace(const TCHAR **pcurrent, const TCHAR *end)
784 {
785         // advance over whitespace
786         while (*pcurrent <= end && isSafeWhitespace(**pcurrent))
787                 ++(*pcurrent); // DBCS safe because of isSafeWhitespace above
788 }
789
790 /**
791  * @brief Compute begin1,begin2,end1,end2 to display byte difference between strings str1 & str2
792  * @param casitive [in] true for case-sensitive, false for case-insensitive
793  * @param xwhite [in] This governs whether we handle whitespace specially (see WHITESPACE_COMPARE_ALL, WHITESPACE_IGNORE_CHANGE, WHITESPACE_IGNORE_ALL)
794  * @param [out] begin return -1 if not found or pos of equal
795  * @param [out] end return -1 if not found or pos of equal valid if begin1 >=0
796  * @param [in] equal false surch for a diff, true surch for equal
797  *
798  *
799  * Assumes whitespace is never leadbyte or trailbyte!
800  */
801 void
802 stringdiffs::ComputeByteDiff(const String & str1, const String & str2, 
803                    bool casitive, int xwhite, 
804                    int begin[2], int end[2], bool equal)
805 {
806         // Set to sane values
807         // Also this way can distinguish if we set begin[0] to -1 for no diff in line
808         begin[0] = end[0] = begin[1] = end[1] = 0;
809
810         int len1 = static_cast<int>(str1.length());
811         int len2 = static_cast<int>(str2.length());
812
813         const TCHAR *pbeg1 = str1.c_str();
814         const TCHAR *pbeg2 = str2.c_str();
815
816         ICUBreakIterator *pIterCharBegin1 = ICUBreakIterator::getCharacterBreakIterator(reinterpret_cast<const UChar *>(pbeg1), static_cast<int32_t>(len1));
817         ICUBreakIterator *pIterCharBegin2 = ICUBreakIterator::getCharacterBreakIterator<2>(reinterpret_cast<const UChar *>(pbeg2), static_cast<int32_t>(len2));
818         ICUBreakIterator *pIterCharEnd1 = ICUBreakIterator::getCharacterBreakIterator<3>(reinterpret_cast<const UChar *>(pbeg1), static_cast<int32_t>(len1));
819         ICUBreakIterator *pIterCharEnd2 = ICUBreakIterator::getCharacterBreakIterator<4>(reinterpret_cast<const UChar *>(pbeg2), static_cast<int32_t>(len2));
820         
821         if (len1 == 0 || len2 == 0)
822         {
823                 if (len1 == len2)
824                 {
825                         begin[0] = -1;
826                         begin[1] = -1;
827                 }
828                 end[0] = len1 - 1;
829                 end[1] = len2 - 1;
830                 return;
831         }
832
833         // cursors from front, which we advance to beginning of difference
834         const TCHAR *py1 = pbeg1;
835         const TCHAR *py2 = pbeg2;
836
837         // pen1,pen2 point to the last valid character (broken multibyte lead chars don't count)
838         const TCHAR *pen1 = pbeg1 + (len1 > 0 ? pIterCharEnd1->preceding(len1) : 0);
839         const TCHAR *pen2 = pbeg2 + (len2 > 0 ? pIterCharEnd2->preceding(len2) : 0);
840         size_t glyphlenz1 = pbeg1 + len1 - pen1;
841         size_t glyphlenz2 = pbeg2 + len2 - pen2;
842
843         if (xwhite != WHITESPACE_COMPARE_ALL)
844         {
845                 // Ignore leading and trailing whitespace
846                 // by advancing py1 and py2
847                 // and retreating pen1 and pen2
848                 while (py1 < pen1 && isSafeWhitespace(*py1))
849                         py1 = pbeg1 + pIterCharBegin1->next();
850                 while (py2 < pen2 && isSafeWhitespace(*py2))
851                         py2 = pbeg2 + pIterCharBegin2->next();
852                 if ((pen1 < pbeg1 + len1 - 1 || pen2 < pbeg2 + len2 -1)
853                         && (!len1 || !len2 || pbeg1[len1] != pbeg2[len2]))
854                 {
855                         // mismatched broken multibyte ends
856                 }
857                 else
858                 {
859                         while (pen1 > py1 && isSafeWhitespace(*pen1))
860                                 pen1 = pbeg1 + pIterCharEnd1->previous();
861                         while (pen2 > py2 && isSafeWhitespace(*pen2))
862                                 pen2 = pbeg2 + pIterCharEnd2->previous();
863                 }
864         }
865         //check for excaption of empty string on one side
866         //In that case display all as a diff
867         if (!equal && (((py1 == pen1) && isSafeWhitespace(*pen1)) ||
868                 ((py2 == pen2) && isSafeWhitespace(*pen2))))
869         {
870                 begin[0] = 0;
871                 begin[1] = 0;
872                 end[0] = len1 - 1;
873                 end[1] = len2 - 1;
874                 return;
875         }
876         // Advance over matching beginnings of lines
877         // Advance py1 & py2 from beginning until find difference or end
878         while (true)
879         {
880                 // Potential difference extends from py1 to pen1 and py2 to pen2
881
882                 // Check if either side finished
883                 if (py1 > pen1 && py2 > pen2)
884                 {
885                         begin[0] = end[0] = begin[1] = end[1] = -1;
886                         break;
887                 }
888                 if (py1 > pen1 || py2 > pen2)
889                 {
890                         break;
891                 }
892
893                 // handle all the whitespace logic (due to WinMerge whitespace settings)
894                 if (xwhite && py1 < pen1 && isSafeWhitespace(*py1))
895                 {
896                         if (xwhite==WHITESPACE_IGNORE_CHANGE && !isSafeWhitespace(*py2))
897                         {
898                                 // py1 is white but py2 is not
899                                 // in WHITESPACE_IGNORE_CHANGE mode,
900                                 // this doesn't qualify as skippable whitespace
901                                 break; // done with forward search
902                         }
903                         // gobble up all whitespace in current area
904                         AdvanceOverWhitespace(&py1, pen1); // will go beyond end
905                         AdvanceOverWhitespace(&py2, pen2); // will go beyond end
906                         continue;
907
908                 }
909                 if (xwhite && py2 < pen2 && isSafeWhitespace(*py2))
910                 {
911                         if (xwhite==WHITESPACE_IGNORE_CHANGE && !isSafeWhitespace(*py1))
912                         {
913                                 // py2 is white but py1 is not
914                                 // in WHITESPACE_IGNORE_CHANGE mode,
915                                 // this doesn't qualify as skippable whitespace
916                                 break; // done with forward search
917                         }
918                         // gobble up all whitespace in current area
919                         AdvanceOverWhitespace(&py1, pen1); // will go beyond end
920                         AdvanceOverWhitespace(&py2, pen2); // will go beyond end
921                         continue;
922                 }
923
924                 const TCHAR* py1next = pbeg1 + pIterCharBegin1->next();
925                 const TCHAR* py2next = pbeg2 + pIterCharBegin2->next();
926                 size_t glyphleny1 = py1next - py1;
927                 size_t glyphleny2 = py2next - py2;
928                 if (glyphleny1 != glyphleny2 || !matchchar(py1, py2, glyphleny1, casitive))
929                         break; // done with forward search
930                 py1 = py1next;
931                 py2 = py2next;
932         }
933
934         // Potential difference extends from py1 to pen1 and py2 to pen2
935
936         // Store results of advance into return variables (begin[0] & begin[1])
937         // -1 in a begin variable means no visible diff area
938         begin[0] = static_cast<int>(py1 - pbeg1);
939         begin[1] = static_cast<int>(py2 - pbeg2);
940
941         const TCHAR *pz1 = pen1;
942         const TCHAR *pz2 = pen2;
943
944         // Retreat over matching ends of lines
945         // Retreat pz1 & pz2 from end until find difference or beginning
946         while (true)
947         {
948                 // Check if either side finished
949                 if (pz1 < py1 && pz2 < py2)
950                 {
951                         begin[0] = end[0] = begin[1] = end[1] = -1;
952                         break;
953                 }
954                 if (pz1 < py1 || pz2 < py2)
955                 {
956                         break;
957                 }
958
959                 // handle all the whitespace logic (due to WinMerge whitespace settings)
960                 if (xwhite && pz1 > py1 && isSafeWhitespace(*pz1))
961                 {
962                         if (xwhite==1 && !isSafeWhitespace(*pz2))
963                                 break; // done with reverse search
964                         // gobble up all whitespace in current area
965                         while (pz1 > py1 && isSafeWhitespace(*pz1))
966                                 pz1 = pbeg1 + pIterCharEnd1->previous();
967                         while (pz2 > py2 && isSafeWhitespace(*pz2))
968                                 pz2 = pbeg2 + pIterCharEnd2->previous();
969                         continue;
970
971                 }
972                 if (xwhite && pz2 > py2 && isSafeWhitespace(*pz2))
973                 {
974                         if (xwhite==1)
975                                 break; // done with reverse search
976                         while (pz2 > py2 && isSafeWhitespace(*pz2))
977                                 pz2 = pbeg2 + pIterCharEnd2->previous();
978                         continue;
979                 }
980
981                 if (glyphlenz1 != glyphlenz2 || !matchchar(pz1, pz2, glyphlenz1, casitive))
982                         break; // done with forward search
983                 const TCHAR* pz1next = pz1;
984                 const TCHAR* pz2next = pz2;
985                 pz1 = (pz1 > pbeg1) ? pbeg1 + pIterCharEnd1->preceding(static_cast<int32_t>(pz1 - pbeg1)) : pz1 - 1;
986                 pz2 = (pz2 > pbeg2) ? pbeg2 + pIterCharEnd2->preceding(static_cast<int32_t>(pz2 - pbeg2)) : pz2 - 1;
987                 glyphlenz1 = pz1next - pz1;
988                 glyphlenz2 = pz2next - pz2;
989                 // Now do real character match
990     }
991
992 /*      if (*pz1 == '\r' && *(pz1+1) == '\n')
993         {
994                 pz1++;
995                 pz2++;
996         }
997         else if (*pz2 == '\r' && *(pz2+1) == '\n')
998         {
999                 pz2++;
1000                 pz1++;
1001         }
1002         if (*(pbeg1-1) == '\r' && *pbeg1 == '\n')
1003         {
1004                 pbeg1--;
1005                 pbeg2--;
1006         }
1007         else if (*(pbeg2-1) == '\r' && *pbeg2 == '\n')
1008         {
1009                 pbeg2--;
1010                 pbeg1--;
1011         }*/
1012
1013         // Store results of advance into return variables (end[0] & end[1])
1014         end[0] = static_cast<int>(pz1 - pbeg1 + glyphlenz1 - 1);
1015         end[1] = static_cast<int>(pz2 - pbeg2 + glyphlenz2 - 1);
1016
1017         // Check if difference region was empty
1018         if (begin[0] == end[0] + 1 && begin[1] == end[1] + 1)
1019                 begin[0] = -1; // no diff
1020 }
1021
1022 /**
1023  * @brief adjust the range of the specified word diffs down to byte(char) level.
1024  * @param str1, str2 [in] line to be compared
1025  * @param casitive [in] true for case-sensitive, false for case-insensitive
1026  * @param xwhite [in] This governs whether we handle whitespace specially
1027  *  (see WHITESPACE_COMPARE_ALL, WHITESPACE_IGNORE_CHANGE, WHITESPACE_IGNORE_ALL)
1028  */
1029 void stringdiffs::wordLevelToByteLevel()
1030 {
1031         for (size_t i = 0; i < m_wdiffs.size(); i++)
1032         {
1033                 int begin[3], end[3];
1034                 wdiff& diff = m_wdiffs[i];
1035                 String str1_2, str2_2;
1036                 str1_2 = m_str1.substr(diff.begin[0], diff.end[0] - diff.begin[0] + 1);
1037                 str2_2 = m_str2.substr(diff.begin[1], diff.end[1] - diff.begin[1] + 1);
1038                 ComputeByteDiff(str1_2, str2_2, m_case_sensitive, m_whitespace, begin, end, false);
1039                 if (begin[0] == -1)
1040                 {
1041                         // no visible diff on side1
1042                         diff.end[0] = diff.begin[0] - 1;
1043                 }
1044                 else
1045                 {
1046                         diff.end[0] = diff.begin[0] + end[0];
1047                         diff.begin[0] += begin[0];
1048                 }
1049                 if (begin[1] == -1)
1050                 {
1051                         // no visible diff on side2
1052                         diff.end[1] = diff.begin[1] - 1;
1053                 }
1054                 else
1055                 {
1056                         diff.end[1] = diff.begin[1] + end[1];
1057                         diff.begin[1] += begin[1];
1058                 }
1059         }
1060 }
1061
1062 }