OSDN Git Service

Merge branch 'master' of https://github.com/winmerge/winmerge into jp
[winmerge-jp/winmerge-jp.git] / Src / stringdiffs.cpp
1 /** 
2  * @file  stringdiffs.cpp
3  *
4  * @brief Implementation file for ComputeWordDiffs (q.v.)
5  *
6  */
7
8 #include "pch.h"
9 #include "stringdiffs.h"
10 #define NOMINMAX
11 #include <cassert>
12 #include <chrono>
13 #include "CompareOptions.h"
14 #include "stringdiffsi.h"
15 #include "Diff3.h"
16
17 using std::vector;
18
19 namespace strdiff
20 {
21
22 static bool Initialized;
23 static bool CustomChars;
24 static TCHAR *BreakChars;
25 static TCHAR BreakCharDefaults[] = _T(",.;:");
26 static int TimeoutMilliSeconds = 500;
27
28 static bool isSafeWhitespace(TCHAR ch);
29 static bool isWordBreak(int breakType, const TCHAR *str, int index, bool ignore_numbers);
30
31 void Init()
32 {
33         BreakChars = &BreakCharDefaults[0];
34         Initialized = true;
35 }
36
37 void Close()
38 {
39         if (CustomChars)
40         {
41                 free(BreakChars);
42                 BreakChars = nullptr;
43                 CustomChars = false;
44         }
45         Initialized = false;
46 }
47
48 void SetBreakChars(const TCHAR *breakChars)
49 {
50         assert(Initialized);
51
52         if (CustomChars)
53                 free(BreakChars);
54
55         CustomChars = true;
56         BreakChars = _tcsdup(breakChars);
57 }
58
59 std::vector<wdiff>
60 ComputeWordDiffs(const String& str1, const String& str2,
61         bool case_sensitive, bool eol_sensitive, int whitespace, bool ignore_numbers, int breakType, bool byte_level)
62 {
63         String strs[3] = {str1, str2, _T("")};
64         return ComputeWordDiffs(2, strs, case_sensitive, eol_sensitive, whitespace, ignore_numbers, breakType, byte_level);
65 }
66
67 struct Comp02Functor
68 {
69         Comp02Functor(const String *strs, bool case_sensitive) : 
70                 strs_(strs), case_sensitive_(case_sensitive)
71         {
72         }
73         bool operator()(const wdiff &wd3)
74         {
75                 size_t wlen0 = wd3.end[0] - wd3.begin[0] + 1;
76                 size_t wlen2 = wd3.end[2] - wd3.begin[2] + 1;
77                 if (wlen0 != wlen2)
78                         return false;
79                 if (case_sensitive_)
80                 {
81                         if (memcmp(&strs_[0][wd3.begin[0]], &strs_[2][wd3.begin[2]], wlen0 * sizeof(TCHAR)) != 0)
82                                 return false;
83                 }
84                 else
85                 {
86                         if (_tcsnicmp(&strs_[0][wd3.begin[0]], &strs_[2][wd3.begin[2]], wlen0) != 0)
87                                 return false;
88                 }
89                 return true;
90         }
91         const String *strs_;
92         bool case_sensitive_;
93 };
94
95 /**
96  * @brief Construct our worker object and tell it to do the work
97  */
98 std::vector<wdiff>
99 ComputeWordDiffs(int nFiles, const String *str,
100         bool case_sensitive, bool eol_sensitive, int whitespace, bool ignore_numbers, int breakType, bool byte_level)
101 {
102         std::vector<wdiff> diffs;
103         if (nFiles == 2)
104         {
105                 stringdiffs sdiffs(str[0], str[1], case_sensitive, eol_sensitive, whitespace, ignore_numbers, breakType, &diffs);
106                 // Hash all words in both lines and then compare them word by word
107                 // storing differences into m_wdiffs
108                 sdiffs.BuildWordDiffList();
109
110                 if (byte_level)
111                         sdiffs.wordLevelToByteLevel();
112
113                 // Now copy m_wdiffs into caller-supplied m_pDiffs (coalescing adjacents if possible)
114                 sdiffs.PopulateDiffs();
115
116         }
117         else
118         {
119                 if (str[0].empty())
120                 {
121                         stringdiffs sdiffs(str[1], str[2], case_sensitive, eol_sensitive, whitespace, ignore_numbers, breakType, &diffs);
122                         sdiffs.BuildWordDiffList();
123                         if (byte_level)
124                                 sdiffs.wordLevelToByteLevel();
125                         sdiffs.PopulateDiffs();
126                         for (size_t i = 0; i < diffs.size(); i++)
127                         {
128                                 wdiff& diff = diffs[i];
129                                 diff.begin[2] = diff.begin[1];
130                                 diff.begin[1] = diff.begin[0];
131                                 diff.begin[0] = 0;
132                                 diff.end[2] = diff.end[1];
133                                 diff.end[1] = diff.end[0];
134                                 diff.end[0] = -1;
135                         }
136                 }
137                 else if (str[1].empty())
138                 {
139                         stringdiffs sdiffs(str[0], str[2], case_sensitive, eol_sensitive, whitespace, ignore_numbers, breakType, &diffs);
140                         sdiffs.BuildWordDiffList();
141                         if (byte_level)
142                                 sdiffs.wordLevelToByteLevel();
143                         sdiffs.PopulateDiffs();
144                         for (size_t i = 0; i < diffs.size(); i++)
145                         {
146                                 wdiff& diff = diffs[i];
147                                 diff.begin[2] = diff.begin[1];
148                                 //diff.begin[0] = diff.begin[0];
149                                 diff.begin[1] = 0;
150                                 diff.end[2] = diff.end[1];
151                                 //diff.end[0] = diff.end[0];
152                                 diff.end[1] = -1;
153                         }
154                 }
155                 else if (str[2].empty())
156                 {
157                         stringdiffs sdiffs(str[0], str[1], case_sensitive, eol_sensitive, whitespace, ignore_numbers, breakType, &diffs);
158                         sdiffs.BuildWordDiffList();
159                         if (byte_level)
160                                 sdiffs.wordLevelToByteLevel();
161                         sdiffs.PopulateDiffs();
162                         for (size_t i = 0; i < diffs.size(); i++)
163                         {
164                                 wdiff& diff = diffs[i];
165                                 //diff.begin[1] = diff.begin[1];
166                                 //diff.begin[0] = diff.begin[0];
167                                 diff.begin[2] = 0;
168                                 //diff.end[1] = diff.end[1];
169                                 //diff.end[0] = diff.end[0];
170                                 diff.end[2] = -1;
171                         }
172                 }
173                 else
174                 {
175                         std::vector<wdiff> diffs10, diffs12;
176                         stringdiffs sdiffs10(str[1], str[0], case_sensitive, eol_sensitive, 0, ignore_numbers, breakType, &diffs10);
177                         stringdiffs sdiffs12(str[1], str[2], case_sensitive, eol_sensitive, 0, ignore_numbers, breakType, &diffs12);
178                         // Hash all words in both lines and then compare them word by word
179                         // storing differences into m_wdiffs
180                         sdiffs10.BuildWordDiffList();
181                         sdiffs12.BuildWordDiffList();
182                         if (byte_level)
183                         {
184                                 sdiffs10.wordLevelToByteLevel();
185                                 sdiffs12.wordLevelToByteLevel();
186                         }
187                         // Now copy m_wdiffs into caller-supplied m_pDiffs (coalescing adjacents if possible)
188                         sdiffs10.PopulateDiffs();
189                         sdiffs12.PopulateDiffs();
190
191                         Make3wayDiff(diffs, diffs10, diffs12, 
192                                 Comp02Functor(str, case_sensitive), false);
193                 }
194         }
195         return diffs;
196 }
197
198 int Compare(const String& str1, const String& str2,
199         bool case_sensitive, bool eol_sensitive, int whitespace, bool ignore_numbers)
200 {
201         if (case_sensitive && eol_sensitive && whitespace == WHITESPACE_COMPARE_ALL && !ignore_numbers)
202                 return str2.compare(str1);
203         String s1 = str1, s2 = str2;
204         if (!case_sensitive)
205         {
206                 s1 = strutils::makelower(s1);
207                 s2 = strutils::makelower(s2);
208         }
209         if (whitespace == WHITESPACE_IGNORE_CHANGE)
210         {
211                 strutils::replace_chars(s1, _T(" \t"), _T(" "));
212                 strutils::replace_chars(s2, _T(" \t"), _T(" "));
213         }
214         else if (whitespace == WHITESPACE_IGNORE_ALL)
215         {
216                 strutils::replace_chars(s1, _T(" \t"), _T(""));
217                 strutils::replace_chars(s2, _T(" \t"), _T(""));
218         }
219         if (!eol_sensitive)
220         {
221                 strutils::replace_chars(s1, _T("\r\n"), _T("\n"));
222                 strutils::replace_chars(s2, _T("\r\n"), _T("\n"));
223         }
224         if (ignore_numbers)
225         {
226                 strutils::replace_chars(s1, _T("0123456789"), _T(""));
227                 strutils::replace_chars(s2, _T("0123456789"), _T(""));
228         }
229         return s2.compare(s1);
230 }
231
232 /**
233  * @brief stringdiffs constructor simply loads all members from arguments
234  */
235 stringdiffs::stringdiffs(const String & str1, const String & str2,
236         bool case_sensitive, bool eol_sensitive, int whitespace, bool ignore_numbers, int breakType,
237         std::vector<wdiff> * pDiffs)
238 : m_str1(str1)
239 , m_str2(str2)
240 , m_case_sensitive(case_sensitive)
241 , m_eol_sensitive(eol_sensitive)
242 , m_whitespace(whitespace)
243 , m_ignore_numbers(ignore_numbers)
244 , m_breakType(breakType)
245 , m_pDiffs(pDiffs)
246 , m_matchblock(true) // Change to false to get word to word compare
247 {
248 }
249
250 /**
251  * @brief Destructor.
252  * The destructor frees all diffs added to the vectors.
253  */
254 stringdiffs::~stringdiffs() = default;
255
256 #ifdef STRINGDIFF_LOGGING
257 void
258 stringdiffs::debugoutput()
259 {
260         for (size_t i = 0; i < m_wdiffs.size(); i++)
261         {
262                 String str1;
263                 String str2;
264                 TCHAR buf[256];
265                 int s1 = m_wdiffs[i].begin[0];
266                 int e1 = m_wdiffs[i].end[0];
267                 int s2 = m_wdiffs[i].begin[1];
268                 int e2 = m_wdiffs[i].end[1];
269
270                 int len1 = e1 - s1 + 1;
271                 int len2 = e2 - s2 + 1;
272
273                 if (len1 < 50)
274                         str1 = m_str1.substr(s1 ,e1 - s1 + 1);
275                 else
276                         str1 = m_str1.substr(s1, 50);
277
278                 if (len2 < 50)
279                         str2 = m_str2.substr(s2, e2- s2 + 1);
280                 else
281                         str2 = m_str2.substr(s2, 50);
282
283                 wsprintf(buf, _T("left=  %s,   %d,%d,\nright= %s,   %d,%d \n"),
284                         str1.c_str(), s1, e1, str2.c_str(), s2, e2);
285                 OutputDebugString(buf);
286         }
287 }
288 #endif
289
290 bool
291 stringdiffs::BuildWordDiffList_DP()
292 {
293         std::vector<char> edscript;
294
295         //if (dp(edscript) <= 0)
296         //      return false;
297         if (onp(edscript) < 0)
298                 return false;
299
300         int i = 1, j = 1;
301         for (size_t k = 0; k < edscript.size(); k++)
302         {
303                 int s1, e1, s2, e2;
304                 if (edscript[k] == '-')
305                 {
306                         if (m_whitespace == WHITESPACE_IGNORE_ALL)
307                         {
308                                 if (IsSpace(m_words1[i]))
309                                 {
310                                         i++;
311                                         continue;
312                                 }
313                         }
314                         if (m_ignore_numbers && IsNumber(m_words1[i]))
315                         {
316                                 i++;
317                                 continue;
318                         }
319
320                         s1 = m_words1[i].start;
321                         e1 = m_words1[i].end;
322                         s2 = m_words2[j-1].end+1;
323                         e2 = s2-1;
324                         m_wdiffs.emplace_back(s1, e1, s2, e2);
325                         i++;
326                 }
327                 else if (edscript[k] == '+')
328                 {
329                         if (m_whitespace == WHITESPACE_IGNORE_ALL)
330                         {
331                                 if (IsSpace(m_words2[j]))
332                                 {
333                                         j++;
334                                         continue;
335                                 }
336                         }
337
338                         if (m_ignore_numbers && IsNumber(m_words2[j]))
339                         {
340                                 j++;
341                                 continue;
342                         }
343
344                         s1 = m_words1[i-1].end+1;
345                         e1 = s1-1;
346                         s2 = m_words2[j].start;
347                         e2 = m_words2[j].end;
348                         m_wdiffs.emplace_back(s1, e1, s2, e2);
349                         j++;
350                 }
351                 else if (edscript[k] == '!')
352                 {
353                         if (m_whitespace == WHITESPACE_IGNORE_CHANGE || m_whitespace == WHITESPACE_IGNORE_ALL)
354                         {
355                                 if (IsSpace(m_words1[i]) && IsSpace(m_words2[j]))
356                                 {
357                                         i++; j++;
358                                         continue;
359                                 }
360                         }
361                         if (m_ignore_numbers && IsNumber(m_words1[i]) && IsNumber(m_words2[j]))
362                         {
363                                 i++; j++;
364                                 continue;
365                         }
366
367                         s1 =  m_words1[i].start;
368                         e1 =  m_words1[i].end;
369                         s2 =  m_words2[j].start;
370                         e2 =  m_words2[j].end ;
371                         m_wdiffs.emplace_back(s1, e1, s2, e2);
372                         i++; j++;
373                 }
374                 else
375                 {
376                         i++; j++;
377                 }
378         }
379 #ifdef STRINGDIFF_LOGGING
380         debugoutput();
381 #endif
382         return true;
383 }
384
385 /**
386  * @brief Add all different elements between lines to the wdiff list
387  */
388 void
389 stringdiffs::BuildWordDiffList()
390 {
391         m_words1 = BuildWordsArray(m_str1);
392         m_words2 = BuildWordsArray(m_str2);
393
394         bool succeeded = false;
395 #ifdef _WIN64
396         if (m_words1.size() < 20480 && m_words2.size() < 20480)
397 #else
398         if (m_words1.size() < 2048 && m_words2.size() < 2048)
399 #endif
400         {
401                 succeeded = BuildWordDiffList_DP();
402         }
403         if (!succeeded)
404         {
405                 int s1 = m_words1[0].start;
406                 int e1 = m_words1[m_words1.size() - 1].end;
407                 int s2 = m_words2[0].start;
408                 int e2 = m_words2[m_words2.size() - 1].end;
409                 m_wdiffs.emplace_back(s1, e1, s2, e2);
410                 return;
411         }
412
413 }
414
415 /**
416  * @brief Break line into constituent words
417  */
418 std::vector<stringdiffs::word>
419 stringdiffs::BuildWordsArray(const String & str)
420 {
421         std::vector<word> words;
422         int i = 0, begin = 0;
423         ICUBreakIterator *pIterChar = ICUBreakIterator::getCharacterBreakIterator(reinterpret_cast<const UChar *>(str.c_str()), static_cast<int32_t>(str.length()));
424
425         size_t sLen = str.length();
426         assert(sLen < INT_MAX);
427         int iLen = static_cast<int>(sLen);
428
429         // dummy;
430         words.push_back(word(0, -1, 0, 0));
431
432         // state when we are looking for next word
433 inspace:
434         if (isSafeWhitespace(str[i])) 
435         {
436                 i = pIterChar->next();
437                 goto inspace;
438         }
439         if (begin < i)
440         {
441                 // just finished a word
442                 // e is first word character (space or at end)
443                 int e = i - 1;
444
445                 words.push_back(word(begin, e, dlspace, Hash(str, begin, e, 0)));
446         }
447         if (i == iLen)
448                 return words;
449         begin = i;
450         goto inword;
451
452         // state when we are inside a word
453 inword:
454         bool atspace=false;
455         if (i == iLen || ((atspace = isSafeWhitespace(str[i])) != 0) || isWordBreak(m_breakType, str.c_str(), i, m_ignore_numbers))
456         {
457                 if (begin<i)
458                 {
459                         // just finished a word
460                         // e is first non-word character (space or at end)
461                         int e = i-1;
462                         
463                         words.push_back(word(begin, e, dlword, Hash(str, begin, e, 0)));
464                 }
465                 if (i == iLen)
466                 {
467                         return words;
468                 }
469                 else if (atspace)
470                 {
471                         begin = i;
472                         goto inspace;
473                 }
474                 else
475                 {
476                         // start a new word because we hit a non-whitespace word break (eg, a comma)
477                         // but, we have to put each word break character into its own word
478                         int break_type = dlbreak;
479                         if (m_ignore_numbers && _istdigit(str[i]))
480                         {
481                                 break_type = dlnumber;
482                         }
483                         int inext = pIterChar->next();
484                         words.push_back(word(i, inext - 1, break_type, Hash(str, i, inext - 1, 0)));
485                         i = inext;
486                         begin = i;
487                         goto inword;
488                 }
489         }
490         i = pIterChar->next();
491         goto inword; // safe even if we're at the end or no longer in a word
492 }
493
494 /**
495  * @brief Populate m_pDiffs from m_wdiffs (combining adjacent diffs)
496  *
497  * Doing the combining of adjacent diffs here keeps some complexity out of BuildWordsArray.
498  */
499 void
500 stringdiffs::PopulateDiffs()
501 {
502         auto IsEOLorEmpty = [](const String& text, size_t begin, size_t end) -> bool {
503                 if (end - begin + 1 > 2)
504                         return false;
505                 String str = text.substr(begin, end - begin + 1);
506                 return (str.empty() || str == _T("\r\n") || str == _T("\n") || str == _T("\r"));
507         };
508         
509         for (int i=0; i< (int)m_wdiffs.size(); ++i)
510         {
511                 bool skipIt = false;
512                 // combine it with next ?
513                 if (i+1< (int)m_wdiffs.size())
514                 {
515                         if (m_wdiffs[i].end[0] + 1 == m_wdiffs[i+1].begin[0]
516                                 && m_wdiffs[i].end[1] + 1 == m_wdiffs[i+1].begin[1])
517                         {
518                                 // diff[i] and diff[i+1] are contiguous
519                                 // so combine them into diff[i+1] and ignore diff[i]
520                                 m_wdiffs[i+1].begin[0] = m_wdiffs[i].begin[0];
521                                 m_wdiffs[i+1].begin[1] = m_wdiffs[i].begin[1];
522                                 skipIt = true;
523                         }
524                 }
525                 else
526                 {
527                         if (!m_eol_sensitive &&
528                                 IsEOLorEmpty(m_str1, m_wdiffs[i].begin[0], m_wdiffs[i].end[0]) &&
529                                 IsEOLorEmpty(m_str2, m_wdiffs[i].begin[1], m_wdiffs[i].end[1]))
530                                 skipIt = true;
531                 }
532                 if (!skipIt)
533                 {
534                         // Should never have a pair where both are missing
535                         assert(m_wdiffs[i].begin[0]>=0 || m_wdiffs[i].begin[1]>=0);
536
537                         // Store the diff[i] in the caller list (m_pDiffs)
538                         m_pDiffs->emplace_back(m_wdiffs[i]);
539                 }
540         }
541 }
542
543 // diffutils hash
544
545 /* Rotate a value n bits to the left. */
546 #define UINT_BIT (sizeof (unsigned) * CHAR_BIT)
547 #define ROL(v, n) ((v) << (n) | (v) >> (UINT_BIT - (n)))
548 /* Given a hash value and a new character, return a new hash value. */
549 #define HASH(h, c) ((c) + ROL (h, 7))
550
551 unsigned
552 stringdiffs::Hash(const String & str, int begin, int end, unsigned h) const
553 {
554         for (int i = begin; i <= end; ++i)
555         {
556                 TCHAR ch = static_cast<unsigned>(str[i]);
557                 if (m_case_sensitive)
558                 {
559                         h += HASH(h, ch);
560                 }
561                 else
562                 {
563                         ch = static_cast<unsigned>(_totlower(ch));
564                         h += HASH(h, ch);
565                 }
566         }
567
568         return h;
569 }
570
571 /**
572  * @brief Compare two words (by reference to original strings)
573  */
574 bool
575 stringdiffs::AreWordsSame(const word & word1, const word & word2) const
576 {
577         if (this->m_whitespace != WHITESPACE_COMPARE_ALL)
578         {
579                 if (IsSpace(word1) && IsSpace(word2))
580                         return true;
581         }
582         if (m_ignore_numbers)
583         {
584                 auto a = m_str1[word1.start];
585                 auto b = m_str2[word2.start];
586                 if (_istdigit(a) && _istdigit(b))
587                         return true;
588
589         }
590         
591         if (word1.hash != word2.hash)
592                 return false;
593         if (word1.length() != word2.length())
594                 return false;
595         for (int i=0; i<word1.length(); ++i)
596         {
597                 if (!caseMatch(m_str1[word1.start+i], m_str2[word2.start+i]))
598                         return false;
599         }
600         return true;
601 }
602
603 /**
604  * @brief Return true if characters match
605  */
606 bool
607 stringdiffs::caseMatch(TCHAR ch1, TCHAR ch2) const
608 {
609         if (m_case_sensitive) 
610                 return ch1==ch2;
611         else 
612                 return _totlower(ch1)==_totlower(ch2);
613 }
614
615 /**
616  * @ brief An O(NP) Sequence Comparison Algorithm. Sun Wu, Udi Manber, Gene Myers
617  */
618 int
619 stringdiffs::onp(std::vector<char> &edscript)
620 {
621         auto start = std::chrono::system_clock::now();
622
623         int M = static_cast<int>(m_words1.size() - 1);
624         int N = static_cast<int>(m_words2.size() - 1);
625         bool exchanged = false;
626         if (M > N)
627         {
628                 M = static_cast<int>(m_words2.size() - 1);
629                 N = static_cast<int>(m_words1.size() - 1);
630                 exchanged = true;
631         }
632         int *fp = (new int[(M+1) + 1 + (N+1)]) + (M+1);
633         struct EditScriptElem { int op; int neq; int pk; int pi; };
634         std::vector<EditScriptElem> *es = (new std::vector<EditScriptElem>[(M+1) + 1 + (N+1)]) + (M+1);
635         int DELTA = N - M;
636         
637         auto addEditScriptElem = [&es, &fp](int k) {
638                 EditScriptElem ese;
639                 if (fp[k - 1] + 1 > fp[k + 1])
640                 {
641                         ese.op = '+';
642                         ese.neq = fp[k] - (fp[k - 1] + 1);
643                         ese.pk = k - 1;
644                 }
645                 else
646                 {
647                         ese.op = '-';
648                         ese.neq = fp[k] - fp[k + 1];
649                         ese.pk = k + 1;
650                 }
651                 ese.pi = static_cast<int>(es[ese.pk].size() - 1);
652                 es[k].push_back(ese);
653         };
654
655         const int COUNTMAX = 100000;
656         int count = 0;
657         int k;
658         for (k = -(M+1); k <= (N+1); k++)
659                 fp[k] = -1; 
660         int p = -1;
661         do
662         {
663                 p = p + 1;
664                 for (k = -p; k <= DELTA-1; k++)
665                 {
666                         fp[k] = snake(k, std::max(fp[k-1] + 1, fp[k+1]), M, N, exchanged);
667                         addEditScriptElem(k);
668                         count++;
669                 }
670                 for (k = DELTA + p; k >= DELTA+1; k--)
671                 {
672                         fp[k] = snake(k, std::max(fp[k-1] + 1, fp[k+1]), M, N, exchanged);
673                         addEditScriptElem(k);
674                         count++;
675                 }
676                 k = DELTA;
677                 fp[k] = snake(k, std::max(fp[k-1] + 1, fp[k+1]), M, N, exchanged);
678                 addEditScriptElem(k);
679                 count++;
680
681                 if (count > COUNTMAX)
682                 {
683                         count = 0;
684                         auto end = std::chrono::system_clock::now();
685                         auto msec = std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count();
686                         if (msec > TimeoutMilliSeconds)
687                         {
688                                 delete [] (es - (M+1));
689                                 delete [] (fp - (M+1));
690                                 return -1;
691                         }
692                 }
693         } while (fp[k] != N);
694
695         edscript.clear();
696
697         std::vector<char> ses;
698         int i;
699         for (k = DELTA, i = static_cast<int>(es[DELTA].size() - 1); i >= 0;)
700         {
701                 EditScriptElem& esi = es[k][i];
702                 for (int j = 0; j < esi.neq; ++j)
703                         ses.push_back('=');
704                 ses.push_back(static_cast<char>(esi.op));
705                 i = esi.pi;
706                 k = esi.pk;
707         }
708         std::reverse(ses.begin(), ses.end());
709
710         int D = 0;
711         for (i = 1; i < static_cast<int>(ses.size()); i++)
712         {
713                 switch (ses[i])
714                 {
715                 case '+':
716                         if (static_cast<size_t>(i + 1) < ses.size() && ses[i + 1] == '-')
717                         {
718                                 edscript.push_back('!');
719                                 i++;
720                                 D++;
721                         }
722                         else
723                         {
724                                 edscript.push_back(exchanged ? '-' : '+');
725                                 D++;
726                         }
727                         break;
728                 case '-':
729                         if (static_cast<size_t>(i + 1) < ses.size() && ses[i + 1] == '+')
730                         {
731                                 edscript.push_back('!');
732                                 i++;
733                                 D++;
734                         }
735                         else
736                         {
737                                 edscript.push_back(exchanged ? '+' : '-');
738                                 D++;
739                         }
740                         break;
741                 default:
742                         edscript.push_back('=');
743                 }
744         }
745                 
746         delete [] (es - (M+1));
747         delete [] (fp - (M+1));
748
749         return D;
750 }
751
752 int
753 stringdiffs::snake(int k, int y, int M, int N, bool exchanged)
754 {
755         int x = y - k;
756         if (exchanged)
757         {
758                 while (x < M && y < N && AreWordsSame(m_words1[y + 1], m_words2[x + 1])) {
759                         x++; y++;
760                 }
761         }
762         else
763         {
764                 while (x < M && y < N && AreWordsSame(m_words1[x + 1], m_words2[y + 1])) {
765                         x++; y++;
766                 }
767         }
768         return y;
769 }
770
771 /**
772  * @brief Return true if chars match
773  *
774  * Caller must not call this for lead bytes
775  */
776 static inline bool
777 matchchar(const TCHAR *ch1, const TCHAR *ch2, size_t len, bool casitive)
778 {
779         if (casitive)
780                 return memcmp(ch1, ch2, len * sizeof(TCHAR)) == 0;
781         for (size_t i = 0; i < len; ++i)
782         {
783                 if (_totlower(ch1[i]) != _totlower(ch2[i]))
784                         return false;
785         }
786         return true;
787 }
788
789
790 /** Does character introduce a multicharacter character? */
791 static inline bool IsLeadByte(TCHAR ch)
792 {
793 #ifdef UNICODE
794         return false;
795 #else
796         return _getmbcp() && IsDBCSLeadByte(ch);
797 #endif
798 }
799
800 /**
801  * @brief Is it whitespace (excludes all lead & trail bytes)?
802  */
803 static inline bool
804 isSafeWhitespace(TCHAR ch)
805 {
806         return _istspace((unsigned)ch) && !IsLeadByte(ch);
807 }
808
809 /**
810  * @brief Is it a non-whitespace wordbreak character (ie, punctuation)?
811  */
812 static bool
813 isWordBreak(int breakType, const TCHAR *str, int index, bool ignore_numbers)
814 {
815         TCHAR ch = str[index];
816         if (ignore_numbers && _istdigit(ch))
817                 return true;
818         // breakType==1 means break also on punctuation
819         if ((ch & 0xff00) == 0)
820         {
821 //              TCHAR nextCh = str[index + 1];
822                 // breakType==0 means whitespace only
823                 if (breakType==0)
824                         return false;
825                 return _tcschr(BreakChars, ch) != nullptr;
826         }
827         else 
828         {
829 //              if (
830 //                      ch==0xff0c/* Fullwidth Full Stop */ || 
831 //                      ch==0xff0e/* Fullwidth Comma */ ||
832 //                      ch==0xff1b/* Fullwidth Semicolon */ ||
833 //                      ch==0xff1a/* Fullwidth Colon */ ||
834 //                      ch==0x3002/* Ideographic Full Stop */ || 
835 //                      ch==0x3001/* Ideographic Comma */
836 //                      )
837 //                      return true;
838 //              WORD wCharType, wCharTypeNext;
839 //              GetStringTypeW(CT_CTYPE3, &ch, 1, &wCharType);
840 //              TCHAR nextCh = str[index + 1];
841 //              GetStringTypeW(CT_CTYPE3, &nextCh, 1, &wCharTypeNext);
842 //              return (wCharType != wCharTypeNext);
843 //              
844                 WORD wCharType = 0;
845                 GetStringTypeW(CT_CTYPE1, &ch, 1, &wCharType);
846                 if ((wCharType & (C1_UPPER | C1_LOWER | C1_DIGIT)) != 0)
847                         return false;
848                 return true;
849         }
850 }
851
852
853 /**
854  * @brief advance current pointer over whitespace, until not whitespace or beyond end
855  * @param pcurrent [in,out] current location (to be advanced)
856  * @param end [in] last valid position (only go one beyond this)
857  */
858 static void
859 AdvanceOverWhitespace(const TCHAR **pcurrent, const TCHAR *end)
860 {
861         // advance over whitespace
862         while (*pcurrent <= end && isSafeWhitespace(**pcurrent))
863                 ++(*pcurrent); // DBCS safe because of isSafeWhitespace above
864 }
865
866 /**
867  * @brief Compute begin1,begin2,end1,end2 to display byte difference between strings str1 & str2
868  * @param casitive [in] true for case-sensitive, false for case-insensitive
869  * @param xwhite [in] This governs whether we handle whitespace specially (see WHITESPACE_COMPARE_ALL, WHITESPACE_IGNORE_CHANGE, WHITESPACE_IGNORE_ALL)
870  * @param [out] begin return -1 if not found or pos of equal
871  * @param [out] end return -1 if not found or pos of equal valid if begin1 >=0
872  * @param [in] equal false surch for a diff, true surch for equal
873  *
874  *
875  * Assumes whitespace is never leadbyte or trailbyte!
876  */
877 void
878 stringdiffs::ComputeByteDiff(const String & str1, const String & str2, 
879                    bool casitive, int xwhite, 
880                    int begin[2], int end[2], bool equal)
881 {
882         // Set to sane values
883         // Also this way can distinguish if we set begin[0] to -1 for no diff in line
884         begin[0] = end[0] = begin[1] = end[1] = 0;
885
886         int len1 = static_cast<int>(str1.length());
887         int len2 = static_cast<int>(str2.length());
888
889         const TCHAR *pbeg1 = str1.c_str();
890         const TCHAR *pbeg2 = str2.c_str();
891
892         ICUBreakIterator *pIterCharBegin1 = ICUBreakIterator::getCharacterBreakIterator(reinterpret_cast<const UChar *>(pbeg1), static_cast<int32_t>(len1));
893         ICUBreakIterator *pIterCharBegin2 = ICUBreakIterator::getCharacterBreakIterator<2>(reinterpret_cast<const UChar *>(pbeg2), static_cast<int32_t>(len2));
894         ICUBreakIterator *pIterCharEnd1 = ICUBreakIterator::getCharacterBreakIterator<3>(reinterpret_cast<const UChar *>(pbeg1), static_cast<int32_t>(len1));
895         ICUBreakIterator *pIterCharEnd2 = ICUBreakIterator::getCharacterBreakIterator<4>(reinterpret_cast<const UChar *>(pbeg2), static_cast<int32_t>(len2));
896         
897         if (len1 == 0 || len2 == 0)
898         {
899                 if (len1 == len2)
900                 {
901                         begin[0] = -1;
902                         begin[1] = -1;
903                 }
904                 end[0] = len1 - 1;
905                 end[1] = len2 - 1;
906                 return;
907         }
908
909         // cursors from front, which we advance to beginning of difference
910         const TCHAR *py1 = pbeg1;
911         const TCHAR *py2 = pbeg2;
912
913         // pen1,pen2 point to the last valid character (broken multibyte lead chars don't count)
914         const TCHAR *pen1 = pbeg1 + (len1 > 0 ? pIterCharEnd1->preceding(len1) : 0);
915         const TCHAR *pen2 = pbeg2 + (len2 > 0 ? pIterCharEnd2->preceding(len2) : 0);
916         size_t glyphlenz1 = pbeg1 + len1 - pen1;
917         size_t glyphlenz2 = pbeg2 + len2 - pen2;
918
919         if (xwhite != WHITESPACE_COMPARE_ALL)
920         {
921                 // Ignore leading and trailing whitespace
922                 // by advancing py1 and py2
923                 // and retreating pen1 and pen2
924                 while (py1 < pen1 && isSafeWhitespace(*py1))
925                         py1 = pbeg1 + pIterCharBegin1->next();
926                 while (py2 < pen2 && isSafeWhitespace(*py2))
927                         py2 = pbeg2 + pIterCharBegin2->next();
928                 if ((pen1 < pbeg1 + len1 - 1 || pen2 < pbeg2 + len2 -1)
929                         && (pbeg1[len1] != pbeg2[len2]))
930                 {
931                         // mismatched broken multibyte ends
932                 }
933                 else
934                 {
935                         while (pen1 > py1 && isSafeWhitespace(*pen1))
936                                 pen1 = pbeg1 + pIterCharEnd1->previous();
937                         while (pen2 > py2 && isSafeWhitespace(*pen2))
938                                 pen2 = pbeg2 + pIterCharEnd2->previous();
939                 }
940         }
941         //check for excaption of empty string on one side
942         //In that case display all as a diff
943         if (!equal && (((py1 == pen1) && isSafeWhitespace(*pen1)) ||
944                 ((py2 == pen2) && isSafeWhitespace(*pen2))))
945         {
946                 begin[0] = 0;
947                 begin[1] = 0;
948                 end[0] = len1 - 1;
949                 end[1] = len2 - 1;
950                 return;
951         }
952         // Advance over matching beginnings of lines
953         // Advance py1 & py2 from beginning until find difference or end
954         while (true)
955         {
956                 // Potential difference extends from py1 to pen1 and py2 to pen2
957
958                 // Check if either side finished
959                 if (py1 > pen1 && py2 > pen2)
960                 {
961                         begin[0] = end[0] = begin[1] = end[1] = -1;
962                         break;
963                 }
964                 if (py1 > pen1 || py2 > pen2)
965                 {
966                         break;
967                 }
968
969                 // handle all the whitespace logic (due to WinMerge whitespace settings)
970                 if (xwhite && py1 < pen1 && isSafeWhitespace(*py1))
971                 {
972                         if (xwhite==WHITESPACE_IGNORE_CHANGE && !isSafeWhitespace(*py2))
973                         {
974                                 // py1 is white but py2 is not
975                                 // in WHITESPACE_IGNORE_CHANGE mode,
976                                 // this doesn't qualify as skippable whitespace
977                                 break; // done with forward search
978                         }
979                         // gobble up all whitespace in current area
980                         AdvanceOverWhitespace(&py1, pen1); // will go beyond end
981                         AdvanceOverWhitespace(&py2, pen2); // will go beyond end
982                         continue;
983
984                 }
985                 if (xwhite && py2 < pen2 && isSafeWhitespace(*py2))
986                 {
987                         if (xwhite==WHITESPACE_IGNORE_CHANGE && !isSafeWhitespace(*py1))
988                         {
989                                 // py2 is white but py1 is not
990                                 // in WHITESPACE_IGNORE_CHANGE mode,
991                                 // this doesn't qualify as skippable whitespace
992                                 break; // done with forward search
993                         }
994                         // gobble up all whitespace in current area
995                         AdvanceOverWhitespace(&py1, pen1); // will go beyond end
996                         AdvanceOverWhitespace(&py2, pen2); // will go beyond end
997                         continue;
998                 }
999
1000                 const TCHAR* py1next = pbeg1 + pIterCharBegin1->next();
1001                 const TCHAR* py2next = pbeg2 + pIterCharBegin2->next();
1002                 size_t glyphleny1 = py1next - py1;
1003                 size_t glyphleny2 = py2next - py2;
1004                 if (glyphleny1 != glyphleny2 || !matchchar(py1, py2, glyphleny1, casitive))
1005                         break; // done with forward search
1006                 py1 = py1next;
1007                 py2 = py2next;
1008         }
1009
1010         // Potential difference extends from py1 to pen1 and py2 to pen2
1011
1012         // Store results of advance into return variables (begin[0] & begin[1])
1013         // -1 in a begin variable means no visible diff area
1014         begin[0] = static_cast<int>(py1 - pbeg1);
1015         begin[1] = static_cast<int>(py2 - pbeg2);
1016
1017         const TCHAR *pz1 = pen1;
1018         const TCHAR *pz2 = pen2;
1019
1020         // Retreat over matching ends of lines
1021         // Retreat pz1 & pz2 from end until find difference or beginning
1022         while (true)
1023         {
1024                 // Check if either side finished
1025                 if (pz1 < py1 && pz2 < py2)
1026                 {
1027                         begin[0] = end[0] = begin[1] = end[1] = -1;
1028                         break;
1029                 }
1030                 if (pz1 < py1 || pz2 < py2)
1031                 {
1032                         break;
1033                 }
1034
1035                 // handle all the whitespace logic (due to WinMerge whitespace settings)
1036                 if (xwhite && pz1 > py1 && isSafeWhitespace(*pz1))
1037                 {
1038                         if (xwhite==1 && !isSafeWhitespace(*pz2))
1039                                 break; // done with reverse search
1040                         // gobble up all whitespace in current area
1041                         while (pz1 > py1 && isSafeWhitespace(*pz1))
1042                                 pz1 = pbeg1 + pIterCharEnd1->previous();
1043                         while (pz2 > py2 && isSafeWhitespace(*pz2))
1044                                 pz2 = pbeg2 + pIterCharEnd2->previous();
1045                         continue;
1046
1047                 }
1048                 if (xwhite && pz2 > py2 && isSafeWhitespace(*pz2))
1049                 {
1050                         if (xwhite==1)
1051                                 break; // done with reverse search
1052                         while (pz2 > py2 && isSafeWhitespace(*pz2))
1053                                 pz2 = pbeg2 + pIterCharEnd2->previous();
1054                         continue;
1055                 }
1056
1057                 if (glyphlenz1 != glyphlenz2 || !matchchar(pz1, pz2, glyphlenz1, casitive))
1058                         break; // done with forward search
1059                 const TCHAR* pz1next = pz1;
1060                 const TCHAR* pz2next = pz2;
1061                 pz1 = (pz1 > pbeg1) ? pbeg1 + pIterCharEnd1->preceding(static_cast<int32_t>(pz1 - pbeg1)) : pz1 - 1;
1062                 pz2 = (pz2 > pbeg2) ? pbeg2 + pIterCharEnd2->preceding(static_cast<int32_t>(pz2 - pbeg2)) : pz2 - 1;
1063                 glyphlenz1 = pz1next - pz1;
1064                 glyphlenz2 = pz2next - pz2;
1065                 // Now do real character match
1066     }
1067
1068 /*      if (*pz1 == '\r' && *(pz1+1) == '\n')
1069         {
1070                 pz1++;
1071                 pz2++;
1072         }
1073         else if (*pz2 == '\r' && *(pz2+1) == '\n')
1074         {
1075                 pz2++;
1076                 pz1++;
1077         }
1078         if (*(pbeg1-1) == '\r' && *pbeg1 == '\n')
1079         {
1080                 pbeg1--;
1081                 pbeg2--;
1082         }
1083         else if (*(pbeg2-1) == '\r' && *pbeg2 == '\n')
1084         {
1085                 pbeg2--;
1086                 pbeg1--;
1087         }*/
1088
1089         // Store results of advance into return variables (end[0] & end[1])
1090         end[0] = static_cast<int>(pz1 - pbeg1 + glyphlenz1 - 1);
1091         end[1] = static_cast<int>(pz2 - pbeg2 + glyphlenz2 - 1);
1092
1093         // Check if difference region was empty
1094         if (begin[0] == end[0] + 1 && begin[1] == end[1] + 1)
1095                 begin[0] = -1; // no diff
1096 }
1097
1098 /**
1099  * @brief adjust the range of the specified word diffs down to byte(char) level.
1100  * @param str1, str2 [in] line to be compared
1101  * @param casitive [in] true for case-sensitive, false for case-insensitive
1102  * @param xwhite [in] This governs whether we handle whitespace specially
1103  *  (see WHITESPACE_COMPARE_ALL, WHITESPACE_IGNORE_CHANGE, WHITESPACE_IGNORE_ALL)
1104  */
1105 void stringdiffs::wordLevelToByteLevel()
1106 {
1107         for (size_t i = 0; i < m_wdiffs.size(); i++)
1108         {
1109                 int begin[3], end[3];
1110                 wdiff& diff = m_wdiffs[i];
1111                 String str1_2, str2_2;
1112                 str1_2 = m_str1.substr(diff.begin[0], diff.end[0] - diff.begin[0] + 1);
1113                 str2_2 = m_str2.substr(diff.begin[1], diff.end[1] - diff.begin[1] + 1);
1114                 ComputeByteDiff(str1_2, str2_2, m_case_sensitive, m_whitespace, begin, end, false);
1115                 if (begin[0] == -1)
1116                 {
1117                         // no visible diff on side1
1118                         diff.end[0] = diff.begin[0] - 1;
1119                 }
1120                 else
1121                 {
1122                         diff.end[0] = diff.begin[0] + end[0];
1123                         diff.begin[0] += begin[0];
1124                 }
1125                 if (begin[1] == -1)
1126                 {
1127                         // no visible diff on side2
1128                         diff.end[1] = diff.begin[1] - 1;
1129                 }
1130                 else
1131                 {
1132                         diff.end[1] = diff.begin[1] + end[1];
1133                         diff.begin[1] += begin[1];
1134                 }
1135         }
1136 }
1137
1138 }