OSDN Git Service

FolderCompare.exe: make FolderCompare.exe buildable with MinGW again
[winmerge-jp/winmerge-jp.git] / Src / stringdiffs.cpp
1 /** 
2  * @file  stringdiffs.cpp
3  *
4  * @brief Implementation file for sd_ComputeWordDiffs (q.v.)
5  *
6  */
7
8 #include "stringdiffs.h"
9 #define NOMINMAX
10 #include <windows.h>
11 #include <tchar.h>
12 #include <cstring>
13 #include <cassert>
14 #include <climits>
15 #include <mbctype.h>
16 #include "CompareOptions.h"
17 #include "stringdiffsi.h"
18 #include "Diff3.h"
19
20 using std::vector;
21
22 static bool Initialized;
23 static bool CustomChars;
24 static TCHAR *BreakChars;
25 static TCHAR BreakCharDefaults[] = _T(",.;:");
26
27 static bool isSafeWhitespace(TCHAR ch);
28 static bool isWordBreak(int breakType, const TCHAR *str, int index);
29
30 void sd_Init()
31 {
32         BreakChars = &BreakCharDefaults[0];
33         Initialized = true;
34 }
35
36 void sd_Close()
37 {
38         if (CustomChars)
39         {
40                 free(BreakChars);
41                 BreakChars = NULL;
42                 CustomChars = false;
43         }
44         Initialized = false;
45 }
46
47 void sd_SetBreakChars(const TCHAR *breakChars)
48 {
49         assert(Initialized);
50
51         if (CustomChars)
52                 free(BreakChars);
53
54         CustomChars = true;
55         BreakChars = _tcsdup(breakChars);
56 }
57
58 void
59 sd_ComputeWordDiffs(const String& str1, const String& str2,
60         bool case_sensitive, int whitespace, int breakType, bool byte_level,
61         std::vector<wdiff> * pDiffs)
62 {
63         String strs[3] = {str1, str2, _T("")};
64         sd_ComputeWordDiffs(2, strs, case_sensitive, whitespace, breakType, byte_level, pDiffs);
65 }
66
67 struct Comp02Functor
68 {
69         Comp02Functor(const String *strs, bool case_sensitive) : 
70                 strs_(strs), case_sensitive_(case_sensitive)
71         {
72         }
73         bool operator()(const wdiff &wd3)
74         {
75                 size_t wlen0 = wd3.end[0] - wd3.begin[0] + 1;
76                 size_t wlen2 = wd3.end[2] - wd3.begin[2] + 1;
77                 if (wlen0 != wlen2)
78                         return false;
79                 if (case_sensitive_)
80                 {
81                         if (memcmp(&strs_[0][wd3.begin[0]], &strs_[2][wd3.begin[2]], wlen0 * sizeof(TCHAR)) != 0)
82                                 return false;
83                 }
84                 else
85                 {
86                         if (_tcsnicmp(&strs_[0][wd3.begin[0]], &strs_[2][wd3.begin[2]], wlen0) != 0)
87                                 return false;
88                 }
89                 return true;
90         }
91         const String *strs_;
92         bool case_sensitive_;
93 };
94
95 /**
96  * @brief Construct our worker object and tell it to do the work
97  */
98 void
99 sd_ComputeWordDiffs(int nFiles, const String str[3],
100         bool case_sensitive, int whitespace, int breakType, bool byte_level,
101         std::vector<wdiff> * pDiffs)
102 {
103         if (nFiles == 2)
104         {
105                 stringdiffs sdiffs(str[0], str[1], case_sensitive, whitespace, breakType, pDiffs);
106                 // Hash all words in both lines and then compare them word by word
107                 // storing differences into m_wdiffs
108                 sdiffs.BuildWordDiffList();
109
110                 if (byte_level)
111                         sdiffs.wordLevelToByteLevel();
112
113                 // Now copy m_wdiffs into caller-supplied m_pDiffs (coalescing adjacents if possible)
114                 sdiffs.PopulateDiffs();
115
116         }
117         else
118         {
119                 if (str[0].empty())
120                 {
121                         stringdiffs sdiffs(str[1], str[2], case_sensitive, whitespace, breakType, pDiffs);
122                         sdiffs.BuildWordDiffList();
123                         if (byte_level)
124                                 sdiffs.wordLevelToByteLevel();
125                         sdiffs.PopulateDiffs();
126                         for (size_t i = 0; i < pDiffs->size(); i++)
127                         {
128                                 wdiff& diff = (*pDiffs)[i];
129                                 diff.begin[2] = diff.begin[1];
130                                 diff.begin[1] = diff.begin[0];
131                                 diff.begin[0] = 0;
132                                 diff.end[2] = diff.end[1];
133                                 diff.end[1] = diff.end[0];
134                                 diff.end[0] = -1;
135                         }
136                 }
137                 else if (str[1].empty())
138                 {
139                         stringdiffs sdiffs(str[0], str[2], case_sensitive, whitespace, breakType, pDiffs);
140                         sdiffs.BuildWordDiffList();
141                         if (byte_level)
142                                 sdiffs.wordLevelToByteLevel();
143                         sdiffs.PopulateDiffs();
144                         for (size_t i = 0; i < pDiffs->size(); i++)
145                         {
146                                 wdiff& diff = (*pDiffs)[i];
147                                 diff.begin[2] = diff.begin[1];
148                                 //diff.begin[0] = diff.begin[0];
149                                 diff.begin[1] = 0;
150                                 diff.end[2] = diff.end[1];
151                                 //diff.end[0] = diff.end[0];
152                                 diff.end[1] = -1;
153                         }
154                 }
155                 else if (str[2].empty())
156                 {
157                         stringdiffs sdiffs(str[0], str[1], case_sensitive, whitespace, breakType, pDiffs);
158                         sdiffs.BuildWordDiffList();
159                         if (byte_level)
160                                 sdiffs.wordLevelToByteLevel();
161                         sdiffs.PopulateDiffs();
162                         for (size_t i = 0; i < pDiffs->size(); i++)
163                         {
164                                 wdiff& diff = (*pDiffs)[i];
165                                 //diff.begin[1] = diff.begin[1];
166                                 //diff.begin[0] = diff.begin[0];
167                                 diff.begin[2] = 0;
168                                 //diff.end[1] = diff.end[1];
169                                 //diff.end[0] = diff.end[0];
170                                 diff.end[2] = -1;
171                         }
172                 }
173                 else
174                 {
175                         std::vector<wdiff> diffs10, diffs12;
176                         stringdiffs sdiffs10(str[1], str[0], case_sensitive, whitespace, breakType, &diffs10);
177                         stringdiffs sdiffs12(str[1], str[2], case_sensitive, whitespace, breakType, &diffs12);
178                         // Hash all words in both lines and then compare them word by word
179                         // storing differences into m_wdiffs
180                         sdiffs10.BuildWordDiffList();
181                         sdiffs12.BuildWordDiffList();
182                         if (byte_level)
183                         {
184                                 sdiffs10.wordLevelToByteLevel();
185                                 sdiffs12.wordLevelToByteLevel();
186                         }
187                         // Now copy m_wdiffs into caller-supplied m_pDiffs (coalescing adjacents if possible)
188                         sdiffs10.PopulateDiffs();
189                         sdiffs12.PopulateDiffs();
190
191                         Make3wayDiff(*pDiffs, diffs10, diffs12, 
192                                 Comp02Functor(str, case_sensitive), false);
193                 }
194         }
195 }
196
197 /**
198  * @brief stringdiffs constructor simply loads all members from arguments
199  */
200 stringdiffs::stringdiffs(const String & str1, const String & str2,
201         bool case_sensitive, int whitespace, int breakType,
202         std::vector<wdiff> * pDiffs)
203 : m_str1(str1)
204 , m_str2(str2)
205 , m_case_sensitive(case_sensitive)
206 , m_whitespace(whitespace)
207 , m_breakType(breakType)
208 , m_pDiffs(pDiffs)
209 , m_matchblock(true) // Change to false to get word to word compare
210 {
211 }
212
213 /**
214  * @brief Destructor.
215  * The destructor frees all diffs added to the vectors.
216  */
217 stringdiffs::~stringdiffs()
218 {
219 }
220
221 #ifdef STRINGDIFF_LOGGING
222 void
223 stringdiffs::debugoutput()
224 {
225         for (size_t i = 0; i < m_wdiffs.size(); i++)
226         {
227                 String str1;
228                 String str2;
229                 TCHAR buf[256];
230                 int s1 = m_wdiffs[i].begin[0];
231                 int e1 = m_wdiffs[i].end[0];
232                 int s2 = m_wdiffs[i].begin[1];
233                 int e2 = m_wdiffs[i].end[1];
234
235                 int len1 = e1 - s1 + 1;
236                 int len2 = e2 - s2 + 1;
237
238                 if (len1 < 50)
239                         str1 = m_str1.substr(s1 ,e1 - s1 + 1);
240                 else
241                         str1 = m_str1.substr(s1, 50);
242
243                 if (len2 < 50)
244                         str2 = m_str2.substr(s2, e2- s2 + 1);
245                 else
246                         str2 = m_str2.substr(s2, 50);
247
248                 wsprintf(buf, _T("left=  %s,   %d,%d,\nright= %s,   %d,%d \n"),
249                         str1.c_str(), s1, e1, str2.c_str(), s2, e2);
250                 OutputDebugString(buf);
251         }
252 }
253 #endif
254
255 bool
256 stringdiffs::BuildWordDiffList_DP()
257 {
258         std::vector<char> edscript;
259
260         //if (dp(edscript) <= 0)
261         //      return false;
262         onp(edscript);
263
264         int i = 1, j = 1;
265         for (size_t k = 0; k < edscript.size(); k++)
266         {
267                 int s1, e1, s2, e2;
268                 if (edscript[k] == '-')
269                 {
270                         if (m_whitespace == WHITESPACE_IGNORE_ALL)
271                         {
272                                 if (IsSpace(m_words1[i]))
273                                 {
274                                         i++;
275                                         continue;
276                                 }
277                         }
278                                 
279                         s1 = m_words1[i].start;
280                         e1 = m_words1[i].end;
281                         s2 = m_words2[j-1].end+1;
282                         e2 = s2-1;
283                         m_wdiffs.push_back(wdiff(s1, e1, s2, e2));
284                         i++;
285                 }
286                 else if (edscript[k] == '+')
287                 {
288                         if (m_whitespace == WHITESPACE_IGNORE_ALL)
289                         {
290                                 if (IsSpace(m_words2[j]))
291                                 {
292                                         j++;
293                                         continue;
294                                 }
295                         }
296
297                         s1 = m_words1[i-1].end+1;
298                         e1 = s1-1;
299                         s2 = m_words2[j].start;
300                         e2 = m_words2[j].end;
301                         m_wdiffs.push_back(wdiff(s1, e1, s2, e2));
302                         j++;
303                 }
304                 else if (edscript[k] == '!')
305                 {
306                         if (m_whitespace == WHITESPACE_IGNORE_CHANGE || m_whitespace == WHITESPACE_IGNORE_ALL)
307                         {
308                                 if (IsSpace(m_words1[i]) && IsSpace(m_words2[j]))
309                                 {
310                                         i++; j++;
311                                         continue;
312                                 }
313                         }
314                                 
315                         s1 =  m_words1[i].start;
316                         e1 =  m_words1[i].end;
317                         s2 =  m_words2[j].start;
318                         e2 =  m_words2[j].end ;
319                         m_wdiffs.push_back(wdiff(s1, e1, s2, e2));
320                         i++; j++;
321                 }
322                 else
323                 {
324                         i++; j++;
325                 }
326         }
327 #ifdef STRINGDIFF_LOGGING
328         debugoutput();
329 #endif
330         return true;
331 }
332
333 /**
334  * @brief Add all different elements between lines to the wdiff list
335  */
336 void
337 stringdiffs::BuildWordDiffList()
338 {
339         BuildWordsArray(m_str1, m_words1);
340         BuildWordsArray(m_str2, m_words2);
341
342 #ifdef _WIN64
343         if (m_words1.size() > 20480 || m_words2.size() > 20480)
344 #else
345         if (m_words1.size() > 2048 || m_words2.size() > 2048)
346 #endif
347         {
348                 int s1 = m_words1[0].start;
349                 int e1 = m_words1[m_words1.size() - 1].end;
350                 int s2 = m_words2[0].start;
351                 int e2 = m_words2[m_words2.size() - 1].end;
352                 m_wdiffs.push_back(wdiff(s1, e1, s2, e2));              
353                 return;
354         }
355
356         BuildWordDiffList_DP();
357 }
358
359 /**
360  * @brief Break line into constituent words
361  */
362 void
363 stringdiffs::BuildWordsArray(const String & str, std::vector<word>& words)
364 {
365         int i=0, begin=0;
366
367         // dummy;
368         words.push_back(word(0, -1, 0, 0));
369
370         // state when we are looking for next word
371 inspace:
372         if (isSafeWhitespace(str[i])) 
373         {
374                 ++i;
375                 goto inspace;
376         }
377         if (begin < i)
378         {
379                 // just finished a word
380                 // e is first word character (space or at end)
381                 int e = i - 1;
382
383                 words.push_back(word(begin, e, dlspace, Hash(str, begin, e, 0)));
384         }
385         if (i == str.length())
386                 return;
387         begin = i;
388         goto inword;
389
390         // state when we are inside a word
391 inword:
392         bool atspace=false;
393         if (i == str.length() || ((atspace = isSafeWhitespace(str[i])) != 0) || isWordBreak(m_breakType, str.c_str(), i))
394         {
395                 if (begin<i)
396                 {
397                         // just finished a word
398                         // e is first non-word character (space or at end)
399                         int e = i-1;
400                         
401                         words.push_back(word(begin, e, dlword, Hash(str, begin, e, 0)));
402                 }
403                 if (i == str.length())
404                 {
405                         return;
406                 }
407                 else if (atspace)
408                 {
409                         begin = i;
410                         goto inspace;
411                 }
412                 else
413                 {
414                         // start a new word because we hit a non-whitespace word break (eg, a comma)
415                         // but, we have to put each word break character into its own word
416                         words.push_back(word(i, i, dlbreak, Hash(str, i, i, 0)));
417                         ++i;
418                         begin = i;
419                         goto inword;
420                 }
421         }
422         ++i;
423         goto inword; // safe even if we're at the end or no longer in a word
424 }
425
426 /**
427  * @brief Populate m_pDiffs from m_wdiffs (combining adjacent diffs)
428  *
429  * Doing the combining of adjacent diffs here keeps some complexity out of BuildWordsArray.
430  */
431 void
432 stringdiffs::PopulateDiffs()
433 {
434         for (int i=0; i< (int)m_wdiffs.size(); ++i)
435         {
436                 bool skipIt = false;
437                 // combine it with next ?
438                 if (i+1< (int)m_wdiffs.size())
439                 {
440                         if (m_wdiffs[i].end[0] + 1 == m_wdiffs[i+1].begin[0]
441                                 && m_wdiffs[i].end[1] + 1 == m_wdiffs[i+1].begin[1])
442                         {
443                                 // diff[i] and diff[i+1] are contiguous
444                                 // so combine them into diff[i+1] and ignore diff[i]
445                                 m_wdiffs[i+1].begin[0] = m_wdiffs[i].begin[0];
446                                 m_wdiffs[i+1].begin[1] = m_wdiffs[i].begin[1];
447                                 skipIt = true;
448                         }
449                 }
450                 if (!skipIt)
451                 {
452                         // Should never have a pair where both are missing
453                         assert(m_wdiffs[i].begin[0]>=0 || m_wdiffs[i].begin[1]>=0);
454
455                         // Store the diff[i] in the caller list (m_pDiffs)
456                         m_pDiffs->push_back(wdiff(m_wdiffs[i]));
457                 }
458         }
459 }
460
461 // diffutils hash
462
463 /* Rotate a value n bits to the left. */
464 #define UINT_BIT (sizeof (unsigned) * CHAR_BIT)
465 #define ROL(v, n) ((v) << (n) | (v) >> (UINT_BIT - (n)))
466 /* Given a hash value and a new character, return a new hash value. */
467 #define HASH(h, c) ((c) + ROL (h, 7))
468
469 unsigned
470 stringdiffs::Hash(const String & str, int begin, int end, unsigned h) const
471 {
472         for (int i = begin; i <= end; ++i)
473         {
474                 unsigned ch = static_cast<unsigned>(str[i]);
475                 if (m_case_sensitive)
476                 {
477                         h += HASH(h, ch);
478                 }
479                 else
480                 {
481                         ch = static_cast<unsigned>(_totupper(ch));
482                         h += HASH(h, ch);
483                 }
484         }
485
486         return h;
487 }
488
489 /**
490  * @brief Compare two words (by reference to original strings)
491  */
492 bool
493 stringdiffs::AreWordsSame(const word & word1, const word & word2) const
494 {
495         if (this->m_whitespace != WHITESPACE_COMPARE_ALL)
496         {
497                 if (IsSpace(word1) && IsSpace(word2))
498                         return true;
499         }
500         if (word1.hash != word2.hash)
501                 return false;
502         if (word1.length() != word2.length())
503                 return false;
504         for (int i=0; i<word1.length(); ++i)
505         {
506                 if (!caseMatch(m_str1[word1.start+i], m_str2[word2.start+i]))
507                         return false;
508         }
509         return true;
510 }
511
512 /**
513  * @brief Return true if characters match
514  */
515 bool
516 stringdiffs::caseMatch(TCHAR ch1, TCHAR ch2) const
517 {
518         if (m_case_sensitive) 
519                 return ch1==ch2;
520         else 
521                 return _totupper(ch1)==_totupper(ch2);
522 }
523
524 /**
525  * @ brief An O(NP) Sequence Comparison Algorithm. Sun Wu, Udi Manber, Gene Myers
526  */
527 int
528 stringdiffs::onp(std::vector<char> &edscript)
529 {
530         int M = m_words1.size() - 1;
531         int N = m_words2.size() - 1;
532         bool exchanged = false;
533         if (M > N)
534         {
535                 M = m_words2.size() - 1;
536                 N = m_words1.size() - 1;
537                 exchanged = true;
538         }
539         int *fp = (new int[(M+1) + 1 + (N+1)]) + (M+1);
540         std::vector<char> *es = (new std::vector<char>[(M+1) + 1 + (N+1)]) + (M+1);
541         int DELTA = N - M;
542         
543         int k;
544         for (k = -(M+1); k <= (N+1); k++)
545                 fp[k] = -1; 
546         int p = -1;
547         do
548         {
549                 p = p + 1;
550                 for (k = -p; k <= DELTA-1; k++)
551                 {
552                         fp[k] = snake(k, std::max(fp[k-1] + 1, fp[k+1]), exchanged);
553         
554                         es[k] = fp[k-1] + 1 > fp[k+1] ? es[k-1] : es[k+1];
555                         es[k].push_back(fp[k-1] + 1 > fp[k+1] ? '+' : '-');
556                         es[k].resize(es[k].size() + fp[k] - std::max(fp[k-1] + 1, fp[k+1]), '=');
557                 }
558                 for (k = DELTA + p; k >= DELTA+1; k--)
559                 {
560                         fp[k] = snake(k, std::max(fp[k-1] + 1, fp[k+1]), exchanged);
561         
562                         es[k] = fp[k-1] + 1 > fp[k+1] ? es[k-1] : es[k+1];
563                         es[k].push_back(fp[k-1] + 1 > fp[k+1] ? '+' : '-');
564                         es[k].resize(es[k].size() + fp[k] - std::max(fp[k-1] + 1, fp[k+1]), '=');
565                 }
566                 k = DELTA;
567                 fp[k] = snake(k, std::max(fp[k-1] + 1, fp[k+1]), exchanged);
568         
569                 es[k] = fp[k-1] + 1 > fp[k+1] ? es[k-1] : es[k+1];
570                 es[k].push_back(fp[k-1] + 1 > fp[k+1] ? '+' : '-');
571                 es[k].resize(es[k].size() + fp[k] - std::max(fp[k-1] + 1, fp[k+1]), '=');
572         } while (fp[k] != N);
573
574         std::vector<char> &ses = es[DELTA]; // Shortest edit script
575         edscript.clear();
576
577         int D = 0;
578         for (size_t i = 1; i < ses.size(); i++)
579         {
580                 switch (ses[i])
581                 {
582                 case '+':
583                         if (i + 1 < ses.size() && ses[i + 1] == '-')
584                         {
585                                 edscript.push_back('!');
586                                 i++;
587                                 D++;
588                         }
589                         else
590                         {
591                                 edscript.push_back(exchanged ? '-' : '+');
592                                 D++;
593                         }
594                         break;
595                 case '-':
596                         if (i + 1 < ses.size() && ses[i + 1] == '+')
597                         {
598                                 edscript.push_back('!');
599                                 i++;
600                                 D++;
601                         }
602                         else
603                         {
604                                 edscript.push_back(exchanged ? '+' : '-');
605                                 D++;
606                         }
607                         break;
608                 default:
609                         edscript.push_back('=');
610                 }
611         }
612                 
613         delete [] (es - (M+1));
614         delete [] (fp - (M+1));
615
616         return D;
617 }
618
619 int
620 stringdiffs::snake(int k, int y, bool exchanged)
621 {
622
623         int M = exchanged ? m_words2.size() - 1 : m_words1.size() - 1;
624         int N = exchanged ? m_words1.size() - 1 : m_words2.size() - 1;
625         int x = y - k;
626         while (x < M && y < N && (exchanged ? AreWordsSame(m_words1[y + 1], m_words2[x + 1]) : AreWordsSame(m_words1[x + 1], m_words2[y + 1]))) {
627                 x = x + 1; y = y + 1;
628         }
629         return y;
630 }
631
632 /**
633  * @brief Return true if chars match
634  *
635  * Caller must not call this for lead bytes
636  */
637 static inline bool
638 matchchar(TCHAR ch1, TCHAR ch2, bool casitive)
639 {
640         if (casitive)
641                 return ch1==ch2;
642         else 
643                 return _totupper(ch1)==_totupper(ch2);
644 }
645
646
647 /** Does character introduce a multicharacter character? */
648 static inline bool IsLeadByte(TCHAR ch)
649 {
650 #ifdef UNICODE
651         return false;
652 #else
653         return _getmbcp() && IsDBCSLeadByte(ch);
654 #endif
655 }
656
657 /**
658  * @brief Is it whitespace (excludes all lead & trail bytes)?
659  */
660 static inline bool
661 isSafeWhitespace(TCHAR ch)
662 {
663         return _istspace((unsigned)ch) && !IsLeadByte(ch);
664 }
665
666 /**
667  * @brief Is it a non-whitespace wordbreak character (ie, punctuation)?
668  */
669 static bool
670 isWordBreak(int breakType, const TCHAR *str, int index)
671 {
672         TCHAR ch = str[index];
673         // breakType==1 means break also on punctuation
674 #ifdef _UNICODE
675         if ((ch & 0xff00) == 0)
676         {
677 //              TCHAR nextCh = str[index + 1];
678                 // breakType==0 means whitespace only
679                 if (!breakType)
680                         return false;
681                 return _tcschr(BreakChars, ch) != 0;
682         }
683         else 
684         {
685 //              if (
686 //                      ch==0xff0c/* Fullwidth Full Stop */ || 
687 //                      ch==0xff0e/* Fullwidth Comma */ ||
688 //                      ch==0xff1b/* Fullwidth Semicolon */ ||
689 //                      ch==0xff1a/* Fullwidth Colon */ ||
690 //                      ch==0x3002/* Ideographic Full Stop */ || 
691 //                      ch==0x3001/* Ideographic Comma */
692 //                      )
693 //                      return true;
694 //              WORD wCharType, wCharTypeNext;
695 //              GetStringTypeW(CT_CTYPE3, &ch, 1, &wCharType);
696 //              TCHAR nextCh = str[index + 1];
697 //              GetStringTypeW(CT_CTYPE3, &nextCh, 1, &wCharTypeNext);
698 //              return (wCharType != wCharTypeNext);
699 //              
700                 return true;
701         }
702 #else
703         // breakType==0 means whitespace only
704         if (!breakType)
705                 return false;
706         return _tcschr(BreakChars, ch) != 0;
707 #endif
708 }
709
710
711 /**
712  * @brief Return pointer to last character of specified string (handle MBCS)
713  *
714  * If the last byte is a broken multibyte (ie, a solo lead byte), this returns previous char
715  */
716 static LPCTSTR
717 LastChar(LPCTSTR psz, int len)
718 {
719         if (!len) return psz;
720
721         if (!_getmbcp()) return psz+len-1;
722
723         LPCTSTR lastValid = psz+len-1;
724
725         LPCTSTR prev=psz;
726         while (psz<lastValid)
727         {
728                 prev = psz;
729                 psz = CharNext(psz);
730                 if (prev == psz)
731                         psz++;
732         }
733         if (psz==lastValid && !IsLeadByte(*psz))
734                 return psz;
735         else // last character was multibyte or broken multibyte
736                 return prev;
737 }
738
739 /**
740  * @brief advance current pointer over whitespace, until not whitespace or beyond end
741  * @param pcurrent [in,out] current location (to be advanced)
742  * @param end [in] last valid position (only go one beyond this)
743  */
744 static void
745 AdvanceOverWhitespace(LPCTSTR * pcurrent, LPCTSTR end)
746 {
747         // advance over whitespace
748         while (*pcurrent <= end && isSafeWhitespace(**pcurrent))
749                 ++(*pcurrent); // DBCS safe because of isSafeWhitespace above
750 }
751
752 /**
753  * @brief Compute begin1,begin2,end1,end2 to display byte difference between strings str1 & str2
754  * @param casitive [in] true for case-sensitive, false for case-insensitive
755  * @param xwhite [in] This governs whether we handle whitespace specially (see WHITESPACE_COMPARE_ALL, WHITESPACE_IGNORE_CHANGE, WHITESPACE_IGNORE_ALL)
756  * @param [out] begin return -1 if not found or pos of equal
757  * @param [out] end return -1 if not found or pos of equal valid if begin1 >=0
758  * @param [in] equal false surch for a diff, true surch for equal
759  *
760  *
761  * Assumes whitespace is never leadbyte or trailbyte!
762  */
763 void
764 sd_ComputeByteDiff(String & str1, String & str2, 
765                    bool casitive, int xwhite, 
766                    int begin[2], int end[2], bool equal)
767 {
768         // Set to sane values
769         // Also this way can distinguish if we set begin[0] to -1 for no diff in line
770         begin[0] = end[0] = begin[1] = end[1] = 0;
771
772         int len1 = str1.length();
773         int len2 = str2.length();
774
775         LPCTSTR pbeg1 = str1.c_str();
776         LPCTSTR pbeg2 = str2.c_str();
777
778         if (len1 == 0 || len2 == 0)
779         {
780                 if (len1 == len2)
781                 {
782                         begin[0] = -1;
783                         begin[1] = -1;
784                 }
785                 end[0] = len1 - 1;
786                 end[1] = len2 - 1;
787                 return;
788         }
789
790         // cursors from front, which we advance to beginning of difference
791         LPCTSTR py1 = pbeg1;
792         LPCTSTR py2 = pbeg2;
793
794         // pen1,pen2 point to the last valid character (broken multibyte lead chars don't count)
795         LPCTSTR pen1 = LastChar(py1, len1);
796         LPCTSTR pen2 = LastChar(py2, len2);
797
798         if (xwhite != WHITESPACE_COMPARE_ALL)
799         {
800                 // Ignore leading and trailing whitespace
801                 // by advancing py1 and py2
802                 // and retreating pen1 and pen2
803                 while (py1 < pen1 && isSafeWhitespace(*py1))
804                         ++py1; // DBCS safe because of isSafeWhitespace above
805                 while (py2 < pen2 && isSafeWhitespace(*py2))
806                         ++py2; // DBCS safe because of isSafeWhitespace above
807                 if ((pen1 < pbeg1 + len1 - 1 || pen2 < pbeg2 + len2 -1)
808                         && (!len1 || !len2 || pbeg1[len1] != pbeg2[len2]))
809                 {
810                         // mismatched broken multibyte ends
811                 }
812                 else
813                 {
814                         while (pen1 > py1 && isSafeWhitespace(*pen1))
815                                 pen1 = CharPrev(py1, pen1);
816                         while (pen2 > py2 && isSafeWhitespace(*pen2))
817                                 pen2 = CharPrev(py2, pen2);
818                 }
819         }
820         //check for excaption of empty string on one side
821         //In that case display all as a diff
822         if (!equal && (((py1 == pen1) && isSafeWhitespace(*pen1)) ||
823                 ((py2 == pen2) && isSafeWhitespace(*pen2))))
824         {
825                 begin[0] = 0;
826                 begin[1] = 0;
827                 end[0] = len1 - 1;
828                 end[1] = len2 - 1;
829                 return;
830         }
831         // Advance over matching beginnings of lines
832         // Advance py1 & py2 from beginning until find difference or end
833         while (1)
834         {
835                 // Potential difference extends from py1 to pen1 and py2 to pen2
836
837                 // Check if either side finished
838                 if (py1 > pen1 && py2 > pen2)
839                 {
840                         begin[0] = end[0] = begin[1] = end[1] = -1;
841                         break;
842                 }
843                 if (py1 > pen1 || py2 > pen2)
844                 {
845                         break;
846                 }
847
848                 // handle all the whitespace logic (due to WinMerge whitespace settings)
849                 if (xwhite && py1 < pen1 && isSafeWhitespace(*py1))
850                 {
851                         if (xwhite==WHITESPACE_IGNORE_CHANGE && !isSafeWhitespace(*py2))
852                         {
853                                 // py1 is white but py2 is not
854                                 // in WHITESPACE_IGNORE_CHANGE mode,
855                                 // this doesn't qualify as skippable whitespace
856                                 break; // done with forward search
857                         }
858                         // gobble up all whitespace in current area
859                         AdvanceOverWhitespace(&py1, pen1); // will go beyond end
860                         AdvanceOverWhitespace(&py2, pen2); // will go beyond end
861                         continue;
862
863                 }
864                 if (xwhite && py2 < pen2 && isSafeWhitespace(*py2))
865                 {
866                         if (xwhite==WHITESPACE_IGNORE_CHANGE && !isSafeWhitespace(*py1))
867                         {
868                                 // py2 is white but py1 is not
869                                 // in WHITESPACE_IGNORE_CHANGE mode,
870                                 // this doesn't qualify as skippable whitespace
871                                 break; // done with forward search
872                         }
873                         // gobble up all whitespace in current area
874                         AdvanceOverWhitespace(&py1, pen1); // will go beyond end
875                         AdvanceOverWhitespace(&py2, pen2); // will go beyond end
876                         continue;
877                 }
878
879                 // Now do real character match
880                 if (IsLeadByte(*py1))
881                 {
882                         if (!IsLeadByte(*py2))
883                                 break; // done with forward search
884                         // DBCS (we assume if a lead byte, then character is 2-byte)
885                         if (!(py1[0] == py2[0] && py1[1] == py2[1]))
886                                 break; // done with forward search
887                         py1 += 2; // DBCS specific
888                         py2 += 2; // DBCS specific
889                 }
890                 else
891                 {
892                         if (IsLeadByte(*py2))
893                                 break; // done with forward search
894                         if (!matchchar(py1[0], py2[0], casitive))
895                                 break; // done with forward search
896                         ++py1; // DBCS safe b/c we checked above
897                         ++py2; // DBCS safe b/c we checked above
898                 }
899         }
900
901         // Potential difference extends from py1 to pen1 and py2 to pen2
902
903         // Store results of advance into return variables (begin[0] & begin[1])
904         // -1 in a begin variable means no visible diff area
905         begin[0] = py1 - pbeg1;
906         begin[1] = py2 - pbeg2;
907
908         LPCTSTR pz1 = pen1;
909         LPCTSTR pz2 = pen2;
910
911         // Retreat over matching ends of lines
912         // Retreat pz1 & pz2 from end until find difference or beginning
913         while (1)
914         {
915                 // Check if either side finished
916                 if (pz1 < py1 && pz2 < py2)
917                 {
918                         begin[0] = end[0] = begin[1] = end[1] = -1;
919                         break;
920                 }
921                 if (pz1 < py1 || pz2 < py2)
922                 {
923                         break;
924                 }
925
926                 // handle all the whitespace logic (due to WinMerge whitespace settings)
927                 if (xwhite && pz1 > py1 && isSafeWhitespace(*pz1))
928                 {
929                         if (xwhite==1 && !isSafeWhitespace(*pz2))
930                                 break; // done with reverse search
931                         // gobble up all whitespace in current area
932                         while (pz1 > py1 && isSafeWhitespace(*pz1))
933                                 pz1 = CharPrev(py1, pz1);
934                         while (pz2 > py2 && isSafeWhitespace(*pz2))
935                                 pz2 = CharPrev(py2, pz2);
936                         continue;
937
938                 }
939                 if (xwhite && pz2 > py2 && isSafeWhitespace(*pz2))
940                 {
941                         if (xwhite==1)
942                                 break; // done with reverse search
943                         while (pz2 > py2 && isSafeWhitespace(*pz2))
944                                 pz2 = CharPrev(py2, pz2);
945                         continue;
946                 }
947
948                 // Now do real character match
949                 if (IsLeadByte(*pz1))
950                 {
951                         if (!IsLeadByte(*pz2))
952                                 break; // done with forward search
953                         // DBCS (we assume if a lead byte, then character is 2-byte)
954                         if (!(pz1[0] == pz2[0] && pz1[1] == pz2[1]))
955                                 break; // done with forward search
956                 }
957                 else
958                 {
959                         if (IsLeadByte(*pz2))
960                                 break; // done with forward search
961                         if (!matchchar(pz1[0], pz2[0], casitive))
962                                 break; // done with forward search
963                 }
964                 pz1 = (pz1 > pbeg1) ? CharPrev(pbeg1, pz1) : pz1 - 1;
965                 pz2 = (pz2 > pbeg2) ? CharPrev(pbeg2, pz2) : pz2 - 1;
966     }
967
968 /*      if (*pz1 == '\r' && *(pz1+1) == '\n')
969         {
970                 pz1++;
971                 pz2++;
972         }
973         else if (*pz2 == '\r' && *(pz2+1) == '\n')
974         {
975                 pz2++;
976                 pz1++;
977         }
978         if (*(pbeg1-1) == '\r' && *pbeg1 == '\n')
979         {
980                 pbeg1--;
981                 pbeg2--;
982         }
983         else if (*(pbeg2-1) == '\r' && *pbeg2 == '\n')
984         {
985                 pbeg2--;
986                 pbeg1--;
987         }*/
988
989         // Store results of advance into return variables (end[0] & end[1])
990         end[0] = pz1 - pbeg1;
991         end[1] = pz2 - pbeg2;
992
993         // Check if difference region was empty
994         if (begin[0] == end[0] + 1 && begin[1] == end[1] + 1)
995                 begin[0] = -1; // no diff
996 }
997
998 /**
999  * @brief adjust the range of the specified word diffs down to byte(char) level.
1000  * @param str1, str2 [in] line to be compared
1001  * @param casitive [in] true for case-sensitive, false for case-insensitive
1002  * @param xwhite [in] This governs whether we handle whitespace specially
1003  *  (see WHITESPACE_COMPARE_ALL, WHITESPACE_IGNORE_CHANGE, WHITESPACE_IGNORE_ALL)
1004  */
1005 void stringdiffs::wordLevelToByteLevel()
1006 {
1007         for (size_t i = 0; i < m_wdiffs.size(); i++)
1008         {
1009                 int begin[3], end[3];
1010                 wdiff& diff = m_wdiffs[i];
1011                 String str1_2, str2_2;
1012                 str1_2 = m_str1.substr(diff.begin[0], diff.end[0] - diff.begin[0] + 1);
1013                 str2_2 = m_str2.substr(diff.begin[1], diff.end[1] - diff.begin[1] + 1);
1014                 sd_ComputeByteDiff(str1_2, str2_2, m_case_sensitive, m_whitespace, begin, end, false);
1015                 if (begin[0] == -1)
1016                 {
1017                         // no visible diff on side1
1018                         diff.end[0] = diff.begin[0] - 1;
1019                 }
1020                 else
1021                 {
1022                         diff.end[0] = diff.begin[0] + end[0];
1023                         diff.begin[0] += begin[0];
1024                 }
1025                 if (begin[1] == -1)
1026                 {
1027                         // no visible diff on side2
1028                         diff.end[1] = diff.begin[1] - 1;
1029                 }
1030                 else
1031                 {
1032                         diff.end[1] = diff.begin[1] + end[1];
1033                         diff.begin[1] += begin[1];
1034                 }
1035         }
1036 }
1037