OSDN Git Service

Cppcheck: Redundant assignment of '...' to itself.
[winmerge-jp/winmerge-jp.git] / Src / stringdiffs.cpp
1 /** 
2  * @file  stringdiffs.cpp
3  *
4  * @brief Implementation file for sd_ComputeWordDiffs (q.v.)
5  *
6  */
7
8 #include "stringdiffs.h"
9 #define NOMINMAX
10 #include <windows.h>
11 #include <tchar.h>
12 #include <cassert>
13 #include <climits>
14 #include <mbctype.h>
15 #include "CompareOptions.h"
16 #include "stringdiffsi.h"
17 #include "Diff3.h"
18
19 using std::vector;
20
21 static bool Initialized;
22 static bool CustomChars;
23 static TCHAR *BreakChars;
24 static TCHAR BreakCharDefaults[] = _T(",.;:");
25
26 static bool isSafeWhitespace(TCHAR ch);
27 static bool isWordBreak(int breakType, const TCHAR *str, int index);
28
29 void sd_Init()
30 {
31         BreakChars = &BreakCharDefaults[0];
32         Initialized = true;
33 }
34
35 void sd_Close()
36 {
37         if (CustomChars)
38         {
39                 free(BreakChars);
40                 BreakChars = NULL;
41                 CustomChars = false;
42         }
43         Initialized = false;
44 }
45
46 void sd_SetBreakChars(const TCHAR *breakChars)
47 {
48         assert(Initialized);
49
50         if (CustomChars)
51                 free(BreakChars);
52
53         CustomChars = true;
54         BreakChars = _tcsdup(breakChars);
55 }
56
57 void
58 sd_ComputeWordDiffs(const String& str1, const String& str2,
59         bool case_sensitive, int whitespace, int breakType, bool byte_level,
60         std::vector<wdiff> * pDiffs)
61 {
62         String strs[3] = {str1, str2, _T("")};
63         sd_ComputeWordDiffs(2, strs, case_sensitive, whitespace, breakType, byte_level, pDiffs);
64 }
65
66 struct Comp02Functor
67 {
68         Comp02Functor(const String *strs, bool case_sensitive) : 
69                 strs_(strs), case_sensitive_(case_sensitive)
70         {
71         }
72         bool operator()(const wdiff &wd3)
73         {
74                 size_t wlen0 = wd3.end[0] - wd3.begin[0] + 1;
75                 size_t wlen2 = wd3.end[2] - wd3.begin[2] + 1;
76                 if (wlen0 != wlen2)
77                         return false;
78                 if (case_sensitive_)
79                 {
80                         if (memcmp(&strs_[0][wd3.begin[0]], &strs_[2][wd3.begin[2]], wlen0 * sizeof(TCHAR)) != 0)
81                                 return false;
82                 }
83                 else
84                 {
85                         if (_tcsnicmp(&strs_[0][wd3.begin[0]], &strs_[2][wd3.begin[2]], wlen0) != 0)
86                                 return false;
87                 }
88                 return true;
89         }
90         const String *strs_;
91         bool case_sensitive_;
92 };
93
94 /**
95  * @brief Construct our worker object and tell it to do the work
96  */
97 void
98 sd_ComputeWordDiffs(int nFiles, const String str[3],
99         bool case_sensitive, int whitespace, int breakType, bool byte_level,
100         std::vector<wdiff> * pDiffs)
101 {
102         if (nFiles == 2)
103         {
104                 stringdiffs sdiffs(str[0], str[1], case_sensitive, whitespace, breakType, pDiffs);
105                 // Hash all words in both lines and then compare them word by word
106                 // storing differences into m_wdiffs
107                 sdiffs.BuildWordDiffList();
108
109                 if (byte_level)
110                         sdiffs.wordLevelToByteLevel();
111
112                 // Now copy m_wdiffs into caller-supplied m_pDiffs (coalescing adjacents if possible)
113                 sdiffs.PopulateDiffs();
114
115         }
116         else
117         {
118                 if (str[0].empty())
119                 {
120                         stringdiffs sdiffs(str[1], str[2], case_sensitive, whitespace, breakType, pDiffs);
121                         sdiffs.BuildWordDiffList();
122                         if (byte_level)
123                                 sdiffs.wordLevelToByteLevel();
124                         sdiffs.PopulateDiffs();
125                         for (size_t i = 0; i < pDiffs->size(); i++)
126                         {
127                                 wdiff& diff = (*pDiffs)[i];
128                                 diff.begin[2] = diff.begin[1];
129                                 diff.begin[1] = diff.begin[0];
130                                 diff.begin[0] = 0;
131                                 diff.end[2] = diff.end[1];
132                                 diff.end[1] = diff.end[0];
133                                 diff.end[0] = -1;
134                         }
135                 }
136                 else if (str[1].empty())
137                 {
138                         stringdiffs sdiffs(str[0], str[2], case_sensitive, whitespace, breakType, pDiffs);
139                         sdiffs.BuildWordDiffList();
140                         if (byte_level)
141                                 sdiffs.wordLevelToByteLevel();
142                         sdiffs.PopulateDiffs();
143                         for (size_t i = 0; i < pDiffs->size(); i++)
144                         {
145                                 wdiff& diff = (*pDiffs)[i];
146                                 diff.begin[2] = diff.begin[1];
147                                 //diff.begin[0] = diff.begin[0];
148                                 diff.begin[1] = 0;
149                                 diff.end[2] = diff.end[1];
150                                 //diff.end[0] = diff.end[0];
151                                 diff.end[1] = -1;
152                         }
153                 }
154                 else if (str[2].empty())
155                 {
156                         stringdiffs sdiffs(str[0], str[1], case_sensitive, whitespace, breakType, pDiffs);
157                         sdiffs.BuildWordDiffList();
158                         if (byte_level)
159                                 sdiffs.wordLevelToByteLevel();
160                         sdiffs.PopulateDiffs();
161                         for (size_t i = 0; i < pDiffs->size(); i++)
162                         {
163                                 wdiff& diff = (*pDiffs)[i];
164                                 //diff.begin[1] = diff.begin[1];
165                                 //diff.begin[0] = diff.begin[0];
166                                 diff.begin[2] = 0;
167                                 //diff.end[1] = diff.end[1];
168                                 //diff.end[0] = diff.end[0];
169                                 diff.end[2] = -1;
170                         }
171                 }
172                 else
173                 {
174                         std::vector<wdiff> diffs10, diffs12;
175                         stringdiffs sdiffs10(str[1], str[0], case_sensitive, whitespace, breakType, &diffs10);
176                         stringdiffs sdiffs12(str[1], str[2], case_sensitive, whitespace, breakType, &diffs12);
177                         // Hash all words in both lines and then compare them word by word
178                         // storing differences into m_wdiffs
179                         sdiffs10.BuildWordDiffList();
180                         sdiffs12.BuildWordDiffList();
181                         if (byte_level)
182                         {
183                                 sdiffs10.wordLevelToByteLevel();
184                                 sdiffs12.wordLevelToByteLevel();
185                         }
186                         // Now copy m_wdiffs into caller-supplied m_pDiffs (coalescing adjacents if possible)
187                         sdiffs10.PopulateDiffs();
188                         sdiffs12.PopulateDiffs();
189
190                         Make3wayDiff(*pDiffs, diffs10, diffs12, 
191                                 Comp02Functor(str, case_sensitive), false);
192                 }
193         }
194 }
195
196 /**
197  * @brief stringdiffs constructor simply loads all members from arguments
198  */
199 stringdiffs::stringdiffs(const String & str1, const String & str2,
200         bool case_sensitive, int whitespace, int breakType,
201         std::vector<wdiff> * pDiffs)
202 : m_str1(str1)
203 , m_str2(str2)
204 , m_case_sensitive(case_sensitive)
205 , m_whitespace(whitespace)
206 , m_breakType(breakType)
207 , m_pDiffs(pDiffs)
208 , m_matchblock(true) // Change to false to get word to word compare
209 {
210 }
211
212 /**
213  * @brief Destructor.
214  * The destructor frees all diffs added to the vectors.
215  */
216 stringdiffs::~stringdiffs()
217 {
218 }
219
220 #ifdef STRINGDIFF_LOGGING
221 void
222 stringdiffs::debugoutput()
223 {
224         for (size_t i = 0; i < m_wdiffs.size(); i++)
225         {
226                 String str1;
227                 String str2;
228                 TCHAR buf[256];
229                 int s1 = m_wdiffs[i].begin[0];
230                 int e1 = m_wdiffs[i].end[0];
231                 int s2 = m_wdiffs[i].begin[1];
232                 int e2 = m_wdiffs[i].end[1];
233
234                 int len1 = e1 - s1 + 1;
235                 int len2 = e2 - s2 + 1;
236
237                 if (len1 < 50)
238                         str1 = m_str1.substr(s1 ,e1 - s1 + 1);
239                 else
240                         str1 = m_str1.substr(s1, 50);
241
242                 if (len2 < 50)
243                         str2 = m_str2.substr(s2, e2- s2 + 1);
244                 else
245                         str2 = m_str2.substr(s2, 50);
246
247                 wsprintf(buf, _T("left=  %s,   %d,%d,\nright= %s,   %d,%d \n"),
248                         str1.c_str(), s1, e1, str2.c_str(), s2, e2);
249                 OutputDebugString(buf);
250         }
251 }
252 #endif
253
254 bool
255 stringdiffs::BuildWordDiffList_DP()
256 {
257         std::vector<char> edscript;
258
259         //if (dp(edscript) <= 0)
260         //      return false;
261         onp(edscript);
262
263         int i = 1, j = 1;
264         for (size_t k = 0; k < edscript.size(); k++)
265         {
266                 int s1, e1, s2, e2;
267                 if (edscript[k] == '-')
268                 {
269                         if (m_whitespace == WHITESPACE_IGNORE_ALL)
270                         {
271                                 if (IsSpace(m_words1[i]))
272                                 {
273                                         i++;
274                                         continue;
275                                 }
276                         }
277                                 
278                         s1 = m_words1[i].start;
279                         e1 = m_words1[i].end;
280                         s2 = m_words2[j-1].end+1;
281                         e2 = s2-1;
282                         m_wdiffs.push_back(wdiff(s1, e1, s2, e2));
283                         i++;
284                 }
285                 else if (edscript[k] == '+')
286                 {
287                         if (m_whitespace == WHITESPACE_IGNORE_ALL)
288                         {
289                                 if (IsSpace(m_words2[j]))
290                                 {
291                                         j++;
292                                         continue;
293                                 }
294                         }
295
296                         s1 = m_words1[i-1].end+1;
297                         e1 = s1-1;
298                         s2 = m_words2[j].start;
299                         e2 = m_words2[j].end;
300                         m_wdiffs.push_back(wdiff(s1, e1, s2, e2));
301                         j++;
302                 }
303                 else if (edscript[k] == '!')
304                 {
305                         if (m_whitespace == WHITESPACE_IGNORE_CHANGE || m_whitespace == WHITESPACE_IGNORE_ALL)
306                         {
307                                 if (IsSpace(m_words1[i]) && IsSpace(m_words2[j]))
308                                 {
309                                         i++; j++;
310                                         continue;
311                                 }
312                         }
313                                 
314                         s1 =  m_words1[i].start;
315                         e1 =  m_words1[i].end;
316                         s2 =  m_words2[j].start;
317                         e2 =  m_words2[j].end ;
318                         m_wdiffs.push_back(wdiff(s1, e1, s2, e2));
319                         i++; j++;
320                 }
321                 else
322                 {
323                         i++; j++;
324                 }
325         }
326 #ifdef STRINGDIFF_LOGGING
327         debugoutput();
328 #endif
329         return true;
330 }
331
332 /**
333  * @brief Add all different elements between lines to the wdiff list
334  */
335 void
336 stringdiffs::BuildWordDiffList()
337 {
338         BuildWordsArray(m_str1, m_words1);
339         BuildWordsArray(m_str2, m_words2);
340
341 #ifdef _WIN64
342         if (m_words1.size() > 20480 || m_words2.size() > 20480)
343 #else
344         if (m_words1.size() > 2048 || m_words2.size() > 2048)
345 #endif
346         {
347                 int s1 = m_words1[0].start;
348                 int e1 = m_words1[m_words1.size() - 1].end;
349                 int s2 = m_words2[0].start;
350                 int e2 = m_words2[m_words2.size() - 1].end;
351                 m_wdiffs.push_back(wdiff(s1, e1, s2, e2));              
352                 return;
353         }
354
355         BuildWordDiffList_DP();
356 }
357
358 /**
359  * @brief Break line into constituent words
360  */
361 void
362 stringdiffs::BuildWordsArray(const String & str, std::vector<word>& words)
363 {
364         int i=0, begin=0;
365
366         // dummy;
367         words.push_back(word(0, -1, 0, 0));
368
369         // state when we are looking for next word
370 inspace:
371         if (isSafeWhitespace(str[i])) 
372         {
373                 ++i;
374                 goto inspace;
375         }
376         if (begin < i)
377         {
378                 // just finished a word
379                 // e is first word character (space or at end)
380                 int e = i - 1;
381
382                 words.push_back(word(begin, e, dlspace, Hash(str, begin, e, 0)));
383         }
384         if (i == str.length())
385                 return;
386         begin = i;
387         goto inword;
388
389         // state when we are inside a word
390 inword:
391         bool atspace=false;
392         if (i == str.length() || ((atspace = isSafeWhitespace(str[i])) != 0) || isWordBreak(m_breakType, str.c_str(), i))
393         {
394                 if (begin<i)
395                 {
396                         // just finished a word
397                         // e is first non-word character (space or at end)
398                         int e = i-1;
399                         
400                         words.push_back(word(begin, e, dlword, Hash(str, begin, e, 0)));
401                 }
402                 if (i == str.length())
403                 {
404                         return;
405                 }
406                 else if (atspace)
407                 {
408                         begin = i;
409                         goto inspace;
410                 }
411                 else
412                 {
413                         // start a new word because we hit a non-whitespace word break (eg, a comma)
414                         // but, we have to put each word break character into its own word
415                         words.push_back(word(i, i, dlbreak, Hash(str, i, i, 0)));
416                         ++i;
417                         begin = i;
418                         goto inword;
419                 }
420         }
421         ++i;
422         goto inword; // safe even if we're at the end or no longer in a word
423 }
424
425 /**
426  * @brief Populate m_pDiffs from m_wdiffs (combining adjacent diffs)
427  *
428  * Doing the combining of adjacent diffs here keeps some complexity out of BuildWordsArray.
429  */
430 void
431 stringdiffs::PopulateDiffs()
432 {
433         for (int i=0; i< (int)m_wdiffs.size(); ++i)
434         {
435                 bool skipIt = false;
436                 // combine it with next ?
437                 if (i+1< (int)m_wdiffs.size())
438                 {
439                         if (m_wdiffs[i].end[0] + 1 == m_wdiffs[i+1].begin[0]
440                                 && m_wdiffs[i].end[1] + 1 == m_wdiffs[i+1].begin[1])
441                         {
442                                 // diff[i] and diff[i+1] are contiguous
443                                 // so combine them into diff[i+1] and ignore diff[i]
444                                 m_wdiffs[i+1].begin[0] = m_wdiffs[i].begin[0];
445                                 m_wdiffs[i+1].begin[1] = m_wdiffs[i].begin[1];
446                                 skipIt = true;
447                         }
448                 }
449                 if (!skipIt)
450                 {
451                         // Should never have a pair where both are missing
452                         assert(m_wdiffs[i].begin[0]>=0 || m_wdiffs[i].begin[1]>=0);
453
454                         // Store the diff[i] in the caller list (m_pDiffs)
455                         m_pDiffs->push_back(wdiff(m_wdiffs[i]));
456                 }
457         }
458 }
459
460 // diffutils hash
461
462 /* Rotate a value n bits to the left. */
463 #define UINT_BIT (sizeof (unsigned) * CHAR_BIT)
464 #define ROL(v, n) ((v) << (n) | (v) >> (UINT_BIT - (n)))
465 /* Given a hash value and a new character, return a new hash value. */
466 #define HASH(h, c) ((c) + ROL (h, 7))
467
468 unsigned
469 stringdiffs::Hash(const String & str, int begin, int end, unsigned h) const
470 {
471         for (int i = begin; i <= end; ++i)
472         {
473                 unsigned ch = static_cast<unsigned>(str[i]);
474                 if (m_case_sensitive)
475                 {
476                         h += HASH(h, ch);
477                 }
478                 else
479                 {
480                         ch = static_cast<unsigned>(_totupper(ch));
481                         h += HASH(h, ch);
482                 }
483         }
484
485         return h;
486 }
487
488 /**
489  * @brief Compare two words (by reference to original strings)
490  */
491 bool
492 stringdiffs::AreWordsSame(const word & word1, const word & word2) const
493 {
494         if (this->m_whitespace != WHITESPACE_COMPARE_ALL)
495         {
496                 if (IsSpace(word1) && IsSpace(word2))
497                         return true;
498         }
499         if (word1.hash != word2.hash)
500                 return false;
501         if (word1.length() != word2.length())
502                 return false;
503         for (int i=0; i<word1.length(); ++i)
504         {
505                 if (!caseMatch(m_str1[word1.start+i], m_str2[word2.start+i]))
506                         return false;
507         }
508         return true;
509 }
510
511 /**
512  * @brief Return true if characters match
513  */
514 bool
515 stringdiffs::caseMatch(TCHAR ch1, TCHAR ch2) const
516 {
517         if (m_case_sensitive) 
518                 return ch1==ch2;
519         else 
520                 return _totupper(ch1)==_totupper(ch2);
521 }
522
523 /**
524  * @ brief An O(NP) Sequence Comparison Algorithm. Sun Wu, Udi Manber, Gene Myers
525  */
526 int
527 stringdiffs::onp(std::vector<char> &edscript)
528 {
529         int M = m_words1.size() - 1;
530         int N = m_words2.size() - 1;
531         bool exchanged = false;
532         if (M > N)
533         {
534                 M = m_words2.size() - 1;
535                 N = m_words1.size() - 1;
536                 exchanged = true;
537         }
538         int *fp = (new int[(M+1) + 1 + (N+1)]) + (M+1);
539         std::vector<char> *es = (new std::vector<char>[(M+1) + 1 + (N+1)]) + (M+1);
540         int DELTA = N - M;
541         
542         int k;
543         for (k = -(M+1); k <= (N+1); k++)
544                 fp[k] = -1; 
545         int p = -1;
546         do
547         {
548                 p = p + 1;
549                 for (k = -p; k <= DELTA-1; k++)
550                 {
551                         fp[k] = snake(k, std::max(fp[k-1] + 1, fp[k+1]), exchanged);
552         
553                         es[k] = fp[k-1] + 1 > fp[k+1] ? es[k-1] : es[k+1];
554                         es[k].push_back(fp[k-1] + 1 > fp[k+1] ? '+' : '-');
555                         es[k].resize(es[k].size() + fp[k] - std::max(fp[k-1] + 1, fp[k+1]), '=');
556                 }
557                 for (k = DELTA + p; k >= DELTA+1; k--)
558                 {
559                         fp[k] = snake(k, std::max(fp[k-1] + 1, fp[k+1]), exchanged);
560         
561                         es[k] = fp[k-1] + 1 > fp[k+1] ? es[k-1] : es[k+1];
562                         es[k].push_back(fp[k-1] + 1 > fp[k+1] ? '+' : '-');
563                         es[k].resize(es[k].size() + fp[k] - std::max(fp[k-1] + 1, fp[k+1]), '=');
564                 }
565                 k = DELTA;
566                 fp[k] = snake(k, std::max(fp[k-1] + 1, fp[k+1]), exchanged);
567         
568                 es[k] = fp[k-1] + 1 > fp[k+1] ? es[k-1] : es[k+1];
569                 es[k].push_back(fp[k-1] + 1 > fp[k+1] ? '+' : '-');
570                 es[k].resize(es[k].size() + fp[k] - std::max(fp[k-1] + 1, fp[k+1]), '=');
571         } while (fp[k] != N);
572
573         std::vector<char> &ses = es[DELTA]; // Shortest edit script
574         edscript.clear();
575
576         int D = 0;
577         for (size_t i = 1; i < ses.size(); i++)
578         {
579                 switch (ses[i])
580                 {
581                 case '+':
582                         if (i + 1 < ses.size() && ses[i + 1] == '-')
583                         {
584                                 edscript.push_back('!');
585                                 i++;
586                                 D++;
587                         }
588                         else
589                         {
590                                 edscript.push_back(exchanged ? '-' : '+');
591                                 D++;
592                         }
593                         break;
594                 case '-':
595                         if (i + 1 < ses.size() && ses[i + 1] == '+')
596                         {
597                                 edscript.push_back('!');
598                                 i++;
599                                 D++;
600                         }
601                         else
602                         {
603                                 edscript.push_back(exchanged ? '+' : '-');
604                                 D++;
605                         }
606                         break;
607                 default:
608                         edscript.push_back('=');
609                 }
610         }
611                 
612         delete [] (es - (M+1));
613         delete [] (fp - (M+1));
614
615         return D;
616 }
617
618 int
619 stringdiffs::snake(int k, int y, bool exchanged)
620 {
621
622         int M = exchanged ? m_words2.size() - 1 : m_words1.size() - 1;
623         int N = exchanged ? m_words1.size() - 1 : m_words2.size() - 1;
624         int x = y - k;
625         while (x < M && y < N && (exchanged ? AreWordsSame(m_words1[y + 1], m_words2[x + 1]) : AreWordsSame(m_words1[x + 1], m_words2[y + 1]))) {
626                 x = x + 1; y = y + 1;
627         }
628         return y;
629 }
630
631 /**
632  * @brief Return true if chars match
633  *
634  * Caller must not call this for lead bytes
635  */
636 static inline bool
637 matchchar(TCHAR ch1, TCHAR ch2, bool casitive)
638 {
639         if (casitive)
640                 return ch1==ch2;
641         else 
642                 return _totupper(ch1)==_totupper(ch2);
643 }
644
645
646 /** Does character introduce a multicharacter character? */
647 static inline bool IsLeadByte(TCHAR ch)
648 {
649 #ifdef UNICODE
650         return false;
651 #else
652         return _getmbcp() && IsDBCSLeadByte(ch);
653 #endif
654 }
655
656 /**
657  * @brief Is it whitespace (excludes all lead & trail bytes)?
658  */
659 static inline bool
660 isSafeWhitespace(TCHAR ch)
661 {
662         return _istspace((unsigned)ch) && !IsLeadByte(ch);
663 }
664
665 /**
666  * @brief Is it a non-whitespace wordbreak character (ie, punctuation)?
667  */
668 static bool
669 isWordBreak(int breakType, const TCHAR *str, int index)
670 {
671         TCHAR ch = str[index];
672         // breakType==1 means break also on punctuation
673 #ifdef _UNICODE
674         if ((ch & 0xff00) == 0)
675         {
676                 TCHAR nextCh = str[index + 1];
677                 // breakType==0 means whitespace only
678                 if (!breakType)
679                         return false;
680                 return _tcschr(BreakChars, ch) != 0;
681         }
682         else 
683         {
684 //              if (
685 //                      ch==0xff0c/* Fullwidth Full Stop */ || 
686 //                      ch==0xff0e/* Fullwidth Comma */ ||
687 //                      ch==0xff1b/* Fullwidth Semicolon */ ||
688 //                      ch==0xff1a/* Fullwidth Colon */ ||
689 //                      ch==0x3002/* Ideographic Full Stop */ || 
690 //                      ch==0x3001/* Ideographic Comma */
691 //                      )
692 //                      return true;
693 //              WORD wCharType, wCharTypeNext;
694 //              GetStringTypeW(CT_CTYPE3, &ch, 1, &wCharType);
695 //              TCHAR nextCh = str[index + 1];
696 //              GetStringTypeW(CT_CTYPE3, &nextCh, 1, &wCharTypeNext);
697 //              return (wCharType != wCharTypeNext);
698 //              
699                 return true;
700         }
701 #else
702         // breakType==0 means whitespace only
703         if (!breakType)
704                 return false;
705         return _tcschr(BreakChars, ch) != 0;
706 #endif
707 }
708
709
710 /**
711  * @brief Return pointer to last character of specified string (handle MBCS)
712  *
713  * If the last byte is a broken multibyte (ie, a solo lead byte), this returns previous char
714  */
715 static LPCTSTR
716 LastChar(LPCTSTR psz, int len)
717 {
718         if (!len) return psz;
719
720         if (!_getmbcp()) return psz+len-1;
721
722         LPCTSTR lastValid = psz+len-1;
723
724         LPCTSTR prev=psz;
725         while (psz<lastValid)
726         {
727                 prev = psz;
728                 psz = CharNext(psz);
729                 if (prev == psz)
730                         psz++;
731         }
732         if (psz==lastValid && !IsLeadByte(*psz))
733                 return psz;
734         else // last character was multibyte or broken multibyte
735                 return prev;
736 }
737
738 /**
739  * @brief advance current pointer over whitespace, until not whitespace or beyond end
740  * @param pcurrent [in,out] current location (to be advanced)
741  * @param end [in] last valid position (only go one beyond this)
742  */
743 static void
744 AdvanceOverWhitespace(LPCTSTR * pcurrent, LPCTSTR end)
745 {
746         // advance over whitespace
747         while (*pcurrent <= end && isSafeWhitespace(**pcurrent))
748                 ++(*pcurrent); // DBCS safe because of isSafeWhitespace above
749 }
750
751 /**
752  * @brief Compute begin1,begin2,end1,end2 to display byte difference between strings str1 & str2
753  * @param casitive [in] true for case-sensitive, false for case-insensitive
754  * @param xwhite [in] This governs whether we handle whitespace specially (see WHITESPACE_COMPARE_ALL, WHITESPACE_IGNORE_CHANGE, WHITESPACE_IGNORE_ALL)
755  * @param [out] begin return -1 if not found or pos of equal
756  * @param [out] end return -1 if not found or pos of equal valid if begin1 >=0
757  * @param [in] equal false surch for a diff, true surch for equal
758  *
759  *
760  * Assumes whitespace is never leadbyte or trailbyte!
761  */
762 void
763 sd_ComputeByteDiff(String & str1, String & str2, 
764                    bool casitive, int xwhite, 
765                    int begin[2], int end[2], bool equal)
766 {
767         // Set to sane values
768         // Also this way can distinguish if we set begin[0] to -1 for no diff in line
769         begin[0] = end[0] = begin[1] = end[1] = 0;
770
771         int len1 = str1.length();
772         int len2 = str2.length();
773
774         LPCTSTR pbeg1 = str1.c_str();
775         LPCTSTR pbeg2 = str2.c_str();
776
777         if (len1 == 0 || len2 == 0)
778         {
779                 if (len1 == len2)
780                 {
781                         begin[0] = -1;
782                         begin[1] = -1;
783                 }
784                 end[0] = len1 - 1;
785                 end[1] = len2 - 1;
786                 return;
787         }
788
789         // cursors from front, which we advance to beginning of difference
790         LPCTSTR py1 = pbeg1;
791         LPCTSTR py2 = pbeg2;
792
793         // pen1,pen2 point to the last valid character (broken multibyte lead chars don't count)
794         LPCTSTR pen1 = LastChar(py1, len1);
795         LPCTSTR pen2 = LastChar(py2, len2);
796
797         if (xwhite != WHITESPACE_COMPARE_ALL)
798         {
799                 // Ignore leading and trailing whitespace
800                 // by advancing py1 and py2
801                 // and retreating pen1 and pen2
802                 while (py1 < pen1 && isSafeWhitespace(*py1))
803                         ++py1; // DBCS safe because of isSafeWhitespace above
804                 while (py2 < pen2 && isSafeWhitespace(*py2))
805                         ++py2; // DBCS safe because of isSafeWhitespace above
806                 if ((pen1 < pbeg1 + len1 - 1 || pen2 < pbeg2 + len2 -1)
807                         && (!len1 || !len2 || pbeg1[len1] != pbeg2[len2]))
808                 {
809                         // mismatched broken multibyte ends
810                 }
811                 else
812                 {
813                         while (pen1 > py1 && isSafeWhitespace(*pen1))
814                                 pen1 = CharPrev(py1, pen1);
815                         while (pen2 > py2 && isSafeWhitespace(*pen2))
816                                 pen2 = CharPrev(py2, pen2);
817                 }
818         }
819         //check for excaption of empty string on one side
820         //In that case display all as a diff
821         if (!equal && (((py1 == pen1) && isSafeWhitespace(*pen1)) ||
822                 ((py2 == pen2) && isSafeWhitespace(*pen2))))
823         {
824                 begin[0] = 0;
825                 begin[1] = 0;
826                 end[0] = len1 - 1;
827                 end[1] = len2 - 1;
828                 return;
829         }
830         // Advance over matching beginnings of lines
831         // Advance py1 & py2 from beginning until find difference or end
832         while (1)
833         {
834                 // Potential difference extends from py1 to pen1 and py2 to pen2
835
836                 // Check if either side finished
837                 if (py1 > pen1 && py2 > pen2)
838                 {
839                         begin[0] = end[0] = begin[1] = end[1] = -1;
840                         break;
841                 }
842                 if (py1 > pen1 || py2 > pen2)
843                 {
844                         break;
845                 }
846
847                 // handle all the whitespace logic (due to WinMerge whitespace settings)
848                 if (xwhite && py1 < pen1 && isSafeWhitespace(*py1))
849                 {
850                         if (xwhite==WHITESPACE_IGNORE_CHANGE && !isSafeWhitespace(*py2))
851                         {
852                                 // py1 is white but py2 is not
853                                 // in WHITESPACE_IGNORE_CHANGE mode,
854                                 // this doesn't qualify as skippable whitespace
855                                 break; // done with forward search
856                         }
857                         // gobble up all whitespace in current area
858                         AdvanceOverWhitespace(&py1, pen1); // will go beyond end
859                         AdvanceOverWhitespace(&py2, pen2); // will go beyond end
860                         continue;
861
862                 }
863                 if (xwhite && py2 < pen2 && isSafeWhitespace(*py2))
864                 {
865                         if (xwhite==WHITESPACE_IGNORE_CHANGE && !isSafeWhitespace(*py1))
866                         {
867                                 // py2 is white but py1 is not
868                                 // in WHITESPACE_IGNORE_CHANGE mode,
869                                 // this doesn't qualify as skippable whitespace
870                                 break; // done with forward search
871                         }
872                         // gobble up all whitespace in current area
873                         AdvanceOverWhitespace(&py1, pen1); // will go beyond end
874                         AdvanceOverWhitespace(&py2, pen2); // will go beyond end
875                         continue;
876                 }
877
878                 // Now do real character match
879                 if (IsLeadByte(*py1))
880                 {
881                         if (!IsLeadByte(*py2))
882                                 break; // done with forward search
883                         // DBCS (we assume if a lead byte, then character is 2-byte)
884                         if (!(py1[0] == py2[0] && py1[1] == py2[1]))
885                                 break; // done with forward search
886                         py1 += 2; // DBCS specific
887                         py2 += 2; // DBCS specific
888                 }
889                 else
890                 {
891                         if (IsLeadByte(*py2))
892                                 break; // done with forward search
893                         if (!matchchar(py1[0], py2[0], casitive))
894                                 break; // done with forward search
895                         ++py1; // DBCS safe b/c we checked above
896                         ++py2; // DBCS safe b/c we checked above
897                 }
898         }
899
900         // Potential difference extends from py1 to pen1 and py2 to pen2
901
902         // Store results of advance into return variables (begin[0] & begin[1])
903         // -1 in a begin variable means no visible diff area
904         begin[0] = py1 - pbeg1;
905         begin[1] = py2 - pbeg2;
906
907         LPCTSTR pz1 = pen1;
908         LPCTSTR pz2 = pen2;
909
910         // Retreat over matching ends of lines
911         // Retreat pz1 & pz2 from end until find difference or beginning
912         while (1)
913         {
914                 // Check if either side finished
915                 if (pz1 < py1 && pz2 < py2)
916                 {
917                         begin[0] = end[0] = begin[1] = end[1] = -1;
918                         break;
919                 }
920                 if (pz1 < py1 || pz2 < py2)
921                 {
922                         break;
923                 }
924
925                 // handle all the whitespace logic (due to WinMerge whitespace settings)
926                 if (xwhite && pz1 > py1 && isSafeWhitespace(*pz1))
927                 {
928                         if (xwhite==1 && !isSafeWhitespace(*pz2))
929                                 break; // done with reverse search
930                         // gobble up all whitespace in current area
931                         while (pz1 > py1 && isSafeWhitespace(*pz1))
932                                 pz1 = CharPrev(py1, pz1);
933                         while (pz2 > py2 && isSafeWhitespace(*pz2))
934                                 pz2 = CharPrev(py2, pz2);
935                         continue;
936
937                 }
938                 if (xwhite && pz2 > py2 && isSafeWhitespace(*pz2))
939                 {
940                         if (xwhite==1)
941                                 break; // done with reverse search
942                         while (pz2 > py2 && isSafeWhitespace(*pz2))
943                                 pz2 = CharPrev(py2, pz2);
944                         continue;
945                 }
946
947                 // Now do real character match
948                 if (IsLeadByte(*pz1))
949                 {
950                         if (!IsLeadByte(*pz2))
951                                 break; // done with forward search
952                         // DBCS (we assume if a lead byte, then character is 2-byte)
953                         if (!(pz1[0] == pz2[0] && pz1[1] == pz2[1]))
954                                 break; // done with forward search
955                 }
956                 else
957                 {
958                         if (IsLeadByte(*pz2))
959                                 break; // done with forward search
960                         if (!matchchar(pz1[0], pz2[0], casitive))
961                                 break; // done with forward search
962                 }
963                 pz1 = (pz1 > pbeg1) ? CharPrev(pbeg1, pz1) : pz1 - 1;
964                 pz2 = (pz2 > pbeg2) ? CharPrev(pbeg2, pz2) : pz2 - 1;
965     }
966
967 /*      if (*pz1 == '\r' && *(pz1+1) == '\n')
968         {
969                 pz1++;
970                 pz2++;
971         }
972         else if (*pz2 == '\r' && *(pz2+1) == '\n')
973         {
974                 pz2++;
975                 pz1++;
976         }
977         if (*(pbeg1-1) == '\r' && *pbeg1 == '\n')
978         {
979                 pbeg1--;
980                 pbeg2--;
981         }
982         else if (*(pbeg2-1) == '\r' && *pbeg2 == '\n')
983         {
984                 pbeg2--;
985                 pbeg1--;
986         }*/
987
988         // Store results of advance into return variables (end[0] & end[1])
989         end[0] = pz1 - pbeg1;
990         end[1] = pz2 - pbeg2;
991
992         // Check if difference region was empty
993         if (begin[0] == end[0] + 1 && begin[1] == end[1] + 1)
994                 begin[0] = -1; // no diff
995 }
996
997 /**
998  * @brief adjust the range of the specified word diffs down to byte(char) level.
999  * @param str1, str2 [in] line to be compared
1000  * @param casitive [in] true for case-sensitive, false for case-insensitive
1001  * @param xwhite [in] This governs whether we handle whitespace specially
1002  *  (see WHITESPACE_COMPARE_ALL, WHITESPACE_IGNORE_CHANGE, WHITESPACE_IGNORE_ALL)
1003  */
1004 void stringdiffs::wordLevelToByteLevel()
1005 {
1006         for (size_t i = 0; i < m_wdiffs.size(); i++)
1007         {
1008                 int begin[3], end[3];
1009                 wdiff& diff = m_wdiffs[i];
1010                 String str1_2, str2_2;
1011                 str1_2 = m_str1.substr(diff.begin[0], diff.end[0] - diff.begin[0] + 1);
1012                 str2_2 = m_str2.substr(diff.begin[1], diff.end[1] - diff.begin[1] + 1);
1013                 sd_ComputeByteDiff(str1_2, str2_2, m_case_sensitive, m_whitespace, begin, end, false);
1014                 if (begin[0] == -1)
1015                 {
1016                         // no visible diff on side1
1017                         diff.end[0] = diff.begin[0] - 1;
1018                 }
1019                 else
1020                 {
1021                         diff.end[0] = diff.begin[0] + end[0];
1022                         diff.begin[0] += begin[0];
1023                 }
1024                 if (begin[1] == -1)
1025                 {
1026                         // no visible diff on side2
1027                         diff.end[1] = diff.begin[1] - 1;
1028                 }
1029                 else
1030                 {
1031                         diff.end[1] = diff.begin[1] + end[1];
1032                         diff.begin[1] += begin[1];
1033                 }
1034         }
1035 }
1036