Src/stringdiffs.cpp

   1 /**
   2  * @file  stringdiffs.cpp
   3  *
   4  * @brief Implementation file for ComputeWordDiffs (q.v.)
   5  *
   6  */
   7
   8 #include "pch.h"
   9 #include "stringdiffs.h"
  10 #define NOMINMAX
  11 #include <cassert>
  12 #include <chrono>
  13 #include "CompareOptions.h"
  14 #include "stringdiffsi.h"
  15 #include "Diff3.h"
  16
  17 using std::vector;
  18
  19 namespace strdiff
  20 {
  21
  22 static bool Initialized;
  23 static bool CustomChars;
  24 static TCHAR *BreakChars;
  25 static TCHAR BreakCharDefaults[] = _T(",.;:");
  26 static int TimeoutMilliSeconds = 500;
  27
  28 static bool isSafeWhitespace(TCHAR ch);
  29 static bool isWordBreak(int breakType, const TCHAR *str, int index);
  30
  31 void Init()
  32 {
  33         BreakChars = &BreakCharDefaults[0];
  34         Initialized = true;
  35 }
  36
  37 void Close()
  38 {
  39         if (CustomChars)
  40         {
  41                 free(BreakChars);
  42                 BreakChars = nullptr;
  43                 CustomChars = false;
  44         }
  45         Initialized = false;
  46 }
  47
  48 void SetBreakChars(const TCHAR *breakChars)
  49 {
  50         assert(Initialized);
  51
  52         if (CustomChars)
  53                 free(BreakChars);
  54
  55         CustomChars = true;
  56         BreakChars = _tcsdup(breakChars);
  57 }
  58
  59 std::vector<wdiff>
  60 ComputeWordDiffs(const String& str1, const String& str2,
  61         bool case_sensitive, bool eol_sensitive, int whitespace, int breakType, bool byte_level)
  62 {
  63         String strs[3] = {str1, str2, _T("")};
  64         return ComputeWordDiffs(2, strs, case_sensitive, eol_sensitive, whitespace, breakType, byte_level);
  65 }
  66
  67 struct Comp02Functor
  68 {
  69         Comp02Functor(const String *strs, bool case_sensitive) :
  70                 strs_(strs), case_sensitive_(case_sensitive)
  71         {
  72         }
  73         bool operator()(const wdiff &wd3)
  74         {
  75                 size_t wlen0 = wd3.end[0] - wd3.begin[0] + 1;
  76                 size_t wlen2 = wd3.end[2] - wd3.begin[2] + 1;
  77                 if (wlen0 != wlen2)
  78                         return false;
  79                 if (case_sensitive_)
  80                 {
  81                         if (memcmp(&strs_[0][wd3.begin[0]], &strs_[2][wd3.begin[2]], wlen0 * sizeof(TCHAR)) != 0)
  82                                 return false;
  83                 }
  84                 else
  85                 {
  86                         if (_tcsnicmp(&strs_[0][wd3.begin[0]], &strs_[2][wd3.begin[2]], wlen0) != 0)
  87                                 return false;
  88                 }
  89                 return true;
  90         }
  91         const String *strs_;
  92         bool case_sensitive_;
  93 };
  94
  95 /**
  96  * @brief Construct our worker object and tell it to do the work
  97  */
  98 std::vector<wdiff>
  99 ComputeWordDiffs(int nFiles, const String str[3],
 100         bool case_sensitive, bool eol_sensitive, int whitespace, int breakType, bool byte_level)
 101 {
 102         std::vector<wdiff> diffs;
 103         if (nFiles == 2)
 104         {
 105                 stringdiffs sdiffs(str[0], str[1], case_sensitive, eol_sensitive, whitespace, breakType, &diffs);
 106                 // Hash all words in both lines and then compare them word by word
 107                 // storing differences into m_wdiffs
 108                 sdiffs.BuildWordDiffList();
 109
 110                 if (byte_level)
 111                         sdiffs.wordLevelToByteLevel();
 112
 113                 // Now copy m_wdiffs into caller-supplied m_pDiffs (coalescing adjacents if possible)
 114                 sdiffs.PopulateDiffs();
 115
 116         }
 117         else
 118         {
 119                 if (str[0].empty())
 120                 {
 121                         stringdiffs sdiffs(str[1], str[2], case_sensitive, eol_sensitive, whitespace, breakType, &diffs);
 122                         sdiffs.BuildWordDiffList();
 123                         if (byte_level)
 124                                 sdiffs.wordLevelToByteLevel();
 125                         sdiffs.PopulateDiffs();
 126                         for (size_t i = 0; i < diffs.size(); i++)
 127                         {
 128                                 wdiff& diff = diffs[i];
 129                                 diff.begin[2] = diff.begin[1];
 130                                 diff.begin[1] = diff.begin[0];
 131                                 diff.begin[0] = 0;
 132                                 diff.end[2] = diff.end[1];
 133                                 diff.end[1] = diff.end[0];
 134                                 diff.end[0] = -1;
 135                         }
 136                 }
 137                 else if (str[1].empty())
 138                 {
 139                         stringdiffs sdiffs(str[0], str[2], case_sensitive, eol_sensitive, whitespace, breakType, &diffs);
 140                         sdiffs.BuildWordDiffList();
 141                         if (byte_level)
 142                                 sdiffs.wordLevelToByteLevel();
 143                         sdiffs.PopulateDiffs();
 144                         for (size_t i = 0; i < diffs.size(); i++)
 145                         {
 146                                 wdiff& diff = diffs[i];
 147                                 diff.begin[2] = diff.begin[1];
 148                                 //diff.begin[0] = diff.begin[0];
 149                                 diff.begin[1] = 0;
 150                                 diff.end[2] = diff.end[1];
 151                                 //diff.end[0] = diff.end[0];
 152                                 diff.end[1] = -1;
 153                         }
 154                 }
 155                 else if (str[2].empty())
 156                 {
 157                         stringdiffs sdiffs(str[0], str[1], case_sensitive, eol_sensitive, whitespace, breakType, &diffs);
 158                         sdiffs.BuildWordDiffList();
 159                         if (byte_level)
 160                                 sdiffs.wordLevelToByteLevel();
 161                         sdiffs.PopulateDiffs();
 162                         for (size_t i = 0; i < diffs.size(); i++)
 163                         {
 164                                 wdiff& diff = diffs[i];
 165                                 //diff.begin[1] = diff.begin[1];
 166                                 //diff.begin[0] = diff.begin[0];
 167                                 diff.begin[2] = 0;
 168                                 //diff.end[1] = diff.end[1];
 169                                 //diff.end[0] = diff.end[0];
 170                                 diff.end[2] = -1;
 171                         }
 172                 }
 173                 else
 174                 {
 175                         std::vector<wdiff> diffs10, diffs12;
 176                         stringdiffs sdiffs10(str[1], str[0], case_sensitive, eol_sensitive, 0, breakType, &diffs10);
 177                         stringdiffs sdiffs12(str[1], str[2], case_sensitive, eol_sensitive, 0, breakType, &diffs12);
 178                         // Hash all words in both lines and then compare them word by word
 179                         // storing differences into m_wdiffs
 180                         sdiffs10.BuildWordDiffList();
 181                         sdiffs12.BuildWordDiffList();
 182                         if (byte_level)
 183                         {
 184                                 sdiffs10.wordLevelToByteLevel();
 185                                 sdiffs12.wordLevelToByteLevel();
 186                         }
 187                         // Now copy m_wdiffs into caller-supplied m_pDiffs (coalescing adjacents if possible)
 188                         sdiffs10.PopulateDiffs();
 189                         sdiffs12.PopulateDiffs();
 190
 191                         Make3wayDiff(diffs, diffs10, diffs12,
 192                                 Comp02Functor(str, case_sensitive), false);
 193                 }
 194         }
 195         return diffs;
 196 }
 197
 198 /**
 199  * @brief stringdiffs constructor simply loads all members from arguments
 200  */
 201 stringdiffs::stringdiffs(const String & str1, const String & str2,
 202         bool case_sensitive, bool eol_sensitive, int whitespace, int breakType,
 203         std::vector<wdiff> * pDiffs)
 204 : m_str1(str1)
 205 , m_str2(str2)
 206 , m_case_sensitive(case_sensitive)
 207 , m_eol_sensitive(eol_sensitive)
 208 , m_whitespace(whitespace)
 209 , m_breakType(breakType)
 210 , m_pDiffs(pDiffs)
 211 , m_matchblock(true) // Change to false to get word to word compare
 212 {
 213 }
 214
 215 /**
 216  * @brief Destructor.
 217  * The destructor frees all diffs added to the vectors.
 218  */
 219 stringdiffs::~stringdiffs()
 220 {
 221 }
 222
 223 #ifdef STRINGDIFF_LOGGING
 224 void
 225 stringdiffs::debugoutput()
 226 {
 227         for (size_t i = 0; i < m_wdiffs.size(); i++)
 228         {
 229                 String str1;
 230                 String str2;
 231                 TCHAR buf[256];
 232                 int s1 = m_wdiffs[i].begin[0];
 233                 int e1 = m_wdiffs[i].end[0];
 234                 int s2 = m_wdiffs[i].begin[1];
 235                 int e2 = m_wdiffs[i].end[1];
 236
 237                 int len1 = e1 - s1 + 1;
 238                 int len2 = e2 - s2 + 1;
 239
 240                 if (len1 < 50)
 241                         str1 = m_str1.substr(s1 ,e1 - s1 + 1);
 242                 else
 243                         str1 = m_str1.substr(s1, 50);
 244
 245                 if (len2 < 50)
 246                         str2 = m_str2.substr(s2, e2- s2 + 1);
 247                 else
 248                         str2 = m_str2.substr(s2, 50);
 249
 250                 wsprintf(buf, _T("left=  %s,   %d,%d,\nright= %s,   %d,%d \n"),
 251                         str1.c_str(), s1, e1, str2.c_str(), s2, e2);
 252                 OutputDebugString(buf);
 253         }
 254 }
 255 #endif
 256
 257 bool
 258 stringdiffs::BuildWordDiffList_DP()
 259 {
 260         std::vector<char> edscript;
 261
 262         //if (dp(edscript) <= 0)
 263         //      return false;
 264         if (onp(edscript) < 0)
 265                 return false;
 266
 267         int i = 1, j = 1;
 268         for (size_t k = 0; k < edscript.size(); k++)
 269         {
 270                 int s1, e1, s2, e2;
 271                 if (edscript[k] == '-')
 272                 {
 273                         if (m_whitespace == WHITESPACE_IGNORE_ALL)
 274                         {
 275                                 if (IsSpace(m_words1[i]))
 276                                 {
 277                                         i++;
 278                                         continue;
 279                                 }
 280                         }
 281
 282                         s1 = m_words1[i].start;
 283                         e1 = m_words1[i].end;
 284                         s2 = m_words2[j-1].end+1;
 285                         e2 = s2-1;
 286                         m_wdiffs.push_back(wdiff(s1, e1, s2, e2));
 287                         i++;
 288                 }
 289                 else if (edscript[k] == '+')
 290                 {
 291                         if (m_whitespace == WHITESPACE_IGNORE_ALL)
 292                         {
 293                                 if (IsSpace(m_words2[j]))
 294                                 {
 295                                         j++;
 296                                         continue;
 297                                 }
 298                         }
 299
 300                         s1 = m_words1[i-1].end+1;
 301                         e1 = s1-1;
 302                         s2 = m_words2[j].start;
 303                         e2 = m_words2[j].end;
 304                         m_wdiffs.push_back(wdiff(s1, e1, s2, e2));
 305                         j++;
 306                 }
 307                 else if (edscript[k] == '!')
 308                 {
 309                         if (m_whitespace == WHITESPACE_IGNORE_CHANGE || m_whitespace == WHITESPACE_IGNORE_ALL)
 310                         {
 311                                 if (IsSpace(m_words1[i]) && IsSpace(m_words2[j]))
 312                                 {
 313                                         i++; j++;
 314                                         continue;
 315                                 }
 316                         }
 317
 318                         s1 =  m_words1[i].start;
 319                         e1 =  m_words1[i].end;
 320                         s2 =  m_words2[j].start;
 321                         e2 =  m_words2[j].end ;
 322                         m_wdiffs.push_back(wdiff(s1, e1, s2, e2));
 323                         i++; j++;
 324                 }
 325                 else
 326                 {
 327                         i++; j++;
 328                 }
 329         }
 330 #ifdef STRINGDIFF_LOGGING
 331         debugoutput();
 332 #endif
 333         return true;
 334 }
 335
 336 /**
 337  * @brief Add all different elements between lines to the wdiff list
 338  */
 339 void
 340 stringdiffs::BuildWordDiffList()
 341 {
 342         m_words1 = BuildWordsArray(m_str1);
 343         m_words2 = BuildWordsArray(m_str2);
 344
 345         bool succeeded = false;
 346 #ifdef _WIN64
 347         if (m_words1.size() < 20480 && m_words2.size() < 20480)
 348 #else
 349         if (m_words1.size() < 2048 && m_words2.size() < 2048)
 350 #endif
 351         {
 352                 succeeded = BuildWordDiffList_DP();
 353         }
 354         if (!succeeded)
 355         {
 356                 int s1 = m_words1[0].start;
 357                 int e1 = m_words1[m_words1.size() - 1].end;
 358                 int s2 = m_words2[0].start;
 359                 int e2 = m_words2[m_words2.size() - 1].end;
 360                 m_wdiffs.push_back(wdiff(s1, e1, s2, e2));
 361                 return;
 362         }
 363
 364 }
 365
 366 /**
 367  * @brief Break line into constituent words
 368  */
 369 std::vector<stringdiffs::word>
 370 stringdiffs::BuildWordsArray(const String & str)
 371 {
 372         std::vector<word> words;
 373         int i = 0, begin = 0;
 374         ICUBreakIterator *pIterChar = ICUBreakIterator::getCharacterBreakIterator(reinterpret_cast<const UChar *>(str.c_str()), static_cast<int32_t>(str.length()));
 375
 376         size_t sLen = str.length();
 377         assert(sLen < INT_MAX);
 378         int iLen = static_cast<int>(sLen);
 379
 380         // dummy;
 381         words.push_back(word(0, -1, 0, 0));
 382
 383         // state when we are looking for next word
 384 inspace:
 385         if (isSafeWhitespace(str[i]))
 386         {
 387                 i = pIterChar->next();
 388                 goto inspace;
 389         }
 390         if (begin < i)
 391         {
 392                 // just finished a word
 393                 // e is first word character (space or at end)
 394                 int e = i - 1;
 395
 396                 words.push_back(word(begin, e, dlspace, Hash(str, begin, e, 0)));
 397         }
 398         if (i == iLen)
 399                 return words;
 400         begin = i;
 401         goto inword;
 402
 403         // state when we are inside a word
 404 inword:
 405         bool atspace=false;
 406         if (i == iLen || ((atspace = isSafeWhitespace(str[i])) != 0) || isWordBreak(m_breakType, str.c_str(), i))
 407         {
 408                 if (begin<i)
 409                 {
 410                         // just finished a word
 411                         // e is first non-word character (space or at end)
 412                         int e = i-1;
 413
 414                         words.push_back(word(begin, e, dlword, Hash(str, begin, e, 0)));
 415                 }
 416                 if (i == iLen)
 417                 {
 418                         return words;
 419                 }
 420                 else if (atspace)
 421                 {
 422                         begin = i;
 423                         goto inspace;
 424                 }
 425                 else
 426                 {
 427                         // start a new word because we hit a non-whitespace word break (eg, a comma)
 428                         // but, we have to put each word break character into its own word
 429                         int inext = pIterChar->next();
 430                         words.push_back(word(i, inext - 1, dlbreak, Hash(str, i, inext - 1, 0)));
 431                         i = inext;
 432                         begin = i;
 433                         goto inword;
 434                 }
 435         }
 436         i = pIterChar->next();
 437         goto inword; // safe even if we're at the end or no longer in a word
 438 }
 439
 440 /**
 441  * @brief Populate m_pDiffs from m_wdiffs (combining adjacent diffs)
 442  *
 443  * Doing the combining of adjacent diffs here keeps some complexity out of BuildWordsArray.
 444  */
 445 void
 446 stringdiffs::PopulateDiffs()
 447 {
 448         auto IsEOLorEmpty = [](const String& text, size_t begin, size_t end) -> bool {
 449                 if (end - begin + 1 > 2)
 450                         return false;
 451                 String str = text.substr(begin, end - begin + 1);
 452                 return (str.empty() || str == _T("\r\n") || str == _T("\n") || str == _T("\r"));
 453         };
 454
 455         for (int i=0; i< (int)m_wdiffs.size(); ++i)
 456         {
 457                 bool skipIt = false;
 458                 // combine it with next ?
 459                 if (i+1< (int)m_wdiffs.size())
 460                 {
 461                         if (m_wdiffs[i].end[0] + 1 == m_wdiffs[i+1].begin[0]
 462                                 && m_wdiffs[i].end[1] + 1 == m_wdiffs[i+1].begin[1])
 463                         {
 464                                 // diff[i] and diff[i+1] are contiguous
 465                                 // so combine them into diff[i+1] and ignore diff[i]
 466                                 m_wdiffs[i+1].begin[0] = m_wdiffs[i].begin[0];
 467                                 m_wdiffs[i+1].begin[1] = m_wdiffs[i].begin[1];
 468                                 skipIt = true;
 469                         }
 470                 }
 471                 else
 472                 {
 473                         if (!m_eol_sensitive &&
 474                                 IsEOLorEmpty(m_str1, m_wdiffs[i].begin[0], m_wdiffs[i].end[0]) &&
 475                                 IsEOLorEmpty(m_str2, m_wdiffs[i].begin[1], m_wdiffs[i].end[1]))
 476                                 skipIt = true;
 477                 }
 478                 if (!skipIt)
 479                 {
 480                         // Should never have a pair where both are missing
 481                         assert(m_wdiffs[i].begin[0]>=0 || m_wdiffs[i].begin[1]>=0);
 482
 483                         // Store the diff[i] in the caller list (m_pDiffs)
 484                         m_pDiffs->push_back(wdiff(m_wdiffs[i]));
 485                 }
 486         }
 487 }
 488
 489 // diffutils hash
 490
 491 /* Rotate a value n bits to the left. */
 492 #define UINT_BIT (sizeof (unsigned) * CHAR_BIT)
 493 #define ROL(v, n) ((v) << (n) | (v) >> (UINT_BIT - (n)))
 494 /* Given a hash value and a new character, return a new hash value. */
 495 #define HASH(h, c) ((c) + ROL (h, 7))
 496
 497 unsigned
 498 stringdiffs::Hash(const String & str, int begin, int end, unsigned h) const
 499 {
 500         for (int i = begin; i <= end; ++i)
 501         {
 502                 TCHAR ch = static_cast<unsigned>(str[i]);
 503                 if (m_case_sensitive)
 504                 {
 505                         h += HASH(h, ch);
 506                 }
 507                 else
 508                 {
 509                         ch = static_cast<unsigned>(_totupper(ch));
 510                         h += HASH(h, ch);
 511                 }
 512         }
 513
 514         return h;
 515 }
 516
 517 /**
 518  * @brief Compare two words (by reference to original strings)
 519  */
 520 bool
 521 stringdiffs::AreWordsSame(const word & word1, const word & word2) const
 522 {
 523         if (this->m_whitespace != WHITESPACE_COMPARE_ALL)
 524         {
 525                 if (IsSpace(word1) && IsSpace(word2))
 526                         return true;
 527         }
 528         if (word1.hash != word2.hash)
 529                 return false;
 530         if (word1.length() != word2.length())
 531                 return false;
 532         for (int i=0; i<word1.length(); ++i)
 533         {
 534                 if (!caseMatch(m_str1[word1.start+i], m_str2[word2.start+i]))
 535                         return false;
 536         }
 537         return true;
 538 }
 539
 540 /**
 541  * @brief Return true if characters match
 542  */
 543 bool
 544 stringdiffs::caseMatch(TCHAR ch1, TCHAR ch2) const
 545 {
 546         if (m_case_sensitive)
 547                 return ch1==ch2;
 548         else
 549                 return _totupper(ch1)==_totupper(ch2);
 550 }
 551
 552 /**
 553  * @ brief An O(NP) Sequence Comparison Algorithm. Sun Wu, Udi Manber, Gene Myers
 554  */
 555 int
 556 stringdiffs::onp(std::vector<char> &edscript)
 557 {
 558         auto start = std::chrono::system_clock::now();
 559
 560         int M = static_cast<int>(m_words1.size() - 1);
 561         int N = static_cast<int>(m_words2.size() - 1);
 562         bool exchanged = false;
 563         if (M > N)
 564         {
 565                 M = static_cast<int>(m_words2.size() - 1);
 566                 N = static_cast<int>(m_words1.size() - 1);
 567                 exchanged = true;
 568         }
 569         int *fp = (new int[(M+1) + 1 + (N+1)]) + (M+1);
 570         struct EditScriptElem { int op; int neq; int pk; int pi; };
 571         std::vector<EditScriptElem> *es = (new std::vector<EditScriptElem>[(M+1) + 1 + (N+1)]) + (M+1);
 572         int DELTA = N - M;
 573
 574         auto addEditScriptElem = [&es, &fp](int k) {
 575                 EditScriptElem ese;
 576                 if (fp[k - 1] + 1 > fp[k + 1])
 577                 {
 578                         ese.op = '+';
 579                         ese.neq = fp[k] - (fp[k - 1] + 1);
 580                         ese.pk = k - 1;
 581                 }
 582                 else
 583                 {
 584                         ese.op = '-';
 585                         ese.neq = fp[k] - fp[k + 1];
 586                         ese.pk = k + 1;
 587                 }
 588                 ese.pi = static_cast<int>(es[ese.pk].size() - 1);
 589                 es[k].push_back(ese);
 590         };
 591
 592         const int COUNTMAX = 100000;
 593         int count = 0;
 594         int k;
 595         for (k = -(M+1); k <= (N+1); k++)
 596                 fp[k] = -1;
 597         int p = -1;
 598         do
 599         {
 600                 p = p + 1;
 601                 for (k = -p; k <= DELTA-1; k++)
 602                 {
 603                         fp[k] = snake(k, std::max(fp[k-1] + 1, fp[k+1]), exchanged);
 604                         addEditScriptElem(k);
 605                         count++;
 606                 }
 607                 for (k = DELTA + p; k >= DELTA+1; k--)
 608                 {
 609                         fp[k] = snake(k, std::max(fp[k-1] + 1, fp[k+1]), exchanged);
 610                         addEditScriptElem(k);
 611                         count++;
 612                 }
 613                 k = DELTA;
 614                 fp[k] = snake(k, std::max(fp[k-1] + 1, fp[k+1]), exchanged);
 615                 addEditScriptElem(k);
 616                 count++;
 617
 618                 if (count > COUNTMAX)
 619                 {
 620                         count = 0;
 621                         auto end = std::chrono::system_clock::now();
 622                         auto msec = std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count();
 623                         if (msec > TimeoutMilliSeconds)
 624                         {
 625                                 delete [] (es - (M+1));
 626                                 delete [] (fp - (M+1));
 627                                 return -1;
 628                         }
 629                 }
 630         } while (fp[k] != N);
 631
 632         edscript.clear();
 633
 634         std::vector<char> ses;
 635         int i;
 636         for (k = DELTA, i = static_cast<int>(es[DELTA].size() - 1); i >= 0;)
 637         {
 638                 EditScriptElem& esi = es[k][i];
 639                 for (int j = 0; j < esi.neq; ++j)
 640                         ses.push_back('=');
 641                 ses.push_back(static_cast<char>(esi.op));
 642                 i = esi.pi;
 643                 k = esi.pk;
 644         }
 645         std::reverse(ses.begin(), ses.end());
 646
 647         int D = 0;
 648         for (i = 1; i < static_cast<int>(ses.size()); i++)
 649         {
 650                 switch (ses[i])
 651                 {
 652                 case '+':
 653                         if (i + 1 < ses.size() && ses[i + 1] == '-')
 654                         {
 655                                 edscript.push_back('!');
 656                                 i++;
 657                                 D++;
 658                         }
 659                         else
 660                         {
 661                                 edscript.push_back(exchanged ? '-' : '+');
 662                                 D++;
 663                         }
 664                         break;
 665                 case '-':
 666                         if (i + 1 < ses.size() && ses[i + 1] == '+')
 667                         {
 668                                 edscript.push_back('!');
 669                                 i++;
 670                                 D++;
 671                         }
 672                         else
 673                         {
 674                                 edscript.push_back(exchanged ? '+' : '-');
 675                                 D++;
 676                         }
 677                         break;
 678                 default:
 679                         edscript.push_back('=');
 680                 }
 681         }
 682
 683         delete [] (es - (M+1));
 684         delete [] (fp - (M+1));
 685
 686         return D;
 687 }
 688
 689 int
 690 stringdiffs::snake(int k, int y, bool exchanged)
 691 {
 692         int M = static_cast<int>(exchanged ? m_words2.size() - 1 : m_words1.size() - 1);
 693         int N = static_cast<int>(exchanged ? m_words1.size() - 1 : m_words2.size() - 1);
 694         int x = y - k;
 695         while (x < M && y < N && (exchanged ? AreWordsSame(m_words1[y + 1], m_words2[x + 1]) : AreWordsSame(m_words1[x + 1], m_words2[y + 1]))) {
 696                 x = x + 1; y = y + 1;
 697         }
 698         return y;
 699 }
 700
 701 /**
 702  * @brief Return true if chars match
 703  *
 704  * Caller must not call this for lead bytes
 705  */
 706 static inline bool
 707 matchchar(const TCHAR *ch1, const TCHAR *ch2, size_t len, bool casitive)
 708 {
 709         if (casitive)
 710                 return memcmp(ch1, ch2, len * sizeof(TCHAR)) == 0;
 711         for (size_t i = 0; i < len; ++i)
 712         {
 713                 if (_totupper(ch1[i]) != _totupper(ch2[i]))
 714                         return false;
 715         }
 716         return true;
 717 }
 718
 719
 720 /** Does character introduce a multicharacter character? */
 721 static inline bool IsLeadByte(TCHAR ch)
 722 {
 723 #ifdef UNICODE
 724         return false;
 725 #else
 726         return _getmbcp() && IsDBCSLeadByte(ch);
 727 #endif
 728 }
 729
 730 /**
 731  * @brief Is it whitespace (excludes all lead & trail bytes)?
 732  */
 733 static inline bool
 734 isSafeWhitespace(TCHAR ch)
 735 {
 736         return _istspace((unsigned)ch) && !IsLeadByte(ch);
 737 }
 738
 739 /**
 740  * @brief Is it a non-whitespace wordbreak character (ie, punctuation)?
 741  */
 742 static bool
 743 isWordBreak(int breakType, const TCHAR *str, int index)
 744 {
 745         TCHAR ch = str[index];
 746         // breakType==1 means break also on punctuation
 747         if ((ch & 0xff00) == 0)
 748         {
 749 //              TCHAR nextCh = str[index + 1];
 750                 // breakType==0 means whitespace only
 751                 if (breakType==0)
 752                         return false;
 753                 return _tcschr(BreakChars, ch) != nullptr;
 754         }
 755         else
 756         {
 757 //              if (
 758 //                      ch==0xff0c/* Fullwidth Full Stop */ ||
 759 //                      ch==0xff0e/* Fullwidth Comma */ ||
 760 //                      ch==0xff1b/* Fullwidth Semicolon */ ||
 761 //                      ch==0xff1a/* Fullwidth Colon */ ||
 762 //                      ch==0x3002/* Ideographic Full Stop */ ||
 763 //                      ch==0x3001/* Ideographic Comma */
 764 //                      )
 765 //                      return true;
 766 //              WORD wCharType, wCharTypeNext;
 767 //              GetStringTypeW(CT_CTYPE3, &ch, 1, &wCharType);
 768 //              TCHAR nextCh = str[index + 1];
 769 //              GetStringTypeW(CT_CTYPE3, &nextCh, 1, &wCharTypeNext);
 770 //              return (wCharType != wCharTypeNext);
 771 //
 772                 return true;
 773         }
 774 }
 775
 776
 777 /**
 778  * @brief advance current pointer over whitespace, until not whitespace or beyond end
 779  * @param pcurrent [in,out] current location (to be advanced)
 780  * @param end [in] last valid position (only go one beyond this)
 781  */
 782 static void
 783 AdvanceOverWhitespace(const TCHAR **pcurrent, const TCHAR *end)
 784 {
 785         // advance over whitespace
 786         while (*pcurrent <= end && isSafeWhitespace(**pcurrent))
 787                 ++(*pcurrent); // DBCS safe because of isSafeWhitespace above
 788 }
 789
 790 /**
 791  * @brief Compute begin1,begin2,end1,end2 to display byte difference between strings str1 & str2
 792  * @param casitive [in] true for case-sensitive, false for case-insensitive
 793  * @param xwhite [in] This governs whether we handle whitespace specially (see WHITESPACE_COMPARE_ALL, WHITESPACE_IGNORE_CHANGE, WHITESPACE_IGNORE_ALL)
 794  * @param [out] begin return -1 if not found or pos of equal
 795  * @param [out] end return -1 if not found or pos of equal valid if begin1 >=0
 796  * @param [in] equal false surch for a diff, true surch for equal
 797  *
 798  *
 799  * Assumes whitespace is never leadbyte or trailbyte!
 800  */
 801 void
 802 stringdiffs::ComputeByteDiff(const String & str1, const String & str2,
 803                    bool casitive, int xwhite,
 804                    int begin[2], int end[2], bool equal)
 805 {
 806         // Set to sane values
 807         // Also this way can distinguish if we set begin[0] to -1 for no diff in line
 808         begin[0] = end[0] = begin[1] = end[1] = 0;
 809
 810         int len1 = static_cast<int>(str1.length());
 811         int len2 = static_cast<int>(str2.length());
 812
 813         const TCHAR *pbeg1 = str1.c_str();
 814         const TCHAR *pbeg2 = str2.c_str();
 815
 816         ICUBreakIterator *pIterCharBegin1 = ICUBreakIterator::getCharacterBreakIterator(reinterpret_cast<const UChar *>(pbeg1), static_cast<int32_t>(len1));
 817         ICUBreakIterator *pIterCharBegin2 = ICUBreakIterator::getCharacterBreakIterator<2>(reinterpret_cast<const UChar *>(pbeg2), static_cast<int32_t>(len2));
 818         ICUBreakIterator *pIterCharEnd1 = ICUBreakIterator::getCharacterBreakIterator<3>(reinterpret_cast<const UChar *>(pbeg1), static_cast<int32_t>(len1));
 819         ICUBreakIterator *pIterCharEnd2 = ICUBreakIterator::getCharacterBreakIterator<4>(reinterpret_cast<const UChar *>(pbeg2), static_cast<int32_t>(len2));
 820
 821         if (len1 == 0 || len2 == 0)
 822         {
 823                 if (len1 == len2)
 824                 {
 825                         begin[0] = -1;
 826                         begin[1] = -1;
 827                 }
 828                 end[0] = len1 - 1;
 829                 end[1] = len2 - 1;
 830                 return;
 831         }
 832
 833         // cursors from front, which we advance to beginning of difference
 834         const TCHAR *py1 = pbeg1;
 835         const TCHAR *py2 = pbeg2;
 836
 837         // pen1,pen2 point to the last valid character (broken multibyte lead chars don't count)
 838         const TCHAR *pen1 = pbeg1 + (len1 > 0 ? pIterCharEnd1->preceding(len1) : 0);
 839         const TCHAR *pen2 = pbeg2 + (len2 > 0 ? pIterCharEnd2->preceding(len2) : 0);
 840         size_t glyphlenz1 = pbeg1 + len1 - pen1;
 841         size_t glyphlenz2 = pbeg2 + len2 - pen2;
 842
 843         if (xwhite != WHITESPACE_COMPARE_ALL)
 844         {
 845                 // Ignore leading and trailing whitespace
 846                 // by advancing py1 and py2
 847                 // and retreating pen1 and pen2
 848                 while (py1 < pen1 && isSafeWhitespace(*py1))
 849                         py1 = pbeg1 + pIterCharBegin1->next();
 850                 while (py2 < pen2 && isSafeWhitespace(*py2))
 851                         py2 = pbeg2 + pIterCharBegin2->next();
 852                 if ((pen1 < pbeg1 + len1 - 1 || pen2 < pbeg2 + len2 -1)
 853                         && (!len1 || !len2 || pbeg1[len1] != pbeg2[len2]))
 854                 {
 855                         // mismatched broken multibyte ends
 856                 }
 857                 else
 858                 {
 859                         while (pen1 > py1 && isSafeWhitespace(*pen1))
 860                                 pen1 = pbeg1 + pIterCharEnd1->previous();
 861                         while (pen2 > py2 && isSafeWhitespace(*pen2))
 862                                 pen2 = pbeg2 + pIterCharEnd2->previous();
 863                 }
 864         }
 865         //check for excaption of empty string on one side
 866         //In that case display all as a diff
 867         if (!equal && (((py1 == pen1) && isSafeWhitespace(*pen1)) ||
 868                 ((py2 == pen2) && isSafeWhitespace(*pen2))))
 869         {
 870                 begin[0] = 0;
 871                 begin[1] = 0;
 872                 end[0] = len1 - 1;
 873                 end[1] = len2 - 1;
 874                 return;
 875         }
 876         // Advance over matching beginnings of lines
 877         // Advance py1 & py2 from beginning until find difference or end
 878         while (true)
 879         {
 880                 // Potential difference extends from py1 to pen1 and py2 to pen2
 881
 882                 // Check if either side finished
 883                 if (py1 > pen1 && py2 > pen2)
 884                 {
 885                         begin[0] = end[0] = begin[1] = end[1] = -1;
 886                         break;
 887                 }
 888                 if (py1 > pen1 || py2 > pen2)
 889                 {
 890                         break;
 891                 }
 892
 893                 // handle all the whitespace logic (due to WinMerge whitespace settings)
 894                 if (xwhite && py1 < pen1 && isSafeWhitespace(*py1))
 895                 {
 896                         if (xwhite==WHITESPACE_IGNORE_CHANGE && !isSafeWhitespace(*py2))
 897                         {
 898                                 // py1 is white but py2 is not
 899                                 // in WHITESPACE_IGNORE_CHANGE mode,
 900                                 // this doesn't qualify as skippable whitespace
 901                                 break; // done with forward search
 902                         }
 903                         // gobble up all whitespace in current area
 904                         AdvanceOverWhitespace(&py1, pen1); // will go beyond end
 905                         AdvanceOverWhitespace(&py2, pen2); // will go beyond end
 906                         continue;
 907
 908                 }
 909                 if (xwhite && py2 < pen2 && isSafeWhitespace(*py2))
 910                 {
 911                         if (xwhite==WHITESPACE_IGNORE_CHANGE && !isSafeWhitespace(*py1))
 912                         {
 913                                 // py2 is white but py1 is not
 914                                 // in WHITESPACE_IGNORE_CHANGE mode,
 915                                 // this doesn't qualify as skippable whitespace
 916                                 break; // done with forward search
 917                         }
 918                         // gobble up all whitespace in current area
 919                         AdvanceOverWhitespace(&py1, pen1); // will go beyond end
 920                         AdvanceOverWhitespace(&py2, pen2); // will go beyond end
 921                         continue;
 922                 }
 923
 924                 const TCHAR* py1next = pbeg1 + pIterCharBegin1->next();
 925                 const TCHAR* py2next = pbeg2 + pIterCharBegin2->next();
 926                 size_t glyphleny1 = py1next - py1;
 927                 size_t glyphleny2 = py2next - py2;
 928                 if (glyphleny1 != glyphleny2 || !matchchar(py1, py2, glyphleny1, casitive))
 929                         break; // done with forward search
 930                 py1 = py1next;
 931                 py2 = py2next;
 932         }
 933
 934         // Potential difference extends from py1 to pen1 and py2 to pen2
 935
 936         // Store results of advance into return variables (begin[0] & begin[1])
 937         // -1 in a begin variable means no visible diff area
 938         begin[0] = static_cast<int>(py1 - pbeg1);
 939         begin[1] = static_cast<int>(py2 - pbeg2);
 940
 941         const TCHAR *pz1 = pen1;
 942         const TCHAR *pz2 = pen2;
 943
 944         // Retreat over matching ends of lines
 945         // Retreat pz1 & pz2 from end until find difference or beginning
 946         while (true)
 947         {
 948                 // Check if either side finished
 949                 if (pz1 < py1 && pz2 < py2)
 950                 {
 951                         begin[0] = end[0] = begin[1] = end[1] = -1;
 952                         break;
 953                 }
 954                 if (pz1 < py1 || pz2 < py2)
 955                 {
 956                         break;
 957                 }
 958
 959                 // handle all the whitespace logic (due to WinMerge whitespace settings)
 960                 if (xwhite && pz1 > py1 && isSafeWhitespace(*pz1))
 961                 {
 962                         if (xwhite==1 && !isSafeWhitespace(*pz2))
 963                                 break; // done with reverse search
 964                         // gobble up all whitespace in current area
 965                         while (pz1 > py1 && isSafeWhitespace(*pz1))
 966                                 pz1 = pbeg1 + pIterCharEnd1->previous();
 967                         while (pz2 > py2 && isSafeWhitespace(*pz2))
 968                                 pz2 = pbeg2 + pIterCharEnd2->previous();
 969                         continue;
 970
 971                 }
 972                 if (xwhite && pz2 > py2 && isSafeWhitespace(*pz2))
 973                 {
 974                         if (xwhite==1)
 975                                 break; // done with reverse search
 976                         while (pz2 > py2 && isSafeWhitespace(*pz2))
 977                                 pz2 = pbeg2 + pIterCharEnd2->previous();
 978                         continue;
 979                 }
 980
 981                 if (glyphlenz1 != glyphlenz2 || !matchchar(pz1, pz2, glyphlenz1, casitive))
 982                         break; // done with forward search
 983                 const TCHAR* pz1next = pz1;
 984                 const TCHAR* pz2next = pz2;
 985                 pz1 = (pz1 > pbeg1) ? pbeg1 + pIterCharEnd1->preceding(static_cast<int32_t>(pz1 - pbeg1)) : pz1 - 1;
 986                 pz2 = (pz2 > pbeg2) ? pbeg2 + pIterCharEnd2->preceding(static_cast<int32_t>(pz2 - pbeg2)) : pz2 - 1;
 987                 glyphlenz1 = pz1next - pz1;
 988                 glyphlenz2 = pz2next - pz2;
 989                 // Now do real character match
 990     }
 991
 992 /*      if (*pz1 == '\r' && *(pz1+1) == '\n')
 993         {
 994                 pz1++;
 995                 pz2++;
 996         }
 997         else if (*pz2 == '\r' && *(pz2+1) == '\n')
 998         {
 999                 pz2++;
1000                 pz1++;
1001         }
1002         if (*(pbeg1-1) == '\r' && *pbeg1 == '\n')
1003         {
1004                 pbeg1--;
1005                 pbeg2--;
1006         }
1007         else if (*(pbeg2-1) == '\r' && *pbeg2 == '\n')
1008         {
1009                 pbeg2--;
1010                 pbeg1--;
1011         }*/
1012
1013         // Store results of advance into return variables (end[0] & end[1])
1014         end[0] = static_cast<int>(pz1 - pbeg1 + glyphlenz1 - 1);
1015         end[1] = static_cast<int>(pz2 - pbeg2 + glyphlenz2 - 1);
1016
1017         // Check if difference region was empty
1018         if (begin[0] == end[0] + 1 && begin[1] == end[1] + 1)
1019                 begin[0] = -1; // no diff
1020 }
1021
1022 /**
1023  * @brief adjust the range of the specified word diffs down to byte(char) level.
1024  * @param str1, str2 [in] line to be compared
1025  * @param casitive [in] true for case-sensitive, false for case-insensitive
1026  * @param xwhite [in] This governs whether we handle whitespace specially
1027  *  (see WHITESPACE_COMPARE_ALL, WHITESPACE_IGNORE_CHANGE, WHITESPACE_IGNORE_ALL)
1028  */
1029 void stringdiffs::wordLevelToByteLevel()
1030 {
1031         for (size_t i = 0; i < m_wdiffs.size(); i++)
1032         {
1033                 int begin[3], end[3];
1034                 wdiff& diff = m_wdiffs[i];
1035                 String str1_2, str2_2;
1036                 str1_2 = m_str1.substr(diff.begin[0], diff.end[0] - diff.begin[0] + 1);
1037                 str2_2 = m_str2.substr(diff.begin[1], diff.end[1] - diff.begin[1] + 1);
1038                 ComputeByteDiff(str1_2, str2_2, m_case_sensitive, m_whitespace, begin, end, false);
1039                 if (begin[0] == -1)
1040                 {
1041                         // no visible diff on side1
1042                         diff.end[0] = diff.begin[0] - 1;
1043                 }
1044                 else
1045                 {
1046                         diff.end[0] = diff.begin[0] + end[0];
1047                         diff.begin[0] += begin[0];
1048                 }
1049                 if (begin[1] == -1)
1050                 {
1051                         // no visible diff on side2
1052                         diff.end[1] = diff.begin[1] - 1;
1053                 }
1054                 else
1055                 {
1056                         diff.end[1] = diff.begin[1] + end[1];
1057                         diff.begin[1] += begin[1];
1058                 }
1059         }
1060 }
1061
1062 }