Src/stringdiffs.cpp

   1 /**
   2  * @file  stringdiffs.cpp
   3  *
   4  * @brief Implementation file for ComputeWordDiffs (q.v.)
   5  *
   6  */
   7
   8 #include "pch.h"
   9 #include "stringdiffs.h"
  10 #define NOMINMAX
  11 #include <cassert>
  12 #include <chrono>
  13 #include "CompareOptions.h"
  14 #include "stringdiffsi.h"
  15 #include "Diff3.h"
  16
  17 using std::vector;
  18
  19 namespace strdiff
  20 {
  21
  22 static bool Initialized;
  23 static bool CustomChars;
  24 static TCHAR *BreakChars;
  25 static TCHAR BreakCharDefaults[] = _T(",.;:");
  26 static int TimeoutMilliSeconds = 500;
  27
  28 static bool isSafeWhitespace(TCHAR ch);
  29 static bool isWordBreak(int breakType, const TCHAR *str, int index, bool ignore_numbers);
  30
  31 void Init()
  32 {
  33         BreakChars = &BreakCharDefaults[0];
  34         Initialized = true;
  35 }
  36
  37 void Close()
  38 {
  39         if (CustomChars)
  40         {
  41                 free(BreakChars);
  42                 BreakChars = nullptr;
  43                 CustomChars = false;
  44         }
  45         Initialized = false;
  46 }
  47
  48 void SetBreakChars(const TCHAR *breakChars)
  49 {
  50         assert(Initialized);
  51
  52         if (CustomChars)
  53                 free(BreakChars);
  54
  55         CustomChars = true;
  56         BreakChars = _tcsdup(breakChars);
  57 }
  58
  59 std::vector<wdiff>
  60 ComputeWordDiffs(const String& str1, const String& str2,
  61         bool case_sensitive, bool eol_sensitive, int whitespace, bool ignore_numbers, int breakType, bool byte_level)
  62 {
  63         String strs[3] = {str1, str2, _T("")};
  64         return ComputeWordDiffs(2, strs, case_sensitive, eol_sensitive, whitespace, ignore_numbers, breakType, byte_level);
  65 }
  66
  67 struct Comp02Functor
  68 {
  69         Comp02Functor(const String *strs, bool case_sensitive) :
  70                 strs_(strs), case_sensitive_(case_sensitive)
  71         {
  72         }
  73         bool operator()(const wdiff &wd3)
  74         {
  75                 size_t wlen0 = wd3.end[0] - wd3.begin[0] + 1;
  76                 size_t wlen2 = wd3.end[2] - wd3.begin[2] + 1;
  77                 if (wlen0 != wlen2)
  78                         return false;
  79                 if (case_sensitive_)
  80                 {
  81                         if (memcmp(&strs_[0][wd3.begin[0]], &strs_[2][wd3.begin[2]], wlen0 * sizeof(TCHAR)) != 0)
  82                                 return false;
  83                 }
  84                 else
  85                 {
  86                         if (_tcsnicmp(&strs_[0][wd3.begin[0]], &strs_[2][wd3.begin[2]], wlen0) != 0)
  87                                 return false;
  88                 }
  89                 return true;
  90         }
  91         const String *strs_;
  92         bool case_sensitive_;
  93 };
  94
  95 /**
  96  * @brief Construct our worker object and tell it to do the work
  97  */
  98 std::vector<wdiff>
  99 ComputeWordDiffs(int nFiles, const String *str,
 100         bool case_sensitive, bool eol_sensitive, int whitespace, bool ignore_numbers, int breakType, bool byte_level)
 101 {
 102         std::vector<wdiff> diffs;
 103         if (nFiles == 2)
 104         {
 105                 stringdiffs sdiffs(str[0], str[1], case_sensitive, eol_sensitive, whitespace, ignore_numbers, breakType, &diffs);
 106                 // Hash all words in both lines and then compare them word by word
 107                 // storing differences into m_wdiffs
 108                 sdiffs.BuildWordDiffList();
 109
 110                 if (byte_level)
 111                         sdiffs.wordLevelToByteLevel();
 112
 113                 // Now copy m_wdiffs into caller-supplied m_pDiffs (coalescing adjacents if possible)
 114                 sdiffs.PopulateDiffs();
 115
 116         }
 117         else
 118         {
 119                 if (str[0].empty())
 120                 {
 121                         stringdiffs sdiffs(str[1], str[2], case_sensitive, eol_sensitive, whitespace, ignore_numbers, breakType, &diffs);
 122                         sdiffs.BuildWordDiffList();
 123                         if (byte_level)
 124                                 sdiffs.wordLevelToByteLevel();
 125                         sdiffs.PopulateDiffs();
 126                         for (size_t i = 0; i < diffs.size(); i++)
 127                         {
 128                                 wdiff& diff = diffs[i];
 129                                 diff.begin[2] = diff.begin[1];
 130                                 diff.begin[1] = diff.begin[0];
 131                                 diff.begin[0] = 0;
 132                                 diff.end[2] = diff.end[1];
 133                                 diff.end[1] = diff.end[0];
 134                                 diff.end[0] = -1;
 135                         }
 136                 }
 137                 else if (str[1].empty())
 138                 {
 139                         stringdiffs sdiffs(str[0], str[2], case_sensitive, eol_sensitive, whitespace, ignore_numbers, breakType, &diffs);
 140                         sdiffs.BuildWordDiffList();
 141                         if (byte_level)
 142                                 sdiffs.wordLevelToByteLevel();
 143                         sdiffs.PopulateDiffs();
 144                         for (size_t i = 0; i < diffs.size(); i++)
 145                         {
 146                                 wdiff& diff = diffs[i];
 147                                 diff.begin[2] = diff.begin[1];
 148                                 //diff.begin[0] = diff.begin[0];
 149                                 diff.begin[1] = 0;
 150                                 diff.end[2] = diff.end[1];
 151                                 //diff.end[0] = diff.end[0];
 152                                 diff.end[1] = -1;
 153                         }
 154                 }
 155                 else if (str[2].empty())
 156                 {
 157                         stringdiffs sdiffs(str[0], str[1], case_sensitive, eol_sensitive, whitespace, ignore_numbers, breakType, &diffs);
 158                         sdiffs.BuildWordDiffList();
 159                         if (byte_level)
 160                                 sdiffs.wordLevelToByteLevel();
 161                         sdiffs.PopulateDiffs();
 162                         for (size_t i = 0; i < diffs.size(); i++)
 163                         {
 164                                 wdiff& diff = diffs[i];
 165                                 //diff.begin[1] = diff.begin[1];
 166                                 //diff.begin[0] = diff.begin[0];
 167                                 diff.begin[2] = 0;
 168                                 //diff.end[1] = diff.end[1];
 169                                 //diff.end[0] = diff.end[0];
 170                                 diff.end[2] = -1;
 171                         }
 172                 }
 173                 else
 174                 {
 175                         std::vector<wdiff> diffs10, diffs12;
 176                         stringdiffs sdiffs10(str[1], str[0], case_sensitive, eol_sensitive, 0, ignore_numbers, breakType, &diffs10);
 177                         stringdiffs sdiffs12(str[1], str[2], case_sensitive, eol_sensitive, 0, ignore_numbers, breakType, &diffs12);
 178                         // Hash all words in both lines and then compare them word by word
 179                         // storing differences into m_wdiffs
 180                         sdiffs10.BuildWordDiffList();
 181                         sdiffs12.BuildWordDiffList();
 182                         if (byte_level)
 183                         {
 184                                 sdiffs10.wordLevelToByteLevel();
 185                                 sdiffs12.wordLevelToByteLevel();
 186                         }
 187                         // Now copy m_wdiffs into caller-supplied m_pDiffs (coalescing adjacents if possible)
 188                         sdiffs10.PopulateDiffs();
 189                         sdiffs12.PopulateDiffs();
 190
 191                         Make3wayDiff(diffs, diffs10, diffs12,
 192                                 Comp02Functor(str, case_sensitive), false);
 193                 }
 194         }
 195         return diffs;
 196 }
 197
 198 int Compare(const String& str1, const String& str2,
 199         bool case_sensitive, bool eol_sensitive, int whitespace, bool ignore_numbers)
 200 {
 201         if (case_sensitive && eol_sensitive && whitespace == WHITESPACE_COMPARE_ALL && !ignore_numbers)
 202                 return str2.compare(str1);
 203         String s1 = str1, s2 = str2;
 204         if (!case_sensitive)
 205         {
 206                 s1 = strutils::makelower(s1);
 207                 s2 = strutils::makelower(s2);
 208         }
 209         if (whitespace == WHITESPACE_IGNORE_CHANGE)
 210         {
 211                 strutils::replace_chars(s1, _T(" \t"), _T(" "));
 212                 strutils::replace_chars(s2, _T(" \t"), _T(" "));
 213         }
 214         else if (whitespace == WHITESPACE_IGNORE_ALL)
 215         {
 216                 strutils::replace_chars(s1, _T(" \t"), _T(""));
 217                 strutils::replace_chars(s2, _T(" \t"), _T(""));
 218         }
 219         if (!eol_sensitive)
 220         {
 221                 strutils::replace_chars(s1, _T("\r\n"), _T("\n"));
 222                 strutils::replace_chars(s2, _T("\r\n"), _T("\n"));
 223         }
 224         if (ignore_numbers)
 225         {
 226                 strutils::replace_chars(s1, _T("0123456789"), _T(""));
 227                 strutils::replace_chars(s2, _T("0123456789"), _T(""));
 228         }
 229         return s2.compare(s1);
 230 }
 231
 232 /**
 233  * @brief stringdiffs constructor simply loads all members from arguments
 234  */
 235 stringdiffs::stringdiffs(const String & str1, const String & str2,
 236         bool case_sensitive, bool eol_sensitive, int whitespace, bool ignore_numbers, int breakType,
 237         std::vector<wdiff> * pDiffs)
 238 : m_str1(str1)
 239 , m_str2(str2)
 240 , m_case_sensitive(case_sensitive)
 241 , m_eol_sensitive(eol_sensitive)
 242 , m_whitespace(whitespace)
 243 , m_ignore_numbers(ignore_numbers)
 244 , m_breakType(breakType)
 245 , m_pDiffs(pDiffs)
 246 , m_matchblock(true) // Change to false to get word to word compare
 247 {
 248 }
 249
 250 /**
 251  * @brief Destructor.
 252  * The destructor frees all diffs added to the vectors.
 253  */
 254 stringdiffs::~stringdiffs() = default;
 255
 256 #ifdef STRINGDIFF_LOGGING
 257 void
 258 stringdiffs::debugoutput()
 259 {
 260         for (size_t i = 0; i < m_wdiffs.size(); i++)
 261         {
 262                 String str1;
 263                 String str2;
 264                 TCHAR buf[256];
 265                 int s1 = m_wdiffs[i].begin[0];
 266                 int e1 = m_wdiffs[i].end[0];
 267                 int s2 = m_wdiffs[i].begin[1];
 268                 int e2 = m_wdiffs[i].end[1];
 269
 270                 int len1 = e1 - s1 + 1;
 271                 int len2 = e2 - s2 + 1;
 272
 273                 if (len1 < 50)
 274                         str1 = m_str1.substr(s1 ,e1 - s1 + 1);
 275                 else
 276                         str1 = m_str1.substr(s1, 50);
 277
 278                 if (len2 < 50)
 279                         str2 = m_str2.substr(s2, e2- s2 + 1);
 280                 else
 281                         str2 = m_str2.substr(s2, 50);
 282
 283                 wsprintf(buf, _T("left=  %s,   %d,%d,\nright= %s,   %d,%d \n"),
 284                         str1.c_str(), s1, e1, str2.c_str(), s2, e2);
 285                 OutputDebugString(buf);
 286         }
 287 }
 288 #endif
 289
 290 bool
 291 stringdiffs::BuildWordDiffList_DP()
 292 {
 293         std::vector<char> edscript;
 294
 295         //if (dp(edscript) <= 0)
 296         //      return false;
 297         if (onp(edscript) < 0)
 298                 return false;
 299
 300         int i = 1, j = 1;
 301         for (size_t k = 0; k < edscript.size(); k++)
 302         {
 303                 int s1, e1, s2, e2;
 304                 if (edscript[k] == '-')
 305                 {
 306                         if (m_whitespace == WHITESPACE_IGNORE_ALL)
 307                         {
 308                                 if (IsSpace(m_words1[i]))
 309                                 {
 310                                         i++;
 311                                         continue;
 312                                 }
 313                         }
 314                         if (m_ignore_numbers && IsNumber(m_words1[i]))
 315                         {
 316                                 i++;
 317                                 continue;
 318                         }
 319
 320                         s1 = m_words1[i].start;
 321                         e1 = m_words1[i].end;
 322                         s2 = m_words2[j-1].end+1;
 323                         e2 = s2-1;
 324                         m_wdiffs.emplace_back(s1, e1, s2, e2);
 325                         i++;
 326                 }
 327                 else if (edscript[k] == '+')
 328                 {
 329                         if (m_whitespace == WHITESPACE_IGNORE_ALL)
 330                         {
 331                                 if (IsSpace(m_words2[j]))
 332                                 {
 333                                         j++;
 334                                         continue;
 335                                 }
 336                         }
 337
 338                         if (m_ignore_numbers && IsNumber(m_words2[j]))
 339                         {
 340                                 j++;
 341                                 continue;
 342                         }
 343
 344                         s1 = m_words1[i-1].end+1;
 345                         e1 = s1-1;
 346                         s2 = m_words2[j].start;
 347                         e2 = m_words2[j].end;
 348                         m_wdiffs.emplace_back(s1, e1, s2, e2);
 349                         j++;
 350                 }
 351                 else if (edscript[k] == '!')
 352                 {
 353                         if (m_whitespace == WHITESPACE_IGNORE_CHANGE || m_whitespace == WHITESPACE_IGNORE_ALL)
 354                         {
 355                                 if (IsSpace(m_words1[i]) && IsSpace(m_words2[j]))
 356                                 {
 357                                         i++; j++;
 358                                         continue;
 359                                 }
 360                         }
 361                         if (m_ignore_numbers && IsNumber(m_words1[i]) && IsNumber(m_words2[j]))
 362                         {
 363                                 i++; j++;
 364                                 continue;
 365                         }
 366
 367                         s1 =  m_words1[i].start;
 368                         e1 =  m_words1[i].end;
 369                         s2 =  m_words2[j].start;
 370                         e2 =  m_words2[j].end ;
 371                         m_wdiffs.emplace_back(s1, e1, s2, e2);
 372                         i++; j++;
 373                 }
 374                 else
 375                 {
 376                         i++; j++;
 377                 }
 378         }
 379 #ifdef STRINGDIFF_LOGGING
 380         debugoutput();
 381 #endif
 382         return true;
 383 }
 384
 385 /**
 386  * @brief Add all different elements between lines to the wdiff list
 387  */
 388 void
 389 stringdiffs::BuildWordDiffList()
 390 {
 391         m_words1 = BuildWordsArray(m_str1);
 392         m_words2 = BuildWordsArray(m_str2);
 393
 394         bool succeeded = false;
 395 #ifdef _WIN64
 396         if (m_words1.size() < 20480 && m_words2.size() < 20480)
 397 #else
 398         if (m_words1.size() < 2048 && m_words2.size() < 2048)
 399 #endif
 400         {
 401                 succeeded = BuildWordDiffList_DP();
 402         }
 403         if (!succeeded)
 404         {
 405                 int s1 = m_words1[0].start;
 406                 int e1 = m_words1[m_words1.size() - 1].end;
 407                 int s2 = m_words2[0].start;
 408                 int e2 = m_words2[m_words2.size() - 1].end;
 409                 m_wdiffs.emplace_back(s1, e1, s2, e2);
 410                 return;
 411         }
 412
 413 }
 414
 415 /**
 416  * @brief Break line into constituent words
 417  */
 418 std::vector<stringdiffs::word>
 419 stringdiffs::BuildWordsArray(const String & str)
 420 {
 421         std::vector<word> words;
 422         int i = 0, begin = 0;
 423         ICUBreakIterator *pIterChar = ICUBreakIterator::getCharacterBreakIterator(reinterpret_cast<const UChar *>(str.c_str()), static_cast<int32_t>(str.length()));
 424
 425         size_t sLen = str.length();
 426         assert(sLen < INT_MAX);
 427         int iLen = static_cast<int>(sLen);
 428
 429         // dummy;
 430         words.push_back(word(0, -1, 0, 0));
 431
 432         // state when we are looking for next word
 433 inspace:
 434         if (isSafeWhitespace(str[i]))
 435         {
 436                 i = pIterChar->next();
 437                 goto inspace;
 438         }
 439         if (begin < i)
 440         {
 441                 // just finished a word
 442                 // e is first word character (space or at end)
 443                 int e = i - 1;
 444
 445                 words.push_back(word(begin, e, dlspace, Hash(str, begin, e, 0)));
 446         }
 447         if (i == iLen)
 448                 return words;
 449         begin = i;
 450         goto inword;
 451
 452         // state when we are inside a word
 453 inword:
 454         bool atspace=false;
 455         if (i == iLen || ((atspace = isSafeWhitespace(str[i])) != 0) || isWordBreak(m_breakType, str.c_str(), i, m_ignore_numbers))
 456         {
 457                 if (begin<i)
 458                 {
 459                         // just finished a word
 460                         // e is first non-word character (space or at end)
 461                         int e = i-1;
 462
 463                         words.push_back(word(begin, e, dlword, Hash(str, begin, e, 0)));
 464                 }
 465                 if (i == iLen)
 466                 {
 467                         return words;
 468                 }
 469                 else if (atspace)
 470                 {
 471                         begin = i;
 472                         goto inspace;
 473                 }
 474                 else
 475                 {
 476                         // start a new word because we hit a non-whitespace word break (eg, a comma)
 477                         // but, we have to put each word break character into its own word
 478                         int break_type = dlbreak;
 479                         if (m_ignore_numbers && _istdigit(str[i]))
 480                         {
 481                                 break_type = dlnumber;
 482                         }
 483                         int inext = pIterChar->next();
 484                         words.push_back(word(i, inext - 1, break_type, Hash(str, i, inext - 1, 0)));
 485                         i = inext;
 486                         begin = i;
 487                         goto inword;
 488                 }
 489         }
 490         i = pIterChar->next();
 491         goto inword; // safe even if we're at the end or no longer in a word
 492 }
 493
 494 /**
 495  * @brief Populate m_pDiffs from m_wdiffs (combining adjacent diffs)
 496  *
 497  * Doing the combining of adjacent diffs here keeps some complexity out of BuildWordsArray.
 498  */
 499 void
 500 stringdiffs::PopulateDiffs()
 501 {
 502         auto IsEOLorEmpty = [](const String& text, size_t begin, size_t end) -> bool {
 503                 if (end - begin + 1 > 2)
 504                         return false;
 505                 String str = text.substr(begin, end - begin + 1);
 506                 return (str.empty() || str == _T("\r\n") || str == _T("\n") || str == _T("\r"));
 507         };
 508
 509         for (int i=0; i< (int)m_wdiffs.size(); ++i)
 510         {
 511                 bool skipIt = false;
 512                 // combine it with next ?
 513                 if (i+1< (int)m_wdiffs.size())
 514                 {
 515                         if (m_wdiffs[i].end[0] + 1 == m_wdiffs[i+1].begin[0]
 516                                 && m_wdiffs[i].end[1] + 1 == m_wdiffs[i+1].begin[1])
 517                         {
 518                                 // diff[i] and diff[i+1] are contiguous
 519                                 // so combine them into diff[i+1] and ignore diff[i]
 520                                 m_wdiffs[i+1].begin[0] = m_wdiffs[i].begin[0];
 521                                 m_wdiffs[i+1].begin[1] = m_wdiffs[i].begin[1];
 522                                 skipIt = true;
 523                         }
 524                 }
 525                 else
 526                 {
 527                         if (!m_eol_sensitive &&
 528                                 IsEOLorEmpty(m_str1, m_wdiffs[i].begin[0], m_wdiffs[i].end[0]) &&
 529                                 IsEOLorEmpty(m_str2, m_wdiffs[i].begin[1], m_wdiffs[i].end[1]))
 530                                 skipIt = true;
 531                 }
 532                 if (!skipIt)
 533                 {
 534                         // Should never have a pair where both are missing
 535                         assert(m_wdiffs[i].begin[0]>=0 || m_wdiffs[i].begin[1]>=0);
 536
 537                         // Store the diff[i] in the caller list (m_pDiffs)
 538                         m_pDiffs->emplace_back(m_wdiffs[i]);
 539                 }
 540         }
 541 }
 542
 543 // diffutils hash
 544
 545 /* Rotate a value n bits to the left. */
 546 #define UINT_BIT (sizeof (unsigned) * CHAR_BIT)
 547 #define ROL(v, n) ((v) << (n) | (v) >> (UINT_BIT - (n)))
 548 /* Given a hash value and a new character, return a new hash value. */
 549 #define HASH(h, c) ((c) + ROL (h, 7))
 550
 551 unsigned
 552 stringdiffs::Hash(const String & str, int begin, int end, unsigned h) const
 553 {
 554         for (int i = begin; i <= end; ++i)
 555         {
 556                 TCHAR ch = static_cast<unsigned>(str[i]);
 557                 if (m_case_sensitive)
 558                 {
 559                         h += HASH(h, ch);
 560                 }
 561                 else
 562                 {
 563                         ch = static_cast<unsigned>(_totlower(ch));
 564                         h += HASH(h, ch);
 565                 }
 566         }
 567
 568         return h;
 569 }
 570
 571 /**
 572  * @brief Compare two words (by reference to original strings)
 573  */
 574 bool
 575 stringdiffs::AreWordsSame(const word & word1, const word & word2) const
 576 {
 577         if (this->m_whitespace != WHITESPACE_COMPARE_ALL)
 578         {
 579                 if (IsSpace(word1) && IsSpace(word2))
 580                         return true;
 581         }
 582         if (m_ignore_numbers)
 583         {
 584                 auto a = m_str1[word1.start];
 585                 auto b = m_str2[word2.start];
 586                 if (_istdigit(a) && _istdigit(b))
 587                         return true;
 588
 589         }
 590
 591         if (word1.hash != word2.hash)
 592                 return false;
 593         if (word1.length() != word2.length())
 594                 return false;
 595         for (int i=0; i<word1.length(); ++i)
 596         {
 597                 if (!caseMatch(m_str1[word1.start+i], m_str2[word2.start+i]))
 598                         return false;
 599         }
 600         return true;
 601 }
 602
 603 /**
 604  * @brief Return true if characters match
 605  */
 606 bool
 607 stringdiffs::caseMatch(TCHAR ch1, TCHAR ch2) const
 608 {
 609         if (m_case_sensitive)
 610                 return ch1==ch2;
 611         else
 612                 return _totlower(ch1)==_totlower(ch2);
 613 }
 614
 615 /**
 616  * @ brief An O(NP) Sequence Comparison Algorithm. Sun Wu, Udi Manber, Gene Myers
 617  */
 618 int
 619 stringdiffs::onp(std::vector<char> &edscript)
 620 {
 621         auto start = std::chrono::system_clock::now();
 622
 623         int M = static_cast<int>(m_words1.size() - 1);
 624         int N = static_cast<int>(m_words2.size() - 1);
 625         bool exchanged = false;
 626         if (M > N)
 627         {
 628                 M = static_cast<int>(m_words2.size() - 1);
 629                 N = static_cast<int>(m_words1.size() - 1);
 630                 exchanged = true;
 631         }
 632         int *fp = (new int[(M+1) + 1 + (N+1)]) + (M+1);
 633         struct EditScriptElem { int op; int neq; int pk; int pi; };
 634         std::vector<EditScriptElem> *es = (new std::vector<EditScriptElem>[(M+1) + 1 + (N+1)]) + (M+1);
 635         int DELTA = N - M;
 636
 637         auto addEditScriptElem = [&es, &fp](int k) {
 638                 EditScriptElem ese;
 639                 if (fp[k - 1] + 1 > fp[k + 1])
 640                 {
 641                         ese.op = '+';
 642                         ese.neq = fp[k] - (fp[k - 1] + 1);
 643                         ese.pk = k - 1;
 644                 }
 645                 else
 646                 {
 647                         ese.op = '-';
 648                         ese.neq = fp[k] - fp[k + 1];
 649                         ese.pk = k + 1;
 650                 }
 651                 ese.pi = static_cast<int>(es[ese.pk].size() - 1);
 652                 es[k].push_back(ese);
 653         };
 654
 655         const int COUNTMAX = 100000;
 656         int count = 0;
 657         int k;
 658         for (k = -(M+1); k <= (N+1); k++)
 659                 fp[k] = -1;
 660         int p = -1;
 661         do
 662         {
 663                 p = p + 1;
 664                 for (k = -p; k <= DELTA-1; k++)
 665                 {
 666                         fp[k] = snake(k, std::max(fp[k-1] + 1, fp[k+1]), M, N, exchanged);
 667                         addEditScriptElem(k);
 668                         count++;
 669                 }
 670                 for (k = DELTA + p; k >= DELTA+1; k--)
 671                 {
 672                         fp[k] = snake(k, std::max(fp[k-1] + 1, fp[k+1]), M, N, exchanged);
 673                         addEditScriptElem(k);
 674                         count++;
 675                 }
 676                 k = DELTA;
 677                 fp[k] = snake(k, std::max(fp[k-1] + 1, fp[k+1]), M, N, exchanged);
 678                 addEditScriptElem(k);
 679                 count++;
 680
 681                 if (count > COUNTMAX)
 682                 {
 683                         count = 0;
 684                         auto end = std::chrono::system_clock::now();
 685                         auto msec = std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count();
 686                         if (msec > TimeoutMilliSeconds)
 687                         {
 688                                 delete [] (es - (M+1));
 689                                 delete [] (fp - (M+1));
 690                                 return -1;
 691                         }
 692                 }
 693         } while (fp[k] != N);
 694
 695         edscript.clear();
 696
 697         std::vector<char> ses;
 698         int i;
 699         for (k = DELTA, i = static_cast<int>(es[DELTA].size() - 1); i >= 0;)
 700         {
 701                 EditScriptElem& esi = es[k][i];
 702                 for (int j = 0; j < esi.neq; ++j)
 703                         ses.push_back('=');
 704                 ses.push_back(static_cast<char>(esi.op));
 705                 i = esi.pi;
 706                 k = esi.pk;
 707         }
 708         std::reverse(ses.begin(), ses.end());
 709
 710         int D = 0;
 711         for (i = 1; i < static_cast<int>(ses.size()); i++)
 712         {
 713                 switch (ses[i])
 714                 {
 715                 case '+':
 716                         if (static_cast<size_t>(i + 1) < ses.size() && ses[i + 1] == '-')
 717                         {
 718                                 edscript.push_back('!');
 719                                 i++;
 720                                 D++;
 721                         }
 722                         else
 723                         {
 724                                 edscript.push_back(exchanged ? '-' : '+');
 725                                 D++;
 726                         }
 727                         break;
 728                 case '-':
 729                         if (static_cast<size_t>(i + 1) < ses.size() && ses[i + 1] == '+')
 730                         {
 731                                 edscript.push_back('!');
 732                                 i++;
 733                                 D++;
 734                         }
 735                         else
 736                         {
 737                                 edscript.push_back(exchanged ? '+' : '-');
 738                                 D++;
 739                         }
 740                         break;
 741                 default:
 742                         edscript.push_back('=');
 743                 }
 744         }
 745
 746         delete [] (es - (M+1));
 747         delete [] (fp - (M+1));
 748
 749         return D;
 750 }
 751
 752 int
 753 stringdiffs::snake(int k, int y, int M, int N, bool exchanged)
 754 {
 755         int x = y - k;
 756         if (exchanged)
 757         {
 758                 while (x < M && y < N && AreWordsSame(m_words1[y + 1], m_words2[x + 1])) {
 759                         x++; y++;
 760                 }
 761         }
 762         else
 763         {
 764                 while (x < M && y < N && AreWordsSame(m_words1[x + 1], m_words2[y + 1])) {
 765                         x++; y++;
 766                 }
 767         }
 768         return y;
 769 }
 770
 771 /**
 772  * @brief Return true if chars match
 773  *
 774  * Caller must not call this for lead bytes
 775  */
 776 static inline bool
 777 matchchar(const TCHAR *ch1, const TCHAR *ch2, size_t len, bool casitive)
 778 {
 779         if (casitive)
 780                 return memcmp(ch1, ch2, len * sizeof(TCHAR)) == 0;
 781         for (size_t i = 0; i < len; ++i)
 782         {
 783                 if (_totlower(ch1[i]) != _totlower(ch2[i]))
 784                         return false;
 785         }
 786         return true;
 787 }
 788
 789
 790 /** Does character introduce a multicharacter character? */
 791 static inline bool IsLeadByte(TCHAR ch)
 792 {
 793 #ifdef UNICODE
 794         return false;
 795 #else
 796         return _getmbcp() && IsDBCSLeadByte(ch);
 797 #endif
 798 }
 799
 800 /**
 801  * @brief Is it whitespace (excludes all lead & trail bytes)?
 802  */
 803 static inline bool
 804 isSafeWhitespace(TCHAR ch)
 805 {
 806         return _istspace((unsigned)ch) && !IsLeadByte(ch);
 807 }
 808
 809 /**
 810  * @brief Is it a non-whitespace wordbreak character (ie, punctuation)?
 811  */
 812 static bool
 813 isWordBreak(int breakType, const TCHAR *str, int index, bool ignore_numbers)
 814 {
 815         TCHAR ch = str[index];
 816         if (ignore_numbers && _istdigit(ch))
 817                 return true;
 818         // breakType==1 means break also on punctuation
 819         if ((ch & 0xff00) == 0)
 820         {
 821 //              TCHAR nextCh = str[index + 1];
 822                 // breakType==0 means whitespace only
 823                 if (breakType==0)
 824                         return false;
 825                 return _tcschr(BreakChars, ch) != nullptr;
 826         }
 827         else
 828         {
 829 //              if (
 830 //                      ch==0xff0c/* Fullwidth Full Stop */ ||
 831 //                      ch==0xff0e/* Fullwidth Comma */ ||
 832 //                      ch==0xff1b/* Fullwidth Semicolon */ ||
 833 //                      ch==0xff1a/* Fullwidth Colon */ ||
 834 //                      ch==0x3002/* Ideographic Full Stop */ ||
 835 //                      ch==0x3001/* Ideographic Comma */
 836 //                      )
 837 //                      return true;
 838 //              WORD wCharType, wCharTypeNext;
 839 //              GetStringTypeW(CT_CTYPE3, &ch, 1, &wCharType);
 840 //              TCHAR nextCh = str[index + 1];
 841 //              GetStringTypeW(CT_CTYPE3, &nextCh, 1, &wCharTypeNext);
 842 //              return (wCharType != wCharTypeNext);
 843 //
 844                 WORD wCharType = 0;
 845                 GetStringTypeW(CT_CTYPE1, &ch, 1, &wCharType);
 846                 if ((wCharType & (C1_UPPER | C1_LOWER | C1_DIGIT)) != 0)
 847                         return false;
 848                 return true;
 849         }
 850 }
 851
 852
 853 /**
 854  * @brief advance current pointer over whitespace, until not whitespace or beyond end
 855  * @param pcurrent [in,out] current location (to be advanced)
 856  * @param end [in] last valid position (only go one beyond this)
 857  */
 858 static void
 859 AdvanceOverWhitespace(const TCHAR **pcurrent, const TCHAR *end)
 860 {
 861         // advance over whitespace
 862         while (*pcurrent <= end && isSafeWhitespace(**pcurrent))
 863                 ++(*pcurrent); // DBCS safe because of isSafeWhitespace above
 864 }
 865
 866 /**
 867  * @brief Compute begin1,begin2,end1,end2 to display byte difference between strings str1 & str2
 868  * @param casitive [in] true for case-sensitive, false for case-insensitive
 869  * @param xwhite [in] This governs whether we handle whitespace specially (see WHITESPACE_COMPARE_ALL, WHITESPACE_IGNORE_CHANGE, WHITESPACE_IGNORE_ALL)
 870  * @param [out] begin return -1 if not found or pos of equal
 871  * @param [out] end return -1 if not found or pos of equal valid if begin1 >=0
 872  * @param [in] equal false surch for a diff, true surch for equal
 873  *
 874  *
 875  * Assumes whitespace is never leadbyte or trailbyte!
 876  */
 877 void
 878 stringdiffs::ComputeByteDiff(const String & str1, const String & str2,
 879                    bool casitive, int xwhite,
 880                    int begin[2], int end[2], bool equal)
 881 {
 882         // Set to sane values
 883         // Also this way can distinguish if we set begin[0] to -1 for no diff in line
 884         begin[0] = end[0] = begin[1] = end[1] = 0;
 885
 886         int len1 = static_cast<int>(str1.length());
 887         int len2 = static_cast<int>(str2.length());
 888
 889         const TCHAR *pbeg1 = str1.c_str();
 890         const TCHAR *pbeg2 = str2.c_str();
 891
 892         ICUBreakIterator *pIterCharBegin1 = ICUBreakIterator::getCharacterBreakIterator(reinterpret_cast<const UChar *>(pbeg1), static_cast<int32_t>(len1));
 893         ICUBreakIterator *pIterCharBegin2 = ICUBreakIterator::getCharacterBreakIterator<2>(reinterpret_cast<const UChar *>(pbeg2), static_cast<int32_t>(len2));
 894         ICUBreakIterator *pIterCharEnd1 = ICUBreakIterator::getCharacterBreakIterator<3>(reinterpret_cast<const UChar *>(pbeg1), static_cast<int32_t>(len1));
 895         ICUBreakIterator *pIterCharEnd2 = ICUBreakIterator::getCharacterBreakIterator<4>(reinterpret_cast<const UChar *>(pbeg2), static_cast<int32_t>(len2));
 896
 897         if (len1 == 0 || len2 == 0)
 898         {
 899                 if (len1 == len2)
 900                 {
 901                         begin[0] = -1;
 902                         begin[1] = -1;
 903                 }
 904                 end[0] = len1 - 1;
 905                 end[1] = len2 - 1;
 906                 return;
 907         }
 908
 909         // cursors from front, which we advance to beginning of difference
 910         const TCHAR *py1 = pbeg1;
 911         const TCHAR *py2 = pbeg2;
 912
 913         // pen1,pen2 point to the last valid character (broken multibyte lead chars don't count)
 914         const TCHAR *pen1 = pbeg1 + (len1 > 0 ? pIterCharEnd1->preceding(len1) : 0);
 915         const TCHAR *pen2 = pbeg2 + (len2 > 0 ? pIterCharEnd2->preceding(len2) : 0);
 916         size_t glyphlenz1 = pbeg1 + len1 - pen1;
 917         size_t glyphlenz2 = pbeg2 + len2 - pen2;
 918
 919         if (xwhite != WHITESPACE_COMPARE_ALL)
 920         {
 921                 // Ignore leading and trailing whitespace
 922                 // by advancing py1 and py2
 923                 // and retreating pen1 and pen2
 924                 while (py1 < pen1 && isSafeWhitespace(*py1))
 925                         py1 = pbeg1 + pIterCharBegin1->next();
 926                 while (py2 < pen2 && isSafeWhitespace(*py2))
 927                         py2 = pbeg2 + pIterCharBegin2->next();
 928                 if ((pen1 < pbeg1 + len1 - 1 || pen2 < pbeg2 + len2 -1)
 929                         && (pbeg1[len1] != pbeg2[len2]))
 930                 {
 931                         // mismatched broken multibyte ends
 932                 }
 933                 else
 934                 {
 935                         while (pen1 > py1 && isSafeWhitespace(*pen1))
 936                                 pen1 = pbeg1 + pIterCharEnd1->previous();
 937                         while (pen2 > py2 && isSafeWhitespace(*pen2))
 938                                 pen2 = pbeg2 + pIterCharEnd2->previous();
 939                 }
 940         }
 941         //check for excaption of empty string on one side
 942         //In that case display all as a diff
 943         if (!equal && (((py1 == pen1) && isSafeWhitespace(*pen1)) ||
 944                 ((py2 == pen2) && isSafeWhitespace(*pen2))))
 945         {
 946                 begin[0] = 0;
 947                 begin[1] = 0;
 948                 end[0] = len1 - 1;
 949                 end[1] = len2 - 1;
 950                 return;
 951         }
 952         // Advance over matching beginnings of lines
 953         // Advance py1 & py2 from beginning until find difference or end
 954         while (true)
 955         {
 956                 // Potential difference extends from py1 to pen1 and py2 to pen2
 957
 958                 // Check if either side finished
 959                 if (py1 > pen1 && py2 > pen2)
 960                 {
 961                         begin[0] = end[0] = begin[1] = end[1] = -1;
 962                         break;
 963                 }
 964                 if (py1 > pen1 || py2 > pen2)
 965                 {
 966                         break;
 967                 }
 968
 969                 // handle all the whitespace logic (due to WinMerge whitespace settings)
 970                 if (xwhite && py1 < pen1 && isSafeWhitespace(*py1))
 971                 {
 972                         if (xwhite==WHITESPACE_IGNORE_CHANGE && !isSafeWhitespace(*py2))
 973                         {
 974                                 // py1 is white but py2 is not
 975                                 // in WHITESPACE_IGNORE_CHANGE mode,
 976                                 // this doesn't qualify as skippable whitespace
 977                                 break; // done with forward search
 978                         }
 979                         // gobble up all whitespace in current area
 980                         AdvanceOverWhitespace(&py1, pen1); // will go beyond end
 981                         AdvanceOverWhitespace(&py2, pen2); // will go beyond end
 982                         continue;
 983
 984                 }
 985                 if (xwhite && py2 < pen2 && isSafeWhitespace(*py2))
 986                 {
 987                         if (xwhite==WHITESPACE_IGNORE_CHANGE && !isSafeWhitespace(*py1))
 988                         {
 989                                 // py2 is white but py1 is not
 990                                 // in WHITESPACE_IGNORE_CHANGE mode,
 991                                 // this doesn't qualify as skippable whitespace
 992                                 break; // done with forward search
 993                         }
 994                         // gobble up all whitespace in current area
 995                         AdvanceOverWhitespace(&py1, pen1); // will go beyond end
 996                         AdvanceOverWhitespace(&py2, pen2); // will go beyond end
 997                         continue;
 998                 }
 999
1000                 const TCHAR* py1next = pbeg1 + pIterCharBegin1->next();
1001                 const TCHAR* py2next = pbeg2 + pIterCharBegin2->next();
1002                 size_t glyphleny1 = py1next - py1;
1003                 size_t glyphleny2 = py2next - py2;
1004                 if (glyphleny1 != glyphleny2 || !matchchar(py1, py2, glyphleny1, casitive))
1005                         break; // done with forward search
1006                 py1 = py1next;
1007                 py2 = py2next;
1008         }
1009
1010         // Potential difference extends from py1 to pen1 and py2 to pen2
1011
1012         // Store results of advance into return variables (begin[0] & begin[1])
1013         // -1 in a begin variable means no visible diff area
1014         begin[0] = static_cast<int>(py1 - pbeg1);
1015         begin[1] = static_cast<int>(py2 - pbeg2);
1016
1017         const TCHAR *pz1 = pen1;
1018         const TCHAR *pz2 = pen2;
1019
1020         // Retreat over matching ends of lines
1021         // Retreat pz1 & pz2 from end until find difference or beginning
1022         while (true)
1023         {
1024                 // Check if either side finished
1025                 if (pz1 < py1 && pz2 < py2)
1026                 {
1027                         begin[0] = end[0] = begin[1] = end[1] = -1;
1028                         break;
1029                 }
1030                 if (pz1 < py1 || pz2 < py2)
1031                 {
1032                         break;
1033                 }
1034
1035                 // handle all the whitespace logic (due to WinMerge whitespace settings)
1036                 if (xwhite && pz1 > py1 && isSafeWhitespace(*pz1))
1037                 {
1038                         if (xwhite==1 && !isSafeWhitespace(*pz2))
1039                                 break; // done with reverse search
1040                         // gobble up all whitespace in current area
1041                         while (pz1 > py1 && isSafeWhitespace(*pz1))
1042                                 pz1 = pbeg1 + pIterCharEnd1->previous();
1043                         while (pz2 > py2 && isSafeWhitespace(*pz2))
1044                                 pz2 = pbeg2 + pIterCharEnd2->previous();
1045                         continue;
1046
1047                 }
1048                 if (xwhite && pz2 > py2 && isSafeWhitespace(*pz2))
1049                 {
1050                         if (xwhite==1)
1051                                 break; // done with reverse search
1052                         while (pz2 > py2 && isSafeWhitespace(*pz2))
1053                                 pz2 = pbeg2 + pIterCharEnd2->previous();
1054                         continue;
1055                 }
1056
1057                 if (glyphlenz1 != glyphlenz2 || !matchchar(pz1, pz2, glyphlenz1, casitive))
1058                         break; // done with forward search
1059                 const TCHAR* pz1next = pz1;
1060                 const TCHAR* pz2next = pz2;
1061                 pz1 = (pz1 > pbeg1) ? pbeg1 + pIterCharEnd1->preceding(static_cast<int32_t>(pz1 - pbeg1)) : pz1 - 1;
1062                 pz2 = (pz2 > pbeg2) ? pbeg2 + pIterCharEnd2->preceding(static_cast<int32_t>(pz2 - pbeg2)) : pz2 - 1;
1063                 glyphlenz1 = pz1next - pz1;
1064                 glyphlenz2 = pz2next - pz2;
1065                 // Now do real character match
1066     }
1067
1068 /*      if (*pz1 == '\r' && *(pz1+1) == '\n')
1069         {
1070                 pz1++;
1071                 pz2++;
1072         }
1073         else if (*pz2 == '\r' && *(pz2+1) == '\n')
1074         {
1075                 pz2++;
1076                 pz1++;
1077         }
1078         if (*(pbeg1-1) == '\r' && *pbeg1 == '\n')
1079         {
1080                 pbeg1--;
1081                 pbeg2--;
1082         }
1083         else if (*(pbeg2-1) == '\r' && *pbeg2 == '\n')
1084         {
1085                 pbeg2--;
1086                 pbeg1--;
1087         }*/
1088
1089         // Store results of advance into return variables (end[0] & end[1])
1090         end[0] = static_cast<int>(pz1 - pbeg1 + glyphlenz1 - 1);
1091         end[1] = static_cast<int>(pz2 - pbeg2 + glyphlenz2 - 1);
1092
1093         // Check if difference region was empty
1094         if (begin[0] == end[0] + 1 && begin[1] == end[1] + 1)
1095                 begin[0] = -1; // no diff
1096 }
1097
1098 /**
1099  * @brief adjust the range of the specified word diffs down to byte(char) level.
1100  * @param str1, str2 [in] line to be compared
1101  * @param casitive [in] true for case-sensitive, false for case-insensitive
1102  * @param xwhite [in] This governs whether we handle whitespace specially
1103  *  (see WHITESPACE_COMPARE_ALL, WHITESPACE_IGNORE_CHANGE, WHITESPACE_IGNORE_ALL)
1104  */
1105 void stringdiffs::wordLevelToByteLevel()
1106 {
1107         for (size_t i = 0; i < m_wdiffs.size(); i++)
1108         {
1109                 int begin[3], end[3];
1110                 wdiff& diff = m_wdiffs[i];
1111                 String str1_2, str2_2;
1112                 str1_2 = m_str1.substr(diff.begin[0], diff.end[0] - diff.begin[0] + 1);
1113                 str2_2 = m_str2.substr(diff.begin[1], diff.end[1] - diff.begin[1] + 1);
1114                 ComputeByteDiff(str1_2, str2_2, m_case_sensitive, m_whitespace, begin, end, false);
1115                 if (begin[0] == -1)
1116                 {
1117                         // no visible diff on side1
1118                         diff.end[0] = diff.begin[0] - 1;
1119                 }
1120                 else
1121                 {
1122                         diff.end[0] = diff.begin[0] + end[0];
1123                         diff.begin[0] += begin[0];
1124                 }
1125                 if (begin[1] == -1)
1126                 {
1127                         // no visible diff on side2
1128                         diff.end[1] = diff.begin[1] - 1;
1129                 }
1130                 else
1131                 {
1132                         diff.end[1] = diff.begin[1] + end[1];
1133                         diff.begin[1] += begin[1];
1134                 }
1135         }
1136 }
1137
1138 }