Src/stringdiffs.cpp

   1 /**
   2  * @file  stringdiffs.cpp
   3  *
   4  * @brief Implementation file for sd_ComputeWordDiffs (q.v.)
   5  *
   6  */
   7
   8 #include "stringdiffs.h"
   9 #define NOMINMAX
  10 #include <windows.h>
  11 #include <tchar.h>
  12 #include <cstring>
  13 #include <cassert>
  14 #include <climits>
  15 #include <mbctype.h>
  16 #include "CompareOptions.h"
  17 #include "stringdiffsi.h"
  18 #include "Diff3.h"
  19
  20 using std::vector;
  21
  22 static bool Initialized;
  23 static bool CustomChars;
  24 static TCHAR *BreakChars;
  25 static TCHAR BreakCharDefaults[] = _T(",.;:");
  26
  27 static bool isSafeWhitespace(TCHAR ch);
  28 static bool isWordBreak(int breakType, const TCHAR *str, int index);
  29
  30 void sd_Init()
  31 {
  32         BreakChars = &BreakCharDefaults[0];
  33         Initialized = true;
  34 }
  35
  36 void sd_Close()
  37 {
  38         if (CustomChars)
  39         {
  40                 free(BreakChars);
  41                 BreakChars = NULL;
  42                 CustomChars = false;
  43         }
  44         Initialized = false;
  45 }
  46
  47 void sd_SetBreakChars(const TCHAR *breakChars)
  48 {
  49         assert(Initialized);
  50
  51         if (CustomChars)
  52                 free(BreakChars);
  53
  54         CustomChars = true;
  55         BreakChars = _tcsdup(breakChars);
  56 }
  57
  58 void
  59 sd_ComputeWordDiffs(const String& str1, const String& str2,
  60         bool case_sensitive, int whitespace, int breakType, bool byte_level,
  61         std::vector<wdiff> * pDiffs)
  62 {
  63         String strs[3] = {str1, str2, _T("")};
  64         sd_ComputeWordDiffs(2, strs, case_sensitive, whitespace, breakType, byte_level, pDiffs);
  65 }
  66
  67 struct Comp02Functor
  68 {
  69         Comp02Functor(const String *strs, bool case_sensitive) :
  70                 strs_(strs), case_sensitive_(case_sensitive)
  71         {
  72         }
  73         bool operator()(const wdiff &wd3)
  74         {
  75                 size_t wlen0 = wd3.end[0] - wd3.begin[0] + 1;
  76                 size_t wlen2 = wd3.end[2] - wd3.begin[2] + 1;
  77                 if (wlen0 != wlen2)
  78                         return false;
  79                 if (case_sensitive_)
  80                 {
  81                         if (memcmp(&strs_[0][wd3.begin[0]], &strs_[2][wd3.begin[2]], wlen0 * sizeof(TCHAR)) != 0)
  82                                 return false;
  83                 }
  84                 else
  85                 {
  86                         if (_tcsnicmp(&strs_[0][wd3.begin[0]], &strs_[2][wd3.begin[2]], wlen0) != 0)
  87                                 return false;
  88                 }
  89                 return true;
  90         }
  91         const String *strs_;
  92         bool case_sensitive_;
  93 };
  94
  95 /**
  96  * @brief Construct our worker object and tell it to do the work
  97  */
  98 void
  99 sd_ComputeWordDiffs(int nFiles, const String str[3],
 100         bool case_sensitive, int whitespace, int breakType, bool byte_level,
 101         std::vector<wdiff> * pDiffs)
 102 {
 103         if (nFiles == 2)
 104         {
 105                 stringdiffs sdiffs(str[0], str[1], case_sensitive, whitespace, breakType, pDiffs);
 106                 // Hash all words in both lines and then compare them word by word
 107                 // storing differences into m_wdiffs
 108                 sdiffs.BuildWordDiffList();
 109
 110                 if (byte_level)
 111                         sdiffs.wordLevelToByteLevel();
 112
 113                 // Now copy m_wdiffs into caller-supplied m_pDiffs (coalescing adjacents if possible)
 114                 sdiffs.PopulateDiffs();
 115
 116         }
 117         else
 118         {
 119                 if (str[0].empty())
 120                 {
 121                         stringdiffs sdiffs(str[1], str[2], case_sensitive, whitespace, breakType, pDiffs);
 122                         sdiffs.BuildWordDiffList();
 123                         if (byte_level)
 124                                 sdiffs.wordLevelToByteLevel();
 125                         sdiffs.PopulateDiffs();
 126                         for (size_t i = 0; i < pDiffs->size(); i++)
 127                         {
 128                                 wdiff& diff = (*pDiffs)[i];
 129                                 diff.begin[2] = diff.begin[1];
 130                                 diff.begin[1] = diff.begin[0];
 131                                 diff.begin[0] = 0;
 132                                 diff.end[2] = diff.end[1];
 133                                 diff.end[1] = diff.end[0];
 134                                 diff.end[0] = -1;
 135                         }
 136                 }
 137                 else if (str[1].empty())
 138                 {
 139                         stringdiffs sdiffs(str[0], str[2], case_sensitive, whitespace, breakType, pDiffs);
 140                         sdiffs.BuildWordDiffList();
 141                         if (byte_level)
 142                                 sdiffs.wordLevelToByteLevel();
 143                         sdiffs.PopulateDiffs();
 144                         for (size_t i = 0; i < pDiffs->size(); i++)
 145                         {
 146                                 wdiff& diff = (*pDiffs)[i];
 147                                 diff.begin[2] = diff.begin[1];
 148                                 //diff.begin[0] = diff.begin[0];
 149                                 diff.begin[1] = 0;
 150                                 diff.end[2] = diff.end[1];
 151                                 //diff.end[0] = diff.end[0];
 152                                 diff.end[1] = -1;
 153                         }
 154                 }
 155                 else if (str[2].empty())
 156                 {
 157                         stringdiffs sdiffs(str[0], str[1], case_sensitive, whitespace, breakType, pDiffs);
 158                         sdiffs.BuildWordDiffList();
 159                         if (byte_level)
 160                                 sdiffs.wordLevelToByteLevel();
 161                         sdiffs.PopulateDiffs();
 162                         for (size_t i = 0; i < pDiffs->size(); i++)
 163                         {
 164                                 wdiff& diff = (*pDiffs)[i];
 165                                 //diff.begin[1] = diff.begin[1];
 166                                 //diff.begin[0] = diff.begin[0];
 167                                 diff.begin[2] = 0;
 168                                 //diff.end[1] = diff.end[1];
 169                                 //diff.end[0] = diff.end[0];
 170                                 diff.end[2] = -1;
 171                         }
 172                 }
 173                 else
 174                 {
 175                         std::vector<wdiff> diffs10, diffs12;
 176                         stringdiffs sdiffs10(str[1], str[0], case_sensitive, whitespace, breakType, &diffs10);
 177                         stringdiffs sdiffs12(str[1], str[2], case_sensitive, whitespace, breakType, &diffs12);
 178                         // Hash all words in both lines and then compare them word by word
 179                         // storing differences into m_wdiffs
 180                         sdiffs10.BuildWordDiffList();
 181                         sdiffs12.BuildWordDiffList();
 182                         if (byte_level)
 183                         {
 184                                 sdiffs10.wordLevelToByteLevel();
 185                                 sdiffs12.wordLevelToByteLevel();
 186                         }
 187                         // Now copy m_wdiffs into caller-supplied m_pDiffs (coalescing adjacents if possible)
 188                         sdiffs10.PopulateDiffs();
 189                         sdiffs12.PopulateDiffs();
 190
 191                         Make3wayDiff(*pDiffs, diffs10, diffs12,
 192                                 Comp02Functor(str, case_sensitive), false);
 193                 }
 194         }
 195 }
 196
 197 /**
 198  * @brief stringdiffs constructor simply loads all members from arguments
 199  */
 200 stringdiffs::stringdiffs(const String & str1, const String & str2,
 201         bool case_sensitive, int whitespace, int breakType,
 202         std::vector<wdiff> * pDiffs)
 203 : m_str1(str1)
 204 , m_str2(str2)
 205 , m_case_sensitive(case_sensitive)
 206 , m_whitespace(whitespace)
 207 , m_breakType(breakType)
 208 , m_pDiffs(pDiffs)
 209 , m_matchblock(true) // Change to false to get word to word compare
 210 {
 211 }
 212
 213 /**
 214  * @brief Destructor.
 215  * The destructor frees all diffs added to the vectors.
 216  */
 217 stringdiffs::~stringdiffs()
 218 {
 219 }
 220
 221 #ifdef STRINGDIFF_LOGGING
 222 void
 223 stringdiffs::debugoutput()
 224 {
 225         for (size_t i = 0; i < m_wdiffs.size(); i++)
 226         {
 227                 String str1;
 228                 String str2;
 229                 TCHAR buf[256];
 230                 int s1 = m_wdiffs[i].begin[0];
 231                 int e1 = m_wdiffs[i].end[0];
 232                 int s2 = m_wdiffs[i].begin[1];
 233                 int e2 = m_wdiffs[i].end[1];
 234
 235                 int len1 = e1 - s1 + 1;
 236                 int len2 = e2 - s2 + 1;
 237
 238                 if (len1 < 50)
 239                         str1 = m_str1.substr(s1 ,e1 - s1 + 1);
 240                 else
 241                         str1 = m_str1.substr(s1, 50);
 242
 243                 if (len2 < 50)
 244                         str2 = m_str2.substr(s2, e2- s2 + 1);
 245                 else
 246                         str2 = m_str2.substr(s2, 50);
 247
 248                 wsprintf(buf, _T("left=  %s,   %d,%d,\nright= %s,   %d,%d \n"),
 249                         str1.c_str(), s1, e1, str2.c_str(), s2, e2);
 250                 OutputDebugString(buf);
 251         }
 252 }
 253 #endif
 254
 255 bool
 256 stringdiffs::BuildWordDiffList_DP()
 257 {
 258         std::vector<char> edscript;
 259
 260         //if (dp(edscript) <= 0)
 261         //      return false;
 262         onp(edscript);
 263
 264         int i = 1, j = 1;
 265         for (size_t k = 0; k < edscript.size(); k++)
 266         {
 267                 int s1, e1, s2, e2;
 268                 if (edscript[k] == '-')
 269                 {
 270                         if (m_whitespace == WHITESPACE_IGNORE_ALL)
 271                         {
 272                                 if (IsSpace(m_words1[i]))
 273                                 {
 274                                         i++;
 275                                         continue;
 276                                 }
 277                         }
 278
 279                         s1 = m_words1[i].start;
 280                         e1 = m_words1[i].end;
 281                         s2 = m_words2[j-1].end+1;
 282                         e2 = s2-1;
 283                         m_wdiffs.push_back(wdiff(s1, e1, s2, e2));
 284                         i++;
 285                 }
 286                 else if (edscript[k] == '+')
 287                 {
 288                         if (m_whitespace == WHITESPACE_IGNORE_ALL)
 289                         {
 290                                 if (IsSpace(m_words2[j]))
 291                                 {
 292                                         j++;
 293                                         continue;
 294                                 }
 295                         }
 296
 297                         s1 = m_words1[i-1].end+1;
 298                         e1 = s1-1;
 299                         s2 = m_words2[j].start;
 300                         e2 = m_words2[j].end;
 301                         m_wdiffs.push_back(wdiff(s1, e1, s2, e2));
 302                         j++;
 303                 }
 304                 else if (edscript[k] == '!')
 305                 {
 306                         if (m_whitespace == WHITESPACE_IGNORE_CHANGE || m_whitespace == WHITESPACE_IGNORE_ALL)
 307                         {
 308                                 if (IsSpace(m_words1[i]) && IsSpace(m_words2[j]))
 309                                 {
 310                                         i++; j++;
 311                                         continue;
 312                                 }
 313                         }
 314
 315                         s1 =  m_words1[i].start;
 316                         e1 =  m_words1[i].end;
 317                         s2 =  m_words2[j].start;
 318                         e2 =  m_words2[j].end ;
 319                         m_wdiffs.push_back(wdiff(s1, e1, s2, e2));
 320                         i++; j++;
 321                 }
 322                 else
 323                 {
 324                         i++; j++;
 325                 }
 326         }
 327 #ifdef STRINGDIFF_LOGGING
 328         debugoutput();
 329 #endif
 330         return true;
 331 }
 332
 333 /**
 334  * @brief Add all different elements between lines to the wdiff list
 335  */
 336 void
 337 stringdiffs::BuildWordDiffList()
 338 {
 339         BuildWordsArray(m_str1, m_words1);
 340         BuildWordsArray(m_str2, m_words2);
 341
 342 #ifdef _WIN64
 343         if (m_words1.size() > 20480 || m_words2.size() > 20480)
 344 #else
 345         if (m_words1.size() > 2048 || m_words2.size() > 2048)
 346 #endif
 347         {
 348                 int s1 = m_words1[0].start;
 349                 int e1 = m_words1[m_words1.size() - 1].end;
 350                 int s2 = m_words2[0].start;
 351                 int e2 = m_words2[m_words2.size() - 1].end;
 352                 m_wdiffs.push_back(wdiff(s1, e1, s2, e2));
 353                 return;
 354         }
 355
 356         BuildWordDiffList_DP();
 357 }
 358
 359 /**
 360  * @brief Break line into constituent words
 361  */
 362 void
 363 stringdiffs::BuildWordsArray(const String & str, std::vector<word>& words)
 364 {
 365         int i=0, begin=0;
 366
 367         // dummy;
 368         words.push_back(word(0, -1, 0, 0));
 369
 370         // state when we are looking for next word
 371 inspace:
 372         if (isSafeWhitespace(str[i]))
 373         {
 374                 ++i;
 375                 goto inspace;
 376         }
 377         if (begin < i)
 378         {
 379                 // just finished a word
 380                 // e is first word character (space or at end)
 381                 int e = i - 1;
 382
 383                 words.push_back(word(begin, e, dlspace, Hash(str, begin, e, 0)));
 384         }
 385         if (i == str.length())
 386                 return;
 387         begin = i;
 388         goto inword;
 389
 390         // state when we are inside a word
 391 inword:
 392         bool atspace=false;
 393         if (i == str.length() || ((atspace = isSafeWhitespace(str[i])) != 0) || isWordBreak(m_breakType, str.c_str(), i))
 394         {
 395                 if (begin<i)
 396                 {
 397                         // just finished a word
 398                         // e is first non-word character (space or at end)
 399                         int e = i-1;
 400
 401                         words.push_back(word(begin, e, dlword, Hash(str, begin, e, 0)));
 402                 }
 403                 if (i == str.length())
 404                 {
 405                         return;
 406                 }
 407                 else if (atspace)
 408                 {
 409                         begin = i;
 410                         goto inspace;
 411                 }
 412                 else
 413                 {
 414                         // start a new word because we hit a non-whitespace word break (eg, a comma)
 415                         // but, we have to put each word break character into its own word
 416                         words.push_back(word(i, i, dlbreak, Hash(str, i, i, 0)));
 417                         ++i;
 418                         begin = i;
 419                         goto inword;
 420                 }
 421         }
 422         ++i;
 423         goto inword; // safe even if we're at the end or no longer in a word
 424 }
 425
 426 /**
 427  * @brief Populate m_pDiffs from m_wdiffs (combining adjacent diffs)
 428  *
 429  * Doing the combining of adjacent diffs here keeps some complexity out of BuildWordsArray.
 430  */
 431 void
 432 stringdiffs::PopulateDiffs()
 433 {
 434         for (int i=0; i< (int)m_wdiffs.size(); ++i)
 435         {
 436                 bool skipIt = false;
 437                 // combine it with next ?
 438                 if (i+1< (int)m_wdiffs.size())
 439                 {
 440                         if (m_wdiffs[i].end[0] + 1 == m_wdiffs[i+1].begin[0]
 441                                 && m_wdiffs[i].end[1] + 1 == m_wdiffs[i+1].begin[1])
 442                         {
 443                                 // diff[i] and diff[i+1] are contiguous
 444                                 // so combine them into diff[i+1] and ignore diff[i]
 445                                 m_wdiffs[i+1].begin[0] = m_wdiffs[i].begin[0];
 446                                 m_wdiffs[i+1].begin[1] = m_wdiffs[i].begin[1];
 447                                 skipIt = true;
 448                         }
 449                 }
 450                 if (!skipIt)
 451                 {
 452                         // Should never have a pair where both are missing
 453                         assert(m_wdiffs[i].begin[0]>=0 || m_wdiffs[i].begin[1]>=0);
 454
 455                         // Store the diff[i] in the caller list (m_pDiffs)
 456                         m_pDiffs->push_back(wdiff(m_wdiffs[i]));
 457                 }
 458         }
 459 }
 460
 461 // diffutils hash
 462
 463 /* Rotate a value n bits to the left. */
 464 #define UINT_BIT (sizeof (unsigned) * CHAR_BIT)
 465 #define ROL(v, n) ((v) << (n) | (v) >> (UINT_BIT - (n)))
 466 /* Given a hash value and a new character, return a new hash value. */
 467 #define HASH(h, c) ((c) + ROL (h, 7))
 468
 469 unsigned
 470 stringdiffs::Hash(const String & str, int begin, int end, unsigned h) const
 471 {
 472         for (int i = begin; i <= end; ++i)
 473         {
 474                 unsigned ch = static_cast<unsigned>(str[i]);
 475                 if (m_case_sensitive)
 476                 {
 477                         h += HASH(h, ch);
 478                 }
 479                 else
 480                 {
 481                         ch = static_cast<unsigned>(_totupper(ch));
 482                         h += HASH(h, ch);
 483                 }
 484         }
 485
 486         return h;
 487 }
 488
 489 /**
 490  * @brief Compare two words (by reference to original strings)
 491  */
 492 bool
 493 stringdiffs::AreWordsSame(const word & word1, const word & word2) const
 494 {
 495         if (this->m_whitespace != WHITESPACE_COMPARE_ALL)
 496         {
 497                 if (IsSpace(word1) && IsSpace(word2))
 498                         return true;
 499         }
 500         if (word1.hash != word2.hash)
 501                 return false;
 502         if (word1.length() != word2.length())
 503                 return false;
 504         for (int i=0; i<word1.length(); ++i)
 505         {
 506                 if (!caseMatch(m_str1[word1.start+i], m_str2[word2.start+i]))
 507                         return false;
 508         }
 509         return true;
 510 }
 511
 512 /**
 513  * @brief Return true if characters match
 514  */
 515 bool
 516 stringdiffs::caseMatch(TCHAR ch1, TCHAR ch2) const
 517 {
 518         if (m_case_sensitive)
 519                 return ch1==ch2;
 520         else
 521                 return _totupper(ch1)==_totupper(ch2);
 522 }
 523
 524 /**
 525  * @ brief An O(NP) Sequence Comparison Algorithm. Sun Wu, Udi Manber, Gene Myers
 526  */
 527 int
 528 stringdiffs::onp(std::vector<char> &edscript)
 529 {
 530         int M = m_words1.size() - 1;
 531         int N = m_words2.size() - 1;
 532         bool exchanged = false;
 533         if (M > N)
 534         {
 535                 M = m_words2.size() - 1;
 536                 N = m_words1.size() - 1;
 537                 exchanged = true;
 538         }
 539         int *fp = (new int[(M+1) + 1 + (N+1)]) + (M+1);
 540         std::vector<char> *es = (new std::vector<char>[(M+1) + 1 + (N+1)]) + (M+1);
 541         int DELTA = N - M;
 542
 543         int k;
 544         for (k = -(M+1); k <= (N+1); k++)
 545                 fp[k] = -1;
 546         int p = -1;
 547         do
 548         {
 549                 p = p + 1;
 550                 for (k = -p; k <= DELTA-1; k++)
 551                 {
 552                         fp[k] = snake(k, std::max(fp[k-1] + 1, fp[k+1]), exchanged);
 553
 554                         es[k] = fp[k-1] + 1 > fp[k+1] ? es[k-1] : es[k+1];
 555                         es[k].push_back(fp[k-1] + 1 > fp[k+1] ? '+' : '-');
 556                         es[k].resize(es[k].size() + fp[k] - std::max(fp[k-1] + 1, fp[k+1]), '=');
 557                 }
 558                 for (k = DELTA + p; k >= DELTA+1; k--)
 559                 {
 560                         fp[k] = snake(k, std::max(fp[k-1] + 1, fp[k+1]), exchanged);
 561
 562                         es[k] = fp[k-1] + 1 > fp[k+1] ? es[k-1] : es[k+1];
 563                         es[k].push_back(fp[k-1] + 1 > fp[k+1] ? '+' : '-');
 564                         es[k].resize(es[k].size() + fp[k] - std::max(fp[k-1] + 1, fp[k+1]), '=');
 565                 }
 566                 k = DELTA;
 567                 fp[k] = snake(k, std::max(fp[k-1] + 1, fp[k+1]), exchanged);
 568
 569                 es[k] = fp[k-1] + 1 > fp[k+1] ? es[k-1] : es[k+1];
 570                 es[k].push_back(fp[k-1] + 1 > fp[k+1] ? '+' : '-');
 571                 es[k].resize(es[k].size() + fp[k] - std::max(fp[k-1] + 1, fp[k+1]), '=');
 572         } while (fp[k] != N);
 573
 574         std::vector<char> &ses = es[DELTA]; // Shortest edit script
 575         edscript.clear();
 576
 577         int D = 0;
 578         for (size_t i = 1; i < ses.size(); i++)
 579         {
 580                 switch (ses[i])
 581                 {
 582                 case '+':
 583                         if (i + 1 < ses.size() && ses[i + 1] == '-')
 584                         {
 585                                 edscript.push_back('!');
 586                                 i++;
 587                                 D++;
 588                         }
 589                         else
 590                         {
 591                                 edscript.push_back(exchanged ? '-' : '+');
 592                                 D++;
 593                         }
 594                         break;
 595                 case '-':
 596                         if (i + 1 < ses.size() && ses[i + 1] == '+')
 597                         {
 598                                 edscript.push_back('!');
 599                                 i++;
 600                                 D++;
 601                         }
 602                         else
 603                         {
 604                                 edscript.push_back(exchanged ? '+' : '-');
 605                                 D++;
 606                         }
 607                         break;
 608                 default:
 609                         edscript.push_back('=');
 610                 }
 611         }
 612
 613         delete [] (es - (M+1));
 614         delete [] (fp - (M+1));
 615
 616         return D;
 617 }
 618
 619 int
 620 stringdiffs::snake(int k, int y, bool exchanged)
 621 {
 622
 623         int M = exchanged ? m_words2.size() - 1 : m_words1.size() - 1;
 624         int N = exchanged ? m_words1.size() - 1 : m_words2.size() - 1;
 625         int x = y - k;
 626         while (x < M && y < N && (exchanged ? AreWordsSame(m_words1[y + 1], m_words2[x + 1]) : AreWordsSame(m_words1[x + 1], m_words2[y + 1]))) {
 627                 x = x + 1; y = y + 1;
 628         }
 629         return y;
 630 }
 631
 632 /**
 633  * @brief Return true if chars match
 634  *
 635  * Caller must not call this for lead bytes
 636  */
 637 static inline bool
 638 matchchar(TCHAR ch1, TCHAR ch2, bool casitive)
 639 {
 640         if (casitive)
 641                 return ch1==ch2;
 642         else
 643                 return _totupper(ch1)==_totupper(ch2);
 644 }
 645
 646
 647 /** Does character introduce a multicharacter character? */
 648 static inline bool IsLeadByte(TCHAR ch)
 649 {
 650 #ifdef UNICODE
 651         return false;
 652 #else
 653         return _getmbcp() && IsDBCSLeadByte(ch);
 654 #endif
 655 }
 656
 657 /**
 658  * @brief Is it whitespace (excludes all lead & trail bytes)?
 659  */
 660 static inline bool
 661 isSafeWhitespace(TCHAR ch)
 662 {
 663         return _istspace((unsigned)ch) && !IsLeadByte(ch);
 664 }
 665
 666 /**
 667  * @brief Is it a non-whitespace wordbreak character (ie, punctuation)?
 668  */
 669 static bool
 670 isWordBreak(int breakType, const TCHAR *str, int index)
 671 {
 672         TCHAR ch = str[index];
 673         // breakType==1 means break also on punctuation
 674 #ifdef _UNICODE
 675         if ((ch & 0xff00) == 0)
 676         {
 677 //              TCHAR nextCh = str[index + 1];
 678                 // breakType==0 means whitespace only
 679                 if (!breakType)
 680                         return false;
 681                 return _tcschr(BreakChars, ch) != 0;
 682         }
 683         else
 684         {
 685 //              if (
 686 //                      ch==0xff0c/* Fullwidth Full Stop */ ||
 687 //                      ch==0xff0e/* Fullwidth Comma */ ||
 688 //                      ch==0xff1b/* Fullwidth Semicolon */ ||
 689 //                      ch==0xff1a/* Fullwidth Colon */ ||
 690 //                      ch==0x3002/* Ideographic Full Stop */ ||
 691 //                      ch==0x3001/* Ideographic Comma */
 692 //                      )
 693 //                      return true;
 694 //              WORD wCharType, wCharTypeNext;
 695 //              GetStringTypeW(CT_CTYPE3, &ch, 1, &wCharType);
 696 //              TCHAR nextCh = str[index + 1];
 697 //              GetStringTypeW(CT_CTYPE3, &nextCh, 1, &wCharTypeNext);
 698 //              return (wCharType != wCharTypeNext);
 699 //
 700                 return true;
 701         }
 702 #else
 703         // breakType==0 means whitespace only
 704         if (!breakType)
 705                 return false;
 706         return _tcschr(BreakChars, ch) != 0;
 707 #endif
 708 }
 709
 710
 711 /**
 712  * @brief Return pointer to last character of specified string (handle MBCS)
 713  *
 714  * If the last byte is a broken multibyte (ie, a solo lead byte), this returns previous char
 715  */
 716 static LPCTSTR
 717 LastChar(LPCTSTR psz, int len)
 718 {
 719         if (!len) return psz;
 720
 721         if (!_getmbcp()) return psz+len-1;
 722
 723         LPCTSTR lastValid = psz+len-1;
 724
 725         LPCTSTR prev=psz;
 726         while (psz<lastValid)
 727         {
 728                 prev = psz;
 729                 psz = CharNext(psz);
 730                 if (prev == psz)
 731                         psz++;
 732         }
 733         if (psz==lastValid && !IsLeadByte(*psz))
 734                 return psz;
 735         else // last character was multibyte or broken multibyte
 736                 return prev;
 737 }
 738
 739 /**
 740  * @brief advance current pointer over whitespace, until not whitespace or beyond end
 741  * @param pcurrent [in,out] current location (to be advanced)
 742  * @param end [in] last valid position (only go one beyond this)
 743  */
 744 static void
 745 AdvanceOverWhitespace(LPCTSTR * pcurrent, LPCTSTR end)
 746 {
 747         // advance over whitespace
 748         while (*pcurrent <= end && isSafeWhitespace(**pcurrent))
 749                 ++(*pcurrent); // DBCS safe because of isSafeWhitespace above
 750 }
 751
 752 /**
 753  * @brief Compute begin1,begin2,end1,end2 to display byte difference between strings str1 & str2
 754  * @param casitive [in] true for case-sensitive, false for case-insensitive
 755  * @param xwhite [in] This governs whether we handle whitespace specially (see WHITESPACE_COMPARE_ALL, WHITESPACE_IGNORE_CHANGE, WHITESPACE_IGNORE_ALL)
 756  * @param [out] begin return -1 if not found or pos of equal
 757  * @param [out] end return -1 if not found or pos of equal valid if begin1 >=0
 758  * @param [in] equal false surch for a diff, true surch for equal
 759  *
 760  *
 761  * Assumes whitespace is never leadbyte or trailbyte!
 762  */
 763 void
 764 sd_ComputeByteDiff(String & str1, String & str2,
 765                    bool casitive, int xwhite,
 766                    int begin[2], int end[2], bool equal)
 767 {
 768         // Set to sane values
 769         // Also this way can distinguish if we set begin[0] to -1 for no diff in line
 770         begin[0] = end[0] = begin[1] = end[1] = 0;
 771
 772         int len1 = str1.length();
 773         int len2 = str2.length();
 774
 775         LPCTSTR pbeg1 = str1.c_str();
 776         LPCTSTR pbeg2 = str2.c_str();
 777
 778         if (len1 == 0 || len2 == 0)
 779         {
 780                 if (len1 == len2)
 781                 {
 782                         begin[0] = -1;
 783                         begin[1] = -1;
 784                 }
 785                 end[0] = len1 - 1;
 786                 end[1] = len2 - 1;
 787                 return;
 788         }
 789
 790         // cursors from front, which we advance to beginning of difference
 791         LPCTSTR py1 = pbeg1;
 792         LPCTSTR py2 = pbeg2;
 793
 794         // pen1,pen2 point to the last valid character (broken multibyte lead chars don't count)
 795         LPCTSTR pen1 = LastChar(py1, len1);
 796         LPCTSTR pen2 = LastChar(py2, len2);
 797
 798         if (xwhite != WHITESPACE_COMPARE_ALL)
 799         {
 800                 // Ignore leading and trailing whitespace
 801                 // by advancing py1 and py2
 802                 // and retreating pen1 and pen2
 803                 while (py1 < pen1 && isSafeWhitespace(*py1))
 804                         ++py1; // DBCS safe because of isSafeWhitespace above
 805                 while (py2 < pen2 && isSafeWhitespace(*py2))
 806                         ++py2; // DBCS safe because of isSafeWhitespace above
 807                 if ((pen1 < pbeg1 + len1 - 1 || pen2 < pbeg2 + len2 -1)
 808                         && (!len1 || !len2 || pbeg1[len1] != pbeg2[len2]))
 809                 {
 810                         // mismatched broken multibyte ends
 811                 }
 812                 else
 813                 {
 814                         while (pen1 > py1 && isSafeWhitespace(*pen1))
 815                                 pen1 = CharPrev(py1, pen1);
 816                         while (pen2 > py2 && isSafeWhitespace(*pen2))
 817                                 pen2 = CharPrev(py2, pen2);
 818                 }
 819         }
 820         //check for excaption of empty string on one side
 821         //In that case display all as a diff
 822         if (!equal && (((py1 == pen1) && isSafeWhitespace(*pen1)) ||
 823                 ((py2 == pen2) && isSafeWhitespace(*pen2))))
 824         {
 825                 begin[0] = 0;
 826                 begin[1] = 0;
 827                 end[0] = len1 - 1;
 828                 end[1] = len2 - 1;
 829                 return;
 830         }
 831         // Advance over matching beginnings of lines
 832         // Advance py1 & py2 from beginning until find difference or end
 833         while (1)
 834         {
 835                 // Potential difference extends from py1 to pen1 and py2 to pen2
 836
 837                 // Check if either side finished
 838                 if (py1 > pen1 && py2 > pen2)
 839                 {
 840                         begin[0] = end[0] = begin[1] = end[1] = -1;
 841                         break;
 842                 }
 843                 if (py1 > pen1 || py2 > pen2)
 844                 {
 845                         break;
 846                 }
 847
 848                 // handle all the whitespace logic (due to WinMerge whitespace settings)
 849                 if (xwhite && py1 < pen1 && isSafeWhitespace(*py1))
 850                 {
 851                         if (xwhite==WHITESPACE_IGNORE_CHANGE && !isSafeWhitespace(*py2))
 852                         {
 853                                 // py1 is white but py2 is not
 854                                 // in WHITESPACE_IGNORE_CHANGE mode,
 855                                 // this doesn't qualify as skippable whitespace
 856                                 break; // done with forward search
 857                         }
 858                         // gobble up all whitespace in current area
 859                         AdvanceOverWhitespace(&py1, pen1); // will go beyond end
 860                         AdvanceOverWhitespace(&py2, pen2); // will go beyond end
 861                         continue;
 862
 863                 }
 864                 if (xwhite && py2 < pen2 && isSafeWhitespace(*py2))
 865                 {
 866                         if (xwhite==WHITESPACE_IGNORE_CHANGE && !isSafeWhitespace(*py1))
 867                         {
 868                                 // py2 is white but py1 is not
 869                                 // in WHITESPACE_IGNORE_CHANGE mode,
 870                                 // this doesn't qualify as skippable whitespace
 871                                 break; // done with forward search
 872                         }
 873                         // gobble up all whitespace in current area
 874                         AdvanceOverWhitespace(&py1, pen1); // will go beyond end
 875                         AdvanceOverWhitespace(&py2, pen2); // will go beyond end
 876                         continue;
 877                 }
 878
 879                 // Now do real character match
 880                 if (IsLeadByte(*py1))
 881                 {
 882                         if (!IsLeadByte(*py2))
 883                                 break; // done with forward search
 884                         // DBCS (we assume if a lead byte, then character is 2-byte)
 885                         if (!(py1[0] == py2[0] && py1[1] == py2[1]))
 886                                 break; // done with forward search
 887                         py1 += 2; // DBCS specific
 888                         py2 += 2; // DBCS specific
 889                 }
 890                 else
 891                 {
 892                         if (IsLeadByte(*py2))
 893                                 break; // done with forward search
 894                         if (!matchchar(py1[0], py2[0], casitive))
 895                                 break; // done with forward search
 896                         ++py1; // DBCS safe b/c we checked above
 897                         ++py2; // DBCS safe b/c we checked above
 898                 }
 899         }
 900
 901         // Potential difference extends from py1 to pen1 and py2 to pen2
 902
 903         // Store results of advance into return variables (begin[0] & begin[1])
 904         // -1 in a begin variable means no visible diff area
 905         begin[0] = py1 - pbeg1;
 906         begin[1] = py2 - pbeg2;
 907
 908         LPCTSTR pz1 = pen1;
 909         LPCTSTR pz2 = pen2;
 910
 911         // Retreat over matching ends of lines
 912         // Retreat pz1 & pz2 from end until find difference or beginning
 913         while (1)
 914         {
 915                 // Check if either side finished
 916                 if (pz1 < py1 && pz2 < py2)
 917                 {
 918                         begin[0] = end[0] = begin[1] = end[1] = -1;
 919                         break;
 920                 }
 921                 if (pz1 < py1 || pz2 < py2)
 922                 {
 923                         break;
 924                 }
 925
 926                 // handle all the whitespace logic (due to WinMerge whitespace settings)
 927                 if (xwhite && pz1 > py1 && isSafeWhitespace(*pz1))
 928                 {
 929                         if (xwhite==1 && !isSafeWhitespace(*pz2))
 930                                 break; // done with reverse search
 931                         // gobble up all whitespace in current area
 932                         while (pz1 > py1 && isSafeWhitespace(*pz1))
 933                                 pz1 = CharPrev(py1, pz1);
 934                         while (pz2 > py2 && isSafeWhitespace(*pz2))
 935                                 pz2 = CharPrev(py2, pz2);
 936                         continue;
 937
 938                 }
 939                 if (xwhite && pz2 > py2 && isSafeWhitespace(*pz2))
 940                 {
 941                         if (xwhite==1)
 942                                 break; // done with reverse search
 943                         while (pz2 > py2 && isSafeWhitespace(*pz2))
 944                                 pz2 = CharPrev(py2, pz2);
 945                         continue;
 946                 }
 947
 948                 // Now do real character match
 949                 if (IsLeadByte(*pz1))
 950                 {
 951                         if (!IsLeadByte(*pz2))
 952                                 break; // done with forward search
 953                         // DBCS (we assume if a lead byte, then character is 2-byte)
 954                         if (!(pz1[0] == pz2[0] && pz1[1] == pz2[1]))
 955                                 break; // done with forward search
 956                 }
 957                 else
 958                 {
 959                         if (IsLeadByte(*pz2))
 960                                 break; // done with forward search
 961                         if (!matchchar(pz1[0], pz2[0], casitive))
 962                                 break; // done with forward search
 963                 }
 964                 pz1 = (pz1 > pbeg1) ? CharPrev(pbeg1, pz1) : pz1 - 1;
 965                 pz2 = (pz2 > pbeg2) ? CharPrev(pbeg2, pz2) : pz2 - 1;
 966     }
 967
 968 /*      if (*pz1 == '\r' && *(pz1+1) == '\n')
 969         {
 970                 pz1++;
 971                 pz2++;
 972         }
 973         else if (*pz2 == '\r' && *(pz2+1) == '\n')
 974         {
 975                 pz2++;
 976                 pz1++;
 977         }
 978         if (*(pbeg1-1) == '\r' && *pbeg1 == '\n')
 979         {
 980                 pbeg1--;
 981                 pbeg2--;
 982         }
 983         else if (*(pbeg2-1) == '\r' && *pbeg2 == '\n')
 984         {
 985                 pbeg2--;
 986                 pbeg1--;
 987         }*/
 988
 989         // Store results of advance into return variables (end[0] & end[1])
 990         end[0] = pz1 - pbeg1;
 991         end[1] = pz2 - pbeg2;
 992
 993         // Check if difference region was empty
 994         if (begin[0] == end[0] + 1 && begin[1] == end[1] + 1)
 995                 begin[0] = -1; // no diff
 996 }
 997
 998 /**
 999  * @brief adjust the range of the specified word diffs down to byte(char) level.
1000  * @param str1, str2 [in] line to be compared
1001  * @param casitive [in] true for case-sensitive, false for case-insensitive
1002  * @param xwhite [in] This governs whether we handle whitespace specially
1003  *  (see WHITESPACE_COMPARE_ALL, WHITESPACE_IGNORE_CHANGE, WHITESPACE_IGNORE_ALL)
1004  */
1005 void stringdiffs::wordLevelToByteLevel()
1006 {
1007         for (size_t i = 0; i < m_wdiffs.size(); i++)
1008         {
1009                 int begin[3], end[3];
1010                 wdiff& diff = m_wdiffs[i];
1011                 String str1_2, str2_2;
1012                 str1_2 = m_str1.substr(diff.begin[0], diff.end[0] - diff.begin[0] + 1);
1013                 str2_2 = m_str2.substr(diff.begin[1], diff.end[1] - diff.begin[1] + 1);
1014                 sd_ComputeByteDiff(str1_2, str2_2, m_case_sensitive, m_whitespace, begin, end, false);
1015                 if (begin[0] == -1)
1016                 {
1017                         // no visible diff on side1
1018                         diff.end[0] = diff.begin[0] - 1;
1019                 }
1020                 else
1021                 {
1022                         diff.end[0] = diff.begin[0] + end[0];
1023                         diff.begin[0] += begin[0];
1024                 }
1025                 if (begin[1] == -1)
1026                 {
1027                         // no visible diff on side2
1028                         diff.end[1] = diff.begin[1] - 1;
1029                 }
1030                 else
1031                 {
1032                         diff.end[1] = diff.begin[1] + end[1];
1033                         diff.begin[1] += begin[1];
1034                 }
1035         }
1036 }
1037