Src/stringdiffs.cpp

   1 /**
   2  * @file  stringdiffs.cpp
   3  *
   4  * @brief Implementation file for sd_ComputeWordDiffs (q.v.)
   5  *
   6  */
   7
   8 #include "stringdiffs.h"
   9 #define NOMINMAX
  10 #include <windows.h>
  11 #include <tchar.h>
  12 #include <cassert>
  13 #include <climits>
  14 #include <mbctype.h>
  15 #include "CompareOptions.h"
  16 #include "stringdiffsi.h"
  17 #include "Diff3.h"
  18
  19 using std::vector;
  20
  21 static bool Initialized;
  22 static bool CustomChars;
  23 static TCHAR *BreakChars;
  24 static TCHAR BreakCharDefaults[] = _T(",.;:");
  25
  26 static bool isSafeWhitespace(TCHAR ch);
  27 static bool isWordBreak(int breakType, const TCHAR *str, int index);
  28
  29 void sd_Init()
  30 {
  31         BreakChars = &BreakCharDefaults[0];
  32         Initialized = true;
  33 }
  34
  35 void sd_Close()
  36 {
  37         if (CustomChars)
  38         {
  39                 free(BreakChars);
  40                 BreakChars = NULL;
  41                 CustomChars = false;
  42         }
  43         Initialized = false;
  44 }
  45
  46 void sd_SetBreakChars(const TCHAR *breakChars)
  47 {
  48         assert(Initialized);
  49
  50         if (CustomChars)
  51                 free(BreakChars);
  52
  53         CustomChars = true;
  54         BreakChars = _tcsdup(breakChars);
  55 }
  56
  57 void
  58 sd_ComputeWordDiffs(const String& str1, const String& str2,
  59         bool case_sensitive, int whitespace, int breakType, bool byte_level,
  60         std::vector<wdiff> * pDiffs)
  61 {
  62         String strs[3] = {str1, str2, _T("")};
  63         sd_ComputeWordDiffs(2, strs, case_sensitive, whitespace, breakType, byte_level, pDiffs);
  64 }
  65
  66 struct Comp02Functor
  67 {
  68         Comp02Functor(const String *strs, bool case_sensitive) :
  69                 strs_(strs), case_sensitive_(case_sensitive)
  70         {
  71         }
  72         bool operator()(const wdiff &wd3)
  73         {
  74                 size_t wlen0 = wd3.end[0] - wd3.begin[0] + 1;
  75                 size_t wlen2 = wd3.end[2] - wd3.begin[2] + 1;
  76                 if (wlen0 != wlen2)
  77                         return false;
  78                 if (case_sensitive_)
  79                 {
  80                         if (memcmp(&strs_[0][wd3.begin[0]], &strs_[2][wd3.begin[2]], wlen0 * sizeof(TCHAR)) != 0)
  81                                 return false;
  82                 }
  83                 else
  84                 {
  85                         if (_tcsnicmp(&strs_[0][wd3.begin[0]], &strs_[2][wd3.begin[2]], wlen0) != 0)
  86                                 return false;
  87                 }
  88                 return true;
  89         }
  90         const String *strs_;
  91         bool case_sensitive_;
  92 };
  93
  94 /**
  95  * @brief Construct our worker object and tell it to do the work
  96  */
  97 void
  98 sd_ComputeWordDiffs(int nFiles, const String str[3],
  99         bool case_sensitive, int whitespace, int breakType, bool byte_level,
 100         std::vector<wdiff> * pDiffs)
 101 {
 102         if (nFiles == 2)
 103         {
 104                 stringdiffs sdiffs(str[0], str[1], case_sensitive, whitespace, breakType, pDiffs);
 105                 // Hash all words in both lines and then compare them word by word
 106                 // storing differences into m_wdiffs
 107                 sdiffs.BuildWordDiffList();
 108
 109                 if (byte_level)
 110                         sdiffs.wordLevelToByteLevel();
 111
 112                 // Now copy m_wdiffs into caller-supplied m_pDiffs (coalescing adjacents if possible)
 113                 sdiffs.PopulateDiffs();
 114
 115         }
 116         else
 117         {
 118                 if (str[0].empty())
 119                 {
 120                         stringdiffs sdiffs(str[1], str[2], case_sensitive, whitespace, breakType, pDiffs);
 121                         sdiffs.BuildWordDiffList();
 122                         if (byte_level)
 123                                 sdiffs.wordLevelToByteLevel();
 124                         sdiffs.PopulateDiffs();
 125                         for (size_t i = 0; i < pDiffs->size(); i++)
 126                         {
 127                                 wdiff& diff = (*pDiffs)[i];
 128                                 diff.begin[2] = diff.begin[1];
 129                                 diff.begin[1] = diff.begin[0];
 130                                 diff.begin[0] = 0;
 131                                 diff.end[2] = diff.end[1];
 132                                 diff.end[1] = diff.end[0];
 133                                 diff.end[0] = -1;
 134                         }
 135                 }
 136                 else if (str[1].empty())
 137                 {
 138                         stringdiffs sdiffs(str[0], str[2], case_sensitive, whitespace, breakType, pDiffs);
 139                         sdiffs.BuildWordDiffList();
 140                         if (byte_level)
 141                                 sdiffs.wordLevelToByteLevel();
 142                         sdiffs.PopulateDiffs();
 143                         for (size_t i = 0; i < pDiffs->size(); i++)
 144                         {
 145                                 wdiff& diff = (*pDiffs)[i];
 146                                 diff.begin[2] = diff.begin[1];
 147                                 //diff.begin[0] = diff.begin[0];
 148                                 diff.begin[1] = 0;
 149                                 diff.end[2] = diff.end[1];
 150                                 //diff.end[0] = diff.end[0];
 151                                 diff.end[1] = -1;
 152                         }
 153                 }
 154                 else if (str[2].empty())
 155                 {
 156                         stringdiffs sdiffs(str[0], str[1], case_sensitive, whitespace, breakType, pDiffs);
 157                         sdiffs.BuildWordDiffList();
 158                         if (byte_level)
 159                                 sdiffs.wordLevelToByteLevel();
 160                         sdiffs.PopulateDiffs();
 161                         for (size_t i = 0; i < pDiffs->size(); i++)
 162                         {
 163                                 wdiff& diff = (*pDiffs)[i];
 164                                 //diff.begin[1] = diff.begin[1];
 165                                 //diff.begin[0] = diff.begin[0];
 166                                 diff.begin[2] = 0;
 167                                 //diff.end[1] = diff.end[1];
 168                                 //diff.end[0] = diff.end[0];
 169                                 diff.end[2] = -1;
 170                         }
 171                 }
 172                 else
 173                 {
 174                         std::vector<wdiff> diffs10, diffs12;
 175                         stringdiffs sdiffs10(str[1], str[0], case_sensitive, whitespace, breakType, &diffs10);
 176                         stringdiffs sdiffs12(str[1], str[2], case_sensitive, whitespace, breakType, &diffs12);
 177                         // Hash all words in both lines and then compare them word by word
 178                         // storing differences into m_wdiffs
 179                         sdiffs10.BuildWordDiffList();
 180                         sdiffs12.BuildWordDiffList();
 181                         if (byte_level)
 182                         {
 183                                 sdiffs10.wordLevelToByteLevel();
 184                                 sdiffs12.wordLevelToByteLevel();
 185                         }
 186                         // Now copy m_wdiffs into caller-supplied m_pDiffs (coalescing adjacents if possible)
 187                         sdiffs10.PopulateDiffs();
 188                         sdiffs12.PopulateDiffs();
 189
 190                         Make3wayDiff(*pDiffs, diffs10, diffs12,
 191                                 Comp02Functor(str, case_sensitive), false);
 192                 }
 193         }
 194 }
 195
 196 /**
 197  * @brief stringdiffs constructor simply loads all members from arguments
 198  */
 199 stringdiffs::stringdiffs(const String & str1, const String & str2,
 200         bool case_sensitive, int whitespace, int breakType,
 201         std::vector<wdiff> * pDiffs)
 202 : m_str1(str1)
 203 , m_str2(str2)
 204 , m_case_sensitive(case_sensitive)
 205 , m_whitespace(whitespace)
 206 , m_breakType(breakType)
 207 , m_pDiffs(pDiffs)
 208 , m_matchblock(true) // Change to false to get word to word compare
 209 {
 210 }
 211
 212 /**
 213  * @brief Destructor.
 214  * The destructor frees all diffs added to the vectors.
 215  */
 216 stringdiffs::~stringdiffs()
 217 {
 218 }
 219
 220 #ifdef STRINGDIFF_LOGGING
 221 void
 222 stringdiffs::debugoutput()
 223 {
 224         for (size_t i = 0; i < m_wdiffs.size(); i++)
 225         {
 226                 String str1;
 227                 String str2;
 228                 TCHAR buf[256];
 229                 int s1 = m_wdiffs[i].begin[0];
 230                 int e1 = m_wdiffs[i].end[0];
 231                 int s2 = m_wdiffs[i].begin[1];
 232                 int e2 = m_wdiffs[i].end[1];
 233
 234                 int len1 = e1 - s1 + 1;
 235                 int len2 = e2 - s2 + 1;
 236
 237                 if (len1 < 50)
 238                         str1 = m_str1.substr(s1 ,e1 - s1 + 1);
 239                 else
 240                         str1 = m_str1.substr(s1, 50);
 241
 242                 if (len2 < 50)
 243                         str2 = m_str2.substr(s2, e2- s2 + 1);
 244                 else
 245                         str2 = m_str2.substr(s2, 50);
 246
 247                 wsprintf(buf, _T("left=  %s,   %d,%d,\nright= %s,   %d,%d \n"),
 248                         str1.c_str(), s1, e1, str2.c_str(), s2, e2);
 249                 OutputDebugString(buf);
 250         }
 251 }
 252 #endif
 253
 254 bool
 255 stringdiffs::BuildWordDiffList_DP()
 256 {
 257         std::vector<char> edscript;
 258
 259         //if (dp(edscript) <= 0)
 260         //      return false;
 261         onp(edscript);
 262
 263         int i = 1, j = 1;
 264         for (size_t k = 0; k < edscript.size(); k++)
 265         {
 266                 int s1, e1, s2, e2;
 267                 if (edscript[k] == '-')
 268                 {
 269                         if (m_whitespace == WHITESPACE_IGNORE_ALL)
 270                         {
 271                                 if (IsSpace(m_words1[i]))
 272                                 {
 273                                         i++;
 274                                         continue;
 275                                 }
 276                         }
 277
 278                         s1 = m_words1[i].start;
 279                         e1 = m_words1[i].end;
 280                         s2 = m_words2[j-1].end+1;
 281                         e2 = s2-1;
 282                         m_wdiffs.push_back(wdiff(s1, e1, s2, e2));
 283                         i++;
 284                 }
 285                 else if (edscript[k] == '+')
 286                 {
 287                         if (m_whitespace == WHITESPACE_IGNORE_ALL)
 288                         {
 289                                 if (IsSpace(m_words2[j]))
 290                                 {
 291                                         j++;
 292                                         continue;
 293                                 }
 294                         }
 295
 296                         s1 = m_words1[i-1].end+1;
 297                         e1 = s1-1;
 298                         s2 = m_words2[j].start;
 299                         e2 = m_words2[j].end;
 300                         m_wdiffs.push_back(wdiff(s1, e1, s2, e2));
 301                         j++;
 302                 }
 303                 else if (edscript[k] == '!')
 304                 {
 305                         if (m_whitespace == WHITESPACE_IGNORE_CHANGE || m_whitespace == WHITESPACE_IGNORE_ALL)
 306                         {
 307                                 if (IsSpace(m_words1[i]) && IsSpace(m_words2[j]))
 308                                 {
 309                                         i++; j++;
 310                                         continue;
 311                                 }
 312                         }
 313
 314                         s1 =  m_words1[i].start;
 315                         e1 =  m_words1[i].end;
 316                         s2 =  m_words2[j].start;
 317                         e2 =  m_words2[j].end ;
 318                         m_wdiffs.push_back(wdiff(s1, e1, s2, e2));
 319                         i++; j++;
 320                 }
 321                 else
 322                 {
 323                         i++; j++;
 324                 }
 325         }
 326 #ifdef STRINGDIFF_LOGGING
 327         debugoutput();
 328 #endif
 329         return true;
 330 }
 331
 332 /**
 333  * @brief Add all different elements between lines to the wdiff list
 334  */
 335 void
 336 stringdiffs::BuildWordDiffList()
 337 {
 338         BuildWordsArray(m_str1, m_words1);
 339         BuildWordsArray(m_str2, m_words2);
 340
 341 #ifdef _WIN64
 342         if (m_words1.size() > 20480 || m_words2.size() > 20480)
 343 #else
 344         if (m_words1.size() > 2048 || m_words2.size() > 2048)
 345 #endif
 346         {
 347                 int s1 = m_words1[0].start;
 348                 int e1 = m_words1[m_words1.size() - 1].end;
 349                 int s2 = m_words2[0].start;
 350                 int e2 = m_words2[m_words2.size() - 1].end;
 351                 m_wdiffs.push_back(wdiff(s1, e1, s2, e2));
 352                 return;
 353         }
 354
 355         BuildWordDiffList_DP();
 356 }
 357
 358 /**
 359  * @brief Break line into constituent words
 360  */
 361 void
 362 stringdiffs::BuildWordsArray(const String & str, std::vector<word>& words)
 363 {
 364         int i=0, begin=0;
 365
 366         // dummy;
 367         words.push_back(word(0, -1, 0, 0));
 368
 369         // state when we are looking for next word
 370 inspace:
 371         if (isSafeWhitespace(str[i]))
 372         {
 373                 ++i;
 374                 goto inspace;
 375         }
 376         if (begin < i)
 377         {
 378                 // just finished a word
 379                 // e is first word character (space or at end)
 380                 int e = i - 1;
 381
 382                 words.push_back(word(begin, e, dlspace, Hash(str, begin, e, 0)));
 383         }
 384         if (i == str.length())
 385                 return;
 386         begin = i;
 387         goto inword;
 388
 389         // state when we are inside a word
 390 inword:
 391         bool atspace=false;
 392         if (i == str.length() || ((atspace = isSafeWhitespace(str[i])) != 0) || isWordBreak(m_breakType, str.c_str(), i))
 393         {
 394                 if (begin<i)
 395                 {
 396                         // just finished a word
 397                         // e is first non-word character (space or at end)
 398                         int e = i-1;
 399
 400                         words.push_back(word(begin, e, dlword, Hash(str, begin, e, 0)));
 401                 }
 402                 if (i == str.length())
 403                 {
 404                         return;
 405                 }
 406                 else if (atspace)
 407                 {
 408                         begin = i;
 409                         goto inspace;
 410                 }
 411                 else
 412                 {
 413                         // start a new word because we hit a non-whitespace word break (eg, a comma)
 414                         // but, we have to put each word break character into its own word
 415                         words.push_back(word(i, i, dlbreak, Hash(str, i, i, 0)));
 416                         ++i;
 417                         begin = i;
 418                         goto inword;
 419                 }
 420         }
 421         ++i;
 422         goto inword; // safe even if we're at the end or no longer in a word
 423 }
 424
 425 /**
 426  * @brief Populate m_pDiffs from m_wdiffs (combining adjacent diffs)
 427  *
 428  * Doing the combining of adjacent diffs here keeps some complexity out of BuildWordsArray.
 429  */
 430 void
 431 stringdiffs::PopulateDiffs()
 432 {
 433         for (int i=0; i< (int)m_wdiffs.size(); ++i)
 434         {
 435                 bool skipIt = false;
 436                 // combine it with next ?
 437                 if (i+1< (int)m_wdiffs.size())
 438                 {
 439                         if (m_wdiffs[i].end[0] + 1 == m_wdiffs[i+1].begin[0]
 440                                 && m_wdiffs[i].end[1] + 1 == m_wdiffs[i+1].begin[1])
 441                         {
 442                                 // diff[i] and diff[i+1] are contiguous
 443                                 // so combine them into diff[i+1] and ignore diff[i]
 444                                 m_wdiffs[i+1].begin[0] = m_wdiffs[i].begin[0];
 445                                 m_wdiffs[i+1].begin[1] = m_wdiffs[i].begin[1];
 446                                 skipIt = true;
 447                         }
 448                 }
 449                 if (!skipIt)
 450                 {
 451                         // Should never have a pair where both are missing
 452                         assert(m_wdiffs[i].begin[0]>=0 || m_wdiffs[i].begin[1]>=0);
 453
 454                         // Store the diff[i] in the caller list (m_pDiffs)
 455                         m_pDiffs->push_back(wdiff(m_wdiffs[i]));
 456                 }
 457         }
 458 }
 459
 460 // diffutils hash
 461
 462 /* Rotate a value n bits to the left. */
 463 #define UINT_BIT (sizeof (unsigned) * CHAR_BIT)
 464 #define ROL(v, n) ((v) << (n) | (v) >> (UINT_BIT - (n)))
 465 /* Given a hash value and a new character, return a new hash value. */
 466 #define HASH(h, c) ((c) + ROL (h, 7))
 467
 468 unsigned
 469 stringdiffs::Hash(const String & str, int begin, int end, unsigned h) const
 470 {
 471         for (int i = begin; i <= end; ++i)
 472         {
 473                 unsigned ch = static_cast<unsigned>(str[i]);
 474                 if (m_case_sensitive)
 475                 {
 476                         h += HASH(h, ch);
 477                 }
 478                 else
 479                 {
 480                         ch = static_cast<unsigned>(_totupper(ch));
 481                         h += HASH(h, ch);
 482                 }
 483         }
 484
 485         return h;
 486 }
 487
 488 /**
 489  * @brief Compare two words (by reference to original strings)
 490  */
 491 bool
 492 stringdiffs::AreWordsSame(const word & word1, const word & word2) const
 493 {
 494         if (this->m_whitespace != WHITESPACE_COMPARE_ALL)
 495         {
 496                 if (IsSpace(word1) && IsSpace(word2))
 497                         return true;
 498         }
 499         if (word1.hash != word2.hash)
 500                 return false;
 501         if (word1.length() != word2.length())
 502                 return false;
 503         for (int i=0; i<word1.length(); ++i)
 504         {
 505                 if (!caseMatch(m_str1[word1.start+i], m_str2[word2.start+i]))
 506                         return false;
 507         }
 508         return true;
 509 }
 510
 511 /**
 512  * @brief Return true if characters match
 513  */
 514 bool
 515 stringdiffs::caseMatch(TCHAR ch1, TCHAR ch2) const
 516 {
 517         if (m_case_sensitive)
 518                 return ch1==ch2;
 519         else
 520                 return _totupper(ch1)==_totupper(ch2);
 521 }
 522
 523 /**
 524  * @ brief An O(NP) Sequence Comparison Algorithm. Sun Wu, Udi Manber, Gene Myers
 525  */
 526 int
 527 stringdiffs::onp(std::vector<char> &edscript)
 528 {
 529         int M = m_words1.size() - 1;
 530         int N = m_words2.size() - 1;
 531         bool exchanged = false;
 532         if (M > N)
 533         {
 534                 M = m_words2.size() - 1;
 535                 N = m_words1.size() - 1;
 536                 exchanged = true;
 537         }
 538         int *fp = (new int[(M+1) + 1 + (N+1)]) + (M+1);
 539         std::vector<char> *es = (new std::vector<char>[(M+1) + 1 + (N+1)]) + (M+1);
 540         int DELTA = N - M;
 541
 542         int k;
 543         for (k = -(M+1); k <= (N+1); k++)
 544                 fp[k] = -1;
 545         int p = -1;
 546         do
 547         {
 548                 p = p + 1;
 549                 for (k = -p; k <= DELTA-1; k++)
 550                 {
 551                         fp[k] = snake(k, std::max(fp[k-1] + 1, fp[k+1]), exchanged);
 552
 553                         es[k] = fp[k-1] + 1 > fp[k+1] ? es[k-1] : es[k+1];
 554                         es[k].push_back(fp[k-1] + 1 > fp[k+1] ? '+' : '-');
 555                         es[k].resize(es[k].size() + fp[k] - std::max(fp[k-1] + 1, fp[k+1]), '=');
 556                 }
 557                 for (k = DELTA + p; k >= DELTA+1; k--)
 558                 {
 559                         fp[k] = snake(k, std::max(fp[k-1] + 1, fp[k+1]), exchanged);
 560
 561                         es[k] = fp[k-1] + 1 > fp[k+1] ? es[k-1] : es[k+1];
 562                         es[k].push_back(fp[k-1] + 1 > fp[k+1] ? '+' : '-');
 563                         es[k].resize(es[k].size() + fp[k] - std::max(fp[k-1] + 1, fp[k+1]), '=');
 564                 }
 565                 k = DELTA;
 566                 fp[k] = snake(k, std::max(fp[k-1] + 1, fp[k+1]), exchanged);
 567
 568                 es[k] = fp[k-1] + 1 > fp[k+1] ? es[k-1] : es[k+1];
 569                 es[k].push_back(fp[k-1] + 1 > fp[k+1] ? '+' : '-');
 570                 es[k].resize(es[k].size() + fp[k] - std::max(fp[k-1] + 1, fp[k+1]), '=');
 571         } while (fp[k] != N);
 572
 573         std::vector<char> &ses = es[DELTA]; // Shortest edit script
 574         edscript.clear();
 575
 576         int D = 0;
 577         for (size_t i = 1; i < ses.size(); i++)
 578         {
 579                 switch (ses[i])
 580                 {
 581                 case '+':
 582                         if (i + 1 < ses.size() && ses[i + 1] == '-')
 583                         {
 584                                 edscript.push_back('!');
 585                                 i++;
 586                                 D++;
 587                         }
 588                         else
 589                         {
 590                                 edscript.push_back(exchanged ? '-' : '+');
 591                                 D++;
 592                         }
 593                         break;
 594                 case '-':
 595                         if (i + 1 < ses.size() && ses[i + 1] == '+')
 596                         {
 597                                 edscript.push_back('!');
 598                                 i++;
 599                                 D++;
 600                         }
 601                         else
 602                         {
 603                                 edscript.push_back(exchanged ? '+' : '-');
 604                                 D++;
 605                         }
 606                         break;
 607                 default:
 608                         edscript.push_back('=');
 609                 }
 610         }
 611
 612         delete [] (es - (M+1));
 613         delete [] (fp - (M+1));
 614
 615         return D;
 616 }
 617
 618 int
 619 stringdiffs::snake(int k, int y, bool exchanged)
 620 {
 621
 622         int M = exchanged ? m_words2.size() - 1 : m_words1.size() - 1;
 623         int N = exchanged ? m_words1.size() - 1 : m_words2.size() - 1;
 624         int x = y - k;
 625         while (x < M && y < N && (exchanged ? AreWordsSame(m_words1[y + 1], m_words2[x + 1]) : AreWordsSame(m_words1[x + 1], m_words2[y + 1]))) {
 626                 x = x + 1; y = y + 1;
 627         }
 628         return y;
 629 }
 630
 631 /**
 632  * @brief Return true if chars match
 633  *
 634  * Caller must not call this for lead bytes
 635  */
 636 static inline bool
 637 matchchar(TCHAR ch1, TCHAR ch2, bool casitive)
 638 {
 639         if (casitive)
 640                 return ch1==ch2;
 641         else
 642                 return _totupper(ch1)==_totupper(ch2);
 643 }
 644
 645
 646 /** Does character introduce a multicharacter character? */
 647 static inline bool IsLeadByte(TCHAR ch)
 648 {
 649 #ifdef UNICODE
 650         return false;
 651 #else
 652         return _getmbcp() && IsDBCSLeadByte(ch);
 653 #endif
 654 }
 655
 656 /**
 657  * @brief Is it whitespace (excludes all lead & trail bytes)?
 658  */
 659 static inline bool
 660 isSafeWhitespace(TCHAR ch)
 661 {
 662         return _istspace((unsigned)ch) && !IsLeadByte(ch);
 663 }
 664
 665 /**
 666  * @brief Is it a non-whitespace wordbreak character (ie, punctuation)?
 667  */
 668 static bool
 669 isWordBreak(int breakType, const TCHAR *str, int index)
 670 {
 671         TCHAR ch = str[index];
 672         // breakType==1 means break also on punctuation
 673 #ifdef _UNICODE
 674         if ((ch & 0xff00) == 0)
 675         {
 676                 TCHAR nextCh = str[index + 1];
 677                 // breakType==0 means whitespace only
 678                 if (!breakType)
 679                         return false;
 680                 return _tcschr(BreakChars, ch) != 0;
 681         }
 682         else
 683         {
 684 //              if (
 685 //                      ch==0xff0c/* Fullwidth Full Stop */ ||
 686 //                      ch==0xff0e/* Fullwidth Comma */ ||
 687 //                      ch==0xff1b/* Fullwidth Semicolon */ ||
 688 //                      ch==0xff1a/* Fullwidth Colon */ ||
 689 //                      ch==0x3002/* Ideographic Full Stop */ ||
 690 //                      ch==0x3001/* Ideographic Comma */
 691 //                      )
 692 //                      return true;
 693 //              WORD wCharType, wCharTypeNext;
 694 //              GetStringTypeW(CT_CTYPE3, &ch, 1, &wCharType);
 695 //              TCHAR nextCh = str[index + 1];
 696 //              GetStringTypeW(CT_CTYPE3, &nextCh, 1, &wCharTypeNext);
 697 //              return (wCharType != wCharTypeNext);
 698 //
 699                 return true;
 700         }
 701 #else
 702         // breakType==0 means whitespace only
 703         if (!breakType)
 704                 return false;
 705         return _tcschr(BreakChars, ch) != 0;
 706 #endif
 707 }
 708
 709
 710 /**
 711  * @brief Return pointer to last character of specified string (handle MBCS)
 712  *
 713  * If the last byte is a broken multibyte (ie, a solo lead byte), this returns previous char
 714  */
 715 static LPCTSTR
 716 LastChar(LPCTSTR psz, int len)
 717 {
 718         if (!len) return psz;
 719
 720         if (!_getmbcp()) return psz+len-1;
 721
 722         LPCTSTR lastValid = psz+len-1;
 723
 724         LPCTSTR prev=psz;
 725         while (psz<lastValid)
 726         {
 727                 prev = psz;
 728                 psz = CharNext(psz);
 729                 if (prev == psz)
 730                         psz++;
 731         }
 732         if (psz==lastValid && !IsLeadByte(*psz))
 733                 return psz;
 734         else // last character was multibyte or broken multibyte
 735                 return prev;
 736 }
 737
 738 /**
 739  * @brief advance current pointer over whitespace, until not whitespace or beyond end
 740  * @param pcurrent [in,out] current location (to be advanced)
 741  * @param end [in] last valid position (only go one beyond this)
 742  */
 743 static void
 744 AdvanceOverWhitespace(LPCTSTR * pcurrent, LPCTSTR end)
 745 {
 746         // advance over whitespace
 747         while (*pcurrent <= end && isSafeWhitespace(**pcurrent))
 748                 ++(*pcurrent); // DBCS safe because of isSafeWhitespace above
 749 }
 750
 751 /**
 752  * @brief Compute begin1,begin2,end1,end2 to display byte difference between strings str1 & str2
 753  * @param casitive [in] true for case-sensitive, false for case-insensitive
 754  * @param xwhite [in] This governs whether we handle whitespace specially (see WHITESPACE_COMPARE_ALL, WHITESPACE_IGNORE_CHANGE, WHITESPACE_IGNORE_ALL)
 755  * @param [out] begin return -1 if not found or pos of equal
 756  * @param [out] end return -1 if not found or pos of equal valid if begin1 >=0
 757  * @param [in] equal false surch for a diff, true surch for equal
 758  *
 759  *
 760  * Assumes whitespace is never leadbyte or trailbyte!
 761  */
 762 void
 763 sd_ComputeByteDiff(String & str1, String & str2,
 764                    bool casitive, int xwhite,
 765                    int begin[2], int end[2], bool equal)
 766 {
 767         // Set to sane values
 768         // Also this way can distinguish if we set begin[0] to -1 for no diff in line
 769         begin[0] = end[0] = begin[1] = end[1] = 0;
 770
 771         int len1 = str1.length();
 772         int len2 = str2.length();
 773
 774         LPCTSTR pbeg1 = str1.c_str();
 775         LPCTSTR pbeg2 = str2.c_str();
 776
 777         if (len1 == 0 || len2 == 0)
 778         {
 779                 if (len1 == len2)
 780                 {
 781                         begin[0] = -1;
 782                         begin[1] = -1;
 783                 }
 784                 end[0] = len1 - 1;
 785                 end[1] = len2 - 1;
 786                 return;
 787         }
 788
 789         // cursors from front, which we advance to beginning of difference
 790         LPCTSTR py1 = pbeg1;
 791         LPCTSTR py2 = pbeg2;
 792
 793         // pen1,pen2 point to the last valid character (broken multibyte lead chars don't count)
 794         LPCTSTR pen1 = LastChar(py1, len1);
 795         LPCTSTR pen2 = LastChar(py2, len2);
 796
 797         if (xwhite != WHITESPACE_COMPARE_ALL)
 798         {
 799                 // Ignore leading and trailing whitespace
 800                 // by advancing py1 and py2
 801                 // and retreating pen1 and pen2
 802                 while (py1 < pen1 && isSafeWhitespace(*py1))
 803                         ++py1; // DBCS safe because of isSafeWhitespace above
 804                 while (py2 < pen2 && isSafeWhitespace(*py2))
 805                         ++py2; // DBCS safe because of isSafeWhitespace above
 806                 if ((pen1 < pbeg1 + len1 - 1 || pen2 < pbeg2 + len2 -1)
 807                         && (!len1 || !len2 || pbeg1[len1] != pbeg2[len2]))
 808                 {
 809                         // mismatched broken multibyte ends
 810                 }
 811                 else
 812                 {
 813                         while (pen1 > py1 && isSafeWhitespace(*pen1))
 814                                 pen1 = CharPrev(py1, pen1);
 815                         while (pen2 > py2 && isSafeWhitespace(*pen2))
 816                                 pen2 = CharPrev(py2, pen2);
 817                 }
 818         }
 819         //check for excaption of empty string on one side
 820         //In that case display all as a diff
 821         if (!equal && (((py1 == pen1) && isSafeWhitespace(*pen1)) ||
 822                 ((py2 == pen2) && isSafeWhitespace(*pen2))))
 823         {
 824                 begin[0] = 0;
 825                 begin[1] = 0;
 826                 end[0] = len1 - 1;
 827                 end[1] = len2 - 1;
 828                 return;
 829         }
 830         // Advance over matching beginnings of lines
 831         // Advance py1 & py2 from beginning until find difference or end
 832         while (1)
 833         {
 834                 // Potential difference extends from py1 to pen1 and py2 to pen2
 835
 836                 // Check if either side finished
 837                 if (py1 > pen1 && py2 > pen2)
 838                 {
 839                         begin[0] = end[0] = begin[1] = end[1] = -1;
 840                         break;
 841                 }
 842                 if (py1 > pen1 || py2 > pen2)
 843                 {
 844                         break;
 845                 }
 846
 847                 // handle all the whitespace logic (due to WinMerge whitespace settings)
 848                 if (xwhite && py1 < pen1 && isSafeWhitespace(*py1))
 849                 {
 850                         if (xwhite==WHITESPACE_IGNORE_CHANGE && !isSafeWhitespace(*py2))
 851                         {
 852                                 // py1 is white but py2 is not
 853                                 // in WHITESPACE_IGNORE_CHANGE mode,
 854                                 // this doesn't qualify as skippable whitespace
 855                                 break; // done with forward search
 856                         }
 857                         // gobble up all whitespace in current area
 858                         AdvanceOverWhitespace(&py1, pen1); // will go beyond end
 859                         AdvanceOverWhitespace(&py2, pen2); // will go beyond end
 860                         continue;
 861
 862                 }
 863                 if (xwhite && py2 < pen2 && isSafeWhitespace(*py2))
 864                 {
 865                         if (xwhite==WHITESPACE_IGNORE_CHANGE && !isSafeWhitespace(*py1))
 866                         {
 867                                 // py2 is white but py1 is not
 868                                 // in WHITESPACE_IGNORE_CHANGE mode,
 869                                 // this doesn't qualify as skippable whitespace
 870                                 break; // done with forward search
 871                         }
 872                         // gobble up all whitespace in current area
 873                         AdvanceOverWhitespace(&py1, pen1); // will go beyond end
 874                         AdvanceOverWhitespace(&py2, pen2); // will go beyond end
 875                         continue;
 876                 }
 877
 878                 // Now do real character match
 879                 if (IsLeadByte(*py1))
 880                 {
 881                         if (!IsLeadByte(*py2))
 882                                 break; // done with forward search
 883                         // DBCS (we assume if a lead byte, then character is 2-byte)
 884                         if (!(py1[0] == py2[0] && py1[1] == py2[1]))
 885                                 break; // done with forward search
 886                         py1 += 2; // DBCS specific
 887                         py2 += 2; // DBCS specific
 888                 }
 889                 else
 890                 {
 891                         if (IsLeadByte(*py2))
 892                                 break; // done with forward search
 893                         if (!matchchar(py1[0], py2[0], casitive))
 894                                 break; // done with forward search
 895                         ++py1; // DBCS safe b/c we checked above
 896                         ++py2; // DBCS safe b/c we checked above
 897                 }
 898         }
 899
 900         // Potential difference extends from py1 to pen1 and py2 to pen2
 901
 902         // Store results of advance into return variables (begin[0] & begin[1])
 903         // -1 in a begin variable means no visible diff area
 904         begin[0] = py1 - pbeg1;
 905         begin[1] = py2 - pbeg2;
 906
 907         LPCTSTR pz1 = pen1;
 908         LPCTSTR pz2 = pen2;
 909
 910         // Retreat over matching ends of lines
 911         // Retreat pz1 & pz2 from end until find difference or beginning
 912         while (1)
 913         {
 914                 // Check if either side finished
 915                 if (pz1 < py1 && pz2 < py2)
 916                 {
 917                         begin[0] = end[0] = begin[1] = end[1] = -1;
 918                         break;
 919                 }
 920                 if (pz1 < py1 || pz2 < py2)
 921                 {
 922                         break;
 923                 }
 924
 925                 // handle all the whitespace logic (due to WinMerge whitespace settings)
 926                 if (xwhite && pz1 > py1 && isSafeWhitespace(*pz1))
 927                 {
 928                         if (xwhite==1 && !isSafeWhitespace(*pz2))
 929                                 break; // done with reverse search
 930                         // gobble up all whitespace in current area
 931                         while (pz1 > py1 && isSafeWhitespace(*pz1))
 932                                 pz1 = CharPrev(py1, pz1);
 933                         while (pz2 > py2 && isSafeWhitespace(*pz2))
 934                                 pz2 = CharPrev(py2, pz2);
 935                         continue;
 936
 937                 }
 938                 if (xwhite && pz2 > py2 && isSafeWhitespace(*pz2))
 939                 {
 940                         if (xwhite==1)
 941                                 break; // done with reverse search
 942                         while (pz2 > py2 && isSafeWhitespace(*pz2))
 943                                 pz2 = CharPrev(py2, pz2);
 944                         continue;
 945                 }
 946
 947                 // Now do real character match
 948                 if (IsLeadByte(*pz1))
 949                 {
 950                         if (!IsLeadByte(*pz2))
 951                                 break; // done with forward search
 952                         // DBCS (we assume if a lead byte, then character is 2-byte)
 953                         if (!(pz1[0] == pz2[0] && pz1[1] == pz2[1]))
 954                                 break; // done with forward search
 955                 }
 956                 else
 957                 {
 958                         if (IsLeadByte(*pz2))
 959                                 break; // done with forward search
 960                         if (!matchchar(pz1[0], pz2[0], casitive))
 961                                 break; // done with forward search
 962                 }
 963                 pz1 = (pz1 > pbeg1) ? CharPrev(pbeg1, pz1) : pz1 - 1;
 964                 pz2 = (pz2 > pbeg2) ? CharPrev(pbeg2, pz2) : pz2 - 1;
 965     }
 966
 967 /*      if (*pz1 == '\r' && *(pz1+1) == '\n')
 968         {
 969                 pz1++;
 970                 pz2++;
 971         }
 972         else if (*pz2 == '\r' && *(pz2+1) == '\n')
 973         {
 974                 pz2++;
 975                 pz1++;
 976         }
 977         if (*(pbeg1-1) == '\r' && *pbeg1 == '\n')
 978         {
 979                 pbeg1--;
 980                 pbeg2--;
 981         }
 982         else if (*(pbeg2-1) == '\r' && *pbeg2 == '\n')
 983         {
 984                 pbeg2--;
 985                 pbeg1--;
 986         }*/
 987
 988         // Store results of advance into return variables (end[0] & end[1])
 989         end[0] = pz1 - pbeg1;
 990         end[1] = pz2 - pbeg2;
 991
 992         // Check if difference region was empty
 993         if (begin[0] == end[0] + 1 && begin[1] == end[1] + 1)
 994                 begin[0] = -1; // no diff
 995 }
 996
 997 /**
 998  * @brief adjust the range of the specified word diffs down to byte(char) level.
 999  * @param str1, str2 [in] line to be compared
1000  * @param casitive [in] true for case-sensitive, false for case-insensitive
1001  * @param xwhite [in] This governs whether we handle whitespace specially
1002  *  (see WHITESPACE_COMPARE_ALL, WHITESPACE_IGNORE_CHANGE, WHITESPACE_IGNORE_ALL)
1003  */
1004 void stringdiffs::wordLevelToByteLevel()
1005 {
1006         for (size_t i = 0; i < m_wdiffs.size(); i++)
1007         {
1008                 int begin[3], end[3];
1009                 wdiff& diff = m_wdiffs[i];
1010                 String str1_2, str2_2;
1011                 str1_2 = m_str1.substr(diff.begin[0], diff.end[0] - diff.begin[0] + 1);
1012                 str2_2 = m_str2.substr(diff.begin[1], diff.end[1] - diff.begin[1] + 1);
1013                 sd_ComputeByteDiff(str1_2, str2_2, m_case_sensitive, m_whitespace, begin, end, false);
1014                 if (begin[0] == -1)
1015                 {
1016                         // no visible diff on side1
1017                         diff.end[0] = diff.begin[0] - 1;
1018                 }
1019                 else
1020                 {
1021                         diff.end[0] = diff.begin[0] + end[0];
1022                         diff.begin[0] += begin[0];
1023                 }
1024                 if (begin[1] == -1)
1025                 {
1026                         // no visible diff on side2
1027                         diff.end[1] = diff.begin[1] - 1;
1028                 }
1029                 else
1030                 {
1031                         diff.end[1] = diff.begin[1] + end[1];
1032                         diff.begin[1] += begin[1];
1033                 }
1034         }
1035 }
1036