From ce7ceb240fd8d559c1a9c4a7d3bee46bacb52c99 Mon Sep 17 00:00:00 2001 From: Takashi Sawanaka Date: Sat, 23 Nov 2019 22:11:49 +0900 Subject: [PATCH] Improve UNICODE character support using icu.dll usable from Windows 10 Creators Update (10) Fixed an issue where Halfwidth Katakana Voiced Sound Mark(U+FF9E) and Halfwidth Katakana Semi-Voiced Sound Mark(U+FF9F) were not treated as one character --- Externals/crystaledit/editlib/icu.cpp | 39 ++++++++++++++++++++++++++- Externals/crystaledit/editlib/icu.hpp | 47 ++++++++++++++++++++++++++++++-- Src/stringdiffs.cpp | 51 ++++++++++++++++------------------- Src/stringdiffsi.h | 4 --- 4 files changed, 106 insertions(+), 35 deletions(-) diff --git a/Externals/crystaledit/editlib/icu.cpp b/Externals/crystaledit/editlib/icu.cpp index 386e81299..36c131530 100644 --- a/Externals/crystaledit/editlib/icu.cpp +++ b/Externals/crystaledit/editlib/icu.cpp @@ -2,5 +2,42 @@ #define ICU_EXTERN #include "icu.hpp" +static ICULoader m_ICULoader; HMODULE ICULoader::m_hLibrary = nullptr; -static ICULoader m_ICULoader; \ No newline at end of file +template <> thread_local std::unique_ptr m_pCharaterBreakIterator<1>; +template <> thread_local std::unique_ptr m_pCharaterBreakIterator<2>; +template <> thread_local std::unique_ptr m_pCharaterBreakIterator<3>; +template <> thread_local std::unique_ptr m_pCharaterBreakIterator<4>; + +// This rule set is based on character-break iterator rules of ICU 63.1 + // . +const UChar* ICUBreakIterator::kCustomRules = +u"$CR = [\\p{Grapheme_Cluster_Break = CR}];" +u"$LF = [\\p{Grapheme_Cluster_Break = LF}];" +u"$Control = [[\\p{Grapheme_Cluster_Break = Control}]];" +u"$VoiceMarks = [\\uFF9E\\uFF9F];" +u"$Extend = [[\\p{Grapheme_Cluster_Break = Extend}] - $VoiceMarks];" +u"$ZWJ = [\\p{Grapheme_Cluster_Break = ZWJ}];" +u"$Regional_Indicator = [\\p{Grapheme_Cluster_Break = Regional_Indicator}];" +u"$Prepend = [\\p{Grapheme_Cluster_Break = Prepend}];" +u"$SpacingMark = [\\p{Grapheme_Cluster_Break = SpacingMark}];" +u"$L = [\\p{Grapheme_Cluster_Break = L}];" +u"$V = [\\p{Grapheme_Cluster_Break = V}];" +u"$T = [\\p{Grapheme_Cluster_Break = T}];" +u"$LV = [\\p{Grapheme_Cluster_Break = LV}];" +u"$LVT = [\\p{Grapheme_Cluster_Break = LVT}];" +u"$Extended_Pict = [:ExtPict:];" +u"!!chain;" +u"!!lookAheadHardBreak;" +u"$VoiceMarks;" +u"$CR $LF;" +u"$L ($L | $V | $LV | $LVT);" +u"($LV | $V) ($V | $T);" +u"($LVT | $T) $T;" +u"[^$Control $CR $LF] ($Extend | $ZWJ);" +u"[^$Control $CR $LF] $SpacingMark;" +u"$Prepend [^$Control $CR $LF];" +u"$Extended_Pict $Extend* $ZWJ $Extended_Pict;" +u"^$Prepend* $Regional_Indicator $Regional_Indicator / $Regional_Indicator;" +u"^$Prepend* $Regional_Indicator $Regional_Indicator;" +u".;"; diff --git a/Externals/crystaledit/editlib/icu.hpp b/Externals/crystaledit/editlib/icu.hpp index e5aabe1bb..7fe520201 100644 --- a/Externals/crystaledit/editlib/icu.hpp +++ b/Externals/crystaledit/editlib/icu.hpp @@ -20,6 +20,18 @@ typedef int32_t UChar32; typedef char16_t UChar; typedef struct UBreakIterator UBreakIterator; typedef enum UErrorCode { U_ZERO_ERROR = 0 } UErrorCode; +enum { U_PARSE_CONTEXT_LEN = 16 }; +typedef struct UParseError { + int32_t line; + int32_t offset; + UChar preContext[U_PARSE_CONTEXT_LEN]; + UChar postContext[U_PARSE_CONTEXT_LEN]; +} UParseError; + +class ICUBreakIterator; + +template +extern thread_local std::unique_ptr m_pCharaterBreakIterator; typedef UBreakIterator* (*ubrk_open_type)(UBreakIteratorType type, const char* locale, const UChar* text, int32_t textLength, UErrorCode* status); ICU_EXTERN UBreakIterator* (*g_pubrk_open)(UBreakIteratorType type, const char* locale, const UChar* text, int32_t textLength, UErrorCode* status); @@ -28,6 +40,13 @@ inline UBreakIterator* ubrk_open(UBreakIteratorType type, const char* locale, co return g_pubrk_open(type, locale, text, textLength, status); } +typedef UBreakIterator* (*ubrk_openRules_type)(const UChar *rules, int32_t rulesLength, const UChar *text, int32_t textLength, UParseError *parseErr, UErrorCode *status); +ICU_EXTERN UBreakIterator* (*g_pubrk_openRules)(const UChar *rules, int32_t rulesLength, const UChar *text, int32_t textLength, UParseError *parseErr, UErrorCode *status); +inline UBreakIterator* ubrk_openRules(const UChar *rules, int32_t rulesLength, const UChar *text, int32_t textLength, UParseError *parseErr, UErrorCode *status) +{ + return g_pubrk_openRules(rules, rulesLength, text, textLength, parseErr, status); +} + typedef void (*ubrk_setText_type)(UBreakIterator *bi, const UChar* text, int32_t textLength, UErrorCode* status); ICU_EXTERN void (*g_pubrk_setText)(UBreakIterator *bi, const UChar* text, int32_t textLength, UErrorCode* status); inline void ubrk_setText(UBreakIterator *bi, const UChar* text, int32_t textLength, UErrorCode* status) @@ -85,6 +104,7 @@ public: if (!m_hLibrary) return; g_pubrk_open = reinterpret_cast(GetProcAddress(m_hLibrary, "ubrk_open")); + g_pubrk_openRules = reinterpret_cast(GetProcAddress(m_hLibrary, "ubrk_openRules")); g_pubrk_setText = reinterpret_cast(GetProcAddress(m_hLibrary, "ubrk_setText")); g_pubrk_close = reinterpret_cast(GetProcAddress(m_hLibrary, "ubrk_close")); g_pubrk_first = reinterpret_cast(GetProcAddress(m_hLibrary, "ubrk_first")); @@ -114,7 +134,15 @@ public: if (ICULoader::IsLoaded()) { UErrorCode status = U_ZERO_ERROR; - m_iter = ubrk_open(type, locale, reinterpret_cast(text), textLength, &status); + if (type == UBRK_CHARACTER) + { + UParseError parseError; + m_iter = ubrk_openRules(kCustomRules, static_cast(wcslen(reinterpret_cast(kCustomRules))), text, textLength, &parseError, &status); + } + else + { + m_iter = ubrk_open(type, locale, reinterpret_cast(text), textLength, &status); + } if (m_iter) ubrk_first(m_iter); } @@ -182,6 +210,21 @@ public: return myfollowing(offset); } + static ICUBreakIterator *getCharacterBreakIterator(const UChar * text, int32_t textLength) + { + return getCharacterBreakIterator<1>(text, textLength); + } + + template + static ICUBreakIterator *getCharacterBreakIterator(const UChar * text, int32_t textLength) + { + if (!m_pCharaterBreakIterator) + m_pCharaterBreakIterator.reset(new ICUBreakIterator(UBRK_CHARACTER, "en", text, textLength)); + else + m_pCharaterBreakIterator->setText(text, textLength); + return m_pCharaterBreakIterator.get(); + } + private: int mynext() { @@ -266,11 +309,11 @@ private: } return m_i; } - UBreakIterator* m_iter; UBreakIteratorType m_type; const UChar *m_text; int m_i; int m_textLength; + static const UChar *kCustomRules; }; diff --git a/Src/stringdiffs.cpp b/Src/stringdiffs.cpp index 15a971bab..508bdf7b9 100644 --- a/Src/stringdiffs.cpp +++ b/Src/stringdiffs.cpp @@ -208,10 +208,6 @@ stringdiffs::stringdiffs(const String & str1, const String & str2, , m_breakType(breakType) , m_pDiffs(pDiffs) , m_matchblock(true) // Change to false to get word to word compare -, m_iterCharBegin1(UBRK_CHARACTER, "en", nullptr, 0) -, m_iterCharBegin2(UBRK_CHARACTER, "en", nullptr, 0) -, m_iterCharEnd1(UBRK_CHARACTER, "en", nullptr, 0) -, m_iterCharEnd2(UBRK_CHARACTER, "en", nullptr, 0) { } @@ -369,8 +365,7 @@ stringdiffs::BuildWordsArray(const String & str) { std::vector words; int i = 0, begin = 0; - - m_iterCharBegin1.setText(reinterpret_cast(str.c_str()), static_cast(str.length())); + ICUBreakIterator *pIterChar = ICUBreakIterator::getCharacterBreakIterator(reinterpret_cast(str.c_str()), static_cast(str.length())); size_t sLen = str.length(); assert(sLen < INT_MAX); @@ -383,7 +378,7 @@ stringdiffs::BuildWordsArray(const String & str) inspace: if (isSafeWhitespace(str[i])) { - i = m_iterCharBegin1.next(); + i = pIterChar->next(); goto inspace; } if (begin < i) @@ -425,14 +420,14 @@ inword: { // start a new word because we hit a non-whitespace word break (eg, a comma) // but, we have to put each word break character into its own word - int inext = m_iterCharBegin1.next(); + int inext = pIterChar->next(); words.push_back(word(i, inext - 1, dlbreak, Hash(str, i, inext - 1, 0))); i = inext; begin = i; goto inword; } } - i = m_iterCharBegin1.next(); + i = pIterChar->next(); goto inword; // safe even if we're at the end or no longer in a word } @@ -756,10 +751,10 @@ stringdiffs::ComputeByteDiff(const String & str1, const String & str2, const TCHAR *pbeg1 = str1.c_str(); const TCHAR *pbeg2 = str2.c_str(); - m_iterCharBegin1.setText(reinterpret_cast(pbeg1), static_cast(len1)); - m_iterCharBegin2.setText(reinterpret_cast(pbeg2), static_cast(len2)); - m_iterCharEnd1.setText(reinterpret_cast(pbeg1), static_cast(len1)); - m_iterCharEnd2.setText(reinterpret_cast(pbeg2), static_cast(len2)); + ICUBreakIterator *pIterCharBegin1 = ICUBreakIterator::getCharacterBreakIterator(reinterpret_cast(pbeg1), static_cast(len1)); + ICUBreakIterator *pIterCharBegin2 = ICUBreakIterator::getCharacterBreakIterator<2>(reinterpret_cast(pbeg2), static_cast(len2)); + ICUBreakIterator *pIterCharEnd1 = ICUBreakIterator::getCharacterBreakIterator<3>(reinterpret_cast(pbeg1), static_cast(len1)); + ICUBreakIterator *pIterCharEnd2 = ICUBreakIterator::getCharacterBreakIterator<4>(reinterpret_cast(pbeg2), static_cast(len2)); if (len1 == 0 || len2 == 0) { @@ -778,8 +773,8 @@ stringdiffs::ComputeByteDiff(const String & str1, const String & str2, const TCHAR *py2 = pbeg2; // pen1,pen2 point to the last valid character (broken multibyte lead chars don't count) - const TCHAR *pen1 = pbeg1 + (len1 > 0 ? m_iterCharEnd1.preceding(len1) : 0); - const TCHAR *pen2 = pbeg2 + (len2 > 0 ? m_iterCharEnd2.preceding(len2) : 0); + const TCHAR *pen1 = pbeg1 + (len1 > 0 ? pIterCharEnd1->preceding(len1) : 0); + const TCHAR *pen2 = pbeg2 + (len2 > 0 ? pIterCharEnd2->preceding(len2) : 0); size_t glyphlenz1 = pbeg1 + len1 - pen1; size_t glyphlenz2 = pbeg2 + len2 - pen2; @@ -789,9 +784,9 @@ stringdiffs::ComputeByteDiff(const String & str1, const String & str2, // by advancing py1 and py2 // and retreating pen1 and pen2 while (py1 < pen1 && isSafeWhitespace(*py1)) - py1 = pbeg1 + m_iterCharBegin1.next(); + py1 = pbeg1 + pIterCharBegin1->next(); while (py2 < pen2 && isSafeWhitespace(*py2)) - py2 = pbeg2 + m_iterCharBegin2.next(); + py2 = pbeg2 + pIterCharBegin2->next(); if ((pen1 < pbeg1 + len1 - 1 || pen2 < pbeg2 + len2 -1) && (!len1 || !len2 || pbeg1[len1] != pbeg2[len2])) { @@ -800,9 +795,9 @@ stringdiffs::ComputeByteDiff(const String & str1, const String & str2, else { while (pen1 > py1 && isSafeWhitespace(*pen1)) - pen1 = pbeg1 + m_iterCharEnd1.previous(); + pen1 = pbeg1 + pIterCharEnd1->previous(); while (pen2 > py2 && isSafeWhitespace(*pen2)) - pen2 = pbeg2 + m_iterCharEnd2.previous(); + pen2 = pbeg2 + pIterCharEnd2->previous(); } } //check for excaption of empty string on one side @@ -864,8 +859,8 @@ stringdiffs::ComputeByteDiff(const String & str1, const String & str2, continue; } - const TCHAR* py1next = pbeg1 + m_iterCharBegin1.next(); - const TCHAR* py2next = pbeg2 + m_iterCharBegin2.next(); + const TCHAR* py1next = pbeg1 + pIterCharBegin1->next(); + const TCHAR* py2next = pbeg2 + pIterCharBegin2->next(); size_t glyphleny1 = py1next - py1; size_t glyphleny2 = py2next - py2; if (glyphleny1 != glyphleny2 || !matchchar(py1, py2, glyphleny1, casitive)) @@ -906,9 +901,9 @@ stringdiffs::ComputeByteDiff(const String & str1, const String & str2, break; // done with reverse search // gobble up all whitespace in current area while (pz1 > py1 && isSafeWhitespace(*pz1)) - pz1 = pbeg1 + m_iterCharEnd1.previous(); + pz1 = pbeg1 + pIterCharEnd1->previous(); while (pz2 > py2 && isSafeWhitespace(*pz2)) - pz2 = pbeg2 + m_iterCharEnd2.previous(); + pz2 = pbeg2 + pIterCharEnd2->previous(); continue; } @@ -917,7 +912,7 @@ stringdiffs::ComputeByteDiff(const String & str1, const String & str2, if (xwhite==1) break; // done with reverse search while (pz2 > py2 && isSafeWhitespace(*pz2)) - pz2 = pbeg2 + m_iterCharEnd2.previous(); + pz2 = pbeg2 + pIterCharEnd2->previous(); continue; } @@ -925,8 +920,8 @@ stringdiffs::ComputeByteDiff(const String & str1, const String & str2, break; // done with forward search const TCHAR* pz1next = pz1; const TCHAR* pz2next = pz2; - pz1 = (pz1 > pbeg1) ? pbeg1 + m_iterCharEnd1.preceding(static_cast(pz1 - pbeg1)) : pz1 - 1; - pz2 = (pz2 > pbeg2) ? pbeg2 + m_iterCharEnd2.preceding(static_cast(pz2 - pbeg2)) : pz2 - 1; + pz1 = (pz1 > pbeg1) ? pbeg1 + pIterCharEnd1->preceding(static_cast(pz1 - pbeg1)) : pz1 - 1; + pz2 = (pz2 > pbeg2) ? pbeg2 + pIterCharEnd2->preceding(static_cast(pz2 - pbeg2)) : pz2 - 1; glyphlenz1 = pz1next - pz1; glyphlenz2 = pz2next - pz2; // Now do real character match @@ -954,8 +949,8 @@ stringdiffs::ComputeByteDiff(const String & str1, const String & str2, }*/ // Store results of advance into return variables (end[0] & end[1]) - end[0] = static_cast(pz1 - pbeg1) + glyphlenz1 - 1; - end[1] = static_cast(pz2 - pbeg2) + glyphlenz2 - 1; + end[0] = static_cast(pz1 - pbeg1 + glyphlenz1 - 1); + end[1] = static_cast(pz2 - pbeg2 + glyphlenz2 - 1); // Check if difference region was empty if (begin[0] == end[0] + 1 && begin[1] == end[1] + 1) diff --git a/Src/stringdiffsi.h b/Src/stringdiffsi.h index 7f5080630..ff555167b 100644 --- a/Src/stringdiffsi.h +++ b/Src/stringdiffsi.h @@ -118,10 +118,6 @@ private: std::vector m_words1; std::vector m_words2; std::vector m_wdiffs; - ICUBreakIterator m_iterCharBegin1; - ICUBreakIterator m_iterCharBegin2; - ICUBreakIterator m_iterCharEnd1; - ICUBreakIterator m_iterCharEnd2; }; } -- 2.11.0