OSDN Git Service

Improve UNICODE character support using icu.dll usable from Windows 10 Creators Updat...
authorTakashi Sawanaka <sdottaka@users.sourceforge.net>
Sat, 23 Nov 2019 13:11:49 +0000 (22:11 +0900)
committerTakashi Sawanaka <sdottaka@users.sourceforge.net>
Sat, 23 Nov 2019 13:11:49 +0000 (22:11 +0900)
Fixed an issue where Halfwidth Katakana Voiced Sound Mark(U+FF9E) and  Halfwidth Katakana Semi-Voiced Sound Mark(U+FF9F) were not treated as one character

Externals/crystaledit/editlib/icu.cpp
Externals/crystaledit/editlib/icu.hpp
Src/stringdiffs.cpp
Src/stringdiffsi.h

index 386e812..36c1315 100644 (file)
@@ -2,5 +2,42 @@
 #define ICU_EXTERN\r
 #include "icu.hpp"\r
 \r
+static ICULoader m_ICULoader; \r
 HMODULE ICULoader::m_hLibrary = nullptr;\r
-static ICULoader m_ICULoader; 
\ No newline at end of file
+template <> thread_local std::unique_ptr<ICUBreakIterator> m_pCharaterBreakIterator<1>;\r
+template <> thread_local std::unique_ptr<ICUBreakIterator> m_pCharaterBreakIterator<2>;\r
+template <> thread_local std::unique_ptr<ICUBreakIterator> m_pCharaterBreakIterator<3>;\r
+template <> thread_local std::unique_ptr<ICUBreakIterator> m_pCharaterBreakIterator<4>;\r
+\r
+// This rule set is based on character-break iterator rules of ICU 63.1\r
+  // <https://github.com/unicode-org/icu/blob/release-63-1/icu4c/source/data/brkitr/rules/char.txt>.\r
+const UChar* ICUBreakIterator::kCustomRules =\r
+u"$CR          = [\\p{Grapheme_Cluster_Break = CR}];"\r
+u"$LF          = [\\p{Grapheme_Cluster_Break = LF}];"\r
+u"$Control     = [[\\p{Grapheme_Cluster_Break = Control}]];"\r
+u"$VoiceMarks  = [\\uFF9E\\uFF9F];"\r
+u"$Extend      = [[\\p{Grapheme_Cluster_Break = Extend}] - $VoiceMarks];"\r
+u"$ZWJ         = [\\p{Grapheme_Cluster_Break = ZWJ}];"\r
+u"$Regional_Indicator = [\\p{Grapheme_Cluster_Break = Regional_Indicator}];"\r
+u"$Prepend     = [\\p{Grapheme_Cluster_Break = Prepend}];"\r
+u"$SpacingMark = [\\p{Grapheme_Cluster_Break = SpacingMark}];"\r
+u"$L           = [\\p{Grapheme_Cluster_Break = L}];"\r
+u"$V           = [\\p{Grapheme_Cluster_Break = V}];"\r
+u"$T           = [\\p{Grapheme_Cluster_Break = T}];"\r
+u"$LV          = [\\p{Grapheme_Cluster_Break = LV}];"\r
+u"$LVT         = [\\p{Grapheme_Cluster_Break = LVT}];"\r
+u"$Extended_Pict = [:ExtPict:];"\r
+u"!!chain;"\r
+u"!!lookAheadHardBreak;"\r
+u"$VoiceMarks;"\r
+u"$CR $LF;"\r
+u"$L ($L | $V | $LV | $LVT);"\r
+u"($LV | $V) ($V | $T);"\r
+u"($LVT | $T) $T;"\r
+u"[^$Control $CR $LF] ($Extend | $ZWJ);"\r
+u"[^$Control $CR $LF] $SpacingMark;"\r
+u"$Prepend [^$Control $CR $LF];"\r
+u"$Extended_Pict $Extend* $ZWJ $Extended_Pict;"\r
+u"^$Prepend* $Regional_Indicator $Regional_Indicator / $Regional_Indicator;"\r
+u"^$Prepend* $Regional_Indicator $Regional_Indicator;"\r
+u".;";\r
index e5aabe1..7fe5202 100644 (file)
@@ -20,6 +20,18 @@ typedef int32_t UChar32;
 typedef char16_t UChar;\r
 typedef struct UBreakIterator UBreakIterator;\r
 typedef enum UErrorCode { U_ZERO_ERROR = 0 } UErrorCode;\r
+enum { U_PARSE_CONTEXT_LEN = 16 };\r
+typedef struct UParseError {\r
+       int32_t line;\r
+       int32_t offset;\r
+       UChar   preContext[U_PARSE_CONTEXT_LEN];\r
+       UChar   postContext[U_PARSE_CONTEXT_LEN];\r
+} UParseError;\r
+\r
+class ICUBreakIterator;\r
+\r
+template<int N>\r
+extern thread_local std::unique_ptr<ICUBreakIterator> m_pCharaterBreakIterator;\r
 \r
 typedef UBreakIterator* (*ubrk_open_type)(UBreakIteratorType type, const char* locale, const UChar* text, int32_t textLength, UErrorCode* status);\r
 ICU_EXTERN UBreakIterator* (*g_pubrk_open)(UBreakIteratorType type, const char* locale, const UChar* text, int32_t textLength, UErrorCode* status);\r
@@ -28,6 +40,13 @@ inline UBreakIterator* ubrk_open(UBreakIteratorType type, const char* locale, co
        return g_pubrk_open(type, locale, text, textLength, status);\r
 }\r
 \r
+typedef UBreakIterator* (*ubrk_openRules_type)(const UChar *rules, int32_t rulesLength, const UChar *text, int32_t textLength, UParseError *parseErr, UErrorCode *status);\r
+ICU_EXTERN UBreakIterator* (*g_pubrk_openRules)(const UChar *rules, int32_t rulesLength, const UChar *text, int32_t textLength, UParseError *parseErr, UErrorCode *status);\r
+inline UBreakIterator* ubrk_openRules(const UChar *rules, int32_t rulesLength, const UChar *text, int32_t textLength, UParseError *parseErr, UErrorCode *status)\r
+{\r
+       return g_pubrk_openRules(rules, rulesLength, text, textLength, parseErr, status);\r
+}\r
+\r
 typedef void (*ubrk_setText_type)(UBreakIterator *bi, const UChar* text, int32_t textLength, UErrorCode* status);\r
 ICU_EXTERN void (*g_pubrk_setText)(UBreakIterator *bi, const UChar* text, int32_t textLength, UErrorCode* status);\r
 inline void ubrk_setText(UBreakIterator *bi, const UChar* text, int32_t textLength, UErrorCode* status)\r
@@ -85,6 +104,7 @@ public:
                if (!m_hLibrary)\r
                        return;\r
                g_pubrk_open = reinterpret_cast<ubrk_open_type>(GetProcAddress(m_hLibrary, "ubrk_open"));\r
+               g_pubrk_openRules = reinterpret_cast<ubrk_openRules_type>(GetProcAddress(m_hLibrary, "ubrk_openRules"));\r
                g_pubrk_setText = reinterpret_cast<ubrk_setText_type>(GetProcAddress(m_hLibrary, "ubrk_setText"));\r
                g_pubrk_close = reinterpret_cast<ubrk_close_type>(GetProcAddress(m_hLibrary, "ubrk_close"));\r
                g_pubrk_first = reinterpret_cast<ubrk_first_type>(GetProcAddress(m_hLibrary, "ubrk_first"));\r
@@ -114,7 +134,15 @@ public:
                if (ICULoader::IsLoaded())\r
                {\r
                        UErrorCode status = U_ZERO_ERROR;\r
-                       m_iter = ubrk_open(type, locale, reinterpret_cast<const UChar *>(text), textLength, &status);\r
+                       if (type == UBRK_CHARACTER)\r
+                       {\r
+                               UParseError parseError;\r
+                               m_iter = ubrk_openRules(kCustomRules, static_cast<int32_t>(wcslen(reinterpret_cast<const wchar_t *>(kCustomRules))), text, textLength, &parseError, &status);\r
+                       }\r
+                       else\r
+                       {\r
+                               m_iter = ubrk_open(type, locale, reinterpret_cast<const UChar *>(text), textLength, &status);\r
+                       }\r
                        if (m_iter)\r
                                ubrk_first(m_iter);\r
                }\r
@@ -182,6 +210,21 @@ public:
                return myfollowing(offset);\r
        }\r
 \r
+       static ICUBreakIterator *getCharacterBreakIterator(const UChar * text, int32_t textLength)\r
+       {\r
+               return getCharacterBreakIterator<1>(text, textLength);\r
+       }\r
+\r
+       template<int N>\r
+       static ICUBreakIterator *getCharacterBreakIterator(const UChar * text, int32_t textLength)\r
+       {\r
+               if (!m_pCharaterBreakIterator<N>)\r
+                       m_pCharaterBreakIterator<N>.reset(new ICUBreakIterator(UBRK_CHARACTER, "en", text, textLength));\r
+               else\r
+                       m_pCharaterBreakIterator<N>->setText(text, textLength);\r
+               return m_pCharaterBreakIterator<N>.get();\r
+       }\r
+\r
 private:\r
        int mynext()\r
        {\r
@@ -266,11 +309,11 @@ private:
                }\r
                return m_i;\r
        }\r
-\r
        UBreakIterator* m_iter;\r
        UBreakIteratorType m_type;\r
        const UChar *m_text;\r
        int m_i;\r
        int m_textLength;\r
+       static const UChar *kCustomRules;\r
 };\r
 \r
index 15a971b..508bdf7 100644 (file)
@@ -208,10 +208,6 @@ stringdiffs::stringdiffs(const String & str1, const String & str2,
 , m_breakType(breakType)
 , m_pDiffs(pDiffs)
 , m_matchblock(true) // Change to false to get word to word compare
-, m_iterCharBegin1(UBRK_CHARACTER, "en", nullptr, 0)
-, m_iterCharBegin2(UBRK_CHARACTER, "en", nullptr, 0)
-, m_iterCharEnd1(UBRK_CHARACTER, "en", nullptr, 0)
-, m_iterCharEnd2(UBRK_CHARACTER, "en", nullptr, 0)
 {
 }
 
@@ -369,8 +365,7 @@ stringdiffs::BuildWordsArray(const String & str)
 {
        std::vector<word> words;
        int i = 0, begin = 0;
-
-       m_iterCharBegin1.setText(reinterpret_cast<const UChar *>(str.c_str()), static_cast<int32_t>(str.length()));
+       ICUBreakIterator *pIterChar = ICUBreakIterator::getCharacterBreakIterator(reinterpret_cast<const UChar *>(str.c_str()), static_cast<int32_t>(str.length()));
 
        size_t sLen = str.length();
        assert(sLen < INT_MAX);
@@ -383,7 +378,7 @@ stringdiffs::BuildWordsArray(const String & str)
 inspace:
        if (isSafeWhitespace(str[i])) 
        {
-               i = m_iterCharBegin1.next();
+               i = pIterChar->next();
                goto inspace;
        }
        if (begin < i)
@@ -425,14 +420,14 @@ inword:
                {
                        // start a new word because we hit a non-whitespace word break (eg, a comma)
                        // but, we have to put each word break character into its own word
-                       int inext = m_iterCharBegin1.next();
+                       int inext = pIterChar->next();
                        words.push_back(word(i, inext - 1, dlbreak, Hash(str, i, inext - 1, 0)));
                        i = inext;
                        begin = i;
                        goto inword;
                }
        }
-       i = m_iterCharBegin1.next();
+       i = pIterChar->next();
        goto inword; // safe even if we're at the end or no longer in a word
 }
 
@@ -756,10 +751,10 @@ stringdiffs::ComputeByteDiff(const String & str1, const String & str2,
        const TCHAR *pbeg1 = str1.c_str();
        const TCHAR *pbeg2 = str2.c_str();
 
-       m_iterCharBegin1.setText(reinterpret_cast<const UChar *>(pbeg1), static_cast<int32_t>(len1));
-       m_iterCharBegin2.setText(reinterpret_cast<const UChar *>(pbeg2), static_cast<int32_t>(len2));
-       m_iterCharEnd1.setText(reinterpret_cast<const UChar *>(pbeg1), static_cast<int32_t>(len1));
-       m_iterCharEnd2.setText(reinterpret_cast<const UChar *>(pbeg2), static_cast<int32_t>(len2));
+       ICUBreakIterator *pIterCharBegin1 = ICUBreakIterator::getCharacterBreakIterator(reinterpret_cast<const UChar *>(pbeg1), static_cast<int32_t>(len1));
+       ICUBreakIterator *pIterCharBegin2 = ICUBreakIterator::getCharacterBreakIterator<2>(reinterpret_cast<const UChar *>(pbeg2), static_cast<int32_t>(len2));
+       ICUBreakIterator *pIterCharEnd1 = ICUBreakIterator::getCharacterBreakIterator<3>(reinterpret_cast<const UChar *>(pbeg1), static_cast<int32_t>(len1));
+       ICUBreakIterator *pIterCharEnd2 = ICUBreakIterator::getCharacterBreakIterator<4>(reinterpret_cast<const UChar *>(pbeg2), static_cast<int32_t>(len2));
        
        if (len1 == 0 || len2 == 0)
        {
@@ -778,8 +773,8 @@ stringdiffs::ComputeByteDiff(const String & str1, const String & str2,
        const TCHAR *py2 = pbeg2;
 
        // pen1,pen2 point to the last valid character (broken multibyte lead chars don't count)
-       const TCHAR *pen1 = pbeg1 + (len1 > 0 ? m_iterCharEnd1.preceding(len1) : 0);
-       const TCHAR *pen2 = pbeg2 + (len2 > 0 ? m_iterCharEnd2.preceding(len2) : 0);
+       const TCHAR *pen1 = pbeg1 + (len1 > 0 ? pIterCharEnd1->preceding(len1) : 0);
+       const TCHAR *pen2 = pbeg2 + (len2 > 0 ? pIterCharEnd2->preceding(len2) : 0);
        size_t glyphlenz1 = pbeg1 + len1 - pen1;
        size_t glyphlenz2 = pbeg2 + len2 - pen2;
 
@@ -789,9 +784,9 @@ stringdiffs::ComputeByteDiff(const String & str1, const String & str2,
                // by advancing py1 and py2
                // and retreating pen1 and pen2
                while (py1 < pen1 && isSafeWhitespace(*py1))
-                       py1 = pbeg1 + m_iterCharBegin1.next();
+                       py1 = pbeg1 + pIterCharBegin1->next();
                while (py2 < pen2 && isSafeWhitespace(*py2))
-                       py2 = pbeg2 + m_iterCharBegin2.next();
+                       py2 = pbeg2 + pIterCharBegin2->next();
                if ((pen1 < pbeg1 + len1 - 1 || pen2 < pbeg2 + len2 -1)
                        && (!len1 || !len2 || pbeg1[len1] != pbeg2[len2]))
                {
@@ -800,9 +795,9 @@ stringdiffs::ComputeByteDiff(const String & str1, const String & str2,
                else
                {
                        while (pen1 > py1 && isSafeWhitespace(*pen1))
-                               pen1 = pbeg1 + m_iterCharEnd1.previous();
+                               pen1 = pbeg1 + pIterCharEnd1->previous();
                        while (pen2 > py2 && isSafeWhitespace(*pen2))
-                               pen2 = pbeg2 + m_iterCharEnd2.previous();
+                               pen2 = pbeg2 + pIterCharEnd2->previous();
                }
        }
        //check for excaption of empty string on one side
@@ -864,8 +859,8 @@ stringdiffs::ComputeByteDiff(const String & str1, const String & str2,
                        continue;
                }
 
-               const TCHAR* py1next = pbeg1 + m_iterCharBegin1.next();
-               const TCHAR* py2next = pbeg2 + m_iterCharBegin2.next();
+               const TCHAR* py1next = pbeg1 + pIterCharBegin1->next();
+               const TCHAR* py2next = pbeg2 + pIterCharBegin2->next();
                size_t glyphleny1 = py1next - py1;
                size_t glyphleny2 = py2next - py2;
                if (glyphleny1 != glyphleny2 || !matchchar(py1, py2, glyphleny1, casitive))
@@ -906,9 +901,9 @@ stringdiffs::ComputeByteDiff(const String & str1, const String & str2,
                                break; // done with reverse search
                        // gobble up all whitespace in current area
                        while (pz1 > py1 && isSafeWhitespace(*pz1))
-                               pz1 = pbeg1 + m_iterCharEnd1.previous();
+                               pz1 = pbeg1 + pIterCharEnd1->previous();
                        while (pz2 > py2 && isSafeWhitespace(*pz2))
-                               pz2 = pbeg2 + m_iterCharEnd2.previous();
+                               pz2 = pbeg2 + pIterCharEnd2->previous();
                        continue;
 
                }
@@ -917,7 +912,7 @@ stringdiffs::ComputeByteDiff(const String & str1, const String & str2,
                        if (xwhite==1)
                                break; // done with reverse search
                        while (pz2 > py2 && isSafeWhitespace(*pz2))
-                               pz2 = pbeg2 + m_iterCharEnd2.previous();
+                               pz2 = pbeg2 + pIterCharEnd2->previous();
                        continue;
                }
 
@@ -925,8 +920,8 @@ stringdiffs::ComputeByteDiff(const String & str1, const String & str2,
                        break; // done with forward search
                const TCHAR* pz1next = pz1;
                const TCHAR* pz2next = pz2;
-               pz1 = (pz1 > pbeg1) ? pbeg1 + m_iterCharEnd1.preceding(static_cast<int32_t>(pz1 - pbeg1)) : pz1 - 1;
-               pz2 = (pz2 > pbeg2) ? pbeg2 + m_iterCharEnd2.preceding(static_cast<int32_t>(pz2 - pbeg2)) : pz2 - 1;
+               pz1 = (pz1 > pbeg1) ? pbeg1 + pIterCharEnd1->preceding(static_cast<int32_t>(pz1 - pbeg1)) : pz1 - 1;
+               pz2 = (pz2 > pbeg2) ? pbeg2 + pIterCharEnd2->preceding(static_cast<int32_t>(pz2 - pbeg2)) : pz2 - 1;
                glyphlenz1 = pz1next - pz1;
                glyphlenz2 = pz2next - pz2;
                // Now do real character match
@@ -954,8 +949,8 @@ stringdiffs::ComputeByteDiff(const String & str1, const String & str2,
        }*/
 
        // Store results of advance into return variables (end[0] & end[1])
-       end[0] = static_cast<int>(pz1 - pbeg1) + glyphlenz1 - 1;
-       end[1] = static_cast<int>(pz2 - pbeg2) + glyphlenz2 - 1;
+       end[0] = static_cast<int>(pz1 - pbeg1 + glyphlenz1 - 1);
+       end[1] = static_cast<int>(pz2 - pbeg2 + glyphlenz2 - 1);
 
        // Check if difference region was empty
        if (begin[0] == end[0] + 1 && begin[1] == end[1] + 1)
index 7f50806..ff55516 100644 (file)
@@ -118,10 +118,6 @@ private:
        std::vector<word> m_words1;
        std::vector<word> m_words2;
        std::vector<wdiff> m_wdiffs;
-       ICUBreakIterator m_iterCharBegin1;
-       ICUBreakIterator m_iterCharBegin2;
-       ICUBreakIterator m_iterCharEnd1;
-       ICUBreakIterator m_iterCharEnd2;
 };
 
 }