From ce7ceb240fd8d559c1a9c4a7d3bee46bacb52c99 Mon Sep 17 00:00:00 2001
From: Takashi Sawanaka <sdottaka@users.sourceforge.net>
Date: Sat, 23 Nov 2019 22:11:49 +0900
Subject: [PATCH] Improve UNICODE character support using icu.dll usable from
 Windows 10 Creators Update (10) Fixed an issue where Halfwidth Katakana
 Voiced Sound Mark(U+FF9E) and  Halfwidth Katakana Semi-Voiced Sound
 Mark(U+FF9F) were not treated as one character

---
 Externals/crystaledit/editlib/icu.cpp | 39 ++++++++++++++++++++++++++-
 Externals/crystaledit/editlib/icu.hpp | 47 ++++++++++++++++++++++++++++++--
 Src/stringdiffs.cpp                   | 51 ++++++++++++++++-------------------
 Src/stringdiffsi.h                    |  4 ---
 4 files changed, 106 insertions(+), 35 deletions(-)
diff --git a/Externals/crystaledit/editlib/icu.cpp b/Externals/crystaledit/editlib/icu.cpp
index 386e81299..36c131530 100644
--- a/Externals/crystaledit/editlib/icu.cpp
+++ b/Externals/crystaledit/editlib/icu.cpp
@@ -2,5 +2,42 @@
 #define ICU_EXTERN
 #include "icu.hpp"
 
+static ICULoader m_ICULoader; 
 HMODULE ICULoader::m_hLibrary = nullptr;
-static ICULoader m_ICULoader; 
\ No newline at end of file
+template <> thread_local std::unique_ptr<ICUBreakIterator> m_pCharaterBreakIterator<1>;
+template <> thread_local std::unique_ptr<ICUBreakIterator> m_pCharaterBreakIterator<2>;
+template <> thread_local std::unique_ptr<ICUBreakIterator> m_pCharaterBreakIterator<3>;
+template <> thread_local std::unique_ptr<ICUBreakIterator> m_pCharaterBreakIterator<4>;
+
+// This rule set is based on character-break iterator rules of ICU 63.1
+  // <https://github.com/unicode-org/icu/blob/release-63-1/icu4c/source/data/brkitr/rules/char.txt>.
+const UChar* ICUBreakIterator::kCustomRules =
+u"$CR          = [\\p{Grapheme_Cluster_Break = CR}];"
+u"$LF          = [\\p{Grapheme_Cluster_Break = LF}];"
+u"$Control     = [[\\p{Grapheme_Cluster_Break = Control}]];"
+u"$VoiceMarks  = [\\uFF9E\\uFF9F];"
+u"$Extend      = [[\\p{Grapheme_Cluster_Break = Extend}] - $VoiceMarks];"
+u"$ZWJ         = [\\p{Grapheme_Cluster_Break = ZWJ}];"
+u"$Regional_Indicator = [\\p{Grapheme_Cluster_Break = Regional_Indicator}];"
+u"$Prepend     = [\\p{Grapheme_Cluster_Break = Prepend}];"
+u"$SpacingMark = [\\p{Grapheme_Cluster_Break = SpacingMark}];"
+u"$L           = [\\p{Grapheme_Cluster_Break = L}];"
+u"$V           = [\\p{Grapheme_Cluster_Break = V}];"
+u"$T           = [\\p{Grapheme_Cluster_Break = T}];"
+u"$LV          = [\\p{Grapheme_Cluster_Break = LV}];"
+u"$LVT         = [\\p{Grapheme_Cluster_Break = LVT}];"
+u"$Extended_Pict = [:ExtPict:];"
+u"!!chain;"
+u"!!lookAheadHardBreak;"
+u"$VoiceMarks;"
+u"$CR $LF;"
+u"$L ($L | $V | $LV | $LVT);"
+u"($LV | $V) ($V | $T);"
+u"($LVT | $T) $T;"
+u"[^$Control $CR $LF] ($Extend | $ZWJ);"
+u"[^$Control $CR $LF] $SpacingMark;"
+u"$Prepend [^$Control $CR $LF];"
+u"$Extended_Pict $Extend* $ZWJ $Extended_Pict;"
+u"^$Prepend* $Regional_Indicator $Regional_Indicator / $Regional_Indicator;"
+u"^$Prepend* $Regional_Indicator $Regional_Indicator;"
+u".;";
diff --git a/Externals/crystaledit/editlib/icu.hpp b/Externals/crystaledit/editlib/icu.hpp
index e5aabe1bb..7fe520201 100644
--- a/Externals/crystaledit/editlib/icu.hpp
+++ b/Externals/crystaledit/editlib/icu.hpp
@@ -20,6 +20,18 @@ typedef int32_t UChar32;
 typedef char16_t UChar;
 typedef struct UBreakIterator UBreakIterator;
 typedef enum UErrorCode { U_ZERO_ERROR = 0 } UErrorCode;
+enum { U_PARSE_CONTEXT_LEN = 16 };
+typedef struct UParseError {
+	int32_t line;
+	int32_t offset;
+	UChar   preContext[U_PARSE_CONTEXT_LEN];
+	UChar   postContext[U_PARSE_CONTEXT_LEN];
+} UParseError;
+
+class ICUBreakIterator;
+
+template<int N>
+extern thread_local std::unique_ptr<ICUBreakIterator> m_pCharaterBreakIterator;
 
 typedef UBreakIterator* (*ubrk_open_type)(UBreakIteratorType type, const char* locale, const UChar* text, int32_t textLength, UErrorCode* status);
 ICU_EXTERN UBreakIterator* (*g_pubrk_open)(UBreakIteratorType type, const char* locale, const UChar* text, int32_t textLength, UErrorCode* status);
@@ -28,6 +40,13 @@ inline UBreakIterator* ubrk_open(UBreakIteratorType type, const char* locale, co
 	return g_pubrk_open(type, locale, text, textLength, status);
 }
 
+typedef UBreakIterator* (*ubrk_openRules_type)(const UChar *rules, int32_t rulesLength, const UChar *text, int32_t textLength, UParseError *parseErr, UErrorCode *status);
+ICU_EXTERN UBreakIterator* (*g_pubrk_openRules)(const UChar *rules, int32_t rulesLength, const UChar *text, int32_t textLength, UParseError *parseErr, UErrorCode *status);
+inline UBreakIterator* ubrk_openRules(const UChar *rules, int32_t rulesLength, const UChar *text, int32_t textLength, UParseError *parseErr, UErrorCode *status)
+{
+	return g_pubrk_openRules(rules, rulesLength, text, textLength, parseErr, status);
+}
+
 typedef void (*ubrk_setText_type)(UBreakIterator *bi, const UChar* text, int32_t textLength, UErrorCode* status);
 ICU_EXTERN void (*g_pubrk_setText)(UBreakIterator *bi, const UChar* text, int32_t textLength, UErrorCode* status);
 inline void ubrk_setText(UBreakIterator *bi, const UChar* text, int32_t textLength, UErrorCode* status)
@@ -85,6 +104,7 @@ public:
 		if (!m_hLibrary)
 			return;
 		g_pubrk_open = reinterpret_cast<ubrk_open_type>(GetProcAddress(m_hLibrary, "ubrk_open"));
+		g_pubrk_openRules = reinterpret_cast<ubrk_openRules_type>(GetProcAddress(m_hLibrary, "ubrk_openRules"));
 		g_pubrk_setText = reinterpret_cast<ubrk_setText_type>(GetProcAddress(m_hLibrary, "ubrk_setText"));
 		g_pubrk_close = reinterpret_cast<ubrk_close_type>(GetProcAddress(m_hLibrary, "ubrk_close"));
 		g_pubrk_first = reinterpret_cast<ubrk_first_type>(GetProcAddress(m_hLibrary, "ubrk_first"));
@@ -114,7 +134,15 @@ public:
 		if (ICULoader::IsLoaded())
 		{
 			UErrorCode status = U_ZERO_ERROR;
-			m_iter = ubrk_open(type, locale, reinterpret_cast<const UChar *>(text), textLength, &status);
+			if (type == UBRK_CHARACTER)
+			{
+				UParseError parseError;
+				m_iter = ubrk_openRules(kCustomRules, static_cast<int32_t>(wcslen(reinterpret_cast<const wchar_t *>(kCustomRules))), text, textLength, &parseError, &status);
+			}
+			else
+			{
+				m_iter = ubrk_open(type, locale, reinterpret_cast<const UChar *>(text), textLength, &status);
+			}
 			if (m_iter)
 				ubrk_first(m_iter);
 		}
@@ -182,6 +210,21 @@ public:
 		return myfollowing(offset);
 	}
 
+	static ICUBreakIterator *getCharacterBreakIterator(const UChar * text, int32_t textLength)
+	{
+		return getCharacterBreakIterator<1>(text, textLength);
+	}
+
+	template<int N>
+	static ICUBreakIterator *getCharacterBreakIterator(const UChar * text, int32_t textLength)
+	{
+		if (!m_pCharaterBreakIterator<N>)
+			m_pCharaterBreakIterator<N>.reset(new ICUBreakIterator(UBRK_CHARACTER, "en", text, textLength));
+		else
+			m_pCharaterBreakIterator<N>->setText(text, textLength);
+		return m_pCharaterBreakIterator<N>.get();
+	}
+
 private:
 	int mynext()
 	{
@@ -266,11 +309,11 @@ private:
 		}
 		return m_i;
 	}
-
 	UBreakIterator* m_iter;
 	UBreakIteratorType m_type;
 	const UChar *m_text;
 	int m_i;
 	int m_textLength;
+	static const UChar *kCustomRules;
 };
 
diff --git a/Src/stringdiffs.cpp b/Src/stringdiffs.cpp
index 15a971bab..508bdf7b9 100644
--- a/Src/stringdiffs.cpp
+++ b/Src/stringdiffs.cpp
@@ -208,10 +208,6 @@ stringdiffs::stringdiffs(const String & str1, const String & str2,
 , m_breakType(breakType)
 , m_pDiffs(pDiffs)
 , m_matchblock(true) // Change to false to get word to word compare
-, m_iterCharBegin1(UBRK_CHARACTER, "en", nullptr, 0)
-, m_iterCharBegin2(UBRK_CHARACTER, "en", nullptr, 0)
-, m_iterCharEnd1(UBRK_CHARACTER, "en", nullptr, 0)
-, m_iterCharEnd2(UBRK_CHARACTER, "en", nullptr, 0)
 {
 }
 
@@ -369,8 +365,7 @@ stringdiffs::BuildWordsArray(const String & str)
 {
 	std::vector<word> words;
 	int i = 0, begin = 0;
-
-	m_iterCharBegin1.setText(reinterpret_cast<const UChar *>(str.c_str()), static_cast<int32_t>(str.length()));
+	ICUBreakIterator *pIterChar = ICUBreakIterator::getCharacterBreakIterator(reinterpret_cast<const UChar *>(str.c_str()), static_cast<int32_t>(str.length()));
 
 	size_t sLen = str.length();
 	assert(sLen < INT_MAX);
@@ -383,7 +378,7 @@ stringdiffs::BuildWordsArray(const String & str)
 inspace:
 	if (isSafeWhitespace(str[i])) 
 	{
-		i = m_iterCharBegin1.next();
+		i = pIterChar->next();
 		goto inspace;
 	}
 	if (begin < i)
@@ -425,14 +420,14 @@ inword:
 		{
 			// start a new word because we hit a non-whitespace word break (eg, a comma)
 			// but, we have to put each word break character into its own word
-			int inext = m_iterCharBegin1.next();
+			int inext = pIterChar->next();
 			words.push_back(word(i, inext - 1, dlbreak, Hash(str, i, inext - 1, 0)));
 			i = inext;
 			begin = i;
 			goto inword;
 		}
 	}
-	i = m_iterCharBegin1.next();
+	i = pIterChar->next();
 	goto inword; // safe even if we're at the end or no longer in a word
 }
 
@@ -756,10 +751,10 @@ stringdiffs::ComputeByteDiff(const String & str1, const String & str2,
 	const TCHAR *pbeg1 = str1.c_str();
 	const TCHAR *pbeg2 = str2.c_str();
 
-	m_iterCharBegin1.setText(reinterpret_cast<const UChar *>(pbeg1), static_cast<int32_t>(len1));
-	m_iterCharBegin2.setText(reinterpret_cast<const UChar *>(pbeg2), static_cast<int32_t>(len2));
-	m_iterCharEnd1.setText(reinterpret_cast<const UChar *>(pbeg1), static_cast<int32_t>(len1));
-	m_iterCharEnd2.setText(reinterpret_cast<const UChar *>(pbeg2), static_cast<int32_t>(len2));
+	ICUBreakIterator *pIterCharBegin1 = ICUBreakIterator::getCharacterBreakIterator(reinterpret_cast<const UChar *>(pbeg1), static_cast<int32_t>(len1));
+	ICUBreakIterator *pIterCharBegin2 = ICUBreakIterator::getCharacterBreakIterator<2>(reinterpret_cast<const UChar *>(pbeg2), static_cast<int32_t>(len2));
+	ICUBreakIterator *pIterCharEnd1 = ICUBreakIterator::getCharacterBreakIterator<3>(reinterpret_cast<const UChar *>(pbeg1), static_cast<int32_t>(len1));
+	ICUBreakIterator *pIterCharEnd2 = ICUBreakIterator::getCharacterBreakIterator<4>(reinterpret_cast<const UChar *>(pbeg2), static_cast<int32_t>(len2));
 	
 	if (len1 == 0 || len2 == 0)
 	{
@@ -778,8 +773,8 @@ stringdiffs::ComputeByteDiff(const String & str1, const String & str2,
 	const TCHAR *py2 = pbeg2;
 
 	// pen1,pen2 point to the last valid character (broken multibyte lead chars don't count)
-	const TCHAR *pen1 = pbeg1 + (len1 > 0 ? m_iterCharEnd1.preceding(len1) : 0);
-	const TCHAR *pen2 = pbeg2 + (len2 > 0 ? m_iterCharEnd2.preceding(len2) : 0);
+	const TCHAR *pen1 = pbeg1 + (len1 > 0 ? pIterCharEnd1->preceding(len1) : 0);
+	const TCHAR *pen2 = pbeg2 + (len2 > 0 ? pIterCharEnd2->preceding(len2) : 0);
 	size_t glyphlenz1 = pbeg1 + len1 - pen1;
 	size_t glyphlenz2 = pbeg2 + len2 - pen2;
 
@@ -789,9 +784,9 @@ stringdiffs::ComputeByteDiff(const String & str1, const String & str2,
 		// by advancing py1 and py2
 		// and retreating pen1 and pen2
 		while (py1 < pen1 && isSafeWhitespace(*py1))
-			py1 = pbeg1 + m_iterCharBegin1.next();
+			py1 = pbeg1 + pIterCharBegin1->next();
 		while (py2 < pen2 && isSafeWhitespace(*py2))
-			py2 = pbeg2 + m_iterCharBegin2.next();
+			py2 = pbeg2 + pIterCharBegin2->next();
 		if ((pen1 < pbeg1 + len1 - 1 || pen2 < pbeg2 + len2 -1)
 			&& (!len1 || !len2 || pbeg1[len1] != pbeg2[len2]))
 		{
@@ -800,9 +795,9 @@ stringdiffs::ComputeByteDiff(const String & str1, const String & str2,
 		else
 		{
 			while (pen1 > py1 && isSafeWhitespace(*pen1))
-				pen1 = pbeg1 + m_iterCharEnd1.previous();
+				pen1 = pbeg1 + pIterCharEnd1->previous();
 			while (pen2 > py2 && isSafeWhitespace(*pen2))
-				pen2 = pbeg2 + m_iterCharEnd2.previous();
+				pen2 = pbeg2 + pIterCharEnd2->previous();
 		}
 	}
 	//check for excaption of empty string on one side
@@ -864,8 +859,8 @@ stringdiffs::ComputeByteDiff(const String & str1, const String & str2,
 			continue;
 		}
 
-		const TCHAR* py1next = pbeg1 + m_iterCharBegin1.next();
-		const TCHAR* py2next = pbeg2 + m_iterCharBegin2.next();
+		const TCHAR* py1next = pbeg1 + pIterCharBegin1->next();
+		const TCHAR* py2next = pbeg2 + pIterCharBegin2->next();
 		size_t glyphleny1 = py1next - py1;
 		size_t glyphleny2 = py2next - py2;
 		if (glyphleny1 != glyphleny2 || !matchchar(py1, py2, glyphleny1, casitive))
@@ -906,9 +901,9 @@ stringdiffs::ComputeByteDiff(const String & str1, const String & str2,
 				break; // done with reverse search
 			// gobble up all whitespace in current area
 			while (pz1 > py1 && isSafeWhitespace(*pz1))
-				pz1 = pbeg1 + m_iterCharEnd1.previous();
+				pz1 = pbeg1 + pIterCharEnd1->previous();
 			while (pz2 > py2 && isSafeWhitespace(*pz2))
-				pz2 = pbeg2 + m_iterCharEnd2.previous();
+				pz2 = pbeg2 + pIterCharEnd2->previous();
 			continue;
 
 		}
@@ -917,7 +912,7 @@ stringdiffs::ComputeByteDiff(const String & str1, const String & str2,
 			if (xwhite==1)
 				break; // done with reverse search
 			while (pz2 > py2 && isSafeWhitespace(*pz2))
-				pz2 = pbeg2 + m_iterCharEnd2.previous();
+				pz2 = pbeg2 + pIterCharEnd2->previous();
 			continue;
 		}
 
@@ -925,8 +920,8 @@ stringdiffs::ComputeByteDiff(const String & str1, const String & str2,
 			break; // done with forward search
 		const TCHAR* pz1next = pz1;
 		const TCHAR* pz2next = pz2;
-		pz1 = (pz1 > pbeg1) ? pbeg1 + m_iterCharEnd1.preceding(static_cast<int32_t>(pz1 - pbeg1)) : pz1 - 1;
-		pz2 = (pz2 > pbeg2) ? pbeg2 + m_iterCharEnd2.preceding(static_cast<int32_t>(pz2 - pbeg2)) : pz2 - 1;
+		pz1 = (pz1 > pbeg1) ? pbeg1 + pIterCharEnd1->preceding(static_cast<int32_t>(pz1 - pbeg1)) : pz1 - 1;
+		pz2 = (pz2 > pbeg2) ? pbeg2 + pIterCharEnd2->preceding(static_cast<int32_t>(pz2 - pbeg2)) : pz2 - 1;
 		glyphlenz1 = pz1next - pz1;
 		glyphlenz2 = pz2next - pz2;
 		// Now do real character match
@@ -954,8 +949,8 @@ stringdiffs::ComputeByteDiff(const String & str1, const String & str2,
 	}*/
 
 	// Store results of advance into return variables (end[0] & end[1])
-	end[0] = static_cast<int>(pz1 - pbeg1) + glyphlenz1 - 1;
-	end[1] = static_cast<int>(pz2 - pbeg2) + glyphlenz2 - 1;
+	end[0] = static_cast<int>(pz1 - pbeg1 + glyphlenz1 - 1);
+	end[1] = static_cast<int>(pz2 - pbeg2 + glyphlenz2 - 1);
 
 	// Check if difference region was empty
 	if (begin[0] == end[0] + 1 && begin[1] == end[1] + 1)
diff --git a/Src/stringdiffsi.h b/Src/stringdiffsi.h
index 7f5080630..ff555167b 100644
--- a/Src/stringdiffsi.h
+++ b/Src/stringdiffsi.h
@@ -118,10 +118,6 @@ private:
 	std::vector<word> m_words1;
 	std::vector<word> m_words2;
 	std::vector<wdiff> m_wdiffs;
-	ICUBreakIterator m_iterCharBegin1;
-	ICUBreakIterator m_iterCharBegin2;
-	ICUBreakIterator m_iterCharEnd1;
-	ICUBreakIterator m_iterCharEnd2;
 };
 
 }
-- 
2.11.0