From 28c5b4d50889a52fa769652704c899b5e4e570b9 Mon Sep 17 00:00:00 2001
From: Roozbeh Pournader <roozbeh@google.com>
Date: Tue, 24 Oct 2017 16:40:21 -0700
Subject: [PATCH] Fallback hyphenation for minority Indic languages

Minority Indic languages now fallback to available patterns for an
existing major language written in the same script, since the Indic
patterns are script-based anyway.

Change-Id: Ie04b97904e2b7d1b4c1fcd2f3cfc41f76ed8c7d9
Fixes: 67751731
Test: mmm -j frameworks/base/core/jni
---
 core/jni/android_text_Hyphenator.cpp | 113 +++++++++++++++++++----------------
 1 file changed, 62 insertions(+), 51 deletions(-)

diff --git a/core/jni/android_text_Hyphenator.cpp b/core/jni/android_text_Hyphenator.cpp
index 05bec28a5d39..6f9cc22fb3ab 100644
--- a/core/jni/android_text_Hyphenator.cpp
+++ b/core/jni/android_text_Hyphenator.cpp
@@ -82,45 +82,45 @@ static void init() {
     constexpr int INDIC_MIN_PREFIX = 2;
     constexpr int INDIC_MIN_SUFFIX = 2;
 
-    addHyphenator("as", INDIC_MIN_PREFIX, INDIC_MIN_SUFFIX); // Assamese
-    addHyphenator("be", 2, 2); // Belarusian
-    addHyphenator("bg", 2, 2); // Bulgarian
-    addHyphenator("bn", INDIC_MIN_PREFIX, INDIC_MIN_SUFFIX); // Bengali
-    addHyphenator("cu", 1, 2); // Church Slavonic
-    addHyphenator("cy", 2, 3); // Welsh
-    addHyphenator("da", 2, 2); // Danish
-    addHyphenator("de-1901", 2, 2); // German 1901 orthography
-    addHyphenator("de-1996", 2, 2); // German 1996 orthography
-    addHyphenator("de-CH-1901", 2, 2); // Swiss High German 1901 orthography
-    addHyphenator("en-GB", 2, 3); // British English
-    addHyphenator("en-US", 2, 3); // American English
-    addHyphenator("es", 2, 2); // Spanish
-    addHyphenator("et", 2, 3); // Estonian
-    addHyphenator("eu", 2, 2); // Basque
-    addHyphenator("fr", 2, 3); // French
-    addHyphenator("ga", 2, 3); // Irish
-    addHyphenator("gu", INDIC_MIN_PREFIX, INDIC_MIN_SUFFIX); // Gujarati
-    addHyphenator("hi", INDIC_MIN_PREFIX, INDIC_MIN_SUFFIX); // Hindi
-    addHyphenator("hr", 2, 2); // Croatian
-    addHyphenator("hu", 2, 2); // Hungarian
+    addHyphenator("as", INDIC_MIN_PREFIX, INDIC_MIN_SUFFIX);  // Assamese
+    addHyphenator("be", 2, 2);  // Belarusian
+    addHyphenator("bg", 2, 2);  // Bulgarian
+    addHyphenator("bn", INDIC_MIN_PREFIX, INDIC_MIN_SUFFIX);  // Bengali
+    addHyphenator("cu", 1, 2);  // Church Slavonic
+    addHyphenator("cy", 2, 3);  // Welsh
+    addHyphenator("da", 2, 2);  // Danish
+    addHyphenator("de-1901", 2, 2);  // German 1901 orthography
+    addHyphenator("de-1996", 2, 2);  // German 1996 orthography
+    addHyphenator("de-CH-1901", 2, 2);  // Swiss High German 1901 orthography
+    addHyphenator("en-GB", 2, 3);  // British English
+    addHyphenator("en-US", 2, 3);  // American English
+    addHyphenator("es", 2, 2);  // Spanish
+    addHyphenator("et", 2, 3);  // Estonian
+    addHyphenator("eu", 2, 2);  // Basque
+    addHyphenator("fr", 2, 3);  // French
+    addHyphenator("ga", 2, 3);  // Irish
+    addHyphenator("gu", INDIC_MIN_PREFIX, INDIC_MIN_SUFFIX);  // Gujarati
+    addHyphenator("hi", INDIC_MIN_PREFIX, INDIC_MIN_SUFFIX);  // Hindi
+    addHyphenator("hr", 2, 2);  // Croatian
+    addHyphenator("hu", 2, 2);  // Hungarian
     // texhyphen sources say Armenian may be (1, 2); but that it needs confirmation.
     // Going with a more conservative value of (2, 2) for now.
-    addHyphenator("hy", 2, 2); // Armenian
-    addHyphenator("kn", INDIC_MIN_PREFIX, INDIC_MIN_SUFFIX); // Kannada
-    addHyphenator("la", 2, 2); // Latin
-    addHyphenator("ml", INDIC_MIN_PREFIX, INDIC_MIN_SUFFIX); // Malayalam
-    addHyphenator("mn-Cyrl", 2, 2); // Mongolian in Cyrillic script
-    addHyphenator("mr", INDIC_MIN_PREFIX, INDIC_MIN_SUFFIX); // Marathi
-    addHyphenator("nb", 2, 2); // Norwegian BokmÃ¥l
-    addHyphenator("nn", 2, 2); // Norwegian Nynorsk
-    addHyphenator("or", INDIC_MIN_PREFIX, INDIC_MIN_SUFFIX); // Oriya
-    addHyphenator("pa", INDIC_MIN_PREFIX, INDIC_MIN_SUFFIX); // Punjabi
-    addHyphenator("pt", 2, 3); // Portuguese
-    addHyphenator("sl", 2, 2); // Slovenian
-    addHyphenator("ta", INDIC_MIN_PREFIX, INDIC_MIN_SUFFIX); // Tamil
-    addHyphenator("te", INDIC_MIN_PREFIX, INDIC_MIN_SUFFIX); // Telugu
-    addHyphenator("tk", 2, 2); // Turkmen
-    addHyphenator("und-Ethi", 1, 1); // Any language in Ethiopic script
+    addHyphenator("hy", 2, 2);  // Armenian
+    addHyphenator("kn", INDIC_MIN_PREFIX, INDIC_MIN_SUFFIX);  // Kannada
+    addHyphenator("la", 2, 2);  // Latin
+    addHyphenator("ml", INDIC_MIN_PREFIX, INDIC_MIN_SUFFIX);  // Malayalam
+    addHyphenator("mn-Cyrl", 2, 2);  // Mongolian in Cyrillic script
+    addHyphenator("mr", INDIC_MIN_PREFIX, INDIC_MIN_SUFFIX);  // Marathi
+    addHyphenator("nb", 2, 2);  // Norwegian BokmÃ¥l
+    addHyphenator("nn", 2, 2);  // Norwegian Nynorsk
+    addHyphenator("or", INDIC_MIN_PREFIX, INDIC_MIN_SUFFIX);  // Oriya
+    addHyphenator("pa", INDIC_MIN_PREFIX, INDIC_MIN_SUFFIX);  // Punjabi
+    addHyphenator("pt", 2, 3);  // Portuguese
+    addHyphenator("sl", 2, 2);  // Slovenian
+    addHyphenator("ta", INDIC_MIN_PREFIX, INDIC_MIN_SUFFIX);  // Tamil
+    addHyphenator("te", INDIC_MIN_PREFIX, INDIC_MIN_SUFFIX);  // Telugu
+    addHyphenator("tk", 2, 2);  // Turkmen
+    addHyphenator("und-Ethi", 1, 1);  // Any language in Ethiopic script
 
     // Following two hyphenators do not have pattern files but there is some special logic based on
     // language.
@@ -130,13 +130,13 @@ static void init() {
     // English locales that fall back to en-US. The data is from CLDR. It's all English locales,
     // minus the locales whose parent is en-001 (from supplementalData.xml, under <parentLocales>).
     // TODO: Figure out how to get this from ICU.
-    addHyphenatorAlias("en-AS", "en-US"); // English (American Samoa)
-    addHyphenatorAlias("en-GU", "en-US"); // English (Guam)
-    addHyphenatorAlias("en-MH", "en-US"); // English (Marshall Islands)
-    addHyphenatorAlias("en-MP", "en-US"); // English (Northern Mariana Islands)
-    addHyphenatorAlias("en-PR", "en-US"); // English (Puerto Rico)
-    addHyphenatorAlias("en-UM", "en-US"); // English (United States Minor Outlying Islands)
-    addHyphenatorAlias("en-VI", "en-US"); // English (Virgin Islands)
+    addHyphenatorAlias("en-AS", "en-US");  // English (American Samoa)
+    addHyphenatorAlias("en-GU", "en-US");  // English (Guam)
+    addHyphenatorAlias("en-MH", "en-US");  // English (Marshall Islands)
+    addHyphenatorAlias("en-MP", "en-US");  // English (Northern Mariana Islands)
+    addHyphenatorAlias("en-PR", "en-US");  // English (Puerto Rico)
+    addHyphenatorAlias("en-UM", "en-US");  // English (United States Minor Outlying Islands)
+    addHyphenatorAlias("en-VI", "en-US");  // English (Virgin Islands)
 
     // All English locales other than those falling back to en-US are mapped to en-GB.
     addHyphenatorAlias("en", "en-GB");
@@ -150,17 +150,28 @@ static void init() {
     addHyphenatorAlias("no", "nb");
 
     // Use mn-Cyrl. According to CLDR's likelySubtags.xml, mn is most likely to be mn-Cyrl.
-    addHyphenatorAlias("mn", "mn-Cyrl"); // Mongolian
+    addHyphenatorAlias("mn", "mn-Cyrl");  // Mongolian
 
     // Fall back to Ethiopic script for languages likely to be written in Ethiopic.
     // Data is from CLDR's likelySubtags.xml.
     // TODO: Convert this to a mechanism using ICU4J's ULocale#addLikelySubtags().
-    addHyphenatorAlias("am", "und-Ethi"); // Amharic
-    addHyphenatorAlias("byn", "und-Ethi"); // Blin
-    addHyphenatorAlias("gez", "und-Ethi"); // GeÊ»ez
-    addHyphenatorAlias("ti", "und-Ethi"); // Tigrinya
-    addHyphenatorAlias("wal", "und-Ethi"); // Wolaytta
-
+    addHyphenatorAlias("am", "und-Ethi");  // Amharic
+    addHyphenatorAlias("byn", "und-Ethi");  // Blin
+    addHyphenatorAlias("gez", "und-Ethi");  // GeÊ»ez
+    addHyphenatorAlias("ti", "und-Ethi");  // Tigrinya
+    addHyphenatorAlias("wal", "und-Ethi");  // Wolaytta
+
+    // Use Hindi as a fallback hyphenator for all languages written in Devanagari, etc. This makes
+    // sense because our Indic patterns are not really linguistic, but script-based.
+    addHyphenatorAlias("und-Beng", "bn");  // Bengali
+    addHyphenatorAlias("und-Deva", "hi");  // Devanagari -> Hindi
+    addHyphenatorAlias("und-Gujr", "gu");  // Gujarati
+    addHyphenatorAlias("und-Guru", "pa");  // Gurmukhi -> Punjabi
+    addHyphenatorAlias("und-Knda", "kn");  // Kannada
+    addHyphenatorAlias("und-Mlym", "ml");  // Malayalam
+    addHyphenatorAlias("und-Orya", "or");  // Oriya
+    addHyphenatorAlias("und-Taml", "ta");  // Tamil
+    addHyphenatorAlias("und-Telu", "te");  // Telugu
 }
 
 static const JNINativeMethod gMethods[] = {
-- 
2.11.0