tools/aapt/pseudolocalize.cpp

   1 #include "pseudolocalize.h"
   2
   3 using namespace std;
   4
   5 // String basis to generate expansion
   6 static const String16 k_expansion_string = String16("one two three "
   7     "four five six seven eight nine ten eleven twelve thirteen "
   8     "fourteen fiveteen sixteen seventeen nineteen twenty");
   9
  10 // Special unicode characters to override directionality of the words
  11 static const String16 k_rlm = String16("\xe2\x80\x8f");
  12 static const String16 k_rlo = String16("\xE2\x80\xae");
  13 static const String16 k_pdf = String16("\xE2\x80\xac");
  14
  15 // Placeholder marks
  16 static const String16 k_placeholder_open = String16("\xc2\xbb");
  17 static const String16 k_placeholder_close = String16("\xc2\xab");
  18
  19 static const char16_t k_arg_start = '{';
  20 static const char16_t k_arg_end = '}';
  21
  22 Pseudolocalizer::Pseudolocalizer(PseudolocalizationMethod m)
  23     : mImpl(nullptr), mLastDepth(0) {
  24   setMethod(m);
  25 }
  26
  27 void Pseudolocalizer::setMethod(PseudolocalizationMethod m) {
  28   if (mImpl) {
  29     delete mImpl;
  30   }
  31   if (m == PSEUDO_ACCENTED) {
  32     mImpl = new PseudoMethodAccent();
  33   } else if (m == PSEUDO_BIDI) {
  34     mImpl = new PseudoMethodBidi();
  35   } else {
  36     mImpl = new PseudoMethodNone();
  37   }
  38 }
  39
  40 String16 Pseudolocalizer::text(const String16& text) {
  41   String16 out;
  42   size_t depth = mLastDepth;
  43   size_t lastpos, pos;
  44   const size_t length= text.size();
  45   const char16_t* str = text.string();
  46   bool escaped = false;
  47   for (lastpos = pos = 0; pos < length; pos++) {
  48     char16_t c = str[pos];
  49     if (escaped) {
  50       escaped = false;
  51       continue;
  52     }
  53     if (c == '\'') {
  54       escaped = true;
  55       continue;
  56     }
  57
  58     if (c == k_arg_start) {
  59       depth++;
  60     } else if (c == k_arg_end && depth) {
  61       depth--;
  62     }
  63
  64     if (mLastDepth != depth || pos == length - 1) {
  65       bool pseudo = ((mLastDepth % 2) == 0);
  66       size_t nextpos = pos;
  67       if (!pseudo || depth == mLastDepth) {
  68         nextpos++;
  69       }
  70       size_t size = nextpos - lastpos;
  71       if (size) {
  72         String16 chunk = String16(text, size, lastpos);
  73         if (pseudo) {
  74           chunk = mImpl->text(chunk);
  75         } else if (str[lastpos] == k_arg_start &&
  76                    str[nextpos - 1] == k_arg_end) {
  77           chunk = mImpl->placeholder(chunk);
  78         }
  79         out.append(chunk);
  80       }
  81       if (pseudo && depth < mLastDepth) { // End of message
  82         out.append(mImpl->end());
  83       } else if (!pseudo && depth > mLastDepth) { // Start of message
  84         out.append(mImpl->start());
  85       }
  86       lastpos = nextpos;
  87       mLastDepth = depth;
  88     }
  89   }
  90   return out;
  91 }
  92
  93 static const char*
  94 pseudolocalize_char(const char16_t c)
  95 {
  96     switch (c) {
  97         case 'a':   return "\xc3\xa5";
  98         case 'b':   return "\xc9\x93";
  99         case 'c':   return "\xc3\xa7";
 100         case 'd':   return "\xc3\xb0";
 101         case 'e':   return "\xc3\xa9";
 102         case 'f':   return "\xc6\x92";
 103         case 'g':   return "\xc4\x9d";
 104         case 'h':   return "\xc4\xa5";
 105         case 'i':   return "\xc3\xae";
 106         case 'j':   return "\xc4\xb5";
 107         case 'k':   return "\xc4\xb7";
 108         case 'l':   return "\xc4\xbc";
 109         case 'm':   return "\xe1\xb8\xbf";
 110         case 'n':   return "\xc3\xb1";
 111         case 'o':   return "\xc3\xb6";
 112         case 'p':   return "\xc3\xbe";
 113         case 'q':   return "\x51";
 114         case 'r':   return "\xc5\x95";
 115         case 's':   return "\xc5\xa1";
 116         case 't':   return "\xc5\xa3";
 117         case 'u':   return "\xc3\xbb";
 118         case 'v':   return "\x56";
 119         case 'w':   return "\xc5\xb5";
 120         case 'x':   return "\xd1\x85";
 121         case 'y':   return "\xc3\xbd";
 122         case 'z':   return "\xc5\xbe";
 123         case 'A':   return "\xc3\x85";
 124         case 'B':   return "\xce\xb2";
 125         case 'C':   return "\xc3\x87";
 126         case 'D':   return "\xc3\x90";
 127         case 'E':   return "\xc3\x89";
 128         case 'G':   return "\xc4\x9c";
 129         case 'H':   return "\xc4\xa4";
 130         case 'I':   return "\xc3\x8e";
 131         case 'J':   return "\xc4\xb4";
 132         case 'K':   return "\xc4\xb6";
 133         case 'L':   return "\xc4\xbb";
 134         case 'M':   return "\xe1\xb8\xbe";
 135         case 'N':   return "\xc3\x91";
 136         case 'O':   return "\xc3\x96";
 137         case 'P':   return "\xc3\x9e";
 138         case 'Q':   return "\x71";
 139         case 'R':   return "\xc5\x94";
 140         case 'S':   return "\xc5\xa0";
 141         case 'T':   return "\xc5\xa2";
 142         case 'U':   return "\xc3\x9b";
 143         case 'V':   return "\xce\xbd";
 144         case 'W':   return "\xc5\xb4";
 145         case 'X':   return "\xc3\x97";
 146         case 'Y':   return "\xc3\x9d";
 147         case 'Z':   return "\xc5\xbd";
 148         case '!':   return "\xc2\xa1";
 149         case '?':   return "\xc2\xbf";
 150         case '$':   return "\xe2\x82\xac";
 151         default:    return NULL;
 152     }
 153 }
 154
 155 static bool is_possible_normal_placeholder_end(const char16_t c) {
 156     switch (c) {
 157         case 's': return true;
 158         case 'S': return true;
 159         case 'c': return true;
 160         case 'C': return true;
 161         case 'd': return true;
 162         case 'o': return true;
 163         case 'x': return true;
 164         case 'X': return true;
 165         case 'f': return true;
 166         case 'e': return true;
 167         case 'E': return true;
 168         case 'g': return true;
 169         case 'G': return true;
 170         case 'a': return true;
 171         case 'A': return true;
 172         case 'b': return true;
 173         case 'B': return true;
 174         case 'h': return true;
 175         case 'H': return true;
 176         case '%': return true;
 177         case 'n': return true;
 178         default:  return false;
 179     }
 180 }
 181
 182 static String16 pseudo_generate_expansion(const unsigned int length) {
 183     String16 result = k_expansion_string;
 184     const char16_t* s = result.string();
 185     if (result.size() < length) {
 186         result += String16(" ");
 187         result += pseudo_generate_expansion(length - result.size());
 188     } else {
 189         int ext = 0;
 190         // Should contain only whole words, so looking for a space
 191         for (unsigned int i = length + 1; i < result.size(); ++i) {
 192           ++ext;
 193           if (s[i] == ' ') {
 194             break;
 195           }
 196         }
 197         result.remove(length + ext, 0);
 198     }
 199     return result;
 200 }
 201
 202 static bool is_space(const char16_t c) {
 203   return (c == ' ' || c == '\t' || c == '\n');
 204 }
 205
 206 String16 PseudoMethodAccent::start() {
 207   String16 result;
 208   if (mDepth == 0) {
 209     result = String16(String8("["));
 210   }
 211   mWordCount = mLength = 0;
 212   mDepth++;
 213   return result;
 214 }
 215
 216 String16 PseudoMethodAccent::end() {
 217   String16 result;
 218   if (mLength) {
 219     result.append(String16(String8(" ")));
 220     result.append(pseudo_generate_expansion(
 221         mWordCount > 3 ? mLength : mLength / 2));
 222   }
 223   mWordCount = mLength = 0;
 224   mDepth--;
 225   if (mDepth == 0) {
 226     result.append(String16(String8("]")));
 227   }
 228   return result;
 229 }
 230
 231 /**
 232  * Converts characters so they look like they've been localized.
 233  *
 234  * Note: This leaves escape sequences untouched so they can later be
 235  * processed by ResTable::collectString in the normal way.
 236  */
 237 String16 PseudoMethodAccent::text(const String16& source)
 238 {
 239     const char16_t* s = source.string();
 240     String16 result;
 241     const size_t I = source.size();
 242     bool lastspace = true;
 243     for (size_t i=0; i<I; i++) {
 244         char16_t c = s[i];
 245         if (c == '\\') {
 246             // Escape syntax, no need to pseudolocalize
 247             if (i<I-1) {
 248                 result += String16("\\");
 249                 i++;
 250                 c = s[i];
 251                 switch (c) {
 252                     case 'u':
 253                         // this one takes up 5 chars
 254                         result += String16(s+i, 5);
 255                         i += 4;
 256                         break;
 257                     case 't':
 258                     case 'n':
 259                     case '#':
 260                     case '@':
 261                     case '?':
 262                     case '"':
 263                     case '\'':
 264                     case '\\':
 265                     default:
 266                         result.append(&c, 1);
 267                         break;
 268                 }
 269             } else {
 270                 result.append(&c, 1);
 271             }
 272         } else if (c == '%') {
 273             // Placeholder syntax, no need to pseudolocalize
 274             String16 chunk;
 275             bool end = false;
 276             chunk.append(&c, 1);
 277             while (!end && i < I) {
 278                 ++i;
 279                 c = s[i];
 280                 chunk.append(&c, 1);
 281                 if (is_possible_normal_placeholder_end(c)) {
 282                     end = true;
 283                 } else if (c == 't') {
 284                     ++i;
 285                     c = s[i];
 286                     chunk.append(&c, 1);
 287                     end = true;
 288                 }
 289             }
 290             // Treat chunk as a placeholder unless it ends with %.
 291             result += ((c == '%') ? chunk : placeholder(chunk));
 292         } else if (c == '<' || c == '&') {
 293             // html syntax, no need to pseudolocalize
 294             bool tag_closed = false;
 295             while (!tag_closed && i < I) {
 296                 if (c == '&') {
 297                     String16 escape_text;
 298                     escape_text.append(&c, 1);
 299                     bool end = false;
 300                     size_t htmlCodePos = i;
 301                     while (!end && htmlCodePos < I) {
 302                         ++htmlCodePos;
 303                         c = s[htmlCodePos];
 304                         escape_text.append(&c, 1);
 305                         // Valid html code
 306                         if (c == ';') {
 307                             end = true;
 308                             i = htmlCodePos;
 309                         }
 310                         // Wrong html code
 311                         else if (!((c == '#' ||
 312                                  (c >= 'a' && c <= 'z') ||
 313                                  (c >= 'A' && c <= 'Z') ||
 314                                  (c >= '0' && c <= '9')))) {
 315                             end = true;
 316                         }
 317                     }
 318                     result += escape_text;
 319                     if (escape_text != String16("&lt;")) {
 320                         tag_closed = true;
 321                     }
 322                     continue;
 323                 }
 324                 if (c == '>') {
 325                     tag_closed = true;
 326                     result.append(&c, 1);
 327                     continue;
 328                 }
 329                 result.append(&c, 1);
 330                 i++;
 331                 c = s[i];
 332             }
 333         } else {
 334             // This is a pure text that should be pseudolocalized
 335             const char* p = pseudolocalize_char(c);
 336             if (p != NULL) {
 337                 result += String16(p);
 338             } else {
 339                 bool space = is_space(c);
 340                 if (lastspace && !space) {
 341                   mWordCount++;
 342                 }
 343                 lastspace = space;
 344                 result.append(&c, 1);
 345             }
 346             // Count only pseudolocalizable chars and delimiters
 347             mLength++;
 348         }
 349     }
 350     return result;
 351 }
 352 String16 PseudoMethodAccent::placeholder(const String16& source) {
 353   // Surround a placeholder with brackets
 354   return k_placeholder_open + source + k_placeholder_close;
 355 }
 356
 357 String16 PseudoMethodBidi::text(const String16& source)
 358 {
 359     const char16_t* s = source.string();
 360     String16 result;
 361     bool lastspace = true;
 362     bool space = true;
 363     for (size_t i=0; i<source.size(); i++) {
 364         char16_t c = s[i];
 365         space = is_space(c);
 366         if (lastspace && !space) {
 367           // Word start
 368           result += k_rlm + k_rlo;
 369         } else if (!lastspace && space) {
 370           // Word end
 371           result += k_pdf + k_rlm;
 372         }
 373         lastspace = space;
 374         result.append(&c, 1);
 375     }
 376     if (!lastspace) {
 377       // End of last word
 378       result += k_pdf + k_rlm;
 379     }
 380     return result;
 381 }
 382
 383 String16 PseudoMethodBidi::placeholder(const String16& source) {
 384   // Surround a placeholder with directionality change sequence
 385   return k_rlm + k_rlo + source + k_pdf + k_rlm;
 386 }
 387