9 from xml.etree import ElementTree
11 from fontTools import ttLib
49 def lang_to_script(lang_code):
50 lang = lang_code.lower()
51 while lang not in LANG_TO_SCRIPT:
52 hyphen_idx = lang.rfind('-')
53 assert hyphen_idx != -1, (
54 'We do not know what script the "%s" language is written in.'
56 assumed_script = lang[hyphen_idx+1:]
57 if len(assumed_script) == 4 and assumed_script.isalpha():
58 # This is actually the script
59 return assumed_script.title()
60 lang = lang[:hyphen_idx]
61 return LANG_TO_SCRIPT[lang]
65 if type(inp) is set: # set of character sequences
66 return '{' + ', '.join([printable(seq) for seq in inp]) + '}'
67 if type(inp) is tuple: # character sequence
68 return '<' + (', '.join([printable(ch) for ch in inp])) + '>'
69 else: # single character
74 font_file, index = font
75 font_path = path.join(_fonts_dir, font_file)
77 return ttLib.TTFont(font_path, fontNumber=index)
79 return ttLib.TTFont(font_path)
82 def get_best_cmap(font):
83 ttfont = open_font(font)
84 all_unicode_cmap = None
86 for cmap in ttfont['cmap'].tables:
87 specifier = (cmap.format, cmap.platformID, cmap.platEncID)
88 if specifier == (4, 3, 1):
89 assert bmp_cmap is None, 'More than one BMP cmap in %s' % (font, )
91 elif specifier == (12, 3, 10):
92 assert all_unicode_cmap is None, (
93 'More than one UCS-4 cmap in %s' % (font, ))
94 all_unicode_cmap = cmap
96 return all_unicode_cmap.cmap if all_unicode_cmap else bmp_cmap.cmap
99 def get_variation_sequences_cmap(font):
100 ttfont = open_font(font)
102 for cmap in ttfont['cmap'].tables:
103 specifier = (cmap.format, cmap.platformID, cmap.platEncID)
104 if specifier == (14, 0, 5):
105 assert vs_cmap is None, 'More than one VS cmap in %s' % (font, )
110 def get_emoji_map(font):
111 # Add normal characters
112 emoji_map = copy.copy(get_best_cmap(font))
113 reverse_cmap = {glyph: code for code, glyph in emoji_map.items()}
115 # Add variation sequences
116 vs_dict = get_variation_sequences_cmap(font).uvsDict
118 for base, glyph in vs_dict[vs]:
120 emoji_map[(base, vs)] = emoji_map[base]
122 emoji_map[(base, vs)] = glyph
125 ttfont = open_font(font)
126 for lookup in ttfont['GSUB'].table.LookupList.Lookup:
127 assert lookup.LookupType == 4, 'We only understand type 4 lookups'
128 for subtable in lookup.SubTable:
129 ligatures = subtable.ligatures
130 for first_glyph in ligatures:
131 for ligature in ligatures[first_glyph]:
132 sequence = [first_glyph] + ligature.Component
133 sequence = [reverse_cmap[glyph] for glyph in sequence]
134 sequence = tuple(sequence)
135 # Make sure no starting subsequence of 'sequence' has been
137 for sub_len in range(2, len(sequence)+1):
138 subsequence = sequence[:sub_len]
139 assert subsequence not in emoji_map
140 emoji_map[sequence] = ligature.LigGlyph
145 def assert_font_supports_any_of_chars(font, chars):
146 best_cmap = get_best_cmap(font)
148 if char in best_cmap:
150 sys.exit('None of characters in %s were found in %s' % (chars, font))
153 def assert_font_supports_all_of_chars(font, chars):
154 best_cmap = get_best_cmap(font)
156 assert char in best_cmap, (
157 'U+%04X was not found in %s' % (char, font))
160 def assert_font_supports_none_of_chars(font, chars):
161 best_cmap = get_best_cmap(font)
163 assert char not in best_cmap, (
164 'U+%04X was found in %s' % (char, font))
167 def assert_font_supports_all_sequences(font, sequences):
168 vs_dict = get_variation_sequences_cmap(font).uvsDict
169 for base, vs in sorted(sequences):
170 assert vs in vs_dict and (base, None) in vs_dict[vs], (
171 '<U+%04X, U+%04X> was not found in %s' % (base, vs, font))
174 def check_hyphens(hyphens_dir):
175 # Find all the scripts that need automatic hyphenation
177 for hyb_file in glob.iglob(path.join(hyphens_dir, '*.hyb')):
178 hyb_file = path.basename(hyb_file)
179 assert hyb_file.startswith('hyph-'), (
180 'Unknown hyphenation file %s' % hyb_file)
181 lang_code = hyb_file[hyb_file.index('-')+1:hyb_file.index('.')]
182 scripts.add(lang_to_script(lang_code))
184 HYPHENS = {0x002D, 0x2010}
185 for script in scripts:
186 fonts = _script_to_font_map[script]
187 assert fonts, 'No fonts found for the "%s" script' % script
189 assert_font_supports_any_of_chars(font, HYPHENS)
192 class FontRecord(object):
193 def __init__(self, name, scripts, variant, weight, style, font):
195 self.scripts = scripts
196 self.variant = variant
202 def parse_fonts_xml(fonts_xml_path):
203 global _script_to_font_map, _fallback_chain
204 _script_to_font_map = collections.defaultdict(set)
206 tree = ElementTree.parse(fonts_xml_path)
207 for family in tree.findall('family'):
208 name = family.get('name')
209 variant = family.get('variant')
210 langs = family.get('lang')
212 assert variant is None, (
213 'No variant expected for LGC font %s.' % name)
214 assert langs is None, (
215 'No language expected for LGC fonts %s.' % name)
217 assert variant in {None, 'elegant', 'compact'}, (
218 'Unexpected value for variant: %s' % variant)
221 langs = langs.split()
222 scripts = {lang_to_script(lang) for lang in langs}
227 assert child.tag == 'font', (
228 'Unknown tag <%s>' % child.tag)
229 font_file = child.text
230 weight = int(child.get('weight'))
231 assert weight % 100 == 0, (
232 'Font weight "%d" is not a multiple of 100.' % weight)
234 style = child.get('style')
235 assert style in {'normal', 'italic'}, (
236 'Unknown style "%s"' % style)
238 index = child.get('index')
242 _fallback_chain.append(FontRecord(
250 if name: # non-empty names are used for default LGC fonts
251 map_scripts = {'Latn', 'Grek', 'Cyrl'}
253 map_scripts = scripts
254 for script in map_scripts:
255 _script_to_font_map[script].add((font_file, index))
258 def check_emoji_coverage(all_emoji, equivalent_emoji):
259 emoji_font = get_emoji_font()
260 check_emoji_font_coverage(emoji_font, all_emoji, equivalent_emoji)
263 def get_emoji_font():
265 record.font for record in _fallback_chain
266 if 'Zsye' in record.scripts]
267 assert len(emoji_fonts) == 1, 'There are %d emoji fonts.' % len(emoji_fonts)
268 return emoji_fonts[0]
271 def check_emoji_font_coverage(emoji_font, all_emoji, equivalent_emoji):
272 coverage = get_emoji_map(emoji_font)
273 for sequence in all_emoji:
274 assert sequence in coverage, (
275 '%s is not supported in the emoji font.' % printable(sequence))
277 for sequence in coverage:
278 if sequence in {0x0000, 0x000D, 0x0020}:
279 # The font needs to support a few extra characters, which is OK
281 assert sequence in all_emoji, (
282 'Emoji font should not support %s.' % printable(sequence))
284 for first, second in sorted(equivalent_emoji.items()):
285 assert coverage[first] == coverage[second], (
286 '%s and %s should map to the same glyph.' % (
290 for glyph in set(coverage.values()):
291 maps_to_glyph = [seq for seq in coverage if coverage[seq] == glyph]
292 if len(maps_to_glyph) > 1:
293 # There are more than one sequences mapping to the same glyph. We
294 # need to make sure they were expected to be equivalent.
295 equivalent_seqs = set()
296 for seq in maps_to_glyph:
298 while equivalent_seq in equivalent_emoji:
299 equivalent_seq = equivalent_emoji[equivalent_seq]
300 equivalent_seqs.add(equivalent_seq)
301 assert len(equivalent_seqs) == 1, (
302 'The sequences %s should not result in the same glyph %s' % (
303 printable(equivalent_seqs),
307 def check_emoji_defaults(default_emoji):
308 missing_text_chars = _emoji_properties['Emoji'] - default_emoji
309 emoji_font_seen = False
310 for record in _fallback_chain:
311 if 'Zsye' in record.scripts:
312 emoji_font_seen = True
313 # No need to check the emoji font
315 # For later fonts, we only check them if they have a script
316 # defined, since the defined script may get them to a higher
317 # score even if they appear after the emoji font.
318 if emoji_font_seen and not record.scripts:
321 # Check default emoji-style characters
322 assert_font_supports_none_of_chars(record.font, sorted(default_emoji))
324 # Mark default text-style characters appearing in fonts above the emoji
326 if not emoji_font_seen:
327 missing_text_chars -= set(get_best_cmap(record.font))
329 # Noto does not have monochrome glyphs for Unicode 7.0 wingdings and
331 missing_text_chars -= _chars_by_age['7.0']
332 assert missing_text_chars == set(), (
333 'Text style version of some emoji characters are missing: ' +
334 repr(missing_text_chars))
337 # Setting reverse to true returns a dictionary that maps the values to sets of
338 # characters, useful for some binary properties. Otherwise, we get a
339 # dictionary that maps characters to the property values, assuming there's only
340 # one property in the file.
341 def parse_unicode_datafile(file_path, reverse=False):
343 output_dict = collections.defaultdict(set)
346 with open(file_path) as datafile:
347 for line in datafile:
349 line = line[:line.index('#')]
354 chars, prop = line.split(';')[:2]
355 chars = chars.strip()
358 if ' ' in chars: # character sequence
359 sequence = [int(ch, 16) for ch in chars.split(' ')]
360 additions = [tuple(sequence)]
361 elif '..' in chars: # character range
362 char_start, char_end = chars.split('..')
363 char_start = int(char_start, 16)
364 char_end = int(char_end, 16)
365 additions = xrange(char_start, char_end+1)
366 else: # singe character
367 additions = [int(chars, 16)]
369 output_dict[prop].update(additions)
371 for addition in additions:
372 assert addition not in output_dict
373 output_dict[addition] = prop
377 def parse_standardized_variants(file_path):
380 with open(file_path) as datafile:
381 for line in datafile:
383 line = line[:line.index('#')]
387 sequence, description, _ = line.split(';')
388 sequence = sequence.strip().split(' ')
389 base = int(sequence[0], 16)
390 vs = int(sequence[1], 16)
391 description = description.strip()
392 if description == 'text style':
393 text_set.add((base, vs))
394 elif description == 'emoji style':
395 emoji_set.add((base, vs))
396 return text_set, emoji_set
399 def parse_ucd(ucd_path):
400 global _emoji_properties, _chars_by_age
401 global _text_variation_sequences, _emoji_variation_sequences
402 global _emoji_sequences, _emoji_zwj_sequences
403 _emoji_properties = parse_unicode_datafile(
404 path.join(ucd_path, 'emoji-data.txt'), reverse=True)
405 _chars_by_age = parse_unicode_datafile(
406 path.join(ucd_path, 'DerivedAge.txt'), reverse=True)
407 sequences = parse_standardized_variants(
408 path.join(ucd_path, 'StandardizedVariants.txt'))
409 _text_variation_sequences, _emoji_variation_sequences = sequences
410 _emoji_sequences = parse_unicode_datafile(
411 path.join(ucd_path, 'emoji-sequences.txt'))
412 _emoji_zwj_sequences = parse_unicode_datafile(
413 path.join(ucd_path, 'emoji-zwj-sequences.txt'))
416 def flag_sequence(territory_code):
417 return tuple(0x1F1E6 + ord(ch) - ord('A') for ch in territory_code)
420 UNSUPPORTED_FLAGS = frozenset({
421 flag_sequence('BL'), flag_sequence('BQ'), flag_sequence('DG'),
422 flag_sequence('EA'), flag_sequence('EH'), flag_sequence('FK'),
423 flag_sequence('GF'), flag_sequence('GP'), flag_sequence('GS'),
424 flag_sequence('MF'), flag_sequence('MQ'), flag_sequence('NC'),
425 flag_sequence('PM'), flag_sequence('RE'), flag_sequence('TF'),
426 flag_sequence('UN'), flag_sequence('WF'), flag_sequence('XK'),
431 flag_sequence('BV'): flag_sequence('NO'),
432 flag_sequence('CP'): flag_sequence('FR'),
433 flag_sequence('HM'): flag_sequence('AU'),
434 flag_sequence('SJ'): flag_sequence('NO'),
435 flag_sequence('UM'): flag_sequence('US'),
438 COMBINING_KEYCAP = 0x20E3
440 # Characters that Android defaults to emoji style, different from the recommendations in UTR #51
441 ANDROID_DEFAULT_EMOJI = frozenset({
442 0x2600, # BLACK SUN WITH RAYS
444 0x260E, # BLACK TELEPHONE
445 0x261D, # WHITE UP POINTING INDEX
446 0x263A, # WHITE SMILING FACE
447 0x2660, # BLACK SPADE SUIT
448 0x2663, # BLACK CLUB SUIT
449 0x2665, # BLACK HEART SUIT
450 0x2666, # BLACK DIAMOND SUIT
451 0x270C, # VICTORY HAND
453 0x2764, # HEAVY BLACK HEART
456 LEGACY_ANDROID_EMOJI = {
457 0xFE4E5: flag_sequence('JP'),
458 0xFE4E6: flag_sequence('US'),
459 0xFE4E7: flag_sequence('FR'),
460 0xFE4E8: flag_sequence('DE'),
461 0xFE4E9: flag_sequence('IT'),
462 0xFE4EA: flag_sequence('GB'),
463 0xFE4EB: flag_sequence('ES'),
464 0xFE4EC: flag_sequence('RU'),
465 0xFE4ED: flag_sequence('CN'),
466 0xFE4EE: flag_sequence('KR'),
467 0xFE82C: (ord('#'), COMBINING_KEYCAP),
468 0xFE82E: (ord('1'), COMBINING_KEYCAP),
469 0xFE82F: (ord('2'), COMBINING_KEYCAP),
470 0xFE830: (ord('3'), COMBINING_KEYCAP),
471 0xFE831: (ord('4'), COMBINING_KEYCAP),
472 0xFE832: (ord('5'), COMBINING_KEYCAP),
473 0xFE833: (ord('6'), COMBINING_KEYCAP),
474 0xFE834: (ord('7'), COMBINING_KEYCAP),
475 0xFE835: (ord('8'), COMBINING_KEYCAP),
476 0xFE836: (ord('9'), COMBINING_KEYCAP),
477 0xFE837: (ord('0'), COMBINING_KEYCAP),
482 (0x1F469, 0x200D, 0x2764, 0x200D, 0x1F48B, 0x200D, 0x1F468): 0x1F48F,
484 (0x1F469, 0x200D, 0x2764, 0x200D, 0x1F468): 0x1F491,
486 (0x1F468, 0x200D, 0x1F469, 0x200D, 0x1F466): 0x1F46A,
490 def is_fitzpatrick_modifier(cp):
491 return 0x1F3FB <= cp <= 0x1F3FF
494 def reverse_emoji(seq):
495 rev = list(reversed(seq))
496 # if there are fitzpatrick modifiers in the sequence, keep them after
497 # the emoji they modify
498 for i in xrange(1, len(rev)):
499 if is_fitzpatrick_modifier(rev[i-1]):
500 rev[i], rev[i-1] = rev[i-1], rev[i]
504 def compute_expected_emoji():
505 equivalent_emoji = {}
506 sequence_pieces = set()
507 all_sequences = set()
508 all_sequences.update(_emoji_variation_sequences)
510 # add zwj sequences not in the current emoji-zwj-sequences.txt
511 adjusted_emoji_zwj_sequences = dict(_emoji_zwj_sequences)
512 adjusted_emoji_zwj_sequences.update(_emoji_zwj_sequences)
513 # single parent families
514 additional_emoji_zwj = (
515 (0x1F468, 0x200D, 0x1F466),
516 (0x1F468, 0x200D, 0x1F467),
517 (0x1F468, 0x200D, 0x1F466, 0x200D, 0x1F466),
518 (0x1F468, 0x200D, 0x1F467, 0x200D, 0x1F466),
519 (0x1F468, 0x200D, 0x1F467, 0x200D, 0x1F467),
520 (0x1F469, 0x200D, 0x1F466),
521 (0x1F469, 0x200D, 0x1F467),
522 (0x1F469, 0x200D, 0x1F466, 0x200D, 0x1F466),
523 (0x1F469, 0x200D, 0x1F467, 0x200D, 0x1F466),
524 (0x1F469, 0x200D, 0x1F467, 0x200D, 0x1F467),
526 # sequences formed from man and woman and optional fitzpatrick modifier
527 modified_extensions = (
534 for seq in additional_emoji_zwj:
535 adjusted_emoji_zwj_sequences[seq] = 'Emoji_ZWJ_Sequence'
536 for ext in modified_extensions:
537 for base in (0x1F468, 0x1F469):
538 seq = (base, 0x200D, ext)
539 adjusted_emoji_zwj_sequences[seq] = 'Emoji_ZWJ_Sequence'
540 for modifier in range(0x1F3FB, 0x1F400):
541 seq = (base, modifier, 0x200D, ext)
542 adjusted_emoji_zwj_sequences[seq] = 'Emoji_ZWJ_Sequence'
544 for sequence in _emoji_sequences.keys():
545 sequence = tuple(ch for ch in sequence if ch != EMOJI_VS)
546 all_sequences.add(sequence)
547 sequence_pieces.update(sequence)
549 for sequence in adjusted_emoji_zwj_sequences.keys():
550 sequence = tuple(ch for ch in sequence if ch != EMOJI_VS)
551 all_sequences.add(sequence)
552 sequence_pieces.update(sequence)
553 # Add reverse of all emoji ZWJ sequences, which are added to the fonts
554 # as a workaround to get the sequences work in RTL text.
555 reversed_seq = reverse_emoji(sequence)
556 all_sequences.add(reversed_seq)
557 equivalent_emoji[reversed_seq] = sequence
559 # Add all two-letter flag sequences, as even the unsupported ones should
560 # resolve to a flag tofu.
561 all_letters = [chr(code) for code in range(ord('A'), ord('Z')+1)]
562 all_two_letter_codes = itertools.product(all_letters, repeat=2)
563 all_flags = {flag_sequence(code) for code in all_two_letter_codes}
564 all_sequences.update(all_flags)
565 tofu_flags = UNSUPPORTED_FLAGS | (all_flags - set(_emoji_sequences.keys()))
568 _emoji_properties['Emoji'] |
571 set(LEGACY_ANDROID_EMOJI.keys()))
573 _emoji_properties['Emoji_Presentation'] |
574 ANDROID_DEFAULT_EMOJI |
576 set(LEGACY_ANDROID_EMOJI.keys()))
578 first_tofu_flag = sorted(tofu_flags)[0]
579 for flag in tofu_flags:
580 if flag != first_tofu_flag:
581 equivalent_emoji[flag] = first_tofu_flag
582 equivalent_emoji.update(EQUIVALENT_FLAGS)
583 equivalent_emoji.update(LEGACY_ANDROID_EMOJI)
584 equivalent_emoji.update(ZWJ_IDENTICALS)
585 for seq in _emoji_variation_sequences:
586 equivalent_emoji[seq] = seq[0]
588 return all_emoji, default_emoji, equivalent_emoji
593 target_out = sys.argv[1]
594 _fonts_dir = path.join(target_out, 'fonts')
596 fonts_xml_path = path.join(target_out, 'etc', 'fonts.xml')
597 parse_fonts_xml(fonts_xml_path)
599 hyphens_dir = path.join(target_out, 'usr', 'hyphen-data')
600 check_hyphens(hyphens_dir)
602 check_emoji = sys.argv[2]
603 if check_emoji == 'true':
604 ucd_path = sys.argv[3]
606 all_emoji, default_emoji, equivalent_emoji = compute_expected_emoji()
607 check_emoji_coverage(all_emoji, equivalent_emoji)
608 check_emoji_defaults(default_emoji)
611 if __name__ == '__main__':