3 # This program parses the UnicodeData file and generates the
4 # corresponding tclUniData.c file with compressed character
5 # data tables. The input to this program should be the latest
6 # UnicodeData file from:
7 # ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData.txt
9 # Copyright (c) 1998-1999 Scriptics Corporation.
10 # All rights reserved.
14 set shift 5; # number of bits of data within a page
15 # This value can be adjusted to find the
16 # best split to minimize table size
18 variable pMap; # map from page to page index, each entry is
19 # an index into the pages table, indexed by
21 variable pages; # map from page index to page info, each
22 # entry is a list of indices into the groups
23 # table, the list is indexed by the offset
24 variable groups; # list of character info values, indexed by
25 # group number, initialized with the
26 # unassigned character group
29 Cn Lu Ll Lt Lm Lo Mn Me Mc Nd Nl No Zs Zl Zp
30 Cc Cf Co Cs Pc Pd Ps Pe Pi Pf Po Sm Sc Sk So
31 }; # Ordered list of character categories, must
32 # match the enumeration in the header file.
35 proc uni::getValue {items index} {
38 # Extract character info
40 set category [lindex $items 2]
41 if {[scan [lindex $items 12] %x toupper] == 1} {
42 set toupper [expr {$index - $toupper}]
46 if {[scan [lindex $items 13] %x tolower] == 1} {
47 set tolower [expr {$tolower - $index}]
51 if {[scan [lindex $items 14] %x totitle] == 1} {
52 set totitle [expr {$index - $totitle}]
59 set categoryIndex [lsearch -exact $categories $category]
60 if {$categoryIndex < 0} {
61 error "Unexpected character category: $index($category)"
64 return [list $categoryIndex $toupper $tolower $totitle]
67 proc uni::getGroup {value} {
70 set gIndex [lsearch -exact $groups $value]
72 set gIndex [llength $groups]
78 proc uni::addPage {info} {
83 set pIndex [lsearch -exact $pages $info]
85 set pIndex [llength $pages]
88 lappend pMap [expr {$pIndex << $shift}]
92 proc uni::buildTables {data} {
97 variable groups {{0 0 0 0}}
99 set info {} ;# temporary page info
101 set mask [expr {(1 << $shift) - 1}]
103 foreach line [split $data \n] {
105 if {!($next & $mask)} {
106 # next character is already on page boundary
109 # fill remaining page
110 set line [format %X [expr {($next-1)|$mask}]]
111 append line ";;Cn;0;ON;;;;;N;;;;;\n"
114 set items [split $line \;]
116 scan [lindex $items 0] %x index
117 if {$index > 0x3FFFF} then {
118 # Ignore characters > plane 3
121 set index [format %d $index]
123 set gIndex [getGroup [getValue $items $index]]
125 # Since the input table omits unassigned characters, these will
126 # show up as gaps in the index sequence. There are a few special cases
127 # where the gaps correspond to a uniform block of assigned characters.
128 # These are indicated as such in the character name.
130 # Enter all unassigned characters up to the current character.
131 if {($index > $next) \
132 && ![regexp "Last>$" [lindex $items 1]]} {
133 for {} {$next < $index} {incr next} {
135 if {($next & $mask) == $mask} {
142 # Enter all assigned characters up to the current character
143 for {set i $next} {$i <= $index} {incr i} {
144 # Add the group index to the info for the current page
147 # If this is the last entry in the page, add the page
148 if {($i & $mask) == $mask} {
153 set next [expr {$index + 1}]
159 global argc argv0 argv
167 puts stderr "\nusage: $argv0 <datafile> <outdir>\n"
170 set f [open [lindex $argv 0] r]
175 puts "X = [llength $pMap] Y= [llength $pages] A= [llength $groups]"
176 set size [expr {[llength $pMap]*2 + ([llength $pages]<<$shift)}]
177 puts "shift = $shift, space = $size"
179 set f [open [file join [lindex $argv 1] tclUniData.c] w]
180 fconfigure $f -translation lf -encoding utf-8
184 * Declarations of Unicode character information tables. This file is
185 * automatically generated by the tools/uniParse.tcl script. Do not
186 * modify this file by hand.
188 * Copyright (c) 1998 Scriptics Corporation.
189 * All rights reserved.
193 * A 16-bit Unicode character is split into two parts in order to index
194 * into the following tables. The lower OFFSET_BITS comprise an offset
195 * into a page of characters. The upper bits comprise the page number.
198 #define OFFSET_BITS $shift
201 * The pageMap is indexed by page number and returns an alternate page number
202 * that identifies a unique page of characters. Many Unicode characters map
203 * to the same alternate page number.
206 static const unsigned short pageMap\[\] = {"
208 set last [expr {[llength $pMap] - 1}]
209 for {set i 0} {$i <= $last} {incr i} {
210 if {$i == [expr {0x10000 >> $shift}]} {
211 set line [string trimright $line " \t,"]
213 set lastpage [expr {[lindex $line end] >> $shift}]
214 puts stdout "lastpage: $lastpage"
215 puts $f "#if TCL_UTF_MAX > 3 || TCL_MAJOR_VERSION > 8 || TCL_MINOR_VERSION > 6"
218 append line [lindex $pMap $i]
222 if {[string length $line] > 70} {
223 puts $f [string trimright $line]
228 puts $f "#endif /* TCL_UTF_MAX > 3 */"
232 * The groupMap is indexed by combining the alternate page number with
233 * the page offset and returns a group number that identifies a unique
234 * set of character attributes.
237 static const unsigned char groupMap\[\] = {"
239 set lasti [expr {[llength $pages] - 1}]
240 for {set i 0} {$i <= $lasti} {incr i} {
241 set page [lindex $pages $i]
242 set lastj [expr {[llength $page] - 1}]
243 if {$i == ($lastpage + 1)} {
244 puts $f [string trimright $line " \t,"]
245 puts $f "#if TCL_UTF_MAX > 3 || TCL_MAJOR_VERSION > 8 || TCL_MINOR_VERSION > 6"
248 for {set j 0} {$j <= $lastj} {incr j} {
249 append line [lindex $page $j]
250 if {$j != $lastj || $i != $lasti} {
253 if {[string length $line] > 70} {
254 puts $f [string trimright $line]
260 puts $f "#endif /* TCL_UTF_MAX > 3 */"
264 * Each group represents a unique set of character attributes. The attributes
265 * are encoded into a 32-bit value as follows:
267 * Bits 0-4 Character category: see the constants listed below.
269 * Bits 5-7 Case delta type: 000 = identity
270 * 010 = add delta for lower
271 * 011 = add delta for lower, add 1 for title
272 * 100 = subtract delta for title/upper
273 * 101 = sub delta for upper, sub 1 for title
274 * 110 = sub delta for upper, add delta for lower
275 * 111 = subtract delta for upper
277 * Bits 8-31 Case delta: delta for case conversions. This should be the
278 * highest field so we can easily sign extend.
281 static const int groups\[\] = {"
283 set last [expr {[llength $groups] - 1}]
284 for {set i 0} {$i <= $last} {incr i} {
285 foreach {type toupper tolower totitle} [lindex $groups $i] {}
287 # Compute the case conversion type and delta
290 if {$totitle == $toupper} {
291 # subtract delta for title or upper
295 error "New case conversion type needed: $toupper $tolower $totitle"
297 } elseif {$toupper} {
298 # subtract delta for upper, subtract 1 for title
301 if {($totitle != 1) || $tolower} {
302 error "New case conversion type needed: $toupper $tolower $totitle"
305 # add delta for lower, add 1 for title
308 if {$totitle != -1} {
309 error "New case conversion type needed: $toupper $tolower $totitle"
312 } elseif {$toupper} {
314 if {$tolower == $toupper} {
315 # subtract delta for upper, add delta for lower
317 } elseif {!$tolower} {
318 # subtract delta for upper
321 error "New case conversion type needed: $toupper $tolower $totitle"
323 } elseif {$tolower} {
324 # add delta for lower
333 append line [expr {($delta << 8) | ($case << 5) | $type}]
337 if {[string length $line] > 65} {
338 puts $f [string trimright $line]
343 puts -nonewline $f "};
345 #if TCL_UTF_MAX > 3 || TCL_MAJOR_VERSION > 8 || TCL_MINOR_VERSION > 6
346 # define UNICODE_OUT_OF_RANGE(ch) (((ch) & 0x1FFFFF) >= [format 0x%X $next])
348 # define UNICODE_OUT_OF_RANGE(ch) (((ch) & 0x1F0000) != 0)
352 * The following constants are used to determine the category of a
365 COMBINING_SPACING_MARK,
366 DECIMAL_DIGIT_NUMBER,
376 CONNECTOR_PUNCTUATION,
380 INITIAL_QUOTE_PUNCTUATION,
381 FINAL_QUOTE_PUNCTUATION,
390 * The following macros extract the fields of the character info. The
391 * GetDelta() macro is complicated because we can't rely on the C compiler
392 * to do sign extension on right shifts.
395 #define GetCaseType(info) (((info) & 0xE0) >> 5)
396 #define GetCategory(ch) (GetUniCharInfo(ch) & 0x1F)
397 #define GetDelta(info) ((info) >> 8)
400 * This macro extracts the information about a character from the
401 * Unicode character tables.
404 #if TCL_UTF_MAX > 3 || TCL_MAJOR_VERSION > 8 || TCL_MINOR_VERSION > 6
405 # define GetUniCharInfo(ch) (groups\[groupMap\[pageMap\[((ch) & 0x1FFFFF) >> OFFSET_BITS\] | ((ch) & ((1 << OFFSET_BITS)-1))\]\])
407 # define GetUniCharInfo(ch) (groups\[groupMap\[pageMap\[((ch) & 0xFFFF) >> OFFSET_BITS\] | ((ch) & ((1 << OFFSET_BITS)-1))\]\])