5 NUM_ELEM_BYTELOOKUP = 2
13 0x00.upto(0x1f) {|ch| C_ESC[[ch].pack("C")] ||= "\\%03o" % ch }
14 0x7f.upto(0xff) {|ch| C_ESC[[ch].pack("C")] = "\\%03o" % ch }
15 C_ESC_PAT = Regexp.union(*C_ESC.keys)
18 '"' + str.gsub(C_ESC_PAT) { C_ESC[$&] } + '"'
22 def self.parse(pattern)
23 if /\A\s*(([0-9a-f][0-9a-f]|\{([0-9a-f][0-9a-f]|[0-9a-f][0-9a-f]-[0-9a-f][0-9a-f])(,([0-9a-f][0-9a-f]|[0-9a-f][0-9a-f]-[0-9a-f][0-9a-f]))*\})+(\s+|\z))*\z/i !~ pattern
24 raise ArgumentError, "invalid pattern: #{pattern.inspect}"
27 pattern.scan(/\S+/) {|seq|
30 if /\A([0-9a-f][0-9a-f])/i =~ seq
32 seq_result << [byte..byte]
34 elsif /\A\{([^\}]+)\}/ =~ seq
38 set.scan(/[^,]+/) {|range|
39 if /\A([0-9a-f][0-9a-f])-([0-9a-f][0-9a-f])\z/i =~ range
43 elsif /\A([0-9a-f][0-9a-f])\z/i =~ range
45 set_result << (byte..byte)
47 raise "invalid range: #{range.inspect}"
50 seq_result << set_result
52 raise "invalid sequence: #{seq.inspect}"
65 return @hash if defined? @hash
70 self.class == other.class &&
71 @pat == other.instance_eval { @pat }
85 if byteset.length == 1 && byteset[0].begin == byteset[0].end
86 "%02x" % byteset[0].begin
90 if range.begin == range.end
93 "%02x-%02x" % [range.begin, range.end]
105 "\#<#{self.class}: #{self.to_s}>"
112 @pat.map {|seq| seq.length }.min
120 @pat.map {|seq| seq.length }.max
134 seq.first.each {|range|
147 seq.first.each {|range|
149 (h[byte] ||= []) << seq[1..-1]
153 h.keys.sort.each {|byte|
154 yield byte, StrSet.new(h[byte])
160 def initialize(type, name)
171 def insert_at_last(num, str)
172 newnum = self.length + num
179 static const #{@type}
180 #{@name}[#{@len}] = {
189 hash.each {|pat, action|
190 h[StrSet.parse(pat)] = action
200 return @hash if defined? @hash
203 hash ^= k.hash ^ v.hash
209 self.class == other.class &&
210 @map == other.instance_eval { @map }
216 "\#<#{self.class}:" +
217 @map.map {|k, v| " [" + k.to_s + "]=>" + v.inspect }.join('') +
222 @map.keys.map {|k| k.max_length }.max
226 @map.each {|ss, action|
227 return action if ss.emptyable?
232 def each_firstbyte(valid_encoding=nil)
234 @map.each {|ss, action|
236 raise "emptyable pattern"
238 ss.each_firstbyte {|byte, rest|
241 raise "ambiguous %s or %s (%02X/%s)" % [h[byte][rest], action, byte, rest]
243 h[byte][rest] = action
248 valid_encoding.each_firstbyte {|byte, rest|
250 am = ActionMap.new(h[byte])
253 am = ActionMap.new(rest => :undef)
258 h.keys.sort.each {|byte|
259 am = ActionMap.new(h[byte])
268 def format_offsets(min, max, offsets)
269 offsets = offsets[min..max]
270 code = "%d, %d,\n" % [min, max]
271 0.step(offsets.length-1,16) {|i|
273 code << offsets[i,8].map {|off| "%3d," % off.to_s }.join('')
274 if i+8 < offsets.length
276 code << offsets[i+8,8].map {|off| "%3d," % off.to_s }.join('')
288 size = @bytes_code.length
289 rawbytes = [bytes].pack("H*")
292 if !n && !(suf = rawbytes.gsub(/[^A-Za-z0-9_]/, '')).empty? && !UsedName[nn = "str1_" + suf] then n = nn end
293 if !n && !UsedName[nn = "str1_" + bytes] then n = nn end
294 n ||= "str1s_#{size}"
302 if n = StrMemo[bytes]
306 size = @bytes_code.length
308 @bytes_code.insert_at_last(1 + len,
309 "\#define #{n} makeSTR1(#{size})\n" +
310 " makeSTR1LEN(#{len})," + bytes.gsub(/../, ' 0x\&,') + "\n\n")
315 def generate_info(info)
331 when /\A([0-9a-f][0-9a-f])\z/i
333 when /\A([0-9a-f][0-9a-f])([0-9a-f][0-9a-f])\z/i
335 when /\A([0-9a-f][0-9a-f])([0-9a-f][0-9a-f])([0-9a-f][0-9a-f])\z/i
336 "o3(0x#$1,0x#$2,0x#$3)"
337 when /\A(f[0-7])([0-9a-f][0-9a-f])([0-9a-f][0-9a-f])([0-9a-f][0-9a-f])\z/i
338 "o4(0x#$1,0x#$2,0x#$3,0x#$4)"
339 when /\A([0-9a-f][0-9a-f]){4,259}\z/i
341 when /\A\/\*BYTE_LOOKUP\*\// # pointer to BYTE_LOOKUP structure
344 raise "unexpected action: #{info.inspect}"
348 def format_infos(infos)
349 infos = infos.map {|info| generate_info(info) }
350 maxlen = infos.map {|info| info.length }.max
351 columns = maxlen <= 16 ? 4 : 2
353 0.step(infos.length-1, columns) {|i|
355 is = infos[i,columns]
357 code << sprintf(" %#{maxlen}s,", info)
364 def generate_lookup_node(bytes_code, words_code, name, table)
369 table.each_with_index {|action, byte|
371 if action != :invalid
375 unless o = infomap[action]
376 infomap[action] = o = infos.length
385 offsets_key = [min, max, offsets[min..max]]
386 if n = OffsetsMemo[offsets_key]
389 offsets_name = "#{name}_offsets"
390 OffsetsMemo[offsets_key] = offsets_name
391 size = bytes_code.length
392 bytes_code.insert_at_last(2+max-min+1,
393 "\#define #{offsets_name} #{size}\n" +
394 format_offsets(min,max,offsets) + "\n")
397 if n = InfosMemo[infos]
400 infos_name = "#{name}_infos"
401 InfosMemo[infos] = infos_name
403 size = words_code.length
404 words_code.insert_at_last(infos.length,
405 "\#define #{infos_name} WORDINDEX2INFO(#{size})\n" +
406 format_infos(infos) + "\n")
409 size = words_code.length
410 words_code.insert_at_last(NUM_ELEM_BYTELOOKUP,
411 "\#define #{name} WORDINDEX2INFO(#{size})\n" +
422 def generate_node(bytes_code, words_code, name_hint=nil, valid_encoding=nil)
423 if n = PreMemo[[self,valid_encoding]]
427 table = Array.new(0x100, :invalid)
428 each_firstbyte(valid_encoding) {|byte, rest, rest_valid_encoding|
429 if a = rest.empty_action
433 name_hint2 = "#{name_hint}_#{'%02X' % byte}" if name_hint
434 table[byte] = "/*BYTE_LOOKUP*/" + rest.generate_node(bytes_code, words_code, name_hint2, rest_valid_encoding)
438 if n = PostMemo[table]
443 name_hint = "fun_" + NextName.dup
447 PreMemo[[self,valid_encoding]] = PostMemo[table] = name_hint
449 generate_lookup_node(bytes_code, words_code, name_hint, table)
453 def gennode(bytes_code, words_code, name_hint=nil, valid_encoding=nil)
454 @bytes_code = bytes_code
455 @words_code = words_code
456 name = generate_node(bytes_code, words_code, name_hint, valid_encoding)
463 def citrus_mskanji_cstomb(csid, index)
471 raise "invalid byte sequence" if row < 0x21
474 offset = (row == 0x22 || row >= 0x26) ? 0xED : 0xF0
475 elsif row >= 0x4D && row <= 0x7E
478 raise "invalid byte sequence"
481 raise "invalid byte sequence" if row > 0x97
482 offset = (row < 0x5F) ? 0x81 : 0xC1
485 raise "invalid byte sequence" if (col < 0x21 || col > 0x7E)
491 col += 1 if (col >= 0x7F)
495 row = row / 2 + offset
500 def citrus_euc_cstomb(csid, index)
513 def citrus_cstomb(ces, csid, index)
516 citrus_mskanji_cstomb(csid, index)
518 citrus_euc_cstomb(csid, index)
522 SUBDIR = %w/APPLE AST BIG5 CNS CP EBCDIC GB GEORGIAN ISO646 ISO-8859 JIS KAZAKH KOI KS MISC TCVN/
525 def citrus_decode_mapsrc(ces, csid, mapsrcs)
527 mapsrcs.split(',').each do |mapsrc|
530 if mapsrc.rindex('UCS', 0)
533 path << SUBDIR.find{|x| from.rindex(x, 0) }
536 path << SUBDIR.find{|x| mapsrc.rindex(x, 0) }
538 path << mapsrc.gsub(':', '@')
539 path = File.join(*path)
541 path[path.rindex('/')] = '%'
542 STDERR.puts 'load mapsrc %s' % path if VERBOSE_MODE
545 break if /^BEGIN_MAP/ =~ l
548 next if /^\s*(?:#|$)/ =~ l
549 break if /^END_MAP/ =~ l
553 when /0x(\w+)\s*-\s*0x(\w+)\s*=\s*INVALID/
555 when /(0x\w+)\s*=\s*(0x\w+)/
556 table.push << [$1.hex, citrus_cstomb(ces, csid, $2.hex)]
558 raise "unknown notation '%s'"% l
562 when /(0x\w+)\s*=\s*(0x\w+)/
563 table.push << [citrus_cstomb(ces, csid, $1.hex), $2.hex]
565 raise "unknown notation '%s'"% l
577 # integer means UTF-8 encoded sequence.
578 k = [k].pack("U").unpack("H*")[0].upcase if Integer === k
579 v = [v].pack("U").unpack("H*")[0].upcase if Integer === v
585 def transcode_compile_tree(name, from, map)
586 map = encode_utf8(map)
589 h[k] = v unless h[k] # use first mapping
591 am = ActionMap.parse(h)
593 max_input = am.max_input_length
595 if ValidEncoding[from]
596 valid_encoding = StrSet.parse(ValidEncoding[from])
601 defined_name = am.gennode(TRANSCODE_GENERATED_BYTES_CODE, TRANSCODE_GENERATED_WORDS_CODE, name, valid_encoding)
602 return defined_name, max_input
606 TRANSCODE_GENERATED_TRANSCODER_CODE = ''
608 def transcode_tblgen(from, to, map)
610 if from.empty? || to.empty?
611 STDERR.puts "converter for #{from.empty? ? to : from}"
613 STDERR.puts "converter from #{from} to #{to}"
616 id_from = from.tr('^0-9A-Za-z', '_')
617 id_to = to.tr('^0-9A-Za-z', '_')
619 tree_name = "to_#{id_to}"
621 tree_name = "from_#{id_from}"
623 tree_name = "from_#{id_from}_to_#{id_to}"
625 map = encode_utf8(map)
626 real_tree_name, max_input = transcode_compile_tree(tree_name, from, map)
627 transcoder_name = "rb_#{tree_name}"
628 TRANSCODERS << transcoder_name
629 input_unit_length = UnitLength[from]
630 max_output = map.map {|k,v| String === v ? v.length/2 : 1 }.max
631 transcoder_code = <<"End"
632 static const rb_transcoder
633 #{transcoder_name} = {
634 #{c_esc from}, #{c_esc to}, #{real_tree_name},
635 TRANSCODE_TABLE_INFO,
636 #{input_unit_length}, /* input_unit_length */
637 #{max_input}, /* max_input */
638 #{max_output}, /* max_output */
639 asciicompat_converter, /* asciicompat_type */
640 0, NULL, NULL, /* state_size, state_init, state_fini */
641 NULL, NULL, NULL, NULL,
645 TRANSCODE_GENERATED_TRANSCODER_CODE << transcoder_code
649 def transcode_generate_node(am, name_hint=nil)
650 STDERR.puts "converter for #{name_hint}" if VERBOSE_MODE
651 name = am.gennode(TRANSCODE_GENERATED_BYTES_CODE, TRANSCODE_GENERATED_WORDS_CODE, name_hint)
655 def transcode_generated_code
656 TRANSCODE_GENERATED_BYTES_CODE.to_s +
657 TRANSCODE_GENERATED_WORDS_CODE.to_s +
658 "\#define TRANSCODE_TABLE_INFO " +
659 "#{OUTPUT_PREFIX}byte_array, #{TRANSCODE_GENERATED_BYTES_CODE.length}, " +
660 "#{OUTPUT_PREFIX}word_array, #{TRANSCODE_GENERATED_WORDS_CODE.length}, " +
661 "sizeof(unsigned int)\n" +
662 TRANSCODE_GENERATED_TRANSCODER_CODE
665 def transcode_register_code
667 TRANSCODERS.each {|transcoder_name|
668 code << " rb_register_transcoder(&#{transcoder_name});\n"
679 UnitLength.default = 1
682 '1byte' => '{00-ff}',
683 '2byte' => '{00-ff}{00-ff}',
684 '4byte' => '{00-ff}{00-ff}{00-ff}{00-ff}',
685 'US-ASCII' => '{00-7f}',
689 {e1-ec}{80-bf}{80-bf}
691 {ee-ef}{80-bf}{80-bf}
692 f0{90-bf}{80-bf}{80-bf}
693 {f1-f3}{80-bf}{80-bf}{80-bf}
694 f4{80-8f}{80-bf}{80-bf}',
695 'UTF-16BE' => '{00-d7,e0-ff}{00-ff}
696 {d8-db}{00-ff}{dc-df}{00-ff}',
697 'UTF-16LE' => '{00-ff}{00-d7,e0-ff}
698 {00-ff}{d8-db}{00-ff}{dc-df}',
699 'UTF-32BE' => '0000{00-d7,e0-ff}{00-ff}
700 00{01-10}{00-ff}{00-ff}',
701 'UTF-32LE' => '{00-ff}{00-d7,e0-ff}0000
702 {00-ff}{00-ff}{01-10}00',
707 'CP51932' => '{00-7f}
710 'Shift_JIS' => '{00-7f}
711 {81-9f,e0-fc}{40-7e,80-fc}
716 {81-fe}{41-5a,61-7a,81-fe}',
718 {81-fe}{40-7e,a1-fe}',
721 8e{a1-b0}{a1-fe}{a1-fe}',
723 {81-fe}{40-7e,80-fe}',
724 'GB18030' => '{00-7f}
726 {81-fe}{30-39}{81-fe}{30-39}',
729 def set_valid_byte_pattern (encoding, pattern_or_label)
731 if ValidEncoding[pattern_or_label]
732 ValidEncoding[pattern_or_label]
736 if ValidEncoding[encoding] and ValidEncoding[encoding]!=pattern
737 raise ArgumentError, "trying to change valid byte pattern for encoding #{encoding} from #{ValidEncoding[encoding]} to #{pattern}"
739 ValidEncoding[encoding] = pattern
742 # the following may be used in different places, so keep them here for the moment
743 set_valid_byte_pattern 'ASCII-8BIT', '1byte'
744 set_valid_byte_pattern 'Windows-31J', 'Shift_JIS'
745 set_valid_byte_pattern 'eucJP-ms', 'EUC-JP'
747 def make_signature(filename, src)
748 "src=#{filename.dump}, len=#{src.length}, checksum=#{src.sum}"
751 output_filename = nil
755 op = OptionParser.new
756 op.def_option("--help", "show help message") { puts op; exit 0 }
757 op.def_option("--verbose", "verbose mode") { verbose_mode = true }
758 op.def_option("--force", "force table generation") { force_mode = true }
759 op.def_option("--output=FILE", "specify output file") {|arg| output_filename = arg }
762 VERBOSE_MODE = verbose_mode
764 OUTPUT_FILENAME = output_filename
765 OUTPUT_PREFIX = output_filename ? File.basename(output_filename)[/\A[A-Za-z0-9_]*/] : ""
766 OUTPUT_PREFIX.sub!(/\A_+/, '')
767 OUTPUT_PREFIX.sub!(/_*\z/, '_')
769 TRANSCODE_GENERATED_BYTES_CODE = ArrayCode.new("unsigned char", "#{OUTPUT_PREFIX}byte_array")
770 TRANSCODE_GENERATED_WORDS_CODE = ArrayCode.new("unsigned int", "#{OUTPUT_PREFIX}word_array")
773 $srcdir = File.dirname(arg)
774 $:.unshift $srcdir unless $:.include? $srcdir
776 src.force_encoding("ascii-8bit") if src.respond_to? :force_encoding
777 this_script = File.read(__FILE__)
778 this_script.force_encoding("ascii-8bit") if this_script.respond_to? :force_encoding
780 base_signature = "/* autogenerated. */\n"
781 base_signature << "/* #{make_signature(File.basename(__FILE__), this_script)} */\n"
782 base_signature << "/* #{make_signature(File.basename(arg), src)} */\n"
784 if !force_mode && output_filename && File.readable?(output_filename)
785 old_signature = File.open(output_filename) {|f| f.gets("").chomp }
786 chk_signature = base_signature.dup
787 old_signature.each_line {|line|
788 if %r{/\* src="([0-9a-z_.-]+)",} =~ line
790 next if name == File.basename(arg) || name == File.basename(__FILE__)
791 path = File.join($srcdir, name)
792 if File.readable? path
793 chk_signature << "/* #{make_signature(name, File.read(path))} */\n"
797 if old_signature == chk_signature
799 File.utime(now, now, output_filename)
800 STDERR.puts "already up-to-date: #{output_filename}" if VERBOSE_MODE
807 STDERR.puts "generating #{output_filename} ..."
812 erb = ERB.new(src, nil, '%')
814 erb_result = erb.result(binding)
820 lib = File.basename(lib)
821 path = File.join($srcdir, lib)
822 if File.readable? path
823 lib_sigs << "/* #{make_signature(lib, File.read(path))} */\n"
828 result << base_signature
835 new_filename = output_filename + ".new"
836 FileUtils.mkdir_p(File.dirname(output_filename))
837 File.open(new_filename, "wb") {|f| f << result }
838 File.rename(new_filename, output_filename)
839 STDERR.puts "done." if VERBOSE_MODE