1 // Copyright 2014 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
5 // Package ucd provides a parser for Unicode Character Database files, the
6 // format of which is defined in http://www.unicode.org/reports/tr44/. See
7 // http://www.unicode.org/Public/UCD/latest/ucd/ for example files.
9 // It currently does not support substitutions of missing fields.
10 package ucd // import "golang.org/x/text/internal/ucd"
23 // UnicodeData.txt fields.
28 CanonicalCombiningClass
37 SimpleUppercaseMapping
38 SimpleLowercaseMapping
39 SimpleTitlecaseMapping
42 // Parse calls f for each entry in the given reader of a UCD file. It will close
43 // the reader upon return. It will call log.Fatal if any error occurred.
45 // This implements the most common usage pattern of using Parser.
46 func Parse(r io.ReadCloser, f func(p *Parser)) {
53 if err := p.Err(); err != nil {
54 r.Close() // os.Exit will cause defers not to be called.
59 // An Option is used to configure a Parser.
60 type Option func(p *Parser)
62 func keepRanges(p *Parser) {
67 // KeepRanges prevents the expansion of ranges. The raw ranges can be
68 // obtained by calling Range(0) on the parser.
69 KeepRanges Option = keepRanges
72 // The Part option register a handler for lines starting with a '@'. The text
73 // after a '@' is available as the first field. Comments are handled as usual.
74 func Part(f func(p *Parser)) Option {
75 return func(p *Parser) {
80 // The CommentHandler option passes comments that are on a line by itself to
82 func CommentHandler(f func(s string)) Option {
83 return func(p *Parser) {
88 // A Parser parses Unicode Character Database (UCD) files.
90 scanner *bufio.Scanner
92 keepRanges bool // Don't expand rune ranges in field 0.
97 // parsedRange is needed in case Range(0) is called more than once for one
98 // field. In some cases this requires scanning ahead.
100 rangeStart, rangeEnd rune
102 partHandler func(p *Parser)
103 commentHandler func(s string)
106 func (p *Parser) setError(err error) {
112 func (p *Parser) getField(i int) []byte {
113 if i >= len(p.field) {
119 // Err returns a non-nil error if any error occurred during parsing.
120 func (p *Parser) Err() error {
124 // New returns a Parser for the given Reader.
125 func New(r io.Reader, o ...Option) *Parser {
127 scanner: bufio.NewScanner(r),
129 for _, f := range o {
135 // Next parses the next line in the file. It returns true if a line was parsed
136 // and false if it reached the end of the file.
137 func (p *Parser) Next() bool {
138 if !p.keepRanges && p.rangeStart < p.rangeEnd {
143 p.field = p.field[:0]
144 p.parsedRange = false
146 for p.scanner.Scan() {
147 b := p.scanner.Bytes()
152 if p.commentHandler != nil {
153 p.commentHandler(strings.TrimSpace(string(b[1:])))
159 if i := bytes.IndexByte(b, '#'); i != -1 {
160 p.comment = bytes.TrimSpace(b[i+1:])
164 if p.partHandler != nil {
165 p.field = append(p.field, bytes.TrimSpace(b[1:]))
167 p.field = p.field[:0]
173 i := bytes.IndexByte(b, ';')
175 p.field = append(p.field, bytes.TrimSpace(b))
178 p.field = append(p.field, bytes.TrimSpace(b[:i]))
182 p.rangeStart, p.rangeEnd = p.getRange(0)
186 p.setError(p.scanner.Err())
190 func parseRune(b []byte) (rune, error) {
191 if len(b) > 2 && b[0] == 'U' && b[1] == '+' {
194 x, err := strconv.ParseUint(string(b), 16, 32)
198 func (p *Parser) parseRune(b []byte) rune {
199 x, err := parseRune(b)
204 // Rune parses and returns field i as a rune.
205 func (p *Parser) Rune(i int) rune {
206 if i > 0 || p.keepRanges {
207 return p.parseRune(p.getField(i))
212 // Runes interprets and returns field i as a sequence of runes.
213 func (p *Parser) Runes(i int) (runes []rune) {
214 add := func(b []byte) {
215 if b = bytes.TrimSpace(b); len(b) > 0 {
216 runes = append(runes, p.parseRune(b))
219 for b := p.getField(i); ; {
220 i := bytes.IndexByte(b, ' ')
232 errIncorrectLegacyRange = errors.New("ucd: unmatched <* First>")
234 // reRange matches one line of a legacy rune range.
235 reRange = regexp.MustCompile("^([0-9A-F]*);<([^,]*), ([^>]*)>(.*)$")
238 // Range parses and returns field i as a rune range. A range is inclusive at
239 // both ends. If the field only has one rune, first and last will be identical.
240 // It supports the legacy format for ranges used in UnicodeData.txt.
241 func (p *Parser) Range(i int) (first, last rune) {
243 return p.rangeStart, p.rangeStart
248 func (p *Parser) getRange(i int) (first, last rune) {
250 if k := bytes.Index(b, []byte("..")); k != -1 {
251 return p.parseRune(b[:k]), p.parseRune(b[k+2:])
253 // The first field may not be a rune, in which case we may ignore any error
254 // and set the range as 0..0.
255 x, err := parseRune(b)
257 // Disable range parsing henceforth. This ensures that an error will be
258 // returned if the user subsequently will try to parse this field as
262 // Special case for UnicodeData that was retained for backwards compatibility.
263 if i == 0 && len(p.field) > 1 && bytes.HasSuffix(p.field[1], []byte("First>")) {
265 return p.rangeStart, p.rangeEnd
267 mf := reRange.FindStringSubmatch(p.scanner.Text())
268 if mf == nil || !p.scanner.Scan() {
269 p.setError(errIncorrectLegacyRange)
272 // Using Bytes would be more efficient here, but Text is a lot easier
273 // and this is not a frequent case.
274 ml := reRange.FindStringSubmatch(p.scanner.Text())
275 if ml == nil || mf[2] != ml[2] || ml[3] != "Last" || mf[4] != ml[4] {
276 p.setError(errIncorrectLegacyRange)
279 p.rangeStart, p.rangeEnd = x, p.parseRune(p.scanner.Bytes()[:len(ml[1])])
281 return p.rangeStart, p.rangeEnd
286 // bools recognizes all valid UCD boolean values.
287 var bools = map[string]bool{
299 // Bool parses and returns field i as a boolean value.
300 func (p *Parser) Bool(i int) bool {
302 for s, v := range bools {
307 p.setError(strconv.ErrSyntax)
311 // Int parses and returns field i as an integer value.
312 func (p *Parser) Int(i int) int {
313 x, err := strconv.ParseInt(string(p.getField(i)), 10, 64)
318 // Uint parses and returns field i as an unsigned integer value.
319 func (p *Parser) Uint(i int) uint {
320 x, err := strconv.ParseUint(string(p.getField(i)), 10, 64)
325 // Float parses and returns field i as a decimal value.
326 func (p *Parser) Float(i int) float64 {
327 x, err := strconv.ParseFloat(string(p.getField(i)), 64)
332 // String parses and returns field i as a string value.
333 func (p *Parser) String(i int) string {
334 return string(p.getField(i))
337 // Strings parses and returns field i as a space-separated list of strings.
338 func (p *Parser) Strings(i int) []string {
339 ss := strings.Split(string(p.getField(i)), " ")
340 for i, s := range ss {
341 ss[i] = strings.TrimSpace(s)
346 // Comment returns the comments for the current line.
347 func (p *Parser) Comment() string {
348 return string(p.comment)
351 var errUndefinedEnum = errors.New("ucd: undefined enum value")
353 // Enum interprets and returns field i as a value that must be one of the values
355 func (p *Parser) Enum(i int, enum ...string) string {
357 for _, s := range enum {
362 p.setError(errUndefinedEnum)
366 func bstrEq(b []byte, s string) bool {
367 if len(b) != len(s) {
370 for i, c := range b {