vendor/golang.org/x/text/internal/ucd/ucd.go

   1 // Copyright 2014 The Go Authors. All rights reserved.
   2 // Use of this source code is governed by a BSD-style
   3 // license that can be found in the LICENSE file.
   4
   5 // Package ucd provides a parser for Unicode Character Database files, the
   6 // format of which is defined in http://www.unicode.org/reports/tr44/. See
   7 // http://www.unicode.org/Public/UCD/latest/ucd/ for example files.
   8 //
   9 // It currently does not support substitutions of missing fields.
  10 package ucd // import "golang.org/x/text/internal/ucd"
  11
  12 import (
  13         "bufio"
  14         "bytes"
  15         "errors"
  16         "io"
  17         "log"
  18         "regexp"
  19         "strconv"
  20         "strings"
  21 )
  22
  23 // UnicodeData.txt fields.
  24 const (
  25         CodePoint = iota
  26         Name
  27         GeneralCategory
  28         CanonicalCombiningClass
  29         BidiClass
  30         DecompMapping
  31         DecimalValue
  32         DigitValue
  33         NumericValue
  34         BidiMirrored
  35         Unicode1Name
  36         ISOComment
  37         SimpleUppercaseMapping
  38         SimpleLowercaseMapping
  39         SimpleTitlecaseMapping
  40 )
  41
  42 // Parse calls f for each entry in the given reader of a UCD file. It will close
  43 // the reader upon return. It will call log.Fatal if any error occurred.
  44 //
  45 // This implements the most common usage pattern of using Parser.
  46 func Parse(r io.ReadCloser, f func(p *Parser)) {
  47         defer r.Close()
  48
  49         p := New(r)
  50         for p.Next() {
  51                 f(p)
  52         }
  53         if err := p.Err(); err != nil {
  54                 r.Close() // os.Exit will cause defers not to be called.
  55                 log.Fatal(err)
  56         }
  57 }
  58
  59 // An Option is used to configure a Parser.
  60 type Option func(p *Parser)
  61
  62 func keepRanges(p *Parser) {
  63         p.keepRanges = true
  64 }
  65
  66 var (
  67         // KeepRanges prevents the expansion of ranges. The raw ranges can be
  68         // obtained by calling Range(0) on the parser.
  69         KeepRanges Option = keepRanges
  70 )
  71
  72 // The Part option register a handler for lines starting with a '@'. The text
  73 // after a '@' is available as the first field. Comments are handled as usual.
  74 func Part(f func(p *Parser)) Option {
  75         return func(p *Parser) {
  76                 p.partHandler = f
  77         }
  78 }
  79
  80 // The CommentHandler option passes comments that are on a line by itself to
  81 // a given handler.
  82 func CommentHandler(f func(s string)) Option {
  83         return func(p *Parser) {
  84                 p.commentHandler = f
  85         }
  86 }
  87
  88 // A Parser parses Unicode Character Database (UCD) files.
  89 type Parser struct {
  90         scanner *bufio.Scanner
  91
  92         keepRanges bool // Don't expand rune ranges in field 0.
  93
  94         err     error
  95         comment []byte
  96         field   [][]byte
  97         // parsedRange is needed in case Range(0) is called more than once for one
  98         // field. In some cases this requires scanning ahead.
  99         parsedRange          bool
 100         rangeStart, rangeEnd rune
 101
 102         partHandler    func(p *Parser)
 103         commentHandler func(s string)
 104 }
 105
 106 func (p *Parser) setError(err error) {
 107         if p.err == nil {
 108                 p.err = err
 109         }
 110 }
 111
 112 func (p *Parser) getField(i int) []byte {
 113         if i >= len(p.field) {
 114                 return nil
 115         }
 116         return p.field[i]
 117 }
 118
 119 // Err returns a non-nil error if any error occurred during parsing.
 120 func (p *Parser) Err() error {
 121         return p.err
 122 }
 123
 124 // New returns a Parser for the given Reader.
 125 func New(r io.Reader, o ...Option) *Parser {
 126         p := &Parser{
 127                 scanner: bufio.NewScanner(r),
 128         }
 129         for _, f := range o {
 130                 f(p)
 131         }
 132         return p
 133 }
 134
 135 // Next parses the next line in the file. It returns true if a line was parsed
 136 // and false if it reached the end of the file.
 137 func (p *Parser) Next() bool {
 138         if !p.keepRanges && p.rangeStart < p.rangeEnd {
 139                 p.rangeStart++
 140                 return true
 141         }
 142         p.comment = nil
 143         p.field = p.field[:0]
 144         p.parsedRange = false
 145
 146         for p.scanner.Scan() {
 147                 b := p.scanner.Bytes()
 148                 if len(b) == 0 {
 149                         continue
 150                 }
 151                 if b[0] == '#' {
 152                         if p.commentHandler != nil {
 153                                 p.commentHandler(strings.TrimSpace(string(b[1:])))
 154                         }
 155                         continue
 156                 }
 157
 158                 // Parse line
 159                 if i := bytes.IndexByte(b, '#'); i != -1 {
 160                         p.comment = bytes.TrimSpace(b[i+1:])
 161                         b = b[:i]
 162                 }
 163                 if b[0] == '@' {
 164                         if p.partHandler != nil {
 165                                 p.field = append(p.field, bytes.TrimSpace(b[1:]))
 166                                 p.partHandler(p)
 167                                 p.field = p.field[:0]
 168                         }
 169                         p.comment = nil
 170                         continue
 171                 }
 172                 for {
 173                         i := bytes.IndexByte(b, ';')
 174                         if i == -1 {
 175                                 p.field = append(p.field, bytes.TrimSpace(b))
 176                                 break
 177                         }
 178                         p.field = append(p.field, bytes.TrimSpace(b[:i]))
 179                         b = b[i+1:]
 180                 }
 181                 if !p.keepRanges {
 182                         p.rangeStart, p.rangeEnd = p.getRange(0)
 183                 }
 184                 return true
 185         }
 186         p.setError(p.scanner.Err())
 187         return false
 188 }
 189
 190 func parseRune(b []byte) (rune, error) {
 191         if len(b) > 2 && b[0] == 'U' && b[1] == '+' {
 192                 b = b[2:]
 193         }
 194         x, err := strconv.ParseUint(string(b), 16, 32)
 195         return rune(x), err
 196 }
 197
 198 func (p *Parser) parseRune(b []byte) rune {
 199         x, err := parseRune(b)
 200         p.setError(err)
 201         return x
 202 }
 203
 204 // Rune parses and returns field i as a rune.
 205 func (p *Parser) Rune(i int) rune {
 206         if i > 0 || p.keepRanges {
 207                 return p.parseRune(p.getField(i))
 208         }
 209         return p.rangeStart
 210 }
 211
 212 // Runes interprets and returns field i as a sequence of runes.
 213 func (p *Parser) Runes(i int) (runes []rune) {
 214         add := func(b []byte) {
 215                 if b = bytes.TrimSpace(b); len(b) > 0 {
 216                         runes = append(runes, p.parseRune(b))
 217                 }
 218         }
 219         for b := p.getField(i); ; {
 220                 i := bytes.IndexByte(b, ' ')
 221                 if i == -1 {
 222                         add(b)
 223                         break
 224                 }
 225                 add(b[:i])
 226                 b = b[i+1:]
 227         }
 228         return
 229 }
 230
 231 var (
 232         errIncorrectLegacyRange = errors.New("ucd: unmatched <* First>")
 233
 234         // reRange matches one line of a legacy rune range.
 235         reRange = regexp.MustCompile("^([0-9A-F]*);<([^,]*), ([^>]*)>(.*)$")
 236 )
 237
 238 // Range parses and returns field i as a rune range. A range is inclusive at
 239 // both ends. If the field only has one rune, first and last will be identical.
 240 // It supports the legacy format for ranges used in UnicodeData.txt.
 241 func (p *Parser) Range(i int) (first, last rune) {
 242         if !p.keepRanges {
 243                 return p.rangeStart, p.rangeStart
 244         }
 245         return p.getRange(i)
 246 }
 247
 248 func (p *Parser) getRange(i int) (first, last rune) {
 249         b := p.getField(i)
 250         if k := bytes.Index(b, []byte("..")); k != -1 {
 251                 return p.parseRune(b[:k]), p.parseRune(b[k+2:])
 252         }
 253         // The first field may not be a rune, in which case we may ignore any error
 254         // and set the range as 0..0.
 255         x, err := parseRune(b)
 256         if err != nil {
 257                 // Disable range parsing henceforth. This ensures that an error will be
 258                 // returned if the user subsequently will try to parse this field as
 259                 // a Rune.
 260                 p.keepRanges = true
 261         }
 262         // Special case for UnicodeData that was retained for backwards compatibility.
 263         if i == 0 && len(p.field) > 1 && bytes.HasSuffix(p.field[1], []byte("First>")) {
 264                 if p.parsedRange {
 265                         return p.rangeStart, p.rangeEnd
 266                 }
 267                 mf := reRange.FindStringSubmatch(p.scanner.Text())
 268                 if mf == nil || !p.scanner.Scan() {
 269                         p.setError(errIncorrectLegacyRange)
 270                         return x, x
 271                 }
 272                 // Using Bytes would be more efficient here, but Text is a lot easier
 273                 // and this is not a frequent case.
 274                 ml := reRange.FindStringSubmatch(p.scanner.Text())
 275                 if ml == nil || mf[2] != ml[2] || ml[3] != "Last" || mf[4] != ml[4] {
 276                         p.setError(errIncorrectLegacyRange)
 277                         return x, x
 278                 }
 279                 p.rangeStart, p.rangeEnd = x, p.parseRune(p.scanner.Bytes()[:len(ml[1])])
 280                 p.parsedRange = true
 281                 return p.rangeStart, p.rangeEnd
 282         }
 283         return x, x
 284 }
 285
 286 // bools recognizes all valid UCD boolean values.
 287 var bools = map[string]bool{
 288         "":      false,
 289         "N":     false,
 290         "No":    false,
 291         "F":     false,
 292         "False": false,
 293         "Y":     true,
 294         "Yes":   true,
 295         "T":     true,
 296         "True":  true,
 297 }
 298
 299 // Bool parses and returns field i as a boolean value.
 300 func (p *Parser) Bool(i int) bool {
 301         b := p.getField(i)
 302         for s, v := range bools {
 303                 if bstrEq(b, s) {
 304                         return v
 305                 }
 306         }
 307         p.setError(strconv.ErrSyntax)
 308         return false
 309 }
 310
 311 // Int parses and returns field i as an integer value.
 312 func (p *Parser) Int(i int) int {
 313         x, err := strconv.ParseInt(string(p.getField(i)), 10, 64)
 314         p.setError(err)
 315         return int(x)
 316 }
 317
 318 // Uint parses and returns field i as an unsigned integer value.
 319 func (p *Parser) Uint(i int) uint {
 320         x, err := strconv.ParseUint(string(p.getField(i)), 10, 64)
 321         p.setError(err)
 322         return uint(x)
 323 }
 324
 325 // Float parses and returns field i as a decimal value.
 326 func (p *Parser) Float(i int) float64 {
 327         x, err := strconv.ParseFloat(string(p.getField(i)), 64)
 328         p.setError(err)
 329         return x
 330 }
 331
 332 // String parses and returns field i as a string value.
 333 func (p *Parser) String(i int) string {
 334         return string(p.getField(i))
 335 }
 336
 337 // Strings parses and returns field i as a space-separated list of strings.
 338 func (p *Parser) Strings(i int) []string {
 339         ss := strings.Split(string(p.getField(i)), " ")
 340         for i, s := range ss {
 341                 ss[i] = strings.TrimSpace(s)
 342         }
 343         return ss
 344 }
 345
 346 // Comment returns the comments for the current line.
 347 func (p *Parser) Comment() string {
 348         return string(p.comment)
 349 }
 350
 351 var errUndefinedEnum = errors.New("ucd: undefined enum value")
 352
 353 // Enum interprets and returns field i as a value that must be one of the values
 354 // in enum.
 355 func (p *Parser) Enum(i int, enum ...string) string {
 356         b := p.getField(i)
 357         for _, s := range enum {
 358                 if bstrEq(b, s) {
 359                         return s
 360                 }
 361         }
 362         p.setError(errUndefinedEnum)
 363         return ""
 364 }
 365
 366 func bstrEq(b []byte, s string) bool {
 367         if len(b) != len(s) {
 368                 return false
 369         }
 370         for i, c := range b {
 371                 if c != s[i] {
 372                         return false
 373                 }
 374         }
 375         return true
 376 }