vendor/github.com/pelletier/go-toml/lexer.go

   1 // TOML lexer.
   2 //
   3 // Written using the principles developed by Rob Pike in
   4 // http://www.youtube.com/watch?v=HxaD_trXwRE
   5
   6 package toml
   7
   8 import (
   9         "bytes"
  10         "errors"
  11         "fmt"
  12         "regexp"
  13         "strconv"
  14         "strings"
  15 )
  16
  17 var dateRegexp *regexp.Regexp
  18
  19 // Define state functions
  20 type tomlLexStateFn func() tomlLexStateFn
  21
  22 // Define lexer
  23 type tomlLexer struct {
  24         inputIdx          int
  25         input             []rune // Textual source
  26         currentTokenStart int
  27         currentTokenStop  int
  28         tokens            []token
  29         depth             int
  30         line              int
  31         col               int
  32         endbufferLine     int
  33         endbufferCol      int
  34 }
  35
  36 // Basic read operations on input
  37
  38 func (l *tomlLexer) read() rune {
  39         r := l.peek()
  40         if r == '\n' {
  41                 l.endbufferLine++
  42                 l.endbufferCol = 1
  43         } else {
  44                 l.endbufferCol++
  45         }
  46         l.inputIdx++
  47         return r
  48 }
  49
  50 func (l *tomlLexer) next() rune {
  51         r := l.read()
  52
  53         if r != eof {
  54                 l.currentTokenStop++
  55         }
  56         return r
  57 }
  58
  59 func (l *tomlLexer) ignore() {
  60         l.currentTokenStart = l.currentTokenStop
  61         l.line = l.endbufferLine
  62         l.col = l.endbufferCol
  63 }
  64
  65 func (l *tomlLexer) skip() {
  66         l.next()
  67         l.ignore()
  68 }
  69
  70 func (l *tomlLexer) fastForward(n int) {
  71         for i := 0; i < n; i++ {
  72                 l.next()
  73         }
  74 }
  75
  76 func (l *tomlLexer) emitWithValue(t tokenType, value string) {
  77         l.tokens = append(l.tokens, token{
  78                 Position: Position{l.line, l.col},
  79                 typ:      t,
  80                 val:      value,
  81         })
  82         l.ignore()
  83 }
  84
  85 func (l *tomlLexer) emit(t tokenType) {
  86         l.emitWithValue(t, string(l.input[l.currentTokenStart:l.currentTokenStop]))
  87 }
  88
  89 func (l *tomlLexer) peek() rune {
  90         if l.inputIdx >= len(l.input) {
  91                 return eof
  92         }
  93         return l.input[l.inputIdx]
  94 }
  95
  96 func (l *tomlLexer) peekString(size int) string {
  97         maxIdx := len(l.input)
  98         upperIdx := l.inputIdx + size // FIXME: potential overflow
  99         if upperIdx > maxIdx {
 100                 upperIdx = maxIdx
 101         }
 102         return string(l.input[l.inputIdx:upperIdx])
 103 }
 104
 105 func (l *tomlLexer) follow(next string) bool {
 106         return next == l.peekString(len(next))
 107 }
 108
 109 // Error management
 110
 111 func (l *tomlLexer) errorf(format string, args ...interface{}) tomlLexStateFn {
 112         l.tokens = append(l.tokens, token{
 113                 Position: Position{l.line, l.col},
 114                 typ:      tokenError,
 115                 val:      fmt.Sprintf(format, args...),
 116         })
 117         return nil
 118 }
 119
 120 // State functions
 121
 122 func (l *tomlLexer) lexVoid() tomlLexStateFn {
 123         for {
 124                 next := l.peek()
 125                 switch next {
 126                 case '[':
 127                         return l.lexTableKey
 128                 case '#':
 129                         return l.lexComment(l.lexVoid)
 130                 case '=':
 131                         return l.lexEqual
 132                 case '\r':
 133                         fallthrough
 134                 case '\n':
 135                         l.skip()
 136                         continue
 137                 }
 138
 139                 if isSpace(next) {
 140                         l.skip()
 141                 }
 142
 143                 if l.depth > 0 {
 144                         return l.lexRvalue
 145                 }
 146
 147                 if isKeyStartChar(next) {
 148                         return l.lexKey
 149                 }
 150
 151                 if next == eof {
 152                         l.next()
 153                         break
 154                 }
 155         }
 156
 157         l.emit(tokenEOF)
 158         return nil
 159 }
 160
 161 func (l *tomlLexer) lexRvalue() tomlLexStateFn {
 162         for {
 163                 next := l.peek()
 164                 switch next {
 165                 case '.':
 166                         return l.errorf("cannot start float with a dot")
 167                 case '=':
 168                         return l.lexEqual
 169                 case '[':
 170                         l.depth++
 171                         return l.lexLeftBracket
 172                 case ']':
 173                         l.depth--
 174                         return l.lexRightBracket
 175                 case '{':
 176                         return l.lexLeftCurlyBrace
 177                 case '}':
 178                         return l.lexRightCurlyBrace
 179                 case '#':
 180                         return l.lexComment(l.lexRvalue)
 181                 case '"':
 182                         return l.lexString
 183                 case '\'':
 184                         return l.lexLiteralString
 185                 case ',':
 186                         return l.lexComma
 187                 case '\r':
 188                         fallthrough
 189                 case '\n':
 190                         l.skip()
 191                         if l.depth == 0 {
 192                                 return l.lexVoid
 193                         }
 194                         return l.lexRvalue
 195                 case '_':
 196                         return l.errorf("cannot start number with underscore")
 197                 }
 198
 199                 if l.follow("true") {
 200                         return l.lexTrue
 201                 }
 202
 203                 if l.follow("false") {
 204                         return l.lexFalse
 205                 }
 206
 207                 if isSpace(next) {
 208                         l.skip()
 209                         continue
 210                 }
 211
 212                 if next == eof {
 213                         l.next()
 214                         break
 215                 }
 216
 217                 possibleDate := l.peekString(35)
 218                 dateMatch := dateRegexp.FindString(possibleDate)
 219                 if dateMatch != "" {
 220                         l.fastForward(len(dateMatch))
 221                         return l.lexDate
 222                 }
 223
 224                 if next == '+' || next == '-' || isDigit(next) {
 225                         return l.lexNumber
 226                 }
 227
 228                 if isAlphanumeric(next) {
 229                         return l.lexKey
 230                 }
 231
 232                 return l.errorf("no value can start with %c", next)
 233         }
 234
 235         l.emit(tokenEOF)
 236         return nil
 237 }
 238
 239 func (l *tomlLexer) lexLeftCurlyBrace() tomlLexStateFn {
 240         l.next()
 241         l.emit(tokenLeftCurlyBrace)
 242         return l.lexRvalue
 243 }
 244
 245 func (l *tomlLexer) lexRightCurlyBrace() tomlLexStateFn {
 246         l.next()
 247         l.emit(tokenRightCurlyBrace)
 248         return l.lexRvalue
 249 }
 250
 251 func (l *tomlLexer) lexDate() tomlLexStateFn {
 252         l.emit(tokenDate)
 253         return l.lexRvalue
 254 }
 255
 256 func (l *tomlLexer) lexTrue() tomlLexStateFn {
 257         l.fastForward(4)
 258         l.emit(tokenTrue)
 259         return l.lexRvalue
 260 }
 261
 262 func (l *tomlLexer) lexFalse() tomlLexStateFn {
 263         l.fastForward(5)
 264         l.emit(tokenFalse)
 265         return l.lexRvalue
 266 }
 267
 268 func (l *tomlLexer) lexEqual() tomlLexStateFn {
 269         l.next()
 270         l.emit(tokenEqual)
 271         return l.lexRvalue
 272 }
 273
 274 func (l *tomlLexer) lexComma() tomlLexStateFn {
 275         l.next()
 276         l.emit(tokenComma)
 277         return l.lexRvalue
 278 }
 279
 280 func (l *tomlLexer) lexKey() tomlLexStateFn {
 281         growingString := ""
 282
 283         for r := l.peek(); isKeyChar(r) || r == '\n' || r == '\r'; r = l.peek() {
 284                 if r == '"' {
 285                         l.next()
 286                         str, err := l.lexStringAsString(`"`, false, true)
 287                         if err != nil {
 288                                 return l.errorf(err.Error())
 289                         }
 290                         growingString += `"` + str + `"`
 291                         l.next()
 292                         continue
 293                 } else if r == '\n' {
 294                         return l.errorf("keys cannot contain new lines")
 295                 } else if isSpace(r) {
 296                         break
 297                 } else if !isValidBareChar(r) {
 298                         return l.errorf("keys cannot contain %c character", r)
 299                 }
 300                 growingString += string(r)
 301                 l.next()
 302         }
 303         l.emitWithValue(tokenKey, growingString)
 304         return l.lexVoid
 305 }
 306
 307 func (l *tomlLexer) lexComment(previousState tomlLexStateFn) tomlLexStateFn {
 308         return func() tomlLexStateFn {
 309                 for next := l.peek(); next != '\n' && next != eof; next = l.peek() {
 310                         if next == '\r' && l.follow("\r\n") {
 311                                 break
 312                         }
 313                         l.next()
 314                 }
 315                 l.ignore()
 316                 return previousState
 317         }
 318 }
 319
 320 func (l *tomlLexer) lexLeftBracket() tomlLexStateFn {
 321         l.next()
 322         l.emit(tokenLeftBracket)
 323         return l.lexRvalue
 324 }
 325
 326 func (l *tomlLexer) lexLiteralStringAsString(terminator string, discardLeadingNewLine bool) (string, error) {
 327         growingString := ""
 328
 329         if discardLeadingNewLine {
 330                 if l.follow("\r\n") {
 331                         l.skip()
 332                         l.skip()
 333                 } else if l.peek() == '\n' {
 334                         l.skip()
 335                 }
 336         }
 337
 338         // find end of string
 339         for {
 340                 if l.follow(terminator) {
 341                         return growingString, nil
 342                 }
 343
 344                 next := l.peek()
 345                 if next == eof {
 346                         break
 347                 }
 348                 growingString += string(l.next())
 349         }
 350
 351         return "", errors.New("unclosed string")
 352 }
 353
 354 func (l *tomlLexer) lexLiteralString() tomlLexStateFn {
 355         l.skip()
 356
 357         // handle special case for triple-quote
 358         terminator := "'"
 359         discardLeadingNewLine := false
 360         if l.follow("''") {
 361                 l.skip()
 362                 l.skip()
 363                 terminator = "'''"
 364                 discardLeadingNewLine = true
 365         }
 366
 367         str, err := l.lexLiteralStringAsString(terminator, discardLeadingNewLine)
 368         if err != nil {
 369                 return l.errorf(err.Error())
 370         }
 371
 372         l.emitWithValue(tokenString, str)
 373         l.fastForward(len(terminator))
 374         l.ignore()
 375         return l.lexRvalue
 376 }
 377
 378 // Lex a string and return the results as a string.
 379 // Terminator is the substring indicating the end of the token.
 380 // The resulting string does not include the terminator.
 381 func (l *tomlLexer) lexStringAsString(terminator string, discardLeadingNewLine, acceptNewLines bool) (string, error) {
 382         growingString := ""
 383
 384         if discardLeadingNewLine {
 385                 if l.follow("\r\n") {
 386                         l.skip()
 387                         l.skip()
 388                 } else if l.peek() == '\n' {
 389                         l.skip()
 390                 }
 391         }
 392
 393         for {
 394                 if l.follow(terminator) {
 395                         return growingString, nil
 396                 }
 397
 398                 if l.follow("\\") {
 399                         l.next()
 400                         switch l.peek() {
 401                         case '\r':
 402                                 fallthrough
 403                         case '\n':
 404                                 fallthrough
 405                         case '\t':
 406                                 fallthrough
 407                         case ' ':
 408                                 // skip all whitespace chars following backslash
 409                                 for strings.ContainsRune("\r\n\t ", l.peek()) {
 410                                         l.next()
 411                                 }
 412                         case '"':
 413                                 growingString += "\""
 414                                 l.next()
 415                         case 'n':
 416                                 growingString += "\n"
 417                                 l.next()
 418                         case 'b':
 419                                 growingString += "\b"
 420                                 l.next()
 421                         case 'f':
 422                                 growingString += "\f"
 423                                 l.next()
 424                         case '/':
 425                                 growingString += "/"
 426                                 l.next()
 427                         case 't':
 428                                 growingString += "\t"
 429                                 l.next()
 430                         case 'r':
 431                                 growingString += "\r"
 432                                 l.next()
 433                         case '\\':
 434                                 growingString += "\\"
 435                                 l.next()
 436                         case 'u':
 437                                 l.next()
 438                                 code := ""
 439                                 for i := 0; i < 4; i++ {
 440                                         c := l.peek()
 441                                         if !isHexDigit(c) {
 442                                                 return "", errors.New("unfinished unicode escape")
 443                                         }
 444                                         l.next()
 445                                         code = code + string(c)
 446                                 }
 447                                 intcode, err := strconv.ParseInt(code, 16, 32)
 448                                 if err != nil {
 449                                         return "", errors.New("invalid unicode escape: \\u" + code)
 450                                 }
 451                                 growingString += string(rune(intcode))
 452                         case 'U':
 453                                 l.next()
 454                                 code := ""
 455                                 for i := 0; i < 8; i++ {
 456                                         c := l.peek()
 457                                         if !isHexDigit(c) {
 458                                                 return "", errors.New("unfinished unicode escape")
 459                                         }
 460                                         l.next()
 461                                         code = code + string(c)
 462                                 }
 463                                 intcode, err := strconv.ParseInt(code, 16, 64)
 464                                 if err != nil {
 465                                         return "", errors.New("invalid unicode escape: \\U" + code)
 466                                 }
 467                                 growingString += string(rune(intcode))
 468                         default:
 469                                 return "", errors.New("invalid escape sequence: \\" + string(l.peek()))
 470                         }
 471                 } else {
 472                         r := l.peek()
 473
 474                         if 0x00 <= r && r <= 0x1F && !(acceptNewLines && (r == '\n' || r == '\r')) {
 475                                 return "", fmt.Errorf("unescaped control character %U", r)
 476                         }
 477                         l.next()
 478                         growingString += string(r)
 479                 }
 480
 481                 if l.peek() == eof {
 482                         break
 483                 }
 484         }
 485
 486         return "", errors.New("unclosed string")
 487 }
 488
 489 func (l *tomlLexer) lexString() tomlLexStateFn {
 490         l.skip()
 491
 492         // handle special case for triple-quote
 493         terminator := `"`
 494         discardLeadingNewLine := false
 495         acceptNewLines := false
 496         if l.follow(`""`) {
 497                 l.skip()
 498                 l.skip()
 499                 terminator = `"""`
 500                 discardLeadingNewLine = true
 501                 acceptNewLines = true
 502         }
 503
 504         str, err := l.lexStringAsString(terminator, discardLeadingNewLine, acceptNewLines)
 505
 506         if err != nil {
 507                 return l.errorf(err.Error())
 508         }
 509
 510         l.emitWithValue(tokenString, str)
 511         l.fastForward(len(terminator))
 512         l.ignore()
 513         return l.lexRvalue
 514 }
 515
 516 func (l *tomlLexer) lexTableKey() tomlLexStateFn {
 517         l.next()
 518
 519         if l.peek() == '[' {
 520                 // token '[[' signifies an array of tables
 521                 l.next()
 522                 l.emit(tokenDoubleLeftBracket)
 523                 return l.lexInsideTableArrayKey
 524         }
 525         // vanilla table key
 526         l.emit(tokenLeftBracket)
 527         return l.lexInsideTableKey
 528 }
 529
 530 func (l *tomlLexer) lexInsideTableArrayKey() tomlLexStateFn {
 531         for r := l.peek(); r != eof; r = l.peek() {
 532                 switch r {
 533                 case ']':
 534                         if l.currentTokenStop > l.currentTokenStart {
 535                                 l.emit(tokenKeyGroupArray)
 536                         }
 537                         l.next()
 538                         if l.peek() != ']' {
 539                                 break
 540                         }
 541                         l.next()
 542                         l.emit(tokenDoubleRightBracket)
 543                         return l.lexVoid
 544                 case '[':
 545                         return l.errorf("table array key cannot contain ']'")
 546                 default:
 547                         l.next()
 548                 }
 549         }
 550         return l.errorf("unclosed table array key")
 551 }
 552
 553 func (l *tomlLexer) lexInsideTableKey() tomlLexStateFn {
 554         for r := l.peek(); r != eof; r = l.peek() {
 555                 switch r {
 556                 case ']':
 557                         if l.currentTokenStop > l.currentTokenStart {
 558                                 l.emit(tokenKeyGroup)
 559                         }
 560                         l.next()
 561                         l.emit(tokenRightBracket)
 562                         return l.lexVoid
 563                 case '[':
 564                         return l.errorf("table key cannot contain ']'")
 565                 default:
 566                         l.next()
 567                 }
 568         }
 569         return l.errorf("unclosed table key")
 570 }
 571
 572 func (l *tomlLexer) lexRightBracket() tomlLexStateFn {
 573         l.next()
 574         l.emit(tokenRightBracket)
 575         return l.lexRvalue
 576 }
 577
 578 func (l *tomlLexer) lexNumber() tomlLexStateFn {
 579         r := l.peek()
 580         if r == '+' || r == '-' {
 581                 l.next()
 582         }
 583         pointSeen := false
 584         expSeen := false
 585         digitSeen := false
 586         for {
 587                 next := l.peek()
 588                 if next == '.' {
 589                         if pointSeen {
 590                                 return l.errorf("cannot have two dots in one float")
 591                         }
 592                         l.next()
 593                         if !isDigit(l.peek()) {
 594                                 return l.errorf("float cannot end with a dot")
 595                         }
 596                         pointSeen = true
 597                 } else if next == 'e' || next == 'E' {
 598                         expSeen = true
 599                         l.next()
 600                         r := l.peek()
 601                         if r == '+' || r == '-' {
 602                                 l.next()
 603                         }
 604                 } else if isDigit(next) {
 605                         digitSeen = true
 606                         l.next()
 607                 } else if next == '_' {
 608                         l.next()
 609                 } else {
 610                         break
 611                 }
 612                 if pointSeen && !digitSeen {
 613                         return l.errorf("cannot start float with a dot")
 614                 }
 615         }
 616
 617         if !digitSeen {
 618                 return l.errorf("no digit in that number")
 619         }
 620         if pointSeen || expSeen {
 621                 l.emit(tokenFloat)
 622         } else {
 623                 l.emit(tokenInteger)
 624         }
 625         return l.lexRvalue
 626 }
 627
 628 func (l *tomlLexer) run() {
 629         for state := l.lexVoid; state != nil; {
 630                 state = state()
 631         }
 632 }
 633
 634 func init() {
 635         dateRegexp = regexp.MustCompile(`^\d{1,4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}(\.\d{1,9})?(Z|[+-]\d{2}:\d{2})`)
 636 }
 637
 638 // Entry point
 639 func lexToml(inputBytes []byte) []token {
 640         runes := bytes.Runes(inputBytes)
 641         l := &tomlLexer{
 642                 input:         runes,
 643                 tokens:        make([]token, 0, 256),
 644                 line:          1,
 645                 col:           1,
 646                 endbufferLine: 1,
 647                 endbufferCol:  1,
 648         }
 649         l.run()
 650         return l.tokens
 651 }