vendor/github.com/hashicorp/hcl/hcl/scanner/scanner.go

   1 // Package scanner implements a scanner for HCL (HashiCorp Configuration
   2 // Language) source text.
   3 package scanner
   4
   5 import (
   6         "bytes"
   7         "fmt"
   8         "os"
   9         "regexp"
  10         "unicode"
  11         "unicode/utf8"
  12
  13         "github.com/hashicorp/hcl/hcl/token"
  14 )
  15
  16 // eof represents a marker rune for the end of the reader.
  17 const eof = rune(0)
  18
  19 // Scanner defines a lexical scanner
  20 type Scanner struct {
  21         buf *bytes.Buffer // Source buffer for advancing and scanning
  22         src []byte        // Source buffer for immutable access
  23
  24         // Source Position
  25         srcPos  token.Pos // current position
  26         prevPos token.Pos // previous position, used for peek() method
  27
  28         lastCharLen int // length of last character in bytes
  29         lastLineLen int // length of last line in characters (for correct column reporting)
  30
  31         tokStart int // token text start position
  32         tokEnd   int // token text end  position
  33
  34         // Error is called for each error encountered. If no Error
  35         // function is set, the error is reported to os.Stderr.
  36         Error func(pos token.Pos, msg string)
  37
  38         // ErrorCount is incremented by one for each error encountered.
  39         ErrorCount int
  40
  41         // tokPos is the start position of most recently scanned token; set by
  42         // Scan. The Filename field is always left untouched by the Scanner.  If
  43         // an error is reported (via Error) and Position is invalid, the scanner is
  44         // not inside a token.
  45         tokPos token.Pos
  46 }
  47
  48 // New creates and initializes a new instance of Scanner using src as
  49 // its source content.
  50 func New(src []byte) *Scanner {
  51         // even though we accept a src, we read from a io.Reader compatible type
  52         // (*bytes.Buffer). So in the future we might easily change it to streaming
  53         // read.
  54         b := bytes.NewBuffer(src)
  55         s := &Scanner{
  56                 buf: b,
  57                 src: src,
  58         }
  59
  60         // srcPosition always starts with 1
  61         s.srcPos.Line = 1
  62         return s
  63 }
  64
  65 // next reads the next rune from the bufferred reader. Returns the rune(0) if
  66 // an error occurs (or io.EOF is returned).
  67 func (s *Scanner) next() rune {
  68         ch, size, err := s.buf.ReadRune()
  69         if err != nil {
  70                 // advance for error reporting
  71                 s.srcPos.Column++
  72                 s.srcPos.Offset += size
  73                 s.lastCharLen = size
  74                 return eof
  75         }
  76
  77         if ch == utf8.RuneError && size == 1 {
  78                 s.srcPos.Column++
  79                 s.srcPos.Offset += size
  80                 s.lastCharLen = size
  81                 s.err("illegal UTF-8 encoding")
  82                 return ch
  83         }
  84
  85         // remember last position
  86         s.prevPos = s.srcPos
  87
  88         s.srcPos.Column++
  89         s.lastCharLen = size
  90         s.srcPos.Offset += size
  91
  92         if ch == '\n' {
  93                 s.srcPos.Line++
  94                 s.lastLineLen = s.srcPos.Column
  95                 s.srcPos.Column = 0
  96         }
  97
  98         // If we see a null character with data left, then that is an error
  99         if ch == '\x00' && s.buf.Len() > 0 {
 100                 s.err("unexpected null character (0x00)")
 101                 return eof
 102         }
 103
 104         // debug
 105         // fmt.Printf("ch: %q, offset:column: %d:%d\n", ch, s.srcPos.Offset, s.srcPos.Column)
 106         return ch
 107 }
 108
 109 // unread unreads the previous read Rune and updates the source position
 110 func (s *Scanner) unread() {
 111         if err := s.buf.UnreadRune(); err != nil {
 112                 panic(err) // this is user fault, we should catch it
 113         }
 114         s.srcPos = s.prevPos // put back last position
 115 }
 116
 117 // peek returns the next rune without advancing the reader.
 118 func (s *Scanner) peek() rune {
 119         peek, _, err := s.buf.ReadRune()
 120         if err != nil {
 121                 return eof
 122         }
 123
 124         s.buf.UnreadRune()
 125         return peek
 126 }
 127
 128 // Scan scans the next token and returns the token.
 129 func (s *Scanner) Scan() token.Token {
 130         ch := s.next()
 131
 132         // skip white space
 133         for isWhitespace(ch) {
 134                 ch = s.next()
 135         }
 136
 137         var tok token.Type
 138
 139         // token text markings
 140         s.tokStart = s.srcPos.Offset - s.lastCharLen
 141
 142         // token position, initial next() is moving the offset by one(size of rune
 143         // actually), though we are interested with the starting point
 144         s.tokPos.Offset = s.srcPos.Offset - s.lastCharLen
 145         if s.srcPos.Column > 0 {
 146                 // common case: last character was not a '\n'
 147                 s.tokPos.Line = s.srcPos.Line
 148                 s.tokPos.Column = s.srcPos.Column
 149         } else {
 150                 // last character was a '\n'
 151                 // (we cannot be at the beginning of the source
 152                 // since we have called next() at least once)
 153                 s.tokPos.Line = s.srcPos.Line - 1
 154                 s.tokPos.Column = s.lastLineLen
 155         }
 156
 157         switch {
 158         case isLetter(ch):
 159                 tok = token.IDENT
 160                 lit := s.scanIdentifier()
 161                 if lit == "true" || lit == "false" {
 162                         tok = token.BOOL
 163                 }
 164         case isDecimal(ch):
 165                 tok = s.scanNumber(ch)
 166         default:
 167                 switch ch {
 168                 case eof:
 169                         tok = token.EOF
 170                 case '"':
 171                         tok = token.STRING
 172                         s.scanString()
 173                 case '#', '/':
 174                         tok = token.COMMENT
 175                         s.scanComment(ch)
 176                 case '.':
 177                         tok = token.PERIOD
 178                         ch = s.peek()
 179                         if isDecimal(ch) {
 180                                 tok = token.FLOAT
 181                                 ch = s.scanMantissa(ch)
 182                                 ch = s.scanExponent(ch)
 183                         }
 184                 case '<':
 185                         tok = token.HEREDOC
 186                         s.scanHeredoc()
 187                 case '[':
 188                         tok = token.LBRACK
 189                 case ']':
 190                         tok = token.RBRACK
 191                 case '{':
 192                         tok = token.LBRACE
 193                 case '}':
 194                         tok = token.RBRACE
 195                 case ',':
 196                         tok = token.COMMA
 197                 case '=':
 198                         tok = token.ASSIGN
 199                 case '+':
 200                         tok = token.ADD
 201                 case '-':
 202                         if isDecimal(s.peek()) {
 203                                 ch := s.next()
 204                                 tok = s.scanNumber(ch)
 205                         } else {
 206                                 tok = token.SUB
 207                         }
 208                 default:
 209                         s.err("illegal char")
 210                 }
 211         }
 212
 213         // finish token ending
 214         s.tokEnd = s.srcPos.Offset
 215
 216         // create token literal
 217         var tokenText string
 218         if s.tokStart >= 0 {
 219                 tokenText = string(s.src[s.tokStart:s.tokEnd])
 220         }
 221         s.tokStart = s.tokEnd // ensure idempotency of tokenText() call
 222
 223         return token.Token{
 224                 Type: tok,
 225                 Pos:  s.tokPos,
 226                 Text: tokenText,
 227         }
 228 }
 229
 230 func (s *Scanner) scanComment(ch rune) {
 231         // single line comments
 232         if ch == '#' || (ch == '/' && s.peek() != '*') {
 233                 if ch == '/' && s.peek() != '/' {
 234                         s.err("expected '/' for comment")
 235                         return
 236                 }
 237
 238                 ch = s.next()
 239                 for ch != '\n' && ch >= 0 && ch != eof {
 240                         ch = s.next()
 241                 }
 242                 if ch != eof && ch >= 0 {
 243                         s.unread()
 244                 }
 245                 return
 246         }
 247
 248         // be sure we get the character after /* This allows us to find comment's
 249         // that are not erminated
 250         if ch == '/' {
 251                 s.next()
 252                 ch = s.next() // read character after "/*"
 253         }
 254
 255         // look for /* - style comments
 256         for {
 257                 if ch < 0 || ch == eof {
 258                         s.err("comment not terminated")
 259                         break
 260                 }
 261
 262                 ch0 := ch
 263                 ch = s.next()
 264                 if ch0 == '*' && ch == '/' {
 265                         break
 266                 }
 267         }
 268 }
 269
 270 // scanNumber scans a HCL number definition starting with the given rune
 271 func (s *Scanner) scanNumber(ch rune) token.Type {
 272         if ch == '0' {
 273                 // check for hexadecimal, octal or float
 274                 ch = s.next()
 275                 if ch == 'x' || ch == 'X' {
 276                         // hexadecimal
 277                         ch = s.next()
 278                         found := false
 279                         for isHexadecimal(ch) {
 280                                 ch = s.next()
 281                                 found = true
 282                         }
 283
 284                         if !found {
 285                                 s.err("illegal hexadecimal number")
 286                         }
 287
 288                         if ch != eof {
 289                                 s.unread()
 290                         }
 291
 292                         return token.NUMBER
 293                 }
 294
 295                 // now it's either something like: 0421(octal) or 0.1231(float)
 296                 illegalOctal := false
 297                 for isDecimal(ch) {
 298                         ch = s.next()
 299                         if ch == '8' || ch == '9' {
 300                                 // this is just a possibility. For example 0159 is illegal, but
 301                                 // 0159.23 is valid. So we mark a possible illegal octal. If
 302                                 // the next character is not a period, we'll print the error.
 303                                 illegalOctal = true
 304                         }
 305                 }
 306
 307                 if ch == 'e' || ch == 'E' {
 308                         ch = s.scanExponent(ch)
 309                         return token.FLOAT
 310                 }
 311
 312                 if ch == '.' {
 313                         ch = s.scanFraction(ch)
 314
 315                         if ch == 'e' || ch == 'E' {
 316                                 ch = s.next()
 317                                 ch = s.scanExponent(ch)
 318                         }
 319                         return token.FLOAT
 320                 }
 321
 322                 if illegalOctal {
 323                         s.err("illegal octal number")
 324                 }
 325
 326                 if ch != eof {
 327                         s.unread()
 328                 }
 329                 return token.NUMBER
 330         }
 331
 332         s.scanMantissa(ch)
 333         ch = s.next() // seek forward
 334         if ch == 'e' || ch == 'E' {
 335                 ch = s.scanExponent(ch)
 336                 return token.FLOAT
 337         }
 338
 339         if ch == '.' {
 340                 ch = s.scanFraction(ch)
 341                 if ch == 'e' || ch == 'E' {
 342                         ch = s.next()
 343                         ch = s.scanExponent(ch)
 344                 }
 345                 return token.FLOAT
 346         }
 347
 348         if ch != eof {
 349                 s.unread()
 350         }
 351         return token.NUMBER
 352 }
 353
 354 // scanMantissa scans the mantissa beginning from the rune. It returns the next
 355 // non decimal rune. It's used to determine wheter it's a fraction or exponent.
 356 func (s *Scanner) scanMantissa(ch rune) rune {
 357         scanned := false
 358         for isDecimal(ch) {
 359                 ch = s.next()
 360                 scanned = true
 361         }
 362
 363         if scanned && ch != eof {
 364                 s.unread()
 365         }
 366         return ch
 367 }
 368
 369 // scanFraction scans the fraction after the '.' rune
 370 func (s *Scanner) scanFraction(ch rune) rune {
 371         if ch == '.' {
 372                 ch = s.peek() // we peek just to see if we can move forward
 373                 ch = s.scanMantissa(ch)
 374         }
 375         return ch
 376 }
 377
 378 // scanExponent scans the remaining parts of an exponent after the 'e' or 'E'
 379 // rune.
 380 func (s *Scanner) scanExponent(ch rune) rune {
 381         if ch == 'e' || ch == 'E' {
 382                 ch = s.next()
 383                 if ch == '-' || ch == '+' {
 384                         ch = s.next()
 385                 }
 386                 ch = s.scanMantissa(ch)
 387         }
 388         return ch
 389 }
 390
 391 // scanHeredoc scans a heredoc string
 392 func (s *Scanner) scanHeredoc() {
 393         // Scan the second '<' in example: '<<EOF'
 394         if s.next() != '<' {
 395                 s.err("heredoc expected second '<', didn't see it")
 396                 return
 397         }
 398
 399         // Get the original offset so we can read just the heredoc ident
 400         offs := s.srcPos.Offset
 401
 402         // Scan the identifier
 403         ch := s.next()
 404
 405         // Indented heredoc syntax
 406         if ch == '-' {
 407                 ch = s.next()
 408         }
 409
 410         for isLetter(ch) || isDigit(ch) {
 411                 ch = s.next()
 412         }
 413
 414         // If we reached an EOF then that is not good
 415         if ch == eof {
 416                 s.err("heredoc not terminated")
 417                 return
 418         }
 419
 420         // Ignore the '\r' in Windows line endings
 421         if ch == '\r' {
 422                 if s.peek() == '\n' {
 423                         ch = s.next()
 424                 }
 425         }
 426
 427         // If we didn't reach a newline then that is also not good
 428         if ch != '\n' {
 429                 s.err("invalid characters in heredoc anchor")
 430                 return
 431         }
 432
 433         // Read the identifier
 434         identBytes := s.src[offs : s.srcPos.Offset-s.lastCharLen]
 435         if len(identBytes) == 0 {
 436                 s.err("zero-length heredoc anchor")
 437                 return
 438         }
 439
 440         var identRegexp *regexp.Regexp
 441         if identBytes[0] == '-' {
 442                 identRegexp = regexp.MustCompile(fmt.Sprintf(`[[:space:]]*%s\z`, identBytes[1:]))
 443         } else {
 444                 identRegexp = regexp.MustCompile(fmt.Sprintf(`[[:space:]]*%s\z`, identBytes))
 445         }
 446
 447         // Read the actual string value
 448         lineStart := s.srcPos.Offset
 449         for {
 450                 ch := s.next()
 451
 452                 // Special newline handling.
 453                 if ch == '\n' {
 454                         // Math is fast, so we first compare the byte counts to see if we have a chance
 455                         // of seeing the same identifier - if the length is less than the number of bytes
 456                         // in the identifier, this cannot be a valid terminator.
 457                         lineBytesLen := s.srcPos.Offset - s.lastCharLen - lineStart
 458                         if lineBytesLen >= len(identBytes) && identRegexp.Match(s.src[lineStart:s.srcPos.Offset-s.lastCharLen]) {
 459                                 break
 460                         }
 461
 462                         // Not an anchor match, record the start of a new line
 463                         lineStart = s.srcPos.Offset
 464                 }
 465
 466                 if ch == eof {
 467                         s.err("heredoc not terminated")
 468                         return
 469                 }
 470         }
 471
 472         return
 473 }
 474
 475 // scanString scans a quoted string
 476 func (s *Scanner) scanString() {
 477         braces := 0
 478         for {
 479                 // '"' opening already consumed
 480                 // read character after quote
 481                 ch := s.next()
 482
 483                 if (ch == '\n' && braces == 0) || ch < 0 || ch == eof {
 484                         s.err("literal not terminated")
 485                         return
 486                 }
 487
 488                 if ch == '"' && braces == 0 {
 489                         break
 490                 }
 491
 492                 // If we're going into a ${} then we can ignore quotes for awhile
 493                 if braces == 0 && ch == '$' && s.peek() == '{' {
 494                         braces++
 495                         s.next()
 496                 } else if braces > 0 && ch == '{' {
 497                         braces++
 498                 }
 499                 if braces > 0 && ch == '}' {
 500                         braces--
 501                 }
 502
 503                 if ch == '\\' {
 504                         s.scanEscape()
 505                 }
 506         }
 507
 508         return
 509 }
 510
 511 // scanEscape scans an escape sequence
 512 func (s *Scanner) scanEscape() rune {
 513         // http://en.cppreference.com/w/cpp/language/escape
 514         ch := s.next() // read character after '/'
 515         switch ch {
 516         case 'a', 'b', 'f', 'n', 'r', 't', 'v', '\\', '"':
 517                 // nothing to do
 518         case '0', '1', '2', '3', '4', '5', '6', '7':
 519                 // octal notation
 520                 ch = s.scanDigits(ch, 8, 3)
 521         case 'x':
 522                 // hexademical notation
 523                 ch = s.scanDigits(s.next(), 16, 2)
 524         case 'u':
 525                 // universal character name
 526                 ch = s.scanDigits(s.next(), 16, 4)
 527         case 'U':
 528                 // universal character name
 529                 ch = s.scanDigits(s.next(), 16, 8)
 530         default:
 531                 s.err("illegal char escape")
 532         }
 533         return ch
 534 }
 535
 536 // scanDigits scans a rune with the given base for n times. For example an
 537 // octal notation \184 would yield in scanDigits(ch, 8, 3)
 538 func (s *Scanner) scanDigits(ch rune, base, n int) rune {
 539         start := n
 540         for n > 0 && digitVal(ch) < base {
 541                 ch = s.next()
 542                 if ch == eof {
 543                         // If we see an EOF, we halt any more scanning of digits
 544                         // immediately.
 545                         break
 546                 }
 547
 548                 n--
 549         }
 550         if n > 0 {
 551                 s.err("illegal char escape")
 552         }
 553
 554         if n != start {
 555                 // we scanned all digits, put the last non digit char back,
 556                 // only if we read anything at all
 557                 s.unread()
 558         }
 559
 560         return ch
 561 }
 562
 563 // scanIdentifier scans an identifier and returns the literal string
 564 func (s *Scanner) scanIdentifier() string {
 565         offs := s.srcPos.Offset - s.lastCharLen
 566         ch := s.next()
 567         for isLetter(ch) || isDigit(ch) || ch == '-' || ch == '.' {
 568                 ch = s.next()
 569         }
 570
 571         if ch != eof {
 572                 s.unread() // we got identifier, put back latest char
 573         }
 574
 575         return string(s.src[offs:s.srcPos.Offset])
 576 }
 577
 578 // recentPosition returns the position of the character immediately after the
 579 // character or token returned by the last call to Scan.
 580 func (s *Scanner) recentPosition() (pos token.Pos) {
 581         pos.Offset = s.srcPos.Offset - s.lastCharLen
 582         switch {
 583         case s.srcPos.Column > 0:
 584                 // common case: last character was not a '\n'
 585                 pos.Line = s.srcPos.Line
 586                 pos.Column = s.srcPos.Column
 587         case s.lastLineLen > 0:
 588                 // last character was a '\n'
 589                 // (we cannot be at the beginning of the source
 590                 // since we have called next() at least once)
 591                 pos.Line = s.srcPos.Line - 1
 592                 pos.Column = s.lastLineLen
 593         default:
 594                 // at the beginning of the source
 595                 pos.Line = 1
 596                 pos.Column = 1
 597         }
 598         return
 599 }
 600
 601 // err prints the error of any scanning to s.Error function. If the function is
 602 // not defined, by default it prints them to os.Stderr
 603 func (s *Scanner) err(msg string) {
 604         s.ErrorCount++
 605         pos := s.recentPosition()
 606
 607         if s.Error != nil {
 608                 s.Error(pos, msg)
 609                 return
 610         }
 611
 612         fmt.Fprintf(os.Stderr, "%s: %s\n", pos, msg)
 613 }
 614
 615 // isHexadecimal returns true if the given rune is a letter
 616 func isLetter(ch rune) bool {
 617         return 'a' <= ch && ch <= 'z' || 'A' <= ch && ch <= 'Z' || ch == '_' || ch >= 0x80 && unicode.IsLetter(ch)
 618 }
 619
 620 // isDigit returns true if the given rune is a decimal digit
 621 func isDigit(ch rune) bool {
 622         return '0' <= ch && ch <= '9' || ch >= 0x80 && unicode.IsDigit(ch)
 623 }
 624
 625 // isDecimal returns true if the given rune is a decimal number
 626 func isDecimal(ch rune) bool {
 627         return '0' <= ch && ch <= '9'
 628 }
 629
 630 // isHexadecimal returns true if the given rune is an hexadecimal number
 631 func isHexadecimal(ch rune) bool {
 632         return '0' <= ch && ch <= '9' || 'a' <= ch && ch <= 'f' || 'A' <= ch && ch <= 'F'
 633 }
 634
 635 // isWhitespace returns true if the rune is a space, tab, newline or carriage return
 636 func isWhitespace(ch rune) bool {
 637         return ch == ' ' || ch == '\t' || ch == '\n' || ch == '\r'
 638 }
 639
 640 // digitVal returns the integer value of a given octal,decimal or hexadecimal rune
 641 func digitVal(ch rune) int {
 642         switch {
 643         case '0' <= ch && ch <= '9':
 644                 return int(ch - '0')
 645         case 'a' <= ch && ch <= 'f':
 646                 return int(ch - 'a' + 10)
 647         case 'A' <= ch && ch <= 'F':
 648                 return int(ch - 'A' + 10)
 649         }
 650         return 16 // larger than any legal digit val
 651 }