1 // Package scanner implements a scanner for HCL (HashiCorp Configuration
2 // Language) source text.
13 "github.com/hashicorp/hcl/hcl/token"
16 // eof represents a marker rune for the end of the reader.
19 // Scanner defines a lexical scanner
21 buf *bytes.Buffer // Source buffer for advancing and scanning
22 src []byte // Source buffer for immutable access
25 srcPos token.Pos // current position
26 prevPos token.Pos // previous position, used for peek() method
28 lastCharLen int // length of last character in bytes
29 lastLineLen int // length of last line in characters (for correct column reporting)
31 tokStart int // token text start position
32 tokEnd int // token text end position
34 // Error is called for each error encountered. If no Error
35 // function is set, the error is reported to os.Stderr.
36 Error func(pos token.Pos, msg string)
38 // ErrorCount is incremented by one for each error encountered.
41 // tokPos is the start position of most recently scanned token; set by
42 // Scan. The Filename field is always left untouched by the Scanner. If
43 // an error is reported (via Error) and Position is invalid, the scanner is
44 // not inside a token.
48 // New creates and initializes a new instance of Scanner using src as
49 // its source content.
50 func New(src []byte) *Scanner {
51 // even though we accept a src, we read from a io.Reader compatible type
52 // (*bytes.Buffer). So in the future we might easily change it to streaming
54 b := bytes.NewBuffer(src)
60 // srcPosition always starts with 1
65 // next reads the next rune from the bufferred reader. Returns the rune(0) if
66 // an error occurs (or io.EOF is returned).
67 func (s *Scanner) next() rune {
68 ch, size, err := s.buf.ReadRune()
70 // advance for error reporting
72 s.srcPos.Offset += size
77 if ch == utf8.RuneError && size == 1 {
79 s.srcPos.Offset += size
81 s.err("illegal UTF-8 encoding")
85 // remember last position
90 s.srcPos.Offset += size
94 s.lastLineLen = s.srcPos.Column
98 // If we see a null character with data left, then that is an error
99 if ch == '\x00' && s.buf.Len() > 0 {
100 s.err("unexpected null character (0x00)")
105 // fmt.Printf("ch: %q, offset:column: %d:%d\n", ch, s.srcPos.Offset, s.srcPos.Column)
109 // unread unreads the previous read Rune and updates the source position
110 func (s *Scanner) unread() {
111 if err := s.buf.UnreadRune(); err != nil {
112 panic(err) // this is user fault, we should catch it
114 s.srcPos = s.prevPos // put back last position
117 // peek returns the next rune without advancing the reader.
118 func (s *Scanner) peek() rune {
119 peek, _, err := s.buf.ReadRune()
128 // Scan scans the next token and returns the token.
129 func (s *Scanner) Scan() token.Token {
133 for isWhitespace(ch) {
139 // token text markings
140 s.tokStart = s.srcPos.Offset - s.lastCharLen
142 // token position, initial next() is moving the offset by one(size of rune
143 // actually), though we are interested with the starting point
144 s.tokPos.Offset = s.srcPos.Offset - s.lastCharLen
145 if s.srcPos.Column > 0 {
146 // common case: last character was not a '\n'
147 s.tokPos.Line = s.srcPos.Line
148 s.tokPos.Column = s.srcPos.Column
150 // last character was a '\n'
151 // (we cannot be at the beginning of the source
152 // since we have called next() at least once)
153 s.tokPos.Line = s.srcPos.Line - 1
154 s.tokPos.Column = s.lastLineLen
160 lit := s.scanIdentifier()
161 if lit == "true" || lit == "false" {
165 tok = s.scanNumber(ch)
181 ch = s.scanMantissa(ch)
182 ch = s.scanExponent(ch)
202 if isDecimal(s.peek()) {
204 tok = s.scanNumber(ch)
209 s.err("illegal char")
213 // finish token ending
214 s.tokEnd = s.srcPos.Offset
216 // create token literal
219 tokenText = string(s.src[s.tokStart:s.tokEnd])
221 s.tokStart = s.tokEnd // ensure idempotency of tokenText() call
230 func (s *Scanner) scanComment(ch rune) {
231 // single line comments
232 if ch == '#' || (ch == '/' && s.peek() != '*') {
233 if ch == '/' && s.peek() != '/' {
234 s.err("expected '/' for comment")
239 for ch != '\n' && ch >= 0 && ch != eof {
242 if ch != eof && ch >= 0 {
248 // be sure we get the character after /* This allows us to find comment's
249 // that are not erminated
252 ch = s.next() // read character after "/*"
255 // look for /* - style comments
257 if ch < 0 || ch == eof {
258 s.err("comment not terminated")
264 if ch0 == '*' && ch == '/' {
270 // scanNumber scans a HCL number definition starting with the given rune
271 func (s *Scanner) scanNumber(ch rune) token.Type {
273 // check for hexadecimal, octal or float
275 if ch == 'x' || ch == 'X' {
279 for isHexadecimal(ch) {
285 s.err("illegal hexadecimal number")
295 // now it's either something like: 0421(octal) or 0.1231(float)
296 illegalOctal := false
299 if ch == '8' || ch == '9' {
300 // this is just a possibility. For example 0159 is illegal, but
301 // 0159.23 is valid. So we mark a possible illegal octal. If
302 // the next character is not a period, we'll print the error.
307 if ch == 'e' || ch == 'E' {
308 ch = s.scanExponent(ch)
313 ch = s.scanFraction(ch)
315 if ch == 'e' || ch == 'E' {
317 ch = s.scanExponent(ch)
323 s.err("illegal octal number")
333 ch = s.next() // seek forward
334 if ch == 'e' || ch == 'E' {
335 ch = s.scanExponent(ch)
340 ch = s.scanFraction(ch)
341 if ch == 'e' || ch == 'E' {
343 ch = s.scanExponent(ch)
354 // scanMantissa scans the mantissa beginning from the rune. It returns the next
355 // non decimal rune. It's used to determine wheter it's a fraction or exponent.
356 func (s *Scanner) scanMantissa(ch rune) rune {
363 if scanned && ch != eof {
369 // scanFraction scans the fraction after the '.' rune
370 func (s *Scanner) scanFraction(ch rune) rune {
372 ch = s.peek() // we peek just to see if we can move forward
373 ch = s.scanMantissa(ch)
378 // scanExponent scans the remaining parts of an exponent after the 'e' or 'E'
380 func (s *Scanner) scanExponent(ch rune) rune {
381 if ch == 'e' || ch == 'E' {
383 if ch == '-' || ch == '+' {
386 ch = s.scanMantissa(ch)
391 // scanHeredoc scans a heredoc string
392 func (s *Scanner) scanHeredoc() {
393 // Scan the second '<' in example: '<<EOF'
395 s.err("heredoc expected second '<', didn't see it")
399 // Get the original offset so we can read just the heredoc ident
400 offs := s.srcPos.Offset
402 // Scan the identifier
405 // Indented heredoc syntax
410 for isLetter(ch) || isDigit(ch) {
414 // If we reached an EOF then that is not good
416 s.err("heredoc not terminated")
420 // Ignore the '\r' in Windows line endings
422 if s.peek() == '\n' {
427 // If we didn't reach a newline then that is also not good
429 s.err("invalid characters in heredoc anchor")
433 // Read the identifier
434 identBytes := s.src[offs : s.srcPos.Offset-s.lastCharLen]
435 if len(identBytes) == 0 {
436 s.err("zero-length heredoc anchor")
440 var identRegexp *regexp.Regexp
441 if identBytes[0] == '-' {
442 identRegexp = regexp.MustCompile(fmt.Sprintf(`[[:space:]]*%s\z`, identBytes[1:]))
444 identRegexp = regexp.MustCompile(fmt.Sprintf(`[[:space:]]*%s\z`, identBytes))
447 // Read the actual string value
448 lineStart := s.srcPos.Offset
452 // Special newline handling.
454 // Math is fast, so we first compare the byte counts to see if we have a chance
455 // of seeing the same identifier - if the length is less than the number of bytes
456 // in the identifier, this cannot be a valid terminator.
457 lineBytesLen := s.srcPos.Offset - s.lastCharLen - lineStart
458 if lineBytesLen >= len(identBytes) && identRegexp.Match(s.src[lineStart:s.srcPos.Offset-s.lastCharLen]) {
462 // Not an anchor match, record the start of a new line
463 lineStart = s.srcPos.Offset
467 s.err("heredoc not terminated")
475 // scanString scans a quoted string
476 func (s *Scanner) scanString() {
479 // '"' opening already consumed
480 // read character after quote
483 if (ch == '\n' && braces == 0) || ch < 0 || ch == eof {
484 s.err("literal not terminated")
488 if ch == '"' && braces == 0 {
492 // If we're going into a ${} then we can ignore quotes for awhile
493 if braces == 0 && ch == '$' && s.peek() == '{' {
496 } else if braces > 0 && ch == '{' {
499 if braces > 0 && ch == '}' {
511 // scanEscape scans an escape sequence
512 func (s *Scanner) scanEscape() rune {
513 // http://en.cppreference.com/w/cpp/language/escape
514 ch := s.next() // read character after '/'
516 case 'a', 'b', 'f', 'n', 'r', 't', 'v', '\\', '"':
518 case '0', '1', '2', '3', '4', '5', '6', '7':
520 ch = s.scanDigits(ch, 8, 3)
522 // hexademical notation
523 ch = s.scanDigits(s.next(), 16, 2)
525 // universal character name
526 ch = s.scanDigits(s.next(), 16, 4)
528 // universal character name
529 ch = s.scanDigits(s.next(), 16, 8)
531 s.err("illegal char escape")
536 // scanDigits scans a rune with the given base for n times. For example an
537 // octal notation \184 would yield in scanDigits(ch, 8, 3)
538 func (s *Scanner) scanDigits(ch rune, base, n int) rune {
540 for n > 0 && digitVal(ch) < base {
543 // If we see an EOF, we halt any more scanning of digits
551 s.err("illegal char escape")
555 // we scanned all digits, put the last non digit char back,
556 // only if we read anything at all
563 // scanIdentifier scans an identifier and returns the literal string
564 func (s *Scanner) scanIdentifier() string {
565 offs := s.srcPos.Offset - s.lastCharLen
567 for isLetter(ch) || isDigit(ch) || ch == '-' || ch == '.' {
572 s.unread() // we got identifier, put back latest char
575 return string(s.src[offs:s.srcPos.Offset])
578 // recentPosition returns the position of the character immediately after the
579 // character or token returned by the last call to Scan.
580 func (s *Scanner) recentPosition() (pos token.Pos) {
581 pos.Offset = s.srcPos.Offset - s.lastCharLen
583 case s.srcPos.Column > 0:
584 // common case: last character was not a '\n'
585 pos.Line = s.srcPos.Line
586 pos.Column = s.srcPos.Column
587 case s.lastLineLen > 0:
588 // last character was a '\n'
589 // (we cannot be at the beginning of the source
590 // since we have called next() at least once)
591 pos.Line = s.srcPos.Line - 1
592 pos.Column = s.lastLineLen
594 // at the beginning of the source
601 // err prints the error of any scanning to s.Error function. If the function is
602 // not defined, by default it prints them to os.Stderr
603 func (s *Scanner) err(msg string) {
605 pos := s.recentPosition()
612 fmt.Fprintf(os.Stderr, "%s: %s\n", pos, msg)
615 // isHexadecimal returns true if the given rune is a letter
616 func isLetter(ch rune) bool {
617 return 'a' <= ch && ch <= 'z' || 'A' <= ch && ch <= 'Z' || ch == '_' || ch >= 0x80 && unicode.IsLetter(ch)
620 // isDigit returns true if the given rune is a decimal digit
621 func isDigit(ch rune) bool {
622 return '0' <= ch && ch <= '9' || ch >= 0x80 && unicode.IsDigit(ch)
625 // isDecimal returns true if the given rune is a decimal number
626 func isDecimal(ch rune) bool {
627 return '0' <= ch && ch <= '9'
630 // isHexadecimal returns true if the given rune is an hexadecimal number
631 func isHexadecimal(ch rune) bool {
632 return '0' <= ch && ch <= '9' || 'a' <= ch && ch <= 'f' || 'A' <= ch && ch <= 'F'
635 // isWhitespace returns true if the rune is a space, tab, newline or carriage return
636 func isWhitespace(ch rune) bool {
637 return ch == ' ' || ch == '\t' || ch == '\n' || ch == '\r'
640 // digitVal returns the integer value of a given octal,decimal or hexadecimal rune
641 func digitVal(ch rune) int {
643 case '0' <= ch && ch <= '9':
645 case 'a' <= ch && ch <= 'f':
646 return int(ch - 'a' + 10)
647 case 'A' <= ch && ch <= 'F':
648 return int(ch - 'A' + 10)
650 return 16 // larger than any legal digit val