3 // Written using the principles developed by Rob Pike in
4 // http://www.youtube.com/watch?v=HxaD_trXwRE
17 var dateRegexp *regexp.Regexp
19 // Define state functions
20 type tomlLexStateFn func() tomlLexStateFn
23 type tomlLexer struct {
25 input []rune // Textual source
36 // Basic read operations on input
38 func (l *tomlLexer) read() rune {
50 func (l *tomlLexer) next() rune {
59 func (l *tomlLexer) ignore() {
60 l.currentTokenStart = l.currentTokenStop
61 l.line = l.endbufferLine
62 l.col = l.endbufferCol
65 func (l *tomlLexer) skip() {
70 func (l *tomlLexer) fastForward(n int) {
71 for i := 0; i < n; i++ {
76 func (l *tomlLexer) emitWithValue(t tokenType, value string) {
77 l.tokens = append(l.tokens, token{
78 Position: Position{l.line, l.col},
85 func (l *tomlLexer) emit(t tokenType) {
86 l.emitWithValue(t, string(l.input[l.currentTokenStart:l.currentTokenStop]))
89 func (l *tomlLexer) peek() rune {
90 if l.inputIdx >= len(l.input) {
93 return l.input[l.inputIdx]
96 func (l *tomlLexer) peekString(size int) string {
97 maxIdx := len(l.input)
98 upperIdx := l.inputIdx + size // FIXME: potential overflow
99 if upperIdx > maxIdx {
102 return string(l.input[l.inputIdx:upperIdx])
105 func (l *tomlLexer) follow(next string) bool {
106 return next == l.peekString(len(next))
111 func (l *tomlLexer) errorf(format string, args ...interface{}) tomlLexStateFn {
112 l.tokens = append(l.tokens, token{
113 Position: Position{l.line, l.col},
115 val: fmt.Sprintf(format, args...),
122 func (l *tomlLexer) lexVoid() tomlLexStateFn {
129 return l.lexComment(l.lexVoid)
147 if isKeyStartChar(next) {
161 func (l *tomlLexer) lexRvalue() tomlLexStateFn {
166 return l.errorf("cannot start float with a dot")
171 return l.lexLeftBracket
174 return l.lexRightBracket
176 return l.lexLeftCurlyBrace
178 return l.lexRightCurlyBrace
180 return l.lexComment(l.lexRvalue)
184 return l.lexLiteralString
196 return l.errorf("cannot start number with underscore")
199 if l.follow("true") {
203 if l.follow("false") {
217 possibleDate := l.peekString(35)
218 dateMatch := dateRegexp.FindString(possibleDate)
220 l.fastForward(len(dateMatch))
224 if next == '+' || next == '-' || isDigit(next) {
228 if isAlphanumeric(next) {
232 return l.errorf("no value can start with %c", next)
239 func (l *tomlLexer) lexLeftCurlyBrace() tomlLexStateFn {
241 l.emit(tokenLeftCurlyBrace)
245 func (l *tomlLexer) lexRightCurlyBrace() tomlLexStateFn {
247 l.emit(tokenRightCurlyBrace)
251 func (l *tomlLexer) lexDate() tomlLexStateFn {
256 func (l *tomlLexer) lexTrue() tomlLexStateFn {
262 func (l *tomlLexer) lexFalse() tomlLexStateFn {
268 func (l *tomlLexer) lexEqual() tomlLexStateFn {
274 func (l *tomlLexer) lexComma() tomlLexStateFn {
280 func (l *tomlLexer) lexKey() tomlLexStateFn {
283 for r := l.peek(); isKeyChar(r) || r == '\n' || r == '\r'; r = l.peek() {
286 str, err := l.lexStringAsString(`"`, false, true)
288 return l.errorf(err.Error())
290 growingString += `"` + str + `"`
293 } else if r == '\n' {
294 return l.errorf("keys cannot contain new lines")
295 } else if isSpace(r) {
297 } else if !isValidBareChar(r) {
298 return l.errorf("keys cannot contain %c character", r)
300 growingString += string(r)
303 l.emitWithValue(tokenKey, growingString)
307 func (l *tomlLexer) lexComment(previousState tomlLexStateFn) tomlLexStateFn {
308 return func() tomlLexStateFn {
309 for next := l.peek(); next != '\n' && next != eof; next = l.peek() {
310 if next == '\r' && l.follow("\r\n") {
320 func (l *tomlLexer) lexLeftBracket() tomlLexStateFn {
322 l.emit(tokenLeftBracket)
326 func (l *tomlLexer) lexLiteralStringAsString(terminator string, discardLeadingNewLine bool) (string, error) {
329 if discardLeadingNewLine {
330 if l.follow("\r\n") {
333 } else if l.peek() == '\n' {
338 // find end of string
340 if l.follow(terminator) {
341 return growingString, nil
348 growingString += string(l.next())
351 return "", errors.New("unclosed string")
354 func (l *tomlLexer) lexLiteralString() tomlLexStateFn {
357 // handle special case for triple-quote
359 discardLeadingNewLine := false
364 discardLeadingNewLine = true
367 str, err := l.lexLiteralStringAsString(terminator, discardLeadingNewLine)
369 return l.errorf(err.Error())
372 l.emitWithValue(tokenString, str)
373 l.fastForward(len(terminator))
378 // Lex a string and return the results as a string.
379 // Terminator is the substring indicating the end of the token.
380 // The resulting string does not include the terminator.
381 func (l *tomlLexer) lexStringAsString(terminator string, discardLeadingNewLine, acceptNewLines bool) (string, error) {
384 if discardLeadingNewLine {
385 if l.follow("\r\n") {
388 } else if l.peek() == '\n' {
394 if l.follow(terminator) {
395 return growingString, nil
408 // skip all whitespace chars following backslash
409 for strings.ContainsRune("\r\n\t ", l.peek()) {
413 growingString += "\""
416 growingString += "\n"
419 growingString += "\b"
422 growingString += "\f"
428 growingString += "\t"
431 growingString += "\r"
434 growingString += "\\"
439 for i := 0; i < 4; i++ {
442 return "", errors.New("unfinished unicode escape")
445 code = code + string(c)
447 intcode, err := strconv.ParseInt(code, 16, 32)
449 return "", errors.New("invalid unicode escape: \\u" + code)
451 growingString += string(rune(intcode))
455 for i := 0; i < 8; i++ {
458 return "", errors.New("unfinished unicode escape")
461 code = code + string(c)
463 intcode, err := strconv.ParseInt(code, 16, 64)
465 return "", errors.New("invalid unicode escape: \\U" + code)
467 growingString += string(rune(intcode))
469 return "", errors.New("invalid escape sequence: \\" + string(l.peek()))
474 if 0x00 <= r && r <= 0x1F && !(acceptNewLines && (r == '\n' || r == '\r')) {
475 return "", fmt.Errorf("unescaped control character %U", r)
478 growingString += string(r)
486 return "", errors.New("unclosed string")
489 func (l *tomlLexer) lexString() tomlLexStateFn {
492 // handle special case for triple-quote
494 discardLeadingNewLine := false
495 acceptNewLines := false
500 discardLeadingNewLine = true
501 acceptNewLines = true
504 str, err := l.lexStringAsString(terminator, discardLeadingNewLine, acceptNewLines)
507 return l.errorf(err.Error())
510 l.emitWithValue(tokenString, str)
511 l.fastForward(len(terminator))
516 func (l *tomlLexer) lexTableKey() tomlLexStateFn {
520 // token '[[' signifies an array of tables
522 l.emit(tokenDoubleLeftBracket)
523 return l.lexInsideTableArrayKey
526 l.emit(tokenLeftBracket)
527 return l.lexInsideTableKey
530 func (l *tomlLexer) lexInsideTableArrayKey() tomlLexStateFn {
531 for r := l.peek(); r != eof; r = l.peek() {
534 if l.currentTokenStop > l.currentTokenStart {
535 l.emit(tokenKeyGroupArray)
542 l.emit(tokenDoubleRightBracket)
545 return l.errorf("table array key cannot contain ']'")
550 return l.errorf("unclosed table array key")
553 func (l *tomlLexer) lexInsideTableKey() tomlLexStateFn {
554 for r := l.peek(); r != eof; r = l.peek() {
557 if l.currentTokenStop > l.currentTokenStart {
558 l.emit(tokenKeyGroup)
561 l.emit(tokenRightBracket)
564 return l.errorf("table key cannot contain ']'")
569 return l.errorf("unclosed table key")
572 func (l *tomlLexer) lexRightBracket() tomlLexStateFn {
574 l.emit(tokenRightBracket)
578 func (l *tomlLexer) lexNumber() tomlLexStateFn {
580 if r == '+' || r == '-' {
590 return l.errorf("cannot have two dots in one float")
593 if !isDigit(l.peek()) {
594 return l.errorf("float cannot end with a dot")
597 } else if next == 'e' || next == 'E' {
601 if r == '+' || r == '-' {
604 } else if isDigit(next) {
607 } else if next == '_' {
612 if pointSeen && !digitSeen {
613 return l.errorf("cannot start float with a dot")
618 return l.errorf("no digit in that number")
620 if pointSeen || expSeen {
628 func (l *tomlLexer) run() {
629 for state := l.lexVoid; state != nil; {
635 dateRegexp = regexp.MustCompile(`^\d{1,4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}(\.\d{1,9})?(Z|[+-]\d{2}:\d{2})`)
639 func lexToml(inputBytes []byte) []token {
640 runes := bytes.Runes(inputBytes)
643 tokens: make([]token, 0, 256),