package filter import ( "fmt" "unicode" "unicode/utf8" ) type token int const ( tokInvalid token = iota tokEOF tokKeyword tokIdent tokString tokInteger tokPunct tokPlaceholder ) func (t token) String() string { switch t { case tokInvalid: return "invalid" case tokEOF: return "EOF" case tokKeyword: return "keyword" case tokIdent: return "identifier" case tokString: return "string" case tokInteger: return "integer" case tokPunct: return "punctuation" case tokPlaceholder: return "placeholder" } return "unknown token" } // A scanner holds the scanner's internal state while processing // a given text. type scanner struct { // immutable state src []byte // source // scanning state ch rune // current character offset int // character offset rdOffset int // reading offset (position after current character) } func (s *scanner) init(src []byte) { s.rdOffset = 0 s.offset = -1 s.src = src s.next() // advance onto the first input rune } const bom = 0xFEFF // byte order mark, always prohibited // next reads the next Unicode char into s.ch. // s.ch < 0 means end-of-file. func (s *scanner) next() { if s.rdOffset < len(s.src) { s.offset = s.rdOffset r, w := rune(s.src[s.rdOffset]), 1 switch { case r == 0: s.error(s.offset+1, "illegal character NUL") case r >= utf8.RuneSelf: // not ASCII r, w = utf8.DecodeRune(s.src[s.rdOffset:]) if r == utf8.RuneError && w == 1 { s.error(s.offset, "illegal UTF-8 encoding") } else if r == bom { s.error(s.offset, "illegal byte order mark") } } s.rdOffset += w s.ch = r } else { s.offset = len(s.src) s.ch = -1 // eof } } func (s *scanner) error(offs int, msg string) { panic(parseError{pos: offs, msg: msg}) } func isLetter(ch rune) bool { return 'a' <= ch && ch <= 'z' || 'A' <= ch && ch <= 'Z' || ch == '_' || ch >= utf8.RuneSelf && unicode.IsLetter(ch) } func isDigit(ch rune) bool { return '0' <= ch && ch <= '9' || ch >= utf8.RuneSelf && unicode.IsDigit(ch) } func (s *scanner) scanIdentifier() string { offs := s.offset for isLetter(s.ch) || isDigit(s.ch) { s.next() } return string(s.src[offs:s.offset]) } func digitVal(ch rune) int { switch { case '0' <= ch && ch <= '9': return int(ch - '0') case 'a' <= ch && ch <= 'f': return int(ch - 'a' + 10) case 'A' <= ch && ch <= 'F': return int(ch - 'A' + 10) } return 16 // larger than any legal digit val } func (s *scanner) scanMantissa(base int) { for digitVal(s.ch) < base { s.next() } } func (s *scanner) scanNumber() { // digitVal(s.ch) < 10 if s.ch == '0' { // int offs := s.offset s.next() if s.ch == 'x' || s.ch == 'X' { // hexadecimal int s.next() s.scanMantissa(16) if s.offset-offs <= 2 { // only scanned "0x" or "0X" s.error(offs, "illegal hexadecimal number") } } else if digitVal(s.ch) < 10 { s.error(offs, "illegal leading 0 in number") } } else { // decimal int s.scanMantissa(10) } } func (s *scanner) scanString() { // "'" opening already consumed offs := s.offset - 1 for { ch := s.ch if ch < 0 { s.error(offs, "string literal not terminated") break } s.next() if ch == '\'' { break } if ch == '\\' { s.error(offs, "illegal backslash in string literal") } } } func (s *scanner) skipWhitespace() { for s.ch == ' ' || s.ch == '\t' { s.next() } } func (s *scanner) Scan() (pos int, tok token, lit string) { s.skipWhitespace() // current token start pos = s.offset // determine token value switch ch := s.ch; { case isLetter(ch): lit = s.scanIdentifier() switch lit { case "AND", "OR": tok = tokKeyword default: tok = tokIdent } return pos, tok, lit case '0' <= ch && ch <= '9': s.scanNumber() tok = tokInteger default: s.next() // always make progress switch ch { case -1: return pos, tokEOF, "" case '\'': tok = tokString s.scanString() case '.', '(', ')', '=': tok = tokPunct case '$': s.scanMantissa(10) if s.offset-pos <= 1 { s.error(pos, "illegal $ character") } tok = tokPlaceholder default: s.error(pos, fmt.Sprintf("illegal character %q", ch)) } } lit = string(s.src[pos:s.offset]) return }