--- /dev/null
+package filter
+
+import (
+ "fmt"
+ "unicode"
+ "unicode/utf8"
+)
+
+type token int
+
+const (
+ tokInvalid token = iota
+ tokEOF
+ tokKeyword
+ tokIdent
+ tokString
+ tokInteger
+ tokPunct
+ tokPlaceholder
+)
+
+func (t token) String() string {
+ switch t {
+ case tokInvalid:
+ return "invalid"
+ case tokEOF:
+ return "EOF"
+ case tokKeyword:
+ return "keyword"
+ case tokIdent:
+ return "identifier"
+ case tokString:
+ return "string"
+ case tokInteger:
+ return "integer"
+ case tokPunct:
+ return "punctuation"
+ case tokPlaceholder:
+ return "placeholder"
+ }
+ return "unknown token"
+}
+
+// A scanner holds the scanner's internal state while processing
+// a given text.
+type scanner struct {
+ // immutable state
+ src []byte // source
+
+ // scanning state
+ ch rune // current character
+ offset int // character offset
+ rdOffset int // reading offset (position after current character)
+}
+
+func (s *scanner) init(src []byte) {
+ s.rdOffset = 0
+ s.offset = -1
+ s.src = src
+ s.next() // advance onto the first input rune
+}
+
+const bom = 0xFEFF // byte order mark, always prohibited
+
+// next reads the next Unicode char into s.ch.
+// s.ch < 0 means end-of-file.
+func (s *scanner) next() {
+ if s.rdOffset < len(s.src) {
+ s.offset = s.rdOffset
+ r, w := rune(s.src[s.rdOffset]), 1
+ switch {
+ case r == 0:
+ s.error(s.offset+1, "illegal character NUL")
+ case r >= utf8.RuneSelf:
+ // not ASCII
+ r, w = utf8.DecodeRune(s.src[s.rdOffset:])
+ if r == utf8.RuneError && w == 1 {
+ s.error(s.offset, "illegal UTF-8 encoding")
+ } else if r == bom {
+ s.error(s.offset, "illegal byte order mark")
+ }
+ }
+ s.rdOffset += w
+ s.ch = r
+ } else {
+ s.offset = len(s.src)
+ s.ch = -1 // eof
+ }
+}
+
+func (s *scanner) error(offs int, msg string) {
+ panic(parseError{pos: offs, msg: msg})
+}
+
+func isLetter(ch rune) bool {
+ return 'a' <= ch && ch <= 'z' || 'A' <= ch && ch <= 'Z' || ch == '_' || ch >= utf8.RuneSelf && unicode.IsLetter(ch)
+}
+
+func isDigit(ch rune) bool {
+ return '0' <= ch && ch <= '9' || ch >= utf8.RuneSelf && unicode.IsDigit(ch)
+}
+
+func (s *scanner) scanIdentifier() string {
+ offs := s.offset
+ for isLetter(s.ch) || isDigit(s.ch) {
+ s.next()
+ }
+ return string(s.src[offs:s.offset])
+}
+
+func digitVal(ch rune) int {
+ switch {
+ case '0' <= ch && ch <= '9':
+ return int(ch - '0')
+ case 'a' <= ch && ch <= 'f':
+ return int(ch - 'a' + 10)
+ case 'A' <= ch && ch <= 'F':
+ return int(ch - 'A' + 10)
+ }
+ return 16 // larger than any legal digit val
+}
+
+func (s *scanner) scanMantissa(base int) {
+ for digitVal(s.ch) < base {
+ s.next()
+ }
+}
+
+func (s *scanner) scanNumber() {
+ // digitVal(s.ch) < 10
+ if s.ch == '0' {
+ // int
+ offs := s.offset
+ s.next()
+ if s.ch == 'x' || s.ch == 'X' {
+ // hexadecimal int
+ s.next()
+ s.scanMantissa(16)
+ if s.offset-offs <= 2 {
+ // only scanned "0x" or "0X"
+ s.error(offs, "illegal hexadecimal number")
+ }
+ } else if digitVal(s.ch) < 10 {
+ s.error(offs, "illegal leading 0 in number")
+ }
+ } else {
+ // decimal int
+ s.scanMantissa(10)
+ }
+}
+
+func (s *scanner) scanString() {
+ // "'" opening already consumed
+ offs := s.offset - 1
+
+ for {
+ ch := s.ch
+ if ch < 0 {
+ s.error(offs, "string literal not terminated")
+ break
+ }
+ s.next()
+ if ch == '\'' {
+ break
+ }
+ if ch == '\\' {
+ s.error(offs, "illegal backslash in string literal")
+ }
+ }
+}
+
+func (s *scanner) skipWhitespace() {
+ for s.ch == ' ' || s.ch == '\t' {
+ s.next()
+ }
+}
+
+func (s *scanner) Scan() (pos int, tok token, lit string) {
+ s.skipWhitespace()
+
+ // current token start
+ pos = s.offset
+
+ // determine token value
+ switch ch := s.ch; {
+ case isLetter(ch):
+ lit = s.scanIdentifier()
+ switch lit {
+ case "AND", "OR":
+ tok = tokKeyword
+ default:
+ tok = tokIdent
+ }
+ return pos, tok, lit
+ case '0' <= ch && ch <= '9':
+ s.scanNumber()
+ tok = tokInteger
+ default:
+ s.next() // always make progress
+ switch ch {
+ case -1:
+ return pos, tokEOF, ""
+ case '\'':
+ tok = tokString
+ s.scanString()
+ case '.', '(', ')', '=':
+ tok = tokPunct
+ case '$':
+ s.scanMantissa(10)
+ if s.offset-pos <= 1 {
+ s.error(pos, "illegal $ character")
+ }
+ tok = tokPlaceholder
+ default:
+ s.error(pos, fmt.Sprintf("illegal character %q", ch))
+ }
+ }
+ lit = string(s.src[pos:s.offset])
+ return
+}