blockchain/query/filter/scanner.go

   1 package filter
   2
   3 import (
   4         "fmt"
   5         "unicode"
   6         "unicode/utf8"
   7 )
   8
   9 type token int
  10
  11 const (
  12         tokInvalid token = iota
  13         tokEOF
  14         tokKeyword
  15         tokIdent
  16         tokString
  17         tokInteger
  18         tokPunct
  19         tokPlaceholder
  20 )
  21
  22 func (t token) String() string {
  23         switch t {
  24         case tokInvalid:
  25                 return "invalid"
  26         case tokEOF:
  27                 return "EOF"
  28         case tokKeyword:
  29                 return "keyword"
  30         case tokIdent:
  31                 return "identifier"
  32         case tokString:
  33                 return "string"
  34         case tokInteger:
  35                 return "integer"
  36         case tokPunct:
  37                 return "punctuation"
  38         case tokPlaceholder:
  39                 return "placeholder"
  40         }
  41         return "unknown token"
  42 }
  43
  44 // A scanner holds the scanner's internal state while processing
  45 // a given text.
  46 type scanner struct {
  47         // immutable state
  48         src []byte // source
  49
  50         // scanning state
  51         ch       rune // current character
  52         offset   int  // character offset
  53         rdOffset int  // reading offset (position after current character)
  54 }
  55
  56 func (s *scanner) init(src []byte) {
  57         s.rdOffset = 0
  58         s.offset = -1
  59         s.src = src
  60         s.next() // advance onto the first input rune
  61 }
  62
  63 const bom = 0xFEFF // byte order mark, always prohibited
  64
  65 // next reads the next Unicode char into s.ch.
  66 // s.ch < 0 means end-of-file.
  67 func (s *scanner) next() {
  68         if s.rdOffset < len(s.src) {
  69                 s.offset = s.rdOffset
  70                 r, w := rune(s.src[s.rdOffset]), 1
  71                 switch {
  72                 case r == 0:
  73                         s.error(s.offset+1, "illegal character NUL")
  74                 case r >= utf8.RuneSelf:
  75                         // not ASCII
  76                         r, w = utf8.DecodeRune(s.src[s.rdOffset:])
  77                         if r == utf8.RuneError && w == 1 {
  78                                 s.error(s.offset, "illegal UTF-8 encoding")
  79                         } else if r == bom {
  80                                 s.error(s.offset, "illegal byte order mark")
  81                         }
  82                 }
  83                 s.rdOffset += w
  84                 s.ch = r
  85         } else {
  86                 s.offset = len(s.src)
  87                 s.ch = -1 // eof
  88         }
  89 }
  90
  91 func (s *scanner) error(offs int, msg string) {
  92         panic(parseError{pos: offs, msg: msg})
  93 }
  94
  95 func isLetter(ch rune) bool {
  96         return 'a' <= ch && ch <= 'z' || 'A' <= ch && ch <= 'Z' || ch == '_' || ch >= utf8.RuneSelf && unicode.IsLetter(ch)
  97 }
  98
  99 func isDigit(ch rune) bool {
 100         return '0' <= ch && ch <= '9' || ch >= utf8.RuneSelf && unicode.IsDigit(ch)
 101 }
 102
 103 func (s *scanner) scanIdentifier() string {
 104         offs := s.offset
 105         for isLetter(s.ch) || isDigit(s.ch) {
 106                 s.next()
 107         }
 108         return string(s.src[offs:s.offset])
 109 }
 110
 111 func digitVal(ch rune) int {
 112         switch {
 113         case '0' <= ch && ch <= '9':
 114                 return int(ch - '0')
 115         case 'a' <= ch && ch <= 'f':
 116                 return int(ch - 'a' + 10)
 117         case 'A' <= ch && ch <= 'F':
 118                 return int(ch - 'A' + 10)
 119         }
 120         return 16 // larger than any legal digit val
 121 }
 122
 123 func (s *scanner) scanMantissa(base int) {
 124         for digitVal(s.ch) < base {
 125                 s.next()
 126         }
 127 }
 128
 129 func (s *scanner) scanNumber() {
 130         // digitVal(s.ch) < 10
 131         if s.ch == '0' {
 132                 // int
 133                 offs := s.offset
 134                 s.next()
 135                 if s.ch == 'x' || s.ch == 'X' {
 136                         // hexadecimal int
 137                         s.next()
 138                         s.scanMantissa(16)
 139                         if s.offset-offs <= 2 {
 140                                 // only scanned "0x" or "0X"
 141                                 s.error(offs, "illegal hexadecimal number")
 142                         }
 143                 } else if digitVal(s.ch) < 10 {
 144                         s.error(offs, "illegal leading 0 in number")
 145                 }
 146         } else {
 147                 // decimal int
 148                 s.scanMantissa(10)
 149         }
 150 }
 151
 152 func (s *scanner) scanString() {
 153         // "'" opening already consumed
 154         offs := s.offset - 1
 155
 156         for {
 157                 ch := s.ch
 158                 if ch < 0 {
 159                         s.error(offs, "string literal not terminated")
 160                         break
 161                 }
 162                 s.next()
 163                 if ch == '\'' {
 164                         break
 165                 }
 166                 if ch == '\\' {
 167                         s.error(offs, "illegal backslash in string literal")
 168                 }
 169         }
 170 }
 171
 172 func (s *scanner) skipWhitespace() {
 173         for s.ch == ' ' || s.ch == '\t' {
 174                 s.next()
 175         }
 176 }
 177
 178 func (s *scanner) Scan() (pos int, tok token, lit string) {
 179         s.skipWhitespace()
 180
 181         // current token start
 182         pos = s.offset
 183
 184         // determine token value
 185         switch ch := s.ch; {
 186         case isLetter(ch):
 187                 lit = s.scanIdentifier()
 188                 switch lit {
 189                 case "AND", "OR":
 190                         tok = tokKeyword
 191                 default:
 192                         tok = tokIdent
 193                 }
 194                 return pos, tok, lit
 195         case '0' <= ch && ch <= '9':
 196                 s.scanNumber()
 197                 tok = tokInteger
 198         default:
 199                 s.next() // always make progress
 200                 switch ch {
 201                 case -1:
 202                         return pos, tokEOF, ""
 203                 case '\'':
 204                         tok = tokString
 205                         s.scanString()
 206                 case '.', '(', ')', '=':
 207                         tok = tokPunct
 208                 case '$':
 209                         s.scanMantissa(10)
 210                         if s.offset-pos <= 1 {
 211                                 s.error(pos, "illegal $ character")
 212                         }
 213                         tok = tokPlaceholder
 214                 default:
 215                         s.error(pos, fmt.Sprintf("illegal character %q", ch))
 216                 }
 217         }
 218         lit = string(s.src[pos:s.offset])
 219         return
 220 }