scanner.go 6.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315
  1. package mar
  2. import (
  3. "bytes"
  4. "strconv"
  5. "strings"
  6. "unicode/utf8"
  7. )
  8. // Scanner is a marionette DSL tokenizer.
  9. type Scanner struct {
  10. i int
  11. data []byte
  12. pos Pos
  13. }
  14. // NewScanner returns a new instance of Scanner.
  15. func NewScanner(data []byte) *Scanner {
  16. data = bytes.Replace(data, []byte{0}, []byte("\uFFFD"), -1)
  17. data = bytes.Replace(data, []byte{'\f'}, []byte{'\n'}, -1)
  18. data = bytes.Replace(data, []byte{'\r', '\n'}, []byte{'\n'}, -1)
  19. return &Scanner{data: data}
  20. }
  21. // Scan returns the next token from the reader.
  22. func (s *Scanner) Scan() (tok Token, lit string, pos Pos) {
  23. for {
  24. // Special handling for whitespace, numbers, strings, & names.
  25. ch := s.peek()
  26. switch {
  27. case isWhitespace(ch):
  28. return s.scanWhitespace()
  29. case isDigit(ch) || ch == '-':
  30. return s.scanNumber()
  31. case ch == '"' || ch == '\'':
  32. return s.scanString()
  33. case isNameStart(ch):
  34. return s.scanIdent()
  35. }
  36. // Check against individual code points next.
  37. pos = s.pos
  38. switch ch := s.read(); ch {
  39. case eof:
  40. return EOF, "", pos
  41. case ',':
  42. return COMMA, string(ch), pos
  43. case ':':
  44. return COLON, string(ch), pos
  45. case '(':
  46. return LPAREN, string(ch), pos
  47. case ')':
  48. return RPAREN, string(ch), pos
  49. case '.':
  50. return DOT, string(ch), pos
  51. case '#':
  52. return HASH, string(ch), pos
  53. default:
  54. return ILLEGAL, string(ch), pos
  55. }
  56. }
  57. }
  58. // ScanIgnoreWhitespace returns the next non-whitespace, non-comment token.
  59. func (s *Scanner) ScanIgnoreWhitespace() (tok Token, lit string, pos Pos) {
  60. for {
  61. if tok, lit, pos = s.Scan(); tok == HASH {
  62. s.scanUntilNewline()
  63. } else if tok != WS {
  64. return tok, lit, pos
  65. }
  66. }
  67. }
  68. // Peek returns the next token without moving the scanner forward.
  69. func (s *Scanner) Peek() (tok Token, lit string, pos Pos) {
  70. i, prev := s.i, s.pos
  71. tok, lit, pos = s.Scan()
  72. s.i, s.pos = i, prev
  73. return tok, lit, pos
  74. }
  75. // PeekIgnoreWhitespace returns the next non-whitespace, non-comment token without moving the scanner forward.
  76. func (s *Scanner) PeekIgnoreWhitespace() (tok Token, lit string, pos Pos) {
  77. i, prev := s.i, s.pos
  78. for {
  79. if tok, lit, pos = s.Scan(); tok == HASH {
  80. s.scanUntilNewline()
  81. } else if tok != WS {
  82. s.i, s.pos = i, prev
  83. return tok, lit, pos
  84. }
  85. }
  86. }
  87. // scanWhitespace consumes the current code point and all subsequent whitespace.
  88. func (s *Scanner) scanWhitespace() (tok Token, lit string, pos Pos) {
  89. pos = s.pos
  90. var buf bytes.Buffer
  91. for ch := s.peek(); isWhitespace(ch); ch = s.peek() {
  92. buf.WriteRune(s.read())
  93. }
  94. return WS, buf.String(), pos
  95. }
  96. // scanUntilNewline consumes all code points up to and including the next newline or EOF.
  97. func (s *Scanner) scanUntilNewline() {
  98. for ch := s.read(); ch != '\n' && ch != eof; ch = s.read() {
  99. }
  100. }
  101. // scanString consumes a quoted string.
  102. func (s *Scanner) scanString() (tok Token, lit string, pos Pos) {
  103. pos = s.pos
  104. ending := s.read()
  105. var buf bytes.Buffer
  106. for {
  107. if ch := s.peek(); ch == eof {
  108. return ILLEGAL, "", pos
  109. }
  110. switch ch := s.read(); ch {
  111. case ending:
  112. return STRING, buf.String(), pos
  113. case '\\':
  114. switch next := s.peek(); next {
  115. case '\\':
  116. buf.WriteRune(s.read())
  117. case '\'':
  118. buf.WriteRune(s.read())
  119. case '"':
  120. buf.WriteRune(s.read())
  121. case 'a':
  122. s.read()
  123. buf.WriteRune('\a')
  124. case 'b':
  125. s.read()
  126. buf.WriteRune('\b')
  127. case 'f':
  128. s.read()
  129. buf.WriteRune('\f')
  130. case 'n':
  131. s.read()
  132. buf.WriteRune('\n')
  133. case 'r':
  134. s.read()
  135. buf.WriteRune('\r')
  136. case 't':
  137. s.read()
  138. buf.WriteRune('\t')
  139. case 'v':
  140. s.read()
  141. buf.WriteRune('\v')
  142. case 'o':
  143. s.read()
  144. buf.WriteRune(rune(s.readOctal()))
  145. case 'x':
  146. s.read()
  147. buf.WriteRune(rune(s.readHex()))
  148. default:
  149. buf.WriteRune('\\')
  150. }
  151. default:
  152. buf.WriteRune(ch)
  153. }
  154. }
  155. }
  156. // scanNumber consumes a number.
  157. func (s *Scanner) scanNumber() (tok Token, lit string, pos Pos) {
  158. pos = s.pos
  159. // If initial code point is + or - then store it.
  160. var buf bytes.Buffer
  161. switch ch := s.peek(); ch {
  162. case '+', '-':
  163. buf.WriteRune(s.read())
  164. }
  165. // Read as many digits as possible.
  166. s.scanDigits(&buf)
  167. // If next code points are a full stop and digit then consume them.
  168. if next := s.peek(); next == '.' {
  169. buf.WriteRune(s.read())
  170. s.scanDigits(&buf)
  171. return FLOAT, buf.String(), pos
  172. }
  173. return INTEGER, buf.String(), pos
  174. }
  175. // scanDigits consume a contiguous series of digits.
  176. func (s *Scanner) scanDigits(buf *bytes.Buffer) {
  177. for ch := s.peek(); isDigit(ch); ch = s.peek() {
  178. buf.WriteRune(s.read())
  179. }
  180. }
  181. // readOctal reads and parses a stream of octal digits.
  182. func (s *Scanner) readOctal() int {
  183. var buf bytes.Buffer
  184. for ch := s.peek(); isOctal(ch); ch = s.peek() {
  185. buf.WriteRune(s.read())
  186. }
  187. i, _ := strconv.ParseInt(buf.String(), 8, 64)
  188. return int(i)
  189. }
  190. // readHex reads and parses a stream of hex digits.
  191. func (s *Scanner) readHex() int {
  192. var buf bytes.Buffer
  193. for ch := s.peek(); isHex(ch); ch = s.peek() {
  194. buf.WriteRune(s.read())
  195. }
  196. i, _ := strconv.ParseInt(buf.String(), 16, 64)
  197. return int(i)
  198. }
  199. // scanIdent consumes an identifier token.
  200. func (s *Scanner) scanIdent() (tok Token, lit string, pos Pos) {
  201. pos = s.pos
  202. var buf bytes.Buffer
  203. for ch := s.peek(); isName(ch); ch = s.peek() {
  204. buf.WriteRune(s.read())
  205. }
  206. lit = buf.String()
  207. switch strings.ToLower(lit) {
  208. case "action":
  209. return ACTION, lit, pos
  210. case "client":
  211. return CLIENT, lit, pos
  212. case "if":
  213. return IF, lit, pos
  214. case "end":
  215. return END, lit, pos
  216. case "null":
  217. return NULL, lit, pos
  218. case "regex_match_incoming":
  219. return REGEX_MATCH_INCOMING, lit, pos
  220. case "server":
  221. return SERVER, lit, pos
  222. case "start":
  223. return START, lit, pos
  224. default:
  225. return IDENT, buf.String(), pos
  226. }
  227. }
  228. func (s *Scanner) read() rune {
  229. if s.i >= len(s.data) {
  230. return eof
  231. }
  232. ch, sz := utf8.DecodeRune(s.data[s.i:])
  233. s.i += sz
  234. // Track scanner position.
  235. if ch == '\n' {
  236. s.pos.Line++
  237. s.pos.Char = 0
  238. } else {
  239. s.pos.Char++
  240. }
  241. return ch
  242. }
  243. func (s *Scanner) peek() rune {
  244. if s.i >= len(s.data) {
  245. return eof
  246. }
  247. ch, _ := utf8.DecodeRune(s.data[s.i:])
  248. return ch
  249. }
  250. // isWhitespace returns true if the rune is a space, tab, or newline.
  251. func isWhitespace(ch rune) bool {
  252. return ch == ' ' || ch == '\t' || ch == '\n'
  253. }
  254. // isLetter returns true if the rune is a letter.
  255. func isLetter(ch rune) bool {
  256. return (ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z')
  257. }
  258. // isDigit returns true if the rune is a decimal digit.
  259. func isDigit(ch rune) bool {
  260. return (ch >= '0' && ch <= '9')
  261. }
  262. // isOctal returns true if the rune is an octal digit.
  263. func isOctal(ch rune) bool {
  264. return (ch >= '0' && ch <= '7')
  265. }
  266. // isHex returns true if the rune is a hex digit.
  267. func isHex(ch rune) bool {
  268. return (ch >= '0' && ch <= '9') || (ch >= 'a' && ch <= 'f') || (ch >= 'A' && ch <= 'F')
  269. }
  270. // isNameStart returns true if the rune can start a name.
  271. func isNameStart(ch rune) bool {
  272. return isLetter(ch) || ch == '_'
  273. }
  274. // isName returns true if the character is a name code point.
  275. func isName(ch rune) bool {
  276. return isNameStart(ch) || isDigit(ch) || ch == '-'
  277. }
  278. // eof represents an EOF file byte.
  279. var eof rune = -1