mirror of
https://github.com/chaisql/chai.git
synced 2025-10-05 23:57:01 +08:00

SQL standard can be interpreted differently: either NULL values are all unique (SQLite, PostgreSQL, ... and now Genji) or they are considered equal (SQL Server, ...).
647 lines
15 KiB
Go
647 lines
15 KiB
Go
package scanner
|
|
|
|
import (
|
|
"bufio"
|
|
"bytes"
|
|
"io"
|
|
"strings"
|
|
|
|
"github.com/genjidb/genji/internal/errors"
|
|
"github.com/genjidb/genji/internal/stringutil"
|
|
)
|
|
|
|
// Code heavily inspired by the influxdata/influxql repository
|
|
// https://github.com/influxdata/influxql/blob/57f403b00b124eb900835c0c944e9b60d848db5e/scanner.go#L12
|
|
|
|
func init() {
|
|
keywords = make(map[string]Token)
|
|
for tok := keywordBeg + 1; tok < keywordEnd; tok++ {
|
|
keywords[strings.ToLower(tokens[tok])] = tok
|
|
}
|
|
for _, tok := range []Token{AND, OR, TRUE, FALSE, NULL, IN, IS, LIKE, BETWEEN} {
|
|
keywords[strings.ToLower(tokens[tok])] = tok
|
|
}
|
|
}
|
|
|
|
// scanner represents a lexical scanner for Genji.
|
|
type scanner struct {
|
|
r *reader
|
|
}
|
|
|
|
// newScanner returns a new instance of Scanner.
|
|
func newScanner(r io.Reader) *scanner {
|
|
return &scanner{r: &reader{r: bufio.NewReaderSize(r, 128)}}
|
|
}
|
|
|
|
// Scan returns the next token and position from the underlying reader.
|
|
// Also returns the literal text read for strings, and number tokens
|
|
// since these token types can have different literal representations.
|
|
func (s *scanner) Scan() (tok Token, pos Pos, lit string) {
|
|
// Read next code point.
|
|
ch0, pos := s.r.read()
|
|
|
|
// If we see whitespace then consume all contiguous whitespace.
|
|
// If we see a letter, or certain acceptable special characters, then consume
|
|
// as an ident or reserved word.
|
|
if isWhitespace(ch0) {
|
|
return s.scanWhitespace()
|
|
} else if isLetter(ch0) || ch0 == '_' {
|
|
s.r.unread()
|
|
return s.scanIdent(true)
|
|
} else if isDigit(ch0) {
|
|
return s.scanNumber()
|
|
}
|
|
|
|
// Otherwise parse individual characters.
|
|
switch ch0 {
|
|
case eof:
|
|
return EOF, pos, ""
|
|
case '`':
|
|
s.r.unread()
|
|
return s.scanIdent(true)
|
|
case '"':
|
|
return s.scanString()
|
|
case '\'':
|
|
return s.scanString()
|
|
case '.':
|
|
ch1, _ := s.r.read()
|
|
s.r.unread()
|
|
if isDigit(ch1) {
|
|
return s.scanNumber()
|
|
}
|
|
return DOT, pos, ""
|
|
case '$':
|
|
tok, _, lit := s.scanIdent(false)
|
|
|
|
if tok != IDENT {
|
|
return tok, pos, "$" + lit
|
|
}
|
|
return NAMEDPARAM, pos, "$" + lit
|
|
case '?':
|
|
return POSITIONALPARAM, pos, ""
|
|
case '+':
|
|
return ADD, pos, ""
|
|
case '-':
|
|
ch1, _ := s.r.read()
|
|
if ch1 == '-' {
|
|
s.skipUntilNewline()
|
|
return COMMENT, pos, ""
|
|
}
|
|
s.r.unread()
|
|
return SUB, pos, ""
|
|
case '*':
|
|
return MUL, pos, ""
|
|
case '/':
|
|
ch1, _ := s.r.read()
|
|
if ch1 == '*' {
|
|
if err := s.skipUntilEndComment(); err != nil {
|
|
return ILLEGAL, pos, ""
|
|
}
|
|
return COMMENT, pos, ""
|
|
}
|
|
s.r.unread()
|
|
return DIV, pos, ""
|
|
case '%':
|
|
return MOD, pos, ""
|
|
case '&':
|
|
return BITWISEAND, pos, ""
|
|
case '|':
|
|
ch1, _ := s.r.read()
|
|
if ch1 == '|' {
|
|
return CONCAT, pos, ""
|
|
}
|
|
s.r.unread()
|
|
return BITWISEOR, pos, ""
|
|
case '^':
|
|
return BITWISEXOR, pos, ""
|
|
case '=':
|
|
ch1, _ := s.r.read()
|
|
if ch1 == '~' {
|
|
return EQREGEX, pos, ""
|
|
}
|
|
if ch1 == '=' {
|
|
return EQ, pos, ""
|
|
}
|
|
s.r.unread()
|
|
return EQ, pos, ""
|
|
case '!':
|
|
if ch1, _ := s.r.read(); ch1 == '=' {
|
|
return NEQ, pos, ""
|
|
} else if ch1 == '~' {
|
|
return NEQREGEX, pos, ""
|
|
}
|
|
s.r.unread()
|
|
case '>':
|
|
if ch1, _ := s.r.read(); ch1 == '=' {
|
|
return GTE, pos, ""
|
|
}
|
|
s.r.unread()
|
|
return GT, pos, ""
|
|
case '<':
|
|
if ch1, _ := s.r.read(); ch1 == '=' {
|
|
return LTE, pos, ""
|
|
} else if ch1 == '>' {
|
|
return NEQ, pos, ""
|
|
}
|
|
s.r.unread()
|
|
return LT, pos, ""
|
|
case '(':
|
|
return LPAREN, pos, ""
|
|
case ')':
|
|
return RPAREN, pos, ""
|
|
case '{':
|
|
return LBRACKET, pos, ""
|
|
case '}':
|
|
return RBRACKET, pos, ""
|
|
case '[':
|
|
return LSBRACKET, pos, ""
|
|
case ']':
|
|
return RSBRACKET, pos, ""
|
|
case ',':
|
|
return COMMA, pos, ""
|
|
case ';':
|
|
return SEMICOLON, pos, ""
|
|
case ':':
|
|
if ch1, _ := s.r.read(); ch1 == ':' {
|
|
return DOUBLECOLON, pos, ""
|
|
}
|
|
s.r.unread()
|
|
return COLON, pos, ""
|
|
}
|
|
|
|
return ILLEGAL, pos, string(ch0)
|
|
}
|
|
|
|
// scanWhitespace consumes the current rune and all contiguous whitespace.
|
|
func (s *scanner) scanWhitespace() (tok Token, pos Pos, lit string) {
|
|
// Create a buffer and read the current character into it.
|
|
var buf bytes.Buffer
|
|
ch, pos := s.r.curr()
|
|
_, _ = buf.WriteRune(ch)
|
|
|
|
// Read every subsequent whitespace character into the buffer.
|
|
// Non-whitespace characters and EOF will cause the loop to exit.
|
|
for {
|
|
ch, _ = s.r.read()
|
|
if ch == eof {
|
|
break
|
|
} else if !isWhitespace(ch) {
|
|
s.r.unread()
|
|
break
|
|
} else {
|
|
_, _ = buf.WriteRune(ch)
|
|
}
|
|
}
|
|
|
|
return WS, pos, buf.String()
|
|
}
|
|
|
|
// skipUntilNewline skips characters until it reaches a newline.
|
|
func (s *scanner) skipUntilNewline() {
|
|
for {
|
|
if ch, _ := s.r.read(); ch == '\n' || ch == eof {
|
|
return
|
|
}
|
|
}
|
|
}
|
|
|
|
// skipUntilEndComment skips characters until it reaches a '*/' symbol.
|
|
func (s *scanner) skipUntilEndComment() error {
|
|
for {
|
|
if ch1, _ := s.r.read(); ch1 == '*' {
|
|
// We might be at the end.
|
|
star:
|
|
ch2, _ := s.r.read()
|
|
if ch2 == '/' {
|
|
return nil
|
|
} else if ch2 == '*' {
|
|
// We are back in the state machine since we see a star.
|
|
goto star
|
|
} else if ch2 == eof {
|
|
return io.EOF
|
|
}
|
|
} else if ch1 == eof {
|
|
return io.EOF
|
|
}
|
|
}
|
|
}
|
|
|
|
func (s *scanner) scanIdent(doLookup bool) (tok Token, pos Pos, lit string) {
|
|
// Save the starting position of the identifier.
|
|
_, pos = s.r.read()
|
|
s.r.unread()
|
|
|
|
var buf bytes.Buffer
|
|
for {
|
|
if ch, _ := s.r.read(); ch == eof {
|
|
break
|
|
} else if ch == '`' {
|
|
tok0, pos0, lit0 := s.scanString()
|
|
if tok0 == BADSTRING || tok0 == BADESCAPE {
|
|
return tok0, pos0, lit0
|
|
}
|
|
return IDENT, pos, lit0
|
|
} else if isIdentChar(ch) {
|
|
s.r.unread()
|
|
buf.WriteString(scanBareIdent(s.r))
|
|
} else {
|
|
s.r.unread()
|
|
break
|
|
}
|
|
}
|
|
lit = buf.String()
|
|
|
|
// If the literal matches a keyword then return that keyword.
|
|
if doLookup {
|
|
if tok := lookup(lit); tok != IDENT {
|
|
return tok, pos, ""
|
|
}
|
|
}
|
|
return IDENT, pos, lit
|
|
}
|
|
|
|
// scanString consumes a contiguous string of non-quote characters.
|
|
// Quote characters can be consumed if they're first escaped with a backslash.
|
|
func (s *scanner) scanString() (tok Token, pos Pos, lit string) {
|
|
s.r.unread()
|
|
_, pos = s.r.curr()
|
|
|
|
lit, err := scanString(s.r)
|
|
|
|
if errors.Is(err, errBadString) {
|
|
return BADSTRING, pos, lit
|
|
} else if errors.Is(err, errBadEscape) {
|
|
_, pos = s.r.curr()
|
|
return BADESCAPE, pos, lit
|
|
}
|
|
return STRING, pos, lit
|
|
}
|
|
|
|
// ScanRegex consumes a token to find escapes
|
|
func (s *scanner) ScanRegex() (tok Token, pos Pos, lit string) {
|
|
_, pos = s.r.curr()
|
|
|
|
// Start & end sentinels.
|
|
start, end := '/', '/'
|
|
// Valid escape chars.
|
|
escapes := map[rune]rune{'/': '/'}
|
|
|
|
b, err := scanDelimited(s.r, start, end, escapes, true)
|
|
|
|
if errors.Is(err, errBadEscape) {
|
|
_, pos = s.r.curr()
|
|
return BADESCAPE, pos, ""
|
|
} else if err != nil {
|
|
return BADREGEX, pos, ""
|
|
}
|
|
return REGEX, pos, string(b)
|
|
}
|
|
|
|
// scanNumber consumes anything that looks like the start of a number.
|
|
func (s *scanner) scanNumber() (tok Token, pos Pos, lit string) {
|
|
var buf bytes.Buffer
|
|
|
|
// Check if the initial rune is a ".".
|
|
ch, pos := s.r.curr()
|
|
if ch == '.' {
|
|
// Peek and see if the next rune is a digit.
|
|
ch1, _ := s.r.read()
|
|
s.r.unread()
|
|
if !isDigit(ch1) {
|
|
return ILLEGAL, pos, "."
|
|
}
|
|
|
|
// Unread the full stop so we can read it later.
|
|
s.r.unread()
|
|
} else {
|
|
s.r.unread()
|
|
}
|
|
|
|
// Read as many digits as possible.
|
|
_, _ = buf.WriteString(s.scanDigits())
|
|
|
|
// If next code points are a full stop and digit then consume them.
|
|
isDecimal := false
|
|
if ch0, _ := s.r.read(); ch0 == '.' {
|
|
isDecimal = true
|
|
if ch1, _ := s.r.read(); isDigit(ch1) {
|
|
_, _ = buf.WriteRune(ch0)
|
|
_, _ = buf.WriteRune(ch1)
|
|
_, _ = buf.WriteString(s.scanDigits())
|
|
} else {
|
|
s.r.unread()
|
|
}
|
|
} else {
|
|
s.r.unread()
|
|
}
|
|
|
|
// If next code points are e or E, optional sign and digits
|
|
if ch0, _ := s.r.read(); ch0 == 'e' || ch0 == 'E' {
|
|
isDecimal = true
|
|
if ch1, _ := s.r.read(); ch1 == '+' || ch1 == '-' {
|
|
if ch2, _ := s.r.read(); isDigit(ch2) {
|
|
_, _ = buf.WriteRune(ch0)
|
|
_, _ = buf.WriteRune(ch1)
|
|
_, _ = buf.WriteRune(ch2)
|
|
_, _ = buf.WriteString(s.scanDigits())
|
|
} else {
|
|
s.r.unread()
|
|
}
|
|
} else if isDigit(ch1) {
|
|
_, _ = buf.WriteRune(ch0)
|
|
_, _ = buf.WriteRune(ch1)
|
|
_, _ = buf.WriteString(s.scanDigits())
|
|
} else {
|
|
s.r.unread()
|
|
}
|
|
} else {
|
|
s.r.unread()
|
|
}
|
|
|
|
if !isDecimal {
|
|
return INTEGER, pos, buf.String()
|
|
}
|
|
return NUMBER, pos, buf.String()
|
|
}
|
|
|
|
// scanDigits consumes a contiguous series of digits.
|
|
func (s *scanner) scanDigits() string {
|
|
var buf bytes.Buffer
|
|
for {
|
|
ch, _ := s.r.read()
|
|
if !isDigit(ch) {
|
|
s.r.unread()
|
|
break
|
|
}
|
|
_, _ = buf.WriteRune(ch)
|
|
}
|
|
return buf.String()
|
|
}
|
|
|
|
// isWhitespace returns true if the rune is a space, tab, or newline.
|
|
func isWhitespace(ch rune) bool { return ch == ' ' || ch == '\t' || ch == '\n' }
|
|
|
|
// isLetter returns true if the rune is a letter.
|
|
func isLetter(ch rune) bool { return (ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z') }
|
|
|
|
// isDigit returns true if the rune is a digit.
|
|
func isDigit(ch rune) bool { return (ch >= '0' && ch <= '9') }
|
|
|
|
// isIdentChar returns true if the rune can be used in an unquoted identifier.
|
|
func isIdentChar(ch rune) bool { return isLetter(ch) || isDigit(ch) || ch == '_' }
|
|
|
|
// Scanner represents a buffered scanner.
|
|
// It provides a fixed-length circular buffer that can be unread.
|
|
type Scanner struct {
|
|
s *scanner
|
|
i int // buffer index
|
|
n int // buffer size
|
|
buf [4]struct {
|
|
tok Token
|
|
pos Pos
|
|
lit string
|
|
}
|
|
}
|
|
|
|
// NewScanner returns a new buffered scanner for a reader.
|
|
func NewScanner(r io.Reader) *Scanner {
|
|
return &Scanner{s: newScanner(r)}
|
|
}
|
|
|
|
// Scan reads the next token from the scanner.
|
|
func (s *Scanner) Scan() (tok Token, pos Pos, lit string) {
|
|
return s.scanFunc(s.s.Scan)
|
|
}
|
|
|
|
// ScanRegex reads a regex token from the scanner.
|
|
func (s *Scanner) ScanRegex() (tok Token, pos Pos, lit string) {
|
|
return s.scanFunc(s.s.ScanRegex)
|
|
}
|
|
|
|
// scanFunc uses the provided function to scan the next token.
|
|
func (s *Scanner) scanFunc(scan func() (Token, Pos, string)) (tok Token, pos Pos, lit string) {
|
|
// If we have unread tokens then read them off the buffer first.
|
|
if s.n > 0 {
|
|
s.n--
|
|
return s.Curr()
|
|
}
|
|
|
|
// Move buffer position forward and save the token.
|
|
s.i = (s.i + 1) % len(s.buf)
|
|
buf := &s.buf[s.i]
|
|
buf.tok, buf.pos, buf.lit = scan()
|
|
|
|
return s.Curr()
|
|
}
|
|
|
|
// Unscan pushes the previously token back onto the buffer.
|
|
func (s *Scanner) Unscan() { s.n++ }
|
|
|
|
// Curr returns the last read token.
|
|
func (s *Scanner) Curr() (tok Token, pos Pos, lit string) {
|
|
buf := &s.buf[(s.i-s.n+len(s.buf))%len(s.buf)]
|
|
return buf.tok, buf.pos, buf.lit
|
|
}
|
|
|
|
// reader represents a buffered rune reader used by the scanner.
|
|
// It provides a fixed-length circular buffer that can be unread.
|
|
type reader struct {
|
|
r io.RuneScanner
|
|
i int // buffer index
|
|
n int // buffer char count
|
|
pos Pos // last read rune position
|
|
buf [3]struct {
|
|
ch rune
|
|
pos Pos
|
|
}
|
|
eof bool // true if reader has ever seen eof.
|
|
}
|
|
|
|
// ReadRune reads the next rune from the reader.
|
|
// This is a wrapper function to implement the io.RuneReader interface.
|
|
// Note that this function does not return size.
|
|
func (r *reader) ReadRune() (ch rune, size int, err error) {
|
|
ch, _ = r.read()
|
|
if ch == eof {
|
|
err = io.EOF
|
|
}
|
|
return
|
|
}
|
|
|
|
// UnreadRune pushes the previously read rune back onto the buffer.
|
|
// This is a wrapper function to implement the io.RuneScanner interface.
|
|
func (r *reader) UnreadRune() error {
|
|
r.unread()
|
|
return nil
|
|
}
|
|
|
|
// read reads the next rune from the reader.
|
|
func (r *reader) read() (ch rune, pos Pos) {
|
|
// If we have unread characters then read them off the buffer first.
|
|
if r.n > 0 {
|
|
r.n--
|
|
return r.curr()
|
|
}
|
|
|
|
// Read next rune from underlying reader.
|
|
// Any error (including io.EOF) should return as EOF.
|
|
ch, _, err := r.r.ReadRune()
|
|
if err != nil {
|
|
ch = eof
|
|
} else if ch == '\r' {
|
|
if ch, _, err := r.r.ReadRune(); err != nil {
|
|
// nop
|
|
} else if ch != '\n' {
|
|
_ = r.r.UnreadRune()
|
|
}
|
|
ch = '\n'
|
|
}
|
|
|
|
// Save character and position to the buffer.
|
|
r.i = (r.i + 1) % len(r.buf)
|
|
buf := &r.buf[r.i]
|
|
buf.ch, buf.pos = ch, r.pos
|
|
|
|
// Update position.
|
|
// Only count EOF once.
|
|
if ch == '\n' {
|
|
r.pos.Line++
|
|
r.pos.Char = 0
|
|
} else if !r.eof {
|
|
r.pos.Char++
|
|
}
|
|
|
|
// Mark the reader as EOF.
|
|
// This is used so we don't double count EOF characters.
|
|
if ch == eof {
|
|
r.eof = true
|
|
}
|
|
|
|
return r.curr()
|
|
}
|
|
|
|
// unread pushes the previously read rune back onto the buffer.
|
|
func (r *reader) unread() {
|
|
r.n++
|
|
}
|
|
|
|
// curr returns the last read character and position.
|
|
func (r *reader) curr() (ch rune, pos Pos) {
|
|
i := (r.i - r.n + len(r.buf)) % len(r.buf)
|
|
buf := &r.buf[i]
|
|
return buf.ch, buf.pos
|
|
}
|
|
|
|
// eof is a marker code point to signify that the reader can't read any more.
|
|
const eof = rune(0)
|
|
|
|
// scanDelimited reads a delimited set of runes
|
|
func scanDelimited(r io.RuneScanner, start, end rune, escapes map[rune]rune, escapesPassThru bool) ([]byte, error) {
|
|
// Scan start delimiter.
|
|
if ch, _, err := r.ReadRune(); err != nil {
|
|
return nil, err
|
|
} else if ch != start {
|
|
return nil, stringutil.Errorf("expected %s; found %s", string(start), string(ch))
|
|
}
|
|
|
|
var buf bytes.Buffer
|
|
for {
|
|
ch0, _, err := r.ReadRune()
|
|
if ch0 == end {
|
|
return buf.Bytes(), nil
|
|
} else if err != nil {
|
|
return buf.Bytes(), err
|
|
} else if ch0 == '\n' {
|
|
return nil, errors.New("delimited text contains new line")
|
|
} else if ch0 == '\\' {
|
|
// If the next character is an escape then write the escaped char.
|
|
// If it's not a valid escape then return an error.
|
|
ch1, _, err := r.ReadRune()
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
c, ok := escapes[ch1]
|
|
if !ok {
|
|
if escapesPassThru {
|
|
// Unread ch1 (char after the \)
|
|
_ = r.UnreadRune()
|
|
// Write ch0 (\) to the output buffer.
|
|
_, _ = buf.WriteRune(ch0)
|
|
continue
|
|
} else {
|
|
buf.Reset()
|
|
_, _ = buf.WriteRune(ch0)
|
|
_, _ = buf.WriteRune(ch1)
|
|
return buf.Bytes(), errBadEscape
|
|
}
|
|
}
|
|
|
|
_, _ = buf.WriteRune(c)
|
|
} else {
|
|
_, _ = buf.WriteRune(ch0)
|
|
}
|
|
}
|
|
}
|
|
|
|
// scanString reads a quoted string from a rune reader.
|
|
func scanString(r io.RuneReader) (string, error) {
|
|
ending, _, err := r.ReadRune()
|
|
if err != nil {
|
|
return "", errBadString
|
|
}
|
|
|
|
var buf bytes.Buffer
|
|
for i := 0; ; i++ {
|
|
ch0, _, err := r.ReadRune()
|
|
if ch0 == ending {
|
|
return buf.String(), nil
|
|
} else if err != nil || ch0 == '\n' {
|
|
return buf.String(), errBadString
|
|
} else if ch0 == '\\' {
|
|
// If the next character is an escape then write the escaped char.
|
|
// If it's not a valid escape then return an error.
|
|
ch1, _, _ := r.ReadRune()
|
|
if ch1 == 'n' {
|
|
_, _ = buf.WriteRune('\n')
|
|
} else if ch1 == '\\' {
|
|
_, _ = buf.WriteRune('\\')
|
|
} else if ch1 == '"' {
|
|
_, _ = buf.WriteRune('"')
|
|
} else if ch1 == '`' {
|
|
_, _ = buf.WriteRune('`')
|
|
} else if ch1 == '\'' {
|
|
_, _ = buf.WriteRune('\'')
|
|
} else if ch1 == 'x' && i == 0 {
|
|
_, _ = buf.WriteString(`\x`)
|
|
} else {
|
|
return string(ch0) + string(ch1), errBadEscape
|
|
}
|
|
} else {
|
|
_, _ = buf.WriteRune(ch0)
|
|
}
|
|
}
|
|
}
|
|
|
|
var errBadString = errors.New("bad string")
|
|
var errBadEscape = errors.New("bad escape")
|
|
|
|
// scanBareIdent reads bare identifier from a rune reader.
|
|
func scanBareIdent(r io.RuneScanner) string {
|
|
// Read every ident character into the buffer.
|
|
// Non-ident characters and EOF will cause the loop to exit.
|
|
var buf bytes.Buffer
|
|
for {
|
|
ch, _, err := r.ReadRune()
|
|
if err != nil {
|
|
break
|
|
} else if !isIdentChar(ch) {
|
|
_ = r.UnreadRune()
|
|
break
|
|
} else {
|
|
_, _ = buf.WriteRune(ch)
|
|
}
|
|
}
|
|
return buf.String()
|
|
}
|