| package scanner |
| |
| import ( |
| "unicode" |
| "unicode/utf8" |
| |
| "github.com/hashicorp/hil/ast" |
| ) |
| |
| // Scan returns a channel that recieves Tokens from the given input string. |
| // |
| // The scanner's job is just to partition the string into meaningful parts. |
| // It doesn't do any transformation of the raw input string, so the caller |
| // must deal with any further interpretation required, such as parsing INTEGER |
| // tokens into real ints, or dealing with escape sequences in LITERAL or |
| // STRING tokens. |
| // |
| // Strings in the returned tokens are slices from the original string. |
| // |
| // startPos should be set to ast.InitPos unless the caller knows that |
| // this interpolation string is part of a larger file and knows the position |
| // of the first character in that larger file. |
| func Scan(s string, startPos ast.Pos) <-chan *Token { |
| ch := make(chan *Token) |
| go scan(s, ch, startPos) |
| return ch |
| } |
| |
| func scan(s string, ch chan<- *Token, pos ast.Pos) { |
| // 'remain' starts off as the whole string but we gradually |
| // slice of the front of it as we work our way through. |
| remain := s |
| |
| // nesting keeps track of how many ${ .. } sequences we are |
| // inside, so we can recognize the minor differences in syntax |
| // between outer string literals (LITERAL tokens) and quoted |
| // string literals (STRING tokens). |
| nesting := 0 |
| |
| // We're going to flip back and forth between parsing literals/strings |
| // and parsing interpolation sequences ${ .. } until we reach EOF or |
| // some INVALID token. |
| All: |
| for { |
| startPos := pos |
| // Literal string processing first, since the beginning of |
| // a string is always outside of an interpolation sequence. |
| literalVal, terminator := scanLiteral(remain, pos, nesting > 0) |
| |
| if len(literalVal) > 0 { |
| litType := LITERAL |
| if nesting > 0 { |
| litType = STRING |
| } |
| ch <- &Token{ |
| Type: litType, |
| Content: literalVal, |
| Pos: startPos, |
| } |
| remain = remain[len(literalVal):] |
| } |
| |
| ch <- terminator |
| remain = remain[len(terminator.Content):] |
| pos = terminator.Pos |
| // Safe to use len() here because none of the terminator tokens |
| // can contain UTF-8 sequences. |
| pos.Column = pos.Column + len(terminator.Content) |
| |
| switch terminator.Type { |
| case INVALID: |
| // Synthetic EOF after invalid token, since further scanning |
| // is likely to just produce more garbage. |
| ch <- &Token{ |
| Type: EOF, |
| Content: "", |
| Pos: pos, |
| } |
| break All |
| case EOF: |
| // All done! |
| break All |
| case BEGIN: |
| nesting++ |
| case CQUOTE: |
| // nothing special to do |
| default: |
| // Should never happen |
| panic("invalid string/literal terminator") |
| } |
| |
| // Now we do the processing of the insides of ${ .. } sequences. |
| // This loop terminates when we encounter either a closing } or |
| // an opening ", which will cause us to return to literal processing. |
| Interpolation: |
| for { |
| |
| token, size, newPos := scanInterpolationToken(remain, pos) |
| ch <- token |
| remain = remain[size:] |
| pos = newPos |
| |
| switch token.Type { |
| case INVALID: |
| // Synthetic EOF after invalid token, since further scanning |
| // is likely to just produce more garbage. |
| ch <- &Token{ |
| Type: EOF, |
| Content: "", |
| Pos: pos, |
| } |
| break All |
| case EOF: |
| // All done |
| // (though a syntax error that we'll catch in the parser) |
| break All |
| case END: |
| nesting-- |
| if nesting < 0 { |
| // Can happen if there are unbalanced ${ and } sequences |
| // in the input, which we'll catch in the parser. |
| nesting = 0 |
| } |
| break Interpolation |
| case OQUOTE: |
| // Beginning of nested quoted string |
| break Interpolation |
| } |
| } |
| } |
| |
| close(ch) |
| } |
| |
| // Returns the token found at the start of the given string, followed by |
| // the number of bytes that were consumed from the string and the adjusted |
| // source position. |
| // |
| // Note that the number of bytes consumed can be more than the length of |
| // the returned token contents if the string begins with whitespace, since |
| // it will be silently consumed before reading the token. |
| func scanInterpolationToken(s string, startPos ast.Pos) (*Token, int, ast.Pos) { |
| pos := startPos |
| size := 0 |
| |
| // Consume whitespace, if any |
| for len(s) > 0 && byteIsSpace(s[0]) { |
| if s[0] == '\n' { |
| pos.Column = 1 |
| pos.Line++ |
| } else { |
| pos.Column++ |
| } |
| size++ |
| s = s[1:] |
| } |
| |
| // Unexpected EOF during sequence |
| if len(s) == 0 { |
| return &Token{ |
| Type: EOF, |
| Content: "", |
| Pos: pos, |
| }, size, pos |
| } |
| |
| next := s[0] |
| var token *Token |
| |
| switch next { |
| case '(', ')', '[', ']', ',', '.', '+', '-', '*', '/', '%', '?', ':': |
| // Easy punctuation symbols that don't have any special meaning |
| // during scanning, and that stand for themselves in the |
| // TokenType enumeration. |
| token = &Token{ |
| Type: TokenType(next), |
| Content: s[:1], |
| Pos: pos, |
| } |
| case '}': |
| token = &Token{ |
| Type: END, |
| Content: s[:1], |
| Pos: pos, |
| } |
| case '"': |
| token = &Token{ |
| Type: OQUOTE, |
| Content: s[:1], |
| Pos: pos, |
| } |
| case '!': |
| if len(s) >= 2 && s[:2] == "!=" { |
| token = &Token{ |
| Type: NOTEQUAL, |
| Content: s[:2], |
| Pos: pos, |
| } |
| } else { |
| token = &Token{ |
| Type: BANG, |
| Content: s[:1], |
| Pos: pos, |
| } |
| } |
| case '<': |
| if len(s) >= 2 && s[:2] == "<=" { |
| token = &Token{ |
| Type: LTE, |
| Content: s[:2], |
| Pos: pos, |
| } |
| } else { |
| token = &Token{ |
| Type: LT, |
| Content: s[:1], |
| Pos: pos, |
| } |
| } |
| case '>': |
| if len(s) >= 2 && s[:2] == ">=" { |
| token = &Token{ |
| Type: GTE, |
| Content: s[:2], |
| Pos: pos, |
| } |
| } else { |
| token = &Token{ |
| Type: GT, |
| Content: s[:1], |
| Pos: pos, |
| } |
| } |
| case '=': |
| if len(s) >= 2 && s[:2] == "==" { |
| token = &Token{ |
| Type: EQUAL, |
| Content: s[:2], |
| Pos: pos, |
| } |
| } else { |
| // A single equals is not a valid operator |
| token = &Token{ |
| Type: INVALID, |
| Content: s[:1], |
| Pos: pos, |
| } |
| } |
| case '&': |
| if len(s) >= 2 && s[:2] == "&&" { |
| token = &Token{ |
| Type: AND, |
| Content: s[:2], |
| Pos: pos, |
| } |
| } else { |
| token = &Token{ |
| Type: INVALID, |
| Content: s[:1], |
| Pos: pos, |
| } |
| } |
| case '|': |
| if len(s) >= 2 && s[:2] == "||" { |
| token = &Token{ |
| Type: OR, |
| Content: s[:2], |
| Pos: pos, |
| } |
| } else { |
| token = &Token{ |
| Type: INVALID, |
| Content: s[:1], |
| Pos: pos, |
| } |
| } |
| default: |
| if next >= '0' && next <= '9' { |
| num, numType := scanNumber(s) |
| token = &Token{ |
| Type: numType, |
| Content: num, |
| Pos: pos, |
| } |
| } else if stringStartsWithIdentifier(s) { |
| ident, runeLen := scanIdentifier(s) |
| tokenType := IDENTIFIER |
| if ident == "true" || ident == "false" { |
| tokenType = BOOL |
| } |
| token = &Token{ |
| Type: tokenType, |
| Content: ident, |
| Pos: pos, |
| } |
| // Skip usual token handling because it doesn't |
| // know how to deal with UTF-8 sequences. |
| pos.Column = pos.Column + runeLen |
| return token, size + len(ident), pos |
| } else { |
| _, byteLen := utf8.DecodeRuneInString(s) |
| token = &Token{ |
| Type: INVALID, |
| Content: s[:byteLen], |
| Pos: pos, |
| } |
| // Skip usual token handling because it doesn't |
| // know how to deal with UTF-8 sequences. |
| pos.Column = pos.Column + 1 |
| return token, size + byteLen, pos |
| } |
| } |
| |
| // Here we assume that the token content contains no UTF-8 sequences, |
| // because we dealt with UTF-8 characters as a special case where |
| // necessary above. |
| size = size + len(token.Content) |
| pos.Column = pos.Column + len(token.Content) |
| |
| return token, size, pos |
| } |
| |
| // Returns the (possibly-empty) prefix of the given string that represents |
| // a literal, followed by the token that marks the end of the literal. |
| func scanLiteral(s string, startPos ast.Pos, nested bool) (string, *Token) { |
| litLen := 0 |
| pos := startPos |
| var terminator *Token |
| for { |
| |
| if litLen >= len(s) { |
| if nested { |
| // We've ended in the middle of a quoted string, |
| // which means this token is actually invalid. |
| return "", &Token{ |
| Type: INVALID, |
| Content: s, |
| Pos: startPos, |
| } |
| } |
| terminator = &Token{ |
| Type: EOF, |
| Content: "", |
| Pos: pos, |
| } |
| break |
| } |
| |
| next := s[litLen] |
| |
| if next == '$' && len(s) > litLen+1 { |
| follow := s[litLen+1] |
| |
| if follow == '{' { |
| terminator = &Token{ |
| Type: BEGIN, |
| Content: s[litLen : litLen+2], |
| Pos: pos, |
| } |
| pos.Column = pos.Column + 2 |
| break |
| } else if follow == '$' { |
| // Double-$ escapes the special processing of $, |
| // so we will consume both characters here. |
| pos.Column = pos.Column + 2 |
| litLen = litLen + 2 |
| continue |
| } |
| } |
| |
| // special handling that applies only to quoted strings |
| if nested { |
| if next == '"' { |
| terminator = &Token{ |
| Type: CQUOTE, |
| Content: s[litLen : litLen+1], |
| Pos: pos, |
| } |
| pos.Column = pos.Column + 1 |
| break |
| } |
| |
| // Escaped quote marks do not terminate the string. |
| // |
| // All we do here in the scanner is avoid terminating a string |
| // due to an escaped quote. The parser is responsible for the |
| // full handling of escape sequences, since it's able to produce |
| // better error messages than we can produce in here. |
| if next == '\\' && len(s) > litLen+1 { |
| follow := s[litLen+1] |
| |
| if follow == '"' { |
| // \" escapes the special processing of ", |
| // so we will consume both characters here. |
| pos.Column = pos.Column + 2 |
| litLen = litLen + 2 |
| continue |
| } else if follow == '\\' { |
| // \\ escapes \ |
| // so we will consume both characters here. |
| pos.Column = pos.Column + 2 |
| litLen = litLen + 2 |
| continue |
| } |
| } |
| } |
| |
| if next == '\n' { |
| pos.Column = 1 |
| pos.Line++ |
| litLen++ |
| } else { |
| pos.Column++ |
| |
| // "Column" measures runes, so we need to actually consume |
| // a valid UTF-8 character here. |
| _, size := utf8.DecodeRuneInString(s[litLen:]) |
| litLen = litLen + size |
| } |
| |
| } |
| |
| return s[:litLen], terminator |
| } |
| |
| // scanNumber returns the extent of the prefix of the string that represents |
| // a valid number, along with what type of number it represents: INT or FLOAT. |
| // |
| // scanNumber does only basic character analysis: numbers consist of digits |
| // and periods, with at least one period signalling a FLOAT. It's the parser's |
| // responsibility to validate the form and range of the number, such as ensuring |
| // that a FLOAT actually contains only one period, etc. |
| func scanNumber(s string) (string, TokenType) { |
| period := -1 |
| byteLen := 0 |
| numType := INTEGER |
| for { |
| if byteLen >= len(s) { |
| break |
| } |
| |
| next := s[byteLen] |
| if next != '.' && (next < '0' || next > '9') { |
| // If our last value was a period, then we're not a float, |
| // we're just an integer that ends in a period. |
| if period == byteLen-1 { |
| byteLen-- |
| numType = INTEGER |
| } |
| |
| break |
| } |
| |
| if next == '.' { |
| // If we've already seen a period, break out |
| if period >= 0 { |
| break |
| } |
| |
| period = byteLen |
| numType = FLOAT |
| } |
| |
| byteLen++ |
| } |
| |
| return s[:byteLen], numType |
| } |
| |
| // scanIdentifier returns the extent of the prefix of the string that |
| // represents a valid identifier, along with the length of that prefix |
| // in runes. |
| // |
| // Identifiers may contain utf8-encoded non-Latin letters, which will |
| // cause the returned "rune length" to be shorter than the byte length |
| // of the returned string. |
| func scanIdentifier(s string) (string, int) { |
| byteLen := 0 |
| runeLen := 0 |
| for { |
| if byteLen >= len(s) { |
| break |
| } |
| |
| nextRune, size := utf8.DecodeRuneInString(s[byteLen:]) |
| if !(nextRune == '_' || |
| nextRune == '-' || |
| nextRune == '.' || |
| nextRune == '*' || |
| unicode.IsNumber(nextRune) || |
| unicode.IsLetter(nextRune) || |
| unicode.IsMark(nextRune)) { |
| break |
| } |
| |
| // If we reach a star, it must be between periods to be part |
| // of the same identifier. |
| if nextRune == '*' && s[byteLen-1] != '.' { |
| break |
| } |
| |
| // If our previous character was a star, then the current must |
| // be period. Otherwise, undo that and exit. |
| if byteLen > 0 && s[byteLen-1] == '*' && nextRune != '.' { |
| byteLen-- |
| if s[byteLen-1] == '.' { |
| byteLen-- |
| } |
| |
| break |
| } |
| |
| byteLen = byteLen + size |
| runeLen = runeLen + 1 |
| } |
| |
| return s[:byteLen], runeLen |
| } |
| |
| // byteIsSpace implements a restrictive interpretation of spaces that includes |
| // only what's valid inside interpolation sequences: spaces, tabs, newlines. |
| func byteIsSpace(b byte) bool { |
| switch b { |
| case ' ', '\t', '\r', '\n': |
| return true |
| default: |
| return false |
| } |
| } |
| |
| // stringStartsWithIdentifier returns true if the given string begins with |
| // a character that is a legal start of an identifier: an underscore or |
| // any character that Unicode considers to be a letter. |
| func stringStartsWithIdentifier(s string) bool { |
| if len(s) == 0 { |
| return false |
| } |
| |
| first := s[0] |
| |
| // Easy ASCII cases first |
| if (first >= 'a' && first <= 'z') || (first >= 'A' && first <= 'Z') || first == '_' { |
| return true |
| } |
| |
| // If our first byte begins a UTF-8 sequence then the sequence might |
| // be a unicode letter. |
| if utf8.RuneStart(first) { |
| firstRune, _ := utf8.DecodeRuneInString(s) |
| if unicode.IsLetter(firstRune) { |
| return true |
| } |
| } |
| |
| return false |
| } |