blob: 2d7ceec8e13615658787affe0f91b79788a5169c [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
#include "core/data_render/tokenizer.h"
#include "core/data_render/token.h"
#include "core/data_render/scanner.h"
#include "core/data_render/class_string.h"
#include <cstring>
#include <cassert>
#include <cctype>
#include <iostream>
#include <unordered_map>
namespace weex {
namespace core {
namespace data_render {
// TokenizerState implementation
// -------------------------------
#ifdef __ANDROID__
#undef EOF
static const char EOF = -1;
#endif
class TokenizerState {
public:
static std::unordered_map<std::string, Token::Type> keywords;
friend class Tokenizer;
using size_type = size_t;
using seek_type = size_t;
using char_type = char;
TokenizerState(CharacterStream* scanner)
: seek_{0}, scanner_{scanner} {}
inline void Reset(CharacterStream* scanner) {
scanner_ = scanner;
seek_ = 0;
last_col_length_ = 0;
position_ = Position();
}
inline size_type& seek() {
return seek_;
}
inline Position& position() {
return position_;
}
inline Token& token() {
return token_;
}
inline Token& last_token() {
return last_token_;
}
inline void set_token(Token token) {
last_token_ = token_;
token_ = token;
}
inline void set_position(int col, int row) {
position_.col() = col;
position_.row() = row;
}
inline const Position& position() const {
return position_;
}
inline char_type ReadChar() {
char ch = scanner_->ReadChar();
if (ch == '\n') {
position_.row()++;
// save the last column position
last_col_length_ = position_.col();
position_.col() = 0;
} else if (ch == EOF) {
// in case of EOF we don't want to go outside the limit
// of our source code
seek_++;
return EOF;
} else {
position_.col()++;
}
seek_++;
return ch;
}
inline void PutBack(char_type ch) {
seek_--;
if (ch == EOF)
return;
if (ch == '\n') {
assert(position_.row() != 0);
position_.col() = last_col_length_;
position_.row()--;
} else {
assert(position_.col() != 0);
position_.col()--;
}
scanner_->PutBack(ch);
}
private:
seek_type seek_;
Token token_;
Token last_token_;
Position position_;
size_t last_col_length_;
CharacterStream* scanner_;
};
std::unordered_map<std::string, Token::Type> TokenizerState::keywords = {
#define K(t, k, p) { k, Token::t },
#define T(t, k, p)
TOKEN_TYPE_LIST(T, K)
#undef K
#undef T
};
// small utility functions
bool IsValidIdentifierStart(char ch) {
return std::isalpha(ch) || ch == '_' || ch == '$';
}
bool IsValidIdentifierChar(char ch) {
return std::isalnum(ch) || ch == '_' || ch == '$';
}
Token::Type IsKeyword(const std::string& str) {
auto it = TokenizerState::keywords.find(str);
if (it == TokenizerState::keywords.end())
return Token::IDENTIFIER;
return it->second;
}
Token::Type IsOneCharacterSymbol(char ch) {
// returns the type of token from the given symbol
switch (ch) {
case '(':
return Token::LPAREN;
case ')':
return Token::RPAREN;
case '{':
return Token::LBRACE;
case '}':
return Token::RBRACE;
case '[':
return Token::LBRACK;
case ']':
return Token::RBRACK;
case ':':
return Token::COLON;
case ';':
return Token::SEMICOLON;
case '.':
return Token::PERIOD;
case ',':
return Token::COMMA;
case '?':
return Token::CONDITIONAL;
case '+':
return Token::ADD;
case '-':
return Token::SUB;
case '/':
return Token::DIV;
case '*':
return Token::MUL;
case '%':
return Token::MOD;
case '>':
return Token::GT;
case '<':
return Token::LT;
case '=':
return Token::ASSIGN;
case '^':
return Token::BIT_XOR;
case '|':
return Token::BIT_OR;
case '&':
return Token::BIT_AND;
case '!':
return Token::NOT;
case '~':
return Token::BIT_NOT;
default:
return Token::INVALID;
}
}
Token::Type IsTwoCharacterSymbol(char ch1, char ch2) {
// returns the type of symbol of two characters
switch (ch1) {
case '=':
switch (ch2) {
case '>':
return Token::ARROW_FUNCTION;
break;
default:
break;
}
break;
default:
break;
}
switch (ch2) {
case '=':
switch (ch1) {
case '+':
return Token::ASSIGN_ADD;
case '-':
return Token::ASSIGN_SUB;
case '*':
return Token::ASSIGN_MUL;
case '/':
return Token::ASSIGN_DIV;
case '%':
return Token::ASSIGN_MOD;
case '^':
return Token::ASSIGN_BIT_XOR;
case '&':
return Token::ASSIGN_BIT_AND;
case '|':
return Token::ASSIGN_BIT_OR;
case '>':
return Token::GTE;
case '<':
return Token::LTE;
case '=':
return Token::EQ;
case '!':
return Token::NE;
}
return Token::INVALID;
case '|':
if (ch1 == '|')
return Token::OR;
return Token::INVALID;
case '&':
if (ch1 == '&')
return Token::AND;
return Token::INVALID;
case '+':
if (ch1 == '+')
return Token::INC;
return Token::INVALID;
case '-':
if (ch1 == '-')
return Token::DEC;
return Token::INVALID;
case '<':
if (ch1 == '<')
return Token::SHL;
return Token::INVALID;
case '>':
if (ch1 == '>')
return Token::SAR;
default:
return Token::INVALID;
}
}
Token::Type IsThreeCharacterSymbol(char ch1, char ch2, char ch3) {
if (ch1 == '=' && ch2 == '=' && ch3 == '=')
return Token::EQ_STRICT;
else if (ch1 == '!' && ch2 == '=' && ch3 == '=')
return Token::NE_STRICT;
else if (ch1 == '>' && ch2 == '>' && ch3 == '>')
return Token::SHR;
else if (ch1 == '>' && ch2 == '>' && ch3 == '=')
return Token::ASSIGN_SAR;
else if (ch1 == '<' && ch2 == '<' && ch3 == '=')
return Token::ASSIGN_SHL;
else if (ch1 == '.' && ch2 == '.' && ch3 == '.')
return Token::UNFOLD;
return Token::INVALID;
}
bool IsSpace(char ch) {
return ch == ' ' || ch == '\t' || ch == '\n' || ch == '\r';
}
// Tokenizer implementation
// --------------------------
Tokenizer::Tokenizer(CharacterStream *stream, ParserContext *context)
: state_{new TokenizerState(stream)}, context_{ context } {}
Tokenizer::Tokenizer(CharacterStream *stream)
: state_{new TokenizerState(stream)} {}
Tokenizer::~Tokenizer() {
delete state_;
}
#define _ state_->
void Tokenizer::Reset(CharacterStream* stream) {
state_->Reset(stream);
}
void Tokenizer::Advance(bool divide_expected) {
_ set_token(AdvanceInternal(divide_expected));
}
Token::Type Tokenizer::Peek() {
if (_ seek() == 0)
Advance();
return _ token().type();
}
Token& Tokenizer::CurrentToken() {
return _ token();
}
// The heart of the lexer
// -----------------------
Token Tokenizer::AdvanceInternal(bool not_regex) {
char ch = _ ReadChar();
do {
// skip all non printable characters
while (ch != EOF && IsSpace(ch))
ch = _ ReadChar();
if (ch == '/') {
char next = _ ReadChar();
if (next == EOF) {
return Token(std::string(1, ch), Token::DIV, _ position(),
_ seek());
} else if (next == '/') {
// single line comment skip whole line
while (ch != '\n') {
ch = _ ReadChar();
}
} else if (next == '*') {
// block comment
ch = _ ReadChar();
char last = ch;
while (ch != '/' || last != '*') {
if (ch == EOF)
break;
last = ch;
ch = _ ReadChar();
}
ch = _ ReadChar();
if (ch == EOF) {
return Token(std::string("ERROR"), Token::ERROR,
_ position(), _ seek());
}
} else {
bool ok = true;
_ PutBack(next);
// do not parse this as regular expression
if (not_regex) break;
Token t = ParseRegex(&ok);
if (!ok) {
break;
} else {
return t;
}
}
} else if (!IsSpace(ch)) {
break;
} else if (ch == EOF || ch == '\0') {
return Token(std::string("EOF"), Token::EOS,
_ position(), _ seek());
}
} while (true);
auto seek = _ seek();
auto position = _ position();
// now check the various possibilities of tokens
// Possibility 1:
// token is a valid identifier, or keyword
if (IsValidIdentifierStart(ch)) {
std::string identifier;
identifier += ch;
ch = _ ReadChar();
while (IsValidIdentifierChar(ch)) {
identifier += ch;
ch = _ ReadChar();
}
_ PutBack(ch);
return Token(identifier, IsKeyword(identifier), position, seek);
}
if (ch == EOF || ch == '\0') {
return Token(std::string("EOF"), Token::EOS,
position, seek);
}
char first = ch;
char second = _ ReadChar();
char third = _ ReadChar();
Token::Type type;
std::string view;
if (first != EOF && second != EOF) {
type = IsThreeCharacterSymbol(first, second, third);
if (type == Token::SHR) {
char fourth = _ ReadChar();
if (fourth == '=') {
// >>>=
view = view + first + second + third + fourth;
return Token(view, Token::ASSIGN_SHR, _ position(), _ seek());
} else {
_ PutBack(fourth);
view = view + first + second + third;
return Token(view, type, position, seek);
}
} else if (type != Token::INVALID) {
view = view + first + second + third;
return Token(view, type, position, seek);
}
_ PutBack(third);
}
if (second != EOF) {
type = IsTwoCharacterSymbol(first, second);
if (type != Token::INVALID) {
view = view + first + second;
return Token(view, type, position, seek);
}
_ PutBack(second);
}
type = IsOneCharacterSymbol(first);
if (type == Token::PERIOD) {
if (isdigit(second)) {
return ParseNumber(first);
}
}
if (type != Token::INVALID) {
view += first;
return Token(view, type, position, seek);
}
if (ch == '"' || ch == '\'' || ch == '`') {
return ParseString(ch);
}
if (isdigit(ch)) {
return ParseNumber(ch);
} else if (ch == EOF || ch == '\0') {
return Token(std::string("EOF"), Token::EOS, position, seek);
}
return Token(std::string("ILLEGAL"), Token::INVALID, position, seek);
}
Token Tokenizer::ParseRegex(bool* ok) {
std::string buffer;
auto seek = _ seek();
auto position = _ position();
char ch = _ ReadChar();
while (ch != '/') {
if (ch == EOF || ch == '\n') {
*ok = false;
if (ch == '\n') {
_ PutBack('\n');
}
for (long i = buffer.length() - 1; i >= 0; i--) {
_ PutBack(buffer[i]);
}
return Token(std::string("ERROR"), Token::ERROR, position, seek);
}
if (ch == '[') {
buffer.push_back(ch);
ch = _ ReadChar();
while (ch != ']') {
if (ch == '\\') {
buffer.push_back(ch);
ch = _ ReadChar();
}
if (ch == EOF) {
return Token(std::string("ERROR"), Token::ERROR, position, seek);
}
buffer.push_back(ch);
ch = _ ReadChar();
}
if (ch == ']')
buffer.push_back(ch);
}
if (ch == '\\') {
buffer.push_back(ch);
ch = _ ReadChar();
}
buffer.push_back(ch);
ch = _ ReadChar();
}
// adding end marker
buffer.push_back('$');
// parse regex flags g, i, m, u, y
// Including special case n when there is no flag present
ch = _ ReadChar();
bool flag_present = false;
while (ch == 'g' || ch == 'i' || ch == 'm' || ch == 'u' || ch == 'y') {
flag_present = true;
buffer.push_back(ch);
ch = _ ReadChar();
}
_ PutBack(ch);
if (!flag_present) {
buffer.push_back('n');
}
return Token(buffer, Token::REGEXP_LITERAL, position, seek);
}
Token Tokenizer::ParseString(char delim) {
std::string buffer;
auto seek = _ seek();
auto position = _ position();
char ch = _ ReadChar();
bool utf8 = false;
while (ch != EOF && ch != delim) {
// escape characters
if (ch == '\\') {
buffer.push_back(ch);
ch = _ ReadChar();
if (tolower(ch) == 'u') {
utf8 = true;
}
if (ch == EOF) {
break;
}
}
buffer.push_back(ch);
ch = _ ReadChar();
}
if (ch == EOF) {
return Token(std::string("EOF"), Token::ERROR, position, seek);
}
Token::Type type = delim == '`' ? Token::TEMPLATE : Token::STRING;
if (utf8) {
buffer = utf8_decode(buffer);
}
return Token(buffer, type, position, seek);
}
Token Tokenizer::ParseNumber(char start) {
std::string buffer;
auto seek = _ seek();
bool had_exp = false;
bool seen_dot = false;
bool isdouble = false;
auto position = _ position();
char ch = _ ReadChar();
buffer.push_back(start);
if (tolower(ch) == 'x') {
// parsing hex number
buffer.push_back(ch);
ch = _ ReadChar();
while (ch != EOF && (tolower(ch) == 'a' || tolower(ch) == 'b'
|| tolower(ch) == 'c' || tolower(ch) == 'd'
|| tolower(ch) == 'e' || tolower(ch) == 'f'
|| isdigit(ch))) {
buffer.push_back(ch);
ch = _ ReadChar();
}
} else if (tolower(ch) == 'b') {
// parsing a bin number
buffer.push_back(ch);
ch = _ ReadChar();
while (ch != EOF && (ch == '0' || ch == '1')) {
buffer.push_back(ch);
ch = _ ReadChar();
}
} else if (tolower(ch) == 'o') {
// parsing oct number
while (ch != EOF && ch < '8' && ch >= '0') {
buffer.push_back(ch);
ch = _ ReadChar();
}
} else {
if (start == '.' || ch == '.') {
seen_dot = true;
isdouble = true;
}
//todo fix 2-a got 2- problem
if (ch != EOF)
buffer.push_back(ch);
while (isdigit(ch) || (tolower(ch) == 'e') || ch == '.') {
ch = _ ReadChar();
if (ch == '.') {
if (!seen_dot) {
seen_dot = true;
isdouble = true;
buffer.push_back(ch);
} else {
// not the character that we should care about
break;
}
} else if (tolower(ch) == 'e') {
if (!had_exp) {
had_exp = true;
buffer.push_back(ch);
isdouble = true;
} else {
break;
}
} else if (isdigit(ch)) {
buffer.push_back(ch);
} else if (ch == EOF) {
if (start == '.' && buffer.length() == 1) {
return Token(std::string("."), Token::PERIOD, position,
seek);
}
break;
} else {
break;
}
}
}
_ PutBack(ch);
if (!isdouble) {
return Token(buffer, Token::INTEGER, position, seek);
} else {
return Token(buffer, Token::NUMBER, position, seek);
}
}
}
}
}