| /* |
| * |
| * Licensed to the Apache Software Foundation (ASF) under one |
| * or more contributor license agreements. See the NOTICE file |
| * distributed with this work for additional information |
| * regarding copyright ownership. The ASF licenses this file |
| * to you under the Apache License, Version 2.0 (the |
| * "License"); you may not use this file except in compliance |
| * with the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, |
| * software distributed under the License is distributed on an |
| * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| * KIND, either express or implied. See the License for the |
| * specific language governing permissions and limitations |
| * under the License. |
| * |
| */ |
| |
| #include "qpid/broker/SelectorToken.h" |
| |
| #include <string> |
| #include <algorithm> |
| #include <iostream> |
| #include <cassert> |
| #include <cctype> |
| |
| namespace qpid { |
| namespace broker { |
| |
| // Tokenisers always take string const_iterators to mark the beginning and end of the string being tokenised |
| // if the tokenise is successful then the start iterator is advanced, if the tokenise fails then the start |
| // iterator is unchanged. |
| |
| std::ostream& operator<<(std::ostream& os, const Token& t) |
| { |
| os << "T<" << t.type << ", " << t.val << ">"; |
| return os; |
| } |
| |
| TokenException::TokenException(const std::string& msg) : |
| range_error(msg) |
| {} |
| |
| // Lexically, reserved words are a subset of identifiers |
| // so we parse an identifier first then check if it is a reserved word and |
| // convert it if it is a reserved word |
| namespace { |
| |
| struct RWEntry { |
| const char* word; |
| TokenType type; |
| }; |
| |
| inline bool caseless(const char* s1, const char* s2) |
| { |
| do { |
| char ls1 = std::tolower(*s1); |
| char ls2 = std::tolower(*s2); |
| if (ls1<ls2) |
| return true; |
| else if (ls1>ls2) |
| return false; |
| } while ( *s1++ && *s2++ ); |
| // Equal |
| return false; |
| } |
| |
| inline bool operator<(const RWEntry& lhs, const RWEntry& rhs) { |
| return caseless(lhs.word, rhs.word); |
| } |
| |
| } |
| |
| bool tokeniseReservedWord(Token& tok) |
| { |
| // This must be sorted!! |
| static const RWEntry reserved[] = { |
| {"and", T_AND}, |
| {"between", T_BETWEEN}, |
| {"escape", T_ESCAPE}, |
| {"false", T_FALSE}, |
| {"in", T_IN}, |
| {"is", T_IS}, |
| {"like", T_LIKE}, |
| {"not", T_NOT}, |
| {"null", T_NULL}, |
| {"or", T_OR}, |
| {"true", T_TRUE} |
| }; |
| |
| const int reserved_size = sizeof(reserved)/sizeof(RWEntry); |
| |
| if ( tok.type != T_IDENTIFIER ) return false; |
| |
| RWEntry rw; |
| rw.word = tok.val.c_str(); |
| std::pair<const RWEntry*, const RWEntry*> entry = std::equal_range(&reserved[0], &reserved[reserved_size], rw); |
| |
| if ( entry.first==entry.second ) return false; |
| |
| tok.type = entry.first->type; |
| return true; |
| } |
| |
| // parsing strings is complicated by the need to allow embedded quotes by doubling the quote character |
| bool processString(std::string::const_iterator& s, std::string::const_iterator& e, char quoteChar, TokenType type, Token& tok) |
| { |
| // We only get here once the tokeniser recognises the initial quote for a string |
| // so we don't need to check for it again. |
| std::string::const_iterator q = std::find(s+1, e, quoteChar); |
| if ( q==e ) return false; |
| |
| std::string content(s+1, q); |
| ++q; |
| |
| while ( q!=e && *q==quoteChar ) { |
| std::string::const_iterator p = q; |
| q = std::find(p+1, e, quoteChar); |
| if ( q==e ) return false; |
| content += std::string(p, q); |
| ++q; |
| } |
| |
| tok = Token(type, s, content); |
| s = q; |
| return true; |
| } |
| |
| inline bool isIdentifierStart(char c) |
| { |
| return std::isalpha(c) || c=='_' || c=='$'; |
| } |
| |
| inline bool isIdentifierPart(char c) |
| { |
| return std::isalnum(c) || c=='_' || c=='$' || c=='.'; |
| } |
| |
| static const std::string END("<END>"); |
| bool tokenise(std::string::const_iterator& s, std::string::const_iterator& e, Token& tok) |
| { |
| std::string::const_iterator t = s; |
| |
| // Hand constructed state machine recogniser |
| enum { |
| START, |
| REJECT, |
| IDENTIFIER, |
| ZERO, |
| DIGIT, |
| HEXDIGIT_START, |
| HEXDIGIT, |
| OCTDIGIT, |
| BINDIGIT_START, |
| BINDIGIT, |
| DECIMAL_START, |
| DECIMAL, |
| EXPONENT_SIGN, |
| EXPONENT_START, |
| EXPONENT, |
| ACCEPT_IDENTIFIER, |
| ACCEPT_INC, |
| ACCEPT_NOINC |
| } state = START; |
| |
| TokenType tokType = T_EOS; |
| while (true) |
| switch (state) { |
| case START: |
| if (t==e) {tok = Token(T_EOS, s, END); return true;} |
| else if (std::isspace(*t)) {++t; ++s; continue;} |
| else switch (*t) { |
| case '(': tokType = T_LPAREN; state = ACCEPT_INC; continue; |
| case ')': tokType = T_RPAREN; state = ACCEPT_INC; continue; |
| case ',': tokType = T_COMMA; state = ACCEPT_INC; continue; |
| case '+': tokType = T_PLUS; state = ACCEPT_INC; continue; |
| case '-': tokType = T_MINUS; state = ACCEPT_INC; continue; |
| case '*': tokType = T_MULT; state = ACCEPT_INC; continue; |
| case '/': tokType = T_DIV; state = ACCEPT_INC; continue; |
| case '=': tokType = T_EQUAL; state = ACCEPT_INC; continue; |
| case '<': |
| ++t; |
| if (t==e || (*t!='>' && *t!='=')) |
| {tokType = T_LESS; state = ACCEPT_NOINC; continue; } |
| else |
| {tokType = (*t=='>') ? T_NEQ : T_LSEQ; state = ACCEPT_INC; continue; } |
| case '>': |
| ++t; |
| if (t==e || *t!='=') |
| {tokType = T_GRT; state = ACCEPT_NOINC; continue;} |
| else |
| {tokType = T_GREQ; state = ACCEPT_INC; continue;} |
| default: |
| break; |
| } |
| if (isIdentifierStart(*t)) {++t; state = IDENTIFIER;} |
| else if (*t=='\'') {return processString(s, e, '\'', T_STRING, tok);} |
| else if (*t=='\"') {return processString(s, e, '\"', T_IDENTIFIER, tok);} |
| else if (*t=='0') {++t; state = ZERO;} |
| else if (std::isdigit(*t)) {++t; state = DIGIT;} |
| else if (*t=='.') {++t; state = DECIMAL_START;} |
| else state = REJECT; |
| continue; |
| case IDENTIFIER: |
| if (t==e) {state = ACCEPT_IDENTIFIER;} |
| else if (isIdentifierPart(*t)) {++t; state = IDENTIFIER;} |
| else state = ACCEPT_IDENTIFIER; |
| continue; |
| case DECIMAL_START: |
| if (t==e) {state = REJECT;} |
| else if (std::isdigit(*t)) {++t; state = DECIMAL;} |
| else state = REJECT; |
| continue; |
| case EXPONENT_SIGN: |
| if (t==e) {state = REJECT;} |
| else if (*t=='-' || *t=='+') {++t; state = EXPONENT_START;} |
| else if (std::isdigit(*t)) {++t; state = EXPONENT;} |
| else state = REJECT; |
| continue; |
| case EXPONENT_START: |
| if (t==e) {state = REJECT;} |
| else if (std::isdigit(*t)) {++t; state = EXPONENT;} |
| else state = REJECT; |
| continue; |
| case ZERO: |
| if (t==e) {tokType = T_NUMERIC_EXACT; state = ACCEPT_NOINC;} |
| else if (*t=='.') {++t; state = DECIMAL;} |
| else if (*t=='x' || *t=='X') {++t; state = HEXDIGIT_START;} |
| else if (*t=='b' || *t=='B') {++t; state = BINDIGIT_START;} |
| else state = OCTDIGIT; |
| continue; |
| case HEXDIGIT_START: |
| if (t==e) {state = REJECT;} |
| else if (std::isxdigit(*t)) {++t; state = HEXDIGIT;} |
| else state = REJECT; |
| continue; |
| case HEXDIGIT: |
| if (t==e) {tokType = T_NUMERIC_EXACT; state = ACCEPT_NOINC;} |
| else if (*t=='l' || *t=='L') {tokType = T_NUMERIC_EXACT; state = ACCEPT_INC;} |
| else if (std::isxdigit(*t) || *t=='_') {++t; state = HEXDIGIT;} |
| else if (*t=='p' || *t=='P') {++t; state = EXPONENT_SIGN;} |
| else {tokType = T_NUMERIC_EXACT; state = ACCEPT_NOINC;} |
| continue; |
| case BINDIGIT_START: |
| if (t==e) {state = REJECT;} |
| else if (*t=='0' || *t=='1') {++t; state = BINDIGIT;} |
| else state = REJECT; |
| continue; |
| case BINDIGIT: |
| if (t==e) {tokType = T_NUMERIC_EXACT; state = ACCEPT_NOINC;} |
| else if (*t=='l' || *t=='L') {tokType = T_NUMERIC_EXACT; state = ACCEPT_INC;} |
| else if (*t=='0' || *t=='1' || *t=='_') {++t; state = BINDIGIT;} |
| else {tokType = T_NUMERIC_EXACT; state = ACCEPT_NOINC;} |
| continue; |
| case OCTDIGIT: |
| if (t==e) {tokType = T_NUMERIC_EXACT; state = ACCEPT_NOINC;} |
| else if (*t=='l' || *t=='L') {tokType = T_NUMERIC_EXACT; state = ACCEPT_INC;} |
| else if ((std::isdigit(*t) && *t<'8') || *t=='_') {++t; state = OCTDIGIT;} |
| else {tokType = T_NUMERIC_EXACT; state = ACCEPT_NOINC;} |
| continue; |
| case DIGIT: |
| if (t==e) {tokType = T_NUMERIC_EXACT; state = ACCEPT_NOINC;} |
| else if (*t=='l' || *t=='L') {tokType = T_NUMERIC_EXACT; state = ACCEPT_INC;} |
| else if (*t=='f' || *t=='F' || *t=='d' || *t=='D') {tokType = T_NUMERIC_APPROX; state = ACCEPT_INC;} |
| else if (std::isdigit(*t) || *t=='_') {++t; state = DIGIT;} |
| else if (*t=='.') {++t; state = DECIMAL;} |
| else if (*t=='e' || *t=='E') {++t; state = EXPONENT_SIGN;} |
| else {tokType = T_NUMERIC_EXACT; state = ACCEPT_NOINC;} |
| continue; |
| case DECIMAL: |
| if (t==e) {tokType = T_NUMERIC_APPROX; state = ACCEPT_NOINC;} |
| else if (std::isdigit(*t) || *t=='_') {++t; state = DECIMAL;} |
| else if (*t=='e' || *t=='E') {++t; state = EXPONENT_SIGN;} |
| else if (*t=='f' || *t=='F' || *t=='d' || *t=='D') {tokType = T_NUMERIC_APPROX; state = ACCEPT_INC;} |
| else {tokType = T_NUMERIC_APPROX; state = ACCEPT_NOINC;} |
| continue; |
| case EXPONENT: |
| if (t==e) {tokType = T_NUMERIC_APPROX; state = ACCEPT_NOINC;} |
| else if (std::isdigit(*t)) {++t; state = EXPONENT;} |
| else if (*t=='f' || *t=='F' || *t=='d' || *t=='D') {tokType = T_NUMERIC_APPROX; state = ACCEPT_INC;} |
| else {tokType = T_NUMERIC_APPROX; state = ACCEPT_NOINC;} |
| continue; |
| case ACCEPT_INC: |
| ++t; |
| case ACCEPT_NOINC: |
| tok = Token(tokType, s, t); |
| s = t; |
| return true; |
| case ACCEPT_IDENTIFIER: |
| tok = Token(T_IDENTIFIER, s, t); |
| s = t; |
| tokeniseReservedWord(tok); |
| return true; |
| case REJECT: |
| return false; |
| }; |
| } |
| |
| Tokeniser::Tokeniser(const std::string::const_iterator& s, const std::string::const_iterator& e) : |
| tokp(0), |
| inStart(s), |
| inp(s), |
| inEnd(e) |
| { |
| } |
| |
| /** |
| * Skip any whitespace then look for a token, throwing an exception if no valid token |
| * is found. |
| * |
| * Advance the string iterator past the parsed token on success. On failure the string iterator is |
| * in an undefined location. |
| */ |
| const Token& Tokeniser::nextToken() |
| { |
| if ( tokens.size()>tokp ) return tokens[tokp++]; |
| |
| // Don't extend stream of tokens further than the end of stream; |
| if ( tokp>0 && tokens[tokp-1].type==T_EOS ) return tokens[tokp-1]; |
| |
| tokens.push_back(Token()); |
| Token& tok = tokens[tokp++]; |
| |
| if (tokenise(inp, inEnd, tok)) return tok; |
| |
| throw TokenException("Found illegal character"); |
| } |
| |
| void Tokeniser::returnTokens(unsigned int n) |
| { |
| assert( n<=tokp ); |
| tokp-=n; |
| } |
| |
| std::string Tokeniser::remaining() |
| { |
| Token& currentTok = tokens[tokp]; |
| return std::string(currentTok.tokenStart, inEnd); |
| } |
| |
| |
| }} |