asterixdb/asterix-external-data/src/test/java/org/apache/asterix/external/classad/Lexer.java - asterixdb - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *   http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  * KIND, either express or implied.  See the License for the
  * specific language governing permissions and limitations
  * under the License.
  */
 package org.apache.asterix.external.classad;

 import java.io.IOException;

 import org.apache.asterix.external.classad.Value.NumberFactor;

 public class Lexer {

     public static final char[] TRUE_CHAR_ARRAY = "true".toCharArray();
     public static final char[] FALSE_CHAR_ARRAY = "false".toCharArray();
     public static final char[] UNDEFINED_CHAR_ARRAY = "undefined".toCharArray();
     public static final char[] ERROR_CHAR_ARRAY = "error".toCharArray();
     public static final char[] IS_CHAR_ARRAY = "is".toCharArray();
     public static final char[] ISNT_CHAR_ARRAY = "isnt".toCharArray();

     public static final char EOF = (char) -1;
     // internal state of lexical analyzer
     protected boolean initialized;
     private TokenType tokenType; // the integer id of the token
     private LexerSource lexSource;
     private char ch; // the current character
     private boolean accumulating; // are we in a token?
     private final boolean debug = false; // debug flag
     // internal buffer for token accumulation
     private AMutableCharArrayString lexBuffer;

     // cached last token
     private TokenValue yylval; // the token itself
     private boolean tokenConsumed; // has the token been consumed?

     public enum TokenType {
         LEX_TOKEN_ERROR,
         LEX_END_OF_INPUT,
         LEX_TOKEN_TOO_LONG,
         LEX_INTEGER_VALUE,
         LEX_REAL_VALUE,
         LEX_BOOLEAN_VALUE,
         LEX_STRING_VALUE,
         LEX_UNDEFINED_VALUE,
         LEX_ERROR_VALUE,
         LEX_IDENTIFIER,
         LEX_SELECTION,
         LEX_MULTIPLY,
         LEX_DIVIDE,
         LEX_MODULUS,
         LEX_PLUS,
         LEX_MINUS,
         LEX_BITWISE_AND,
         LEX_BITWISE_OR,
         LEX_BITWISE_NOT,
         LEX_BITWISE_XOR,
         LEX_LEFT_SHIFT,
         LEX_RIGHT_SHIFT,
         LEX_URIGHT_SHIFT,
         LEX_LOGICAL_AND,
         LEX_LOGICAL_OR,
         LEX_LOGICAL_NOT,
         LEX_LESS_THAN,
         LEX_LESS_OR_EQUAL,
         LEX_GREATER_THAN,
         LEX_GREATER_OR_EQUAL,
         LEX_EQUAL,
         LEX_NOT_EQUAL,
         LEX_META_EQUAL,
         LEX_META_NOT_EQUAL,
         LEX_BOUND_TO,
         LEX_QMARK,
         LEX_COLON,
         LEX_COMMA,
         LEX_SEMICOLON,
         LEX_OPEN_BOX,
         LEX_CLOSE_BOX,
         LEX_OPEN_PAREN,
         LEX_CLOSE_PAREN,
         LEX_OPEN_BRACE,
         LEX_CLOSE_BRACE,
         LEX_BACKSLASH,
         LEX_ABSOLUTE_TIME_VALUE,
         LEX_RELATIVE_TIME_VALUE
     };

     public Lexer() {
         // initialize lexer state (token, etc.) variables
         tokenType = TokenType.LEX_END_OF_INPUT;
         ch = 0;
         tokenConsumed = true;
         accumulating = false;
         initialized = false;
         yylval = new TokenValue();
         return;
     }

     // Initialization method:  Initialize with immutable string
     //   +  Token will be accumulated in the lexBuffer
     public boolean initialize(LexerSource source) throws IOException {
         lexSource = source;
         ch = lexSource.readCharacter();
         // token state initialization
         if (lexBuffer != null) {
             lexBuffer.reset();
         } else {
             lexBuffer = new AMutableCharArrayString();
         }
         lexBuffer.setChar(0, ch);
         lexBuffer.setLength(0);
         //lexBufferCount = 0;
         tokenConsumed = true;
         accumulating = false;
         initialized = true;
         return true;
     }

     public boolean reinitialize() throws IOException {
         ch = lexSource.readCharacter();
         // token state initialization
         lexBuffer.setChar(0, ch);
         lexBuffer.setLength(0);
         tokenConsumed = true;
         accumulating = false;
         return true;
     }

     public boolean wasInitialized() {
         return initialized;
     }

     // FinishedParse:  This function implements the cleanup phase of a parse.
     //   String valued tokens are entered into a string space, and maintained
     //   with reference counting.  When a parse is finished, this space is flushed
     //   out.
     public void finishedParse() {
         accumulating = false;
         return;
     }

     // Mark:  This function is called when the beginning of a token is detected
     public void mark() {
         lexBuffer.setChar(0, ch);
         lexBuffer.setLength(1);
         accumulating = true;
         return;
     }

     // Cut:  This function is called when the end of a token is detected
     public void cut() {
         lexBuffer.decrementLength();
         accumulating = false;
         return;
     }

     // Wind:  This function is called when an additional character must be read
     //            from the input source; the conceptual action is to move the cursor
     public void wind() throws IOException {
         if (ch == (char) -1) {
             if (accumulating) {
                 lexBuffer.incrementLength();
             }
             return;
         }
         ch = lexSource.readCharacter();
         if (ch == (char) -1) {
             if (accumulating) {
                 lexBuffer.incrementLength();
             }
             return;
         }
         if (accumulating) {
             lexBuffer.appendChar(ch);
         }
     }

     public TokenType consumeToken() throws IOException {
         return consumeToken(null);
     }

     public TokenType consumeToken(TokenValue lvalp) throws IOException {
         if (lvalp != null) {
             lvalp.copyFrom(yylval);
         }
         // if a token has already been consumed, get another token
         if (tokenConsumed) {
             peekToken(lvalp);
         }
         if (debug) {
             System.out.printf("Consume: %s\n", strLexToken(tokenType));
         }

         tokenConsumed = true;
         return tokenType;
     }

     private boolean isxdigit(char ch) {
         return Character.isDigit(ch) || isLowerCaseHexaAlpha(ch) || isUppserCaseHexaAlpha(ch);
     }

     private boolean isUppserCaseHexaAlpha(char ch) {
         return ch >= 'a' && ch <= 'f';
     }

     private boolean isLowerCaseHexaAlpha(char ch2) {
         return ch >= 'A' && ch <= 'F';
     }

     public TokenType peekToken() throws IOException {
         return peekToken(null);
     }

     // peekToken() returns the same token till consumeToken() is called
     public TokenType peekToken(TokenValue lvalp) throws IOException {
         /*if (lvalp == null) {
             System.err.println("Null value passed to peekToken");
             return null;
         }*/
         if (!tokenConsumed) {

             if (lvalp != null) {
                 lvalp.copyFrom(yylval);
             }
             return tokenType;
         }

         // Set the token to unconsumed
         tokenConsumed = false;

         // consume white space
         while (true) {
             if (Character.isWhitespace(ch)) {
                 wind();
                 continue;
             } else if (ch == '/') {
                 mark();
                 wind();
                 if (ch == '/') {
                     // a c++ style comment
                     while (ch > 0 && ch != '\n') {
                         wind();
                     }
                 } else if (ch == '*') {
                     // a c style comment
                     int oldCh;
                     ch = '\n';
                     do {
                         oldCh = ch;
                         wind();
                     } while ((oldCh != '*' || ch != '/') && (ch > 0));
                     if (ch == EOF) {
                         tokenType = TokenType.LEX_TOKEN_ERROR;
                         return (tokenType);
                     }
                     wind();
                 } else {
                     // just a division operator
                     cut();
                     tokenType = TokenType.LEX_DIVIDE;
                     yylval.setTokenType(tokenType);
                     return (tokenType);
                 }
             } else {
                 break; // out of while( true ) loop
             }
         }

         // check if this is the end of the input
         if (ch == EOF) {
             tokenType = TokenType.LEX_END_OF_INPUT;
             yylval.setTokenType(tokenType);
             return tokenType;
         }

         // check the first character of the token
         if (ch == '-') {
             // Depending on the last token we saw, a minus may be the start
             // of an integer or real token. tokenizeNumber() does the right
             // thing if there is no subsequent integer or real.
             switch (tokenType) {
                 case LEX_INTEGER_VALUE:
                 case LEX_REAL_VALUE:
                 case LEX_BOOLEAN_VALUE:
                 case LEX_STRING_VALUE:
                 case LEX_UNDEFINED_VALUE:
                 case LEX_ERROR_VALUE:
                 case LEX_IDENTIFIER:
                 case LEX_SELECTION:
                 case LEX_CLOSE_BOX:
                 case LEX_CLOSE_PAREN:
                 case LEX_CLOSE_BRACE:
                 case LEX_BACKSLASH:
                 case LEX_ABSOLUTE_TIME_VALUE:
                 case LEX_RELATIVE_TIME_VALUE:
                     tokenizePunctOperator();
                     break;
                 default:
                     tokenizeNumber();
                     break;
             }
         } else if (Character.isDigit(ch) || ch == '.') {
             // tokenizeNumber() also takes care of the selection operator
             tokenizeNumber();

         } else if (Character.isAlphabetic(ch) || ch == '_') {
             tokenizeAlphaHead();
         } else if (ch == '\"') {
             tokenizeString('\"'); // its a string literal
         } else if (ch == '\'') {
             tokenizeString('\''); // its a quoted attribute
         }

         else {
             tokenizePunctOperator();
         }

         if (debug) {
             System.out.printf("Peek: %s\n", strLexToken(tokenType));
             if (tokenType == TokenType.LEX_ERROR_VALUE) {
                 System.out.println("Lexer problem");
             }
         }
         if (lvalp != null) {
             lvalp.copyFrom(yylval);
         }
         yylval.setTokenType(tokenType);
         return tokenType;
     }

     // Tokenize number constants:
     //   1.  Integers:  [-] 0[0-7]+ | 0[xX][0-9a-fA-F]+ | [0-9]+
     //   2.  Reals   :  [-] [0-9]*\.[0-9]* (e|E) [+-]? [0-9]+
     enum NumberType {
         NONE,
         INTEGER,
         REAL
     };

     public TokenType tokenizeNumber() throws IOException {
         NumberType numberType = NumberType.NONE;
         NumberFactor f;
         long integer = 0;
         double real = 0;
         int och;

         och = ch;
         mark();
         wind();

         if (och == '-') {
             // This may be a negative number or the unary minus operator
             // The subsequent two characters will tell us which.
             if (Character.isDigit(ch)) {
                 // It looks like a negative number, keep reading.
                 och = ch;
                 wind();
             } else if (ch == '.') {
                 // This could be a real number or an attribute reference
                 // starting with dot. Look at the second character.
                 int ch2 = lexSource.readCharacter();
                 if (ch2 >= 0) {
                     lexSource.unreadCharacter();
                 }
                 if (!Character.isDigit(ch2)) {
                     // It's not a real number, return a minus token.
                     cut();
                     tokenType = TokenType.LEX_MINUS;
                     return tokenType;
                 }
                 // It looks like a negative real, keep reading.
             } else {
                 // It's not a number, return a minus token.
                 cut();
                 tokenType = TokenType.LEX_MINUS;
                 return tokenType;
             }
         }

         if (och == '0') {
             // number is octal, hex or real
             if (Character.toLowerCase(ch) == 'x') {
                 // get hex digits only; parse hex-digit+
                 numberType = NumberType.INTEGER;
                 wind();
                 if (!isxdigit(ch)) {
                     cut();
                     tokenType = TokenType.LEX_TOKEN_ERROR;
                     return (tokenType);
                 }
                 while (isxdigit(ch)) {
                     wind();
                 }
             } else {
                 // get octal or real
                 numberType = NumberType.INTEGER;
                 while (Character.isDigit(ch)) {
                     wind();
                     if (!isodigit(ch)) {
                         // not an octal number
                         numberType = NumberType.REAL;
                     }
                 }
                 if (ch == '.' || Character.toLowerCase(ch) == 'e') {
                     numberType = NumberType.REAL;
                 } else if (numberType == NumberType.REAL) {
                     // non-octal digits, but not a real (no '.' or 'e')
                     // so, illegal octal constant
                     cut();
                     tokenType = TokenType.LEX_TOKEN_ERROR;
                     return (tokenType);
                 }
             }
         } else if (Character.isDigit(och)) {
             // decimal or real; get digits
             while (Character.isDigit(ch)) {
                 wind();
             }
             numberType = (ch == '.' || Character.toLowerCase(ch) == 'e') ? NumberType.REAL : NumberType.INTEGER;
         }

         if (och == '.' || ch == '.') {
             // fraction part of real or selection operator
             if (ch == '.') {
                 wind();
             }
             if (Character.isDigit(ch)) {
                 // real; get digits after decimal point
                 numberType = NumberType.REAL;
                 while (Character.isDigit(ch)) {
                     wind();
                 }
             } else {
                 if (numberType != NumberType.NONE) {
                     // initially like a number, but no digit following the '.'
                     cut();
                     tokenType = TokenType.LEX_TOKEN_ERROR;
                     return (tokenType);
                 }
                 // selection operator
                 cut();
                 tokenType = TokenType.LEX_SELECTION;
                 return (tokenType);
             }
         }

         // if we are tokenizing a real, the (optional) exponent part is left
         //   i.e., [eE][+-]?[0-9]+
         if (numberType == NumberType.REAL && Character.toLowerCase(ch) == 'e') {
             wind();
             if (ch == '+' || ch == '-') {
                 wind();
             }
             if (!Character.isDigit(ch)) {
                 cut();
                 tokenType = TokenType.LEX_TOKEN_ERROR;
                 return (tokenType);
             }
             while (Character.isDigit(ch)) {
                 wind();
             }
         }

         if (numberType == NumberType.INTEGER) {
             cut();
             integer = Long.parseLong(lexBuffer.toString());
         } else if (numberType == NumberType.REAL) {
             cut();
             real = Double.parseDouble(lexBuffer.toString());
         } else {
             /* If we've reached this point, we have a serious programming
              * error: tokenizeNumber should only be called if we are
              * lexing a number or a selection, and we didn't find a number
              * or a selection. This should really never happen, so we
              * bomb if it does. It should be reported as a bug.
              */
             throw new IOException("Should not reach here");
         }

         switch (Character.toUpperCase(ch)) {
             case 'B':
                 f = NumberFactor.B_FACTOR;
                 wind();
                 break;
             case 'K':
                 f = NumberFactor.K_FACTOR;
                 wind();
                 break;
             case 'M':
                 f = NumberFactor.M_FACTOR;
                 wind();
                 break;
             case 'G':
                 f = NumberFactor.G_FACTOR;
                 wind();
                 break;
             case 'T':
                 f = NumberFactor.T_FACTOR;
                 wind();
                 break;
             default:
                 f = NumberFactor.NO_FACTOR;
         }

         if (numberType == NumberType.INTEGER) {
             yylval.setIntValue(integer, f);
             yylval.setTokenType(TokenType.LEX_INTEGER_VALUE);
             tokenType = TokenType.LEX_INTEGER_VALUE;
         } else {
             yylval.setRealValue(real, f);
             yylval.setTokenType(TokenType.LEX_REAL_VALUE);
             tokenType = TokenType.LEX_REAL_VALUE;
         }

         return (tokenType);
     }

     public static boolean isodigit(char ch) {
         return ch >= '0' && ch <= '7';
     }

     // Tokenize alpha head: (character sequences beggining with an alphabet)
     //   1.  Reserved character sequences:  true, false, error, undefined
     //   2.  Identifier                  :  [a-zA-Z_][a-zA-Z0-9_]*
     public TokenType tokenizeAlphaHead() throws IOException {
         mark();
         while (Character.isAlphabetic(ch)) {
             wind();
         }
         if (Character.isDigit(ch) || ch == '_') {
             // The token is an identifier; consume the rest of the token
             wind();
             while (Character.isAlphabetic(ch) || Character.isDigit(ch) || ch == '_') {
                 wind();
             }
             cut();
             tokenType = TokenType.LEX_IDENTIFIER;
             yylval.setStringValue(lexBuffer);
             return tokenType;
         }

         // check if the string is one of the reserved words; Case insensitive
         cut();
         if (isEqualIgnoreCase(TRUE_CHAR_ARRAY)) {
             tokenType = TokenType.LEX_BOOLEAN_VALUE;
             yylval.setBoolValue(true);
         } else if (isEqualIgnoreCase(FALSE_CHAR_ARRAY)) {
             tokenType = TokenType.LEX_BOOLEAN_VALUE;
             yylval.setBoolValue(false);
         } else if (isEqualIgnoreCase(UNDEFINED_CHAR_ARRAY)) {
             tokenType = TokenType.LEX_UNDEFINED_VALUE;
         } else if (isEqualIgnoreCase(ERROR_CHAR_ARRAY)) {
             tokenType = TokenType.LEX_ERROR_VALUE;
         } else if (isEqualIgnoreCase(IS_CHAR_ARRAY)) {
             tokenType = TokenType.LEX_META_EQUAL;
         } else if (isEqualIgnoreCase(ISNT_CHAR_ARRAY)) {
             tokenType = TokenType.LEX_META_NOT_EQUAL;
         } else {
             // token is a character only identifier
             tokenType = TokenType.LEX_IDENTIFIER;
             yylval.setStringValue(lexBuffer);
         }
         return tokenType;
     }

     private boolean isEqualIgnoreCase(char[] compareTo) {
         return lexBuffer.isEqualsIgnoreCaseLower(compareTo);
     }

     // tokenizeStringLiteral:  Scans strings of the form " ... " or '...'
     // based on whether the argument passed was '\"' or '\''
     public TokenType tokenizeString(char delim) throws IOException {
         boolean stringComplete = false;

         // need to mark() after the quote
         wind();
         mark();

         while (!stringComplete) {
             boolean oddBackWhacks = false;
             char oldCh = 0;
             // consume the string literal; read upto " ignoring \"
             while ((ch > 0) && (ch != delim || (ch == delim && oldCh == '\\' && oddBackWhacks))) {
                 if (!oddBackWhacks && ch == '\\') {
                     oddBackWhacks = true;
                 } else {
                     oddBackWhacks = false;
                 }
                 oldCh = ch;
                 wind();
             }

             if (ch == delim) {
                 char tempch = ' ';
                 // read past the whitespace characters
                 while (Character.isWhitespace(tempch)) {
                     tempch = lexSource.readCharacter();
                 }
                 if (tempch != delim) { // a new token exists after the string
                     if (tempch != EOF) {
                         lexSource.unreadCharacter();
                     }
                     stringComplete = true;
                 } else { // the adjacent string is to be concatenated to the existing string
                     lexBuffer.erase(lexBuffer.getLength());// erase the lagging '\"'
                     wind();
                 }
             } else {
                 // loop quit due to ch == 0 or ch == EOF
                 tokenType = TokenType.LEX_TOKEN_ERROR;
                 return tokenType;
             }
         }
         cut();
         wind(); // skip over the close quote
         boolean validStr = true; // to check if string is valid after converting escape
         validStr = Util.convertEscapes(lexBuffer);
         yylval.setStringValue(lexBuffer);
         if (validStr) {
             if (delim == '\"') {
                 tokenType = TokenType.LEX_STRING_VALUE;
             } else {
                 tokenType = TokenType.LEX_IDENTIFIER;
             }
         } else {
             tokenType = TokenType.LEX_TOKEN_ERROR; // string conatins a '\0' character inbetween
         }

         return tokenType;
     }

     // tokenizePunctOperator:  Tokenize puncutation and operators
     public TokenType tokenizePunctOperator() throws IOException {
         // save character; may need to lookahead
         char oldch = ch;
         char extra_lookahead;

         mark();
         wind();
         switch (oldch) {
             // these cases don't need lookaheads
             case '.':
                 tokenType = TokenType.LEX_SELECTION;
                 break;

             case '*':
                 tokenType = TokenType.LEX_MULTIPLY;
                 break;

             case '/':
                 tokenType = TokenType.LEX_DIVIDE;
                 break;

             case '%':
                 tokenType = TokenType.LEX_MODULUS;
                 break;

             case '+':
                 tokenType = TokenType.LEX_PLUS;
                 break;

             case '-':
                 tokenType = TokenType.LEX_MINUS;
                 break;

             case '~':
                 tokenType = TokenType.LEX_BITWISE_NOT;
                 break;

             case '^':
                 tokenType = TokenType.LEX_BITWISE_XOR;
                 break;

             case '?':
                 tokenType = TokenType.LEX_QMARK;
                 break;

             case ':':
                 tokenType = TokenType.LEX_COLON;
                 break;

             case ';':
                 tokenType = TokenType.LEX_SEMICOLON;
                 break;

             case ',':
                 tokenType = TokenType.LEX_COMMA;
                 break;

             case '[':
                 tokenType = TokenType.LEX_OPEN_BOX;
                 break;

             case ']':
                 tokenType = TokenType.LEX_CLOSE_BOX;
                 break;

             case '(':
                 tokenType = TokenType.LEX_OPEN_PAREN;
                 break;

             case ')':
                 tokenType = TokenType.LEX_CLOSE_PAREN;
                 break;

             case '{':
                 tokenType = TokenType.LEX_OPEN_BRACE;
                 break;

             case '}':
                 tokenType = TokenType.LEX_CLOSE_BRACE;
                 break;

             // the following cases need lookaheads

             case '&':
                 tokenType = TokenType.LEX_BITWISE_AND;
                 if (ch == '&') {
                     tokenType = TokenType.LEX_LOGICAL_AND;
                     wind();
                 }
                 break;

             case '|':
                 tokenType = TokenType.LEX_BITWISE_OR;
                 if (ch == '|') {
                     tokenType = TokenType.LEX_LOGICAL_OR;
                     wind();
                 }
                 break;

             case '<':
                 tokenType = TokenType.LEX_LESS_THAN;
                 switch (ch) {
                     case '=':
                         tokenType = TokenType.LEX_LESS_OR_EQUAL;
                         wind();
                         break;

                     case '<':
                         tokenType = TokenType.LEX_LEFT_SHIFT;
                         wind();
                         break;

                     default:
                         // just the '<' --- no need to do anything
                         break;
                 }
                 break;

             case '>':
                 tokenType = TokenType.LEX_GREATER_THAN;
                 switch (ch) {
                     case '=':
                         tokenType = TokenType.LEX_GREATER_OR_EQUAL;
                         wind();
                         break;

                     case '>':
                         tokenType = TokenType.LEX_RIGHT_SHIFT;
                         wind();
                         if (ch == '>') {
                             tokenType = TokenType.LEX_URIGHT_SHIFT;
                             wind();
                         }
                         break;

                     default:
                         // just the '>' --- no need to do anything
                         break;
                 }
                 break;

             case '=':
                 tokenType = TokenType.LEX_BOUND_TO;
                 switch (ch) {
                     case '=':
                         tokenType = TokenType.LEX_EQUAL;
                         wind();
                         break;

                     case '?':
                         tokenType = TokenType.LEX_META_EQUAL;
                         wind();

                         // ensure the trailing '=' of the '=?=' combination
                         if (ch != '=') {
                             tokenType = TokenType.LEX_TOKEN_ERROR;
                             return tokenType;
                         }

                         wind();
                         break;

                     case '!':
                         extra_lookahead = lexSource.readCharacter();
                         lexSource.unreadCharacter();
                         if (extra_lookahead == '=') {
                             tokenType = TokenType.LEX_META_NOT_EQUAL;
                             wind();
                             wind();
                         }
                         break;

                     default:
                         // just the '=' --- no need to do anything
                         break;
                 }
                 break;

             case '!':
                 tokenType = TokenType.LEX_LOGICAL_NOT;
                 switch (ch) {
                     case '=':
                         tokenType = TokenType.LEX_NOT_EQUAL;
                         wind();
                         break;

                     default:
                         // just the '!' --- no need to do anything
                         break;
                 }
                 break;

             default:
                 tokenType = TokenType.LEX_TOKEN_ERROR;
                 return tokenType;
         }

         // cut the token and return
         cut();
         return tokenType;
     }

     // strLexToken:  Return string representation of token type
     public static String strLexToken(TokenType tokenType) {
         switch (tokenType) {
             case LEX_END_OF_INPUT:
                 return "LEX_END_OF_INPUT";
             case LEX_TOKEN_ERROR:
                 return "LEX_TOKEN_ERROR";
             case LEX_TOKEN_TOO_LONG:
                 return "LEX_TOKEN_TOO_LONG";

             case LEX_INTEGER_VALUE:
                 return "LEX_INTEGER_VALUE";
             case LEX_REAL_VALUE:
                 return "LEX_REAL_VALUE";
             case LEX_BOOLEAN_VALUE:
                 return "LEX_BOOLEAN_VALUE";
             case LEX_STRING_VALUE:
                 return "LEX_STRING_VALUE";
             case LEX_UNDEFINED_VALUE:
                 return "LEX_UNDEFINED_VALUE";
             case LEX_ERROR_VALUE:
                 return "LEX_ERROR_VALUE";

             case LEX_IDENTIFIER:
                 return "LEX_IDENTIFIER";
             case LEX_SELECTION:
                 return "LEX_SELECTION";

             case LEX_MULTIPLY:
                 return "LEX_MULTIPLY";
             case LEX_DIVIDE:
                 return "LEX_DIVIDE";
             case LEX_MODULUS:
                 return "LEX_MODULUS";
             case LEX_PLUS:
                 return "LEX_PLUS";
             case LEX_MINUS:
                 return "LEX_MINUS";

             case LEX_BITWISE_AND:
                 return "LEX_BITWISE_AND";
             case LEX_BITWISE_OR:
                 return "LEX_BITWISE_OR";
             case LEX_BITWISE_NOT:
                 return "LEX_BITWISE_NOT";
             case LEX_BITWISE_XOR:
                 return "LEX_BITWISE_XOR";

             case LEX_LEFT_SHIFT:
                 return "LEX_LEFT_SHIFT";
             case LEX_RIGHT_SHIFT:
                 return "LEX_RIGHT_SHIFT";
             case LEX_URIGHT_SHIFT:
                 return "LEX_URIGHT_SHIFT";

             case LEX_LOGICAL_AND:
                 return "LEX_LOGICAL_AND";
             case LEX_LOGICAL_OR:
                 return "LEX_LOGICAL_OR";
             case LEX_LOGICAL_NOT:
                 return "LEX_LOGICAL_NOT";

             case LEX_LESS_THAN:
                 return "LEX_LESS_THAN";
             case LEX_LESS_OR_EQUAL:
                 return "LEX_LESS_OR_EQUAL";
             case LEX_GREATER_THAN:
                 return "LEX_GREATER_THAN";
             case LEX_GREATER_OR_EQUAL:
                 return "LEX_GREATER_OR_EQUAL";
             case LEX_EQUAL:
                 return "LEX_EQUAL";
             case LEX_NOT_EQUAL:
                 return "LEX_NOT_EQUAL";
             case LEX_META_EQUAL:
                 return "LEX_META_EQUAL";
             case LEX_META_NOT_EQUAL:
                 return "LEX_META_NOT_EQUAL";

             case LEX_BOUND_TO:
                 return "LEX_BOUND_TO";

             case LEX_QMARK:
                 return "LEX_QMARK";
             case LEX_COLON:
                 return "LEX_COLON";
             case LEX_SEMICOLON:
                 return "LEX_SEMICOLON";
             case LEX_COMMA:
                 return "LEX_COMMA";
             case LEX_OPEN_BOX:
                 return "LEX_OPEN_BOX";
             case LEX_CLOSE_BOX:
                 return "LEX_CLOSE_BOX";
             case LEX_OPEN_PAREN:
                 return "LEX_OPEN_PAREN";
             case LEX_CLOSE_PAREN:
                 return "LEX_CLOSE_PAREN";
             case LEX_OPEN_BRACE:
                 return "LEX_OPEN_BRACE";
             case LEX_CLOSE_BRACE:
                 return "LEX_CLOSE_BRACE";
             case LEX_BACKSLASH:
                 return "LEX_BACKSLASH";
             case LEX_ABSOLUTE_TIME_VALUE:
                 return "LEX_ABSOLUTE_TIME_VALUE";
             case LEX_RELATIVE_TIME_VALUE:
                 return "LEX_RELATIVE_TIME_VALUE";

             default:
                 return "** Unknown **";
         }
     }

     public LexerSource getLexSource() {
         return lexSource;
     }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing,
	* software distributed under the License is distributed on an
	* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	* KIND, either express or implied. See the License for the
	* specific language governing permissions and limitations
	* under the License.
	*/
	package org.apache.asterix.external.classad;

	import java.io.IOException;

	import org.apache.asterix.external.classad.Value.NumberFactor;

	public class Lexer {

	public static final char[] TRUE_CHAR_ARRAY = "true".toCharArray();
	public static final char[] FALSE_CHAR_ARRAY = "false".toCharArray();
	public static final char[] UNDEFINED_CHAR_ARRAY = "undefined".toCharArray();
	public static final char[] ERROR_CHAR_ARRAY = "error".toCharArray();
	public static final char[] IS_CHAR_ARRAY = "is".toCharArray();
	public static final char[] ISNT_CHAR_ARRAY = "isnt".toCharArray();

	public static final char EOF = (char) -1;
	// internal state of lexical analyzer
	protected boolean initialized;
	private TokenType tokenType; // the integer id of the token
	private LexerSource lexSource;
	private char ch; // the current character
	private boolean accumulating; // are we in a token?
	private final boolean debug = false; // debug flag
	// internal buffer for token accumulation
	private AMutableCharArrayString lexBuffer;

	// cached last token
	private TokenValue yylval; // the token itself
	private boolean tokenConsumed; // has the token been consumed?

	public enum TokenType {
	LEX_TOKEN_ERROR,
	LEX_END_OF_INPUT,
	LEX_TOKEN_TOO_LONG,
	LEX_INTEGER_VALUE,
	LEX_REAL_VALUE,
	LEX_BOOLEAN_VALUE,
	LEX_STRING_VALUE,
	LEX_UNDEFINED_VALUE,
	LEX_ERROR_VALUE,
	LEX_IDENTIFIER,
	LEX_SELECTION,
	LEX_MULTIPLY,
	LEX_DIVIDE,
	LEX_MODULUS,
	LEX_PLUS,
	LEX_MINUS,
	LEX_BITWISE_AND,
	LEX_BITWISE_OR,
	LEX_BITWISE_NOT,
	LEX_BITWISE_XOR,
	LEX_LEFT_SHIFT,
	LEX_RIGHT_SHIFT,
	LEX_URIGHT_SHIFT,
	LEX_LOGICAL_AND,
	LEX_LOGICAL_OR,
	LEX_LOGICAL_NOT,
	LEX_LESS_THAN,
	LEX_LESS_OR_EQUAL,
	LEX_GREATER_THAN,
	LEX_GREATER_OR_EQUAL,
	LEX_EQUAL,
	LEX_NOT_EQUAL,
	LEX_META_EQUAL,
	LEX_META_NOT_EQUAL,
	LEX_BOUND_TO,
	LEX_QMARK,
	LEX_COLON,
	LEX_COMMA,
	LEX_SEMICOLON,
	LEX_OPEN_BOX,
	LEX_CLOSE_BOX,
	LEX_OPEN_PAREN,
	LEX_CLOSE_PAREN,
	LEX_OPEN_BRACE,
	LEX_CLOSE_BRACE,
	LEX_BACKSLASH,
	LEX_ABSOLUTE_TIME_VALUE,
	LEX_RELATIVE_TIME_VALUE
	};

	public Lexer() {
	// initialize lexer state (token, etc.) variables
	tokenType = TokenType.LEX_END_OF_INPUT;
	ch = 0;
	tokenConsumed = true;
	accumulating = false;
	initialized = false;
	yylval = new TokenValue();
	return;
	}

	// Initialization method: Initialize with immutable string
	// + Token will be accumulated in the lexBuffer
	public boolean initialize(LexerSource source) throws IOException {
	lexSource = source;
	ch = lexSource.readCharacter();
	// token state initialization
	if (lexBuffer != null) {
	lexBuffer.reset();
	} else {
	lexBuffer = new AMutableCharArrayString();
	}
	lexBuffer.setChar(0, ch);
	lexBuffer.setLength(0);
	//lexBufferCount = 0;
	tokenConsumed = true;
	accumulating = false;
	initialized = true;
	return true;
	}

	public boolean reinitialize() throws IOException {
	ch = lexSource.readCharacter();
	// token state initialization
	lexBuffer.setChar(0, ch);
	lexBuffer.setLength(0);
	tokenConsumed = true;
	accumulating = false;
	return true;
	}

	public boolean wasInitialized() {
	return initialized;
	}

	// FinishedParse: This function implements the cleanup phase of a parse.
	// String valued tokens are entered into a string space, and maintained
	// with reference counting. When a parse is finished, this space is flushed
	// out.
	public void finishedParse() {
	accumulating = false;
	return;
	}

	// Mark: This function is called when the beginning of a token is detected
	public void mark() {
	lexBuffer.setChar(0, ch);
	lexBuffer.setLength(1);
	accumulating = true;
	return;
	}

	// Cut: This function is called when the end of a token is detected
	public void cut() {
	lexBuffer.decrementLength();
	accumulating = false;
	return;
	}

	// Wind: This function is called when an additional character must be read
	// from the input source; the conceptual action is to move the cursor
	public void wind() throws IOException {
	if (ch == (char) -1) {
	if (accumulating) {
	lexBuffer.incrementLength();
	}
	return;
	}
	ch = lexSource.readCharacter();
	if (ch == (char) -1) {
	if (accumulating) {
	lexBuffer.incrementLength();
	}
	return;
	}
	if (accumulating) {
	lexBuffer.appendChar(ch);
	}
	}

	public TokenType consumeToken() throws IOException {
	return consumeToken(null);
	}

	public TokenType consumeToken(TokenValue lvalp) throws IOException {
	if (lvalp != null) {
	lvalp.copyFrom(yylval);
	}
	// if a token has already been consumed, get another token
	if (tokenConsumed) {
	peekToken(lvalp);
	}
	if (debug) {
	System.out.printf("Consume: %s\n", strLexToken(tokenType));
	}

	tokenConsumed = true;
	return tokenType;
	}

	private boolean isxdigit(char ch) {
	return Character.isDigit(ch) \|\| isLowerCaseHexaAlpha(ch) \|\| isUppserCaseHexaAlpha(ch);
	}

	private boolean isUppserCaseHexaAlpha(char ch) {
	return ch >= 'a' && ch <= 'f';
	}

	private boolean isLowerCaseHexaAlpha(char ch2) {
	return ch >= 'A' && ch <= 'F';
	}

	public TokenType peekToken() throws IOException {
	return peekToken(null);
	}

	// peekToken() returns the same token till consumeToken() is called
	public TokenType peekToken(TokenValue lvalp) throws IOException {
	/*if (lvalp == null) {
	System.err.println("Null value passed to peekToken");
	return null;
	}*/
	if (!tokenConsumed) {

	if (lvalp != null) {
	lvalp.copyFrom(yylval);
	}
	return tokenType;
	}

	// Set the token to unconsumed
	tokenConsumed = false;

	// consume white space
	while (true) {
	if (Character.isWhitespace(ch)) {
	wind();
	continue;
	} else if (ch == '/') {
	mark();
	wind();
	if (ch == '/') {
	// a c++ style comment
	while (ch > 0 && ch != '\n') {
	wind();
	}
	} else if (ch == '*') {
	// a c style comment
	int oldCh;
	ch = '\n';
	do {
	oldCh = ch;
	wind();
	} while ((oldCh != '*' \|\| ch != '/') && (ch > 0));
	if (ch == EOF) {
	tokenType = TokenType.LEX_TOKEN_ERROR;
	return (tokenType);
	}
	wind();
	} else {
	// just a division operator
	cut();
	tokenType = TokenType.LEX_DIVIDE;
	yylval.setTokenType(tokenType);
	return (tokenType);
	}
	} else {
	break; // out of while( true ) loop
	}
	}

	// check if this is the end of the input
	if (ch == EOF) {
	tokenType = TokenType.LEX_END_OF_INPUT;
	yylval.setTokenType(tokenType);
	return tokenType;
	}

	// check the first character of the token
	if (ch == '-') {
	// Depending on the last token we saw, a minus may be the start
	// of an integer or real token. tokenizeNumber() does the right
	// thing if there is no subsequent integer or real.
	switch (tokenType) {
	case LEX_INTEGER_VALUE:
	case LEX_REAL_VALUE:
	case LEX_BOOLEAN_VALUE:
	case LEX_STRING_VALUE:
	case LEX_UNDEFINED_VALUE:
	case LEX_ERROR_VALUE:
	case LEX_IDENTIFIER:
	case LEX_SELECTION:
	case LEX_CLOSE_BOX:
	case LEX_CLOSE_PAREN:
	case LEX_CLOSE_BRACE:
	case LEX_BACKSLASH:
	case LEX_ABSOLUTE_TIME_VALUE:
	case LEX_RELATIVE_TIME_VALUE:
	tokenizePunctOperator();
	break;
	default:
	tokenizeNumber();
	break;
	}
	} else if (Character.isDigit(ch) \|\| ch == '.') {
	// tokenizeNumber() also takes care of the selection operator
	tokenizeNumber();

	} else if (Character.isAlphabetic(ch) \|\| ch == '_') {
	tokenizeAlphaHead();
	} else if (ch == '\"') {
	tokenizeString('\"'); // its a string literal
	} else if (ch == '\'') {
	tokenizeString('\''); // its a quoted attribute
	}

	else {
	tokenizePunctOperator();
	}

	if (debug) {
	System.out.printf("Peek: %s\n", strLexToken(tokenType));
	if (tokenType == TokenType.LEX_ERROR_VALUE) {
	System.out.println("Lexer problem");
	}
	}
	if (lvalp != null) {
	lvalp.copyFrom(yylval);
	}
	yylval.setTokenType(tokenType);
	return tokenType;
	}

	// Tokenize number constants:
	// 1. Integers: [-] 0[0-7]+ \| 0[xX][0-9a-fA-F]+ \| [0-9]+
	// 2. Reals : [-] [0-9]\.[0-9] (e\|E) [+-]? [0-9]+
	enum NumberType {
	NONE,
	INTEGER,
	REAL
	};

	public TokenType tokenizeNumber() throws IOException {
	NumberType numberType = NumberType.NONE;
	NumberFactor f;
	long integer = 0;
	double real = 0;
	int och;

	och = ch;
	mark();
	wind();

	if (och == '-') {
	// This may be a negative number or the unary minus operator
	// The subsequent two characters will tell us which.
	if (Character.isDigit(ch)) {
	// It looks like a negative number, keep reading.
	och = ch;
	wind();
	} else if (ch == '.') {
	// This could be a real number or an attribute reference
	// starting with dot. Look at the second character.
	int ch2 = lexSource.readCharacter();
	if (ch2 >= 0) {
	lexSource.unreadCharacter();
	}
	if (!Character.isDigit(ch2)) {
	// It's not a real number, return a minus token.
	cut();
	tokenType = TokenType.LEX_MINUS;
	return tokenType;
	}
	// It looks like a negative real, keep reading.
	} else {
	// It's not a number, return a minus token.
	cut();
	tokenType = TokenType.LEX_MINUS;
	return tokenType;
	}
	}

	if (och == '0') {
	// number is octal, hex or real
	if (Character.toLowerCase(ch) == 'x') {
	// get hex digits only; parse hex-digit+
	numberType = NumberType.INTEGER;
	wind();
	if (!isxdigit(ch)) {
	cut();
	tokenType = TokenType.LEX_TOKEN_ERROR;
	return (tokenType);
	}
	while (isxdigit(ch)) {
	wind();
	}
	} else {
	// get octal or real
	numberType = NumberType.INTEGER;
	while (Character.isDigit(ch)) {
	wind();
	if (!isodigit(ch)) {
	// not an octal number
	numberType = NumberType.REAL;
	}
	}
	if (ch == '.' \|\| Character.toLowerCase(ch) == 'e') {
	numberType = NumberType.REAL;
	} else if (numberType == NumberType.REAL) {
	// non-octal digits, but not a real (no '.' or 'e')
	// so, illegal octal constant
	cut();
	tokenType = TokenType.LEX_TOKEN_ERROR;
	return (tokenType);
	}
	}
	} else if (Character.isDigit(och)) {
	// decimal or real; get digits
	while (Character.isDigit(ch)) {
	wind();
	}
	numberType = (ch == '.' \|\| Character.toLowerCase(ch) == 'e') ? NumberType.REAL : NumberType.INTEGER;
	}

	if (och == '.' \|\| ch == '.') {
	// fraction part of real or selection operator
	if (ch == '.') {
	wind();
	}
	if (Character.isDigit(ch)) {
	// real; get digits after decimal point
	numberType = NumberType.REAL;
	while (Character.isDigit(ch)) {
	wind();
	}
	} else {
	if (numberType != NumberType.NONE) {
	// initially like a number, but no digit following the '.'
	cut();
	tokenType = TokenType.LEX_TOKEN_ERROR;
	return (tokenType);
	}
	// selection operator
	cut();
	tokenType = TokenType.LEX_SELECTION;
	return (tokenType);
	}
	}

	// if we are tokenizing a real, the (optional) exponent part is left
	// i.e., [eE][+-]?[0-9]+
	if (numberType == NumberType.REAL && Character.toLowerCase(ch) == 'e') {
	wind();
	if (ch == '+' \|\| ch == '-') {
	wind();
	}
	if (!Character.isDigit(ch)) {
	cut();
	tokenType = TokenType.LEX_TOKEN_ERROR;
	return (tokenType);
	}
	while (Character.isDigit(ch)) {
	wind();
	}
	}

	if (numberType == NumberType.INTEGER) {
	cut();
	integer = Long.parseLong(lexBuffer.toString());
	} else if (numberType == NumberType.REAL) {
	cut();
	real = Double.parseDouble(lexBuffer.toString());
	} else {
	/* If we've reached this point, we have a serious programming
	* error: tokenizeNumber should only be called if we are
	* lexing a number or a selection, and we didn't find a number
	* or a selection. This should really never happen, so we
	* bomb if it does. It should be reported as a bug.
	*/
	throw new IOException("Should not reach here");
	}

	switch (Character.toUpperCase(ch)) {
	case 'B':
	f = NumberFactor.B_FACTOR;
	wind();
	break;
	case 'K':
	f = NumberFactor.K_FACTOR;
	wind();
	break;
	case 'M':
	f = NumberFactor.M_FACTOR;
	wind();
	break;
	case 'G':
	f = NumberFactor.G_FACTOR;
	wind();
	break;
	case 'T':
	f = NumberFactor.T_FACTOR;
	wind();
	break;
	default:
	f = NumberFactor.NO_FACTOR;
	}

	if (numberType == NumberType.INTEGER) {
	yylval.setIntValue(integer, f);
	yylval.setTokenType(TokenType.LEX_INTEGER_VALUE);
	tokenType = TokenType.LEX_INTEGER_VALUE;
	} else {
	yylval.setRealValue(real, f);
	yylval.setTokenType(TokenType.LEX_REAL_VALUE);
	tokenType = TokenType.LEX_REAL_VALUE;
	}

	return (tokenType);
	}

	public static boolean isodigit(char ch) {
	return ch >= '0' && ch <= '7';
	}

	// Tokenize alpha head: (character sequences beggining with an alphabet)
	// 1. Reserved character sequences: true, false, error, undefined
	// 2. Identifier : [a-zA-Z_][a-zA-Z0-9_]*
	public TokenType tokenizeAlphaHead() throws IOException {
	mark();
	while (Character.isAlphabetic(ch)) {
	wind();
	}
	if (Character.isDigit(ch) \|\| ch == '_') {
	// The token is an identifier; consume the rest of the token
	wind();
	while (Character.isAlphabetic(ch) \|\| Character.isDigit(ch) \|\| ch == '_') {
	wind();
	}
	cut();
	tokenType = TokenType.LEX_IDENTIFIER;
	yylval.setStringValue(lexBuffer);
	return tokenType;
	}

	// check if the string is one of the reserved words; Case insensitive
	cut();
	if (isEqualIgnoreCase(TRUE_CHAR_ARRAY)) {
	tokenType = TokenType.LEX_BOOLEAN_VALUE;
	yylval.setBoolValue(true);
	} else if (isEqualIgnoreCase(FALSE_CHAR_ARRAY)) {
	tokenType = TokenType.LEX_BOOLEAN_VALUE;
	yylval.setBoolValue(false);
	} else if (isEqualIgnoreCase(UNDEFINED_CHAR_ARRAY)) {
	tokenType = TokenType.LEX_UNDEFINED_VALUE;
	} else if (isEqualIgnoreCase(ERROR_CHAR_ARRAY)) {
	tokenType = TokenType.LEX_ERROR_VALUE;
	} else if (isEqualIgnoreCase(IS_CHAR_ARRAY)) {
	tokenType = TokenType.LEX_META_EQUAL;
	} else if (isEqualIgnoreCase(ISNT_CHAR_ARRAY)) {
	tokenType = TokenType.LEX_META_NOT_EQUAL;
	} else {
	// token is a character only identifier
	tokenType = TokenType.LEX_IDENTIFIER;
	yylval.setStringValue(lexBuffer);
	}
	return tokenType;
	}

	private boolean isEqualIgnoreCase(char[] compareTo) {
	return lexBuffer.isEqualsIgnoreCaseLower(compareTo);
	}

	// tokenizeStringLiteral: Scans strings of the form " ... " or '...'
	// based on whether the argument passed was '\"' or '\''
	public TokenType tokenizeString(char delim) throws IOException {
	boolean stringComplete = false;

	// need to mark() after the quote
	wind();
	mark();

	while (!stringComplete) {
	boolean oddBackWhacks = false;
	char oldCh = 0;
	// consume the string literal; read upto " ignoring \"
	while ((ch > 0) && (ch != delim \|\| (ch == delim && oldCh == '\\' && oddBackWhacks))) {
	if (!oddBackWhacks && ch == '\\') {
	oddBackWhacks = true;
	} else {
	oddBackWhacks = false;
	}
	oldCh = ch;
	wind();
	}

	if (ch == delim) {
	char tempch = ' ';
	// read past the whitespace characters
	while (Character.isWhitespace(tempch)) {
	tempch = lexSource.readCharacter();
	}
	if (tempch != delim) { // a new token exists after the string
	if (tempch != EOF) {
	lexSource.unreadCharacter();
	}
	stringComplete = true;
	} else { // the adjacent string is to be concatenated to the existing string
	lexBuffer.erase(lexBuffer.getLength());// erase the lagging '\"'
	wind();
	}
	} else {
	// loop quit due to ch == 0 or ch == EOF
	tokenType = TokenType.LEX_TOKEN_ERROR;
	return tokenType;
	}
	}
	cut();
	wind(); // skip over the close quote
	boolean validStr = true; // to check if string is valid after converting escape
	validStr = Util.convertEscapes(lexBuffer);
	yylval.setStringValue(lexBuffer);
	if (validStr) {
	if (delim == '\"') {
	tokenType = TokenType.LEX_STRING_VALUE;
	} else {
	tokenType = TokenType.LEX_IDENTIFIER;
	}
	} else {
	tokenType = TokenType.LEX_TOKEN_ERROR; // string conatins a '\0' character inbetween
	}

	return tokenType;
	}

	// tokenizePunctOperator: Tokenize puncutation and operators
	public TokenType tokenizePunctOperator() throws IOException {
	// save character; may need to lookahead
	char oldch = ch;
	char extra_lookahead;

	mark();
	wind();
	switch (oldch) {
	// these cases don't need lookaheads
	case '.':
	tokenType = TokenType.LEX_SELECTION;
	break;

	case '*':
	tokenType = TokenType.LEX_MULTIPLY;
	break;

	case '/':
	tokenType = TokenType.LEX_DIVIDE;
	break;

	case '%':
	tokenType = TokenType.LEX_MODULUS;
	break;

	case '+':
	tokenType = TokenType.LEX_PLUS;
	break;

	case '-':
	tokenType = TokenType.LEX_MINUS;
	break;

	case '~':
	tokenType = TokenType.LEX_BITWISE_NOT;
	break;

	case '^':
	tokenType = TokenType.LEX_BITWISE_XOR;
	break;

	case '?':
	tokenType = TokenType.LEX_QMARK;
	break;

	case ':':
	tokenType = TokenType.LEX_COLON;
	break;

	case ';':
	tokenType = TokenType.LEX_SEMICOLON;
	break;

	case ',':
	tokenType = TokenType.LEX_COMMA;
	break;

	case '[':
	tokenType = TokenType.LEX_OPEN_BOX;
	break;

	case ']':
	tokenType = TokenType.LEX_CLOSE_BOX;
	break;

	case '(':
	tokenType = TokenType.LEX_OPEN_PAREN;
	break;

	case ')':
	tokenType = TokenType.LEX_CLOSE_PAREN;
	break;

	case '{':
	tokenType = TokenType.LEX_OPEN_BRACE;
	break;

	case '}':
	tokenType = TokenType.LEX_CLOSE_BRACE;
	break;

	// the following cases need lookaheads

	case '&':
	tokenType = TokenType.LEX_BITWISE_AND;
	if (ch == '&') {
	tokenType = TokenType.LEX_LOGICAL_AND;
	wind();
	}
	break;

	case '\|':
	tokenType = TokenType.LEX_BITWISE_OR;
	if (ch == '\|') {
	tokenType = TokenType.LEX_LOGICAL_OR;
	wind();
	}
	break;

	case '<':
	tokenType = TokenType.LEX_LESS_THAN;
	switch (ch) {
	case '=':
	tokenType = TokenType.LEX_LESS_OR_EQUAL;
	wind();
	break;

	case '<':
	tokenType = TokenType.LEX_LEFT_SHIFT;
	wind();
	break;

	default:
	// just the '<' --- no need to do anything
	break;
	}
	break;

	case '>':
	tokenType = TokenType.LEX_GREATER_THAN;
	switch (ch) {
	case '=':
	tokenType = TokenType.LEX_GREATER_OR_EQUAL;
	wind();
	break;

	case '>':
	tokenType = TokenType.LEX_RIGHT_SHIFT;
	wind();
	if (ch == '>') {
	tokenType = TokenType.LEX_URIGHT_SHIFT;
	wind();
	}
	break;

	default:
	// just the '>' --- no need to do anything
	break;
	}
	break;

	case '=':
	tokenType = TokenType.LEX_BOUND_TO;
	switch (ch) {
	case '=':
	tokenType = TokenType.LEX_EQUAL;
	wind();
	break;

	case '?':
	tokenType = TokenType.LEX_META_EQUAL;
	wind();

	// ensure the trailing '=' of the '=?=' combination
	if (ch != '=') {
	tokenType = TokenType.LEX_TOKEN_ERROR;
	return tokenType;
	}

	wind();
	break;

	case '!':
	extra_lookahead = lexSource.readCharacter();
	lexSource.unreadCharacter();
	if (extra_lookahead == '=') {
	tokenType = TokenType.LEX_META_NOT_EQUAL;
	wind();
	wind();
	}
	break;

	default:
	// just the '=' --- no need to do anything
	break;
	}
	break;

	case '!':
	tokenType = TokenType.LEX_LOGICAL_NOT;
	switch (ch) {
	case '=':
	tokenType = TokenType.LEX_NOT_EQUAL;
	wind();
	break;

	default:
	// just the '!' --- no need to do anything
	break;
	}
	break;

	default:
	tokenType = TokenType.LEX_TOKEN_ERROR;
	return tokenType;
	}

	// cut the token and return
	cut();
	return tokenType;
	}

	// strLexToken: Return string representation of token type
	public static String strLexToken(TokenType tokenType) {
	switch (tokenType) {
	case LEX_END_OF_INPUT:
	return "LEX_END_OF_INPUT";
	case LEX_TOKEN_ERROR:
	return "LEX_TOKEN_ERROR";
	case LEX_TOKEN_TOO_LONG:
	return "LEX_TOKEN_TOO_LONG";

	case LEX_INTEGER_VALUE:
	return "LEX_INTEGER_VALUE";
	case LEX_REAL_VALUE:
	return "LEX_REAL_VALUE";
	case LEX_BOOLEAN_VALUE:
	return "LEX_BOOLEAN_VALUE";
	case LEX_STRING_VALUE:
	return "LEX_STRING_VALUE";
	case LEX_UNDEFINED_VALUE:
	return "LEX_UNDEFINED_VALUE";
	case LEX_ERROR_VALUE:
	return "LEX_ERROR_VALUE";

	case LEX_IDENTIFIER:
	return "LEX_IDENTIFIER";
	case LEX_SELECTION:
	return "LEX_SELECTION";

	case LEX_MULTIPLY:
	return "LEX_MULTIPLY";
	case LEX_DIVIDE:
	return "LEX_DIVIDE";
	case LEX_MODULUS:
	return "LEX_MODULUS";
	case LEX_PLUS:
	return "LEX_PLUS";
	case LEX_MINUS:
	return "LEX_MINUS";

	case LEX_BITWISE_AND:
	return "LEX_BITWISE_AND";
	case LEX_BITWISE_OR:
	return "LEX_BITWISE_OR";
	case LEX_BITWISE_NOT:
	return "LEX_BITWISE_NOT";
	case LEX_BITWISE_XOR:
	return "LEX_BITWISE_XOR";

	case LEX_LEFT_SHIFT:
	return "LEX_LEFT_SHIFT";
	case LEX_RIGHT_SHIFT:
	return "LEX_RIGHT_SHIFT";
	case LEX_URIGHT_SHIFT:
	return "LEX_URIGHT_SHIFT";

	case LEX_LOGICAL_AND:
	return "LEX_LOGICAL_AND";
	case LEX_LOGICAL_OR:
	return "LEX_LOGICAL_OR";
	case LEX_LOGICAL_NOT:
	return "LEX_LOGICAL_NOT";

	case LEX_LESS_THAN:
	return "LEX_LESS_THAN";
	case LEX_LESS_OR_EQUAL:
	return "LEX_LESS_OR_EQUAL";
	case LEX_GREATER_THAN:
	return "LEX_GREATER_THAN";
	case LEX_GREATER_OR_EQUAL:
	return "LEX_GREATER_OR_EQUAL";
	case LEX_EQUAL:
	return "LEX_EQUAL";
	case LEX_NOT_EQUAL:
	return "LEX_NOT_EQUAL";
	case LEX_META_EQUAL:
	return "LEX_META_EQUAL";
	case LEX_META_NOT_EQUAL:
	return "LEX_META_NOT_EQUAL";

	case LEX_BOUND_TO:
	return "LEX_BOUND_TO";

	case LEX_QMARK:
	return "LEX_QMARK";
	case LEX_COLON:
	return "LEX_COLON";
	case LEX_SEMICOLON:
	return "LEX_SEMICOLON";
	case LEX_COMMA:
	return "LEX_COMMA";
	case LEX_OPEN_BOX:
	return "LEX_OPEN_BOX";
	case LEX_CLOSE_BOX:
	return "LEX_CLOSE_BOX";
	case LEX_OPEN_PAREN:
	return "LEX_OPEN_PAREN";
	case LEX_CLOSE_PAREN:
	return "LEX_CLOSE_PAREN";
	case LEX_OPEN_BRACE:
	return "LEX_OPEN_BRACE";
	case LEX_CLOSE_BRACE:
	return "LEX_CLOSE_BRACE";
	case LEX_BACKSLASH:
	return "LEX_BACKSLASH";
	case LEX_ABSOLUTE_TIME_VALUE:
	return "LEX_ABSOLUTE_TIME_VALUE";
	case LEX_RELATIVE_TIME_VALUE:
	return "LEX_RELATIVE_TIME_VALUE";

	default:
	return " Unknown ";
	}
	}

	public LexerSource getLexSource() {
	return lexSource;
	}
	}