jena-arq/src/main/java/org/apache/jena/riot/tokens/TokenizerText.java - jena - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 package org.apache.jena.riot.tokens;

 import static org.apache.jena.atlas.lib.Chars.*;
 import static org.apache.jena.riot.system.RiotChars.*;

 import java.util.NoSuchElementException;
 import java.util.Objects;

 import org.apache.jena.atlas.AtlasException;
 import org.apache.jena.atlas.io.IO;
 import org.apache.jena.atlas.io.PeekReader;
 import org.apache.jena.atlas.lib.Chars;
 import org.apache.jena.riot.RiotParseException;
 import org.apache.jena.riot.system.ErrorHandler;
 import org.apache.jena.riot.system.RiotChars;
 import org.apache.jena.sparql.ARQInternalErrorException;

 /** Tokenizer for all sorts of things RDF-ish */
 public final class TokenizerText implements Tokenizer
 {
     // Drop through to final general symbol/keyword reader, including <=, !=
     // Care with <=
     // Policy driven for CURIES?

     public static final int CTRL_CHAR = CH_STAR;

     // The code has the call points for checking tokens but it is generally better to
     // do the check later in the parsing process. In case a need arises, the code
     // remains, all compiled away by "if ( false )" (javac does not generate any
     // bytecodes and even if it it did, JIT will remove dead branches.
     private static final boolean Checking = false;

     private Token token = null;
     private final StringBuilder stringBuilder = new StringBuilder(200);
     private final PeekReader reader;
     // Whether whitespace between tokens includes newlines (in various forms).
     private final boolean lineMode;
     private boolean finished = false;
     private TokenChecker checker = null;

     // The code assumes that errors throw exception and so stop parsing.
     private final ErrorHandler errorHandler;

     public static TokenizeTextBuilder create() { return new TokenizeTextBuilder() ; }

     /*package*/ static TokenizerText internal(PeekReader reader, boolean lineMode, ErrorHandler errorHandler) {
         return new TokenizerText(reader, lineMode, errorHandler);
     }
     private TokenizerText(PeekReader reader, boolean lineMode, ErrorHandler errorHandler) {
         this.reader = Objects.requireNonNull(reader, "PeekReader");
         this.lineMode = lineMode;
         this.errorHandler = Objects.requireNonNull(errorHandler, "ErrorHandler");
     }

     @Override
     public final boolean hasNext() {
         if ( finished )
             return false;
         if ( token != null )
             return true;

         try {
             skip();
             if ( reader.eof() ) {
                 // close();
                 finished = true;
                 return false;
             }
             token = parseToken();
             if ( token == null ) {
                 // close();
                 finished = true;
                 return false;
             }
             return true;
         } catch (AtlasException ex) {
             if ( ex.getCause() != null ) {
                 if ( ex.getCause().getClass() == java.nio.charset.MalformedInputException.class )
                     throw new RiotParseException("Bad character encoding", reader.getLineNum(), reader.getColNum());
                 throw new RiotParseException("Bad input stream [" + ex.getCause() + "]", reader.getLineNum(),
                                              reader.getColNum());
             }
             throw new RiotParseException("Bad input stream", reader.getLineNum(), reader.getColNum());
         }
     }

     @Override
     public final boolean eof() {
         return !hasNext();
     }

     @Override
     public final Token next() {
         if ( !hasNext() )
             throw new NoSuchElementException();
         Token t = token;
         token = null;
         return t;
     }

     @Override
     public final Token peek() {
         if ( !hasNext() )
             return null;
         return token;
     }

     @Override
     public void remove()
     { throw new UnsupportedOperationException(); }

 //    private TokenChecker getChecker() {
 //        return checker;
 //    }
 //
 //    private void setChecker(TokenChecker checker) {
 //        this.checker = checker;
 //    }
 //
 //    private ErrorHandler getErrorHandler() {
 //        return errorHandler;
 //    }
 //
 //    private void setErrorHandler(ErrorHandler handler) {
 //        this.errorHandler = handler;
 //    }

     @Override
     public void close() {
         IO.close(reader);
     }

     // ---- Machinery

     private void skip() {
         int ch = EOF;
         for (;;) {
             if ( reader.eof() )
                 return;

             ch = reader.peekChar();
             if ( ch == CH_HASH ) {
                 reader.readChar();
                 // Comment. Skip to NL
                 for (;;) {
                     ch = reader.peekChar();
                     if ( ch == EOF || isNewlineChar(ch) )
                         break;
                     reader.readChar();
                 }
             }

             // Including excess newline chars from comment.
             if ( lineMode ) {
                 if ( !isHorizontalWhitespace(ch) )
                     break;
             } else {
                 if ( !isWhitespace(ch) )
                     break;
             }
             reader.readChar();
         }
     }

     private Token parseToken() {
         token = new Token(getLine(), getColumn());

         int ch = reader.peekChar();

         // ---- IRI, unless it's <<.
         // [spc] check is for LT.
         if ( ch == CH_LT ) {
             // Look ahead on char
             reader.readChar();
             int chPeek = reader.peekChar();
             if ( chPeek != '<' ) {
                 token.setImage(readIRI());
                 token.setType(TokenType.IRI);
                 if ( Checking )
                     checkURI(token.getImage());
                 return token;
             }
             if ( chPeek == '<' ) {
                 reader.readChar();
                 token.setType(TokenType.LT2);
                 //token.setImage("<<");
                 return token;
             }
             fatal("Internal error - parsed '"+chPeek+"' after '<'");
         }

         // ---- Literal
         if ( ch == CH_QUOTE1 || ch == CH_QUOTE2 ) {
             // The token type is STRING.
             // We incorporate this into a token for LITERAL_LANG or LITERAL_DT.
             token.setType(TokenType.STRING);

             reader.readChar();
             int ch2 = reader.peekChar();
             if ( ch2 == ch ) {
                 reader.readChar(); // Read potential second quote.
                 int ch3 = reader.peekChar();
                 if ( ch3 == ch ) {
                     reader.readChar();     // Read potential third quote.
                     token.setImage(readLongString(ch, false));
                     StringType st = (ch == CH_QUOTE1) ? StringType.LONG_STRING1 : StringType.LONG_STRING2;
                     token.setStringType(st);
                 } else {
                     // Two quotes then a non-quote.
                     // Must be '' or ""
                     // No need to pushback characters as we know the lexical
                     // form is the empty string.
                     // if ( ch2 != EOF ) reader.pushbackChar(ch2);
                     // if ( ch1 != EOF ) reader.pushbackChar(ch1); // Must be
                     // '' or ""
                     token.setImage("");
                     StringType st = (ch == CH_QUOTE1) ? StringType.STRING1 : StringType.STRING2;
                     token.setStringType(st);
                 }
             } else {
                 // One quote character.
                 token.setImage(readString(ch, ch));
                 // Record exactly what form of STRING was seen.
                 StringType st = (ch == CH_QUOTE1) ? StringType.STRING1 : StringType.STRING2;
                 token.setStringType(st);
             }

             // White space after lexical part of a literal.
             skip();

             // Literal. Is it @ or ^^
             if ( reader.peekChar() == CH_AT ) {
                 reader.readChar();
                 // White space is not legal here.
                 // The Turtle spec terminal is "LANGTAG" which includes the '@'.
                 Token mainToken = new Token(token);
                 mainToken.setType(TokenType.LITERAL_LANG);
                 mainToken.setSubToken1(token);
                 mainToken.setImage2(langTag());
                 token = mainToken;
                 if ( Checking )
                     checkLiteralLang(token.getImage(), token.getImage2());
             } else if ( reader.peekChar() == '^' ) {
                 expect("^^");
                 // White space is legal after a ^^.
                 // It's not a good idea, but it is legal.
 //                // Check no whitespace.
 //                int nextCh = reader.peekChar();
 //                if ( isWhitespace(nextCh) )
 //                    exception("No whitespace after ^^ in literal with datatype");
                 skip();

                 // Stash current token.
                 Token mainToken = new Token(token);
                 mainToken.setSubToken1(token);
                 mainToken.setImage(token.getImage());

                 Token subToken = parseToken();
                 if ( !subToken.isIRI() )
                     fatal("Datatype URI required after ^^ - URI or prefixed name expected");

                 mainToken.setSubToken2(subToken);
                 mainToken.setType(TokenType.LITERAL_DT);

                 token = mainToken;
                 if ( Checking )
                     checkLiteralDT(token.getImage(), subToken);
             } else {
                 // Was a simple string.
                 if ( Checking )
                     checkString(token.getImage());
             }
             return token;
         }

         if ( ch == CH_UNDERSCORE ) {
             reader.readChar();
             int ch2 = reader.peekChar();
             if ( ch2 == CH_COLON ) {
                 reader.readChar();
                 // Blank node :label must be at least one char
                 token.setImage(readBlankNodeLabel());
                 token.setType(TokenType.BNODE);
                 if ( Checking ) checkBlankNode(token.getImage());
                 return token;
             }
             token.setType(TokenType.UNDERSCORE);
             /*token.setImage(CH_UNDERSCORE);*/
             return token;
         }

         // A directive (not part of a literal as lang tag)
         if ( ch == CH_AT ) {
             reader.readChar();
             token.setType(TokenType.DIRECTIVE);
             token.setImage(readWord(false));
             if ( Checking )
                 checkDirective(token.cntrlCode);
             return token;
         }

         // Variable
         if ( ch == CH_QMARK ) {
             reader.readChar();
             token.setType(TokenType.VAR);
             // Character set?
             token.setImage(readVarName());
             if ( Checking )
                 checkVariable(token.getImage());
             return token;
         }

         switch(ch)
         {
             // DOT can start a decimal.  Check for digit.
             case CH_DOT:
                 reader.readChar();
                 ch = reader.peekChar();
                 if ( range(ch, '0', '9') ) {
                     // Not a DOT after all.
                     reader.pushbackChar(CH_DOT);
                     readNumber();
                     if ( Checking )
                         checkNumber(token.getImage(), token.getImage2());
                     return token;
                 }
                 token.setType(TokenType.DOT);
                 return token;

             case CH_GT: {
                 reader.readChar();
                 int chPeek = reader.peekChar();
                 if ( chPeek == CH_GT ) {
                     reader.readChar();
                     token.setType(TokenType.GT2);
                     return token;
                 }
                 token.setType(TokenType.GT);
                 //token.setImage(">>");
                 return token;
             }

             case CH_SEMICOLON:  reader.readChar(); token.setType(TokenType.SEMICOLON); /*token.setImage(CH_SEMICOLON);*/ return token;
             case CH_COMMA:      reader.readChar(); token.setType(TokenType.COMMA);     /*token.setImage(CH_COMMA);*/ return token;
             case CH_LBRACE:     reader.readChar(); token.setType(TokenType.LBRACE);    /*token.setImage(CH_LBRACE);*/ return token;
             case CH_RBRACE:     reader.readChar(); token.setType(TokenType.RBRACE);    /*token.setImage(CH_RBRACE);*/ return token;
             case CH_LPAREN:     reader.readChar(); token.setType(TokenType.LPAREN);    /*token.setImage(CH_LPAREN);*/ return token;
             case CH_RPAREN:     reader.readChar(); token.setType(TokenType.RPAREN);    /*token.setImage(CH_RPAREN);*/ return token;
             case CH_LBRACKET:   reader.readChar(); token.setType(TokenType.LBRACKET);  /*token.setImage(CH_LBRACKET);*/ return token;
             case CH_RBRACKET:   reader.readChar(); token.setType(TokenType.RBRACKET);  /*token.setImage(CH_RBRACKET);*/ return token;
             case CH_EQUALS:     reader.readChar(); token.setType(TokenType.EQUALS);    /*token.setImage(CH_EQUALS);*/ return token;
             case CH_SLASH:      reader.readChar(); token.setType(TokenType.SLASH);     /*token.setImage(CH_SLASH);*/ return token;
             case CH_RSLASH:     reader.readChar(); token.setType(TokenType.RSLASH);    /*token.setImage(CH_RSLASH);*/ return token;
             case CH_VBAR:       reader.readChar(); token.setType(TokenType.VBAR);      /*token.setImage(CH_VBAR);*/ return token;
             case CH_AMPHERSAND: reader.readChar(); token.setType(TokenType.AMPHERSAND);/*token.setImage(CH_AMPHERSAND);*/ return token;
             // Specials (if blank node processing off)
             //case CH_COLON:      reader.readChar(); token.setType(TokenType.COLON); /*token.setImage(COLON);*/return token;

             // Done above with blank nodes.
             //case CH_UNDERSCORE: reader.readChar(); token.setType(TokenType.UNDERSCORE);/*token.setImage(CH_UNDERSCORE);*/ return token;
             case CH_LT:         reader.readChar(); token.setType(TokenType.LT);        /*token.setImage(CH_LT);*/ return token;
             case CH_STAR:       reader.readChar(); token.setType(TokenType.STAR);      /*token.setImage(CH_STAR);*/ return token;

             // XXX Multi-character symbols
             // Two character tokens && || GE >= , LE <=
             //TokenType.LE
             //TokenType.GE
             //TokenType.LOGICAL_AND
             //TokenType.LOGICAL_OR
         }

         // ---- Numbers.
         // A plain "+" and "-", not followed by a digit, are symbols.

         /*
         [16]    integer         ::=     ('-' | '+') ? [0-9]+
         [17]    double          ::=     ('-' | '+') ? ( [0-9]+ '.' [0-9]* exponent | '.' ([0-9])+ exponent | ([0-9])+ exponent )
                                         0.e0, .0e0, 0e0
         [18]    decimal         ::=     ('-' | '+')? ( [0-9]+ '.' [0-9]* | '.' ([0-9])+ | ([0-9])+ )
                                         0.0 .0 0.
         [19]    exponent        ::=     [eE] ('-' | '+')? [0-9]+
         []      hex             ::=     0x0123456789ABCDEFG

         */

         // TODO readNumberNoSign

         int signCh = 0;

         if ( ch == CH_PLUS || ch == CH_MINUS ) {
             reader.readChar();
             int ch2 = reader.peekChar();

             if ( !range(ch2, '0', '9') ) {
                 // ch was end of symbol.
                 // reader.readChar();
                 if ( ch == CH_PLUS )
                     token.setType(TokenType.PLUS);
                 else
                     token.setType(TokenType.MINUS);
                 return token;
             }

             // Already got a + or - ...
             // readNumberNoSign
             // Because next, old code proceses signs.
             reader.pushbackChar(ch);
             signCh = ch;
             // Drop to next "if"
         }

         if ( ch == CH_PLUS || ch == CH_MINUS || range(ch, '0', '9') ) {
             // readNumberNoSign
             readNumber();
             if ( Checking )
                 checkNumber(token.getImage(), token.getImage2());
             return token;
         }

         if ( isNewlineChar(ch) ) {
             //** - If collecting token image.
             //** stringBuilder.setLength(0);
             // Any number of NL and CR become one "NL" token.
             do {
                 int ch2 = reader.readChar();
                 //** stringBuilder.append((char)ch2);
             } while (isNewlineChar(reader.peekChar()));
             token.setType(TokenType.NL);
             //** token.setImage(stringBuilder.toString());
             return token;
         }

         // Plain words and prefixes.
         //   Can't start with a number due to numeric test above.
         //   Can't start with a '_' due to blank node test above.
         // If we see a :, the first time it means a prefixed name else it's a token break.

         readPrefixedNameOrKeyword(token);

         if ( Checking ) checkKeyword(token.getImage());
         return token;
     }

     private static final boolean VeryVeryLaxIRI = false;
     // Spaces in IRI are illegal.
     private static final boolean AllowSpacesInIRI = false;

     // [8]  IRIREF  ::= '<' ([^#x00-#x20<>"{}|^`\] | UCHAR)* '>'
     private String readIRI() {
         stringBuilder.setLength(0);
         for (;;) {
             int ch = reader.readChar();
             switch(ch) {
                 case EOF:
                     fatal("Broken IRI (End of file)"); return null;
                 case NL:
                     fatal("Broken IRI (newline): %s", stringBuilder.toString()); return null;
                 case CR:
                     fatal("Broken IRI (CR): %s", stringBuilder.toString()); return null;
                 case CH_GT:
                     // Done!
                     return stringBuilder.toString();
                 case CH_RSLASH:
                     if ( VeryVeryLaxIRI )
                         // Includes unicode escapes and also \n etc
                         ch = readLiteralEscape();
                     else
                         // NORMAL
                         ch = readUnicodeEscape();
                     // Don't check legality of ch (strict syntax at this point).
                     // That does not mean it is a good idea to bypass checking.
                     // Bad characters will lead to trouble elsewhere.
                     break;
                 case CH_LT:
                     // Probably a corrupt file so treat as fatal.
                     fatal("Bad character in IRI (bad character: '<'): <%s[<]...>", stringBuilder.toString()); return null;
                 case TAB:
                     error("Bad character in IRI (Tab character): <%s[tab]...>", stringBuilder.toString()); return null;
                 case '{': case '}': case '"': case '|': case '^': case '`' :
                     if ( ! VeryVeryLaxIRI )
                         warning("Illegal character in IRI (codepoint 0x%02X, '%c'): <%s[%c]...>", ch, (char)ch, stringBuilder.toString(), (char)ch);
                     break;
                 case SPC:
                     if ( ! AllowSpacesInIRI )
                         error("Bad character in IRI (space): <%s[space]...>", stringBuilder.toString());
                     else
                         warning("Bad character in IRI (space): <%s[space]...>", stringBuilder.toString());
                     break;
                 default:
                     if ( ch <= 0x19 )
                         warning("Illegal character in IRI (control char 0x%02X): <%s[0x%02X]...>", ch, stringBuilder.toString(), ch);

             }
             // JENA-1924: jena-iri does not catch this.
             if ( ! VeryVeryLaxIRI && ch >= 0xA0 && ! isUcsChar(ch) )
                 warning("Illegal character in IRI (Not a ucschar: 0x%04X): <%s[U+%04X]...>", ch, stringBuilder.toString(), ch);
             insertCodepoint(stringBuilder, ch);
         }
     }

     private static boolean isUcsChar(int ch) {
         // RFC 3987
         // ucschar    = %xA0-D7FF / %xF900-FDCF / %xFDF0-FFEF
         //            / %x10000-1FFFD / %x20000-2FFFD / %x30000-3FFFD
         //            / %x40000-4FFFD / %x50000-5FFFD / %x60000-6FFFD
         //            / %x70000-7FFFD / %x80000-8FFFD / %x90000-9FFFD
         //            / %xA0000-AFFFD / %xB0000-BFFFD / %xC0000-CFFFD
         //            / %xD0000-DFFFD / %xE1000-EFFFD
         boolean b = range(ch, 0xA0, 0xD7FF)  || range(ch, 0xF900, 0xFDCF)  || range(ch, 0xFDF0, 0xFFEF);
         if ( b )
             return true;
         if ( ch < 0x1000 )
             return false;
         // 32 bit checks.
         return
             range(ch, 0x10000, 0x1FFFD) || range(ch, 0x20000, 0x2FFFD) || range(ch, 0x30000, 0x3FFFD) ||
             range(ch, 0x40000, 0x4FFFD) || range(ch, 0x50000, 0x5FFFD) || range(ch, 0x60000, 0x6FFFD) ||
             range(ch, 0x70000, 0x7FFFD) || range(ch, 0x80000, 0x8FFFD) || range(ch, 0x90000, 0x9FFFD) ||
             range(ch, 0xA0000, 0xAFFFD) || range(ch, 0xB0000, 0xBFFFD) || range(ch, 0xC0000, 0xCFFFD) ||
             range(ch, 0xD0000, 0xDFFFD) || range(ch, 0xE1000, 0xEFFFD);
     }

     // Read a unicode escape : does not allow \\ bypass
     private final int readUnicodeEscape() {
         int ch = reader.readChar();
         if ( ch == EOF )
             fatal("Broken escape sequence");

         switch (ch) {
             case 'u': return readUnicode4Escape();
             case 'U': return readUnicode8Escape();
             default:
                 fatal("Illegal unicode escape sequence value: \\%c (0x%02X)", ch, ch);
         }
         return 0;
     }

     private void readPrefixedNameOrKeyword(Token token) {
         long posn = reader.getPosition();
         String prefixPart = readPrefixPart(); // Prefix part or keyword
         token.setImage(prefixPart);
         token.setType(TokenType.KEYWORD);
         int ch = reader.peekChar();
         if ( ch == CH_COLON ) {
             reader.readChar();
             token.setType(TokenType.PREFIXED_NAME);
             String ln = readLocalPart(); // Local part
             token.setImage2(ln);
             if ( Checking )
                 checkPrefixedName(token.getImage(), token.getImage2());
         }

         // If we made no progress, nothing found, not even a keyword -- it's an
         // error.
         if ( posn == reader.getPosition() )
             fatal("Failed to find a prefix name or keyword: %c(%d;0x%04X)", ch, ch, ch);

         if ( Checking )
             checkKeyword(token.getImage());

     }

     /*
     The token rules from SPARQL and Turtle.
     PNAME_NS       ::=  PN_PREFIX? ':'
     PNAME_LN       ::=  PNAME_NS PN_LOCAL

     PN_CHARS_BASE  ::=  [A-Z] | [a-z] | [#x00C0-#x00D6] | [#x00D8-#x00F6] | [#x00F8-#x02FF] | [#x0370-#x037D] | [#x037F-#x1FFF] | [#x200C-#x200D] | [#x2070-#x218F] | [#x2C00-#x2FEF] | [#x3001-#xD7FF] | [#xF900-#xFDCF] | [#xFDF0-#xFFFD] | [#x10000-#xEFFFF]
     PN_CHARS_U  ::=  PN_CHARS_BASE | '_'
     PN_CHARS  ::=  PN_CHARS_U | '-' | [0-9] | #x00B7 | [#x0300-#x036F] | [#x203F-#x2040]

     PN_PREFIX  ::=  PN_CHARS_BASE ((PN_CHARS|'.')* PN_CHARS)?
     PN_LOCAL  ::=  (PN_CHARS_U | ':' | [0-9] | PLX ) ((PN_CHARS | '.' | ':' | PLX)* (PN_CHARS | ':' | PLX) )?
     PLX  ::=  PERCENT | PN_LOCAL_ESC
     PERCENT  ::=  '%' HEX HEX
     HEX  ::=  [0-9] | [A-F] | [a-f]
     PN_LOCAL_ESC  ::=  '\' ( '_' | '~' | '.' | '-' | '!' | '$' | '&' | "'" | '(' | ')' | '*' | '+' | ',' | ';' | '=' | '/' | '?' | '#' | '@' | '%' )
     */

     private String readPrefixPart()
     { return readSegment(false); }

     private String readLocalPart()
     { return readSegment(true); }

     // Read the prefix or localname part of a prefixed.
     // Returns "" when there are no valid characters, e.g. prefix for ":foo" or local name for "ex:".
     private String readSegment(boolean isLocalPart) {
         // Prefix: PN_CHARS_BASE                       ((PN_CHARS|'.')* PN_CHARS)?
         // Local: ( PN_CHARS_U | ':' | [0-9] | PLX )   ((PN_CHARS | '.' | ':' | PLX)* (PN_CHARS | ':' | PLX) )?
         //    PN_CHARS_U is PN_CHARS_BASE and '_'

         // RiotChars has isPNChars_U_N for   ( PN_CHARS_U | [0-9] )
         stringBuilder.setLength(0);

         // -- Test first character
         int ch = reader.peekChar();
         if ( ch == EOF )
             return "";
         if ( isLocalPart ) {
             if ( ch == CH_COLON ) {
                 reader.readChar();
                 stringBuilder.append((char)ch);
             } else if ( ch == CH_PERCENT || ch == CH_RSLASH ) {
                 // processPLX
                 // read % or \
                 reader.readChar();
                 processPLX(ch);
             } else if ( RiotChars.isPNChars_U_N(ch) ) {
                 stringBuilder.append((char)ch);
                 reader.readChar();
             } else
                 return "";
         } else {
             if ( !RiotChars.isPNCharsBase(ch) )
                 return "";
             stringBuilder.append((char)ch);
             reader.readChar();
         }
         // Done first character
         int chDot = 0;

         for (;;) {
             ch = reader.peekChar();
             boolean valid = false;

             if ( isLocalPart && (ch == CH_PERCENT || ch == CH_RSLASH) ) {
                 reader.readChar();
                 if ( chDot != 0 )
                     stringBuilder.append((char)chDot);
                 processPLX(ch);
                 chDot = 0;
                 continue;
             }

             // Single valid characters
             if ( isLocalPart && ch == CH_COLON )
                 valid = true;
             else if ( isPNChars(ch) )
                 valid = true;
             else if ( ch == CH_DOT )
                 valid = true;
             else
                 valid = false;

             if ( !valid )
                 break; // Exit loop

             // Valid character.
             reader.readChar();
             // Was there also a DOT previous loop?
             if ( chDot != 0 ) {
                 stringBuilder.append((char)chDot);
                 chDot = 0;
             }

             if ( ch != CH_DOT )
                 stringBuilder.append((char)ch);
             else
                 // DOT - delay until next loop.
                 chDot = ch;
         }

         // On exit, chDot may hold a character.

         if ( chDot == CH_DOT )
             // Unread it.
             reader.pushbackChar(chDot);
         return stringBuilder.toString();
     }

     // Process PLX (percent or character escape for a prefixed name)
     private void processPLX(int ch)
     {
         if ( ch == CH_PERCENT )
         {
             stringBuilder.append((char)ch);

             ch = reader.peekChar();
             if ( ! isHexChar(ch) )
                 fatal("Not a hex character: '%c'",ch);
             stringBuilder.append((char)ch);
             reader.readChar();

             ch = reader.peekChar();
             if ( ! isHexChar(ch) )
                 fatal("Not a hex character: '%c'",ch);
             stringBuilder.append((char)ch);
             reader.readChar();
         }
         else if ( ch == CH_RSLASH )
         {
             ch = readCharEscape();
             stringBuilder.append((char)ch);
         }
         else
             throw new ARQInternalErrorException("Not a '\\' or a '%' character");
     }

     // Get characters between two markers.
     // strEscapes may be processed
     private String readString(int startCh, int endCh) {
         long y = getLine();
         long x = getColumn();
         stringBuilder.setLength(0);
         // Assumes first delimiter char read already.
         // Reads terminating delimiter

         for (;;) {
             int ch = reader.readChar();
             if ( ch == EOF ) {
                 // if ( endNL ) return stringBuilder.toString();
                 fatal("Broken token: " + stringBuilder.toString(), y, x);
             }

             if ( ch == NL )
                 fatal("Broken token (newline): " + stringBuilder.toString(), y, x);

             if ( ch == endCh ) {
                 return stringBuilder.toString();
             }

             if ( ch == CH_RSLASH )
                 ch = readLiteralEscape();
             insertCodepoint(stringBuilder, ch);
         }
     }

     private String readLongString(int quoteChar, boolean endNL) {
         stringBuilder.setLength(0);
         for (;;) {
             int ch = reader.readChar();
             if ( ch == EOF ) {
                 if ( endNL )
                     return stringBuilder.toString();
                 fatal("Broken long string");
             }

             if ( ch == quoteChar ) {
                 if ( threeQuotes(quoteChar) )
                     return stringBuilder.toString();
             }

             if ( ch == CH_RSLASH )
                 ch = readLiteralEscape();
             insertCodepoint(stringBuilder, ch);
         }
     }

     private String readWord(boolean leadingDigitAllowed)
     { return readWordSub(leadingDigitAllowed, false); }

     // A 'word' is used in several places:
     //   keyword
     //   prefix part of prefix name
     //   local part of prefix name (allows digits)

     static private char[] extraCharsWord = new char[] {'_', '.' , '-'};

     private String readWordSub(boolean leadingDigitAllowed, boolean leadingSignAllowed) {
         return readCharsAnd(leadingDigitAllowed, leadingSignAllowed, extraCharsWord, false);
     }

     // This array adds the other characters that can occurs in an internal variable name.
     // Variables can be created with SPARQL-illegal syntax to ensure they do not clash with
     // variables in the query from the application.
     // See ARQConstants.
     //   allocVarAnonMarker, allocVarMarker, globalVar, allocVarBNodeToVar, allocVarScopeHiding
     // but this set is wider and matches anywhere in the name after the first '?'.
     static private char[] extraCharsVar = new char[]{'_', '.', '-', '?', '@', '+', '/', '~'};

     private String readVarName() {
         return readCharsAnd(true, true, extraCharsVar, true);
     }

     // See also readBlankNodeLabel

     private String readCharsAnd(boolean leadingDigitAllowed, boolean leadingSignAllowed, char[] extraChars, boolean allowFinalDot) {
         stringBuilder.setLength(0);
         int idx = 0;
         if ( !leadingDigitAllowed ) {
             int ch = reader.peekChar();
             if ( Character.isDigit(ch) )
                 return "";
         }

         // Used for local part of prefix names =>
         if ( !leadingSignAllowed ) {
             int ch = reader.peekChar();
             if ( ch == '-' || ch == '+' )
                 return "";
         }

         for (;; idx++) {
             int ch = reader.peekChar();

             if ( isAlphaNumeric(ch) || Chars.charInArray(ch, extraChars) ) {
                 reader.readChar();
                 stringBuilder.append((char)ch);
                 continue;
             } else
                 // Inappropriate character.
                 break;

         }

         if ( !allowFinalDot ) {
             // BAD : assumes pushbackChar is infinite.
             // Check is ends in "."
             while (idx > 0 && stringBuilder.charAt(idx - 1) == CH_DOT) {
                 // Push back the dot.
                 reader.pushbackChar(CH_DOT);
                 stringBuilder.setLength(idx - 1);
                 idx--;
             }
         }
         return stringBuilder.toString();
     }

     // BLANK_NODE_LABEL    ::=     '_:' (PN_CHARS_U | [0-9]) ((PN_CHARS | '.')* PN_CHARS)?

     private String readBlankNodeLabel() {
         stringBuilder.setLength(0);
         // First character.
         {
             int ch = reader.peekChar();
             if ( ch == EOF )
                 fatal("Blank node label missing (EOF found)");
             if ( isWhitespace(ch) )
                 fatal("Blank node label missing");
             // if ( ! isAlpha(ch) && ch != '_' )
             // Not strict

             if ( !RiotChars.isPNChars_U_N(ch) )
                 fatal("Blank node label does not start with alphabetic or _ :" + (char)ch);
             reader.readChar();
             stringBuilder.append((char)ch);
         }

         // Remainder. DOT can't be last so do a delay on that.

         int chDot = 0;

         for (;;) {
             int ch = reader.peekChar();
             if ( ch == EOF )
                 break;

             // DOT magic.
             if ( !(RiotChars.isPNChars(ch) || ch == CH_DOT) )
                 break;
             reader.readChar();

             if ( chDot != 0 ) {
                 stringBuilder.append((char)chDot);
                 chDot = 0;
             }

             if ( ch != CH_DOT )
                 stringBuilder.append((char)ch);
             else
                 // DOT - delay until next loop.
                 chDot = ch;
         }

         if ( chDot == CH_DOT )
             // Unread it.
             reader.pushbackChar(chDot);

         // if ( ! seen )
         // exception("Blank node label missing");
         return stringBuilder.toString();
     }

     /*
      * [146]  INTEGER  ::=  [0-9]+
      * [147]  DECIMAL  ::=  [0-9]* '.' [0-9]+
      * [148]  DOUBLE  ::=  [0-9]+ '.' [0-9]* EXPONENT | '.' ([0-9])+ EXPONENT | ([0-9])+ EXPONENT
      * []     hex             ::=     0x0123456789ABCDEFG
      */
     private void readNumber() {
         // One entry, definitely a number.
         // Beware of '.' as a (non) decimal.
         /*
         maybeSign()
         digits()
         if dot ==> decimal, digits
         if e   ==> double, maybeSign, digits
         else
             check not "." for decimal.
         */
         boolean isDouble = false;
         boolean isDecimal = false;
         stringBuilder.setLength(0);

         /*
         readPossibleSign(stringBuilder);
         readDigits may be hex
         readDot
         readDigits
         readExponent.
         */

         int x = 0; // Digits before a dot.
         int ch = reader.peekChar();
         if ( ch == '0' ) {
             x++;
             reader.readChar();
             stringBuilder.append((char)ch);
             ch = reader.peekChar();
             if ( ch == 'x' || ch == 'X' ) {
                 reader.readChar();
                 stringBuilder.append((char)ch);
                 readHex(reader, stringBuilder);
                 token.setImage(stringBuilder.toString());
                 token.setType(TokenType.HEX);
                 return;
             }
         } else if ( ch == '-' || ch == '+' ) {
             readPossibleSign(stringBuilder);
         }

         x += readDigits(stringBuilder);
 //        if ( x == 0 ) {}
         ch = reader.peekChar();
         if ( ch == CH_DOT ) {
             reader.readChar();
             stringBuilder.append(CH_DOT);
             isDecimal = true; // Includes things that will be doubles.
             readDigits(stringBuilder);
         }

         if ( x == 0 && !isDecimal )
             // Possible a tokenizer error - should not have entered readNumber
             // in the first place.
             fatal("Unrecognized as number");

         if ( exponent(stringBuilder) ) {
             isDouble = true;
             isDecimal = false;
         }

         // Final part - "decimal" 123. is an integer 123 and a DOT.
         if ( isDecimal ) {
             int len = stringBuilder.length();
             if ( stringBuilder.charAt(len - 1) == CH_DOT ) {
                 stringBuilder.setLength(len - 1);
                 reader.pushbackChar(CH_DOT);
                 isDecimal = false;
             }
         }

         token.setImage(stringBuilder.toString());
         if ( isDouble )
             token.setType(TokenType.DOUBLE);
         else if ( isDecimal )
             token.setType(TokenType.DECIMAL);
         else
             token.setType(TokenType.INTEGER);
     }

     private void readHex(PeekReader reader, StringBuilder sb) {
         // Just after the 0x, which are in sb
         int x = 0;
         for (;;) {
             int ch = reader.peekChar();

             if ( !isHexChar(ch) )
                 break;
             reader.readChar();
             sb.append((char)ch);
             x++;
         }
         if ( x == 0 )
             fatal("No hex characters after " + sb.toString());
     }

     private int readDigits(StringBuilder buffer) {
         int count = 0;
         for (;;) {
             int ch = reader.peekChar();
             if ( !range(ch, '0', '9') )
                 break;
             reader.readChar();
             buffer.append((char)ch);
             count++;
         }
         return count;
     }

     private void readPossibleSign(StringBuilder sb) {
         int ch = reader.peekChar();
         if ( ch == '-' || ch == '+' ) {
             reader.readChar();
             sb.append((char)ch);
         }
     }

     // Assume have read the first quote char.
     // On return:
     //   If false, have moved over no more characters (due to pushbacks)
     //   If true, at end of 3 quotes
     private boolean threeQuotes(int ch) {
         // reader.readChar(); // Read first quote.
         int ch2 = reader.peekChar();
         if ( ch2 != ch ) {
             // reader.pushbackChar(ch2);
             return false;
         }

         reader.readChar(); // Read second quote.
         int ch3 = reader.peekChar();
         if ( ch3 != ch ) {
             // reader.pushbackChar(ch3);
             reader.pushbackChar(ch2);
             return false;
         }

         // Three quotes.
         reader.readChar(); // Read third quote.
         return true;
     }

     private boolean exponent(StringBuilder sb) {
         int ch = reader.peekChar();
         if ( ch != 'e' && ch != 'E' )
             return false;
         reader.readChar();
         sb.append((char)ch);
         readPossibleSign(sb);
         int x = readDigits(sb);
         if ( x == 0 )
             fatal("Malformed double: " + sb);
         return true;
     }

     private String langTag() {
         stringBuilder.setLength(0);
         a2z(stringBuilder);
         if ( stringBuilder.length() == 0 )
             fatal("Bad language tag");
         for (;;) {
             int ch = reader.peekChar();
             if ( ch == '-' ) {
                 reader.readChar();
                 stringBuilder.append('-');
                 int x = stringBuilder.length();
                 a2zN(stringBuilder);
                 if ( stringBuilder.length() == x )
                     fatal("Bad language tag");
             } else
                 break;
         }
         return stringBuilder.toString().intern();
     }

     // ASCII-only e.g. in lang tags.
     private void a2z(StringBuilder sb2) {
         for (;;) {
             int ch = reader.peekChar();
             if ( isA2Z(ch) ) {
                 reader.readChar();
                 stringBuilder.append((char)ch);
             } else
                 return;
         }
     }

     private void a2zN(StringBuilder sb2) {
         for (;;) {
             int ch = reader.peekChar();
             if ( isA2ZN(ch) ) {
                 reader.readChar();
                 stringBuilder.append((char)ch);
             } else
                 return;
         }
     }

     private void insertCodepoint(StringBuilder buffer, int ch) {
         if ( Character.charCount(ch) == 1 )
             buffer.append((char)ch);
         else {
             // Convert to UTF-16. Note that the rest of any system this is used
             // in must also respect codepoints and surrogate pairs.
             if ( !Character.isDefined(ch) && !Character.isSupplementaryCodePoint(ch) )
                 fatal("Illegal codepoint: 0x%04X", ch);
             char[] chars = Character.toChars(ch);
             buffer.append(chars);
         }
     }

     @Override
     public long getColumn() {
         return reader.getColNum();
     }

     @Override
     public long getLine() {
         return reader.getLineNum();
     }

     // ---- Routines to check tokens

     private void checkBlankNode(String blankNodeLabel) {
         if ( checker != null )
             checker.checkBlankNode(blankNodeLabel);
     }

     private void checkLiteralLang(String lexicalForm, String langTag) {
         if ( checker != null )
             checker.checkLiteralLang(lexicalForm, langTag);
     }

     private void checkLiteralDT(String lexicalForm, Token datatype) {
         if ( checker != null )
             checker.checkLiteralDT(lexicalForm, datatype);
     }

     private void checkString(String string) {
         if ( checker != null )
             checker.checkString(string);
     }

     private void checkURI(String uriStr) {
         if ( checker != null )
             checker.checkURI(uriStr);
     }

     private void checkNumber(String image, String datatype) {
         if ( checker != null )
             checker.checkNumber(image, datatype);
     }

     private void checkVariable(String tokenImage) {
         if ( checker != null )
             checker.checkVariable(tokenImage);
     }

     private void checkDirective(int cntrlCode) {
         if ( checker != null )
             checker.checkDirective(cntrlCode);
     }

     private void checkKeyword(String tokenImage) {
         if ( checker != null )
             checker.checkKeyword(tokenImage);
     }

     private void checkPrefixedName(String tokenImage, String tokenImage2) {
         if ( checker != null )
             checker.checkPrefixedName(tokenImage, tokenImage2);
     }

     private void checkControl(int code) {
         if ( checker != null )
             checker.checkControl(code);
     }

     // ---- Escape sequences

     private final int readLiteralEscape() {
         int c = reader.readChar();
         if ( c == EOF )
             fatal("Escape sequence not completed");

         switch (c) {
             case 'n':   return NL;
             case 'r':   return CR;
             case 't':   return '\t';
             case 'f':   return '\f';
             case 'b':   return BSPACE;
             case '"':   return '"';
             case '\'':  return '\'';
             case '\\':  return '\\';
             case 'u':   return readUnicode4Escape();
             case 'U':   return readUnicode8Escape();
             default:
                 fatal("Illegal escape sequence value: %c (0x%02X)", c, c);
                 return 0;
         }
     }

     private final int readCharEscape() {
         // PN_LOCAL_ESC ::= '\' ( '_' | '~' | '.' | '-' | '!' | '$' | '&' | "'"
         // | '(' | ')' | '*' | '+' | ',' | ';' | '=' | '/' | '?' | '#' | '@' |
         // '%' )

         int c = reader.readChar();
         if ( c == EOF )
             fatal("Escape sequence not completed");

         switch (c) {
             case '_': case '~': case '.':  case '-':  case '!':  case '$':  case '&':
             case '\'':
             case '(':  case ')':  case '*':  case '+':  case ',':  case ';':
             case '=':  case '/':  case '?':  case '#':  case '@':  case '%':
                 return c;
             default:
                 fatal("illegal character escape value: \\%c", c);
                 return 0;
         }
     }

     private final
     int readUnicode4Escape() { return readHexSequence(4); }

     private final int readUnicode8Escape() {
         int ch8 = readHexSequence(8);
         if ( ch8 > Character.MAX_CODE_POINT )
             fatal("Illegal code point in \\U sequence value: 0x%08X", ch8);
         return ch8;
     }

     private final int readHexSequence(int N) {
         int x = 0;
         for (int i = 0; i < N; i++) {
             int d = readHexChar();
             if ( d < 0 )
                 return -1;
             x = (x << 4) + d;
         }
         return x;
     }

     private final int readHexChar() {
         int ch = reader.readChar();
         if ( ch == EOF )
             fatal("Not a hexadecimal character (end of file)");

         int x = valHexChar(ch);
         if ( x != -1 )
             return x;
         fatal("Not a hexadecimal character: " + (char)ch);
         return -1;
     }

     private boolean expect(String str) {
         for (int i = 0; i < str.length(); i++) {
             char want = str.charAt(i);
             if ( reader.eof() ) {
                 fatal("End of input during expected string: " + str);
                 return false;
             }
             int inChar = reader.peekChar();
             if ( inChar != want ) {
                 fatal("expected \"" + str + "\"");
                 return false;
             }
             reader.readChar();
         }
         return true;
     }

     /** Warning - can continue. */
     private void warning(String message, Object... args) {
         String msg = String.format(message, args);
         errorHandler.warning(msg, reader.getLineNum(), reader.getColNum());
     }

     /** Error - at the tokenizer level, it can continue (with some junk) but it is a serious error and the
      * caller probably should treat as an error and stop.
      * @param message
      * @param args
      */
     private void error(String message, Object... args) {
         String msg = String.format(message, args);
         errorHandler.error(msg, reader.getLineNum(), reader.getColNum());
     }

     /** Structural error - unrecoverable - but reported as ERROR (FATAL can imply system fault) */
     private void fatal(String message, Object... args) {
         String msg = String.format(message, args);
         long line = reader.getLineNum();
         long col = reader.getColNum();
         errorHandler.fatal(msg, line, col);
         // We require that errors cause the tokenizer to stop so in case the
         // provided error handler does not, we throw an exception.
         throw new RiotParseException(message, line, col);
     }
 }