| /* |
| * Licensed to the Apache Software Foundation (ASF) under one |
| * or more contributor license agreements. See the NOTICE file |
| * distributed with this work for additional information |
| * regarding copyright ownership. The ASF licenses this file |
| * to you under the Apache License, Version 2.0 (the |
| * "License"); you may not use this file except in compliance |
| * with the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| package org.apache.jena.atlas.json.io.parser; |
| |
| import static org.apache.jena.atlas.lib.Chars.CH_COLON; |
| import static org.apache.jena.atlas.lib.Chars.CH_COMMA; |
| import static org.apache.jena.atlas.lib.Chars.CH_DOT; |
| import static org.apache.jena.atlas.lib.Chars.CH_GT; |
| import static org.apache.jena.atlas.lib.Chars.CH_HASH; |
| import static org.apache.jena.atlas.lib.Chars.CH_LBRACE; |
| import static org.apache.jena.atlas.lib.Chars.CH_LBRACKET; |
| import static org.apache.jena.atlas.lib.Chars.CH_LPAREN; |
| import static org.apache.jena.atlas.lib.Chars.CH_LT; |
| import static org.apache.jena.atlas.lib.Chars.CH_MINUS; |
| import static org.apache.jena.atlas.lib.Chars.CH_PLUS; |
| import static org.apache.jena.atlas.lib.Chars.CH_QUOTE1; |
| import static org.apache.jena.atlas.lib.Chars.CH_QUOTE2; |
| import static org.apache.jena.atlas.lib.Chars.CH_RBRACE; |
| import static org.apache.jena.atlas.lib.Chars.CH_RBRACKET; |
| import static org.apache.jena.atlas.lib.Chars.CH_RPAREN; |
| import static org.apache.jena.atlas.lib.Chars.CH_SEMICOLON; |
| import static org.apache.jena.atlas.lib.Chars.CR; |
| import static org.apache.jena.atlas.lib.Chars.EOF; |
| import static org.apache.jena.atlas.lib.Chars.NL; |
| |
| import java.io.IOException ; |
| import java.util.NoSuchElementException ; |
| |
| import org.apache.jena.atlas.io.IO ; |
| import org.apache.jena.atlas.io.PeekReader ; |
| import org.apache.jena.atlas.json.JsonParseException ; |
| import org.apache.jena.riot.tokens.StringType; |
| import org.apache.jena.riot.tokens.Token ; |
| import org.apache.jena.riot.tokens.TokenType ; |
| import org.apache.jena.riot.tokens.Tokenizer ; |
| |
| |
| |
| /** Tokenizer for all sorts of things JSON-ish */ |
| |
| public class TokenizerJSON implements Tokenizer |
| { |
| private Token token = null ; |
| private final StringBuilder sb = new StringBuilder() ; |
| private final PeekReader reader ; |
| private boolean finished = false ; |
| |
| public TokenizerJSON(PeekReader reader) |
| { |
| this.reader = reader ; |
| } |
| |
| @Override |
| public final boolean hasNext() { |
| if ( finished ) |
| return false ; |
| if ( token != null ) |
| return true ; |
| skip() ; |
| if ( reader.eof() ) |
| return false ; |
| token = parseToken() ; |
| return token != null ; |
| } |
| |
| @Override |
| public final boolean eof() { |
| return hasNext() ; |
| } |
| |
| /** Move to next token */ |
| @Override |
| public final Token next() { |
| if ( !hasNext() ) |
| throw new NoSuchElementException() ; |
| Token t = token ; |
| token = null ; |
| return t ; |
| } |
| |
| @Override |
| public final Token peek() { |
| if ( !hasNext() ) |
| return null ; |
| return token ; |
| } |
| |
| @Override |
| public void remove() |
| { throw new UnsupportedOperationException() ; } |
| |
| // ---- Machinery |
| |
| // ""-string, ''-string, *X, |
| // various single characters . , : ; |
| // (), [], {}, <> |
| // Numbers (integer, decimal, double) |
| // Keys (restricted strings, used as keys in maps) |
| // ALPHA (ALPHA,NUMERIC,_,...) |
| |
| private Token parseToken() { |
| token = new Token(getLine(), getColumn()) ; |
| |
| int ch = reader.peekChar() ; |
| |
| // ---- String |
| // Support both "" and '' strings (only "" is legal JSON) |
| if ( ch == CH_QUOTE1 || ch == CH_QUOTE2 ) { |
| token.setType(TokenType.STRING); |
| reader.readChar() ; |
| int ch2 = reader.peekChar() ; |
| if ( ch2 == ch ) { |
| // Maybe """-strings/'''-strings |
| reader.readChar() ; // Read potential second quote. |
| int ch3 = reader.peekChar() ; |
| if ( ch3 == ch ) { |
| // """-strings/'''-strings |
| reader.readChar() ; |
| token.setImage(readLong(ch, false)) ; |
| StringType st = (ch == CH_QUOTE1) ? StringType.LONG_STRING1 : StringType.LONG_STRING2 ; |
| token.setStringType(st) ; |
| return token ; |
| } |
| // Two quotes then a non-quote. |
| // Must be '' or "" |
| token.setImage("") ; |
| } else |
| // Single quote character. |
| token.setImage(allBetween(ch, ch, true, false)) ; |
| // Single quoted string. |
| StringType st = (ch == CH_QUOTE1) ? StringType.STRING1 : StringType.STRING2 ; |
| token.setStringType(st) ; |
| return token ; |
| } |
| |
| switch (ch) { |
| // DOT can't start a decimal in JSON. Check for digit. |
| case CH_DOT: |
| // reader.readChar() ; |
| // ch = reader.peekChar() ; |
| // if ( range(ch, '0', '9') ) |
| // { |
| // // Not a DOT after all. |
| // reader.pushbackChar(CH_DOT) ; |
| // // Drop through to number code. |
| // break ; |
| // } |
| token.setType(TokenType.DOT) ; |
| return token ; |
| |
| case CH_SEMICOLON : |
| reader.readChar() ; |
| token.setType(TokenType.SEMICOLON) ; |
| return token ; |
| case CH_COMMA : |
| reader.readChar() ; |
| token.setType(TokenType.COMMA) ; |
| return token ; |
| case CH_LBRACE : |
| reader.readChar() ; |
| token.setType(TokenType.LBRACE) ; |
| return token ; |
| case CH_RBRACE : |
| reader.readChar() ; |
| token.setType(TokenType.RBRACE) ; |
| return token ; |
| case CH_LPAREN : |
| reader.readChar() ; |
| token.setType(TokenType.LPAREN) ; |
| return token ; |
| case CH_RPAREN : |
| reader.readChar() ; |
| token.setType(TokenType.RPAREN) ; |
| return token ; |
| case CH_LBRACKET : |
| reader.readChar() ; |
| token.setType(TokenType.LBRACKET) ; |
| return token ; |
| case CH_RBRACKET : |
| reader.readChar() ; |
| token.setType(TokenType.RBRACKET) ; |
| return token ; |
| |
| // Some interesting characters |
| case CH_COLON : |
| reader.readChar() ; |
| token.setType(TokenType.COLON) ; |
| return token ; |
| // case CH_UNDERSCORE: reader.readChar() ; |
| // token.setType(TokenType.UNDERSCORE) ; return token ; |
| case CH_LT : |
| reader.readChar() ; |
| token.setType(TokenType.LT) ; |
| return token ; |
| case CH_GT : |
| reader.readChar() ; |
| token.setType(TokenType.GT) ; |
| return token ; |
| // GE, LE |
| } |
| |
| if ( ch == CH_PLUS || ch == CH_MINUS || range(ch, '0', '9') ) { |
| readNumber() ; |
| return token ; |
| } |
| |
| // Plain words and prefixes. |
| // Can't start with a number due to numeric test above. |
| // Can start with a '_' (no blank node test above) |
| |
| readKeyWord(token) ; |
| return token ; |
| } |
| |
| private void skip() { |
| int ch = EOF ; |
| for ( ; ; ) { |
| if ( reader.eof() ) |
| return ; |
| |
| ch = reader.peekChar() ; |
| if ( ch == CH_HASH ) { |
| reader.readChar() ; |
| // Comment. Skip to NL |
| for ( ; ; ) { |
| ch = reader.peekChar() ; |
| if ( ch == EOF || isNewlineChar(ch) ) |
| break ; |
| reader.readChar() ; |
| } |
| } |
| |
| // Including excess newline chars from comment. |
| if ( !isWhitespace(ch) ) |
| break ; |
| reader.readChar() ; |
| } |
| } |
| |
| private void readKeyWord(Token token2) { |
| long posn = reader.getPosition() ; |
| token2.setImage(readWord(false)) ; |
| token2.setType(TokenType.KEYWORD) ; |
| int ch = reader.peekChar() ; |
| |
| // If we made no progress, nothing found, not even a keyword -- it's an |
| // error. |
| if ( posn == reader.getPosition() ) |
| exception(String.format("Unknown char: %c(%d)", ch, ch)) ; |
| } |
| |
| private String readLong(int quoteChar, boolean endNL) { |
| sb.setLength(0) ; |
| for ( ; ; ) { |
| int ch = reader.readChar() ; |
| if ( ch == EOF ) { |
| if ( endNL ) |
| return sb.toString() ; |
| exception("Broken long string") ; |
| } |
| |
| if ( ch == quoteChar ) { |
| if ( threeQuotes(quoteChar) ) |
| return sb.toString() ; |
| } |
| |
| if ( ch == '\\' ) |
| ch = readLiteralEscape() ; |
| insertLiteralChar(sb, ch) ; |
| } |
| } |
| |
| // Need "readCharOrEscape" |
| |
| // Assume have read the first quote char. |
| // On return: |
| // If false, have moved over no more characters (due to pushbacks) |
| // If true, at end of 3 quotes |
| private boolean threeQuotes(int ch) { |
| //reader.readChar() ; // Read first quote. |
| int ch2 = reader.peekChar() ; |
| if ( ch2 != ch ) { |
| // reader.pushbackChar(ch2) ; |
| return false ; |
| } |
| |
| reader.readChar() ; // Read second quote. |
| int ch3 = reader.peekChar() ; |
| if ( ch3 != ch ) { |
| // reader.pushbackChar(ch3) ; |
| reader.pushbackChar(ch2) ; |
| return false ; |
| } |
| |
| // Three quotes. |
| reader.readChar() ; // Read third quote. |
| return true ; |
| } |
| |
| // Read a "word": alphanumerics, "_", ".", "-" |
| private String readWord(boolean leadingDigitAllowed) { |
| sb.setLength(0) ; |
| int idx = 0 ; |
| if ( !leadingDigitAllowed ) { |
| int ch = reader.peekChar() ; |
| if ( Character.isDigit(ch) ) |
| return "" ; |
| } |
| |
| for ( ; ; idx++ ) { |
| int ch = reader.peekChar() ; |
| |
| if ( Character.isLetterOrDigit(ch) || ch == '_' || ch == '.' || ch == '-' ) { |
| reader.readChar() ; |
| sb.append((char)ch) ; |
| continue ; |
| } else |
| break ; |
| |
| } |
| |
| // // Trailing DOT? |
| // // BAD : assumes pushbackChar is infinite. |
| // // Check is ends in "." |
| // while ( idx > 0 && sb.charAt(idx-1) == CH_DOT ) |
| // { |
| // // Push back the dot. |
| // reader.pushbackChar(CH_DOT) ; |
| // sb.setLength(idx-1) ; |
| // idx -- ; |
| // } |
| return sb.toString() ; |
| } |
| |
| // Make better! |
| /* |
| [16] integer ::= ('-' | '+') ? [0-9]+ |
| [17] double ::= ('-' | '+') ? ( [0-9]+ '.' [0-9]* exponent | '.' ([0-9])+ exponent | ([0-9])+ exponent ) |
| 0.e0, .0e0, 0e0 |
| [18] decimal ::= ('-' | '+')? ( [0-9]+ '.' [0-9]* | '.' ([0-9])+ | ([0-9])+ ) |
| 0.0 .0 |
| [19] exponent ::= [eE] ('-' | '+')? [0-9]+ |
| [] hex ::= 0x0123456789ABCDEFG |
| |
| */ |
| private void readNumber() { |
| // One entry, definitely a number. |
| // Beware of '.' as a (non) decimal. |
| /* |
| * maybeSign() digits() if dot ==> decimal, digits if e ==> double, |
| * maybeSign, digits else check not "." for decimal. |
| */ |
| boolean isDouble = false ; |
| boolean isDecimal = false ; |
| sb.setLength(0) ; |
| |
| int x = 0 ; // Digits before a dot. |
| int ch = reader.peekChar() ; |
| if ( ch == '0' ) { |
| x++ ; |
| reader.readChar() ; |
| sb.append((char)ch) ; |
| ch = reader.peekChar() ; |
| if ( ch == 'x' || ch == 'X' ) { |
| reader.readChar() ; |
| sb.append((char)ch) ; |
| readHex(reader, sb) ; |
| token.setImage(sb.toString()) ; |
| token.setType(TokenType.HEX) ; |
| return ; |
| } |
| } else if ( ch == '-' || ch == '+' ) { |
| readPossibleSign(sb) ; |
| } |
| |
| x += readDigits(sb) ; |
| // if ( x == 0 ) |
| // { |
| // |
| // } |
| ch = reader.peekChar() ; |
| if ( ch == CH_DOT ) { |
| reader.readChar() ; |
| sb.append(CH_DOT) ; |
| isDecimal = true ; // Includes things that will be doubles. |
| readDigits(sb) ; |
| } |
| |
| if ( x == 0 && !isDecimal ) |
| // Possible a tokenizer error - should not have entered readNumber |
| // in the first place. |
| exception("Unrecognized as number") ; |
| |
| if ( exponent(sb) ) { |
| isDouble = true ; |
| isDecimal = false ; |
| |
| } |
| |
| token.setImage(sb.toString()) ; |
| if ( isDouble ) |
| token.setType(TokenType.DOUBLE) ; |
| else if ( isDecimal ) |
| token.setType(TokenType.DECIMAL) ; |
| else |
| token.setType(TokenType.INTEGER) ; |
| } |
| |
| private static void readHex(PeekReader reader, StringBuilder sb) { |
| // Just after the 0x, which are in sb |
| int x = 0 ; |
| for ( ; ; ) { |
| int ch = reader.peekChar() ; |
| |
| if ( !range(ch, '0', '9') && !range(ch, 'a', 'f') && !range(ch, 'A', 'F') ) |
| break ; |
| reader.readChar() ; |
| sb.append((char)ch) ; |
| x++ ; |
| } |
| if ( x == 0 ) |
| exception(reader, "No hex characters after " + sb.toString()) ; |
| } |
| |
| private boolean exponent(StringBuilder sb) { |
| int ch = reader.peekChar() ; |
| if ( ch != 'e' && ch != 'E' ) |
| return false ; |
| reader.readChar() ; |
| sb.append((char)ch) ; |
| readPossibleSign(sb) ; |
| int x = readDigits(sb) ; |
| if ( x == 0 ) |
| exception("Malformed double: " + sb) ; |
| return true ; |
| } |
| |
| private void readPossibleSign(StringBuilder sb) { |
| int ch = reader.peekChar() ; |
| if ( ch == '-' || ch == '+' ) { |
| reader.readChar() ; |
| sb.append((char)ch) ; |
| } |
| } |
| |
| private int readDigits(StringBuilder buffer) { |
| int count = 0 ; |
| for ( ; ; ) { |
| int ch = reader.peekChar() ; |
| if ( !range(ch, '0', '9') ) |
| break ; |
| reader.readChar() ; |
| buffer.append((char)ch) ; |
| count++ ; |
| } |
| return count ; |
| } |
| |
| private String langTag() { |
| sb.setLength(0) ; |
| a2z(sb) ; |
| if ( sb.length() == 0 ) |
| exception("Bad language tag") ; |
| for ( ; ; ) { |
| int ch = reader.peekChar() ; |
| if ( ch == '-' ) { |
| reader.readChar() ; |
| sb.append('-') ; |
| int x = sb.length() ; |
| a2zN(sb) ; |
| if ( sb.length() == x ) |
| exception("Bad language tag") ; |
| } else |
| break ; |
| } |
| return sb.toString() ; |
| } |
| |
| private void a2z(StringBuilder sb2) { |
| for ( ; ; ) { |
| int ch = reader.peekChar() ; |
| if ( isA2Z(ch) ) { |
| reader.readChar() ; |
| sb.append((char)ch) ; |
| } else |
| return ; |
| } |
| } |
| |
| private void a2zN(StringBuilder sb2) { |
| for ( ; ; ) { |
| int ch = reader.peekChar() ; |
| if ( isA2ZN(ch) ) { |
| reader.readChar() ; |
| sb.append((char)ch) ; |
| } else |
| return ; |
| } |
| } |
| |
| // Blank node label: A-Z,a-z0-9 and '-' |
| private String blankNodeLabel() { |
| sb.setLength(0) ; |
| boolean seen = false ; |
| for ( ; ; ) { |
| int ch = reader.readChar() ; |
| if ( ch == EOF ) |
| break ; |
| if ( !isA2ZN(ch) && ch != '-' ) |
| break ; |
| sb.append((char)ch) ; |
| seen = true ; |
| } |
| if ( !seen ) |
| exception("Blank node label missing") ; |
| return sb.toString() ; |
| } |
| |
| // Get characters between two markers. |
| // strEscapes may be processed |
| // endNL end of line as an ending is OK |
| private String allBetween(int startCh, int endCh, boolean strEscapes, boolean endNL) { |
| long y = getLine() ; |
| long x = getColumn() ; |
| sb.setLength(0) ; |
| |
| // Assumes first char read already. |
| // int ch0 = reader.readChar() ; |
| // if ( ch0 != startCh ) |
| // exception("Broken parser", y, x) ; |
| |
| for ( ; ; ) { |
| int ch = reader.readChar() ; |
| if ( ch == EOF ) { |
| if ( endNL ) |
| return sb.toString() ; |
| exception("Broken token: " + sb.toString(), y, x) ; |
| } |
| |
| if ( ch == '\n' ) |
| exception("Broken token (newline): " + sb.toString(), y, x) ; |
| |
| if ( ch == endCh ) { |
| // sb.append(((char)ch)) ; |
| return sb.toString() ; |
| } |
| |
| if ( ch == '\\' ) { |
| if ( strEscapes ) |
| ch = readLiteralEscape() ; |
| else { |
| ch = reader.readChar() ; |
| if ( ch == EOF ) { |
| if ( endNL ) |
| return sb.toString() ; |
| exception("Broken token: " + sb.toString(), y, x) ; |
| } |
| |
| switch (ch) { |
| case 'u' : |
| ch = readUnicode4Escape() ; |
| break ; |
| case 'U' : |
| ch = readUnicode4Escape() ; |
| break ; |
| default : |
| exception(String.format("illegal escape sequence value: %c (0x%02X)", ch, ch)) ; |
| break ; |
| } |
| } |
| } |
| insertLiteralChar(sb, ch) ; |
| } |
| } |
| |
| private void insertLiteralChar(StringBuilder buffer, int ch) { |
| if ( Character.charCount(ch) == 1 ) |
| buffer.append((char)ch) ; |
| else { |
| // Convert to UTF-16. Note that the rest of any systemn this is used |
| // in must also respect codepoints and surrogate pairs. |
| if ( !Character.isDefined(ch) && !Character.isSupplementaryCodePoint(ch) ) |
| exception(String.format("Illegal codepoint: 0x%04X", ch)) ; |
| char[] chars = Character.toChars(ch) ; |
| buffer.append(chars) ; |
| } |
| } |
| |
| @Override |
| public long getColumn() { |
| return reader.getColNum() ; |
| } |
| |
| @Override |
| public long getLine() { |
| return reader.getLineNum() ; |
| } |
| |
| // ---- Character classes |
| |
| @Override |
| public void close() { |
| try { |
| reader.close() ; |
| } |
| catch (IOException ex) { |
| IO.exception(ex) ; |
| } |
| } |
| |
| private boolean isA2Z(int ch) { |
| return range(ch, 'a', 'z') || range(ch, 'A', 'Z') ; |
| } |
| |
| private boolean isA2ZN(int ch) { |
| return range(ch, 'a', 'z') || range(ch, 'A', 'Z') || range(ch, '0', '9') ; |
| } |
| |
| private boolean isNumeric(int ch) { |
| return range(ch, '0', '9') ; |
| } |
| |
| private static boolean isWhitespace(int ch) { |
| return ch == ' ' || ch == '\t' || ch == '\r' || ch == '\n' || ch == '\f' ; |
| } |
| |
| private static boolean isNewlineChar(int ch) { |
| return ch == '\r' || ch == '\n' ; |
| } |
| |
| // ---- Escape sequences |
| |
| private final int readLiteralEscape() { |
| int c = reader.readChar() ; |
| if ( c == EOF ) |
| exception("Escape sequence not completed") ; |
| |
| switch (c) { |
| case 'n' : |
| return NL ; |
| case 'r' : |
| return CR ; |
| case 't' : |
| return '\t' ; |
| case 'b' : |
| return '\b' ; |
| case '"' : |
| return '"' ; |
| case '/' : |
| return '/' ; // JSON requires / escapes. |
| case '\'' : |
| return '\'' ; |
| case '\\' : |
| return '\\' ; |
| case 'u' : |
| return readUnicode4Escape() ; |
| case 'U' : |
| return readUnicode8Escape() ; |
| default : |
| exception(String.format("illegal escape sequence value: %c (0x%02X)", c, c)) ; |
| return 0 ; |
| } |
| } |
| |
| private final int readUnicodeEscape() { |
| int ch = reader.readChar() ; |
| if ( ch == EOF ) |
| exception("Broken escape sequence") ; |
| |
| switch (ch) { |
| case 'u' : |
| return readUnicode4Escape() ; |
| case 'U' : |
| return readUnicode8Escape() ; |
| default : |
| exception(String.format("illegal escape sequence value: %c (0x%02X)", ch, ch)) ; |
| } |
| return 0 ; |
| } |
| |
| private final int readUnicode4Escape() { |
| return readUnicodeEscape(4) ; |
| } |
| |
| private final int readUnicode8Escape() { |
| int ch8 = readUnicodeEscape(8) ; |
| if ( ch8 > Character.MAX_CODE_POINT ) |
| exception(String.format("illegal code point in \\U sequence value: 0x%08X", ch8)) ; |
| return ch8 ; |
| } |
| |
| private final int readUnicodeEscape(int N) { |
| int x = 0 ; |
| for ( int i = 0 ; i < N ; i++ ) { |
| int d = readHexChar() ; |
| if ( d < 0 ) |
| return -1 ; |
| x = (x << 4) + d ; |
| } |
| return x ; |
| } |
| |
| private final int readHexChar() { |
| int ch = reader.readChar() ; |
| if ( ch == EOF ) |
| exception("Not a hexadecimal character (end of file)") ; |
| |
| if ( range(ch, '0', '9') ) |
| return ch - '0' ; |
| if ( range(ch, 'a', 'f') ) |
| return ch - 'a' + 10 ; |
| if ( range(ch, 'A', 'F') ) |
| return ch - 'A' + 10 ; |
| |
| exception("Not a hexadecimal character: " + (char)ch) ; |
| return -1 ; |
| } |
| |
| private static boolean range(int ch, char a, char b) { |
| return (ch >= a && ch <= b) ; |
| } |
| |
| private boolean expect(String str) { |
| for ( int i = 0 ; i < str.length() ; i++ ) { |
| char want = str.charAt(i) ; |
| if ( reader.eof() ) { |
| exception("End of input during expected string: " + str) ; |
| return false ; |
| } |
| int inChar = reader.readChar() ; |
| if ( inChar != want ) { |
| exception("expected \"" + str + "\"") ; |
| return false ; |
| } |
| } |
| return true ; |
| } |
| |
| private void exception(String message) { |
| exception(message, reader.getLineNum(), reader.getColNum()) ; |
| } |
| |
| private static void exception(PeekReader reader, String message) { |
| exception(message, reader.getLineNum(), reader.getColNum()) ; |
| } |
| |
| private static void exception(String message, long line, long col) { |
| throw new JsonParseException(message, (int)line, (int)col) ; |
| } |
| } |