blob: 1cff2d02e774b8b9c1a228e7c36e9f1ea27f0bd6 [file] [log] [blame]
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
package org.apache.olingo.odata2.core.uri.expression;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.olingo.odata2.api.edm.EdmLiteral;
import org.apache.olingo.odata2.api.edm.EdmLiteralException;
import org.apache.olingo.odata2.api.edm.EdmSimpleTypeFacade;
import org.apache.olingo.odata2.api.edm.EdmSimpleTypeKind;
import org.apache.olingo.odata2.api.uri.expression.ExpressionParserException;
import org.apache.olingo.odata2.core.edm.EdmSimpleTypeFacadeImpl;
* Expression tokenizer
public class Tokenizer {
private static final Pattern OTHER_LIT = Pattern.compile("(?:\\p{L}|\\p{Digit}|[-._~%!$&*+;:@])+");
private static final Pattern FUNK =
.compile("^(startswith|endswith|substring|substring|substringof|indexof|replace|tolower|toupper" +
"|trim|concat|length|year|mounth|day|hour|minute|second|round|ceiling|floor)( *)\\(");
private static final Pattern AND_SUB1 = Pattern.compile("^(add|sub|mul|div|mod|not) ");
private static final Pattern AND_SUB = Pattern.compile("^(and|or|eq|ne|lt|gt|le|ge) ");
private static final Pattern prefix = Pattern.compile("^(X|binary|guid|datetime|datetimeoffset|time)'");
private boolean flagIncludeWhitespace = false;
private EdmSimpleTypeFacade typeDectector = null;
int curPosition;
final String expression;
final int expressionLength;
TokenList tokens;
public Tokenizer(final String expression) {
typeDectector = new EdmSimpleTypeFacadeImpl();
this.expression = expression;
expressionLength = expression.length();
tokens = new TokenList();
* Inform the Tokenizer whether extra tokens for whitespace characters should be added to the token list or not.
* @param flagIncludeWhitespace True -> Whitespace token will be added to token list; False otherwise
* @return this
public Tokenizer setFlagWhiteSpace(final Boolean flagIncludeWhitespace) {
this.flagIncludeWhitespace = flagIncludeWhitespace;
return this;
* Tokenizes an expression as defined per OData specification
* @return Token list
public TokenList tokenize() throws TokenizerException, ExpressionParserException {
curPosition = 0;
int oldPosition;
char curCharacter;
String token = "";
while (curPosition < expressionLength) {
oldPosition = curPosition;
curCharacter = expression.charAt(curPosition);
switch (curCharacter) {
case ' ':
// count whitespace and move pointer to next non-whitespace char
eatWhiteSpaces(curPosition, curCharacter);
case '(':
tokens.appendToken(curPosition, TokenKind.OPENPAREN, curCharacter);
curPosition = curPosition + 1;
case ')':
tokens.appendToken(curPosition, TokenKind.CLOSEPAREN, curCharacter);
curPosition = curPosition + 1;
case '\'':
token = "";
case ',':
tokens.appendToken(oldPosition, TokenKind.COMMA, curCharacter);
curPosition = curPosition + 1;
case '=':
case '/':
case '?':
case '.':
case '*':
curPosition = curPosition + 1;
tokens.appendToken(oldPosition, TokenKind.SYMBOL, curCharacter);
String rem_expr = expression.substring(curPosition); // remaining expression
boolean isBinary = checkForBinary(oldPosition, rem_expr);
if (isBinary) {
// check for prefixes like X, binary, guid, datetime
boolean isPrefix = checkForPrefix(rem_expr);
if (isPrefix) {
// check for math
boolean isMath = checkForMath(oldPosition, rem_expr);
if (isMath) {
// check for function
boolean isFunction = checkForMethod(oldPosition, rem_expr);
if (isFunction) {
boolean isBoolean = checkForBoolean(oldPosition, rem_expr);
if (isBoolean) {
boolean isLiteral = checkForLiteral(oldPosition, curCharacter, rem_expr);
if (isLiteral) {
token = new Character(curCharacter).toString();
throw TokenizerException.createUNKNOWN_CHARACTER(oldPosition, token, expression);
return tokens;
private boolean checkForLiteral(final int oldPosition, final char curCharacter, final String rem_expr) {
final Matcher matcher = OTHER_LIT.matcher(rem_expr);
boolean isLiteral = false;
if (matcher.lookingAt()) {
String token =;
try {
EdmLiteral edmLiteral = typeDectector.parseUriLiteral(token);
curPosition = curPosition + token.length();
// It is a simple type.
tokens.appendEdmTypedToken(oldPosition, TokenKind.SIMPLE_TYPE, token, edmLiteral);
isLiteral = true;
} catch (EdmLiteralException e) {
// We treat it as normal untyped literal.
// The '-' is checked here (and not in the switch statement) because it may be
// part of a negative number.
if (curCharacter == '-') {
curPosition = curPosition + 1;
tokens.appendToken(oldPosition, TokenKind.SYMBOL, curCharacter);
isLiteral = true;
} else {
curPosition = curPosition + token.length();
tokens.appendToken(oldPosition, TokenKind.LITERAL, token);
isLiteral = true;
return isLiteral;
private boolean checkForBoolean(final int oldPosition, final String rem_expr) {
boolean isBoolean = false;
if ("true".equals(rem_expr) || "false".equals(rem_expr)) {
curPosition = curPosition + rem_expr.length();
tokens.appendEdmTypedToken(oldPosition, TokenKind.SIMPLE_TYPE, rem_expr, new EdmLiteral(EdmSimpleTypeFacadeImpl
.getEdmSimpleType(EdmSimpleTypeKind.Boolean), rem_expr));
isBoolean = true;
return isBoolean;
private void eatWhiteSpaces(final int oldPosition, char curCharacter) {
int lv_token_len;
String expression_sub;
while ((curCharacter == ' ') && (curPosition < expressionLength)) {
curPosition = curPosition + 1;
if (curPosition < expressionLength) {
curCharacter = expression.charAt(curPosition);
lv_token_len = curPosition - oldPosition;
if (flagIncludeWhitespace == true) {
expression_sub = expression.substring(oldPosition, oldPosition + lv_token_len);
tokens.appendEdmTypedToken(oldPosition, TokenKind.WHITESPACE, expression_sub, null);
private boolean checkForMethod(final int oldPosition, final String rem_expr) {
boolean isMethod = false;
Matcher matcher = FUNK.matcher(rem_expr);
if (matcher.find()) {
String token =;
curPosition = curPosition + token.length();
tokens.appendToken(oldPosition, TokenKind.LITERAL, token);
isMethod = true;
return isMethod;
private boolean checkForMath(final int oldPosition, final String rem_expr) {
boolean isMath = false;
Matcher matcher1 = AND_SUB1.matcher(rem_expr);
if (matcher1.find()) {
String token =;
curPosition = curPosition + token.length();
tokens.appendToken(oldPosition, TokenKind.LITERAL, token);
isMath = true;
return isMath;
private boolean checkForBinary(final int oldPosition, final String rem_expr) {
boolean isBinary = false;
Matcher matcher1 = AND_SUB.matcher(rem_expr);
if (matcher1.find()) {
String token =;
curPosition = curPosition + token.length();
tokens.appendToken(oldPosition, TokenKind.LITERAL, token);
isBinary = true;
return isBinary;
private boolean checkForPrefix(final String rem_expr) throws ExpressionParserException, TokenizerException {
boolean isPrefix = false;
Matcher matcher = prefix.matcher(rem_expr);
String token = "";
char curCharacter;
if (matcher.find()) {
token =;
curPosition = curPosition + token.length();
curCharacter = expression.charAt(curPosition); // "should be '
readLiteral(curCharacter, token);
isPrefix = true;
return isPrefix;
private void readLiteral(final char curCharacter) throws ExpressionParserException, TokenizerException {
readLiteral(curCharacter, "");
* Read up to single ' and move pointer to the following char and tries a type detection
* @param curCharacter
* @param token
* @throws ExpressionParserException
* @throws TokenizerException
private void readLiteral(char curCharacter, String token) throws ExpressionParserException, TokenizerException {
int offsetPos = -token.length();
int oldPosition = curPosition;
token = token + Character.toString(curCharacter);
curPosition = curPosition + 1;
boolean wasApostroph = false; // leading ' does not count
while (curPosition < expressionLength) {
curCharacter = expression.charAt(curPosition);
if (curCharacter != '\'') {
if (wasApostroph == true) {
token = token + curCharacter;
wasApostroph = false;
} else {
if (wasApostroph) {
wasApostroph = false; // a double ' is a normal character '
} else {
wasApostroph = true;
token = token + curCharacter;
curPosition = curPosition + 1;
if (!wasApostroph) {
// Exception tested within TestPMparseFilterString
throw FilterParserExceptionImpl.createTOKEN_UNDETERMINATED_STRING(oldPosition, expression);
try {
EdmLiteral edmLiteral = typeDectector.parseUriLiteral(token);
tokens.appendEdmTypedToken(oldPosition + offsetPos, TokenKind.SIMPLE_TYPE, token, edmLiteral);
} catch (EdmLiteralException ex) {
throw TokenizerException.createTYPEDECTECTION_FAILED_ON_STRING(ex, oldPosition, token);