blob: 3cea1db7a6e172115f7aca16a64b2c1063bdb5c8 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.tinkerpop.gremlin.jsr223;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Optional;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* Processes Gremlin strings using regex to try to detect certain properties from the script without actual
* having to execute a {@code eval()} on it.
*/
public class GremlinScriptChecker {
/**
* An empty result whose properties return as empty.
*/
public static final Result EMPTY_RESULT = new Result(null, null);
/**
* At least one of these tokens should be present somewhere in the Gremlin string for {@link #parse(String)} to
* take any action at all.
*/
private static final Set<String> tokens = new HashSet<>(Arrays.asList("evaluationTimeout", "scriptEvaluationTimeout",
"ARGS_EVAL_TIMEOUT", "ARGS_SCRIPT_EVAL_TIMEOUT",
"requestId", "REQUEST_ID"));
/**
* Matches single line comments, multi-line comments and space characters.
* <pre>
* From https://regex101.com/
*
* OR: match either of the followings
* Sequence: match all of the followings in order
* / /
* Repeat
* AnyCharacterExcept\n
* zero or more times
* EndOfLine
* Sequence: match all of the followings in order
* /
* *
* Repeat
* CapturingGroup
* GroupNumber:1
* OR: match either of the followings
* AnyCharacterExcept\n
* AnyCharIn[ CarriageReturn NewLine]
* zero or more times (ungreedy)
* *
* /
* WhiteSpaceCharacter
* </pre>
*/
private static final Pattern patternClean = Pattern.compile("//.*$|/\\*(.|[\\r\\n])*?\\*/|\\s", Pattern.MULTILINE);
/**
* Regex fragment for the timeout tokens to look for. There are basically four:
* <ul>
* <li>{@code evaluationTimeout} which is a string value and thus single or double quoted</li>
* <li>{@code scriptEvaluationTimeout} which is a string value and thus single or double quoted</li>
* <li>{@code ARGS_EVAL_TIMEOUT} which is a enum type of value which can be referenced with or without a {@code Tokens} qualifier</li>
* <li>{@code ARGS_SCRIPT_EVAL_TIMEOUT} which is a enum type of value which can be referenced with or without a {@code Tokens} qualifier</li>
* </ul>
* See {@link #patternWithOptions} for explain as this regex is embedded in there.
*/
private static final String timeoutTokens = "[\"']evaluationTimeout[\"']|[\"']scriptEvaluationTimeout[\"']|(?:Tokens\\.)?ARGS_EVAL_TIMEOUT|(?:Tokens\\.)?ARGS_SCRIPT_EVAL_TIMEOUT";
/**
* Regex fragment for the timeout tokens to look for. There are basically four:
* <ul>
* <li>{@code requestId} which is a string value and thus single or double quoted</li>
* <li>{@code REQUEST_ID} which is a enum type of value which can be referenced with or without a {@code Tokens} qualifier</li>
* </ul>
* See {@link #patternWithOptions} for a full explain as this regex is embedded in there.
*/
private static final String requestIdTokens = "[\"']requestId[\"']|(?:Tokens\\.)?REQUEST_ID";
/**
* Matches {@code .with({timeout-token},{timeout})} with a matching group on the {@code timeout}.
*
* <pre>
* From https://regex101.com/
*
* \.with\((?:(?:["']evaluationTimeout["']|["']scriptEvaluationTimeout["']|(?:Tokens\.)?ARGS_EVAL_TIMEOUT|(?:Tokens\.)?ARGS_SCRIPT_EVAL_TIMEOUT),(?<to>\d*)(:?L|l)?)|(?:(?:["']requestId["']|(?:Tokens\.)?REQUEST_ID),["'](?<rid>.*?))["']\)
*
* gm
* 1st Alternative \.with\((?:(?:["']evaluationTimeout["']|["']scriptEvaluationTimeout["']|(?:Tokens\.)?ARGS_EVAL_TIMEOUT|(?:Tokens\.)?ARGS_SCRIPT_EVAL_TIMEOUT),(?<to>\d*)(:?L|l)?)
* \. matches the character . with index 4610 (2E16 or 568) literally (case sensitive)
* with matches the characters with literally (case sensitive)
* \( matches the character ( with index 4010 (2816 or 508) literally (case sensitive)
* Non-capturing group (?:(?:["']evaluationTimeout["']|["']scriptEvaluationTimeout["']|(?:Tokens\.)?ARGS_EVAL_TIMEOUT|(?:Tokens\.)?ARGS_SCRIPT_EVAL_TIMEOUT),(?<to>\d*)(:?L|l)?)
* Non-capturing group (?:["']evaluationTimeout["']|["']scriptEvaluationTimeout["']|(?:Tokens\.)?ARGS_EVAL_TIMEOUT|(?:Tokens\.)?ARGS_SCRIPT_EVAL_TIMEOUT)
* 1st Alternative ["']evaluationTimeout["']
* Match a single character present in the list below ["']
* "' matches a single character in the list "' (case sensitive)
* evaluationTimeout matches the characters evaluationTimeout literally (case sensitive)
* Match a single character present in the list below ["']
* "' matches a single character in the list "' (case sensitive)
* 2nd Alternative ["']scriptEvaluationTimeout["']
* Match a single character present in the list below ["']
* "' matches a single character in the list "' (case sensitive)
* scriptEvaluationTimeout matches the characters scriptEvaluationTimeout literally (case sensitive)
* Match a single character present in the list below ["']
* "' matches a single character in the list "' (case sensitive)
* 3rd Alternative (?:Tokens\.)?ARGS_EVAL_TIMEOUT
* Non-capturing group (?:Tokens\.)?
* ? matches the previous token between zero and one times, as many times as possible, giving back as needed (greedy)
* Tokens matches the characters Tokens literally (case sensitive)
* \. matches the character . with index 4610 (2E16 or 568) literally (case sensitive)
* ARGS_EVAL_TIMEOUT matches the characters ARGS_EVAL_TIMEOUT literally (case sensitive)
* 4th Alternative (?:Tokens\.)?ARGS_SCRIPT_EVAL_TIMEOUT
* Non-capturing group (?:Tokens\.)?
* ? matches the previous token between zero and one times, as many times as possible, giving back as needed (greedy)
* Tokens matches the characters Tokens literally (case sensitive)
* \. matches the character . with index 4610 (2E16 or 568) literally (case sensitive)
* ARGS_SCRIPT_EVAL_TIMEOUT matches the characters ARGS_SCRIPT_EVAL_TIMEOUT literally (case sensitive)
* , matches the character , with index 4410 (2C16 or 548) literally (case sensitive)
* Named Capture Group to (?<to>\d*)
* \d matches a digit (equivalent to [0-9])
* * matches the previous token between zero and unlimited times, as many times as possible, giving back as needed (greedy)
* 2nd Capturing Group (:?L|l)?
* ? matches the previous token between zero and one times, as many times as possible, giving back as needed (greedy)
* 1st Alternative :?L
* : matches the character : with index 5810 (3A16 or 728) literally (case sensitive)
* ? matches the previous token between zero and one times, as many times as possible, giving back as needed (greedy)
* L matches the character L with index 7610 (4C16 or 1148) literally (case sensitive)
* 2nd Alternative l
* l matches the character l with index 10810 (6C16 or 1548) literally (case sensitive)
* 2nd Alternative (?:(?:["']requestId["']|(?:Tokens\.)?REQUEST_ID),["'](?<rid>.*?))["']\)
* Non-capturing group (?:(?:["']requestId["']|(?:Tokens\.)?REQUEST_ID),["'](?<rid>.*?))
* Non-capturing group (?:["']requestId["']|(?:Tokens\.)?REQUEST_ID)
* 1st Alternative ["']requestId["']
* 2nd Alternative (?:Tokens\.)?REQUEST_ID
* , matches the character , with index 4410 (2C16 or 548) literally (case sensitive)
* Match a single character present in the list below ["']
* "' matches a single character in the list "' (case sensitive)
* Named Capture Group rid (?<rid>.*?)
* . matches any character (except for line terminators)
* *? matches the previous token between zero and unlimited times, as few times as possible, expanding as needed (lazy)
* Match a single character present in the list below ["']
* "' matches a single character in the list "' (case sensitive)
* \) matches the character ) with index 4110 (2916 or 518) literally (case sensitive)
* </pre>
*/
private static final Pattern patternWithOptions =
Pattern.compile("\\.with\\(((?:" + timeoutTokens + "),(?<to>\\d*)(:?L|l)?)|((?:" + requestIdTokens + "),[\"'](?<rid>.*?))[\"']\\)");
/**
* Parses a Gremlin script and extracts a {@code Result} containing properties that are relevant to the checker.
*/
public static Result parse(final String gremlin) {
if (gremlin.isEmpty()) return EMPTY_RESULT;
// do a cheap check for tokens we care about - no need to parse unless one of these tokens is present in
// the string.
if (tokens.stream().noneMatch(gremlin::contains)) return EMPTY_RESULT;
// kill out comments/whitespace. for whitespace, ignoring the need to keep string literals together as that
// isn't currently a requirement
final String cleanGremlin = patternClean.matcher(gremlin).replaceAll("");
final Matcher m = patternWithOptions.matcher(cleanGremlin);
if (!m.find()) return EMPTY_RESULT;
// arguments given to Result class as null mean they weren't assigned (or the parser didn't find them somehow - eek!)
Long timeout = null;
String requestId = null;
do {
// timeout is added up across all scripts
final String to = m.group("to");
if (to != null) {
if (null == timeout) timeout = 0L;
timeout += Long.parseLong(to);
}
// request id just uses the last one found
final String rid = m.group("rid");
if (rid != null) requestId = rid;
} while (m.find());
return new Result(timeout, requestId);
}
/**
* A result returned from a {@link #parse(String)} of a Gremlin string.
*/
public static class Result {
private final Long timeout;
private final String requestId;
private Result(final Long timeout, final String requestId) {
this.timeout = timeout;
this.requestId = requestId;
}
/**
* Gets the value of the timeouts that were set using the {@code with()} source step. If there are multiple
* commands using this step, the timeouts are summed together.
*/
public final Optional<Long> getTimeout() {
return null == timeout ? Optional.empty() : Optional.of(timeout);
}
/**
* Gets the value of the request identifier supplied using the {@code with()} source step. If there are
* multiple commands using this step, the last usage should represent the id returned here.
*/
public Optional<String> getRequestId() {
return null == requestId ? Optional.empty() : Optional.of(requestId);
}
@Override
public String toString() {
return "GremlinScriptChecker.Result{" +
"timeout=" + timeout +
", requestId='" + requestId + '\'' +
'}';
}
}
}