blob: 466fd1a24d09d97a58341269744de18874e7979d [file] [log] [blame]
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* See the License for the specific language governing permissions and
* limitations under the License.
package org.apache.phoenix.expression.function;
import java.util.List;
import org.apache.phoenix.expression.Determinism;
import org.apache.phoenix.expression.Expression;
import org.apache.phoenix.expression.util.regex.AbstractBasePattern;
import org.apache.phoenix.parse.FunctionParseNode;
import org.apache.phoenix.parse.FunctionParseNode.Argument;
import org.apache.phoenix.parse.FunctionParseNode.BuiltInFunction;
import org.apache.phoenix.parse.RegexpSubstrParseNode;
import org.apache.phoenix.schema.SortOrder;
import org.apache.phoenix.schema.tuple.Tuple;
import org.apache.phoenix.schema.types.PDataType;
import org.apache.phoenix.schema.types.PInteger;
import org.apache.phoenix.schema.types.PLong;
import org.apache.phoenix.schema.types.PVarchar;
* Implementation of REGEXP_SUBSTR(<source>, <pattern>, <offset>) built-in function,
* where <offset> is the offset from the start of <string>. Positive offset is treated as 1-based,
* a zero offset is treated as 0-based, and a negative offset starts from the end of the string
* working backwards. The <pattern> is the pattern we would like to search for in the <source> string.
* The function returns the first occurrence of any substring in the <source> string that matches
* the <pattern> input as a VARCHAR.
* @since 0.1
nodeClass = RegexpSubstrParseNode.class, args={
@Argument(allowedTypes={PLong.class}, defaultValue="1")},
classType = FunctionParseNode.FunctionClassType.ABSTRACT,
derivedFunctions = {ByteBasedRegexpSubstrFunction.class, StringBasedRegexpSubstrFunction.class})
public abstract class RegexpSubstrFunction extends PrefixFunction {
public static final String NAME = "REGEXP_SUBSTR";
private AbstractBasePattern pattern;
private Integer offset;
private Integer maxLength;
private static final PDataType TYPE = PVarchar.INSTANCE;
public RegexpSubstrFunction() { }
public RegexpSubstrFunction(List<Expression> children) {
protected abstract AbstractBasePattern compilePatternSpec(String value);
private void init() {
ImmutableBytesWritable ptr = new ImmutableBytesWritable();
Expression patternExpr = getPatternExpression();
if (patternExpr.isStateless() && patternExpr.getDeterminism() == Determinism.ALWAYS && patternExpr.evaluate(null, ptr)) {
String patternStr = (String) patternExpr.getDataType().toObject(ptr, patternExpr.getSortOrder());
if (patternStr != null) {
pattern = compilePatternSpec(patternStr);
// If the source string has a fixed width, then the max length would be the length
// of the source string minus the offset, or the absolute value of the offset if
// it's negative. Offset number is a required argument. However, if the source string
// is not fixed width, the maxLength would be null.
Expression offsetExpr = getOffsetExpression();
if (offsetExpr.isStateless() && offsetExpr.getDeterminism() == Determinism.ALWAYS && offsetExpr.evaluate(null, ptr)) {
offset = (Integer)PInteger.INSTANCE.toObject(ptr, offsetExpr.getDataType(), offsetExpr.getSortOrder());
if (offset != null) {
PDataType type = getSourceStrExpression().getDataType();
if (type.isFixedWidth()) {
if (offset >= 0) {
Integer maxLength = getSourceStrExpression().getMaxLength();
this.maxLength = maxLength - offset - (offset == 0 ? 0 : 1);
} else {
this.maxLength = -offset;
public boolean evaluate(Tuple tuple, ImmutableBytesWritable ptr) {
AbstractBasePattern pattern = this.pattern;
if (pattern == null) {
Expression patternExpr = getPatternExpression();
if (!patternExpr.evaluate(tuple, ptr)) {
return false;
if (ptr.getLength() == 0) {
return true;
pattern = compilePatternSpec((String) patternExpr.getDataType().toObject(ptr, patternExpr.getSortOrder()));
int offset;
if (this.offset == null) {
Expression offsetExpression = getOffsetExpression();
if (!offsetExpression.evaluate(tuple, ptr)) {
return false;
if (ptr.getLength() == 0) {
return true;
offset = offsetExpression.getDataType().getCodec().decodeInt(ptr, offsetExpression.getSortOrder());
} else {
offset = this.offset;
Expression strExpression = getSourceStrExpression();
if (!strExpression.evaluate(tuple, ptr)) {
return false;
if (ptr.get().length == 0) {
return true;
TYPE.coerceBytes(ptr, strExpression.getDataType(), strExpression.getSortOrder(), SortOrder.ASC);
// Account for 1 versus 0-based offset
offset = offset - (offset <= 0 ? 0 : 1);
pattern.substr(ptr, offset);
return true;
public Integer getMaxLength() {
return maxLength;
public OrderPreserving preservesOrder() {
if (offset != null) {
if (offset == 0 || offset == 1) {
return OrderPreserving.YES_IF_LAST;
return OrderPreserving.NO;
public void readFields(DataInput input) throws IOException {
public int getKeyFormationTraversalIndex() {
return preservesOrder() == OrderPreserving.NO ? NO_TRAVERSAL : 0;
private Expression getOffsetExpression() {
return children.get(2);
private Expression getPatternExpression() {
return children.get(1);
private Expression getSourceStrExpression() {
return children.get(0);
public PDataType getDataType() {
// ALways VARCHAR since we do not know in advanced how long the
// matched string will be.
return TYPE;
public String getName() {
return NAME;