blob: 9e736ae57c966fc3b07cf79c61392283566a0074 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.eagle.log.entity.filter;
import org.apache.eagle.common.config.EagleConfigFactory;
import org.apache.eagle.log.entity.EntityQualifierUtils;
import org.apache.eagle.log.entity.meta.EntityDefinition;
import org.apache.eagle.log.entity.meta.Qualifier;
import org.apache.eagle.common.ByteUtil;
import org.apache.eagle.query.parser.*;
import org.apache.hadoop.hbase.filter.*;
import org.apache.hadoop.hbase.filter.CompareFilter.CompareOp;
import org.apache.hadoop.hbase.filter.FilterList.Operator;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.nio.charset.Charset;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* the steps of building hbase filters 1. receive ORExpression from eagle-antlr 2. iterate all ANDExpression
* in ORExpression 2.1 put each ANDExpression to a new filter list with MUST_PASS_ONE option 2.2 iterate all
* AtomicExpression in ANDExpression 2.2.1 group AtomicExpression into 2 groups by looking up metadata, one is
* for tag filters, the other is for column filters 2.2.2 put the above 2 filters to a filter list with
* MUST_PASS_ALL option
*/
public class HBaseFilterBuilder {
private static final Logger LOG = LoggerFactory.getLogger(HBaseFilterBuilder.class);
/*
* syntax is @<fieldname>
*/
// private static final String fnRegex = "^@(.*)$";
private static final Pattern _fnPattern = TokenConstant.ID_PATTERN;// Pattern.compile(fnRegex);
private static final Charset _defaultCharset = Charset.forName("ISO-8859-1");
private ORExpression orExpr;
private EntityDefinition ed;
private boolean filterIfMissing;
private Charset charset = _defaultCharset;
/**
* TODO: Verify performance impact
*
* @return
*/
public Set<String> getFilterFields() {
return filterFields;
}
/**
* Just add filter fields for expression filter
*/
private Set<String> filterFields;
public HBaseFilterBuilder(EntityDefinition ed, ORExpression orExpr) {
this(ed, orExpr, false);
}
public HBaseFilterBuilder(EntityDefinition ed, ORExpression orExpr, boolean filterIfMissing) {
this.ed = ed;
this.orExpr = orExpr;
this.filterIfMissing = filterIfMissing;
}
public void setCharset(String charsetName) {
charset = Charset.forName(charsetName);
}
public Charset getCharset() {
return charset;
}
/**
* Because we don't have metadata for tag, we regard non-qualifer field as tag. So one field possibly is
* not a real tag when this function return true. This happens when a user input an wrong field name which
* is neither tag or qualifier
*
* @param field
*/
private boolean isTag(String field) {
return ed.isTag(field);
}
/**
* check whether this field is one entity attribute or not
*
* @param fieldName
* @return
*/
private String parseEntityAttribute(String fieldName) {
Matcher m = _fnPattern.matcher(fieldName);
if (m.find()) {
return m.group(1);
}
return null;
}
/**
* Return the partition values for each or expression. The size of the returned list should be equal to
* the size of FilterList that {@link #buildFilters()} returns. TODO: For now we don't support one query
* to query multiple partitions. In future if partition is defined, for the entity, internally We need to
* spawn multiple queries and send one query for each partition.
*
* @return Return the partition values for each or expression. Return null if the entity doesn't support
* partition
*/
public List<String[]> getPartitionValues() {
final String[] partitions = ed.getPartitions();
if (partitions == null || partitions.length == 0) {
return null;
}
final List<String[]> result = new ArrayList<String[]>();
final Map<String, String> partitionKeyValueMap = new HashMap<String, String>();
for (ANDExpression andExpr : orExpr.getANDExprList()) {
partitionKeyValueMap.clear();
for (AtomicExpression ae : andExpr.getAtomicExprList()) {
// TODO temporarily ignore those fields which are not for attributes
if (ae.getKeyType() == TokenType.ID) {
final String fieldName = parseEntityAttribute(ae.getKey());
if (fieldName == null) {
LOG.warn(fieldName + " field does not have format @<FieldName>, ignored");
continue;
}
if (ed.isPartitionTag(fieldName) && ComparisonOperator.EQUAL.equals(ae.getOp())) {
final String value = ae.getValue();
partitionKeyValueMap.put(fieldName, value);
}
}
}
final String[] values = new String[partitions.length];
result.add(values);
for (int i = 0; i < partitions.length; ++i) {
final String partition = partitions[i];
final String value = partitionKeyValueMap.get(partition);
values[i] = value;
}
}
return result;
}
/**
* @see org.apache.eagle.query.parser.TokenType
* @return
*/
public FilterList buildFilters() {
// TODO: Optimize to select between row filter or column filter for better performance
// Use row key filter priority by default
boolean rowFilterPriority = true;
FilterList fltList = new FilterList(Operator.MUST_PASS_ONE);
for (ANDExpression andExpr : orExpr.getANDExprList()) {
FilterList list = new FilterList(Operator.MUST_PASS_ALL);
Map<String, List<String>> tagFilters = new HashMap<String, List<String>>();
List<QualifierFilterEntity> qualifierFilters = new ArrayList<QualifierFilterEntity>();
// List<QualifierFilterEntry> tagLikeQualifierFilters = new ArrayList<QualifierFilterEntry>();
// TODO refactor not to use too much if/else
for (AtomicExpression ae : andExpr.getAtomicExprList()) {
// TODO temporarily ignore those fields which are not for attributes
String fieldName = ae.getKey();
if (ae.getKeyType() == TokenType.ID) {
fieldName = parseEntityAttribute(fieldName);
if (fieldName == null) {
LOG.warn(fieldName + " field does not have format @<FieldName>, ignored");
continue;
}
}
String value = ae.getValue();
ComparisonOperator op = ae.getOp();
TokenType keyType = ae.getKeyType();
TokenType valueType = ae.getValueType();
QualifierFilterEntity entry = new QualifierFilterEntity(fieldName, value, op, keyType,
valueType);
// TODO Exact match, need to add escape for those special characters here, including:
// "-", "[", "]", "/", "{", "}", "(", ")", "*", "+", "?", ".", "\\", "^", "$", "|"
if (keyType == TokenType.ID && isTag(fieldName)) {
if ((ComparisonOperator.EQUAL.equals(op) || ComparisonOperator.IS.equals(op))
&& !TokenType.NULL.equals(valueType)) {
// Use RowFilter for equal TAG
if (tagFilters.get(fieldName) == null) {
tagFilters.put(fieldName, new ArrayList<String>());
}
tagFilters.get(fieldName).add(value);
} else if (rowFilterPriority && ComparisonOperator.IN.equals(op)) {
// Use RowFilter here by default
if (tagFilters.get(fieldName) == null) {
tagFilters.put(fieldName, new ArrayList<String>());
}
tagFilters.get(fieldName).addAll(EntityQualifierUtils.parseList(value));
} else if (ComparisonOperator.LIKE.equals(op) || ComparisonOperator.NOT_LIKE.equals(op)
|| ComparisonOperator.CONTAINS.equals(op)
|| ComparisonOperator.NOT_CONTAINS.equals(op)
|| ComparisonOperator.IN.equals(op) || ComparisonOperator.IS.equals(op)
|| ComparisonOperator.IS_NOT.equals(op)
|| ComparisonOperator.NOT_EQUAL.equals(op)
|| ComparisonOperator.EQUAL.equals(op)
|| ComparisonOperator.NOT_IN.equals(op)) {
qualifierFilters.add(entry);
} else {
LOG.warn("Don't support operation: \"" + op + "\" on tag field: " + fieldName
+ " yet, going to ignore");
throw new IllegalArgumentException("Don't support operation: " + op
+ " on tag field: " + fieldName
+ ", avaliable options: =, =!, =~, !=~, in, not in, contains, not contains");
}
} else {
qualifierFilters.add(entry);
}
}
// Build RowFilter for equal tags
list.addFilter(buildTagFilter(tagFilters));
// Build SingleColumnValueFilter
FilterList qualifierFilterList = buildQualifierFilter(qualifierFilters);
if (qualifierFilterList != null && qualifierFilterList.getFilters().size() > 0) {
list.addFilter(qualifierFilterList);
} else {
if (LOG.isDebugEnabled()) {
LOG.debug("Ignore empty qualifier filter from " + qualifierFilters.toString());
}
}
fltList.addFilter(list);
}
LOG.info("Query: " + orExpr.toString() + " => Filter: " + fltList.toString());
return fltList;
}
/**
* charset is used to decode the byte array, in hbase server, RegexStringComparator uses the same charset
* to decode the byte array stored in qualifier for tag filter regex, it's always ISO-8859-1 as it only
* comes from String's hashcode (Integer) Note: regex comparasion is to compare String
*/
protected Filter buildTagFilter(Map<String, List<String>> tagFilters) {
RegexStringComparator regexStringComparator = new RegexStringComparator(buildTagFilterRegex(tagFilters));
regexStringComparator.setCharset(charset);
RowFilter filter = new RowFilter(CompareOp.EQUAL, regexStringComparator);
return filter;
}
/**
* all qualifiers' condition must be satisfied.
* <H1>Use RegexStringComparator for:</H1> IN LIKE NOT_LIKE
* <H1>Use SubstringComparator for:</H1> CONTAINS
* <H1>Use EntityQualifierHelper for:</H1> EQUALS NOT_EUQALS LESS LESS_OR_EQUAL GREATER GREATER_OR_EQUAL
* <H2>TODO: Compare performance of RegexStringComparator ,SubstringComparator ,EntityQualifierHelper</H2>
*
* @param qualifierFilters
* @return
*/
protected FilterList buildQualifierFilter(List<QualifierFilterEntity> qualifierFilters) {
FilterList list = new FilterList(Operator.MUST_PASS_ALL);
// iterate all the qualifiers
for (QualifierFilterEntity entry : qualifierFilters) {
// if contains expression based filter
if (entry.getKeyType() == TokenType.EXP || entry.getValueType() == TokenType.EXP
|| entry.getKeyType() != TokenType.ID) {
if (!EagleConfigFactory.load().isCoprocessorEnabled()) {
LOG.warn("Expression in filter may not support, because custom filter and coprocessor is disabled: "
+ entry.toString());
}
list.addFilter(buildExpressionBasedFilter(entry));
continue;
}
// else using SingleColumnValueFilter
String qualifierName = entry.getKey();
if (!isTag(entry.getKey())) {
Qualifier qualifier = ed.getDisplayNameMap().get(entry.getKey());
qualifierName = qualifier.getQualifierName();
}
// Comparator to be used for building HBase Filter
// WritableByteArrayComparable comparator;
ByteArrayComparable comparable;
if (ComparisonOperator.IN.equals(entry.getOp())
|| ComparisonOperator.NOT_IN.equals(entry.getOp())) {
Filter setFilter = buildListQualifierFilter(entry);
if (setFilter != null) {
list.addFilter(setFilter);
}
} else {
// If [=,!=,is,is not] NULL, use NullComparator else throw exception
if (TokenType.NULL.equals(entry.getValueType())) {
if (ComparisonOperator.EQUAL.equals(entry.getOp())
|| ComparisonOperator.NOT_EQUAL.equals(entry.getOp())
|| ComparisonOperator.IS.equals(entry.getOp())
|| ComparisonOperator.IS_NOT.equals(entry.getOp())) {
comparable = new NullComparator();
} else {
throw new IllegalArgumentException("Operation: " + entry.getOp()
+ " with NULL is not supported yet: "
+ entry.toString()
+ ", avaliable options: [=, !=, is, is not] null|NULL");
}
} else if (ComparisonOperator.CONTAINS.equals(entry.getOp())
|| ComparisonOperator.NOT_CONTAINS.equals(entry.getOp())) {
// If [contains, not contains],use SubstringComparator
comparable = new SubstringComparator(entry.getValue());
} else if (ComparisonOperator.LIKE.equals(entry.getOp())
|| ComparisonOperator.NOT_LIKE.equals(entry.getOp())) {
// If [like, not like], use RegexStringComparator
// Use RegexStringComparator for LIKE / NOT_LIKE
RegexStringComparator _comparator = new RegexStringComparator(buildQualifierRegex(entry
.getValue()));
_comparator.setCharset(charset);
comparable = _comparator;
} else {
Class type = EntityQualifierUtils.getType(ed, entry.getKey());
// if type is null (is Tag or not found) or not defined for TypedByteArrayComparator
if (!EagleConfigFactory.load().isCoprocessorEnabled() || type == null
|| TypedByteArrayComparator.get(type) == null) {
comparable = new BinaryComparator(EntityQualifierUtils.toBytes(ed, entry.getKey(),
entry.getValue()));
} else {
comparable = new TypedByteArrayComparator(EntityQualifierUtils
.toBytes(ed, entry.getKey(), entry.getValue()), type);
}
}
SingleColumnValueFilter filter = new SingleColumnValueFilter(ed.getColumnFamily()
.getBytes(), qualifierName.getBytes(), convertToHBaseCompareOp(entry.getOp()),
comparable);
filter.setFilterIfMissing(filterIfMissing);
list.addFilter(filter);
}
}
return list;
}
private Filter buildExpressionBasedFilter(QualifierFilterEntity entry) {
BooleanExpressionComparator expressionComparator = new BooleanExpressionComparator(entry, ed);
filterFields = expressionComparator.getRequiredFields();
RowValueFilter filter = new RowValueFilter(expressionComparator);
return filter;
}
/**
* Currently use BinaryComparator only
* <h2>TODO:</h2> Possibility to tune performance by using: OR[BinaryComparator,...] instead of
* RegexStringComparator? <br/>
* <br/>
* ! Check op must be IN or NOTIN in caller
*
* @param entry
* @return
*/
private Filter buildListQualifierFilter(QualifierFilterEntity entry) {
List<String> valueSet = EntityQualifierUtils.parseList(entry.getValue());
Iterator<String> it = valueSet.iterator();
String fieldName = entry.getKey();
String qualifierName = fieldName;
if (!ed.isTag(entry.getKey())) {
qualifierName = ed.getDisplayNameMap().get(entry.getKey()).getQualifierName();
}
// TODO: Try to use RegExp just work if possible
// Because single SingleColumnValueFilter is much faster than multi SingleColumnValueFilters in OR
// list.
// Class qualifierType = EntityQualifierHelper.getType(ed,fieldName);
// if( qualifierType == null || qualifierType == String.class){
// boolean first = true;
// StringBuilder filterRegex = new StringBuilder();
// filterRegex.append("^(");
// while(it.hasNext()) {
// String value = it.next();
// if(value == null) {
// logger.warn("ignore empty value in set qualifier filter: "+entry.toString());
// continue;
// }
// if(!first) filterRegex.append("|");
// filterRegex.append(value);
// first = false;
// }
// filterRegex.append(")$");
// RegexStringComparator regexStringComparator = new RegexStringComparator(filterRegex.toString());
// return new SingleColumnValueFilter(ed.getColumnFamily().getBytes(), qualifierName.getBytes(),
// convertToHBaseCompareOp(entry.getOp()), regexStringComparator);
// }else{
FilterList setFilterList;
if (ComparisonOperator.IN.equals(entry.getOp())) {
setFilterList = new FilterList(Operator.MUST_PASS_ONE);
} else if (ComparisonOperator.NOT_IN.equals(entry.getOp())) {
setFilterList = new FilterList(Operator.MUST_PASS_ALL);
} else {
throw new IllegalArgumentException(String
.format("Don't support operation: %s on LIST type of value yet: %s, valid options: IN/NOT IN [LIST]",
entry.getOp(), entry.toString()));
}
while (it.hasNext()) {
String value = it.next();
BinaryComparator comparator = new BinaryComparator(EntityQualifierUtils.toBytes(ed, fieldName,
value));
SingleColumnValueFilter filter = new SingleColumnValueFilter(ed.getColumnFamily()
.getBytes(), qualifierName.getBytes(), convertToHBaseCompareOp(entry.getOp()), comparator);
filter.setFilterIfMissing(filterIfMissing);
setFilterList.addFilter(filter);
}
return setFilterList;
// }
}
/**
* Just used for LIKE and NOT_LIKE
*
* @param qualifierValue
* @return
*/
protected String buildQualifierRegex(String qualifierValue) {
StringBuilder sb = new StringBuilder();
// sb.append("(?s)");
sb.append("^");
sb.append(qualifierValue);
sb.append("$");
return sb.toString();
}
/**
* Appends the given ID to the given buffer, followed by "\\E". [steal it from opentsdb, thanks opentsdb
* :) https://github.com/OpenTSDB/opentsdb/blob/master/src/core/TsdbQuery.java]
*/
private static void addId(final StringBuilder buf, final byte[] id) {
buf.append("\\Q");
boolean backslash = false;
for (final byte b : id) {
buf.append((char)(b & 0xFF));
if (b == 'E' && backslash) { // If we saw a `\' and now we have a `E'.
// So we just terminated the quoted section because we just added \E
// to `buf'. So let's put a litteral \E now and start quoting again.
buf.append("\\\\E\\Q");
} else {
backslash = b == '\\';
}
}
buf.append("\\E");
}
@SuppressWarnings("unused")
private static void addId(final StringBuilder buf, final String id) {
buf.append("\\Q");
int len = id.length() - 1;
boolean backslash = false;
for (int i = 0; i < len; i++) {
char c = id.charAt(i);
buf.append(c);
if (c == 'E' && backslash) { // If we saw a `\' and now we have a `E'.
// So we just terminated the quoted section because we just added \E
// to `buf'. So let's put a litteral \E now and start quoting again.
buf.append("\\\\E\\Q");
} else {
backslash = c == '\\';
}
}
buf.append("\\E");
}
/**
* one search tag may have multiple values which have OR relationship, and relationship between different
* search tags is AND the query is like "(TAG1=value11 OR TAG1=value12) AND TAG2=value2"
*
* @param tags
* @return
*/
protected String buildTagFilterRegex(Map<String, List<String>> tags) {
// TODO need consider that \E could be part of tag, refer to
// https://github.com/OpenTSDB/opentsdb/blob/master/src/core/TsdbQuery.java
final SortedMap<Integer, List<Integer>> tagHash = new TreeMap<Integer, List<Integer>>();
final int numOfPartitionFields = (ed.getPartitions() == null) ? 0 : ed.getPartitions().length;
for (Map.Entry<String, List<String>> entry : tags.entrySet()) {
String tagName = entry.getKey();
// Ignore tag if the tag is one of partition fields
if (ed.isPartitionTag(tagName)) {
continue;
}
List<String> stringValues = entry.getValue();
List<Integer> hashValues = new ArrayList<Integer>(stringValues.size());
for (String value : stringValues) {
hashValues.add(value.hashCode());
}
tagHash.put(tagName.hashCode(), hashValues);
}
// header = prefix(4 bytes) + partition_hashes(4*N bytes) + timestamp (8 bytes)
final int headerLength = 4 + numOfPartitionFields * 4 + 8;
// <tag1:4><value1:4> ... <tagn:4><valuen:4>
StringBuilder sb = new StringBuilder();
sb.append("(?s)");
sb.append("^(?:.{").append(headerLength).append("})");
sb.append("(?:.{").append(8).append("})*"); // for any number of tags
for (Map.Entry<Integer, List<Integer>> entry : tagHash.entrySet()) {
try {
addId(sb, ByteUtil.intToBytes(entry.getKey()));
List<Integer> hashValues = entry.getValue();
sb.append("(?:");
boolean first = true;
for (Integer value : hashValues) {
if (!first) {
sb.append('|');
}
addId(sb, ByteUtil.intToBytes(value));
first = false;
}
sb.append(")");
sb.append("(?:.{").append(8).append("})*"); // for any number of tags
} catch (Exception ex) {
LOG.error("constructing regex error", ex);
}
}
sb.append("$");
if (LOG.isDebugEnabled()) {
LOG.debug("Tag filter pattern is " + sb.toString());
}
return sb.toString();
}
/**
* Convert ComparisonOperator to native HBase CompareOp Support: =, =~,CONTAINS,<,<=,>,>=,!=,!=~
*
* @param comp
* @return
*/
protected static CompareOp convertToHBaseCompareOp(ComparisonOperator comp) {
if (comp == ComparisonOperator.EQUAL || comp == ComparisonOperator.LIKE
|| comp == ComparisonOperator.CONTAINS || comp == ComparisonOperator.IN
|| comp == ComparisonOperator.IS) {
return CompareOp.EQUAL;
} else if (comp == ComparisonOperator.LESS) {
return CompareOp.LESS;
} else if (comp == ComparisonOperator.LESS_OR_EQUAL) {
return CompareOp.LESS_OR_EQUAL;
} else if (comp == ComparisonOperator.GREATER) {
return CompareOp.GREATER;
} else if (comp == ComparisonOperator.GREATER_OR_EQUAL) {
return CompareOp.GREATER_OR_EQUAL;
} else if (comp == ComparisonOperator.NOT_EQUAL || comp == ComparisonOperator.NOT_LIKE
|| comp == ComparisonOperator.NOT_CONTAINS || comp == ComparisonOperator.IS_NOT
|| comp == ComparisonOperator.NOT_IN) {
return CompareOp.NOT_EQUAL;
} else {
LOG.error("{} operation is not supported now\n", comp);
throw new IllegalArgumentException("Illegal operation: " + comp + ", avaliable options: "
+ Arrays.toString(ComparisonOperator.values()));
}
}
protected static CompareOp getHBaseCompareOp(String comp) {
return convertToHBaseCompareOp(ComparisonOperator.locateOperator(comp));
}
}