blob: 589ad84fee32ccc18d6e01e3bc83ecdaad3dc66c [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.druid.segment;
import com.google.common.base.Predicate;
import com.google.common.base.Predicates;
import com.google.common.primitives.Ints;
import it.unimi.dsi.fastutil.ints.IntArrays;
import org.apache.druid.collections.bitmap.BitmapFactory;
import org.apache.druid.collections.bitmap.MutableBitmap;
import org.apache.druid.common.config.NullHandling;
import org.apache.druid.data.input.impl.DimensionSchema.MultiValueHandling;
import org.apache.druid.java.util.common.ISE;
import org.apache.druid.java.util.common.guava.Comparators;
import org.apache.druid.query.dimension.DimensionSpec;
import org.apache.druid.query.extraction.ExtractionFn;
import org.apache.druid.query.filter.ValueMatcher;
import org.apache.druid.query.monomorphicprocessing.RuntimeShapeInspector;
import org.apache.druid.segment.column.ColumnCapabilities;
import org.apache.druid.segment.column.ColumnCapabilitiesImpl;
import org.apache.druid.segment.column.ColumnType;
import org.apache.druid.segment.data.ArrayBasedIndexedInts;
import org.apache.druid.segment.data.IndexedInts;
import org.apache.druid.segment.filter.BooleanValueMatcher;
import org.apache.druid.segment.incremental.IncrementalIndex;
import org.apache.druid.segment.incremental.IncrementalIndexRow;
import org.apache.druid.segment.incremental.IncrementalIndexRowHolder;
import org.checkerframework.checker.nullness.qual.MonotonicNonNull;
import javax.annotation.Nullable;
import java.util.Arrays;
import java.util.BitSet;
import java.util.List;
public class StringDimensionIndexer extends DictionaryEncodedColumnIndexer<int[], String>
{
@Nullable
private static String emptyToNullIfNeeded(@Nullable Object o)
{
return o != null ? NullHandling.emptyToNullIfNeeded(o.toString()) : null;
}
private final MultiValueHandling multiValueHandling;
private final boolean hasBitmapIndexes;
private final boolean hasSpatialIndexes;
private final boolean useMaxMemoryEstimates;
private volatile boolean hasMultipleValues = false;
public StringDimensionIndexer(
MultiValueHandling multiValueHandling,
boolean hasBitmapIndexes,
boolean hasSpatialIndexes,
boolean useMaxMemoryEstimates
)
{
super(new StringDimensionDictionary(!useMaxMemoryEstimates));
this.multiValueHandling = multiValueHandling == null ? MultiValueHandling.ofDefault() : multiValueHandling;
this.hasBitmapIndexes = hasBitmapIndexes;
this.hasSpatialIndexes = hasSpatialIndexes;
this.useMaxMemoryEstimates = useMaxMemoryEstimates;
}
@Override
public EncodedKeyComponent<int[]> processRowValsToUnsortedEncodedKeyComponent(@Nullable Object dimValues, boolean reportParseExceptions)
{
final int[] encodedDimensionValues;
final int oldDictSize = dimLookup.size();
final long oldDictSizeInBytes = useMaxMemoryEstimates ? 0 : dimLookup.sizeInBytes();
if (dimValues == null) {
final int nullId = dimLookup.getId(null);
encodedDimensionValues = nullId == DimensionDictionary.ABSENT_VALUE_ID ? new int[]{dimLookup.add(null)} : new int[]{nullId};
} else if (dimValues instanceof List) {
List<Object> dimValuesList = (List<Object>) dimValues;
if (dimValuesList.isEmpty()) {
dimLookup.add(null);
encodedDimensionValues = IntArrays.EMPTY_ARRAY;
} else if (dimValuesList.size() == 1) {
encodedDimensionValues = new int[]{dimLookup.add(emptyToNullIfNeeded(dimValuesList.get(0)))};
} else {
hasMultipleValues = true;
final String[] dimensionValues = new String[dimValuesList.size()];
for (int i = 0; i < dimValuesList.size(); i++) {
dimensionValues[i] = emptyToNullIfNeeded(dimValuesList.get(i));
}
if (multiValueHandling.needSorting()) {
// Sort multival row by their unencoded values first.
Arrays.sort(dimensionValues, Comparators.naturalNullsFirst());
}
final int[] retVal = new int[dimensionValues.length];
int prevId = -1;
int pos = 0;
for (String dimensionValue : dimensionValues) {
if (multiValueHandling != MultiValueHandling.SORTED_SET) {
retVal[pos++] = dimLookup.add(dimensionValue);
continue;
}
int index = dimLookup.add(dimensionValue);
if (index != prevId) {
prevId = retVal[pos++] = index;
}
}
encodedDimensionValues = pos == retVal.length ? retVal : Arrays.copyOf(retVal, pos);
}
} else {
encodedDimensionValues = new int[]{dimLookup.add(emptyToNullIfNeeded(dimValues))};
}
// If dictionary size has changed, the sorted lookup is no longer valid.
if (oldDictSize != dimLookup.size()) {
sortedLookup = null;
}
long effectiveSizeBytes;
if (useMaxMemoryEstimates) {
effectiveSizeBytes = estimateEncodedKeyComponentSize(encodedDimensionValues);
} else {
// size of encoded array + dictionary size change
effectiveSizeBytes = 16L + (long) encodedDimensionValues.length * Integer.BYTES
+ (dimLookup.sizeInBytes() - oldDictSizeInBytes);
}
return new EncodedKeyComponent<>(encodedDimensionValues, effectiveSizeBytes);
}
/**
* Estimates size of the given key component.
* <p>
* Deprecated method. Use {@link #processRowValsToUnsortedEncodedKeyComponent(Object, boolean)}
* and {@link EncodedKeyComponent#getEffectiveSizeBytes()}.
*/
public long estimateEncodedKeyComponentSize(int[] keys)
{
// string length is being accounted for each time they are referenced, based on dimension handler interface,
// even though they are stored just once. It may overestimate the size by a bit, but we wanted to leave
// more buffer to be safe
long estimatedSize = keys.length * Integer.BYTES;
String[] vals = dimLookup.getValues(keys);
for (String val : vals) {
if (val != null) {
// According to https://www.ibm.com/developerworks/java/library/j-codetoheap/index.html
// String has the following memory usuage...
// 28 bytes of data for String metadata (class pointer, flags, locks, hash, count, offset, reference to char array)
// 16 bytes of data for the char array metadata (class pointer, flags, locks, size)
// 2 bytes for every letter of the string
int sizeOfString = 28 + 16 + (2 * val.length());
estimatedSize += sizeOfString;
}
}
return estimatedSize;
}
@Override
public int compareUnsortedEncodedKeyComponents(int[] lhs, int[] rhs)
{
int lhsLen = lhs.length;
int rhsLen = rhs.length;
int lenCompareResult = Ints.compare(lhsLen, rhsLen);
if (lenCompareResult != 0) {
// if the values don't have the same length, check if we're comparing [] and [null], which are equivalent
if (lhsLen + rhsLen == 1) {
int[] longerVal = rhsLen > lhsLen ? rhs : lhs;
if (longerVal[0] == dimLookup.getIdForNull()) {
return 0;
} else {
//noinspection ArrayEquality -- longerVal is explicitly set to only lhs or rhs
return longerVal == lhs ? 1 : -1;
}
}
}
int valsIndex = 0;
int lenToCompare = Math.min(lhsLen, rhsLen);
while (valsIndex < lenToCompare) {
int lhsVal = lhs[valsIndex];
int rhsVal = rhs[valsIndex];
if (lhsVal != rhsVal) {
final String lhsValActual = getActualValue(lhsVal, false);
final String rhsValActual = getActualValue(rhsVal, false);
int valueCompareResult = 0;
if (lhsValActual != null && rhsValActual != null) {
valueCompareResult = lhsValActual.compareTo(rhsValActual);
} else if (lhsValActual == null ^ rhsValActual == null) {
valueCompareResult = lhsValActual == null ? -1 : 1;
}
if (valueCompareResult != 0) {
return valueCompareResult;
}
}
++valsIndex;
}
return lenCompareResult;
}
@Override
public boolean checkUnsortedEncodedKeyComponentsEqual(int[] lhs, int[] rhs)
{
return Arrays.equals(lhs, rhs);
}
@Override
public int getUnsortedEncodedKeyComponentHashCode(int[] key)
{
return Arrays.hashCode(key);
}
@Override
public ColumnCapabilities getColumnCapabilities()
{
ColumnCapabilitiesImpl capabilites = new ColumnCapabilitiesImpl().setType(ColumnType.STRING)
.setHasBitmapIndexes(hasBitmapIndexes)
.setHasSpatialIndexes(hasSpatialIndexes)
.setDictionaryValuesUnique(true)
.setDictionaryValuesSorted(false);
// Strings are opportunistically multi-valued, but the capabilities are initialized as 'unknown', since a
// multi-valued row might be processed at any point during ingestion.
// We only explicitly set multiple values if we are certain that there are multiple values, otherwise, a race
// condition might occur where this indexer might process a multi-valued row in the period between obtaining the
// capabilities, and actually processing the rows with a selector. Leaving as unknown allows the caller to decide
// how to handle this.
if (hasMultipleValues) {
capabilites.setHasMultipleValues(true);
}
// Likewise, only set dictionaryEncoded if explicitly if true for a similar reason as multi-valued handling. The
// dictionary is populated as rows are processed, but there might be implicit default values not accounted for in
// the dictionary yet. We can be certain that the dictionary has an entry for every value if either of
// a) we have already processed an explitic default (null) valued row for this column
// b) the processing was not 'sparse', meaning that this indexer has processed an explict value for every row
// is true.
final boolean allValuesEncoded = dictionaryEncodesAllValues();
if (allValuesEncoded) {
capabilites.setDictionaryEncoded(true);
}
if (isSparse || dimLookup.getIdForNull() != DimensionDictionary.ABSENT_VALUE_ID) {
capabilites.setHasNulls(true);
}
return capabilites;
}
@Override
public DimensionSelector makeDimensionSelector(
final DimensionSpec spec,
final IncrementalIndexRowHolder currEntry,
final IncrementalIndex.DimensionDesc desc
)
{
final ExtractionFn extractionFn = spec.getExtractionFn();
final int dimIndex = desc.getIndex();
// maxId is used in concert with getLastRowIndex() in IncrementalIndex to ensure that callers do not encounter
// rows that contain IDs over the initially-reported cardinality. The main idea is that IncrementalIndex establishes
// a watermark at the time a cursor is created, and doesn't allow the cursor to walk past that watermark.
//
// Additionally, this selector explicitly blocks knowledge of IDs past maxId that may occur from other causes
// (for example: nulls getting generated for empty arrays, or calls to lookupId).
final int maxId = getCardinality();
class IndexerDimensionSelector implements DimensionSelector, IdLookup
{
private final ArrayBasedIndexedInts indexedInts = new ArrayBasedIndexedInts();
@Nullable
@MonotonicNonNull
private int[] nullIdIntArray;
@Override
public IndexedInts getRow()
{
final Object[] dims = currEntry.get().getDims();
int[] indices;
if (dimIndex < dims.length) {
indices = (int[]) dims[dimIndex];
} else {
indices = null;
}
int[] row = null;
int rowSize = 0;
// usually due to currEntry's rowIndex is smaller than the row's rowIndex in which this dim first appears
if (indices == null || indices.length == 0) {
if (hasMultipleValues) {
row = IntArrays.EMPTY_ARRAY;
rowSize = 0;
} else {
final int nullId = getEncodedValue(null, false);
if (nullId >= 0 && nullId < maxId) {
// null was added to the dictionary before this selector was created; return its ID.
if (nullIdIntArray == null) {
nullIdIntArray = new int[]{nullId};
}
row = nullIdIntArray;
rowSize = 1;
} else {
// null doesn't exist in the dictionary; return an empty array.
// Choose to use ArrayBasedIndexedInts later, instead of special "empty" IndexedInts, for monomorphism
row = IntArrays.EMPTY_ARRAY;
rowSize = 0;
}
}
}
if (row == null && indices != null && indices.length > 0) {
row = indices;
rowSize = indices.length;
}
indexedInts.setValues(row, rowSize);
return indexedInts;
}
@Override
public ValueMatcher makeValueMatcher(final String value)
{
if (extractionFn == null) {
final int valueId = lookupId(value);
if (valueId >= 0 || value == null) {
return new ValueMatcher()
{
@Override
public boolean matches()
{
Object[] dims = currEntry.get().getDims();
if (dimIndex >= dims.length) {
return value == null;
}
int[] dimsInt = (int[]) dims[dimIndex];
if (dimsInt == null || dimsInt.length == 0) {
return value == null;
}
for (int id : dimsInt) {
if (id == valueId) {
return true;
}
}
return false;
}
@Override
public void inspectRuntimeShape(RuntimeShapeInspector inspector)
{
// nothing to inspect
}
};
} else {
return BooleanValueMatcher.of(false);
}
} else {
// Employ caching BitSet optimization
return makeValueMatcher(Predicates.equalTo(value));
}
}
@Override
public ValueMatcher makeValueMatcher(final Predicate<String> predicate)
{
final BitSet checkedIds = new BitSet(maxId);
final BitSet matchingIds = new BitSet(maxId);
final boolean matchNull = predicate.apply(null);
// Lazy matcher; only check an id if matches() is called.
return new ValueMatcher()
{
@Override
public boolean matches()
{
Object[] dims = currEntry.get().getDims();
if (dimIndex >= dims.length) {
return matchNull;
}
int[] dimsInt = (int[]) dims[dimIndex];
if (dimsInt == null || dimsInt.length == 0) {
return matchNull;
}
for (int id : dimsInt) {
if (checkedIds.get(id)) {
if (matchingIds.get(id)) {
return true;
}
} else {
final boolean matches = predicate.apply(lookupName(id));
checkedIds.set(id);
if (matches) {
matchingIds.set(id);
return true;
}
}
}
return false;
}
@Override
public void inspectRuntimeShape(RuntimeShapeInspector inspector)
{
// nothing to inspect
}
};
}
@Override
public int getValueCardinality()
{
return maxId;
}
@Override
public String lookupName(int id)
{
if (id >= maxId) {
// Sanity check; IDs beyond maxId should not be known to callers. (See comment above.)
throw new ISE("id[%d] >= maxId[%d]", id, maxId);
}
final String strValue = getActualValue(id, false);
return extractionFn == null ? strValue : extractionFn.apply(strValue);
}
@Override
public boolean nameLookupPossibleInAdvance()
{
return dictionaryEncodesAllValues();
}
@Nullable
@Override
public IdLookup idLookup()
{
return extractionFn == null ? this : null;
}
@Override
public int lookupId(String name)
{
if (extractionFn != null) {
throw new UnsupportedOperationException(
"cannot perform lookup when applying an extraction function"
);
}
final int id = getEncodedValue(name, false);
if (id < maxId) {
return id;
} else {
// Can happen if a value was added to our dimLookup after this selector was created. Act like it
// doesn't exist.
return DimensionDictionary.ABSENT_VALUE_ID;
}
}
@SuppressWarnings("deprecation")
@Nullable
@Override
public Object getObject()
{
IncrementalIndexRow key = currEntry.get();
if (key == null) {
return null;
}
Object[] dims = key.getDims();
if (dimIndex >= dims.length) {
return null;
}
return convertUnsortedEncodedKeyComponentToActualList((int[]) dims[dimIndex]);
}
@SuppressWarnings("deprecation")
@Override
public Class classOfObject()
{
return Object.class;
}
@Override
public void inspectRuntimeShape(RuntimeShapeInspector inspector)
{
// nothing to inspect
}
}
return new IndexerDimensionSelector();
}
@Nullable
@Override
public Object convertUnsortedEncodedKeyComponentToActualList(int[] key)
{
if (key == null || key.length == 0) {
return null;
}
if (key.length == 1) {
return getActualValue(key[0], false);
} else {
String[] rowArray = new String[key.length];
for (int i = 0; i < key.length; i++) {
String val = getActualValue(key[i], false);
rowArray[i] = NullHandling.nullToEmptyIfNeeded(val);
}
return Arrays.asList(rowArray);
}
}
@Override
public void fillBitmapsFromUnsortedEncodedKeyComponent(
int[] key,
int rowNum,
MutableBitmap[] bitmapIndexes,
BitmapFactory factory
)
{
if (!hasBitmapIndexes) {
throw new UnsupportedOperationException("This column does not include bitmap indexes");
}
for (int dimValIdx : key) {
if (bitmapIndexes[dimValIdx] == null) {
bitmapIndexes[dimValIdx] = factory.makeEmptyMutableBitmap();
}
bitmapIndexes[dimValIdx].add(rowNum);
}
}
}