blob: cf033eaa65d6beca4f7cb942ee5d5aa90b990090 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.druid.query.groupby.epinephelinae.column;
import com.google.common.collect.BiMap;
import com.google.common.collect.HashBiMap;
import it.unimi.dsi.fastutil.objects.Object2IntMap;
import org.apache.druid.error.DruidException;
import org.apache.druid.query.groupby.epinephelinae.DictionaryBuildingUtils;
import org.apache.druid.segment.column.ColumnType;
import org.apache.druid.segment.column.NullableTypeStrategy;
import javax.annotation.concurrent.NotThreadSafe;
import java.util.ArrayList;
import java.util.List;
/**
* Strategy for grouping dimensions which can have variable-width objects, and aren't backed by prebuilt dictionaries. It
* encapsulates the dictionary building logic, along with providing the implementations for dimension to dictionary id
* encoding-decoding.
* <p>
* This strategy can handle any dimension that can be addressed on a reverse-dictionary. Reverse dictionary uses
* a sorted map, rather than a hashmap.
* <p>
* This is the most expensive of all the strategies, and hence must be used only when other strategies aren't valid.
*/
@NotThreadSafe
public class DictionaryBuildingGroupByColumnSelectorStrategy<DimensionType>
extends KeyMappingGroupByColumnSelectorStrategy<DimensionType>
{
private DictionaryBuildingGroupByColumnSelectorStrategy(
DimensionIdCodec<DimensionType> dimensionIdCodec,
ColumnType columnType,
NullableTypeStrategy<DimensionType> nullableTypeStrategy,
DimensionType defaultValue
)
{
super(dimensionIdCodec, columnType, nullableTypeStrategy, defaultValue);
}
/**
* Creates an implementation of the strategy for the given type
*/
public static GroupByColumnSelectorStrategy forType(final ColumnType columnType)
{
if (columnType.equals(ColumnType.STRING)) {
// String types are handled specially because they can have multi-value dimensions
throw DruidException.defensive("Should use special variant which handles multi-value dimensions");
} else if (
// Defensive check, primitives should be using a faster fixed-width strategy
columnType.equals(ColumnType.DOUBLE)
|| columnType.equals(ColumnType.FLOAT)
|| columnType.equals(ColumnType.LONG)) {
throw DruidException.defensive("Could used a fixed width strategy");
}
if (ColumnType.STRING_ARRAY.equals(columnType)) {
forStringArrays();
}
// Catch-all for all other types, that can only have single-valued dimensions
return forArrayAndComplexTypes(columnType);
}
/**
* Implemenatation of dictionary building strategy for types other than strings (since they can be multi-valued and need
* to be handled separately) and numeric primitives (since they can be handled by fixed-width strategy).
* This also means that we handle array and complex types here, which simplifies the generics a lot, as everything can be
* treated as Object in this class.
* <p>
* Also, there isn't any concept of multi-values here, therefore Dimension == DimensionHolderType == Object. We still
* homogenize rogue selectors which can return non-standard implementation of arrays (like Long[] for long arrays instead of
* Object[]) to what the callers would expect (i.e. Object[] in this case).
*/
private static GroupByColumnSelectorStrategy forArrayAndComplexTypes(final ColumnType columnType)
{
return new DictionaryBuildingGroupByColumnSelectorStrategy<>(
new UniValueDimensionIdCodec(columnType.getNullableStrategy()),
columnType,
columnType.getNullableStrategy(),
null
);
}
private static GroupByColumnSelectorStrategy forStringArrays()
{
return new DictionaryBuildingGroupByColumnSelectorStrategy<>(
new StringArrayDimensionIdCodec(),
ColumnType.STRING_ARRAY,
ColumnType.STRING_ARRAY.getNullableStrategy(),
null
);
}
private static class UniValueDimensionIdCodec implements DimensionIdCodec<Object>
{
/**
* Dictionary for mapping the dimension value to an index. i-th position in the dictionary holds the value represented
* by the dictionaryId "i".
* Therefore, if a value has a dictionary id "i", dictionary.get(i) = value
* If a -> 0, b -> 1, c -> 2 (value -> dictionaryId), then the dictionary would be laid out like: [a, b, c]
*/
private final List<Object> dictionary;
/**
* Reverse dictionary for faster lookup into the dictionary, and reusing pre-existing dictionary ids.
* <p>
* An entry of form (value, i) in the reverse dictionary represents that "value" is present at the i-th location in the
* {@link #dictionary}.
* Absence of mapping of a "value" (denoted by returning {@link GroupByColumnSelectorStrategy#GROUP_BY_MISSING_VALUE})
* represents that the value is absent in the dictionary
* If a -> 0, b -> 1, c -> 2 (value -> dictionaryId), then the reverse dictionary would have the entries (a, 0), (b, 1),
* (c, 2)
*/
private final Object2IntMap<Object> reverseDictionary;
@SuppressWarnings("rawtypes")
private final NullableTypeStrategy nullableTypeStrategy;
public UniValueDimensionIdCodec(final NullableTypeStrategy nullableTypeStrategy)
{
this.dictionary = DictionaryBuildingUtils.createDictionary();
this.reverseDictionary = DictionaryBuildingUtils.createReverseDictionary(nullableTypeStrategy);
this.nullableTypeStrategy = nullableTypeStrategy;
}
@Override
public MemoryFootprint<Integer> lookupId(Object dimension)
{
int dictId = reverseDictionary.getInt(dimension);
int footprintIncrease = 0;
// Even if called again, then this is no-op
if (dictId < 0) {
final int size = dictionary.size();
dictionary.add(dimension);
reverseDictionary.put(dimension, size);
dictId = size;
// MultiValueHOlder is always expected to handle the type, once the coercion is complete
//noinspection unchecked
footprintIncrease = DictionaryBuildingUtils.estimateEntryFootprint(
nullableTypeStrategy.estimateSizeBytes(dimension)
);
}
return new MemoryFootprint<>(dictId, footprintIncrease);
}
@Override
public Object idToKey(int id)
{
if (id >= dictionary.size()) {
throw DruidException.defensive("Unknown dictionary id [%d]", id);
}
// No need to handle GROUP_BY_MISSING_VALUE, by contract
return dictionary.get(id);
}
@Override
public boolean canCompareIds()
{
// Dictionaries are built on the fly, and ids are assigned in the order in which the value is added to the
// dictionary.
return false;
}
@Override
public void reset()
{
dictionary.clear();
reverseDictionary.clear();
}
}
/**
* {@link DimensionIdCodec} for string arrays. Dictionary building for string arrays is optimised to have a dual
* dictionary - one that maps the string values to an id, and another which maps an array of these ids, to the returned
* dictionary id. This reduces the amount of heap memory required to build the dictionaries
*/
private static class StringArrayDimensionIdCodec implements DimensionIdCodec<Object>
{
// contains string <-> id for each element of the multi value grouping column
// for eg : [a,b,c] is the col value. dictionaryToInt will contain { a <-> 1, b <-> 2, c <-> 3}
private final BiMap<String, Integer> elementBiDictionary = HashBiMap.create();
// stores each row as an integer array where the int represents the value in dictionaryToInt
// for eg : [a,b,c] would be converted to [1,2,3] and assigned a integer value 1.
// [1,2,3] <-> 1
private final BiMap<ArrayList<Integer>, Integer> arrayBiDictionary = HashBiMap.create();
@Override
public MemoryFootprint<Integer> lookupId(Object dimension)
{
// dimension IS non-null, by contract of this method
Object[] stringArray = (Object[]) dimension;
ArrayList<Integer> dictionaryEncodedStringArray = new ArrayList<>();
int estimatedFootprint = 0;
for (Object element : stringArray) {
String elementCasted = (String) element;
Integer elementDictId = elementBiDictionary.get(elementCasted);
if (elementDictId == null) {
elementDictId = elementBiDictionary.size();
elementBiDictionary.put(elementCasted, elementDictId);
// We're not using the dictionary and reverseDictionary from DictionaryBuilding, but the BiMap is close enough
// that we expect this footprint calculation to still be useful.
estimatedFootprint +=
DictionaryBuildingUtils.estimateEntryFootprint(elementCasted == null
? 0
: elementCasted.length() * Character.BYTES);
}
dictionaryEncodedStringArray.add(elementDictId);
}
Integer arrayDictId = arrayBiDictionary.get(dictionaryEncodedStringArray);
if (arrayDictId == null) {
arrayDictId = arrayBiDictionary.size();
arrayBiDictionary.put(dictionaryEncodedStringArray, arrayDictId);
estimatedFootprint +=
DictionaryBuildingUtils.estimateEntryFootprint(dictionaryEncodedStringArray.size() * Integer.BYTES);
}
return new MemoryFootprint<>(arrayDictId, estimatedFootprint);
}
@Override
public Object idToKey(int id)
{
ArrayList<Integer> dictionaryEncodedStringArray = arrayBiDictionary.inverse().get(id);
final Object[] stringRepresentation = new Object[dictionaryEncodedStringArray.size()];
for (int i = 0; i < dictionaryEncodedStringArray.size(); ++i) {
stringRepresentation[i] = elementBiDictionary.inverse().get(dictionaryEncodedStringArray.get(i));
}
return stringRepresentation;
}
@Override
public boolean canCompareIds()
{
return false;
}
@Override
public void reset()
{
arrayBiDictionary.clear();
elementBiDictionary.clear();
}
}
}