processing/src/main/java/org/apache/druid/query/groupby/epinephelinae/column/DictionaryBuildingGroupByColumnSelectorStrategy.java - druid - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *   http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  * KIND, either express or implied.  See the License for the
  * specific language governing permissions and limitations
  * under the License.
  */

 package org.apache.druid.query.groupby.epinephelinae.column;

 import com.google.common.collect.BiMap;
 import com.google.common.collect.HashBiMap;
 import it.unimi.dsi.fastutil.objects.Object2IntMap;
 import org.apache.druid.error.DruidException;
 import org.apache.druid.query.groupby.epinephelinae.DictionaryBuildingUtils;
 import org.apache.druid.segment.column.ColumnType;
 import org.apache.druid.segment.column.NullableTypeStrategy;

 import javax.annotation.concurrent.NotThreadSafe;
 import java.util.ArrayList;
 import java.util.List;

 /**
  * Strategy for grouping dimensions which can have variable-width objects, and aren't backed by prebuilt dictionaries. It
  * encapsulates the dictionary building logic, along with providing the implementations for dimension to dictionary id
  * encoding-decoding.
  * <p>
  * This strategy can handle any dimension that can be addressed on a reverse-dictionary. Reverse dictionary uses
  * a sorted map, rather than a hashmap.
  * <p>
  * This is the most expensive of all the strategies, and hence must be used only when other strategies aren't valid.
  */
 @NotThreadSafe
 public class DictionaryBuildingGroupByColumnSelectorStrategy<DimensionType>
     extends KeyMappingGroupByColumnSelectorStrategy<DimensionType>
 {

   private DictionaryBuildingGroupByColumnSelectorStrategy(
       DimensionIdCodec<DimensionType> dimensionIdCodec,
       ColumnType columnType,
       NullableTypeStrategy<DimensionType> nullableTypeStrategy,
       DimensionType defaultValue
   )
   {
     super(dimensionIdCodec, columnType, nullableTypeStrategy, defaultValue);
   }

   /**
    * Creates an implementation of the strategy for the given type
    */
   public static GroupByColumnSelectorStrategy forType(final ColumnType columnType)
   {
     if (columnType.equals(ColumnType.STRING)) {
       // String types are handled specially because they can have multi-value dimensions
       throw DruidException.defensive("Should use special variant which handles multi-value dimensions");
     } else if (
       // Defensive check, primitives should be using a faster fixed-width strategy
         columnType.equals(ColumnType.DOUBLE)
         || columnType.equals(ColumnType.FLOAT)
         || columnType.equals(ColumnType.LONG)) {
       throw DruidException.defensive("Could used a fixed width strategy");
     }

     if (ColumnType.STRING_ARRAY.equals(columnType)) {
       forStringArrays();
     }

     // Catch-all for all other types, that can only have single-valued dimensions
     return forArrayAndComplexTypes(columnType);
   }

   /**
    * Implemenatation of dictionary building strategy for types other than strings (since they can be multi-valued and need
    * to be handled separately) and numeric primitives (since they can be handled by fixed-width strategy).
    * This also means that we handle array and complex types here, which simplifies the generics a lot, as everything can be
    * treated as Object in this class.
    * <p>
    * Also, there isn't any concept of multi-values here, therefore Dimension == DimensionHolderType == Object. We still
    * homogenize rogue selectors which can return non-standard implementation of arrays (like Long[] for long arrays instead of
    * Object[]) to what the callers would expect (i.e. Object[] in this case).
    */
   private static GroupByColumnSelectorStrategy forArrayAndComplexTypes(final ColumnType columnType)
   {
     return new DictionaryBuildingGroupByColumnSelectorStrategy<>(
         new UniValueDimensionIdCodec(columnType.getNullableStrategy()),
         columnType,
         columnType.getNullableStrategy(),
         null
     );
   }

   private static GroupByColumnSelectorStrategy forStringArrays()
   {
     return new DictionaryBuildingGroupByColumnSelectorStrategy<>(
         new StringArrayDimensionIdCodec(),
         ColumnType.STRING_ARRAY,
         ColumnType.STRING_ARRAY.getNullableStrategy(),
         null
     );
   }

   private static class UniValueDimensionIdCodec implements DimensionIdCodec<Object>
   {
     /**
      * Dictionary for mapping the dimension value to an index. i-th position in the dictionary holds the value represented
      * by the dictionaryId "i".
      * Therefore, if a value has a dictionary id "i", dictionary.get(i) = value
      * If a -> 0, b -> 1, c -> 2 (value -> dictionaryId), then the dictionary would be laid out like: [a, b, c]
      */
     private final List<Object> dictionary;

     /**
      * Reverse dictionary for faster lookup into the dictionary, and reusing pre-existing dictionary ids.
      * <p>
      * An entry of form (value, i) in the reverse dictionary represents that "value" is present at the i-th location in the
      * {@link #dictionary}.
      * Absence of mapping of a "value" (denoted by returning {@link GroupByColumnSelectorStrategy#GROUP_BY_MISSING_VALUE})
      * represents that the value is absent in the dictionary
      * If a -> 0, b -> 1, c -> 2 (value -> dictionaryId), then the reverse dictionary would have the entries (a, 0), (b, 1),
      * (c, 2)
      */
     private final Object2IntMap<Object> reverseDictionary;

     @SuppressWarnings("rawtypes")
     private final NullableTypeStrategy nullableTypeStrategy;

     public UniValueDimensionIdCodec(final NullableTypeStrategy nullableTypeStrategy)
     {
       this.dictionary = DictionaryBuildingUtils.createDictionary();
       this.reverseDictionary = DictionaryBuildingUtils.createReverseDictionary(nullableTypeStrategy);
       this.nullableTypeStrategy = nullableTypeStrategy;
     }

     @Override
     public MemoryFootprint<Integer> lookupId(Object dimension)
     {
       int dictId = reverseDictionary.getInt(dimension);
       int footprintIncrease = 0;
       // Even if called again, then this is no-op
       if (dictId < 0) {
         final int size = dictionary.size();
         dictionary.add(dimension);
         reverseDictionary.put(dimension, size);
         dictId = size;
         // MultiValueHOlder is always expected to handle the type, once the coercion is complete
         //noinspection unchecked
         footprintIncrease = DictionaryBuildingUtils.estimateEntryFootprint(
             nullableTypeStrategy.estimateSizeBytes(dimension)
         );
       }
       return new MemoryFootprint<>(dictId, footprintIncrease);
     }

     @Override
     public Object idToKey(int id)
     {
       if (id >= dictionary.size()) {
         throw DruidException.defensive("Unknown dictionary id [%d]", id);
       }
       // No need to handle GROUP_BY_MISSING_VALUE, by contract
       return dictionary.get(id);
     }

     @Override
     public boolean canCompareIds()
     {
       // Dictionaries are built on the fly, and ids are assigned in the order in which the value is added to the
       // dictionary.
       return false;
     }

     @Override
     public void reset()
     {
       dictionary.clear();
       reverseDictionary.clear();
     }
   }

   /**
    * {@link DimensionIdCodec} for string arrays. Dictionary building for string arrays is optimised to have a dual
    * dictionary - one that maps the string values to an id, and another which maps an array of these ids, to the returned
    * dictionary id. This reduces the amount of heap memory required to build the dictionaries
    */
   private static class StringArrayDimensionIdCodec implements DimensionIdCodec<Object>
   {
     // contains string <-> id for each element of the multi value grouping column
     // for eg : [a,b,c] is the col value. dictionaryToInt will contain { a <-> 1, b <-> 2, c <-> 3}
     private final BiMap<String, Integer> elementBiDictionary = HashBiMap.create();

     // stores each row as an integer array where the int represents the value in dictionaryToInt
     // for eg : [a,b,c] would be converted to [1,2,3] and assigned a integer value 1.
     // [1,2,3] <-> 1
     private final BiMap<ArrayList<Integer>, Integer> arrayBiDictionary = HashBiMap.create();

     @Override
     public MemoryFootprint<Integer> lookupId(Object dimension)
     {
       // dimension IS non-null, by contract of this method
       Object[] stringArray = (Object[]) dimension;
       ArrayList<Integer> dictionaryEncodedStringArray = new ArrayList<>();
       int estimatedFootprint = 0;
       for (Object element : stringArray) {
         String elementCasted = (String) element;
         Integer elementDictId = elementBiDictionary.get(elementCasted);
         if (elementDictId == null) {
           elementDictId = elementBiDictionary.size();
           elementBiDictionary.put(elementCasted, elementDictId);
           // We're not using the dictionary and reverseDictionary from DictionaryBuilding, but the BiMap is close enough
           // that we expect this footprint calculation to still be useful.
           estimatedFootprint +=
               DictionaryBuildingUtils.estimateEntryFootprint(elementCasted == null
                                                              ? 0
                                                              : elementCasted.length() * Character.BYTES);
         }
         dictionaryEncodedStringArray.add(elementDictId);
       }

       Integer arrayDictId = arrayBiDictionary.get(dictionaryEncodedStringArray);
       if (arrayDictId == null) {
         arrayDictId = arrayBiDictionary.size();
         arrayBiDictionary.put(dictionaryEncodedStringArray, arrayDictId);
         estimatedFootprint +=
             DictionaryBuildingUtils.estimateEntryFootprint(dictionaryEncodedStringArray.size() * Integer.BYTES);
       }
       return new MemoryFootprint<>(arrayDictId, estimatedFootprint);
     }

     @Override
     public Object idToKey(int id)
     {
       ArrayList<Integer> dictionaryEncodedStringArray = arrayBiDictionary.inverse().get(id);
       final Object[] stringRepresentation = new Object[dictionaryEncodedStringArray.size()];
       for (int i = 0; i < dictionaryEncodedStringArray.size(); ++i) {
         stringRepresentation[i] = elementBiDictionary.inverse().get(dictionaryEncodedStringArray.get(i));
       }
       return stringRepresentation;
     }

     @Override
     public boolean canCompareIds()
     {
       return false;
     }

     @Override
     public void reset()
     {
       arrayBiDictionary.clear();
       elementBiDictionary.clear();
     }
   }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing,
	* software distributed under the License is distributed on an
	* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	* KIND, either express or implied. See the License for the
	* specific language governing permissions and limitations
	* under the License.
	*/

	package org.apache.druid.query.groupby.epinephelinae.column;

	import com.google.common.collect.BiMap;
	import com.google.common.collect.HashBiMap;
	import it.unimi.dsi.fastutil.objects.Object2IntMap;
	import org.apache.druid.error.DruidException;
	import org.apache.druid.query.groupby.epinephelinae.DictionaryBuildingUtils;
	import org.apache.druid.segment.column.ColumnType;
	import org.apache.druid.segment.column.NullableTypeStrategy;

	import javax.annotation.concurrent.NotThreadSafe;
	import java.util.ArrayList;
	import java.util.List;

	/**
	* Strategy for grouping dimensions which can have variable-width objects, and aren't backed by prebuilt dictionaries. It
	* encapsulates the dictionary building logic, along with providing the implementations for dimension to dictionary id
	* encoding-decoding.
	* <p>
	* This strategy can handle any dimension that can be addressed on a reverse-dictionary. Reverse dictionary uses
	* a sorted map, rather than a hashmap.
	* <p>
	* This is the most expensive of all the strategies, and hence must be used only when other strategies aren't valid.
	*/
	@NotThreadSafe
	public class DictionaryBuildingGroupByColumnSelectorStrategy<DimensionType>
	extends KeyMappingGroupByColumnSelectorStrategy<DimensionType>
	{

	private DictionaryBuildingGroupByColumnSelectorStrategy(
	DimensionIdCodec<DimensionType> dimensionIdCodec,
	ColumnType columnType,
	NullableTypeStrategy<DimensionType> nullableTypeStrategy,
	DimensionType defaultValue
	)
	{
	super(dimensionIdCodec, columnType, nullableTypeStrategy, defaultValue);
	}

	/**
	* Creates an implementation of the strategy for the given type
	*/
	public static GroupByColumnSelectorStrategy forType(final ColumnType columnType)
	{
	if (columnType.equals(ColumnType.STRING)) {
	// String types are handled specially because they can have multi-value dimensions
	throw DruidException.defensive("Should use special variant which handles multi-value dimensions");
	} else if (
	// Defensive check, primitives should be using a faster fixed-width strategy
	columnType.equals(ColumnType.DOUBLE)
	\|\| columnType.equals(ColumnType.FLOAT)
	\|\| columnType.equals(ColumnType.LONG)) {
	throw DruidException.defensive("Could used a fixed width strategy");
	}

	if (ColumnType.STRING_ARRAY.equals(columnType)) {
	forStringArrays();
	}

	// Catch-all for all other types, that can only have single-valued dimensions
	return forArrayAndComplexTypes(columnType);
	}

	/**
	* Implemenatation of dictionary building strategy for types other than strings (since they can be multi-valued and need
	* to be handled separately) and numeric primitives (since they can be handled by fixed-width strategy).
	* This also means that we handle array and complex types here, which simplifies the generics a lot, as everything can be
	* treated as Object in this class.
	* <p>
	* Also, there isn't any concept of multi-values here, therefore Dimension == DimensionHolderType == Object. We still
	* homogenize rogue selectors which can return non-standard implementation of arrays (like Long[] for long arrays instead of
	* Object[]) to what the callers would expect (i.e. Object[] in this case).
	*/
	private static GroupByColumnSelectorStrategy forArrayAndComplexTypes(final ColumnType columnType)
	{
	return new DictionaryBuildingGroupByColumnSelectorStrategy<>(
	new UniValueDimensionIdCodec(columnType.getNullableStrategy()),
	columnType,
	columnType.getNullableStrategy(),
	null
	);
	}

	private static GroupByColumnSelectorStrategy forStringArrays()
	{
	return new DictionaryBuildingGroupByColumnSelectorStrategy<>(
	new StringArrayDimensionIdCodec(),
	ColumnType.STRING_ARRAY,
	ColumnType.STRING_ARRAY.getNullableStrategy(),
	null
	);
	}

	private static class UniValueDimensionIdCodec implements DimensionIdCodec<Object>
	{
	/**
	* Dictionary for mapping the dimension value to an index. i-th position in the dictionary holds the value represented
	* by the dictionaryId "i".
	* Therefore, if a value has a dictionary id "i", dictionary.get(i) = value
	* If a -> 0, b -> 1, c -> 2 (value -> dictionaryId), then the dictionary would be laid out like: [a, b, c]
	*/
	private final List<Object> dictionary;

	/**
	* Reverse dictionary for faster lookup into the dictionary, and reusing pre-existing dictionary ids.
	* <p>
	* An entry of form (value, i) in the reverse dictionary represents that "value" is present at the i-th location in the
	* {@link #dictionary}.
	* Absence of mapping of a "value" (denoted by returning {@link GroupByColumnSelectorStrategy#GROUP_BY_MISSING_VALUE})
	* represents that the value is absent in the dictionary
	* If a -> 0, b -> 1, c -> 2 (value -> dictionaryId), then the reverse dictionary would have the entries (a, 0), (b, 1),
	* (c, 2)
	*/
	private final Object2IntMap<Object> reverseDictionary;

	@SuppressWarnings("rawtypes")
	private final NullableTypeStrategy nullableTypeStrategy;

	public UniValueDimensionIdCodec(final NullableTypeStrategy nullableTypeStrategy)
	{
	this.dictionary = DictionaryBuildingUtils.createDictionary();
	this.reverseDictionary = DictionaryBuildingUtils.createReverseDictionary(nullableTypeStrategy);
	this.nullableTypeStrategy = nullableTypeStrategy;
	}

	@Override
	public MemoryFootprint<Integer> lookupId(Object dimension)
	{
	int dictId = reverseDictionary.getInt(dimension);
	int footprintIncrease = 0;
	// Even if called again, then this is no-op
	if (dictId < 0) {
	final int size = dictionary.size();
	dictionary.add(dimension);
	reverseDictionary.put(dimension, size);
	dictId = size;
	// MultiValueHOlder is always expected to handle the type, once the coercion is complete
	//noinspection unchecked
	footprintIncrease = DictionaryBuildingUtils.estimateEntryFootprint(
	nullableTypeStrategy.estimateSizeBytes(dimension)
	);
	}
	return new MemoryFootprint<>(dictId, footprintIncrease);
	}

	@Override
	public Object idToKey(int id)
	{
	if (id >= dictionary.size()) {
	throw DruidException.defensive("Unknown dictionary id [%d]", id);
	}
	// No need to handle GROUP_BY_MISSING_VALUE, by contract
	return dictionary.get(id);
	}

	@Override
	public boolean canCompareIds()
	{
	// Dictionaries are built on the fly, and ids are assigned in the order in which the value is added to the
	// dictionary.
	return false;
	}

	@Override
	public void reset()
	{
	dictionary.clear();
	reverseDictionary.clear();
	}
	}

	/**
	* {@link DimensionIdCodec} for string arrays. Dictionary building for string arrays is optimised to have a dual
	* dictionary - one that maps the string values to an id, and another which maps an array of these ids, to the returned
	* dictionary id. This reduces the amount of heap memory required to build the dictionaries
	*/
	private static class StringArrayDimensionIdCodec implements DimensionIdCodec<Object>
	{
	// contains string <-> id for each element of the multi value grouping column
	// for eg : [a,b,c] is the col value. dictionaryToInt will contain { a <-> 1, b <-> 2, c <-> 3}
	private final BiMap<String, Integer> elementBiDictionary = HashBiMap.create();

	// stores each row as an integer array where the int represents the value in dictionaryToInt
	// for eg : [a,b,c] would be converted to [1,2,3] and assigned a integer value 1.
	// [1,2,3] <-> 1
	private final BiMap<ArrayList<Integer>, Integer> arrayBiDictionary = HashBiMap.create();

	@Override
	public MemoryFootprint<Integer> lookupId(Object dimension)
	{
	// dimension IS non-null, by contract of this method
	Object[] stringArray = (Object[]) dimension;
	ArrayList<Integer> dictionaryEncodedStringArray = new ArrayList<>();
	int estimatedFootprint = 0;
	for (Object element : stringArray) {
	String elementCasted = (String) element;
	Integer elementDictId = elementBiDictionary.get(elementCasted);
	if (elementDictId == null) {
	elementDictId = elementBiDictionary.size();
	elementBiDictionary.put(elementCasted, elementDictId);
	// We're not using the dictionary and reverseDictionary from DictionaryBuilding, but the BiMap is close enough
	// that we expect this footprint calculation to still be useful.
	estimatedFootprint +=
	DictionaryBuildingUtils.estimateEntryFootprint(elementCasted == null
	? 0
	: elementCasted.length() * Character.BYTES);
	}
	dictionaryEncodedStringArray.add(elementDictId);
	}

	Integer arrayDictId = arrayBiDictionary.get(dictionaryEncodedStringArray);
	if (arrayDictId == null) {
	arrayDictId = arrayBiDictionary.size();
	arrayBiDictionary.put(dictionaryEncodedStringArray, arrayDictId);
	estimatedFootprint +=
	DictionaryBuildingUtils.estimateEntryFootprint(dictionaryEncodedStringArray.size() * Integer.BYTES);
	}
	return new MemoryFootprint<>(arrayDictId, estimatedFootprint);
	}

	@Override
	public Object idToKey(int id)
	{
	ArrayList<Integer> dictionaryEncodedStringArray = arrayBiDictionary.inverse().get(id);
	final Object[] stringRepresentation = new Object[dictionaryEncodedStringArray.size()];
	for (int i = 0; i < dictionaryEncodedStringArray.size(); ++i) {
	stringRepresentation[i] = elementBiDictionary.inverse().get(dictionaryEncodedStringArray.get(i));
	}
	return stringRepresentation;
	}

	@Override
	public boolean canCompareIds()
	{
	return false;
	}

	@Override
	public void reset()
	{
	arrayBiDictionary.clear();
	elementBiDictionary.clear();
	}
	}
	}