blob: 68ef6da2f514d3dc6bbe96e4422777cd6d2ce72e [file] [log] [blame]
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.pinot.segment.local.segment.index.readers;
import it.unimi.dsi.fastutil.objects.Object2IntOpenHashMap;
import java.math.BigDecimal;
import java.util.Arrays;
import org.apache.pinot.segment.spi.index.reader.Dictionary;
import org.apache.pinot.segment.spi.memory.PinotDataBuffer;
import org.apache.pinot.spi.data.FieldSpec.DataType;
import org.apache.pinot.spi.utils.BytesUtils;
/**
* Implementation of String dictionary that cache all values on-heap.
* <p>This is useful for String columns that:
* <ul>
* <li>Has low cardinality string dictionary where memory footprint on-heap is acceptably small</li>
* <li>Is heavily queried</li>
* </ul>
* <p>This helps avoid creation of String from byte[], which is expensive as well as creates garbage.
*/
public class OnHeapStringDictionary extends BaseImmutableDictionary {
private final byte _paddingByte;
private final String[] _unpaddedStrings;
private final Object2IntOpenHashMap<String> _unPaddedStringToIdMap;
private final String[] _paddedStrings;
public OnHeapStringDictionary(PinotDataBuffer dataBuffer, int length, int numBytesPerValue, byte paddingByte) {
super(dataBuffer, length, numBytesPerValue, paddingByte);
_paddingByte = paddingByte;
byte[] buffer = new byte[numBytesPerValue];
_unpaddedStrings = new String[length];
_unPaddedStringToIdMap = new Object2IntOpenHashMap<>(length);
_unPaddedStringToIdMap.defaultReturnValue(Dictionary.NULL_VALUE_INDEX);
for (int i = 0; i < length; i++) {
String unpaddedString = getUnpaddedString(i, buffer);
_unpaddedStrings[i] = unpaddedString;
_unPaddedStringToIdMap.put(unpaddedString, i);
}
if (paddingByte == 0) {
_paddedStrings = null;
} else {
_paddedStrings = new String[length];
for (int i = 0; i < length; i++) {
_paddedStrings[i] = getPaddedString(i, buffer);
}
}
}
@Override
public DataType getValueType() {
return DataType.STRING;
}
@Override
public int indexOf(String stringValue) {
return _unPaddedStringToIdMap.getInt(stringValue);
}
/**
* WARNING: With non-zero padding byte, binary search result might not reflect the real insertion index for the value.
* E.g. with padding byte 'b', if unpadded value "aa" is in the dictionary, and stored as "aab", then unpadded value
* "a" will be mis-positioned after value "aa"; unpadded value "aab" will return positive value even if value "aab" is
* not in the dictionary.
* TODO: Clean up the segments with legacy non-zero padding byte, and remove the support for non-zero padding byte
*/
@Override
public int insertionIndexOf(String stringValue) {
int index = _unPaddedStringToIdMap.getInt(stringValue);
if (index != Dictionary.NULL_VALUE_INDEX) {
return index;
} else {
return _paddingByte == 0 ? Arrays.binarySearch(_unpaddedStrings, stringValue)
: Arrays.binarySearch(_paddedStrings, padString(stringValue));
}
}
@Override
public String get(int dictId) {
return _unpaddedStrings[dictId];
}
@Override
public int getIntValue(int dictId) {
return Integer.parseInt(_unpaddedStrings[dictId]);
}
@Override
public long getLongValue(int dictId) {
return Long.parseLong(_unpaddedStrings[dictId]);
}
@Override
public float getFloatValue(int dictId) {
return Float.parseFloat(_unpaddedStrings[dictId]);
}
@Override
public double getDoubleValue(int dictId) {
return Double.parseDouble(_unpaddedStrings[dictId]);
}
@Override
public BigDecimal getBigDecimalValue(int dictId) {
return new BigDecimal(_unpaddedStrings[dictId]);
}
@Override
public String getStringValue(int dictId) {
return _unpaddedStrings[dictId];
}
@Override
public byte[] getBytesValue(int dictId) {
return BytesUtils.toBytes(_unpaddedStrings[dictId]);
}
}