blob: 0645b2b533c8e17336dfcfdc59b887473963b42b [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.parquet.column;
import static org.apache.parquet.bytes.BytesUtils.getWidthFromMaxInt;
import java.util.HashMap;
import java.util.Map;
import java.util.Objects;
import java.util.OptionalDouble;
import java.util.OptionalLong;
import org.apache.parquet.Preconditions;
import org.apache.parquet.bytes.ByteBufferAllocator;
import org.apache.parquet.bytes.CapacityByteArrayOutputStream;
import org.apache.parquet.bytes.HeapByteBufferAllocator;
import org.apache.parquet.column.impl.ColumnWriteStoreV1;
import org.apache.parquet.column.impl.ColumnWriteStoreV2;
import org.apache.parquet.column.page.PageWriteStore;
import org.apache.parquet.column.values.ValuesWriter;
import org.apache.parquet.column.values.bitpacking.DevNullValuesWriter;
import org.apache.parquet.column.values.bloomfilter.BloomFilterWriteStore;
import org.apache.parquet.column.values.factory.DefaultValuesWriterFactory;
import org.apache.parquet.column.values.factory.ValuesWriterFactory;
import org.apache.parquet.column.values.rle.RunLengthBitPackingHybridEncoder;
import org.apache.parquet.column.values.rle.RunLengthBitPackingHybridValuesWriter;
import org.apache.parquet.schema.MessageType;
/**
* This class represents all the configurable Parquet properties.
*/
public class ParquetProperties {
public static final int DEFAULT_PAGE_SIZE = 1024 * 1024;
public static final int DEFAULT_DICTIONARY_PAGE_SIZE = DEFAULT_PAGE_SIZE;
public static final boolean DEFAULT_IS_DICTIONARY_ENABLED = true;
public static final boolean DEFAULT_IS_BYTE_STREAM_SPLIT_ENABLED = false;
public static final WriterVersion DEFAULT_WRITER_VERSION = WriterVersion.PARQUET_1_0;
public static final boolean DEFAULT_ESTIMATE_ROW_COUNT_FOR_PAGE_SIZE_CHECK = true;
public static final int DEFAULT_MINIMUM_RECORD_COUNT_FOR_CHECK = 100;
public static final int DEFAULT_MAXIMUM_RECORD_COUNT_FOR_CHECK = 10000;
public static final int DEFAULT_PAGE_VALUE_COUNT_THRESHOLD = Integer.MAX_VALUE / 2;
public static final int DEFAULT_COLUMN_INDEX_TRUNCATE_LENGTH = 64;
public static final int DEFAULT_STATISTICS_TRUNCATE_LENGTH = Integer.MAX_VALUE;
public static final int DEFAULT_PAGE_ROW_COUNT_LIMIT = 20_000;
public static final int DEFAULT_MAX_BLOOM_FILTER_BYTES = 1024 * 1024;
public static final boolean DEFAULT_BLOOM_FILTER_ENABLED = false;
public static final double DEFAULT_BLOOM_FILTER_FPP = 0.01;
public static final boolean DEFAULT_ADAPTIVE_BLOOM_FILTER_ENABLED = false;
public static final int DEFAULT_BLOOM_FILTER_CANDIDATES_NUMBER = 5;
public static final boolean DEFAULT_PAGE_WRITE_CHECKSUM_ENABLED = true;
public static final ValuesWriterFactory DEFAULT_VALUES_WRITER_FACTORY = new DefaultValuesWriterFactory();
private static final int MIN_SLAB_SIZE = 64;
private enum ByteStreamSplitMode {
NONE,
FLOATING_POINT,
EXTENDED
}
public enum WriterVersion {
PARQUET_1_0("v1"),
PARQUET_2_0("v2");
private final String shortName;
WriterVersion(String shortname) {
this.shortName = shortname;
}
public static WriterVersion fromString(String name) {
for (WriterVersion v : WriterVersion.values()) {
if (v.shortName.equals(name)) {
return v;
}
}
// Throws IllegalArgumentException if name does not exact match with enum name
return WriterVersion.valueOf(name);
}
}
private final int initialSlabSize;
private final int pageSizeThreshold;
private final int pageValueCountThreshold;
private final int dictionaryPageSizeThreshold;
private final WriterVersion writerVersion;
private final ColumnProperty<Boolean> dictionaryEnabled;
private final int minRowCountForPageSizeCheck;
private final int maxRowCountForPageSizeCheck;
private final boolean estimateNextSizeCheck;
private final ByteBufferAllocator allocator;
private final ValuesWriterFactory valuesWriterFactory;
private final int columnIndexTruncateLength;
private final int statisticsTruncateLength;
// The expected NDV (number of distinct values) for each columns
private final ColumnProperty<Long> bloomFilterNDVs;
private final ColumnProperty<Double> bloomFilterFPPs;
private final int maxBloomFilterBytes;
private final ColumnProperty<Boolean> bloomFilterEnabled;
private final ColumnProperty<Boolean> adaptiveBloomFilterEnabled;
private final ColumnProperty<Integer> numBloomFilterCandidates;
private final int pageRowCountLimit;
private final boolean pageWriteChecksumEnabled;
private final ColumnProperty<ByteStreamSplitMode> byteStreamSplitEnabled;
private final Map<String, String> extraMetaData;
private ParquetProperties(Builder builder) {
this.pageSizeThreshold = builder.pageSize;
this.pageValueCountThreshold = builder.pageValueCountThreshold;
this.initialSlabSize =
CapacityByteArrayOutputStream.initialSlabSizeHeuristic(MIN_SLAB_SIZE, pageSizeThreshold, 10);
this.dictionaryPageSizeThreshold = builder.dictPageSize;
this.writerVersion = builder.writerVersion;
this.dictionaryEnabled = builder.enableDict.build();
this.minRowCountForPageSizeCheck = builder.minRowCountForPageSizeCheck;
this.maxRowCountForPageSizeCheck = builder.maxRowCountForPageSizeCheck;
this.estimateNextSizeCheck = builder.estimateNextSizeCheck;
this.allocator = builder.allocator;
this.valuesWriterFactory = builder.valuesWriterFactory;
this.columnIndexTruncateLength = builder.columnIndexTruncateLength;
this.statisticsTruncateLength = builder.statisticsTruncateLength;
this.bloomFilterNDVs = builder.bloomFilterNDVs.build();
this.bloomFilterFPPs = builder.bloomFilterFPPs.build();
this.bloomFilterEnabled = builder.bloomFilterEnabled.build();
this.maxBloomFilterBytes = builder.maxBloomFilterBytes;
this.adaptiveBloomFilterEnabled = builder.adaptiveBloomFilterEnabled.build();
this.numBloomFilterCandidates = builder.numBloomFilterCandidates.build();
this.pageRowCountLimit = builder.pageRowCountLimit;
this.pageWriteChecksumEnabled = builder.pageWriteChecksumEnabled;
this.byteStreamSplitEnabled = builder.byteStreamSplitEnabled.build();
this.extraMetaData = builder.extraMetaData;
}
public static Builder builder() {
return new Builder();
}
public static Builder copy(ParquetProperties toCopy) {
return new Builder(toCopy);
}
public ValuesWriter newRepetitionLevelWriter(ColumnDescriptor path) {
return newColumnDescriptorValuesWriter(path.getMaxRepetitionLevel());
}
public ValuesWriter newDefinitionLevelWriter(ColumnDescriptor path) {
return newColumnDescriptorValuesWriter(path.getMaxDefinitionLevel());
}
private ValuesWriter newColumnDescriptorValuesWriter(int maxLevel) {
if (maxLevel == 0) {
return new DevNullValuesWriter();
} else {
return new RunLengthBitPackingHybridValuesWriter(
getWidthFromMaxInt(maxLevel), MIN_SLAB_SIZE, pageSizeThreshold, allocator);
}
}
public RunLengthBitPackingHybridEncoder newRepetitionLevelEncoder(ColumnDescriptor path) {
return newLevelEncoder(path.getMaxRepetitionLevel());
}
public RunLengthBitPackingHybridEncoder newDefinitionLevelEncoder(ColumnDescriptor path) {
return newLevelEncoder(path.getMaxDefinitionLevel());
}
private RunLengthBitPackingHybridEncoder newLevelEncoder(int maxLevel) {
return new RunLengthBitPackingHybridEncoder(
getWidthFromMaxInt(maxLevel), MIN_SLAB_SIZE, pageSizeThreshold, allocator);
}
public ValuesWriter newValuesWriter(ColumnDescriptor path) {
return valuesWriterFactory.newValuesWriter(path);
}
public int getPageSizeThreshold() {
return pageSizeThreshold;
}
public int getPageValueCountThreshold() {
return pageValueCountThreshold;
}
public int getInitialSlabSize() {
return initialSlabSize;
}
public int getDictionaryPageSizeThreshold() {
return dictionaryPageSizeThreshold;
}
public WriterVersion getWriterVersion() {
return writerVersion;
}
@Deprecated
public boolean isEnableDictionary() {
return dictionaryEnabled.getDefaultValue();
}
public boolean isDictionaryEnabled(ColumnDescriptor column) {
return dictionaryEnabled.getValue(column);
}
@Deprecated()
public boolean isByteStreamSplitEnabled() {
return byteStreamSplitEnabled.getDefaultValue() != ByteStreamSplitMode.NONE;
}
public boolean isByteStreamSplitEnabled(ColumnDescriptor column) {
switch (column.getPrimitiveType().getPrimitiveTypeName()) {
case FLOAT:
case DOUBLE:
return byteStreamSplitEnabled.getValue(column) != ByteStreamSplitMode.NONE;
case INT32:
case INT64:
case FIXED_LEN_BYTE_ARRAY:
return byteStreamSplitEnabled.getValue(column) == ByteStreamSplitMode.EXTENDED;
default:
return false;
}
}
public ByteBufferAllocator getAllocator() {
return allocator;
}
public ColumnWriteStore newColumnWriteStore(MessageType schema, PageWriteStore pageStore) {
switch (writerVersion) {
case PARQUET_1_0:
return new ColumnWriteStoreV1(schema, pageStore, this);
case PARQUET_2_0:
return new ColumnWriteStoreV2(schema, pageStore, this);
default:
throw new IllegalArgumentException("unknown version " + writerVersion);
}
}
public ColumnWriteStore newColumnWriteStore(
MessageType schema, PageWriteStore pageStore, BloomFilterWriteStore bloomFilterWriteStore) {
switch (writerVersion) {
case PARQUET_1_0:
return new ColumnWriteStoreV1(schema, pageStore, bloomFilterWriteStore, this);
case PARQUET_2_0:
return new ColumnWriteStoreV2(schema, pageStore, bloomFilterWriteStore, this);
default:
throw new IllegalArgumentException("unknown version " + writerVersion);
}
}
public int getMinRowCountForPageSizeCheck() {
return minRowCountForPageSizeCheck;
}
public int getMaxRowCountForPageSizeCheck() {
return maxRowCountForPageSizeCheck;
}
public ValuesWriterFactory getValuesWriterFactory() {
return valuesWriterFactory;
}
public int getColumnIndexTruncateLength() {
return columnIndexTruncateLength;
}
public int getStatisticsTruncateLength() {
return statisticsTruncateLength;
}
public boolean estimateNextSizeCheck() {
return estimateNextSizeCheck;
}
public int getPageRowCountLimit() {
return pageRowCountLimit;
}
public boolean getPageWriteChecksumEnabled() {
return pageWriteChecksumEnabled;
}
public OptionalLong getBloomFilterNDV(ColumnDescriptor column) {
Long ndv = bloomFilterNDVs.getValue(column);
return ndv == null ? OptionalLong.empty() : OptionalLong.of(ndv);
}
public OptionalDouble getBloomFilterFPP(ColumnDescriptor column) {
Double fpp = bloomFilterFPPs.getValue(column);
return fpp == null ? OptionalDouble.empty() : OptionalDouble.of(fpp);
}
public boolean isBloomFilterEnabled(ColumnDescriptor column) {
return bloomFilterEnabled.getValue(column);
}
public int getMaxBloomFilterBytes() {
return maxBloomFilterBytes;
}
public boolean getAdaptiveBloomFilterEnabled(ColumnDescriptor column) {
return adaptiveBloomFilterEnabled.getValue(column);
}
public int getBloomFilterCandidatesCount(ColumnDescriptor column) {
return numBloomFilterCandidates.getValue(column);
}
public Map<String, String> getExtraMetaData() {
return extraMetaData;
}
@Override
public String toString() {
return "Parquet page size to " + getPageSizeThreshold() + '\n'
+ "Parquet dictionary page size to " + getDictionaryPageSizeThreshold() + '\n'
+ "Dictionary is " + dictionaryEnabled + '\n'
+ "Writer version is: " + getWriterVersion() + '\n'
+ "Page size checking is: " + (estimateNextSizeCheck() ? "estimated" : "constant") + '\n'
+ "Min row count for page size check is: " + getMinRowCountForPageSizeCheck() + '\n'
+ "Max row count for page size check is: " + getMaxRowCountForPageSizeCheck() + '\n'
+ "Truncate length for column indexes is: " + getColumnIndexTruncateLength() + '\n'
+ "Truncate length for statistics min/max is: " + getStatisticsTruncateLength() + '\n'
+ "Bloom filter enabled: " + bloomFilterEnabled + '\n'
+ "Max Bloom filter size for a column is " + getMaxBloomFilterBytes() + '\n'
+ "Bloom filter expected number of distinct values are: " + bloomFilterNDVs + '\n'
+ "Bloom filter false positive probabilities are: " + bloomFilterFPPs + '\n'
+ "Page row count limit to " + getPageRowCountLimit() + '\n'
+ "Writing page checksums is: " + (getPageWriteChecksumEnabled() ? "on" : "off");
}
public static class Builder {
private int pageSize = DEFAULT_PAGE_SIZE;
private int dictPageSize = DEFAULT_DICTIONARY_PAGE_SIZE;
private final ColumnProperty.Builder<Boolean> enableDict;
private WriterVersion writerVersion = DEFAULT_WRITER_VERSION;
private int minRowCountForPageSizeCheck = DEFAULT_MINIMUM_RECORD_COUNT_FOR_CHECK;
private int maxRowCountForPageSizeCheck = DEFAULT_MAXIMUM_RECORD_COUNT_FOR_CHECK;
private int pageValueCountThreshold = DEFAULT_PAGE_VALUE_COUNT_THRESHOLD;
private boolean estimateNextSizeCheck = DEFAULT_ESTIMATE_ROW_COUNT_FOR_PAGE_SIZE_CHECK;
private ByteBufferAllocator allocator = new HeapByteBufferAllocator();
private ValuesWriterFactory valuesWriterFactory = DEFAULT_VALUES_WRITER_FACTORY;
private int columnIndexTruncateLength = DEFAULT_COLUMN_INDEX_TRUNCATE_LENGTH;
private int statisticsTruncateLength = DEFAULT_STATISTICS_TRUNCATE_LENGTH;
private final ColumnProperty.Builder<Long> bloomFilterNDVs;
private final ColumnProperty.Builder<Double> bloomFilterFPPs;
private int maxBloomFilterBytes = DEFAULT_MAX_BLOOM_FILTER_BYTES;
private final ColumnProperty.Builder<Boolean> adaptiveBloomFilterEnabled;
private final ColumnProperty.Builder<Integer> numBloomFilterCandidates;
private final ColumnProperty.Builder<Boolean> bloomFilterEnabled;
private int pageRowCountLimit = DEFAULT_PAGE_ROW_COUNT_LIMIT;
private boolean pageWriteChecksumEnabled = DEFAULT_PAGE_WRITE_CHECKSUM_ENABLED;
private final ColumnProperty.Builder<ByteStreamSplitMode> byteStreamSplitEnabled;
private Map<String, String> extraMetaData = new HashMap<>();
private Builder() {
enableDict = ColumnProperty.<Boolean>builder().withDefaultValue(DEFAULT_IS_DICTIONARY_ENABLED);
byteStreamSplitEnabled = ColumnProperty.<ByteStreamSplitMode>builder()
.withDefaultValue(
DEFAULT_IS_BYTE_STREAM_SPLIT_ENABLED
? ByteStreamSplitMode.FLOATING_POINT
: ByteStreamSplitMode.NONE);
bloomFilterEnabled = ColumnProperty.<Boolean>builder().withDefaultValue(DEFAULT_BLOOM_FILTER_ENABLED);
bloomFilterNDVs = ColumnProperty.<Long>builder().withDefaultValue(null);
bloomFilterFPPs = ColumnProperty.<Double>builder().withDefaultValue(DEFAULT_BLOOM_FILTER_FPP);
adaptiveBloomFilterEnabled =
ColumnProperty.<Boolean>builder().withDefaultValue(DEFAULT_ADAPTIVE_BLOOM_FILTER_ENABLED);
numBloomFilterCandidates =
ColumnProperty.<Integer>builder().withDefaultValue(DEFAULT_BLOOM_FILTER_CANDIDATES_NUMBER);
}
private Builder(ParquetProperties toCopy) {
this.pageSize = toCopy.pageSizeThreshold;
this.enableDict = ColumnProperty.builder(toCopy.dictionaryEnabled);
this.dictPageSize = toCopy.dictionaryPageSizeThreshold;
this.writerVersion = toCopy.writerVersion;
this.minRowCountForPageSizeCheck = toCopy.minRowCountForPageSizeCheck;
this.maxRowCountForPageSizeCheck = toCopy.maxRowCountForPageSizeCheck;
this.estimateNextSizeCheck = toCopy.estimateNextSizeCheck;
this.valuesWriterFactory = toCopy.valuesWriterFactory;
this.allocator = toCopy.allocator;
this.pageRowCountLimit = toCopy.pageRowCountLimit;
this.pageWriteChecksumEnabled = toCopy.pageWriteChecksumEnabled;
this.bloomFilterNDVs = ColumnProperty.builder(toCopy.bloomFilterNDVs);
this.bloomFilterFPPs = ColumnProperty.builder(toCopy.bloomFilterFPPs);
this.bloomFilterEnabled = ColumnProperty.builder(toCopy.bloomFilterEnabled);
this.adaptiveBloomFilterEnabled = ColumnProperty.builder(toCopy.adaptiveBloomFilterEnabled);
this.numBloomFilterCandidates = ColumnProperty.builder(toCopy.numBloomFilterCandidates);
this.maxBloomFilterBytes = toCopy.maxBloomFilterBytes;
this.byteStreamSplitEnabled = ColumnProperty.builder(toCopy.byteStreamSplitEnabled);
this.extraMetaData = toCopy.extraMetaData;
}
/**
* Set the Parquet format page size.
*
* @param pageSize an integer size in bytes
* @return this builder for method chaining.
*/
public Builder withPageSize(int pageSize) {
Preconditions.checkArgument(pageSize > 0, "Invalid page size (negative): %s", pageSize);
this.pageSize = pageSize;
return this;
}
/**
* Enable or disable dictionary encoding.
*
* @param enableDictionary whether dictionary encoding should be enabled
* @return this builder for method chaining.
*/
public Builder withDictionaryEncoding(boolean enableDictionary) {
this.enableDict.withDefaultValue(enableDictionary);
return this;
}
/**
* Enable or disable dictionary encoding for the specified column.
*
* @param columnPath the path of the column (dot-string)
* @param enableDictionary whether dictionary encoding should be enabled
* @return this builder for method chaining.
*/
public Builder withDictionaryEncoding(String columnPath, boolean enableDictionary) {
this.enableDict.withValue(columnPath, enableDictionary);
return this;
}
/**
* Enable or disable BYTE_STREAM_SPLIT encoding for FLOAT and DOUBLE columns.
*
* @param enable whether BYTE_STREAM_SPLIT encoding should be enabled
* @return this builder for method chaining.
*/
public Builder withByteStreamSplitEncoding(boolean enable) {
this.byteStreamSplitEnabled.withDefaultValue(
enable ? ByteStreamSplitMode.FLOATING_POINT : ByteStreamSplitMode.NONE);
return this;
}
/**
* Enable or disable BYTE_STREAM_SPLIT encoding for specified columns.
*
* @param columnPath the path of the column (dot-string)
* @param enable whether BYTE_STREAM_SPLIT encoding should be enabled
* @return this builder for method chaining.
*/
public Builder withByteStreamSplitEncoding(String columnPath, boolean enable) {
this.byteStreamSplitEnabled.withValue(
columnPath, enable ? ByteStreamSplitMode.EXTENDED : ByteStreamSplitMode.NONE);
return this;
}
/**
* Enable or disable BYTE_STREAM_SPLIT encoding for FLOAT, DOUBLE, INT32, INT64 and FIXED_LEN_BYTE_ARRAY columns.
*
* @param enable whether BYTE_STREAM_SPLIT encoding should be enabled
* @return this builder for method chaining.
*/
public Builder withExtendedByteStreamSplitEncoding(boolean enable) {
this.byteStreamSplitEnabled.withDefaultValue(
enable ? ByteStreamSplitMode.EXTENDED : ByteStreamSplitMode.NONE);
return this;
}
/**
* Set the Parquet format dictionary page size.
*
* @param dictionaryPageSize an integer size in bytes
* @return this builder for method chaining.
*/
public Builder withDictionaryPageSize(int dictionaryPageSize) {
Preconditions.checkArgument(
dictionaryPageSize > 0, "Invalid dictionary page size (negative): %s", dictionaryPageSize);
this.dictPageSize = dictionaryPageSize;
return this;
}
/**
* Set the {@link WriterVersion format version}.
*
* @param version a {@code WriterVersion}
* @return this builder for method chaining.
*/
public Builder withWriterVersion(WriterVersion version) {
this.writerVersion = version;
return this;
}
public Builder withMinRowCountForPageSizeCheck(int min) {
Preconditions.checkArgument(min > 0, "Invalid row count for page size check (negative): %s", min);
this.minRowCountForPageSizeCheck = min;
return this;
}
public Builder withMaxRowCountForPageSizeCheck(int max) {
Preconditions.checkArgument(max > 0, "Invalid row count for page size check (negative): %s", max);
this.maxRowCountForPageSizeCheck = max;
return this;
}
public Builder withPageValueCountThreshold(int value) {
Preconditions.checkArgument(value > 0, "Invalid page value count threshold (negative): %s", value);
this.pageValueCountThreshold = value;
return this;
}
// Do not attempt to predict next size check. Prevents issues with rows that vary significantly in size.
public Builder estimateRowCountForPageSizeCheck(boolean estimateNextSizeCheck) {
this.estimateNextSizeCheck = estimateNextSizeCheck;
return this;
}
public Builder withAllocator(ByteBufferAllocator allocator) {
this.allocator = Objects.requireNonNull(allocator, "ByteBufferAllocator cannot be null");
return this;
}
public Builder withValuesWriterFactory(ValuesWriterFactory factory) {
this.valuesWriterFactory = Objects.requireNonNull(factory, "ValuesWriterFactory cannot be null");
return this;
}
public Builder withColumnIndexTruncateLength(int length) {
Preconditions.checkArgument(
length > 0, "Invalid column index min/max truncate length (negative or zero) : %s", length);
this.columnIndexTruncateLength = length;
return this;
}
public Builder withStatisticsTruncateLength(int length) {
Preconditions.checkArgument(
length > 0, "Invalid statistics min/max truncate length (negative or zero) : %s", length);
this.statisticsTruncateLength = length;
return this;
}
/**
* Set max Bloom filter bytes for related columns.
*
* @param maxBloomFilterBytes the max bytes of a Bloom filter bitset for a column.
* @return this builder for method chaining
*/
public Builder withMaxBloomFilterBytes(int maxBloomFilterBytes) {
this.maxBloomFilterBytes = maxBloomFilterBytes;
return this;
}
/**
* Set Bloom filter NDV (number of distinct values) for the specified column.
* If set for a column then the writing of the bloom filter for that column will be automatically enabled (see
* {@link #withBloomFilterEnabled(String, boolean)}).
*
* @param columnPath the path of the column (dot-string)
* @param ndv the NDV of the column
* @return this builder for method chaining
*/
public Builder withBloomFilterNDV(String columnPath, long ndv) {
Preconditions.checkArgument(ndv > 0, "Invalid NDV for column \"%s\": %s", columnPath, ndv);
this.bloomFilterNDVs.withValue(columnPath, ndv);
// Setting an NDV for a column implies writing a bloom filter
this.bloomFilterEnabled.withValue(columnPath, true);
return this;
}
public Builder withBloomFilterFPP(String columnPath, double fpp) {
Preconditions.checkArgument(fpp > 0.0 && fpp < 1.0, "Invalid FPP for column \"%s\": %s", columnPath, fpp);
this.bloomFilterFPPs.withValue(columnPath, fpp);
return this;
}
/**
* Enable or disable the bloom filter for the columns not specified by
* {@link #withBloomFilterEnabled(String, boolean)}.
*
* @param enabled whether bloom filter shall be enabled for all columns
* @return this builder for method chaining
*/
public Builder withBloomFilterEnabled(boolean enabled) {
this.bloomFilterEnabled.withDefaultValue(enabled);
return this;
}
/**
* Whether to use adaptive bloom filter to automatically adjust the bloom filter size according to
* `parquet.bloom.filter.max.bytes`.
* If NDV (number of distinct values) for a specified column is set, it will be ignored
*
* @param enabled whether to use adaptive bloom filter
*/
public Builder withAdaptiveBloomFilterEnabled(boolean enabled) {
this.adaptiveBloomFilterEnabled.withDefaultValue(enabled);
return this;
}
/**
* When `AdaptiveBloomFilter` is enabled, set how many bloom filter candidates to use.
*
* @param columnPath the path of the column (dot-string)
* @param number the number of candidates
*/
public Builder withBloomFilterCandidatesNumber(String columnPath, int number) {
Preconditions.checkArgument(
number > 0, "Invalid candidates number for column \"%s\": %d", columnPath, number);
this.numBloomFilterCandidates.withDefaultValue(number);
return this;
}
/**
* Enable or disable the bloom filter for the specified column.
* One may either disable bloom filters for all columns by invoking {@link #withBloomFilterEnabled(boolean)} with a
* {@code false} value and then enable the bloom filters for the required columns one-by-one by invoking this
* method or vice versa.
*
* @param columnPath the path of the column (dot-string)
* @param enabled whether bloom filter shall be enabled
* @return this builder for method chaining
*/
public Builder withBloomFilterEnabled(String columnPath, boolean enabled) {
this.bloomFilterEnabled.withValue(columnPath, enabled);
return this;
}
public Builder withPageRowCountLimit(int rowCount) {
Preconditions.checkArgument(rowCount > 0, "Invalid row count limit for pages: %s", rowCount);
pageRowCountLimit = rowCount;
return this;
}
public Builder withPageWriteChecksumEnabled(boolean val) {
this.pageWriteChecksumEnabled = val;
return this;
}
public Builder withExtraMetaData(Map<String, String> extraMetaData) {
this.extraMetaData = extraMetaData;
return this;
}
public ParquetProperties build() {
ParquetProperties properties = new ParquetProperties(this);
// we pass a constructed but uninitialized factory to ParquetProperties above as currently
// creation of ValuesWriters is invoked from within ParquetProperties. In the future
// we'd like to decouple that and won't need to pass an object to properties and then pass the
// properties to the object.
valuesWriterFactory.initialize(properties);
return properties;
}
}
}