| /** |
| * Licensed to the Apache Software Foundation (ASF) under one |
| * or more contributor license agreements. See the NOTICE file |
| * distributed with this work for additional information |
| * regarding copyright ownership. The ASF licenses this file |
| * to you under the Apache License, Version 2.0 (the |
| * "License"); you may not use this file except in compliance |
| * with the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| package org.apache.orc; |
| |
| import org.apache.hadoop.conf.Configuration; |
| |
| import java.util.Properties; |
| |
| /** |
| * Define the configuration properties that Orc understands. |
| */ |
| public enum OrcConf { |
| STRIPE_SIZE("orc.stripe.size", "hive.exec.orc.default.stripe.size", |
| 64L * 1024 * 1024, |
| "Define the default ORC stripe size, in bytes."), |
| BLOCK_SIZE("orc.block.size", "hive.exec.orc.default.block.size", |
| 256L * 1024 * 1024, |
| "Define the default file system block size for ORC files."), |
| ENABLE_INDEXES("orc.create.index", "orc.create.index", true, |
| "Should the ORC writer create indexes as part of the file."), |
| ROW_INDEX_STRIDE("orc.row.index.stride", |
| "hive.exec.orc.default.row.index.stride", 10000, |
| "Define the default ORC index stride in number of rows. (Stride is the\n"+ |
| " number of rows n index entry represents.)"), |
| BUFFER_SIZE("orc.compress.size", "hive.exec.orc.default.buffer.size", |
| 256 * 1024, "Define the default ORC buffer size, in bytes."), |
| BASE_DELTA_RATIO("orc.base.delta.ratio", "hive.exec.orc.base.delta.ratio", 8, |
| "The ratio of base writer and delta writer in terms of STRIPE_SIZE and BUFFER_SIZE."), |
| BLOCK_PADDING("orc.block.padding", "hive.exec.orc.default.block.padding", |
| true, |
| "Define whether stripes should be padded to the HDFS block boundaries."), |
| COMPRESS("orc.compress", "hive.exec.orc.default.compress", "ZLIB", |
| "Define the default compression codec for ORC file"), |
| WRITE_FORMAT("orc.write.format", "hive.exec.orc.write.format", "0.12", |
| "Define the version of the file to write. Possible values are 0.11 and\n"+ |
| " 0.12. If this parameter is not defined, ORC will use the run\n" + |
| " length encoding (RLE) introduced in Hive 0.12."), |
| ENFORCE_COMPRESSION_BUFFER_SIZE("orc.buffer.size.enforce", "hive.exec.orc.buffer.size.enforce", false, |
| "Defines whether to enforce ORC compression buffer size."), |
| ENCODING_STRATEGY("orc.encoding.strategy", "hive.exec.orc.encoding.strategy", |
| "SPEED", |
| "Define the encoding strategy to use while writing data. Changing this\n"+ |
| "will only affect the light weight encoding for integers. This\n" + |
| "flag will not change the compression level of higher level\n" + |
| "compression codec (like ZLIB)."), |
| COMPRESSION_STRATEGY("orc.compression.strategy", |
| "hive.exec.orc.compression.strategy", "SPEED", |
| "Define the compression strategy to use while writing data.\n" + |
| "This changes the compression level of higher level compression\n" + |
| "codec (like ZLIB)."), |
| BLOCK_PADDING_TOLERANCE("orc.block.padding.tolerance", |
| "hive.exec.orc.block.padding.tolerance", 0.05, |
| "Define the tolerance for block padding as a decimal fraction of\n" + |
| "stripe size (for example, the default value 0.05 is 5% of the\n" + |
| "stripe size). For the defaults of 64Mb ORC stripe and 256Mb HDFS\n" + |
| "blocks, the default block padding tolerance of 5% will\n" + |
| "reserve a maximum of 3.2Mb for padding within the 256Mb block.\n" + |
| "In that case, if the available size within the block is more than\n"+ |
| "3.2Mb, a new smaller stripe will be inserted to fit within that\n" + |
| "space. This will make sure that no stripe written will block\n" + |
| " boundaries and cause remote reads within a node local task."), |
| BLOOM_FILTER_FPP("orc.bloom.filter.fpp", "orc.default.bloom.fpp", 0.05, |
| "Define the default false positive probability for bloom filters."), |
| USE_ZEROCOPY("orc.use.zerocopy", "hive.exec.orc.zerocopy", false, |
| "Use zerocopy reads with ORC. (This requires Hadoop 2.3 or later.)"), |
| SKIP_CORRUPT_DATA("orc.skip.corrupt.data", "hive.exec.orc.skip.corrupt.data", |
| false, |
| "If ORC reader encounters corrupt data, this value will be used to\n" + |
| "determine whether to skip the corrupt data or throw exception.\n" + |
| "The default behavior is to throw exception."), |
| TOLERATE_MISSING_SCHEMA("orc.tolerate.missing.schema", |
| "hive.exec.orc.tolerate.missing.schema", |
| true, |
| "Writers earlier than HIVE-4243 may have inaccurate schema metadata.\n" |
| + "This setting will enable best effort schema evolution rather\n" |
| + "than rejecting mismatched schemas"), |
| MEMORY_POOL("orc.memory.pool", "hive.exec.orc.memory.pool", 0.5, |
| "Maximum fraction of heap that can be used by ORC file writers"), |
| DICTIONARY_KEY_SIZE_THRESHOLD("orc.dictionary.key.threshold", |
| "hive.exec.orc.dictionary.key.size.threshold", |
| 0.8, |
| "If the number of distinct keys in a dictionary is greater than this\n" + |
| "fraction of the total number of non-null rows, turn off \n" + |
| "dictionary encoding. Use 1 to always use dictionary encoding."), |
| ROW_INDEX_STRIDE_DICTIONARY_CHECK("orc.dictionary.early.check", |
| "hive.orc.row.index.stride.dictionary.check", |
| true, |
| "If enabled dictionary check will happen after first row index stride\n" + |
| "(default 10000 rows) else dictionary check will happen before\n" + |
| "writing first stripe. In both cases, the decision to use\n" + |
| "dictionary or not will be retained thereafter."), |
| BLOOM_FILTER_COLUMNS("orc.bloom.filter.columns", "orc.bloom.filter.columns", |
| "", "List of columns to create bloom filters for when writing."), |
| BLOOM_FILTER_WRITE_VERSION("orc.bloom.filter.write.version", |
| "orc.bloom.filter.write.version", OrcFile.BloomFilterVersion.UTF8.toString(), |
| "Which version of the bloom filters should we write.\n" + |
| "The choices are:\n" + |
| " original - writes two versions of the bloom filters for use by\n" + |
| " both old and new readers.\n" + |
| " utf8 - writes just the new bloom filters."), |
| IGNORE_NON_UTF8_BLOOM_FILTERS("orc.bloom.filter.ignore.non-utf8", |
| "orc.bloom.filter.ignore.non-utf8", false, |
| "Should the reader ignore the obsolete non-UTF8 bloom filters."), |
| MAX_FILE_LENGTH("orc.max.file.length", "orc.max.file.length", Long.MAX_VALUE, |
| "The maximum size of the file to read for finding the file tail. This\n" + |
| "is primarily used for streaming ingest to read intermediate\n" + |
| "footers while the file is still open"), |
| MAPRED_INPUT_SCHEMA("orc.mapred.input.schema", null, null, |
| "The schema that the user desires to read. The values are\n" + |
| "interpreted using TypeDescription.fromString."), |
| MAPRED_SHUFFLE_KEY_SCHEMA("orc.mapred.map.output.key.schema", null, null, |
| "The schema of the MapReduce shuffle key. The values are\n" + |
| "interpreted using TypeDescription.fromString."), |
| MAPRED_SHUFFLE_VALUE_SCHEMA("orc.mapred.map.output.value.schema", null, null, |
| "The schema of the MapReduce shuffle value. The values are\n" + |
| "interpreted using TypeDescription.fromString."), |
| MAPRED_OUTPUT_SCHEMA("orc.mapred.output.schema", null, null, |
| "The schema that the user desires to write. The values are\n" + |
| "interpreted using TypeDescription.fromString."), |
| INCLUDE_COLUMNS("orc.include.columns", "hive.io.file.readcolumn.ids", null, |
| "The list of comma separated column ids that should be read with 0\n" + |
| "being the first column, 1 being the next, and so on. ."), |
| KRYO_SARG("orc.kryo.sarg", "orc.kryo.sarg", null, |
| "The kryo and base64 encoded SearchArgument for predicate pushdown."), |
| KRYO_SARG_BUFFER("orc.kryo.sarg.buffer", null, 8192, |
| "The kryo buffer size for SearchArgument for predicate pushdown."), |
| SARG_COLUMNS("orc.sarg.column.names", "org.sarg.column.names", null, |
| "The list of column names for the SearchArgument."), |
| FORCE_POSITIONAL_EVOLUTION("orc.force.positional.evolution", |
| "orc.force.positional.evolution", false, |
| "Require schema evolution to match the top level columns using position\n" + |
| "rather than column names. This provides backwards compatibility with\n" + |
| "Hive 2.1."), |
| ROWS_BETWEEN_CHECKS("orc.rows.between.memory.checks", "orc.rows.between.memory.checks", 5000, |
| "How often should MemoryManager check the memory sizes? Measured in rows\n" + |
| "added to all of the writers. Valid range is [1,10000] and is primarily meant for" + |
| "n\testing. Setting this too low may negatively affect performance."), |
| OVERWRITE_OUTPUT_FILE("orc.overwrite.output.file", "orc.overwrite.output.file", false, |
| "A boolean flag to enable overwriting of the output file if it already exists.\n"), |
| IS_SCHEMA_EVOLUTION_CASE_SENSITIVE("orc.schema.evolution.case.sensitive", "orc.schema.evolution.case.sensitive", true, |
| "A boolean flag to determine if the comparision of field names in schema evolution is case sensitive .\n"), |
| WRITE_VARIABLE_LENGTH_BLOCKS("orc.write.variable.length.blocks", null, false, |
| "A boolean flag as to whether the ORC writer should write variable length\n" |
| + "HDFS blocks."), |
| DIRECT_ENCODING_COLUMNS("orc.column.encoding.direct", "orc.column.encoding.direct", "", |
| "Comma-separated list of columns for which dictionary encoding is to be skipped."), |
| // some JVM doesn't allow array creation of size Integer.MAX_VALUE, so chunk size is slightly less than max int |
| ORC_MAX_DISK_RANGE_CHUNK_LIMIT("orc.max.disk.range.chunk.limit", "hive.exec.orc.max.disk.range.chunk.limit", |
| Integer.MAX_VALUE - 1024, "When reading stripes >2GB, specify max limit for the chunk size."), |
| ENCRYPTION("orc.encrypt", "orc.encrypt", null, "The list of keys and columns to encrypt with"), |
| DATA_MASK("orc.mask", "orc.mask", null, "The masks to apply to the encrypted columns"), |
| KEY_PROVIDER("orc.key.provider", "orc.key.provider", "hadoop", "The kind of KeyProvider to use for encryption."), |
| PROLEPTIC_GREGORIAN("orc.proleptic.gregorian", "orc.proleptic.gregorian", false, |
| "Should we read and write dates & times using the proleptic Gregorian calendar\n" + |
| "instead of the hybrid Julian Gregorian? Hive before 3.1 and Spark before 3.0\n" + |
| "used hybrid."), |
| PROLEPTIC_GREGORIAN_DEFAULT("orc.proleptic.gregorian.default", |
| "orc.proleptic.gregorian.default", false, |
| "This value controls whether pre-ORC 27 files are using the hybrid or proleptic\n" + |
| "calendar. Only Hive 3.1 and the C++ library wrote using the proleptic, so hybrid\n" + |
| "is the default.") |
| ; |
| |
| private final String attribute; |
| private final String hiveConfName; |
| private final Object defaultValue; |
| private final String description; |
| |
| OrcConf(String attribute, |
| String hiveConfName, |
| Object defaultValue, |
| String description) { |
| this.attribute = attribute; |
| this.hiveConfName = hiveConfName; |
| this.defaultValue = defaultValue; |
| this.description = description; |
| } |
| |
| public String getAttribute() { |
| return attribute; |
| } |
| |
| public String getHiveConfName() { |
| return hiveConfName; |
| } |
| |
| public Object getDefaultValue() { |
| return defaultValue; |
| } |
| |
| public String getDescription() { |
| return description; |
| } |
| |
| private String lookupValue(Properties tbl, Configuration conf) { |
| String result = null; |
| if (tbl != null) { |
| result = tbl.getProperty(attribute); |
| } |
| if (result == null && conf != null) { |
| result = conf.get(attribute); |
| if (result == null && hiveConfName != null) { |
| result = conf.get(hiveConfName); |
| } |
| } |
| return result; |
| } |
| |
| public int getInt(Properties tbl, Configuration conf) { |
| String value = lookupValue(tbl, conf); |
| if (value != null) { |
| return Integer.parseInt(value); |
| } |
| return ((Number) defaultValue).intValue(); |
| } |
| |
| public int getInt(Configuration conf) { |
| return getInt(null, conf); |
| } |
| |
| public void getInt(Configuration conf, int value) { |
| conf.setInt(attribute, value); |
| } |
| |
| public long getLong(Properties tbl, Configuration conf) { |
| String value = lookupValue(tbl, conf); |
| if (value != null) { |
| return Long.parseLong(value); |
| } |
| return ((Number) defaultValue).longValue(); |
| } |
| |
| public long getLong(Configuration conf) { |
| return getLong(null, conf); |
| } |
| |
| public void setLong(Configuration conf, long value) { |
| conf.setLong(attribute, value); |
| } |
| |
| public String getString(Properties tbl, Configuration conf) { |
| String value = lookupValue(tbl, conf); |
| return value == null ? (String) defaultValue : value; |
| } |
| |
| public String getString(Configuration conf) { |
| return getString(null, conf); |
| } |
| |
| public void setString(Configuration conf, String value) { |
| conf.set(attribute, value); |
| } |
| |
| public boolean getBoolean(Properties tbl, Configuration conf) { |
| String value = lookupValue(tbl, conf); |
| if (value != null) { |
| return Boolean.parseBoolean(value); |
| } |
| return (Boolean) defaultValue; |
| } |
| |
| public boolean getBoolean(Configuration conf) { |
| return getBoolean(null, conf); |
| } |
| |
| public void setBoolean(Configuration conf, boolean value) { |
| conf.setBoolean(attribute, value); |
| } |
| |
| public double getDouble(Properties tbl, Configuration conf) { |
| String value = lookupValue(tbl, conf); |
| if (value != null) { |
| return Double.parseDouble(value); |
| } |
| return ((Number) defaultValue).doubleValue(); |
| } |
| |
| public double getDouble(Configuration conf) { |
| return getDouble(null, conf); |
| } |
| |
| public void setDouble(Configuration conf, double value) { |
| conf.setDouble(attribute, value); |
| } |
| } |