| /* |
| * Licensed to the Apache Software Foundation (ASF) under one |
| * or more contributor license agreements. See the NOTICE file |
| * distributed with this work for additional information |
| * regarding copyright ownership. The ASF licenses this file |
| * to you under the Apache License, Version 2.0 (the |
| * "License"); you may not use this file except in compliance |
| * with the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.drill.exec.store.easy.text.reader; |
| |
| import org.apache.drill.exec.record.metadata.TupleMetadata; |
| import org.apache.drill.exec.store.easy.text.TextFormatPlugin; |
| import org.apache.drill.exec.store.easy.text.TextFormatConfig; |
| import org.apache.drill.shaded.guava.com.google.common.base.Charsets; |
| |
| import java.nio.charset.StandardCharsets; |
| |
| public class TextParsingSettings { |
| |
| private final byte quote; |
| private final byte quoteEscape; |
| private final byte delimiter; |
| private final byte comment; |
| |
| private final long maxCharsPerColumn = TextFormatPlugin.MAX_CHARS_PER_COLUMN; |
| private final byte normalizedNewLine = b('\n'); |
| private final byte[] newLineDelimiter; |
| private final String lineSeparatorString; |
| private boolean skipFirstLine; |
| private final boolean headerExtractionEnabled; |
| |
| // Available only via table properties |
| private final boolean parseUnescapedQuotes; |
| private final boolean ignoreLeadingWhitespace; |
| private final boolean ignoreTrailingWhitespace; |
| /** |
| * Configure the properties for this one scan based on: |
| * <p> |
| * <ul> |
| * <li>The defaults in the plugin config (if properties not defined |
| * in the config JSON.</li> |
| * <li>The config values from the config JSON as stored in the |
| * plugin config.</li> |
| * <li>Table function settings expressed in the query (and passed |
| * in as part of the plugin config.</li> |
| * <li>Table properties.</li> |
| * </ul> |
| * <p> |
| * The result is that the user can customize the behavior of a table just |
| * via the table properties; the user need not define a new storage |
| * config just to change a property. For example, by default, the |
| * <tt>`csv`</tt> config has no headers. But, if the user has a ".csv" |
| * file with headers, the user can just customize the table properties. |
| */ |
| public TextParsingSettings(TextFormatConfig config, |
| TupleMetadata providedSchema) { |
| boolean extractHeaders = config.isHeaderExtractionEnabled(); |
| boolean skipFirst = config.isSkipFirstLine(); |
| boolean ignoreLeadingWhitespace = false; |
| boolean ignoreTrailingWhitespace = false; |
| boolean parseUnescapedQuotes = true; |
| byte delimChar = bSafe(config.getFieldDelimiter(), "fieldDelimiter"); |
| byte commentChar = bSafe(config.getComment(), "comment"); |
| byte quoteChar = bSafe(config.getQuote(), "quote"); |
| byte quoteEscapeChar = bSafe(config.getEscape(), "escape"); |
| byte[] newlineDelim = config.getLineDelimiter().getBytes(Charsets.UTF_8); |
| if (providedSchema != null) { |
| extractHeaders = providedSchema.booleanProperty( |
| TextFormatPlugin.HAS_HEADERS_PROP, extractHeaders); |
| skipFirst = ! extractHeaders & providedSchema.booleanProperty( |
| TextFormatPlugin.SKIP_FIRST_LINE_PROP, skipFirstLine); |
| skipFirst = ! extractHeaders & providedSchema.booleanProperty( |
| TextFormatPlugin.SKIP_FIRST_LINE_PROP, skipFirstLine); |
| ignoreLeadingWhitespace = providedSchema.booleanProperty( |
| TextFormatPlugin.TRIM_WHITESPACE_PROP, ignoreLeadingWhitespace); |
| ignoreTrailingWhitespace = providedSchema.booleanProperty( |
| TextFormatPlugin.TRIM_WHITESPACE_PROP, ignoreTrailingWhitespace); |
| parseUnescapedQuotes = providedSchema.booleanProperty( |
| TextFormatPlugin.PARSE_UNESCAPED_QUOTES_PROP, parseUnescapedQuotes); |
| delimChar = overrideChar(providedSchema, TextFormatPlugin.DELIMITER_PROP, delimChar); |
| quoteChar = overrideChar(providedSchema, TextFormatPlugin.QUOTE_PROP, quoteChar); |
| quoteEscapeChar = overrideChar(providedSchema, TextFormatPlugin.QUOTE_ESCAPE_PROP, quoteEscapeChar); |
| newlineDelim = newlineDelimBytes(providedSchema, newlineDelim); |
| commentChar = commentChar(providedSchema, commentChar); |
| } |
| this.skipFirstLine = !extractHeaders && skipFirst; |
| this.headerExtractionEnabled = extractHeaders; |
| |
| this.quote = quoteChar; |
| this.quoteEscape = quoteEscapeChar; |
| this.newLineDelimiter = newlineDelim; |
| this.lineSeparatorString = new String(newLineDelimiter); |
| this.delimiter = delimChar; |
| this.comment = commentChar; |
| this.ignoreLeadingWhitespace = ignoreLeadingWhitespace; |
| this.ignoreTrailingWhitespace = ignoreTrailingWhitespace; |
| this.parseUnescapedQuotes = parseUnescapedQuotes; |
| } |
| |
| /** |
| * Parse a delimiter from table properties. If the property is unset, |
| * or is a blank string, then uses the delimiter from the plugin config. |
| * Else, if non-blank, uses the first character of the property value. |
| */ |
| private static byte overrideChar(TupleMetadata providedSchema, String propName, byte configValue) { |
| String value = providedSchema.property(propName); |
| if (value == null || value.isEmpty()) { |
| return configValue; |
| } |
| // Text reader supports only ASCII text and characters. |
| return (byte) value.charAt(0); |
| } |
| |
| /** |
| * Parse a comment character from table properties. If the property is unset, |
| * then uses the delimiter from the plugin config. If the properry value is |
| * blank, then uses ASCII NUL (0) as the comment. This value should never |
| * match anything, and effectively disables the comment feature. |
| * Else, if non-blank, uses the first character of the property value. |
| */ |
| private static byte commentChar(TupleMetadata providedSchema, byte configValue) { |
| String value = providedSchema.property(TextFormatPlugin.COMMENT_CHAR_PROP); |
| if (value == null) { |
| return configValue; |
| } |
| if (value.isEmpty()) { |
| return 0; |
| } |
| // Text reader supports only ASCII text and characters. |
| return (byte) value.charAt(0); |
| } |
| |
| /** |
| * Return either line delimiter from table properties, or the one |
| * provided as a parameter from the plugin config. The line delimiter |
| * can contain multiple bytes. |
| */ |
| private static byte[] newlineDelimBytes(TupleMetadata providedSchema, byte[] configValue) { |
| String value = providedSchema.property(TextFormatPlugin.LINE_DELIM_PROP); |
| if (value == null || value.isEmpty()) { |
| return configValue; |
| } |
| return value.getBytes(StandardCharsets.UTF_8); |
| } |
| |
| public byte getComment() { |
| return comment; |
| } |
| |
| public boolean isSkipFirstLine() { |
| return skipFirstLine; |
| } |
| |
| public void setSkipFirstLine(boolean skipFirstLine) { |
| this.skipFirstLine = skipFirstLine; |
| } |
| |
| private static byte bSafe(char c, String name) { |
| if (c > Byte.MAX_VALUE) { |
| throw new IllegalArgumentException(String.format("Failure validating configuration option %s. Expected a " |
| + "character between 0 and 127 but value was actually %d.", name, (int) c)); |
| } |
| return (byte) c; |
| } |
| |
| private static byte b(char c) { |
| return (byte) c; |
| } |
| |
| public byte[] getNewLineDelimiter() { |
| return newLineDelimiter; |
| } |
| |
| /** |
| * Returns the character used for escaping values where the field delimiter is |
| * part of the value. Defaults to '"' |
| * |
| * @return the quote character |
| */ |
| public byte getQuote() { |
| return quote; |
| } |
| |
| public String getLineSeparatorString() { |
| return lineSeparatorString; |
| } |
| |
| /** |
| * Returns the character used for escaping quotes inside an already quoted value. Defaults to '"' |
| * @return the quote escape character |
| */ |
| public byte getQuoteEscape() { |
| return quoteEscape; |
| } |
| |
| /** |
| * Returns the field delimiter character. Defaults to ',' |
| * @return the field delimiter character |
| */ |
| public byte getDelimiter() { |
| return delimiter; |
| } |
| |
| /** |
| * Indicates whether the CSV parser should accept unescaped quotes inside |
| * quoted values and parse them normally. Defaults to {@code true}. |
| * |
| * @return a flag indicating whether or not the CSV parser should accept |
| * unescaped quotes inside quoted values. |
| */ |
| public boolean parseUnescapedQuotes() { |
| return parseUnescapedQuotes; |
| } |
| |
| /** |
| * Indicates whether or not the first valid record parsed from the input |
| * should be considered as the row containing the names of each column |
| * |
| * @return true if the first valid record parsed from the input should be |
| * considered as the row containing the names of each column, false |
| * otherwise |
| */ |
| public boolean isHeaderExtractionEnabled() { |
| return headerExtractionEnabled; |
| } |
| |
| public long getMaxCharsPerColumn() { |
| return maxCharsPerColumn; |
| } |
| |
| public byte getNormalizedNewLine() { |
| return normalizedNewLine; |
| } |
| |
| public boolean ignoreLeadingWhitespace() { |
| return ignoreLeadingWhitespace; |
| } |
| |
| public boolean ignoreTrailingWhitespace() { |
| return ignoreTrailingWhitespace; |
| } |
| } |