nifi-nar-bundles/nifi-standard-services/nifi-record-serialization-services-bundle/nifi-record-serialization-services/src/main/java/org/apache/nifi/csv/CSVReader.java - nifi - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 package org.apache.nifi.csv;

 import org.apache.commons.csv.CSVFormat;
 import org.apache.nifi.annotation.documentation.CapabilityDescription;
 import org.apache.nifi.annotation.documentation.Tags;
 import org.apache.nifi.annotation.lifecycle.OnEnabled;
 import org.apache.nifi.components.AllowableValue;
 import org.apache.nifi.components.PropertyDescriptor;
 import org.apache.nifi.context.PropertyContext;
 import org.apache.nifi.controller.ConfigurationContext;
 import org.apache.nifi.expression.ExpressionLanguageScope;
 import org.apache.nifi.logging.ComponentLog;
 import org.apache.nifi.schema.access.SchemaAccessStrategy;
 import org.apache.nifi.schema.access.SchemaAccessUtils;
 import org.apache.nifi.schema.access.SchemaNotFoundException;
 import org.apache.nifi.schema.inference.InferSchemaAccessStrategy;
 import org.apache.nifi.schema.inference.RecordSourceFactory;
 import org.apache.nifi.schema.inference.SchemaInferenceEngine;
 import org.apache.nifi.schema.inference.SchemaInferenceUtil;
 import org.apache.nifi.schema.inference.TimeValueInference;
 import org.apache.nifi.schemaregistry.services.SchemaRegistry;
 import org.apache.nifi.serialization.DateTimeUtils;
 import org.apache.nifi.serialization.RecordReader;
 import org.apache.nifi.serialization.RecordReaderFactory;
 import org.apache.nifi.serialization.SchemaRegistryService;
 import org.apache.nifi.serialization.record.RecordSchema;
 import org.apache.nifi.stream.io.NonCloseableInputStream;

 import java.io.IOException;
 import java.io.InputStream;
 import java.util.ArrayList;
 import java.util.List;
 import java.util.Map;

 @Tags({"csv", "parse", "record", "row", "reader", "delimited", "comma", "separated", "values"})
 @CapabilityDescription("Parses CSV-formatted data, returning each row in the CSV file as a separate record. "
     + "This reader assumes that the first line in the content is the column names and all subsequent lines are "
     + "the values. See Controller Service's Usage for further documentation.")
 public class CSVReader extends SchemaRegistryService implements RecordReaderFactory {

     private final AllowableValue headerDerivedAllowableValue = new AllowableValue("csv-header-derived", "Use String Fields From Header",
         "The first non-comment line of the CSV file is a header line that contains the names of the columns. The schema will be derived by using the "
             + "column names in the header and assuming that all columns are of type String.");

     // CSV parsers
     public static final AllowableValue APACHE_COMMONS_CSV = new AllowableValue("commons-csv", "Apache Commons CSV",
             "The CSV parser implementation from the Apache Commons CSV library.");

     public static final AllowableValue JACKSON_CSV = new AllowableValue("jackson-csv", "Jackson CSV",
             "The CSV parser implementation from the Jackson Dataformats library.");


     public static final PropertyDescriptor CSV_PARSER = new PropertyDescriptor.Builder()
             .name("csv-reader-csv-parser")
             .displayName("CSV Parser")
             .description("Specifies which parser to use to read CSV records. NOTE: Different parsers may support different subsets of functionality "
                     + "and may also exhibit different levels of performance.")
             .expressionLanguageSupported(ExpressionLanguageScope.NONE)
             .allowableValues(APACHE_COMMONS_CSV, JACKSON_CSV)
             .defaultValue(APACHE_COMMONS_CSV.getValue())
             .required(true)
             .build();

     private volatile String csvParser;
     private volatile CSVFormat csvFormat;
     private volatile String dateFormat;
     private volatile String timeFormat;
     private volatile String timestampFormat;
     private volatile boolean firstLineIsHeader;
     private volatile boolean ignoreHeader;
     private volatile String charSet;

     @Override
     protected List<PropertyDescriptor> getSupportedPropertyDescriptors() {
         final List<PropertyDescriptor> properties = new ArrayList<>(super.getSupportedPropertyDescriptors());
         properties.add(CSV_PARSER);
         properties.add(DateTimeUtils.DATE_FORMAT);
         properties.add(DateTimeUtils.TIME_FORMAT);
         properties.add(DateTimeUtils.TIMESTAMP_FORMAT);
         properties.add(CSVUtils.CSV_FORMAT);
         properties.add(CSVUtils.VALUE_SEPARATOR);
         properties.add(CSVUtils.FIRST_LINE_IS_HEADER);
         properties.add(CSVUtils.IGNORE_CSV_HEADER);
         properties.add(CSVUtils.QUOTE_CHAR);
         properties.add(CSVUtils.ESCAPE_CHAR);
         properties.add(CSVUtils.COMMENT_MARKER);
         properties.add(CSVUtils.NULL_STRING);
         properties.add(CSVUtils.TRIM_FIELDS);
         properties.add(CSVUtils.CHARSET);
         return properties;
     }

     @OnEnabled
     public void storeCsvFormat(final ConfigurationContext context) {
         this.csvParser = context.getProperty(CSV_PARSER).getValue();
         this.csvFormat = CSVUtils.createCSVFormat(context);
         this.dateFormat = context.getProperty(DateTimeUtils.DATE_FORMAT).getValue();
         this.timeFormat = context.getProperty(DateTimeUtils.TIME_FORMAT).getValue();
         this.timestampFormat = context.getProperty(DateTimeUtils.TIMESTAMP_FORMAT).getValue();
         this.firstLineIsHeader = context.getProperty(CSVUtils.FIRST_LINE_IS_HEADER).asBoolean();
         this.ignoreHeader = context.getProperty(CSVUtils.IGNORE_CSV_HEADER).asBoolean();
         this.charSet = context.getProperty(CSVUtils.CHARSET).getValue();

         // Ensure that if we are deriving schema from header that we always treat the first line as a header,
         // regardless of the 'First Line is Header' property
         final String accessStrategy = context.getProperty(SchemaAccessUtils.SCHEMA_ACCESS_STRATEGY).getValue();
         if (headerDerivedAllowableValue.getValue().equals(accessStrategy) || SchemaInferenceUtil.INFER_SCHEMA.getValue().equals(accessStrategy)) {
             this.csvFormat = this.csvFormat.withFirstRecordAsHeader();
             this.firstLineIsHeader = true;
         }
     }

     @Override
     public RecordReader createRecordReader(final Map<String, String> variables, final InputStream in, final ComponentLog logger) throws IOException, SchemaNotFoundException {
         // Use Mark/Reset of a BufferedInputStream in case we read from the Input Stream for the header.
         in.mark(1024 * 1024);
         final RecordSchema schema = getSchema(variables, new NonCloseableInputStream(in), null);
         in.reset();

         if(APACHE_COMMONS_CSV.getValue().equals(csvParser)) {
             return new CSVRecordReader(in, logger, schema, csvFormat, firstLineIsHeader, ignoreHeader, dateFormat, timeFormat, timestampFormat, charSet);
         } else if(JACKSON_CSV.getValue().equals(csvParser)) {
             return new JacksonCSVRecordReader(in, logger, schema, csvFormat, firstLineIsHeader, ignoreHeader, dateFormat, timeFormat, timestampFormat, charSet);
         } else {
             throw new IOException("Parser not supported");
         }
     }

     @Override
     protected SchemaAccessStrategy getSchemaAccessStrategy(final String allowableValue, final SchemaRegistry schemaRegistry, final PropertyContext context) {
         if (allowableValue.equalsIgnoreCase(headerDerivedAllowableValue.getValue())) {
             return new CSVHeaderSchemaStrategy(context);
         } else if (allowableValue.equalsIgnoreCase(SchemaInferenceUtil.INFER_SCHEMA.getValue())) {
             final RecordSourceFactory<CSVRecordAndFieldNames> sourceFactory = (var, in) -> new CSVRecordSource(in, context);
             final SchemaInferenceEngine<CSVRecordAndFieldNames> inference = new CSVSchemaInference(new TimeValueInference(dateFormat, timeFormat, timestampFormat));
             return new InferSchemaAccessStrategy<>(sourceFactory, inference, getLogger());
         }

         return super.getSchemaAccessStrategy(allowableValue, schemaRegistry, context);
     }

     @Override
     protected List<AllowableValue> getSchemaAccessStrategyValues() {
         final List<AllowableValue> allowableValues = new ArrayList<>(super.getSchemaAccessStrategyValues());
         allowableValues.add(headerDerivedAllowableValue);
         allowableValues.add(SchemaInferenceUtil.INFER_SCHEMA);
         return allowableValues;
     }

     @Override
     protected AllowableValue getDefaultSchemaAccessStrategy() {
         return SchemaInferenceUtil.INFER_SCHEMA;
     }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	package org.apache.nifi.csv;

	import org.apache.commons.csv.CSVFormat;
	import org.apache.nifi.annotation.documentation.CapabilityDescription;
	import org.apache.nifi.annotation.documentation.Tags;
	import org.apache.nifi.annotation.lifecycle.OnEnabled;
	import org.apache.nifi.components.AllowableValue;
	import org.apache.nifi.components.PropertyDescriptor;
	import org.apache.nifi.context.PropertyContext;
	import org.apache.nifi.controller.ConfigurationContext;
	import org.apache.nifi.expression.ExpressionLanguageScope;
	import org.apache.nifi.logging.ComponentLog;
	import org.apache.nifi.schema.access.SchemaAccessStrategy;
	import org.apache.nifi.schema.access.SchemaAccessUtils;
	import org.apache.nifi.schema.access.SchemaNotFoundException;
	import org.apache.nifi.schema.inference.InferSchemaAccessStrategy;
	import org.apache.nifi.schema.inference.RecordSourceFactory;
	import org.apache.nifi.schema.inference.SchemaInferenceEngine;
	import org.apache.nifi.schema.inference.SchemaInferenceUtil;
	import org.apache.nifi.schema.inference.TimeValueInference;
	import org.apache.nifi.schemaregistry.services.SchemaRegistry;
	import org.apache.nifi.serialization.DateTimeUtils;
	import org.apache.nifi.serialization.RecordReader;
	import org.apache.nifi.serialization.RecordReaderFactory;
	import org.apache.nifi.serialization.SchemaRegistryService;
	import org.apache.nifi.serialization.record.RecordSchema;
	import org.apache.nifi.stream.io.NonCloseableInputStream;

	import java.io.IOException;
	import java.io.InputStream;
	import java.util.ArrayList;
	import java.util.List;
	import java.util.Map;

	@Tags({"csv", "parse", "record", "row", "reader", "delimited", "comma", "separated", "values"})
	@CapabilityDescription("Parses CSV-formatted data, returning each row in the CSV file as a separate record. "
	+ "This reader assumes that the first line in the content is the column names and all subsequent lines are "
	+ "the values. See Controller Service's Usage for further documentation.")
	public class CSVReader extends SchemaRegistryService implements RecordReaderFactory {

	private final AllowableValue headerDerivedAllowableValue = new AllowableValue("csv-header-derived", "Use String Fields From Header",
	"The first non-comment line of the CSV file is a header line that contains the names of the columns. The schema will be derived by using the "
	+ "column names in the header and assuming that all columns are of type String.");

	// CSV parsers
	public static final AllowableValue APACHE_COMMONS_CSV = new AllowableValue("commons-csv", "Apache Commons CSV",
	"The CSV parser implementation from the Apache Commons CSV library.");

	public static final AllowableValue JACKSON_CSV = new AllowableValue("jackson-csv", "Jackson CSV",
	"The CSV parser implementation from the Jackson Dataformats library.");


	public static final PropertyDescriptor CSV_PARSER = new PropertyDescriptor.Builder()
	.name("csv-reader-csv-parser")
	.displayName("CSV Parser")
	.description("Specifies which parser to use to read CSV records. NOTE: Different parsers may support different subsets of functionality "
	+ "and may also exhibit different levels of performance.")
	.expressionLanguageSupported(ExpressionLanguageScope.NONE)
	.allowableValues(APACHE_COMMONS_CSV, JACKSON_CSV)
	.defaultValue(APACHE_COMMONS_CSV.getValue())
	.required(true)
	.build();

	private volatile String csvParser;
	private volatile CSVFormat csvFormat;
	private volatile String dateFormat;
	private volatile String timeFormat;
	private volatile String timestampFormat;
	private volatile boolean firstLineIsHeader;
	private volatile boolean ignoreHeader;
	private volatile String charSet;

	@Override
	protected List<PropertyDescriptor> getSupportedPropertyDescriptors() {
	final List<PropertyDescriptor> properties = new ArrayList<>(super.getSupportedPropertyDescriptors());
	properties.add(CSV_PARSER);
	properties.add(DateTimeUtils.DATE_FORMAT);
	properties.add(DateTimeUtils.TIME_FORMAT);
	properties.add(DateTimeUtils.TIMESTAMP_FORMAT);
	properties.add(CSVUtils.CSV_FORMAT);
	properties.add(CSVUtils.VALUE_SEPARATOR);
	properties.add(CSVUtils.FIRST_LINE_IS_HEADER);
	properties.add(CSVUtils.IGNORE_CSV_HEADER);
	properties.add(CSVUtils.QUOTE_CHAR);
	properties.add(CSVUtils.ESCAPE_CHAR);
	properties.add(CSVUtils.COMMENT_MARKER);
	properties.add(CSVUtils.NULL_STRING);
	properties.add(CSVUtils.TRIM_FIELDS);
	properties.add(CSVUtils.CHARSET);
	return properties;
	}

	@OnEnabled
	public void storeCsvFormat(final ConfigurationContext context) {
	this.csvParser = context.getProperty(CSV_PARSER).getValue();
	this.csvFormat = CSVUtils.createCSVFormat(context);
	this.dateFormat = context.getProperty(DateTimeUtils.DATE_FORMAT).getValue();
	this.timeFormat = context.getProperty(DateTimeUtils.TIME_FORMAT).getValue();
	this.timestampFormat = context.getProperty(DateTimeUtils.TIMESTAMP_FORMAT).getValue();
	this.firstLineIsHeader = context.getProperty(CSVUtils.FIRST_LINE_IS_HEADER).asBoolean();
	this.ignoreHeader = context.getProperty(CSVUtils.IGNORE_CSV_HEADER).asBoolean();
	this.charSet = context.getProperty(CSVUtils.CHARSET).getValue();

	// Ensure that if we are deriving schema from header that we always treat the first line as a header,
	// regardless of the 'First Line is Header' property
	final String accessStrategy = context.getProperty(SchemaAccessUtils.SCHEMA_ACCESS_STRATEGY).getValue();
	if (headerDerivedAllowableValue.getValue().equals(accessStrategy) \|\| SchemaInferenceUtil.INFER_SCHEMA.getValue().equals(accessStrategy)) {
	this.csvFormat = this.csvFormat.withFirstRecordAsHeader();
	this.firstLineIsHeader = true;
	}
	}

	@Override
	public RecordReader createRecordReader(final Map<String, String> variables, final InputStream in, final ComponentLog logger) throws IOException, SchemaNotFoundException {
	// Use Mark/Reset of a BufferedInputStream in case we read from the Input Stream for the header.
	in.mark(1024 * 1024);
	final RecordSchema schema = getSchema(variables, new NonCloseableInputStream(in), null);
	in.reset();

	if(APACHE_COMMONS_CSV.getValue().equals(csvParser)) {
	return new CSVRecordReader(in, logger, schema, csvFormat, firstLineIsHeader, ignoreHeader, dateFormat, timeFormat, timestampFormat, charSet);
	} else if(JACKSON_CSV.getValue().equals(csvParser)) {
	return new JacksonCSVRecordReader(in, logger, schema, csvFormat, firstLineIsHeader, ignoreHeader, dateFormat, timeFormat, timestampFormat, charSet);
	} else {
	throw new IOException("Parser not supported");
	}
	}

	@Override
	protected SchemaAccessStrategy getSchemaAccessStrategy(final String allowableValue, final SchemaRegistry schemaRegistry, final PropertyContext context) {
	if (allowableValue.equalsIgnoreCase(headerDerivedAllowableValue.getValue())) {
	return new CSVHeaderSchemaStrategy(context);
	} else if (allowableValue.equalsIgnoreCase(SchemaInferenceUtil.INFER_SCHEMA.getValue())) {
	final RecordSourceFactory<CSVRecordAndFieldNames> sourceFactory = (var, in) -> new CSVRecordSource(in, context);
	final SchemaInferenceEngine<CSVRecordAndFieldNames> inference = new CSVSchemaInference(new TimeValueInference(dateFormat, timeFormat, timestampFormat));
	return new InferSchemaAccessStrategy<>(sourceFactory, inference, getLogger());
	}

	return super.getSchemaAccessStrategy(allowableValue, schemaRegistry, context);
	}

	@Override
	protected List<AllowableValue> getSchemaAccessStrategyValues() {
	final List<AllowableValue> allowableValues = new ArrayList<>(super.getSchemaAccessStrategyValues());
	allowableValues.add(headerDerivedAllowableValue);
	allowableValues.add(SchemaInferenceUtil.INFER_SCHEMA);
	return allowableValues;
	}

	@Override
	protected AllowableValue getDefaultSchemaAccessStrategy() {
	return SchemaInferenceUtil.INFER_SCHEMA;
	}
	}