processing/src/main/java/org/apache/carbondata/processing/loading/csvinput/CSVInputFormat.java - carbondata - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *    http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 package org.apache.carbondata.processing.loading.csvinput;

 import java.io.IOException;
 import java.io.InputStream;
 import java.io.InputStreamReader;
 import java.io.Reader;
 import java.nio.charset.Charset;

 import org.apache.carbondata.common.logging.LogServiceFactory;
 import org.apache.carbondata.core.constants.CarbonCommonConstants;
 import org.apache.carbondata.core.util.CarbonProperties;

 import com.univocity.parsers.csv.CsvParser;
 import com.univocity.parsers.csv.CsvParserSettings;
 import org.apache.commons.io.input.BOMInputStream;
 import org.apache.commons.lang.BooleanUtils;
 import org.apache.commons.lang.StringUtils;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FSDataInputStream;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.fs.Seekable;
 import org.apache.hadoop.io.NullWritable;
 import org.apache.hadoop.io.Text;
 import org.apache.hadoop.io.compress.CodecPool;
 import org.apache.hadoop.io.compress.CompressionCodec;
 import org.apache.hadoop.io.compress.CompressionCodecFactory;
 import org.apache.hadoop.io.compress.CompressionInputStream;
 import org.apache.hadoop.io.compress.Decompressor;
 import org.apache.hadoop.io.compress.SplitCompressionInputStream;
 import org.apache.hadoop.io.compress.SplittableCompressionCodec;
 import org.apache.hadoop.mapreduce.InputSplit;
 import org.apache.hadoop.mapreduce.JobContext;
 import org.apache.hadoop.mapreduce.RecordReader;
 import org.apache.hadoop.mapreduce.TaskAttemptContext;
 import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
 import org.apache.hadoop.mapreduce.lib.input.FileSplit;
 import org.apache.hadoop.util.LineReader;
 import org.apache.log4j.Logger;

 /**
  * An {@link org.apache.hadoop.mapreduce.InputFormat} for csv files.  Files are broken into lines.
  * Values are the line of csv files.
  */
 public class CSVInputFormat extends FileInputFormat<NullWritable, StringArrayWritable> {

   public static final String DELIMITER = "carbon.csvinputformat.delimiter";
   public static final String DELIMITER_DEFAULT = ",";
   public static final String COMMENT = "carbon.csvinputformat.comment";
   public static final String COMMENT_DEFAULT = "#";
   public static final String QUOTE = "carbon.csvinputformat.quote";
   public static final String QUOTE_DEFAULT = "\"";
   public static final String ESCAPE = "carbon.csvinputformat.escape";
   public static final String ESCAPE_DEFAULT = "\\";
   public static final String HEADER_PRESENT = "carbon.csvinputformat.header.present";
   public static final boolean HEADER_PRESENT_DEFAULT = false;
   public static final String SKIP_EMPTY_LINE = "carbon.csvinputformat.skip.empty.line";
   public static final String READ_BUFFER_SIZE = "carbon.csvinputformat.read.buffer.size";
   public static final String READ_BUFFER_SIZE_DEFAULT = "65536";
   public static final String MAX_COLUMNS = "carbon.csvinputformat.max.columns";
   public static final String NUMBER_OF_COLUMNS = "carbon.csvinputformat.number.of.columns";
   /**
    * support only one column index
    */
   public static final String SELECT_COLUMN_INDEX = "carbon.csvinputformat.select.column.index";
   public static final int DEFAULT_MAX_NUMBER_OF_COLUMNS_FOR_PARSING = 2000;
   public static final int THRESHOLD_MAX_NUMBER_OF_COLUMNS_FOR_PARSING = 20000;

   private static final Logger LOGGER =
       LogServiceFactory.getLogService(CSVInputFormat.class.toString());

   @Override
   public RecordReader<NullWritable, StringArrayWritable> createRecordReader(InputSplit inputSplit,
       TaskAttemptContext context) throws IOException, InterruptedException {
     return new CSVRecordReader();
   }

   @Override
   protected boolean isSplitable(JobContext context, Path file) {
     final CompressionCodec codec = new CompressionCodecFactory(context.getConfiguration())
         .getCodec(file);
     if (null == codec) {
       return true;
     }
     return codec instanceof SplittableCompressionCodec;
   }

   /**
    * Sets the comment char to configuration. Default it is #.
    * @param configuration
    * @param commentChar
    */
   public static void setCommentCharacter(Configuration configuration, String commentChar) {
     if (commentChar != null && !commentChar.isEmpty()) {
       configuration.set(COMMENT, commentChar);
     }
   }

   /**
    * Sets the delimiter to configuration. Default it is ','
    * @param configuration
    * @param delimiter
    */
   public static void setCSVDelimiter(Configuration configuration, String delimiter) {
     if (delimiter != null && !delimiter.isEmpty()) {
       configuration.set(DELIMITER, delimiter);
     }
   }

   /**
    * Sets the skipEmptyLine to configuration. Default it is false
    *
    * @param configuration
    * @param skipEmptyLine
    */
   public static void setSkipEmptyLine(Configuration configuration, String skipEmptyLine) {
     if (skipEmptyLine != null && !skipEmptyLine.isEmpty()) {
       configuration.set(SKIP_EMPTY_LINE, skipEmptyLine);
     } else {
       try {
         BooleanUtils.toBoolean(CarbonProperties.getInstance()
             .getProperty(CarbonCommonConstants.CARBON_SKIP_EMPTY_LINE), "true", "false");
         configuration.set(SKIP_EMPTY_LINE, CarbonProperties.getInstance()
             .getProperty(CarbonCommonConstants.CARBON_SKIP_EMPTY_LINE));
       } catch (Exception e) {
         configuration.set(SKIP_EMPTY_LINE, CarbonCommonConstants.CARBON_SKIP_EMPTY_LINE_DEFAULT);
       }
     }
   }

   /**
    * Sets the escape character to configuration. Default it is \
    * @param configuration
    * @param escapeCharacter
    */
   public static void setEscapeCharacter(Configuration configuration, String escapeCharacter) {
     if (escapeCharacter != null && !escapeCharacter.isEmpty()) {
       configuration.set(ESCAPE, escapeCharacter);
     }
   }

   /**
    * Whether header needs to read from csv or not. By default it is false.
    * @param configuration
    * @param headerExtractEnable
    */
   public static void setHeaderExtractionEnabled(Configuration configuration,
       boolean headerExtractEnable) {
     configuration.set(HEADER_PRESENT, String.valueOf(headerExtractEnable));
   }

   /**
    * Sets the quote character to configuration. Default it is "
    * @param configuration
    * @param quoteCharacter
    */
   public static void setQuoteCharacter(Configuration configuration, String quoteCharacter) {
     if (quoteCharacter != null && !quoteCharacter.isEmpty()) {
       configuration.set(QUOTE, quoteCharacter);
     }
   }

   /**
    * Sets the read buffer size to configuration.
    * @param configuration
    * @param bufferSize
    */
   public static void setReadBufferSize(Configuration configuration, String bufferSize) {
     if (bufferSize != null && !bufferSize.isEmpty()) {
       configuration.set(READ_BUFFER_SIZE, bufferSize);
     }
   }

   public static void setMaxColumns(Configuration configuration, String maxColumns) {
     if (maxColumns != null) {
       configuration.set(MAX_COLUMNS, maxColumns);
     }
   }

   public static void setNumberOfColumns(Configuration configuration, String numberOfColumns) {
     configuration.set(NUMBER_OF_COLUMNS, numberOfColumns);
   }

   public static CsvParserSettings extractCsvParserSettings(Configuration job) {
     CsvParserSettings parserSettings = new CsvParserSettings();
     parserSettings.getFormat().setDelimiter(job.get(DELIMITER, DELIMITER_DEFAULT).charAt(0));
     parserSettings.getFormat().setComment(job.get(COMMENT, COMMENT_DEFAULT).charAt(0));
     parserSettings.setLineSeparatorDetectionEnabled(true);
     parserSettings.setNullValue("");
     parserSettings.setEmptyValue("");
     parserSettings.setIgnoreLeadingWhitespaces(false);
     parserSettings.setIgnoreTrailingWhitespaces(false);
     parserSettings.setSkipEmptyLines(
         Boolean.valueOf(job.get(SKIP_EMPTY_LINE,
             CarbonCommonConstants.CARBON_SKIP_EMPTY_LINE_DEFAULT)));
     // todo: will verify whether there is a performance degrade using -1 here
     // parserSettings.setMaxCharsPerColumn(CarbonCommonConstants.MAX_CHARS_PER_COLUMN_DEFAULT);
     parserSettings.setMaxCharsPerColumn(CarbonCommonConstants.MAX_CHARS_PER_COLUMN_INFINITY);
     String maxColumns = job.get(MAX_COLUMNS, "" + DEFAULT_MAX_NUMBER_OF_COLUMNS_FOR_PARSING);
     parserSettings.setMaxColumns(Integer.parseInt(maxColumns));
     parserSettings.getFormat().setQuote(job.get(QUOTE, QUOTE_DEFAULT).charAt(0));
     parserSettings.getFormat().setQuoteEscape(job.get(ESCAPE, ESCAPE_DEFAULT).charAt(0));
     // setting the content length to to limit the length of displayed contents being parsed/written
     // in the exception message when an error occurs.
     parserSettings.setErrorContentLength(CarbonCommonConstants.CARBON_ERROR_CONTENT_LENGTH);

     String selectColumnIndex = job.get(SELECT_COLUMN_INDEX, null);
     if (!StringUtils.isBlank(selectColumnIndex)) {
       parserSettings.selectIndexes(Integer.parseInt(selectColumnIndex));
     }
     return parserSettings;
   }

   /**
    * Treats value as line in file. Key is null.
    */
   public static class CSVRecordReader extends RecordReader<NullWritable, StringArrayWritable> {

     private long start;
     private long end;
     private BoundedInputStream boundedInputStream;
     private Reader reader;
     private CsvParser csvParser;
     private StringArrayWritable value;
     private String[] columns;
     private Seekable filePosition;
     private boolean isCompressedInput;
     private Decompressor decompressor;

     @Override
     public void initialize(InputSplit inputSplit, TaskAttemptContext context)
         throws IOException, InterruptedException {
       FileSplit split = (FileSplit) inputSplit;
       start = split.getStart();
       end = start + split.getLength();
       Path file = split.getPath();
       Configuration job = context.getConfiguration();
       CompressionCodec codec = (new CompressionCodecFactory(job)).getCodec(file);
       FileSystem fs = file.getFileSystem(job);
       int bufferSize = Integer.parseInt(job.get(READ_BUFFER_SIZE, READ_BUFFER_SIZE_DEFAULT));
       FSDataInputStream fileIn = fs.open(file, bufferSize);
       InputStream inputStream;
       if (codec != null) {
         isCompressedInput = true;
         decompressor = CodecPool.getDecompressor(codec);
         if (codec instanceof SplittableCompressionCodec) {
           SplitCompressionInputStream scIn = ((SplittableCompressionCodec) codec)
               .createInputStream(fileIn, decompressor, start, end, SplittableCompressionCodec
                   .READ_MODE.BYBLOCK);
           start = scIn.getAdjustedStart();
           end = scIn.getAdjustedEnd();
           if (start != 0) {
             LineReader lineReader = new LineReader(scIn, 1);
             start += lineReader.readLine(new Text(), 0);
           }
           filePosition = scIn;
           inputStream = scIn;
         } else {
           CompressionInputStream cIn = codec.createInputStream(fileIn, decompressor);
           filePosition = cIn;
           inputStream = cIn;
         }
       } else {
         fileIn.seek(start);
         if (start != 0) {
           LineReader lineReader = new LineReader(fileIn, 1);
           start += lineReader.readLine(new Text(), 0);
         }
         boundedInputStream = new BoundedInputStream(fileIn, end - start);
         filePosition = fileIn;
         inputStream = boundedInputStream;
       }

       //Wrap input stream with BOMInputStream to skip UTF-8 BOM characters
       reader = new InputStreamReader(new BOMInputStream(inputStream),
           Charset.forName(CarbonCommonConstants.DEFAULT_CHARSET));

       CsvParserSettings settings = extractCsvParserSettings(job);
       if (start == 0) {
         settings.setHeaderExtractionEnabled(job.getBoolean(HEADER_PRESENT,
             HEADER_PRESENT_DEFAULT));
       }
       csvParser = new CsvParser(settings);
       csvParser.beginParsing(reader);
     }

     @Override
     public boolean nextKeyValue() throws IOException, InterruptedException {
       if (csvParser == null) {
         return false;
       }
       columns = csvParser.parseNext();
       if (columns == null) {
         value = null;
         return false;
       }
       if (value == null) {
         value = new StringArrayWritable();
       }
       value.set(columns);
       return true;
     }

     @Override
     public NullWritable getCurrentKey() throws IOException, InterruptedException {
       return NullWritable.get();
     }

     @Override
     public StringArrayWritable getCurrentValue() throws IOException, InterruptedException {
       return value;
     }

     private long getPos() throws IOException {
       long retVal = start;
       if (null != boundedInputStream) {
         retVal = end - boundedInputStream.getRemaining();
       } else if (isCompressedInput && null != filePosition) {
         retVal = filePosition.getPos();
       }
       return retVal;
     }

     @Override
     public float getProgress() throws IOException, InterruptedException {
       return start == end ? 0.0F : Math.min(1.0F, (float) (getPos() -
           start) / (float) (end - start));
     }

     @Override
     public void close() throws IOException {
       try {
         if (reader != null) {
           reader.close();
         }
         if (boundedInputStream != null) {
           boundedInputStream.close();
         }
         if (null != csvParser) {
           csvParser.stopParsing();
         }
       } finally {
         reader = null;
         boundedInputStream = null;
         csvParser = null;
         filePosition = null;
         value = null;
         if (decompressor != null) {
           CodecPool.returnDecompressor(decompressor);
           decompressor = null;
         }
       }
     }
   }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	package org.apache.carbondata.processing.loading.csvinput;

	import java.io.IOException;
	import java.io.InputStream;
	import java.io.InputStreamReader;
	import java.io.Reader;
	import java.nio.charset.Charset;

	import org.apache.carbondata.common.logging.LogServiceFactory;
	import org.apache.carbondata.core.constants.CarbonCommonConstants;
	import org.apache.carbondata.core.util.CarbonProperties;

	import com.univocity.parsers.csv.CsvParser;
	import com.univocity.parsers.csv.CsvParserSettings;
	import org.apache.commons.io.input.BOMInputStream;
	import org.apache.commons.lang.BooleanUtils;
	import org.apache.commons.lang.StringUtils;
	import org.apache.hadoop.conf.Configuration;
	import org.apache.hadoop.fs.FSDataInputStream;
	import org.apache.hadoop.fs.FileSystem;
	import org.apache.hadoop.fs.Path;
	import org.apache.hadoop.fs.Seekable;
	import org.apache.hadoop.io.NullWritable;
	import org.apache.hadoop.io.Text;
	import org.apache.hadoop.io.compress.CodecPool;
	import org.apache.hadoop.io.compress.CompressionCodec;
	import org.apache.hadoop.io.compress.CompressionCodecFactory;
	import org.apache.hadoop.io.compress.CompressionInputStream;
	import org.apache.hadoop.io.compress.Decompressor;
	import org.apache.hadoop.io.compress.SplitCompressionInputStream;
	import org.apache.hadoop.io.compress.SplittableCompressionCodec;
	import org.apache.hadoop.mapreduce.InputSplit;
	import org.apache.hadoop.mapreduce.JobContext;
	import org.apache.hadoop.mapreduce.RecordReader;
	import org.apache.hadoop.mapreduce.TaskAttemptContext;
	import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
	import org.apache.hadoop.mapreduce.lib.input.FileSplit;
	import org.apache.hadoop.util.LineReader;
	import org.apache.log4j.Logger;

	/**
	* An {@link org.apache.hadoop.mapreduce.InputFormat} for csv files. Files are broken into lines.
	* Values are the line of csv files.
	*/
	public class CSVInputFormat extends FileInputFormat<NullWritable, StringArrayWritable> {

	public static final String DELIMITER = "carbon.csvinputformat.delimiter";
	public static final String DELIMITER_DEFAULT = ",";
	public static final String COMMENT = "carbon.csvinputformat.comment";
	public static final String COMMENT_DEFAULT = "#";
	public static final String QUOTE = "carbon.csvinputformat.quote";
	public static final String QUOTE_DEFAULT = "\"";
	public static final String ESCAPE = "carbon.csvinputformat.escape";
	public static final String ESCAPE_DEFAULT = "\\";
	public static final String HEADER_PRESENT = "carbon.csvinputformat.header.present";
	public static final boolean HEADER_PRESENT_DEFAULT = false;
	public static final String SKIP_EMPTY_LINE = "carbon.csvinputformat.skip.empty.line";
	public static final String READ_BUFFER_SIZE = "carbon.csvinputformat.read.buffer.size";
	public static final String READ_BUFFER_SIZE_DEFAULT = "65536";
	public static final String MAX_COLUMNS = "carbon.csvinputformat.max.columns";
	public static final String NUMBER_OF_COLUMNS = "carbon.csvinputformat.number.of.columns";
	/**
	* support only one column index
	*/
	public static final String SELECT_COLUMN_INDEX = "carbon.csvinputformat.select.column.index";
	public static final int DEFAULT_MAX_NUMBER_OF_COLUMNS_FOR_PARSING = 2000;
	public static final int THRESHOLD_MAX_NUMBER_OF_COLUMNS_FOR_PARSING = 20000;

	private static final Logger LOGGER =
	LogServiceFactory.getLogService(CSVInputFormat.class.toString());

	@Override
	public RecordReader<NullWritable, StringArrayWritable> createRecordReader(InputSplit inputSplit,
	TaskAttemptContext context) throws IOException, InterruptedException {
	return new CSVRecordReader();
	}

	@Override
	protected boolean isSplitable(JobContext context, Path file) {
	final CompressionCodec codec = new CompressionCodecFactory(context.getConfiguration())
	.getCodec(file);
	if (null == codec) {
	return true;
	}
	return codec instanceof SplittableCompressionCodec;
	}

	/**
	* Sets the comment char to configuration. Default it is #.
	* @param configuration
	* @param commentChar
	*/
	public static void setCommentCharacter(Configuration configuration, String commentChar) {
	if (commentChar != null && !commentChar.isEmpty()) {
	configuration.set(COMMENT, commentChar);
	}
	}

	/**
	* Sets the delimiter to configuration. Default it is ','
	* @param configuration
	* @param delimiter
	*/
	public static void setCSVDelimiter(Configuration configuration, String delimiter) {
	if (delimiter != null && !delimiter.isEmpty()) {
	configuration.set(DELIMITER, delimiter);
	}
	}

	/**
	* Sets the skipEmptyLine to configuration. Default it is false
	*
	* @param configuration
	* @param skipEmptyLine
	*/
	public static void setSkipEmptyLine(Configuration configuration, String skipEmptyLine) {
	if (skipEmptyLine != null && !skipEmptyLine.isEmpty()) {
	configuration.set(SKIP_EMPTY_LINE, skipEmptyLine);
	} else {
	try {
	BooleanUtils.toBoolean(CarbonProperties.getInstance()
	.getProperty(CarbonCommonConstants.CARBON_SKIP_EMPTY_LINE), "true", "false");
	configuration.set(SKIP_EMPTY_LINE, CarbonProperties.getInstance()
	.getProperty(CarbonCommonConstants.CARBON_SKIP_EMPTY_LINE));
	} catch (Exception e) {
	configuration.set(SKIP_EMPTY_LINE, CarbonCommonConstants.CARBON_SKIP_EMPTY_LINE_DEFAULT);
	}
	}
	}

	/**
	* Sets the escape character to configuration. Default it is \
	* @param configuration
	* @param escapeCharacter
	*/
	public static void setEscapeCharacter(Configuration configuration, String escapeCharacter) {
	if (escapeCharacter != null && !escapeCharacter.isEmpty()) {
	configuration.set(ESCAPE, escapeCharacter);
	}
	}

	/**
	* Whether header needs to read from csv or not. By default it is false.
	* @param configuration
	* @param headerExtractEnable
	*/
	public static void setHeaderExtractionEnabled(Configuration configuration,
	boolean headerExtractEnable) {
	configuration.set(HEADER_PRESENT, String.valueOf(headerExtractEnable));
	}

	/**
	* Sets the quote character to configuration. Default it is "
	* @param configuration
	* @param quoteCharacter
	*/
	public static void setQuoteCharacter(Configuration configuration, String quoteCharacter) {
	if (quoteCharacter != null && !quoteCharacter.isEmpty()) {
	configuration.set(QUOTE, quoteCharacter);
	}
	}

	/**
	* Sets the read buffer size to configuration.
	* @param configuration
	* @param bufferSize
	*/
	public static void setReadBufferSize(Configuration configuration, String bufferSize) {
	if (bufferSize != null && !bufferSize.isEmpty()) {
	configuration.set(READ_BUFFER_SIZE, bufferSize);
	}
	}

	public static void setMaxColumns(Configuration configuration, String maxColumns) {
	if (maxColumns != null) {
	configuration.set(MAX_COLUMNS, maxColumns);
	}
	}

	public static void setNumberOfColumns(Configuration configuration, String numberOfColumns) {
	configuration.set(NUMBER_OF_COLUMNS, numberOfColumns);
	}

	public static CsvParserSettings extractCsvParserSettings(Configuration job) {
	CsvParserSettings parserSettings = new CsvParserSettings();
	parserSettings.getFormat().setDelimiter(job.get(DELIMITER, DELIMITER_DEFAULT).charAt(0));
	parserSettings.getFormat().setComment(job.get(COMMENT, COMMENT_DEFAULT).charAt(0));
	parserSettings.setLineSeparatorDetectionEnabled(true);
	parserSettings.setNullValue("");
	parserSettings.setEmptyValue("");
	parserSettings.setIgnoreLeadingWhitespaces(false);
	parserSettings.setIgnoreTrailingWhitespaces(false);
	parserSettings.setSkipEmptyLines(
	Boolean.valueOf(job.get(SKIP_EMPTY_LINE,
	CarbonCommonConstants.CARBON_SKIP_EMPTY_LINE_DEFAULT)));
	// todo: will verify whether there is a performance degrade using -1 here
	// parserSettings.setMaxCharsPerColumn(CarbonCommonConstants.MAX_CHARS_PER_COLUMN_DEFAULT);
	parserSettings.setMaxCharsPerColumn(CarbonCommonConstants.MAX_CHARS_PER_COLUMN_INFINITY);
	String maxColumns = job.get(MAX_COLUMNS, "" + DEFAULT_MAX_NUMBER_OF_COLUMNS_FOR_PARSING);
	parserSettings.setMaxColumns(Integer.parseInt(maxColumns));
	parserSettings.getFormat().setQuote(job.get(QUOTE, QUOTE_DEFAULT).charAt(0));
	parserSettings.getFormat().setQuoteEscape(job.get(ESCAPE, ESCAPE_DEFAULT).charAt(0));
	// setting the content length to to limit the length of displayed contents being parsed/written
	// in the exception message when an error occurs.
	parserSettings.setErrorContentLength(CarbonCommonConstants.CARBON_ERROR_CONTENT_LENGTH);

	String selectColumnIndex = job.get(SELECT_COLUMN_INDEX, null);
	if (!StringUtils.isBlank(selectColumnIndex)) {
	parserSettings.selectIndexes(Integer.parseInt(selectColumnIndex));
	}
	return parserSettings;
	}

	/**
	* Treats value as line in file. Key is null.
	*/
	public static class CSVRecordReader extends RecordReader<NullWritable, StringArrayWritable> {

	private long start;
	private long end;
	private BoundedInputStream boundedInputStream;
	private Reader reader;
	private CsvParser csvParser;
	private StringArrayWritable value;
	private String[] columns;
	private Seekable filePosition;
	private boolean isCompressedInput;
	private Decompressor decompressor;

	@Override
	public void initialize(InputSplit inputSplit, TaskAttemptContext context)
	throws IOException, InterruptedException {
	FileSplit split = (FileSplit) inputSplit;
	start = split.getStart();
	end = start + split.getLength();
	Path file = split.getPath();
	Configuration job = context.getConfiguration();
	CompressionCodec codec = (new CompressionCodecFactory(job)).getCodec(file);
	FileSystem fs = file.getFileSystem(job);
	int bufferSize = Integer.parseInt(job.get(READ_BUFFER_SIZE, READ_BUFFER_SIZE_DEFAULT));
	FSDataInputStream fileIn = fs.open(file, bufferSize);
	InputStream inputStream;
	if (codec != null) {
	isCompressedInput = true;
	decompressor = CodecPool.getDecompressor(codec);
	if (codec instanceof SplittableCompressionCodec) {
	SplitCompressionInputStream scIn = ((SplittableCompressionCodec) codec)
	.createInputStream(fileIn, decompressor, start, end, SplittableCompressionCodec
	.READ_MODE.BYBLOCK);
	start = scIn.getAdjustedStart();
	end = scIn.getAdjustedEnd();
	if (start != 0) {
	LineReader lineReader = new LineReader(scIn, 1);
	start += lineReader.readLine(new Text(), 0);
	}
	filePosition = scIn;
	inputStream = scIn;
	} else {
	CompressionInputStream cIn = codec.createInputStream(fileIn, decompressor);
	filePosition = cIn;
	inputStream = cIn;
	}
	} else {
	fileIn.seek(start);
	if (start != 0) {
	LineReader lineReader = new LineReader(fileIn, 1);
	start += lineReader.readLine(new Text(), 0);
	}
	boundedInputStream = new BoundedInputStream(fileIn, end - start);
	filePosition = fileIn;
	inputStream = boundedInputStream;
	}

	//Wrap input stream with BOMInputStream to skip UTF-8 BOM characters
	reader = new InputStreamReader(new BOMInputStream(inputStream),
	Charset.forName(CarbonCommonConstants.DEFAULT_CHARSET));

	CsvParserSettings settings = extractCsvParserSettings(job);
	if (start == 0) {
	settings.setHeaderExtractionEnabled(job.getBoolean(HEADER_PRESENT,
	HEADER_PRESENT_DEFAULT));
	}
	csvParser = new CsvParser(settings);
	csvParser.beginParsing(reader);
	}

	@Override
	public boolean nextKeyValue() throws IOException, InterruptedException {
	if (csvParser == null) {
	return false;
	}
	columns = csvParser.parseNext();
	if (columns == null) {
	value = null;
	return false;
	}
	if (value == null) {
	value = new StringArrayWritable();
	}
	value.set(columns);
	return true;
	}

	@Override
	public NullWritable getCurrentKey() throws IOException, InterruptedException {
	return NullWritable.get();
	}

	@Override
	public StringArrayWritable getCurrentValue() throws IOException, InterruptedException {
	return value;
	}

	private long getPos() throws IOException {
	long retVal = start;
	if (null != boundedInputStream) {
	retVal = end - boundedInputStream.getRemaining();
	} else if (isCompressedInput && null != filePosition) {
	retVal = filePosition.getPos();
	}
	return retVal;
	}

	@Override
	public float getProgress() throws IOException, InterruptedException {
	return start == end ? 0.0F : Math.min(1.0F, (float) (getPos() -
	start) / (float) (end - start));
	}

	@Override
	public void close() throws IOException {
	try {
	if (reader != null) {
	reader.close();
	}
	if (boundedInputStream != null) {
	boundedInputStream.close();
	}
	if (null != csvParser) {
	csvParser.stopParsing();
	}
	} finally {
	reader = null;
	boundedInputStream = null;
	csvParser = null;
	filePosition = null;
	value = null;
	if (decompressor != null) {
	CodecPool.returnDecompressor(decompressor);
	decompressor = null;
	}
	}
	}
	}
	}