tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/csv/TextAndCSVParser.java - tika - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *      http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.tika.parser.csv;

 import java.io.BufferedReader;
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.InputStreamReader;
 import java.io.Reader;
 import java.nio.charset.Charset;
 import java.nio.charset.UnsupportedCharsetException;
 import java.util.Arrays;
 import java.util.Collections;
 import java.util.HashMap;
 import java.util.HashSet;
 import java.util.Map;
 import java.util.Set;

 import org.apache.commons.csv.CSVFormat;
 import org.apache.commons.csv.CSVRecord;
 import org.apache.commons.io.input.CloseShieldInputStream;
 import org.apache.tika.config.Field;
 import org.apache.tika.detect.AutoDetectReader;
 import org.apache.tika.detect.EncodingDetector;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.Property;
 import org.apache.tika.metadata.TikaCoreProperties;
 import org.apache.tika.mime.MediaType;
 import org.apache.tika.parser.AbstractEncodingDetectorParser;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.sax.XHTMLContentHandler;
 import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;

 /**
  * Unless the {@link TikaCoreProperties#CONTENT_TYPE_USER_OVERRIDE} is set,
  * this parser tries to assess whether the file is a text file, csv or tsv.
  * If the detector detects regularity in column numbers and/or encapsulated cells,
  * this parser will apply the {@link org.apache.commons.csv.CSVParser};
  * otherwise, it will treat the contents as text.
  * <p>
  *     If there is a csv parse exception during detection, the parser sets
  * the {@link Metadata#CONTENT_TYPE} to {@link MediaType#TEXT_PLAIN}
  * and treats the file as {@link MediaType#TEXT_PLAIN}.
  * </p>
  * <p>
  *     If there is a csv parse exception during the parse, the parser
  *     writes what's left of the stream as if it were text and then throws
  *     an exception.  As of this writing, the content that was buffered by the underlying
  *     {@link org.apache.commons.csv.CSVParser} is lost.
  * </p>
  */
 public class TextAndCSVParser extends AbstractEncodingDetectorParser {

     private static final String CSV_PREFIX = "csv";
     private static final String CHARSET = "charset";
     private static final String DELIMITER = "delimiter";
     public static final Property DELIMITER_PROPERTY = Property.externalText(
             CSV_PREFIX + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER+DELIMITER);

     private static final String TD = "td";
     private static final String TR = "tr";
     private static final String TABLE = "table";

     static final MediaType CSV = MediaType.text("csv");
     static final MediaType TSV = MediaType.text("tsv");

     private static final int DEFAULT_MARK_LIMIT = 20000;

     private static final char[] DEFAULT_DELIMITERS = new char[]{',', '\t'};

     private static final Map<Character, String> CHAR_TO_STRING_DELIMITER_MAP = new HashMap<>();
     private static final Map<String, Character> STRING_TO_CHAR_DELIMITER_MAP = new HashMap<>();

     static {
         CHAR_TO_STRING_DELIMITER_MAP.put(',', "comma");
         CHAR_TO_STRING_DELIMITER_MAP.put('\t', "tab");
         CHAR_TO_STRING_DELIMITER_MAP.put('|', "pipe");
         CHAR_TO_STRING_DELIMITER_MAP.put(';', "semicolon");
         CHAR_TO_STRING_DELIMITER_MAP.put(':', "colon");
     }

     static {
         for (Map.Entry<Character, String> e : CHAR_TO_STRING_DELIMITER_MAP.entrySet()) {
             STRING_TO_CHAR_DELIMITER_MAP.put(e.getValue(), e.getKey());
         }
     }

     private static final Set<MediaType> SUPPORTED_TYPES =
             Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(
                     CSV, TSV, MediaType.TEXT_PLAIN)));

     private char[] delimiters = DEFAULT_DELIMITERS;

     /**
      * This is the mark limit in characters (not bytes) to
      * read from the stream when classifying the stream as
      * csv, tsv or txt.
      */
     @Field
     private int markLimit = DEFAULT_MARK_LIMIT;


     /**
      * minimum confidence score that there's enough
      * evidence to determine csv/tsv vs. txt
      */
     @Field
     private double minConfidence = 0.50;

     public TextAndCSVParser() {
         super();
     }

     public TextAndCSVParser(EncodingDetector encodingDetector) {
         super(encodingDetector);
     }

     @Override
     public Set<MediaType> getSupportedTypes(ParseContext context) {
         return SUPPORTED_TYPES;
     }

     @Override
     public void parse(InputStream stream, ContentHandler handler,
                       Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {

         CSVParams params = getOverride(metadata);
         Reader reader = null;
         Charset charset = null;
         if (! params.isComplete()) {
             reader = detect(params, stream, metadata, context);
             if (params.getCharset() != null) {
                 charset = params.getCharset();
             } else {
                 charset = ((AutoDetectReader) reader).getCharset();
             }
         } else {
             reader = new BufferedReader(new InputStreamReader(stream, params.getCharset()));
             charset = params.getCharset();
         }

         updateMetadata(params, metadata);

         //if text or a non-csv/tsv category of text
         //treat this as text and be done
         //TODO -- if it was detected as a non-csv subtype of text
         if (! params.getMediaType().getBaseType().equals(CSV) &&
             ! params.getMediaType().getBaseType().equals(TSV)) {
             handleText(reader, charset, handler, metadata);
             return;
         }

         CSVFormat csvFormat = CSVFormat.EXCEL.withDelimiter(params.getDelimiter());
         metadata.set(DELIMITER_PROPERTY, CHAR_TO_STRING_DELIMITER_MAP.get(csvFormat.getDelimiter()));

         XHTMLContentHandler xhtmlContentHandler = new XHTMLContentHandler(handler, metadata);
         try (org.apache.commons.csv.CSVParser commonsParser =
                      new org.apache.commons.csv.CSVParser(reader, csvFormat)) {
             xhtmlContentHandler.startDocument();
             xhtmlContentHandler.startElement(TABLE);
             try {
                 for (CSVRecord row : commonsParser) {
                     xhtmlContentHandler.startElement(TR);
                     for (String cell : row) {
                         xhtmlContentHandler.startElement(TD);
                         xhtmlContentHandler.characters(cell);
                         xhtmlContentHandler.endElement(TD);
                     }
                     xhtmlContentHandler.endElement(TR);
                 }
             } catch (IllegalStateException e) {
                 //if there's a parse exception
                 //try to get the rest of the content...treat it as text for now
                 //There will be some content lost because of buffering.
                 //TODO -- figure out how to improve this
                 xhtmlContentHandler.endElement(TABLE);
                 xhtmlContentHandler.startElement("div", "name", "after exception");
                 handleText(reader, xhtmlContentHandler);
                 xhtmlContentHandler.endElement("div");
                 xhtmlContentHandler.endDocument();
                 //TODO -- consider dumping what's left in the reader as text
                 throw new TikaException("exception parsing the csv", e);
             }

             xhtmlContentHandler.endElement(TABLE);
             xhtmlContentHandler.endDocument();
         }
     }

     private void handleText(Reader reader, Charset charset,
                             ContentHandler handler, Metadata metadata)
             throws SAXException, IOException, TikaException {
         // Automatically detect the character encoding
             //try to get detected content type; could be a subclass of text/plain
             //such as vcal, etc.
             String incomingMime = metadata.get(Metadata.CONTENT_TYPE);
             MediaType mediaType = MediaType.TEXT_PLAIN;
             if (incomingMime != null) {
                 MediaType tmpMediaType = MediaType.parse(incomingMime);
                 if (tmpMediaType != null) {
                     mediaType = tmpMediaType;
                 }
             }
             MediaType type = new MediaType(mediaType, charset);
             metadata.set(Metadata.CONTENT_TYPE, type.toString());
             // deprecated, see TIKA-431
             metadata.set(Metadata.CONTENT_ENCODING, charset.name());

             XHTMLContentHandler xhtml =
                     new XHTMLContentHandler(handler, metadata);
             xhtml.startDocument();
             handleText(reader, xhtml);
             xhtml.endDocument();
     }

     private static void handleText(Reader reader, XHTMLContentHandler xhtml)
             throws SAXException, IOException {
         xhtml.startElement("p");
         char[] buffer = new char[4096];
         int n = reader.read(buffer);
         while (n != -1) {
             xhtml.characters(buffer, 0, n);
             n = reader.read(buffer);
         }
         xhtml.endElement("p");

     }

     private Reader detect(CSVParams params, InputStream stream,
                         Metadata metadata, ParseContext context) throws IOException, TikaException {
         //if the file was already identified as not .txt, .csv or .tsv
         //don't even try to csv or not
         String mediaString = metadata.get(Metadata.CONTENT_TYPE);
         if (mediaString != null) {
             MediaType mediaType = MediaType.parse(mediaString);
             if (! SUPPORTED_TYPES.contains(mediaType.getBaseType())) {
                 params.setMediaType(mediaType);
                 return new AutoDetectReader(
                         new CloseShieldInputStream(stream),
                         metadata, getEncodingDetector(context));
             }
         }
         Reader reader = null;
         if (params.getCharset() == null) {
             reader = new AutoDetectReader(
                     new CloseShieldInputStream(stream),
                     metadata, getEncodingDetector(context));
             params.setCharset(((AutoDetectReader)reader).getCharset());
             if (params.isComplete()) {
                 return reader;
             }
         } else {
             reader = new BufferedReader(new InputStreamReader(
                     new CloseShieldInputStream(stream), params.getCharset()));
         }

         if (params.getDelimiter() == null &&
                 (params.getMediaType() == null ||
                         isCSVOrTSV(params.getMediaType()))) {

             CSVSniffer sniffer = new CSVSniffer(markLimit, delimiters, minConfidence);
             CSVResult result = sniffer.getBest(reader, metadata);
             params.setMediaType(result.getMediaType());
             params.setDelimiter(result.getDelimiter());
         }
         return reader;
     }

     private CSVParams getOverride(Metadata metadata) {
         String override = metadata.get(TikaCoreProperties.CONTENT_TYPE_USER_OVERRIDE);
         if (override == null) {
             return new CSVParams();
         }
         MediaType mediaType = MediaType.parse(override);
         if (mediaType == null) {
             return new CSVParams();
         }
         String charsetString = mediaType.getParameters().get(CHARSET);
         Charset charset = null;
         if (charsetString != null) {
             try {
                 charset = Charset.forName(charsetString);
             } catch (UnsupportedCharsetException e) {

             }
         }
         if (! isCSVOrTSV(mediaType)) {
             return new CSVParams(mediaType, charset);
         }

         String delimiterString = mediaType.getParameters().get(DELIMITER);
         if (delimiterString == null) {
             return new CSVParams(mediaType, charset);
         }
         if (STRING_TO_CHAR_DELIMITER_MAP.containsKey(delimiterString)) {
             return new CSVParams(mediaType, charset, (char) STRING_TO_CHAR_DELIMITER_MAP.get(delimiterString));
         }
         if (delimiterString.length() == 1) {
             return new CSVParams(mediaType, charset, delimiterString.charAt(0));
         }
         //TODO: log bad/unrecognized delimiter string
         return new CSVParams(mediaType, charset);
     }

     static boolean isCSVOrTSV(MediaType mediaType) {
         if (mediaType == null) {
             return false;
         }
         if (mediaType.getBaseType().equals(TSV) ||
                 mediaType.getBaseType().equals(CSV)) {
             return true;
         }
         return false;
     }

     private void updateMetadata(CSVParams params, Metadata metadata) {
         MediaType mediaType = null;
         if (params.getMediaType().getBaseType().equals(MediaType.TEXT_PLAIN)) {
             mediaType = MediaType.TEXT_PLAIN;
         } else if (params.getDelimiter() != null) {
             if (params.getDelimiter() == '\t') {
                 mediaType = TSV;
             } else {
                 mediaType = CSV;
             }
         } else {
             if (metadata.get(Metadata.CONTENT_TYPE) != null) {
                 mediaType = MediaType.parse(
                         metadata.get(Metadata.CONTENT_TYPE));
             }
         }
         Map<String, String> attrs = new HashMap<>();
         if (params.getCharset() != null) {
             attrs.put(CHARSET, params.getCharset().name());
             // deprecated, see TIKA-431
             metadata.set(Metadata.CONTENT_ENCODING, params.getCharset().name());
         }
         if (!mediaType.equals(MediaType.TEXT_PLAIN) && params.getDelimiter() != null) {
             if (CHAR_TO_STRING_DELIMITER_MAP.containsKey(params.getDelimiter())) {
                 attrs.put(DELIMITER, CHAR_TO_STRING_DELIMITER_MAP.get(params.getDelimiter()));
             } else {
                 attrs.put(DELIMITER, Integer.toString((int)params.getDelimiter().charValue()));
             }
         }
         MediaType type = new MediaType(mediaType, attrs);
         metadata.set(Metadata.CONTENT_TYPE, type.toString());
     }

 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.tika.parser.csv;

	import java.io.BufferedReader;
	import java.io.IOException;
	import java.io.InputStream;
	import java.io.InputStreamReader;
	import java.io.Reader;
	import java.nio.charset.Charset;
	import java.nio.charset.UnsupportedCharsetException;
	import java.util.Arrays;
	import java.util.Collections;
	import java.util.HashMap;
	import java.util.HashSet;
	import java.util.Map;
	import java.util.Set;

	import org.apache.commons.csv.CSVFormat;
	import org.apache.commons.csv.CSVRecord;
	import org.apache.commons.io.input.CloseShieldInputStream;
	import org.apache.tika.config.Field;
	import org.apache.tika.detect.AutoDetectReader;
	import org.apache.tika.detect.EncodingDetector;
	import org.apache.tika.exception.TikaException;
	import org.apache.tika.metadata.Metadata;
	import org.apache.tika.metadata.Property;
	import org.apache.tika.metadata.TikaCoreProperties;
	import org.apache.tika.mime.MediaType;
	import org.apache.tika.parser.AbstractEncodingDetectorParser;
	import org.apache.tika.parser.ParseContext;
	import org.apache.tika.sax.XHTMLContentHandler;
	import org.xml.sax.ContentHandler;
	import org.xml.sax.SAXException;

	/**
	* Unless the {@link TikaCoreProperties#CONTENT_TYPE_USER_OVERRIDE} is set,
	* this parser tries to assess whether the file is a text file, csv or tsv.
	* If the detector detects regularity in column numbers and/or encapsulated cells,
	* this parser will apply the {@link org.apache.commons.csv.CSVParser};
	* otherwise, it will treat the contents as text.
	* <p>
	* If there is a csv parse exception during detection, the parser sets
	* the {@link Metadata#CONTENT_TYPE} to {@link MediaType#TEXT_PLAIN}
	* and treats the file as {@link MediaType#TEXT_PLAIN}.
	* </p>
	* <p>
	* If there is a csv parse exception during the parse, the parser
	* writes what's left of the stream as if it were text and then throws
	* an exception. As of this writing, the content that was buffered by the underlying
	* {@link org.apache.commons.csv.CSVParser} is lost.
	* </p>
	*/
	public class TextAndCSVParser extends AbstractEncodingDetectorParser {

	private static final String CSV_PREFIX = "csv";
	private static final String CHARSET = "charset";
	private static final String DELIMITER = "delimiter";
	public static final Property DELIMITER_PROPERTY = Property.externalText(
	CSV_PREFIX + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER+DELIMITER);

	private static final String TD = "td";
	private static final String TR = "tr";
	private static final String TABLE = "table";

	static final MediaType CSV = MediaType.text("csv");
	static final MediaType TSV = MediaType.text("tsv");

	private static final int DEFAULT_MARK_LIMIT = 20000;

	private static final char[] DEFAULT_DELIMITERS = new char[]{',', '\t'};

	private static final Map<Character, String> CHAR_TO_STRING_DELIMITER_MAP = new HashMap<>();
	private static final Map<String, Character> STRING_TO_CHAR_DELIMITER_MAP = new HashMap<>();

	static {
	CHAR_TO_STRING_DELIMITER_MAP.put(',', "comma");
	CHAR_TO_STRING_DELIMITER_MAP.put('\t', "tab");
	CHAR_TO_STRING_DELIMITER_MAP.put('\|', "pipe");
	CHAR_TO_STRING_DELIMITER_MAP.put(';', "semicolon");
	CHAR_TO_STRING_DELIMITER_MAP.put(':', "colon");
	}

	static {
	for (Map.Entry<Character, String> e : CHAR_TO_STRING_DELIMITER_MAP.entrySet()) {
	STRING_TO_CHAR_DELIMITER_MAP.put(e.getValue(), e.getKey());
	}
	}

	private static final Set<MediaType> SUPPORTED_TYPES =
	Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(
	CSV, TSV, MediaType.TEXT_PLAIN)));

	private char[] delimiters = DEFAULT_DELIMITERS;

	/**
	* This is the mark limit in characters (not bytes) to
	* read from the stream when classifying the stream as
	* csv, tsv or txt.
	*/
	@Field
	private int markLimit = DEFAULT_MARK_LIMIT;


	/**
	* minimum confidence score that there's enough
	* evidence to determine csv/tsv vs. txt
	*/
	@Field
	private double minConfidence = 0.50;

	public TextAndCSVParser() {
	super();
	}

	public TextAndCSVParser(EncodingDetector encodingDetector) {
	super(encodingDetector);
	}

	@Override
	public Set<MediaType> getSupportedTypes(ParseContext context) {
	return SUPPORTED_TYPES;
	}

	@Override
	public void parse(InputStream stream, ContentHandler handler,
	Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {

	CSVParams params = getOverride(metadata);
	Reader reader = null;
	Charset charset = null;
	if (! params.isComplete()) {
	reader = detect(params, stream, metadata, context);
	if (params.getCharset() != null) {
	charset = params.getCharset();
	} else {
	charset = ((AutoDetectReader) reader).getCharset();
	}
	} else {
	reader = new BufferedReader(new InputStreamReader(stream, params.getCharset()));
	charset = params.getCharset();
	}

	updateMetadata(params, metadata);

	//if text or a non-csv/tsv category of text
	//treat this as text and be done
	//TODO -- if it was detected as a non-csv subtype of text
	if (! params.getMediaType().getBaseType().equals(CSV) &&
	! params.getMediaType().getBaseType().equals(TSV)) {
	handleText(reader, charset, handler, metadata);
	return;
	}

	CSVFormat csvFormat = CSVFormat.EXCEL.withDelimiter(params.getDelimiter());
	metadata.set(DELIMITER_PROPERTY, CHAR_TO_STRING_DELIMITER_MAP.get(csvFormat.getDelimiter()));

	XHTMLContentHandler xhtmlContentHandler = new XHTMLContentHandler(handler, metadata);
	try (org.apache.commons.csv.CSVParser commonsParser =
	new org.apache.commons.csv.CSVParser(reader, csvFormat)) {
	xhtmlContentHandler.startDocument();
	xhtmlContentHandler.startElement(TABLE);
	try {
	for (CSVRecord row : commonsParser) {
	xhtmlContentHandler.startElement(TR);
	for (String cell : row) {
	xhtmlContentHandler.startElement(TD);
	xhtmlContentHandler.characters(cell);
	xhtmlContentHandler.endElement(TD);
	}
	xhtmlContentHandler.endElement(TR);
	}
	} catch (IllegalStateException e) {
	//if there's a parse exception
	//try to get the rest of the content...treat it as text for now
	//There will be some content lost because of buffering.
	//TODO -- figure out how to improve this
	xhtmlContentHandler.endElement(TABLE);
	xhtmlContentHandler.startElement("div", "name", "after exception");
	handleText(reader, xhtmlContentHandler);
	xhtmlContentHandler.endElement("div");
	xhtmlContentHandler.endDocument();
	//TODO -- consider dumping what's left in the reader as text
	throw new TikaException("exception parsing the csv", e);
	}

	xhtmlContentHandler.endElement(TABLE);
	xhtmlContentHandler.endDocument();
	}
	}

	private void handleText(Reader reader, Charset charset,
	ContentHandler handler, Metadata metadata)
	throws SAXException, IOException, TikaException {
	// Automatically detect the character encoding
	//try to get detected content type; could be a subclass of text/plain
	//such as vcal, etc.
	String incomingMime = metadata.get(Metadata.CONTENT_TYPE);
	MediaType mediaType = MediaType.TEXT_PLAIN;
	if (incomingMime != null) {
	MediaType tmpMediaType = MediaType.parse(incomingMime);
	if (tmpMediaType != null) {
	mediaType = tmpMediaType;
	}
	}
	MediaType type = new MediaType(mediaType, charset);
	metadata.set(Metadata.CONTENT_TYPE, type.toString());
	// deprecated, see TIKA-431
	metadata.set(Metadata.CONTENT_ENCODING, charset.name());

	XHTMLContentHandler xhtml =
	new XHTMLContentHandler(handler, metadata);
	xhtml.startDocument();
	handleText(reader, xhtml);
	xhtml.endDocument();
	}

	private static void handleText(Reader reader, XHTMLContentHandler xhtml)
	throws SAXException, IOException {
	xhtml.startElement("p");
	char[] buffer = new char[4096];
	int n = reader.read(buffer);
	while (n != -1) {
	xhtml.characters(buffer, 0, n);
	n = reader.read(buffer);
	}
	xhtml.endElement("p");

	}

	private Reader detect(CSVParams params, InputStream stream,
	Metadata metadata, ParseContext context) throws IOException, TikaException {
	//if the file was already identified as not .txt, .csv or .tsv
	//don't even try to csv or not
	String mediaString = metadata.get(Metadata.CONTENT_TYPE);
	if (mediaString != null) {
	MediaType mediaType = MediaType.parse(mediaString);
	if (! SUPPORTED_TYPES.contains(mediaType.getBaseType())) {
	params.setMediaType(mediaType);
	return new AutoDetectReader(
	new CloseShieldInputStream(stream),
	metadata, getEncodingDetector(context));
	}
	}
	Reader reader = null;
	if (params.getCharset() == null) {
	reader = new AutoDetectReader(
	new CloseShieldInputStream(stream),
	metadata, getEncodingDetector(context));
	params.setCharset(((AutoDetectReader)reader).getCharset());
	if (params.isComplete()) {
	return reader;
	}
	} else {
	reader = new BufferedReader(new InputStreamReader(
	new CloseShieldInputStream(stream), params.getCharset()));
	}

	if (params.getDelimiter() == null &&
	(params.getMediaType() == null \|\|
	isCSVOrTSV(params.getMediaType()))) {

	CSVSniffer sniffer = new CSVSniffer(markLimit, delimiters, minConfidence);
	CSVResult result = sniffer.getBest(reader, metadata);
	params.setMediaType(result.getMediaType());
	params.setDelimiter(result.getDelimiter());
	}
	return reader;
	}

	private CSVParams getOverride(Metadata metadata) {
	String override = metadata.get(TikaCoreProperties.CONTENT_TYPE_USER_OVERRIDE);
	if (override == null) {
	return new CSVParams();
	}
	MediaType mediaType = MediaType.parse(override);
	if (mediaType == null) {
	return new CSVParams();
	}
	String charsetString = mediaType.getParameters().get(CHARSET);
	Charset charset = null;
	if (charsetString != null) {
	try {
	charset = Charset.forName(charsetString);
	} catch (UnsupportedCharsetException e) {

	}
	}
	if (! isCSVOrTSV(mediaType)) {
	return new CSVParams(mediaType, charset);
	}

	String delimiterString = mediaType.getParameters().get(DELIMITER);
	if (delimiterString == null) {
	return new CSVParams(mediaType, charset);
	}
	if (STRING_TO_CHAR_DELIMITER_MAP.containsKey(delimiterString)) {
	return new CSVParams(mediaType, charset, (char) STRING_TO_CHAR_DELIMITER_MAP.get(delimiterString));
	}
	if (delimiterString.length() == 1) {
	return new CSVParams(mediaType, charset, delimiterString.charAt(0));
	}
	//TODO: log bad/unrecognized delimiter string
	return new CSVParams(mediaType, charset);
	}

	static boolean isCSVOrTSV(MediaType mediaType) {
	if (mediaType == null) {
	return false;
	}
	if (mediaType.getBaseType().equals(TSV) \|\|
	mediaType.getBaseType().equals(CSV)) {
	return true;
	}
	return false;
	}

	private void updateMetadata(CSVParams params, Metadata metadata) {
	MediaType mediaType = null;
	if (params.getMediaType().getBaseType().equals(MediaType.TEXT_PLAIN)) {
	mediaType = MediaType.TEXT_PLAIN;
	} else if (params.getDelimiter() != null) {
	if (params.getDelimiter() == '\t') {
	mediaType = TSV;
	} else {
	mediaType = CSV;
	}
	} else {
	if (metadata.get(Metadata.CONTENT_TYPE) != null) {
	mediaType = MediaType.parse(
	metadata.get(Metadata.CONTENT_TYPE));
	}
	}
	Map<String, String> attrs = new HashMap<>();
	if (params.getCharset() != null) {
	attrs.put(CHARSET, params.getCharset().name());
	// deprecated, see TIKA-431
	metadata.set(Metadata.CONTENT_ENCODING, params.getCharset().name());
	}
	if (!mediaType.equals(MediaType.TEXT_PLAIN) && params.getDelimiter() != null) {
	if (CHAR_TO_STRING_DELIMITER_MAP.containsKey(params.getDelimiter())) {
	attrs.put(DELIMITER, CHAR_TO_STRING_DELIMITER_MAP.get(params.getDelimiter()));
	} else {
	attrs.put(DELIMITER, Integer.toString((int)params.getDelimiter().charValue()));
	}
	}
	MediaType type = new MediaType(mediaType, attrs);
	metadata.set(Metadata.CONTENT_TYPE, type.toString());
	}

	}