blob: 52c7f79cebc38c65f45e86458abefaaafa63fee7 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.parser.csv;
import java.io.BufferedReader;
import java.io.EOFException;
import java.io.IOException;
import java.io.PushbackReader;
import java.io.Reader;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.apache.commons.io.input.ProxyReader;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
class CSVSniffer {
static final int EOF = -1;
static final int NEW_LINE = '\n';
static final int CARRIAGE_RETURN = '\r';
private static final int DEFAULT_MARK_LIMIT = 10000;
private static final double DEFAULT_MIN_CONFIDENCE = 0.50;
private static final int PUSH_BACK = 2;
private static final int SPACE = ' ';
private final char[] delimiters;
private final int markLimit;
private final double minConfidence;
CSVSniffer(char[] delimiters) {
this(DEFAULT_MARK_LIMIT, delimiters, DEFAULT_MIN_CONFIDENCE);
}
CSVSniffer(int markLimit, char[] delimiters, double minConfidence) {
this.markLimit = markLimit;
this.delimiters = delimiters;
this.minConfidence = minConfidence;
}
List<CSVResult> sniff(Reader reader) throws IOException {
if (!reader.markSupported()) {
reader = new BufferedReader(reader);
}
List<CSVResult> ret = new ArrayList<>();
for (char delimiter : delimiters) {
reader.mark(markLimit);
try {
CSVResult result = new Snifflet(delimiter).sniff(reader);
ret.add(result);
} finally {
reader.reset();
}
}
Collections.sort(ret);
return ret;
}
/**
* @param reader
* @param metadata
* @return the best result given the detection results or {@link CSVResult#TEXT}
* if the confidence is not above a threshold.
* @throws IOException
*/
CSVResult getBest(Reader reader, Metadata metadata) throws IOException {
//TODO: take into consideration the filename. Perhaps require
//a higher confidence if detection contradicts filename?
List<CSVResult> results = sniff(reader);
if (results == null || results.size() == 0) {
return CSVResult.TEXT;
}
CSVResult bestResult = results.get(0);
if (bestResult.getConfidence() < minConfidence) {
return CSVResult.TEXT;
}
return bestResult;
}
private static class UnsurprisingEOF extends EOFException {
}
private static class HitMarkLimitException extends EOFException {
}
private static class MutableInt {
int i;
MutableInt(int i) {
this.i = i;
}
void increment() {
i++;
}
int intValue() {
return i;
}
}
//inner class that tests a single hypothesis/combination
//of parameters for delimiter and quote character
//this will throw an EOF before reading beyond the
//markLimit number of characters (not bytes!)
private class Snifflet {
private final char delimiter;
//hardcode this for now
private final char quoteCharacter = '"';
Map<Integer, MutableInt> rowLengthCounts = new HashMap<>();
int charsRead = 0;
int colCount = 0;
int encapsulated = 0; //number of cells that are encapsulated in dquotes (for now)
boolean parseException = false;
public Snifflet(char delimiter) {
this.delimiter = delimiter;
}
CSVResult sniff(Reader r) throws IOException {
boolean eof = false;
boolean hitMarkLimit = false;
int lastC = -1;
StringBuilder unquoted = new StringBuilder();
try (PushbackReader reader = new PushbackReader(new CloseShieldReader(r), PUSH_BACK)) {
int c = read(reader);
while (c != EOF) {
if (c == quoteCharacter) {
handleUnquoted(unquoted);
//test to make sure there isn't an unencapsulated quote character
// in the middle of a cell
if (lastC > -1 && lastC != delimiter && lastC != NEW_LINE &&
lastC != CARRIAGE_RETURN) {
parseException = true;
return calcResult();
}
//TODO: test to make sure cell doesn't start with escaped
// ""the quick brown cat"
boolean correctlyEncapsulated = consumeQuoted(reader, quoteCharacter);
if (!correctlyEncapsulated) {
parseException = true;
return calcResult();
}
} else if (c == delimiter) {
handleUnquoted(unquoted);
endColumn();
consumeSpaceCharacters(reader);
} else if (c == NEW_LINE || c == CARRIAGE_RETURN) {
if (unquoted.length() > 0) {
endColumn();
}
handleUnquoted(unquoted);
endRow();
consumeNewLines(reader);
} else {
unquoted.append((char) c);
}
lastC = c;
c = read(reader);
}
} catch (HitMarkLimitException e) {
hitMarkLimit = true;
} catch (UnsurprisingEOF e) {
//totally ignore
} catch (EOFException e) {
//the consume* throw this to avoid
//having to check -1 every time and
//having to rely on potentially wonky
//inputstreams not consistently returning -1
//after hitting EOF and returning the first -1.
//Yes. That's a thing.
eof = true;
} finally {
r.reset();
}
//if you've hit the marklimit or an eof on a truncated file
//don't add the last row's info
if (!hitMarkLimit && !eof && lastC != NEW_LINE && lastC != CARRIAGE_RETURN) {
handleUnquoted(unquoted);
endColumn();
endRow();
}
return calcResult();
}
private CSVResult calcResult() {
double confidence = getConfidence();
MediaType mediaType = TextAndCSVParser.CSV;
if (delimiter == '\t') {
mediaType = TextAndCSVParser.TSV;
}
return new CSVResult(confidence, mediaType, delimiter);
}
private void handleUnquoted(StringBuilder unquoted) {
if (unquoted.length() > 0) {
unquoted(unquoted.toString());
unquoted.setLength(0);
}
}
void consumeSpaceCharacters(PushbackReader reader) throws IOException {
int c = read(reader);
while (c == SPACE) {
c = read(reader);
}
if (c == EOF) {
throw new UnsurprisingEOF();
}
unread(reader, c);
}
/**
* @param reader
* @param quoteCharacter
* @return whether or not this was a correctly encapsulated cell
* @throws UnsurprisingEOF if the file ended immediately after the close quote
* @throws EOFException if the file ended in the middle of the encapsulated section
* @throws IOException on other IOExceptions
*/
boolean consumeQuoted(PushbackReader reader, int quoteCharacter) throws IOException {
//this currently assumes excel "escaping" of double quotes:
//'the " quick' -> "the "" quick"
//we can make this more interesting later with other
//escaping options
int c = read(reader);
while (c != -1) {
if (c == quoteCharacter) {
int nextC = read(reader);
if (nextC == EOF) {
encapsulated++;
endColumn();
throw new UnsurprisingEOF();
} else if (nextC != quoteCharacter) {
encapsulated++;
endColumn();
unread(reader, nextC);
consumeSpaceCharacters(reader);
//now make sure that the next character is eof, \r\n
//or a delimiter
nextC = read(reader);
if (nextC == EOF) {
throw new UnsurprisingEOF();
} else if (nextC == NEW_LINE || nextC == CARRIAGE_RETURN) {
unread(reader, nextC);
return true;
} else if (nextC != delimiter) {
unread(reader, nextC);
return false;
}
unread(reader, nextC);
return true;
}
}
c = read(reader);
}
throw new EOFException();
}
private int read(PushbackReader reader) throws IOException {
if (charsRead >= markLimit - 1) {
throw new HitMarkLimitException();
}
int c = reader.read();
if (c == EOF) {
return EOF;
}
charsRead++;
return c;
}
private void unread(PushbackReader reader, int c) throws IOException {
if (c != EOF) {
reader.unread(c);
charsRead--;
}
}
//consume all consecutive '\r\n' in any order
void consumeNewLines(PushbackReader reader) throws IOException {
int c = read(reader);
while (c == NEW_LINE || c == CARRIAGE_RETURN) {
c = read(reader);
}
if (c == EOF) {
throw new EOFException();
}
unread(reader, c);
return;
}
void endColumn() {
colCount++;
}
void endRow() {
MutableInt cnt = rowLengthCounts.get(colCount);
if (cnt == null) {
cnt = new MutableInt(1);
rowLengthCounts.put(colCount, cnt);
} else {
cnt.increment();
}
colCount = 0;
}
void unquoted(String string) {
//TODO -- do some analysis to make sure you don't have
//large tokens like 2,3,2,3,2,3,
}
double getConfidence() {
double confidence = 0.0f;
if (parseException) {
return -1.0f;
}
//TODO -- add tests for long tokens containing
//other delimiters, e.g. the,quick,brown,fox as a token
//when testing '\t'
double colCountConsistencyConf = calculateColumnCountConsistency();
if (colCountConsistencyConf > -1.0) {
confidence = colCountConsistencyConf;
}
//the idea is that if there are a bunch of encapsulated
//cells, then that should outweigh column length inconsistency
//this particular formula offers a small initial increase
//that eventually approaches 1.0
double encapsulatedBonus = 0;
if (encapsulated > 0) {
encapsulatedBonus = 1.0 - (1.0d / Math.pow(encapsulated, 0.2));
}
return Math.min(confidence + encapsulatedBonus, 1.0);
}
private double calculateColumnCountConsistency() {
int max = -1;
int totalRows = 0;
//find the most common row
for (Map.Entry<Integer, MutableInt> e : rowLengthCounts.entrySet()) {
int numCols = e.getKey();
int count = e.getValue().intValue();
//require that numCols > 1 so that you had at least
//one delimiter in that row
if (numCols > 1 && count > max) {
max = count;
}
totalRows += count;
}
//if there's not enough info
if (max < 0 || totalRows < 3) {
return 0.0;
}
//TODO: convert this to continuous vs vague heuristic step function
double consistency = (double) max / (double) totalRows;
return ((1d - (1d / Math.pow(totalRows, 0.3))) * consistency);
}
}
private static class CloseShieldReader extends ProxyReader {
public CloseShieldReader(Reader r) {
super(r);
}
@Override
public void close() throws IOException {
//do nothing
}
}
}