blob: 192681de27e7640fb05e56391eaa88752ca61c45 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.parser.csv;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertNull;
import static org.junit.Assert.fail;
import java.io.ByteArrayInputStream;
import java.io.InputStream;
import java.nio.charset.StandardCharsets;
import java.util.HashMap;
import java.util.Map;
import org.apache.commons.io.ByteOrderMark;
import org.apache.tika.TikaTest;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.Parser;
import org.junit.BeforeClass;
import org.junit.Test;
public class TextAndCSVParserTest extends TikaTest {
private static byte[] CSV_UTF8 =
("the,quick,brown\tfox\n" +
"jumped \tover,the\tlazy,\tdog\n"+
"and then,ran,down\tthe\tstreet").getBytes(StandardCharsets.UTF_8);
private static byte[] CSV_UTF_16LE =
("the,quick,brown\tfox\n" +
"jumped \tover,the\tlazy,\tdog\n"+
"and then,ran,down\tthe\tstreet").getBytes(StandardCharsets.UTF_16LE);
private static byte[] TSV_UTF8 =
("the\tquick\tbrown,fox\n" +
"jumped ,over\tthe,lazy\t,dog\n"+
"and then\tran\tdown,the,street").getBytes(StandardCharsets.UTF_8);
private static byte[] TSV_UTF_16LE =
("the\tquick\tbrown,fox\n" +
"jumped ,over\tthe,lazy\t,dog\n"+
"and then\tran\tdown,the,street").getBytes(StandardCharsets.UTF_16LE);
private static String EXPECTED_TSV = ("<table><tr> <td>the</td> <td>quick</td> <td>brown,fox</td></tr>\n" +
"<tr> <td>jumped ,over</td> <td>the,lazy</td> <td>,dog</td></tr>\n" +
"<tr> <td>and then</td> <td>ran</td> <td>down,the,street</td></tr>\n" +
"</table>").replaceAll("[\r\n\t ]+", " ");
private static String EXPECTED_CSV = EXPECTED_TSV.replaceAll(",+", " ");
private static Parser PARSER;
@BeforeClass
public static void setUp() throws Exception {
try (InputStream is = Thread.currentThread().getContextClassLoader()
.getResourceAsStream("org/apache/tika/parser/csv/tika-config.xml")) {
PARSER = new AutoDetectParser(new TikaConfig(is));
}
}
@Test
public void testCSV_UTF8() throws Exception {
Metadata metadata = new Metadata();
metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, "test.csv");
XMLResult xmlResult = getXML(new ByteArrayInputStream(CSV_UTF8), PARSER, metadata);
assertEquals("comma", xmlResult.metadata.get(TextAndCSVParser.DELIMITER_PROPERTY));
assertMediaTypeEquals("csv", "ISO-8859-1","comma",
xmlResult.metadata.get(Metadata.CONTENT_TYPE));
assertContainsIgnoreWhiteSpaceDiffs(EXPECTED_CSV, xmlResult.xml);
}
@Test
public void testCSV_UTF8_TypeOverride() throws Exception {
Metadata metadata = new Metadata();
metadata.set(TikaCoreProperties.CONTENT_TYPE_USER_OVERRIDE, "text/csv; charset=UTF-8");
XMLResult xmlResult = getXML(new ByteArrayInputStream(CSV_UTF8), PARSER, metadata);
assertEquals("comma", xmlResult.metadata.get(TextAndCSVParser.DELIMITER_PROPERTY));
assertMediaTypeEquals("csv", "UTF-8","comma",
xmlResult.metadata.get(Metadata.CONTENT_TYPE));
assertContainsIgnoreWhiteSpaceDiffs(EXPECTED_CSV, xmlResult.xml);
}
@Test
public void testCSV_UTF8_Type() throws Exception {
Metadata metadata = new Metadata();
metadata.set(Metadata.CONTENT_TYPE, "text/csv");
XMLResult xmlResult = getXML(new ByteArrayInputStream(CSV_UTF8), PARSER, metadata);
assertEquals("comma", xmlResult.metadata.get(TextAndCSVParser.DELIMITER_PROPERTY));
assertMediaTypeEquals("csv", "ISO-8859-1","comma",
xmlResult.metadata.get(Metadata.CONTENT_TYPE));
assertContainsIgnoreWhiteSpaceDiffs(EXPECTED_CSV, xmlResult.xml);
}
@Test
public void testCSV_UTF16LE() throws Exception {
Metadata metadata = new Metadata();
metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, "test.csv");
XMLResult xmlResult = getXML(new ByteArrayInputStream(CSV_UTF_16LE), PARSER, metadata);
assertEquals("comma", xmlResult.metadata.get(TextAndCSVParser.DELIMITER_PROPERTY));
assertMediaTypeEquals("csv", "UTF-16LE","comma",
xmlResult.metadata.get(Metadata.CONTENT_TYPE));
assertContainsIgnoreWhiteSpaceDiffs(EXPECTED_CSV, xmlResult.xml);
}
@Test
public void testCSV_UTF16LE_BOM() throws Exception {
Metadata metadata = new Metadata();
metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, "test.csv");
XMLResult xmlResult = getXML(new ByteArrayInputStream(
concat(ByteOrderMark.UTF_16LE.getBytes(), CSV_UTF_16LE)), PARSER, metadata);
assertEquals("comma", xmlResult.metadata.get(TextAndCSVParser.DELIMITER_PROPERTY));
assertMediaTypeEquals("csv", "UTF-16LE","comma",
xmlResult.metadata.get(Metadata.CONTENT_TYPE));
assertContainsIgnoreWhiteSpaceDiffs(EXPECTED_CSV, xmlResult.xml);
}
@Test
public void testTSV_UTF8() throws Exception {
Metadata metadata = new Metadata();
metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, "test.csv");
XMLResult xmlResult = getXML(new ByteArrayInputStream(TSV_UTF8), PARSER, metadata);
assertEquals("tab", xmlResult.metadata.get(TextAndCSVParser.DELIMITER_PROPERTY));
assertMediaTypeEquals("tsv", "ISO-8859-1","tab",
xmlResult.metadata.get(Metadata.CONTENT_TYPE));
assertContainsIgnoreWhiteSpaceDiffs(EXPECTED_TSV, xmlResult.xml);
}
@Test
public void testTSV_UTF16LE() throws Exception {
Metadata metadata = new Metadata();
metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, "test.csv");
XMLResult xmlResult = getXML(new ByteArrayInputStream(TSV_UTF_16LE), PARSER, metadata);
assertEquals("tab", xmlResult.metadata.get(TextAndCSVParser.DELIMITER_PROPERTY));
assertMediaTypeEquals("tsv", "UTF-16LE","tab",
xmlResult.metadata.get(Metadata.CONTENT_TYPE));
assertContainsIgnoreWhiteSpaceDiffs(EXPECTED_TSV, xmlResult.xml);
}
@Test
public void testBadCsv() throws Exception {
//this causes an IllegalStateException during delimiter detection
//when trying to parse with ','; therefore, the parser backs off to
//treating this as straight text.
//This isn't necessarily the best outcome, but we want to make sure
//that an IllegalStateException during delimiter guessing doesn't
//make the parse fail.
byte[] csv = ("the,quick\n" +
"brown,\"la\"zy\"\n" +
"brown,\"dog\n").getBytes(StandardCharsets.UTF_8);
Metadata metadata = new Metadata();
metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, "test.csv");
XMLResult xmlResult = getXML(new ByteArrayInputStream(csv), PARSER, metadata);
assertNull(xmlResult.metadata.get(TextAndCSVParser.DELIMITER_PROPERTY));
assertEquals("text/plain; charset=ISO-8859-1", xmlResult.metadata.get(Metadata.CONTENT_TYPE));
assertContains("the,quick", xmlResult.xml);
}
@Test //TIKA-2836
public void testNonCSV() throws Exception {
byte[] bytes = ("testcsv\n" +
"testcsv testcsv;;; testcsv").getBytes(StandardCharsets.UTF_8);
Metadata metadata = new Metadata();
metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, "test.csv");
XMLResult xmlResult = getXML(new ByteArrayInputStream(bytes), PARSER, metadata);
assertContains("text/plain", xmlResult.metadata.get(Metadata.CONTENT_TYPE));
metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, "test.txt");
xmlResult = getXML(new ByteArrayInputStream(bytes), PARSER, metadata);
assertContains("text/plain", xmlResult.metadata.get(Metadata.CONTENT_TYPE));
}
@Test
public void testLong() throws Exception {
//test mark/reset worked on the sniffers
StringBuilder sb = new StringBuilder();
for (int rows = 0; rows < 1000; rows++) {
for (int cols = 0; cols < 10; cols++) {
sb.append("2").append(",");
}
sb.append("\n");
}
Metadata metadata = new Metadata();
XMLResult xmlResult = getXML(new ByteArrayInputStream(sb.toString().getBytes(StandardCharsets.UTF_8)), PARSER, metadata);
assertMediaTypeEquals("csv", "ISO-8859-1","comma",
xmlResult.metadata.get(Metadata.CONTENT_TYPE));
}
//TIKA-2047
@Test
public void testSubclassingMimeTypesRemain() throws Exception {
XMLResult r = getXML("testVCalendar.vcs");
assertEquals("text/x-vcalendar; charset=ISO-8859-1", r.metadata.get(Metadata.CONTENT_TYPE));
}
private void assertContainsIgnoreWhiteSpaceDiffs(String expected, String xml) {
assertContains(expected, xml.replaceAll("[\r\n\t ]", " "));
}
private static void assertMediaTypeEquals(String csv, String charset, String delimiter, String mediaTypeString) {
if (mediaTypeString == null) {
fail("media type string must not be null");
}
MediaType expected = mediaType(csv, charset, delimiter);
MediaType observed = MediaType.parse(mediaTypeString);
assertEquals(expected, observed);
}
private static MediaType mediaType(String csv, String charset, String delimiter) {
Map<String, String> attrs = new HashMap<>();
attrs.put("charset", charset);
attrs.put("delimiter", delimiter);
return new MediaType(MediaType.text(csv), attrs);
}
private static byte[] concat(byte[] bytesA, byte[] bytesB) {
byte[] ret = new byte[bytesA.length+bytesB.length];
System.arraycopy(bytesA, 0, ret, 0, bytesA.length);
System.arraycopy(bytesB, 0, ret, bytesA.length, bytesB.length);
return ret;
}
}