blob: 5ed7666143ae1c19134e5a5087b56c35ebba0263 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.parser.txt;
import static java.nio.charset.StandardCharsets.ISO_8859_1;
import static java.nio.charset.StandardCharsets.UTF_8;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertNull;
import java.io.ByteArrayInputStream;
import java.io.StringWriter;
import org.junit.Test;
import org.xml.sax.ContentHandler;
import org.xml.sax.helpers.DefaultHandler;
import org.apache.tika.TikaTest;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.sax.BodyContentHandler;
import org.apache.tika.sax.WriteOutContentHandler;
public class TXTParserTest extends TikaTest {
private Parser parser = new TXTParser();
@Test
public void testEnglishText() throws Exception {
String text = "Hello, World! This is simple UTF-8 text content written" +
" in English to test autodetection of both the character" +
" encoding and the language of the input stream.";
Metadata metadata = new Metadata();
StringWriter writer = new StringWriter();
parser.parse(new ByteArrayInputStream(text.getBytes(ISO_8859_1)),
new WriteOutContentHandler(writer), metadata, new ParseContext());
String content = writer.toString();
assertEquals("text/plain; charset=ISO-8859-1", metadata.get(Metadata.CONTENT_TYPE));
// TIKA-501: Remove language detection from TXTParser
assertNull(metadata.get(Metadata.CONTENT_LANGUAGE));
assertNull(metadata.get(TikaCoreProperties.LANGUAGE));
TikaTest.assertContains("Hello", content);
TikaTest.assertContains("World", content);
TikaTest.assertContains("autodetection", content);
TikaTest.assertContains("stream", content);
}
@Test
public void testUTF8Text() throws Exception {
String text = "I\u00F1t\u00EBrn\u00E2ti\u00F4n\u00E0liz\u00E6ti\u00F8n";
ContentHandler handler = new BodyContentHandler();
Metadata metadata = new Metadata();
parser.parse(new ByteArrayInputStream(text.getBytes(UTF_8)), handler, metadata,
new ParseContext());
assertEquals("text/plain; charset=UTF-8", metadata.get(Metadata.CONTENT_TYPE));
assertEquals("UTF-8", metadata.get(Metadata.CONTENT_ENCODING)); // deprecated
TikaTest.assertContains(text, handler.toString());
}
@Test
public void testEmptyText() throws Exception {
ContentHandler handler = new BodyContentHandler();
Metadata metadata = new Metadata();
parser.parse(new ByteArrayInputStream(new byte[0]), handler, metadata, new ParseContext());
assertEquals("text/plain; charset=UTF-8", metadata.get(Metadata.CONTENT_TYPE));
assertEquals("\n", handler.toString());
}
/**
* Test for the heuristics that we use to assign an eight-bit character
* encoding to mostly ASCII sequences. If a more specific match can not
* be made, a string with a CR(LF) in it is most probably windows-1252,
* otherwise ISO-8859-1, except if it contains the currency/euro symbol
* (byte 0xa4) in which case it's more likely to be ISO-8859-15.
*/
@Test
public void testLatinDetectionHeuristics() throws Exception {
String windows = "test\r\n";
String unix = "test\n";
String euro = "test \u20ac\n";
Metadata metadata;
metadata = new Metadata();
parser.parse(new ByteArrayInputStream(windows.getBytes("ISO-8859-15")),
new DefaultHandler(), metadata, new ParseContext());
assertEquals("text/plain; charset=windows-1252", metadata.get(Metadata.CONTENT_TYPE));
metadata = new Metadata();
parser.parse(new ByteArrayInputStream(unix.getBytes("ISO-8859-15")), new DefaultHandler(),
metadata, new ParseContext());
assertEquals("text/plain; charset=ISO-8859-1", metadata.get(Metadata.CONTENT_TYPE));
metadata = new Metadata();
parser.parse(new ByteArrayInputStream(euro.getBytes("ISO-8859-15")), new DefaultHandler(),
metadata, new ParseContext());
assertEquals("text/plain; charset=ISO-8859-15", metadata.get(Metadata.CONTENT_TYPE));
}
/**
* Test case for TIKA-240: Drop the BOM when extracting plain text
*
* @see <a href="https://issues.apache.org/jira/browse/TIKA-240">TIKA-240</a>
*/
@Test
public void testDropByteOrderMark() throws Exception {
assertExtractText("UTF-8 BOM", "test",
new byte[]{(byte) 0xEF, (byte) 0xBB, (byte) 0xBF, 't', 'e', 's', 't'});
assertExtractText("UTF-16 BE BOM", "test",
new byte[]{(byte) 0xFE, (byte) 0xFF, 0, 't', 0, 'e', 0, 's', 0, 't'});
assertExtractText("UTF-16 LE BOM", "test",
new byte[]{(byte) 0xFF, (byte) 0xFE, 't', 0, 'e', 0, 's', 0, 't', 0});
}
/**
* Test case for TIKA-335: using incoming charset
*
* @see <a href="https://issues.apache.org/jira/browse/TIKA-335">TIKA-335</a>
*/
@Test
public void testUseIncomingCharsetAsHint() throws Exception {
// Could be ISO 8859-1 or ISO 8859-15 or ...
// u00e1 is latin small letter a with acute
final String test2 = "the name is \u00e1ndre";
Metadata metadata = new Metadata();
parser.parse(new ByteArrayInputStream(test2.getBytes(ISO_8859_1)), new BodyContentHandler(),
metadata, new ParseContext());
assertEquals("text/plain; charset=ISO-8859-1", metadata.get(Metadata.CONTENT_TYPE));
assertEquals("ISO-8859-1", metadata.get(Metadata.CONTENT_ENCODING)); // deprecated
metadata.set(Metadata.CONTENT_TYPE, "text/plain; charset=ISO-8859-15");
parser.parse(new ByteArrayInputStream(test2.getBytes(ISO_8859_1)), new BodyContentHandler(),
metadata, new ParseContext());
assertEquals("text/plain; charset=ISO-8859-15", metadata.get(Metadata.CONTENT_TYPE));
assertEquals("ISO-8859-15", metadata.get(Metadata.CONTENT_ENCODING)); // deprecated
}
/**
* Test case for TIKA-341: using charset in content-type
*
* @see <a href="https://issues.apache.org/jira/browse/TIKA-341">TIKA-341</a>
*/
@Test
public void testUsingCharsetInContentTypeHeader() throws Exception {
// Could be ISO 8859-1 or ISO 8859-15 or ...
// u00e1 is latin small letter a with acute
final String test2 = "the name is \u00e1ndre";
Metadata metadata = new Metadata();
parser.parse(new ByteArrayInputStream(test2.getBytes(ISO_8859_1)), new BodyContentHandler(),
metadata, new ParseContext());
assertEquals("text/plain; charset=ISO-8859-1", metadata.get(Metadata.CONTENT_TYPE));
assertEquals("ISO-8859-1", metadata.get(Metadata.CONTENT_ENCODING)); // deprecated
metadata = new Metadata();
metadata.set(Metadata.CONTENT_TYPE, "text/html; charset=ISO-8859-15");
parser.parse(new ByteArrayInputStream(test2.getBytes(ISO_8859_1)), new BodyContentHandler(),
metadata, new ParseContext());
assertEquals("text/html; charset=ISO-8859-15", metadata.get(Metadata.CONTENT_TYPE));
assertEquals("ISO-8859-15", metadata.get(Metadata.CONTENT_ENCODING)); // deprecated
}
private void assertExtractText(String msg, String expected, byte[] input) throws Exception {
ContentHandler handler = new BodyContentHandler() {
public void ignorableWhitespace(char[] ch, int off, int len) {
// Ignore the whitespace added by XHTMLContentHandler
}
};
Metadata metadata = new Metadata();
parser.parse(new ByteArrayInputStream(input), handler, metadata, new ParseContext());
assertEquals(msg, expected, handler.toString());
}
/**
* Test case for TIKA-339: don't override incoming language
*
* @see <a href="https://issues.apache.org/jira/browse/TIKA-335">TIKA-335</a>
*/
@Test
public void testRetainIncomingLanguage() throws Exception {
final String test = "Simple Content";
Metadata metadata = new Metadata();
metadata.set(TikaCoreProperties.LANGUAGE, "en");
parser.parse(new ByteArrayInputStream(test.getBytes(UTF_8)), new BodyContentHandler(),
metadata, new ParseContext());
assertEquals("en", metadata.get(TikaCoreProperties.LANGUAGE));
}
@Test
public void testCP866() throws Exception {
Metadata metadata = new Metadata();
StringWriter writer = new StringWriter();
parser.parse(getResourceAsStream("/test-documents/russian.cp866.txt"),
new WriteOutContentHandler(writer), metadata, new ParseContext());
assertEquals("text/plain; charset=IBM866", metadata.get(Metadata.CONTENT_TYPE));
}
@Test
public void testEBCDIC_CP500() throws Exception {
Metadata metadata = new Metadata();
StringWriter writer = new StringWriter();
parser.parse(getResourceAsStream("/test-documents/english.cp500.txt"),
new WriteOutContentHandler(writer), metadata, new ParseContext());
assertEquals("text/plain; charset=IBM500", metadata.get(Metadata.CONTENT_TYPE));
// Additional check that it isn't too eager on short blocks of text
metadata = new Metadata();
writer = new StringWriter();
parser.parse(new ByteArrayInputStream(
"<html><body>hello world</body></html>".getBytes(ISO_8859_1)),
new WriteOutContentHandler(writer), metadata, new ParseContext());
assertEquals("text/plain; charset=ISO-8859-1", metadata.get(Metadata.CONTENT_TYPE));
}
/**
* Test case for TIKA-771: "Hello, World!" in UTF-8/ASCII gets detected as IBM500
*
* @see <a href="https://issues.apache.org/jira/browse/TIKA-771">TIKA-771</a>
*/
@Test
public void testCharsetDetectionWithShortSnipet() throws Exception {
final String text = "Hello, World!";
Metadata metadata = new Metadata();
parser.parse(new ByteArrayInputStream(text.getBytes(UTF_8)), new BodyContentHandler(),
metadata, new ParseContext());
assertEquals("text/plain; charset=ISO-8859-1", metadata.get(Metadata.CONTENT_TYPE));
// Now verify that if we tell the parser the encoding is UTF-8, that's what
// we get back (see TIKA-868)
metadata.set(Metadata.CONTENT_TYPE, "application/binary; charset=UTF-8");
parser.parse(new ByteArrayInputStream(text.getBytes(UTF_8)), new BodyContentHandler(),
metadata, new ParseContext());
assertEquals("application/binary; charset=UTF-8", metadata.get(Metadata.CONTENT_TYPE));
}
//TIKA-2047
@Test
public void testSubclassingMimeTypesRemain() throws Exception {
XMLResult r = getXML("testVCalendar.vcs");
assertEquals("text/x-vcalendar; charset=ISO-8859-1", r.metadata.get(Metadata.CONTENT_TYPE));
}
}