tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java - tika - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.tika.parser.txt;

 import static java.nio.charset.StandardCharsets.ISO_8859_1;
 import static java.nio.charset.StandardCharsets.UTF_8;
 import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertNull;

 import java.io.ByteArrayInputStream;
 import java.io.StringWriter;

 import org.junit.Test;
 import org.xml.sax.ContentHandler;
 import org.xml.sax.helpers.DefaultHandler;

 import org.apache.tika.TikaTest;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.TikaCoreProperties;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.Parser;
 import org.apache.tika.sax.BodyContentHandler;
 import org.apache.tika.sax.WriteOutContentHandler;

 public class TXTParserTest extends TikaTest {

     private Parser parser = new TXTParser();

     @Test
     public void testEnglishText() throws Exception {
         String text = "Hello, World! This is simple UTF-8 text content written" +
                 " in English to test autodetection of both the character" +
                 " encoding and the language of the input stream.";

         Metadata metadata = new Metadata();
         StringWriter writer = new StringWriter();
         parser.parse(new ByteArrayInputStream(text.getBytes(ISO_8859_1)),
                 new WriteOutContentHandler(writer), metadata, new ParseContext());
         String content = writer.toString();

         assertEquals("text/plain; charset=ISO-8859-1", metadata.get(Metadata.CONTENT_TYPE));

         // TIKA-501: Remove language detection from TXTParser
         assertNull(metadata.get(Metadata.CONTENT_LANGUAGE));
         assertNull(metadata.get(TikaCoreProperties.LANGUAGE));

         TikaTest.assertContains("Hello", content);
         TikaTest.assertContains("World", content);
         TikaTest.assertContains("autodetection", content);
         TikaTest.assertContains("stream", content);
     }

     @Test
     public void testUTF8Text() throws Exception {
         String text = "I\u00F1t\u00EBrn\u00E2ti\u00F4n\u00E0liz\u00E6ti\u00F8n";

         ContentHandler handler = new BodyContentHandler();
         Metadata metadata = new Metadata();
         parser.parse(new ByteArrayInputStream(text.getBytes(UTF_8)), handler, metadata,
                 new ParseContext());
         assertEquals("text/plain; charset=UTF-8", metadata.get(Metadata.CONTENT_TYPE));
         assertEquals("UTF-8", metadata.get(Metadata.CONTENT_ENCODING)); // deprecated

         TikaTest.assertContains(text, handler.toString());
     }

     @Test
     public void testEmptyText() throws Exception {
         ContentHandler handler = new BodyContentHandler();
         Metadata metadata = new Metadata();
         parser.parse(new ByteArrayInputStream(new byte[0]), handler, metadata, new ParseContext());
         assertEquals("text/plain; charset=UTF-8", metadata.get(Metadata.CONTENT_TYPE));
         assertEquals("\n", handler.toString());
     }

     /**
      * Test for the heuristics that we use to assign an eight-bit character
      * encoding to mostly ASCII sequences. If a more specific match can not
      * be made, a string with a CR(LF) in it is most probably windows-1252,
      * otherwise ISO-8859-1, except if it contains the currency/euro symbol
      * (byte 0xa4) in which case it's more likely to be ISO-8859-15.
      */
     @Test
     public void testLatinDetectionHeuristics() throws Exception {
         String windows = "test\r\n";
         String unix = "test\n";
         String euro = "test \u20ac\n";

         Metadata metadata;

         metadata = new Metadata();
         parser.parse(new ByteArrayInputStream(windows.getBytes("ISO-8859-15")),
                 new DefaultHandler(), metadata, new ParseContext());
         assertEquals("text/plain; charset=windows-1252", metadata.get(Metadata.CONTENT_TYPE));

         metadata = new Metadata();
         parser.parse(new ByteArrayInputStream(unix.getBytes("ISO-8859-15")), new DefaultHandler(),
                 metadata, new ParseContext());
         assertEquals("text/plain; charset=ISO-8859-1", metadata.get(Metadata.CONTENT_TYPE));

         metadata = new Metadata();
         parser.parse(new ByteArrayInputStream(euro.getBytes("ISO-8859-15")), new DefaultHandler(),
                 metadata, new ParseContext());
         assertEquals("text/plain; charset=ISO-8859-15", metadata.get(Metadata.CONTENT_TYPE));
     }

     /**
      * Test case for TIKA-240: Drop the BOM when extracting plain text
      *
      * @see <a href="https://issues.apache.org/jira/browse/TIKA-240">TIKA-240</a>
      */
     @Test
     public void testDropByteOrderMark() throws Exception {
         assertExtractText("UTF-8 BOM", "test",
                 new byte[]{(byte) 0xEF, (byte) 0xBB, (byte) 0xBF, 't', 'e', 's', 't'});
         assertExtractText("UTF-16 BE BOM", "test",
                 new byte[]{(byte) 0xFE, (byte) 0xFF, 0, 't', 0, 'e', 0, 's', 0, 't'});
         assertExtractText("UTF-16 LE BOM", "test",
                 new byte[]{(byte) 0xFF, (byte) 0xFE, 't', 0, 'e', 0, 's', 0, 't', 0});
     }

     /**
      * Test case for TIKA-335: using incoming charset
      *
      * @see <a href="https://issues.apache.org/jira/browse/TIKA-335">TIKA-335</a>
      */
     @Test
     public void testUseIncomingCharsetAsHint() throws Exception {
         // Could be ISO 8859-1 or ISO 8859-15 or ...
         // u00e1 is latin small letter a with acute
         final String test2 = "the name is \u00e1ndre";

         Metadata metadata = new Metadata();
         parser.parse(new ByteArrayInputStream(test2.getBytes(ISO_8859_1)), new BodyContentHandler(),
                 metadata, new ParseContext());
         assertEquals("text/plain; charset=ISO-8859-1", metadata.get(Metadata.CONTENT_TYPE));
         assertEquals("ISO-8859-1", metadata.get(Metadata.CONTENT_ENCODING)); // deprecated

         metadata.set(Metadata.CONTENT_TYPE, "text/plain; charset=ISO-8859-15");
         parser.parse(new ByteArrayInputStream(test2.getBytes(ISO_8859_1)), new BodyContentHandler(),
                 metadata, new ParseContext());
         assertEquals("text/plain; charset=ISO-8859-15", metadata.get(Metadata.CONTENT_TYPE));
         assertEquals("ISO-8859-15", metadata.get(Metadata.CONTENT_ENCODING)); // deprecated
     }

     /**
      * Test case for TIKA-341: using charset in content-type
      *
      * @see <a href="https://issues.apache.org/jira/browse/TIKA-341">TIKA-341</a>
      */
     @Test
     public void testUsingCharsetInContentTypeHeader() throws Exception {
         // Could be ISO 8859-1 or ISO 8859-15 or ...
         // u00e1 is latin small letter a with acute
         final String test2 = "the name is \u00e1ndre";

         Metadata metadata = new Metadata();
         parser.parse(new ByteArrayInputStream(test2.getBytes(ISO_8859_1)), new BodyContentHandler(),
                 metadata, new ParseContext());
         assertEquals("text/plain; charset=ISO-8859-1", metadata.get(Metadata.CONTENT_TYPE));
         assertEquals("ISO-8859-1", metadata.get(Metadata.CONTENT_ENCODING)); // deprecated

         metadata = new Metadata();
         metadata.set(Metadata.CONTENT_TYPE, "text/html; charset=ISO-8859-15");
         parser.parse(new ByteArrayInputStream(test2.getBytes(ISO_8859_1)), new BodyContentHandler(),
                 metadata, new ParseContext());
         assertEquals("text/html; charset=ISO-8859-15", metadata.get(Metadata.CONTENT_TYPE));
         assertEquals("ISO-8859-15", metadata.get(Metadata.CONTENT_ENCODING)); // deprecated
     }

     private void assertExtractText(String msg, String expected, byte[] input) throws Exception {
         ContentHandler handler = new BodyContentHandler() {
             public void ignorableWhitespace(char[] ch, int off, int len) {
                 // Ignore the whitespace added by XHTMLContentHandler
             }
         };
         Metadata metadata = new Metadata();
         parser.parse(new ByteArrayInputStream(input), handler, metadata, new ParseContext());
         assertEquals(msg, expected, handler.toString());
     }

     /**
      * Test case for TIKA-339: don't override incoming language
      *
      * @see <a href="https://issues.apache.org/jira/browse/TIKA-335">TIKA-335</a>
      */
     @Test
     public void testRetainIncomingLanguage() throws Exception {
         final String test = "Simple Content";

         Metadata metadata = new Metadata();
         metadata.set(TikaCoreProperties.LANGUAGE, "en");

         parser.parse(new ByteArrayInputStream(test.getBytes(UTF_8)), new BodyContentHandler(),
                 metadata, new ParseContext());

         assertEquals("en", metadata.get(TikaCoreProperties.LANGUAGE));
     }

     @Test
     public void testCP866() throws Exception {
         Metadata metadata = new Metadata();
         StringWriter writer = new StringWriter();
         parser.parse(getResourceAsStream("/test-documents/russian.cp866.txt"),
                 new WriteOutContentHandler(writer), metadata, new ParseContext());

         assertEquals("text/plain; charset=IBM866", metadata.get(Metadata.CONTENT_TYPE));
     }

     @Test
     public void testEBCDIC_CP500() throws Exception {
         Metadata metadata = new Metadata();
         StringWriter writer = new StringWriter();
         parser.parse(getResourceAsStream("/test-documents/english.cp500.txt"),
                 new WriteOutContentHandler(writer), metadata, new ParseContext());

         assertEquals("text/plain; charset=IBM500", metadata.get(Metadata.CONTENT_TYPE));

         // Additional check that it isn't too eager on short blocks of text
         metadata = new Metadata();
         writer = new StringWriter();
         parser.parse(new ByteArrayInputStream(
                         "<html><body>hello world</body></html>".getBytes(ISO_8859_1)),
                 new WriteOutContentHandler(writer), metadata, new ParseContext());

         assertEquals("text/plain; charset=ISO-8859-1", metadata.get(Metadata.CONTENT_TYPE));
     }

     /**
      * Test case for TIKA-771: "Hello, World!" in UTF-8/ASCII gets detected as IBM500
      *
      * @see <a href="https://issues.apache.org/jira/browse/TIKA-771">TIKA-771</a>
      */
     @Test
     public void testCharsetDetectionWithShortSnipet() throws Exception {
         final String text = "Hello, World!";

         Metadata metadata = new Metadata();
         parser.parse(new ByteArrayInputStream(text.getBytes(UTF_8)), new BodyContentHandler(),
                 metadata, new ParseContext());
         assertEquals("text/plain; charset=ISO-8859-1", metadata.get(Metadata.CONTENT_TYPE));

         // Now verify that if we tell the parser the encoding is UTF-8, that's what
         // we get back (see TIKA-868)
         metadata.set(Metadata.CONTENT_TYPE, "application/binary; charset=UTF-8");
         parser.parse(new ByteArrayInputStream(text.getBytes(UTF_8)), new BodyContentHandler(),
                 metadata, new ParseContext());
         assertEquals("application/binary; charset=UTF-8", metadata.get(Metadata.CONTENT_TYPE));
     }

     //TIKA-2047
     @Test
     public void testSubclassingMimeTypesRemain() throws Exception {
         XMLResult r = getXML("testVCalendar.vcs");
         assertEquals("text/x-vcalendar; charset=ISO-8859-1", r.metadata.get(Metadata.CONTENT_TYPE));
     }

 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.tika.parser.txt;

	import static java.nio.charset.StandardCharsets.ISO_8859_1;
	import static java.nio.charset.StandardCharsets.UTF_8;
	import static org.junit.Assert.assertEquals;
	import static org.junit.Assert.assertNull;

	import java.io.ByteArrayInputStream;
	import java.io.StringWriter;

	import org.junit.Test;
	import org.xml.sax.ContentHandler;
	import org.xml.sax.helpers.DefaultHandler;

	import org.apache.tika.TikaTest;
	import org.apache.tika.metadata.Metadata;
	import org.apache.tika.metadata.TikaCoreProperties;
	import org.apache.tika.parser.ParseContext;
	import org.apache.tika.parser.Parser;
	import org.apache.tika.sax.BodyContentHandler;
	import org.apache.tika.sax.WriteOutContentHandler;

	public class TXTParserTest extends TikaTest {

	private Parser parser = new TXTParser();

	@Test
	public void testEnglishText() throws Exception {
	String text = "Hello, World! This is simple UTF-8 text content written" +
	" in English to test autodetection of both the character" +
	" encoding and the language of the input stream.";

	Metadata metadata = new Metadata();
	StringWriter writer = new StringWriter();
	parser.parse(new ByteArrayInputStream(text.getBytes(ISO_8859_1)),
	new WriteOutContentHandler(writer), metadata, new ParseContext());
	String content = writer.toString();

	assertEquals("text/plain; charset=ISO-8859-1", metadata.get(Metadata.CONTENT_TYPE));

	// TIKA-501: Remove language detection from TXTParser
	assertNull(metadata.get(Metadata.CONTENT_LANGUAGE));
	assertNull(metadata.get(TikaCoreProperties.LANGUAGE));

	TikaTest.assertContains("Hello", content);
	TikaTest.assertContains("World", content);
	TikaTest.assertContains("autodetection", content);
	TikaTest.assertContains("stream", content);
	}

	@Test
	public void testUTF8Text() throws Exception {
	String text = "I\u00F1t\u00EBrn\u00E2ti\u00F4n\u00E0liz\u00E6ti\u00F8n";

	ContentHandler handler = new BodyContentHandler();
	Metadata metadata = new Metadata();
	parser.parse(new ByteArrayInputStream(text.getBytes(UTF_8)), handler, metadata,
	new ParseContext());
	assertEquals("text/plain; charset=UTF-8", metadata.get(Metadata.CONTENT_TYPE));
	assertEquals("UTF-8", metadata.get(Metadata.CONTENT_ENCODING)); // deprecated

	TikaTest.assertContains(text, handler.toString());
	}

	@Test
	public void testEmptyText() throws Exception {
	ContentHandler handler = new BodyContentHandler();
	Metadata metadata = new Metadata();
	parser.parse(new ByteArrayInputStream(new byte[0]), handler, metadata, new ParseContext());
	assertEquals("text/plain; charset=UTF-8", metadata.get(Metadata.CONTENT_TYPE));
	assertEquals("\n", handler.toString());
	}

	/**
	* Test for the heuristics that we use to assign an eight-bit character
	* encoding to mostly ASCII sequences. If a more specific match can not
	* be made, a string with a CR(LF) in it is most probably windows-1252,
	* otherwise ISO-8859-1, except if it contains the currency/euro symbol
	* (byte 0xa4) in which case it's more likely to be ISO-8859-15.
	*/
	@Test
	public void testLatinDetectionHeuristics() throws Exception {
	String windows = "test\r\n";
	String unix = "test\n";
	String euro = "test \u20ac\n";

	Metadata metadata;

	metadata = new Metadata();
	parser.parse(new ByteArrayInputStream(windows.getBytes("ISO-8859-15")),
	new DefaultHandler(), metadata, new ParseContext());
	assertEquals("text/plain; charset=windows-1252", metadata.get(Metadata.CONTENT_TYPE));

	metadata = new Metadata();
	parser.parse(new ByteArrayInputStream(unix.getBytes("ISO-8859-15")), new DefaultHandler(),
	metadata, new ParseContext());
	assertEquals("text/plain; charset=ISO-8859-1", metadata.get(Metadata.CONTENT_TYPE));

	metadata = new Metadata();
	parser.parse(new ByteArrayInputStream(euro.getBytes("ISO-8859-15")), new DefaultHandler(),
	metadata, new ParseContext());
	assertEquals("text/plain; charset=ISO-8859-15", metadata.get(Metadata.CONTENT_TYPE));
	}

	/**
	* Test case for TIKA-240: Drop the BOM when extracting plain text
	*
	* @see <a href="https://issues.apache.org/jira/browse/TIKA-240">TIKA-240</a>
	*/
	@Test
	public void testDropByteOrderMark() throws Exception {
	assertExtractText("UTF-8 BOM", "test",
	new byte[]{(byte) 0xEF, (byte) 0xBB, (byte) 0xBF, 't', 'e', 's', 't'});
	assertExtractText("UTF-16 BE BOM", "test",
	new byte[]{(byte) 0xFE, (byte) 0xFF, 0, 't', 0, 'e', 0, 's', 0, 't'});
	assertExtractText("UTF-16 LE BOM", "test",
	new byte[]{(byte) 0xFF, (byte) 0xFE, 't', 0, 'e', 0, 's', 0, 't', 0});
	}

	/**
	* Test case for TIKA-335: using incoming charset
	*
	* @see <a href="https://issues.apache.org/jira/browse/TIKA-335">TIKA-335</a>
	*/
	@Test
	public void testUseIncomingCharsetAsHint() throws Exception {
	// Could be ISO 8859-1 or ISO 8859-15 or ...
	// u00e1 is latin small letter a with acute
	final String test2 = "the name is \u00e1ndre";

	Metadata metadata = new Metadata();
	parser.parse(new ByteArrayInputStream(test2.getBytes(ISO_8859_1)), new BodyContentHandler(),
	metadata, new ParseContext());
	assertEquals("text/plain; charset=ISO-8859-1", metadata.get(Metadata.CONTENT_TYPE));
	assertEquals("ISO-8859-1", metadata.get(Metadata.CONTENT_ENCODING)); // deprecated

	metadata.set(Metadata.CONTENT_TYPE, "text/plain; charset=ISO-8859-15");
	parser.parse(new ByteArrayInputStream(test2.getBytes(ISO_8859_1)), new BodyContentHandler(),
	metadata, new ParseContext());
	assertEquals("text/plain; charset=ISO-8859-15", metadata.get(Metadata.CONTENT_TYPE));
	assertEquals("ISO-8859-15", metadata.get(Metadata.CONTENT_ENCODING)); // deprecated
	}

	/**
	* Test case for TIKA-341: using charset in content-type
	*
	* @see <a href="https://issues.apache.org/jira/browse/TIKA-341">TIKA-341</a>
	*/
	@Test
	public void testUsingCharsetInContentTypeHeader() throws Exception {
	// Could be ISO 8859-1 or ISO 8859-15 or ...
	// u00e1 is latin small letter a with acute
	final String test2 = "the name is \u00e1ndre";

	Metadata metadata = new Metadata();
	parser.parse(new ByteArrayInputStream(test2.getBytes(ISO_8859_1)), new BodyContentHandler(),
	metadata, new ParseContext());
	assertEquals("text/plain; charset=ISO-8859-1", metadata.get(Metadata.CONTENT_TYPE));
	assertEquals("ISO-8859-1", metadata.get(Metadata.CONTENT_ENCODING)); // deprecated

	metadata = new Metadata();
	metadata.set(Metadata.CONTENT_TYPE, "text/html; charset=ISO-8859-15");
	parser.parse(new ByteArrayInputStream(test2.getBytes(ISO_8859_1)), new BodyContentHandler(),
	metadata, new ParseContext());
	assertEquals("text/html; charset=ISO-8859-15", metadata.get(Metadata.CONTENT_TYPE));
	assertEquals("ISO-8859-15", metadata.get(Metadata.CONTENT_ENCODING)); // deprecated
	}

	private void assertExtractText(String msg, String expected, byte[] input) throws Exception {
	ContentHandler handler = new BodyContentHandler() {
	public void ignorableWhitespace(char[] ch, int off, int len) {
	// Ignore the whitespace added by XHTMLContentHandler
	}
	};
	Metadata metadata = new Metadata();
	parser.parse(new ByteArrayInputStream(input), handler, metadata, new ParseContext());
	assertEquals(msg, expected, handler.toString());
	}

	/**
	* Test case for TIKA-339: don't override incoming language
	*
	* @see <a href="https://issues.apache.org/jira/browse/TIKA-335">TIKA-335</a>
	*/
	@Test
	public void testRetainIncomingLanguage() throws Exception {
	final String test = "Simple Content";

	Metadata metadata = new Metadata();
	metadata.set(TikaCoreProperties.LANGUAGE, "en");

	parser.parse(new ByteArrayInputStream(test.getBytes(UTF_8)), new BodyContentHandler(),
	metadata, new ParseContext());

	assertEquals("en", metadata.get(TikaCoreProperties.LANGUAGE));
	}

	@Test
	public void testCP866() throws Exception {
	Metadata metadata = new Metadata();
	StringWriter writer = new StringWriter();
	parser.parse(getResourceAsStream("/test-documents/russian.cp866.txt"),
	new WriteOutContentHandler(writer), metadata, new ParseContext());

	assertEquals("text/plain; charset=IBM866", metadata.get(Metadata.CONTENT_TYPE));
	}

	@Test
	public void testEBCDIC_CP500() throws Exception {
	Metadata metadata = new Metadata();
	StringWriter writer = new StringWriter();
	parser.parse(getResourceAsStream("/test-documents/english.cp500.txt"),
	new WriteOutContentHandler(writer), metadata, new ParseContext());

	assertEquals("text/plain; charset=IBM500", metadata.get(Metadata.CONTENT_TYPE));

	// Additional check that it isn't too eager on short blocks of text
	metadata = new Metadata();
	writer = new StringWriter();
	parser.parse(new ByteArrayInputStream(
	"<html><body>hello world</body></html>".getBytes(ISO_8859_1)),
	new WriteOutContentHandler(writer), metadata, new ParseContext());

	assertEquals("text/plain; charset=ISO-8859-1", metadata.get(Metadata.CONTENT_TYPE));
	}

	/**
	* Test case for TIKA-771: "Hello, World!" in UTF-8/ASCII gets detected as IBM500
	*
	* @see <a href="https://issues.apache.org/jira/browse/TIKA-771">TIKA-771</a>
	*/
	@Test
	public void testCharsetDetectionWithShortSnipet() throws Exception {
	final String text = "Hello, World!";

	Metadata metadata = new Metadata();
	parser.parse(new ByteArrayInputStream(text.getBytes(UTF_8)), new BodyContentHandler(),
	metadata, new ParseContext());
	assertEquals("text/plain; charset=ISO-8859-1", metadata.get(Metadata.CONTENT_TYPE));

	// Now verify that if we tell the parser the encoding is UTF-8, that's what
	// we get back (see TIKA-868)
	metadata.set(Metadata.CONTENT_TYPE, "application/binary; charset=UTF-8");
	parser.parse(new ByteArrayInputStream(text.getBytes(UTF_8)), new BodyContentHandler(),
	metadata, new ParseContext());
	assertEquals("application/binary; charset=UTF-8", metadata.get(Metadata.CONTENT_TYPE));
	}

	//TIKA-2047
	@Test
	public void testSubclassingMimeTypesRemain() throws Exception {
	XMLResult r = getXML("testVCalendar.vcs");
	assertEquals("text/x-vcalendar; charset=ISO-8859-1", r.metadata.get(Metadata.CONTENT_TYPE));
	}

	}