trunk/tika-parsers/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java - tika - Git at Google

 /**
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.tika.parser.txt;

 import java.io.ByteArrayInputStream;
 import java.io.StringWriter;

 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.Parser;
 import org.apache.tika.sax.BodyContentHandler;
 import org.apache.tika.sax.WriteOutContentHandler;
 import org.xml.sax.ContentHandler;

 import junit.framework.TestCase;

 public class TXTParserTest extends TestCase {

     private Parser parser = new TXTParser();

     public void testEnglishText() throws Exception {
         String text =
             "Hello, World! This is simple UTF-8 text content written"
             + " in English to test autodetection of both the character"
             + " encoding and the language of the input stream.";

         Metadata metadata = new Metadata();
         StringWriter writer = new StringWriter();
         parser.parse(
                 new ByteArrayInputStream(text.getBytes("UTF-8")),
                 new WriteOutContentHandler(writer),
                 metadata,
                 new ParseContext());
         String content = writer.toString();

         assertEquals("text/plain", metadata.get(Metadata.CONTENT_TYPE));

         // TIKA-501: Remove language detection from TXTParser
         assertNull(metadata.get(Metadata.CONTENT_LANGUAGE));
         assertNull(metadata.get(Metadata.LANGUAGE));

         assertTrue(content.contains("Hello"));
         assertTrue(content.contains("World"));
         assertTrue(content.contains("autodetection"));
         assertTrue(content.contains("stream"));
     }

     public void testUTF8Text() throws Exception {
         String text = "I\u00F1t\u00EBrn\u00E2ti\u00F4n\u00E0liz\u00E6ti\u00F8n";

         ContentHandler handler = new BodyContentHandler();
         Metadata metadata = new Metadata();
         parser.parse(
                 new ByteArrayInputStream(text.getBytes("UTF-8")),
                 handler, metadata, new ParseContext());
         assertEquals("text/plain", metadata.get(Metadata.CONTENT_TYPE));
         assertEquals("UTF-8", metadata.get(Metadata.CONTENT_ENCODING));

         assertTrue(handler.toString().contains(text));
     }

     public void testEmptyText() throws Exception {
         ContentHandler handler = new BodyContentHandler();
         Metadata metadata = new Metadata();
         parser.parse(
                 new ByteArrayInputStream(new byte[0]), handler, metadata, new ParseContext());
         assertEquals("text/plain", metadata.get(Metadata.CONTENT_TYPE));
         assertEquals("\n", handler.toString());
     }

     /**
      * Test case for TIKA-240: Drop the BOM when extracting plain text
      *
      * @see <a href="https://issues.apache.org/jira/browse/TIKA-240">TIKA-240</a>
      */
     public void testDropByteOrderMark() throws Exception {
         assertExtractText("UTF-8 BOM", "test", new byte[] {
                 (byte) 0xEF, (byte) 0xBB, (byte) 0xBF, 't', 'e', 's', 't' });
         assertExtractText("UTF-16 BE BOM", "test", new byte[] {
                 (byte) 0xFE, (byte) 0xFF, 0, 't', 0, 'e', 0, 's', 0, 't'});
         assertExtractText("UTF-16 LE BOM", "test", new byte[] {
                 (byte) 0xFF, (byte) 0xFE, 't', 0, 'e', 0, 's', 0, 't', 0});
     }

     /**
      * Test case for TIKA-335: using incoming charset
      *
      * @see <a href="https://issues.apache.org/jira/browse/TIKA-335">TIKA-335</a>
      */
     public void testUseIncomingCharsetAsHint() throws Exception {
         // Could be UTF-8 or ISO 8859-1 or ...
         // u00e1 is latin small letter a with acute
         final String test2 = "the name is \u00e1ndre";

         Metadata metadata = new Metadata();
         parser.parse(
                 new ByteArrayInputStream(test2.getBytes("UTF-8")),
                 new BodyContentHandler(),  metadata, new ParseContext());

         assertEquals("UTF-8", metadata.get(Metadata.CONTENT_ENCODING));

         metadata.set(Metadata.CONTENT_ENCODING, "ISO-8859-1");
         parser.parse(
                 new ByteArrayInputStream(test2.getBytes("UTF-8")),
                 new BodyContentHandler(),  metadata, new ParseContext());

         assertEquals("ISO-8859-1", metadata.get(Metadata.CONTENT_ENCODING));
     }

     /**
      * Test case for TIKA-341: using charset in content-type
      *
      * @see <a href="https://issues.apache.org/jira/browse/TIKA-341">TIKA-341</a>
      */
     public void testUsingCharsetInContentTypeHeader() throws Exception {
         // Could be UTF-8 or ISO 8859-1 or ...
         // u00e1 is latin small letter a with acute
         final String test2 = "the name is \u00e1ndre";

         Metadata metadata = new Metadata();
         parser.parse(
                 new ByteArrayInputStream(test2.getBytes("UTF-8")),
                 new BodyContentHandler(),  metadata, new ParseContext());

         assertEquals("UTF-8", metadata.get(Metadata.CONTENT_ENCODING));

         metadata = new Metadata();
         metadata.set(Metadata.CONTENT_TYPE, "text/html; charset=ISO-8859-1");
         parser.parse(
                 new ByteArrayInputStream(test2.getBytes("UTF-8")),
                 new BodyContentHandler(),  metadata, new ParseContext());

         assertEquals("ISO-8859-1", metadata.get(Metadata.CONTENT_ENCODING));
     }

     private void assertExtractText(String msg, String expected, byte[] input)
             throws Exception {
         ContentHandler handler = new BodyContentHandler() {
             public void ignorableWhitespace(char[] ch, int off, int len) {
                 // Ignore the whitespace added by XHTMLContentHandler
             }
         };
         Metadata metadata = new Metadata();
         parser.parse(new ByteArrayInputStream(input), handler, metadata, new ParseContext());
         assertEquals("text/plain", metadata.get(Metadata.CONTENT_TYPE));
         assertEquals(msg, expected, handler.toString());
     }

     /**
      * Test case for TIKA-339: don't override incoming language
      *
      * @see <a href="https://issues.apache.org/jira/browse/TIKA-335">TIKA-335</a>
      */
     public void testRetainIncomingLanguage() throws Exception {
         final String test = "Simple Content";

         Metadata metadata = new Metadata();
         metadata.set(Metadata.LANGUAGE, "en");

         parser.parse(
                 new ByteArrayInputStream(test.getBytes("UTF-8")),
                 new BodyContentHandler(),  metadata, new ParseContext());

         assertEquals("en", metadata.get(Metadata.LANGUAGE));
     }

     public void testCP866() throws Exception {
         Metadata metadata = new Metadata();
         StringWriter writer = new StringWriter();
         parser.parse(
                 TXTParserTest.class.getResourceAsStream("/test-documents/russian.cp866.txt"),
                 new WriteOutContentHandler(writer),
                 metadata,
                 new ParseContext());

         assertEquals("text/plain", metadata.get(Metadata.CONTENT_TYPE));
         assertEquals("IBM866", metadata.get(Metadata.CONTENT_ENCODING));
     }

 }
	/**
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.tika.parser.txt;

	import java.io.ByteArrayInputStream;
	import java.io.StringWriter;

	import org.apache.tika.metadata.Metadata;
	import org.apache.tika.parser.ParseContext;
	import org.apache.tika.parser.Parser;
	import org.apache.tika.sax.BodyContentHandler;
	import org.apache.tika.sax.WriteOutContentHandler;
	import org.xml.sax.ContentHandler;

	import junit.framework.TestCase;

	public class TXTParserTest extends TestCase {

	private Parser parser = new TXTParser();

	public void testEnglishText() throws Exception {
	String text =
	"Hello, World! This is simple UTF-8 text content written"
	+ " in English to test autodetection of both the character"
	+ " encoding and the language of the input stream.";

	Metadata metadata = new Metadata();
	StringWriter writer = new StringWriter();
	parser.parse(
	new ByteArrayInputStream(text.getBytes("UTF-8")),
	new WriteOutContentHandler(writer),
	metadata,
	new ParseContext());
	String content = writer.toString();

	assertEquals("text/plain", metadata.get(Metadata.CONTENT_TYPE));

	// TIKA-501: Remove language detection from TXTParser
	assertNull(metadata.get(Metadata.CONTENT_LANGUAGE));
	assertNull(metadata.get(Metadata.LANGUAGE));

	assertTrue(content.contains("Hello"));
	assertTrue(content.contains("World"));
	assertTrue(content.contains("autodetection"));
	assertTrue(content.contains("stream"));
	}

	public void testUTF8Text() throws Exception {
	String text = "I\u00F1t\u00EBrn\u00E2ti\u00F4n\u00E0liz\u00E6ti\u00F8n";

	ContentHandler handler = new BodyContentHandler();
	Metadata metadata = new Metadata();
	parser.parse(
	new ByteArrayInputStream(text.getBytes("UTF-8")),
	handler, metadata, new ParseContext());
	assertEquals("text/plain", metadata.get(Metadata.CONTENT_TYPE));
	assertEquals("UTF-8", metadata.get(Metadata.CONTENT_ENCODING));

	assertTrue(handler.toString().contains(text));
	}

	public void testEmptyText() throws Exception {
	ContentHandler handler = new BodyContentHandler();
	Metadata metadata = new Metadata();
	parser.parse(
	new ByteArrayInputStream(new byte[0]), handler, metadata, new ParseContext());
	assertEquals("text/plain", metadata.get(Metadata.CONTENT_TYPE));
	assertEquals("\n", handler.toString());
	}

	/**
	* Test case for TIKA-240: Drop the BOM when extracting plain text
	*
	* @see <a href="https://issues.apache.org/jira/browse/TIKA-240">TIKA-240</a>
	*/
	public void testDropByteOrderMark() throws Exception {
	assertExtractText("UTF-8 BOM", "test", new byte[] {
	(byte) 0xEF, (byte) 0xBB, (byte) 0xBF, 't', 'e', 's', 't' });
	assertExtractText("UTF-16 BE BOM", "test", new byte[] {
	(byte) 0xFE, (byte) 0xFF, 0, 't', 0, 'e', 0, 's', 0, 't'});
	assertExtractText("UTF-16 LE BOM", "test", new byte[] {
	(byte) 0xFF, (byte) 0xFE, 't', 0, 'e', 0, 's', 0, 't', 0});
	}

	/**
	* Test case for TIKA-335: using incoming charset
	*
	* @see <a href="https://issues.apache.org/jira/browse/TIKA-335">TIKA-335</a>
	*/
	public void testUseIncomingCharsetAsHint() throws Exception {
	// Could be UTF-8 or ISO 8859-1 or ...
	// u00e1 is latin small letter a with acute
	final String test2 = "the name is \u00e1ndre";

	Metadata metadata = new Metadata();
	parser.parse(
	new ByteArrayInputStream(test2.getBytes("UTF-8")),
	new BodyContentHandler(), metadata, new ParseContext());

	assertEquals("UTF-8", metadata.get(Metadata.CONTENT_ENCODING));

	metadata.set(Metadata.CONTENT_ENCODING, "ISO-8859-1");
	parser.parse(
	new ByteArrayInputStream(test2.getBytes("UTF-8")),
	new BodyContentHandler(), metadata, new ParseContext());

	assertEquals("ISO-8859-1", metadata.get(Metadata.CONTENT_ENCODING));
	}

	/**
	* Test case for TIKA-341: using charset in content-type
	*
	* @see <a href="https://issues.apache.org/jira/browse/TIKA-341">TIKA-341</a>
	*/
	public void testUsingCharsetInContentTypeHeader() throws Exception {
	// Could be UTF-8 or ISO 8859-1 or ...
	// u00e1 is latin small letter a with acute
	final String test2 = "the name is \u00e1ndre";

	Metadata metadata = new Metadata();
	parser.parse(
	new ByteArrayInputStream(test2.getBytes("UTF-8")),
	new BodyContentHandler(), metadata, new ParseContext());

	assertEquals("UTF-8", metadata.get(Metadata.CONTENT_ENCODING));

	metadata = new Metadata();
	metadata.set(Metadata.CONTENT_TYPE, "text/html; charset=ISO-8859-1");
	parser.parse(
	new ByteArrayInputStream(test2.getBytes("UTF-8")),
	new BodyContentHandler(), metadata, new ParseContext());

	assertEquals("ISO-8859-1", metadata.get(Metadata.CONTENT_ENCODING));
	}

	private void assertExtractText(String msg, String expected, byte[] input)
	throws Exception {
	ContentHandler handler = new BodyContentHandler() {
	public void ignorableWhitespace(char[] ch, int off, int len) {
	// Ignore the whitespace added by XHTMLContentHandler
	}
	};
	Metadata metadata = new Metadata();
	parser.parse(new ByteArrayInputStream(input), handler, metadata, new ParseContext());
	assertEquals("text/plain", metadata.get(Metadata.CONTENT_TYPE));
	assertEquals(msg, expected, handler.toString());
	}

	/**
	* Test case for TIKA-339: don't override incoming language
	*
	* @see <a href="https://issues.apache.org/jira/browse/TIKA-335">TIKA-335</a>
	*/
	public void testRetainIncomingLanguage() throws Exception {
	final String test = "Simple Content";

	Metadata metadata = new Metadata();
	metadata.set(Metadata.LANGUAGE, "en");

	parser.parse(
	new ByteArrayInputStream(test.getBytes("UTF-8")),
	new BodyContentHandler(), metadata, new ParseContext());

	assertEquals("en", metadata.get(Metadata.LANGUAGE));
	}

	public void testCP866() throws Exception {
	Metadata metadata = new Metadata();
	StringWriter writer = new StringWriter();
	parser.parse(
	TXTParserTest.class.getResourceAsStream("/test-documents/russian.cp866.txt"),
	new WriteOutContentHandler(writer),
	metadata,
	new ParseContext());

	assertEquals("text/plain", metadata.get(Metadata.CONTENT_TYPE));
	assertEquals("IBM866", metadata.get(Metadata.CONTENT_ENCODING));
	}

	}