| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.tika.parser.microsoft.rtf; |
| |
| import static org.junit.Assert.assertEquals; |
| import static org.junit.Assert.assertFalse; |
| import static org.junit.Assert.assertNotNull; |
| import static org.junit.Assert.assertTrue; |
| |
| import java.io.File; |
| import java.io.InputStream; |
| import java.util.Arrays; |
| import java.util.HashSet; |
| import java.util.List; |
| import java.util.Set; |
| |
| import org.junit.Test; |
| |
| import org.apache.tika.Tika; |
| import org.apache.tika.TikaTest; |
| import org.apache.tika.config.TikaConfig; |
| import org.apache.tika.extractor.ContainerExtractor; |
| import org.apache.tika.extractor.ParserContainerExtractor; |
| import org.apache.tika.io.TikaInputStream; |
| import org.apache.tika.metadata.Metadata; |
| import org.apache.tika.metadata.Office; |
| import org.apache.tika.metadata.OfficeOpenXMLCore; |
| import org.apache.tika.metadata.TikaCoreProperties; |
| import org.apache.tika.mime.MediaType; |
| import org.apache.tika.parser.AutoDetectParser; |
| import org.apache.tika.parser.Parser; |
| |
| /** |
| * Junit test class for the Tika {@link RTFParser} |
| */ |
| public class RTFParserTest extends TikaTest { |
| |
| @Test |
| public void testBasicExtraction() throws Exception { |
| |
| Metadata metadata = new Metadata(); |
| String content = getText("testRTF.rtf", metadata); |
| |
| assertEquals("application/rtf", metadata.get(Metadata.CONTENT_TYPE)); |
| assertEquals(1, metadata.getValues(Metadata.CONTENT_TYPE).length); |
| assertContains("Test", content); |
| assertContains("indexation Word", content); |
| } |
| |
| @Test |
| public void testUmlautSpacesExtraction2() throws Exception { |
| String content = getText("testRTFUmlautSpaces2.rtf"); |
| content = content.replaceAll("\\s+", ""); |
| assertEquals("\u00DCbersicht", content); |
| } |
| |
| @Test |
| public void testUnicodeUCNControlWordCharacterDoublingExtraction() throws Exception { |
| String content = getText("testRTFUnicodeUCNControlWordCharacterDoubling.rtf"); |
| |
| assertContains("\u5E74", content); |
| assertContains("\u5ff5", content); |
| assertContains("0 ", content); |
| assertContains("abc", content); |
| assertFalse("Doubled character \u5E74", content.contains("\u5E74\u5E74")); |
| } |
| |
| @Test |
| public void testHexEscapeInsideWord() throws Exception { |
| String content = getText("testRTFHexEscapeInsideWord.rtf"); |
| assertContains("ESP\u00cdRITO", content); |
| } |
| |
| @Test |
| public void testWindowsCodepage1250() throws Exception { |
| String content = getText("testRTFWindowsCodepage1250.rtf"); |
| assertContains("za\u017c\u00f3\u0142\u0107 g\u0119\u015bl\u0105 ja\u017a\u0144", content); |
| assertContains("ZA\u017b\u00d3\u0141\u0106 G\u0118\u015aL\u0104 JA\u0179\u0143", content); |
| } |
| |
| @Test |
| public void testTableCellSeparation() throws Exception { |
| String content = getText("testRTFTableCellSeparation.rtf"); |
| content = content.replaceAll("\\s+", " "); |
| assertContains("a b c d \u00E4 \u00EB \u00F6 \u00FC", content); |
| assertContains("a b c d \u00E4 \u00EB \u00F6 \u00FC", content); |
| } |
| |
| @Test |
| public void testTableCellSeparation2() throws Exception { |
| String content = getText("testRTFTableCellSeparation2.rtf"); |
| // TODO: why do we insert extra whitespace...? |
| content = content.replaceAll("\\s+", " "); |
| assertContains("Station Fax", content); |
| } |
| |
| @Test |
| public void testWordPadCzechCharactersExtraction() throws Exception { |
| String s1 = getText("testRTFWordPadCzechCharacters.rtf"); |
| assertTrue(s1.contains("\u010Cl\u00E1nek t\u00FDdne")); |
| assertTrue(s1.contains( |
| "starov\u011Bk\u00E9 \u017Eidovsk\u00E9 n\u00E1bo\u017Eensk\u00E9 texty")); |
| } |
| |
| @Test |
| public void testWord2010CzechCharactersExtraction() throws Exception { |
| String s1 = getText("testRTFWord2010CzechCharacters.rtf"); |
| assertTrue(s1.contains("\u010Cl\u00E1nek t\u00FDdne")); |
| assertTrue(s1.contains( |
| "starov\u011Bk\u00E9 \u017Eidovsk\u00E9 n\u00E1bo\u017Eensk\u00E9 texty")); |
| } |
| |
| @Test |
| public void testMS932Extraction() throws Exception { |
| XMLResult xmlResult = getXML("testRTF-ms932.rtf"); |
| // Hello in Japanese |
| assertTrue(xmlResult.xml.contains("\u3053\u3093\u306b\u3061\u306f")); |
| |
| // Verify title, since it was also encoded with MS932: |
| assertEquals("\u30bf\u30a4\u30c8\u30eb", xmlResult.metadata.get(TikaCoreProperties.TITLE)); |
| } |
| |
| @Test |
| public void testUmlautSpacesExtraction() throws Exception { |
| assertContains("\u00DCbersicht", getText("testRTFUmlautSpaces.rtf")); |
| } |
| |
| @Test |
| public void testGothic() throws Exception { |
| String content = getText("testRTFUnicodeGothic.rtf"); |
| assertContains("\uD800\uDF32\uD800\uDF3f\uD800\uDF44\uD800\uDF39\uD800\uDF43\uD800\uDF3A", |
| content); |
| } |
| |
| @Test |
| public void testJapaneseText() throws Exception { |
| XMLResult r = getXML("testRTFJapanese.rtf"); |
| String content = r.xml; |
| |
| // Verify title -- this title uses upr escape inside |
| // title info field: |
| assertEquals( |
| "\u30be\u30eb\u30b2\u3068\u5c3e\u5d0e\u3001\u6de1\u3005\u3068\u6700\u671f\u3000", |
| r.metadata.get(TikaCoreProperties.TITLE)); |
| assertEquals("VMazel", r.metadata.get(TikaCoreProperties.CREATOR)); |
| assertEquals("StarWriter", r.metadata.get(TikaCoreProperties.COMMENTS)); |
| |
| // Special version of (GHQ) |
| assertContains("\uff08\uff27\uff28\uff31\uff09", content); |
| |
| // 6 other characters |
| assertContains("\u6771\u4eac\u90fd\u4e09\u9df9\u5e02", content); |
| } |
| |
| @Test |
| public void testMaxLength() throws Exception { |
| File file = getResourceAsFile("/test-documents/testRTFJapanese.rtf"); |
| Metadata metadata = new Metadata(); |
| InputStream stream = TikaInputStream.get(file, metadata); |
| |
| // Test w/ default limit: |
| Tika localTika = new Tika(); |
| String content = localTika.parseToString(stream, metadata); |
| // parseToString closes for convenience: |
| //stream.close(); |
| assertTrue(content.length() > 500); |
| |
| // Test setting max length on the instance: |
| localTika.setMaxStringLength(200); |
| stream = TikaInputStream.get(file, metadata); |
| content = localTika.parseToString(stream, metadata); |
| |
| // parseToString closes for convenience: |
| //stream.close(); |
| assertTrue(content.length() <= 200); |
| |
| // Test setting max length per-call: |
| stream = TikaInputStream.get(file, metadata); |
| content = localTika.parseToString(stream, metadata, 100); |
| // parseToString closes for convenience: |
| //stream.close(); |
| assertTrue(content.length() <= 100); |
| } |
| |
| @Test |
| public void testTextWithCurlyBraces() throws Exception { |
| String content = getText("testRTFWithCurlyBraces.rtf"); |
| assertContains("{ some text inside curly brackets }", content); |
| } |
| |
| @Test |
| public void testControls() throws Exception { |
| String content = getText("testRTFControls.rtf"); |
| assertContains("Thiswordhasanem\u2014dash", content); |
| assertContains("Thiswordhasanen\u2013dash", content); |
| assertContains("Thiswordhasanon\u2011breakinghyphen", content); |
| assertContains("Thiswordhasanonbreaking\u00a0space", content); |
| assertContains("Thiswordhasanoptional\u00adhyphen", content); |
| assertContains("\u2018Single quoted text\u2019", content); |
| assertContains("\u201cDouble quoted text\u201d", content); |
| assertContains("\u201cDouble quoted text again\u201d", content); |
| } |
| |
| @Test |
| public void testInvalidUnicode() throws Exception { |
| String content = getText("testRTFInvalidUnicode.rtf"); |
| assertContains("Unpaired hi \ufffd here", content); |
| assertContains("Unpaired lo \ufffd here", content); |
| assertContains("Mismatched pair \ufffd\ufffd here", content); |
| } |
| |
| @Test |
| public void testVarious() throws Exception { |
| Metadata metadata = new Metadata(); |
| String content = getText("testRTFVarious.rtf", metadata); |
| assertContains("Footnote appears here", content); |
| assertContains("This is a footnote.", content); |
| assertContains("This is the header text.", content); |
| assertContains("This is the footer text.", content); |
| assertContains("Here is a text box", content); |
| assertContains("Bold", content); |
| assertContains("italic", content); |
| assertContains("underline", content); |
| assertContains("superscript", content); |
| assertContains("subscript", content); |
| assertContains("Here is a citation:", content); |
| assertContains("Figure 1 This is a caption for Figure 1", content); |
| assertContains("(Kramer)", content); |
| |
| // Table |
| assertContains("Row 1 Col 1 Row 1 Col 2 Row 1 Col 3 Row 2 Col 1 Row 2 Col 2 Row 2 Col 3", |
| content.replaceAll("\\s+", " ")); |
| |
| // 2-columns |
| assertContains("Row 1 column 1 Row 2 column 1 Row 1 column 2 Row 2 column 2", |
| content.replaceAll("\\s+", " ")); |
| assertContains("This is a hyperlink", content); |
| assertContains("Here is a list:", content); |
| for (int row = 1; row <= 3; row++) { |
| assertContains("Bullet " + row, content); |
| } |
| assertContains("Here is a numbered list:", content); |
| for (int row = 1; row <= 3; row++) { |
| assertContains("Number bullet " + row, content); |
| } |
| |
| for (int row = 1; row <= 2; row++) { |
| for (int col = 1; col <= 3; col++) { |
| assertContains("Row " + row + " Col " + col, content); |
| } |
| } |
| |
| assertContains("Keyword1 Keyword2", content); |
| assertContains("Keyword1 Keyword2", Arrays.asList(metadata.getValues(Office.KEYWORDS))); |
| |
| assertContains("Subject is here", content); |
| assertEquals("Subject is here", metadata.get(OfficeOpenXMLCore.SUBJECT)); |
| |
| assertContains("Suddenly some Japanese text:", content); |
| // Special version of (GHQ) |
| assertContains("\uff08\uff27\uff28\uff31\uff09", content); |
| // 6 other characters |
| assertContains("\u30be\u30eb\u30b2\u3068\u5c3e\u5d0e\u3001\u6de1\u3005\u3068\u6700\u671f", |
| content); |
| |
| assertContains("And then some Gothic text:", content); |
| assertContains("\uD800\uDF32\uD800\uDF3f\uD800\uDF44\uD800\uDF39\uD800\uDF43\uD800\uDF3A", |
| content); |
| } |
| |
| @Test |
| public void testVariousStyle() throws Exception { |
| String content = getXML("testRTFVarious.rtf").xml; |
| assertContains("<b>Bold</b>", content); |
| assertContains("<i>italic</i>", content); |
| } |
| |
| @Test |
| public void testBoldItalic() throws Exception { |
| String content = getXML("testRTFBoldItalic.rtf").xml; |
| assertContains("<b>bold</b>", content); |
| assertContains("<b>bold </b><b><i>italic</i></b>", content); |
| assertContains("<b><i>italic </i></b><b>bold</b>", content); |
| assertContains("<i>italic</i>", content); |
| assertContains("<b>bold then </b><b><i>italic then</i></b><i> not bold</i>", content); |
| assertContains("<i>italic then </i><b><i>bold then</i></b><b> not italic</b>", content); |
| } |
| |
| @Test |
| public void testHyperlink() throws Exception { |
| String content = getXML("testRTFHyperlink.rtf").xml; |
| assertContains( |
| "our most <a href=\"" + |
| "http://r.office.microsoft.com/r/rlidwelcomeFAQ?clid=1033\">" + |
| "frequently asked questions</a>", |
| content); |
| assertEquals(-1, content.indexOf("<p>\t\t</p>")); |
| } |
| |
| @Test |
| public void testIgnoredControlWord() throws Exception { |
| assertContains("<p>The quick brown fox jumps over the lazy dog</p>", |
| getXML("testRTFIgnoredControlWord.rtf").xml); |
| } |
| |
| @Test |
| public void testFontAfterBufferedText() throws Exception { |
| assertContains( |
| "\u0423\u0432\u0430\u0436\u0430\u0435\u043c\u044b\u0439" + |
| " \u043a\u043b\u0438\u0435\u043d\u0442!", |
| getXML("testFontAfterBufferedText.rtf").xml); |
| } |
| |
| @Test |
| public void testListMicrosoftWord() throws Exception { |
| String content = getXML("testRTFListMicrosoftWord.rtf").xml; |
| assertContains("<ol>\t<li>one</li>", content); |
| assertContains("</ol>", content); |
| assertContains("<ul>\t<li>first</li>", content); |
| assertContains("</ul>", content); |
| } |
| |
| @Test |
| public void testTurningOffList() throws Exception { |
| try (InputStream is = getResourceAsStream( |
| "/org/apache/tika/parser/microsoft/rtf/ignoreListMarkup-tika-config.xml")) { |
| assertNotNull(is); |
| TikaConfig tikaConfig = new TikaConfig(is); |
| Parser p = new AutoDetectParser(tikaConfig); |
| String content = getXML("testRTFListMicrosoftWord.rtf", p).xml; |
| assertNotContained("<ol>", content); |
| assertNotContained("<ul>", content); |
| assertNotContained("<li>", content); |
| } |
| } |
| |
| @Test |
| public void testListLibreOffice() throws Exception { |
| String content = getXML("testRTFListLibreOffice.rtf").xml; |
| assertContains("<ol>\t<li>one</li>", content); |
| assertContains("</ol>", content); |
| assertContains("<ul>\t<li>first</li>", content); |
| assertContains("</ul>", content); |
| } |
| |
| // TIKA-782 |
| @Test |
| public void testBinControlWord() throws Exception { |
| ByteCopyingHandler embHandler = new ByteCopyingHandler(); |
| try (TikaInputStream tis = TikaInputStream |
| .get(getResourceAsStream("/test-documents/testBinControlWord.rtf"))) { |
| ContainerExtractor ex = new ParserContainerExtractor(); |
| assertEquals(true, ex.isSupported(tis)); |
| ex.extract(tis, ex, embHandler); |
| } |
| assertEquals(1, embHandler.bytes.size()); |
| |
| byte[] bytes = embHandler.bytes.get(0); |
| assertEquals(10, bytes.length); |
| //} |
| assertEquals(125, (int) bytes[4]); |
| //make sure that at least the last value is correct |
| assertEquals(-1, (int) bytes[9]); |
| } |
| |
| // TIKA-999 |
| @Test |
| public void testMetaDataCounts() throws Exception { |
| XMLResult xml = getXML("testRTFWord2010CzechCharacters.rtf"); |
| assertEquals("1", xml.metadata.get(Office.PAGE_COUNT)); |
| assertEquals("70", xml.metadata.get(Office.WORD_COUNT)); |
| assertEquals("401", xml.metadata.get(Office.CHARACTER_COUNT)); |
| assertTrue(xml.metadata.get(TikaCoreProperties.CREATED).startsWith("2010-10-13T")); |
| } |
| |
| // TIKA-1192 |
| @Test |
| public void testListOverride() throws Exception { |
| assertContains("Body", getText("testRTFListOverride.rtf")); |
| } |
| |
| // TIKA-1305 |
| @Test |
| public void testCorruptListOverride() throws Exception { |
| assertContains("apple", getText("testRTFCorruptListOverride.rtf")); |
| } |
| |
| |
| @Test |
| public void testMultipleNewlines() throws Exception { |
| String content = getXML("testRTFNewlines.rtf").xml; |
| content = content.replaceAll("[\r\n]+", " "); |
| assertContains("<body><p>one</p> " + "<p /> " + "<p>two</p> " + "<p /> " + "<p /> " + |
| "<p>three</p> " + "<p /> " + "<p /> " + "<p /> " + "<p>four</p>", content); |
| } |
| |
| //TIKA-1010 test linked embedded doc |
| @Test |
| public void testEmbeddedLinkedDocument() throws Exception { |
| Set<MediaType> skipTypes = new HashSet<>(); |
| skipTypes.add(MediaType.parse("image/emf")); |
| skipTypes.add(MediaType.parse("image/wmf")); |
| |
| TrackingHandler tracker = new TrackingHandler(skipTypes); |
| try (TikaInputStream tis = TikaInputStream |
| .get(getResourceAsStream("/test-documents/testRTFEmbeddedLink.rtf"))) { |
| ContainerExtractor ex = new ParserContainerExtractor(); |
| assertEquals(true, ex.isSupported(tis)); |
| ex.extract(tis, ex, tracker); |
| } |
| //should gracefully skip link and not throw NPE, IOEx, etc |
| assertEquals(0, tracker.filenames.size()); |
| |
| tracker = new TrackingHandler(); |
| try (TikaInputStream tis = TikaInputStream |
| .get(getResourceAsStream("/test-documents/testRTFEmbeddedLink.rtf"))) { |
| ContainerExtractor ex = new ParserContainerExtractor(); |
| assertEquals(true, ex.isSupported(tis)); |
| ex.extract(tis, ex, tracker); |
| } |
| //should gracefully skip link and not throw NPE, IOEx, etc |
| assertEquals(2, tracker.filenames.size()); |
| } |
| |
| @Test |
| public void testConfig() throws Exception { |
| //test that memory allocation of the bin element is limited |
| //via the config file. Unfortunately, this test file's bin embedding contains 10 bytes |
| //so we had to set the config to 0. |
| try (InputStream is = getResourceAsStream( |
| "/org/apache/tika/parser/microsoft/rtf/tika-config.xml")) { |
| assertNotNull(is); |
| TikaConfig tikaConfig = new TikaConfig(is); |
| Parser p = new AutoDetectParser(tikaConfig); |
| List<Metadata> metadataList = getRecursiveMetadata("testBinControlWord.rtf", p); |
| assertEquals(1, metadataList.size()); |
| assertContains("TikaMemoryLimitException", metadataList.get(0) |
| .get(TikaCoreProperties.TIKA_META_EXCEPTION_EMBEDDED_STREAM)); |
| } |
| } |
| |
| @Test |
| public void testBoldPlain() throws Exception { |
| //TIKA-2410 -- bold should be turned off by "plain" |
| XMLResult r = getXML("testRTFBoldPlain.rtf"); |
| assertContains("<b>Hank</b>", r.xml); |
| assertNotContained("<b>Anna Smith", r.xml); |
| } |
| |
| @Test |
| public void testSpacingInAnnotations() throws Exception { |
| //TIKA-2838 |
| assertContains("supercali ATB Allison, Timothy B. This is a comment fragilistic", |
| getXML("testRTF_annotation_spacing.rtf").xml); |
| } |
| |
| @Test |
| public void testTIKA1713() throws Exception { |
| assertContains("For discussion", getXML("testRTFTIKA_1713.rtf").xml); |
| } |
| |
| @Test |
| public void testTIKA2150() throws Exception { |
| assertContains("TO\tFROM", getXML("testRTFTIKA_2150.rtf").xml); |
| } |
| |
| @Test |
| public void testTIKA2500() throws Exception { |
| assertContains("Level1", getXML("testRTFTIKA_2500.rtf").xml); |
| } |
| |
| @Test |
| public void testTIKA2883() throws Exception { |
| assertContains("This message has been archived.", getXML("testRTFTIKA_2883.rtf").xml); |
| } |
| |
| @Test |
| public void testTIKA2899() throws Exception { |
| assertContains("this Agreement on today", getXML("testRTFTIKA_2899.rtf").xml); |
| } |
| |
| } |