blob: a439b05d43a43ddb63ea42b65912d43880fce36a [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.parser.microsoft.rtf;
import org.apache.commons.io.FilenameUtils;
import org.apache.tika.TikaTest;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.RTFMetadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.RecursiveParserWrapper;
import org.apache.tika.sax.BasicContentHandlerFactory;
import org.apache.tika.sax.RecursiveParserWrapperHandler;
import org.junit.Assert;
import org.junit.Test;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertNotNull;
import static org.junit.Assert.assertTrue;
public class RTFParserTest extends TikaTest {
// TIKA-1010
@Test
public void testEmbeddedMonster() throws Exception {
Map<Integer, Pair> expected = new HashMap<>();
expected.put(3, new Pair("Hw.txt","text/plain; charset=ISO-8859-1"));
expected.put(4, new Pair("file_0.doc", "application/msword"));
expected.put(7, new Pair("file_1.xlsx",
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"));
expected.put(10, new Pair("text.html", "text/html; charset=windows-1252"));
expected.put(11, new Pair("html-within-zip.zip", "application/zip"));
expected.put(12, new Pair("test-zip-of-zip_\u666E\u6797\u65AF\u987F.zip", "application/zip"));
expected.put(15, new Pair("testHTML_utf8_\u666E\u6797\u65AF\u987F.html", "text/html; charset=UTF-8"));
expected.put(18, new Pair("testJPEG_\u666E\u6797\u65AF\u987F.jpg", "image/jpeg"));
expected.put(21, new Pair("file_2.xls", "application/vnd.ms-excel"));
expected.put(24, new Pair("testMSG_\u666E\u6797\u65AF\u987F.msg", "application/vnd.ms-outlook"));
expected.put(27, new Pair("file_3.pdf", "application/pdf"));
expected.put(30, new Pair("file_4.ppt", "application/vnd.ms-powerpoint"));
expected.put(34, new Pair("file_5.pptx", "application/vnd.openxmlformats-officedocument.presentationml.presentation"));
expected.put(33, new Pair("thumbnail.jpeg", "image/jpeg"));
expected.put(37, new Pair("file_6.doc", "application/msword"));
expected.put(40, new Pair("file_7.doc", "application/msword"));
expected.put(43, new Pair("file_8.docx", "application/vnd.openxmlformats-officedocument.wordprocessingml.document"));
expected.put(46, new Pair("testJPEG_\u666E\u6797\u65AF\u987F.jpg", "image/jpeg"));
List<Metadata> metadataList = getRecursiveMetadata("testRTFEmbeddedFiles.rtf");
assertEquals(49, metadataList.size());
for (Map.Entry<Integer, Pair> e : expected.entrySet()) {
Metadata metadata = metadataList.get(e.getKey());
Pair p = e.getValue();
assertNotNull(metadata.get(TikaCoreProperties.RESOURCE_NAME_KEY));
//necessary to getName() because MSOffice extractor includes
//directory: _1457338524/HW.txt
Assert.assertEquals("filename equals ",
p.fileName, FilenameUtils.getName(
metadata.get(TikaCoreProperties.EMBEDDED_RESOURCE_PATH)));
assertEquals(p.mimeType, metadata.get(Metadata.CONTENT_TYPE));
}
assertEquals("C:\\Users\\tallison\\AppData\\Local\\Temp\\testJPEG_普林斯顿.jpg",
metadataList.get(46).get(TikaCoreProperties.ORIGINAL_RESOURCE_NAME));
}
//TIKA-1010 test regular (not "embedded") images/picts
@Test
public void testRegularImages() throws Exception {
ParseContext ctx = new ParseContext();
RecursiveParserWrapper parser = new RecursiveParserWrapper(AUTO_DETECT_PARSER);
RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler(
new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.IGNORE, -1),-1);
Metadata rootMetadata = new Metadata();
rootMetadata.add(TikaCoreProperties.RESOURCE_NAME_KEY, "testRTFRegularImages.rtf");
try (TikaInputStream tis = TikaInputStream.get(getResourceAsStream("/test-documents/testRTFRegularImages.rtf"))) {
parser.parse(tis, handler, rootMetadata, ctx);
}
List<Metadata> metadatas = handler.getMetadataList();
Metadata meta_jpg_exif = metadatas.get(1);//("testJPEG_EXIF_\u666E\u6797\u65AF\u987F.jpg");
Metadata meta_jpg = metadatas.get(3);//("testJPEG_\u666E\u6797\u65AF\u987F.jpg");
assertTrue(meta_jpg_exif != null);
assertTrue(meta_jpg != null);
assertTrue(Arrays.asList(meta_jpg_exif.getValues(TikaCoreProperties.SUBJECT)).contains("serbor"));
assertTrue(meta_jpg.get(TikaCoreProperties.COMMENTS).contains("Licensed to the Apache"));
//make sure old metadata doesn't linger between objects
assertFalse(Arrays.asList(meta_jpg.getValues(TikaCoreProperties.SUBJECT)).contains("serbor"));
assertEquals("false", meta_jpg.get(RTFMetadata.THUMBNAIL));
assertEquals("false", meta_jpg_exif.get(RTFMetadata.THUMBNAIL));
assertEquals(51, meta_jpg.names().length);
assertEquals(110, meta_jpg_exif.names().length);
}
private static class Pair {
final String fileName;
final String mimeType;
Pair(String fileName, String mimeType) {
this.fileName = fileName;
this.mimeType = mimeType;
}
}
}