blob: 485b9780df7f297ee798e528792c1dcb00e8a8d7 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.mime;
import static java.nio.charset.StandardCharsets.US_ASCII;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertNotNull;
import static org.junit.Assert.assertTrue;
import static org.junit.Assert.fail;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.lang.reflect.Field;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.List;
import java.util.Set;
import java.util.concurrent.Executors;
import org.junit.After;
import org.junit.Before;
import org.junit.Test;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
/**
* These tests try to ensure that the MimeTypesReader
* has correctly processed the mime-types.xml file.
* To do this, it tests that various aspects of the
* mime-types.xml file have ended up correctly as
* globs, matches, magics etc.
* <p>
* If you make updates to mime-types.xml, then the
* checks in this test may no longer hold true.
* As such, if tests here start failing after your
* changes, please review the test details, and
* update it to match the new state of the file!
*/
public class MimeTypesReaderTest {
static boolean stop = false;
private MimeTypes mimeTypes;
private List<Magic> magics;
private String customMimeTypes;
private static String getTypeAsString(MimeTypes mimeTypes, String text, Metadata metadata)
throws IOException {
return mimeTypes
.detect(new ByteArrayInputStream(text.getBytes(StandardCharsets.UTF_8)), metadata)
.toString();
}
@SuppressWarnings("unchecked")
@Before
public void setUp() throws NoSuchFieldException, SecurityException, IllegalArgumentException,
IllegalAccessException {
this.mimeTypes = TikaConfig.getDefaultConfig().getMimeRepository();
Field magicsField = mimeTypes.getClass().getDeclaredField("magics");
magicsField.setAccessible(true);
magics = (List<Magic>) magicsField.get(mimeTypes);
//ensure reset of custom mimes path
customMimeTypes = System.getProperty(MimeTypesFactory.CUSTOM_MIMES_SYS_PROP);
}
@After
public void tearDown() {
if (customMimeTypes == null) {
System.clearProperty(MimeTypesFactory.CUSTOM_MIMES_SYS_PROP);
} else {
System.setProperty(MimeTypesFactory.CUSTOM_MIMES_SYS_PROP, customMimeTypes);
}
}
@Test
public void testHtmlMatches() throws Exception {
int minMatches = 10;
// Check on the type
MimeType html = mimeTypes.forName("text/html");
assertTrue(html.hasMagic());
assertTrue("There should be at least " + minMatches + " HTML matches, found " +
html.getMagics().size(), html.getMagics().size() >= minMatches);
// Check on the overall magics
List<Magic> htmlMagics = new ArrayList<>();
for (Magic magic : magics) {
if (magic.getType().toString().equals("text/html")) {
htmlMagics.add(magic);
}
}
assertTrue("There should be at least " + minMatches + " HTML matches, found " +
htmlMagics.size(), htmlMagics.size() >= minMatches);
}
@Test
public void testExcelMatches() throws Exception {
int minMatches = 4;
// Check on the type
MimeType excel = mimeTypes.forName("application/vnd.ms-excel");
assertTrue(excel.hasMagic());
assertTrue("There should be at least " + minMatches + " Excel matches, found " +
excel.getMagics().size(), excel.getMagics().size() >= minMatches);
// Check on the overall magics
List<Magic> excelMagics = new ArrayList<>();
for (Magic magic : magics) {
if (magic.getType().toString().equals("application/vnd.ms-excel")) {
excelMagics.add(magic);
}
}
assertTrue("There should be at least " + minMatches + " Excel matches, found " +
excelMagics.size(), excelMagics.size() >= minMatches);
}
/**
* @since TIKA-515
*/
@Test
public void testReadComment() {
try {
assertNotNull(this.mimeTypes.forName("application/msword").getDescription());
} catch (Exception e) {
fail(e.getMessage());
}
}
/**
* @since TIKA-1012
*/
@Test
public void testReadExtendedMetadata() throws Exception {
MimeType mime = this.mimeTypes.forName("image/bmp");
assertEquals("BMP", mime.getAcronym());
assertEquals("com.microsoft.bmp", mime.getUniformTypeIdentifier());
assertEquals("http://en.wikipedia.org/wiki/BMP_file_format",
mime.getLinks().get(0).toString());
mime = this.mimeTypes.forName("application/xml");
assertEquals("XML", mime.getAcronym());
assertEquals("public.xml", mime.getUniformTypeIdentifier());
assertEquals("http://en.wikipedia.org/wiki/Xml", mime.getLinks().get(0).toString());
}
@Test
public void testReadParameterHierarchy() throws Exception {
MimeType mimeBTree4 =
this.mimeTypes.forName("application/x-berkeley-db;format=btree;version=4");
MediaType mtBTree4 = mimeBTree4.getType();
// Canonicalised with spaces
assertEquals("application/x-berkeley-db; format=btree; version=4", mimeBTree4.toString());
assertEquals("application/x-berkeley-db; format=btree; version=4", mtBTree4.toString());
// Parent has one parameter
MediaType mtBTree = this.mimeTypes.getMediaTypeRegistry().getSupertype(mtBTree4);
assertEquals("application/x-berkeley-db; format=btree", mtBTree.toString());
// Parent has several children, for versions 2 through 4
Set<MediaType> mtBTreeChildren =
this.mimeTypes.getMediaTypeRegistry().getChildTypes(mtBTree);
assertTrue(mtBTreeChildren.toString(), mtBTreeChildren.size() >= 3);
assertTrue(mtBTreeChildren.toString(), mtBTreeChildren.contains(mtBTree4));
// Parent of that has none
MediaType mtBD = this.mimeTypes.getMediaTypeRegistry().getSupertype(mtBTree);
assertEquals("application/x-berkeley-db", mtBD.toString());
// If we use one with parameters not known in the media registry,
// getting the parent will return the non-parameter version
MediaType mtAlt = MediaType.application("x-berkeley-db; format=unknown; version=42");
MediaType mtAltP = this.mimeTypes.getMediaTypeRegistry().getSupertype(mtAlt);
assertEquals("application/x-berkeley-db", mtAltP.toString());
}
/**
* TIKA-746 Ensures that the custom mimetype maps were also
* loaded and used
*/
@Test
public void testCustomMimeTypes() {
// Check that it knows about our three special ones
String helloWorld = "hello/world";
String helloWorldFile = "hello/world-file";
String helloXWorld = "hello/x-world-hello";
try {
assertNotNull(this.mimeTypes.forName(helloWorld));
assertNotNull(this.mimeTypes.forName(helloWorldFile));
assertNotNull(this.mimeTypes.forName(helloXWorld));
} catch (Exception e) {
fail(e.getMessage());
}
// Check that the details come through as expected
try {
MimeType hw = this.mimeTypes.forName(helloWorld);
MimeType hwf = this.mimeTypes.forName(helloWorldFile);
MimeType hxw = this.mimeTypes.forName(helloXWorld);
// The parent has no comments, globs, magic etc
assertEquals("", hw.getDescription());
assertEquals("", hw.getExtension());
assertEquals(0, hw.getExtensions().size());
assertEquals(0, hw.getMagics().size());
// The file one does
assertEquals("A \"Hello World\" file", hwf.getDescription());
assertEquals(".hello.world", hwf.getExtension());
assertEquals(1, hwf.getMagics().size());
// The alternate one has most
assertEquals("", hxw.getDescription());
assertEquals(".x-hello-world", hxw.getExtension());
assertEquals(1, hxw.getMagics().size());
// Check that we can correct detect with the file one:
// By name
Metadata m = new Metadata();
m.add(TikaCoreProperties.RESOURCE_NAME_KEY, "test.hello.world");
assertEquals(hwf.toString(), this.mimeTypes.detect(null, m).toString());
m = new Metadata();
m.add(TikaCoreProperties.RESOURCE_NAME_KEY, "test.x-hello-world");
assertEquals(hxw.toString(), this.mimeTypes.detect(null, m).toString());
// By contents - picks the x one as that sorts later
m = new Metadata();
ByteArrayInputStream s = new ByteArrayInputStream("Hello, World!".getBytes(US_ASCII));
assertEquals(hxw.toString(), this.mimeTypes.detect(s, m).toString());
} catch (Exception e) {
fail(e.getMessage());
}
}
/**
* TIKA-2460 Test loading of custom-mimetypes.xml from sys prop.
*/
@Test
public void testExternalMimeTypes() throws Exception {
System.setProperty(MimeTypesFactory.CUSTOM_MIMES_SYS_PROP,
"src/test/resources/org/apache/tika/mime/external-mimetypes.xml");
MimeTypes mimeTypes = MimeTypes.getDefaultMimeTypes(new CustomClassLoader());
Metadata m = new Metadata();
m.add(TikaCoreProperties.RESOURCE_NAME_KEY, "test.external.mime.type");
assertEquals("external/mime-type", mimeTypes.detect(null, m).toString());
}
@Test
public void testGetExtensionForPowerPoint() throws Exception {
MimeType mt = this.mimeTypes.forName("application/vnd.ms-powerpoint");
String ext = mt.getExtension();
assertEquals(".ppt", ext);
assertEquals(".ppt", mt.getExtensions().get(0));
}
@Test
public void testGetRegisteredMimesWithParameters() throws Exception {
//TIKA-1692
// Media Type always keeps details / parameters
String name = "application/xml; charset=UTF-8";
MediaType mt = MediaType.parse(name);
assertEquals(name, mt.toString());
// Mime type loses details not in the file
MimeType mimeType = this.mimeTypes.getRegisteredMimeType(name);
assertEquals("application/xml", mimeType.toString());
assertEquals(".xml", mimeType.getExtension());
// But on well-known parameters stays
name = "application/dita+xml; format=map";
mt = MediaType.parse(name);
assertEquals(name, mt.toString());
mimeType = this.mimeTypes.getRegisteredMimeType(name);
assertEquals(name, mimeType.toString());
assertEquals(".ditamap", mimeType.getExtension());
}
@Test
public void testMultiThreaded() throws Exception {
MimeTypes mimeTypes = MimeTypes.getDefaultMimeTypes();
Executors.newSingleThreadExecutor().execute(() -> {
try {
for (int i = 0; i < 500 && !stop; i++) {
mimeTypes.forName("abc" + i + "/abc");
}
} catch (MimeTypeException e) {
e.printStackTrace();
}
});
for (int i = 0; i < 500 & !stop; i++) {
mimeTypes.getMediaTypeRegistry().getAliases(MediaType.APPLICATION_ZIP);
}
}
@Test
public void testMinShouldMatch() throws Exception {
System.setProperty(MimeTypesFactory.CUSTOM_MIMES_SYS_PROP,
"src/test/resources/org/apache/tika/mime/custom-mimetypes-minShouldMatch.xml");
MimeTypes mimeTypes = MimeTypes.getDefaultMimeTypes(new CustomClassLoader());
//matches one
assertEquals("hello/world-min-file",
getTypeAsString(mimeTypes, "Hello World!", new Metadata()));
//matches two
assertEquals("hello/world-min-file",
getTypeAsString(mimeTypes, "Hello Welt!", new Metadata()));
//matches two
assertEquals("hello/world-min-file",
getTypeAsString(mimeTypes, "Hallo Welt!", new Metadata()));
//missing !
assertEquals("text/plain", getTypeAsString(mimeTypes, "Hello World", new Metadata()));
//Hello requires world, welt or hallo; monde requires bonjour le
assertEquals("text/plain", getTypeAsString(mimeTypes, "Hello Monde", new Metadata()));
//this matcher is treated as "or" with minshouldmatch clause
assertEquals("hello/world-min-file",
getTypeAsString(mimeTypes, "Bonjour le Monde!", new Metadata()));
}
@Test(expected = IllegalArgumentException.class)
public void testBadMinShouldMatch1() throws Exception {
System.setProperty(MimeTypesFactory.CUSTOM_MIMES_SYS_PROP,
"src/test/resources/org/apache/tika/mime/custom-mimetypes-badMinShouldMatch1.xml");
MimeTypes mimeTypes = MimeTypes.getDefaultMimeTypes(new CustomClassLoader());
}
@Test(expected = IllegalArgumentException.class)
public void testBadMinShouldMatch2() throws Exception {
System.setProperty(MimeTypesFactory.CUSTOM_MIMES_SYS_PROP,
"src/test/resources/org/apache/tika/mime/custom-mimetypes-badMinShouldMatch2.xml");
MimeTypes mimeTypes = MimeTypes.getDefaultMimeTypes(new CustomClassLoader());
}
@Test(expected = NumberFormatException.class)
public void testBadMinShouldMatch3() throws Exception {
System.setProperty(MimeTypesFactory.CUSTOM_MIMES_SYS_PROP,
"src/test/resources/org/apache/tika/mime/custom-mimetypes-badMinShouldMatch3.xml");
MimeTypes mimeTypes = MimeTypes.getDefaultMimeTypes(new CustomClassLoader());
}
@Test(expected = IllegalArgumentException.class)
public void testBadMinShouldMatch4() throws Exception {
System.setProperty(MimeTypesFactory.CUSTOM_MIMES_SYS_PROP,
"src/test/resources/org/apache/tika/mime/custom-mimetypes-badMinShouldMatch4.xml");
MimeTypes mimeTypes = MimeTypes.getDefaultMimeTypes(new CustomClassLoader());
}
private static class CustomClassLoader extends ClassLoader {
}
}