blob: a21269952bdc28fef1dcb878c2426f102a8f3623 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.any23.mime;
import org.junit.Assert;
import org.apache.any23.mime.purifier.WhiteSpacesPurifier;
import org.junit.After;
import org.junit.Before;
import org.junit.Test;
import org.eclipse.rdf4j.rio.RDFFormat;
import java.io.BufferedInputStream;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.StandardCharsets;
import java.util.Arrays;
import java.util.Collection;
import java.util.List;
/**
* Test case for {@link TikaMIMETypeDetector} class.
*
* @author juergen
* @author Michele Mostarda (michele.mostarda@gmail.com)
*/
public class TikaMIMETypeDetectorTest {
private static final String PLAIN = "text/plain";
private static final String HTML = "text/html";
private static final String XML = "application/xml";
private static final String TRIX = RDFFormat.TRIX.getDefaultMIMEType();
private static final String XHTML = "application/xhtml+xml";
private static final String RDFXML = RDFFormat.RDFXML.getDefaultMIMEType();
private static final String TURTLE = RDFFormat.TURTLE.getDefaultMIMEType();
private static final String N3 = RDFFormat.N3.getDefaultMIMEType();
private static final String NQUADS = RDFFormat.NQUADS.getDefaultMIMEType();
private static final String CSV = "text/csv";
private static final String RSS = "application/rss+xml";
private static final String ATOM = "application/atom+xml";
private static final String YAML = "text/x-yaml";
private TikaMIMETypeDetector detector;
@Before
public void setUp() throws Exception {
detector = new TikaMIMETypeDetector(new WhiteSpacesPurifier());
}
@After
public void tearDown() throws Exception {
detector = null;
}
@Test
public void testN3Detection() throws IOException {
assertN3Detection("<http://example.org/path> <http://foo.com> <http://example.org/Document/foo#> .");
assertN3Detection("_:bnode1 <http://foo.com> _:bnode2 .");
assertN3Detection("<http://www.example.com> <http://purl.org/dc/elements/1.1/title> \"x\" .");
assertN3Detection("<http://www.example.com> <http://purl.org/dc/elements/1.1/title> \"x\"@it .");
assertN3Detection("<http://www.example.com> <http://purl.org/dc/elements/1.1/title> \"x\"^^<http://xxx.net> .");
assertN3Detection("<http://www.example.com> <http://purl.org/dc/elements/1.1/title> \"x\"^^xsd:integer .");
// Wrong N3 line '.'
assertN3DetectionFail(""
+ "<http://wrong.example.org/path> <http://wrong.foo.com> . <http://wrong.org/Document/foo#>"
);
// NQuads is not mislead with N3.
assertN3DetectionFail(
"<http://example.org/path> <http://foo.com> <http://dom.org/Document/foo#> <http://path/to/graph> ."
);
}
@Test
public void testNQuadsDetection() throws IOException {
assertNQuadsDetection(
"<http://www.ex.eu> <http://foo.com> <http://example.org/Document/foo#> <http://path.to.graph> ."
);
assertNQuadsDetection(
"_:bnode1 <http://foo.com> _:bnode2 <http://path.to.graph> ."
);
assertNQuadsDetection(
"<http://www.ex.eu> <http://purl.org/dc/elements/1.1/title> \"x\" <http://path.to.graph> ."
);
assertNQuadsDetection(
"<http://www.ex.eu> <http://purl.org/dc/elements/1.1/title> \"x\"@it <http://path.to.graph> ."
);
assertNQuadsDetection(
"<http://www.ex.eu> <http://dd.cc.org/1.1/p> \"xxx\"^^<http://www.sp.net/a#tt> <http://path.to.graph> ."
);
assertNQuadsDetection(
"<http://www.ex.eu> <http://purlo.org/1.1/title> \"yyy\"^^xsd:datetime <http://path.to.graph> ."
);
// Wrong NQuads line.
assertNQuadsDetectionFail(
"<http://www.wrong.com> <http://wrong.com/1.1/tt> \"x\"^^<http://xxx.net/int> . <http://path.to.graph>"
);
// N3 is not mislead with NQuads.
assertNQuadsDetectionFail(
"<http://example.org/path> <http://foo.com> <http://example.org/Document/foo#> ."
);
}
/* BEGIN: by content. */
@Test
public void testDetectRSS1ByContent() throws Exception {
detectMIMEtypeByContent(RDFXML, manifestRss1());
}
private List<String> manifestRss1() {
return Arrays.asList("/application/rss1/test1");
}
@Test
public void testDetectRSS2ByContent() throws Exception {
detectMIMEtypeByContent(RSS, manifestRss2());
}
private List<String> manifestRss2() {
return Arrays.asList("/application/rss2/index.html", "/application/rss2/rss2sample.xml", "/application/rss2/test1");
}
@Test
public void testDetectRDFN3ByContent() throws Exception {
detectMIMEtypeByContent(N3, manifestN3());
}
private List<String> manifestN3() {
return Arrays.asList("/application/rdfn3/test1", "/application/rdfn3/test2", "/application/rdfn3/test3");
}
@Test
public void testDetectRDFNQuadsByContent() throws Exception {
detectMIMEtypeByContent(NQUADS, manifestNQuads());
}
private List<String> manifestNQuads() {
return Arrays.asList("/application/nquads/test1.nq", "/application/nquads/test2.nq");
}
@Test
public void testDetectRDFXMLByContent() throws Exception {
detectMIMEtypeByContent(RDFXML, manifestRdfXml());
}
private List<String> manifestRdfXml() {
return Arrays.asList("/application/rdfxml/error.rdf", "/application/rdfxml/foaf", "/application/rdfxml/physics.owl", "/application/rdfxml/test1", "/application/rdfxml/test2", "/application/rdfxml/test3");
}
@Test
public void testDetectTriXByContent() throws Exception {
detectMIMEtypeByContent(TRIX, manifestTrix());
}
private List<String> manifestTrix() {
return Arrays.asList("/application/trix/test1.trx");
}
@Test
public void testDetectAtomByContent() throws Exception {
detectMIMEtypeByContent(ATOM, manifestAtom());
}
private List<String> manifestAtom() {
return Arrays.asList("/application/atom/atom.xml");
}
@Test
public void testDetectHTMLByContent() throws Exception {
detectMIMEtypeByContent(HTML, manifestHtml());
}
private List<String> manifestHtml() {
return Arrays.asList("/text/html/test1");
}
@Test
public void testDetectRDFaByContent() throws Exception {
detectMIMEtypeByContent(XHTML, manifestRdfa());
}
private List<String> manifestRdfa() {
return Arrays.asList("/application/rdfa/false.test", "/application/rdfa/london-gazette.html", "/application/rdfa/mic.xhtml", "/application/rdfa/test1.html");
}
@Test
public void testDetectXHTMLByContent() throws Exception {
detectMIMEtypeByContent(XHTML, manifestXHtml());
}
private List<String> manifestXHtml() {
return Arrays.asList("/application/xhtml/blank-file-header.xhtml", "/application/xhtml/index.html", "/application/xhtml/test1");
}
@Test
public void testDetectWSDLByContent() throws Exception {
detectMIMEtypeByContent("application/x-wsdl", manifestWsdl());
}
private List<String> manifestWsdl() {
return Arrays.asList("/application/wsdl/error.wsdl", "/application/wsdl/test1");
}
@Test
public void testDetectZIPByContent() throws Exception {
detectMIMEtypeByContent("application/zip", manifestZip());
}
private List<String> manifestZip() {
return Arrays.asList("/application/zip/4_entries.zip", "/application/zip/test1.zip", "/application/zip/test2");
}
@Test
public void testDetectCSVByContent() throws Exception {
detectMIMEtypeByContent(CSV, manifestCsv());
}
private List<String> manifestCsv() {
return Arrays.asList("/org/apache/any23/extractor/csv/test-comma.csv", "/org/apache/any23/extractor/csv/test-semicolon.csv", "/org/apache/any23/extractor/csv/test-tab.csv", "/org/apache/any23/extractor/csv/test-type.csv");
}
/* END: by content. */
/* BEGIN: by content metadata. */
@Test
public void testDetectContentPlainByMeta() throws IOException {
detectMIMETypeByMimeTypeHint(PLAIN, "text/plain");
}
@Test
public void testDetectTextRDFByMeta() throws IOException {
detectMIMETypeByMimeTypeHint(RDFXML, "text/rdf");
}
@Test
public void testDetectTextN3ByMeta() throws IOException {
detectMIMETypeByMimeTypeHint(N3, "text/rdf+n3");
}
@Test
public void testDetectTextNQuadsByMeta() throws IOException {
detectMIMETypeByMimeTypeHint(NQUADS, "application/n-quads");
}
@Test
public void testDetectTextTurtleByMeta() throws IOException {
detectMIMETypeByMimeTypeHint(TURTLE, "text/turtle");
}
@Test
public void testDetectRDFXMLByMeta() throws IOException {
detectMIMETypeByMimeTypeHint(RDFXML, "application/rdf+xml");
}
@Test
public void testDetectXMLByMeta() throws IOException {
detectMIMETypeByMimeTypeHint(XML, "application/xml");
}
@Test
public void testDetectTriXByMeta() throws IOException {
detectMIMETypeByMimeTypeHint(TRIX, "application/trix");
}
@Test
public void testDetectExtensionN3ByMeta() throws IOException {
detectMIMETypeByMimeTypeHint(PLAIN, "text/plain");
}
@Test
public void testDetectXHTMLByMeta() throws IOException {
detectMIMETypeByMimeTypeHint(XHTML, "application/xhtml+xml");
}
@Test
public void testDetectTextHTMLByMeta() throws IOException {
detectMIMETypeByMimeTypeHint(HTML, "text/html");
}
@Test
public void testDetectTextPlainByMeta() throws IOException {
detectMIMETypeByMimeTypeHint(PLAIN, "text/plain");
}
@Test
public void testDetectApplicationXMLByMeta() throws IOException {
detectMIMETypeByMimeTypeHint(XML, "application/xml");
}
@Test
public void testDetectApplicationCSVByMeta() throws IOException {
detectMIMETypeByMimeTypeHint(CSV, "text/csv");
}
@Test
public void testDetectApplicationYAMLByMeta() throws IOException {
detectMIMETypeByMimeTypeHint(YAML, "text/x-yaml");
}
/* END: by content metadata. */
/* BEGIN: by content and name. */
@Test
public void testRDFXMLByContentAndName() throws Exception {
detectMIMETypeByContentAndName(RDFXML, manifestRdfXml());
}
@Test
public void testTriXByContentAndName() throws Exception {
detectMIMETypeByContentAndName(TRIX, manifestTrix());
}
@Test
public void testRSS1ByContentAndName() throws Exception {
detectMIMETypeByContentAndName(RDFXML, manifestRss1());
}
@Test
public void testRSS2ByContentAndName() throws Exception {
detectMIMETypeByContentAndName(RSS, manifestRss2());
}
@Test
public void testDetectRDFN3ByContentAndName() throws Exception {
detectMIMETypeByContentAndName(N3, manifestN3());
}
@Test
public void testDetectRDFNQuadsByContentAndName() throws Exception {
detectMIMETypeByContentAndName(NQUADS, manifestNQuads());
}
@Test
public void testAtomByContentAndName() throws Exception {
detectMIMETypeByContentAndName(ATOM, manifestAtom());
}
@Test
public void testHTMLByContentAndName() throws Exception {
detectMIMETypeByContentAndName(HTML, manifestHtml());
}
@Test
public void testXHTMLByContentAndName() throws Exception {
detectMIMETypeByContentAndName(XHTML, manifestXHtml());
}
@Test
public void testWSDLByContentAndName() throws Exception {
detectMIMETypeByContentAndName("application/x-wsdl", manifestWsdl());
}
@Test
public void testZipByContentAndName() throws Exception {
detectMIMETypeByContentAndName("application/zip", manifestZip());
}
@Test
public void testRDFaByContentAndName() throws Exception {
detectMIMETypeByContentAndName(XHTML, manifestRdfa());
}
@Test
public void testCSVByContentAndName() throws Exception {
detectMIMETypeByContentAndName(CSV, manifestCsv());
}
/**
* Test done only based on content is failed because the standard does not
* require to have "%YAML" header.
* @throws Exception if there is an error detecting the mime type from the content and name
*/
@Test
public void testYAMLByContentAndName() throws Exception {
detectMIMETypeByContentAndName(YAML, manifestYAML());
}
private List<String> manifestYAML() {
return Arrays.asList("/org/apache/any23/extractor/yaml/simple-load.yml",
"/org/apache/any23/extractor/yaml/simple-load_no_head.yml",
"/org/apache/any23/extractor/yaml/simple-load_yaml.yaml"
);
}
/* END: by content and name. */
private void assertN3Detection(String n3Exp) throws IOException {
ByteArrayInputStream bais = new ByteArrayInputStream(n3Exp.getBytes(StandardCharsets.UTF_8));
Assert.assertTrue(TikaMIMETypeDetector.checkN3Format(bais));
}
private void assertN3DetectionFail(String n3Exp) throws IOException {
ByteArrayInputStream bais = new ByteArrayInputStream(n3Exp.getBytes(StandardCharsets.UTF_8));
Assert.assertFalse(TikaMIMETypeDetector.checkN3Format(bais));
}
private void assertNQuadsDetection(String n4Exp) throws IOException {
ByteArrayInputStream bais = new ByteArrayInputStream(n4Exp.getBytes(StandardCharsets.UTF_8));
Assert.assertTrue(TikaMIMETypeDetector.checkNQuadsFormat(bais));
}
private void assertNQuadsDetectionFail(String n4Exp) throws IOException {
ByteArrayInputStream bais = new ByteArrayInputStream(n4Exp.getBytes(StandardCharsets.UTF_8));
Assert.assertFalse(TikaMIMETypeDetector.checkNQuadsFormat(bais));
}
/**
* Checks the detection of a specific MIME based on content analysis.
*
* @param expectedMimeType the expected mime type.
* @param testDir the target file.
* @throws IOException
*/
private void detectMIMEtypeByContent(String expectedMimeType, Collection<String> manifest)
throws IOException {
String detectedMimeType;
for (String test : manifest) {
InputStream is = new BufferedInputStream(this.getClass().getResourceAsStream(test));
detectedMimeType = detector.guessMIMEType(
null,
is,
null
).toString();
if (test.contains("error")) {
Assert.assertNotSame(expectedMimeType, detectedMimeType);
} else {
Assert.assertEquals(
String.format(java.util.Locale.ROOT,
"Error in mimetype detection for file %s", test),
expectedMimeType,
detectedMimeType
);
}
is.close();
}
}
/**
* Verifies the detection of a specific MIME based on content, filename and
* metadata MIME type.
*
* @param expectedMimeType
* @param contentTypeHeader
* @throws IOException
*/
private void detectMIMETypeByMimeTypeHint(String expectedMimeType, String contentTypeHeader)
throws IOException {
String detectedMimeType = detector.guessMIMEType(
null,
null,
MIMEType.parse(contentTypeHeader)
).toString();
Assert.assertEquals(expectedMimeType, detectedMimeType);
}
/**
* Verifies the detection of a specific MIME based on content and filename.
*
* @param expectedMimeType
* @param testDir
* @throws IOException
*/
private void detectMIMETypeByContentAndName(String expectedMimeType, Collection<String> manifest) throws IOException {
String detectedMimeType;
for (String test : manifest) {
InputStream is = new BufferedInputStream(this.getClass().getResourceAsStream(test));
detectedMimeType = detector.guessMIMEType(test, is, null).toString();
if (test.contains("error")) {
Assert.assertNotSame(expectedMimeType, detectedMimeType);
} else {
Assert.assertEquals(
String.format(java.util.Locale.ROOT,
"Error while detecting mimetype in file %s", test),
expectedMimeType,
detectedMimeType
);
}
is.close();
}
}
}