enhancement-engines/tika/src/test/java/org/apache/stanbol/enhancer/engines/tika/TikaEngineTest.java - stanbol - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 package org.apache.stanbol.enhancer.engines.tika;

 import static java.util.Collections.singleton;
 import static org.apache.commons.io.IOUtils.closeQuietly;
 import static org.apache.stanbol.enhancer.engines.tika.TikaEngine.XHTML;
 import static org.apache.stanbol.enhancer.servicesapi.EnhancementEngine.CANNOT_ENHANCE;
 import static org.apache.tika.mime.MediaType.OCTET_STREAM;
 import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertFalse;
 import static org.junit.Assert.assertNotNull;
 import static org.junit.Assert.assertNull;
 import static org.junit.Assert.assertTrue;
 import static org.junit.Assert.fail;

 import java.io.IOException;
 import java.io.InputStream;
 import java.io.InputStreamReader;
 import java.io.Reader;
 import java.nio.charset.Charset;
 import java.text.DateFormatSymbols;
 import java.text.ParseException;
 import java.text.SimpleDateFormat;
 import java.util.Arrays;
 import java.util.Date;
 import java.util.HashSet;
 import java.util.Iterator;
 import java.util.Locale;
 import java.util.Map.Entry;
 import java.util.Set;
 import java.util.regex.Pattern;

 import org.apache.clerezza.rdf.core.Literal;
 import org.apache.clerezza.rdf.core.LiteralFactory;
 import org.apache.clerezza.rdf.core.MGraph;
 import org.apache.clerezza.rdf.core.NonLiteral;
 import org.apache.clerezza.rdf.core.PlainLiteral;
 import org.apache.clerezza.rdf.core.Resource;
 import org.apache.clerezza.rdf.core.Triple;
 import org.apache.clerezza.rdf.core.TypedLiteral;
 import org.apache.clerezza.rdf.core.UriRef;
 import org.apache.clerezza.rdf.ontologies.RDF;
 import org.apache.clerezza.rdf.ontologies.XSD;
 import org.apache.commons.io.IOUtils;
 import org.apache.commons.io.LineIterator;
 import org.apache.stanbol.enhancer.contentitem.inmemory.InMemoryContentItemFactory;
 import org.apache.stanbol.enhancer.servicesapi.Blob;
 import org.apache.stanbol.enhancer.servicesapi.ContentItem;
 import org.apache.stanbol.enhancer.servicesapi.ContentItemFactory;
 import org.apache.stanbol.enhancer.servicesapi.EngineException;
 import org.apache.stanbol.enhancer.servicesapi.helper.ContentItemHelper;
 import org.apache.stanbol.enhancer.servicesapi.impl.StreamSource;
 import org.apache.stanbol.enhancer.servicesapi.impl.StringSource;
 import org.apache.stanbol.enhancer.servicesapi.rdf.NamespaceEnum;
 import org.junit.After;
 import org.junit.AfterClass;
 import org.junit.Assert;
 import org.junit.Before;
 import org.junit.BeforeClass;
 import org.junit.Test;
 import org.osgi.service.cm.ConfigurationException;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;

 public class TikaEngineTest {

     private static final Logger log = LoggerFactory.getLogger(TikaEngineTest.class);
     private static final ContentItemFactory ciFactory = InMemoryContentItemFactory.getInstance();
     private static TikaEngine engine;
     private static MockComponentContext context;
     private static LiteralFactory lf = LiteralFactory.getInstance();
     /**
      * Required to make this test independent of the timeZone of the local host.
      */
     private static SimpleDateFormat dateDefaultTimezone =
             new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss", new DateFormatSymbols(Locale.US));

     @BeforeClass
     public static void setUpServices() throws IOException {
         context = new MockComponentContext();
         context.properties.put(TikaEngine.PROPERTY_NAME, "tika");
         //to test unmapped properties
         context.properties.put(TikaEngine.UNMAPPED_PROPERTIES, "true");
     }

     @Before
     public void bindServices() throws ConfigurationException {
         if(engine == null){
             engine = new TikaEngine(ciFactory);
             engine.activate(context);
         }
     }

     @Test
     public void testHtml() throws EngineException, IOException {
         log.info(">>> testHtml <<<");
         ContentItem ci = createContentItem("test.html", "text/html; charset=UTF-8");
         assertFalse(engine.canEnhance(ci) == CANNOT_ENHANCE);
         engine.computeEnhancements(ci);
         Entry<UriRef,Blob> contentPart = ContentItemHelper.getBlob(ci,
             singleton("text/plain"));
         assertNotNull(contentPart);
         Blob plainTextBlob = contentPart.getValue();
         assertNotNull(plainTextBlob);
         assertContentRegexp(plainTextBlob,
             "The Apache Stanbol Enhancer",
             "The Stanbol enhancer can detect famous cities such as Paris and people such as Bob Marley.");
         //validate XHTML results
         contentPart = ContentItemHelper.getBlob(ci,
             singleton("application/xhtml+xml"));
         assertNotNull(contentPart);
         Blob xhtmlBlob = contentPart.getValue();
         assertNotNull(xhtmlBlob);
         assertContentRegexp(xhtmlBlob,
             "<html xmlns=\"http://www.w3.org/1999/xhtml\">",
             "<head>",
             "<meta name=",
             "<title>The Apache Stanbol Enhancer</title>",
             "The Apache Stanbol Enhancer",
             "The Stanbol enhancer can detect famous cities",
             "</body></html>");
     }
     @Test
     public void testPdf() throws EngineException, IOException {
         log.info(">>> testPdf <<<");
         //PDF created by Apple Pages
         ContentItem ci = createContentItem("test.pdf", "application/pdf");
         assertFalse(engine.canEnhance(ci) == CANNOT_ENHANCE);
         engine.computeEnhancements(ci);
         Entry<UriRef,Blob> contentPart = ContentItemHelper.getBlob(ci,
             singleton("text/plain"));
         assertNotNull(contentPart);
         Blob plainTextBlob = contentPart.getValue();
         assertNotNull(plainTextBlob);
         assertContentRegexp(plainTextBlob,
             "The Apache Stanbol Enhancer",
             "The Stanbol enhancer can detect famous cities ");
         //validate XHTML results
         contentPart = ContentItemHelper.getBlob(ci,
             singleton("application/xhtml+xml"));
         assertNotNull(contentPart);
         Blob xhtmlBlob = contentPart.getValue();
         assertNotNull(xhtmlBlob);
         assertContentRegexp(xhtmlBlob,
             "<html xmlns=\"http://www.w3.org/1999/xhtml\">",
             "<head>",
             "<meta name=",
             "<div class=\"page\">",
             "The Apache Stanbol Enhancer",
             "The Stanbol enhancer can detect famous cities",
             "</body></html>");

         //PDF created by OpenOffice
         ci = createContentItem("test2.pdf", "application/pdf");
         assertFalse(engine.canEnhance(ci) == CANNOT_ENHANCE);
         engine.computeEnhancements(ci);
         //validate plain text results
         contentPart = ContentItemHelper.getBlob(ci,
             singleton("text/plain"));
         assertNotNull(contentPart);
         plainTextBlob = contentPart.getValue();
         assertNotNull(plainTextBlob);
         assertContentRegexp(plainTextBlob,
             "The Apache Stanbol Enhancer",
             "The Stanbol enhancer can detect famous cities");
         //validate XHTML results
         contentPart = ContentItemHelper.getBlob(ci,
             singleton("application/xhtml+xml"));
         assertNotNull(contentPart);
         xhtmlBlob = contentPart.getValue();
         assertNotNull(xhtmlBlob);
         assertContentRegexp(xhtmlBlob,
             "<html xmlns=\"http://www.w3.org/1999/xhtml\">",
             "<head>",
             "<meta name=",
             "<div class=\"page\">",
             "The Apache Stanbol Enhancer",
             "The Stanbol enhancer can detect famous cities",
             "</body></html>");
     }
     @Test
     public void testMsWord() throws EngineException, IOException {
         log.info(">>> testMsWord <<<");
         ContentItem ci = createContentItem("test.doc", "application/msword");
         assertFalse(engine.canEnhance(ci) == CANNOT_ENHANCE);
         engine.computeEnhancements(ci);
         Entry<UriRef,Blob> contentPart = ContentItemHelper.getBlob(ci,
             singleton("text/plain"));
         assertNotNull(contentPart);
         Blob plainTextBlob = contentPart.getValue();
         assertNotNull(plainTextBlob);
         assertContentRegexp(plainTextBlob,
             "The Apache Stanbol Enhancer",
             "The Stanbol enhancer can detect famous cities such as Paris and people such as Bob Marley.");
         //validate XHTML results
         contentPart = ContentItemHelper.getBlob(ci,
             singleton("application/xhtml+xml"));
         assertNotNull(contentPart);
         Blob xhtmlBlob = contentPart.getValue();
         assertNotNull(xhtmlBlob);
         assertContentRegexp(xhtmlBlob,
             "<html xmlns=\"http://www.w3.org/1999/xhtml\">",
             "<head>",
             "<meta name=",
             "<title>",
             "The Apache Stanbol Enhancer",
             "The Stanbol enhancer can detect famous cities",
             "</body></html>");    }
     @Test
     public void testRtf() throws EngineException, IOException {
         log.info(">>> testRtf <<<");
         ContentItem ci = createContentItem("test.rtf", "application/rtf");
         assertFalse(engine.canEnhance(ci) == CANNOT_ENHANCE);
         engine.computeEnhancements(ci);
         Entry<UriRef,Blob> contentPart = ContentItemHelper.getBlob(ci,
             singleton("text/plain"));
         assertNotNull(contentPart);
         Blob plainTextBlob = contentPart.getValue();
         assertNotNull(plainTextBlob);
         assertContentRegexp(plainTextBlob,
             "The Apache Stanbol Enhancer",
             "The Stanbol enhancer can detect famous cities such as Paris and people such as Bob Marley.");
         //validate XHTML results
         contentPart = ContentItemHelper.getBlob(ci,
             singleton("application/xhtml+xml"));
         assertNotNull(contentPart);
         Blob xhtmlBlob = contentPart.getValue();
         assertNotNull(xhtmlBlob);
         assertContentRegexp(xhtmlBlob,
             "<html xmlns=\"http://www.w3.org/1999/xhtml\">",
             "<head>",
             "<meta name=",
             "<title>",
             "The Apache Stanbol Enhancer",
             "The Stanbol enhancer can detect famous cities",
             "</body></html>");
     }
     @Test
     public void testOdt() throws EngineException, IOException {
         log.info(">>> testOdt <<<");
         ContentItem ci = createContentItem("test.odt", "application/vnd.oasis.opendocument.text");
         assertFalse(engine.canEnhance(ci) == CANNOT_ENHANCE);
         engine.computeEnhancements(ci);
         Entry<UriRef,Blob> contentPart = ContentItemHelper.getBlob(ci,
             singleton("text/plain"));
         assertNotNull(contentPart);
         Blob plainTextBlob = contentPart.getValue();
         assertNotNull(plainTextBlob);
         assertContentRegexp(plainTextBlob,
             "The Apache Stanbol Enhancer",
             "The Stanbol enhancer can detect famous cities such as Paris and people such as Bob Marley.");
         //validate XHTML results
         contentPart = ContentItemHelper.getBlob(ci,
             singleton("application/xhtml+xml"));
         assertNotNull(contentPart);
         Blob xhtmlBlob = contentPart.getValue();
         assertNotNull(xhtmlBlob);
         assertContentRegexp(xhtmlBlob,
             "<html xmlns=\"http://www.w3.org/1999/xhtml\">",
             "<head>",
             "<meta name=",
             "<title>",
             "The Apache Stanbol Enhancer",
             "The Stanbol enhancer can detect famous cities",
             "</body></html>");
     }
     @Test
     public void testEMail() throws EngineException, IOException, ParseException {
         log.info(">>> testEMail <<<");
         ContentItem ci = createContentItem("test.email.txt", "message/rfc822");
         assertFalse(engine.canEnhance(ci) == CANNOT_ENHANCE);
         engine.computeEnhancements(ci);
         Entry<UriRef,Blob> contentPart = ContentItemHelper.getBlob(ci,
             singleton("text/plain"));
         assertNotNull(contentPart);
         Blob plainTextBlob = contentPart.getValue();
         assertNotNull(plainTextBlob);
         assertContentRegexp(plainTextBlob,
             "Julien Nioche commented on TIKA-461:",
             "I'll have a look at mime4j and try to use it in Tika",
             "> RFC822 messages not parsed",
             "Key: TIKA-461",
             "URL: https://issues.apache.org/jira/browse/TIKA-461");
         //validate XHTML results
         contentPart = ContentItemHelper.getBlob(ci,
             singleton("application/xhtml+xml"));
         assertNotNull(contentPart);
         Blob xhtmlBlob = contentPart.getValue();
         assertNotNull(xhtmlBlob);
         assertContentRegexp(xhtmlBlob,
             "<html xmlns=\"http://www.w3.org/1999/xhtml\">",
             "<title>\\[jira\\] Commented: \\(TIKA-461\\) RFC822 messages not parsed</title>",
             "<body><p>",
             "Julien Nioche commented on TIKA-461:",
             "I'll have a look at mime4j and try to use it in Tika",
             "&gt; RFC822 messages not parsed",
             "Key: TIKA-461",
             "URL: https://issues.apache.org/jira/browse/TIKA-461");
         //no check the extracted metadata!
         //DC
         //STANBOL-757: dc:date no longer added by Tika 1.2 (dc:created is still present)
         //verifyValue(ci, new UriRef(NamespaceEnum.dc+"date"), XSD.dateTime,"2010-09-06T09:25:34Z");
         verifyValue(ci, new UriRef(NamespaceEnum.dc+"format"), null,"message/rfc822");
         //STANBOL-757: dc:subject no longer added by Tika1.2 (dc:title is used instead)
         //verifyValue(ci, new UriRef(NamespaceEnum.dc+"subject"), null,"[jira] Commented: (TIKA-461) RFC822 messages not parsed");
         verifyValue(ci, new UriRef(NamespaceEnum.dc+"title"), null,"[jira] Commented: (TIKA-461) RFC822 messages not parsed");
         verifyValue(ci, new UriRef(NamespaceEnum.dc+"creator"), null,"Julien Nioche (JIRA) <jira@apache.org>");
         verifyValue(ci, new UriRef(NamespaceEnum.dc+"created"), XSD.dateTime,"2010-09-06T09:25:34Z");

         //Media Ontology
         verifyValue(ci, new UriRef(NamespaceEnum.media+"creationDate"),XSD.dateTime,"2010-09-06T09:25:34Z");
         verifyValue(ci, new UriRef(NamespaceEnum.media+"hasFormat"),null,"message/rfc822");
         verifyValue(ci, new UriRef(NamespaceEnum.media+"hasCreator"),null,"Julien Nioche (JIRA) <jira@apache.org>");
         verifyValue(ci, new UriRef(NamespaceEnum.media+"hasContributor"),null,"Julien Nioche (JIRA) <jira@apache.org>");
         //STANBOL-757: This was present with Tika 1.1 because its mapping from dc:subject
 //        verifyValue(ci, new UriRef(NamespaceEnum.media+"hasKeyword"),null,"[jira] Commented: (TIKA-461) RFC822 messages not parsed");


         //Nepomuk Message
         String message = "http://www.semanticdesktop.org/ontologies/2007/03/22/nmo#";
         verifyValue(ci, new UriRef(message+"from"),null,"Julien Nioche (JIRA) <jira@apache.org>");
         verifyValue(ci, new UriRef(message+"to"),null,"dev@tika.apache.org");

     }
     @Test
     public void testMp3() throws EngineException, IOException, ParseException {
         log.info(">>> testMp3 <<<");
         ContentItem ci = createContentItem("testMP3id3v24.mp3", "audio/mpeg");
         assertFalse(engine.canEnhance(ci) == CANNOT_ENHANCE);
         engine.computeEnhancements(ci);
         Entry<UriRef,Blob> contentPart = ContentItemHelper.getBlob(ci,
             singleton("text/plain"));
         assertNotNull(contentPart);
         Blob plainTextBlob = contentPart.getValue();
         assertNotNull(plainTextBlob);
         assertContentRegexp(plainTextBlob,
             "Test Title",
             "Test Artist",
             "Test Album");
         //validate XHTML results
         contentPart = ContentItemHelper.getBlob(ci,
             singleton("application/xhtml+xml"));
         assertNotNull(contentPart);
         Blob xhtmlBlob = contentPart.getValue();
         assertNotNull(xhtmlBlob);
         //Test AudioTrack metadata
         NonLiteral audioTrack = verifyNonLiteral(ci, new UriRef(NamespaceEnum.media+"hasTrack"));
         //types
         verifyValues(ci, audioTrack, RDF.type,
             new UriRef(NamespaceEnum.media+"MediaFragment"),
             new UriRef(NamespaceEnum.media+"Track"),
             new UriRef(NamespaceEnum.media+"AudioTrack"));
         //properties
         verifyValue(ci, audioTrack, new UriRef(NamespaceEnum.media+"hasFormat"), XSD.string, "Stereo");
         verifyValue(ci, audioTrack, new UriRef(NamespaceEnum.media+"samplingRate"), XSD.int_, "44100");
         verifyValue(ci, audioTrack, new UriRef(NamespaceEnum.media+"hasCompression"), XSD.string, "MP3");
     }
     /**
      * Tests mappings for the Mp4 metadata extraction capabilities added to
      * Tika 1.1 (STANBOL-627)
      * @throws EngineException
      * @throws IOException
      * @throws ParseException
      */
     @Test
     public void testMp4() throws EngineException, IOException, ParseException {
         log.info(">>> testMp4 <<<");
         ContentItem ci = createContentItem("testMP4.m4a", "audio/mp4");
         assertFalse(engine.canEnhance(ci) == CANNOT_ENHANCE);
         engine.computeEnhancements(ci);
         Entry<UriRef,Blob> contentPart = ContentItemHelper.getBlob(ci,
             singleton("text/plain"));
         assertNotNull(contentPart);
         Blob plainTextBlob = contentPart.getValue();
         assertNotNull(plainTextBlob);
         assertContentRegexp(plainTextBlob,
             "Test Title",
             "Test Artist",
             "Test Album");
         //validate XHTML results
         contentPart = ContentItemHelper.getBlob(ci,
             singleton("application/xhtml+xml"));
         assertNotNull(contentPart);
         Blob xhtmlBlob = contentPart.getValue();
         assertNotNull(xhtmlBlob);
         //Test AudioTrack metadata
         NonLiteral audioTrack = verifyNonLiteral(ci, new UriRef(NamespaceEnum.media+"hasTrack"));
         //types
         verifyValues(ci, audioTrack, RDF.type,
             new UriRef(NamespaceEnum.media+"MediaFragment"),
             new UriRef(NamespaceEnum.media+"Track"),
             new UriRef(NamespaceEnum.media+"AudioTrack"));
         //properties
         verifyValue(ci, audioTrack, new UriRef(NamespaceEnum.media+"hasFormat"), XSD.string, "Stereo");
         verifyValue(ci, audioTrack, new UriRef(NamespaceEnum.media+"samplingRate"), XSD.int_, "44100");
         verifyValue(ci, audioTrack, new UriRef(NamespaceEnum.media+"hasCompression"), XSD.string, "M4A");
     }
     @Test
     public void testGEOMetadata() throws EngineException, IOException, ParseException{
         log.info(">>> testGEOMetadata <<<");
         //first validate Media Resource Ontology
         UriRef hasLocation = new UriRef(NamespaceEnum.media+"hasLocation");
         UriRef locationLatitude = new UriRef(NamespaceEnum.media+"locationLatitude");
         UriRef locationLongitude = new UriRef(NamespaceEnum.media+"locationLongitude");
         //UriRef locationAltitude = new UriRef(NamespaceEnum.media+"locationAltitude");
         ContentItem ci = createContentItem("testJPEG_GEO.jpg", OCTET_STREAM.toString());//"video/x-ms-asf");
         assertFalse(engine.canEnhance(ci) == CANNOT_ENHANCE);
         engine.computeEnhancements(ci);
         Iterator<Triple> it = ci.getMetadata().filter(ci.getUri(),hasLocation, null);
         assertTrue(it.hasNext());
         Resource r = it.next().getObject();
         assertFalse(it.hasNext());
         assertTrue(r instanceof NonLiteral);
         NonLiteral location = verifyNonLiteral(ci, hasLocation);
         //lat
         verifyValue(ci, location, locationLatitude, XSD.double_, "12.54321");
         //long
         verifyValue(ci, location, locationLongitude, XSD.double_, "-54.1234");

         //second the GEO ont
         UriRef lat = new UriRef(NamespaceEnum.geo+"lat");
         UriRef lon = new UriRef(NamespaceEnum.geo+"long");
         //lat
         verifyValue(ci, lat, XSD.double_, "12.54321");
         //long
         verifyValue(ci, lon, XSD.double_, "-54.1234");
     }


     public void testMetadata() throws EngineException, ParseException, IOException{
         log.info(">>> testMetadata <<<");
         ContentItem ci = createContentItem("testMP3id3v24.mp3", "audio/mpeg");
         assertFalse(engine.canEnhance(ci) == CANNOT_ENHANCE);
         engine.computeEnhancements(ci);
         verifyValue(ci,new UriRef(NamespaceEnum.dc+"creator"),null,"Test Artist");
         verifyValue(ci, new UriRef(NamespaceEnum.dc+"title"),null,"Test Album");
         verifyValue(ci, new UriRef(NamespaceEnum.dc+"format"),null,"audio/mpeg");
         verifyValue(ci, new UriRef(NamespaceEnum.media+"hasFormat"),null,"audio/mpeg");
         verifyValue(ci, new UriRef(NamespaceEnum.media+"mainOriginalTitle"),null,"Test Album");
         verifyValue(ci, new UriRef(NamespaceEnum.media+"hasContributor"),null,"Test Artist");
         verifyValue(ci, new UriRef(NamespaceEnum.media+"releaseDate"),XSD.string,"2008");
         verifyValue(ci, new UriRef(NamespaceEnum.media+"hasGenre"),null,"Rock");
         verifyValue(ci, new UriRef(NamespaceEnum.media+"hasCreator"),null,"Test Artist");
     }
     @Test
     public void testExifMetadata() throws EngineException, ParseException, IOException {
         log.info(">>> testExifMetadata <<<");
         String exif = "http://www.semanticdesktop.org/ontologies/2007/05/10/nexif#";
         ContentItem ci = createContentItem("testJPEG_EXIF.jpg", "image/jpeg");
         assertFalse(engine.canEnhance(ci) == CANNOT_ENHANCE);
         engine.computeEnhancements(ci);
         verifyValue(ci, new UriRef(exif+"make"),null,"Canon");
         verifyValue(ci, new UriRef(exif+"software"),null,"Adobe Photoshop CS3 Macintosh");
         verifyValue(ci, new UriRef(exif+"dateTimeOriginal"),XSD.dateTime,"2009-08-11T09:09:45");
         verifyValue(ci, new UriRef(exif+"relatedImageWidth"),XSD.int_,"100");
         verifyValue(ci, new UriRef(exif+"fNumber"),XSD.double_,"5.6");
         verifyValue(ci, new UriRef(exif+"model"),null,"Canon EOS 40D");
         verifyValue(ci, new UriRef(exif+"isoSpeedRatings"),XSD.int_,"400");
         verifyValue(ci, new UriRef(exif+"xResolution"),XSD.double_,"240.0");
         verifyValue(ci, new UriRef(exif+"flash"),XSD.boolean_,"false");
         verifyValue(ci, new UriRef(exif+"exposureTime"),XSD.double_,"6.25E-4");
         verifyValue(ci, new UriRef(exif+"yResolution"),XSD.double_,"240.0");
         verifyValue(ci, new UriRef(exif+"resolutionUnit"),XSD.string,"Inch");
         verifyValue(ci, new UriRef(exif+"focalLength"),XSD.double_,"194.0");
         verifyValue(ci, new UriRef(exif+"relatedImageLength"),XSD.int_,"68");
         verifyValue(ci, new UriRef(exif+"bitsPerSample"),XSD.int_,"8");
         //also Media Ontology mappings for Exif
         verifyValue(ci, new UriRef(NamespaceEnum.media+"frameHeight"),XSD.int_,"68");
         verifyValue(ci, new UriRef(NamespaceEnum.media+"frameWidth"),XSD.int_,"100");
         verifyValue(ci, new UriRef(NamespaceEnum.media+"hasFormat"),null,"image/jpeg");
         verifyValue(ci, new UriRef(NamespaceEnum.media+"creationDate"),XSD.dateTime,"2009-08-11T09:09:45");
         verifyValues(ci, new UriRef(NamespaceEnum.media+"hasKeyword"),null,"serbor","moscow-birds","canon-55-250");
         //and finally the mapped DC properties
         verifyValue(ci, new UriRef(NamespaceEnum.dc+"format"),null,"image/jpeg");
         verifyValue(ci, new UriRef(NamespaceEnum.dc+"created"),XSD.dateTime,"2009-08-11T09:09:45");
         verifyValue(ci, new UriRef(NamespaceEnum.dc+"modified"),XSD.dateTime,"2009-10-02T23:02:49");
         verifyValues(ci, new UriRef(NamespaceEnum.dc+"subject"), null, "serbor","moscow-birds","canon-55-250");
     }

     /**
      * Tests unmapped properties as added by <a href="https://issues.apache.org/jira/browse/STANBOL-947">
      * STANBOL-947</a>
      * @throws EngineException
      * @throws IOException
      * @throws ParseException
      */
     @Test
     public void testUnmappedProperties() throws EngineException, IOException, ParseException {
         log.info(">>> testUnmappedProperties <<<");
         //reuses the image with EXIF metadata
         ContentItem ci = createContentItem("testMP4.m4a", "audio/mp4");
         assertFalse(engine.canEnhance(ci) == CANNOT_ENHANCE);
         engine.computeEnhancements(ci);
         //test that the "xmpDM:logComment" is present
         verifyValue(ci, new UriRef("urn:tika.apache.org:tika:xmpDM:logComment"), null,"Test Comments");
     }

     @Test
     public void testContentTypeDetection() throws EngineException, IOException {
         log.info(">>> testContentTypeDetection <<<");
         ContentItem ci = createContentItem("test.pdf", OCTET_STREAM.toString());
         assertFalse(engine.canEnhance(ci) == CANNOT_ENHANCE);
         engine.computeEnhancements(ci);
         Entry<UriRef,Blob> contentPart = ContentItemHelper.getBlob(ci,
             singleton("text/plain"));
         assertNotNull(contentPart);
         Blob plainTextBlob = contentPart.getValue();
         assertNotNull(plainTextBlob);
         assertContentRegexp(plainTextBlob,
             "The Apache Stanbol Enhancer",
             "The Stanbol enhancer can detect famous cities");
         //validate XHTML results
         contentPart = ContentItemHelper.getBlob(ci,
             singleton("application/xhtml+xml"));
         assertNotNull(contentPart);
         Blob xhtmlBlob = contentPart.getValue();
         assertNotNull(xhtmlBlob);
         assertContentRegexp(xhtmlBlob,
             "<html xmlns=\"http://www.w3.org/1999/xhtml\">",
             "<head>",
             "<meta name=",
             "<div class=\"page\">",
             "The Apache Stanbol Enhancer",
             "The Stanbol enhancer can detect famous cities",
             "</body></html>");
     }
     /**
      * Tests that text is not processed
      */
     @Test
     public void testText() throws EngineException, IOException {
         log.info(">>> testText <<<");
         String text = "The Stanbol enhancer can detect famous cities such as " +
         		"Paris and people such as Bob Marley.";
         ContentItem ci = ciFactory.createContentItem(new StringSource(text));
         Assert.assertEquals(1, ContentItemHelper.getContentParts(ci, Blob.class).size());
     }
     @Test
     public void testUnsupported() throws EngineException, IOException {
         log.info(">>> testUnsupported <<<");
         ContentItem ci = createContentItem("test.pages", "application/x-iwork-pages-sffpages");
         assertFalse(engine.canEnhance(ci) == CANNOT_ENHANCE);
         engine.computeEnhancements(ci);
         Entry<UriRef,Blob> contentPart = ContentItemHelper.getBlob(ci,
             singleton("text/plain"));
         //it MUST NOT give an error but also not add a content part
         assertNull(contentPart);
         //only the original content
         assertEquals(1, ContentItemHelper.getContentParts(ci, Blob.class).size());
     }
     @Test
     public void testXhtml() throws EngineException, IOException {
         log.info(">>> testXhtml <<<");
         ContentItem ci = createContentItem("test.xhtml", XHTML.toString()+"; charset=UTF-8");
         assertFalse(engine.canEnhance(ci) == CANNOT_ENHANCE);
         engine.computeEnhancements(ci);
         Entry<UriRef,Blob> contentPart = ContentItemHelper.getBlob(ci,
             singleton("text/plain"));
         assertNotNull(contentPart);
         Blob plainTextBlob = contentPart.getValue();
         assertNotNull(plainTextBlob);
         assertContentRegexp(plainTextBlob,
             "The Apache Stanbol Enhancer",
             "The Stanbol enhancer can detect famous cities");
         //only the original and the plain text
         // this asserts that no xhtml is parsed from the parsed xhtml content
         assertEquals(2, ContentItemHelper.getContentParts(ci, Blob.class).size());
     }

     private ContentItem createContentItem(String resourceName, String contentType) throws IOException {
         InputStream in = TikaEngineTest.class.getClassLoader().getResourceAsStream(resourceName);
         assertNotNull(in);
         return ciFactory.createContentItem(new StreamSource(in,contentType));
     }
     /**
      * Tests if the parsed regex pattern are contained in any line of the parsed
      * test
      * @throws IOException
      */
     public void assertContentRegexp(Blob blob, String... regexp) throws IOException {
         Charset charset;
         if(blob.getParameter().containsKey("charset")){
             charset = Charset.forName(blob.getParameter().get("charset"));
         } else {
             charset = Charset.defaultCharset();
         }
         Reader reader = null;
         nextPattern:
         for (String expr : regexp) {
             if(reader != null){
                 closeQuietly(reader);
             }
             final Pattern p = Pattern.compile(".*" + expr + ".*");
             reader = new InputStreamReader(blob.getStream(), charset);
             final LineIterator it = new LineIterator(reader);
             while (it.hasNext()) {
                 final String line = it.nextLine();
                 if (p.matcher(line).matches()) {
                     continue nextPattern;
                 }
             }
             fail(this + ": no match for regexp '" + expr + "', content=\n" +
                     IOUtils.toString(blob.getStream(), charset.toString()));
         }
     }
     @After
     public void unbindServices() {/*nothing to do */}

     @AfterClass
     public static void shutdownServices() {
         engine.deactivate(context);
         engine = null;
     }

     /*
      * Internal helper methods
      */
     private NonLiteral verifyNonLiteral(ContentItem ci, UriRef property){
         return verifyNonLiteral(ci, ci.getUri(), property);
     }
     private static NonLiteral verifyNonLiteral(ContentItem ci, UriRef subject, UriRef property){
         Iterator<Triple> it = ci.getMetadata().filter(subject,property, null);
         assertTrue(it.hasNext());
         Resource r = it.next().getObject();
         assertFalse(it.hasNext());
         assertTrue(r instanceof NonLiteral);
         return (NonLiteral)r;
     }
     private static UriRef verifyValue(ContentItem ci, UriRef property, UriRef value){
         return verifyValue(ci, ci.getUri(), property, value);
     }
     private static UriRef verifyValue(ContentItem ci, NonLiteral subject, UriRef property, UriRef value){
         Iterator<Triple> it = ci.getMetadata().filter(subject,property, null);
         assertTrue(it.hasNext());
         Resource r = it.next().getObject();
         assertFalse(it.hasNext());
         assertTrue(r instanceof UriRef);
         assertEquals(value,r);
         return (UriRef)r;
    }
     private static Literal verifyValue(ContentItem ci, UriRef property, UriRef dataType, String lexValue) throws ParseException{
         return verifyValue(ci, ci.getUri(), property, dataType, lexValue);
     }
     private static Literal verifyValue(ContentItem ci, NonLiteral subject, UriRef property, UriRef dataType, String lexValue) throws ParseException{
         Iterator<Triple> it = ci.getMetadata().filter(subject,property, null);
         assertTrue(it.hasNext());
         Resource r = it.next().getObject();
         assertFalse(it.hasNext());
         if(dataType == null){
             assertTrue(r instanceof PlainLiteral);
         } else {
             assertTrue(r instanceof TypedLiteral);
             assertEquals(dataType, ((TypedLiteral)r).getDataType());
         }
         //if we check dates and the lexical value is not UTC than we need to
         //consider the time zone of the host running this test
         if(XSD.dateTime.equals(dataType) && lexValue.charAt(lexValue.length()-1) != 'Z'){
             Date expectedDate = dateDefaultTimezone.parse(lexValue);
             assertEquals(expectedDate, lf.createObject(Date.class, ((TypedLiteral)r)));
         } else {
             assertEquals(lexValue,((Literal)r).getLexicalForm());
         }
         return (Literal)r;
     }
     private static Set<Literal> verifyValues(ContentItem ci, UriRef property, UriRef dataType, String...lexValues){
         return verifyValues(ci, ci.getUri(), property, dataType, lexValues);
     }
     private static Set<Literal> verifyValues(ContentItem ci, NonLiteral subject, UriRef property, UriRef dataType, String...lexValues){
         Iterator<Triple> it = ci.getMetadata().filter(subject,property, null);
         assertTrue(it.hasNext());
         Set<String> expected = new HashSet<String>(Arrays.asList(lexValues));
         Set<Literal> found = new HashSet<Literal>(expected.size());
         while(it.hasNext()){
             Resource r = it.next().getObject();
             if(dataType == null){
                 assertTrue(r instanceof PlainLiteral);
             } else {
                 assertTrue(r instanceof TypedLiteral);
                 assertEquals(dataType, ((TypedLiteral)r).getDataType());
             }
             assertTrue(expected.remove(((Literal)r).getLexicalForm()));
             found.add((Literal)r);
         }
         return found;
     }
     private static Set<NonLiteral> verifyValues(ContentItem ci, NonLiteral subject, UriRef property, NonLiteral...references){
         Iterator<Triple> it = ci.getMetadata().filter(subject,property, null);
         assertTrue(it.hasNext());
         Set<NonLiteral> expected = new HashSet<NonLiteral>(Arrays.asList(references));
         Set<NonLiteral> found = new HashSet<NonLiteral>(expected.size());
         while(it.hasNext()){
             Resource r = it.next().getObject();
             assertTrue(r instanceof NonLiteral);
             assertTrue(expected.remove(r));
             found.add((NonLiteral)r);
         }
         return found;
     }

 }