blob: 4bba96289f31ea65230784e36362832a9eb1d57d [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.oodt.cas.metadata.extractors;
//JDK imports
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStream;
import java.util.Enumeration;
import java.util.logging.Logger;
//OODT imports
import org.apache.oodt.cas.metadata.Metadata;
import org.apache.oodt.cas.metadata.exceptions.MetExtractionException;
import org.apache.tika.Tika;
/**
* @author rverma
* @author arni
* @author mattmann
* @version $Revision$
*
* <p>
* A Met Extractor that invokes Apache Tika to automatically detect
* relevant metadata for a given product.
* </p>
* .
* <p>
* To use this extractor, a met extractor config file must be referenced.
* This can take the form of a Java properties file that includes,
* at a minimum, the 'ProductType=...' metadata key specified.
* </p>
*/
public class TikaCmdLineMetExtractor extends CmdLineMetExtractor {
private static final Logger LOG = Logger
.getLogger(TikaCmdLineMetExtractor.class.getName());
protected static MetReaderConfigReader reader =
new MetReaderConfigReader();
public TikaCmdLineMetExtractor() {
super(reader);
}
/*
* (non-Javadoc)
*
* @see
* org.apache.oodt.cas.metadata.AbstractMetExtractor#extractMetadata(java
* .io.File)
*/
@Override
public Metadata extrMetadata(File file) throws MetExtractionException {
try {
org.apache.tika.metadata.Metadata tikaMet =
new org.apache.tika.metadata.Metadata();
Metadata met = new Metadata();
InputStream is = new FileInputStream(file);
// extract met from prod using tika
LOG.fine("Invoking tika extractor on file ["
+ file.getAbsolutePath() + "]");
Tika tika = new Tika();
tika.parse(is, tikaMet); // extract metadata
tikaMet.add("content", tika.parseToString(file)); // extract content
LOG.fine("Number of captured tika metadata keys: ["
+ tikaMet.names().length + "]");
// copy tika met into oodt met
for (String key : tikaMet.names()) {
met.addMetadata(key, tikaMet.get(key));
LOG.fine("Added tika met key [" + key + "] with value ["
+ met.getMetadata(key) + "]");
}
MetReaderConfig myConfig = (MetReaderConfig) this.config;
// add config file met
Enumeration<Object> configMetKeys = myConfig.keys();
while (configMetKeys.hasMoreElements()) {
String configMetKey = (String) configMetKeys.nextElement();
String configMetKeyVal = (String) myConfig.get(configMetKey);
met.addMetadata(configMetKey, configMetKeyVal);
LOG.fine("Added config file met key [" + configMetKey +
"] with value [" + met.getMetadata(configMetKeyVal) + "]");
}
return met;
} catch (Exception e) {
e.printStackTrace();
LOG.severe(e.getMessage());
throw new MetExtractionException(e.getMessage());
}
}
public static void main(String[] args) throws Exception {
processMain(args, new TikaCmdLineMetExtractor());
}
}