blob: c2e87e1bb8d53aa370d22bc18ed2c77e80d3a908 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.parser.xmp;
import static java.nio.charset.StandardCharsets.UTF_8;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.Calendar;
import java.util.List;
import org.apache.jempbox.xmp.ResourceEvent;
import org.apache.jempbox.xmp.ResourceRef;
import org.apache.jempbox.xmp.XMPMetadata;
import org.apache.jempbox.xmp.XMPSchemaDublinCore;
import org.apache.jempbox.xmp.XMPSchemaMediaManagement;
import org.w3c.dom.Document;
import org.xml.sax.SAXException;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.Property;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.metadata.XMPMM;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.utils.DateUtils;
import org.apache.tika.utils.XMLReaderUtils;
public class JempboxExtractor {
//TODO: change signature to require parsecontext from parse
private static final ParseContext EMPTY_PARSE_CONTEXT = new ParseContext();
// The XMP spec says it must be unicode, but for most file formats it specifies
// "must be encoded in UTF-8"
private static final String DEFAULT_XMP_CHARSET = UTF_8.name();
private static volatile int MAX_EVENT_HISTORY_IN_XMPMM = 1024;
private XMPPacketScanner scanner = new XMPPacketScanner();
private Metadata metadata;
public JempboxExtractor(Metadata metadata) {
this.metadata = metadata;
}
/**
* Tries to extract Dublin Core schema from XMP. If XMPMetadata is null
* or if the DC schema is null, this will return without throwing an exception.
*
* @param xmpMetadata XMPMetadata to process
* @param metadata Tika's metadata to write to
*/
public static void extractDublinCore(XMPMetadata xmpMetadata, Metadata metadata) {
if (xmpMetadata == null) {
return;
}
XMPSchemaDublinCore dc = null;
try {
dc = xmpMetadata.getDublinCoreSchema();
} catch (IOException e) {
//swallow
}
if (dc == null) {
return;
}
if (dc.getTitle() != null) {
metadata.set(TikaCoreProperties.TITLE, dc.getTitle());
}
if (dc.getDescription() != null) {
metadata.set(TikaCoreProperties.DESCRIPTION, dc.getDescription());
}
if (dc.getCreators() != null && dc.getCreators().size() > 0) {
metadata.set(TikaCoreProperties.CREATOR, joinCreators(dc.getCreators()));
}
if (dc.getSubjects() != null && dc.getSubjects().size() > 0) {
for (String keyword : dc.getSubjects()) {
metadata.add(TikaCoreProperties.SUBJECT, keyword);
}
// TODO should we set SUBJECT too?
// All tested photo managers set the same in Iptc.Application2.Keywords
// and Xmp.dc.subject
}
}
protected static String joinCreators(List<String> creators) {
if (creators == null || creators.size() == 0) {
return "";
}
if (creators.size() == 1) {
return creators.get(0);
}
StringBuffer c = new StringBuffer();
for (String s : creators) {
c.append(", ").append(s);
}
return c.substring(2);
}
/**
* Extracts Media Management metadata from XMP.
* <p>
* Silently swallows exceptions.
*
* @param xmp
* @param metadata
*/
public static void extractXMPMM(XMPMetadata xmp, Metadata metadata) {
if (xmp == null) {
return;
}
XMPSchemaMediaManagement mmSchema = null;
try {
mmSchema = xmp.getMediaManagementSchema();
} catch (IOException e) {
//swallow
return;
}
if (mmSchema != null) {
addMetadata(metadata, XMPMM.DOCUMENTID, mmSchema.getDocumentID());
//not currently supported by JempBox...
// metadata.set(XMPMM.INSTANCEID, mmSchema.getInstanceID());
ResourceRef derivedFrom = mmSchema.getDerivedFrom();
if (derivedFrom != null) {
try {
addMetadata(metadata, XMPMM.DERIVED_FROM_DOCUMENTID,
derivedFrom.getDocumentID());
} catch (NullPointerException e) {
//swallow
}
try {
addMetadata(metadata, XMPMM.DERIVED_FROM_INSTANCEID,
derivedFrom.getInstanceID());
} catch (NullPointerException e) {
//swallow
}
//TODO: not yet supported by XMPBox...extract OriginalDocumentID
//in DerivedFrom section
}
if (mmSchema.getHistory() != null) {
int eventsAdded = 0;
for (ResourceEvent stevt : mmSchema.getHistory()) {
if (eventsAdded >= MAX_EVENT_HISTORY_IN_XMPMM) {
break;
}
String instanceId = null;
String action = null;
Calendar when = null;
String softwareAgent = null;
try {
instanceId = stevt.getInstanceID();
action = stevt.getAction();
when = stevt.getWhen();
softwareAgent = stevt.getSoftwareAgent();
//instanceid can throw npe; getWhen can throw IOException
} catch (NullPointerException | IOException e) {
//swallow
}
if (instanceId != null && instanceId.trim().length() > 0) {
//for absent data elements, pass in empty strings so
//that parallel arrays will have matching offsets
//for absent data
action = (action == null) ? "" : action;
String dateString = (when == null) ? "" : DateUtils.formatDate(when);
softwareAgent = (softwareAgent == null) ? "" : softwareAgent;
metadata.add(XMPMM.HISTORY_EVENT_INSTANCEID, instanceId);
metadata.add(XMPMM.HISTORY_ACTION, action);
metadata.add(XMPMM.HISTORY_WHEN, dateString);
metadata.add(XMPMM.HISTORY_SOFTWARE_AGENT, softwareAgent);
eventsAdded++;
}
}
}
}
}
private static void addMetadata(Metadata m, Property p, String value) {
if (value != null) {
if (p.isMultiValuePermitted() || m.get(p) == null) {
m.add(p, value);
}
}
}
/**
* @return maximum number of events to extract from the XMPMM history.
*/
public static int getMaxXMPMMHistory() {
return MAX_EVENT_HISTORY_IN_XMPMM;
}
/**
* Maximum number of events to extract from the
* event history in the XMP Media Management (XMPMM) section.
* The extractor will silently stop adding events after it
* has reached this threshold.
* <p>
* The default is 1024.
*/
public static void setMaxXMPMMHistory(int maxEvents) {
MAX_EVENT_HISTORY_IN_XMPMM = maxEvents;
}
public void parse(InputStream file) throws IOException, TikaException {
ByteArrayOutputStream xmpraw = new ByteArrayOutputStream();
if (!scanner.parse(file, xmpraw)) {
return;
}
XMPMetadata xmp = null;
try (InputStream decoded = new ByteArrayInputStream(xmpraw.toByteArray())) {
Document dom = XMLReaderUtils.buildDOM(decoded, EMPTY_PARSE_CONTEXT);
if (dom != null) {
xmp = new XMPMetadata(dom);
}
} catch (IOException | SAXException e) {
//
}
extractDublinCore(xmp, metadata);
extractXMPMM(xmp, metadata);
}
}