tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-xmp-commons/src/main/java/org/apache/tika/parser/xmp/JempboxExtractor.java - tika - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.tika.parser.xmp;

 import static java.nio.charset.StandardCharsets.UTF_8;

 import java.io.ByteArrayInputStream;
 import java.io.ByteArrayOutputStream;
 import java.io.IOException;
 import java.io.InputStream;
 import java.util.Calendar;
 import java.util.List;

 import org.apache.jempbox.xmp.ResourceEvent;
 import org.apache.jempbox.xmp.ResourceRef;
 import org.apache.jempbox.xmp.XMPMetadata;
 import org.apache.jempbox.xmp.XMPSchemaDublinCore;
 import org.apache.jempbox.xmp.XMPSchemaMediaManagement;
 import org.w3c.dom.Document;
 import org.xml.sax.SAXException;

 import org.apache.tika.exception.TikaException;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.Property;
 import org.apache.tika.metadata.TikaCoreProperties;
 import org.apache.tika.metadata.XMPMM;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.utils.DateUtils;
 import org.apache.tika.utils.XMLReaderUtils;

 public class JempboxExtractor {

     //TODO: change signature to require parsecontext from parse
     private static final ParseContext EMPTY_PARSE_CONTEXT = new ParseContext();
     // The XMP spec says it must be unicode, but for most file formats it specifies
     // "must be encoded in UTF-8"
     private static final String DEFAULT_XMP_CHARSET = UTF_8.name();
     private static volatile int MAX_EVENT_HISTORY_IN_XMPMM = 1024;
     private XMPPacketScanner scanner = new XMPPacketScanner();
     private Metadata metadata;

     public JempboxExtractor(Metadata metadata) {
         this.metadata = metadata;
     }

     /**
      * Tries to extract Dublin Core schema from XMP.  If XMPMetadata is null
      * or if the DC schema is null, this will return without throwing an exception.
      *
      * @param xmpMetadata XMPMetadata to process
      * @param metadata    Tika's metadata to write to
      */
     public static void extractDublinCore(XMPMetadata xmpMetadata, Metadata metadata) {
         if (xmpMetadata == null) {
             return;
         }
         XMPSchemaDublinCore dc = null;
         try {
             dc = xmpMetadata.getDublinCoreSchema();
         } catch (IOException e) {
             //swallow
         }
         if (dc == null) {
             return;
         }
         if (dc.getTitle() != null) {
             metadata.set(TikaCoreProperties.TITLE, dc.getTitle());
         }
         if (dc.getDescription() != null) {
             metadata.set(TikaCoreProperties.DESCRIPTION, dc.getDescription());
         }
         if (dc.getCreators() != null && dc.getCreators().size() > 0) {
             metadata.set(TikaCoreProperties.CREATOR, joinCreators(dc.getCreators()));
         }
         if (dc.getSubjects() != null && dc.getSubjects().size() > 0) {
             for (String keyword : dc.getSubjects()) {
                 metadata.add(TikaCoreProperties.SUBJECT, keyword);
             }
             // TODO should we set SUBJECT too?
             // All tested photo managers set the same in Iptc.Application2.Keywords
             // and Xmp.dc.subject
         }
     }

     protected static String joinCreators(List<String> creators) {
         if (creators == null || creators.size() == 0) {
             return "";
         }
         if (creators.size() == 1) {
             return creators.get(0);
         }
         StringBuffer c = new StringBuffer();
         for (String s : creators) {
             c.append(", ").append(s);
         }
         return c.substring(2);
     }

     /**
      * Extracts Media Management metadata from XMP.
      * <p>
      * Silently swallows exceptions.
      *
      * @param xmp
      * @param metadata
      */
     public static void extractXMPMM(XMPMetadata xmp, Metadata metadata) {
         if (xmp == null) {
             return;
         }
         XMPSchemaMediaManagement mmSchema = null;
         try {
             mmSchema = xmp.getMediaManagementSchema();
         } catch (IOException e) {
             //swallow
             return;
         }
         if (mmSchema != null) {
             addMetadata(metadata, XMPMM.DOCUMENTID, mmSchema.getDocumentID());
             //not currently supported by JempBox...
 //          metadata.set(XMPMM.INSTANCEID, mmSchema.getInstanceID());

             ResourceRef derivedFrom = mmSchema.getDerivedFrom();
             if (derivedFrom != null) {
                 try {
                     addMetadata(metadata, XMPMM.DERIVED_FROM_DOCUMENTID,
                             derivedFrom.getDocumentID());
                 } catch (NullPointerException e) {
                     //swallow
                 }

                 try {
                     addMetadata(metadata, XMPMM.DERIVED_FROM_INSTANCEID,
                             derivedFrom.getInstanceID());
                 } catch (NullPointerException e) {
                     //swallow
                 }

                 //TODO: not yet supported by XMPBox...extract OriginalDocumentID
                 //in DerivedFrom section
             }
             if (mmSchema.getHistory() != null) {
                 int eventsAdded = 0;
                 for (ResourceEvent stevt : mmSchema.getHistory()) {
                     if (eventsAdded >= MAX_EVENT_HISTORY_IN_XMPMM) {
                         break;
                     }
                     String instanceId = null;
                     String action = null;
                     Calendar when = null;
                     String softwareAgent = null;
                     try {
                         instanceId = stevt.getInstanceID();
                         action = stevt.getAction();
                         when = stevt.getWhen();
                         softwareAgent = stevt.getSoftwareAgent();

                         //instanceid can throw npe; getWhen can throw IOException
                     } catch (NullPointerException | IOException e) {
                         //swallow
                     }
                     if (instanceId != null && instanceId.trim().length() > 0) {
                         //for absent data elements, pass in empty strings so
                         //that parallel arrays will have matching offsets
                         //for absent data

                         action = (action == null) ? "" : action;
                         String dateString = (when == null) ? "" : DateUtils.formatDate(when);
                         softwareAgent = (softwareAgent == null) ? "" : softwareAgent;

                         metadata.add(XMPMM.HISTORY_EVENT_INSTANCEID, instanceId);
                         metadata.add(XMPMM.HISTORY_ACTION, action);
                         metadata.add(XMPMM.HISTORY_WHEN, dateString);
                         metadata.add(XMPMM.HISTORY_SOFTWARE_AGENT, softwareAgent);
                         eventsAdded++;
                     }
                 }
             }
         }
     }

     private static void addMetadata(Metadata m, Property p, String value) {
         if (value != null) {
             if (p.isMultiValuePermitted() || m.get(p) == null) {
                 m.add(p, value);
             }
         }
     }

     /**
      * @return maximum number of events to extract from the XMPMM history.
      */
     public static int getMaxXMPMMHistory() {
         return MAX_EVENT_HISTORY_IN_XMPMM;
     }

     /**
      * Maximum number of events to extract from the
      * event history in the XMP Media Management (XMPMM) section.
      * The extractor will silently stop adding events after it
      * has reached this threshold.
      * <p>
      * The default is 1024.
      */
     public static void setMaxXMPMMHistory(int maxEvents) {
         MAX_EVENT_HISTORY_IN_XMPMM = maxEvents;
     }

     public void parse(InputStream file) throws IOException, TikaException {
         ByteArrayOutputStream xmpraw = new ByteArrayOutputStream();
         if (!scanner.parse(file, xmpraw)) {
             return;
         }

         XMPMetadata xmp = null;
         try (InputStream decoded = new ByteArrayInputStream(xmpraw.toByteArray())) {
             Document dom = XMLReaderUtils.buildDOM(decoded, EMPTY_PARSE_CONTEXT);
             if (dom != null) {
                 xmp = new XMPMetadata(dom);
             }
         } catch (IOException | SAXException e) {
             //
         }
         extractDublinCore(xmp, metadata);
         extractXMPMM(xmp, metadata);
     }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.tika.parser.xmp;

	import static java.nio.charset.StandardCharsets.UTF_8;

	import java.io.ByteArrayInputStream;
	import java.io.ByteArrayOutputStream;
	import java.io.IOException;
	import java.io.InputStream;
	import java.util.Calendar;
	import java.util.List;

	import org.apache.jempbox.xmp.ResourceEvent;
	import org.apache.jempbox.xmp.ResourceRef;
	import org.apache.jempbox.xmp.XMPMetadata;
	import org.apache.jempbox.xmp.XMPSchemaDublinCore;
	import org.apache.jempbox.xmp.XMPSchemaMediaManagement;
	import org.w3c.dom.Document;
	import org.xml.sax.SAXException;

	import org.apache.tika.exception.TikaException;
	import org.apache.tika.metadata.Metadata;
	import org.apache.tika.metadata.Property;
	import org.apache.tika.metadata.TikaCoreProperties;
	import org.apache.tika.metadata.XMPMM;
	import org.apache.tika.parser.ParseContext;
	import org.apache.tika.utils.DateUtils;
	import org.apache.tika.utils.XMLReaderUtils;

	public class JempboxExtractor {

	//TODO: change signature to require parsecontext from parse
	private static final ParseContext EMPTY_PARSE_CONTEXT = new ParseContext();
	// The XMP spec says it must be unicode, but for most file formats it specifies
	// "must be encoded in UTF-8"
	private static final String DEFAULT_XMP_CHARSET = UTF_8.name();
	private static volatile int MAX_EVENT_HISTORY_IN_XMPMM = 1024;
	private XMPPacketScanner scanner = new XMPPacketScanner();
	private Metadata metadata;

	public JempboxExtractor(Metadata metadata) {
	this.metadata = metadata;
	}

	/**
	* Tries to extract Dublin Core schema from XMP. If XMPMetadata is null
	* or if the DC schema is null, this will return without throwing an exception.
	*
	* @param xmpMetadata XMPMetadata to process
	* @param metadata Tika's metadata to write to
	*/
	public static void extractDublinCore(XMPMetadata xmpMetadata, Metadata metadata) {
	if (xmpMetadata == null) {
	return;
	}
	XMPSchemaDublinCore dc = null;
	try {
	dc = xmpMetadata.getDublinCoreSchema();
	} catch (IOException e) {
	//swallow
	}
	if (dc == null) {
	return;
	}
	if (dc.getTitle() != null) {
	metadata.set(TikaCoreProperties.TITLE, dc.getTitle());
	}
	if (dc.getDescription() != null) {
	metadata.set(TikaCoreProperties.DESCRIPTION, dc.getDescription());
	}
	if (dc.getCreators() != null && dc.getCreators().size() > 0) {
	metadata.set(TikaCoreProperties.CREATOR, joinCreators(dc.getCreators()));
	}
	if (dc.getSubjects() != null && dc.getSubjects().size() > 0) {
	for (String keyword : dc.getSubjects()) {
	metadata.add(TikaCoreProperties.SUBJECT, keyword);
	}
	// TODO should we set SUBJECT too?
	// All tested photo managers set the same in Iptc.Application2.Keywords
	// and Xmp.dc.subject
	}
	}

	protected static String joinCreators(List<String> creators) {
	if (creators == null \|\| creators.size() == 0) {
	return "";
	}
	if (creators.size() == 1) {
	return creators.get(0);
	}
	StringBuffer c = new StringBuffer();
	for (String s : creators) {
	c.append(", ").append(s);
	}
	return c.substring(2);
	}

	/**
	* Extracts Media Management metadata from XMP.
	* <p>
	* Silently swallows exceptions.
	*
	* @param xmp
	* @param metadata
	*/
	public static void extractXMPMM(XMPMetadata xmp, Metadata metadata) {
	if (xmp == null) {
	return;
	}
	XMPSchemaMediaManagement mmSchema = null;
	try {
	mmSchema = xmp.getMediaManagementSchema();
	} catch (IOException e) {
	//swallow
	return;
	}
	if (mmSchema != null) {
	addMetadata(metadata, XMPMM.DOCUMENTID, mmSchema.getDocumentID());
	//not currently supported by JempBox...
	// metadata.set(XMPMM.INSTANCEID, mmSchema.getInstanceID());

	ResourceRef derivedFrom = mmSchema.getDerivedFrom();
	if (derivedFrom != null) {
	try {
	addMetadata(metadata, XMPMM.DERIVED_FROM_DOCUMENTID,
	derivedFrom.getDocumentID());
	} catch (NullPointerException e) {
	//swallow
	}

	try {
	addMetadata(metadata, XMPMM.DERIVED_FROM_INSTANCEID,
	derivedFrom.getInstanceID());
	} catch (NullPointerException e) {
	//swallow
	}

	//TODO: not yet supported by XMPBox...extract OriginalDocumentID
	//in DerivedFrom section
	}
	if (mmSchema.getHistory() != null) {
	int eventsAdded = 0;
	for (ResourceEvent stevt : mmSchema.getHistory()) {
	if (eventsAdded >= MAX_EVENT_HISTORY_IN_XMPMM) {
	break;
	}
	String instanceId = null;
	String action = null;
	Calendar when = null;
	String softwareAgent = null;
	try {
	instanceId = stevt.getInstanceID();
	action = stevt.getAction();
	when = stevt.getWhen();
	softwareAgent = stevt.getSoftwareAgent();

	//instanceid can throw npe; getWhen can throw IOException
	} catch (NullPointerException \| IOException e) {
	//swallow
	}
	if (instanceId != null && instanceId.trim().length() > 0) {
	//for absent data elements, pass in empty strings so
	//that parallel arrays will have matching offsets
	//for absent data

	action = (action == null) ? "" : action;
	String dateString = (when == null) ? "" : DateUtils.formatDate(when);
	softwareAgent = (softwareAgent == null) ? "" : softwareAgent;

	metadata.add(XMPMM.HISTORY_EVENT_INSTANCEID, instanceId);
	metadata.add(XMPMM.HISTORY_ACTION, action);
	metadata.add(XMPMM.HISTORY_WHEN, dateString);
	metadata.add(XMPMM.HISTORY_SOFTWARE_AGENT, softwareAgent);
	eventsAdded++;
	}
	}
	}
	}
	}

	private static void addMetadata(Metadata m, Property p, String value) {
	if (value != null) {
	if (p.isMultiValuePermitted() \|\| m.get(p) == null) {
	m.add(p, value);
	}
	}
	}

	/**
	* @return maximum number of events to extract from the XMPMM history.
	*/
	public static int getMaxXMPMMHistory() {
	return MAX_EVENT_HISTORY_IN_XMPMM;
	}

	/**
	* Maximum number of events to extract from the
	* event history in the XMP Media Management (XMPMM) section.
	* The extractor will silently stop adding events after it
	* has reached this threshold.
	* <p>
	* The default is 1024.
	*/
	public static void setMaxXMPMMHistory(int maxEvents) {
	MAX_EVENT_HISTORY_IN_XMPMM = maxEvents;
	}

	public void parse(InputStream file) throws IOException, TikaException {
	ByteArrayOutputStream xmpraw = new ByteArrayOutputStream();
	if (!scanner.parse(file, xmpraw)) {
	return;
	}

	XMPMetadata xmp = null;
	try (InputStream decoded = new ByteArrayInputStream(xmpraw.toByteArray())) {
	Document dom = XMLReaderUtils.buildDOM(decoded, EMPTY_PARSE_CONTEXT);
	if (dom != null) {
	xmp = new XMPMetadata(dom);
	}
	} catch (IOException \| SAXException e) {
	//
	}
	extractDublinCore(xmp, metadata);
	extractXMPMM(xmp, metadata);
	}
	}