blob: 204b99f3c86a2a412d82215f098f2621fd70900c [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.parser.microsoft;
import java.io.IOException;
import java.io.InputStream;
import java.util.Collections;
import java.util.Set;
import org.apache.poi.hemf.record.emf.HemfComment;
import org.apache.poi.hemf.record.emf.HemfRecord;
import org.apache.poi.hemf.record.emf.HemfRecordType;
import org.apache.poi.hemf.record.emf.HemfText;
import org.apache.poi.hemf.usermodel.HemfPicture;
import org.apache.poi.util.RecordFormatException;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.apache.tika.exception.TikaException;
import org.apache.tika.extractor.EmbeddedDocumentExtractor;
import org.apache.tika.extractor.EmbeddedDocumentUtil;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AbstractParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.sax.EmbeddedContentHandler;
import org.apache.tika.sax.XHTMLContentHandler;
/**
* Extracts files embedded in EMF and offers a
* very rough capability to extract text if there
* is text stored in the EMF.
* <p/>
* To improve text extraction, we'd have to implement
* quite a bit more at the POI level. We'd want to track changes
* in font and use that information for identifying character sets,
* inserting spaces and new lines.
*/
public class EMFParser extends AbstractParser {
private static final MediaType MEDIA_TYPE = MediaType.image("emf");
private static final MediaType WMF_MEDIA_TYPE = MediaType.image("wmf");
private static final Set<MediaType> SUPPORTED_TYPES = Collections.singleton(MEDIA_TYPE);
private static void handleEmbedded(byte[] data,
EmbeddedDocumentExtractor embeddedDocumentExtractor,
ContentHandler handler) throws TikaException, SAXException {
try (InputStream is = TikaInputStream.get(data)) {
Metadata embeddedMetadata = new Metadata();
if (embeddedDocumentExtractor.shouldParseEmbedded(embeddedMetadata)) {
embeddedDocumentExtractor
.parseEmbedded(is, new EmbeddedContentHandler(handler), embeddedMetadata,
false);
}
} catch (IOException e) {
//swallow
}
}
@Override
public Set<MediaType> getSupportedTypes(ParseContext context) {
return SUPPORTED_TYPES;
}
@Override
public void parse(InputStream stream, ContentHandler handler, Metadata metadata,
ParseContext context) throws IOException, SAXException, TikaException {
EmbeddedDocumentExtractor embeddedDocumentExtractor = null;
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
xhtml.startDocument();
try {
HemfPicture ex = new HemfPicture(stream);
double lastY = -1;
double lastX = -1;
long fudgeFactorX = 1000;//derive this from the font or frame/bounds information
StringBuilder buffer = new StringBuilder();
for (HemfRecord record : ex) {
if (record.getEmfRecordType() == HemfRecordType.comment) {
HemfComment.EmfCommentData commentData =
((HemfComment.EmfComment) record).getCommentData();
if (commentData instanceof HemfComment.EmfCommentDataMultiformats) {
if (embeddedDocumentExtractor == null) {
embeddedDocumentExtractor =
EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context);
}
handleMultiFormats((HemfComment.EmfCommentDataMultiformats) commentData,
xhtml, embeddedDocumentExtractor);
} else if (commentData instanceof HemfComment.EmfCommentDataWMF) {
if (embeddedDocumentExtractor == null) {
embeddedDocumentExtractor =
EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context);
}
handleWMF(((HemfComment.EmfCommentDataWMF) commentData).getWMFData(), xhtml,
embeddedDocumentExtractor);
}
} else if (record.getEmfRecordType().equals(HemfRecordType.extTextOutW)) {
HemfText.EmfExtTextOutW extTextOutW = (HemfText.EmfExtTextOutW) record;
//change equality to delta diff;
if (lastY > -1 && lastY != extTextOutW.getReference().getY()) {
xhtml.startElement("p");
xhtml.characters(buffer.toString());
xhtml.endElement("p");
buffer.setLength(0);
lastX = -1;
}
if (lastX > -1 && extTextOutW.getReference().getX() - lastX > fudgeFactorX) {
buffer.append(" ");
}
String txt = extTextOutW.getText();
buffer.append(txt);
lastY = extTextOutW.getReference().getY();
lastX = extTextOutW.getReference().getX();
}
}
if (buffer.length() > 0) {
xhtml.startElement("p");
xhtml.characters(buffer.toString());
xhtml.endElement("p");
}
} catch (RecordFormatException e) { //POI's hemfparser can throw these for "parse
// exceptions"
throw new TikaException(e.getMessage(), e);
} catch (RuntimeException e) { //convert Runtime to RecordFormatExceptions
throw new TikaException(e.getMessage(), e);
}
xhtml.endDocument();
}
private void handleWMF(byte[] bytes, ContentHandler contentHandler,
EmbeddedDocumentExtractor embeddedDocumentExtractor)
throws IOException, SAXException, TikaException {
Metadata embeddedMetadata = new Metadata();
embeddedMetadata.set(Metadata.CONTENT_TYPE, WMF_MEDIA_TYPE.toString());
if (embeddedDocumentExtractor.shouldParseEmbedded(embeddedMetadata)) {
try (InputStream is = TikaInputStream.get(bytes)) {
embeddedDocumentExtractor
.parseEmbedded(is, new EmbeddedContentHandler(contentHandler),
embeddedMetadata, false);
}
}
}
private void handleMultiFormats(HemfComment.EmfCommentDataMultiformats commentData,
ContentHandler handler,
EmbeddedDocumentExtractor embeddedDocumentExtractor)
throws IOException, TikaException, SAXException {
for (HemfComment.EmfCommentDataFormat dataFormat : commentData.getFormats()) {
//is this right?!
handleEmbedded(dataFormat.getRawData(), embeddedDocumentExtractor, handler);
}
}
}