blob: 5ed1399a28b9da74568202dee6059b86f39a888d [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.parser.microsoft.xml;
import java.io.IOException;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
import javax.xml.namespace.QName;
import org.apache.commons.codec.binary.Base64;
import org.xml.sax.Attributes;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.AttributesImpl;
import org.xml.sax.helpers.DefaultHandler;
import org.apache.tika.extractor.EmbeddedDocumentExtractor;
import org.apache.tika.extractor.EmbeddedDocumentUtil;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.sax.TeeContentHandler;
import org.apache.tika.sax.XHTMLContentHandler;
/**
* Parses wordml 2003 format word files. These are single xml files
* that predate ooxml.
* <p>
* See <a href="https://en.wikipedia.org/wiki/Microsoft_Office_XML_formats">https://en.wikipedia.org/wiki/Microsoft_Office_XML_formats</a>
*/
public class WordMLParser extends AbstractXML2003Parser {
//map between wordml and xhtml entities
private static final Map<String, String> WORDML_TO_XHTML;
//ignore all characters within these elements
private static final Set<QName> IGNORE_CHARACTERS = Collections.unmodifiableSet(new HashSet<>(
Arrays.asList(new QName(WORD_ML_URL, HLINK), new QName(WORD_ML_URL, PICT),
new QName(WORD_ML_URL, BIN_DATA),
new QName(MS_OFFICE_PROPERTIES_URN, DOCUMENT_PROPERTIES))));
private static final MediaType MEDIA_TYPE = MediaType.application("vnd.ms-wordml");
private static final Set<MediaType> SUPPORTED_TYPES = Collections.singleton(MEDIA_TYPE);
static {
Map<String, String> m = new HashMap<>();
m.put(P, P);
m.put("tbl", TABLE);
m.put(TR, TR);
m.put("tc", TD);//not a typo -- table cell -> tc
WORDML_TO_XHTML = Collections.unmodifiableMap(m);
}
//immutable
@Override
public Set<MediaType> getSupportedTypes(ParseContext context) {
return SUPPORTED_TYPES;
}
@Override
protected ContentHandler getContentHandler(ContentHandler ch, Metadata metadata,
ParseContext context) {
return new TeeContentHandler(super.getContentHandler(ch, metadata, context),
new WordMLHandler(ch), new HyperlinkHandler(ch, WORD_ML_URL),
new PictHandler(ch, metadata,
EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context)));
}
@Override
public void setContentType(Metadata metadata) {
metadata.set(Metadata.CONTENT_TYPE, MEDIA_TYPE.toString());
}
private static class WordMLHandler extends DefaultHandler {
private final ContentHandler handler;
private boolean ignoreCharacters;
private boolean inBody = false;
//use inP to keep track of whether the handler is
//in a paragraph or not. <p><p></p></p> was allowed
//in wordml. Use this boolean to prevent <p> within <p>
private boolean inP;
public WordMLHandler(ContentHandler handler) {
this.handler = handler;
}
@Override
public void startElement(String uri, String localName, String qName, Attributes attrs)
throws SAXException {
localName = localName.toLowerCase(Locale.US);
if (WORD_ML_URL.equals(uri)) {
if (BODY.equals(localName)) {
inBody = true;
return;
}
String html = WORDML_TO_XHTML.get(localName);
if (html != null) {
if (P.equals(localName)) {
//close p if already in a p to prevent nested <p>
if (inP) {
handler.endElement(XHTMLContentHandler.XHTML, P, P);
}
inP = true;
}
handler.startElement(XHTMLContentHandler.XHTML, html, html, EMPTY_ATTRS);
if (html.equals(TABLE)) {
handler.startElement(XHTMLContentHandler.XHTML, TBODY, TBODY, EMPTY_ATTRS);
}
}
if (BR.equals(localName)) {
handler.characters(NEWLINE, 0, 1);
}
}
if (IGNORE_CHARACTERS.contains(new QName(uri, localName))) {
ignoreCharacters = true;
}
}
@Override
public void characters(char[] str, int offset, int len) throws SAXException {
if (!ignoreCharacters && inBody) {
handler.characters(str, offset, len);
}
}
@Override
public void endElement(String uri, String localName, String qName) throws SAXException {
if (WORD_ML_URL.equals(uri)) {
//for now, don't bother checking for end of body...if there's any text
//after the close of body, we should extract it
localName = localName.toLowerCase(Locale.US);
String html = WORDML_TO_XHTML.get(localName);
if (html != null) {
if (html.equals(TABLE)) {
handler.endElement(XHTMLContentHandler.XHTML, TBODY, TBODY);
}
if (P.equals(html) && !inP) {
//start p if not already in one to prevent non-matching <p>
handler.startElement(XHTMLContentHandler.XHTML, P, P, EMPTY_ATTRS);
}
handler.endElement(XHTMLContentHandler.XHTML, html, html);
if (P.equals(html)) {
inP = false;
}
}
}
if (IGNORE_CHARACTERS.contains(new QName(uri, localName))) {
ignoreCharacters = false;
}
}
}
private static class PictHandler extends DefaultHandler {
final StringBuilder buffer = new StringBuilder();
final Metadata parentMetadata;
final ContentHandler handler;
final Base64 base64 = new Base64();
byte[] rawBytes = null;
EmbeddedDocumentExtractor embeddedDocumentExtractor;
boolean inPict = false;
boolean inBin = false;
String pictName = null;
String pictSource = null;
public PictHandler(ContentHandler handler, Metadata metadata,
EmbeddedDocumentExtractor embeddedDocumentExtractor) {
this.handler = handler;
this.parentMetadata = metadata;
this.embeddedDocumentExtractor = embeddedDocumentExtractor;
}
@Override
public void startElement(String uri, String localName, String qName, Attributes attrs)
throws SAXException {
if (WORD_ML_URL.equals(uri)) {
if (PICT.equals(localName)) {
inPict = true;
} else if (BIN_DATA.equals(localName)) {
inBin = true;
pictName = attrs.getValue(WORD_ML_URL, NAME_ATTR);
if (pictName != null) {
pictName = pictName.replaceFirst("wordml://", "");
}
}
} else if (MS_VML_URN.equals(uri)) {
if (localName.equals("imagedata")) {
//src is an internal designator with an extension
String src = attrs.getValue("", "src");
//title appears to be the original file name
String title = attrs.getValue(MS_OFFICE_PROPERTIES_URN, "title");
if (title != null && !title.equals("")) {
if (src != null) {
//take the extention from the src and append it to the title
int i = src.lastIndexOf(".");
if (i > -1 && i + 1 < src.length()) {
String ext = src.substring(i);
title += ext;
}
}
pictSource = title;
}
}
}
}
@Override
public void characters(char[] str, int offset, int len) throws SAXException {
if (inBin) {
buffer.append(str, offset, len);
} else if (inPict) {
handler.characters(str, offset, len);
}
}
@Override
public void endElement(String uri, String localName, String qName) throws SAXException {
if (!WORD_ML_URL.equals(uri)) {
return;
}
//somewhat tricky...
//can't just dump bin_data at the end of the
//bin_data element because there may be metadata
//after it, if it is within a pict element
//<pict><binData></binData><imagedata/></pict>.
//However, if you aren't in a pict (say docOLEdata), then do dump binary
//data at the end of the bin data.
if (PICT.equals(localName)) {
inPict = false;
AttributesImpl attrs = new AttributesImpl();
if (pictName != null) {
attrs.addAttribute(XHTMLContentHandler.XHTML, HREF, HREF, CDATA, pictName);
}
handler.startElement(XHTMLContentHandler.XHTML, IMG, IMG, attrs);
handler.endElement(XHTMLContentHandler.XHTML, IMG, IMG);
handleEmbedded();
} else if (BIN_DATA.equals(localName)) {
inBin = false;
boolean success = false;
try {
rawBytes = base64.decode(buffer.toString());
success = true;
} catch (IllegalArgumentException e) {
EmbeddedDocumentUtil.recordEmbeddedStreamException(e, parentMetadata);
} finally {
//reset
buffer.setLength(0);
}
if (success && !inPict) {
handleEmbedded();
}
}
}
private void handleEmbedded() throws SAXException {
if (rawBytes != null) {
try (TikaInputStream is = TikaInputStream.get(rawBytes)) {
Metadata metadata = new Metadata();
if (pictName != null) {
metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, pictName);
}
if (pictSource != null) {
metadata.set(TikaCoreProperties.ORIGINAL_RESOURCE_NAME, pictSource);
}
if (embeddedDocumentExtractor.shouldParseEmbedded(metadata)) {
embeddedDocumentExtractor.parseEmbedded(is, handler, metadata, false);
}
} catch (IOException e) {
//log
}
}
//reset
pictName = null;
pictSource = null;
rawBytes = null;
}
}
}