blob: ccf027379e0f7ad0873fa8c3b26b7ca7cc7a5f99 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.parser.mif;
import java.io.IOException;
import java.util.ArrayList;
import java.util.EmptyStackException;
import java.util.List;
import java.util.Stack;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.lang3.StringEscapeUtils;
import org.apache.commons.lang3.StringUtils;
import org.xml.sax.Attributes;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.AttributesImpl;
import org.apache.tika.detect.AutoDetectReader;
/**
* Helper Class to Parse and Extract Adobe MIF Files.
*/
public class MIFExtractor {
private static final Pattern openTagPattern = Pattern.compile("<(\\S*).*");
private static final Pattern selfTagPattern = Pattern.compile("(<(\\S*))(\\s)(.*)(\\>).*");
private static final String OPEN_TAG_MARKER = "<";
private static final String CLOSE_TAG_MARKER = ">";
private static final String START_TAG_VALUE = "`";
private static final String END_TAG_VALUE = "'";
/**
* Parsers the file supplied through the reader and emits events to the supplied content
* handler.
*
* @param reader the reader to use.
* @param handler the content handler to use.
* @throws IOException on any IO error.
* @throws SAXException on any SAX error.
*/
static void parse(AutoDetectReader reader, ContentHandler handler)
throws IOException, SAXException {
handler.startDocument();
String line;
Tag currentTag = new Tag();
Stack<Tag> parents = new Stack<>();
while ((line = reader.readLine()) != null) {
if (line.contains(OPEN_TAG_MARKER) && !line.contains(CLOSE_TAG_MARKER)) {
Matcher matcher = openTagPattern.matcher(line.trim());
if (matcher.matches()) {
if (!parents.empty()) {
currentTag = new Tag();
currentTag.setParent(parents.peek());
}
currentTag.setName(matcher.group(1));
parents.push(currentTag);
Attributes attrs = new AttributesImpl();
handler.startElement(StringUtils.EMPTY, matcher.group(1), matcher.group(1),
attrs);
}
} else if (line.trim().startsWith(CLOSE_TAG_MARKER)) {
try {
String tmp = line.trim();
String tagName = tmp.substring(tmp.lastIndexOf(" ") + 1);
Tag parent = parents.peek();
if (tagName.equals(parent.getName())) {
parents.pop();
}
if (!parents.empty()) {
parents.peek().addChild(parent);
}
handler.endElement(StringUtils.EMPTY, parent.getName(), parent.getName());
} catch (EmptyStackException ex) {
// Shouldn't happen, swallow to keep parsing
}
} else {
Matcher matcher = selfTagPattern.matcher(line.trim());
if (matcher.matches()) {
if (!parents.empty()) {
Tag child = new Tag();
child.setName(matcher.group(2));
child.setValue(matcher.group(4));
child.setParent(parents.peek());
currentTag.addChild(child);
processTag(handler, child);
} else {
currentTag.setName(matcher.group(2));
currentTag.setValue(matcher.group(4));
processTag(handler, currentTag);
currentTag = new Tag();
}
}
}
}
handler.endDocument();
}
/**
* Process a tag and emit events to Content Handler.
*
* @param handler the content handler.
* @param tag the tag to process.
* @throws SAXException on any SAX error.
*/
private static void processTag(ContentHandler handler, Tag tag) throws SAXException {
Attributes attrs = new AttributesImpl();
handler.startElement(StringUtils.EMPTY, tag.getName(), tag.getName(), attrs);
String value = StringUtils.removeStart(tag.getValue(), START_TAG_VALUE);
value = StringUtils.removeEnd(value, END_TAG_VALUE);
String content = StringEscapeUtils.escapeXml(value);
handler.characters(content.toCharArray(), 0, content.length());
handler.endElement(StringUtils.EMPTY, tag.getName(), tag.getName());
}
/**
* Helper model class for a MIF Tag to support parsing.
*/
private static final class Tag {
private List<Tag> children = new ArrayList<>();
private Tag parent;
private String name;
private String value;
public void addChild(Tag child) {
children.add(child);
}
public List<Tag> getChildren() {
return children;
}
public void setChildren(List<Tag> children) {
this.children = children;
}
public String getName() {
return name;
}
public void setName(String name) {
this.name = name;
}
public String getValue() {
return value;
}
public void setValue(String value) {
this.value = value;
}
public String toString() {
return name + ":" + value;
}
public Tag getParent() {
return parent;
}
public void setParent(Tag parent) {
this.parent = parent;
}
}
}