| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.tika.parser.pdf; |
| |
| import java.io.InputStream; |
| import java.util.HashMap; |
| import java.util.LinkedHashMap; |
| import java.util.Map; |
| import java.util.regex.Matcher; |
| import java.util.regex.Pattern; |
| import javax.xml.namespace.QName; |
| import javax.xml.stream.XMLStreamConstants; |
| import javax.xml.stream.XMLStreamException; |
| import javax.xml.stream.XMLStreamReader; |
| |
| import org.xml.sax.SAXException; |
| import org.xml.sax.helpers.AttributesImpl; |
| |
| import org.apache.tika.metadata.Metadata; |
| import org.apache.tika.parser.ParseContext; |
| import org.apache.tika.sax.XHTMLContentHandler; |
| |
| /** |
| * This class offers an initial capability to |
| * scrape text containing elements out of XFA, and |
| * it tries to link fields with values. |
| * <p> |
| * Some areas for improvement: |
| * <ol> |
| * <li>convert this to 2 lines of XPath</li> |
| * <li>handle metadata stored in <desc> section (govdocs1: 754282.pdf, 982106.pdf)</li> |
| * <li>handle pdf metadata (access permissions, etc.) in <pdf> element</li> |
| * <li>extract different types of uris as metadata</li> |
| * <li>add extraction of <image> data (govdocs1: 754282.pdf)</li> |
| * <li>add computation of traversal order for fields</li> |
| * <li>figure out when text extracted from xfa fields is duplicative of that |
| * extracted from the rest of the pdf...and do this efficiently and quickly</li> |
| * <li>avoid duplication with <speak> and <tooltip> elements</li> |
| * </ol> |
| */ |
| class XFAExtractor { |
| |
| private static final Pattern XFA_TEMPLATE_ANY_VERSION = |
| Pattern.compile("^http://www.xfa.org/schema/xfa-template"); |
| private static final Pattern TEXT_PATTERN = |
| Pattern.compile("^(speak|text|contents-richtext|toolTip|exData)$"); |
| |
| private static final String XFA_DATA_NS = "http://www.xfa.org/schema/xfa-data/1.0/"; |
| |
| private static final String FIELD_LN = "field"; |
| private static final QName XFA_DATA = new QName(XFA_DATA_NS, "data"); |
| |
| private final Matcher xfaTemplateMatcher;//namespace any version |
| private final Matcher textMatcher; |
| |
| XFAExtractor() { |
| xfaTemplateMatcher = XFA_TEMPLATE_ANY_VERSION.matcher(""); |
| textMatcher = TEXT_PATTERN.matcher(""); |
| } |
| |
| void extract(InputStream xfaIs, XHTMLContentHandler xhtml, Metadata m, ParseContext context) |
| throws XMLStreamException, SAXException { |
| xhtml.startElement("div", "class", "xfa_content"); |
| |
| Map<String, String> pdfObjRToValues = new HashMap<>(); |
| |
| //for now, store and dump the fields in insertion order |
| Map<String, XFAField> namedFields = new LinkedHashMap<>(); |
| |
| //The strategy is to cache the fields in fields |
| //and cache the values in pdfObjRToValues while |
| //handling the text etc along the way. |
| // |
| //As a final step, dump the merged fields and the values. |
| |
| XMLStreamReader reader = context.getXMLInputFactory().createXMLStreamReader(xfaIs); |
| while (reader.hasNext()) { |
| switch (reader.next()) { |
| case XMLStreamConstants.START_ELEMENT: |
| QName name = reader.getName(); |
| String localName = name.getLocalPart(); |
| if (xfaTemplateMatcher.reset(name.getNamespaceURI()).find() && |
| FIELD_LN.equals(name.getLocalPart())) { |
| handleField(reader, namedFields); |
| } else if (XFA_DATA.equals(name)) { //full qname match is important! |
| loadData(reader, pdfObjRToValues); |
| } else if (textMatcher.reset(localName).find()) { |
| scrapeTextUntil(reader, xhtml, name); |
| } |
| break; |
| case XMLStreamConstants.END_ELEMENT: |
| break; |
| } |
| } |
| |
| if (namedFields.size() == 0) { |
| xhtml.endElement("div"); |
| return; |
| } |
| //now dump fields and values |
| xhtml.startElement("div", "class", "xfa_form"); |
| xhtml.startElement("ol"); |
| StringBuilder sb = new StringBuilder(); |
| for (Map.Entry<String, XFAField> e : namedFields.entrySet()) { |
| String fieldName = e.getKey(); |
| XFAField field = e.getValue(); |
| String fieldValue = pdfObjRToValues.get(fieldName); |
| AttributesImpl attrs = new AttributesImpl(); |
| attrs.addAttribute("", "fieldName", "fieldName", "CDATA", fieldName); |
| |
| String displayFieldName = |
| (field.toolTip == null || field.toolTip.trim().length() == 0) ? fieldName : |
| field.toolTip; |
| |
| sb.append(displayFieldName).append(": "); |
| if (fieldValue != null) { |
| sb.append(fieldValue); |
| } |
| |
| xhtml.startElement("li", attrs); |
| xhtml.characters(sb.toString()); |
| xhtml.endElement("li"); |
| sb.setLength(0); |
| } |
| xhtml.endElement("ol"); |
| xhtml.endElement("div"); |
| xhtml.endElement("div"); |
| } |
| |
| //try to scrape the text until the endElement |
| private void scrapeTextUntil(XMLStreamReader reader, XHTMLContentHandler xhtml, |
| QName endElement) throws XMLStreamException, SAXException { |
| StringBuilder buffer = new StringBuilder(); |
| boolean keepGoing = true; |
| while (reader.hasNext() && keepGoing) { |
| switch (reader.next()) { |
| case XMLStreamConstants.START_ELEMENT: |
| break; |
| case XMLStreamConstants.CHARACTERS: |
| int start = reader.getTextStart(); |
| int length = reader.getTextLength(); |
| buffer.append(reader.getTextCharacters(), start, length); |
| break; |
| |
| case XMLStreamConstants.CDATA: |
| start = reader.getTextStart(); |
| length = reader.getTextLength(); |
| buffer.append(reader.getTextCharacters(), start, length); |
| break; |
| |
| case (XMLStreamConstants.END_ELEMENT): |
| if (reader.getName().equals(endElement)) { |
| keepGoing = false; |
| } else if ("p".equals(reader.getName().getLocalPart())) { |
| xhtml.element("p", buffer.toString()); |
| buffer.setLength(0); |
| } |
| break; |
| } |
| } |
| String remainder = buffer.toString(); |
| if (remainder.trim().length() > 0) { |
| xhtml.element("p", remainder); |
| } |
| } |
| |
| |
| private String scrapeTextUntil(XMLStreamReader reader, QName endElement) |
| throws XMLStreamException { |
| StringBuilder buffer = new StringBuilder(); |
| boolean keepGoing = true; |
| while (reader.hasNext() && keepGoing) { |
| switch (reader.next()) { |
| case XMLStreamConstants.START_ELEMENT: |
| break; |
| case XMLStreamConstants.CHARACTERS: |
| int start = reader.getTextStart(); |
| int length = reader.getTextLength(); |
| buffer.append(reader.getTextCharacters(), start, length); |
| break; |
| |
| case XMLStreamConstants.CDATA: |
| start = reader.getTextStart(); |
| length = reader.getTextLength(); |
| buffer.append(reader.getTextCharacters(), start, length); |
| break; |
| |
| case (XMLStreamConstants.END_ELEMENT): |
| if (reader.getName().equals(endElement)) { |
| keepGoing = false; |
| } else if ("p".equals(reader.getName().getLocalPart())) { |
| buffer.append("\n"); |
| } |
| break; |
| } |
| } |
| return buffer.toString(); |
| } |
| |
| private void loadData(XMLStreamReader reader, Map<String, String> pdfObjRToValues) |
| throws XMLStreamException { |
| //reader is at the "xfa:data" element |
| //scrape the contents from the text containing nodes |
| StringBuilder buffer = new StringBuilder(); |
| while (reader.hasNext()) { |
| switch (reader.next()) { |
| case (XMLStreamConstants.START_ELEMENT): |
| break; |
| case XMLStreamConstants.CHARACTERS: |
| int start = reader.getTextStart(); |
| int length = reader.getTextLength(); |
| buffer.append(reader.getTextCharacters(), start, length); |
| break; |
| |
| case XMLStreamConstants.CDATA: |
| start = reader.getTextStart(); |
| length = reader.getTextLength(); |
| buffer.append(reader.getTextCharacters(), start, length); |
| break; |
| |
| case (XMLStreamConstants.END_ELEMENT): |
| if (buffer.length() > 0) { |
| String localName = reader.getLocalName(); |
| pdfObjRToValues.put(localName, buffer.toString()); |
| buffer.setLength(0); |
| } |
| if (XFA_DATA.equals(reader.getName())) { |
| return; |
| } |
| break; |
| |
| } |
| } |
| } |
| |
| private void handleField(XMLStreamReader reader, Map<String, XFAField> fields) |
| throws XMLStreamException { |
| //reader is set to the field element |
| String fieldName = findFirstAttributeValue(reader, "name"); |
| String pdfObjRef = ""; |
| String toolTip = ""; |
| while (reader.hasNext()) { |
| switch (reader.next()) { |
| case XMLStreamConstants.START_ELEMENT: |
| if ("toolTip".equals(reader.getName().getLocalPart())) { |
| toolTip = scrapeTextUntil(reader, reader.getName()); |
| } |
| // add checkbutton, etcif (reader.getName().equals()) |
| break; |
| case XMLStreamConstants.END_ELEMENT: |
| if (xfaTemplateMatcher.reset(reader.getName().getNamespaceURI()).find() && |
| FIELD_LN.equals(reader.getName().getLocalPart())) { |
| if (fieldName != null) { |
| fields.put(fieldName, new XFAField(fieldName, toolTip, pdfObjRef)); |
| } |
| return; |
| } |
| break; |
| case XMLStreamConstants.PROCESSING_INSTRUCTION: |
| if ("PDF_OBJR".equals(reader.getPITarget())) { |
| pdfObjRef = reader.getPIData(); |
| } |
| break; |
| |
| } |
| } |
| } |
| |
| private String findFirstAttributeValue(XMLStreamReader reader, String name) { |
| for (int i = 0; i < reader.getAttributeCount(); i++) { |
| String n = reader.getAttributeLocalName(i); |
| if (name.equals(n)) { |
| return reader.getAttributeValue(i); |
| } |
| } |
| return ""; |
| } |
| |
| static class XFAField { |
| String fieldName; |
| String toolTip; |
| String pdfObjRef; |
| String value; |
| |
| public XFAField(String fieldName, String toolTip, String pdfObjRef) { |
| this.fieldName = fieldName; |
| this.toolTip = toolTip; |
| this.pdfObjRef = pdfObjRef; |
| } |
| |
| @Override |
| public String toString() { |
| return "XFAField{" + "fieldName='" + fieldName + '\'' + ", toolTip='" + toolTip + '\'' + |
| ", pdfObjRef='" + pdfObjRef + '\'' + ", value='" + value + '\'' + '}'; |
| } |
| } |
| } |