| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.tika.sax.xpath; |
| |
| import java.util.LinkedList; |
| |
| import org.apache.tika.sax.ContentHandlerDecorator; |
| import org.xml.sax.Attributes; |
| import org.xml.sax.ContentHandler; |
| import org.xml.sax.SAXException; |
| import org.xml.sax.helpers.AttributesImpl; |
| |
| /** |
| * Content handler decorator that only passes the elements, attributes, |
| * and text nodes that match the given XPath expression. |
| */ |
| public class MatchingContentHandler extends ContentHandlerDecorator { |
| |
| private final LinkedList<Matcher> matchers = new LinkedList<Matcher>(); |
| |
| private Matcher matcher; |
| |
| public MatchingContentHandler(ContentHandler delegate, Matcher matcher) { |
| super(delegate); |
| this.matcher = matcher; |
| } |
| |
| public void startElement( |
| String uri, String localName, String name, Attributes attributes) |
| throws SAXException { |
| matchers.addFirst(matcher); |
| matcher = matcher.descend(uri, localName); |
| |
| AttributesImpl matches = new AttributesImpl(); |
| for (int i = 0; i < attributes.getLength(); i++) { |
| String attributeURI = attributes.getURI(i); |
| String attributeName = attributes.getLocalName(i); |
| if (matcher.matchesAttribute(attributeURI, attributeName)) { |
| matches.addAttribute( |
| attributeURI, attributeName, attributes.getQName(i), |
| attributes.getType(i), attributes.getValue(i)); |
| } |
| } |
| |
| if (matcher.matchesElement() || matches.getLength() > 0) { |
| super.startElement(uri, localName, name, matches); |
| if (!matcher.matchesElement()) { |
| // Force the matcher to match the current element, so the |
| // endElement method knows to emit the correct event |
| matcher = |
| new CompositeMatcher(matcher, ElementMatcher.INSTANCE); |
| } |
| } |
| } |
| |
| public void endElement(String uri, String localName, String name) |
| throws SAXException { |
| if (matcher.matchesElement()) { |
| super.endElement(uri, localName, name); |
| } |
| // Sometimes tagsoup returns double end tags, so the stack might |
| // be empty! TODO: Remove this when the tagsoup problem is fixed. |
| if (!matchers.isEmpty()) { |
| matcher = matchers.removeFirst(); |
| } |
| } |
| |
| public void characters(char[] ch, int start, int length) |
| throws SAXException { |
| if (matcher.matchesText()) { |
| super.characters(ch, start, length); |
| } |
| } |
| |
| public void ignorableWhitespace(char[] ch, int start, int length) |
| throws SAXException { |
| if (matcher.matchesText()) { |
| super.ignorableWhitespace(ch, start, length); |
| } |
| } |
| |
| public void processingInstruction(String target, String data) { |
| // TODO: Support for matching processing instructions |
| } |
| |
| public void skippedEntity(String name) throws SAXException { |
| // TODO: Can skipped entities refer to more than text? |
| if (matcher.matchesText()) { |
| super.skippedEntity(name); |
| } |
| } |
| |
| } |