blob: c506e2c68e6126b6e56b16862f2073d033aa6502 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.sax.boilerpipe;
import java.io.Writer;
import java.util.ArrayList;
import java.util.BitSet;
import java.util.HashSet;
import java.util.List;
import java.util.Locale;
import java.util.Set;
import de.l3s.boilerpipe.BoilerpipeExtractor;
import de.l3s.boilerpipe.BoilerpipeProcessingException;
import de.l3s.boilerpipe.document.TextBlock;
import de.l3s.boilerpipe.document.TextDocument;
import de.l3s.boilerpipe.extractors.ArticleExtractor;
import de.l3s.boilerpipe.extractors.DefaultExtractor;
import de.l3s.boilerpipe.sax.BoilerpipeHTMLContentHandler;
import org.xml.sax.Attributes;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.AttributesImpl;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.sax.WriteOutContentHandler;
import org.apache.tika.sax.XHTMLContentHandler;
/**
* Uses the <a href="http://code.google.com/p/boilerpipe/">boilerpipe</a>
* library to automatically extract the main content from a web page.
* <p/>
* Use this as a {@link ContentHandler} object passed to
* {@link HtmlParser#parse(java.io.InputStream, ContentHandler, Metadata,
* org.apache.tika.parser.ParseContext)}
*/
public class BoilerpipeContentHandler extends BoilerpipeHTMLContentHandler {
/**
* The newline character that gets inserted after block elements.
*/
private static final char[] NL = new char[]{'\n'};
private static Set<Character> ALLOWABLE_CHARS;
static {
ALLOWABLE_CHARS = new HashSet<>();
ALLOWABLE_CHARS.add(' ');
ALLOWABLE_CHARS.add('\n');
ALLOWABLE_CHARS.add('\r');
}
private ContentHandler delegate;
private BoilerpipeExtractor extractor;
private boolean includeMarkup;
private boolean inHeader;
private boolean inFooter;
private int headerCharOffset;
private List<RecordedElement> elements;
private TextDocument td;
/**
* Creates a new boilerpipe-based content extractor, using the
* {@link DefaultExtractor} extraction rules and "delegate" as the content handler.
*
* @param delegate The {@link ContentHandler} object
*/
public BoilerpipeContentHandler(ContentHandler delegate) {
this(delegate, DefaultExtractor.INSTANCE);
}
/**
* Creates a content handler that writes XHTML body character events to
* the given writer.
*
* @param writer writer
*/
public BoilerpipeContentHandler(Writer writer) {
this(new WriteOutContentHandler(writer));
}
/**
* Creates a new boilerpipe-based content extractor, using the given
* extraction rules. The extracted main content will be passed to the
* <delegate> content handler.
*
* @param delegate The {@link ContentHandler} object
* @param extractor Extraction rules to use, e.g. {@link ArticleExtractor}
*/
public BoilerpipeContentHandler(ContentHandler delegate, BoilerpipeExtractor extractor) {
this.td = null;
this.delegate = delegate;
this.extractor = extractor;
}
public boolean isIncludeMarkup() {
return includeMarkup;
}
public void setIncludeMarkup(boolean includeMarkup) {
this.includeMarkup = includeMarkup;
}
/**
* Retrieves the built TextDocument
*
* @return TextDocument
*/
public TextDocument getTextDocument() {
return td;
}
@Override
public void startDocument() throws SAXException {
super.startDocument();
delegate.startDocument();
inHeader = true;
inFooter = false;
headerCharOffset = 0;
if (includeMarkup) {
elements = new ArrayList<>();
}
}
@Override
public void startPrefixMapping(String prefix, String uri) throws SAXException {
super.startPrefixMapping(prefix, uri);
delegate.startPrefixMapping(prefix, uri);
}
;
@Override
public void startElement(String uri, String localName, String qName, Attributes atts)
throws SAXException {
super.startElement(uri, localName, qName, atts);
if (inHeader) {
delegate.startElement(uri, localName, qName, atts);
} else if (inFooter) {
// Do nothing
} else if (includeMarkup) {
elements.add(new RecordedElement(uri, localName, qName, atts));
} else {
// This happens for the <body> element, if we're not doing markup.
delegate.startElement(uri, localName, qName, atts);
}
}
;
@Override
public void characters(char[] chars, int offset, int length) throws SAXException {
super.characters(chars, offset, length);
if (inHeader) {
delegate.characters(chars, offset, length);
headerCharOffset++;
} else if (inFooter) {
// Do nothing
} else if (includeMarkup) {
RecordedElement element = elements.get(elements.size() - 1);
char[] characters = new char[length];
System.arraycopy(chars, offset, characters, 0, length);
element.getCharacters().add(characters);
}
}
;
@Override
public void endElement(String uri, String localName, String qName) throws SAXException {
super.endElement(uri, localName, qName);
if (inHeader) {
delegate.endElement(uri, localName, qName);
inHeader = !localName.equals("head");
} else if (inFooter) {
// Do nothing
} else if (localName.equals("body")) {
inFooter = true;
} else if (includeMarkup) {
// Add the end element, and the continuation from the previous element
elements.add(new RecordedElement(uri, localName, qName));
elements.add(new RecordedElement());
}
}
;
@Override
public void endDocument() throws SAXException {
super.endDocument();
td = toTextDocument();
try {
extractor.process(td);
} catch (BoilerpipeProcessingException e) {
throw new SAXException(e);
}
Attributes emptyAttrs = new AttributesImpl();
// At this point we have all the information we need to either emit N paragraphs
// of plain text (if not including markup), or we have to replay our recorded elements
// and only emit character runs that passed the boilerpipe filters.
if (includeMarkup) {
BitSet validCharacterRuns = new BitSet();
for (TextBlock block : td.getTextBlocks()) {
if (block.isContent()) {
BitSet bs = block.getContainedTextElements();
if (bs != null) {
validCharacterRuns.or(bs);
}
}
}
// Now have bits set for all valid character runs. Replay our recorded elements,
// but only emit character runs flagged as valid.
int curCharsIndex = headerCharOffset;
for (RecordedElement element : elements) {
switch (element.getElementType()) {
case START:
delegate.startElement(element.getUri(), element.getLocalName(),
element.getQName(), element.getAttrs());
// Fall through
case CONTINUE:
// Now emit characters that are valid. Note that boilerpipe
// pre-increments the character index, so
// we have to follow suit.
for (int i = 0; i < element.getCharacters().size(); i++) {
char[] chars = element.getCharacters().get(i);
curCharsIndex++;
boolean isValidCharacterRun = validCharacterRuns.get(curCharsIndex);
// https://issues.apache.org/jira/projects/TIKA/issues/TIKA-2683
// Allow exempted characters to be written
if (isValidCharacterRun ||
(chars.length == 1 && ALLOWABLE_CHARS.contains(chars[0]))) {
delegate.characters(chars, 0, chars.length);
}
// https://issues.apache.org/jira/browse/TIKA-961
if (isValidCharacterRun && i == element.getCharacters().size() - 1 &&
!Character.isWhitespace(chars[chars.length - 1])) {
// Only add whitespace for certain elements
if (XHTMLContentHandler.ENDLINE.contains(element.getLocalName())) {
delegate.ignorableWhitespace(NL, 0, NL.length);
}
}
}
break;
case END:
delegate.endElement(element.getUri(), element.getLocalName(),
element.getQName());
break;
default:
throw new RuntimeException(
"Unhandled element type: " + element.getElementType());
}
}
} else {
for (TextBlock block : td.getTextBlocks()) {
if (block.isContent()) {
delegate.startElement(XHTMLContentHandler.XHTML, "p", "p", emptyAttrs);
char[] chars = block.getText().toCharArray();
delegate.characters(chars, 0, chars.length);
delegate.endElement(XHTMLContentHandler.XHTML, "p", "p");
delegate.ignorableWhitespace(NL, 0, NL.length);
}
}
}
delegate.endElement(XHTMLContentHandler.XHTML, "body", "body");
delegate.endElement(XHTMLContentHandler.XHTML, "html", "html");
// We defer ending any prefix mapping until here, which is why we don't pass this
// through to the delegate in an overridden method.
delegate.endPrefixMapping("");
delegate.endDocument();
}
;
private static class RecordedElement {
private String uri;
private String localName;
private String qName;
private Attributes attrs;
private List<char[]> characters;
private ElementType elementType;
public RecordedElement(String uri, String localName, String qName, Attributes attrs) {
this(uri, localName, qName, attrs, ElementType.START);
}
public RecordedElement(String uri, String localName, String qName) {
this(uri, localName, qName, null, ElementType.END);
}
public RecordedElement() {
this(null, null, null, null, ElementType.CONTINUE);
}
protected RecordedElement(String uri, String localName, String qName, Attributes attrs,
RecordedElement.ElementType elementType) {
this.uri = uri;
this.localName = localName;
this.qName = qName;
this.attrs = attrs;
this.elementType = elementType;
this.characters = new ArrayList<>();
}
@Override
public String toString() {
return String.format(Locale.ROOT, "<%s> of type %s", localName, elementType);
}
public String getUri() {
return uri;
}
public String getLocalName() {
return localName;
}
public String getQName() {
return qName;
}
public Attributes getAttrs() {
return attrs;
}
public List<char[]> getCharacters() {
return characters;
}
public RecordedElement.ElementType getElementType() {
return elementType;
}
public enum ElementType {
START, END, CONTINUE
}
}
}