| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.tika.sax.boilerpipe; |
| |
| import java.io.Writer; |
| import java.util.ArrayList; |
| import java.util.BitSet; |
| import java.util.HashSet; |
| import java.util.List; |
| import java.util.Locale; |
| import java.util.Set; |
| |
| import de.l3s.boilerpipe.BoilerpipeExtractor; |
| import de.l3s.boilerpipe.BoilerpipeProcessingException; |
| import de.l3s.boilerpipe.document.TextBlock; |
| import de.l3s.boilerpipe.document.TextDocument; |
| import de.l3s.boilerpipe.extractors.ArticleExtractor; |
| import de.l3s.boilerpipe.extractors.DefaultExtractor; |
| import de.l3s.boilerpipe.sax.BoilerpipeHTMLContentHandler; |
| import org.xml.sax.Attributes; |
| import org.xml.sax.ContentHandler; |
| import org.xml.sax.SAXException; |
| import org.xml.sax.helpers.AttributesImpl; |
| |
| import org.apache.tika.metadata.Metadata; |
| import org.apache.tika.sax.WriteOutContentHandler; |
| import org.apache.tika.sax.XHTMLContentHandler; |
| |
| /** |
| * Uses the <a href="http://code.google.com/p/boilerpipe/">boilerpipe</a> |
| * library to automatically extract the main content from a web page. |
| * <p/> |
| * Use this as a {@link ContentHandler} object passed to |
| * {@link HtmlParser#parse(java.io.InputStream, ContentHandler, Metadata, |
| * org.apache.tika.parser.ParseContext)} |
| */ |
| public class BoilerpipeContentHandler extends BoilerpipeHTMLContentHandler { |
| /** |
| * The newline character that gets inserted after block elements. |
| */ |
| private static final char[] NL = new char[]{'\n'}; |
| private static Set<Character> ALLOWABLE_CHARS; |
| |
| static { |
| ALLOWABLE_CHARS = new HashSet<>(); |
| ALLOWABLE_CHARS.add(' '); |
| ALLOWABLE_CHARS.add('\n'); |
| ALLOWABLE_CHARS.add('\r'); |
| } |
| |
| private ContentHandler delegate; |
| private BoilerpipeExtractor extractor; |
| private boolean includeMarkup; |
| private boolean inHeader; |
| private boolean inFooter; |
| private int headerCharOffset; |
| private List<RecordedElement> elements; |
| private TextDocument td; |
| |
| /** |
| * Creates a new boilerpipe-based content extractor, using the |
| * {@link DefaultExtractor} extraction rules and "delegate" as the content handler. |
| * |
| * @param delegate The {@link ContentHandler} object |
| */ |
| public BoilerpipeContentHandler(ContentHandler delegate) { |
| this(delegate, DefaultExtractor.INSTANCE); |
| } |
| |
| /** |
| * Creates a content handler that writes XHTML body character events to |
| * the given writer. |
| * |
| * @param writer writer |
| */ |
| public BoilerpipeContentHandler(Writer writer) { |
| this(new WriteOutContentHandler(writer)); |
| } |
| |
| /** |
| * Creates a new boilerpipe-based content extractor, using the given |
| * extraction rules. The extracted main content will be passed to the |
| * <delegate> content handler. |
| * |
| * @param delegate The {@link ContentHandler} object |
| * @param extractor Extraction rules to use, e.g. {@link ArticleExtractor} |
| */ |
| public BoilerpipeContentHandler(ContentHandler delegate, BoilerpipeExtractor extractor) { |
| this.td = null; |
| this.delegate = delegate; |
| this.extractor = extractor; |
| } |
| |
| public boolean isIncludeMarkup() { |
| return includeMarkup; |
| } |
| |
| public void setIncludeMarkup(boolean includeMarkup) { |
| this.includeMarkup = includeMarkup; |
| } |
| |
| /** |
| * Retrieves the built TextDocument |
| * |
| * @return TextDocument |
| */ |
| public TextDocument getTextDocument() { |
| return td; |
| } |
| |
| @Override |
| public void startDocument() throws SAXException { |
| super.startDocument(); |
| |
| delegate.startDocument(); |
| |
| inHeader = true; |
| inFooter = false; |
| headerCharOffset = 0; |
| |
| if (includeMarkup) { |
| elements = new ArrayList<>(); |
| } |
| } |
| |
| @Override |
| public void startPrefixMapping(String prefix, String uri) throws SAXException { |
| super.startPrefixMapping(prefix, uri); |
| delegate.startPrefixMapping(prefix, uri); |
| } |
| |
| ; |
| |
| @Override |
| public void startElement(String uri, String localName, String qName, Attributes atts) |
| throws SAXException { |
| super.startElement(uri, localName, qName, atts); |
| |
| if (inHeader) { |
| delegate.startElement(uri, localName, qName, atts); |
| } else if (inFooter) { |
| // Do nothing |
| } else if (includeMarkup) { |
| elements.add(new RecordedElement(uri, localName, qName, atts)); |
| } else { |
| // This happens for the <body> element, if we're not doing markup. |
| delegate.startElement(uri, localName, qName, atts); |
| } |
| } |
| |
| ; |
| |
| @Override |
| public void characters(char[] chars, int offset, int length) throws SAXException { |
| super.characters(chars, offset, length); |
| |
| if (inHeader) { |
| delegate.characters(chars, offset, length); |
| headerCharOffset++; |
| } else if (inFooter) { |
| // Do nothing |
| } else if (includeMarkup) { |
| RecordedElement element = elements.get(elements.size() - 1); |
| |
| char[] characters = new char[length]; |
| System.arraycopy(chars, offset, characters, 0, length); |
| element.getCharacters().add(characters); |
| } |
| } |
| |
| ; |
| |
| @Override |
| public void endElement(String uri, String localName, String qName) throws SAXException { |
| super.endElement(uri, localName, qName); |
| |
| if (inHeader) { |
| delegate.endElement(uri, localName, qName); |
| inHeader = !localName.equals("head"); |
| } else if (inFooter) { |
| // Do nothing |
| } else if (localName.equals("body")) { |
| inFooter = true; |
| } else if (includeMarkup) { |
| // Add the end element, and the continuation from the previous element |
| elements.add(new RecordedElement(uri, localName, qName)); |
| elements.add(new RecordedElement()); |
| } |
| } |
| |
| ; |
| |
| @Override |
| public void endDocument() throws SAXException { |
| super.endDocument(); |
| |
| td = toTextDocument(); |
| try { |
| extractor.process(td); |
| } catch (BoilerpipeProcessingException e) { |
| throw new SAXException(e); |
| } |
| |
| Attributes emptyAttrs = new AttributesImpl(); |
| |
| // At this point we have all the information we need to either emit N paragraphs |
| // of plain text (if not including markup), or we have to replay our recorded elements |
| // and only emit character runs that passed the boilerpipe filters. |
| if (includeMarkup) { |
| BitSet validCharacterRuns = new BitSet(); |
| for (TextBlock block : td.getTextBlocks()) { |
| if (block.isContent()) { |
| BitSet bs = block.getContainedTextElements(); |
| if (bs != null) { |
| validCharacterRuns.or(bs); |
| } |
| } |
| } |
| |
| // Now have bits set for all valid character runs. Replay our recorded elements, |
| // but only emit character runs flagged as valid. |
| int curCharsIndex = headerCharOffset; |
| |
| for (RecordedElement element : elements) { |
| switch (element.getElementType()) { |
| case START: |
| delegate.startElement(element.getUri(), element.getLocalName(), |
| element.getQName(), element.getAttrs()); |
| // Fall through |
| |
| case CONTINUE: |
| // Now emit characters that are valid. Note that boilerpipe |
| // pre-increments the character index, so |
| // we have to follow suit. |
| for (int i = 0; i < element.getCharacters().size(); i++) { |
| char[] chars = element.getCharacters().get(i); |
| curCharsIndex++; |
| boolean isValidCharacterRun = validCharacterRuns.get(curCharsIndex); |
| |
| // https://issues.apache.org/jira/projects/TIKA/issues/TIKA-2683 |
| // Allow exempted characters to be written |
| if (isValidCharacterRun || |
| (chars.length == 1 && ALLOWABLE_CHARS.contains(chars[0]))) { |
| delegate.characters(chars, 0, chars.length); |
| } |
| |
| // https://issues.apache.org/jira/browse/TIKA-961 |
| if (isValidCharacterRun && i == element.getCharacters().size() - 1 && |
| !Character.isWhitespace(chars[chars.length - 1])) { |
| // Only add whitespace for certain elements |
| if (XHTMLContentHandler.ENDLINE.contains(element.getLocalName())) { |
| delegate.ignorableWhitespace(NL, 0, NL.length); |
| } |
| } |
| } |
| break; |
| |
| case END: |
| delegate.endElement(element.getUri(), element.getLocalName(), |
| element.getQName()); |
| break; |
| |
| default: |
| throw new RuntimeException( |
| "Unhandled element type: " + element.getElementType()); |
| } |
| |
| |
| } |
| } else { |
| for (TextBlock block : td.getTextBlocks()) { |
| if (block.isContent()) { |
| delegate.startElement(XHTMLContentHandler.XHTML, "p", "p", emptyAttrs); |
| char[] chars = block.getText().toCharArray(); |
| delegate.characters(chars, 0, chars.length); |
| delegate.endElement(XHTMLContentHandler.XHTML, "p", "p"); |
| delegate.ignorableWhitespace(NL, 0, NL.length); |
| } |
| } |
| } |
| |
| delegate.endElement(XHTMLContentHandler.XHTML, "body", "body"); |
| delegate.endElement(XHTMLContentHandler.XHTML, "html", "html"); |
| |
| // We defer ending any prefix mapping until here, which is why we don't pass this |
| // through to the delegate in an overridden method. |
| delegate.endPrefixMapping(""); |
| |
| delegate.endDocument(); |
| } |
| |
| ; |
| |
| private static class RecordedElement { |
| private String uri; |
| private String localName; |
| private String qName; |
| private Attributes attrs; |
| private List<char[]> characters; |
| private ElementType elementType; |
| |
| public RecordedElement(String uri, String localName, String qName, Attributes attrs) { |
| this(uri, localName, qName, attrs, ElementType.START); |
| } |
| |
| public RecordedElement(String uri, String localName, String qName) { |
| this(uri, localName, qName, null, ElementType.END); |
| } |
| |
| public RecordedElement() { |
| this(null, null, null, null, ElementType.CONTINUE); |
| } |
| |
| protected RecordedElement(String uri, String localName, String qName, Attributes attrs, |
| RecordedElement.ElementType elementType) { |
| this.uri = uri; |
| this.localName = localName; |
| this.qName = qName; |
| this.attrs = attrs; |
| this.elementType = elementType; |
| this.characters = new ArrayList<>(); |
| } |
| |
| @Override |
| public String toString() { |
| return String.format(Locale.ROOT, "<%s> of type %s", localName, elementType); |
| } |
| |
| public String getUri() { |
| return uri; |
| } |
| |
| public String getLocalName() { |
| return localName; |
| } |
| |
| public String getQName() { |
| return qName; |
| } |
| |
| public Attributes getAttrs() { |
| return attrs; |
| } |
| |
| public List<char[]> getCharacters() { |
| return characters; |
| } |
| |
| public RecordedElement.ElementType getElementType() { |
| return elementType; |
| } |
| |
| public enum ElementType { |
| START, END, CONTINUE |
| } |
| } |
| } |