tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-commons/src/main/java/org/apache/tika/sax/boilerpipe/BoilerpipeContentHandler.java - tika - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.tika.sax.boilerpipe;

 import java.io.Writer;
 import java.util.ArrayList;
 import java.util.BitSet;
 import java.util.HashSet;
 import java.util.List;
 import java.util.Locale;
 import java.util.Set;

 import de.l3s.boilerpipe.BoilerpipeExtractor;
 import de.l3s.boilerpipe.BoilerpipeProcessingException;
 import de.l3s.boilerpipe.document.TextBlock;
 import de.l3s.boilerpipe.document.TextDocument;
 import de.l3s.boilerpipe.extractors.ArticleExtractor;
 import de.l3s.boilerpipe.extractors.DefaultExtractor;
 import de.l3s.boilerpipe.sax.BoilerpipeHTMLContentHandler;
 import org.xml.sax.Attributes;
 import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
 import org.xml.sax.helpers.AttributesImpl;

 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.sax.WriteOutContentHandler;
 import org.apache.tika.sax.XHTMLContentHandler;

 /**
  * Uses the <a href="http://code.google.com/p/boilerpipe/">boilerpipe</a>
  * library to automatically extract the main content from a web page.
  * <p/>
  * Use this as a {@link ContentHandler} object passed to
  * {@link HtmlParser#parse(java.io.InputStream, ContentHandler, Metadata,
  * org.apache.tika.parser.ParseContext)}
  */
 public class BoilerpipeContentHandler extends BoilerpipeHTMLContentHandler {
     /**
      * The newline character that gets inserted after block elements.
      */
     private static final char[] NL = new char[]{'\n'};
     private static Set<Character> ALLOWABLE_CHARS;

     static {
         ALLOWABLE_CHARS = new HashSet<>();
         ALLOWABLE_CHARS.add(' ');
         ALLOWABLE_CHARS.add('\n');
         ALLOWABLE_CHARS.add('\r');
     }

     private ContentHandler delegate;
     private BoilerpipeExtractor extractor;
     private boolean includeMarkup;
     private boolean inHeader;
     private boolean inFooter;
     private int headerCharOffset;
     private List<RecordedElement> elements;
     private TextDocument td;

     /**
      * Creates a new boilerpipe-based content extractor, using the
      * {@link DefaultExtractor} extraction rules and "delegate" as the content handler.
      *
      * @param delegate The {@link ContentHandler} object
      */
     public BoilerpipeContentHandler(ContentHandler delegate) {
         this(delegate, DefaultExtractor.INSTANCE);
     }

     /**
      * Creates a content handler that writes XHTML body character events to
      * the given writer.
      *
      * @param writer writer
      */
     public BoilerpipeContentHandler(Writer writer) {
         this(new WriteOutContentHandler(writer));
     }

     /**
      * Creates a new boilerpipe-based content extractor, using the given
      * extraction rules. The extracted main content will be passed to the
      * <delegate> content handler.
      *
      * @param delegate  The {@link ContentHandler} object
      * @param extractor Extraction rules to use, e.g. {@link ArticleExtractor}
      */
     public BoilerpipeContentHandler(ContentHandler delegate, BoilerpipeExtractor extractor) {
         this.td = null;
         this.delegate = delegate;
         this.extractor = extractor;
     }

     public boolean isIncludeMarkup() {
         return includeMarkup;
     }

     public void setIncludeMarkup(boolean includeMarkup) {
         this.includeMarkup = includeMarkup;
     }

     /**
      * Retrieves the built TextDocument
      *
      * @return TextDocument
      */
     public TextDocument getTextDocument() {
         return td;
     }

     @Override
     public void startDocument() throws SAXException {
         super.startDocument();

         delegate.startDocument();

         inHeader = true;
         inFooter = false;
         headerCharOffset = 0;

         if (includeMarkup) {
             elements = new ArrayList<>();
         }
     }

     @Override
     public void startPrefixMapping(String prefix, String uri) throws SAXException {
         super.startPrefixMapping(prefix, uri);
         delegate.startPrefixMapping(prefix, uri);
     }

     ;

     @Override
     public void startElement(String uri, String localName, String qName, Attributes atts)
             throws SAXException {
         super.startElement(uri, localName, qName, atts);

         if (inHeader) {
             delegate.startElement(uri, localName, qName, atts);
         } else if (inFooter) {
             // Do nothing
         } else if (includeMarkup) {
             elements.add(new RecordedElement(uri, localName, qName, atts));
         } else {
             // This happens for the <body> element, if we're not doing markup.
             delegate.startElement(uri, localName, qName, atts);
         }
     }

     ;

     @Override
     public void characters(char[] chars, int offset, int length) throws SAXException {
         super.characters(chars, offset, length);

         if (inHeader) {
             delegate.characters(chars, offset, length);
             headerCharOffset++;
         } else if (inFooter) {
             // Do nothing
         } else if (includeMarkup) {
             RecordedElement element = elements.get(elements.size() - 1);

             char[] characters = new char[length];
             System.arraycopy(chars, offset, characters, 0, length);
             element.getCharacters().add(characters);
         }
     }

     ;

     @Override
     public void endElement(String uri, String localName, String qName) throws SAXException {
         super.endElement(uri, localName, qName);

         if (inHeader) {
             delegate.endElement(uri, localName, qName);
             inHeader = !localName.equals("head");
         } else if (inFooter) {
             // Do nothing
         } else if (localName.equals("body")) {
             inFooter = true;
         } else if (includeMarkup) {
             // Add the end element, and the continuation from the previous element
             elements.add(new RecordedElement(uri, localName, qName));
             elements.add(new RecordedElement());
         }
     }

     ;

     @Override
     public void endDocument() throws SAXException {
         super.endDocument();

         td = toTextDocument();
         try {
             extractor.process(td);
         } catch (BoilerpipeProcessingException e) {
             throw new SAXException(e);
         }

         Attributes emptyAttrs = new AttributesImpl();

         // At this point we have all the information we need to either emit N paragraphs
         // of plain text (if not including markup), or we have to replay our recorded elements
         // and only emit character runs that passed the boilerpipe filters.
         if (includeMarkup) {
             BitSet validCharacterRuns = new BitSet();
             for (TextBlock block : td.getTextBlocks()) {
                 if (block.isContent()) {
                     BitSet bs = block.getContainedTextElements();
                     if (bs != null) {
                         validCharacterRuns.or(bs);
                     }
                 }
             }

             // Now have bits set for all valid character runs. Replay our recorded elements,
             // but only emit character runs flagged as valid.
             int curCharsIndex = headerCharOffset;

             for (RecordedElement element : elements) {
                 switch (element.getElementType()) {
                     case START:
                         delegate.startElement(element.getUri(), element.getLocalName(),
                                 element.getQName(), element.getAttrs());
                         // Fall through

                     case CONTINUE:
                         // Now emit characters that are valid. Note that boilerpipe
                         // pre-increments the character index, so
                         // we have to follow suit.
                         for (int i = 0; i < element.getCharacters().size(); i++) {
                             char[] chars = element.getCharacters().get(i);
                             curCharsIndex++;
                             boolean isValidCharacterRun = validCharacterRuns.get(curCharsIndex);

                             // https://issues.apache.org/jira/projects/TIKA/issues/TIKA-2683
                             // Allow exempted characters to be written
                             if (isValidCharacterRun ||
                                     (chars.length == 1 && ALLOWABLE_CHARS.contains(chars[0]))) {
                                 delegate.characters(chars, 0, chars.length);
                             }

                             // https://issues.apache.org/jira/browse/TIKA-961
                             if (isValidCharacterRun && i == element.getCharacters().size() - 1 &&
                                     !Character.isWhitespace(chars[chars.length - 1])) {
                                 // Only add whitespace for certain elements
                                 if (XHTMLContentHandler.ENDLINE.contains(element.getLocalName())) {
                                     delegate.ignorableWhitespace(NL, 0, NL.length);
                                 }
                             }
                         }
                         break;

                     case END:
                         delegate.endElement(element.getUri(), element.getLocalName(),
                                 element.getQName());
                         break;

                     default:
                         throw new RuntimeException(
                                 "Unhandled element type: " + element.getElementType());
                 }


             }
         } else {
             for (TextBlock block : td.getTextBlocks()) {
                 if (block.isContent()) {
                     delegate.startElement(XHTMLContentHandler.XHTML, "p", "p", emptyAttrs);
                     char[] chars = block.getText().toCharArray();
                     delegate.characters(chars, 0, chars.length);
                     delegate.endElement(XHTMLContentHandler.XHTML, "p", "p");
                     delegate.ignorableWhitespace(NL, 0, NL.length);
                 }
             }
         }

         delegate.endElement(XHTMLContentHandler.XHTML, "body", "body");
         delegate.endElement(XHTMLContentHandler.XHTML, "html", "html");

         // We defer ending any prefix mapping until here, which is why we don't pass this
         // through to the delegate in an overridden method.
         delegate.endPrefixMapping("");

         delegate.endDocument();
     }

     ;

     private static class RecordedElement {
         private String uri;
         private String localName;
         private String qName;
         private Attributes attrs;
         private List<char[]> characters;
         private ElementType elementType;

         public RecordedElement(String uri, String localName, String qName, Attributes attrs) {
             this(uri, localName, qName, attrs, ElementType.START);
         }

         public RecordedElement(String uri, String localName, String qName) {
             this(uri, localName, qName, null, ElementType.END);
         }

         public RecordedElement() {
             this(null, null, null, null, ElementType.CONTINUE);
         }

         protected RecordedElement(String uri, String localName, String qName, Attributes attrs,
                                   RecordedElement.ElementType elementType) {
             this.uri = uri;
             this.localName = localName;
             this.qName = qName;
             this.attrs = attrs;
             this.elementType = elementType;
             this.characters = new ArrayList<>();
         }

         @Override
         public String toString() {
             return String.format(Locale.ROOT, "<%s> of type %s", localName, elementType);
         }

         public String getUri() {
             return uri;
         }

         public String getLocalName() {
             return localName;
         }

         public String getQName() {
             return qName;
         }

         public Attributes getAttrs() {
             return attrs;
         }

         public List<char[]> getCharacters() {
             return characters;
         }

         public RecordedElement.ElementType getElementType() {
             return elementType;
         }

         public enum ElementType {
             START, END, CONTINUE
         }
     }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.tika.sax.boilerpipe;

	import java.io.Writer;
	import java.util.ArrayList;
	import java.util.BitSet;
	import java.util.HashSet;
	import java.util.List;
	import java.util.Locale;
	import java.util.Set;

	import de.l3s.boilerpipe.BoilerpipeExtractor;
	import de.l3s.boilerpipe.BoilerpipeProcessingException;
	import de.l3s.boilerpipe.document.TextBlock;
	import de.l3s.boilerpipe.document.TextDocument;
	import de.l3s.boilerpipe.extractors.ArticleExtractor;
	import de.l3s.boilerpipe.extractors.DefaultExtractor;
	import de.l3s.boilerpipe.sax.BoilerpipeHTMLContentHandler;
	import org.xml.sax.Attributes;
	import org.xml.sax.ContentHandler;
	import org.xml.sax.SAXException;
	import org.xml.sax.helpers.AttributesImpl;

	import org.apache.tika.metadata.Metadata;
	import org.apache.tika.sax.WriteOutContentHandler;
	import org.apache.tika.sax.XHTMLContentHandler;

	/**
	* Uses the <a href="http://code.google.com/p/boilerpipe/">boilerpipe</a>
	* library to automatically extract the main content from a web page.
	* <p/>
	* Use this as a {@link ContentHandler} object passed to
	* {@link HtmlParser#parse(java.io.InputStream, ContentHandler, Metadata,
	* org.apache.tika.parser.ParseContext)}
	*/
	public class BoilerpipeContentHandler extends BoilerpipeHTMLContentHandler {
	/**
	* The newline character that gets inserted after block elements.
	*/
	private static final char[] NL = new char[]{'\n'};
	private static Set<Character> ALLOWABLE_CHARS;

	static {
	ALLOWABLE_CHARS = new HashSet<>();
	ALLOWABLE_CHARS.add(' ');
	ALLOWABLE_CHARS.add('\n');
	ALLOWABLE_CHARS.add('\r');
	}

	private ContentHandler delegate;
	private BoilerpipeExtractor extractor;
	private boolean includeMarkup;
	private boolean inHeader;
	private boolean inFooter;
	private int headerCharOffset;
	private List<RecordedElement> elements;
	private TextDocument td;

	/**
	* Creates a new boilerpipe-based content extractor, using the
	* {@link DefaultExtractor} extraction rules and "delegate" as the content handler.
	*
	* @param delegate The {@link ContentHandler} object
	*/
	public BoilerpipeContentHandler(ContentHandler delegate) {
	this(delegate, DefaultExtractor.INSTANCE);
	}

	/**
	* Creates a content handler that writes XHTML body character events to
	* the given writer.
	*
	* @param writer writer
	*/
	public BoilerpipeContentHandler(Writer writer) {
	this(new WriteOutContentHandler(writer));
	}

	/**
	* Creates a new boilerpipe-based content extractor, using the given
	* extraction rules. The extracted main content will be passed to the
	* <delegate> content handler.
	*
	* @param delegate The {@link ContentHandler} object
	* @param extractor Extraction rules to use, e.g. {@link ArticleExtractor}
	*/
	public BoilerpipeContentHandler(ContentHandler delegate, BoilerpipeExtractor extractor) {
	this.td = null;
	this.delegate = delegate;
	this.extractor = extractor;
	}

	public boolean isIncludeMarkup() {
	return includeMarkup;
	}

	public void setIncludeMarkup(boolean includeMarkup) {
	this.includeMarkup = includeMarkup;
	}

	/**
	* Retrieves the built TextDocument
	*
	* @return TextDocument
	*/
	public TextDocument getTextDocument() {
	return td;
	}

	@Override
	public void startDocument() throws SAXException {
	super.startDocument();

	delegate.startDocument();

	inHeader = true;
	inFooter = false;
	headerCharOffset = 0;

	if (includeMarkup) {
	elements = new ArrayList<>();
	}
	}

	@Override
	public void startPrefixMapping(String prefix, String uri) throws SAXException {
	super.startPrefixMapping(prefix, uri);
	delegate.startPrefixMapping(prefix, uri);
	}

	;

	@Override
	public void startElement(String uri, String localName, String qName, Attributes atts)
	throws SAXException {
	super.startElement(uri, localName, qName, atts);

	if (inHeader) {
	delegate.startElement(uri, localName, qName, atts);
	} else if (inFooter) {
	// Do nothing
	} else if (includeMarkup) {
	elements.add(new RecordedElement(uri, localName, qName, atts));
	} else {
	// This happens for the <body> element, if we're not doing markup.
	delegate.startElement(uri, localName, qName, atts);
	}
	}

	;

	@Override
	public void characters(char[] chars, int offset, int length) throws SAXException {
	super.characters(chars, offset, length);

	if (inHeader) {
	delegate.characters(chars, offset, length);
	headerCharOffset++;
	} else if (inFooter) {
	// Do nothing
	} else if (includeMarkup) {
	RecordedElement element = elements.get(elements.size() - 1);

	char[] characters = new char[length];
	System.arraycopy(chars, offset, characters, 0, length);
	element.getCharacters().add(characters);
	}
	}

	;

	@Override
	public void endElement(String uri, String localName, String qName) throws SAXException {
	super.endElement(uri, localName, qName);

	if (inHeader) {
	delegate.endElement(uri, localName, qName);
	inHeader = !localName.equals("head");
	} else if (inFooter) {
	// Do nothing
	} else if (localName.equals("body")) {
	inFooter = true;
	} else if (includeMarkup) {
	// Add the end element, and the continuation from the previous element
	elements.add(new RecordedElement(uri, localName, qName));
	elements.add(new RecordedElement());
	}
	}

	;

	@Override
	public void endDocument() throws SAXException {
	super.endDocument();

	td = toTextDocument();
	try {
	extractor.process(td);
	} catch (BoilerpipeProcessingException e) {
	throw new SAXException(e);
	}

	Attributes emptyAttrs = new AttributesImpl();

	// At this point we have all the information we need to either emit N paragraphs
	// of plain text (if not including markup), or we have to replay our recorded elements
	// and only emit character runs that passed the boilerpipe filters.
	if (includeMarkup) {
	BitSet validCharacterRuns = new BitSet();
	for (TextBlock block : td.getTextBlocks()) {
	if (block.isContent()) {
	BitSet bs = block.getContainedTextElements();
	if (bs != null) {
	validCharacterRuns.or(bs);
	}
	}
	}

	// Now have bits set for all valid character runs. Replay our recorded elements,
	// but only emit character runs flagged as valid.
	int curCharsIndex = headerCharOffset;

	for (RecordedElement element : elements) {
	switch (element.getElementType()) {
	case START:
	delegate.startElement(element.getUri(), element.getLocalName(),
	element.getQName(), element.getAttrs());
	// Fall through

	case CONTINUE:
	// Now emit characters that are valid. Note that boilerpipe
	// pre-increments the character index, so
	// we have to follow suit.
	for (int i = 0; i < element.getCharacters().size(); i++) {
	char[] chars = element.getCharacters().get(i);
	curCharsIndex++;
	boolean isValidCharacterRun = validCharacterRuns.get(curCharsIndex);

	// https://issues.apache.org/jira/projects/TIKA/issues/TIKA-2683
	// Allow exempted characters to be written
	if (isValidCharacterRun \|\|
	(chars.length == 1 && ALLOWABLE_CHARS.contains(chars[0]))) {
	delegate.characters(chars, 0, chars.length);
	}

	// https://issues.apache.org/jira/browse/TIKA-961
	if (isValidCharacterRun && i == element.getCharacters().size() - 1 &&
	!Character.isWhitespace(chars[chars.length - 1])) {
	// Only add whitespace for certain elements
	if (XHTMLContentHandler.ENDLINE.contains(element.getLocalName())) {
	delegate.ignorableWhitespace(NL, 0, NL.length);
	}
	}
	}
	break;

	case END:
	delegate.endElement(element.getUri(), element.getLocalName(),
	element.getQName());
	break;

	default:
	throw new RuntimeException(
	"Unhandled element type: " + element.getElementType());
	}


	}
	} else {
	for (TextBlock block : td.getTextBlocks()) {
	if (block.isContent()) {
	delegate.startElement(XHTMLContentHandler.XHTML, "p", "p", emptyAttrs);
	char[] chars = block.getText().toCharArray();
	delegate.characters(chars, 0, chars.length);
	delegate.endElement(XHTMLContentHandler.XHTML, "p", "p");
	delegate.ignorableWhitespace(NL, 0, NL.length);
	}
	}
	}

	delegate.endElement(XHTMLContentHandler.XHTML, "body", "body");
	delegate.endElement(XHTMLContentHandler.XHTML, "html", "html");

	// We defer ending any prefix mapping until here, which is why we don't pass this
	// through to the delegate in an overridden method.
	delegate.endPrefixMapping("");

	delegate.endDocument();
	}

	;

	private static class RecordedElement {
	private String uri;
	private String localName;
	private String qName;
	private Attributes attrs;
	private List<char[]> characters;
	private ElementType elementType;

	public RecordedElement(String uri, String localName, String qName, Attributes attrs) {
	this(uri, localName, qName, attrs, ElementType.START);
	}

	public RecordedElement(String uri, String localName, String qName) {
	this(uri, localName, qName, null, ElementType.END);
	}

	public RecordedElement() {
	this(null, null, null, null, ElementType.CONTINUE);
	}

	protected RecordedElement(String uri, String localName, String qName, Attributes attrs,
	RecordedElement.ElementType elementType) {
	this.uri = uri;
	this.localName = localName;
	this.qName = qName;
	this.attrs = attrs;
	this.elementType = elementType;
	this.characters = new ArrayList<>();
	}

	@Override
	public String toString() {
	return String.format(Locale.ROOT, "<%s> of type %s", localName, elementType);
	}

	public String getUri() {
	return uri;
	}

	public String getLocalName() {
	return localName;
	}

	public String getQName() {
	return qName;
	}

	public Attributes getAttrs() {
	return attrs;
	}

	public List<char[]> getCharacters() {
	return characters;
	}

	public RecordedElement.ElementType getElementType() {
	return elementType;
	}

	public enum ElementType {
	START, END, CONTINUE
	}
	}
	}