core/src/main/java/org/apache/any23/extractor/html/TagSoupParser.java - any23 - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *  http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 package org.apache.any23.extractor.html;

 import org.apache.any23.validator.DefaultValidator;
 import org.apache.any23.validator.Validator;
 import org.apache.any23.validator.ValidatorException;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import org.w3c.dom.Document;

 import java.io.IOException;
 import java.io.InputStream;
 import java.net.URI;
 import java.net.URISyntaxException;
 import java.nio.charset.Charset;
 import java.nio.charset.UnsupportedCharsetException;
 import java.util.Locale;

 /**
  * <p>Parses an {@link java.io.InputStream}
  * into an <i>HTML DOM</i> tree.
  * </p>
  * <p><strong>Note:</strong> The resulting <i>DOM</i> tree will not be namespace
  * aware, and all element names will be upper case, while attributes
  * will be lower case. This is because the HTML parser
  * uses the <a href="http://xerces.apache.org/xerces2-j/dom.html">Xerces HTML DOM</a>
  * implementation, which doesn't support namespaces and forces uppercase element names. This works
  * with the <i>RDFa XSLT Converter</i> and with <i>XPath</i>, so we left it this way.</p>
  *
  * @author Richard Cyganiak (richard at cyganiak dot de)
  * @author Michele Mostarda (mostarda@fbk.eu)
  * @author Davide Palmisano (palmisano@fbk.eu)
  */

 public class TagSoupParser {

     public static final String ELEMENT_LOCATION = "Element-Location";

     private final static Logger logger = LoggerFactory.getLogger(TagSoupParser.class);

     private final InputStream input;

     private final String documentIRI;

     private final String encoding;

     private final TagSoupParsingConfiguration config;

     private Document result = null;


     public TagSoupParser(InputStream input, String documentIRI) {
         this.input = input;
         this.documentIRI = documentIRI;
         this.encoding = null;

         config = TagSoupParsingConfiguration.getDefault();
     }

     public TagSoupParser(InputStream input, String documentIRI, String encoding) {
         if (encoding != null && !Charset.isSupported(encoding))
             throw new UnsupportedCharsetException(String.format(Locale.ROOT, "Charset %s is not supported", encoding));

         this.input = input;
         this.documentIRI = documentIRI;
         this.encoding = encoding;

         config = TagSoupParsingConfiguration.getDefault();
     }


     /**
      * Returns the DOM of the given document IRI.
      *
      * @return the <i>HTML</i> DOM.
      * @throws IOException if there is an error whilst accessing the DOM
      */
     public Document getDOM() throws IOException {
         if (result == null) {
             long startTime = System.currentTimeMillis();
             try {
                 result = config.parse(input, documentIRI, encoding);
             } finally {
                 long elapsed = System.currentTimeMillis() - startTime;
                 logger.debug("Parsed " + documentIRI + " with " + config.name() + ", " + elapsed + "ms");
             }
         }
         result.setDocumentURI(documentIRI);
         return result;
     }

     /**
      * Returns the validated DOM and applies fixes on it if <i>applyFix</i>
      * is set to <code>true</code>.
      *
      * @param applyFix whether to apply fixes to the DOM
      * @return a report containing the <i>HTML</i> DOM that has been validated and fixed if <i>applyFix</i>
      *         if <code>true</code>. The reports contains also information about the activated rules and the
      *         the detected issues.
      * @throws IOException if there is an error accessing the DOM
      * @throws org.apache.any23.validator.ValidatorException if there is an error validating the DOM
      */
     public DocumentReport getValidatedDOM(boolean applyFix) throws IOException, ValidatorException {
         final URI dIRI;
         try {
             dIRI = new URI(documentIRI);
         } catch (IllegalArgumentException | URISyntaxException urise) {
             throw new ValidatorException("Error while performing validation, invalid document IRI.", urise);
         }
         Validator validator = new DefaultValidator();
         Document document = getDOM();
         return new DocumentReport( validator.validate(dIRI, document, applyFix), document );
     }

     /**
      * Describes a <i>DOM Element</i> location.
      */
     public static class ElementLocation {

         private int beginLineNumber;
         private int beginColumnNumber;
         private int endLineNumber;
         private int endColumnNumber;

         private ElementLocation(
                 int beginLineNumber, int beginColumnNumber, int endLineNumber, int endColumnNumber
         ) {
             this.beginLineNumber = beginLineNumber;
             this.beginColumnNumber = beginColumnNumber;
             this.endLineNumber = endLineNumber;
             this.endColumnNumber = endColumnNumber;
         }

         public int getBeginLineNumber() {
             return beginLineNumber;
         }

         public int getBeginColumnNumber() {
             return beginColumnNumber;
         }

         public int getEndLineNumber() {
             return endLineNumber;
         }

         public int getEndColumnNumber() {
             return endColumnNumber;
         }
     }

 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	package org.apache.any23.extractor.html;

	import org.apache.any23.validator.DefaultValidator;
	import org.apache.any23.validator.Validator;
	import org.apache.any23.validator.ValidatorException;
	import org.slf4j.Logger;
	import org.slf4j.LoggerFactory;
	import org.w3c.dom.Document;

	import java.io.IOException;
	import java.io.InputStream;
	import java.net.URI;
	import java.net.URISyntaxException;
	import java.nio.charset.Charset;
	import java.nio.charset.UnsupportedCharsetException;
	import java.util.Locale;

	/**
	* <p>Parses an {@link java.io.InputStream}
	* into an <i>HTML DOM</i> tree.
	* </p>
	* <p><strong>Note:</strong> The resulting <i>DOM</i> tree will not be namespace
	* aware, and all element names will be upper case, while attributes
	* will be lower case. This is because the HTML parser
	* uses the <a href="http://xerces.apache.org/xerces2-j/dom.html">Xerces HTML DOM</a>
	* implementation, which doesn't support namespaces and forces uppercase element names. This works
	* with the <i>RDFa XSLT Converter</i> and with <i>XPath</i>, so we left it this way.</p>
	*
	* @author Richard Cyganiak (richard at cyganiak dot de)
	* @author Michele Mostarda (mostarda@fbk.eu)
	* @author Davide Palmisano (palmisano@fbk.eu)
	*/

	public class TagSoupParser {

	public static final String ELEMENT_LOCATION = "Element-Location";

	private final static Logger logger = LoggerFactory.getLogger(TagSoupParser.class);

	private final InputStream input;

	private final String documentIRI;

	private final String encoding;

	private final TagSoupParsingConfiguration config;

	private Document result = null;


	public TagSoupParser(InputStream input, String documentIRI) {
	this.input = input;
	this.documentIRI = documentIRI;
	this.encoding = null;

	config = TagSoupParsingConfiguration.getDefault();
	}

	public TagSoupParser(InputStream input, String documentIRI, String encoding) {
	if (encoding != null && !Charset.isSupported(encoding))
	throw new UnsupportedCharsetException(String.format(Locale.ROOT, "Charset %s is not supported", encoding));

	this.input = input;
	this.documentIRI = documentIRI;
	this.encoding = encoding;

	config = TagSoupParsingConfiguration.getDefault();
	}


	/**
	* Returns the DOM of the given document IRI.
	*
	* @return the <i>HTML</i> DOM.
	* @throws IOException if there is an error whilst accessing the DOM
	*/
	public Document getDOM() throws IOException {
	if (result == null) {
	long startTime = System.currentTimeMillis();
	try {
	result = config.parse(input, documentIRI, encoding);
	} finally {
	long elapsed = System.currentTimeMillis() - startTime;
	logger.debug("Parsed " + documentIRI + " with " + config.name() + ", " + elapsed + "ms");
	}
	}
	result.setDocumentURI(documentIRI);
	return result;
	}

	/**
	* Returns the validated DOM and applies fixes on it if <i>applyFix</i>
	* is set to <code>true</code>.
	*
	* @param applyFix whether to apply fixes to the DOM
	* @return a report containing the <i>HTML</i> DOM that has been validated and fixed if <i>applyFix</i>
	* if <code>true</code>. The reports contains also information about the activated rules and the
	* the detected issues.
	* @throws IOException if there is an error accessing the DOM
	* @throws org.apache.any23.validator.ValidatorException if there is an error validating the DOM
	*/
	public DocumentReport getValidatedDOM(boolean applyFix) throws IOException, ValidatorException {
	final URI dIRI;
	try {
	dIRI = new URI(documentIRI);
	} catch (IllegalArgumentException \| URISyntaxException urise) {
	throw new ValidatorException("Error while performing validation, invalid document IRI.", urise);
	}
	Validator validator = new DefaultValidator();
	Document document = getDOM();
	return new DocumentReport( validator.validate(dIRI, document, applyFix), document );
	}

	/**
	* Describes a <i>DOM Element</i> location.
	*/
	public static class ElementLocation {

	private int beginLineNumber;
	private int beginColumnNumber;
	private int endLineNumber;
	private int endColumnNumber;

	private ElementLocation(
	int beginLineNumber, int beginColumnNumber, int endLineNumber, int endColumnNumber
	) {
	this.beginLineNumber = beginLineNumber;
	this.beginColumnNumber = beginColumnNumber;
	this.endLineNumber = endLineNumber;
	this.endColumnNumber = endColumnNumber;
	}

	public int getBeginLineNumber() {
	return beginLineNumber;
	}

	public int getBeginColumnNumber() {
	return beginColumnNumber;
	}

	public int getEndLineNumber() {
	return endLineNumber;
	}

	public int getEndColumnNumber() {
	return endColumnNumber;
	}
	}

	}