blob: 37a8f18d6b3b51111459b975de23ccf73ac6fab9 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.any23.extractor.html;
import org.apache.any23.validator.DefaultValidator;
import org.apache.any23.validator.Validator;
import org.apache.any23.validator.ValidatorException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.w3c.dom.Document;
import java.io.IOException;
import java.io.InputStream;
import java.net.URI;
import java.net.URISyntaxException;
import java.nio.charset.Charset;
import java.nio.charset.UnsupportedCharsetException;
import java.util.Locale;
/**
* <p>Parses an {@link java.io.InputStream}
* into an <i>HTML DOM</i> tree.
* </p>
* <p><strong>Note:</strong> The resulting <i>DOM</i> tree will not be namespace
* aware, and all element names will be upper case, while attributes
* will be lower case. This is because the HTML parser
* uses the <a href="http://xerces.apache.org/xerces2-j/dom.html">Xerces HTML DOM</a>
* implementation, which doesn't support namespaces and forces uppercase element names. This works
* with the <i>RDFa XSLT Converter</i> and with <i>XPath</i>, so we left it this way.</p>
*
* @author Richard Cyganiak (richard at cyganiak dot de)
* @author Michele Mostarda (mostarda@fbk.eu)
* @author Davide Palmisano (palmisano@fbk.eu)
*/
public class TagSoupParser {
public static final String ELEMENT_LOCATION = "Element-Location";
private final static Logger logger = LoggerFactory.getLogger(TagSoupParser.class);
private final InputStream input;
private final String documentIRI;
private final String encoding;
private final TagSoupParsingConfiguration config;
private Document result = null;
public TagSoupParser(InputStream input, String documentIRI) {
this.input = input;
this.documentIRI = documentIRI;
this.encoding = null;
config = TagSoupParsingConfiguration.getDefault();
}
public TagSoupParser(InputStream input, String documentIRI, String encoding) {
if (encoding != null && !Charset.isSupported(encoding))
throw new UnsupportedCharsetException(String.format(Locale.ROOT, "Charset %s is not supported", encoding));
this.input = input;
this.documentIRI = documentIRI;
this.encoding = encoding;
config = TagSoupParsingConfiguration.getDefault();
}
/**
* Returns the DOM of the given document IRI.
*
* @return the <i>HTML</i> DOM.
* @throws IOException if there is an error whilst accessing the DOM
*/
public Document getDOM() throws IOException {
if (result == null) {
long startTime = System.currentTimeMillis();
try {
result = config.parse(input, documentIRI, encoding);
} finally {
long elapsed = System.currentTimeMillis() - startTime;
logger.debug("Parsed " + documentIRI + " with " + config.name() + ", " + elapsed + "ms");
}
}
result.setDocumentURI(documentIRI);
return result;
}
/**
* Returns the validated DOM and applies fixes on it if <i>applyFix</i>
* is set to <code>true</code>.
*
* @param applyFix whether to apply fixes to the DOM
* @return a report containing the <i>HTML</i> DOM that has been validated and fixed if <i>applyFix</i>
* if <code>true</code>. The reports contains also information about the activated rules and the
* the detected issues.
* @throws IOException if there is an error accessing the DOM
* @throws org.apache.any23.validator.ValidatorException if there is an error validating the DOM
*/
public DocumentReport getValidatedDOM(boolean applyFix) throws IOException, ValidatorException {
final URI dIRI;
try {
dIRI = new URI(documentIRI);
} catch (IllegalArgumentException | URISyntaxException urise) {
throw new ValidatorException("Error while performing validation, invalid document IRI.", urise);
}
Validator validator = new DefaultValidator();
Document document = getDOM();
return new DocumentReport( validator.validate(dIRI, document, applyFix), document );
}
/**
* Describes a <i>DOM Element</i> location.
*/
public static class ElementLocation {
private int beginLineNumber;
private int beginColumnNumber;
private int endLineNumber;
private int endColumnNumber;
private ElementLocation(
int beginLineNumber, int beginColumnNumber, int endLineNumber, int endColumnNumber
) {
this.beginLineNumber = beginLineNumber;
this.beginColumnNumber = beginColumnNumber;
this.endLineNumber = endLineNumber;
this.endColumnNumber = endColumnNumber;
}
public int getBeginLineNumber() {
return beginLineNumber;
}
public int getBeginColumnNumber() {
return beginColumnNumber;
}
public int getEndLineNumber() {
return endLineNumber;
}
public int getEndColumnNumber() {
return endColumnNumber;
}
}
}