blob: 0d8d1ecd6ec3ff5317ce398d5cfae5da888fc953 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.any23.extractor.html;
import org.apache.any23.extractor.ExtractionContext;
import org.apache.any23.extractor.ExtractionException;
import org.apache.any23.extractor.ExtractionParameters;
import org.apache.any23.extractor.ExtractionResult;
import org.apache.any23.extractor.Extractor;
import org.apache.any23.extractor.ExtractorDescription;
import org.apache.any23.rdf.RDFUtils;
import org.apache.any23.vocab.SINDICE;
import org.eclipse.rdf4j.model.IRI;
import org.eclipse.rdf4j.model.impl.SimpleValueFactory;
import org.w3c.dom.Document;
import org.w3c.dom.NamedNodeMap;
import org.w3c.dom.Node;
import java.io.IOException;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
/**
* This extractor represents the <i>HTML META</i> tag values according the
* <a href="http://www.w3.org/TR/html401/struct/global.html#h-7.4.4">HTML4 specification</a>.
*
* @author Davide Palmisano ( dpalmisano@gmail.com )
*/
public class HTMLMetaExtractor implements Extractor.TagSoupDOMExtractor {
private static final SINDICE vSINDICE = SINDICE.getInstance();
private IRI profile;
private Map<String, IRI> prefixes = new HashMap<>();
private String documentLang;
/**
* {@inheritDoc}
*/
@Override
public void run(ExtractionParameters extractionParameters, ExtractionContext extractionContext, Document in,
ExtractionResult out) throws IOException, ExtractionException {
profile = extractProfile(in);
documentLang = getDocumentLanguage(in);
extractLinkDefinedPrefixes(in);
String baseProfile = vSINDICE.NS;
if (profile != null) {
baseProfile = profile.toString();
}
final IRI documentIRI = extractionContext.getDocumentIRI();
Set<Meta> metas = extractMetaElement(in, baseProfile);
for (Meta meta : metas) {
String lang = documentLang;
if (meta.getLang() != null) {
lang = meta.getLang();
}
if (meta.isPragmaDirective) {
if (lang != null) {
out.writeTriple(documentIRI, meta.getHttpEquiv(),
SimpleValueFactory.getInstance().createLiteral(meta.getContent(), lang));
} else {
out.writeTriple(documentIRI, meta.getHttpEquiv(),
SimpleValueFactory.getInstance().createLiteral(meta.getContent()));
}
} else {
if (lang != null) {
out.writeTriple(documentIRI, meta.getName(),
SimpleValueFactory.getInstance().createLiteral(meta.getContent(), lang));
} else {
out.writeTriple(documentIRI, meta.getName(),
SimpleValueFactory.getInstance().createLiteral(meta.getContent()));
}
}
}
}
/**
* Returns the {@link Document} language if declared, <code>null</code> otherwise.
*
* @param in
* a instance of {@link Document}.
*
* @return the language declared, could be <code>null</code>.
*/
private String getDocumentLanguage(Document in) {
String lang = DomUtils.find(in, "string(/HTML/@lang)");
if ("".equals(lang)) {
return null;
}
return lang;
}
private IRI extractProfile(Document in) {
String profile = DomUtils.find(in, "string(/HTML/@profile)");
if ("".equals(profile)) {
return null;
}
return SimpleValueFactory.getInstance().createIRI(profile);
}
/**
* It extracts prefixes defined in the <i>LINK</i> meta tags.
*
* @param in
*/
private void extractLinkDefinedPrefixes(Document in) {
List<Node> linkNodes = DomUtils.findAll(in, "/HTML/HEAD/LINK");
for (Node linkNode : linkNodes) {
NamedNodeMap attributes = linkNode.getAttributes();
Node relNode = attributes.getNamedItem("rel");
String rel = relNode == null ? null : relNode.getTextContent();
Node hrefNode = attributes.getNamedItem("href");
String href = hrefNode == null ? null : hrefNode.getTextContent();
if (rel != null && href != null && RDFUtils.isAbsoluteIRI(href)) {
prefixes.put(rel, SimpleValueFactory.getInstance().createIRI(href));
}
}
}
private Set<Meta> extractMetaElement(Document in, String baseProfile) {
List<Node> metaNodes = DomUtils.findAll(in, "/HTML/HEAD/META");
Set<Meta> result = new HashSet<>();
for (Node metaNode : metaNodes) {
NamedNodeMap attributes = metaNode.getAttributes();
Node nameAttribute = attributes.getNamedItem("name");
Node httpEquivAttribute = attributes.getNamedItem("http-equiv");
Node contentAttribute = attributes.getNamedItem("content");
if (nameAttribute == null && httpEquivAttribute == null)
continue; // support HTML5 meta element nodes that do not have both name and http-equiv
if (nameAttribute != null || httpEquivAttribute != null) {
if (contentAttribute == null) {
continue;
}
}
boolean isPragmaDirective = (httpEquivAttribute != null) ? true : false;
if (isPragmaDirective) {
String httpEquiv = httpEquivAttribute.getTextContent();
String content = contentAttribute.getTextContent();
String xpath = DomUtils.getXPathForNode(metaNode);
IRI httpEquivAsIRI = getPrefixIfExists(httpEquiv);
if (httpEquivAsIRI == null) {
httpEquivAsIRI = SimpleValueFactory.getInstance().createIRI(baseProfile + httpEquiv);
}
Meta meta = new Meta(xpath, content, httpEquivAsIRI);
result.add(meta);
} else {
String name = nameAttribute.getTextContent();
String content = contentAttribute.getTextContent();
String xpath = DomUtils.getXPathForNode(metaNode);
IRI nameAsIRI = getPrefixIfExists(name);
if (nameAsIRI == null) {
nameAsIRI = SimpleValueFactory.getInstance().createIRI(baseProfile + name);
}
Meta meta = new Meta(xpath, nameAsIRI, content);
result.add(meta);
}
}
return result;
}
private IRI getPrefixIfExists(String name) {
String[] split = name.split("\\.");
if (split.length == 2 && prefixes.containsKey(split[0])) {
return SimpleValueFactory.getInstance().createIRI(prefixes.get(split[0]) + split[1]);
}
return null;
}
@Override
public ExtractorDescription getDescription() {
return HTMLMetaExtractorFactory.getDescriptionInstance();
}
private static class Meta {
private String xpath;
private IRI name;
private IRI httpEquiv;
private String lang;
private String content;
private boolean isPragmaDirective;
public Meta(String xpath, String content, IRI httpEquiv) {
this.xpath = xpath;
this.content = content;
this.httpEquiv = httpEquiv;
this.setPragmaDirective(true);
}
@SuppressWarnings("unused")
public Meta(String xpath, String content, IRI httpEquiv, String lang) {
this(xpath, content, httpEquiv);
this.lang = lang;
}
public Meta(String xpath, IRI name, String content) {
this.xpath = xpath;
this.name = name;
this.content = content;
}
@SuppressWarnings("unused")
public Meta(String xpath, IRI name, String content, String lang) {
this(xpath, name, content);
this.lang = lang;
}
private void setPragmaDirective(boolean value) {
this.isPragmaDirective = value;
}
public IRI getHttpEquiv() {
return httpEquiv;
}
public IRI getName() {
return name;
}
public String getLang() {
return lang;
}
public String getContent() {
return content;
}
@Override
public boolean equals(Object o) {
if (this == o)
return true;
if (o == null || getClass() != o.getClass())
return false;
Meta meta = (Meta) o;
if (xpath != null ? !xpath.equals(meta.xpath) : meta.xpath != null)
return false;
return true;
}
@Override
public int hashCode() {
return xpath != null ? xpath.hashCode() : 0;
}
}
}