blob: f4171388de6f7437eaaf92099ec07fe220cfc6af [file] [log] [blame]
/*
* Copyright 2008-2010 Digital Enterprise Research Institute (DERI)
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.deri.any23.extractor.html;
import org.apache.commons.lang.StringUtils;
import org.deri.any23.extractor.ExtractionException;
import org.deri.any23.extractor.ExtractionResult;
import org.deri.any23.extractor.ExtractorDescription;
import org.deri.any23.extractor.ExtractorFactory;
import org.deri.any23.extractor.SimpleExtractorFactory;
import org.deri.any23.extractor.TagSoupExtractionResult;
import org.deri.any23.rdf.PopularPrefixes;
import org.deri.any23.vocab.VCARD;
import org.openrdf.model.BNode;
import org.openrdf.model.Resource;
import org.openrdf.model.URI;
import org.openrdf.model.vocabulary.RDF;
import org.w3c.dom.NamedNodeMap;
import org.w3c.dom.Node;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.List;
import static org.deri.any23.extractor.html.HTMLDocument.TextField;
/**
* Extractor for the <a href="http://microformats.org/wiki/hcard">hCard</a>
* microformat.
*
* @author Gabriele Renzi
*/
public class HCardExtractor extends EntityBasedMicroformatExtractor {
private HCardName name = new HCardName();
private HTMLDocument fragment;
public final static ExtractorFactory<HCardExtractor> factory =
SimpleExtractorFactory.create(
"html-mf-hcard",
PopularPrefixes.createSubset("rdf", "vcard"),
Arrays.asList("text/html;q=0.1", "application/xhtml+xml;q=0.1"),
null,
HCardExtractor.class
);
public ExtractorDescription getDescription() {
return factory;
}
@Override
protected String getBaseClassName() {
return "vcard";
}
@Override
protected void resetExtractor() {
name.reset(); // Cleanup of the HCardName content.
}
private void fixIncludes(HTMLDocument document, Node node) {
NamedNodeMap attributes = node.getAttributes();
// header case test 32
if ("TD".equals(node.getNodeName()) && (null != attributes.getNamedItem("headers"))) {
String id = attributes.getNamedItem("headers").getNodeValue();
Node header = document.findNodeById(id);
if (null != header) {
node.appendChild(header.cloneNode(true));
attributes.removeNamedItem("headers");
}
}
// include pattern, test 31
for (Node current : document.findAll("//*[@class]")) {
if (!DomUtils.hasClassName(current, "include")) continue;
// we have to remove the field soon to avoid infinite loops
// no null check, we know it's there or we won't be in the loop
current.getAttributes().removeNamedItem("class");
ArrayList<TextField> res = new ArrayList<TextField>();
HTMLDocument.readUrlField(res, current);
TextField id = res.get(0);
if (null == id)
continue;
id = new TextField( StringUtils.substringAfter(id.value(), "#"), id.source() );
Node included = document.findNodeById(id.value());
if (null == included)
continue;
current.appendChild(included.cloneNode(true));
}
}
@Override
protected boolean extractEntity(Node node, ExtractionResult out) throws ExtractionException {
this.fragment = new HTMLDocument(node);
fixIncludes(getHTMLDocument(), node);
final BNode card = getBlankNodeFor(node);
boolean foundSomething = false;
readFn();
readNames();
readOrganization();
foundSomething |= addFn(card);
foundSomething |= addNames(card);
foundSomething |= addOrganizationName(card);
foundSomething |= addStringProperty("sort-string", card, VCARD.sort_string);
foundSomething |= addUrl(card);
foundSomething |= addEmail(card);
foundSomething |= addPhoto(card);
foundSomething |= addLogo(card);
foundSomething |= addUid(card);
foundSomething |= addClass(card);
foundSomething |= addStringProperty("bday", card, VCARD.bday);
foundSomething |= addStringProperty("rev", card, VCARD.rev);
foundSomething |= addStringProperty("tz", card, VCARD.tz);
foundSomething |= addCategory(card);
foundSomething |= addStringProperty("card", card, VCARD.class_);
foundSomething |= addSubMicroformat("adr", card, VCARD.adr);
foundSomething |= addTelephones(card);
foundSomething |= addStringProperty("title", card, VCARD.title);
foundSomething |= addStringProperty("role", card, VCARD.role);
foundSomething |= addStringMultiProperty("note", card, VCARD.note);
foundSomething |= addSubMicroformat("geo", card, VCARD.geo);
if (!foundSomething) return false;
out.writeTriple(card, RDF.TYPE, VCARD.VCard);
final TagSoupExtractionResult tser = (TagSoupExtractionResult) out;
tser.addResourceRoot( DomUtils.getXPathListForNode(node), card, getDescription().getExtractorName() );
return true;
}
private boolean addTelephones(Resource card) {
boolean found = false;
for (Node node : fragment.findAll(".//*[contains(@class,'tel')]")) {
HTMLDocument telFragment = new HTMLDocument(node);
TextField[] values = telFragment.getPluralUrlField("value");
if (values.length == 0) {
//no sub values
String[] typeAndValue = telFragment.getSingularUrlField("tel").value().split(":");
//modem:goo fax:foo tel:bar
if (typeAndValue.length > 1) {
found |= addTel(card, "tel", typeAndValue[1]);
} else {
found |= addTel(card, "tel", typeAndValue[0]);
}
} else {
final String[] valuesStr = new String[values.length];
for(int i = 0; i < values.length; i++) {
valuesStr[i] = values[i].value();
}
HTMLDocument.TextField[] types = telFragment.getPluralTextField("type");
if (types.length == 0) {
found |= addTel(card, "tel", StringUtils.join(valuesStr));
}
for (HTMLDocument.TextField type : types) {
found |= addTel(card, type.value(), StringUtils.join(valuesStr));
}
}
}
return found;
}
private boolean addTel(Resource card, String type, String value) {
URI tel = super.fixLink(value, "tel");
URI composed = VCARD.getProperty(type + "Tel");
if (composed == null) {
URI simple = VCARD.getProperty(type);
if (simple == null) {
return conditionallyAddResourceProperty(card, VCARD.tel, tel);
}
return conditionallyAddResourceProperty(card, simple, tel);
}
return conditionallyAddResourceProperty(card, composed, tel);
}
private boolean addSubMicroformat(String className, Resource resource, URI property) {
List<Node> nodes = fragment.findAllByClassName(className);
if (nodes.isEmpty()) return false;
for (Node node : nodes) {
addBNodeProperty(
getDescription().getExtractorName(),
node,
resource, property, getBlankNodeFor(node)
);
}
return true;
}
private boolean addStringProperty(String className, Resource resource, URI property) {
final HTMLDocument.TextField textField = fragment.getSingularTextField(className);
return conditionallyAddStringProperty(
getDescription().getExtractorName(),
textField.source(),
resource, property, textField.value()
);
}
/**
* Adds a property that can be associated to multiple values.
*
* @param className
* @param resource
* @param property
* @return <code>true</code> if the multi property has been added, <code>false</code> otherwise.
*/
private boolean addStringMultiProperty(String className, Resource resource, URI property) {
HTMLDocument.TextField[] fields = fragment.getPluralTextField(className);
boolean found = false;
final String extractorName = getDescription().getExtractorName();
for(HTMLDocument.TextField field : fields) {
found |= conditionallyAddStringProperty(
extractorName,
field.source(),
resource, property, field.value()
);
}
return found;
}
private boolean addCategory(Resource card) {
HTMLDocument.TextField[] categories = fragment.getPluralTextField("category");
boolean found = false;
final String extractorName = getDescription().getExtractorName();
for (HTMLDocument.TextField category : categories) {
found |= conditionallyAddStringProperty(
extractorName,
category.source(),
card, VCARD.category, category.value()
);
}
return found;
}
private boolean addUid(Resource card) {
TextField uid = fragment.getSingularUrlField("uid");
return conditionallyAddStringProperty(
getDescription().getExtractorName(),
fragment.getDocument(),
card, VCARD.uid, uid.value()
);
}
private boolean addClass(Resource card) {
TextField class_ = fragment.getSingularUrlField("class");
return conditionallyAddStringProperty(
getDescription().getExtractorName(),
fragment.getDocument(),
card, VCARD.class_, class_.value()
);
}
private boolean addLogo(Resource card) throws ExtractionException {
TextField[] links = fragment.getPluralUrlField("logo");
boolean found = false;
for (TextField link : links) {
found |= conditionallyAddResourceProperty(
card, VCARD.logo, getHTMLDocument().resolveURI(link.value())
);
}
return found;
}
private boolean addPhoto(Resource card) throws ExtractionException {
TextField[] links = fragment.getPluralUrlField("photo");
boolean found = false;
for (TextField link : links) {
found |= conditionallyAddResourceProperty(
card, VCARD.photo, getHTMLDocument().resolveURI(link.value())
);
}
return found;
}
private boolean addEmail(Resource card) {
String email = dropSubject(fragment.getSingularUrlField("email").value());
return conditionallyAddResourceProperty(
card,
VCARD.email,
fixLink(email, "mailto")
);
}
private String dropSubject(String mail) {
if (mail == null) return null;
return mail.split("\\?")[0];
}
private void readNames() {
for (String field : HCardName.FIELDS) {
HTMLDocument.TextField[] values = fragment.getPluralTextField(field);
for (HTMLDocument.TextField text : values) {
if ("".equals(text.value())) continue;
name.setField(field, text);
}
}
}
private void addFieldTriple(String extractor, Node n, BNode bn, String fieldName, String fieldValue) {
conditionallyAddLiteralProperty(
extractor, n, bn, VCARD.getProperty(fieldName), valueFactory.createLiteral(fieldValue)
);
}
private boolean addNames(Resource card) {
BNode n = valueFactory.createBNode();
final String extractorName = getDescription().getExtractorName();
addBNodeProperty(
extractorName,
this.fragment.getDocument(),
card, VCARD.n, n
);
addURIProperty(n, RDF.TYPE, VCARD.Name);
for (String fieldName : HCardName.FIELDS) {
if (!name.containsField(fieldName)) {
continue;
}
if (name.isMultiField(fieldName)) {
Collection<HTMLDocument.TextField> values = name.getFields(fieldName);
for(TextField value : values) {
addFieldTriple(
extractorName,
value.source(),
n, fieldName, value.value()
);
}
} else {
TextField value = name.getField(fieldName);
if(value == null) { continue; }
addFieldTriple(
extractorName,
value.source(),
n, fieldName, value.value()
);
}
}
return true;
}
private void readFn() {
name.setFullName(fragment.getSingularTextField("fn"));
}
private boolean addFn(Resource card) {
final TextField fullNameTextField = name.getFullName();
return conditionallyAddStringProperty(
getDescription().getExtractorName(),
fullNameTextField.source(),
card, VCARD.fn, fullNameTextField.value()
);
}
private void readOrganization() {
Node node = fragment.findMicroformattedObjectNode("*", "org");
if (node == null) return;
HTMLDocument doc = new HTMLDocument(node);
String nodeText = doc.getText();
if(nodeText != null) {
name.setOrganization( new HTMLDocument.TextField(nodeText, node) );
} else {
name.setOrganization(doc.getSingularTextField("organization-name"));
}
name.setOrganizationUnit(doc.getSingularTextField("organization-unit"));
}
private boolean addOrganizationName(Resource card) {
if (name.getOrganization() == null) return false;
BNode org = valueFactory.createBNode();
final String extractorName = getDescription().getExtractorName();
addBNodeProperty(
extractorName,
this.fragment.getDocument(),
card, VCARD.org, org
);
addURIProperty(org, RDF.TYPE, VCARD.Organization);
final TextField organizationTextField = name.getOrganization();
conditionallyAddLiteralProperty(
extractorName,
organizationTextField.source(),
org, VCARD.organization_name, valueFactory.createLiteral( organizationTextField.value() )
);
final TextField organizationUnitTextField = name.getOrganizationUnit();
if(organizationUnitTextField != null) {
conditionallyAddStringProperty(
extractorName,
organizationUnitTextField.source(),
org, VCARD.organization_unit, organizationUnitTextField.value()
);
}
return true;
}
private boolean addUrl(Resource card) throws ExtractionException {
TextField[] links = fragment.getPluralUrlField("url");
boolean found = false;
for (TextField link : links) {
found |= conditionallyAddResourceProperty(card, VCARD.url, getHTMLDocument().resolveURI(link.value()));
}
return found;
}
}