blob: 0feb7e2926c6a631cff0afbc007cd2ae16c7ba7a [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.any23.extractor.html;
import org.apache.any23.extractor.ExtractionException;
import org.apache.any23.extractor.ExtractorDescription;
import org.apache.any23.extractor.TagSoupExtractionResult;
import org.apache.any23.rdf.RDFUtils;
import org.apache.any23.vocab.ICAL;
import org.eclipse.rdf4j.model.BNode;
import org.eclipse.rdf4j.model.Resource;
import org.eclipse.rdf4j.model.IRI;
import org.eclipse.rdf4j.model.vocabulary.RDF;
import org.w3c.dom.Node;
import javax.xml.datatype.DatatypeConfigurationException;
import java.text.ParseException;
import java.util.List;
import static org.apache.any23.extractor.html.HTMLDocument.TextField;
/**
* Extractor for the <a href="http://microformats.org/wiki/hcalendar">hCalendar</a> microformat.
*
* @author Gabriele Renzi
*/
public class HCalendarExtractor extends MicroformatExtractor {
private static final ICAL vICAL = ICAL.getInstance();
private static final String[] Components = { "Vevent", "Vtodo", "Vjournal", "Vfreebusy" };
private static final String DATE_FORMAT = "yyyyMMdd'T'HHmm'Z'";
private String[] textSingularProps = { "summary", "class", "transp", "description", "status", "location" };
private String[] textDateProps = { "dtstart", "dtstamp", "dtend", };
@Override
public ExtractorDescription getDescription() {
return HCalendarExtractorFactory.getDescriptionInstance();
}
@Override
protected boolean extract() throws ExtractionException {
final HTMLDocument document = getHTMLDocument();
List<Node> calendars = document.findAllByClassName("vcalendar");
if (calendars.size() == 0)
// vcal allows to avoid top name, in which case whole document is
// the calendar, let's try
if (document.findAllByClassName("vevent").size() > 0)
calendars.add(document.getDocument());
boolean foundAny = false;
for (Node node : calendars)
foundAny |= extractCalendar(node);
return foundAny;
}
private boolean extractCalendar(Node node) throws ExtractionException {
IRI cal = getDocumentIRI();
addIRIProperty(cal, RDF.TYPE, vICAL.Vcalendar);
return addComponents(node, cal);
}
private boolean addComponents(Node node, Resource cal) throws ExtractionException {
boolean foundAny = false;
for (String component : Components) {
List<Node> events = DomUtils.findAllByClassName(node, component);
if (events.size() == 0)
continue;
for (Node evtNode : events)
foundAny |= extractComponent(evtNode, cal, component);
}
return foundAny;
}
private boolean extractComponent(Node node, Resource cal, String component) throws ExtractionException {
HTMLDocument compoNode = new HTMLDocument(node);
BNode evt = valueFactory.createBNode();
addIRIProperty(evt, RDF.TYPE, vICAL.getClass(component));
addTextProps(compoNode, evt);
addUrl(compoNode, evt);
addRRule(compoNode, evt);
addOrganizer(compoNode, evt);
addUid(compoNode, evt);
addBNodeProperty(cal, vICAL.component, evt);
final TagSoupExtractionResult tser = (TagSoupExtractionResult) getCurrentExtractionResult();
tser.addResourceRoot(compoNode.getPathToLocalRoot(), evt, this.getClass());
return true;
}
private void addUid(HTMLDocument compoNode, Resource evt) {
TextField url = compoNode.getSingularUrlField("uid");
conditionallyAddStringProperty(compoNode.getDocument(), evt, vICAL.uid, url.value());
}
private void addUrl(HTMLDocument compoNode, Resource evt) throws ExtractionException {
TextField url = compoNode.getSingularUrlField("url");
if ("".equals(url.value()))
return;
addIRIProperty(evt, vICAL.url, getHTMLDocument().resolveIRI(url.value()));
}
private void addRRule(HTMLDocument compoNode, Resource evt) {
for (Node rule : compoNode.findAllByClassName("rrule")) {
BNode rrule = valueFactory.createBNode();
addIRIProperty(rrule, RDF.TYPE, vICAL.DomainOf_rrule);
TextField freq = new HTMLDocument(rule).getSingularTextField("freq");
conditionallyAddStringProperty(freq.source(), rrule, vICAL.freq, freq.value());
addBNodeProperty(rule, evt, vICAL.rrule, rrule);
}
}
private void addOrganizer(HTMLDocument compoNode, Resource evt) {
for (Node organizer : compoNode.findAllByClassName("organizer")) {
// untyped
BNode blank = valueFactory.createBNode();
TextField mail = new HTMLDocument(organizer).getSingularUrlField("organizer");
conditionallyAddStringProperty(compoNode.getDocument(), blank, vICAL.calAddress, mail.value());
addBNodeProperty(organizer, evt, vICAL.organizer, blank);
}
}
private void addTextProps(HTMLDocument node, Resource evt) {
for (String date : textSingularProps) {
HTMLDocument.TextField val = node.getSingularTextField(date);
conditionallyAddStringProperty(val.source(), evt, vICAL.getProperty(date), val.value());
}
for (String date : textDateProps) {
HTMLDocument.TextField val = node.getSingularTextField(date);
try {
conditionallyAddStringProperty(val.source(), evt, vICAL.getProperty(date),
RDFUtils.getXSDDate(val.value(), DATE_FORMAT));
} catch (ParseException e) {
// Unparsable date format just leave it as it is.
conditionallyAddStringProperty(val.source(), evt, vICAL.getProperty(date), val.value());
} catch (DatatypeConfigurationException e) {
// Unparsable date format just leave it as it is
conditionallyAddStringProperty(val.source(), evt, vICAL.getProperty(date), val.value());
}
}
HTMLDocument.TextField[] values = node.getPluralTextField("category");
for (TextField val : values) {
conditionallyAddStringProperty(val.source(), evt, vICAL.categories, val.value());
}
}
}