blob: 97f4686e5151d8db90ce27e8154b1a87a4cdc6ca [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.any23.extractor.html;
import java.util.Locale;
import org.apache.any23.extractor.ExtractionException;
import org.apache.any23.extractor.ExtractionResult;
import org.apache.any23.extractor.ExtractorDescription;
import org.apache.any23.extractor.TagSoupExtractionResult;
import org.apache.any23.vocab.WO;
import org.eclipse.rdf4j.model.BNode;
import org.eclipse.rdf4j.model.Resource;
import org.eclipse.rdf4j.model.IRI;
import org.eclipse.rdf4j.model.vocabulary.RDF;
import org.w3c.dom.Node;
/**
* Extractor able to extract the <a href="http://microformats.org/wiki/species">Species Microformat</a>.
* The data are represented using the
* <a href="http://www.bbc.co.uk/ontologies/wildlife/2010-02-22.shtml">BBC Wildlife Ontology</a>.
*
* @see org.apache.any23.vocab.WO
* @author Davide Palmisano (dpalmisano@gmail.com)
*/
public class SpeciesExtractor extends EntityBasedMicroformatExtractor {
private static final WO vWO = WO.getInstance();
private static final String[] classes = {
"kingdom",
"phylum",
"order",
"family",
"genus",
"species",
"class",
};
/**
* Returns the description of this extractor.
*
* @return a human readable description.
*/
@Override
public ExtractorDescription getDescription() {
return SpeciesExtractorFactory.getDescriptionInstance();
}
/**
* Returns the base class name for the extractor.
*
* @return a string containing the base of the extractor.
*/
@Override
protected String getBaseClassName() {
return "biota";
}
/**
* Resets the internal status of the extractor to prepare it to a new extraction section.
*/
@Override
protected void resetExtractor() {
// empty
}
/**
* Extracts an entity from a <i>DOM</i> node.
*
* @param node the DOM node.
* @param out the extraction result collector.
* @return <code>true</code> if the extraction has produces something, <code>false</code> otherwise.
* @throws org.apache.any23.extractor.ExtractionException if there is an error during extraction
*
*/
@Override
protected boolean extractEntity(Node node, ExtractionResult out) throws ExtractionException {
BNode biota = getBlankNodeFor(node);
conditionallyAddResourceProperty(biota, RDF.TYPE, vWO.species);
final HTMLDocument fragment = new HTMLDocument(node);
addNames(fragment, biota);
addClasses(fragment, biota);
final TagSoupExtractionResult tser = (TagSoupExtractionResult) out;
tser.addResourceRoot(
DomUtils.getXPathListForNode(node),
biota,
this.getClass()
);
return true;
}
private void addNames(HTMLDocument doc, Resource biota) throws ExtractionException {
HTMLDocument.TextField binomial = doc.getSingularTextField("binomial");
conditionallyAddStringProperty(
binomial.source(), biota, vWO.scientificName, binomial.value()
);
HTMLDocument.TextField vernacular = doc.getSingularTextField("vernacular");
conditionallyAddStringProperty(
vernacular.source(), biota, vWO.speciesName, vernacular.value()
);
}
private void addClassesName(HTMLDocument doc, Resource biota) throws ExtractionException {
for (String clazz : classes) {
HTMLDocument.TextField classTextField = doc.getSingularTextField(clazz);
conditionallyAddStringProperty(
classTextField.source(), biota, resolvePropertyName(clazz), classTextField.value());
}
}
private void addClasses(HTMLDocument doc, Resource biota) throws ExtractionException {
for(String clazz : classes) {
HTMLDocument.TextField classTextField = doc.getSingularUrlField(clazz);
if(classTextField.source() != null) {
BNode classBNode = getBlankNodeFor(classTextField.source());
addBNodeProperty(biota, vWO.getProperty(clazz), classBNode);
conditionallyAddResourceProperty(classBNode, RDF.TYPE, resolveClassName(clazz));
HTMLDocument fragment = new HTMLDocument(classTextField.source());
addClassesName(fragment, classBNode);
}
}
}
private IRI resolvePropertyName(String clazz) {
return vWO.getProperty(
String.format(Locale.ROOT,
"%sName",
clazz
)
);
}
private IRI resolveClassName(String clazz) {
String upperCaseClass = clazz.substring(0, 1);
return vWO.getClass(
String.format(Locale.ROOT, "%s%s",
upperCaseClass.toUpperCase(Locale.ROOT),
clazz.substring(1)
)
);
}
}