blob: fc8d1c7205236d45a3e04142f4a81f3cb277eef0 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.any23.extractor.html;
import org.apache.any23.extractor.ExtractionException;
import org.apache.any23.extractor.ExtractionResult;
import org.apache.any23.extractor.ExtractorDescription;
import org.apache.any23.vocab.HRecipe;
import org.eclipse.rdf4j.model.BNode;
import org.eclipse.rdf4j.model.IRI;
import org.eclipse.rdf4j.model.vocabulary.RDF;
import org.w3c.dom.Node;
/**
* Extractor for the <a href="http://microformats.org/wiki/hrecipe">hRecipe</a> microformat.
*
* @author Michele Mostarda (mostarda@fbk.eu)
*/
public class HRecipeExtractor extends EntityBasedMicroformatExtractor {
private static final HRecipe vHRECIPE = HRecipe.getInstance();
@Override
public ExtractorDescription getDescription() {
return HRecipeExtractorFactory.getDescriptionInstance();
}
@Override
protected String getBaseClassName() {
return "hrecipe";
}
@Override
protected void resetExtractor() {
// Empty.
}
@Override
protected boolean extractEntity(Node node, ExtractionResult out) throws ExtractionException {
final BNode recipe = getBlankNodeFor(node);
conditionallyAddResourceProperty(recipe, RDF.TYPE, vHRECIPE.Recipe);
final HTMLDocument fragment = new HTMLDocument(node);
addFN(fragment, recipe);
addIngredients(fragment, recipe);
addYield(fragment, recipe);
addInstructions(fragment, recipe);
addDurations(fragment, recipe);
addPhoto(fragment, recipe);
addSummary(fragment, recipe);
addAuthors(fragment, recipe);
addPublished(fragment, recipe);
addNutritions(fragment, recipe);
addTags(fragment, recipe);
return true;
}
/**
* Maps a field text with a property.
*
* @param fragment
* @param recipe
* @param fieldClass
* @param property
*/
private void mapFieldWithProperty(HTMLDocument fragment, BNode recipe, String fieldClass, IRI property) {
HTMLDocument.TextField title = fragment.getSingularTextField(fieldClass);
conditionallyAddStringProperty(title.source(), recipe, property, title.value());
}
/**
* Adds the <code>fn</code> triple.
*
* @param fragment
* @param recipe
*/
private void addFN(HTMLDocument fragment, BNode recipe) {
mapFieldWithProperty(fragment, recipe, "fn", vHRECIPE.fn);
}
/**
* Adds the <code>ingredient</code> triples.
*
* @param fragment
* @param ingredient
*
* @return
*/
private BNode addIngredient(HTMLDocument fragment, HTMLDocument.TextField ingredient) {
final BNode ingredientBnode = getBlankNodeFor(ingredient.source());
addIRIProperty(ingredientBnode, RDF.TYPE, vHRECIPE.Ingredient);
conditionallyAddStringProperty(ingredient.source(), ingredientBnode, vHRECIPE.ingredientName,
HTMLDocument.readNodeContent(ingredient.source(), true));
mapFieldWithProperty(fragment, ingredientBnode, "value", vHRECIPE.ingredientQuantity);
mapFieldWithProperty(fragment, ingredientBnode, "type", vHRECIPE.ingredientQuantityType);
return ingredientBnode;
}
/**
* Adds the <code>ingredients</code>list triples.
*
* @param fragment
* @param recipe
*
* @return
*/
private void addIngredients(HTMLDocument fragment, BNode recipe) {
final HTMLDocument.TextField[] ingredients = fragment.getPluralTextField("ingredient");
for (HTMLDocument.TextField ingredient : ingredients) {
addBNodeProperty(recipe, vHRECIPE.ingredient, addIngredient(fragment, ingredient));
}
}
/**
* Adds the <code>instruction</code> triples.
*
* @param fragment
* @param recipe
*/
private void addInstructions(HTMLDocument fragment, BNode recipe) {
mapFieldWithProperty(fragment, recipe, "instructions", vHRECIPE.instructions);
}
/**
* Adds the <code>yield</code> triples.
*
* @param fragment
* @param recipe
*/
private void addYield(HTMLDocument fragment, BNode recipe) {
mapFieldWithProperty(fragment, recipe, "yield", vHRECIPE.yield);
}
/**
* Adds the <code>duration</code> triples.
*
* @param fragment
* @param duration
*
* @return
*/
// TODO: USE http://microformats.org/wiki/value-class-pattern to read correct date format.
private BNode addDuration(HTMLDocument fragment, HTMLDocument.TextField duration) {
final BNode durationBnode = getBlankNodeFor(duration.source());
addIRIProperty(durationBnode, RDF.TYPE, vHRECIPE.Duration);
conditionallyAddStringProperty(duration.source(), durationBnode, vHRECIPE.durationTime, duration.value());
mapFieldWithProperty(fragment, durationBnode, "value-title", vHRECIPE.durationTitle);
return durationBnode;
}
/**
* Adds the <code>yield</code> triples.
*
* @param fragment
* @param recipe
*/
private void addDurations(HTMLDocument fragment, BNode recipe) {
final HTMLDocument.TextField[] durations = fragment.getPluralTextField("duration");
for (HTMLDocument.TextField duration : durations) {
addBNodeProperty(recipe, vHRECIPE.duration, addDuration(fragment, duration));
}
}
/**
* Adds the <code>photo</code> triples.
*
* @param fragment
* @param recipe
*
* @throws ExtractionException
*/
private void addPhoto(HTMLDocument fragment, BNode recipe) throws ExtractionException {
final HTMLDocument.TextField[] photos = fragment.getPluralUrlField("photo");
for (HTMLDocument.TextField photo : photos) {
addIRIProperty(recipe, vHRECIPE.photo, fragment.resolveIRI(photo.value()));
}
}
/**
* Adds the <code>summary</code> triples.
*
* @param fragment
* @param recipe
*/
private void addSummary(HTMLDocument fragment, BNode recipe) {
mapFieldWithProperty(fragment, recipe, "summary", vHRECIPE.summary);
}
/**
* Adds the <code>authors</code> triples.
*
* @param fragment
* @param recipe
*/
private void addAuthors(HTMLDocument fragment, BNode recipe) {
final HTMLDocument.TextField[] authors = fragment.getPluralTextField("author");
for (HTMLDocument.TextField author : authors) {
conditionallyAddStringProperty(author.source(), recipe, vHRECIPE.author, author.value());
}
}
/**
* Adds the <code>published</code> triples.
*
* @param fragment
* @param recipe
*/
// TODO: USE http://microformats.org/wiki/value-class-pattern to read correct date format.
private void addPublished(HTMLDocument fragment, BNode recipe) {
mapFieldWithProperty(fragment, recipe, "published", vHRECIPE.published);
}
/**
* Adds the <code>nutrition</code> triples.
*
* @param fragment
* @param nutrition
*
* @return
*/
private BNode addNutrition(HTMLDocument fragment, HTMLDocument.TextField nutrition) {
final BNode nutritionBnode = getBlankNodeFor(nutrition.source());
addIRIProperty(nutritionBnode, RDF.TYPE, vHRECIPE.Nutrition);
conditionallyAddStringProperty(nutrition.source(), nutritionBnode, vHRECIPE.nutritionValue, nutrition.value());
mapFieldWithProperty(fragment, nutritionBnode, "value", vHRECIPE.nutritionValue);
mapFieldWithProperty(fragment, nutritionBnode, "type", vHRECIPE.nutritionValueType);
return nutritionBnode;
}
/**
* Adds the <code>nutritions</code> triples.
*
* @param fragment
* @param recipe
*/
private void addNutritions(HTMLDocument fragment, BNode recipe) {
HTMLDocument.TextField[] nutritions = fragment.getPluralTextField("nutrition");
for (HTMLDocument.TextField nutrition : nutritions) {
addBNodeProperty(recipe, vHRECIPE.nutrition, addNutrition(fragment, nutrition));
}
}
/**
* Adds the <code>tags</code> triples.
*
* @param fragment
* @param recipe
*/
private void addTags(HTMLDocument fragment, BNode recipe) {
HTMLDocument.TextField[] tags = fragment.extractRelTagNodes();
for (HTMLDocument.TextField tag : tags) {
conditionallyAddStringProperty(tag.source(), recipe, vHRECIPE.tag, tag.value());
}
}
}