core/src/main/java/org/apache/any23/extractor/html/SpeciesExtractor.java - any23 - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *  http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 package org.apache.any23.extractor.html;

 import java.util.Locale;

 import org.apache.any23.extractor.ExtractionException;
 import org.apache.any23.extractor.ExtractionResult;
 import org.apache.any23.extractor.ExtractorDescription;
 import org.apache.any23.extractor.TagSoupExtractionResult;
 import org.apache.any23.vocab.WO;
 import org.eclipse.rdf4j.model.BNode;
 import org.eclipse.rdf4j.model.Resource;
 import org.eclipse.rdf4j.model.IRI;
 import org.eclipse.rdf4j.model.vocabulary.RDF;
 import org.w3c.dom.Node;

 /**
  * Extractor able to extract the <a href="http://microformats.org/wiki/species">Species Microformat</a>.
  * The data are represented using the
  * <a href="http://www.bbc.co.uk/ontologies/wildlife/2010-02-22.shtml">BBC Wildlife Ontology</a>.
  *
  * @see org.apache.any23.vocab.WO
  * @author Davide Palmisano (dpalmisano@gmail.com)
  */
 public class SpeciesExtractor extends EntityBasedMicroformatExtractor {

     private static final WO vWO = WO.getInstance();

     private static final String[] classes = {
             "kingdom",
             "phylum",
             "order",
             "family",
             "genus",
             "species",
             "class",
     };

     /**
      * Returns the description of this extractor.
      *
      * @return a human readable description.
      */
     @Override
     public ExtractorDescription getDescription() {
         return SpeciesExtractorFactory.getDescriptionInstance();
     }

     /**
      * Returns the base class name for the extractor.
      *
      * @return a string containing the base of the extractor.
      */
     @Override
     protected String getBaseClassName() {
         return "biota";
     }

     /**
      * Resets the internal status of the extractor to prepare it to a new extraction section.
      */
     @Override
     protected void resetExtractor() {
         // empty
     }

     /**
      * Extracts an entity from a <i>DOM</i> node.
      *
      * @param node the DOM node.
      * @param out  the extraction result collector.
      * @return <code>true</code> if the extraction has produces something, <code>false</code> otherwise.
      * @throws org.apache.any23.extractor.ExtractionException if there is an error during extraction
      *
      */
     @Override
     protected boolean extractEntity(Node node, ExtractionResult out) throws ExtractionException {
         BNode biota = getBlankNodeFor(node);
         conditionallyAddResourceProperty(biota, RDF.TYPE, vWO.species);

         final HTMLDocument fragment = new HTMLDocument(node);
         addNames(fragment, biota);
         addClasses(fragment, biota);

         final TagSoupExtractionResult tser = (TagSoupExtractionResult) out;
         tser.addResourceRoot(
                 DomUtils.getXPathListForNode(node),
                 biota,
                 this.getClass()
         );

         return true;
     }

     private void addNames(HTMLDocument doc, Resource biota) throws ExtractionException {
         HTMLDocument.TextField binomial = doc.getSingularTextField("binomial");
         conditionallyAddStringProperty(
                 binomial.source(), biota, vWO.scientificName, binomial.value()
         );
         HTMLDocument.TextField vernacular = doc.getSingularTextField("vernacular");
         conditionallyAddStringProperty(
                 vernacular.source(), biota, vWO.speciesName, vernacular.value()
         );
     }

     private void addClassesName(HTMLDocument doc, Resource biota) throws ExtractionException {
         for (String clazz : classes) {
             HTMLDocument.TextField classTextField = doc.getSingularTextField(clazz);
             conditionallyAddStringProperty(
                     classTextField.source(), biota, resolvePropertyName(clazz), classTextField.value());
         }
     }

     private void addClasses(HTMLDocument doc, Resource biota) throws ExtractionException {
         for(String clazz : classes) {
             HTMLDocument.TextField classTextField = doc.getSingularUrlField(clazz);
             if(classTextField.source() != null) {
                 BNode classBNode = getBlankNodeFor(classTextField.source());
                 addBNodeProperty(biota, vWO.getProperty(clazz), classBNode);
                 conditionallyAddResourceProperty(classBNode, RDF.TYPE, resolveClassName(clazz));
                 HTMLDocument fragment = new HTMLDocument(classTextField.source());
                 addClassesName(fragment, classBNode);
             }
         }
     }

     private IRI resolvePropertyName(String clazz) {
         return vWO.getProperty(
                 String.format(Locale.ROOT,
                         "%sName",
                         clazz
                 )
         );
     }

     private IRI resolveClassName(String clazz) {
         String upperCaseClass = clazz.substring(0, 1);
         return vWO.getClass(
                 String.format(Locale.ROOT, "%s%s",
                         upperCaseClass.toUpperCase(Locale.ROOT),
                         clazz.substring(1)
                 )
         );
     }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	package org.apache.any23.extractor.html;

	import java.util.Locale;

	import org.apache.any23.extractor.ExtractionException;
	import org.apache.any23.extractor.ExtractionResult;
	import org.apache.any23.extractor.ExtractorDescription;
	import org.apache.any23.extractor.TagSoupExtractionResult;
	import org.apache.any23.vocab.WO;
	import org.eclipse.rdf4j.model.BNode;
	import org.eclipse.rdf4j.model.Resource;
	import org.eclipse.rdf4j.model.IRI;
	import org.eclipse.rdf4j.model.vocabulary.RDF;
	import org.w3c.dom.Node;

	/**
	* Extractor able to extract the <a href="http://microformats.org/wiki/species">Species Microformat</a>.
	* The data are represented using the
	* <a href="http://www.bbc.co.uk/ontologies/wildlife/2010-02-22.shtml">BBC Wildlife Ontology</a>.
	*
	* @see org.apache.any23.vocab.WO
	* @author Davide Palmisano (dpalmisano@gmail.com)
	*/
	public class SpeciesExtractor extends EntityBasedMicroformatExtractor {

	private static final WO vWO = WO.getInstance();

	private static final String[] classes = {
	"kingdom",
	"phylum",
	"order",
	"family",
	"genus",
	"species",
	"class",
	};

	/**
	* Returns the description of this extractor.
	*
	* @return a human readable description.
	*/
	@Override
	public ExtractorDescription getDescription() {
	return SpeciesExtractorFactory.getDescriptionInstance();
	}

	/**
	* Returns the base class name for the extractor.
	*
	* @return a string containing the base of the extractor.
	*/
	@Override
	protected String getBaseClassName() {
	return "biota";
	}

	/**
	* Resets the internal status of the extractor to prepare it to a new extraction section.
	*/
	@Override
	protected void resetExtractor() {
	// empty
	}

	/**
	* Extracts an entity from a <i>DOM</i> node.
	*
	* @param node the DOM node.
	* @param out the extraction result collector.
	* @return <code>true</code> if the extraction has produces something, <code>false</code> otherwise.
	* @throws org.apache.any23.extractor.ExtractionException if there is an error during extraction
	*
	*/
	@Override
	protected boolean extractEntity(Node node, ExtractionResult out) throws ExtractionException {
	BNode biota = getBlankNodeFor(node);
	conditionallyAddResourceProperty(biota, RDF.TYPE, vWO.species);

	final HTMLDocument fragment = new HTMLDocument(node);
	addNames(fragment, biota);
	addClasses(fragment, biota);

	final TagSoupExtractionResult tser = (TagSoupExtractionResult) out;
	tser.addResourceRoot(
	DomUtils.getXPathListForNode(node),
	biota,
	this.getClass()
	);

	return true;
	}

	private void addNames(HTMLDocument doc, Resource biota) throws ExtractionException {
	HTMLDocument.TextField binomial = doc.getSingularTextField("binomial");
	conditionallyAddStringProperty(
	binomial.source(), biota, vWO.scientificName, binomial.value()
	);
	HTMLDocument.TextField vernacular = doc.getSingularTextField("vernacular");
	conditionallyAddStringProperty(
	vernacular.source(), biota, vWO.speciesName, vernacular.value()
	);
	}

	private void addClassesName(HTMLDocument doc, Resource biota) throws ExtractionException {
	for (String clazz : classes) {
	HTMLDocument.TextField classTextField = doc.getSingularTextField(clazz);
	conditionallyAddStringProperty(
	classTextField.source(), biota, resolvePropertyName(clazz), classTextField.value());
	}
	}

	private void addClasses(HTMLDocument doc, Resource biota) throws ExtractionException {
	for(String clazz : classes) {
	HTMLDocument.TextField classTextField = doc.getSingularUrlField(clazz);
	if(classTextField.source() != null) {
	BNode classBNode = getBlankNodeFor(classTextField.source());
	addBNodeProperty(biota, vWO.getProperty(clazz), classBNode);
	conditionallyAddResourceProperty(classBNode, RDF.TYPE, resolveClassName(clazz));
	HTMLDocument fragment = new HTMLDocument(classTextField.source());
	addClassesName(fragment, classBNode);
	}
	}
	}

	private IRI resolvePropertyName(String clazz) {
	return vWO.getProperty(
	String.format(Locale.ROOT,
	"%sName",
	clazz
	)
	);
	}

	private IRI resolveClassName(String clazz) {
	String upperCaseClass = clazz.substring(0, 1);
	return vWO.getClass(
	String.format(Locale.ROOT, "%s%s",
	upperCaseClass.toUpperCase(Locale.ROOT),
	clazz.substring(1)
	)
	);
	}
	}