| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| package org.apache.any23.extractor.html; |
| |
| import org.apache.any23.extractor.ExtractionResult; |
| import org.apache.any23.extractor.ExtractorDescription; |
| import org.apache.any23.extractor.TagSoupExtractionResult; |
| import org.apache.any23.vocab.DOAC; |
| import org.apache.any23.vocab.FOAF; |
| import org.eclipse.rdf4j.model.BNode; |
| import org.eclipse.rdf4j.model.Resource; |
| import org.eclipse.rdf4j.model.vocabulary.RDF; |
| import org.w3c.dom.Node; |
| |
| import java.util.List; |
| |
| /** |
| * Extractor for the <a href="http://microformats.org/wiki/hresume">hResume</a> microformat. |
| * |
| * @author Gabriele Renzi |
| */ |
| public class HResumeExtractor extends EntityBasedMicroformatExtractor { |
| |
| private static final FOAF vFOAF = FOAF.getInstance(); |
| private static final DOAC vDOAC = DOAC.getInstance(); |
| |
| @Override |
| public ExtractorDescription getDescription() { |
| return HResumeExtractorFactory.getDescriptionInstance(); |
| } |
| |
| @Override |
| public String getBaseClassName() { |
| return "hresume"; |
| } |
| |
| @Override |
| protected void resetExtractor() { |
| // Empty. |
| } |
| |
| @Override |
| protected boolean extractEntity(Node node, ExtractionResult out) { |
| if (null == node) |
| return false; |
| BNode person = getBlankNodeFor(node); |
| // we have a person, at least |
| out.writeTriple(person, RDF.TYPE, vFOAF.Person); |
| final HTMLDocument fragment = new HTMLDocument(node); |
| addSummary(fragment, person); |
| addContact(fragment, person); |
| addExperiences(fragment, person); |
| addEducations(fragment, person); |
| addAffiliations(fragment, person); |
| addSkills(fragment, person); |
| |
| final TagSoupExtractionResult tser = (TagSoupExtractionResult) out; |
| tser.addResourceRoot(DomUtils.getXPathListForNode(node), person, this.getClass()); |
| |
| return true; |
| } |
| |
| private void addSummary(HTMLDocument doc, Resource person) { |
| HTMLDocument.TextField summary = doc.getSingularTextField("summary"); |
| conditionallyAddStringProperty(summary.source(), person, vDOAC.summary, summary.value()); |
| } |
| |
| private void addContact(HTMLDocument doc, Resource person) { |
| List<Node> nodes = doc.findAllByClassName("contact"); |
| if (nodes.size() > 0) |
| addBNodeProperty(nodes.get(0), person, vFOAF.isPrimaryTopicOf, getBlankNodeFor(nodes.get(0))); |
| } |
| |
| private void addExperiences(HTMLDocument doc, Resource person) { |
| List<Node> nodes = doc.findAllByClassName("experience"); |
| for (Node node : nodes) { |
| BNode exp = valueFactory.createBNode(); |
| if (addExperience(exp, new HTMLDocument(node))) |
| addBNodeProperty(node, person, vDOAC.experience, exp); |
| } |
| } |
| |
| private boolean addExperience(Resource exp, HTMLDocument document) { |
| final Node documentNode = document.getDocument(); |
| String check = ""; |
| |
| HTMLDocument.TextField value = document.getSingularTextField("title"); |
| check += value; |
| conditionallyAddStringProperty(value.source(), exp, vDOAC.title, value.value().trim()); |
| |
| value = document.getSingularTextField("dtstart"); |
| check += value; |
| conditionallyAddStringProperty(documentNode, exp, vDOAC.start_date, value.value().trim()); |
| |
| value = document.getSingularTextField("dtend"); |
| check += value; |
| conditionallyAddStringProperty(documentNode, exp, vDOAC.end_date, value.value().trim()); |
| |
| value = document.getSingularTextField("summary"); |
| check += value; |
| conditionallyAddStringProperty(documentNode, exp, vDOAC.organization, value.value().trim()); |
| |
| return !"".equals(check); |
| } |
| |
| private void addEducations(HTMLDocument doc, Resource person) { |
| List<Node> nodes = doc.findAllByClassName("education"); |
| for (Node node : nodes) { |
| BNode exp = valueFactory.createBNode(); |
| if (addExperience(exp, new HTMLDocument(node))) |
| addBNodeProperty(node, person, vDOAC.education, exp); |
| } |
| } |
| |
| private void addAffiliations(HTMLDocument doc, Resource person) { |
| List<Node> nodes = doc.findAllByClassName("affiliation"); |
| for (Node node : nodes) { |
| addBNodeProperty(node, person, vDOAC.affiliation, getBlankNodeFor(node)); |
| } |
| } |
| |
| private void addSkills(HTMLDocument doc, Resource person) { |
| List<Node> nodes; |
| |
| // Extracting data from single node. |
| nodes = doc.findAllByClassName("skill"); |
| for (Node node : nodes) { |
| conditionallyAddStringProperty(node, person, vDOAC.skill, extractSkillValue(node)); |
| } |
| // Extracting from enlisting node. |
| nodes = doc.findAllByClassName("skills"); |
| for (Node node : nodes) { |
| String nodeText = node.getTextContent(); |
| String[] skills = nodeText.split(","); |
| for (String skill : skills) { |
| conditionallyAddStringProperty(node, person, vDOAC.skill, skill.trim()); |
| } |
| } |
| } |
| |
| private String extractSkillValue(Node n) { |
| String name = n.getNodeName(); |
| String skill = null; |
| if ("A".equals(name) && DomUtils.hasAttribute(n, "rel", "tag")) { |
| skill = n.getAttributes().getNamedItem("href").getTextContent(); |
| } else { |
| skill = n.getTextContent(); |
| } |
| return skill; |
| } |
| |
| } |