blob: 5bb901ae29d8a2c006124f20e48bc254c05139f1 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.stanbol.enhancer.engines.dbpspotlight.model;
import static org.apache.stanbol.enhancer.engines.dbpspotlight.utils.XMLParser.getElementsByTagName;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import org.apache.clerezza.rdf.core.Resource;
import org.apache.clerezza.rdf.core.UriRef;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.NodeList;
/**
* Contains a result given by DBPedia Spotlight..
*
*
* @author <a href="mailto:iavor.jelev@babelmonkeys.com">Iavor Jelev</a>
*/
public class Annotation {
/*
* TODO (Note by rwesten 2012-08-22)
*
* Added here functionality to extract DBpedia
* Ontoloty types for Annotations. This is mainly to
* choose the best dc:type for fise:TextAnnotations
* created for Annotation.
*
* This is based on the assumption that the most generic
* dbpedia type is always the last one in the returned list.
*
* In addition "DBpedia:TopicalConcept" is ignored first
* as it seams not to be used by dbpedia.org and second
* because it is always parsed last (even after schema
* and freebase types) and would therefore be considered
* as the most generic dbpedia type.
*
* I do not like this solution and would like to find
* a better solution for that
*/
/**
* Allows to add DBpedia Ontology types that should be
* ignored by {@link #getDbpediaTypeNames()}.<p>
* Introduced this to ignore the "TopicalConcept"
* type.
*/
public static final Set<String> IGNORED_DBP_TYPES;
static {
Set<String> ignored = new HashSet<String>();
ignored.add("DBpedia:TopicalConcept");
IGNORED_DBP_TYPES = Collections.unmodifiableSet(ignored);
}
public Resource uri;
//TODO: change this to a list with the parsed types
// Processing of XML results should be done during parsing
public String types;
public Integer support;
//NOTE rwesten: changed this to embed a SurfaceFrom so that i
// can reuse code for creating fise:TextAnnotations
public SurfaceForm surfaceForm;
public Double similarityScore;
public Double percentageOfSecondRank;
public List<String> getTypeNames() {
if (types != null) {
List<String> t = new ArrayList<String>();
String[] typex = types.split(",");
for (String type : typex) {
// make the returned types referenceable
String deref = type.replace("DBpedia:", "http://dbpedia.org/ontology/")
.replace("Freebase:", "http://www.freebase.com/schema")
.replace("Schema:", "http://www.schema.org/");
if(!deref.isEmpty()){
t.add(deref);
}
}
return t;
}
return Collections.emptyList();
}
/**
* Getter for the dbpedia ontology types excluding {@link #IGNORED_DBP_TYPES}
* @return the types or an empty list if none
*/
public List<String> getDbpediaTypeNames(){
if (types != null) {
List<String> t = new ArrayList<String>();
String[] typex = types.split(",");
for (String type : typex) {
if(!IGNORED_DBP_TYPES.contains(type) && type.startsWith("DBpedia:")){
t.add(type.replace("DBpedia:", "http://dbpedia.org/ontology/"));
}
}
return t;
}
return Collections.emptyList();
}
public String toString() {
return String
.format("[uri=%s, support=%i, types=%s, surfaceForm=\"%s\", similarityScore=%d, percentageOfSecondRank=%d]",
uri, support, types, surfaceForm,
similarityScore, percentageOfSecondRank);
}
/**
* This method parses allAnnotations from the parsed XML {@link Document}
*
* @param xmlDoc
* A XML document containing annotations.
* @return a Collection<DBPSLAnnotation> with all annotations
*/
public static Collection<Annotation> parseAnnotations(Document xmlDoc) {
NodeList nList = getElementsByTagName(xmlDoc, "Resource");
Collection<Annotation> dbpslAnnos = new HashSet<Annotation>();
for (int temp = 0; temp < nList.getLength(); temp++) {
Annotation dbpslann = new Annotation();
Element node = (Element) nList.item(temp);
dbpslann.uri = new UriRef(node.getAttribute("URI"));
dbpslann.support = (new Integer(node.getAttribute("support")))
.intValue();
dbpslann.types = node.getAttribute("types");
dbpslann.surfaceForm = new SurfaceForm();
dbpslann.surfaceForm.name = node.getAttribute("surfaceForm");
dbpslann.surfaceForm.offset = (new Integer(node.getAttribute("offset")))
.intValue();
//set the type of the surface form
List<String> dbpediaTypes = dbpslann.getDbpediaTypeNames();
if(!dbpediaTypes.isEmpty()){
//set the last type in the list - the most general one - as type
//for the surface form
dbpslann.surfaceForm.type = dbpediaTypes.get(dbpediaTypes.size()-1);
}
dbpslann.similarityScore = (new Double(
node.getAttribute("similarityScore"))).doubleValue();
dbpslann.percentageOfSecondRank = (new Double(
node.getAttribute("percentageOfSecondRank"))).doubleValue();
dbpslAnnos.add(dbpslann);
}
return dbpslAnnos;
}
}