created branch for STANBOL-1279
git-svn-id: https://svn.apache.org/repos/asf/stanbol/branches/STANBOL-1279@1643760 13f79535-47bb-0310-9956-ffa450edef68
diff --git a/entitycoreference/README.md b/entitycoreference/README.md
new file mode 100644
index 0000000..b25da45
--- /dev/null
+++ b/entitycoreference/README.md
@@ -0,0 +1,84 @@
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+# Entity Co-reference Engine
+
+The Entity co-reference Engine perform co-reference resolution of Named Entities in a given text.
+The co-references will be noun phrases which refer to those Named Entities by having a minimal set of attributes which match
+contextual information (rdf:type of the entity and spatial and object function giving info) from entity repositories
+such as Dbpedia and Yago for that Named Entity.
+
+We have the following text as an example : "Microsoft has posted its 2013 earnings. The software company did better than expected. ... The Redmond-based company will hire 500 new developers this year."
+The enhancement engine will link "Microsoft" with "The software company" and "The Redmond-based company".
+
+## Building the DbPedia 3.9 index with yago types
+
+This index will contain the yago rdf:types and several spatial/org membership and functional properties from the DBpedia index.
+NOTE: At the moment the index is available only for english.
+
+### (1) Follow the instructions at entityhub/indexing/dbpedia/README.md and build the dbpedia index with the following configuration:
+
+#### (1) Use the RDF dumps (in N-Triple format) from :
+ http://downloads.dbpedia.org/3.9/dbpedia_3.9.owl
+ http://downloads.dbpedia.org/3.9/en/labels_en.nt.bz2
+ http://downloads.dbpedia.org/3.9/en/instance_types_en.nt.bz2
+ http://downloads.dbpedia.org/3.9/en/mappingbased_properties_en.nt.bz2
+ http://downloads.dbpedia.org/3.9/links/yago_types.nt.bz2
+
+#### (2) The mappings.txt file must contain the following entries:
+
+ rdfs:label | d=entityhub:text
+ rdf:type | d=entityhub:ref
+ dbp-ont:birthPlace | d=entityhub:ref
+ dbp-ont:region | d=entityhub:ref
+ dbp-ont:foundationPlace | d=entityhub:ref
+ dbp-ont:locationCity | d=entityhub:ref
+ dbp-ont:location | d=entityhub:ref
+ dbp-ont:hometown | d=entityhub:ref
+ dbp-ont:country | d=entityhub:ref
+
+### (2) Run the script /dbpedia_yag_classes/build_yago_dbpedia_labels.sh which will create the dbpedia_yago_classes_labels.nt.bz2 archive
+which contains the labels of the yago types.
+
+### (3) Follow the instructions at entityhub/indexing/genericrdf/README.md and rebuild the dbpedia index in order to include the
+aforementioned yago types labels. After you init the indexer go through the following steps:
+
+#### (1) Copy the dbpedia_yago_classes_labels.nt.bz2 to the indexing\resources\rdfdata folder.
+
+#### (2) Change the indexing/config/indexing.properties to include the following attributes:
+
+ name=dbpedia
+ description=DBpedia.org
+
+#### (3) The indexing/config/mappings.txt file must olny contain the rdfs:label attribute
+
+#### (4) Copy the contents of the indexing/destination folder from the results of point ### (1) to the /indexing/destination folder
+of the generic rdf indexing at point ### (3).
+
+The results of all these steps will be the dbpedia.solrindex.zip archive which should be used as described in entityhub/indexing/dbpedia/README.md.
+
+## Configuring the Engine
+TODO
+
+## Running the Entity co-reference engine in Stanbol.
+
+In order to run the engine you need to add it to a chain that also contains the following engine types:
+- a language detection engine
+- a sentence detection engine (like opennlp-sentence)
+- a token detection engine (like opennlp-token)
+- a NER detection engine (like opennlp-ner)
+- a noun phrase detection engine (like pos-chunker)
diff --git a/entitycoreference/dbpedia_yago_classes/build_yago_dbpedia_labels.sh b/entitycoreference/dbpedia_yago_classes/build_yago_dbpedia_labels.sh
new file mode 100755
index 0000000..34274c0
--- /dev/null
+++ b/entitycoreference/dbpedia_yago_classes/build_yago_dbpedia_labels.sh
@@ -0,0 +1,58 @@
+#!/usr/bin/env bash
+
+YAGO=http://resources.mpi-inf.mpg.de/yago-naga/yago/download/yago/
+
+files=(yagoLabels.ttl.7z \
+ yagoDBpediaClasses.ttl.7z
+ )
+YAGO_LABELS=yagoLabels.ttl
+YAGO_DBPEDIA_CLASSES=yagoDBpediaClasses.ttl
+YAGO_WORDNET_LABELS=yago_wordnet_labels
+YAGO_WORDNET_DBPEDIA_CLASSES=yago_wordnet_dbpedia_classes
+DBPEDIA_YAGO_CLASS_LABELS_NT=dbpedia_yago_classes_labels.nt
+
+# First, download and decompress the necessary yago files.
+for i in "${files[@]}"
+do
+ :
+ url=${YAGO}/${i}
+ wget -c ${url}
+ 7za e ${i}
+ rm ${i}
+done
+
+# Second, create a file with <wordnet_class> rdfs:label "label" format.
+grep '^<wordnet_' ${YAGO_LABELS} | grep 'rdfs:label' > ${YAGO_WORDNET_LABELS}
+
+# Third, create a file with wordnet to dbpedia yago class mappings.
+grep '^<wordnet_' ${YAGO_DBPEDIA_CLASSES} > ${YAGO_WORDNET_DBPEDIA_CLASSES}
+
+# Last, create the nt file which will contain the dbpedia yago class and its labels.
+touch ${DBPEDIA_YAGO_CLASS_LABELS_NT};
+
+while read line
+do
+ wordnet_class=`echo $line | awk '{print $1}'`;
+ dbpedia_class=`grep $wordnet_class $YAGO_WORDNET_DBPEDIA_CLASSES | awk '{split($0,a," "); print a[3]}'`;
+
+ if [ -z "$dbpedia_class" ]
+ then
+ continue;
+ fi
+
+ mapped_line=${line/$wordnet_class/$dbpedia_class};
+ mapped_line_with_label=${mapped_line/rdfs:label/<http://www.w3.org/2000/01/rdf-schema#label>};
+ mapped_line_with_label_lang=${mapped_line_with_label/@eng/@en};
+
+ echo "Mapping $wordnet_class to $dbpedia_class";
+
+ echo $mapped_line_with_label_lang >> ${DBPEDIA_YAGO_CLASS_LABELS_NT};
+done < ${YAGO_WORDNET_LABELS}
+
+bzip2 ${DBPEDIA_YAGO_CLASS_LABELS_NT}
+
+# Cleanup
+rm ${YAGO_LABELS}
+rm ${YAGO_DBPEDIA_CLASSES}
+rm ${YAGO_WORDNET_LABELS}
+rm ${YAGO_WORDNET_DBPEDIA_CLASSES}
\ No newline at end of file
diff --git a/entitycoreference/pom.xml b/entitycoreference/pom.xml
new file mode 100644
index 0000000..e54db7a
--- /dev/null
+++ b/entitycoreference/pom.xml
@@ -0,0 +1,129 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
+
+ <modelVersion>4.0.0</modelVersion>
+
+ <parent>
+ <groupId>org.apache.stanbol</groupId>
+ <artifactId>apache-stanbol-enhancement-engines</artifactId>
+ <version>1.0.0-SNAPSHOT</version>
+ <relativePath>..</relativePath>
+ </parent>
+
+ <groupId>org.apache.stanbol</groupId>
+ <artifactId>org.apache.stanbol.enhancer.engines.entitycoreference</artifactId>
+ <version>1.0.0-SNAPSHOT</version>
+ <packaging>bundle</packaging>
+
+ <name>Apache Stanbol Enhancement Engine : Entity Co-Reference</name>
+ <description>
+ An Engine that finds co-references of Named Entities based on
+ dbpedia/yago concepts.
+ </description>
+
+ <inceptionYear>2014</inceptionYear>
+
+ <scm>
+ <connection>
+ scm:svn:http://svn.apache.org/repos/asf/stanbol/trunk/enhancement-engines/entitycoreference/
+ </connection>
+ <developerConnection>
+ scm:svn:https://svn.apache.org/repos/asf/stanbol/trunk/enhancement-engines/entitycoreference/
+ </developerConnection>
+ <url>http://stanbol.apache.org/</url>
+ </scm>
+
+ <build>
+ <plugins>
+ <plugin>
+ <groupId>org.apache.felix</groupId>
+ <artifactId>maven-bundle-plugin</artifactId>
+ <extensions>true</extensions>
+ <configuration>
+ <instructions>
+ <Import-Package>
+ org.apache.stanbol.enhancer.servicesapi; provide:=true; version="[0.11,1.1)",
+ org.apache.stanbol.enhancer.engines.entitylinking;version=${project.version}; provide:=true,
+ *
+ </Import-Package>
+ <Private-Package>
+ org.apache.stanbol.enhancer.engines.entitycoreference.*;version=${project.version}
+ </Private-Package>
+ </instructions>
+ </configuration>
+ </plugin>
+ <plugin>
+ <groupId>org.apache.felix</groupId>
+ <artifactId>maven-scr-plugin</artifactId>
+ </plugin>
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-compiler-plugin</artifactId>
+ <version>3.1</version>
+ <configuration>
+ <source>1.7</source>
+ <target>1.7</target>
+ </configuration>
+ </plugin>
+ </plugins>
+ </build>
+
+ <dependencies>
+ <dependency>
+ <groupId>org.apache.stanbol</groupId>
+ <artifactId>org.apache.stanbol.commons.namespaceprefix.service</artifactId>
+ <version>1.0.0-SNAPSHOT</version>
+ </dependency>
+
+ <dependency>
+ <groupId>org.apache.stanbol</groupId>
+ <artifactId>org.apache.stanbol.enhancer.engines.entitylinking.engine</artifactId>
+ <version>1.0.0-SNAPSHOT</version>
+ </dependency>
+
+ <dependency>
+ <groupId>org.apache.stanbol</groupId>
+ <artifactId>org.apache.stanbol.entityhub.servicesapi</artifactId>
+ <version>1.0.0-SNAPSHOT</version>
+ </dependency>
+
+ <dependency>
+ <groupId>org.apache.felix</groupId>
+ <artifactId>org.apache.felix.scr.annotations</artifactId>
+ </dependency>
+ <dependency>
+ <groupId>org.slf4j</groupId>
+ <artifactId>slf4j-api</artifactId>
+ </dependency>
+
+ <!-- Test dependencies -->
+ <dependency>
+ <groupId>junit</groupId>
+ <artifactId>junit</artifactId>
+ <scope>test</scope>
+ </dependency>
+ <dependency> <!-- used for debug level logging during tests -->
+ <groupId>org.slf4j</groupId>
+ <artifactId>slf4j-log4j12</artifactId>
+ <scope>test</scope>
+ </dependency>
+
+ </dependencies>
+
+</project>
diff --git a/entitycoreference/src/license/THIRD-PARTY.properties b/entitycoreference/src/license/THIRD-PARTY.properties
new file mode 100644
index 0000000..21c81d0
--- /dev/null
+++ b/entitycoreference/src/license/THIRD-PARTY.properties
@@ -0,0 +1,24 @@
+# Generated by org.codehaus.mojo.license.AddThirdPartyMojo
+#-------------------------------------------------------------------------------
+# Already used licenses in project :
+# - Apache Software License
+# - Apache Software License, Version 2.0
+# - BSD License
+# - Common Development And Distribution License (CDDL), Version 1.0
+# - Common Development And Distribution License (CDDL), Version 1.1
+# - Common Public License, Version 1.0
+# - Eclipse Public License, Version 1.0
+# - GNU General Public License (GPL), Version 2 with classpath exception
+# - GNU Lesser General Public License (LGPL)
+# - GNU Lesser General Public License (LGPL), Version 2.1
+# - ICU License
+# - MIT License
+# - New BSD License
+# - Public Domain License
+#-------------------------------------------------------------------------------
+# Please fill the missing licenses for dependencies :
+#
+#
+#Tue Jul 23 16:41:35 CEST 2013
+org.osgi--org.osgi.compendium--4.1.0=The Apache Software License, Version 2.0
+org.osgi--org.osgi.core--4.1.0=The Apache Software License, Version 2.0
diff --git a/entitycoreference/src/main/java/org/apache/stanbol/enhancer/engines/entitycoreference/Constants.java b/entitycoreference/src/main/java/org/apache/stanbol/enhancer/engines/entitycoreference/Constants.java
new file mode 100644
index 0000000..3e2c0a0
--- /dev/null
+++ b/entitycoreference/src/main/java/org/apache/stanbol/enhancer/engines/entitycoreference/Constants.java
@@ -0,0 +1,47 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.stanbol.enhancer.engines.entitycoreference;
+
+/**
+ * Constants used engine wide.
+ *
+ * @author Cristian Petroaca
+ *
+ */
+public final class Constants {
+ /**
+ * The main config folder of the engine
+ */
+ public final static String CONFIG_FOLDER = "/config";
+
+ /**
+ * The main data folder
+ */
+ public final static String DATA_FOLDER = "/data";
+
+ /**
+ * The path to the pos config folder.
+ */
+ public final static String POS_CONFIG_FOLDER = CONFIG_FOLDER + "/pos";
+
+ /**
+ * The path to the place adjectivals folder.
+ */
+ public final static String PLACE_ADJECTIVALS_FOLDER = DATA_FOLDER + "/place_adjectivals";
+
+ private Constants() {}
+}
diff --git a/entitycoreference/src/main/java/org/apache/stanbol/enhancer/engines/entitycoreference/EntityCoReferenceEngine.java b/entitycoreference/src/main/java/org/apache/stanbol/enhancer/engines/entitycoreference/EntityCoReferenceEngine.java
new file mode 100644
index 0000000..87e27a7
--- /dev/null
+++ b/entitycoreference/src/main/java/org/apache/stanbol/enhancer/engines/entitycoreference/EntityCoReferenceEngine.java
@@ -0,0 +1,336 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.stanbol.enhancer.engines.entitycoreference;
+
+import static org.apache.stanbol.enhancer.nlp.utils.NlpEngineHelper.getLanguage;
+
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.Dictionary;
+import java.util.EnumSet;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.felix.scr.annotations.Activate;
+import org.apache.felix.scr.annotations.Component;
+import org.apache.felix.scr.annotations.Deactivate;
+import org.apache.felix.scr.annotations.Properties;
+import org.apache.felix.scr.annotations.Property;
+import org.apache.felix.scr.annotations.Reference;
+import org.apache.felix.scr.annotations.Service;
+import org.apache.stanbol.enhancer.engines.entitycoreference.datamodel.NounPhrase;
+import org.apache.stanbol.enhancer.engines.entitycoreference.impl.CoreferenceFinder;
+import org.apache.stanbol.enhancer.engines.entitycoreference.impl.NounPhraseFilterer;
+import org.apache.stanbol.enhancer.nlp.NlpAnnotations;
+import org.apache.stanbol.enhancer.nlp.model.AnalysedText;
+import org.apache.stanbol.enhancer.nlp.model.Section;
+import org.apache.stanbol.enhancer.nlp.model.Span;
+import org.apache.stanbol.enhancer.nlp.model.SpanTypeEnum;
+import org.apache.stanbol.enhancer.nlp.model.annotation.Value;
+import org.apache.stanbol.enhancer.nlp.ner.NerTag;
+import org.apache.stanbol.enhancer.nlp.phrase.PhraseTag;
+import org.apache.stanbol.enhancer.nlp.pos.LexicalCategory;
+import org.apache.stanbol.enhancer.nlp.utils.NlpEngineHelper;
+import org.apache.stanbol.enhancer.servicesapi.ContentItem;
+import org.apache.stanbol.enhancer.servicesapi.EngineException;
+import org.apache.stanbol.enhancer.servicesapi.EnhancementEngine;
+import org.apache.stanbol.enhancer.servicesapi.ServiceProperties;
+import org.apache.stanbol.enhancer.servicesapi.impl.AbstractEnhancementEngine;
+import org.apache.stanbol.entityhub.servicesapi.Entityhub;
+import org.apache.stanbol.entityhub.servicesapi.site.SiteManager;
+import org.osgi.service.cm.ConfigurationException;
+import org.osgi.service.component.ComponentContext;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * This engine extracts references in the given text of noun phrases which point to NERs. The coreference is
+ * performed based on matching several of the named entity's dbpedia/yago properties to the noun phrase
+ * tokens.
+ *
+ * TODO - Be able to detect possessive coreferences such as Germany's prime minister TODO - be able to detect
+ * products and their developer such as Iphone 7 and Apple's new device. TODO - provide the ability via config
+ * for the user to also allow coreferencing of 1 word noun phrases based soley on comparison with entity class
+ * type?
+ *
+ * @author Cristian Petroaca
+ *
+ */
+@Component(immediate = true, metatype = true)
+@Service(value = EnhancementEngine.class)
+@Properties(value = {
+ @Property(name = EnhancementEngine.PROPERTY_NAME, value = "entity-coreference"),
+ @Property(name = EntityCoReferenceEngine.CONFIG_LANGUAGES, value = "en"),
+ @Property(name = EntityCoReferenceEngine.REFERENCED_SITE_ID, value = "dbpedia"),
+ @Property(name = EntityCoReferenceEngine.MAX_DISTANCE, intValue = EntityCoReferenceEngine.MAX_DISTANCE_DEFAULT_VALUE)})
+public class EntityCoReferenceEngine extends AbstractEnhancementEngine<RuntimeException,RuntimeException>
+ implements EnhancementEngine, ServiceProperties {
+
+ private static final Integer ENGINE_ORDERING = ServiceProperties.ORDERING_POST_PROCESSING + 91;
+
+ /**
+ * Language configuration. Takes a list of ISO language codes of supported languages. Currently supported
+ * are the languages given as default value.
+ */
+ protected static final String CONFIG_LANGUAGES = "enhancer.engine.entitycoreference.languages";
+
+ /**
+ * Referenced site configuration. Defaults to dbpedia.
+ */
+ protected static final String REFERENCED_SITE_ID = "enhancer.engine.entitycoreference.referencedSiteId";
+
+ /**
+ * Maximum sentence distance between the ner and the noun phrase which mentions it. -1 means no distance
+ * constraint.
+ */
+ protected static final String MAX_DISTANCE = "enhancer.engine.entitycoreference.maxDistance";
+
+ protected static final int MAX_DISTANCE_DEFAULT_VALUE = 1;
+ public static final int MAX_DISTANCE_NO_CONSTRAINT = -1;
+
+ private final Logger log = LoggerFactory.getLogger(EntityCoReferenceEngine.class);
+
+ /**
+ * Service of the Entityhub that manages all the active referenced Site. This Service is used to lookup
+ * the configured Referenced Site when we need to enhance a content item.
+ */
+ @Reference
+ protected SiteManager siteManager;
+
+ /**
+ * Used to lookup Entities if the {@link #REFERENCED_SITE_ID} property is set to "entityhub" or "local"
+ */
+ @Reference
+ protected Entityhub entityhub;
+
+ /**
+ * Specialized class which filters out bad noun phrases based on the language.
+ */
+ private NounPhraseFilterer nounPhraseFilterer;
+
+ /**
+ * Performs the logic needed to find corefs based on the NERs and noun phrases in the text.
+ */
+ private CoreferenceFinder corefFinder;
+
+ @SuppressWarnings("unchecked")
+ @Activate
+ protected void activate(ComponentContext ctx) throws ConfigurationException {
+ super.activate(ctx);
+
+ Dictionary<String,Object> config = ctx.getProperties();
+
+ /* Step 1 - initialize the {@link NounPhraseFilterer} with the language config */
+ String languages = (String) config.get(CONFIG_LANGUAGES);
+
+ if (languages == null || languages.isEmpty()) {
+ throw new ConfigurationException(CONFIG_LANGUAGES,
+ "The Languages Config is a required Parameter and MUST NOT be NULL or an empty String!");
+ }
+
+ nounPhraseFilterer = new NounPhraseFilterer(languages.split(","));
+
+ /* Step 2 - initialize the {@link CoreferenceFinder} */
+ String referencedSiteID = null;
+ Object referencedSiteIDfromConfig = config.get(REFERENCED_SITE_ID);
+
+ if (referencedSiteIDfromConfig == null) {
+ throw new ConfigurationException(REFERENCED_SITE_ID,
+ "The ID of the Referenced Site is a required Parameter and MUST NOT be NULL!");
+ }
+
+ referencedSiteID = referencedSiteIDfromConfig.toString();
+ if (referencedSiteID.isEmpty()) {
+ throw new ConfigurationException(REFERENCED_SITE_ID,
+ "The ID of the Referenced Site is a required Parameter and MUST NOT be an empty String!");
+ }
+
+ if (Entityhub.ENTITYHUB_IDS.contains(referencedSiteID.toLowerCase())) {
+ log.debug("Init NamedEntityTaggingEngine instance for the Entityhub");
+ referencedSiteID = null;
+ }
+
+ int maxDistance;
+ Object maxDistanceFromConfig = config.get(MAX_DISTANCE);
+
+ if (maxDistanceFromConfig == null) {
+ maxDistance = MAX_DISTANCE_DEFAULT_VALUE;
+ } else if (maxDistanceFromConfig instanceof Number) {
+ maxDistance = ((Number) maxDistanceFromConfig).intValue();
+ } else {
+ try {
+ maxDistance = Integer.parseInt(maxDistanceFromConfig.toString());
+ } catch (NumberFormatException nfe) {
+ throw new ConfigurationException(MAX_DISTANCE, "The Max Distance parameter must be a number");
+ }
+ }
+
+ if (maxDistance < -1) {
+ throw new ConfigurationException(MAX_DISTANCE,
+ "The Max Distance parameter must not be smaller than -1");
+ }
+
+ corefFinder = new CoreferenceFinder(languages.split(","), siteManager, entityhub, referencedSiteID,
+ maxDistance);
+
+ log.info("activate {}[name:{}]", getClass().getSimpleName(), getName());
+ }
+
+ @Override
+ public Map<String,Object> getServiceProperties() {
+ return Collections.unmodifiableMap(Collections.singletonMap(ENHANCEMENT_ENGINE_ORDERING,
+ (Object) ENGINE_ORDERING));
+ }
+
+ @Override
+ public int canEnhance(ContentItem ci) throws EngineException {
+ String language = getLanguage(this, ci, false);
+ if (language == null) {
+ log.debug("Engine {} ignores ContentItem {} becuase language {} is not detected.",
+ new Object[] {getName(), ci.getUri(), language});
+ return CANNOT_ENHANCE;
+ }
+
+ if (!nounPhraseFilterer.supportsLanguage(language)) {
+ log.debug("Engine {} does not support language {}.", new Object[] {getName(), language});
+ return CANNOT_ENHANCE;
+ }
+
+ return ENHANCE_SYNCHRONOUS;
+ }
+
+ @Override
+ public void computeEnhancements(ContentItem ci) throws EngineException {
+ /*
+ * Step 1 - Build the NER list and the noun phrase list.
+ *
+ * TODO - the noun phrases need to be lemmatized.
+ */
+ Map<Integer,List<Span>> ners = new HashMap<Integer,List<Span>>();
+ List<NounPhrase> nounPhrases = new ArrayList<NounPhrase>();
+ extractNersAndNounPhrases(ci, ners, nounPhrases);
+
+ /*
+ * If there are no NERs to reference there's nothing to do but exit.
+ */
+ if (ners.size() == 0) {
+ log.info("Did not find any NERs for which to do the coreferencing");
+ return;
+ }
+
+ /*
+ * Step 2 - Filter out bad noun phrases.
+ */
+ String language = getLanguage(this, ci, false);
+ if (language == null) {
+ log.info("Could not detect the language of the text");
+ return;
+ }
+
+ nounPhraseFilterer.filter(nounPhrases, language);
+
+ /*
+ * If there are no good noun phrases there's nothing to do but exit.
+ */
+ if (nounPhrases.size() == 0) {
+ log.info("Did not find any noun phrases with which to do the coreferencing");
+ return;
+ }
+
+ /*
+ * Step 3 - Extract corefs and write them as {@link NlpAnnotations.COREF_ANNOTATION}s in the {@link
+ * Span}s
+ */
+ corefFinder.extractCorefs(ners, nounPhrases, language);
+ }
+
+ @Deactivate
+ protected void deactivate(ComponentContext ctx) {
+ log.info("deactivate {}[name:{}]", getClass().getSimpleName(), getName());
+
+ nounPhraseFilterer = null;
+ corefFinder = null;
+
+ super.deactivate(ctx);
+ }
+
+ /**
+ * Extracts the NERs and the noun phrases from the given text and puts them in the given lists.
+ *
+ * @param ci
+ * @param ners
+ * @param nounPhrases
+ */
+ private void extractNersAndNounPhrases(ContentItem ci,
+ Map<Integer,List<Span>> ners,
+ List<NounPhrase> nounPhrases) {
+ AnalysedText at = NlpEngineHelper.getAnalysedText(this, ci, true);
+ Iterator<? extends Section> sections = at.getSentences();
+ if (!sections.hasNext()) { // process as single sentence
+ sections = Collections.singleton(at).iterator();
+ }
+
+ int sentenceCnt = 0;
+ while (sections.hasNext()) {
+ sentenceCnt++;
+ Section section = sections.next();
+ List<NounPhrase> sectionNounPhrases = new ArrayList<NounPhrase>();
+ List<Span> sectionNers = new ArrayList<Span>();
+
+ Iterator<Span> chunks = section.getEnclosed(EnumSet.of(SpanTypeEnum.Chunk));
+ while (chunks.hasNext()) {
+ Span chunk = chunks.next();
+
+ Value<NerTag> ner = chunk.getAnnotation(NlpAnnotations.NER_ANNOTATION);
+ if (ner != null) {
+ sectionNers.add(chunk);
+ }
+
+ Value<PhraseTag> phrase = chunk.getAnnotation(NlpAnnotations.PHRASE_ANNOTATION);
+ if (phrase != null && phrase.value().getCategory() == LexicalCategory.Noun) {
+ sectionNounPhrases.add(new NounPhrase(chunk, sentenceCnt));
+ }
+ }
+
+ for (NounPhrase nounPhrase : sectionNounPhrases) {
+ Iterator<Span> tokens = section.getEnclosed(EnumSet.of(SpanTypeEnum.Token));
+
+ while (tokens.hasNext()) {
+ Span token = tokens.next();
+
+ if (nounPhrase.containsSpan(token)) {
+ nounPhrase.addToken(token);
+ }
+ }
+
+ for (Span sectionNer : sectionNers) {
+ if (nounPhrase.containsSpan(sectionNer)) {
+ nounPhrase.addNerChunk(sectionNer);
+ }
+ }
+ }
+
+ nounPhrases.addAll(sectionNounPhrases);
+
+ if (!sectionNers.isEmpty()) {
+ ners.put(sentenceCnt, sectionNers);
+ }
+ }
+ }
+}
diff --git a/entitycoreference/src/main/java/org/apache/stanbol/enhancer/engines/entitycoreference/datamodel/NounPhrase.java b/entitycoreference/src/main/java/org/apache/stanbol/enhancer/engines/entitycoreference/datamodel/NounPhrase.java
new file mode 100644
index 0000000..9b916b6
--- /dev/null
+++ b/entitycoreference/src/main/java/org/apache/stanbol/enhancer/engines/entitycoreference/datamodel/NounPhrase.java
@@ -0,0 +1,164 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.stanbol.enhancer.engines.entitycoreference.datamodel;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.stanbol.enhancer.nlp.model.Span;
+
+/**
+ * Encapsulates span and sentence information about a noun phrase.
+ *
+ * @author Cristian Petroaca
+ *
+ */
+public class NounPhrase {
+ /**
+ * The {@link Span} which represents this noun phrase.
+ */
+ private Span chunk;
+
+ /*
+ * TODO - should use Set instead?
+ */
+ /**
+ * The {@link Span}s - tokens - which make up this noun phrase.
+ */
+ private List<Span> tokens;
+
+ /**
+ * The {@link Span}s contained in this noun phrase which represent Ners.
+ */
+ private List<Span> nerChunks;
+
+ /**
+ * The sentence index in which this noun phrase is found.
+ */
+ private int sentenceNo;
+
+ public NounPhrase(Span chunk, int sentenceNo) {
+ if (chunk == null) {
+ throw new IllegalArgumentException("Chunk cannot be null");
+ }
+
+ this.chunk = chunk;
+ this.tokens = new ArrayList<Span>();
+ this.nerChunks = new ArrayList<Span>();
+ this.sentenceNo = sentenceNo;
+ }
+
+ /**
+ * Gets the chunk representing this noun phrase.
+ *
+ * @return
+ */
+ public Span getChunk() {
+ return chunk;
+ }
+
+ /**
+ * Adds a new token which is found in this noun phrase.
+ *
+ * @param token
+ */
+ public void addToken(Span token) {
+ /*
+ * TODO - validate token boundaries within this noun phrase.
+ */
+ tokens.add(token);
+ }
+
+ /**
+ * Gets the list of tokens which make up this noun phrase.
+ *
+ * @return
+ */
+ public List<Span> getTokens() {
+ return tokens;
+ }
+
+ /**
+ * Adds a new NER chunk which is found within this noun phrase.
+ *
+ * @param chunk
+ */
+ public void addNerChunk(Span chunk) {
+ /*
+ * TODO - validate NER boundaries within this noun phrase.
+ */
+ nerChunks.add(chunk);
+ }
+
+ /**
+ * Gets the list of NERs within this noun phrase.
+ *
+ * @return
+ */
+ public List<Span> getNerChunks() {
+ return nerChunks;
+ }
+
+ /**
+ * Determines whether this noun phrase's {@link Span} contains the given {@link Span}.
+ *
+ * @param span
+ * @return
+ */
+ public boolean containsSpan(Span span) {
+ return (span.getStart() >= chunk.getStart() && span.getEnd() <= chunk.getEnd());
+ }
+
+ /**
+ * Determines whether this noun phrase has NERs.
+ *
+ * @return
+ */
+ public boolean hasNers() {
+ return nerChunks.size() > 0;
+ }
+
+ /**
+ * Returns the sentence index in which this noun phrase is found.
+ *
+ * @return
+ */
+ public int getSentenceNo() {
+ return this.sentenceNo;
+ }
+
+ public int hashCode() {
+ final int prime = 31;
+ int result = 1;
+ result = prime * result + chunk.hashCode();
+ result = prime * result + tokens.hashCode();
+ result = prime * result + nerChunks.hashCode();
+
+ return result;
+ }
+
+ public boolean equals(Object obj) {
+ if (this == obj) return true;
+ if (obj == null) return false;
+ if (getClass() != obj.getClass()) return false;
+
+ NounPhrase other = (NounPhrase) obj;
+
+ return chunk.equals(other.chunk) && tokens.equals(other.tokens) && nerChunks.equals(other.nerChunks)
+ && sentenceNo == other.sentenceNo;
+ }
+}
diff --git a/entitycoreference/src/main/java/org/apache/stanbol/enhancer/engines/entitycoreference/datamodel/PlaceAdjectival.java b/entitycoreference/src/main/java/org/apache/stanbol/enhancer/engines/entitycoreference/datamodel/PlaceAdjectival.java
new file mode 100644
index 0000000..2b59330
--- /dev/null
+++ b/entitycoreference/src/main/java/org/apache/stanbol/enhancer/engines/entitycoreference/datamodel/PlaceAdjectival.java
@@ -0,0 +1,82 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.stanbol.enhancer.engines.entitycoreference.datamodel;
+
+import org.apache.clerezza.rdf.core.UriRef;
+
+/**
+ * Represents a place adjectival inside a {@link Span}.
+ *
+ * @author Cristian Petroaca
+ *
+ */
+public class PlaceAdjectival {
+ /**
+ * The start index in the {@link Span}.
+ */
+ private int startIdx;
+
+ /**
+ * The end index in the {@link Span}.
+ */
+ private int endIdx;
+
+ /**
+ * The {@link UriRef} in the {@link SiteManager} or {@link Entityhub} that this place adjectival points
+ * to.
+ */
+ private UriRef placeUri;
+
+ public PlaceAdjectival(int startIdx, int endIdx, UriRef placeUri) {
+ this.startIdx = startIdx;
+ this.endIdx = endIdx;
+ this.placeUri = placeUri;
+ }
+
+ public UriRef getPlaceUri() {
+ return placeUri;
+ }
+
+ public int getStart() {
+ return this.startIdx;
+ }
+
+ public int getEnd() {
+ return this.endIdx;
+ }
+
+ public int hashCode() {
+ final int prime = 31;
+ int result = 1;
+ result = prime * result + startIdx;
+ result = prime * result + endIdx;
+ result = prime * result + placeUri.hashCode();
+
+ return result;
+ }
+
+ public boolean equals(Object obj) {
+ if (this == obj) return true;
+ if (obj == null) return false;
+ if (getClass() != obj.getClass()) return false;
+
+ PlaceAdjectival other = (PlaceAdjectival) obj;
+
+ return this.startIdx == other.startIdx && this.endIdx == other.endIdx
+ && this.placeUri.equals(other.placeUri);
+ }
+}
diff --git a/entitycoreference/src/main/java/org/apache/stanbol/enhancer/engines/entitycoreference/impl/CoreferenceFinder.java b/entitycoreference/src/main/java/org/apache/stanbol/enhancer/engines/entitycoreference/impl/CoreferenceFinder.java
new file mode 100644
index 0000000..48590c2
--- /dev/null
+++ b/entitycoreference/src/main/java/org/apache/stanbol/enhancer/engines/entitycoreference/impl/CoreferenceFinder.java
@@ -0,0 +1,402 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.stanbol.enhancer.engines.entitycoreference.impl;
+
+import static org.apache.stanbol.enhancer.nlp.NlpAnnotations.COREF_ANNOTATION;
+import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.RDFS_LABEL;
+import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.RDF_TYPE;
+
+import java.util.HashSet;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+import org.apache.clerezza.rdf.core.UriRef;
+import org.apache.stanbol.enhancer.engines.entitycoreference.EntityCoReferenceEngine;
+import org.apache.stanbol.enhancer.engines.entitycoreference.datamodel.NounPhrase;
+import org.apache.stanbol.enhancer.engines.entitycoreference.datamodel.PlaceAdjectival;
+import org.apache.stanbol.enhancer.nlp.NlpAnnotations;
+import org.apache.stanbol.enhancer.nlp.coref.CorefFeature;
+import org.apache.stanbol.enhancer.nlp.model.Span;
+import org.apache.stanbol.enhancer.nlp.model.annotation.Value;
+import org.apache.stanbol.enhancer.servicesapi.EngineException;
+import org.apache.stanbol.enhancer.servicesapi.rdf.OntologicalClasses;
+import org.apache.stanbol.entityhub.servicesapi.Entityhub;
+import org.apache.stanbol.entityhub.servicesapi.model.Entity;
+import org.apache.stanbol.entityhub.servicesapi.model.Text;
+import org.apache.stanbol.entityhub.servicesapi.query.Constraint;
+import org.apache.stanbol.entityhub.servicesapi.query.FieldQuery;
+import org.apache.stanbol.entityhub.servicesapi.query.FieldQueryFactory;
+import org.apache.stanbol.entityhub.servicesapi.query.QueryResultList;
+import org.apache.stanbol.entityhub.servicesapi.query.ReferenceConstraint;
+import org.apache.stanbol.entityhub.servicesapi.query.TextConstraint;
+import org.apache.stanbol.entityhub.servicesapi.site.Site;
+import org.apache.stanbol.entityhub.servicesapi.site.SiteManager;
+import org.osgi.service.cm.ConfigurationException;
+
+/**
+ * Uses the list of NERs and the list of {@link NounPhrase}s found in the analyzed text to find possible
+ * co-references.
+ *
+ * @author Cristian Petroaca
+ *
+ */
+public class CoreferenceFinder {
+ /**
+ * The configured {@link SiteManager} for {@link Entity} storage.
+ */
+ private SiteManager siteManager;
+
+ /**
+ * The default {@link Entity} storage.
+ */
+ private Entityhub entityHub;
+
+ /**
+ * The name of the configured site for the {@link SiteManager}.
+ */
+ private String referencedSiteID;
+
+ /**
+ * In memory cache storing {@link Entity} types which are often used.
+ */
+ private InMemoryEntityTypeIndex entityTypeIndex;
+
+ /**
+ * Class holding configuration params.
+ */
+ private CoreferenceFinderConfig config;
+
+ /**
+ * Holds vocabulary.dictionary info such as the list of place adjectivals by language.
+ */
+ private Dictionaries dictionaries;
+
+ public CoreferenceFinder(String[] languages,
+ SiteManager siteManager,
+ Entityhub entityHub,
+ String referencedSiteID,
+ int maxDistance) throws ConfigurationException {
+ this.siteManager = siteManager;
+ this.entityHub = entityHub;
+ this.referencedSiteID = referencedSiteID;
+ this.entityTypeIndex = new InMemoryEntityTypeIndex();
+ this.config = new CoreferenceFinderConfig(maxDistance);
+ this.dictionaries = new Dictionaries(languages);
+ }
+
+ /**
+ * Performs the actual coreference resolution by iterating through all the NERs and all the
+ * {@link NounPhrase}s which are after the given Ner in the text. If any coreferences are found they are
+ * written as {@link NlpAnnotation}s in the NER and noun phrase {@link Span}s.
+ *
+ * @param ners
+ * @param nounPhrases
+ * @param language
+ * @throws EngineException
+ */
+ public void extractCorefs(Map<Integer,List<Span>> ners, List<NounPhrase> nounPhrases, String language) throws EngineException {
+ for (Map.Entry<Integer,List<Span>> entry : ners.entrySet()) {
+ int nerSentenceNo = entry.getKey();
+ List<Span> nerSpans = entry.getValue();
+ int maxDistance = this.config.getMaxDistance();
+
+ for (Span ner : nerSpans) {
+ Entity entity = null;
+ Set<String> typeLabels = null;
+ Set<Span> corefs = new HashSet<Span>();
+
+ for (NounPhrase nounPhrase : nounPhrases) {
+ int nounPhraseSentenceNo = nounPhrase.getSentenceNo();
+
+ if (nounPhrase.getChunk().getStart() > ner.getStart()
+ && (maxDistance != EntityCoReferenceEngine.MAX_DISTANCE_NO_CONSTRAINT
+ && nounPhraseSentenceNo > nerSentenceNo && nounPhraseSentenceNo - nerSentenceNo <= maxDistance)) {
+
+ if (entity == null) {
+ entity = lookupEntity(ner, language);
+
+ /*
+ * If the entity is still null there's nothing to do but go to the next ner.
+ */
+ if (entity == null) break;
+
+ if (typeLabels == null) {
+ typeLabels = buildEntityTypeLabels(entity, language);
+ }
+ }
+
+ if (isCoreferent(typeLabels, entity, ner, nounPhrase, language)) {
+ Set<Span> coreferencedNer = new HashSet<Span>();
+ coreferencedNer.add(ner);
+ Span chunk = nounPhrase.getChunk();
+
+ chunk.addAnnotation(COREF_ANNOTATION,
+ Value.value(new CorefFeature(false, coreferencedNer)));
+ corefs.add(chunk);
+ }
+ }
+ }
+
+ if (corefs.size() > 0) {
+ ner.addAnnotation(COREF_ANNOTATION, Value.value(new CorefFeature(true, corefs)));
+ }
+ }
+ }
+ }
+
+ /**
+ * Gets an Entity from the configured {@link Site} based on the NER text and type.
+ *
+ * @param ner
+ * @param language
+ * @return
+ * @throws EngineException
+ */
+ private Entity lookupEntity(Span ner, String language) throws EngineException {
+ Site site = getReferencedSite();
+ FieldQueryFactory queryFactory = site == null ? entityHub.getQueryFactory() : site.getQueryFactory();
+ FieldQuery query = queryFactory.createFieldQuery();
+
+ Constraint labelConstraint;
+ String namedEntityLabel = ner.getSpan();
+ labelConstraint = new TextConstraint(namedEntityLabel, false, language, null);
+ query.setConstraint(RDFS_LABEL.getUnicodeString(), labelConstraint);
+ query.setConstraint(RDF_TYPE.getUnicodeString(),
+ new ReferenceConstraint(ner.getAnnotation(NlpAnnotations.NER_ANNOTATION).value().getType()
+ .getUnicodeString()));
+ query.setLimit(1);
+ QueryResultList<Entity> results = site == null ? // if site is NULL
+ entityHub.findEntities(query)
+ : // use the Entityhub
+ site.findEntities(query); // else the referenced site
+
+ if (results.isEmpty()) return null;
+
+ // We set the limit to 1 so if it found anything it should contain just 1 entry
+ return results.iterator().next();
+ }
+
+ /**
+ * Performs the coreference matching rules: 1. Match the entity type. 2. If the {@link NounPhrase}
+ * contains any NERs match the NER to any spatial/org membership/functional Entity properties from the
+ * {@link Site}. 3. If {@link NounPhrase} contains any place adjectivals perform spatial co-reference
+ * based on the entity spatial properties.
+ *
+ * @param typeLabels
+ * - a list of types (classes) that the given entity has.
+ * @param entity
+ * - the entity for which we want to do the coref.
+ * @param ner
+ * - the ner in the text for which we want to do the coref.
+ * @param nounPhrase
+ * - the {@link NounPhrase} which we want to test for coref.
+ * @param language
+ * - the language of the text.
+ * @return
+ * @throws EngineException
+ */
+ private boolean isCoreferent(Set<String> typeLabels,
+ Entity entity,
+ Span ner,
+ NounPhrase nounPhrase,
+ String language) throws EngineException {
+ /*
+ * 1. Try to match the entity class to the noun phrase.
+ */
+ String matchedClass = null;
+ String nounPhraseText = nounPhrase.getChunk().getSpan().toLowerCase();
+ int classStart = 0;
+ int classEnd = 0;
+
+ for (String label : typeLabels) {
+ if (nounPhraseText.matches(".*\\b" + label + "\\b.*")
+ && (matchedClass == null || label.split("\\s").length > matchedClass.split("\\s").length)) {
+ matchedClass = label;
+ classStart = nounPhrase.getChunk().getStart() + nounPhraseText.indexOf(label);
+ classEnd = classStart + label.length();
+ }
+ }
+
+ if (matchedClass == null) return false;
+
+ /*
+ * 2. See if there are any NERs in the noun phrase to further identify the coref. Any NERs found
+ * should be separate words from the class matches from point 1.
+ */
+ /*
+ * TODO - add dbpprop: attributes to the rules ontology, such as dbpprop:nationality and
+ * dbpprop:industry and dbpprop:locationCountry
+ */
+ /*
+ * TODO - devise a coref confidence scheme?
+ */
+ if (nounPhrase.hasNers()) {
+ List<Span> npNers = nounPhrase.getNerChunks();
+ UriRef nerType = ner.getAnnotation(NlpAnnotations.NER_ANNOTATION).value().getType();
+
+ for (Span npNer : npNers) {
+ /*
+ * Don't go any further if for some reason it turns out that the ner text is the same as the
+ * entity class text.
+ */
+ if ((npNer.getStart() >= classStart && npNer.getStart() <= classEnd)
+ || (npNer.getEnd() >= classStart && npNer.getEnd() <= classEnd)) continue;
+
+ Entity npEntity = lookupEntity(npNer, language);
+
+ if (npEntity != null) {
+ UriRef npNerType = npNer.getAnnotation(NlpAnnotations.NER_ANNOTATION).value().getType();
+ Set<String> rulesOntologyAttr = new HashSet<String>();
+
+ if (OntologicalClasses.DBPEDIA_PLACE.equals(npNerType)) {
+ rulesOntologyAttr = this.config.getSpatialOntology(nerType);
+ } else if (OntologicalClasses.DBPEDIA_ORGANISATION.equals(npNerType)) {
+ /*
+ * TODO - add organisation rules ontology in config and in dbpedia index
+ */
+ }
+
+ if (valueExistsInEntityAttributes(rulesOntologyAttr, entity, npEntity.getId())) {
+ return true;
+ }
+ }
+ }
+ }
+
+ /*
+ * 3. Detect any place adjectivals in noun phrases and use them for spatial coreference. Any place
+ * adjectivals found should be separate words from the class matches from point 1.
+ */
+ PlaceAdjectival placeAdjectival = this.dictionaries.findPlaceAdjectival(language, nounPhrase);
+
+ if (placeAdjectival != null
+ && (placeAdjectival.getEnd() < classStart || placeAdjectival.getStart() > classEnd)) {
+ /*
+ * We use the same spatial rules ontology attributes as before.
+ */
+ Set<String> rulesOntologyAttr = this.config.getSpatialOntology(ner
+ .getAnnotation(NlpAnnotations.NER_ANNOTATION).value().getType());
+
+ if (valueExistsInEntityAttributes(rulesOntologyAttr, entity, placeAdjectival.getPlaceUri()
+ .getUnicodeString())) {
+ return true;
+ }
+ }
+
+ /*
+ * If there was no additional info to do the coref and if the entity class matched and has more than 1
+ * word then we consider this a good enough coreference.
+ */
+ if (matchedClass.split("\\s").length > 1) return true;
+
+ return false;
+ }
+
+ /**
+ * Builds a Set of Entity Type labels given the Entity type uris.
+ *
+ * @param entity
+ * @param language
+ * @return
+ * @throws EngineException
+ */
+ private Set<String> buildEntityTypeLabels(Entity entity, String language) throws EngineException {
+ Iterator<Object> typeUris = entity.getRepresentation().get(RDF_TYPE.getUnicodeString());
+ Set<String> allTypeLabels = new HashSet<String>();
+
+ while (typeUris.hasNext()) {
+ String typeUri = typeUris.next().toString();
+
+ if (this.config.shouldExcludeClass(typeUri)) continue;
+
+ // First try the in memory index
+ Set<String> labels = this.entityTypeIndex.lookupEntityType(new UriRef(typeUri), language);
+
+ if (labels == null) {
+ Site site = getReferencedSite();
+ Entity entityType = (site == null) ? this.entityHub.getEntity(typeUri) : site
+ .getEntity(typeUri);
+
+ if (entityType != null) {
+ labels = new HashSet<String>();
+ Iterator<Text> labelIterator = entityType.getRepresentation().get(
+ RDFS_LABEL.getUnicodeString(), language);
+
+ while (labelIterator.hasNext()) {
+ labels.add(labelIterator.next().getText());
+ }
+
+ this.entityTypeIndex.addEntityType(new UriRef(typeUri), language, labels);
+ }
+ }
+ if (labels != null) allTypeLabels.addAll(labels);
+ }
+
+ return allTypeLabels;
+ }
+
+ /**
+ * Checks whether any of the attributes in rulesOntologyAttr from the given Entity contain the given
+ * value.
+ *
+ * @param rulesOntologyAttr
+ * @param entity
+ * @param value
+ * @return
+ */
+ private boolean valueExistsInEntityAttributes(Set<String> rulesOntologyAttr, Entity entity, String value) {
+ for (String attribute : rulesOntologyAttr) {
+ Iterator<Object> entityAttributes = entity.getRepresentation().get(attribute);
+
+ while (entityAttributes.hasNext()) {
+ Object entityAttribute = entityAttributes.next();
+
+ if (entityAttribute.toString().equals(value)) {
+ return true;
+ }
+ }
+ }
+
+ return false;
+ }
+
+ /**
+ * Retrieves the configured {@link Site} which holds the NER properties.
+ *
+ * @return
+ * @throws EngineException
+ */
+ private Site getReferencedSite() throws EngineException {
+ Site site = null;
+
+ if (referencedSiteID != null) { // lookup the referenced site
+ site = siteManager.getSite(referencedSiteID);
+ // ensure that it is present
+ if (site == null) {
+ String msg = String
+ .format("Unable to enhance because Referenced Site %s is currently not active!",
+ referencedSiteID);
+
+ throw new EngineException(msg);
+ }
+ }
+
+ return site;
+ }
+}
diff --git a/entitycoreference/src/main/java/org/apache/stanbol/enhancer/engines/entitycoreference/impl/CoreferenceFinderConfig.java b/entitycoreference/src/main/java/org/apache/stanbol/enhancer/engines/entitycoreference/impl/CoreferenceFinderConfig.java
new file mode 100644
index 0000000..cc2c270
--- /dev/null
+++ b/entitycoreference/src/main/java/org/apache/stanbol/enhancer/engines/entitycoreference/impl/CoreferenceFinderConfig.java
@@ -0,0 +1,156 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.stanbol.enhancer.engines.entitycoreference.impl;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Map;
+import java.util.Properties;
+import java.util.Set;
+
+import org.apache.clerezza.rdf.core.UriRef;
+import org.apache.stanbol.enhancer.engines.entitycoreference.Constants;
+import org.apache.stanbol.enhancer.servicesapi.rdf.OntologicalClasses;
+import org.osgi.service.cm.ConfigurationException;
+
+/**
+ * Contains configuration parameters for the {@link CoreferenceFinder}.
+ *
+ * @author Cristian Petroaca
+ *
+ */
+public class CoreferenceFinderConfig {
+ private static final String NAMED_ENTITY_CONFIG = "named_entity.properties";
+ private static final String SPATIAL_PLACE_ATTRIBUTES_PROP = "spatial.ont.place.attributes";
+ private static final String SPATIAL_ORG_ATTRIBUTES_PROP = "spatial.ont.organisation.attributes";
+ private static final String SPATIAL_PERSON_ATTRIBUTES_PROP = "spatial.ont.person.attributes";
+ private static final String ENTITY_CLASSES_TO_EXCLUDE_PROP = "entity.classes.to.exclude";
+
+ /**
+ * The maximum distance (in sentence numbers) between a NER and a {@link NounPhrase} for which we look for
+ * a coreference.
+ */
+ private int maxDistance;
+
+ /**
+ * The Uris for spatial properties for the NER to be inspected when doing the coref spatial match.
+ */
+ private Map<UriRef,Set<String>> spatialRulesOntology;
+
+ /**
+ * Entity classes which will not be used for coreference because they are too general.
+ */
+ private Set<String> entityClassesToExclude;
+
+ public CoreferenceFinderConfig(int maxDistance) throws ConfigurationException {
+ // First read the ontology from config used for entity properties matching
+ Properties props = new Properties();
+ InputStream in = null;
+
+ try {
+ in = CoreferenceFinderConfig.class.getResourceAsStream(Constants.CONFIG_FOLDER + "/"
+ + NAMED_ENTITY_CONFIG);
+ props.load(in);
+ } catch (IOException e) {
+ throw new ConfigurationException("", "Could not read " + NAMED_ENTITY_CONFIG);
+ } finally {
+ if (in != null) {
+ try {
+ in.close();
+ } catch (IOException e) {}
+ }
+ }
+
+ this.spatialRulesOntology = new HashMap<UriRef,Set<String>>();
+ Set<String> attributes = new HashSet<String>();
+
+ String placeAttributes = props.getProperty(SPATIAL_PLACE_ATTRIBUTES_PROP);
+ if (placeAttributes == null || placeAttributes.isEmpty()) {
+ throw new ConfigurationException(SPATIAL_PLACE_ATTRIBUTES_PROP, "Missing property in "
+ + NAMED_ENTITY_CONFIG);
+ }
+ for (String attribute : placeAttributes.split(",")) {
+ attributes.add(attribute);
+ }
+ this.spatialRulesOntology.put(OntologicalClasses.DBPEDIA_PLACE, attributes);
+
+ String orgAttributes = props.getProperty(SPATIAL_ORG_ATTRIBUTES_PROP);
+ if (orgAttributes == null || placeAttributes.isEmpty()) {
+ throw new ConfigurationException(SPATIAL_ORG_ATTRIBUTES_PROP, "Missing property in "
+ + NAMED_ENTITY_CONFIG);
+ }
+ attributes.clear();
+ for (String attribute : orgAttributes.split(",")) {
+ attributes.add(attribute);
+ }
+ this.spatialRulesOntology.put(OntologicalClasses.DBPEDIA_ORGANISATION, attributes);
+
+ String personAttributes = props.getProperty(SPATIAL_PERSON_ATTRIBUTES_PROP);
+ if (personAttributes == null || placeAttributes.isEmpty()) {
+ throw new ConfigurationException(SPATIAL_PERSON_ATTRIBUTES_PROP, "Missing property in "
+ + NAMED_ENTITY_CONFIG);
+ }
+ attributes.clear();
+ for (String attribute : personAttributes.split(",")) {
+ attributes.add(attribute);
+ }
+ this.spatialRulesOntology.put(OntologicalClasses.DBPEDIA_PERSON, attributes);
+
+ this.maxDistance = maxDistance;
+
+ String entityClassesToExcludeString = props.getProperty(ENTITY_CLASSES_TO_EXCLUDE_PROP);
+ if (entityClassesToExcludeString != null && !entityClassesToExcludeString.isEmpty()) {
+ this.entityClassesToExclude = new HashSet<String>();
+
+ for (String clazz : entityClassesToExcludeString.split(",")) {
+ this.entityClassesToExclude.add(clazz);
+ }
+ }
+ }
+
+ /**
+ * Gets the max distance parameter.
+ *
+ * @return
+ */
+ public int getMaxDistance() {
+ return maxDistance;
+ }
+
+ /**
+ * Gets the URIs for the spatial properties for a given Entity Type.
+ *
+ * @param uri
+ * of the Entity type for which we want to get the ontology.
+ * @return
+ */
+ public Set<String> getSpatialOntology(UriRef uri) {
+ return this.spatialRulesOntology.get(uri);
+ }
+
+ /**
+ * Checks whether we should exclude the given class based on our config.
+ *
+ * @param clazz
+ * @return
+ */
+ public boolean shouldExcludeClass(String clazz) {
+ return this.entityClassesToExclude.contains(clazz);
+ }
+}
diff --git a/entitycoreference/src/main/java/org/apache/stanbol/enhancer/engines/entitycoreference/impl/Dictionaries.java b/entitycoreference/src/main/java/org/apache/stanbol/enhancer/engines/entitycoreference/impl/Dictionaries.java
new file mode 100644
index 0000000..20021ce
--- /dev/null
+++ b/entitycoreference/src/main/java/org/apache/stanbol/enhancer/engines/entitycoreference/impl/Dictionaries.java
@@ -0,0 +1,183 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.stanbol.enhancer.engines.entitycoreference.impl;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Properties;
+
+import org.apache.clerezza.rdf.core.UriRef;
+import org.apache.stanbol.enhancer.engines.entitycoreference.Constants;
+import org.apache.stanbol.enhancer.engines.entitycoreference.datamodel.NounPhrase;
+import org.apache.stanbol.enhancer.engines.entitycoreference.datamodel.PlaceAdjectival;
+import org.apache.stanbol.enhancer.nlp.model.Span;
+import org.osgi.service.cm.ConfigurationException;
+
+/**
+ * Contains information about several terms and properties of words we use in the {@link CoreferenceFinder}.
+ *
+ * @author Cristian Petroaca
+ *
+ */
+class Dictionaries {
+ private static final String PLACE_ADJECTIVALS_CONFIG = "config.properties";
+ private static final String ENTITY_BASE_URI = "entity.uri.base";
+
+ /**
+ * Contains the list of place adjectivals in the form: language -> adjectival -> UriRef -> adjectival ->
+ * UriRef There are Places that have multiple adjectivals so in this map there are adjectivals that point
+ * to the same UriRef but that ensures a fast lookup.
+ */
+ private Map<String,Map<String,UriRef>> placeAdjectivalsMap;
+
+ public Dictionaries(String[] languages) throws ConfigurationException {
+ Properties props = new Properties();
+ InputStream in = null;
+
+ try {
+ in = Dictionaries.class.getResourceAsStream(Constants.PLACE_ADJECTIVALS_FOLDER + "/"
+ + PLACE_ADJECTIVALS_CONFIG);
+ props.load(in);
+ } catch (IOException e) {
+ throw new ConfigurationException("", "Could not read " + PLACE_ADJECTIVALS_CONFIG);
+ } finally {
+ if (in != null) {
+ try {
+ in.close();
+ } catch (IOException e) {}
+ }
+ }
+
+ String entityBaseUri = props.getProperty(ENTITY_BASE_URI);
+ if (entityBaseUri == null || entityBaseUri.equals("")) {
+ throw new ConfigurationException(ENTITY_BASE_URI, "Missing property in "
+ + PLACE_ADJECTIVALS_CONFIG);
+ }
+
+ placeAdjectivalsMap = new HashMap<>();
+
+ for (String language : languages) {
+ String line = null;
+ Map<String,UriRef> languagePlaceAdjMap = new HashMap<>();
+ InputStream langIn = null;
+ BufferedReader reader = null;
+
+ try {
+ langIn = Dictionaries.class.getResourceAsStream(Constants.PLACE_ADJECTIVALS_FOLDER + "/"
+ + language);
+ reader = new BufferedReader(new InputStreamReader(langIn));
+
+ while ((line = reader.readLine()) != null) {
+ String[] splittedLine = line.split("\t");
+ String place = splittedLine[0];
+ String adjectivals = splittedLine[1];
+ UriRef ref = new UriRef(entityBaseUri + place.trim());
+ String[] adjectivalsArray = adjectivals.split(",");
+
+ for (String adjectival : adjectivalsArray) {
+ languagePlaceAdjMap.put(adjectival.trim().toLowerCase(), ref);
+ }
+ }
+
+ placeAdjectivalsMap.put(language, languagePlaceAdjMap);
+ } catch (IOException ioe) {
+ throw new ConfigurationException("", "Could not read " + Constants.PLACE_ADJECTIVALS_FOLDER
+ + "/" + language, ioe);
+ } finally {
+ if (langIn != null) {
+ try {
+ langIn.close();
+ } catch (IOException e) {}
+ }
+
+ if (reader != null) {
+ try {
+ reader.close();
+ } catch (IOException e) {}
+ }
+ }
+ }
+ }
+
+ /**
+ * Checks whether a {@link NounPhrase} contains a place adjectival and returns it.
+ *
+ * @param language
+ * @param nounPhrase
+ * @return the {@link PlaceAdjectival} if the {@link NounPhrase} contains one or null if not.
+ */
+ public PlaceAdjectival findPlaceAdjectival(String language, NounPhrase nounPhrase) {
+ List<Span> tokens = nounPhrase.getTokens();
+ Map<String,UriRef> langPlaceAdjectivalsMap = placeAdjectivalsMap.get(language);
+ /*
+ * Go through all 1-grams and 2-grams and see if we have a match in the place adjectivals map. 2-grams
+ * should be good enough since there are no 3-gram places at least from what I saw.
+ */
+ for (int i = 0; i < tokens.size(); i++) {
+ Span currentToken = tokens.get(i);
+ String currentTokenString = currentToken.getSpan().toLowerCase();
+ // First the current 1-gram
+ if (langPlaceAdjectivalsMap.containsKey(currentTokenString)) {
+ return new PlaceAdjectival(currentToken.getStart(), currentToken.getEnd(),
+ langPlaceAdjectivalsMap.get(currentTokenString));
+ }
+
+ // Then use the 2-gram with the token before it
+ StringBuilder concatTokens = new StringBuilder();
+ String concatTokensString = null;
+
+ if (i > 0) {
+ Span previousToken = tokens.get(i - 1);
+ String previousTokenString = previousToken.getSpan().toLowerCase();
+ concatTokens = new StringBuilder();
+ concatTokens.append(previousTokenString);
+ concatTokens.append(" ");
+ concatTokens.append(currentTokenString);
+ concatTokensString = concatTokens.toString();
+
+ if (langPlaceAdjectivalsMap.containsKey(concatTokensString.toLowerCase())) {
+ return new PlaceAdjectival(previousToken.getStart(), currentToken.getEnd(),
+ langPlaceAdjectivalsMap.get(concatTokensString));
+ }
+ }
+
+ // Now use the 2-gram with the token after it
+ if (i < tokens.size() - 1) {
+ Span nextToken = tokens.get(i + 1);
+ String nextTokenString = nextToken.getSpan().toLowerCase();
+ concatTokens = new StringBuilder();
+ concatTokens.append(currentTokenString);
+ concatTokens.append(" ");
+ concatTokens.append(nextTokenString);
+
+ concatTokensString = concatTokens.toString();
+
+ if (langPlaceAdjectivalsMap.containsKey(concatTokens.toString())) {
+ return new PlaceAdjectival(currentToken.getStart(), nextToken.getEnd(),
+ langPlaceAdjectivalsMap.get(concatTokensString));
+ }
+ }
+ }
+
+ return null;
+ }
+}
diff --git a/entitycoreference/src/main/java/org/apache/stanbol/enhancer/engines/entitycoreference/impl/InMemoryEntityTypeIndex.java b/entitycoreference/src/main/java/org/apache/stanbol/enhancer/engines/entitycoreference/impl/InMemoryEntityTypeIndex.java
new file mode 100644
index 0000000..3142291
--- /dev/null
+++ b/entitycoreference/src/main/java/org/apache/stanbol/enhancer/engines/entitycoreference/impl/InMemoryEntityTypeIndex.java
@@ -0,0 +1,75 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.stanbol.enhancer.engines.entitycoreference.impl;
+
+import java.util.HashMap;
+import java.util.Map;
+import java.util.Set;
+
+import org.apache.clerezza.rdf.core.UriRef;
+
+/**
+ * Memory cache for storing often used Entity Type (Class) information.
+ *
+ * @author Cristian Petroaca
+ *
+ */
+public class InMemoryEntityTypeIndex {
+ /**
+ * The index having as key the Uri of the class and the value the set of labels ordered by language.
+ */
+ private Map<UriRef,Map<String,Set<String>>> index;
+
+ public InMemoryEntityTypeIndex() {
+ index = new HashMap<UriRef,Map<String,Set<String>>>();
+ }
+
+ /**
+ * Searches for a given class URI for the given language.
+ *
+ * @param uri
+ * @param language
+ * @return
+ */
+ public Set<String> lookupEntityType(UriRef uri, String language) {
+ Map<String,Set<String>> langMap = index.get(uri);
+
+ if (langMap != null) {
+ return langMap.get(language);
+ }
+
+ return null;
+ }
+
+ /**
+ * Adds a new class URI's labels for the given language.
+ *
+ * @param uri
+ * @param language
+ * @param labels
+ */
+ public void addEntityType(UriRef uri, String language, Set<String> labels) {
+ Map<String,Set<String>> langMap = index.get(uri);
+
+ if (langMap == null) {
+ langMap = new HashMap<String,Set<String>>();
+ index.put(uri, langMap);
+ }
+
+ langMap.put(language, labels);
+ }
+}
diff --git a/entitycoreference/src/main/java/org/apache/stanbol/enhancer/engines/entitycoreference/impl/NounPhraseFilterer.java b/entitycoreference/src/main/java/org/apache/stanbol/enhancer/engines/entitycoreference/impl/NounPhraseFilterer.java
new file mode 100644
index 0000000..da72dc8
--- /dev/null
+++ b/entitycoreference/src/main/java/org/apache/stanbol/enhancer/engines/entitycoreference/impl/NounPhraseFilterer.java
@@ -0,0 +1,139 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.stanbol.enhancer.engines.entitycoreference.impl;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+import java.util.Properties;
+import java.util.Set;
+
+import org.apache.stanbol.enhancer.engines.entitycoreference.Constants;
+import org.apache.stanbol.enhancer.engines.entitycoreference.datamodel.NounPhrase;
+import org.apache.stanbol.enhancer.nlp.NlpAnnotations;
+import org.apache.stanbol.enhancer.nlp.model.Span;
+import org.apache.stanbol.enhancer.nlp.model.annotation.Value;
+import org.apache.stanbol.enhancer.nlp.pos.LexicalCategory;
+import org.apache.stanbol.enhancer.nlp.pos.Pos;
+import org.apache.stanbol.enhancer.nlp.pos.PosTag;
+import org.osgi.service.cm.ConfigurationException;
+
+/**
+ * Filters out bad {@link NounPhrase}s based on pos information.
+ *
+ * @author Cristian Petroaca
+ *
+ */
+/*
+ * TODO - create a NounPhraseFilterer interface with multiple implementations to separate languages with
+ * appositional definite article from the others.
+ */
+public class NounPhraseFilterer {
+ private final static String WITHIN_TEXT_DET_PROP = "within.text.referencing.determiners";
+ private final static short MIN_POS_NUMBER = 2;
+
+ /**
+ * Set of determiners based on language which make a {@link NounPhrase} valid for being a coref mention.
+ */
+ private Map<String,Set<String>> withinTextRefDeterminers;
+
+ public NounPhraseFilterer(String[] languages) throws ConfigurationException {
+ withinTextRefDeterminers = new HashMap<String,Set<String>>();
+
+ for (String language : languages) {
+ Properties props = new Properties();
+ String propertiesFile = Constants.POS_CONFIG_FOLDER + "/" + language + ".properties";
+ InputStream in = null;
+
+ try {
+ in = NounPhraseFilterer.class.getResourceAsStream(propertiesFile);
+ props.load(in);
+ } catch (IOException e) {
+ throw new ConfigurationException("", "Could not read " + propertiesFile);
+ } finally {
+ if (in != null) {
+ try {
+ in.close();
+ } catch (IOException e) {}
+ }
+ }
+
+ String determinersProperty = props.getProperty(WITHIN_TEXT_DET_PROP);
+
+ if (determinersProperty == null) {
+ throw new ConfigurationException(WITHIN_TEXT_DET_PROP, "Missing property in "
+ + propertiesFile);
+ }
+
+ Set<String> langDeterminerSet = new HashSet<String>();
+ for (String determiner : determinersProperty.split(",")) {
+ langDeterminerSet.add(determiner);
+ }
+
+ withinTextRefDeterminers.put(language, langDeterminerSet);
+ }
+ }
+
+ /**
+ * Filters out noun phrases which do not contain a determiner from the given config and do not a token
+ * count bigger than 2 - TODO : should this be configurable to be able to also include 1 word noun
+ * phrases?
+ *
+ * @param nounPhrases
+ * @param language
+ */
+ public void filter(List<NounPhrase> nounPhrases, String language) {
+ Set<String> langDeterminerSet = withinTextRefDeterminers.get(language);
+ Iterator<NounPhrase> it = nounPhrases.iterator();
+
+ while (it.hasNext()) {
+ NounPhrase nounPhrase = it.next();
+ boolean hasGoodDeterminer = false;
+ short nounNo = 0;
+
+ for (Span token : nounPhrase.getTokens()) {
+ Value<PosTag> pos = token.getAnnotation(NlpAnnotations.POS_ANNOTATION);
+
+ if (pos != null) {
+ PosTag posTag = pos.value();
+
+ if (posTag.hasCategory(LexicalCategory.Noun)
+ || posTag.hasCategory(LexicalCategory.Adjective)) {
+ nounNo++;
+ }
+
+ if (!hasGoodDeterminer && posTag.hasPos(Pos.Determiner)
+ && langDeterminerSet.contains(token.getSpan().toLowerCase())) {
+ hasGoodDeterminer = true;
+ }
+ }
+ }
+
+ if (!hasGoodDeterminer || nounNo < MIN_POS_NUMBER) {
+ it.remove();
+ }
+ }
+ }
+
+ public boolean supportsLanguage(String language) {
+ return withinTextRefDeterminers.containsKey(language);
+ }
+}
diff --git a/entitycoreference/src/main/resources/OSGI-INF/metatype/metatype.properties b/entitycoreference/src/main/resources/OSGI-INF/metatype/metatype.properties
new file mode 100644
index 0000000..b37924b
--- /dev/null
+++ b/entitycoreference/src/main/resources/OSGI-INF/metatype/metatype.properties
@@ -0,0 +1,41 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+stanbol.enhancer.engine.name.name=Name
+stanbol.enhancer.engine.name.description=The name of the enhancement engine as \
+used in the RESTful interface '/engine/<name>'
+
+service.ranking.name=Ranking
+service.ranking.description=If two enhancement engines with the same name are active the \
+one with the higher ranking will be used to process parsed content items.
+
+
+#===============================================================================
+#Properties and Options used to configure
+#===============================================================================
+org.apache.stanbol.enhancer.engines.entitycoreference.EntityCoReferenceEngine.name=Apache \
+Stanbol Enhancer Engine: Entity Co-Reference
+org.apache.stanbol.enhancer.engines.entitycoreference.EntityCoReferenceEngine.description=An Engine that finds \
+co-references of Named Entities based on dbpedia/yago concepts.
+
+enhancer.engine.entitycoreference.languages.name=Language configuration
+enhancer.engine.entitycoreference.languages.description=Takes a list of ISO \
+ language codes. '*' is the Wildcard; '!{lang}' to exclude a language
+enhancer.engine.entitycoreference.referencedSiteId.name=Referenced Site
+enhancer.engine.entitycoreference.referencedSiteId.description=The ID of the \
+Entityhub Referenced Site holding the Entity Index.
+enhancer.engine.entitycoreference.maxDistance.name=Max sentence distance
+enhancer.engine.entitycoreference.maxDistance.description=The maximum sentence distance between the Ner \
+and the noun phrase which mentions it. -1 means no distance constraint.
diff --git a/entitycoreference/src/main/resources/config/named_entity.properties b/entitycoreference/src/main/resources/config/named_entity.properties
new file mode 100644
index 0000000..9071bb1
--- /dev/null
+++ b/entitycoreference/src/main/resources/config/named_entity.properties
@@ -0,0 +1,7 @@
+# Properties of an Entity that will be matched against when doing the coreference
+spatial.ont.person.attributes=http://dbpedia.org/ontology/birthPlace,http://dbpedia.org/ontology/region,http://dbpedia.org/ontology/nationality,http://dbpedia.org/ontology/country
+spatial.ont.organisation.attributes=http://dbpedia.org/ontology/foundationPlace,http://dbpedia.org/ontology/locationCity,http://dbpedia.org/ontology/location,http://dbpedia.org/ontology/hometown
+spatial.ont.place.attributes=http://dbpedia.org/ontology/country,http://dbpedia.org/ontology/subdivisionName,http://dbpedia.org/ontology/location
+
+# Entity classes that will be excluded when doing the coreference because they are too generic.
+entity.classes.to.exclude=http://dbpedia.org/ontology/Person,http://dbpedia.org/class/yago/LivingThing100004258,http://dbpedia.org/class/yago/PhysicalEntity100001930,http://dbpedia.org/class/yago/Abstraction100002137,http://dbpedia.org/class/yago/Organism100004475,http://dbpedia.org/class/yago/Location100027167,http://schema.org/Place,http://dbpedia.org/class/yago/Object100002684,http://dbpedia.org/class/yago/YagoGeoEntity,http://www.w3.org/2002/07/owl#Thing,http://dbpedia.org/class/yago/YagoPermanentlyLocatedEntity
\ No newline at end of file
diff --git a/entitycoreference/src/main/resources/config/pos/en.properties b/entitycoreference/src/main/resources/config/pos/en.properties
new file mode 100644
index 0000000..f754216
--- /dev/null
+++ b/entitycoreference/src/main/resources/config/pos/en.properties
@@ -0,0 +1,2 @@
+# Determiners of a noun phrase which determine that the noun phrase is a good candidate for coref.
+within.text.referencing.determiners=the,this,these
\ No newline at end of file
diff --git a/entitycoreference/src/main/resources/data/place_adjectivals/config.properties b/entitycoreference/src/main/resources/data/place_adjectivals/config.properties
new file mode 100644
index 0000000..2ca1f35
--- /dev/null
+++ b/entitycoreference/src/main/resources/data/place_adjectivals/config.properties
@@ -0,0 +1,2 @@
+# The Base URL for place adjetival entities
+entity.uri.base=http://dbpedia.org/resource/
\ No newline at end of file
diff --git a/entitycoreference/src/main/resources/data/place_adjectivals/en b/entitycoreference/src/main/resources/data/place_adjectivals/en
new file mode 100644
index 0000000..effc95b
--- /dev/null
+++ b/entitycoreference/src/main/resources/data/place_adjectivals/en
@@ -0,0 +1,236 @@
+Abkhazia Abkhaz, Abkhazian
+Afghanistan Afghan
+Albania Albanian
+Algeria Algerian
+American_Samoa American Samoan
+Andorra Andorran
+Angola Angolan
+Anguilla Anguillan
+Antigua_and_Barbuda Antiguan, Barbudan
+Argentina Argentine, Argentinean, Argentinian
+Armenia Armenian
+Aruba Aruban
+Australia Australian
+Austria Austrian
+Azerbaijan Azerbaijani, Azeri
+Bahamas Bahamian
+Bahrain Bahraini
+Bangladesh Bangladeshi
+Barbados Barbadian
+Belarus Belarusian
+Belgium Belgian
+Belize Belizean
+Benin Beninese, Beninois
+Bermuda Bermudian, Bermudan
+Bhutan Bhutanese
+Bolivia Bolivian
+Bosnia_and_Herzegovina Bosnian, Bosniak, Herzegovinian
+Botswana Motswana, Botswanan
+Brazil Brazilian
+British Virgin Islands British Virgin Island
+Brunei Bruneian
+Bulgaria Bulgarian
+Burkina_Fasoa Burkinabè
+Burmab Burmese
+Burundi Burundian
+Cambodia Cambodian
+Cameroon Cameroonian
+Canada Canadian
+Cape_Verde Cape Verdean
+Cayman_Islands Caymanian
+Central_African_Republic Central African
+Chad Chadian
+Chile Chilean
+China Chinese
+Christmas_Island Christmas Island
+Cocos_Islands Cocos Island
+Colombia Colombian
+Comoros Comorian
+Congo Congolese, Congo
+Cook_Islands Cook Island, Cook Islands
+Costa_Rica Costa Rican
+Côte_d'Ivoire Ivorian
+Croatia Croatian
+Cuba Cuban
+Cyprus Cypriot
+Czech_Republic Czech
+Denmark Danish
+Djibouti Djiboutian
+Dominica Dominicand
+Dominican_Republic Dominicane
+East_Timor Timorese
+Ecuador Ecuadorian
+Egypt Egyptian
+El_Salvador Salvadoran
+England English
+Equatorial_Guinea Equatorial Guinean, Equatoguinean
+Eritrea Eritrean
+Estonia Estonian
+Ethiopia Ethiopian
+Falkland_Islands Falkland Island
+Faroe_Islands Faroese
+Fiji Fijian
+Finland Finnish
+France French
+French_Guiana French Guianese
+French_Polynesia French Polynesian
+Gabon Gabonese
+Gambia Gambian
+Georgia Georgian
+Germany German
+Ghana Ghanaian
+Gibraltar Gibraltar
+Great_Britain British
+Greece Greek, Greciang, Hellenic
+Greenland Greenlandic
+Grenada Grenadian
+Guadeloupe Guadeloupe
+Guam Guamanian, Guambat
+Guatemala Guatemalan
+Guinea Guinean
+Guyana Guyanese
+Haiti Haitian
+Honduras Honduran
+Hong_Kong Hong Kong, Hongkongese
+Hungary Hungarian, Magyar
+Iceland Icelandic
+India Indian
+Indonesia Indonesian
+Iran Iranian, Persian
+Iraq Iraqi
+Ireland Irish
+Isle_of_Man Manx
+Israel Israeli
+Italy Italian, Italic
+Jamaica Jamaican
+Japan Japanese
+Jordan Jordanian
+Kazakhstan Kazakh, Kazakhstani
+Kenya Kenyan
+Kiribati I-Kiribati
+North_Korea North Korean
+South_Korea South Korean
+Kosovo Kosovar, Kosovan
+Kuwait Kuwaiti
+Kyrgyzstan Kyrgyzstani, Kyrgyz, Kirgiz, Kirghiz
+Laos Laotian, Lao
+Latvia Latvian
+Lebanon Lebanese
+Lesotho Basotho
+Liberia Liberian
+Libya Libyan
+Liechtenstein Liechtenstein
+Lithuania Lithuanian
+Luxembourg Luxembourg, Luxembourgish
+Macau Macanese, Chinese
+Macedonia Macedonian
+Madagascar Malagasy
+Malawi Malawian
+Malaysia Malaysian
+Maldives Maldivian
+Mali Malian
+Malta Maltese
+Marshall Islands Marshallese
+Martinique Martiniquais, Martinican
+Mauritania Mauritanian
+Mauritius Mauritian
+Mayotte Mahoran
+Mexico Mexican
+Micronesia Micronesian
+Moldova Moldovan
+Monaco Monégasque, Monacan
+Mongolia Mongolian
+Montenegro Montenegrin
+Montserrat Montserratian
+Morocco Moroccan
+Mozambique Mozambican
+Namibia Namibian
+Nauru Nauruan
+Nepal Nepalese, Nepali
+Netherlands Dutch, Netherlandic
+New_Caledonia New Caledonian
+New_Zealand New Zealand, NZ
+Nicaragua Nicaraguan
+Niue Niuean
+Niger Nigerien
+Nigeria Nigerian
+Norway Norwegian
+Northern_Ireland Northern Irish, Irish
+Northern_Marianas Northern Marianan
+Oman Omani
+Pakistan Pakistani
+Palestine Palestinian
+Palau Palauan
+Panama Panamanian
+Papua_New_Guinea Papua New Guinean, Papuan
+Paraguay Paraguayan
+Peru Peruvian
+Philippines Philippine, Filipino
+Pitcairn_Island Pitcairn Island
+Poland Polish
+Portugal Portuguese
+Puerto_Rico Puerto Rican
+Qatar Qatari
+Ireland Irish
+Réunion Réunionese, Réunionnais
+Romania Romanian
+Russia Russian
+Rwanda Rwandan
+St._Helena St. Helenian
+St._Kitts_and_Nevis Kittitian, Nevisian
+St._Lucia St. Lucian
+Saint-Pierre_and_Miquelon Saint-Pierrais, Miquelonnais
+St._Vincent_and_the_Grenadines St. Vincentian, Vincentian
+Samoa Samoan
+San_Marino Sammarinese
+São_Tomé_and_Príncipe São Toméan
+Saudi_Arabia Saudi, Saudi Arabian
+Scotland Scots, Scottish, Scotchi
+Senegal Senegalese
+Serbia Serbian
+Seychelles Seychellois
+Sierra_Leone Sierra Leonean
+Singapore Singaporean
+Slovakia Slovak
+Slovenia Slovenian, Slovene
+Solomon_Islands Solomon Island
+Somalia Somali, Somalian
+South_Africa South African
+South_Ossetia South Ossetian
+South_Sudan South Sudanese
+Spain Spanish
+Sri_Lanka Sri Lankan
+Sudan Sudanese
+Surinam Surinamese
+Swaziland Swazi
+Sweden Swedish
+Switzerland Swiss
+Syria Syrian
+Taiwan Taiwanese
+Tajikistan Tajikistani
+Tanzania Tanzanian
+Thailand Thai
+Togo Togolese
+Tonga Tongan
+Trinidad_and_Tobago Trinidadian, Tobagonian
+Tunisia Tunisian
+Turkey Turkish
+Turkmenistan Turkmen
+Tuvalu Tuvaluan
+Uganda Ugandan
+Ukraine Ukrainian
+United_Arab_Emirates Emirati, Emirian
+United_Kingdom British, UK
+United_States American, US
+Uruguay Uruguayan
+Uzbekistan Uzbekistani, Uzbek
+Vanuatu Ni-Vanuatu, Vanuatuan
+Venezuela Venezuelan
+Vietnam Vietnamese
+Virgin_Islands Virgin Island
+Wales Welsh
+Wallis_and_Futuna Wallisian, Futunan
+Western_Sahara Sahraw, Sahrawian, Sahraouian
+Yemen Yemeni
+Zambia Zambian
+Zimbabwe Zimbabwean
\ No newline at end of file
diff --git a/entitycoreference/src/test/resources/log4j.properties b/entitycoreference/src/test/resources/log4j.properties
new file mode 100644
index 0000000..a7d5b65
--- /dev/null
+++ b/entitycoreference/src/test/resources/log4j.properties
@@ -0,0 +1,24 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Root logger option
+log4j.rootLogger=INFO, stdout
+
+# Direct log messages to stdout
+log4j.appender.stdout=org.apache.log4j.ConsoleAppender
+log4j.appender.stdout.Target=System.out
+log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
+log4j.appender.stdout.layout.ConversionPattern=%d{ABSOLUTE} %5p %c{1}:%L - %m%n
+log4j.logger.org.apache.stanbol.enhancer.engines.keywordextraction=DEBUG
\ No newline at end of file
diff --git a/pom.xml b/pom.xml
new file mode 100644
index 0000000..ec5069f
--- /dev/null
+++ b/pom.xml
@@ -0,0 +1,31 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor
+ license agreements. See the NOTICE file distributed with this work for additional
+ information regarding copyright ownership. The ASF licenses this file to
+ You under the Apache License, Version 2.0 (the "License"); you may not use
+ this file except in compliance with the License. You may obtain a copy of
+ the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required
+ by applicable law or agreed to in writing, software distributed under the
+ License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS
+ OF ANY KIND, either express or implied. See the License for the specific
+ language governing permissions and limitations under the License. -->
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
+
+ <modelVersion>4.0.0</modelVersion>
+
+ <groupId>org.apache.stanbol</groupId>
+ <artifactId>org.apache.stanbol.stanbol1279.reactor</artifactId>
+ <version>1.0.0-SNAPSHOT</version>
+ <packaging>pom</packaging>
+
+ <name>Apache Stanbol Entity Co-Mention Engine Reactor</name>
+ <description>
+ Reactor pom for the STANBOL-1279 reactor.
+ </description>
+
+ <modules>
+ <module>entitycoreference</module>
+ </modules>
+
+</project>