KeywordlinkingEngine: UnitTests now work again, Improved Limit used by the EntitySearcher; Default Configuration: Corrected also some bugs in the configuration, KeywordLinkingEngine now uses 20 suggestions and 1 min found tokens (good for testing disambiguation as it results in a lot of suggestions)

git-svn-id: https://svn.apache.org/repos/asf/incubator/stanbol/branches/disambiguation-engine@1379463 13f79535-47bb-0310-9956-ffa450edef68
diff --git a/defaults/src/main/resources/config/org.apache.stanbol.enhancer.chain.weighted.impl.WeightedChain-default.config b/defaults/src/main/resources/config/org.apache.stanbol.enhancer.chain.weighted.impl.WeightedChain-default.config
deleted file mode 100644
index 93851be..0000000
--- a/defaults/src/main/resources/config/org.apache.stanbol.enhancer.chain.weighted.impl.WeightedChain-default.config
+++ /dev/null
@@ -1,3 +0,0 @@
-stanbol.enhancer.chain.name="dbpedia-keyword-disambiguation"

-stanbol.enhancer.chain.weighted.chain=["tika;optional","metaxa;optional","langdetect","dbpediaKeyword","disambiguation-mlt"]

-service.ranking=I"0"
\ No newline at end of file
diff --git a/defaults/src/main/resources/config/org.apache.stanbol.enhancer.chain.weighted.impl.WeightedChain-defaultdisambiguation.config b/defaults/src/main/resources/config/org.apache.stanbol.enhancer.chain.weighted.impl.WeightedChain-defaultdisambiguation.config
new file mode 100644
index 0000000..54b70ad
--- /dev/null
+++ b/defaults/src/main/resources/config/org.apache.stanbol.enhancer.chain.weighted.impl.WeightedChain-defaultdisambiguation.config
@@ -0,0 +1,3 @@
+stanbol.enhancer.chain.name="default"

+stanbol.enhancer.chain.weighted.chain=["tika;optional","metaxa;optional","langdetect","ner","dbpediaLinking","entityhubExtraction","disambiguation-mlt"]

+service.ranking=I"-100"
\ No newline at end of file
diff --git a/defaults/src/main/resources/config/org.apache.stanbol.enhancer.chain.weighted.impl.WeightedChain-keyworddisambiguation.config b/defaults/src/main/resources/config/org.apache.stanbol.enhancer.chain.weighted.impl.WeightedChain-keyworddisambiguation.config
index 54b70ad..56e952e 100644
--- a/defaults/src/main/resources/config/org.apache.stanbol.enhancer.chain.weighted.impl.WeightedChain-keyworddisambiguation.config
+++ b/defaults/src/main/resources/config/org.apache.stanbol.enhancer.chain.weighted.impl.WeightedChain-keyworddisambiguation.config
@@ -1,3 +1,3 @@
-stanbol.enhancer.chain.name="default"

-stanbol.enhancer.chain.weighted.chain=["tika;optional","metaxa;optional","langdetect","ner","dbpediaLinking","entityhubExtraction","disambiguation-mlt"]

-service.ranking=I"-100"
\ No newline at end of file
+stanbol.enhancer.chain.name="dbpedia-keyword-disambiguation"
+stanbol.enhancer.chain.weighted.chain=["tika;optional","metaxa;optional","langdetect","dbpediaKeyword","disambiguation-mlt"]
+service.ranking=I"0"
\ No newline at end of file
diff --git a/defaults/src/main/resources/config/org.apache.stanbol.enhancer.engines.keywordextraction.engine.KeywordLinkingEngine-dbpediakeyword.config b/defaults/src/main/resources/config/org.apache.stanbol.enhancer.engines.keywordextraction.engine.KeywordLinkingEngine-dbpediakeyword.config
index b058dc9..ed2ca61 100644
--- a/defaults/src/main/resources/config/org.apache.stanbol.enhancer.engines.keywordextraction.engine.KeywordLinkingEngine-dbpediakeyword.config
+++ b/defaults/src/main/resources/config/org.apache.stanbol.enhancer.engines.keywordextraction.engine.KeywordLinkingEngine-dbpediakeyword.config
@@ -16,4 +16,5 @@
 org.apache.stanbol.enhancer.engines.keywordextraction.redirectField="rdfs:seeAlso"

 stanbol.enhancer.engine.name="dbpediaKeyword"

 org.apache.stanbol.enhancer.engines.keywordextraction.defaultMatchingLanguage="en"

-org.apache.stanbol.enhancer.engines.keywordextraction.keywordTokenizer=B"false"

+org.apache.stanbol.enhancer.engines.keywordextraction.keywordTokenizer=B"false"
+org.apache.stanbol.enhancer.engines.keywordextraction.minFoundTokens=I"1"

diff --git a/engines/disambiguation-mlt/src/main/java/org/apache/stanbol/enhancer/engine/disambiguation/mlt/DisambiguatorEngine.java b/engines/disambiguation-mlt/src/main/java/org/apache/stanbol/enhancer/engine/disambiguation/mlt/DisambiguatorEngine.java
index 00b2bd6..bd97154 100755
--- a/engines/disambiguation-mlt/src/main/java/org/apache/stanbol/enhancer/engine/disambiguation/mlt/DisambiguatorEngine.java
+++ b/engines/disambiguation-mlt/src/main/java/org/apache/stanbol/enhancer/engine/disambiguation/mlt/DisambiguatorEngine.java
@@ -262,9 +262,7 @@
                 disData.allSelectedTexts, 
                 window);
                 //savedEntity.getContext()); 
-            disambiguationContext = unionString(false,
-                Collections.singleton(savedEntity.getName()),
-                contextSelections);
+          disambiguationContext = unionString(false, contextSelections);
             
             //(2) I do not understand this variant (see comment for the 
             //    EntitiesInRange(..) method
@@ -278,6 +276,11 @@
 //                Collections.singleton(context), //the context
 //                contextSelections); //other selected parsed in the context
             
+            //or just the name of the entity AND the context
+//            disambiguationContext = unionString(false,
+//                Collections.singleton(savedEntity.getName()),
+//                contextSelections);
+            
             //(4) TODO: I would also like to have the possibility to disambiguate
             //    using URIs of Entities suggested for other TextAnnotations
             //    within the context.
diff --git a/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/engine/KeywordLinkingEngine.java b/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/engine/KeywordLinkingEngine.java
index 38f21e2..98cf079 100644
--- a/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/engine/KeywordLinkingEngine.java
+++ b/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/engine/KeywordLinkingEngine.java
@@ -205,7 +205,12 @@
      * The literal representing the LangIDEngine as creator.
      */
     public static final Literal LANG_ID_ENGINE_NAME = LiteralFactory.getInstance().createTypedLiteral("org.apache.stanbol.enhancer.engines.langid.LangIdEnhancementEngine");
-    
+
+    /**
+     * The default value for the LIMIT of the {@link EntitySearcher}
+     */
+    private static final int DEFAULT_ENTITY_SEARCHER_LIMIT = 10;
+
     private EntitySearcher entitySearcher;
     private EntityLinkerConfig linkerConfig;
     private TextAnalyzerConfig nlpConfig;
@@ -873,9 +878,9 @@
         }
         //TODO: make limit configurable!
         if(Entityhub.ENTITYHUB_IDS.contains(referencedSiteName.toLowerCase())){
-            entitySearcher = new EntityhubSearcher(context.getBundleContext(),10);
+            entitySearcher = new EntityhubSearcher(context.getBundleContext(),DEFAULT_ENTITY_SEARCHER_LIMIT);
         } else {
-            entitySearcher = new ReferencedSiteSearcher(context.getBundleContext(),referencedSiteName,10);
+            entitySearcher = new ReferencedSiteSearcher(context.getBundleContext(),referencedSiteName,DEFAULT_ENTITY_SEARCHER_LIMIT);
         }
     }
     /**
diff --git a/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/EntityLinker.java b/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/EntityLinker.java
index 13c4158..fb69c75 100644
--- a/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/EntityLinker.java
+++ b/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/EntityLinker.java
@@ -56,6 +56,8 @@
      * The map holding the results of the linking process
      */
     private final Map<String,LinkedEntity> linkedEntities = new HashMap<String,LinkedEntity>();
+
+    private Integer lookupLimit;
     
     /**
      * After {@link #process()}ing this returns the entities linked for the
@@ -79,6 +81,7 @@
         this.entitySearcher = taxonomy;
         this.config = config;
         this.state = new ProcessingState(content.getAnalysedText());
+        this.lookupLimit  = Math.max(10,config.getMaxSuggestions()*2);
     }
     /**
      * Steps over the sentences, chunks, tokens of the {@link #sentences}
@@ -289,8 +292,11 @@
     private List<Suggestion> lookupEntities(List<String> searchStrings) throws EngineException {
         Collection<? extends Representation> results;
         try {
-            results = entitySearcher.lookup(config.getNameField(),config.getSelectedFields(),
-            searchStrings, state.getSentence().getLanguage(),config.getDefaultLanguage());
+            results = entitySearcher.lookup(config.getNameField(),
+                config.getSelectedFields(),
+                searchStrings, 
+                new String[]{state.getSentence().getLanguage(),config.getDefaultLanguage()},
+                lookupLimit);
         } catch (RuntimeException e) {
             throw new EngineException(e.getMessage(),e);
         }
@@ -555,6 +561,7 @@
                 //processable tokens are counted, but Exact also checks
                 //of non-processable!
                 foundTokens = coveredTokens;
+                foundProcessableTokens = coveredProcessableTokens;
             } else if((foundProcessableTokens >= config.getMinFoundTokens() ||
                     //NOTE (rwesten, 2012-05-21): Do not check if all covered
                     //  Tokens are found, but if all Tokens of the Label are
diff --git a/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/EntitySearcher.java b/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/EntitySearcher.java
index 96564a4..53aaa05 100644
--- a/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/EntitySearcher.java
+++ b/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/EntitySearcher.java
@@ -44,10 +44,11 @@
      * to be included. Other fields MAY also be included.
      * @param search the tokens to search for. MUST NOT be <code>null</code>
      * @param languages the languages to include in the search 
+     * @param limit The maximum number of resutls of <code>null</code> to use the default
      * @return the Representations found for the specified query
      * @throws T An exception while searching for concepts
      */
-    Collection<? extends Representation> lookup(String field,Set<String> includeFields,List<String> search,String...languages) throws IllegalStateException;
+    Collection<? extends Representation> lookup(String field, Set<String> includeFields, List<String> search, String[] languages,Integer limit) throws IllegalStateException;
     /**
      * Lookup a concept of the taxonomy by the id.
      * @param id the id
diff --git a/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/impl/EntityhubSearcher.java b/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/impl/EntityhubSearcher.java
index 44d902a..37872a5 100644
--- a/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/impl/EntityhubSearcher.java
+++ b/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/impl/EntityhubSearcher.java
@@ -62,13 +62,19 @@
     public Collection<? extends Representation> lookup(String field,
                                            Set<String> includeFields,
                                            List<String> search,
-                                           String... languages) throws IllegalStateException {
+                                           String[] languages,
+                                           Integer limit) throws IllegalStateException {
         Entityhub entityhub = getSearchService();
         if(entityhub == null){
             throw new IllegalStateException("The Entityhub is currently not active");
         }
         FieldQuery query = EntitySearcherUtils.createFieldQuery(entityhub.getQueryFactory(),
             field, includeFields, search, languages);
+        if(limit != null && limit > 0){
+            query.setLimit(limit);
+        } else if(this.limit != null){
+            query.setLimit(this.limit);
+        }
         QueryResultList<Representation> results;
         try {
             results = entityhub.find(query);
diff --git a/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/impl/ReferencedSiteSearcher.java b/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/impl/ReferencedSiteSearcher.java
index 6dbcef4..c6a0339 100644
--- a/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/impl/ReferencedSiteSearcher.java
+++ b/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/impl/ReferencedSiteSearcher.java
@@ -69,7 +69,8 @@
     public Collection<? extends Representation> lookup(String field,
                                            Set<String> includeFields,
                                            List<String> search,
-                                           String... languages) throws IllegalStateException {
+                                           String[] languages,
+                                           Integer limit) throws IllegalStateException {
         //build the query and than return the result
         Site site = getSearchService();
         if(site == null){
@@ -77,8 +78,10 @@
         }
         FieldQuery query = EntitySearcherUtils.createFieldQuery(site.getQueryFactory(), 
             field, includeFields, search, languages);
-        if(limit != null){
+        if(limit != null && limit > 0){
             query.setLimit(limit);
+        } else if(this.limit != null){
+            query.setLimit(this.limit);
         }
         QueryResultList<Representation> results;
         try {
diff --git a/engines/keywordextraction/src/test/java/org/apache/stanbol/enhancer/engines/keywordextraction/impl/TestSearcherImpl.java b/engines/keywordextraction/src/test/java/org/apache/stanbol/enhancer/engines/keywordextraction/impl/TestSearcherImpl.java
index 4616dbb..07622bf 100644
--- a/engines/keywordextraction/src/test/java/org/apache/stanbol/enhancer/engines/keywordextraction/impl/TestSearcherImpl.java
+++ b/engines/keywordextraction/src/test/java/org/apache/stanbol/enhancer/engines/keywordextraction/impl/TestSearcherImpl.java
@@ -74,7 +74,8 @@
     public Collection<? extends Representation> lookup(String field,
                                            Set<String> includeFields,
                                            List<String> search,
-                                           String... languages) throws IllegalStateException {
+                                           String[] languages,
+                                           Integer limit) throws IllegalStateException {
         if(field.equals(nameField)){
             //we do not need sorting
             //Representation needs to implement equals, therefore results filters multiple matches