enhancement-engines/lucenefstlinking/src/test/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngineTest.java - stanbol - Git at Google

 /*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 package org.apache.stanbol.enhancer.engines.lucenefstlinking;

 import static org.apache.stanbol.enhancer.engines.entitylinking.config.TextProcessingConfig.PROCESSED_LANGUAGES;
 import static org.apache.stanbol.enhancer.engines.entitylinking.config.TextProcessingConfig.PROCESS_ONLY_PROPER_NOUNS_STATE;
 import static org.apache.stanbol.enhancer.engines.lucenefstlinking.FstLinkingEngine.FISE_ORIGIN;
 import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.DC_CREATOR;
 import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.DC_LANGUAGE;
 import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_EXTRACTED_FROM;
 import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertNotNull;

 import java.io.File;
 import java.io.IOException;
 import java.io.InputStream;
 import java.nio.charset.Charset;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collection;
 import java.util.Collections;
 import java.util.Dictionary;
 import java.util.HashMap;
 import java.util.HashSet;
 import java.util.Hashtable;
 import java.util.Iterator;
 import java.util.List;
 import java.util.Map;
 import java.util.Set;
 import java.util.TreeSet;
 import java.util.concurrent.ExecutorService;
 import java.util.concurrent.Executors;
 import java.util.concurrent.Future;
 import java.util.concurrent.TimeUnit;
 import java.util.concurrent.TimeoutException;

 import org.apache.clerezza.rdf.core.Literal;
 import org.apache.clerezza.rdf.core.LiteralFactory;
 import org.apache.clerezza.rdf.core.Resource;
 import org.apache.clerezza.rdf.core.Triple;
 import org.apache.clerezza.rdf.core.UriRef;
 import org.apache.clerezza.rdf.core.impl.PlainLiteralImpl;
 import org.apache.clerezza.rdf.core.impl.TripleImpl;
 import org.apache.solr.client.solrj.SolrServer;
 import org.apache.solr.client.solrj.embedded.EmbeddedSolrServer;
 import org.apache.solr.core.SolrCore;
 import org.apache.stanbol.commons.solr.IndexReference;
 import org.apache.stanbol.commons.solr.managed.ManagedSolrServer;
 import org.apache.stanbol.commons.solr.managed.standalone.StandaloneEmbeddedSolrServerProvider;
 import org.apache.stanbol.enhancer.contentitem.inmemory.InMemoryContentItemFactory;
 import org.apache.stanbol.enhancer.engines.entitylinking.config.EntityLinkerConfig;
 import org.apache.stanbol.enhancer.engines.entitylinking.config.LanguageProcessingConfig;
 import org.apache.stanbol.enhancer.engines.entitylinking.config.EntityLinkerConfig.RedirectProcessingMode;
 import org.apache.stanbol.enhancer.engines.entitylinking.config.TextProcessingConfig;
 import org.apache.stanbol.enhancer.engines.lucenefstlinking.CorpusCreationTask;
 import org.apache.stanbol.enhancer.engines.lucenefstlinking.CorpusInfo;
 import org.apache.stanbol.enhancer.engines.lucenefstlinking.FieldEncodingEnum;
 import org.apache.stanbol.enhancer.engines.lucenefstlinking.FstLinkingEngine;
 import org.apache.stanbol.enhancer.engines.lucenefstlinking.IndexConfiguration;
 import org.apache.stanbol.enhancer.engines.lucenefstlinking.cache.FastLRUCacheManager;
 import org.apache.stanbol.enhancer.engines.lucenefstlinking.cache.SolrEntityCache;
 import org.apache.stanbol.enhancer.nlp.json.AnalyzedTextParser;
 import org.apache.stanbol.enhancer.nlp.model.AnalysedText;
 import org.apache.stanbol.enhancer.nlp.model.AnalysedTextFactory;
 import org.apache.stanbol.enhancer.nlp.model.AnalysedTextUtils;
 import org.apache.stanbol.enhancer.nlp.pos.Pos;
 import org.apache.stanbol.enhancer.nlp.utils.LanguageConfiguration;
 import org.apache.stanbol.enhancer.nlp.utils.NlpEngineHelper;
 import org.apache.stanbol.enhancer.servicesapi.Blob;
 import org.apache.stanbol.enhancer.servicesapi.ContentItem;
 import org.apache.stanbol.enhancer.servicesapi.ContentItemFactory;
 import org.apache.stanbol.enhancer.servicesapi.EngineException;
 import org.apache.stanbol.enhancer.servicesapi.EnhancementEngine;
 import org.apache.stanbol.enhancer.servicesapi.helper.EnhancementEngineHelper;
 import org.apache.stanbol.enhancer.servicesapi.impl.StreamSource;
 import org.apache.stanbol.enhancer.servicesapi.rdf.Properties;
 import org.apache.stanbol.enhancer.servicesapi.rdf.TechnicalClasses;
 import org.apache.stanbol.enhancer.test.helper.EnhancementStructureHelper;
 import org.apache.stanbol.entityhub.core.model.InMemoryValueFactory;
 import org.apache.stanbol.entityhub.servicesapi.model.Representation;
 import org.apache.stanbol.entityhub.servicesapi.model.ValueFactory;
 import org.apache.stanbol.entityhub.servicesapi.util.ModelUtils;
 import org.apache.stanbol.entityhub.yard.solr.impl.SolrYard;
 import org.apache.stanbol.entityhub.yard.solr.impl.SolrYardConfig;
 import org.junit.After;
 import org.junit.AfterClass;
 import org.junit.Assert;
 import org.junit.Before;
 import org.junit.BeforeClass;
 import org.junit.Test;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;

 public class FstLinkingEngineTest {

     private final static Logger log = LoggerFactory.getLogger(FstLinkingEngineTest.class);

     /**
      * The SolrYard used for {@link #testSetup()} to check if {@link #REQUIRED_ENTITIES}
      * are present in the data.<p>
      * NOTE that the {@link FstLinkingEngine} DOES NOT require a SolrYard, but directly
      * operates on the #core
      */
     protected static SolrYard yard;
     protected static SolrCore core;
     private static IndexConfiguration fstConfig;
     /**
      * The SolrDirectoryManager also tested within this unit test
      */
     public static final String TEST_YARD_ID = "dbpedia";
     public static final String TEST_SOLR_CORE_NAME = "dbpedia";
     public static final String TEST_ORIGIN = "texst.origin";
     public static final String TEST_SOLR_CORE_CONFIGURATION = "dbpedia_26k.solrindex.bz2";
     protected static final String TEST_INDEX_REL_PATH = File.separatorChar + "target" + File.separatorChar
                                                         + ManagedSolrServer.DEFAULT_SOLR_DATA_DIR;
     /**
      * The maximal time we wait for the creation of an FST model in the test
      */
     public static final long FST_CREATION_WAIT_TIME = 2000; //seconds

     public static final String TEST_TEXT_FILE = "merkel.txt";
     public static final String TEST_TEXT_NLP_FILE = "merkel_nlp.json";

     private static final Literal EN_LANGUAGE = LiteralFactory.getInstance().createTypedLiteral("en");

     protected static final String DBPEDIA = "http://dbpedia.org/resource/";

     /**
      * List used in {@link #testSetup()} to validate that all expected entities
      * are contained in the SolrYard initialised based on the
      * {@link #TEST_SOLR_CORE_CONFIGURATION}.
      */
     private static final List<String> REQUIRED_ENTITIES = Arrays.asList(
         DBPEDIA+"Christian_Democratic_Union_(Germany)", DBPEDIA+"Angela_Merkel",
         DBPEDIA+"Germany", DBPEDIA+"Social_Democratic_Party_of_Germany",
         DBPEDIA+"Greece");

     private ContentItemFactory cif = InMemoryContentItemFactory.getInstance();
     private AnalysedTextFactory atf = AnalysedTextFactory.getDefaultInstance();
     private ContentItem ci;
     private String content;

     /**
      * Used with the {@link EnhancementStructureHelper} to validate Enhancement
      * results
      */
     private static Map<UriRef,Resource> EXPECTED_ENHANCEMENT_VALUES;
     static{
         EXPECTED_ENHANCEMENT_VALUES = new HashMap<UriRef,Resource>();
         EXPECTED_ENHANCEMENT_VALUES.put(DC_CREATOR, LiteralFactory.getInstance().createTypedLiteral(
             FstLinkingEngine.class.getName()));
         //adding null as expected for confidence makes it a required property
         EXPECTED_ENHANCEMENT_VALUES.put(Properties.ENHANCER_CONFIDENCE, null);
     }


     @BeforeClass
     public static void setup() throws Exception {
         // get the working directory
         // use property substitution to test this feature!
         String prefix = System.getProperty("basedir") == null ? "." : "${basedir}";
         String solrServerDir = prefix + TEST_INDEX_REL_PATH;
         log.info("Test Solr Server Directory: {}", solrServerDir);
         System.setProperty(ManagedSolrServer.MANAGED_SOLR_DIR_PROPERTY, solrServerDir);
         SolrYardConfig config = new SolrYardConfig(TEST_YARD_ID, TEST_SOLR_CORE_NAME);
         config.setAllowInitialisation(false);
         config.setIndexConfigurationName(TEST_SOLR_CORE_CONFIGURATION); //the dbpedia default data
         config.setAllowInitialisation(true); //init from datafile provider
         config.setName("DBpedia.org default data");
         config.setDescription("Data used for FstLinkingEngie tests");
         // create the Yard used for the tests
         IndexReference solrIndexRef = IndexReference.parse(config.getSolrServerLocation());

         SolrServer server = StandaloneEmbeddedSolrServerProvider.getInstance().getSolrServer(
             solrIndexRef, config.getIndexConfigurationName());
         Assert.assertNotNull("Unable to initialise SolrServer for testing",server);
         core = ((EmbeddedSolrServer)server).getCoreContainer().getCore(
             solrIndexRef.getIndex());
         Assert.assertNotNull("Unable to get SolrCore '" + config.getIndexConfigurationName()
             + "' from SolrServer "+server, core);
         yard = new SolrYard(server,config,null);
         //setup the index configuration
         LanguageConfiguration langConf = new LanguageConfiguration("not.used",
             new String[]{"en;field=dbpedia-ont:surfaceForm;generate=true"});
         fstConfig = new IndexConfiguration(langConf, core, FieldEncodingEnum.SolrYard,"");
         fstConfig.setExecutorService(Executors.newFixedThreadPool(1));
         fstConfig.setTypeField("rdf:type");
         fstConfig.setRankingField("entityhub:entityRank");
         //fstConfig.setEntityCacheManager(new FastLRUCacheManager(2048));
         fstConfig.setOrigin(new PlainLiteralImpl(TEST_ORIGIN));
         //activate the FST config
         fstConfig.activate(); //activate this configuration

         //validate that the index contains the expected entities
         validateTestIndex();

         //now create the FST models
         List<Future<?>> creationTasks = new ArrayList<Future<?>>();
         for(CorpusInfo corpus : fstConfig.getCorpora()){
             Assert.assertTrue("Failure in UnitTest - all FST models need to be generate=true",
                 corpus.allowCreation);
             if(!corpus.isFstFile()){
                 //create a task on the FST corpus creation service
                 creationTasks.add(fstConfig.getExecutorService().submit(
                     new CorpusCreationTask(fstConfig, corpus)));
             }
         }
         //and wait until all models are built (should only take some seconds on
         //typical hardware
         for(Future<?> future : creationTasks){
             try {
                 future.get(FST_CREATION_WAIT_TIME,TimeUnit.SECONDS);
             } catch (TimeoutException e) {
                 // we assert on future.isDone instead
             }
             Assert.assertTrue("FST Model creation not finished after "
                 + FST_CREATION_WAIT_TIME +"seconds", future.isDone());
         }
     }

     private static void validateTestIndex() throws Exception {
         log.info("check availability of {} entities", REQUIRED_ENTITIES.size());
         for(String context : REQUIRED_ENTITIES){
             log.debug("  > check Entity {}",context);
             Representation rep = yard.getRepresentation(context);
             assertNotNull(rep);
             assertEquals(rep.getId(),context);
             if(log.isDebugEnabled()){
                 log.debug("Data for Entity {}: \n {}",rep.getId(),
                     ModelUtils.getRepresentationInfo(rep));
             }
         }
         log.info("   ... all Entities present");
     }


     @AfterClass
     public static void cleanup() throws Exception {
         if(yard != null){
             yard.close();
         }
         yard = null;
     }

     /**
      * Initialises the {@link #ci} and {@link #content} fields for tests.
      * It creates a ContentItem containing a '<code>plain/text</code>'
      * {@link Blob} for the {@value #TEST_TEXT_FILE} and an {@link AnalysedText}
      * filled with the NLP analysis results stored in
      * {@link #TEST_TEXT_NLP_FILE}
      * @return the {@link ContentItem} as used for the tests
      * @throws IOException on any IO releated error while reading the test files
      */
     @Before
     public void setupTest() throws IOException {
         //create a contentItem for the plain text used for testing
         InputStream is = FstLinkingEngineTest.class.getClassLoader().getResourceAsStream(TEST_TEXT_FILE);
         Assert.assertNotNull("Unable to load '"+TEST_TEXT_FILE+"' via classpath",is);
         ContentItem ci = cif.createContentItem(new StreamSource(is,"text/plain"));
         AnalysedText at = atf.createAnalysedText(ci, ci.getBlob());
         is.close();
         //parse the prepared NLP results and add it to the ContentItem
         is = FstLinkingEngineTest.class.getClassLoader().getResourceAsStream(TEST_TEXT_NLP_FILE);
         Assert.assertNotNull("Unable to load '"+TEST_TEXT_NLP_FILE+"' via classpath",is);
         AnalyzedTextParser.getDefaultInstance().parse(is, Charset.forName("UTF-8"), at);
         is.close();
         //set the language of the contentItem
         ci.getMetadata().add(new TripleImpl(ci.getUri(), DC_LANGUAGE,
             EN_LANGUAGE));
         //set the contentItem and also the content
         this.ci = ci;
         this.content = at.getText().toString();
     }
     @After
     public void cleanupTest() {
         ci = null;
         content = null;
     }

     @Test
     public void testFstLinkingWithProperNouns() throws Exception {
         Dictionary<String,Object> dict = new Hashtable<String,Object>();
         dict.put(PROCESSED_LANGUAGES, Arrays.asList("en;lmmtip;uc=LINK;prob=0.75;pprob=0.75"));
         dict.put(PROCESS_ONLY_PROPER_NOUNS_STATE, true);
         TextProcessingConfig tpc = TextProcessingConfig.createInstance(dict);
         EntityLinkerConfig elc = new EntityLinkerConfig();
         elc.setMinFoundTokens(2);//this is assumed by this test
         elc.setRedirectProcessingMode(RedirectProcessingMode.FOLLOW);
         FstLinkingEngine engine = new FstLinkingEngine("proper-noun-linking",
             LinkingModeEnum.LINKABLE_TOKEN, fstConfig, tpc, elc, null);
         processConentItem(engine);
         validateEnhancements(
             Arrays.asList(
                 "Chancellor", "Angela Merkel", "Greece", "Greeks", "Germany", "SPD"),
             Arrays.asList(
                 DBPEDIA+"Christian_Democratic_Union_(Germany)",
                 DBPEDIA+"Angela_Merkel", DBPEDIA+"Greece", DBPEDIA+"Germany",
                 DBPEDIA+"Social_Democratic_Party_of_Germany"));
     }

     @Test
     public void testFstLinkingWithNouns() throws Exception {
         Dictionary<String,Object> dict = new Hashtable<String,Object>();
         dict.put(PROCESSED_LANGUAGES, Arrays.asList("en;lmmtip;uc=LINK;prob=0.75;pprob=0.75"));
         dict.put(PROCESS_ONLY_PROPER_NOUNS_STATE, false);
         TextProcessingConfig tpc = TextProcessingConfig.createInstance(dict);
         EntityLinkerConfig elc = new EntityLinkerConfig();
         elc.setMinFoundTokens(2);//this is assumed by this test
         elc.setRedirectProcessingMode(RedirectProcessingMode.FOLLOW);
         FstLinkingEngine engine = new FstLinkingEngine("proper-noun-linking",
             LinkingModeEnum.LINKABLE_TOKEN, fstConfig, tpc, elc, null);
         processConentItem(engine);
         validateEnhancements(
             Arrays.asList(
                 "Chancellor", "Angela Merkel", "Greece", "Greeks", "Germany", "SPD",
                 "change","election", "party", "policy"),
             Arrays.asList(
                 DBPEDIA+"Christian_Democratic_Union_(Germany)",
                 DBPEDIA+"Angela_Merkel", DBPEDIA+"Greece", DBPEDIA+"Germany",
                 DBPEDIA+"Social_Democratic_Party_of_Germany", DBPEDIA+"Chancellor",
                 DBPEDIA+"Election", DBPEDIA+"Party", DBPEDIA+"Policy"));
     }

     /**
      * @param expected
      */
     private int[] validateEnhancements(Collection<String> expectedSelectedTexts,
             Collection<String> expectedEntities) {
         //create clones from the parsed sets so that we can remove values
         Set<String> selectedTexts = new TreeSet<String>(expectedSelectedTexts);
         Set<String> suggestedEntities = new TreeSet<String>(expectedEntities);
         //iterate over all fise:TextAnnotations
         //NOTE this assumes all textAnnotations are from the FST linking engine
         log.info("  ... validated fise:TextAnnotations:");
         Map<UriRef,Resource> expected = new HashMap<UriRef,Resource>(EXPECTED_ENHANCEMENT_VALUES);
         expected.put(ENHANCER_EXTRACTED_FROM, ci.getUri());
         int[] num = new int[]{0,0};
         Iterator<Triple> textAnnotations = ci.getMetadata().filter(
             null, Properties.RDF_TYPE, TechnicalClasses.ENHANCER_TEXTANNOTATION);
         while(textAnnotations.hasNext()){
             UriRef textAnnotation = (UriRef)textAnnotations.next().getSubject();
             //validate this test annotation against the Stanbol EnhancementStructure
             EnhancementStructureHelper.validateTextAnnotation(
                 ci.getMetadata(), textAnnotation, content, expected);
             String selectedText = EnhancementEngineHelper.getString(
                 ci.getMetadata(), textAnnotation, Properties.ENHANCER_SELECTED_TEXT);
             log.info(" {}. {}",num[0]+1,selectedText);
             Assert.assertNotNull(selectedText);
             //NOTE also check for contains in the parsed set to not fail if the
             //     same selected text is contained multiple times
             Assert.assertTrue("fise:selected-text '" + selectedText +
                 "' not expected (expected: "+expectedSelectedTexts+")",
                 selectedTexts.remove(selectedText) || expectedSelectedTexts.contains(selectedTexts));
             num[0]++; //count the number of fise:TextAnnotations
         }
         Assert.assertTrue("Results do miss following expected fise:TextAnnotations: "
             + selectedTexts, selectedTexts.isEmpty());

         log.info("  ... validated fise:EntityAnnotations:");
         Iterator<Triple> entityAnnotations = ci.getMetadata().filter(
             null, Properties.RDF_TYPE, TechnicalClasses.ENHANCER_ENTITYANNOTATION);
         while(entityAnnotations.hasNext()){
             UriRef entityAnnotation = (UriRef)entityAnnotations.next().getSubject();
             //validate this test annotation against the Stanbol EnhancementStructure
             EnhancementStructureHelper.validateEntityAnnotation(
                 ci.getMetadata(), entityAnnotation, expected);
             UriRef entityUri = EnhancementEngineHelper.getReference(
                 ci.getMetadata(), entityAnnotation, Properties.ENHANCER_ENTITY_REFERENCE);
             log.info(" {}. {}",num[1]+1,entityUri);
             Assert.assertNotNull(entityUri);
             //NOTE also check for contains in the parsed set to not fail if the
             //     same selected text is contained multiple times
             if(suggestedEntities.remove(entityUri.getUnicodeString())){
                 log.info(" ... found");
             }
             //assert origin
             assertEquals(TEST_ORIGIN, EnhancementEngineHelper.getString(
                 ci.getMetadata(),entityAnnotation, FISE_ORIGIN));

 //            Assert.assertTrue("fise:referenced-entity " + entityUri +
 //                " not expected (expected: "+expectedEntities+")",
 //                suggestedEntities.remove(entityUri.getUnicodeString()) ||
 //                expectedEntities.contains(entityUri.getUnicodeString()));
             num[1]++; //count the number of fise:TextAnnotations

         }
         Assert.assertTrue("Results do miss following expected fise:EntityAnnotations: "
                 + suggestedEntities, suggestedEntities.isEmpty());
         return num;
     }

     /**
      * Processes the {@link #ci} with the parsed engine.
      * @param engine
      * @return returns {@link #ci} as convenience
      * @throws EngineException
      */
     private ContentItem processConentItem(FstLinkingEngine engine) throws EngineException {
         Assert.assertEquals("The FST Linking engine is expected to enhance the "
             + "test ContentItem EnhancementEngine.ENHANCE_ASYNC",
             EnhancementEngine.ENHANCE_ASYNC, engine.canEnhance(ci));
         engine.computeEnhancements(ci);
         return ci;
     }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.stanbol.enhancer.engines.lucenefstlinking;

	import static org.apache.stanbol.enhancer.engines.entitylinking.config.TextProcessingConfig.PROCESSED_LANGUAGES;
	import static org.apache.stanbol.enhancer.engines.entitylinking.config.TextProcessingConfig.PROCESS_ONLY_PROPER_NOUNS_STATE;
	import static org.apache.stanbol.enhancer.engines.lucenefstlinking.FstLinkingEngine.FISE_ORIGIN;
	import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.DC_CREATOR;
	import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.DC_LANGUAGE;
	import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_EXTRACTED_FROM;
	import static org.junit.Assert.assertEquals;
	import static org.junit.Assert.assertNotNull;

	import java.io.File;
	import java.io.IOException;
	import java.io.InputStream;
	import java.nio.charset.Charset;
	import java.util.ArrayList;
	import java.util.Arrays;
	import java.util.Collection;
	import java.util.Collections;
	import java.util.Dictionary;
	import java.util.HashMap;
	import java.util.HashSet;
	import java.util.Hashtable;
	import java.util.Iterator;
	import java.util.List;
	import java.util.Map;
	import java.util.Set;
	import java.util.TreeSet;
	import java.util.concurrent.ExecutorService;
	import java.util.concurrent.Executors;
	import java.util.concurrent.Future;
	import java.util.concurrent.TimeUnit;
	import java.util.concurrent.TimeoutException;

	import org.apache.clerezza.rdf.core.Literal;
	import org.apache.clerezza.rdf.core.LiteralFactory;
	import org.apache.clerezza.rdf.core.Resource;
	import org.apache.clerezza.rdf.core.Triple;
	import org.apache.clerezza.rdf.core.UriRef;
	import org.apache.clerezza.rdf.core.impl.PlainLiteralImpl;
	import org.apache.clerezza.rdf.core.impl.TripleImpl;
	import org.apache.solr.client.solrj.SolrServer;
	import org.apache.solr.client.solrj.embedded.EmbeddedSolrServer;
	import org.apache.solr.core.SolrCore;
	import org.apache.stanbol.commons.solr.IndexReference;
	import org.apache.stanbol.commons.solr.managed.ManagedSolrServer;
	import org.apache.stanbol.commons.solr.managed.standalone.StandaloneEmbeddedSolrServerProvider;
	import org.apache.stanbol.enhancer.contentitem.inmemory.InMemoryContentItemFactory;
	import org.apache.stanbol.enhancer.engines.entitylinking.config.EntityLinkerConfig;
	import org.apache.stanbol.enhancer.engines.entitylinking.config.LanguageProcessingConfig;
	import org.apache.stanbol.enhancer.engines.entitylinking.config.EntityLinkerConfig.RedirectProcessingMode;
	import org.apache.stanbol.enhancer.engines.entitylinking.config.TextProcessingConfig;
	import org.apache.stanbol.enhancer.engines.lucenefstlinking.CorpusCreationTask;
	import org.apache.stanbol.enhancer.engines.lucenefstlinking.CorpusInfo;
	import org.apache.stanbol.enhancer.engines.lucenefstlinking.FieldEncodingEnum;
	import org.apache.stanbol.enhancer.engines.lucenefstlinking.FstLinkingEngine;
	import org.apache.stanbol.enhancer.engines.lucenefstlinking.IndexConfiguration;
	import org.apache.stanbol.enhancer.engines.lucenefstlinking.cache.FastLRUCacheManager;
	import org.apache.stanbol.enhancer.engines.lucenefstlinking.cache.SolrEntityCache;
	import org.apache.stanbol.enhancer.nlp.json.AnalyzedTextParser;
	import org.apache.stanbol.enhancer.nlp.model.AnalysedText;
	import org.apache.stanbol.enhancer.nlp.model.AnalysedTextFactory;
	import org.apache.stanbol.enhancer.nlp.model.AnalysedTextUtils;
	import org.apache.stanbol.enhancer.nlp.pos.Pos;
	import org.apache.stanbol.enhancer.nlp.utils.LanguageConfiguration;
	import org.apache.stanbol.enhancer.nlp.utils.NlpEngineHelper;
	import org.apache.stanbol.enhancer.servicesapi.Blob;
	import org.apache.stanbol.enhancer.servicesapi.ContentItem;
	import org.apache.stanbol.enhancer.servicesapi.ContentItemFactory;
	import org.apache.stanbol.enhancer.servicesapi.EngineException;
	import org.apache.stanbol.enhancer.servicesapi.EnhancementEngine;
	import org.apache.stanbol.enhancer.servicesapi.helper.EnhancementEngineHelper;
	import org.apache.stanbol.enhancer.servicesapi.impl.StreamSource;
	import org.apache.stanbol.enhancer.servicesapi.rdf.Properties;
	import org.apache.stanbol.enhancer.servicesapi.rdf.TechnicalClasses;
	import org.apache.stanbol.enhancer.test.helper.EnhancementStructureHelper;
	import org.apache.stanbol.entityhub.core.model.InMemoryValueFactory;
	import org.apache.stanbol.entityhub.servicesapi.model.Representation;
	import org.apache.stanbol.entityhub.servicesapi.model.ValueFactory;
	import org.apache.stanbol.entityhub.servicesapi.util.ModelUtils;
	import org.apache.stanbol.entityhub.yard.solr.impl.SolrYard;
	import org.apache.stanbol.entityhub.yard.solr.impl.SolrYardConfig;
	import org.junit.After;
	import org.junit.AfterClass;
	import org.junit.Assert;
	import org.junit.Before;
	import org.junit.BeforeClass;
	import org.junit.Test;
	import org.slf4j.Logger;
	import org.slf4j.LoggerFactory;

	public class FstLinkingEngineTest {

	private final static Logger log = LoggerFactory.getLogger(FstLinkingEngineTest.class);

	/**
	* The SolrYard used for {@link #testSetup()} to check if {@link #REQUIRED_ENTITIES}
	* are present in the data.<p>
	* NOTE that the {@link FstLinkingEngine} DOES NOT require a SolrYard, but directly
	* operates on the #core
	*/
	protected static SolrYard yard;
	protected static SolrCore core;
	private static IndexConfiguration fstConfig;
	/**
	* The SolrDirectoryManager also tested within this unit test
	*/
	public static final String TEST_YARD_ID = "dbpedia";
	public static final String TEST_SOLR_CORE_NAME = "dbpedia";
	public static final String TEST_ORIGIN = "texst.origin";
	public static final String TEST_SOLR_CORE_CONFIGURATION = "dbpedia_26k.solrindex.bz2";
	protected static final String TEST_INDEX_REL_PATH = File.separatorChar + "target" + File.separatorChar
	+ ManagedSolrServer.DEFAULT_SOLR_DATA_DIR;
	/**
	* The maximal time we wait for the creation of an FST model in the test
	*/
	public static final long FST_CREATION_WAIT_TIME = 2000; //seconds

	public static final String TEST_TEXT_FILE = "merkel.txt";
	public static final String TEST_TEXT_NLP_FILE = "merkel_nlp.json";

	private static final Literal EN_LANGUAGE = LiteralFactory.getInstance().createTypedLiteral("en");

	protected static final String DBPEDIA = "http://dbpedia.org/resource/";

	/**
	* List used in {@link #testSetup()} to validate that all expected entities
	* are contained in the SolrYard initialised based on the
	* {@link #TEST_SOLR_CORE_CONFIGURATION}.
	*/
	private static final List<String> REQUIRED_ENTITIES = Arrays.asList(
	DBPEDIA+"Christian_Democratic_Union_(Germany)", DBPEDIA+"Angela_Merkel",
	DBPEDIA+"Germany", DBPEDIA+"Social_Democratic_Party_of_Germany",
	DBPEDIA+"Greece");

	private ContentItemFactory cif = InMemoryContentItemFactory.getInstance();
	private AnalysedTextFactory atf = AnalysedTextFactory.getDefaultInstance();
	private ContentItem ci;
	private String content;

	/**
	* Used with the {@link EnhancementStructureHelper} to validate Enhancement
	* results
	*/
	private static Map<UriRef,Resource> EXPECTED_ENHANCEMENT_VALUES;
	static{
	EXPECTED_ENHANCEMENT_VALUES = new HashMap<UriRef,Resource>();
	EXPECTED_ENHANCEMENT_VALUES.put(DC_CREATOR, LiteralFactory.getInstance().createTypedLiteral(
	FstLinkingEngine.class.getName()));
	//adding null as expected for confidence makes it a required property
	EXPECTED_ENHANCEMENT_VALUES.put(Properties.ENHANCER_CONFIDENCE, null);
	}


	@BeforeClass
	public static void setup() throws Exception {
	// get the working directory
	// use property substitution to test this feature!
	String prefix = System.getProperty("basedir") == null ? "." : "${basedir}";
	String solrServerDir = prefix + TEST_INDEX_REL_PATH;
	log.info("Test Solr Server Directory: {}", solrServerDir);
	System.setProperty(ManagedSolrServer.MANAGED_SOLR_DIR_PROPERTY, solrServerDir);
	SolrYardConfig config = new SolrYardConfig(TEST_YARD_ID, TEST_SOLR_CORE_NAME);
	config.setAllowInitialisation(false);
	config.setIndexConfigurationName(TEST_SOLR_CORE_CONFIGURATION); //the dbpedia default data
	config.setAllowInitialisation(true); //init from datafile provider
	config.setName("DBpedia.org default data");
	config.setDescription("Data used for FstLinkingEngie tests");
	// create the Yard used for the tests
	IndexReference solrIndexRef = IndexReference.parse(config.getSolrServerLocation());

	SolrServer server = StandaloneEmbeddedSolrServerProvider.getInstance().getSolrServer(
	solrIndexRef, config.getIndexConfigurationName());
	Assert.assertNotNull("Unable to initialise SolrServer for testing",server);
	core = ((EmbeddedSolrServer)server).getCoreContainer().getCore(
	solrIndexRef.getIndex());
	Assert.assertNotNull("Unable to get SolrCore '" + config.getIndexConfigurationName()
	+ "' from SolrServer "+server, core);
	yard = new SolrYard(server,config,null);
	//setup the index configuration
	LanguageConfiguration langConf = new LanguageConfiguration("not.used",
	new String[]{"en;field=dbpedia-ont:surfaceForm;generate=true"});
	fstConfig = new IndexConfiguration(langConf, core, FieldEncodingEnum.SolrYard,"");
	fstConfig.setExecutorService(Executors.newFixedThreadPool(1));
	fstConfig.setTypeField("rdf:type");
	fstConfig.setRankingField("entityhub:entityRank");
	//fstConfig.setEntityCacheManager(new FastLRUCacheManager(2048));
	fstConfig.setOrigin(new PlainLiteralImpl(TEST_ORIGIN));
	//activate the FST config
	fstConfig.activate(); //activate this configuration

	//validate that the index contains the expected entities
	validateTestIndex();

	//now create the FST models
	List<Future<?>> creationTasks = new ArrayList<Future<?>>();
	for(CorpusInfo corpus : fstConfig.getCorpora()){
	Assert.assertTrue("Failure in UnitTest - all FST models need to be generate=true",
	corpus.allowCreation);
	if(!corpus.isFstFile()){
	//create a task on the FST corpus creation service
	creationTasks.add(fstConfig.getExecutorService().submit(
	new CorpusCreationTask(fstConfig, corpus)));
	}
	}
	//and wait until all models are built (should only take some seconds on
	//typical hardware
	for(Future<?> future : creationTasks){
	try {
	future.get(FST_CREATION_WAIT_TIME,TimeUnit.SECONDS);
	} catch (TimeoutException e) {
	// we assert on future.isDone instead
	}
	Assert.assertTrue("FST Model creation not finished after "
	+ FST_CREATION_WAIT_TIME +"seconds", future.isDone());
	}
	}

	private static void validateTestIndex() throws Exception {
	log.info("check availability of {} entities", REQUIRED_ENTITIES.size());
	for(String context : REQUIRED_ENTITIES){
	log.debug(" > check Entity {}",context);
	Representation rep = yard.getRepresentation(context);
	assertNotNull(rep);
	assertEquals(rep.getId(),context);
	if(log.isDebugEnabled()){
	log.debug("Data for Entity {}: \n {}",rep.getId(),
	ModelUtils.getRepresentationInfo(rep));
	}
	}
	log.info(" ... all Entities present");
	}


	@AfterClass
	public static void cleanup() throws Exception {
	if(yard != null){
	yard.close();
	}
	yard = null;
	}

	/**
	* Initialises the {@link #ci} and {@link #content} fields for tests.
	* It creates a ContentItem containing a '<code>plain/text</code>'
	* {@link Blob} for the {@value #TEST_TEXT_FILE} and an {@link AnalysedText}
	* filled with the NLP analysis results stored in
	* {@link #TEST_TEXT_NLP_FILE}
	* @return the {@link ContentItem} as used for the tests
	* @throws IOException on any IO releated error while reading the test files
	*/
	@Before
	public void setupTest() throws IOException {
	//create a contentItem for the plain text used for testing
	InputStream is = FstLinkingEngineTest.class.getClassLoader().getResourceAsStream(TEST_TEXT_FILE);
	Assert.assertNotNull("Unable to load '"+TEST_TEXT_FILE+"' via classpath",is);
	ContentItem ci = cif.createContentItem(new StreamSource(is,"text/plain"));
	AnalysedText at = atf.createAnalysedText(ci, ci.getBlob());
	is.close();
	//parse the prepared NLP results and add it to the ContentItem
	is = FstLinkingEngineTest.class.getClassLoader().getResourceAsStream(TEST_TEXT_NLP_FILE);
	Assert.assertNotNull("Unable to load '"+TEST_TEXT_NLP_FILE+"' via classpath",is);
	AnalyzedTextParser.getDefaultInstance().parse(is, Charset.forName("UTF-8"), at);
	is.close();
	//set the language of the contentItem
	ci.getMetadata().add(new TripleImpl(ci.getUri(), DC_LANGUAGE,
	EN_LANGUAGE));
	//set the contentItem and also the content
	this.ci = ci;
	this.content = at.getText().toString();
	}
	@After
	public void cleanupTest() {
	ci = null;
	content = null;
	}

	@Test
	public void testFstLinkingWithProperNouns() throws Exception {
	Dictionary<String,Object> dict = new Hashtable<String,Object>();
	dict.put(PROCESSED_LANGUAGES, Arrays.asList("en;lmmtip;uc=LINK;prob=0.75;pprob=0.75"));
	dict.put(PROCESS_ONLY_PROPER_NOUNS_STATE, true);
	TextProcessingConfig tpc = TextProcessingConfig.createInstance(dict);
	EntityLinkerConfig elc = new EntityLinkerConfig();
	elc.setMinFoundTokens(2);//this is assumed by this test
	elc.setRedirectProcessingMode(RedirectProcessingMode.FOLLOW);
	FstLinkingEngine engine = new FstLinkingEngine("proper-noun-linking",
	LinkingModeEnum.LINKABLE_TOKEN, fstConfig, tpc, elc, null);
	processConentItem(engine);
	validateEnhancements(
	Arrays.asList(
	"Chancellor", "Angela Merkel", "Greece", "Greeks", "Germany", "SPD"),
	Arrays.asList(
	DBPEDIA+"Christian_Democratic_Union_(Germany)",
	DBPEDIA+"Angela_Merkel", DBPEDIA+"Greece", DBPEDIA+"Germany",
	DBPEDIA+"Social_Democratic_Party_of_Germany"));
	}

	@Test
	public void testFstLinkingWithNouns() throws Exception {
	Dictionary<String,Object> dict = new Hashtable<String,Object>();
	dict.put(PROCESSED_LANGUAGES, Arrays.asList("en;lmmtip;uc=LINK;prob=0.75;pprob=0.75"));
	dict.put(PROCESS_ONLY_PROPER_NOUNS_STATE, false);
	TextProcessingConfig tpc = TextProcessingConfig.createInstance(dict);
	EntityLinkerConfig elc = new EntityLinkerConfig();
	elc.setMinFoundTokens(2);//this is assumed by this test
	elc.setRedirectProcessingMode(RedirectProcessingMode.FOLLOW);
	FstLinkingEngine engine = new FstLinkingEngine("proper-noun-linking",
	LinkingModeEnum.LINKABLE_TOKEN, fstConfig, tpc, elc, null);
	processConentItem(engine);
	validateEnhancements(
	Arrays.asList(
	"Chancellor", "Angela Merkel", "Greece", "Greeks", "Germany", "SPD",
	"change","election", "party", "policy"),
	Arrays.asList(
	DBPEDIA+"Christian_Democratic_Union_(Germany)",
	DBPEDIA+"Angela_Merkel", DBPEDIA+"Greece", DBPEDIA+"Germany",
	DBPEDIA+"Social_Democratic_Party_of_Germany", DBPEDIA+"Chancellor",
	DBPEDIA+"Election", DBPEDIA+"Party", DBPEDIA+"Policy"));
	}

	/**
	* @param expected
	*/
	private int[] validateEnhancements(Collection<String> expectedSelectedTexts,
	Collection<String> expectedEntities) {
	//create clones from the parsed sets so that we can remove values
	Set<String> selectedTexts = new TreeSet<String>(expectedSelectedTexts);
	Set<String> suggestedEntities = new TreeSet<String>(expectedEntities);
	//iterate over all fise:TextAnnotations
	//NOTE this assumes all textAnnotations are from the FST linking engine
	log.info(" ... validated fise:TextAnnotations:");
	Map<UriRef,Resource> expected = new HashMap<UriRef,Resource>(EXPECTED_ENHANCEMENT_VALUES);
	expected.put(ENHANCER_EXTRACTED_FROM, ci.getUri());
	int[] num = new int[]{0,0};
	Iterator<Triple> textAnnotations = ci.getMetadata().filter(
	null, Properties.RDF_TYPE, TechnicalClasses.ENHANCER_TEXTANNOTATION);
	while(textAnnotations.hasNext()){
	UriRef textAnnotation = (UriRef)textAnnotations.next().getSubject();
	//validate this test annotation against the Stanbol EnhancementStructure
	EnhancementStructureHelper.validateTextAnnotation(
	ci.getMetadata(), textAnnotation, content, expected);
	String selectedText = EnhancementEngineHelper.getString(
	ci.getMetadata(), textAnnotation, Properties.ENHANCER_SELECTED_TEXT);
	log.info(" {}. {}",num[0]+1,selectedText);
	Assert.assertNotNull(selectedText);
	//NOTE also check for contains in the parsed set to not fail if the
	// same selected text is contained multiple times
	Assert.assertTrue("fise:selected-text '" + selectedText +
	"' not expected (expected: "+expectedSelectedTexts+")",
	selectedTexts.remove(selectedText) \|\| expectedSelectedTexts.contains(selectedTexts));
	num[0]++; //count the number of fise:TextAnnotations
	}
	Assert.assertTrue("Results do miss following expected fise:TextAnnotations: "
	+ selectedTexts, selectedTexts.isEmpty());

	log.info(" ... validated fise:EntityAnnotations:");
	Iterator<Triple> entityAnnotations = ci.getMetadata().filter(
	null, Properties.RDF_TYPE, TechnicalClasses.ENHANCER_ENTITYANNOTATION);
	while(entityAnnotations.hasNext()){
	UriRef entityAnnotation = (UriRef)entityAnnotations.next().getSubject();
	//validate this test annotation against the Stanbol EnhancementStructure
	EnhancementStructureHelper.validateEntityAnnotation(
	ci.getMetadata(), entityAnnotation, expected);
	UriRef entityUri = EnhancementEngineHelper.getReference(
	ci.getMetadata(), entityAnnotation, Properties.ENHANCER_ENTITY_REFERENCE);
	log.info(" {}. {}",num[1]+1,entityUri);
	Assert.assertNotNull(entityUri);
	//NOTE also check for contains in the parsed set to not fail if the
	// same selected text is contained multiple times
	if(suggestedEntities.remove(entityUri.getUnicodeString())){
	log.info(" ... found");
	}
	//assert origin
	assertEquals(TEST_ORIGIN, EnhancementEngineHelper.getString(
	ci.getMetadata(),entityAnnotation, FISE_ORIGIN));

	// Assert.assertTrue("fise:referenced-entity " + entityUri +
	// " not expected (expected: "+expectedEntities+")",
	// suggestedEntities.remove(entityUri.getUnicodeString()) \|\|
	// expectedEntities.contains(entityUri.getUnicodeString()));
	num[1]++; //count the number of fise:TextAnnotations

	}
	Assert.assertTrue("Results do miss following expected fise:EntityAnnotations: "
	+ suggestedEntities, suggestedEntities.isEmpty());
	return num;
	}

	/**
	* Processes the {@link #ci} with the parsed engine.
	* @param engine
	* @return returns {@link #ci} as convenience
	* @throws EngineException
	*/
	private ContentItem processConentItem(FstLinkingEngine engine) throws EngineException {
	Assert.assertEquals("The FST Linking engine is expected to enhance the "
	+ "test ContentItem EnhancementEngine.ENHANCE_ASYNC",
	EnhancementEngine.ENHANCE_ASYNC, engine.canEnhance(ci));
	engine.computeEnhancements(ci);
	return ci;
	}
	}