enhancement-engines/celi/src/test/java/org/apache/stanbol/enhancer/engines/celi/lemmatizer/impl/CeliAnalyzedTextLemmatizerEngineTest.java - stanbol - Git at Google

 /*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 package org.apache.stanbol.enhancer.engines.celi.lemmatizer.impl;

 import static org.apache.stanbol.enhancer.engines.celi.lemmatizer.impl.CeliLemmatizerEnhancementEngine.SERVICE_URL;
 import static org.apache.stanbol.enhancer.nlp.NlpAnnotations.POS_ANNOTATION;
 import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.DC_LANGUAGE;

 import java.io.IOException;
 import java.util.Dictionary;
 import java.util.Hashtable;
 import java.util.Iterator;
 import java.util.List;

 import junit.framework.Assert;

 import org.apache.clerezza.rdf.core.impl.PlainLiteralImpl;
 import org.apache.clerezza.rdf.core.impl.TripleImpl;
 import org.apache.stanbol.enhancer.contentitem.inmemory.InMemoryContentItemFactory;
 import org.apache.stanbol.enhancer.engines.celi.CeliConstants;
 import org.apache.stanbol.enhancer.engines.celi.testutils.MockComponentContext;
 import org.apache.stanbol.enhancer.nlp.NlpAnnotations;
 import org.apache.stanbol.enhancer.nlp.model.AnalysedText;
 import org.apache.stanbol.enhancer.nlp.model.AnalysedTextFactory;
 import org.apache.stanbol.enhancer.nlp.model.Token;
 import org.apache.stanbol.enhancer.nlp.model.annotation.Value;
 import org.apache.stanbol.enhancer.nlp.morpho.MorphoFeatures;
 import org.apache.stanbol.enhancer.nlp.pos.LexicalCategory;
 import org.apache.stanbol.enhancer.nlp.pos.PosTag;
 import org.apache.stanbol.enhancer.servicesapi.ContentItem;
 import org.apache.stanbol.enhancer.servicesapi.ContentItemFactory;
 import org.apache.stanbol.enhancer.servicesapi.EngineException;
 import org.apache.stanbol.enhancer.servicesapi.EnhancementEngine;
 import org.apache.stanbol.enhancer.servicesapi.helper.EnhancementEngineHelper;
 import org.apache.stanbol.enhancer.servicesapi.impl.StringSource;
 import org.apache.stanbol.enhancer.test.helper.RemoteServiceHelper;
 import org.junit.AfterClass;
 import org.junit.BeforeClass;
 import org.junit.Test;
 import org.osgi.service.cm.ConfigurationException;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;

 public class CeliAnalyzedTextLemmatizerEngineTest {

     private static final Logger log = LoggerFactory.getLogger(CeliAnalyzedTextLemmatizerEngineTest.class);

     private static final ContentItemFactory ciFactory = InMemoryContentItemFactory.getInstance();
     private static final AnalysedTextFactory atFactory = AnalysedTextFactory.getDefaultInstance();


     /*
      * Data for the GERMAN test
      */
     public static final String de_verb = "verbrachten";
     public static final String de_adjective = "kaiserlichen";//"sensationellen"; //"schönen";
     public static final String de_noun = "Urlaub";

     public static final String de_text = String.format(
         "Wir %s einen %s %s in der Schweiz",
         de_verb,de_adjective,de_noun);

     public static final int de_verbStart = de_text.indexOf(de_verb);
     public static final double de_verbProb = 0.98765d;

     public static final int de_adjectiveStart = de_text.indexOf(de_adjective);
     public static final double de_adjectiveProb = 0.87654d;

     public static final int de_nounStart = de_text.indexOf(de_noun);
     public static final double de_nounProb = 0.998877d;

     public static CeliAnalyzedTextLemmatizerEngine engine;

     @BeforeClass
     public static void initEngine() throws IOException, ConfigurationException {
         Dictionary<String, Object> properties = new Hashtable<String, Object>();
         properties.put(EnhancementEngine.PROPERTY_NAME, "celiLemmatizer");
         properties.put(CeliConstants.CELI_TEST_ACCOUNT, "true");
         properties.put(CeliConstants.CELI_CONNECTION_TIMEOUT, "5");
         properties.put(SERVICE_URL, "http://linguagrid.org/LSGrid/ws/morpho-analyser");
         MockComponentContext context = new MockComponentContext(properties);
         engine = new CeliAnalyzedTextLemmatizerEngine();
         engine.activate(context);
     }
     @AfterClass
     public static void deactivate(){
         engine.deactivate(null);
         engine = null;
     }

     @Test
     public void testEngineDe() throws IOException, EngineException {
         ContentItem ci = ciFactory.createContentItem(new StringSource(de_text));
         Assert.assertNotNull(ci);
         AnalysedText at = atFactory.createAnalysedText(ci, ci.getBlob());
         Assert.assertNotNull(at);
         ci.getMetadata().add(new TripleImpl(ci.getUri(), DC_LANGUAGE, new PlainLiteralImpl("de")));
         Assert.assertEquals("de", EnhancementEngineHelper.getLanguage(ci));

         //Add some Tokens with POS annotations to test the usage of
         //existing POS annotations by the lemmatizer
         Token verbrachten = at.addToken(de_verbStart,de_verbStart+de_verb.length());
         verbrachten.addAnnotation(POS_ANNOTATION, Value.value(
             new PosTag("V",LexicalCategory.Verb), de_verbProb));

         Token schonen = at.addToken(de_adjectiveStart,de_adjectiveStart+de_adjective.length());
         schonen.addAnnotation(POS_ANNOTATION, Value.value(
             new PosTag("ADJ",LexicalCategory.Adjective), de_adjectiveProb));

         Token urlaub = at.addToken(de_nounStart,de_nounStart+de_noun.length());
         urlaub.addAnnotation(POS_ANNOTATION, Value.value(
             new PosTag("NC",LexicalCategory.Noun), de_nounProb));

         Assert.assertEquals("Can not enhance Test ContentItem",
             EnhancementEngine.ENHANCE_ASYNC,engine.canEnhance(ci));
         //compute the enhancements
         try {
             engine.computeEnhancements(ci);
         } catch (EngineException e) {
             RemoteServiceHelper.checkServiceUnavailable(e);
             return; //deactivate test
         }
         //now validate the enhancements
         boolean foundVerb = false;
         boolean foundAdjective = false;
         boolean foundNoun = false;
         for(Iterator<Token> tokens = at.getTokens(); tokens.hasNext();){
             Token token = tokens.next();
             log.info("Token: {}",token);
             List<Value<MorphoFeatures>> mfs = token.getAnnotations(NlpAnnotations.MORPHO_ANNOTATION);
             if(de_verb.equals(token.getSpan())){
                 foundVerb = !mfs.isEmpty();
                 validateMorphFeatureProbability(mfs,LexicalCategory.Verb,de_verbProb);
             } else if(de_adjective.equals(token.getSpan())){
                 foundAdjective = !mfs.isEmpty();
                 validateMorphFeatureProbability(mfs,LexicalCategory.Adjective,de_adjectiveProb);
             } else if(de_noun.equals(token.getSpan())){
                 foundNoun = !mfs.isEmpty();
                 validateMorphFeatureProbability(mfs,LexicalCategory.Noun,de_nounProb);
             }
             for(Value<MorphoFeatures> mf : mfs){
                 log.info("  - {}",mf);
                 Assert.assertNotNull(mf.value().getLemma());
             }
         }
         Assert.assertTrue("No MorphoFeatures found for '"+de_verb+"'!",foundVerb);
         Assert.assertTrue("No MorphoFeatures found for '"+de_adjective+"'!",foundAdjective);
         Assert.assertTrue("No MorphoFeatures found for '"+de_noun+"'!",foundNoun);
     }
     private void validateMorphFeatureProbability(List<Value<MorphoFeatures>> mfs, LexicalCategory lc, double prob) {
         for(Value<MorphoFeatures> mf : mfs){
             for(PosTag pos : mf.value().getPosList()){
                 if(pos.hasCategory(lc)){
                     Assert.assertEquals(prob, mf.probability());
                 }
             }
         }
     }

 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.stanbol.enhancer.engines.celi.lemmatizer.impl;

	import static org.apache.stanbol.enhancer.engines.celi.lemmatizer.impl.CeliLemmatizerEnhancementEngine.SERVICE_URL;
	import static org.apache.stanbol.enhancer.nlp.NlpAnnotations.POS_ANNOTATION;
	import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.DC_LANGUAGE;

	import java.io.IOException;
	import java.util.Dictionary;
	import java.util.Hashtable;
	import java.util.Iterator;
	import java.util.List;

	import junit.framework.Assert;

	import org.apache.clerezza.rdf.core.impl.PlainLiteralImpl;
	import org.apache.clerezza.rdf.core.impl.TripleImpl;
	import org.apache.stanbol.enhancer.contentitem.inmemory.InMemoryContentItemFactory;
	import org.apache.stanbol.enhancer.engines.celi.CeliConstants;
	import org.apache.stanbol.enhancer.engines.celi.testutils.MockComponentContext;
	import org.apache.stanbol.enhancer.nlp.NlpAnnotations;
	import org.apache.stanbol.enhancer.nlp.model.AnalysedText;
	import org.apache.stanbol.enhancer.nlp.model.AnalysedTextFactory;
	import org.apache.stanbol.enhancer.nlp.model.Token;
	import org.apache.stanbol.enhancer.nlp.model.annotation.Value;
	import org.apache.stanbol.enhancer.nlp.morpho.MorphoFeatures;
	import org.apache.stanbol.enhancer.nlp.pos.LexicalCategory;
	import org.apache.stanbol.enhancer.nlp.pos.PosTag;
	import org.apache.stanbol.enhancer.servicesapi.ContentItem;
	import org.apache.stanbol.enhancer.servicesapi.ContentItemFactory;
	import org.apache.stanbol.enhancer.servicesapi.EngineException;
	import org.apache.stanbol.enhancer.servicesapi.EnhancementEngine;
	import org.apache.stanbol.enhancer.servicesapi.helper.EnhancementEngineHelper;
	import org.apache.stanbol.enhancer.servicesapi.impl.StringSource;
	import org.apache.stanbol.enhancer.test.helper.RemoteServiceHelper;
	import org.junit.AfterClass;
	import org.junit.BeforeClass;
	import org.junit.Test;
	import org.osgi.service.cm.ConfigurationException;
	import org.slf4j.Logger;
	import org.slf4j.LoggerFactory;

	public class CeliAnalyzedTextLemmatizerEngineTest {

	private static final Logger log = LoggerFactory.getLogger(CeliAnalyzedTextLemmatizerEngineTest.class);

	private static final ContentItemFactory ciFactory = InMemoryContentItemFactory.getInstance();
	private static final AnalysedTextFactory atFactory = AnalysedTextFactory.getDefaultInstance();


	/*
	* Data for the GERMAN test
	*/
	public static final String de_verb = "verbrachten";
	public static final String de_adjective = "kaiserlichen";//"sensationellen"; //"schönen";
	public static final String de_noun = "Urlaub";

	public static final String de_text = String.format(
	"Wir %s einen %s %s in der Schweiz",
	de_verb,de_adjective,de_noun);

	public static final int de_verbStart = de_text.indexOf(de_verb);
	public static final double de_verbProb = 0.98765d;

	public static final int de_adjectiveStart = de_text.indexOf(de_adjective);
	public static final double de_adjectiveProb = 0.87654d;

	public static final int de_nounStart = de_text.indexOf(de_noun);
	public static final double de_nounProb = 0.998877d;

	public static CeliAnalyzedTextLemmatizerEngine engine;

	@BeforeClass
	public static void initEngine() throws IOException, ConfigurationException {
	Dictionary<String, Object> properties = new Hashtable<String, Object>();
	properties.put(EnhancementEngine.PROPERTY_NAME, "celiLemmatizer");
	properties.put(CeliConstants.CELI_TEST_ACCOUNT, "true");
	properties.put(CeliConstants.CELI_CONNECTION_TIMEOUT, "5");
	properties.put(SERVICE_URL, "http://linguagrid.org/LSGrid/ws/morpho-analyser");
	MockComponentContext context = new MockComponentContext(properties);
	engine = new CeliAnalyzedTextLemmatizerEngine();
	engine.activate(context);
	}
	@AfterClass
	public static void deactivate(){
	engine.deactivate(null);
	engine = null;
	}

	@Test
	public void testEngineDe() throws IOException, EngineException {
	ContentItem ci = ciFactory.createContentItem(new StringSource(de_text));
	Assert.assertNotNull(ci);
	AnalysedText at = atFactory.createAnalysedText(ci, ci.getBlob());
	Assert.assertNotNull(at);
	ci.getMetadata().add(new TripleImpl(ci.getUri(), DC_LANGUAGE, new PlainLiteralImpl("de")));
	Assert.assertEquals("de", EnhancementEngineHelper.getLanguage(ci));

	//Add some Tokens with POS annotations to test the usage of
	//existing POS annotations by the lemmatizer
	Token verbrachten = at.addToken(de_verbStart,de_verbStart+de_verb.length());
	verbrachten.addAnnotation(POS_ANNOTATION, Value.value(
	new PosTag("V",LexicalCategory.Verb), de_verbProb));

	Token schonen = at.addToken(de_adjectiveStart,de_adjectiveStart+de_adjective.length());
	schonen.addAnnotation(POS_ANNOTATION, Value.value(
	new PosTag("ADJ",LexicalCategory.Adjective), de_adjectiveProb));

	Token urlaub = at.addToken(de_nounStart,de_nounStart+de_noun.length());
	urlaub.addAnnotation(POS_ANNOTATION, Value.value(
	new PosTag("NC",LexicalCategory.Noun), de_nounProb));

	Assert.assertEquals("Can not enhance Test ContentItem",
	EnhancementEngine.ENHANCE_ASYNC,engine.canEnhance(ci));
	//compute the enhancements
	try {
	engine.computeEnhancements(ci);
	} catch (EngineException e) {
	RemoteServiceHelper.checkServiceUnavailable(e);
	return; //deactivate test
	}
	//now validate the enhancements
	boolean foundVerb = false;
	boolean foundAdjective = false;
	boolean foundNoun = false;
	for(Iterator<Token> tokens = at.getTokens(); tokens.hasNext();){
	Token token = tokens.next();
	log.info("Token: {}",token);
	List<Value<MorphoFeatures>> mfs = token.getAnnotations(NlpAnnotations.MORPHO_ANNOTATION);
	if(de_verb.equals(token.getSpan())){
	foundVerb = !mfs.isEmpty();
	validateMorphFeatureProbability(mfs,LexicalCategory.Verb,de_verbProb);
	} else if(de_adjective.equals(token.getSpan())){
	foundAdjective = !mfs.isEmpty();
	validateMorphFeatureProbability(mfs,LexicalCategory.Adjective,de_adjectiveProb);
	} else if(de_noun.equals(token.getSpan())){
	foundNoun = !mfs.isEmpty();
	validateMorphFeatureProbability(mfs,LexicalCategory.Noun,de_nounProb);
	}
	for(Value<MorphoFeatures> mf : mfs){
	log.info(" - {}",mf);
	Assert.assertNotNull(mf.value().getLemma());
	}
	}
	Assert.assertTrue("No MorphoFeatures found for '"+de_verb+"'!",foundVerb);
	Assert.assertTrue("No MorphoFeatures found for '"+de_adjective+"'!",foundAdjective);
	Assert.assertTrue("No MorphoFeatures found for '"+de_noun+"'!",foundNoun);
	}
	private void validateMorphFeatureProbability(List<Value<MorphoFeatures>> mfs, LexicalCategory lc, double prob) {
	for(Value<MorphoFeatures> mf : mfs){
	for(PosTag pos : mf.value().getPosList()){
	if(pos.hasCategory(lc)){
	Assert.assertEquals(prob, mf.probability());
	}
	}
	}
	}

	}