enhancement-engines/paoding-token/src/test/java/org/apache/stanbol/enhancer/engines/paoding/token/TestPaodingTokenizerEngine.java - stanbol - Git at Google

 /*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 package org.apache.stanbol.enhancer.engines.paoding.token;

 import java.io.File;
 import java.io.IOException;
 import java.util.Dictionary;
 import java.util.Hashtable;

 import org.apache.clerezza.rdf.core.UriRef;
 import org.apache.clerezza.rdf.core.impl.PlainLiteralImpl;
 import org.apache.clerezza.rdf.core.impl.TripleImpl;
 import org.apache.stanbol.commons.solr.extras.paoding.Activator;
 import org.apache.stanbol.enhancer.contentitem.inmemory.InMemoryContentItemFactory;
 import org.apache.stanbol.enhancer.nlp.model.AnalysedText;
 import org.apache.stanbol.enhancer.nlp.model.AnalysedTextFactory;
 import org.apache.stanbol.enhancer.nlp.model.AnalysedTextUtils;
 import org.apache.stanbol.enhancer.servicesapi.ContentItem;
 import org.apache.stanbol.enhancer.servicesapi.ContentItemFactory;
 import org.apache.stanbol.enhancer.servicesapi.EngineException;
 import org.apache.stanbol.enhancer.servicesapi.EnhancementEngine;
 import org.apache.stanbol.enhancer.servicesapi.impl.StringSource;
 import org.apache.stanbol.enhancer.servicesapi.rdf.Properties;
 import org.junit.After;
 import org.junit.Assert;
 import org.junit.Before;
 import org.junit.BeforeClass;
 import org.junit.Test;
 import org.osgi.service.cm.ConfigurationException;

 public class TestPaodingTokenizerEngine {
     public static final String FAKE_BUNDLE_SYMBOLIC_NAME = "FAKE_BUNDLE_SYMBOLIC_NAME";


     private static ContentItemFactory contentItemFactory;

     private static UriRef id = new UriRef("http://www.example.org/contentItem1");
     /**
      * Test text taken from the <a href ="http://zh.wikipedia.org/wiki/Barack_Obama">
      * Chinese wikipedia side for Barack Obama</a>.
      */
     private static String text = "巴拉克·侯赛因·奥巴马二世，美国民主黨籍政治家，第44任美国总统，"
             + "為第一位非裔美国总统，同時擁有黑（盧歐族）白（英德爱混血）血统，於2008年初次當選美國總統，"
             + "並於2012年成功連任，。\n 奥巴马1961年出生於美国夏威夷州檀香山，童年和青少年時期分别在印尼和"
             + "夏威夷度过。1991年，奥巴马以优等生荣誉从哈佛法学院毕业。1996年，当选伊利诺州参议员。2000年，"
             + "競選美国众议院席位失败，后一直从事州参议员工作，且於2002年获得连任。2004年，"
             + "在美国民主党全国代表大会上发表主题演讲，因此成为全美知名的政界人物。同年11月，"
             + "以70%的选票当选代表伊利诺州的美国联邦参议员，是美國歷史上第五位有非裔血统的联邦参议员。";

     private PaodingTokenizerEngine engine;

     private ContentItem contentItem;

     protected static final String TEST_PAODING_DIC_PATH = File.separatorChar + "target"
             + File.separatorChar + "paoding-dict";
     private static File paodingDict;

     @BeforeClass
     public static void initDataFileProvicer() throws IOException{
         String baseDir = System.getProperty("basedir") == null ? "." : System.getProperty("basedir");
         paodingDict = new File(baseDir,TEST_PAODING_DIC_PATH);
         if(!paodingDict.isDirectory()){
             Activator.initPaodingDictionary(paodingDict, TestPaodingTokenizerEngine.class.
                 getClassLoader().getResourceAsStream(Activator.DICT_ARCHIVE));
         }
         Activator.initPaodingDictHomeProperty(paodingDict);
         contentItemFactory = InMemoryContentItemFactory.getInstance();
     }

     @Before
     public void setUpServices() throws IOException , ConfigurationException {
         engine = new PaodingTokenizerEngine();
         engine.analysedTextFactory = AnalysedTextFactory.getDefaultInstance();
         Dictionary<String,Object> config = new Hashtable<String,Object>();
         config.put(EnhancementEngine.PROPERTY_NAME, "paoding-token");
         engine.activate(new MockComponentContext(config));
         contentItem = contentItemFactory.createContentItem(id, new StringSource(text));
         //add an annotation that this is Japanese
         contentItem.getMetadata().add(new TripleImpl(id, Properties.DC_LANGUAGE,
             new PlainLiteralImpl("zh")));
     }

     @Test
     public void testEngine() throws EngineException {
         Assert.assertEquals(EnhancementEngine.ENHANCE_ASYNC, engine.canEnhance(contentItem));
         engine.computeEnhancements(contentItem);
         //assert the results
         AnalysedText at = AnalysedTextUtils.getAnalysedText(contentItem);
         Assert.assertNotNull(at);
         Assert.assertTrue(at.getTokens().hasNext()); //assert that tokens are present
     }


     @After
     public void cleanUpServices(){
         if(engine != null){
             engine.deactivate(null);
         }
         engine = null;
     }

 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.stanbol.enhancer.engines.paoding.token;

	import java.io.File;
	import java.io.IOException;
	import java.util.Dictionary;
	import java.util.Hashtable;

	import org.apache.clerezza.rdf.core.UriRef;
	import org.apache.clerezza.rdf.core.impl.PlainLiteralImpl;
	import org.apache.clerezza.rdf.core.impl.TripleImpl;
	import org.apache.stanbol.commons.solr.extras.paoding.Activator;
	import org.apache.stanbol.enhancer.contentitem.inmemory.InMemoryContentItemFactory;
	import org.apache.stanbol.enhancer.nlp.model.AnalysedText;
	import org.apache.stanbol.enhancer.nlp.model.AnalysedTextFactory;
	import org.apache.stanbol.enhancer.nlp.model.AnalysedTextUtils;
	import org.apache.stanbol.enhancer.servicesapi.ContentItem;
	import org.apache.stanbol.enhancer.servicesapi.ContentItemFactory;
	import org.apache.stanbol.enhancer.servicesapi.EngineException;
	import org.apache.stanbol.enhancer.servicesapi.EnhancementEngine;
	import org.apache.stanbol.enhancer.servicesapi.impl.StringSource;
	import org.apache.stanbol.enhancer.servicesapi.rdf.Properties;
	import org.junit.After;
	import org.junit.Assert;
	import org.junit.Before;
	import org.junit.BeforeClass;
	import org.junit.Test;
	import org.osgi.service.cm.ConfigurationException;

	public class TestPaodingTokenizerEngine {
	public static final String FAKE_BUNDLE_SYMBOLIC_NAME = "FAKE_BUNDLE_SYMBOLIC_NAME";


	private static ContentItemFactory contentItemFactory;

	private static UriRef id = new UriRef("http://www.example.org/contentItem1");
	/**
	* Test text taken from the <a href ="http://zh.wikipedia.org/wiki/Barack_Obama">
	* Chinese wikipedia side for Barack Obama</a>.
	*/
	private static String text = "巴拉克·侯赛因·奥巴马二世，美国民主黨籍政治家，第44任美国总统，"
	+ "為第一位非裔美国总统，同時擁有黑（盧歐族）白（英德爱混血）血统，於2008年初次當選美國總統，"
	+ "並於2012年成功連任，。\n 奥巴马1961年出生於美国夏威夷州檀香山，童年和青少年時期分别在印尼和"
	+ "夏威夷度过。1991年，奥巴马以优等生荣誉从哈佛法学院毕业。1996年，当选伊利诺州参议员。2000年，"
	+ "競選美国众议院席位失败，后一直从事州参议员工作，且於2002年获得连任。2004年，"
	+ "在美国民主党全国代表大会上发表主题演讲，因此成为全美知名的政界人物。同年11月，"
	+ "以70%的选票当选代表伊利诺州的美国联邦参议员，是美國歷史上第五位有非裔血统的联邦参议员。";

	private PaodingTokenizerEngine engine;

	private ContentItem contentItem;

	protected static final String TEST_PAODING_DIC_PATH = File.separatorChar + "target"
	+ File.separatorChar + "paoding-dict";
	private static File paodingDict;

	@BeforeClass
	public static void initDataFileProvicer() throws IOException{
	String baseDir = System.getProperty("basedir") == null ? "." : System.getProperty("basedir");
	paodingDict = new File(baseDir,TEST_PAODING_DIC_PATH);
	if(!paodingDict.isDirectory()){
	Activator.initPaodingDictionary(paodingDict, TestPaodingTokenizerEngine.class.
	getClassLoader().getResourceAsStream(Activator.DICT_ARCHIVE));
	}
	Activator.initPaodingDictHomeProperty(paodingDict);
	contentItemFactory = InMemoryContentItemFactory.getInstance();
	}

	@Before
	public void setUpServices() throws IOException , ConfigurationException {
	engine = new PaodingTokenizerEngine();
	engine.analysedTextFactory = AnalysedTextFactory.getDefaultInstance();
	Dictionary<String,Object> config = new Hashtable<String,Object>();
	config.put(EnhancementEngine.PROPERTY_NAME, "paoding-token");
	engine.activate(new MockComponentContext(config));
	contentItem = contentItemFactory.createContentItem(id, new StringSource(text));
	//add an annotation that this is Japanese
	contentItem.getMetadata().add(new TripleImpl(id, Properties.DC_LANGUAGE,
	new PlainLiteralImpl("zh")));
	}

	@Test
	public void testEngine() throws EngineException {
	Assert.assertEquals(EnhancementEngine.ENHANCE_ASYNC, engine.canEnhance(contentItem));
	engine.computeEnhancements(contentItem);
	//assert the results
	AnalysedText at = AnalysedTextUtils.getAnalysedText(contentItem);
	Assert.assertNotNull(at);
	Assert.assertTrue(at.getTokens().hasNext()); //assert that tokens are present
	}


	@After
	public void cleanUpServices(){
	if(engine != null){
	engine.deactivate(null);
	}
	engine = null;
	}

	}