hugegraph-test/src/main/java/org/apache/hugegraph/unit/core/AnalyzerTest.java - incubator-hugegraph - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements. See the NOTICE file distributed with this
  * work for additional information regarding copyright ownership. The ASF
  * licenses this file to You under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
  * License for the specific language governing permissions and limitations
  * under the License.
  */

 package org.apache.hugegraph.unit.core;

 import java.util.Arrays;
 import java.util.HashSet;
 import java.util.Set;

 import org.junit.After;
 import org.junit.Before;
 import org.junit.Test;

 import org.apache.hugegraph.analyzer.Analyzer;
 import org.apache.hugegraph.analyzer.AnalyzerFactory;
 import org.apache.hugegraph.testutil.Assert;

 public class AnalyzerTest {

     private static final String TEXT_1 = "England wins World Cup";
     private static final String TEXT_2 = "英格兰世界杯夺冠，中华人民共和国国歌，" +
                                         "百度科技园位于北京市海淀区西北旺东路10号院";

     @Before
     public void setup() {
         // pass
     }

     @After
     public void teardown() {
         // pass
     }

     @Test
     public void testAnsjAnalyzer() {
         // BaseAnalysis mode
         Analyzer analyzer = AnalyzerFactory.analyzer("ansj", "BaseAnalysis");
         Assert.assertEquals(setOf("england", " ", "wins", "world", "cup"),
                             analyzer.segment(TEXT_1));
         Assert.assertEquals(setOf("英格兰", "世界杯", "夺冠", "，", "中华人民共和国",
                                   "国歌", "百度", "科技", "园", "位于", "北京市",
                                   "海淀区", "西北", "旺", "东路", "10", "号", "院"),
                             analyzer.segment(TEXT_2));

         // IndexAnalysis mode
         analyzer = AnalyzerFactory.analyzer("ansj", "IndexAnalysis");
         Assert.assertEquals(setOf("england", " ", "wins", "world", "cup"),
                             analyzer.segment(TEXT_1));
         Assert.assertEquals(setOf("英格兰", "世界杯", "夺冠", "，", "中华人民共和国",
                                   "国歌", "百度", "科技", "园", "位于", "北京市",
                                   "海淀区", "西北", "旺", "东路", "10号", "院"),
                             analyzer.segment(TEXT_2));
     }

     @Test
     public void testHanlpAnalyzer() {
         // standard mode
         Analyzer analyzer = AnalyzerFactory.analyzer("hanlp", "standard");
         Assert.assertEquals(setOf("England", " ", "wins", "World", "Cup"),
                             analyzer.segment(TEXT_1));
         Assert.assertEquals(setOf("英格兰", "世界杯", "夺冠", "，", "中华人民共和国",
                                   "国歌", "百度", "科技园", "位于", "北京市",
                                   "海淀区", "西北旺", "东路", "10", "号", "院"),
                             analyzer.segment(TEXT_2));

         // Note latest hanlp portable version not contains model data
         // https://github.com/hankcs/HanLP/tree/portable#%E6%96%B9%E5%BC%8F%E4%B8%80maven
         // So test IndexTokenizer instead
         analyzer = AnalyzerFactory.analyzer("hanlp", "index");
         Assert.assertEquals(setOf("England", " ", "wins", "World", "Cup"),
                             analyzer.segment(TEXT_1));
         Assert.assertEquals(setOf("英格兰", "英格", "格兰", "世界杯", "世界", "夺冠", "，",
                                   "中华人民共和国", "中华", "华人", "人民", "共和国",
                                   "共和","国歌", "百度", "科技园", "科技", "位于",
                                   "北京市", "北京", "海淀区", "海淀", "淀区", "西北旺",
                                   "西北", "东路", "10", "号", "院"),
                             analyzer.segment(TEXT_2));
     }

     @Test
     public void testSmartCNAnalyzer() {
         Analyzer analyzer = AnalyzerFactory.analyzer("smartcn", "");
         Assert.assertEquals(setOf("england", "win", "world", "cup"),
                             analyzer.segment(TEXT_1));
         Assert.assertEquals(setOf("英格兰", "世界杯", "夺冠", "中华人民共和国",
                                   "国歌", "百", "度", "科技", "园", "位于",
                                   "北京市", "海淀区", "西北", "旺", "东", "路",
                                   "10", "号", "院"),
                             analyzer.segment(TEXT_2));
     }

     @Test
     public void testJiebaAnalyzer() {
         // SEARCH mode
         Analyzer analyzer = AnalyzerFactory.analyzer("jieba", "SEARCH");
         Assert.assertEquals(setOf("england", " ", "wins", "world", "cup"),
                             analyzer.segment(TEXT_1));
         Assert.assertEquals(setOf("英格兰", "世界杯", "夺冠", "，", "中华人民共和国",
                                   "国歌", "百度", "科技园", "位于", "北京市",
                                   "海淀区", "西北", "旺", "东路", "10", "号院"),
                             analyzer.segment(TEXT_2));

         // INDEX mode
         analyzer = AnalyzerFactory.analyzer("jieba", "INDEX");
         Assert.assertEquals(setOf("england", " ", "wins", "world", "cup"),
                             analyzer.segment(TEXT_1));
         Assert.assertEquals(setOf("英格兰", "世界", "世界杯", "夺冠", "，", "中华",
                                   "华人", "人民", "共和", "共和国", "中华人民共和国",
                                   "国歌", "百度", "科技", "科技园", "位于", "北京",
                                   "京市", "北京市", "海淀", "淀区", "海淀区", "西北",
                                   "旺", "东路", "10", "号院"),
                             analyzer.segment(TEXT_2));
     }

     @Test
     public void testJcsegAnalyzer() {
         // Simple mode
         Analyzer analyzer = AnalyzerFactory.analyzer("jcseg", "Simple");
         Assert.assertEquals(setOf("england", "wins", "world", "cup"),
                             analyzer.segment(TEXT_1));
         Assert.assertEquals(setOf("英格兰", "世界杯", "夺冠", "，", "中华",
                                   "人民共和国", "国歌", "百度", "科技", "园", "位于",
                                   "北京市", "海淀区", "西北", "旺", "东路", "1", "0",
                                   "号", "院"),
                             analyzer.segment(TEXT_2));

         // Complex mode
         analyzer = AnalyzerFactory.analyzer("jcseg", "Complex");
         Assert.assertEquals(setOf("england", "wins", "world", "cup"),
                             analyzer.segment(TEXT_1));
         Assert.assertEquals(setOf("英格兰", "世界杯", "夺冠", "，", "中华",
                                   "人民共和国", "国歌", "百度", "科技", "园", "位于",
                                   "北京市", "海淀区", "西北", "旺", "东路", "1", "0",
                                   "号", "院"),
                             analyzer.segment(TEXT_2));
     }

     @Test
     public void testMMSeg4JAnalyzer() {
         // Simple mode
         Analyzer analyzer = AnalyzerFactory.analyzer("mmseg4j", "Simple");
         Assert.assertEquals(setOf("england", "wins", "world", "cup"),
                             analyzer.segment(TEXT_1));
         Assert.assertEquals(setOf("英格兰", "世界杯", "夺冠", "中华人民共和国",
                                   "国歌", "百度", "科技园", "位于", "北京市",
                                   "海淀区", "西北", "旺", "东路", "10", "号",
                                   "院"),
                             analyzer.segment(TEXT_2));

         // Complex mode
         analyzer = AnalyzerFactory.analyzer("mmseg4j", "Complex");
         Assert.assertEquals(setOf("england", "wins", "world", "cup"),
                             analyzer.segment(TEXT_1));
         Assert.assertEquals(setOf("英格兰", "世界杯", "夺冠", "中华人民共和国",
                                   "国歌", "百度", "科技园", "位于", "北京市",
                                   "海淀区", "西北", "旺", "东路", "10", "号",
                                   "院"),
                             analyzer.segment(TEXT_2));
     }

     @Test
     public void testIKAnalyzer() {
         // Smart mode
         Analyzer analyzer = AnalyzerFactory.analyzer("ikanalyzer", "smart");
         Assert.assertEquals(setOf("england", "wins", "world", "cup"),
                             analyzer.segment(TEXT_1));
         Assert.assertEquals(setOf("英格兰", "世界杯", "夺冠", "中华人民共和国",
                                   "国歌", "百度", "科技园", "位于", "北京市",
                                   "海淀区", "西北", "旺", "东路", "10号", "院"),
                             analyzer.segment(TEXT_2));

         // Max_word mode
         analyzer = AnalyzerFactory.analyzer("ikanalyzer", "max_word");
         Assert.assertEquals(setOf("england", "wins", "world", "cup"),
                             analyzer.segment(TEXT_1));
         Assert.assertEquals(setOf("英格兰", "世界杯", "世界", "杯", "夺冠",
                                   "中华人民共和国", "中华人民", "中华", "华人",
                                   "人民共和国", "人民", "共和国", "共和", "国",
                                   "国歌", "百度", "百", "度", "科技园", "科技",
                                   "园", "位于", "北京市", "北京", "市", "海淀区",
                                   "海淀", "淀区", "西北", "旺", "东路", "10",
                                   "号", "院"),
                             analyzer.segment(TEXT_2));
     }

     private static Set<String> setOf(String... elems) {
         return new HashSet<>(Arrays.asList(elems));
     }

 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with this
	* work for additional information regarding copyright ownership. The ASF
	* licenses this file to You under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance with the License.
	* You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
	* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
	* License for the specific language governing permissions and limitations
	* under the License.
	*/

	package org.apache.hugegraph.unit.core;

	import java.util.Arrays;
	import java.util.HashSet;
	import java.util.Set;

	import org.junit.After;
	import org.junit.Before;
	import org.junit.Test;

	import org.apache.hugegraph.analyzer.Analyzer;
	import org.apache.hugegraph.analyzer.AnalyzerFactory;
	import org.apache.hugegraph.testutil.Assert;

	public class AnalyzerTest {

	private static final String TEXT_1 = "England wins World Cup";
	private static final String TEXT_2 = "英格兰世界杯夺冠，中华人民共和国国歌，" +
	"百度科技园位于北京市海淀区西北旺东路10号院";

	@Before
	public void setup() {
	// pass
	}

	@After
	public void teardown() {
	// pass
	}

	@Test
	public void testAnsjAnalyzer() {
	// BaseAnalysis mode
	Analyzer analyzer = AnalyzerFactory.analyzer("ansj", "BaseAnalysis");
	Assert.assertEquals(setOf("england", " ", "wins", "world", "cup"),
	analyzer.segment(TEXT_1));
	Assert.assertEquals(setOf("英格兰", "世界杯", "夺冠", "，", "中华人民共和国",
	"国歌", "百度", "科技", "园", "位于", "北京市",
	"海淀区", "西北", "旺", "东路", "10", "号", "院"),
	analyzer.segment(TEXT_2));

	// IndexAnalysis mode
	analyzer = AnalyzerFactory.analyzer("ansj", "IndexAnalysis");
	Assert.assertEquals(setOf("england", " ", "wins", "world", "cup"),
	analyzer.segment(TEXT_1));
	Assert.assertEquals(setOf("英格兰", "世界杯", "夺冠", "，", "中华人民共和国",
	"国歌", "百度", "科技", "园", "位于", "北京市",
	"海淀区", "西北", "旺", "东路", "10号", "院"),
	analyzer.segment(TEXT_2));
	}

	@Test
	public void testHanlpAnalyzer() {
	// standard mode
	Analyzer analyzer = AnalyzerFactory.analyzer("hanlp", "standard");
	Assert.assertEquals(setOf("England", " ", "wins", "World", "Cup"),
	analyzer.segment(TEXT_1));
	Assert.assertEquals(setOf("英格兰", "世界杯", "夺冠", "，", "中华人民共和国",
	"国歌", "百度", "科技园", "位于", "北京市",
	"海淀区", "西北旺", "东路", "10", "号", "院"),
	analyzer.segment(TEXT_2));

	// Note latest hanlp portable version not contains model data
	// https://github.com/hankcs/HanLP/tree/portable#%E6%96%B9%E5%BC%8F%E4%B8%80maven
	// So test IndexTokenizer instead
	analyzer = AnalyzerFactory.analyzer("hanlp", "index");
	Assert.assertEquals(setOf("England", " ", "wins", "World", "Cup"),
	analyzer.segment(TEXT_1));
	Assert.assertEquals(setOf("英格兰", "英格", "格兰", "世界杯", "世界", "夺冠", "，",
	"中华人民共和国", "中华", "华人", "人民", "共和国",
	"共和","国歌", "百度", "科技园", "科技", "位于",
	"北京市", "北京", "海淀区", "海淀", "淀区", "西北旺",
	"西北", "东路", "10", "号", "院"),
	analyzer.segment(TEXT_2));
	}

	@Test
	public void testSmartCNAnalyzer() {
	Analyzer analyzer = AnalyzerFactory.analyzer("smartcn", "");
	Assert.assertEquals(setOf("england", "win", "world", "cup"),
	analyzer.segment(TEXT_1));
	Assert.assertEquals(setOf("英格兰", "世界杯", "夺冠", "中华人民共和国",
	"国歌", "百", "度", "科技", "园", "位于",
	"北京市", "海淀区", "西北", "旺", "东", "路",
	"10", "号", "院"),
	analyzer.segment(TEXT_2));
	}

	@Test
	public void testJiebaAnalyzer() {
	// SEARCH mode
	Analyzer analyzer = AnalyzerFactory.analyzer("jieba", "SEARCH");
	Assert.assertEquals(setOf("england", " ", "wins", "world", "cup"),
	analyzer.segment(TEXT_1));
	Assert.assertEquals(setOf("英格兰", "世界杯", "夺冠", "，", "中华人民共和国",
	"国歌", "百度", "科技园", "位于", "北京市",
	"海淀区", "西北", "旺", "东路", "10", "号院"),
	analyzer.segment(TEXT_2));

	// INDEX mode
	analyzer = AnalyzerFactory.analyzer("jieba", "INDEX");
	Assert.assertEquals(setOf("england", " ", "wins", "world", "cup"),
	analyzer.segment(TEXT_1));
	Assert.assertEquals(setOf("英格兰", "世界", "世界杯", "夺冠", "，", "中华",
	"华人", "人民", "共和", "共和国", "中华人民共和国",
	"国歌", "百度", "科技", "科技园", "位于", "北京",
	"京市", "北京市", "海淀", "淀区", "海淀区", "西北",
	"旺", "东路", "10", "号院"),
	analyzer.segment(TEXT_2));
	}

	@Test
	public void testJcsegAnalyzer() {
	// Simple mode
	Analyzer analyzer = AnalyzerFactory.analyzer("jcseg", "Simple");
	Assert.assertEquals(setOf("england", "wins", "world", "cup"),
	analyzer.segment(TEXT_1));
	Assert.assertEquals(setOf("英格兰", "世界杯", "夺冠", "，", "中华",
	"人民共和国", "国歌", "百度", "科技", "园", "位于",
	"北京市", "海淀区", "西北", "旺", "东路", "1", "0",
	"号", "院"),
	analyzer.segment(TEXT_2));

	// Complex mode
	analyzer = AnalyzerFactory.analyzer("jcseg", "Complex");
	Assert.assertEquals(setOf("england", "wins", "world", "cup"),
	analyzer.segment(TEXT_1));
	Assert.assertEquals(setOf("英格兰", "世界杯", "夺冠", "，", "中华",
	"人民共和国", "国歌", "百度", "科技", "园", "位于",
	"北京市", "海淀区", "西北", "旺", "东路", "1", "0",
	"号", "院"),
	analyzer.segment(TEXT_2));
	}

	@Test
	public void testMMSeg4JAnalyzer() {
	// Simple mode
	Analyzer analyzer = AnalyzerFactory.analyzer("mmseg4j", "Simple");
	Assert.assertEquals(setOf("england", "wins", "world", "cup"),
	analyzer.segment(TEXT_1));
	Assert.assertEquals(setOf("英格兰", "世界杯", "夺冠", "中华人民共和国",
	"国歌", "百度", "科技园", "位于", "北京市",
	"海淀区", "西北", "旺", "东路", "10", "号",
	"院"),
	analyzer.segment(TEXT_2));

	// Complex mode
	analyzer = AnalyzerFactory.analyzer("mmseg4j", "Complex");
	Assert.assertEquals(setOf("england", "wins", "world", "cup"),
	analyzer.segment(TEXT_1));
	Assert.assertEquals(setOf("英格兰", "世界杯", "夺冠", "中华人民共和国",
	"国歌", "百度", "科技园", "位于", "北京市",
	"海淀区", "西北", "旺", "东路", "10", "号",
	"院"),
	analyzer.segment(TEXT_2));
	}

	@Test
	public void testIKAnalyzer() {
	// Smart mode
	Analyzer analyzer = AnalyzerFactory.analyzer("ikanalyzer", "smart");
	Assert.assertEquals(setOf("england", "wins", "world", "cup"),
	analyzer.segment(TEXT_1));
	Assert.assertEquals(setOf("英格兰", "世界杯", "夺冠", "中华人民共和国",
	"国歌", "百度", "科技园", "位于", "北京市",
	"海淀区", "西北", "旺", "东路", "10号", "院"),
	analyzer.segment(TEXT_2));

	// Max_word mode
	analyzer = AnalyzerFactory.analyzer("ikanalyzer", "max_word");
	Assert.assertEquals(setOf("england", "wins", "world", "cup"),
	analyzer.segment(TEXT_1));
	Assert.assertEquals(setOf("英格兰", "世界杯", "世界", "杯", "夺冠",
	"中华人民共和国", "中华人民", "中华", "华人",
	"人民共和国", "人民", "共和国", "共和", "国",
	"国歌", "百度", "百", "度", "科技园", "科技",
	"园", "位于", "北京市", "北京", "市", "海淀区",
	"海淀", "淀区", "西北", "旺", "东路", "10",
	"号", "院"),
	analyzer.segment(TEXT_2));
	}

	private static Set<String> setOf(String... elems) {
	return new HashSet<>(Arrays.asList(elems));
	}

	}