lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/TestICUTransformFilter.java - lucene-solr - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.lucene.analysis.icu;


 import java.io.IOException;
 import java.io.StringReader;

 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
 import org.apache.lucene.analysis.MockTokenizer;
 import org.apache.lucene.analysis.Tokenizer;
 import org.apache.lucene.analysis.core.KeywordTokenizer;
 import org.apache.lucene.analysis.TokenStream;

 import com.ibm.icu.text.Transliterator;
 import com.ibm.icu.text.UnicodeSet;


 /**
  * Test the ICUTransformFilter with some basic examples.
  */
 public class TestICUTransformFilter extends BaseTokenStreamTestCase {

   public void testBasicFunctionality() throws Exception {
     checkToken(Transliterator.getInstance("Traditional-Simplified"),
         "簡化字", "简化字");
     checkToken(Transliterator.getInstance("Katakana-Hiragana"),
         "ヒラガナ", "ひらがな");
     checkToken(Transliterator.getInstance("Fullwidth-Halfwidth"),
         "アルアノリウ", "ｱﾙｱﾉﾘｳ");
     checkToken(Transliterator.getInstance("Any-Latin"),
         "Αλφαβητικός Κατάλογος", "Alphabētikós Katálogos");
     checkToken(Transliterator.getInstance("NFD; [:Nonspacing Mark:] Remove"),
         "Alphabētikós Katálogos", "Alphabetikos Katalogos");
     checkToken(Transliterator.getInstance("Han-Latin"),
         "中国", "zhōng guó");
   }

   public void testCustomFunctionality() throws Exception {
     String rules = "a > b; b > c;"; // convert a's to b's and b's to c's
     checkToken(Transliterator.createFromRules("test", rules, Transliterator.FORWARD), "abacadaba", "bcbcbdbcb");
   }

   public void testCustomFunctionality2() throws Exception {
     String rules = "c { a > b; a > d;"; // convert a's to b's and b's to c's
     checkToken(Transliterator.createFromRules("test", rules, Transliterator.FORWARD), "caa", "cbd");
   }

   public void testOptimizer() throws Exception {
     String rules = "a > b; b > c;"; // convert a's to b's and b's to c's
     Transliterator custom = Transliterator.createFromRules("test", rules, Transliterator.FORWARD);
     assertTrue(custom.getFilter() == null);
     final KeywordTokenizer input = new KeywordTokenizer();
     input.setReader(new StringReader(""));
     new ICUTransformFilter(input, custom);
     assertTrue(custom.getFilter().equals(new UnicodeSet("[ab]")));
   }

   public void testOptimizer2() throws Exception {
     checkToken(Transliterator.getInstance("Traditional-Simplified; CaseFold"),
         "ABCDE", "abcde");
   }

   public void testOptimizerSurrogate() throws Exception {
     String rules = "\\U00020087 > x;"; // convert CJK UNIFIED IDEOGRAPH-20087 to an x
     Transliterator custom = Transliterator.createFromRules("test", rules, Transliterator.FORWARD);
     assertTrue(custom.getFilter() == null);
     final KeywordTokenizer input = new KeywordTokenizer();
     input.setReader(new StringReader(""));
     new ICUTransformFilter(input, custom);
     assertTrue(custom.getFilter().equals(new UnicodeSet("[\\U00020087]")));
   }

   private void checkToken(Transliterator transform, String input, String expected) throws IOException {
     final KeywordTokenizer input1 = new KeywordTokenizer();
     input1.setReader(new StringReader(input));
     TokenStream ts = new ICUTransformFilter(input1, transform);
     assertTokenStreamContents(ts, new String[] { expected });
   }

   /** blast some random strings through the analyzer */
   public void testRandomStrings() throws Exception {
     final Transliterator transform = Transliterator.getInstance("Any-Latin");
     Analyzer a = new Analyzer() {
       @Override
       protected TokenStreamComponents createComponents(String fieldName) {
         Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
         return new TokenStreamComponents(tokenizer, new ICUTransformFilter(tokenizer, transform));
       }
     };
     checkRandomData(random(), a, 1000*RANDOM_MULTIPLIER);
     a.close();
   }

   public void testEmptyTerm() throws IOException {
     Analyzer a = new Analyzer() {
       @Override
       protected TokenStreamComponents createComponents(String fieldName) {
         Tokenizer tokenizer = new KeywordTokenizer();
         return new TokenStreamComponents(tokenizer, new ICUTransformFilter(tokenizer, Transliterator.getInstance("Any-Latin")));
       }
     };
     checkOneTerm(a, "", "");
     a.close();
   }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.lucene.analysis.icu;


	import java.io.IOException;
	import java.io.StringReader;

	import org.apache.lucene.analysis.Analyzer;
	import org.apache.lucene.analysis.BaseTokenStreamTestCase;
	import org.apache.lucene.analysis.MockTokenizer;
	import org.apache.lucene.analysis.Tokenizer;
	import org.apache.lucene.analysis.core.KeywordTokenizer;
	import org.apache.lucene.analysis.TokenStream;

	import com.ibm.icu.text.Transliterator;
	import com.ibm.icu.text.UnicodeSet;


	/**
	* Test the ICUTransformFilter with some basic examples.
	*/
	public class TestICUTransformFilter extends BaseTokenStreamTestCase {

	public void testBasicFunctionality() throws Exception {
	checkToken(Transliterator.getInstance("Traditional-Simplified"),
	"簡化字", "简化字");
	checkToken(Transliterator.getInstance("Katakana-Hiragana"),
	"ヒラガナ", "ひらがな");
	checkToken(Transliterator.getInstance("Fullwidth-Halfwidth"),
	"アルアノリウ", "ｱﾙｱﾉﾘｳ");
	checkToken(Transliterator.getInstance("Any-Latin"),
	"Αλφαβητικός Κατάλογος", "Alphabētikós Katálogos");
	checkToken(Transliterator.getInstance("NFD; [:Nonspacing Mark:] Remove"),
	"Alphabētikós Katálogos", "Alphabetikos Katalogos");
	checkToken(Transliterator.getInstance("Han-Latin"),
	"中国", "zhōng guó");
	}

	public void testCustomFunctionality() throws Exception {
	String rules = "a > b; b > c;"; // convert a's to b's and b's to c's
	checkToken(Transliterator.createFromRules("test", rules, Transliterator.FORWARD), "abacadaba", "bcbcbdbcb");
	}

	public void testCustomFunctionality2() throws Exception {
	String rules = "c { a > b; a > d;"; // convert a's to b's and b's to c's
	checkToken(Transliterator.createFromRules("test", rules, Transliterator.FORWARD), "caa", "cbd");
	}

	public void testOptimizer() throws Exception {
	String rules = "a > b; b > c;"; // convert a's to b's and b's to c's
	Transliterator custom = Transliterator.createFromRules("test", rules, Transliterator.FORWARD);
	assertTrue(custom.getFilter() == null);
	final KeywordTokenizer input = new KeywordTokenizer();
	input.setReader(new StringReader(""));
	new ICUTransformFilter(input, custom);
	assertTrue(custom.getFilter().equals(new UnicodeSet("[ab]")));
	}

	public void testOptimizer2() throws Exception {
	checkToken(Transliterator.getInstance("Traditional-Simplified; CaseFold"),
	"ABCDE", "abcde");
	}

	public void testOptimizerSurrogate() throws Exception {
	String rules = "\\U00020087 > x;"; // convert CJK UNIFIED IDEOGRAPH-20087 to an x
	Transliterator custom = Transliterator.createFromRules("test", rules, Transliterator.FORWARD);
	assertTrue(custom.getFilter() == null);
	final KeywordTokenizer input = new KeywordTokenizer();
	input.setReader(new StringReader(""));
	new ICUTransformFilter(input, custom);
	assertTrue(custom.getFilter().equals(new UnicodeSet("[\\U00020087]")));
	}

	private void checkToken(Transliterator transform, String input, String expected) throws IOException {
	final KeywordTokenizer input1 = new KeywordTokenizer();
	input1.setReader(new StringReader(input));
	TokenStream ts = new ICUTransformFilter(input1, transform);
	assertTokenStreamContents(ts, new String[] { expected });
	}

	/** blast some random strings through the analyzer */
	public void testRandomStrings() throws Exception {
	final Transliterator transform = Transliterator.getInstance("Any-Latin");
	Analyzer a = new Analyzer() {
	@Override
	protected TokenStreamComponents createComponents(String fieldName) {
	Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
	return new TokenStreamComponents(tokenizer, new ICUTransformFilter(tokenizer, transform));
	}
	};
	checkRandomData(random(), a, 1000*RANDOM_MULTIPLIER);
	a.close();
	}

	public void testEmptyTerm() throws IOException {
	Analyzer a = new Analyzer() {
	@Override
	protected TokenStreamComponents createComponents(String fieldName) {
	Tokenizer tokenizer = new KeywordTokenizer();
	return new TokenStreamComponents(tokenizer, new ICUTransformFilter(tokenizer, Transliterator.getInstance("Any-Latin")));
	}
	};
	checkOneTerm(a, "", "");
	a.close();
	}
	}