lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/TestICUTransformFilter.java - lucene-solr - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.lucene.analysis.icu;

 import com.ibm.icu.text.Transliterator;
 import com.ibm.icu.text.UnicodeSet;
 import java.io.IOException;
 import java.io.StringReader;
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
 import org.apache.lucene.analysis.MockTokenizer;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.Tokenizer;
 import org.apache.lucene.analysis.core.KeywordTokenizer;

 /** Test the ICUTransformFilter with some basic examples. */
 public class TestICUTransformFilter extends BaseTokenStreamTestCase {

   public void testBasicFunctionality() throws Exception {
     checkToken(Transliterator.getInstance("Traditional-Simplified"), "簡化字", "简化字");
     checkToken(Transliterator.getInstance("Katakana-Hiragana"), "ヒラガナ", "ひらがな");
     checkToken(Transliterator.getInstance("Fullwidth-Halfwidth"), "アルアノリウ", "ｱﾙｱﾉﾘｳ");
     checkToken(
         Transliterator.getInstance("Any-Latin"), "Αλφαβητικός Κατάλογος", "Alphabētikós Katálogos");
     checkToken(
         Transliterator.getInstance("NFD; [:Nonspacing Mark:] Remove"),
         "Alphabētikós Katálogos",
         "Alphabetikos Katalogos");
     checkToken(Transliterator.getInstance("Han-Latin"), "中国", "zhōng guó");
   }

   public void testCustomFunctionality() throws Exception {
     String rules = "a > b; b > c;"; // convert a's to b's and b's to c's
     checkToken(
         Transliterator.createFromRules("test", rules, Transliterator.FORWARD),
         "abacadaba",
         "bcbcbdbcb");
   }

   public void testCustomFunctionality2() throws Exception {
     String rules = "c { a > b; a > d;"; // convert a's to b's and b's to c's
     checkToken(Transliterator.createFromRules("test", rules, Transliterator.FORWARD), "caa", "cbd");
   }

   public void testOptimizer() throws Exception {
     String rules = "a > b; b > c;"; // convert a's to b's and b's to c's
     Transliterator custom = Transliterator.createFromRules("test", rules, Transliterator.FORWARD);
     assertTrue(custom.getFilter() == null);
     final KeywordTokenizer input = new KeywordTokenizer();
     input.setReader(new StringReader(""));
     new ICUTransformFilter(input, custom);
     assertTrue(custom.getFilter().equals(new UnicodeSet("[ab]")));
   }

   public void testOptimizer2() throws Exception {
     checkToken(Transliterator.getInstance("Traditional-Simplified; CaseFold"), "ABCDE", "abcde");
   }

   public void testOptimizerSurrogate() throws Exception {
     String rules = "\\U00020087 > x;"; // convert CJK UNIFIED IDEOGRAPH-20087 to an x
     Transliterator custom = Transliterator.createFromRules("test", rules, Transliterator.FORWARD);
     assertTrue(custom.getFilter() == null);
     final KeywordTokenizer input = new KeywordTokenizer();
     input.setReader(new StringReader(""));
     new ICUTransformFilter(input, custom);
     assertTrue(custom.getFilter().equals(new UnicodeSet("[\\U00020087]")));
   }

   private void checkToken(Transliterator transform, String input, String expected)
       throws IOException {
     final KeywordTokenizer input1 = new KeywordTokenizer();
     input1.setReader(new StringReader(input));
     TokenStream ts = new ICUTransformFilter(input1, transform);
     assertTokenStreamContents(ts, new String[] {expected});
   }

   /** blast some random strings through the analyzer */
   public void testRandomStrings() throws Exception {
     final Transliterator transform = Transliterator.getInstance("Any-Latin");
     Analyzer a =
         new Analyzer() {
           @Override
           protected TokenStreamComponents createComponents(String fieldName) {
             Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
             return new TokenStreamComponents(
                 tokenizer, new ICUTransformFilter(tokenizer, transform));
           }
         };
     checkRandomData(random(), a, 200 * RANDOM_MULTIPLIER);
     a.close();
   }

   public void testEmptyTerm() throws IOException {
     Analyzer a =
         new Analyzer() {
           @Override
           protected TokenStreamComponents createComponents(String fieldName) {
             Tokenizer tokenizer = new KeywordTokenizer();
             return new TokenStreamComponents(
                 tokenizer,
                 new ICUTransformFilter(tokenizer, Transliterator.getInstance("Any-Latin")));
           }
         };
     checkOneTerm(a, "", "");
     a.close();
   }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.lucene.analysis.icu;

	import com.ibm.icu.text.Transliterator;
	import com.ibm.icu.text.UnicodeSet;
	import java.io.IOException;
	import java.io.StringReader;
	import org.apache.lucene.analysis.Analyzer;
	import org.apache.lucene.analysis.BaseTokenStreamTestCase;
	import org.apache.lucene.analysis.MockTokenizer;
	import org.apache.lucene.analysis.TokenStream;
	import org.apache.lucene.analysis.Tokenizer;
	import org.apache.lucene.analysis.core.KeywordTokenizer;

	/** Test the ICUTransformFilter with some basic examples. */
	public class TestICUTransformFilter extends BaseTokenStreamTestCase {

	public void testBasicFunctionality() throws Exception {
	checkToken(Transliterator.getInstance("Traditional-Simplified"), "簡化字", "简化字");
	checkToken(Transliterator.getInstance("Katakana-Hiragana"), "ヒラガナ", "ひらがな");
	checkToken(Transliterator.getInstance("Fullwidth-Halfwidth"), "アルアノリウ", "ｱﾙｱﾉﾘｳ");
	checkToken(
	Transliterator.getInstance("Any-Latin"), "Αλφαβητικός Κατάλογος", "Alphabētikós Katálogos");
	checkToken(
	Transliterator.getInstance("NFD; [:Nonspacing Mark:] Remove"),
	"Alphabētikós Katálogos",
	"Alphabetikos Katalogos");
	checkToken(Transliterator.getInstance("Han-Latin"), "中国", "zhōng guó");
	}

	public void testCustomFunctionality() throws Exception {
	String rules = "a > b; b > c;"; // convert a's to b's and b's to c's
	checkToken(
	Transliterator.createFromRules("test", rules, Transliterator.FORWARD),
	"abacadaba",
	"bcbcbdbcb");
	}

	public void testCustomFunctionality2() throws Exception {
	String rules = "c { a > b; a > d;"; // convert a's to b's and b's to c's
	checkToken(Transliterator.createFromRules("test", rules, Transliterator.FORWARD), "caa", "cbd");
	}

	public void testOptimizer() throws Exception {
	String rules = "a > b; b > c;"; // convert a's to b's and b's to c's
	Transliterator custom = Transliterator.createFromRules("test", rules, Transliterator.FORWARD);
	assertTrue(custom.getFilter() == null);
	final KeywordTokenizer input = new KeywordTokenizer();
	input.setReader(new StringReader(""));
	new ICUTransformFilter(input, custom);
	assertTrue(custom.getFilter().equals(new UnicodeSet("[ab]")));
	}

	public void testOptimizer2() throws Exception {
	checkToken(Transliterator.getInstance("Traditional-Simplified; CaseFold"), "ABCDE", "abcde");
	}

	public void testOptimizerSurrogate() throws Exception {
	String rules = "\\U00020087 > x;"; // convert CJK UNIFIED IDEOGRAPH-20087 to an x
	Transliterator custom = Transliterator.createFromRules("test", rules, Transliterator.FORWARD);
	assertTrue(custom.getFilter() == null);
	final KeywordTokenizer input = new KeywordTokenizer();
	input.setReader(new StringReader(""));
	new ICUTransformFilter(input, custom);
	assertTrue(custom.getFilter().equals(new UnicodeSet("[\\U00020087]")));
	}

	private void checkToken(Transliterator transform, String input, String expected)
	throws IOException {
	final KeywordTokenizer input1 = new KeywordTokenizer();
	input1.setReader(new StringReader(input));
	TokenStream ts = new ICUTransformFilter(input1, transform);
	assertTokenStreamContents(ts, new String[] {expected});
	}

	/** blast some random strings through the analyzer */
	public void testRandomStrings() throws Exception {
	final Transliterator transform = Transliterator.getInstance("Any-Latin");
	Analyzer a =
	new Analyzer() {
	@Override
	protected TokenStreamComponents createComponents(String fieldName) {
	Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
	return new TokenStreamComponents(
	tokenizer, new ICUTransformFilter(tokenizer, transform));
	}
	};
	checkRandomData(random(), a, 200 * RANDOM_MULTIPLIER);
	a.close();
	}

	public void testEmptyTerm() throws IOException {
	Analyzer a =
	new Analyzer() {
	@Override
	protected TokenStreamComponents createComponents(String fieldName) {
	Tokenizer tokenizer = new KeywordTokenizer();
	return new TokenStreamComponents(
	tokenizer,
	new ICUTransformFilter(tokenizer, Transliterator.getInstance("Any-Latin")));
	}
	};
	checkOneTerm(a, "", "");
	a.close();
	}
	}