| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| package org.apache.lucene.analysis.cjk; |
| |
| import java.io.IOException; |
| import java.io.Reader; |
| import java.io.StringReader; |
| import org.apache.lucene.analysis.Analyzer; |
| import org.apache.lucene.analysis.BaseTokenStreamTestCase; |
| import org.apache.lucene.analysis.CharFilter; |
| import org.apache.lucene.analysis.MockTokenizer; |
| import org.apache.lucene.analysis.TokenStream; |
| import org.apache.lucene.analysis.Tokenizer; |
| |
| public class TestCJKWidthCharFilter extends BaseTokenStreamTestCase { |
| /** Full-width ASCII forms normalized to half-width (basic latin) */ |
| public void testFullWidthASCII() throws IOException { |
| CharFilter reader = new CJKWidthCharFilter(new StringReader("Test 1234")); |
| TokenStream ts = whitespaceMockTokenizer(reader); |
| assertTokenStreamContents( |
| ts, new String[] {"Test", "1234"}, new int[] {0, 5}, new int[] {4, 9}, 9); |
| } |
| |
| /** |
| * Half-width katakana forms normalized to standard katakana. A bit trickier in some cases, since |
| * half-width forms are decomposed and voice marks need to be recombined with a preceding base |
| * form. |
| */ |
| public void testHalfWidthKana() throws IOException { |
| CharFilter reader = new CJKWidthCharFilter(new StringReader("カタカナ")); |
| TokenStream ts = whitespaceMockTokenizer(reader); |
| assertTokenStreamContents(ts, new String[] {"カタカナ"}, new int[] {0}, new int[] {4}, 4); |
| |
| reader = new CJKWidthCharFilter(new StringReader("ヴィッツ")); |
| ts = whitespaceMockTokenizer(reader); |
| assertTokenStreamContents(ts, new String[] {"ヴィッツ"}, new int[] {0}, new int[] {5}, 5); |
| |
| reader = new CJKWidthCharFilter(new StringReader("パナソニック")); |
| ts = whitespaceMockTokenizer(reader); |
| assertTokenStreamContents(ts, new String[] {"パナソニック"}, new int[] {0}, new int[] {7}, 7); |
| |
| reader = new CJKWidthCharFilter(new StringReader("ヴィッツ パナソニック")); |
| ts = whitespaceMockTokenizer(reader); |
| assertTokenStreamContents( |
| ts, new String[] {"ヴィッツ", "パナソニック"}, new int[] {0, 6}, new int[] {5, 13}, 13); |
| } |
| |
| /** Input may contain orphan voiced marks that cannot be combined with the previous character. */ |
| public void testOrphanVoiceMark() throws Exception { |
| CharFilter reader = new CJKWidthCharFilter(new StringReader("ア゙ィッツ")); |
| TokenStream ts = whitespaceMockTokenizer(reader); |
| assertTokenStreamContents(ts, new String[] {"ア\u3099ィッツ"}, new int[] {0}, new int[] {5}, 5); |
| |
| reader = new CJKWidthCharFilter(new StringReader("゙ィッツ")); |
| ts = whitespaceMockTokenizer(reader); |
| assertTokenStreamContents(ts, new String[] {"\u3099ィッツ"}, new int[] {0}, new int[] {4}, 4); |
| |
| reader = new CJKWidthCharFilter(new StringReader("ア゚ナソニック")); |
| ts = whitespaceMockTokenizer(reader); |
| assertTokenStreamContents(ts, new String[] {"ア\u309Aナソニック"}, new int[] {0}, new int[] {7}, 7); |
| |
| reader = new CJKWidthCharFilter(new StringReader("゚ナソニック")); |
| ts = whitespaceMockTokenizer(reader); |
| assertTokenStreamContents(ts, new String[] {"\u309Aナソニック"}, new int[] {0}, new int[] {6}, 6); |
| } |
| |
| public void testComplexInput() throws Exception { |
| CharFilter reader = new CJKWidthCharFilter(new StringReader("Test 1234")); |
| TokenStream ts = whitespaceMockTokenizer(reader); |
| assertTokenStreamContents( |
| ts, new String[] {"Test", "1234"}, new int[] {0, 5}, new int[] {4, 9}, 9); |
| |
| reader = new CJKWidthCharFilter(new StringReader("カタカナ ヴィッツ パナソニック")); |
| ts = whitespaceMockTokenizer(reader); |
| assertTokenStreamContents( |
| ts, |
| new String[] {"カタカナ", "ヴィッツ", "パナソニック"}, |
| new int[] {0, 5, 11}, |
| new int[] {4, 10, 18}, |
| 18); |
| } |
| |
| public void testEmptyInput() throws Exception { |
| CharFilter reader = new CJKWidthCharFilter(new StringReader("")); |
| TokenStream ts = whitespaceMockTokenizer(reader); |
| assertTokenStreamContents(ts, new String[] {}); |
| } |
| |
| public void testRandom() throws Exception { |
| Analyzer analyzer = |
| new Analyzer() { |
| @Override |
| protected TokenStreamComponents createComponents(String fieldName) { |
| Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false); |
| return new TokenStreamComponents(tokenizer, tokenizer); |
| } |
| |
| @Override |
| protected Reader initReader(String fieldName, Reader reader) { |
| return new CJKWidthCharFilter(reader); |
| } |
| }; |
| int numRounds = RANDOM_MULTIPLIER * 1000; |
| checkRandomData(random(), analyzer, numRounds); |
| analyzer.close(); |
| } |
| } |