blob: fa77b4000790897dbad9ffc070a325a780426712 [file] [log] [blame]
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.charfilter;
import java.io.Reader;
import java.io.StringReader;
import java.util.HashSet;
import java.util.Set;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.CharReader;
import org.apache.lucene.analysis.CharStream;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.util._TestUtil;
import org.junit.Ignore;
public class TestMappingCharFilter extends BaseTokenStreamTestCase {
NormalizeCharMap normMap;
@Override
public void setUp() throws Exception {
super.setUp();
normMap = new NormalizeCharMap();
normMap.add( "aa", "a" );
normMap.add( "bbb", "b" );
normMap.add( "cccc", "cc" );
normMap.add( "h", "i" );
normMap.add( "j", "jj" );
normMap.add( "k", "kkk" );
normMap.add( "ll", "llll" );
normMap.add( "empty", "" );
}
public void testReaderReset() throws Exception {
CharStream cs = new MappingCharFilter( normMap, new StringReader( "x" ) );
char[] buf = new char[10];
int len = cs.read(buf, 0, 10);
assertEquals( 1, len );
assertEquals( 'x', buf[0]) ;
len = cs.read(buf, 0, 10);
assertEquals( -1, len );
// rewind
cs.reset();
len = cs.read(buf, 0, 10);
assertEquals( 1, len );
assertEquals( 'x', buf[0]) ;
}
public void testNothingChange() throws Exception {
CharStream cs = new MappingCharFilter( normMap, new StringReader( "x" ) );
TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false);
assertTokenStreamContents(ts, new String[]{"x"}, new int[]{0}, new int[]{1}, 1);
}
public void test1to1() throws Exception {
CharStream cs = new MappingCharFilter( normMap, new StringReader( "h" ) );
TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false);
assertTokenStreamContents(ts, new String[]{"i"}, new int[]{0}, new int[]{1}, 1);
}
public void test1to2() throws Exception {
CharStream cs = new MappingCharFilter( normMap, new StringReader( "j" ) );
TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false);
assertTokenStreamContents(ts, new String[]{"jj"}, new int[]{0}, new int[]{1}, 1);
}
public void test1to3() throws Exception {
CharStream cs = new MappingCharFilter( normMap, new StringReader( "k" ) );
TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false);
assertTokenStreamContents(ts, new String[]{"kkk"}, new int[]{0}, new int[]{1}, 1);
}
public void test2to4() throws Exception {
CharStream cs = new MappingCharFilter( normMap, new StringReader( "ll" ) );
TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false);
assertTokenStreamContents(ts, new String[]{"llll"}, new int[]{0}, new int[]{2}, 2);
}
public void test2to1() throws Exception {
CharStream cs = new MappingCharFilter( normMap, new StringReader( "aa" ) );
TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false);
assertTokenStreamContents(ts, new String[]{"a"}, new int[]{0}, new int[]{2}, 2);
}
public void test3to1() throws Exception {
CharStream cs = new MappingCharFilter( normMap, new StringReader( "bbb" ) );
TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false);
assertTokenStreamContents(ts, new String[]{"b"}, new int[]{0}, new int[]{3}, 3);
}
public void test4to2() throws Exception {
CharStream cs = new MappingCharFilter( normMap, new StringReader( "cccc" ) );
TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false);
assertTokenStreamContents(ts, new String[]{"cc"}, new int[]{0}, new int[]{4}, 4);
}
public void test5to0() throws Exception {
CharStream cs = new MappingCharFilter( normMap, new StringReader( "empty" ) );
TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false);
assertTokenStreamContents(ts, new String[0], new int[]{}, new int[]{}, 5);
}
//
// 1111111111222
// 01234567890123456789012
//(in) h i j k ll cccc bbb aa
//
// 1111111111222
// 01234567890123456789012
//(out) i i jj kkk llll cc b a
//
// h, 0, 1 => i, 0, 1
// i, 2, 3 => i, 2, 3
// j, 4, 5 => jj, 4, 5
// k, 6, 7 => kkk, 6, 7
// ll, 8,10 => llll, 8,10
// cccc,11,15 => cc,11,15
// bbb,16,19 => b,16,19
// aa,20,22 => a,20,22
//
public void testTokenStream() throws Exception {
String testString = "h i j k ll cccc bbb aa";
CharStream cs = new MappingCharFilter( normMap, CharReader.get( new StringReader( testString ) ) );
TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false);
assertTokenStreamContents(ts,
new String[]{"i","i","jj","kkk","llll","cc","b","a"},
new int[]{0,2,4,6,8,11,16,20},
new int[]{1,3,5,7,10,15,19,22},
testString.length()
);
}
//
//
// 0123456789
//(in) aaaa ll h
//(out-1) aa llll i
//(out-2) a llllllll i
//
// aaaa,0,4 => a,0,4
// ll,5,7 => llllllll,5,7
// h,8,9 => i,8,9
public void testChained() throws Exception {
String testString = "aaaa ll h";
CharStream cs = new MappingCharFilter( normMap,
new MappingCharFilter( normMap, CharReader.get( new StringReader( testString ) ) ) );
TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false);
assertTokenStreamContents(ts,
new String[]{"a","llllllll","i"},
new int[]{0,5,8},
new int[]{4,7,9},
testString.length()
);
}
public void testRandom() throws Exception {
Analyzer analyzer = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
return new TokenStreamComponents(tokenizer, tokenizer);
}
@Override
protected Reader initReader(Reader reader) {
return new MappingCharFilter(normMap, CharReader.get(reader));
}
};
int numRounds = RANDOM_MULTIPLIER * 10000;
checkRandomData(random, analyzer, numRounds);
}
@Ignore("wrong finalOffset: https://issues.apache.org/jira/browse/LUCENE-3971")
public void testFinalOffsetSpecialCase() throws Exception {
final NormalizeCharMap map = new NormalizeCharMap();
map.add("t", "");
// even though this below rule has no effect, the test passes if you remove it!!
map.add("tmakdbl", "c");
Analyzer analyzer = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
return new TokenStreamComponents(tokenizer, tokenizer);
}
@Override
protected Reader initReader(Reader reader) {
return new MappingCharFilter(map, CharReader.get(reader));
}
};
String text = "gzw f quaxot";
checkAnalysisConsistency(random, analyzer, false, text);
}
@Ignore("wrong finalOffset: https://issues.apache.org/jira/browse/LUCENE-3971")
public void testRandomMaps() throws Exception {
for (int i = 0; i < 100; i++) {
final NormalizeCharMap map = randomMap();
Analyzer analyzer = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
return new TokenStreamComponents(tokenizer, tokenizer);
}
@Override
protected Reader initReader(Reader reader) {
return new MappingCharFilter(map, CharReader.get(reader));
}
};
int numRounds = RANDOM_MULTIPLIER * 100;
checkRandomData(random, analyzer, numRounds);
}
}
private NormalizeCharMap randomMap() {
NormalizeCharMap map = new NormalizeCharMap();
// we can't add duplicate keys, or NormalizeCharMap gets angry
Set<String> keys = new HashSet<String>();
int num = random.nextInt(5);
//System.out.println("NormalizeCharMap=");
for (int i = 0; i < num; i++) {
String key = _TestUtil.randomSimpleString(random);
if (!keys.contains(key)) {
String value = _TestUtil.randomSimpleString(random);
map.add(key, value);
keys.add(key);
//System.out.println("mapping: '" + key + "' => '" + value + "'");
}
}
return map;
}
}