blob: fdb0d602fa35a3c5cea4792abd43773d4eb81c75 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.charfilter;
import java.io.Reader;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Random;
import java.util.Set;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.CharFilter;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.util.TestUtil;
import org.apache.lucene.util.UnicodeUtil;
public class TestMappingCharFilter extends BaseTokenStreamTestCase {
NormalizeCharMap normMap;
@Override
public void setUp() throws Exception {
super.setUp();
NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder();
builder.add( "aa", "a" );
builder.add( "bbb", "b" );
builder.add( "cccc", "cc" );
builder.add( "h", "i" );
builder.add( "j", "jj" );
builder.add( "k", "kkk" );
builder.add( "ll", "llll" );
builder.add( "empty", "" );
// BMP (surrogate pair):
builder.add(UnicodeUtil.newString(new int[] {0x1D122}, 0, 1), "fclef");
builder.add("\uff01", "full-width-exclamation");
normMap = builder.build();
}
public void testReaderReset() throws Exception {
CharFilter cs = new MappingCharFilter( normMap, new StringReader( "x" ) );
char[] buf = new char[10];
int len = cs.read(buf, 0, 10);
assertEquals( 1, len );
assertEquals( 'x', buf[0]) ;
len = cs.read(buf, 0, 10);
assertEquals( -1, len );
// rewind
cs.reset();
len = cs.read(buf, 0, 10);
assertEquals( 1, len );
assertEquals( 'x', buf[0]) ;
}
public void testNothingChange() throws Exception {
CharFilter cs = new MappingCharFilter( normMap, new StringReader( "x" ) );
TokenStream ts =whitespaceMockTokenizer(cs);
assertTokenStreamContents(ts, new String[]{"x"}, new int[]{0}, new int[]{1}, 1);
}
public void test1to1() throws Exception {
CharFilter cs = new MappingCharFilter( normMap, new StringReader( "h" ) );
TokenStream ts =whitespaceMockTokenizer(cs);
assertTokenStreamContents(ts, new String[]{"i"}, new int[]{0}, new int[]{1}, 1);
}
public void test1to2() throws Exception {
CharFilter cs = new MappingCharFilter( normMap, new StringReader( "j" ) );
TokenStream ts =whitespaceMockTokenizer(cs);
assertTokenStreamContents(ts, new String[]{"jj"}, new int[]{0}, new int[]{1}, 1);
}
public void test1to3() throws Exception {
CharFilter cs = new MappingCharFilter( normMap, new StringReader( "k" ) );
TokenStream ts =whitespaceMockTokenizer(cs);
assertTokenStreamContents(ts, new String[]{"kkk"}, new int[]{0}, new int[]{1}, 1);
}
public void test2to4() throws Exception {
CharFilter cs = new MappingCharFilter( normMap, new StringReader( "ll" ) );
TokenStream ts =whitespaceMockTokenizer(cs);
assertTokenStreamContents(ts, new String[]{"llll"}, new int[]{0}, new int[]{2}, 2);
}
public void test2to1() throws Exception {
CharFilter cs = new MappingCharFilter( normMap, new StringReader( "aa" ) );
TokenStream ts =whitespaceMockTokenizer(cs);
assertTokenStreamContents(ts, new String[]{"a"}, new int[]{0}, new int[]{2}, 2);
}
public void test3to1() throws Exception {
CharFilter cs = new MappingCharFilter( normMap, new StringReader( "bbb" ) );
TokenStream ts =whitespaceMockTokenizer(cs);
assertTokenStreamContents(ts, new String[]{"b"}, new int[]{0}, new int[]{3}, 3);
}
public void test4to2() throws Exception {
CharFilter cs = new MappingCharFilter( normMap, new StringReader( "cccc" ) );
TokenStream ts =whitespaceMockTokenizer(cs);
assertTokenStreamContents(ts, new String[]{"cc"}, new int[]{0}, new int[]{4}, 4);
}
public void test5to0() throws Exception {
CharFilter cs = new MappingCharFilter( normMap, new StringReader( "empty" ) );
TokenStream ts =whitespaceMockTokenizer(cs);
assertTokenStreamContents(ts, new String[0], new int[]{}, new int[]{}, 5);
}
public void testNonBMPChar() throws Exception {
CharFilter cs = new MappingCharFilter( normMap, new StringReader( UnicodeUtil.newString(new int[] {0x1D122}, 0, 1) ) );
TokenStream ts =whitespaceMockTokenizer(cs);
assertTokenStreamContents(ts, new String[]{"fclef"}, new int[]{0}, new int[]{2}, 2);
}
public void testFullWidthChar() throws Exception {
CharFilter cs = new MappingCharFilter( normMap, new StringReader( "\uff01") );
TokenStream ts =whitespaceMockTokenizer(cs);
assertTokenStreamContents(ts, new String[]{"full-width-exclamation"}, new int[]{0}, new int[]{1}, 1);
}
//
// 1111111111222
// 01234567890123456789012
//(in) h i j k ll cccc bbb aa
//
// 1111111111222
// 01234567890123456789012
//(out) i i jj kkk llll cc b a
//
// h, 0, 1 => i, 0, 1
// i, 2, 3 => i, 2, 3
// j, 4, 5 => jj, 4, 5
// k, 6, 7 => kkk, 6, 7
// ll, 8,10 => llll, 8,10
// cccc,11,15 => cc,11,15
// bbb,16,19 => b,16,19
// aa,20,22 => a,20,22
//
public void testTokenStream() throws Exception {
String testString = "h i j k ll cccc bbb aa";
CharFilter cs = new MappingCharFilter( normMap, new StringReader( testString ) );
TokenStream ts =whitespaceMockTokenizer(cs);
assertTokenStreamContents(ts,
new String[]{"i","i","jj","kkk","llll","cc","b","a"},
new int[]{0,2,4,6,8,11,16,20},
new int[]{1,3,5,7,10,15,19,22},
testString.length()
);
}
//
//
// 0123456789
//(in) aaaa ll h
//(out-1) aa llll i
//(out-2) a llllllll i
//
// aaaa,0,4 => a,0,4
// ll,5,7 => llllllll,5,7
// h,8,9 => i,8,9
public void testChained() throws Exception {
String testString = "aaaa ll h";
CharFilter cs = new MappingCharFilter( normMap,
new MappingCharFilter( normMap, new StringReader( testString ) ) );
TokenStream ts =whitespaceMockTokenizer(cs);
assertTokenStreamContents(ts,
new String[]{"a","llllllll","i"},
new int[]{0,5,8},
new int[]{4,7,9},
testString.length()
);
}
public void testRandom() throws Exception {
Analyzer analyzer = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
return new TokenStreamComponents(tokenizer, tokenizer);
}
@Override
protected Reader initReader(String fieldName, Reader reader) {
return new MappingCharFilter(normMap, reader);
}
};
int numRounds = RANDOM_MULTIPLIER * 1000;
checkRandomData(random(), analyzer, numRounds);
analyzer.close();
}
//@Ignore("wrong finalOffset: https://issues.apache.org/jira/browse/LUCENE-3971")
public void testFinalOffsetSpecialCase() throws Exception {
final NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder();
builder.add("t", "");
// even though this below rule has no effect, the test passes if you remove it!!
builder.add("tmakdbl", "c");
final NormalizeCharMap map = builder.build();
Analyzer analyzer = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
return new TokenStreamComponents(tokenizer, tokenizer);
}
@Override
protected Reader initReader(String fieldName, Reader reader) {
return new MappingCharFilter(map, reader);
}
};
String text = "gzw f quaxot";
checkAnalysisConsistency(random(), analyzer, false, text);
analyzer.close();
}
//@Ignore("wrong finalOffset: https://issues.apache.org/jira/browse/LUCENE-3971")
public void testRandomMaps() throws Exception {
int numIterations = atLeast(3);
for (int i = 0; i < numIterations; i++) {
final NormalizeCharMap map = randomMap();
Analyzer analyzer = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
return new TokenStreamComponents(tokenizer, tokenizer);
}
@Override
protected Reader initReader(String fieldName, Reader reader) {
return new MappingCharFilter(map, reader);
}
};
int numRounds = 100;
checkRandomData(random(), analyzer, numRounds);
analyzer.close();
}
}
private NormalizeCharMap randomMap() {
Random random = random();
NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder();
// we can't add duplicate keys, or NormalizeCharMap gets angry
Set<String> keys = new HashSet<>();
int num = random.nextInt(5);
//System.out.println("NormalizeCharMap=");
for (int i = 0; i < num; i++) {
String key = TestUtil.randomSimpleString(random);
if (!keys.contains(key) && key.length() != 0) {
String value = TestUtil.randomSimpleString(random);
builder.add(key, value);
keys.add(key);
//System.out.println("mapping: '" + key + "' => '" + value + "'");
}
}
return builder.build();
}
public void testRandomMaps2() throws Exception {
final Random random = random();
final int numIterations = atLeast(3);
for(int iter=0;iter<numIterations;iter++) {
if (VERBOSE) {
System.out.println("\nTEST iter=" + iter);
}
final char endLetter = (char) TestUtil.nextInt(random, 'b', 'z');
final Map<String,String> map = new HashMap<>();
final NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder();
final int numMappings = atLeast(5);
if (VERBOSE) {
System.out.println(" mappings:");
}
while (map.size() < numMappings) {
final String key = TestUtil.randomSimpleStringRange(random, 'a', endLetter, 7);
if (key.length() != 0 && !map.containsKey(key)) {
final String value = TestUtil.randomSimpleString(random);
map.put(key, value);
builder.add(key, value);
if (VERBOSE) {
System.out.println(" " + key + " -> " + value);
}
}
}
final NormalizeCharMap charMap = builder.build();
if (VERBOSE) {
System.out.println(" test random documents...");
}
for(int iter2=0;iter2<100;iter2++) {
final String content = TestUtil.randomSimpleStringRange(random, 'a', endLetter, atLeast(1000));
if (VERBOSE) {
System.out.println(" content=" + content);
}
// Do stupid dog-slow mapping:
// Output string:
final StringBuilder output = new StringBuilder();
// Maps output offset to input offset:
final List<Integer> inputOffsets = new ArrayList<>();
int cumDiff = 0;
int charIdx = 0;
while(charIdx < content.length()) {
int matchLen = -1;
String matchRepl = null;
for(Map.Entry<String,String> ent : map.entrySet()) {
final String match = ent.getKey();
if (charIdx + match.length() <= content.length()) {
final int limit = charIdx+match.length();
boolean matches = true;
for(int charIdx2=charIdx;charIdx2<limit;charIdx2++) {
if (match.charAt(charIdx2-charIdx) != content.charAt(charIdx2)) {
matches = false;
break;
}
}
if (matches) {
final String repl = ent.getValue();
if (match.length() > matchLen) {
// Greedy: longer match wins
matchLen = match.length();
matchRepl = repl;
}
}
}
}
if (matchLen != -1) {
// We found a match here!
if (VERBOSE) {
System.out.println(" match=" + content.substring(charIdx, charIdx+matchLen) + " @ off=" + charIdx + " repl=" + matchRepl);
}
output.append(matchRepl);
final int minLen = Math.min(matchLen, matchRepl.length());
// Common part, directly maps back to input
// offset:
for(int outIdx=0;outIdx<minLen;outIdx++) {
inputOffsets.add(output.length() - matchRepl.length() + outIdx + cumDiff);
}
cumDiff += matchLen - matchRepl.length();
charIdx += matchLen;
if (matchRepl.length() < matchLen) {
// Replacement string is shorter than matched
// input: nothing to do
} else if (matchRepl.length() > matchLen) {
// Replacement string is longer than matched
// input: for all the "extra" chars we map
// back to a single input offset:
for(int outIdx=matchLen;outIdx<matchRepl.length();outIdx++) {
inputOffsets.add(output.length() + cumDiff - 1);
}
} else {
// Same length: no change to offset
}
assert inputOffsets.size() == output.length(): "inputOffsets.size()=" + inputOffsets.size() + " vs output.length()=" + output.length();
} else {
inputOffsets.add(output.length() + cumDiff);
output.append(content.charAt(charIdx));
charIdx++;
}
}
final String expected = output.toString();
if (VERBOSE) {
System.out.print(" expected:");
for(int charIdx2=0;charIdx2<expected.length();charIdx2++) {
System.out.print(" " + expected.charAt(charIdx2) + "/" + inputOffsets.get(charIdx2));
}
System.out.println();
}
final MappingCharFilter mapFilter = new MappingCharFilter(charMap, new StringReader(content));
final StringBuilder actualBuilder = new StringBuilder();
final List<Integer> actualInputOffsets = new ArrayList<>();
// Now consume the actual mapFilter, somewhat randomly:
while (true) {
if (random.nextBoolean()) {
final int ch = mapFilter.read();
if (ch == -1) {
break;
}
actualBuilder.append((char) ch);
} else {
final char[] buffer = new char[TestUtil.nextInt(random, 1, 100)];
final int off = buffer.length == 1 ? 0 : random.nextInt(buffer.length-1);
final int count = mapFilter.read(buffer, off, buffer.length-off);
if (count == -1) {
break;
} else {
actualBuilder.append(buffer, off, count);
}
}
if (random.nextInt(10) == 7) {
// Map offsets
while(actualInputOffsets.size() < actualBuilder.length()) {
actualInputOffsets.add(mapFilter.correctOffset(actualInputOffsets.size()));
}
}
}
// Finish mappping offsets
while(actualInputOffsets.size() < actualBuilder.length()) {
actualInputOffsets.add(mapFilter.correctOffset(actualInputOffsets.size()));
}
final String actual = actualBuilder.toString();
// Verify:
assertEquals(expected, actual);
assertEquals(inputOffsets, actualInputOffsets);
}
}
}
}