lucene/analysis/common/src/test/org/apache/lucene/analysis/charfilter/TestMappingCharFilter.java - lucene-solr - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.lucene.analysis.charfilter;

 import java.io.Reader;
 import java.io.StringReader;
 import java.util.ArrayList;
 import java.util.HashMap;
 import java.util.HashSet;
 import java.util.List;
 import java.util.Map;
 import java.util.Random;
 import java.util.Set;

 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
 import org.apache.lucene.analysis.CharFilter;
 import org.apache.lucene.analysis.MockTokenizer;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.Tokenizer;
 import org.apache.lucene.util.TestUtil;
 import org.apache.lucene.util.UnicodeUtil;

 public class TestMappingCharFilter extends BaseTokenStreamTestCase {

   NormalizeCharMap normMap;

   @Override
   public void setUp() throws Exception {
     super.setUp();
     NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder();

     builder.add( "aa", "a" );
     builder.add( "bbb", "b" );
     builder.add( "cccc", "cc" );

     builder.add( "h", "i" );
     builder.add( "j", "jj" );
     builder.add( "k", "kkk" );
     builder.add( "ll", "llll" );

     builder.add( "empty", "" );

     // BMP (surrogate pair):
     builder.add(UnicodeUtil.newString(new int[] {0x1D122}, 0, 1), "fclef");

     builder.add("\uff01", "full-width-exclamation");

     normMap = builder.build();
   }

   public void testReaderReset() throws Exception {
     CharFilter cs = new MappingCharFilter( normMap, new StringReader( "x" ) );
     char[] buf = new char[10];
     int len = cs.read(buf, 0, 10);
     assertEquals( 1, len );
     assertEquals( 'x', buf[0]) ;
     len = cs.read(buf, 0, 10);
     assertEquals( -1, len );

     // rewind
     cs.reset();
     len = cs.read(buf, 0, 10);
     assertEquals( 1, len );
     assertEquals( 'x', buf[0]) ;
   }

   public void testNothingChange() throws Exception {
     CharFilter cs = new MappingCharFilter( normMap, new StringReader( "x" ) );
     TokenStream ts =whitespaceMockTokenizer(cs);
     assertTokenStreamContents(ts, new String[]{"x"}, new int[]{0}, new int[]{1}, 1);
   }

   public void test1to1() throws Exception {
     CharFilter cs = new MappingCharFilter( normMap, new StringReader( "h" ) );
     TokenStream ts =whitespaceMockTokenizer(cs);
     assertTokenStreamContents(ts, new String[]{"i"}, new int[]{0}, new int[]{1}, 1);
   }

   public void test1to2() throws Exception {
     CharFilter cs = new MappingCharFilter( normMap, new StringReader( "j" ) );
     TokenStream ts =whitespaceMockTokenizer(cs);
     assertTokenStreamContents(ts, new String[]{"jj"}, new int[]{0}, new int[]{1}, 1);
   }

   public void test1to3() throws Exception {
     CharFilter cs = new MappingCharFilter( normMap, new StringReader( "k" ) );
     TokenStream ts =whitespaceMockTokenizer(cs);
     assertTokenStreamContents(ts, new String[]{"kkk"}, new int[]{0}, new int[]{1}, 1);
   }

   public void test2to4() throws Exception {
     CharFilter cs = new MappingCharFilter( normMap, new StringReader( "ll" ) );
     TokenStream ts =whitespaceMockTokenizer(cs);
     assertTokenStreamContents(ts, new String[]{"llll"}, new int[]{0}, new int[]{2}, 2);
   }

   public void test2to1() throws Exception {
     CharFilter cs = new MappingCharFilter( normMap, new StringReader( "aa" ) );
     TokenStream ts =whitespaceMockTokenizer(cs);
     assertTokenStreamContents(ts, new String[]{"a"}, new int[]{0}, new int[]{2}, 2);
   }

   public void test3to1() throws Exception {
     CharFilter cs = new MappingCharFilter( normMap, new StringReader( "bbb" ) );
     TokenStream ts =whitespaceMockTokenizer(cs);
     assertTokenStreamContents(ts, new String[]{"b"}, new int[]{0}, new int[]{3}, 3);
   }

   public void test4to2() throws Exception {
     CharFilter cs = new MappingCharFilter( normMap, new StringReader( "cccc" ) );
     TokenStream ts =whitespaceMockTokenizer(cs);
     assertTokenStreamContents(ts, new String[]{"cc"}, new int[]{0}, new int[]{4}, 4);
   }

   public void test5to0() throws Exception {
     CharFilter cs = new MappingCharFilter( normMap, new StringReader( "empty" ) );
     TokenStream ts =whitespaceMockTokenizer(cs);
     assertTokenStreamContents(ts, new String[0], new int[]{}, new int[]{}, 5);
   }

   public void testNonBMPChar() throws Exception {
     CharFilter cs = new MappingCharFilter( normMap, new StringReader( UnicodeUtil.newString(new int[] {0x1D122}, 0, 1) ) );
     TokenStream ts =whitespaceMockTokenizer(cs);
     assertTokenStreamContents(ts, new String[]{"fclef"}, new int[]{0}, new int[]{2}, 2);
   }

   public void testFullWidthChar() throws Exception {
     CharFilter cs = new MappingCharFilter( normMap, new StringReader( "\uff01") );
     TokenStream ts =whitespaceMockTokenizer(cs);
     assertTokenStreamContents(ts, new String[]{"full-width-exclamation"}, new int[]{0}, new int[]{1}, 1);
   }

   //
   //                1111111111222
   //      01234567890123456789012
   //(in)  h i j k ll cccc bbb aa
   //
   //                1111111111222
   //      01234567890123456789012
   //(out) i i jj kkk llll cc b a
   //
   //    h, 0, 1 =>    i, 0, 1
   //    i, 2, 3 =>    i, 2, 3
   //    j, 4, 5 =>   jj, 4, 5
   //    k, 6, 7 =>  kkk, 6, 7
   //   ll, 8,10 => llll, 8,10
   // cccc,11,15 =>   cc,11,15
   //  bbb,16,19 =>    b,16,19
   //   aa,20,22 =>    a,20,22
   //
   public void testTokenStream() throws Exception {
     String testString = "h i j k ll cccc bbb aa";
     CharFilter cs = new MappingCharFilter( normMap, new StringReader( testString ) );
     TokenStream ts =whitespaceMockTokenizer(cs);
     assertTokenStreamContents(ts,
       new String[]{"i","i","jj","kkk","llll","cc","b","a"},
       new int[]{0,2,4,6,8,11,16,20},
       new int[]{1,3,5,7,10,15,19,22},
       testString.length()
     );
   }

   //
   //
   //        0123456789
   //(in)    aaaa ll h
   //(out-1) aa llll i
   //(out-2) a llllllll i
   //
   // aaaa,0,4 => a,0,4
   //   ll,5,7 => llllllll,5,7
   //    h,8,9 => i,8,9
   public void testChained() throws Exception {
     String testString = "aaaa ll h";
     CharFilter cs = new MappingCharFilter( normMap,
         new MappingCharFilter( normMap, new StringReader( testString ) ) );
     TokenStream ts =whitespaceMockTokenizer(cs);
     assertTokenStreamContents(ts,
       new String[]{"a","llllllll","i"},
       new int[]{0,5,8},
       new int[]{4,7,9},
       testString.length()
     );
   }

   public void testRandom() throws Exception {
     Analyzer analyzer = new Analyzer() {

       @Override
       protected TokenStreamComponents createComponents(String fieldName) {
         Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
         return new TokenStreamComponents(tokenizer, tokenizer);
       }

       @Override
       protected Reader initReader(String fieldName, Reader reader) {
         return new MappingCharFilter(normMap, reader);
       }
     };

     int numRounds = RANDOM_MULTIPLIER * 1000;
     checkRandomData(random(), analyzer, numRounds);
     analyzer.close();
   }

   //@Ignore("wrong finalOffset: https://issues.apache.org/jira/browse/LUCENE-3971")
   public void testFinalOffsetSpecialCase() throws Exception {
     final NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder();
     builder.add("t", "");
     // even though this below rule has no effect, the test passes if you remove it!!
     builder.add("tmakdbl", "c");

     final NormalizeCharMap map = builder.build();

     Analyzer analyzer = new Analyzer() {
       @Override
       protected TokenStreamComponents createComponents(String fieldName) {
         Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
         return new TokenStreamComponents(tokenizer, tokenizer);
       }

       @Override
       protected Reader initReader(String fieldName, Reader reader) {
         return new MappingCharFilter(map, reader);
       }
     };

     String text = "gzw f quaxot";
     checkAnalysisConsistency(random(), analyzer, false, text);
     analyzer.close();
   }

   //@Ignore("wrong finalOffset: https://issues.apache.org/jira/browse/LUCENE-3971")
   public void testRandomMaps() throws Exception {
     int numIterations = atLeast(3);
     for (int i = 0; i < numIterations; i++) {
       final NormalizeCharMap map = randomMap();
       Analyzer analyzer = new Analyzer() {
         @Override
         protected TokenStreamComponents createComponents(String fieldName) {
           Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
           return new TokenStreamComponents(tokenizer, tokenizer);
         }

         @Override
         protected Reader initReader(String fieldName, Reader reader) {
           return new MappingCharFilter(map, reader);
         }
       };
       int numRounds = 100;
       checkRandomData(random(), analyzer, numRounds);
       analyzer.close();
     }
   }

   private NormalizeCharMap randomMap() {
     Random random = random();
     NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder();
     // we can't add duplicate keys, or NormalizeCharMap gets angry
     Set<String> keys = new HashSet<>();
     int num = random.nextInt(5);
     //System.out.println("NormalizeCharMap=");
     for (int i = 0; i < num; i++) {
       String key = TestUtil.randomSimpleString(random);
       if (!keys.contains(key) && key.length() != 0) {
         String value = TestUtil.randomSimpleString(random);
         builder.add(key, value);
         keys.add(key);
         //System.out.println("mapping: '" + key + "' => '" + value + "'");
       }
     }
     return builder.build();
   }

   public void testRandomMaps2() throws Exception {
     final Random random = random();
     final int numIterations = atLeast(3);
     for(int iter=0;iter<numIterations;iter++) {

       if (VERBOSE) {
         System.out.println("\nTEST iter=" + iter);
       }

       final char endLetter = (char) TestUtil.nextInt(random, 'b', 'z');

       final Map<String,String> map = new HashMap<>();
       final NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder();
       final int numMappings = atLeast(5);
       if (VERBOSE) {
         System.out.println("  mappings:");
       }
       while (map.size() < numMappings) {
         final String key = TestUtil.randomSimpleStringRange(random, 'a', endLetter, 7);
         if (key.length() != 0 && !map.containsKey(key)) {
           final String value = TestUtil.randomSimpleString(random);
           map.put(key, value);
           builder.add(key, value);
           if (VERBOSE) {
             System.out.println("    " + key + " -> " + value);
           }
         }
       }

       final NormalizeCharMap charMap = builder.build();

       if (VERBOSE) {
         System.out.println("  test random documents...");
       }

       for(int iter2=0;iter2<100;iter2++) {
         final String content = TestUtil.randomSimpleStringRange(random, 'a', endLetter, atLeast(1000));

         if (VERBOSE) {
           System.out.println("  content=" + content);
         }

         // Do stupid dog-slow mapping:

         // Output string:
         final StringBuilder output = new StringBuilder();

         // Maps output offset to input offset:
         final List<Integer> inputOffsets = new ArrayList<>();

         int cumDiff = 0;
         int charIdx = 0;
         while(charIdx < content.length()) {

           int matchLen = -1;
           String matchRepl = null;

           for(Map.Entry<String,String> ent : map.entrySet()) {
             final String match = ent.getKey();
             if (charIdx + match.length() <= content.length()) {
               final int limit = charIdx+match.length();
               boolean matches = true;
               for(int charIdx2=charIdx;charIdx2<limit;charIdx2++) {
                 if (match.charAt(charIdx2-charIdx) != content.charAt(charIdx2)) {
                   matches = false;
                   break;
                 }
               }

               if (matches) {
                 final String repl = ent.getValue();
                 if (match.length() > matchLen) {
                   // Greedy: longer match wins
                   matchLen = match.length();
                   matchRepl = repl;
                 }
               }
             }
           }

           if (matchLen != -1) {
             // We found a match here!
             if (VERBOSE) {
               System.out.println("    match=" + content.substring(charIdx, charIdx+matchLen) + " @ off=" + charIdx + " repl=" + matchRepl);
             }
             output.append(matchRepl);
             final int minLen = Math.min(matchLen, matchRepl.length());

             // Common part, directly maps back to input
             // offset:
             for(int outIdx=0;outIdx<minLen;outIdx++) {
               inputOffsets.add(output.length() - matchRepl.length() + outIdx + cumDiff);
             }

             cumDiff += matchLen - matchRepl.length();
             charIdx += matchLen;

             if (matchRepl.length() < matchLen) {
               // Replacement string is shorter than matched
               // input: nothing to do
             } else if (matchRepl.length() > matchLen) {
               // Replacement string is longer than matched
               // input: for all the "extra" chars we map
               // back to a single input offset:
               for(int outIdx=matchLen;outIdx<matchRepl.length();outIdx++) {
                 inputOffsets.add(output.length() + cumDiff - 1);
               }
             } else {
               // Same length: no change to offset
             }

             assert inputOffsets.size() == output.length(): "inputOffsets.size()=" + inputOffsets.size() + " vs output.length()=" + output.length();
           } else {
             inputOffsets.add(output.length() + cumDiff);
             output.append(content.charAt(charIdx));
             charIdx++;
           }
         }

         final String expected = output.toString();
         if (VERBOSE) {
           System.out.print("    expected:");
           for(int charIdx2=0;charIdx2<expected.length();charIdx2++) {
             System.out.print(" " + expected.charAt(charIdx2) + "/" + inputOffsets.get(charIdx2));
           }
           System.out.println();
         }

         final MappingCharFilter mapFilter = new MappingCharFilter(charMap, new StringReader(content));

         final StringBuilder actualBuilder = new StringBuilder();
         final List<Integer> actualInputOffsets = new ArrayList<>();

         // Now consume the actual mapFilter, somewhat randomly:
         while (true) {
           if (random.nextBoolean()) {
             final int ch = mapFilter.read();
             if (ch == -1) {
               break;
             }
             actualBuilder.append((char) ch);
           } else {
             final char[] buffer = new char[TestUtil.nextInt(random, 1, 100)];
             final int off = buffer.length == 1 ? 0 : random.nextInt(buffer.length-1);
             final int count = mapFilter.read(buffer, off, buffer.length-off);
             if (count == -1) {
               break;
             } else {
               actualBuilder.append(buffer, off, count);
             }
           }

           if (random.nextInt(10) == 7) {
             // Map offsets
             while(actualInputOffsets.size() < actualBuilder.length()) {
               actualInputOffsets.add(mapFilter.correctOffset(actualInputOffsets.size()));
             }
           }
         }

         // Finish mappping offsets
         while(actualInputOffsets.size() < actualBuilder.length()) {
           actualInputOffsets.add(mapFilter.correctOffset(actualInputOffsets.size()));
         }

         final String actual = actualBuilder.toString();

         // Verify:
         assertEquals(expected, actual);
         assertEquals(inputOffsets, actualInputOffsets);
       }
     }
   }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.lucene.analysis.charfilter;

	import java.io.Reader;
	import java.io.StringReader;
	import java.util.ArrayList;
	import java.util.HashMap;
	import java.util.HashSet;
	import java.util.List;
	import java.util.Map;
	import java.util.Random;
	import java.util.Set;

	import org.apache.lucene.analysis.Analyzer;
	import org.apache.lucene.analysis.BaseTokenStreamTestCase;
	import org.apache.lucene.analysis.CharFilter;
	import org.apache.lucene.analysis.MockTokenizer;
	import org.apache.lucene.analysis.TokenStream;
	import org.apache.lucene.analysis.Tokenizer;
	import org.apache.lucene.util.TestUtil;
	import org.apache.lucene.util.UnicodeUtil;

	public class TestMappingCharFilter extends BaseTokenStreamTestCase {

	NormalizeCharMap normMap;

	@Override
	public void setUp() throws Exception {
	super.setUp();
	NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder();

	builder.add( "aa", "a" );
	builder.add( "bbb", "b" );
	builder.add( "cccc", "cc" );

	builder.add( "h", "i" );
	builder.add( "j", "jj" );
	builder.add( "k", "kkk" );
	builder.add( "ll", "llll" );

	builder.add( "empty", "" );

	// BMP (surrogate pair):
	builder.add(UnicodeUtil.newString(new int[] {0x1D122}, 0, 1), "fclef");

	builder.add("\uff01", "full-width-exclamation");

	normMap = builder.build();
	}

	public void testReaderReset() throws Exception {
	CharFilter cs = new MappingCharFilter( normMap, new StringReader( "x" ) );
	char[] buf = new char[10];
	int len = cs.read(buf, 0, 10);
	assertEquals( 1, len );
	assertEquals( 'x', buf[0]) ;
	len = cs.read(buf, 0, 10);
	assertEquals( -1, len );

	// rewind
	cs.reset();
	len = cs.read(buf, 0, 10);
	assertEquals( 1, len );
	assertEquals( 'x', buf[0]) ;
	}

	public void testNothingChange() throws Exception {
	CharFilter cs = new MappingCharFilter( normMap, new StringReader( "x" ) );
	TokenStream ts =whitespaceMockTokenizer(cs);
	assertTokenStreamContents(ts, new String[]{"x"}, new int[]{0}, new int[]{1}, 1);
	}

	public void test1to1() throws Exception {
	CharFilter cs = new MappingCharFilter( normMap, new StringReader( "h" ) );
	TokenStream ts =whitespaceMockTokenizer(cs);
	assertTokenStreamContents(ts, new String[]{"i"}, new int[]{0}, new int[]{1}, 1);
	}

	public void test1to2() throws Exception {
	CharFilter cs = new MappingCharFilter( normMap, new StringReader( "j" ) );
	TokenStream ts =whitespaceMockTokenizer(cs);
	assertTokenStreamContents(ts, new String[]{"jj"}, new int[]{0}, new int[]{1}, 1);
	}

	public void test1to3() throws Exception {
	CharFilter cs = new MappingCharFilter( normMap, new StringReader( "k" ) );
	TokenStream ts =whitespaceMockTokenizer(cs);
	assertTokenStreamContents(ts, new String[]{"kkk"}, new int[]{0}, new int[]{1}, 1);
	}

	public void test2to4() throws Exception {
	CharFilter cs = new MappingCharFilter( normMap, new StringReader( "ll" ) );
	TokenStream ts =whitespaceMockTokenizer(cs);
	assertTokenStreamContents(ts, new String[]{"llll"}, new int[]{0}, new int[]{2}, 2);
	}

	public void test2to1() throws Exception {
	CharFilter cs = new MappingCharFilter( normMap, new StringReader( "aa" ) );
	TokenStream ts =whitespaceMockTokenizer(cs);
	assertTokenStreamContents(ts, new String[]{"a"}, new int[]{0}, new int[]{2}, 2);
	}

	public void test3to1() throws Exception {
	CharFilter cs = new MappingCharFilter( normMap, new StringReader( "bbb" ) );
	TokenStream ts =whitespaceMockTokenizer(cs);
	assertTokenStreamContents(ts, new String[]{"b"}, new int[]{0}, new int[]{3}, 3);
	}

	public void test4to2() throws Exception {
	CharFilter cs = new MappingCharFilter( normMap, new StringReader( "cccc" ) );
	TokenStream ts =whitespaceMockTokenizer(cs);
	assertTokenStreamContents(ts, new String[]{"cc"}, new int[]{0}, new int[]{4}, 4);
	}

	public void test5to0() throws Exception {
	CharFilter cs = new MappingCharFilter( normMap, new StringReader( "empty" ) );
	TokenStream ts =whitespaceMockTokenizer(cs);
	assertTokenStreamContents(ts, new String[0], new int[]{}, new int[]{}, 5);
	}

	public void testNonBMPChar() throws Exception {
	CharFilter cs = new MappingCharFilter( normMap, new StringReader( UnicodeUtil.newString(new int[] {0x1D122}, 0, 1) ) );
	TokenStream ts =whitespaceMockTokenizer(cs);
	assertTokenStreamContents(ts, new String[]{"fclef"}, new int[]{0}, new int[]{2}, 2);
	}

	public void testFullWidthChar() throws Exception {
	CharFilter cs = new MappingCharFilter( normMap, new StringReader( "\uff01") );
	TokenStream ts =whitespaceMockTokenizer(cs);
	assertTokenStreamContents(ts, new String[]{"full-width-exclamation"}, new int[]{0}, new int[]{1}, 1);
	}

	//
	// 1111111111222
	// 01234567890123456789012
	//(in) h i j k ll cccc bbb aa
	//
	// 1111111111222
	// 01234567890123456789012
	//(out) i i jj kkk llll cc b a
	//
	// h, 0, 1 => i, 0, 1
	// i, 2, 3 => i, 2, 3
	// j, 4, 5 => jj, 4, 5
	// k, 6, 7 => kkk, 6, 7
	// ll, 8,10 => llll, 8,10
	// cccc,11,15 => cc,11,15
	// bbb,16,19 => b,16,19
	// aa,20,22 => a,20,22
	//
	public void testTokenStream() throws Exception {
	String testString = "h i j k ll cccc bbb aa";
	CharFilter cs = new MappingCharFilter( normMap, new StringReader( testString ) );
	TokenStream ts =whitespaceMockTokenizer(cs);
	assertTokenStreamContents(ts,
	new String[]{"i","i","jj","kkk","llll","cc","b","a"},
	new int[]{0,2,4,6,8,11,16,20},
	new int[]{1,3,5,7,10,15,19,22},
	testString.length()
	);
	}

	//
	//
	// 0123456789
	//(in) aaaa ll h
	//(out-1) aa llll i
	//(out-2) a llllllll i
	//
	// aaaa,0,4 => a,0,4
	// ll,5,7 => llllllll,5,7
	// h,8,9 => i,8,9
	public void testChained() throws Exception {
	String testString = "aaaa ll h";
	CharFilter cs = new MappingCharFilter( normMap,
	new MappingCharFilter( normMap, new StringReader( testString ) ) );
	TokenStream ts =whitespaceMockTokenizer(cs);
	assertTokenStreamContents(ts,
	new String[]{"a","llllllll","i"},
	new int[]{0,5,8},
	new int[]{4,7,9},
	testString.length()
	);
	}

	public void testRandom() throws Exception {
	Analyzer analyzer = new Analyzer() {

	@Override
	protected TokenStreamComponents createComponents(String fieldName) {
	Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
	return new TokenStreamComponents(tokenizer, tokenizer);
	}

	@Override
	protected Reader initReader(String fieldName, Reader reader) {
	return new MappingCharFilter(normMap, reader);
	}
	};

	int numRounds = RANDOM_MULTIPLIER * 1000;
	checkRandomData(random(), analyzer, numRounds);
	analyzer.close();
	}

	//@Ignore("wrong finalOffset: https://issues.apache.org/jira/browse/LUCENE-3971")
	public void testFinalOffsetSpecialCase() throws Exception {
	final NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder();
	builder.add("t", "");
	// even though this below rule has no effect, the test passes if you remove it!!
	builder.add("tmakdbl", "c");

	final NormalizeCharMap map = builder.build();

	Analyzer analyzer = new Analyzer() {
	@Override
	protected TokenStreamComponents createComponents(String fieldName) {
	Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
	return new TokenStreamComponents(tokenizer, tokenizer);
	}

	@Override
	protected Reader initReader(String fieldName, Reader reader) {
	return new MappingCharFilter(map, reader);
	}
	};

	String text = "gzw f quaxot";
	checkAnalysisConsistency(random(), analyzer, false, text);
	analyzer.close();
	}

	//@Ignore("wrong finalOffset: https://issues.apache.org/jira/browse/LUCENE-3971")
	public void testRandomMaps() throws Exception {
	int numIterations = atLeast(3);
	for (int i = 0; i < numIterations; i++) {
	final NormalizeCharMap map = randomMap();
	Analyzer analyzer = new Analyzer() {
	@Override
	protected TokenStreamComponents createComponents(String fieldName) {
	Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
	return new TokenStreamComponents(tokenizer, tokenizer);
	}

	@Override
	protected Reader initReader(String fieldName, Reader reader) {
	return new MappingCharFilter(map, reader);
	}
	};
	int numRounds = 100;
	checkRandomData(random(), analyzer, numRounds);
	analyzer.close();
	}
	}

	private NormalizeCharMap randomMap() {
	Random random = random();
	NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder();
	// we can't add duplicate keys, or NormalizeCharMap gets angry
	Set<String> keys = new HashSet<>();
	int num = random.nextInt(5);
	//System.out.println("NormalizeCharMap=");
	for (int i = 0; i < num; i++) {
	String key = TestUtil.randomSimpleString(random);
	if (!keys.contains(key) && key.length() != 0) {
	String value = TestUtil.randomSimpleString(random);
	builder.add(key, value);
	keys.add(key);
	//System.out.println("mapping: '" + key + "' => '" + value + "'");
	}
	}
	return builder.build();
	}

	public void testRandomMaps2() throws Exception {
	final Random random = random();
	final int numIterations = atLeast(3);
	for(int iter=0;iter<numIterations;iter++) {

	if (VERBOSE) {
	System.out.println("\nTEST iter=" + iter);
	}

	final char endLetter = (char) TestUtil.nextInt(random, 'b', 'z');

	final Map<String,String> map = new HashMap<>();
	final NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder();
	final int numMappings = atLeast(5);
	if (VERBOSE) {
	System.out.println(" mappings:");
	}
	while (map.size() < numMappings) {
	final String key = TestUtil.randomSimpleStringRange(random, 'a', endLetter, 7);
	if (key.length() != 0 && !map.containsKey(key)) {
	final String value = TestUtil.randomSimpleString(random);
	map.put(key, value);
	builder.add(key, value);
	if (VERBOSE) {
	System.out.println(" " + key + " -> " + value);
	}
	}
	}

	final NormalizeCharMap charMap = builder.build();

	if (VERBOSE) {
	System.out.println(" test random documents...");
	}

	for(int iter2=0;iter2<100;iter2++) {
	final String content = TestUtil.randomSimpleStringRange(random, 'a', endLetter, atLeast(1000));

	if (VERBOSE) {
	System.out.println(" content=" + content);
	}

	// Do stupid dog-slow mapping:

	// Output string:
	final StringBuilder output = new StringBuilder();

	// Maps output offset to input offset:
	final List<Integer> inputOffsets = new ArrayList<>();

	int cumDiff = 0;
	int charIdx = 0;
	while(charIdx < content.length()) {

	int matchLen = -1;
	String matchRepl = null;

	for(Map.Entry<String,String> ent : map.entrySet()) {
	final String match = ent.getKey();
	if (charIdx + match.length() <= content.length()) {
	final int limit = charIdx+match.length();
	boolean matches = true;
	for(int charIdx2=charIdx;charIdx2<limit;charIdx2++) {
	if (match.charAt(charIdx2-charIdx) != content.charAt(charIdx2)) {
	matches = false;
	break;
	}
	}

	if (matches) {
	final String repl = ent.getValue();
	if (match.length() > matchLen) {
	// Greedy: longer match wins
	matchLen = match.length();
	matchRepl = repl;
	}
	}
	}
	}

	if (matchLen != -1) {
	// We found a match here!
	if (VERBOSE) {
	System.out.println(" match=" + content.substring(charIdx, charIdx+matchLen) + " @ off=" + charIdx + " repl=" + matchRepl);
	}
	output.append(matchRepl);
	final int minLen = Math.min(matchLen, matchRepl.length());

	// Common part, directly maps back to input
	// offset:
	for(int outIdx=0;outIdx<minLen;outIdx++) {
	inputOffsets.add(output.length() - matchRepl.length() + outIdx + cumDiff);
	}

	cumDiff += matchLen - matchRepl.length();
	charIdx += matchLen;

	if (matchRepl.length() < matchLen) {
	// Replacement string is shorter than matched
	// input: nothing to do
	} else if (matchRepl.length() > matchLen) {
	// Replacement string is longer than matched
	// input: for all the "extra" chars we map
	// back to a single input offset:
	for(int outIdx=matchLen;outIdx<matchRepl.length();outIdx++) {
	inputOffsets.add(output.length() + cumDiff - 1);
	}
	} else {
	// Same length: no change to offset
	}

	assert inputOffsets.size() == output.length(): "inputOffsets.size()=" + inputOffsets.size() + " vs output.length()=" + output.length();
	} else {
	inputOffsets.add(output.length() + cumDiff);
	output.append(content.charAt(charIdx));
	charIdx++;
	}
	}

	final String expected = output.toString();
	if (VERBOSE) {
	System.out.print(" expected:");
	for(int charIdx2=0;charIdx2<expected.length();charIdx2++) {
	System.out.print(" " + expected.charAt(charIdx2) + "/" + inputOffsets.get(charIdx2));
	}
	System.out.println();
	}

	final MappingCharFilter mapFilter = new MappingCharFilter(charMap, new StringReader(content));

	final StringBuilder actualBuilder = new StringBuilder();
	final List<Integer> actualInputOffsets = new ArrayList<>();

	// Now consume the actual mapFilter, somewhat randomly:
	while (true) {
	if (random.nextBoolean()) {
	final int ch = mapFilter.read();
	if (ch == -1) {
	break;
	}
	actualBuilder.append((char) ch);
	} else {
	final char[] buffer = new char[TestUtil.nextInt(random, 1, 100)];
	final int off = buffer.length == 1 ? 0 : random.nextInt(buffer.length-1);
	final int count = mapFilter.read(buffer, off, buffer.length-off);
	if (count == -1) {
	break;
	} else {
	actualBuilder.append(buffer, off, count);
	}
	}

	if (random.nextInt(10) == 7) {
	// Map offsets
	while(actualInputOffsets.size() < actualBuilder.length()) {
	actualInputOffsets.add(mapFilter.correctOffset(actualInputOffsets.size()));
	}
	}
	}

	// Finish mappping offsets
	while(actualInputOffsets.size() < actualBuilder.length()) {
	actualInputOffsets.add(mapFilter.correctOffset(actualInputOffsets.size()));
	}

	final String actual = actualBuilder.toString();

	// Verify:
	assertEquals(expected, actual);
	assertEquals(inputOffsets, actualInputOffsets);
	}
	}
	}
	}