lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestBugInSomething.java - lucene-solr - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.lucene.analysis.core;

 import java.io.IOException;
 import java.io.Reader;
 import java.io.StringReader;
 import java.nio.CharBuffer;
 import java.util.Arrays;
 import java.util.HashSet;

 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
 import org.apache.lucene.analysis.CharArraySet;
 import org.apache.lucene.analysis.CharFilter;
 import org.apache.lucene.analysis.MockCharFilter;
 import org.apache.lucene.analysis.MockTokenFilter;
 import org.apache.lucene.analysis.MockTokenizer;
 import org.apache.lucene.analysis.TokenFilter;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.Tokenizer;
 import org.apache.lucene.analysis.charfilter.MappingCharFilter;
 import org.apache.lucene.analysis.charfilter.NormalizeCharMap;
 import org.apache.lucene.analysis.commongrams.CommonGramsFilter;
 import org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter;
 import org.apache.lucene.analysis.ngram.EdgeNGramTokenizer;
 import org.apache.lucene.analysis.ngram.NGramTokenFilter;
 import org.apache.lucene.analysis.shingle.ShingleFilter;
 import org.apache.lucene.analysis.wikipedia.WikipediaTokenizer;
 import org.apache.lucene.util.LuceneTestCase.SuppressCodecs;


 @SuppressCodecs("Direct")
 public class TestBugInSomething extends BaseTokenStreamTestCase {
   public void test() throws Exception {
     final CharArraySet cas = new CharArraySet(3, false);
     cas.add("jjp");
     cas.add("wlmwoknt");
     cas.add("tcgyreo");

     final NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder();
     builder.add("mtqlpi", "");
     builder.add("mwoknt", "jjp");
     builder.add("tcgyreo", "zpfpajyws");
     final NormalizeCharMap map = builder.build();

     Analyzer a = new Analyzer() {
       @Override
       protected TokenStreamComponents createComponents(String fieldName) {
         Tokenizer t = new MockTokenizer(MockTokenFilter.ENGLISH_STOPSET, false, -65);
         TokenFilter f = new CommonGramsFilter(t, cas);
         return new TokenStreamComponents(t, f);
       }

       @Override
       protected Reader initReader(String fieldName, Reader reader) {
         reader = new MockCharFilter(reader, 0);
         reader = new MappingCharFilter(map, reader);
         reader = new TestRandomChains.CheckThatYouDidntReadAnythingReaderWrapper(reader);
         return reader;
       }
     };
     checkAnalysisConsistency(random(), a, false, "wmgddzunizdomqyj");
     a.close();
   }

   CharFilter wrappedStream = new CharFilter(new StringReader("bogus")) {

     @Override
     public void mark(int readAheadLimit) {
       throw new UnsupportedOperationException("mark(int)");
     }

     @Override
     public boolean markSupported() {
       throw new UnsupportedOperationException("markSupported()");
     }

     @Override
     public int read() {
       throw new UnsupportedOperationException("read()");
     }

     @Override
     public int read(char[] cbuf) {
       throw new UnsupportedOperationException("read(char[])");
     }

     @Override
     public int read(CharBuffer target) {
       throw new UnsupportedOperationException("read(CharBuffer)");
     }

     @Override
     public boolean ready() {
       throw new UnsupportedOperationException("ready()");
     }

     @Override
     public void reset() {
       throw new UnsupportedOperationException("reset()");
     }

     @Override
     public long skip(long n) {
       throw new UnsupportedOperationException("skip(long)");
     }

     @Override
     public int correct(int currentOff) {
       throw new UnsupportedOperationException("correct(int)");
     }

     @Override
     public void close() {
       throw new UnsupportedOperationException("close()");
     }

     @Override
     public int read(char[] arg0, int arg1, int arg2) {
       throw new UnsupportedOperationException("read(char[], int, int)");
     }
   };

   public void testWrapping() throws Exception {
     CharFilter cs = new TestRandomChains.CheckThatYouDidntReadAnythingReaderWrapper(wrappedStream);
     Exception expected = expectThrows(Exception.class, () -> {
       cs.mark(1);
     });
     assertEquals("mark(int)", expected.getMessage());

     expected = expectThrows(Exception.class, () -> {
       cs.markSupported();
     });
     assertEquals("markSupported()", expected.getMessage());

     expected = expectThrows(Exception.class, () -> {
       cs.read();
     });
     assertEquals("read()", expected.getMessage());

     expected = expectThrows(Exception.class, () -> {
       cs.read(new char[0]);
     });
     assertEquals("read(char[])", expected.getMessage());

     expected = expectThrows(Exception.class, () -> {
       cs.read(CharBuffer.wrap(new char[0]));
     });
     assertEquals("read(CharBuffer)", expected.getMessage());

     expected = expectThrows(Exception.class, () -> {
       cs.reset();
     });
     assertEquals("reset()", expected.getMessage());

     expected = expectThrows(Exception.class, () -> {
       cs.skip(1);
     });
     assertEquals("skip(long)", expected.getMessage());

     expected = expectThrows(Exception.class, () -> {
       cs.correctOffset(1);
     });
     assertEquals("correct(int)", expected.getMessage());

     expected = expectThrows(Exception.class, () -> {
       cs.close();
     });
     assertEquals("close()", expected.getMessage());

     expected = expectThrows(Exception.class, () -> {
       cs.read(new char[0], 0, 0);
     });
     assertEquals("read(char[], int, int)", expected.getMessage());
   }

   // todo: test framework?

   static final class SopTokenFilter extends TokenFilter {

     SopTokenFilter(TokenStream input) {
       super(input);
     }

     @Override
     public boolean incrementToken() throws IOException {
       if (input.incrementToken()) {
         if (VERBOSE) System.out.println(input.getClass().getSimpleName() + "->" + this.reflectAsString(false));
         return true;
       } else {
         return false;
       }
     }

     @Override
     public void end() throws IOException {
       super.end();
       if (VERBOSE) System.out.println(input.getClass().getSimpleName() + ".end()");
     }

     @Override
     public void close() throws IOException {
       super.close();
       if (VERBOSE) System.out.println(input.getClass().getSimpleName() + ".close()");
     }

     @Override
     public void reset() throws IOException {
       super.reset();
       if (VERBOSE) System.out.println(input.getClass().getSimpleName() + ".reset()");
     }
   }

   // LUCENE-5269
   @Nightly
   public void testUnicodeShinglesAndNgrams() throws Exception {
     Analyzer analyzer = new Analyzer() {
       @Override
       protected TokenStreamComponents createComponents(String fieldName) {
         Tokenizer tokenizer = new EdgeNGramTokenizer(2, 94);
         //TokenStream stream = new SopTokenFilter(tokenizer);
         TokenStream stream = new ShingleFilter(tokenizer, 5);
         //stream = new SopTokenFilter(stream);
         stream = new NGramTokenFilter(stream, 55, 83, false);
         //stream = new SopTokenFilter(stream);
         return new TokenStreamComponents(tokenizer, stream);
       }
     };
     checkRandomData(random(), analyzer, 2000);
     analyzer.close();
   }

   public void testCuriousWikipediaString() throws Exception {
     final CharArraySet protWords = new CharArraySet(new HashSet<>(
         Arrays.asList("rrdpafa", "pupmmlu", "xlq", "dyy", "zqrxrrck", "o", "hsrlfvcha")), false);
     final byte table[] = new byte[] {
         -57, 26, 1, 48, 63, -23, 55, -84, 18, 120, -97, 103, 58, 13, 84, 89, 57, -13, -63,
         5, 28, 97, -54, -94, 102, -108, -5, 5, 46, 40, 43, 78, 43, -72, 36, 29, 124, -106,
         -22, -51, 65, 5, 31, -42, 6, -99, 97, 14, 81, -128, 74, 100, 54, -55, -25, 53, -71,
         -98, 44, 33, 86, 106, -42, 47, 115, -89, -18, -26, 22, -95, -43, 83, -125, 105, -104,
         -24, 106, -16, 126, 115, -105, 97, 65, -33, 57, 44, -1, 123, -68, 100, 13, -41, -64,
         -119, 0, 92, 94, -36, 53, -9, -102, -18, 90, 94, -26, 31, 71, -20
     };
     Analyzer a = new Analyzer() {
       @Override
       protected TokenStreamComponents createComponents(String fieldName) {
         Tokenizer tokenizer = new WikipediaTokenizer();
         TokenStream stream = new SopTokenFilter(tokenizer);
         stream = new WordDelimiterFilter(stream, table, -50, protWords);
         stream = new SopTokenFilter(stream);
         return new TokenStreamComponents(tokenizer, stream);
       }
     };
     checkAnalysisConsistency(random(), a, false, "B\u28c3\ue0f8[ \ud800\udfc2 </p> jb");
     a.close();
   }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.lucene.analysis.core;

	import java.io.IOException;
	import java.io.Reader;
	import java.io.StringReader;
	import java.nio.CharBuffer;
	import java.util.Arrays;
	import java.util.HashSet;

	import org.apache.lucene.analysis.Analyzer;
	import org.apache.lucene.analysis.BaseTokenStreamTestCase;
	import org.apache.lucene.analysis.CharArraySet;
	import org.apache.lucene.analysis.CharFilter;
	import org.apache.lucene.analysis.MockCharFilter;
	import org.apache.lucene.analysis.MockTokenFilter;
	import org.apache.lucene.analysis.MockTokenizer;
	import org.apache.lucene.analysis.TokenFilter;
	import org.apache.lucene.analysis.TokenStream;
	import org.apache.lucene.analysis.Tokenizer;
	import org.apache.lucene.analysis.charfilter.MappingCharFilter;
	import org.apache.lucene.analysis.charfilter.NormalizeCharMap;
	import org.apache.lucene.analysis.commongrams.CommonGramsFilter;
	import org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter;
	import org.apache.lucene.analysis.ngram.EdgeNGramTokenizer;
	import org.apache.lucene.analysis.ngram.NGramTokenFilter;
	import org.apache.lucene.analysis.shingle.ShingleFilter;
	import org.apache.lucene.analysis.wikipedia.WikipediaTokenizer;
	import org.apache.lucene.util.LuceneTestCase.SuppressCodecs;


	@SuppressCodecs("Direct")
	public class TestBugInSomething extends BaseTokenStreamTestCase {
	public void test() throws Exception {
	final CharArraySet cas = new CharArraySet(3, false);
	cas.add("jjp");
	cas.add("wlmwoknt");
	cas.add("tcgyreo");

	final NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder();
	builder.add("mtqlpi", "");
	builder.add("mwoknt", "jjp");
	builder.add("tcgyreo", "zpfpajyws");
	final NormalizeCharMap map = builder.build();

	Analyzer a = new Analyzer() {
	@Override
	protected TokenStreamComponents createComponents(String fieldName) {
	Tokenizer t = new MockTokenizer(MockTokenFilter.ENGLISH_STOPSET, false, -65);
	TokenFilter f = new CommonGramsFilter(t, cas);
	return new TokenStreamComponents(t, f);
	}

	@Override
	protected Reader initReader(String fieldName, Reader reader) {
	reader = new MockCharFilter(reader, 0);
	reader = new MappingCharFilter(map, reader);
	reader = new TestRandomChains.CheckThatYouDidntReadAnythingReaderWrapper(reader);
	return reader;
	}
	};
	checkAnalysisConsistency(random(), a, false, "wmgddzunizdomqyj");
	a.close();
	}

	CharFilter wrappedStream = new CharFilter(new StringReader("bogus")) {

	@Override
	public void mark(int readAheadLimit) {
	throw new UnsupportedOperationException("mark(int)");
	}

	@Override
	public boolean markSupported() {
	throw new UnsupportedOperationException("markSupported()");
	}

	@Override
	public int read() {
	throw new UnsupportedOperationException("read()");
	}

	@Override
	public int read(char[] cbuf) {
	throw new UnsupportedOperationException("read(char[])");
	}

	@Override
	public int read(CharBuffer target) {
	throw new UnsupportedOperationException("read(CharBuffer)");
	}

	@Override
	public boolean ready() {
	throw new UnsupportedOperationException("ready()");
	}

	@Override
	public void reset() {
	throw new UnsupportedOperationException("reset()");
	}

	@Override
	public long skip(long n) {
	throw new UnsupportedOperationException("skip(long)");
	}

	@Override
	public int correct(int currentOff) {
	throw new UnsupportedOperationException("correct(int)");
	}

	@Override
	public void close() {
	throw new UnsupportedOperationException("close()");
	}

	@Override
	public int read(char[] arg0, int arg1, int arg2) {
	throw new UnsupportedOperationException("read(char[], int, int)");
	}
	};

	public void testWrapping() throws Exception {
	CharFilter cs = new TestRandomChains.CheckThatYouDidntReadAnythingReaderWrapper(wrappedStream);
	Exception expected = expectThrows(Exception.class, () -> {
	cs.mark(1);
	});
	assertEquals("mark(int)", expected.getMessage());

	expected = expectThrows(Exception.class, () -> {
	cs.markSupported();
	});
	assertEquals("markSupported()", expected.getMessage());

	expected = expectThrows(Exception.class, () -> {
	cs.read();
	});
	assertEquals("read()", expected.getMessage());

	expected = expectThrows(Exception.class, () -> {
	cs.read(new char[0]);
	});
	assertEquals("read(char[])", expected.getMessage());

	expected = expectThrows(Exception.class, () -> {
	cs.read(CharBuffer.wrap(new char[0]));
	});
	assertEquals("read(CharBuffer)", expected.getMessage());

	expected = expectThrows(Exception.class, () -> {
	cs.reset();
	});
	assertEquals("reset()", expected.getMessage());

	expected = expectThrows(Exception.class, () -> {
	cs.skip(1);
	});
	assertEquals("skip(long)", expected.getMessage());

	expected = expectThrows(Exception.class, () -> {
	cs.correctOffset(1);
	});
	assertEquals("correct(int)", expected.getMessage());

	expected = expectThrows(Exception.class, () -> {
	cs.close();
	});
	assertEquals("close()", expected.getMessage());

	expected = expectThrows(Exception.class, () -> {
	cs.read(new char[0], 0, 0);
	});
	assertEquals("read(char[], int, int)", expected.getMessage());
	}

	// todo: test framework?

	static final class SopTokenFilter extends TokenFilter {

	SopTokenFilter(TokenStream input) {
	super(input);
	}

	@Override
	public boolean incrementToken() throws IOException {
	if (input.incrementToken()) {
	if (VERBOSE) System.out.println(input.getClass().getSimpleName() + "->" + this.reflectAsString(false));
	return true;
	} else {
	return false;
	}
	}

	@Override
	public void end() throws IOException {
	super.end();
	if (VERBOSE) System.out.println(input.getClass().getSimpleName() + ".end()");
	}

	@Override
	public void close() throws IOException {
	super.close();
	if (VERBOSE) System.out.println(input.getClass().getSimpleName() + ".close()");
	}

	@Override
	public void reset() throws IOException {
	super.reset();
	if (VERBOSE) System.out.println(input.getClass().getSimpleName() + ".reset()");
	}
	}

	// LUCENE-5269
	@Nightly
	public void testUnicodeShinglesAndNgrams() throws Exception {
	Analyzer analyzer = new Analyzer() {
	@Override
	protected TokenStreamComponents createComponents(String fieldName) {
	Tokenizer tokenizer = new EdgeNGramTokenizer(2, 94);
	//TokenStream stream = new SopTokenFilter(tokenizer);
	TokenStream stream = new ShingleFilter(tokenizer, 5);
	//stream = new SopTokenFilter(stream);
	stream = new NGramTokenFilter(stream, 55, 83, false);
	//stream = new SopTokenFilter(stream);
	return new TokenStreamComponents(tokenizer, stream);
	}
	};
	checkRandomData(random(), analyzer, 2000);
	analyzer.close();
	}

	public void testCuriousWikipediaString() throws Exception {
	final CharArraySet protWords = new CharArraySet(new HashSet<>(
	Arrays.asList("rrdpafa", "pupmmlu", "xlq", "dyy", "zqrxrrck", "o", "hsrlfvcha")), false);
	final byte table[] = new byte[] {
	-57, 26, 1, 48, 63, -23, 55, -84, 18, 120, -97, 103, 58, 13, 84, 89, 57, -13, -63,
	5, 28, 97, -54, -94, 102, -108, -5, 5, 46, 40, 43, 78, 43, -72, 36, 29, 124, -106,
	-22, -51, 65, 5, 31, -42, 6, -99, 97, 14, 81, -128, 74, 100, 54, -55, -25, 53, -71,
	-98, 44, 33, 86, 106, -42, 47, 115, -89, -18, -26, 22, -95, -43, 83, -125, 105, -104,
	-24, 106, -16, 126, 115, -105, 97, 65, -33, 57, 44, -1, 123, -68, 100, 13, -41, -64,
	-119, 0, 92, 94, -36, 53, -9, -102, -18, 90, 94, -26, 31, 71, -20
	};
	Analyzer a = new Analyzer() {
	@Override
	protected TokenStreamComponents createComponents(String fieldName) {
	Tokenizer tokenizer = new WikipediaTokenizer();
	TokenStream stream = new SopTokenFilter(tokenizer);
	stream = new WordDelimiterFilter(stream, table, -50, protWords);
	stream = new SopTokenFilter(stream);
	return new TokenStreamComponents(tokenizer, stream);
	}
	};
	checkAnalysisConsistency(random(), a, false, "B\u28c3\ue0f8[ \ud800\udfc2 </p> jb");
	a.close();
	}
	}