blob: cb9b5096b457c90a195fa911d6c6a748f43832b3 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.core;
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.nio.CharBuffer;
import java.util.Arrays;
import java.util.HashSet;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.CharFilter;
import org.apache.lucene.analysis.MockCharFilter;
import org.apache.lucene.analysis.MockTokenFilter;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.charfilter.MappingCharFilter;
import org.apache.lucene.analysis.charfilter.NormalizeCharMap;
import org.apache.lucene.analysis.commongrams.CommonGramsFilter;
import org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter;
import org.apache.lucene.analysis.ngram.EdgeNGramTokenizer;
import org.apache.lucene.analysis.ngram.NGramTokenFilter;
import org.apache.lucene.analysis.shingle.ShingleFilter;
import org.apache.lucene.analysis.wikipedia.WikipediaTokenizer;
import org.apache.lucene.util.LuceneTestCase.SuppressCodecs;
@SuppressCodecs("Direct")
public class TestBugInSomething extends BaseTokenStreamTestCase {
public void test() throws Exception {
final CharArraySet cas = new CharArraySet(3, false);
cas.add("jjp");
cas.add("wlmwoknt");
cas.add("tcgyreo");
final NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder();
builder.add("mtqlpi", "");
builder.add("mwoknt", "jjp");
builder.add("tcgyreo", "zpfpajyws");
final NormalizeCharMap map = builder.build();
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer t = new MockTokenizer(MockTokenFilter.ENGLISH_STOPSET, false, -65);
TokenFilter f = new CommonGramsFilter(t, cas);
return new TokenStreamComponents(t, f);
}
@Override
protected Reader initReader(String fieldName, Reader reader) {
reader = new MockCharFilter(reader, 0);
reader = new MappingCharFilter(map, reader);
reader = new TestRandomChains.CheckThatYouDidntReadAnythingReaderWrapper(reader);
return reader;
}
};
checkAnalysisConsistency(random(), a, false, "wmgddzunizdomqyj");
a.close();
}
CharFilter wrappedStream = new CharFilter(new StringReader("bogus")) {
@Override
public void mark(int readAheadLimit) {
throw new UnsupportedOperationException("mark(int)");
}
@Override
public boolean markSupported() {
throw new UnsupportedOperationException("markSupported()");
}
@Override
public int read() {
throw new UnsupportedOperationException("read()");
}
@Override
public int read(char[] cbuf) {
throw new UnsupportedOperationException("read(char[])");
}
@Override
public int read(CharBuffer target) {
throw new UnsupportedOperationException("read(CharBuffer)");
}
@Override
public boolean ready() {
throw new UnsupportedOperationException("ready()");
}
@Override
public void reset() {
throw new UnsupportedOperationException("reset()");
}
@Override
public long skip(long n) {
throw new UnsupportedOperationException("skip(long)");
}
@Override
public int correct(int currentOff) {
throw new UnsupportedOperationException("correct(int)");
}
@Override
public void close() {
throw new UnsupportedOperationException("close()");
}
@Override
public int read(char[] arg0, int arg1, int arg2) {
throw new UnsupportedOperationException("read(char[], int, int)");
}
};
public void testWrapping() throws Exception {
CharFilter cs = new TestRandomChains.CheckThatYouDidntReadAnythingReaderWrapper(wrappedStream);
Exception expected = expectThrows(Exception.class, () -> {
cs.mark(1);
});
assertEquals("mark(int)", expected.getMessage());
expected = expectThrows(Exception.class, () -> {
cs.markSupported();
});
assertEquals("markSupported()", expected.getMessage());
expected = expectThrows(Exception.class, () -> {
cs.read();
});
assertEquals("read()", expected.getMessage());
expected = expectThrows(Exception.class, () -> {
cs.read(new char[0]);
});
assertEquals("read(char[])", expected.getMessage());
expected = expectThrows(Exception.class, () -> {
cs.read(CharBuffer.wrap(new char[0]));
});
assertEquals("read(CharBuffer)", expected.getMessage());
expected = expectThrows(Exception.class, () -> {
cs.reset();
});
assertEquals("reset()", expected.getMessage());
expected = expectThrows(Exception.class, () -> {
cs.skip(1);
});
assertEquals("skip(long)", expected.getMessage());
expected = expectThrows(Exception.class, () -> {
cs.correctOffset(1);
});
assertEquals("correct(int)", expected.getMessage());
expected = expectThrows(Exception.class, () -> {
cs.close();
});
assertEquals("close()", expected.getMessage());
expected = expectThrows(Exception.class, () -> {
cs.read(new char[0], 0, 0);
});
assertEquals("read(char[], int, int)", expected.getMessage());
}
// todo: test framework?
static final class SopTokenFilter extends TokenFilter {
SopTokenFilter(TokenStream input) {
super(input);
}
@Override
public boolean incrementToken() throws IOException {
if (input.incrementToken()) {
if (VERBOSE) System.out.println(input.getClass().getSimpleName() + "->" + this.reflectAsString(false));
return true;
} else {
return false;
}
}
@Override
public void end() throws IOException {
super.end();
if (VERBOSE) System.out.println(input.getClass().getSimpleName() + ".end()");
}
@Override
public void close() throws IOException {
super.close();
if (VERBOSE) System.out.println(input.getClass().getSimpleName() + ".close()");
}
@Override
public void reset() throws IOException {
super.reset();
if (VERBOSE) System.out.println(input.getClass().getSimpleName() + ".reset()");
}
}
// LUCENE-5269
@Nightly
public void testUnicodeShinglesAndNgrams() throws Exception {
Analyzer analyzer = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = new EdgeNGramTokenizer(2, 94);
//TokenStream stream = new SopTokenFilter(tokenizer);
TokenStream stream = new ShingleFilter(tokenizer, 5);
//stream = new SopTokenFilter(stream);
stream = new NGramTokenFilter(stream, 55, 83, false);
//stream = new SopTokenFilter(stream);
return new TokenStreamComponents(tokenizer, stream);
}
};
checkRandomData(random(), analyzer, 2000);
analyzer.close();
}
public void testCuriousWikipediaString() throws Exception {
final CharArraySet protWords = new CharArraySet(new HashSet<>(
Arrays.asList("rrdpafa", "pupmmlu", "xlq", "dyy", "zqrxrrck", "o", "hsrlfvcha")), false);
final byte table[] = new byte[] {
-57, 26, 1, 48, 63, -23, 55, -84, 18, 120, -97, 103, 58, 13, 84, 89, 57, -13, -63,
5, 28, 97, -54, -94, 102, -108, -5, 5, 46, 40, 43, 78, 43, -72, 36, 29, 124, -106,
-22, -51, 65, 5, 31, -42, 6, -99, 97, 14, 81, -128, 74, 100, 54, -55, -25, 53, -71,
-98, 44, 33, 86, 106, -42, 47, 115, -89, -18, -26, 22, -95, -43, 83, -125, 105, -104,
-24, 106, -16, 126, 115, -105, 97, 65, -33, 57, 44, -1, 123, -68, 100, 13, -41, -64,
-119, 0, 92, 94, -36, 53, -9, -102, -18, 90, 94, -26, 31, 71, -20
};
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = new WikipediaTokenizer();
TokenStream stream = new SopTokenFilter(tokenizer);
stream = new WordDelimiterFilter(stream, table, -50, protWords);
stream = new SopTokenFilter(stream);
return new TokenStreamComponents(tokenizer, stream);
}
};
checkAnalysisConsistency(random(), a, false, "B\u28c3\ue0f8[ \ud800\udfc2 </p> jb");
a.close();
}
}