blob: 31bb9f13f8510a3ab5f173c80c262fd0a5fd31f0 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.core;
import java.io.IOException;
import java.io.StringReader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.en.EnglishAnalyzer;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IOUtils;
public class TestAnalyzers extends BaseTokenStreamTestCase {
public void testSimple() throws Exception {
Analyzer a = new SimpleAnalyzer();
assertAnalyzesTo(a, "foo bar FOO BAR", new String[] {"foo", "bar", "foo", "bar"});
assertAnalyzesTo(a, "foo bar . FOO <> BAR", new String[] {"foo", "bar", "foo", "bar"});
assertAnalyzesTo(a, "foo.bar.FOO.BAR", new String[] {"foo", "bar", "foo", "bar"});
assertAnalyzesTo(a, "U.S.A.", new String[] {"u", "s", "a"});
assertAnalyzesTo(a, "C++", new String[] {"c"});
assertAnalyzesTo(a, "B2B", new String[] {"b", "b"});
assertAnalyzesTo(a, "2B", new String[] {"b"});
assertAnalyzesTo(a, "\"QUOTED\" word", new String[] {"quoted", "word"});
assertEquals(new BytesRef("\"\\à3[]()! cz@"), a.normalize("dummy", "\"\\À3[]()! Cz@"));
a.close();
}
public void testNull() throws Exception {
Analyzer a = new WhitespaceAnalyzer();
assertAnalyzesTo(a, "foo bar FOO BAR", new String[] {"foo", "bar", "FOO", "BAR"});
assertAnalyzesTo(
a, "foo bar . FOO <> BAR", new String[] {"foo", "bar", ".", "FOO", "<>", "BAR"});
assertAnalyzesTo(a, "foo.bar.FOO.BAR", new String[] {"foo.bar.FOO.BAR"});
assertAnalyzesTo(a, "U.S.A.", new String[] {"U.S.A."});
assertAnalyzesTo(a, "C++", new String[] {"C++"});
assertAnalyzesTo(a, "B2B", new String[] {"B2B"});
assertAnalyzesTo(a, "2B", new String[] {"2B"});
assertAnalyzesTo(a, "\"QUOTED\" word", new String[] {"\"QUOTED\"", "word"});
assertEquals(new BytesRef("\"\\À3[]()! Cz@"), a.normalize("dummy", "\"\\À3[]()! Cz@"));
a.close();
}
public void testStop() throws Exception {
Analyzer a = new StopAnalyzer(EnglishAnalyzer.ENGLISH_STOP_WORDS_SET);
assertAnalyzesTo(a, "foo bar FOO BAR", new String[] {"foo", "bar", "foo", "bar"});
assertAnalyzesTo(a, "foo a bar such FOO THESE BAR", new String[] {"foo", "bar", "foo", "bar"});
assertEquals(new BytesRef("\"\\à3[]()! cz@"), a.normalize("dummy", "\"\\À3[]()! Cz@"));
assertEquals(new BytesRef("the"), a.normalize("dummy", "the"));
a.close();
}
void verifyPayload(TokenStream ts) throws IOException {
PayloadAttribute payloadAtt = ts.getAttribute(PayloadAttribute.class);
ts.reset();
for (byte b = 1; ; b++) {
boolean hasNext = ts.incrementToken();
if (!hasNext) break;
// System.out.println("id="+System.identityHashCode(nextToken) + " " + t);
// System.out.println("payload=" + (int)nextToken.getPayload().toByteArray()[0]);
assertEquals(b, payloadAtt.getPayload().bytes[0]);
}
}
// Make sure old style next() calls result in a new copy of payloads
public void testPayloadCopy() throws IOException {
String s = "how now brown cow";
TokenStream ts;
ts = new WhitespaceTokenizer();
((Tokenizer) ts).setReader(new StringReader(s));
ts = new PayloadSetter(ts);
verifyPayload(ts);
ts = new WhitespaceTokenizer();
((Tokenizer) ts).setReader(new StringReader(s));
ts = new PayloadSetter(ts);
verifyPayload(ts);
}
// LUCENE-1150: Just a compile time test, to ensure the
// StandardAnalyzer constants remain publicly accessible
@SuppressWarnings("unused")
public void _testStandardConstants() {
int x = StandardTokenizer.ALPHANUM;
x = StandardTokenizer.NUM;
x = StandardTokenizer.SOUTHEAST_ASIAN;
x = StandardTokenizer.IDEOGRAPHIC;
x = StandardTokenizer.HIRAGANA;
x = StandardTokenizer.KATAKANA;
x = StandardTokenizer.HANGUL;
String[] y = StandardTokenizer.TOKEN_TYPES;
}
private static class LowerCaseWhitespaceAnalyzer extends Analyzer {
@Override
public TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer =
random().nextBoolean() ? new WhitespaceTokenizer() : new UnicodeWhitespaceTokenizer();
return new TokenStreamComponents(tokenizer, new LowerCaseFilter(tokenizer));
}
}
private static class UpperCaseWhitespaceAnalyzer extends Analyzer {
@Override
public TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer =
random().nextBoolean() ? new WhitespaceTokenizer() : new UnicodeWhitespaceTokenizer();
return new TokenStreamComponents(tokenizer, new UpperCaseFilter(tokenizer));
}
}
/** Test that LowercaseFilter handles entire unicode range correctly */
public void testLowerCaseFilter() throws IOException {
Analyzer a = new LowerCaseWhitespaceAnalyzer();
// BMP
assertAnalyzesTo(a, "AbaCaDabA", new String[] {"abacadaba"});
// supplementary
assertAnalyzesTo(
a,
"\ud801\udc16\ud801\udc16\ud801\udc16\ud801\udc16",
new String[] {"\ud801\udc3e\ud801\udc3e\ud801\udc3e\ud801\udc3e"});
assertAnalyzesTo(a, "AbaCa\ud801\udc16DabA", new String[] {"abaca\ud801\udc3edaba"});
// unpaired lead surrogate
assertAnalyzesTo(a, "AbaC\uD801AdaBa", new String[] {"abac\uD801adaba"});
// unpaired trail surrogate
assertAnalyzesTo(a, "AbaC\uDC16AdaBa", new String[] {"abac\uDC16adaba"});
a.close();
}
/** Test that LowercaseFilter handles entire unicode range correctly */
public void testUpperCaseFilter() throws IOException {
Analyzer a = new UpperCaseWhitespaceAnalyzer();
// BMP
assertAnalyzesTo(a, "AbaCaDabA", new String[] {"ABACADABA"});
// supplementary
assertAnalyzesTo(
a,
"\ud801\udc3e\ud801\udc3e\ud801\udc3e\ud801\udc3e",
new String[] {"\ud801\udc16\ud801\udc16\ud801\udc16\ud801\udc16"});
assertAnalyzesTo(a, "AbaCa\ud801\udc3eDabA", new String[] {"ABACA\ud801\udc16DABA"});
// unpaired lead surrogate
assertAnalyzesTo(a, "AbaC\uD801AdaBa", new String[] {"ABAC\uD801ADABA"});
// unpaired trail surrogate
assertAnalyzesTo(a, "AbaC\uDC16AdaBa", new String[] {"ABAC\uDC16ADABA"});
a.close();
}
/**
* Test that LowercaseFilter handles the lowercasing correctly if the term buffer has a trailing
* surrogate character leftover and the current term in the buffer ends with a corresponding
* leading surrogate.
*/
public void testLowerCaseFilterLowSurrogateLeftover() throws IOException {
// test if the limit of the termbuffer is correctly used with supplementary
// chars
WhitespaceTokenizer tokenizer = new WhitespaceTokenizer();
tokenizer.setReader(new StringReader("BogustermBogusterm\udc16"));
LowerCaseFilter filter = new LowerCaseFilter(tokenizer);
assertTokenStreamContents(filter, new String[] {"bogustermbogusterm\udc16"});
filter.reset();
String highSurEndingUpper = "BogustermBoguster\ud801";
String highSurEndingLower = "bogustermboguster\ud801";
tokenizer.setReader(new StringReader(highSurEndingUpper));
assertTokenStreamContents(filter, new String[] {highSurEndingLower});
assertTrue(filter.hasAttribute(CharTermAttribute.class));
char[] termBuffer = filter.getAttribute(CharTermAttribute.class).buffer();
int length = highSurEndingLower.length();
assertEquals('\ud801', termBuffer[length - 1]);
}
public void testWhitespaceTokenizer() throws IOException {
StringReader reader = new StringReader("Tokenizer \ud801\udc1ctest");
WhitespaceTokenizer tokenizer = new WhitespaceTokenizer();
tokenizer.setReader(reader);
assertTokenStreamContents(tokenizer, new String[] {"Tokenizer", "\ud801\udc1ctest"});
}
/** blast some random strings through the analyzer */
public void testRandomStrings() throws Exception {
Analyzer analyzers[] =
new Analyzer[] {
new WhitespaceAnalyzer(),
new SimpleAnalyzer(),
new StopAnalyzer(EnglishAnalyzer.ENGLISH_STOP_WORDS_SET),
new UnicodeWhitespaceAnalyzer()
};
for (Analyzer analyzer : analyzers) {
checkRandomData(random(), analyzer, 100 * RANDOM_MULTIPLIER);
}
IOUtils.close(analyzers);
}
/** blast some random large strings through the analyzer */
public void testRandomHugeStrings() throws Exception {
Analyzer analyzers[] =
new Analyzer[] {
new WhitespaceAnalyzer(),
new SimpleAnalyzer(),
new StopAnalyzer(EnglishAnalyzer.ENGLISH_STOP_WORDS_SET),
new UnicodeWhitespaceAnalyzer()
};
for (Analyzer analyzer : analyzers) {
checkRandomData(random(), analyzer, 10 * RANDOM_MULTIPLIER, 8192);
}
IOUtils.close(analyzers);
}
}
final class PayloadSetter extends TokenFilter {
PayloadAttribute payloadAtt;
public PayloadSetter(TokenStream input) {
super(input);
payloadAtt = addAttribute(PayloadAttribute.class);
}
byte[] data = new byte[1];
BytesRef p = new BytesRef(data, 0, 1);
@Override
public boolean incrementToken() throws IOException {
boolean hasNext = input.incrementToken();
if (!hasNext) return false;
payloadAtt.setPayload(p); // reuse the payload / byte[]
data[0]++;
return true;
}
}