blob: 7a88a3dc9f9068eb249fd8d932f214ca58725a4d [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.cassandra.index.sasi.analyzer;
import java.io.InputStream;
import java.nio.ByteBuffer;
import java.util.ArrayList;
import java.util.List;
import java.util.Locale;
import org.junit.Test;
import org.apache.cassandra.serializers.UTF8Serializer;
import static org.junit.Assert.assertEquals;
public class StandardAnalyzerTest
{
@Test
public void testTokenizationAscii() throws Exception
{
InputStream is = StandardAnalyzerTest.class.getClassLoader()
.getResourceAsStream("tokenization/apache_license_header.txt");
StandardTokenizerOptions options = new StandardTokenizerOptions.OptionsBuilder()
.maxTokenLength(5).build();
StandardAnalyzer tokenizer = new StandardAnalyzer();
tokenizer.init(options);
List<ByteBuffer> tokens = new ArrayList<>();
tokenizer.reset(is);
while (tokenizer.hasNext())
tokens.add(tokenizer.next());
assertEquals(67, tokens.size());
}
@Test
public void testTokenizationLoremIpsum() throws Exception
{
InputStream is = StandardAnalyzerTest.class.getClassLoader()
.getResourceAsStream("tokenization/lorem_ipsum.txt");
StandardAnalyzer tokenizer = new StandardAnalyzer();
tokenizer.init(StandardTokenizerOptions.getDefaultOptions());
List<ByteBuffer> tokens = new ArrayList<>();
tokenizer.reset(is);
while (tokenizer.hasNext())
tokens.add(tokenizer.next());
assertEquals(62, tokens.size());
}
@Test
public void testTokenizationJaJp1() throws Exception
{
InputStream is = StandardAnalyzerTest.class.getClassLoader()
.getResourceAsStream("tokenization/ja_jp_1.txt");
StandardAnalyzer tokenizer = new StandardAnalyzer();
tokenizer.init(StandardTokenizerOptions.getDefaultOptions());
tokenizer.reset(is);
List<ByteBuffer> tokens = new ArrayList<>();
while (tokenizer.hasNext())
tokens.add(tokenizer.next());
assertEquals(210, tokens.size());
}
@Test
public void testTokenizationJaJp2() throws Exception
{
InputStream is = StandardAnalyzerTest.class.getClassLoader()
.getResourceAsStream("tokenization/ja_jp_2.txt");
StandardTokenizerOptions options = new StandardTokenizerOptions.OptionsBuilder().stemTerms(true)
.ignoreStopTerms(true).alwaysLowerCaseTerms(true).build();
StandardAnalyzer tokenizer = new StandardAnalyzer();
tokenizer.init(options);
tokenizer.reset(is);
List<ByteBuffer> tokens = new ArrayList<>();
while (tokenizer.hasNext())
tokens.add(tokenizer.next());
assertEquals(57, tokens.size());
}
@Test
public void testTokenizationRuRu1() throws Exception
{
InputStream is = StandardAnalyzerTest.class.getClassLoader()
.getResourceAsStream("tokenization/ru_ru_1.txt");
StandardAnalyzer tokenizer = new StandardAnalyzer();
tokenizer.init(StandardTokenizerOptions.getDefaultOptions());
List<ByteBuffer> tokens = new ArrayList<>();
tokenizer.reset(is);
while (tokenizer.hasNext())
tokens.add(tokenizer.next());
assertEquals(456, tokens.size());
}
@Test
public void testTokenizationZnTw1() throws Exception
{
InputStream is = StandardAnalyzerTest.class.getClassLoader()
.getResourceAsStream("tokenization/zn_tw_1.txt");
StandardAnalyzer tokenizer = new StandardAnalyzer();
tokenizer.init(StandardTokenizerOptions.getDefaultOptions());
List<ByteBuffer> tokens = new ArrayList<>();
tokenizer.reset(is);
while (tokenizer.hasNext())
tokens.add(tokenizer.next());
assertEquals(963, tokens.size());
}
@Test
public void testTokenizationAdventuresOfHuckFinn() throws Exception
{
InputStream is = StandardAnalyzerTest.class.getClassLoader()
.getResourceAsStream("tokenization/adventures_of_huckleberry_finn_mark_twain.txt");
StandardTokenizerOptions options = new StandardTokenizerOptions.OptionsBuilder().stemTerms(true)
.ignoreStopTerms(true).useLocale(Locale.ENGLISH)
.alwaysLowerCaseTerms(true).build();
StandardAnalyzer tokenizer = new StandardAnalyzer();
tokenizer.init(options);
List<ByteBuffer> tokens = new ArrayList<>();
tokenizer.reset(is);
while (tokenizer.hasNext())
tokens.add(tokenizer.next());
assertEquals(37739, tokens.size());
}
@Test
public void testSkipStopWordBeforeStemmingFrench() throws Exception
{
InputStream is = StandardAnalyzerTest.class.getClassLoader()
.getResourceAsStream("tokenization/french_skip_stop_words_before_stemming.txt");
StandardTokenizerOptions options = new StandardTokenizerOptions.OptionsBuilder().stemTerms(true)
.ignoreStopTerms(true).useLocale(Locale.FRENCH)
.alwaysLowerCaseTerms(true).build();
StandardAnalyzer tokenizer = new StandardAnalyzer();
tokenizer.init(options);
List<ByteBuffer> tokens = new ArrayList<>();
List<String> words = new ArrayList<>();
tokenizer.reset(is);
while (tokenizer.hasNext())
{
final ByteBuffer nextToken = tokenizer.next();
tokens.add(nextToken);
words.add(UTF8Serializer.instance.deserialize(nextToken.duplicate()));
}
assertEquals(4, tokens.size());
assertEquals("dans", words.get(0));
assertEquals("plui", words.get(1));
assertEquals("chanson", words.get(2));
assertEquals("connu", words.get(3));
}
@Test
public void tokenizeDomainNamesAndUrls() throws Exception
{
InputStream is = StandardAnalyzerTest.class.getClassLoader()
.getResourceAsStream("tokenization/top_visited_domains.txt");
StandardAnalyzer tokenizer = new StandardAnalyzer();
tokenizer.init(StandardTokenizerOptions.getDefaultOptions());
tokenizer.reset(is);
List<ByteBuffer> tokens = new ArrayList<>();
while (tokenizer.hasNext())
tokens.add(tokenizer.next());
assertEquals(15, tokens.size());
}
@Test
public void testReuseAndResetTokenizerInstance() throws Exception
{
List<ByteBuffer> bbToTokenize = new ArrayList<>();
bbToTokenize.add(ByteBuffer.wrap("Nip it in the bud".getBytes()));
bbToTokenize.add(ByteBuffer.wrap("I couldn’t care less".getBytes()));
bbToTokenize.add(ByteBuffer.wrap("One and the same".getBytes()));
bbToTokenize.add(ByteBuffer.wrap("The squeaky wheel gets the grease.".getBytes()));
bbToTokenize.add(ByteBuffer.wrap("The pen is mightier than the sword.".getBytes()));
StandardAnalyzer tokenizer = new StandardAnalyzer();
tokenizer.init(StandardTokenizerOptions.getDefaultOptions());
List<ByteBuffer> tokens = new ArrayList<>();
for (ByteBuffer bb : bbToTokenize)
{
tokenizer.reset(bb);
while (tokenizer.hasNext())
tokens.add(tokenizer.next());
}
assertEquals(10, tokens.size());
}
}