blob: 16f9f0658adb06200d5c0ee1056ac0ec9b4d5240 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.cassandra.index.sasi.analyzer;
import java.nio.ByteBuffer;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import org.apache.cassandra.db.marshal.Int32Type;
import org.apache.cassandra.db.marshal.UTF8Type;
import org.apache.cassandra.utils.ByteBufferUtil;
import org.apache.commons.io.IOUtils;
import org.junit.Assert;
import org.junit.Test;
import static org.junit.Assert.assertEquals;
public class DelimiterAnalyzerTest
{
@Test
public void caseSensitiveAnalizer() throws Exception
{
DelimiterAnalyzer analyzer = new DelimiterAnalyzer();
analyzer.init(
new HashMap()
{{
put(DelimiterTokenizingOptions.DELIMITER, " ");
}},
UTF8Type.instance);
String testString = "Nip it in the bud";
ByteBuffer toAnalyze = ByteBuffer.wrap(testString.getBytes());
analyzer.reset(toAnalyze);
StringBuilder output = new StringBuilder();
while (analyzer.hasNext())
output.append(ByteBufferUtil.string(analyzer.next()) + (analyzer.hasNext() ? ' ' : ""));
Assert.assertEquals(testString, output.toString());
Assert.assertFalse(testString.toLowerCase().equals(output.toString()));
}
@Test
public void testBlankEntries() throws Exception
{
DelimiterAnalyzer analyzer = new DelimiterAnalyzer();
analyzer.init(
new HashMap()
{{
put(DelimiterTokenizingOptions.DELIMITER, ",");
}},
UTF8Type.instance);
String testString = ",Nip,,,,it,,,in,,the,bud,,,";
ByteBuffer toAnalyze = ByteBuffer.wrap(testString.getBytes());
analyzer.reset(toAnalyze);
StringBuilder output = new StringBuilder();
while (analyzer.hasNext())
output.append(ByteBufferUtil.string(analyzer.next()) + (analyzer.hasNext() ? ',' : ""));
Assert.assertEquals("Nip,it,in,the,bud", output.toString());
Assert.assertFalse(testString.toLowerCase().equals(output.toString()));
}
@Test(expected = IllegalArgumentException.class)
public void ensureIncompatibleInputSkipped() throws Exception
{
new DelimiterAnalyzer().init(new HashMap(), Int32Type.instance);
}
@Test
public void testTokenizationLoremIpsum() throws Exception
{
ByteBuffer bb = ByteBuffer.wrap(IOUtils.toByteArray(
DelimiterAnalyzerTest.class.getClassLoader().getResourceAsStream("tokenization/lorem_ipsum.txt")));
DelimiterAnalyzer tokenizer = new DelimiterAnalyzer();
tokenizer.init(
new HashMap()
{{
put(DelimiterTokenizingOptions.DELIMITER, " ");
}},
UTF8Type.instance);
List<ByteBuffer> tokens = new ArrayList<>();
tokenizer.reset(bb);
while (tokenizer.hasNext())
tokens.add(tokenizer.next());
assertEquals(69, tokens.size());
}
@Test
public void testTokenizationJaJp1() throws Exception
{
ByteBuffer bb = ByteBuffer.wrap(IOUtils.toByteArray(
DelimiterAnalyzerTest.class.getClassLoader().getResourceAsStream("tokenization/ja_jp_1.txt")));
DelimiterAnalyzer tokenizer = new DelimiterAnalyzer();
tokenizer.init(
new HashMap()
{{
put(DelimiterTokenizingOptions.DELIMITER, "。");
}},
UTF8Type.instance);
tokenizer.reset(bb);
List<ByteBuffer> tokens = new ArrayList<>();
while (tokenizer.hasNext())
tokens.add(tokenizer.next());
assertEquals(4, tokens.size());
}
@Test
public void testTokenizationJaJp2() throws Exception
{
ByteBuffer bb = ByteBuffer.wrap(IOUtils.toByteArray(
DelimiterAnalyzerTest.class.getClassLoader().getResourceAsStream("tokenization/ja_jp_2.txt")));
DelimiterAnalyzer tokenizer = new DelimiterAnalyzer();
tokenizer.init(
new HashMap()
{{
put(DelimiterTokenizingOptions.DELIMITER, "。");
}},
UTF8Type.instance);
tokenizer.reset(bb);
List<ByteBuffer> tokens = new ArrayList<>();
while (tokenizer.hasNext())
tokens.add(tokenizer.next());
assertEquals(2, tokens.size());
}
@Test
public void testTokenizationRuRu1() throws Exception
{
ByteBuffer bb = ByteBuffer.wrap(IOUtils.toByteArray(
DelimiterAnalyzerTest.class.getClassLoader().getResourceAsStream("tokenization/ru_ru_1.txt")));
DelimiterAnalyzer tokenizer = new DelimiterAnalyzer();
tokenizer.init(
new HashMap()
{{
put(DelimiterTokenizingOptions.DELIMITER, " ");
}},
UTF8Type.instance);
List<ByteBuffer> tokens = new ArrayList<>();
tokenizer.reset(bb);
while (tokenizer.hasNext())
tokens.add(tokenizer.next());
assertEquals(447, tokens.size());
}
@Test
public void testTokenizationZnTw1() throws Exception
{
ByteBuffer bb = ByteBuffer.wrap(IOUtils.toByteArray(
DelimiterAnalyzerTest.class.getClassLoader().getResourceAsStream("tokenization/zn_tw_1.txt")));
DelimiterAnalyzer tokenizer = new DelimiterAnalyzer();
tokenizer.init(
new HashMap()
{{
put(DelimiterTokenizingOptions.DELIMITER, " ");
}},
UTF8Type.instance);
List<ByteBuffer> tokens = new ArrayList<>();
tokenizer.reset(bb);
while (tokenizer.hasNext())
tokens.add(tokenizer.next());
assertEquals(403, tokens.size());
}
@Test
public void testTokenizationAdventuresOfHuckFinn() throws Exception
{
ByteBuffer bb = ByteBuffer.wrap(IOUtils.toByteArray(
DelimiterAnalyzerTest.class.getClassLoader().getResourceAsStream("tokenization/adventures_of_huckleberry_finn_mark_twain.txt")));
DelimiterAnalyzer tokenizer = new DelimiterAnalyzer();
tokenizer.init(
new HashMap()
{{
put(DelimiterTokenizingOptions.DELIMITER, " ");
}},
UTF8Type.instance);
List<ByteBuffer> tokens = new ArrayList<>();
tokenizer.reset(bb);
while (tokenizer.hasNext())
tokens.add(tokenizer.next());
assertEquals(104594, tokens.size());
}
@Test
public void testWorldCities() throws Exception
{
ByteBuffer bb = ByteBuffer.wrap(IOUtils.toByteArray(
DelimiterAnalyzerTest.class.getClassLoader().getResourceAsStream("tokenization/world_cities_a.csv")));
DelimiterAnalyzer tokenizer = new DelimiterAnalyzer();
tokenizer.init(
new HashMap()
{{
put(DelimiterTokenizingOptions.DELIMITER, ",");
}},
UTF8Type.instance);
List<ByteBuffer> tokens = new ArrayList<>();
tokenizer.reset(bb);
while (tokenizer.hasNext())
tokens.add(tokenizer.next());
assertEquals(122265, tokens.size());
}
@Test
public void tokenizeDomainNamesAndUrls() throws Exception
{
ByteBuffer bb = ByteBuffer.wrap(IOUtils.toByteArray(
DelimiterAnalyzerTest.class.getClassLoader().getResourceAsStream("tokenization/top_visited_domains.txt")));
DelimiterAnalyzer tokenizer = new DelimiterAnalyzer();
tokenizer.init(
new HashMap()
{{
put(DelimiterTokenizingOptions.DELIMITER, " ");
}},
UTF8Type.instance);
tokenizer.reset(bb);
List<ByteBuffer> tokens = new ArrayList<>();
while (tokenizer.hasNext())
tokens.add(tokenizer.next());
assertEquals(12, tokens.size());
}
@Test
public void testReuseAndResetTokenizerInstance() throws Exception
{
List<ByteBuffer> bbToTokenize = new ArrayList<>();
bbToTokenize.add(ByteBuffer.wrap("Nip it in the bud".getBytes()));
bbToTokenize.add(ByteBuffer.wrap("I couldn’t care less".getBytes()));
bbToTokenize.add(ByteBuffer.wrap("One and the same".getBytes()));
bbToTokenize.add(ByteBuffer.wrap("The squeaky wheel gets the grease.".getBytes()));
bbToTokenize.add(ByteBuffer.wrap("The pen is mightier than the sword.".getBytes()));
DelimiterAnalyzer tokenizer = new DelimiterAnalyzer();
tokenizer.init(
new HashMap()
{{
put(DelimiterTokenizingOptions.DELIMITER, " ");
}},
UTF8Type.instance);
List<ByteBuffer> tokens = new ArrayList<>();
for (ByteBuffer bb : bbToTokenize)
{
tokenizer.reset(bb);
while (tokenizer.hasNext())
tokens.add(tokenizer.next());
}
assertEquals(26, tokens.size());
}
}