test/unit/org/apache/cassandra/index/sasi/analyzer/DelimiterAnalyzerTest.java - cassandra - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.cassandra.index.sasi.analyzer;

 import java.nio.ByteBuffer;
 import java.util.ArrayList;
 import java.util.HashMap;
 import java.util.List;

 import org.apache.cassandra.db.marshal.Int32Type;
 import org.apache.cassandra.db.marshal.UTF8Type;
 import org.apache.cassandra.utils.ByteBufferUtil;
 import org.apache.commons.io.IOUtils;

 import org.junit.Assert;
 import org.junit.Test;

 import static org.junit.Assert.assertEquals;

 public class DelimiterAnalyzerTest
 {

     @Test
     public void caseSensitiveAnalizer() throws Exception
     {
         DelimiterAnalyzer analyzer = new DelimiterAnalyzer();

         analyzer.init(
             new HashMap()
                 {{
                     put(DelimiterTokenizingOptions.DELIMITER, " ");
                 }},
             UTF8Type.instance);

         String testString = "Nip it in the bud";
         ByteBuffer toAnalyze = ByteBuffer.wrap(testString.getBytes());
         analyzer.reset(toAnalyze);
         StringBuilder output = new StringBuilder();
         while (analyzer.hasNext())
             output.append(ByteBufferUtil.string(analyzer.next()) + (analyzer.hasNext() ? ' ' : ""));

         Assert.assertEquals(testString, output.toString());
         Assert.assertFalse(testString.toLowerCase().equals(output.toString()));
     }

     @Test
     public void testBlankEntries() throws Exception
     {
         DelimiterAnalyzer analyzer = new DelimiterAnalyzer();

         analyzer.init(
             new HashMap()
                 {{
                     put(DelimiterTokenizingOptions.DELIMITER, ",");
                 }},
             UTF8Type.instance);

         String testString = ",Nip,,,,it,,,in,,the,bud,,,";
         ByteBuffer toAnalyze = ByteBuffer.wrap(testString.getBytes());
         analyzer.reset(toAnalyze);
         StringBuilder output = new StringBuilder();
         while (analyzer.hasNext())
             output.append(ByteBufferUtil.string(analyzer.next()) + (analyzer.hasNext() ? ',' : ""));

         Assert.assertEquals("Nip,it,in,the,bud", output.toString());
         Assert.assertFalse(testString.toLowerCase().equals(output.toString()));
     }

     @Test(expected = IllegalArgumentException.class)
     public void ensureIncompatibleInputSkipped() throws Exception
     {
         new DelimiterAnalyzer().init(new HashMap(), Int32Type.instance);
     }

     @Test
     public void testTokenizationLoremIpsum() throws Exception
     {
         ByteBuffer bb = ByteBuffer.wrap(IOUtils.toByteArray(
                 DelimiterAnalyzerTest.class.getClassLoader().getResourceAsStream("tokenization/lorem_ipsum.txt")));

         DelimiterAnalyzer tokenizer = new DelimiterAnalyzer();

         tokenizer.init(
             new HashMap()
                 {{
                     put(DelimiterTokenizingOptions.DELIMITER, " ");
                 }},
             UTF8Type.instance);

         List<ByteBuffer> tokens = new ArrayList<>();
         tokenizer.reset(bb);
         while (tokenizer.hasNext())
             tokens.add(tokenizer.next());

         assertEquals(69, tokens.size());

     }

     @Test
     public void testTokenizationJaJp1() throws Exception
     {
         ByteBuffer bb = ByteBuffer.wrap(IOUtils.toByteArray(
                 DelimiterAnalyzerTest.class.getClassLoader().getResourceAsStream("tokenization/ja_jp_1.txt")));

         DelimiterAnalyzer tokenizer = new DelimiterAnalyzer();

         tokenizer.init(
             new HashMap()
                 {{
                     put(DelimiterTokenizingOptions.DELIMITER, "。");
                 }},
             UTF8Type.instance);

         tokenizer.reset(bb);
         List<ByteBuffer> tokens = new ArrayList<>();
         while (tokenizer.hasNext())
             tokens.add(tokenizer.next());

         assertEquals(4, tokens.size());
     }

     @Test
     public void testTokenizationJaJp2() throws Exception
     {
         ByteBuffer bb = ByteBuffer.wrap(IOUtils.toByteArray(
                 DelimiterAnalyzerTest.class.getClassLoader().getResourceAsStream("tokenization/ja_jp_2.txt")));

         DelimiterAnalyzer tokenizer = new DelimiterAnalyzer();

         tokenizer.init(
             new HashMap()
                 {{
                     put(DelimiterTokenizingOptions.DELIMITER, "。");
                 }},
             UTF8Type.instance);

         tokenizer.reset(bb);
         List<ByteBuffer> tokens = new ArrayList<>();
         while (tokenizer.hasNext())
             tokens.add(tokenizer.next());

         assertEquals(2, tokens.size());
     }

     @Test
     public void testTokenizationRuRu1() throws Exception
     {
         ByteBuffer bb = ByteBuffer.wrap(IOUtils.toByteArray(
                 DelimiterAnalyzerTest.class.getClassLoader().getResourceAsStream("tokenization/ru_ru_1.txt")));

         DelimiterAnalyzer tokenizer = new DelimiterAnalyzer();

         tokenizer.init(
             new HashMap()
                 {{
                     put(DelimiterTokenizingOptions.DELIMITER, " ");
                 }},
             UTF8Type.instance);

         List<ByteBuffer> tokens = new ArrayList<>();
         tokenizer.reset(bb);
         while (tokenizer.hasNext())
             tokens.add(tokenizer.next());

         assertEquals(447, tokens.size());
     }

     @Test
     public void testTokenizationZnTw1() throws Exception
     {
         ByteBuffer bb = ByteBuffer.wrap(IOUtils.toByteArray(
                 DelimiterAnalyzerTest.class.getClassLoader().getResourceAsStream("tokenization/zn_tw_1.txt")));

         DelimiterAnalyzer tokenizer = new DelimiterAnalyzer();

         tokenizer.init(
             new HashMap()
                 {{
                     put(DelimiterTokenizingOptions.DELIMITER, " ");
                 }},
             UTF8Type.instance);

         List<ByteBuffer> tokens = new ArrayList<>();
         tokenizer.reset(bb);
         while (tokenizer.hasNext())
             tokens.add(tokenizer.next());

         assertEquals(403, tokens.size());
     }

     @Test
     public void testTokenizationAdventuresOfHuckFinn() throws Exception
     {
         ByteBuffer bb = ByteBuffer.wrap(IOUtils.toByteArray(
                 DelimiterAnalyzerTest.class.getClassLoader().getResourceAsStream("tokenization/adventures_of_huckleberry_finn_mark_twain.txt")));

         DelimiterAnalyzer tokenizer = new DelimiterAnalyzer();

         tokenizer.init(
             new HashMap()
                 {{
                     put(DelimiterTokenizingOptions.DELIMITER, " ");
                 }},
             UTF8Type.instance);

         List<ByteBuffer> tokens = new ArrayList<>();
         tokenizer.reset(bb);
         while (tokenizer.hasNext())
             tokens.add(tokenizer.next());

         assertEquals(104594, tokens.size());
     }

     @Test
     public void testWorldCities() throws Exception
     {
         ByteBuffer bb = ByteBuffer.wrap(IOUtils.toByteArray(
                 DelimiterAnalyzerTest.class.getClassLoader().getResourceAsStream("tokenization/world_cities_a.csv")));

         DelimiterAnalyzer tokenizer = new DelimiterAnalyzer();

         tokenizer.init(
             new HashMap()
                 {{
                     put(DelimiterTokenizingOptions.DELIMITER, ",");
                 }},
             UTF8Type.instance);

         List<ByteBuffer> tokens = new ArrayList<>();
         tokenizer.reset(bb);
         while (tokenizer.hasNext())
             tokens.add(tokenizer.next());

         assertEquals(122265, tokens.size());
     }

     @Test
     public void tokenizeDomainNamesAndUrls() throws Exception
     {
         ByteBuffer bb = ByteBuffer.wrap(IOUtils.toByteArray(
                 DelimiterAnalyzerTest.class.getClassLoader().getResourceAsStream("tokenization/top_visited_domains.txt")));

         DelimiterAnalyzer tokenizer = new DelimiterAnalyzer();

         tokenizer.init(
             new HashMap()
                 {{
                     put(DelimiterTokenizingOptions.DELIMITER, " ");
                 }},
             UTF8Type.instance);

         tokenizer.reset(bb);

         List<ByteBuffer> tokens = new ArrayList<>();
         while (tokenizer.hasNext())
             tokens.add(tokenizer.next());

         assertEquals(12, tokens.size());
     }

     @Test
     public void testReuseAndResetTokenizerInstance() throws Exception
     {
         List<ByteBuffer> bbToTokenize = new ArrayList<>();
         bbToTokenize.add(ByteBuffer.wrap("Nip it in the bud".getBytes()));
         bbToTokenize.add(ByteBuffer.wrap("I couldn’t care less".getBytes()));
         bbToTokenize.add(ByteBuffer.wrap("One and the same".getBytes()));
         bbToTokenize.add(ByteBuffer.wrap("The squeaky wheel gets the grease.".getBytes()));
         bbToTokenize.add(ByteBuffer.wrap("The pen is mightier than the sword.".getBytes()));

         DelimiterAnalyzer tokenizer = new DelimiterAnalyzer();

         tokenizer.init(
             new HashMap()
                 {{
                     put(DelimiterTokenizingOptions.DELIMITER, " ");
                 }},
             UTF8Type.instance);

         List<ByteBuffer> tokens = new ArrayList<>();
         for (ByteBuffer bb : bbToTokenize)
         {
             tokenizer.reset(bb);
             while (tokenizer.hasNext())
                 tokens.add(tokenizer.next());
         }
         assertEquals(26, tokens.size());
     }

 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.cassandra.index.sasi.analyzer;

	import java.nio.ByteBuffer;
	import java.util.ArrayList;
	import java.util.HashMap;
	import java.util.List;

	import org.apache.cassandra.db.marshal.Int32Type;
	import org.apache.cassandra.db.marshal.UTF8Type;
	import org.apache.cassandra.utils.ByteBufferUtil;
	import org.apache.commons.io.IOUtils;

	import org.junit.Assert;
	import org.junit.Test;

	import static org.junit.Assert.assertEquals;

	public class DelimiterAnalyzerTest
	{

	@Test
	public void caseSensitiveAnalizer() throws Exception
	{
	DelimiterAnalyzer analyzer = new DelimiterAnalyzer();

	analyzer.init(
	new HashMap()
	{{
	put(DelimiterTokenizingOptions.DELIMITER, " ");
	}},
	UTF8Type.instance);

	String testString = "Nip it in the bud";
	ByteBuffer toAnalyze = ByteBuffer.wrap(testString.getBytes());
	analyzer.reset(toAnalyze);
	StringBuilder output = new StringBuilder();
	while (analyzer.hasNext())
	output.append(ByteBufferUtil.string(analyzer.next()) + (analyzer.hasNext() ? ' ' : ""));

	Assert.assertEquals(testString, output.toString());
	Assert.assertFalse(testString.toLowerCase().equals(output.toString()));
	}

	@Test
	public void testBlankEntries() throws Exception
	{
	DelimiterAnalyzer analyzer = new DelimiterAnalyzer();

	analyzer.init(
	new HashMap()
	{{
	put(DelimiterTokenizingOptions.DELIMITER, ",");
	}},
	UTF8Type.instance);

	String testString = ",Nip,,,,it,,,in,,the,bud,,,";
	ByteBuffer toAnalyze = ByteBuffer.wrap(testString.getBytes());
	analyzer.reset(toAnalyze);
	StringBuilder output = new StringBuilder();
	while (analyzer.hasNext())
	output.append(ByteBufferUtil.string(analyzer.next()) + (analyzer.hasNext() ? ',' : ""));

	Assert.assertEquals("Nip,it,in,the,bud", output.toString());
	Assert.assertFalse(testString.toLowerCase().equals(output.toString()));
	}

	@Test(expected = IllegalArgumentException.class)
	public void ensureIncompatibleInputSkipped() throws Exception
	{
	new DelimiterAnalyzer().init(new HashMap(), Int32Type.instance);
	}

	@Test
	public void testTokenizationLoremIpsum() throws Exception
	{
	ByteBuffer bb = ByteBuffer.wrap(IOUtils.toByteArray(
	DelimiterAnalyzerTest.class.getClassLoader().getResourceAsStream("tokenization/lorem_ipsum.txt")));

	DelimiterAnalyzer tokenizer = new DelimiterAnalyzer();

	tokenizer.init(
	new HashMap()
	{{
	put(DelimiterTokenizingOptions.DELIMITER, " ");
	}},
	UTF8Type.instance);

	List<ByteBuffer> tokens = new ArrayList<>();
	tokenizer.reset(bb);
	while (tokenizer.hasNext())
	tokens.add(tokenizer.next());

	assertEquals(69, tokens.size());

	}

	@Test
	public void testTokenizationJaJp1() throws Exception
	{
	ByteBuffer bb = ByteBuffer.wrap(IOUtils.toByteArray(
	DelimiterAnalyzerTest.class.getClassLoader().getResourceAsStream("tokenization/ja_jp_1.txt")));

	DelimiterAnalyzer tokenizer = new DelimiterAnalyzer();

	tokenizer.init(
	new HashMap()
	{{
	put(DelimiterTokenizingOptions.DELIMITER, "。");
	}},
	UTF8Type.instance);

	tokenizer.reset(bb);
	List<ByteBuffer> tokens = new ArrayList<>();
	while (tokenizer.hasNext())
	tokens.add(tokenizer.next());

	assertEquals(4, tokens.size());
	}

	@Test
	public void testTokenizationJaJp2() throws Exception
	{
	ByteBuffer bb = ByteBuffer.wrap(IOUtils.toByteArray(
	DelimiterAnalyzerTest.class.getClassLoader().getResourceAsStream("tokenization/ja_jp_2.txt")));

	DelimiterAnalyzer tokenizer = new DelimiterAnalyzer();

	tokenizer.init(
	new HashMap()
	{{
	put(DelimiterTokenizingOptions.DELIMITER, "。");
	}},
	UTF8Type.instance);

	tokenizer.reset(bb);
	List<ByteBuffer> tokens = new ArrayList<>();
	while (tokenizer.hasNext())
	tokens.add(tokenizer.next());

	assertEquals(2, tokens.size());
	}

	@Test
	public void testTokenizationRuRu1() throws Exception
	{
	ByteBuffer bb = ByteBuffer.wrap(IOUtils.toByteArray(
	DelimiterAnalyzerTest.class.getClassLoader().getResourceAsStream("tokenization/ru_ru_1.txt")));

	DelimiterAnalyzer tokenizer = new DelimiterAnalyzer();

	tokenizer.init(
	new HashMap()
	{{
	put(DelimiterTokenizingOptions.DELIMITER, " ");
	}},
	UTF8Type.instance);

	List<ByteBuffer> tokens = new ArrayList<>();
	tokenizer.reset(bb);
	while (tokenizer.hasNext())
	tokens.add(tokenizer.next());

	assertEquals(447, tokens.size());
	}

	@Test
	public void testTokenizationZnTw1() throws Exception
	{
	ByteBuffer bb = ByteBuffer.wrap(IOUtils.toByteArray(
	DelimiterAnalyzerTest.class.getClassLoader().getResourceAsStream("tokenization/zn_tw_1.txt")));

	DelimiterAnalyzer tokenizer = new DelimiterAnalyzer();

	tokenizer.init(
	new HashMap()
	{{
	put(DelimiterTokenizingOptions.DELIMITER, " ");
	}},
	UTF8Type.instance);

	List<ByteBuffer> tokens = new ArrayList<>();
	tokenizer.reset(bb);
	while (tokenizer.hasNext())
	tokens.add(tokenizer.next());

	assertEquals(403, tokens.size());
	}

	@Test
	public void testTokenizationAdventuresOfHuckFinn() throws Exception
	{
	ByteBuffer bb = ByteBuffer.wrap(IOUtils.toByteArray(
	DelimiterAnalyzerTest.class.getClassLoader().getResourceAsStream("tokenization/adventures_of_huckleberry_finn_mark_twain.txt")));

	DelimiterAnalyzer tokenizer = new DelimiterAnalyzer();

	tokenizer.init(
	new HashMap()
	{{
	put(DelimiterTokenizingOptions.DELIMITER, " ");
	}},
	UTF8Type.instance);

	List<ByteBuffer> tokens = new ArrayList<>();
	tokenizer.reset(bb);
	while (tokenizer.hasNext())
	tokens.add(tokenizer.next());

	assertEquals(104594, tokens.size());
	}

	@Test
	public void testWorldCities() throws Exception
	{
	ByteBuffer bb = ByteBuffer.wrap(IOUtils.toByteArray(
	DelimiterAnalyzerTest.class.getClassLoader().getResourceAsStream("tokenization/world_cities_a.csv")));

	DelimiterAnalyzer tokenizer = new DelimiterAnalyzer();

	tokenizer.init(
	new HashMap()
	{{
	put(DelimiterTokenizingOptions.DELIMITER, ",");
	}},
	UTF8Type.instance);

	List<ByteBuffer> tokens = new ArrayList<>();
	tokenizer.reset(bb);
	while (tokenizer.hasNext())
	tokens.add(tokenizer.next());

	assertEquals(122265, tokens.size());
	}

	@Test
	public void tokenizeDomainNamesAndUrls() throws Exception
	{
	ByteBuffer bb = ByteBuffer.wrap(IOUtils.toByteArray(
	DelimiterAnalyzerTest.class.getClassLoader().getResourceAsStream("tokenization/top_visited_domains.txt")));

	DelimiterAnalyzer tokenizer = new DelimiterAnalyzer();

	tokenizer.init(
	new HashMap()
	{{
	put(DelimiterTokenizingOptions.DELIMITER, " ");
	}},
	UTF8Type.instance);

	tokenizer.reset(bb);

	List<ByteBuffer> tokens = new ArrayList<>();
	while (tokenizer.hasNext())
	tokens.add(tokenizer.next());

	assertEquals(12, tokens.size());
	}

	@Test
	public void testReuseAndResetTokenizerInstance() throws Exception
	{
	List<ByteBuffer> bbToTokenize = new ArrayList<>();
	bbToTokenize.add(ByteBuffer.wrap("Nip it in the bud".getBytes()));
	bbToTokenize.add(ByteBuffer.wrap("I couldn’t care less".getBytes()));
	bbToTokenize.add(ByteBuffer.wrap("One and the same".getBytes()));
	bbToTokenize.add(ByteBuffer.wrap("The squeaky wheel gets the grease.".getBytes()));
	bbToTokenize.add(ByteBuffer.wrap("The pen is mightier than the sword.".getBytes()));

	DelimiterAnalyzer tokenizer = new DelimiterAnalyzer();

	tokenizer.init(
	new HashMap()
	{{
	put(DelimiterTokenizingOptions.DELIMITER, " ");
	}},
	UTF8Type.instance);

	List<ByteBuffer> tokens = new ArrayList<>();
	for (ByteBuffer bb : bbToTokenize)
	{
	tokenizer.reset(bb);
	while (tokenizer.hasNext())
	tokens.add(tokenizer.next());
	}
	assertEquals(26, tokens.size());
	}

	}