lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestRemoveDuplicatesTokenFilter.java - lucene-solr - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.lucene.analysis.miscellaneous;

 import java.io.IOException;
 import java.util.Arrays;
 import java.util.Iterator;
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
 import org.apache.lucene.analysis.MockTokenizer;
 import org.apache.lucene.analysis.Token;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.Tokenizer;
 import org.apache.lucene.analysis.core.KeywordTokenizer;
 import org.apache.lucene.analysis.synonym.SynonymGraphFilter;
 import org.apache.lucene.analysis.synonym.SynonymMap;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
 import org.apache.lucene.util.CharsRef;
 import org.apache.lucene.util.TestUtil;

 public class TestRemoveDuplicatesTokenFilter extends BaseTokenStreamTestCase {

   public static Token tok(int pos, String t, int start, int end) {
     Token tok = new Token(t, start, end);
     tok.setPositionIncrement(pos);
     return tok;
   }

   public static Token tok(int pos, String t) {
     return tok(pos, t, 0, 0);
   }

   public void testDups(final String expected, final Token... tokens) throws Exception {

     final Iterator<Token> toks = Arrays.asList(tokens).iterator();
     final TokenStream ts =
         new RemoveDuplicatesTokenFilter(
             (new TokenStream() {
               CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
               OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
               PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);

               @Override
               public boolean incrementToken() {
                 if (toks.hasNext()) {
                   clearAttributes();
                   Token tok = toks.next();
                   termAtt.setEmpty().append(tok);
                   offsetAtt.setOffset(tok.startOffset(), tok.endOffset());
                   posIncAtt.setPositionIncrement(tok.getPositionIncrement());
                   return true;
                 } else {
                   return false;
                 }
               }
             }));

     assertTokenStreamContents(ts, expected.split("\\s"));
   }

   public void testNoDups() throws Exception {

     testDups(
         "A B B C D E",
         tok(1, "A", 0, 4),
         tok(1, "B", 5, 10),
         tok(1, "B", 11, 15),
         tok(1, "C", 16, 20),
         tok(0, "D", 16, 20),
         tok(1, "E", 21, 25));
   }

   public void testSimpleDups() throws Exception {

     testDups(
         "A B C D E",
         tok(1, "A", 0, 4),
         tok(1, "B", 5, 10),
         tok(0, "B", 11, 15),
         tok(1, "C", 16, 20),
         tok(0, "D", 16, 20),
         tok(1, "E", 21, 25));
   }

   public void testComplexDups() throws Exception {

     testDups(
         "A B C D E F G H I J K",
         tok(1, "A"),
         tok(1, "B"),
         tok(0, "B"),
         tok(1, "C"),
         tok(1, "D"),
         tok(0, "D"),
         tok(0, "D"),
         tok(1, "E"),
         tok(1, "F"),
         tok(0, "F"),
         tok(1, "G"),
         tok(0, "H"),
         tok(0, "H"),
         tok(1, "I"),
         tok(1, "J"),
         tok(0, "K"),
         tok(0, "J"));
   }

   // some helper methods for the below test with synonyms
   private String randomNonEmptyString() {
     while (true) {
       final String s = TestUtil.randomUnicodeString(random()).trim();
       if (s.length() != 0 && s.indexOf('\u0000') == -1) {
         return s;
       }
     }
   }

   private void add(SynonymMap.Builder b, String input, String output, boolean keepOrig) {
     b.add(
         new CharsRef(input.replaceAll(" +", "\u0000")),
         new CharsRef(output.replaceAll(" +", "\u0000")),
         keepOrig);
   }

   /** blast some random strings through the analyzer */
   public void testRandomStrings() throws Exception {
     final int numIters = atLeast(3);
     for (int i = 0; i < numIters; i++) {
       SynonymMap.Builder b = new SynonymMap.Builder(random().nextBoolean());
       final int numEntries = atLeast(10);
       for (int j = 0; j < numEntries; j++) {
         add(b, randomNonEmptyString(), randomNonEmptyString(), random().nextBoolean());
       }
       final SynonymMap map = b.build();
       final boolean ignoreCase = random().nextBoolean();

       final Analyzer analyzer =
           new Analyzer() {
             @Override
             protected TokenStreamComponents createComponents(String fieldName) {
               Tokenizer tokenizer = new MockTokenizer(MockTokenizer.SIMPLE, true);
               TokenStream stream = new SynonymGraphFilter(tokenizer, map, ignoreCase);
               return new TokenStreamComponents(tokenizer, new RemoveDuplicatesTokenFilter(stream));
             }
           };

       checkRandomData(random(), analyzer, 200);
       analyzer.close();
     }
   }

   public void testEmptyTerm() throws IOException {
     Analyzer a =
         new Analyzer() {
           @Override
           protected TokenStreamComponents createComponents(String fieldName) {
             Tokenizer tokenizer = new KeywordTokenizer();
             return new TokenStreamComponents(tokenizer, new RemoveDuplicatesTokenFilter(tokenizer));
           }
         };
     checkOneTerm(a, "", "");
     a.close();
   }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.lucene.analysis.miscellaneous;

	import java.io.IOException;
	import java.util.Arrays;
	import java.util.Iterator;
	import org.apache.lucene.analysis.Analyzer;
	import org.apache.lucene.analysis.BaseTokenStreamTestCase;
	import org.apache.lucene.analysis.MockTokenizer;
	import org.apache.lucene.analysis.Token;
	import org.apache.lucene.analysis.TokenStream;
	import org.apache.lucene.analysis.Tokenizer;
	import org.apache.lucene.analysis.core.KeywordTokenizer;
	import org.apache.lucene.analysis.synonym.SynonymGraphFilter;
	import org.apache.lucene.analysis.synonym.SynonymMap;
	import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
	import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
	import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
	import org.apache.lucene.util.CharsRef;
	import org.apache.lucene.util.TestUtil;

	public class TestRemoveDuplicatesTokenFilter extends BaseTokenStreamTestCase {

	public static Token tok(int pos, String t, int start, int end) {
	Token tok = new Token(t, start, end);
	tok.setPositionIncrement(pos);
	return tok;
	}

	public static Token tok(int pos, String t) {
	return tok(pos, t, 0, 0);
	}

	public void testDups(final String expected, final Token... tokens) throws Exception {

	final Iterator<Token> toks = Arrays.asList(tokens).iterator();
	final TokenStream ts =
	new RemoveDuplicatesTokenFilter(
	(new TokenStream() {
	CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
	OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
	PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);

	@Override
	public boolean incrementToken() {
	if (toks.hasNext()) {
	clearAttributes();
	Token tok = toks.next();
	termAtt.setEmpty().append(tok);
	offsetAtt.setOffset(tok.startOffset(), tok.endOffset());
	posIncAtt.setPositionIncrement(tok.getPositionIncrement());
	return true;
	} else {
	return false;
	}
	}
	}));

	assertTokenStreamContents(ts, expected.split("\\s"));
	}

	public void testNoDups() throws Exception {

	testDups(
	"A B B C D E",
	tok(1, "A", 0, 4),
	tok(1, "B", 5, 10),
	tok(1, "B", 11, 15),
	tok(1, "C", 16, 20),
	tok(0, "D", 16, 20),
	tok(1, "E", 21, 25));
	}

	public void testSimpleDups() throws Exception {

	testDups(
	"A B C D E",
	tok(1, "A", 0, 4),
	tok(1, "B", 5, 10),
	tok(0, "B", 11, 15),
	tok(1, "C", 16, 20),
	tok(0, "D", 16, 20),
	tok(1, "E", 21, 25));
	}

	public void testComplexDups() throws Exception {

	testDups(
	"A B C D E F G H I J K",
	tok(1, "A"),
	tok(1, "B"),
	tok(0, "B"),
	tok(1, "C"),
	tok(1, "D"),
	tok(0, "D"),
	tok(0, "D"),
	tok(1, "E"),
	tok(1, "F"),
	tok(0, "F"),
	tok(1, "G"),
	tok(0, "H"),
	tok(0, "H"),
	tok(1, "I"),
	tok(1, "J"),
	tok(0, "K"),
	tok(0, "J"));
	}

	// some helper methods for the below test with synonyms
	private String randomNonEmptyString() {
	while (true) {
	final String s = TestUtil.randomUnicodeString(random()).trim();
	if (s.length() != 0 && s.indexOf('\u0000') == -1) {
	return s;
	}
	}
	}

	private void add(SynonymMap.Builder b, String input, String output, boolean keepOrig) {
	b.add(
	new CharsRef(input.replaceAll(" +", "\u0000")),
	new CharsRef(output.replaceAll(" +", "\u0000")),
	keepOrig);
	}

	/** blast some random strings through the analyzer */
	public void testRandomStrings() throws Exception {
	final int numIters = atLeast(3);
	for (int i = 0; i < numIters; i++) {
	SynonymMap.Builder b = new SynonymMap.Builder(random().nextBoolean());
	final int numEntries = atLeast(10);
	for (int j = 0; j < numEntries; j++) {
	add(b, randomNonEmptyString(), randomNonEmptyString(), random().nextBoolean());
	}
	final SynonymMap map = b.build();
	final boolean ignoreCase = random().nextBoolean();

	final Analyzer analyzer =
	new Analyzer() {
	@Override
	protected TokenStreamComponents createComponents(String fieldName) {
	Tokenizer tokenizer = new MockTokenizer(MockTokenizer.SIMPLE, true);
	TokenStream stream = new SynonymGraphFilter(tokenizer, map, ignoreCase);
	return new TokenStreamComponents(tokenizer, new RemoveDuplicatesTokenFilter(stream));
	}
	};

	checkRandomData(random(), analyzer, 200);
	analyzer.close();
	}
	}

	public void testEmptyTerm() throws IOException {
	Analyzer a =
	new Analyzer() {
	@Override
	protected TokenStreamComponents createComponents(String fieldName) {
	Tokenizer tokenizer = new KeywordTokenizer();
	return new TokenStreamComponents(tokenizer, new RemoveDuplicatesTokenFilter(tokenizer));
	}
	};
	checkOneTerm(a, "", "");
	a.close();
	}
	}