blob: 5a597ef56cb7de7b0686611e2b4f342a74602314 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.shingle;
import java.io.IOException;
import java.io.StringReader;
import java.util.Random;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.CannedTokenStream;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
public class TestShingleFilter extends BaseTokenStreamTestCase {
public static final Token[] TEST_TOKEN =
new Token[] {
createToken("please", 0, 6),
createToken("divide", 7, 13),
createToken("this", 14, 18),
createToken("sentence", 19, 27),
createToken("into", 28, 32),
createToken("shingles", 33, 39),
};
public static final int[] UNIGRAM_ONLY_POSITION_INCREMENTS = new int[] {1, 1, 1, 1, 1, 1};
public static final String[] UNIGRAM_ONLY_TYPES =
new String[] {"word", "word", "word", "word", "word", "word"};
public static Token[] testTokenWithHoles;
public static final Token[] BI_GRAM_TOKENS =
new Token[] {
createToken("please", 0, 6),
createToken("please divide", 0, 13),
createToken("divide", 7, 13),
createToken("divide this", 7, 18),
createToken("this", 14, 18),
createToken("this sentence", 14, 27),
createToken("sentence", 19, 27),
createToken("sentence into", 19, 32),
createToken("into", 28, 32),
createToken("into shingles", 28, 39),
createToken("shingles", 33, 39),
};
public static final int[] BI_GRAM_POSITION_INCREMENTS =
new int[] {1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1};
public static final String[] BI_GRAM_TYPES =
new String[] {
"word", "shingle", "word", "shingle", "word", "shingle", "word", "shingle", "word",
"shingle", "word"
};
public static final Token[] BI_GRAM_TOKENS_WITH_HOLES =
new Token[] {
createToken("please", 0, 6),
createToken("please divide", 0, 13),
createToken("divide", 7, 13),
createToken("divide _", 7, 19),
createToken("_ sentence", 19, 27),
createToken("sentence", 19, 27),
createToken("sentence _", 19, 33),
createToken("_ shingles", 33, 39),
createToken("shingles", 33, 39),
};
public static final int[] BI_GRAM_POSITION_INCREMENTS_WITH_HOLES =
new int[] {1, 0, 1, 0, 1, 1, 0, 1, 1};
private static final String[] BI_GRAM_TYPES_WITH_HOLES = {
"word", "shingle", "word", "shingle", "shingle", "word", "shingle", "shingle", "word"
};
public static final Token[] BI_GRAM_TOKENS_WITHOUT_UNIGRAMS =
new Token[] {
createToken("please divide", 0, 13),
createToken("divide this", 7, 18),
createToken("this sentence", 14, 27),
createToken("sentence into", 19, 32),
createToken("into shingles", 28, 39),
};
public static final int[] BI_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS =
new int[] {1, 1, 1, 1, 1};
public static final String[] BI_GRAM_TYPES_WITHOUT_UNIGRAMS =
new String[] {"shingle", "shingle", "shingle", "shingle", "shingle"};
public static final Token[] BI_GRAM_TOKENS_WITH_HOLES_WITHOUT_UNIGRAMS =
new Token[] {
createToken("please divide", 0, 13),
createToken("divide _", 7, 19),
createToken("_ sentence", 19, 27),
createToken("sentence _", 19, 33),
createToken("_ shingles", 33, 39),
};
public static final int[] BI_GRAM_POSITION_INCREMENTS_WITH_HOLES_WITHOUT_UNIGRAMS =
new int[] {1, 1, 1, 1, 1, 1};
public static final Token[] TEST_SINGLE_TOKEN = new Token[] {createToken("please", 0, 6)};
public static final Token[] SINGLE_TOKEN = new Token[] {createToken("please", 0, 6)};
public static final int[] SINGLE_TOKEN_INCREMENTS = new int[] {1};
public static final String[] SINGLE_TOKEN_TYPES = new String[] {"word"};
public static final Token[] EMPTY_TOKEN_ARRAY = new Token[] {};
public static final int[] EMPTY_TOKEN_INCREMENTS_ARRAY = new int[] {};
public static final String[] EMPTY_TOKEN_TYPES_ARRAY = new String[] {};
public static final Token[] TRI_GRAM_TOKENS =
new Token[] {
createToken("please", 0, 6),
createToken("please divide", 0, 13),
createToken("please divide this", 0, 18),
createToken("divide", 7, 13),
createToken("divide this", 7, 18),
createToken("divide this sentence", 7, 27),
createToken("this", 14, 18),
createToken("this sentence", 14, 27),
createToken("this sentence into", 14, 32),
createToken("sentence", 19, 27),
createToken("sentence into", 19, 32),
createToken("sentence into shingles", 19, 39),
createToken("into", 28, 32),
createToken("into shingles", 28, 39),
createToken("shingles", 33, 39)
};
public static final int[] TRI_GRAM_POSITION_INCREMENTS =
new int[] {1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1};
public static final String[] TRI_GRAM_TYPES =
new String[] {
"word", "shingle", "shingle", "word", "shingle", "shingle", "word", "shingle", "shingle",
"word", "shingle", "shingle", "word", "shingle", "word"
};
public static final Token[] TRI_GRAM_TOKENS_WITHOUT_UNIGRAMS =
new Token[] {
createToken("please divide", 0, 13),
createToken("please divide this", 0, 18),
createToken("divide this", 7, 18),
createToken("divide this sentence", 7, 27),
createToken("this sentence", 14, 27),
createToken("this sentence into", 14, 32),
createToken("sentence into", 19, 32),
createToken("sentence into shingles", 19, 39),
createToken("into shingles", 28, 39),
};
public static final int[] TRI_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS =
new int[] {1, 0, 1, 0, 1, 0, 1, 0, 1};
public static final String[] TRI_GRAM_TYPES_WITHOUT_UNIGRAMS =
new String[] {
"shingle", "shingle",
"shingle", "shingle",
"shingle", "shingle",
"shingle", "shingle",
"shingle",
};
public static final Token[] FOUR_GRAM_TOKENS =
new Token[] {
createToken("please", 0, 6),
createToken("please divide", 0, 13),
createToken("please divide this", 0, 18),
createToken("please divide this sentence", 0, 27),
createToken("divide", 7, 13),
createToken("divide this", 7, 18),
createToken("divide this sentence", 7, 27),
createToken("divide this sentence into", 7, 32),
createToken("this", 14, 18),
createToken("this sentence", 14, 27),
createToken("this sentence into", 14, 32),
createToken("this sentence into shingles", 14, 39),
createToken("sentence", 19, 27),
createToken("sentence into", 19, 32),
createToken("sentence into shingles", 19, 39),
createToken("into", 28, 32),
createToken("into shingles", 28, 39),
createToken("shingles", 33, 39)
};
public static final int[] FOUR_GRAM_POSITION_INCREMENTS =
new int[] {1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1};
public static final String[] FOUR_GRAM_TYPES =
new String[] {
"word", "shingle", "shingle", "shingle", "word", "shingle", "shingle", "shingle", "word",
"shingle", "shingle", "shingle", "word", "shingle", "shingle", "word", "shingle", "word"
};
public static final Token[] FOUR_GRAM_TOKENS_WITHOUT_UNIGRAMS =
new Token[] {
createToken("please divide", 0, 13),
createToken("please divide this", 0, 18),
createToken("please divide this sentence", 0, 27),
createToken("divide this", 7, 18),
createToken("divide this sentence", 7, 27),
createToken("divide this sentence into", 7, 32),
createToken("this sentence", 14, 27),
createToken("this sentence into", 14, 32),
createToken("this sentence into shingles", 14, 39),
createToken("sentence into", 19, 32),
createToken("sentence into shingles", 19, 39),
createToken("into shingles", 28, 39),
};
public static final int[] FOUR_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS =
new int[] {1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1};
public static final String[] FOUR_GRAM_TYPES_WITHOUT_UNIGRAMS =
new String[] {
"shingle", "shingle",
"shingle", "shingle",
"shingle", "shingle",
"shingle", "shingle",
"shingle", "shingle",
"shingle", "shingle",
};
public static final Token[] TRI_GRAM_TOKENS_MIN_TRI_GRAM =
new Token[] {
createToken("please", 0, 6),
createToken("please divide this", 0, 18),
createToken("divide", 7, 13),
createToken("divide this sentence", 7, 27),
createToken("this", 14, 18),
createToken("this sentence into", 14, 32),
createToken("sentence", 19, 27),
createToken("sentence into shingles", 19, 39),
createToken("into", 28, 32),
createToken("shingles", 33, 39)
};
public static final int[] TRI_GRAM_POSITION_INCREMENTS_MIN_TRI_GRAM =
new int[] {1, 0, 1, 0, 1, 0, 1, 0, 1, 1};
public static final String[] TRI_GRAM_TYPES_MIN_TRI_GRAM =
new String[] {
"word", "shingle", "word", "shingle", "word", "shingle", "word", "shingle", "word", "word"
};
public static final Token[] TRI_GRAM_TOKENS_WITHOUT_UNIGRAMS_MIN_TRI_GRAM =
new Token[] {
createToken("please divide this", 0, 18),
createToken("divide this sentence", 7, 27),
createToken("this sentence into", 14, 32),
createToken("sentence into shingles", 19, 39)
};
public static final int[] TRI_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS_MIN_TRI_GRAM =
new int[] {1, 1, 1, 1};
public static final String[] TRI_GRAM_TYPES_WITHOUT_UNIGRAMS_MIN_TRI_GRAM =
new String[] {"shingle", "shingle", "shingle", "shingle"};
public static final Token[] FOUR_GRAM_TOKENS_MIN_TRI_GRAM =
new Token[] {
createToken("please", 0, 6),
createToken("please divide this", 0, 18),
createToken("please divide this sentence", 0, 27),
createToken("divide", 7, 13),
createToken("divide this sentence", 7, 27),
createToken("divide this sentence into", 7, 32),
createToken("this", 14, 18),
createToken("this sentence into", 14, 32),
createToken("this sentence into shingles", 14, 39),
createToken("sentence", 19, 27),
createToken("sentence into shingles", 19, 39),
createToken("into", 28, 32),
createToken("shingles", 33, 39)
};
public static final int[] FOUR_GRAM_POSITION_INCREMENTS_MIN_TRI_GRAM =
new int[] {1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1};
public static final String[] FOUR_GRAM_TYPES_MIN_TRI_GRAM =
new String[] {
"word", "shingle", "shingle", "word", "shingle", "shingle", "word", "shingle", "shingle",
"word", "shingle", "word", "word"
};
public static final Token[] FOUR_GRAM_TOKENS_WITHOUT_UNIGRAMS_MIN_TRI_GRAM =
new Token[] {
createToken("please divide this", 0, 18),
createToken("please divide this sentence", 0, 27),
createToken("divide this sentence", 7, 27),
createToken("divide this sentence into", 7, 32),
createToken("this sentence into", 14, 32),
createToken("this sentence into shingles", 14, 39),
createToken("sentence into shingles", 19, 39),
};
public static final int[] FOUR_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS_MIN_TRI_GRAM =
new int[] {1, 0, 1, 0, 1, 0, 1};
public static final String[] FOUR_GRAM_TYPES_WITHOUT_UNIGRAMS_MIN_TRI_GRAM =
new String[] {
"shingle", "shingle",
"shingle", "shingle",
"shingle", "shingle",
"shingle"
};
public static final Token[] FOUR_GRAM_TOKENS_MIN_FOUR_GRAM =
new Token[] {
createToken("please", 0, 6),
createToken("please divide this sentence", 0, 27),
createToken("divide", 7, 13),
createToken("divide this sentence into", 7, 32),
createToken("this", 14, 18),
createToken("this sentence into shingles", 14, 39),
createToken("sentence", 19, 27),
createToken("into", 28, 32),
createToken("shingles", 33, 39)
};
public static final int[] FOUR_GRAM_POSITION_INCREMENTS_MIN_FOUR_GRAM =
new int[] {1, 0, 1, 0, 1, 0, 1, 1, 1};
public static final String[] FOUR_GRAM_TYPES_MIN_FOUR_GRAM =
new String[] {
"word", "shingle", "word", "shingle", "word", "shingle", "word", "word", "word"
};
public static final Token[] FOUR_GRAM_TOKENS_WITHOUT_UNIGRAMS_MIN_FOUR_GRAM =
new Token[] {
createToken("please divide this sentence", 0, 27),
createToken("divide this sentence into", 7, 32),
createToken("this sentence into shingles", 14, 39),
};
public static final int[] FOUR_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS_MIN_FOUR_GRAM =
new int[] {1, 1, 1};
public static final String[] FOUR_GRAM_TYPES_WITHOUT_UNIGRAMS_MIN_FOUR_GRAM =
new String[] {"shingle", "shingle", "shingle"};
public static final Token[] BI_GRAM_TOKENS_NO_SEPARATOR =
new Token[] {
createToken("please", 0, 6),
createToken("pleasedivide", 0, 13),
createToken("divide", 7, 13),
createToken("dividethis", 7, 18),
createToken("this", 14, 18),
createToken("thissentence", 14, 27),
createToken("sentence", 19, 27),
createToken("sentenceinto", 19, 32),
createToken("into", 28, 32),
createToken("intoshingles", 28, 39),
createToken("shingles", 33, 39),
};
public static final int[] BI_GRAM_POSITION_INCREMENTS_NO_SEPARATOR =
new int[] {1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1};
public static final String[] BI_GRAM_TYPES_NO_SEPARATOR =
new String[] {
"word", "shingle", "word", "shingle", "word", "shingle", "word", "shingle", "word",
"shingle", "word"
};
public static final Token[] BI_GRAM_TOKENS_WITHOUT_UNIGRAMS_NO_SEPARATOR =
new Token[] {
createToken("pleasedivide", 0, 13),
createToken("dividethis", 7, 18),
createToken("thissentence", 14, 27),
createToken("sentenceinto", 19, 32),
createToken("intoshingles", 28, 39),
};
public static final int[] BI_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS_NO_SEPARATOR =
new int[] {1, 1, 1, 1, 1};
public static final String[] BI_GRAM_TYPES_WITHOUT_UNIGRAMS_NO_SEPARATOR =
new String[] {"shingle", "shingle", "shingle", "shingle", "shingle"};
public static final Token[] TRI_GRAM_TOKENS_NO_SEPARATOR =
new Token[] {
createToken("please", 0, 6),
createToken("pleasedivide", 0, 13),
createToken("pleasedividethis", 0, 18),
createToken("divide", 7, 13),
createToken("dividethis", 7, 18),
createToken("dividethissentence", 7, 27),
createToken("this", 14, 18),
createToken("thissentence", 14, 27),
createToken("thissentenceinto", 14, 32),
createToken("sentence", 19, 27),
createToken("sentenceinto", 19, 32),
createToken("sentenceintoshingles", 19, 39),
createToken("into", 28, 32),
createToken("intoshingles", 28, 39),
createToken("shingles", 33, 39)
};
public static final int[] TRI_GRAM_POSITION_INCREMENTS_NO_SEPARATOR =
new int[] {1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1};
public static final String[] TRI_GRAM_TYPES_NO_SEPARATOR =
new String[] {
"word", "shingle", "shingle", "word", "shingle", "shingle", "word", "shingle", "shingle",
"word", "shingle", "shingle", "word", "shingle", "word"
};
public static final Token[] TRI_GRAM_TOKENS_WITHOUT_UNIGRAMS_NO_SEPARATOR =
new Token[] {
createToken("pleasedivide", 0, 13),
createToken("pleasedividethis", 0, 18),
createToken("dividethis", 7, 18),
createToken("dividethissentence", 7, 27),
createToken("thissentence", 14, 27),
createToken("thissentenceinto", 14, 32),
createToken("sentenceinto", 19, 32),
createToken("sentenceintoshingles", 19, 39),
createToken("intoshingles", 28, 39),
};
public static final int[] TRI_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS_NO_SEPARATOR =
new int[] {1, 0, 1, 0, 1, 0, 1, 0, 1};
public static final String[] TRI_GRAM_TYPES_WITHOUT_UNIGRAMS_NO_SEPARATOR =
new String[] {
"shingle", "shingle",
"shingle", "shingle",
"shingle", "shingle",
"shingle", "shingle",
"shingle",
};
public static final Token[] BI_GRAM_TOKENS_ALT_SEPARATOR =
new Token[] {
createToken("please", 0, 6),
createToken("please<SEP>divide", 0, 13),
createToken("divide", 7, 13),
createToken("divide<SEP>this", 7, 18),
createToken("this", 14, 18),
createToken("this<SEP>sentence", 14, 27),
createToken("sentence", 19, 27),
createToken("sentence<SEP>into", 19, 32),
createToken("into", 28, 32),
createToken("into<SEP>shingles", 28, 39),
createToken("shingles", 33, 39),
};
public static final int[] BI_GRAM_POSITION_INCREMENTS_ALT_SEPARATOR =
new int[] {1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1};
public static final String[] BI_GRAM_TYPES_ALT_SEPARATOR =
new String[] {
"word", "shingle", "word", "shingle", "word", "shingle", "word", "shingle", "word",
"shingle", "word"
};
public static final Token[] BI_GRAM_TOKENS_WITHOUT_UNIGRAMS_ALT_SEPARATOR =
new Token[] {
createToken("please<SEP>divide", 0, 13),
createToken("divide<SEP>this", 7, 18),
createToken("this<SEP>sentence", 14, 27),
createToken("sentence<SEP>into", 19, 32),
createToken("into<SEP>shingles", 28, 39),
};
public static final int[] BI_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS_ALT_SEPARATOR =
new int[] {1, 1, 1, 1, 1};
public static final String[] BI_GRAM_TYPES_WITHOUT_UNIGRAMS_ALT_SEPARATOR =
new String[] {"shingle", "shingle", "shingle", "shingle", "shingle"};
public static final Token[] TRI_GRAM_TOKENS_ALT_SEPARATOR =
new Token[] {
createToken("please", 0, 6),
createToken("please<SEP>divide", 0, 13),
createToken("please<SEP>divide<SEP>this", 0, 18),
createToken("divide", 7, 13),
createToken("divide<SEP>this", 7, 18),
createToken("divide<SEP>this<SEP>sentence", 7, 27),
createToken("this", 14, 18),
createToken("this<SEP>sentence", 14, 27),
createToken("this<SEP>sentence<SEP>into", 14, 32),
createToken("sentence", 19, 27),
createToken("sentence<SEP>into", 19, 32),
createToken("sentence<SEP>into<SEP>shingles", 19, 39),
createToken("into", 28, 32),
createToken("into<SEP>shingles", 28, 39),
createToken("shingles", 33, 39)
};
public static final int[] TRI_GRAM_POSITION_INCREMENTS_ALT_SEPARATOR =
new int[] {1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1};
public static final String[] TRI_GRAM_TYPES_ALT_SEPARATOR =
new String[] {
"word", "shingle", "shingle", "word", "shingle", "shingle", "word", "shingle", "shingle",
"word", "shingle", "shingle", "word", "shingle", "word"
};
public static final Token[] TRI_GRAM_TOKENS_WITHOUT_UNIGRAMS_ALT_SEPARATOR =
new Token[] {
createToken("please<SEP>divide", 0, 13),
createToken("please<SEP>divide<SEP>this", 0, 18),
createToken("divide<SEP>this", 7, 18),
createToken("divide<SEP>this<SEP>sentence", 7, 27),
createToken("this<SEP>sentence", 14, 27),
createToken("this<SEP>sentence<SEP>into", 14, 32),
createToken("sentence<SEP>into", 19, 32),
createToken("sentence<SEP>into<SEP>shingles", 19, 39),
createToken("into<SEP>shingles", 28, 39),
};
public static final int[] TRI_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS_ALT_SEPARATOR =
new int[] {1, 0, 1, 0, 1, 0, 1, 0, 1};
public static final String[] TRI_GRAM_TYPES_WITHOUT_UNIGRAMS_ALT_SEPARATOR =
new String[] {
"shingle", "shingle",
"shingle", "shingle",
"shingle", "shingle",
"shingle", "shingle",
"shingle",
};
public static final Token[] TRI_GRAM_TOKENS_NULL_SEPARATOR =
new Token[] {
createToken("please", 0, 6),
createToken("pleasedivide", 0, 13),
createToken("pleasedividethis", 0, 18),
createToken("divide", 7, 13),
createToken("dividethis", 7, 18),
createToken("dividethissentence", 7, 27),
createToken("this", 14, 18),
createToken("thissentence", 14, 27),
createToken("thissentenceinto", 14, 32),
createToken("sentence", 19, 27),
createToken("sentenceinto", 19, 32),
createToken("sentenceintoshingles", 19, 39),
createToken("into", 28, 32),
createToken("intoshingles", 28, 39),
createToken("shingles", 33, 39)
};
public static final int[] TRI_GRAM_POSITION_INCREMENTS_NULL_SEPARATOR =
new int[] {1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1};
public static final String[] TRI_GRAM_TYPES_NULL_SEPARATOR =
new String[] {
"word", "shingle", "shingle", "word", "shingle", "shingle", "word", "shingle", "shingle",
"word", "shingle", "shingle", "word", "shingle", "word"
};
public static final Token[] TEST_TOKEN_POS_INCR_EQUAL_TO_N =
new Token[] {
createToken("please", 0, 6),
createToken("divide", 7, 13),
createToken("this", 14, 18),
createToken("sentence", 29, 37, 3),
createToken("into", 38, 42),
createToken("shingles", 43, 49),
};
public static final Token[] TRI_GRAM_TOKENS_POS_INCR_EQUAL_TO_N =
new Token[] {
createToken("please", 0, 6),
createToken("please divide", 0, 13),
createToken("please divide this", 0, 18),
createToken("divide", 7, 13),
createToken("divide this", 7, 18),
createToken("divide this _", 7, 29),
createToken("this", 14, 18),
createToken("this _", 14, 29),
createToken("this _ _", 14, 29),
createToken("_ _ sentence", 29, 37),
createToken("_ sentence", 29, 37),
createToken("_ sentence into", 29, 42),
createToken("sentence", 29, 37),
createToken("sentence into", 29, 42),
createToken("sentence into shingles", 29, 49),
createToken("into", 38, 42),
createToken("into shingles", 38, 49),
createToken("shingles", 43, 49)
};
public static final int[] TRI_GRAM_POSITION_INCREMENTS_POS_INCR_EQUAL_TO_N =
new int[] {1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1};
public static final String[] TRI_GRAM_TYPES_POS_INCR_EQUAL_TO_N =
new String[] {
"word", "shingle", "shingle", "word", "shingle", "shingle", "word", "shingle", "shingle",
"shingle", "shingle", "shingle", "word", "shingle", "shingle", "word", "shingle", "word"
};
public static final Token[] TRI_GRAM_TOKENS_POS_INCR_EQUAL_TO_N_WITHOUT_UNIGRAMS =
new Token[] {
createToken("please divide", 0, 13),
createToken("please divide this", 0, 18),
createToken("divide this", 7, 18),
createToken("divide this _", 7, 29),
createToken("this _", 14, 29),
createToken("this _ _", 14, 29),
createToken("_ _ sentence", 29, 37),
createToken("_ sentence", 29, 37),
createToken("_ sentence into", 29, 42),
createToken("sentence into", 29, 42),
createToken("sentence into shingles", 29, 49),
createToken("into shingles", 38, 49),
};
public static final int[] TRI_GRAM_POSITION_INCREMENTS_POS_INCR_EQUAL_TO_N_WITHOUT_UNIGRAMS =
new int[] {1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1};
public static final String[] TRI_GRAM_TYPES_POS_INCR_EQUAL_TO_N_WITHOUT_UNIGRAMS =
new String[] {
"shingle", "shingle", "shingle", "shingle", "shingle", "shingle", "shingle", "shingle",
"shingle", "shingle", "shingle", "shingle",
};
public static final Token[] TEST_TOKEN_POS_INCR_GREATER_THAN_N =
new Token[] {
createToken("please", 0, 6),
createToken("divide", 57, 63, 8),
createToken("this", 64, 68),
createToken("sentence", 69, 77),
createToken("into", 78, 82),
createToken("shingles", 83, 89),
};
public static final Token[] TRI_GRAM_TOKENS_POS_INCR_GREATER_THAN_N =
new Token[] {
createToken("please", 0, 6),
createToken("please _", 0, 57),
createToken("please _ _", 0, 57),
createToken("_ _ divide", 57, 63),
createToken("_ divide", 57, 63),
createToken("_ divide this", 57, 68),
createToken("divide", 57, 63),
createToken("divide this", 57, 68),
createToken("divide this sentence", 57, 77),
createToken("this", 64, 68),
createToken("this sentence", 64, 77),
createToken("this sentence into", 64, 82),
createToken("sentence", 69, 77),
createToken("sentence into", 69, 82),
createToken("sentence into shingles", 69, 89),
createToken("into", 78, 82),
createToken("into shingles", 78, 89),
createToken("shingles", 83, 89)
};
public static final int[] TRI_GRAM_POSITION_INCREMENTS_POS_INCR_GREATER_THAN_N =
new int[] {1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1};
public static final String[] TRI_GRAM_TYPES_POS_INCR_GREATER_THAN_N =
new String[] {
"word", "shingle", "shingle", "shingle", "shingle", "shingle", "word", "shingle", "shingle",
"word", "shingle", "shingle", "word", "shingle", "shingle", "word", "shingle", "word"
};
public static final Token[] TRI_GRAM_TOKENS_POS_INCR_GREATER_THAN_N_WITHOUT_UNIGRAMS =
new Token[] {
createToken("please _", 0, 57),
createToken("please _ _", 0, 57),
createToken("_ _ divide", 57, 63),
createToken("_ divide", 57, 63),
createToken("_ divide this", 57, 68),
createToken("divide this", 57, 68),
createToken("divide this sentence", 57, 77),
createToken("this sentence", 64, 77),
createToken("this sentence into", 64, 82),
createToken("sentence into", 69, 82),
createToken("sentence into shingles", 69, 89),
createToken("into shingles", 78, 89),
};
public static final int[] TRI_GRAM_POSITION_INCREMENTS_POS_INCR_GREATER_THAN_N_WITHOUT_UNIGRAMS =
new int[] {1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1};
public static final String[] TRI_GRAM_TYPES_POS_INCR_GREATER_THAN_N_WITHOUT_UNIGRAMS =
new String[] {
"shingle", "shingle", "shingle", "shingle", "shingle", "shingle", "shingle", "shingle",
"shingle", "shingle", "shingle", "shingle",
};
@Override
public void setUp() throws Exception {
super.setUp();
testTokenWithHoles =
new Token[] {
createToken("please", 0, 6),
createToken("divide", 7, 13),
createToken("sentence", 19, 27, 2),
createToken("shingles", 33, 39, 2),
};
}
/*
* Class under test for void ShingleFilter(TokenStream, int)
*/
public void testBiGramFilter() throws IOException {
this.shingleFilterTest(
2, TEST_TOKEN, BI_GRAM_TOKENS, BI_GRAM_POSITION_INCREMENTS, BI_GRAM_TYPES, true);
}
public void testBiGramFilterWithHoles() throws IOException {
this.shingleFilterTest(
2,
testTokenWithHoles,
BI_GRAM_TOKENS_WITH_HOLES,
BI_GRAM_POSITION_INCREMENTS_WITH_HOLES,
BI_GRAM_TYPES_WITH_HOLES,
true);
}
public void testBiGramFilterWithoutUnigrams() throws IOException {
this.shingleFilterTest(
2,
TEST_TOKEN,
BI_GRAM_TOKENS_WITHOUT_UNIGRAMS,
BI_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS,
BI_GRAM_TYPES_WITHOUT_UNIGRAMS,
false);
}
public void testBiGramFilterWithHolesWithoutUnigrams() throws IOException {
this.shingleFilterTest(
2,
testTokenWithHoles,
BI_GRAM_TOKENS_WITH_HOLES_WITHOUT_UNIGRAMS,
BI_GRAM_POSITION_INCREMENTS_WITH_HOLES_WITHOUT_UNIGRAMS,
BI_GRAM_TYPES_WITHOUT_UNIGRAMS,
false);
}
public void testBiGramFilterWithSingleToken() throws IOException {
this.shingleFilterTest(
2, TEST_SINGLE_TOKEN, SINGLE_TOKEN, SINGLE_TOKEN_INCREMENTS, SINGLE_TOKEN_TYPES, true);
}
public void testBiGramFilterWithSingleTokenWithoutUnigrams() throws IOException {
this.shingleFilterTest(
2,
TEST_SINGLE_TOKEN,
EMPTY_TOKEN_ARRAY,
EMPTY_TOKEN_INCREMENTS_ARRAY,
EMPTY_TOKEN_TYPES_ARRAY,
false);
}
public void testBiGramFilterWithEmptyTokenStream() throws IOException {
this.shingleFilterTest(
2,
EMPTY_TOKEN_ARRAY,
EMPTY_TOKEN_ARRAY,
EMPTY_TOKEN_INCREMENTS_ARRAY,
EMPTY_TOKEN_TYPES_ARRAY,
true);
}
public void testBiGramFilterWithEmptyTokenStreamWithoutUnigrams() throws IOException {
this.shingleFilterTest(
2,
EMPTY_TOKEN_ARRAY,
EMPTY_TOKEN_ARRAY,
EMPTY_TOKEN_INCREMENTS_ARRAY,
EMPTY_TOKEN_TYPES_ARRAY,
false);
}
public void testTriGramFilter() throws IOException {
this.shingleFilterTest(
3, TEST_TOKEN, TRI_GRAM_TOKENS, TRI_GRAM_POSITION_INCREMENTS, TRI_GRAM_TYPES, true);
}
public void testTriGramFilterWithoutUnigrams() throws IOException {
this.shingleFilterTest(
3,
TEST_TOKEN,
TRI_GRAM_TOKENS_WITHOUT_UNIGRAMS,
TRI_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS,
TRI_GRAM_TYPES_WITHOUT_UNIGRAMS,
false);
}
public void testFourGramFilter() throws IOException {
this.shingleFilterTest(
4, TEST_TOKEN, FOUR_GRAM_TOKENS, FOUR_GRAM_POSITION_INCREMENTS, FOUR_GRAM_TYPES, true);
}
public void testFourGramFilterWithoutUnigrams() throws IOException {
this.shingleFilterTest(
4,
TEST_TOKEN,
FOUR_GRAM_TOKENS_WITHOUT_UNIGRAMS,
FOUR_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS,
FOUR_GRAM_TYPES_WITHOUT_UNIGRAMS,
false);
}
public void testTriGramFilterMinTriGram() throws IOException {
this.shingleFilterTest(
3,
3,
TEST_TOKEN,
TRI_GRAM_TOKENS_MIN_TRI_GRAM,
TRI_GRAM_POSITION_INCREMENTS_MIN_TRI_GRAM,
TRI_GRAM_TYPES_MIN_TRI_GRAM,
true);
}
public void testTriGramFilterWithoutUnigramsMinTriGram() throws IOException {
this.shingleFilterTest(
3,
3,
TEST_TOKEN,
TRI_GRAM_TOKENS_WITHOUT_UNIGRAMS_MIN_TRI_GRAM,
TRI_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS_MIN_TRI_GRAM,
TRI_GRAM_TYPES_WITHOUT_UNIGRAMS_MIN_TRI_GRAM,
false);
}
public void testFourGramFilterMinTriGram() throws IOException {
this.shingleFilterTest(
3,
4,
TEST_TOKEN,
FOUR_GRAM_TOKENS_MIN_TRI_GRAM,
FOUR_GRAM_POSITION_INCREMENTS_MIN_TRI_GRAM,
FOUR_GRAM_TYPES_MIN_TRI_GRAM,
true);
}
public void testFourGramFilterWithoutUnigramsMinTriGram() throws IOException {
this.shingleFilterTest(
3,
4,
TEST_TOKEN,
FOUR_GRAM_TOKENS_WITHOUT_UNIGRAMS_MIN_TRI_GRAM,
FOUR_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS_MIN_TRI_GRAM,
FOUR_GRAM_TYPES_WITHOUT_UNIGRAMS_MIN_TRI_GRAM,
false);
}
public void testFourGramFilterMinFourGram() throws IOException {
this.shingleFilterTest(
4,
4,
TEST_TOKEN,
FOUR_GRAM_TOKENS_MIN_FOUR_GRAM,
FOUR_GRAM_POSITION_INCREMENTS_MIN_FOUR_GRAM,
FOUR_GRAM_TYPES_MIN_FOUR_GRAM,
true);
}
public void testFourGramFilterWithoutUnigramsMinFourGram() throws IOException {
this.shingleFilterTest(
4,
4,
TEST_TOKEN,
FOUR_GRAM_TOKENS_WITHOUT_UNIGRAMS_MIN_FOUR_GRAM,
FOUR_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS_MIN_FOUR_GRAM,
FOUR_GRAM_TYPES_WITHOUT_UNIGRAMS_MIN_FOUR_GRAM,
false);
}
public void testBiGramFilterNoSeparator() throws IOException {
this.shingleFilterTest(
"",
2,
2,
TEST_TOKEN,
BI_GRAM_TOKENS_NO_SEPARATOR,
BI_GRAM_POSITION_INCREMENTS_NO_SEPARATOR,
BI_GRAM_TYPES_NO_SEPARATOR,
true);
}
public void testBiGramFilterWithoutUnigramsNoSeparator() throws IOException {
this.shingleFilterTest(
"",
2,
2,
TEST_TOKEN,
BI_GRAM_TOKENS_WITHOUT_UNIGRAMS_NO_SEPARATOR,
BI_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS_NO_SEPARATOR,
BI_GRAM_TYPES_WITHOUT_UNIGRAMS_NO_SEPARATOR,
false);
}
public void testTriGramFilterNoSeparator() throws IOException {
this.shingleFilterTest(
"",
2,
3,
TEST_TOKEN,
TRI_GRAM_TOKENS_NO_SEPARATOR,
TRI_GRAM_POSITION_INCREMENTS_NO_SEPARATOR,
TRI_GRAM_TYPES_NO_SEPARATOR,
true);
}
public void testTriGramFilterWithoutUnigramsNoSeparator() throws IOException {
this.shingleFilterTest(
"",
2,
3,
TEST_TOKEN,
TRI_GRAM_TOKENS_WITHOUT_UNIGRAMS_NO_SEPARATOR,
TRI_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS_NO_SEPARATOR,
TRI_GRAM_TYPES_WITHOUT_UNIGRAMS_NO_SEPARATOR,
false);
}
public void testBiGramFilterAltSeparator() throws IOException {
this.shingleFilterTest(
"<SEP>",
2,
2,
TEST_TOKEN,
BI_GRAM_TOKENS_ALT_SEPARATOR,
BI_GRAM_POSITION_INCREMENTS_ALT_SEPARATOR,
BI_GRAM_TYPES_ALT_SEPARATOR,
true);
}
public void testBiGramFilterWithoutUnigramsAltSeparator() throws IOException {
this.shingleFilterTest(
"<SEP>",
2,
2,
TEST_TOKEN,
BI_GRAM_TOKENS_WITHOUT_UNIGRAMS_ALT_SEPARATOR,
BI_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS_ALT_SEPARATOR,
BI_GRAM_TYPES_WITHOUT_UNIGRAMS_ALT_SEPARATOR,
false);
}
public void testTriGramFilterAltSeparator() throws IOException {
this.shingleFilterTest(
"<SEP>",
2,
3,
TEST_TOKEN,
TRI_GRAM_TOKENS_ALT_SEPARATOR,
TRI_GRAM_POSITION_INCREMENTS_ALT_SEPARATOR,
TRI_GRAM_TYPES_ALT_SEPARATOR,
true);
}
public void testTriGramFilterWithoutUnigramsAltSeparator() throws IOException {
this.shingleFilterTest(
"<SEP>",
2,
3,
TEST_TOKEN,
TRI_GRAM_TOKENS_WITHOUT_UNIGRAMS_ALT_SEPARATOR,
TRI_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS_ALT_SEPARATOR,
TRI_GRAM_TYPES_WITHOUT_UNIGRAMS_ALT_SEPARATOR,
false);
}
public void testTriGramFilterNullSeparator() throws IOException {
this.shingleFilterTest(
null,
2,
3,
TEST_TOKEN,
TRI_GRAM_TOKENS_NULL_SEPARATOR,
TRI_GRAM_POSITION_INCREMENTS_NULL_SEPARATOR,
TRI_GRAM_TYPES_NULL_SEPARATOR,
true);
}
public void testPositionIncrementEqualToN() throws IOException {
this.shingleFilterTest(
2,
3,
TEST_TOKEN_POS_INCR_EQUAL_TO_N,
TRI_GRAM_TOKENS_POS_INCR_EQUAL_TO_N,
TRI_GRAM_POSITION_INCREMENTS_POS_INCR_EQUAL_TO_N,
TRI_GRAM_TYPES_POS_INCR_EQUAL_TO_N,
true);
}
public void testPositionIncrementEqualToNWithoutUnigrams() throws IOException {
this.shingleFilterTest(
2,
3,
TEST_TOKEN_POS_INCR_EQUAL_TO_N,
TRI_GRAM_TOKENS_POS_INCR_EQUAL_TO_N_WITHOUT_UNIGRAMS,
TRI_GRAM_POSITION_INCREMENTS_POS_INCR_EQUAL_TO_N_WITHOUT_UNIGRAMS,
TRI_GRAM_TYPES_POS_INCR_EQUAL_TO_N_WITHOUT_UNIGRAMS,
false);
}
public void testPositionIncrementGreaterThanN() throws IOException {
this.shingleFilterTest(
2,
3,
TEST_TOKEN_POS_INCR_GREATER_THAN_N,
TRI_GRAM_TOKENS_POS_INCR_GREATER_THAN_N,
TRI_GRAM_POSITION_INCREMENTS_POS_INCR_GREATER_THAN_N,
TRI_GRAM_TYPES_POS_INCR_GREATER_THAN_N,
true);
}
public void testPositionIncrementGreaterThanNWithoutUnigrams() throws IOException {
this.shingleFilterTest(
2,
3,
TEST_TOKEN_POS_INCR_GREATER_THAN_N,
TRI_GRAM_TOKENS_POS_INCR_GREATER_THAN_N_WITHOUT_UNIGRAMS,
TRI_GRAM_POSITION_INCREMENTS_POS_INCR_GREATER_THAN_N_WITHOUT_UNIGRAMS,
TRI_GRAM_TYPES_POS_INCR_GREATER_THAN_N_WITHOUT_UNIGRAMS,
false);
}
public void testReset() throws Exception {
Tokenizer wsTokenizer = new WhitespaceTokenizer();
wsTokenizer.setReader(new StringReader("please divide this sentence"));
TokenStream filter = new ShingleFilter(wsTokenizer, 2);
assertTokenStreamContents(
filter,
new String[] {
"please", "please divide", "divide", "divide this", "this", "this sentence", "sentence"
},
new int[] {0, 0, 7, 7, 14, 14, 19},
new int[] {6, 13, 13, 18, 18, 27, 27},
new String[] {
TypeAttribute.DEFAULT_TYPE,
"shingle",
TypeAttribute.DEFAULT_TYPE,
"shingle",
TypeAttribute.DEFAULT_TYPE,
"shingle",
TypeAttribute.DEFAULT_TYPE
},
new int[] {1, 0, 1, 0, 1, 0, 1});
wsTokenizer.setReader(new StringReader("please divide this sentence"));
assertTokenStreamContents(
filter,
new String[] {
"please", "please divide", "divide", "divide this", "this", "this sentence", "sentence"
},
new int[] {0, 0, 7, 7, 14, 14, 19},
new int[] {6, 13, 13, 18, 18, 27, 27},
new String[] {
TypeAttribute.DEFAULT_TYPE,
"shingle",
TypeAttribute.DEFAULT_TYPE,
"shingle",
TypeAttribute.DEFAULT_TYPE,
"shingle",
TypeAttribute.DEFAULT_TYPE
},
new int[] {1, 0, 1, 0, 1, 0, 1});
}
public void testOutputUnigramsIfNoShinglesSingleTokenCase() throws IOException {
// Single token input with outputUnigrams==false is the primary case where
// enabling this option should alter program behavior.
this.shingleFilterTest(
2,
2,
TEST_SINGLE_TOKEN,
SINGLE_TOKEN,
SINGLE_TOKEN_INCREMENTS,
SINGLE_TOKEN_TYPES,
false,
true);
}
public void testOutputUnigramsIfNoShinglesWithSimpleBigram() throws IOException {
// Here we expect the same result as with testBiGramFilter().
this.shingleFilterTest(
2, 2, TEST_TOKEN, BI_GRAM_TOKENS, BI_GRAM_POSITION_INCREMENTS, BI_GRAM_TYPES, true, true);
}
public void testOutputUnigramsIfNoShinglesWithSimpleUnigramlessBigram() throws IOException {
// Here we expect the same result as with testBiGramFilterWithoutUnigrams().
this.shingleFilterTest(
2,
2,
TEST_TOKEN,
BI_GRAM_TOKENS_WITHOUT_UNIGRAMS,
BI_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS,
BI_GRAM_TYPES_WITHOUT_UNIGRAMS,
false,
true);
}
public void testOutputUnigramsIfNoShinglesWithMultipleInputTokens() throws IOException {
// Test when the minimum shingle size is greater than the number of input tokens
this.shingleFilterTest(
7,
7,
TEST_TOKEN,
TEST_TOKEN,
UNIGRAM_ONLY_POSITION_INCREMENTS,
UNIGRAM_ONLY_TYPES,
false,
true);
}
protected void shingleFilterTest(
int maxSize,
Token[] tokensToShingle,
Token[] tokensToCompare,
int[] positionIncrements,
String[] types,
boolean outputUnigrams)
throws IOException {
ShingleFilter filter = new ShingleFilter(new CannedTokenStream(tokensToShingle), maxSize);
filter.setOutputUnigrams(outputUnigrams);
shingleFilterTestCommon(filter, tokensToCompare, positionIncrements, types);
}
protected void shingleFilterTest(
int minSize,
int maxSize,
Token[] tokensToShingle,
Token[] tokensToCompare,
int[] positionIncrements,
String[] types,
boolean outputUnigrams)
throws IOException {
ShingleFilter filter =
new ShingleFilter(new CannedTokenStream(tokensToShingle), minSize, maxSize);
filter.setOutputUnigrams(outputUnigrams);
shingleFilterTestCommon(filter, tokensToCompare, positionIncrements, types);
}
protected void shingleFilterTest(
int minSize,
int maxSize,
Token[] tokensToShingle,
Token[] tokensToCompare,
int[] positionIncrements,
String[] types,
boolean outputUnigrams,
boolean outputUnigramsIfNoShingles)
throws IOException {
ShingleFilter filter =
new ShingleFilter(new CannedTokenStream(tokensToShingle), minSize, maxSize);
filter.setOutputUnigrams(outputUnigrams);
filter.setOutputUnigramsIfNoShingles(outputUnigramsIfNoShingles);
shingleFilterTestCommon(filter, tokensToCompare, positionIncrements, types);
}
protected void shingleFilterTest(
String tokenSeparator,
int minSize,
int maxSize,
Token[] tokensToShingle,
Token[] tokensToCompare,
int[] positionIncrements,
String[] types,
boolean outputUnigrams)
throws IOException {
ShingleFilter filter =
new ShingleFilter(new CannedTokenStream(tokensToShingle), minSize, maxSize);
filter.setTokenSeparator(tokenSeparator);
filter.setOutputUnigrams(outputUnigrams);
shingleFilterTestCommon(filter, tokensToCompare, positionIncrements, types);
}
protected void shingleFilterTestCommon(
ShingleFilter filter, Token[] tokensToCompare, int[] positionIncrements, String[] types)
throws IOException {
String text[] = new String[tokensToCompare.length];
int startOffsets[] = new int[tokensToCompare.length];
int endOffsets[] = new int[tokensToCompare.length];
for (int i = 0; i < tokensToCompare.length; i++) {
text[i] = new String(tokensToCompare[i].buffer(), 0, tokensToCompare[i].length());
startOffsets[i] = tokensToCompare[i].startOffset();
endOffsets[i] = tokensToCompare[i].endOffset();
}
assertTokenStreamContents(filter, text, startOffsets, endOffsets, types, positionIncrements);
}
private static Token createToken(String term, int start, int offset) {
return createToken(term, start, offset, 1);
}
private static Token createToken(String term, int start, int offset, int positionIncrement) {
Token token = new Token();
token.setOffset(start, offset);
token.copyBuffer(term.toCharArray(), 0, term.length());
token.setPositionIncrement(positionIncrement);
return token;
}
/** blast some random strings through the analyzer */
public void testRandomStrings() throws Exception {
Analyzer a =
new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
return new TokenStreamComponents(tokenizer, new ShingleFilter(tokenizer));
}
};
checkRandomData(random(), a, 200 * RANDOM_MULTIPLIER);
a.close();
}
/** blast some random large strings through the analyzer */
public void testRandomHugeStrings() throws Exception {
Random random = random();
Analyzer a =
new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
return new TokenStreamComponents(tokenizer, new ShingleFilter(tokenizer));
}
};
checkRandomData(random, a, 3 * RANDOM_MULTIPLIER, 8192);
a.close();
}
public void testEmptyTerm() throws IOException {
Analyzer a =
new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = new KeywordTokenizer();
return new TokenStreamComponents(tokenizer, new ShingleFilter(tokenizer));
}
};
checkOneTerm(a, "", "");
a.close();
}
public void testTrailingHole1() throws IOException {
// Analyzing "wizard of", where of is removed as a
// stopword leaving a trailing hole:
Token[] inputTokens = new Token[] {createToken("wizard", 0, 6)};
ShingleFilter filter = new ShingleFilter(new CannedTokenStream(1, 9, inputTokens), 2, 2);
assertTokenStreamContents(
filter,
new String[] {"wizard", "wizard _"},
new int[] {0, 0},
new int[] {6, 9},
new int[] {1, 0},
9);
}
public void testTrailingHole2() throws IOException {
// Analyzing "purple wizard of", where of is removed as a
// stopword leaving a trailing hole:
Token[] inputTokens = new Token[] {createToken("purple", 0, 6), createToken("wizard", 7, 13)};
ShingleFilter filter = new ShingleFilter(new CannedTokenStream(1, 16, inputTokens), 2, 2);
assertTokenStreamContents(
filter,
new String[] {"purple", "purple wizard", "wizard", "wizard _"},
new int[] {0, 0, 7, 7},
new int[] {6, 13, 13, 16},
new int[] {1, 0, 1, 0},
16);
}
public void testTwoTrailingHoles() throws IOException {
// Analyzing "purple wizard of the", where of and the are removed as a
// stopwords, leaving two trailing holes:
Token[] inputTokens = new Token[] {createToken("purple", 0, 6), createToken("wizard", 7, 13)};
ShingleFilter filter = new ShingleFilter(new CannedTokenStream(2, 20, inputTokens), 2, 2);
assertTokenStreamContents(
filter,
new String[] {"purple", "purple wizard", "wizard", "wizard _"},
new int[] {0, 0, 7, 7},
new int[] {6, 13, 13, 20},
new int[] {1, 0, 1, 0},
20);
}
public void testTwoTrailingHolesTriShingle() throws IOException {
// Analyzing "purple wizard of the", where of and the are removed as a
// stopwords, leaving two trailing holes:
Token[] inputTokens = new Token[] {createToken("purple", 0, 6), createToken("wizard", 7, 13)};
ShingleFilter filter = new ShingleFilter(new CannedTokenStream(2, 20, inputTokens), 2, 3);
assertTokenStreamContents(
filter,
new String[] {
"purple", "purple wizard", "purple wizard _", "wizard", "wizard _", "wizard _ _"
},
new int[] {0, 0, 0, 7, 7, 7},
new int[] {6, 13, 20, 13, 20, 20},
new int[] {1, 0, 0, 1, 0, 0},
20);
}
public void testTwoTrailingHolesTriShingleWithTokenFiller() throws IOException {
// Analyzing "purple wizard of the", where of and the are removed as a
// stopwords, leaving two trailing holes:
Token[] inputTokens = new Token[] {createToken("purple", 0, 6), createToken("wizard", 7, 13)};
ShingleFilter filter = new ShingleFilter(new CannedTokenStream(2, 20, inputTokens), 2, 3);
filter.setFillerToken("--");
assertTokenStreamContents(
filter,
new String[] {
"purple", "purple wizard", "purple wizard --", "wizard", "wizard --", "wizard -- --"
},
new int[] {0, 0, 0, 7, 7, 7},
new int[] {6, 13, 20, 13, 20, 20},
new int[] {1, 0, 0, 1, 0, 0},
20);
filter = new ShingleFilter(new CannedTokenStream(2, 20, inputTokens), 2, 3);
filter.setFillerToken("");
assertTokenStreamContents(
filter,
new String[] {"purple", "purple wizard", "purple wizard ", "wizard", "wizard ", "wizard "},
new int[] {0, 0, 0, 7, 7, 7},
new int[] {6, 13, 20, 13, 20, 20},
new int[] {1, 0, 0, 1, 0, 0},
20);
filter = new ShingleFilter(new CannedTokenStream(2, 20, inputTokens), 2, 3);
filter.setFillerToken(null);
assertTokenStreamContents(
filter,
new String[] {"purple", "purple wizard", "purple wizard ", "wizard", "wizard ", "wizard "},
new int[] {0, 0, 0, 7, 7, 7},
new int[] {6, 13, 20, 13, 20, 20},
new int[] {1, 0, 0, 1, 0, 0},
20);
filter = new ShingleFilter(new CannedTokenStream(2, 20, inputTokens), 2, 3);
filter.setFillerToken(null);
filter.setTokenSeparator(null);
assertTokenStreamContents(
filter,
new String[] {"purple", "purplewizard", "purplewizard", "wizard", "wizard", "wizard"},
new int[] {0, 0, 0, 7, 7, 7},
new int[] {6, 13, 20, 13, 20, 20},
new int[] {1, 0, 0, 1, 0, 0},
20);
}
public void testPositionLength() throws Exception {
Analyzer a =
new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
ShingleFilter filter = new ShingleFilter(tokenizer, 4, 4);
filter.setOutputUnigrams(false);
return new TokenStreamComponents(tokenizer, filter);
}
};
assertTokenStreamContents(
a.tokenStream("", "to be or not to be"),
new String[] {"to be or not", "be or not to", "or not to be"},
new int[] {0, 3, 6},
new int[] {12, 15, 18},
null,
new int[] {1, 1, 1},
new int[] {1, 1, 1},
18,
// offsets are correct but assertTokenStreamContents does not handle multiple terms with
// different offsets
// finishing at the same position
false);
a =
new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
ShingleFilter filter = new ShingleFilter(tokenizer, 2, 4);
filter.setOutputUnigrams(false);
return new TokenStreamComponents(tokenizer, filter);
}
};
assertTokenStreamContents(
a.tokenStream("", "to be or not to be"),
new String[] {
"to be",
"to be or",
"to be or not",
"be or",
"be or not",
"be or not to",
"or not",
"or not to",
"or not to be",
"not to",
"not to be",
"to be"
},
new int[] {0, 0, 0, 3, 3, 3, 6, 6, 6, 9, 9, 13},
new int[] {5, 8, 12, 8, 12, 15, 12, 15, 18, 15, 18, 18},
null,
new int[] {1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1},
new int[] {1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 1},
18,
// offsets are correct but assertTokenStreamContents does not handle multiple terms with
// different offsets
// finishing at the same position
false);
a =
new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
ShingleFilter filter = new ShingleFilter(tokenizer, 3, 4);
filter.setOutputUnigrams(false);
return new TokenStreamComponents(tokenizer, filter);
}
};
assertTokenStreamContents(
a.tokenStream("", "to be or not to be"),
new String[] {
"to be or",
"to be or not",
"be or not",
"be or not to",
"or not to",
"or not to be",
"not to be"
},
new int[] {0, 0, 3, 3, 6, 6, 9},
new int[] {8, 12, 12, 15, 15, 18, 18},
null,
new int[] {1, 0, 1, 0, 1, 0, 1, 0},
new int[] {1, 2, 1, 2, 1, 2, 1, 2},
18,
// offsets are correct but assertTokenStreamContents does not handle multiple terms with
// different offsets
// finishing at the same position
false);
a =
new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
ShingleFilter filter = new ShingleFilter(tokenizer, 3, 5);
filter.setOutputUnigrams(false);
return new TokenStreamComponents(tokenizer, filter);
}
};
assertTokenStreamContents(
a.tokenStream("", "to be or not to be"),
new String[] {
"to be or",
"to be or not",
"to be or not to",
"be or not",
"be or not to",
"be or not to be",
"or not to",
"or not to be",
"not to be"
},
new int[] {0, 0, 0, 3, 3, 3, 6, 6, 9, 9},
new int[] {8, 12, 15, 12, 15, 18, 15, 18, 18},
null,
new int[] {1, 0, 0, 1, 0, 0, 1, 0, 1, 0},
new int[] {1, 2, 3, 1, 2, 3, 1, 2, 1},
18,
// offsets are correct but assertTokenStreamContents does not handle multiple terms with
// different offsets
// finishing at the same position
false);
}
}