| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.lucene.analysis.miscellaneous; |
| |
| import java.util.Collections; |
| import java.util.Set; |
| import org.apache.lucene.analysis.BaseTokenStreamTestCase; |
| import org.apache.lucene.analysis.CannedTokenStream; |
| import org.apache.lucene.analysis.Token; |
| import org.apache.lucene.analysis.TokenStream; |
| |
| /** |
| * Test that this filter moves the value in type to a synonym token with the same offsets. This is |
| * rarely useful by itself, but in combination with another filter that updates the type value with |
| * an appropriate synonym can be used to identify synonyms before tokens are modified by further |
| * analysis, and then add them at the end, ensuring that the synonym value has not been subjected to |
| * the intervening analysis. This typically applies when the analysis would remove characters that |
| * should remain in the synonym. |
| */ |
| public class TestTypeAsSynonymFilter extends BaseTokenStreamTestCase { |
| |
| /** |
| * Test the straight forward case with the simplest constructor. Simply converts every type to a |
| * synonym. Typically one wants to also set an ignore list containing "word" unless that default |
| * value is removed by prior analysis. |
| */ |
| public void testSimple() throws Exception { |
| |
| Token token = new Token("foo", 0, 2); |
| token.setType("bar"); |
| Token token2 = new Token("foo", 4, 6); |
| token2.setFlags(5); |
| TokenStream ts = new CannedTokenStream(token, token2); |
| ts = new TypeAsSynonymFilter(ts); |
| |
| // "word" is the default type! |
| assertTokenStreamContents( |
| ts, |
| new String[] {"foo", "bar", "foo", "word"}, |
| new int[] {0, 0, 4, 4}, |
| new int[] {2, 2, 6, 6}, |
| null, // not testing types |
| new int[] {1, 0, 1, 0}, |
| null, // positions |
| // final values, keywords, graph, payloads not tested here |
| null, |
| null, |
| null, |
| false, |
| null, |
| // ensure basic case continues to copy flags to synonym by default for back compat |
| new int[] {0, 0, 5, 5}); |
| } |
| |
| /** |
| * Tests that we can add a prefix to the synonym (for example, to keep it from ever matching user |
| * input directly), and test that we can ignore a list of type values we don't wish to turn into |
| * synonyms. |
| */ |
| public void testWithPrefixAndIgnore() throws Exception { |
| Token[] tokens = |
| new Token[] { |
| new Token("foo", 1, 3), new Token("foo", 5, 7), new Token("foo", 9, 11), |
| }; |
| tokens[0].setType("bar"); |
| tokens[2].setType("ignoreme"); |
| TokenStream ts = new CannedTokenStream(tokens); |
| ts = new TypeAsSynonymFilter(ts, "pfx_", Set.of("word", "ignoreme"), 0); |
| |
| assertTokenStreamContents( |
| ts, |
| new String[] {"foo", "pfx_bar", "foo", "foo"}, |
| new int[] {1, 1, 5, 9}, |
| new int[] {3, 3, 7, 11}, |
| new int[] {1, 0, 1, 1}); |
| } |
| |
| /** |
| * Analysis chains that make use of flags may or may not want flags transferred to the synonym to |
| * be created. This tests the mask that can be used to control which flag bits are transferred. |
| */ |
| public void testFlagMask() throws Exception { |
| |
| Token token = new Token("foo", 0, 2); |
| token.setType("bar"); |
| token.setFlags(7); |
| Token token2 = new Token("foo", 4, 6); |
| TokenStream ts = new CannedTokenStream(token, token2); |
| |
| ts = new TypeAsSynonymFilter(ts, "", Collections.emptySet(), 5); |
| |
| // "word" is the default type! |
| assertTokenStreamContents( |
| ts, |
| new String[] {"foo", "bar", "foo", "word"}, |
| new int[] {0, 0, 4, 4}, |
| new int[] {2, 2, 6, 6}, |
| null, // not testing types |
| null, |
| null, // positions tested above |
| // final values, keywords, graph, payloads not tested here |
| null, |
| null, |
| null, |
| false, |
| null, |
| new int[] {7, 5, 0, 0}); |
| } |
| } |