| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.lucene.analysis.compound; |
| |
| |
| import java.io.IOException; |
| import java.io.Reader; |
| import java.io.StringReader; |
| import java.util.Arrays; |
| |
| import org.apache.lucene.analysis.Analyzer; |
| import org.apache.lucene.analysis.BaseTokenStreamTestCase; |
| import org.apache.lucene.analysis.CharArraySet; |
| import org.apache.lucene.analysis.MockTokenizer; |
| import org.apache.lucene.analysis.TokenFilter; |
| import org.apache.lucene.analysis.TokenStream; |
| import org.apache.lucene.analysis.Tokenizer; |
| import org.apache.lucene.analysis.charfilter.MappingCharFilter; |
| import org.apache.lucene.analysis.charfilter.NormalizeCharMap; |
| import org.apache.lucene.analysis.compound.hyphenation.HyphenationTree; |
| import org.apache.lucene.analysis.core.KeywordTokenizer; |
| import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; |
| import org.apache.lucene.util.Attribute; |
| import org.apache.lucene.util.AttributeImpl; |
| import org.apache.lucene.util.AttributeReflector; |
| import org.xml.sax.InputSource; |
| |
| public class TestCompoundWordTokenFilter extends BaseTokenStreamTestCase { |
| |
| private static CharArraySet makeDictionary(String... dictionary) { |
| return new CharArraySet(Arrays.asList(dictionary), true); |
| } |
| |
| public void testHyphenationCompoundWordsDA() throws Exception { |
| CharArraySet dict = makeDictionary("læse", "hest"); |
| |
| InputSource is = new InputSource(getClass().getResource("da_UTF8.xml").toExternalForm()); |
| HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter |
| .getHyphenationTree(is); |
| |
| HyphenationCompoundWordTokenFilter tf = new HyphenationCompoundWordTokenFilter( |
| whitespaceMockTokenizer("min veninde som er lidt af en læsehest"), |
| hyphenator, |
| dict, CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE, |
| CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE, |
| CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, false); |
| assertTokenStreamContents(tf, |
| new String[] { "min", "veninde", "som", "er", "lidt", "af", "en", "læsehest", "læse", "hest" }, |
| new int[] { 1, 1, 1, 1, 1, 1, 1, 1, 0, 0 } |
| ); |
| } |
| |
| public void testHyphenationCompoundWordsDELongestMatch() throws Exception { |
| CharArraySet dict = makeDictionary("basketball", "basket", "ball", "kurv"); |
| |
| InputSource is = new InputSource(getClass().getResource("da_UTF8.xml").toExternalForm()); |
| HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter |
| .getHyphenationTree(is); |
| |
| // the word basket will not be added due to the longest match option |
| HyphenationCompoundWordTokenFilter tf = new HyphenationCompoundWordTokenFilter( |
| whitespaceMockTokenizer("basketballkurv"), |
| hyphenator, dict, |
| CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE, |
| CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE, 40, true); |
| assertTokenStreamContents(tf, |
| new String[] { "basketballkurv", "basketball", "ball", "kurv" }, |
| new int[] { 1, 0, 0, 0 } |
| ); |
| |
| } |
| |
| /** |
| * With hyphenation-only, you can get a lot of nonsense tokens. |
| * This can be controlled with the min/max subword size. |
| */ |
| public void testHyphenationOnly() throws Exception { |
| InputSource is = new InputSource(getClass().getResource("da_UTF8.xml").toExternalForm()); |
| HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter |
| .getHyphenationTree(is); |
| |
| HyphenationCompoundWordTokenFilter tf = new HyphenationCompoundWordTokenFilter( |
| |
| whitespaceMockTokenizer("basketballkurv"), |
| hyphenator, |
| CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE, |
| 2, 4); |
| |
| // min=2, max=4 |
| assertTokenStreamContents(tf, |
| new String[] { "basketballkurv", "ba", "sket", "bal", "ball", "kurv" } |
| ); |
| |
| tf = new HyphenationCompoundWordTokenFilter( |
| |
| whitespaceMockTokenizer("basketballkurv"), |
| hyphenator, |
| CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE, |
| 4, 6); |
| |
| // min=4, max=6 |
| assertTokenStreamContents(tf, |
| new String[] { "basketballkurv", "basket", "sket", "ball", "lkurv", "kurv" } |
| ); |
| |
| tf = new HyphenationCompoundWordTokenFilter( |
| |
| whitespaceMockTokenizer("basketballkurv"), |
| hyphenator, |
| CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE, |
| 4, 10); |
| |
| // min=4, max=10 |
| assertTokenStreamContents(tf, |
| new String[] { "basketballkurv", "basket", "basketbal", "basketball", "sket", |
| "sketbal", "sketball", "ball", "ballkurv", "lkurv", "kurv" } |
| ); |
| |
| } |
| |
| public void testDumbCompoundWordsSE() throws Exception { |
| CharArraySet dict = makeDictionary("Bil", "Dörr", "Motor", "Tak", "Borr", "Slag", "Hammar", |
| "Pelar", "Glas", "Ögon", "Fodral", "Bas", "Fiol", "Makare", "Gesäll", |
| "Sko", "Vind", "Rute", "Torkare", "Blad"); |
| |
| DictionaryCompoundWordTokenFilter tf = new DictionaryCompoundWordTokenFilter( |
| whitespaceMockTokenizer( |
| "Bildörr Bilmotor Biltak Slagborr Hammarborr Pelarborr Glasögonfodral Basfiolsfodral Basfiolsfodralmakaregesäll Skomakare Vindrutetorkare Vindrutetorkarblad abba"), |
| dict); |
| |
| assertTokenStreamContents(tf, new String[] { "Bildörr", "Bil", "dörr", "Bilmotor", |
| "Bil", "motor", "Biltak", "Bil", "tak", "Slagborr", "Slag", "borr", |
| "Hammarborr", "Hammar", "borr", "Pelarborr", "Pelar", "borr", |
| "Glasögonfodral", "Glas", "ögon", "fodral", "Basfiolsfodral", "Bas", |
| "fiol", "fodral", "Basfiolsfodralmakaregesäll", "Bas", "fiol", |
| "fodral", "makare", "gesäll", "Skomakare", "Sko", "makare", |
| "Vindrutetorkare", "Vind", "rute", "torkare", "Vindrutetorkarblad", |
| "Vind", "rute", "blad", "abba" }, new int[] { 0, 0, 0, 8, 8, 8, 17, |
| 17, 17, 24, 24, 24, 33, 33, 33, 44, 44, 44, 54, 54, 54, 54, 69, 69, 69, |
| 69, 84, 84, 84, 84, 84, 84, 111, 111, 111, 121, 121, 121, 121, 137, |
| 137, 137, 137, 156 }, new int[] { 7, 7, 7, 16, 16, 16, 23, 23, 23, 32, |
| 32, 32, 43, 43, 43, 53, 53, 53, 68, 68, 68, 68, 83, 83, 83, 83, 110, |
| 110, 110, 110, 110, 110, 120, 120, 120, 136, 136, 136, 136, 155, 155, 155, |
| 155, 160 }, new int[] { 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, |
| 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, |
| 0, 0, 0, 1 }); |
| } |
| |
| public void testDumbCompoundWordsSELongestMatch() throws Exception { |
| CharArraySet dict = makeDictionary("Bil", "Dörr", "Motor", "Tak", "Borr", "Slag", "Hammar", |
| "Pelar", "Glas", "Ögon", "Fodral", "Bas", "Fiols", "Makare", "Gesäll", |
| "Sko", "Vind", "Rute", "Torkare", "Blad", "Fiolsfodral"); |
| |
| DictionaryCompoundWordTokenFilter tf = new DictionaryCompoundWordTokenFilter( |
| whitespaceMockTokenizer("Basfiolsfodralmakaregesäll"), |
| dict, CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE, |
| CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE, |
| CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, true); |
| |
| assertTokenStreamContents(tf, new String[] { "Basfiolsfodralmakaregesäll", "Bas", |
| "fiolsfodral", "fodral", "makare", "gesäll" }, new int[] { 0, 0, 0, 0, |
| 0, 0 }, new int[] { 26, 26, 26, 26, 26, 26 }, new int[] { 1, 0, 0, 0, |
| 0, 0 }); |
| } |
| |
| public void testTokenEndingWithWordComponentOfMinimumLength() throws Exception { |
| CharArraySet dict = makeDictionary("ab", "cd", "ef"); |
| |
| Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false); |
| tokenizer.setReader(new StringReader("abcdef")); |
| DictionaryCompoundWordTokenFilter tf = new DictionaryCompoundWordTokenFilter( |
| tokenizer, |
| dict, |
| CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE, |
| CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE, |
| CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, false); |
| |
| assertTokenStreamContents(tf, |
| new String[] { "abcdef", "ab", "cd", "ef" }, |
| new int[] { 0, 0, 0, 0}, |
| new int[] { 6, 6, 6, 6}, |
| new int[] { 1, 0, 0, 0} |
| ); |
| } |
| |
| public void testWordComponentWithLessThanMinimumLength() throws Exception { |
| CharArraySet dict = makeDictionary("abc", "d", "efg"); |
| |
| Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false); |
| tokenizer.setReader(new StringReader("abcdefg")); |
| DictionaryCompoundWordTokenFilter tf = new DictionaryCompoundWordTokenFilter( |
| tokenizer, |
| dict, |
| CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE, |
| CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE, |
| CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, false); |
| |
| // since "d" is shorter than the minimum subword size, it should not be added to the token stream |
| assertTokenStreamContents(tf, |
| new String[] { "abcdefg", "abc", "efg" }, |
| new int[] { 0, 0, 0}, |
| new int[] { 7, 7, 7}, |
| new int[] { 1, 0, 0} |
| ); |
| } |
| |
| public void testReset() throws Exception { |
| CharArraySet dict = makeDictionary("Rind", "Fleisch", "Draht", "Schere", "Gesetz", |
| "Aufgabe", "Überwachung"); |
| |
| MockTokenizer wsTokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false); |
| wsTokenizer.setEnableChecks(false); // we will reset in a strange place |
| wsTokenizer.setReader(new StringReader("Rindfleischüberwachungsgesetz")); |
| DictionaryCompoundWordTokenFilter tf = new DictionaryCompoundWordTokenFilter( |
| wsTokenizer, dict, |
| CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE, |
| CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE, |
| CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, false); |
| |
| CharTermAttribute termAtt = tf.getAttribute(CharTermAttribute.class); |
| tf.reset(); |
| assertTrue(tf.incrementToken()); |
| assertEquals("Rindfleischüberwachungsgesetz", termAtt.toString()); |
| assertTrue(tf.incrementToken()); |
| assertEquals("Rind", termAtt.toString()); |
| tf.end(); |
| tf.close(); |
| wsTokenizer.setReader(new StringReader("Rindfleischüberwachungsgesetz")); |
| tf.reset(); |
| assertTrue(tf.incrementToken()); |
| assertEquals("Rindfleischüberwachungsgesetz", termAtt.toString()); |
| } |
| |
| public void testRetainMockAttribute() throws Exception { |
| CharArraySet dict = makeDictionary("abc", "d", "efg"); |
| Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false); |
| tokenizer.setReader(new StringReader("abcdefg")); |
| TokenStream stream = new MockRetainAttributeFilter(tokenizer); |
| stream = new DictionaryCompoundWordTokenFilter( |
| stream, dict, |
| CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE, |
| CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE, |
| CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, false); |
| MockRetainAttribute retAtt = stream.addAttribute(MockRetainAttribute.class); |
| stream.reset(); |
| while (stream.incrementToken()) { |
| assertTrue("Custom attribute value was lost", retAtt.getRetain()); |
| } |
| |
| } |
| |
| public void testLucene8124() throws Exception { |
| InputSource is = new InputSource(getClass().getResource("hyphenation-LUCENE-8124.xml").toExternalForm()); |
| HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter |
| .getHyphenationTree(is); |
| |
| HyphenationCompoundWordTokenFilter tf = new HyphenationCompoundWordTokenFilter( |
| whitespaceMockTokenizer( |
| "Rindfleisch"), |
| hyphenator); |
| |
| // TODO Rindfleisch returned twice is another issue of the HyphenationCompoundTokenFilter |
| assertTokenStreamContents(tf, new String[] { "Rindfleisch", "Rind", "Rindfleisch", "fleisch"}); |
| } |
| |
| |
| public static interface MockRetainAttribute extends Attribute { |
| void setRetain(boolean attr); |
| boolean getRetain(); |
| } |
| |
| public static final class MockRetainAttributeImpl extends AttributeImpl implements MockRetainAttribute { |
| private boolean retain = false; |
| |
| @Override |
| public void clear() { |
| retain = false; |
| } |
| |
| @Override |
| public boolean getRetain() { |
| return retain; |
| } |
| |
| @Override |
| public void setRetain(boolean retain) { |
| this.retain = retain; |
| } |
| |
| @Override |
| public void copyTo(AttributeImpl target) { |
| MockRetainAttribute t = (MockRetainAttribute) target; |
| t.setRetain(retain); |
| } |
| |
| @Override |
| public void reflectWith(AttributeReflector reflector) { |
| reflector.reflect(MockRetainAttribute.class, "retain", retain); |
| } |
| } |
| |
| private static class MockRetainAttributeFilter extends TokenFilter { |
| |
| MockRetainAttribute retainAtt = addAttribute(MockRetainAttribute.class); |
| |
| MockRetainAttributeFilter(TokenStream input) { |
| super(input); |
| } |
| |
| @Override |
| public boolean incrementToken() throws IOException { |
| if (input.incrementToken()){ |
| retainAtt.setRetain(true); |
| return true; |
| } else { |
| return false; |
| } |
| } |
| } |
| |
| // SOLR-2891 |
| // *CompoundWordTokenFilter blindly adds term length to offset, but this can take things out of bounds |
| // wrt original text if a previous filter increases the length of the word (in this case ü -> ue) |
| // so in this case we behave like WDF, and preserve any modified offsets |
| public void testInvalidOffsets() throws Exception { |
| final CharArraySet dict = makeDictionary("fall"); |
| final NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder(); |
| builder.add("ü", "ue"); |
| final NormalizeCharMap normMap = builder.build(); |
| |
| Analyzer analyzer = new Analyzer() { |
| |
| @Override |
| protected TokenStreamComponents createComponents(String fieldName) { |
| Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false); |
| TokenFilter filter = new DictionaryCompoundWordTokenFilter(tokenizer, dict); |
| return new TokenStreamComponents(tokenizer, filter); |
| } |
| |
| @Override |
| protected Reader initReader(String fieldName, Reader reader) { |
| return new MappingCharFilter(normMap, reader); |
| } |
| }; |
| |
| assertAnalyzesTo(analyzer, "banküberfall", |
| new String[] { "bankueberfall", "fall" }, |
| new int[] { 0, 0 }, |
| new int[] { 12, 12 }); |
| analyzer.close(); |
| } |
| |
| /** blast some random strings through the analyzer */ |
| public void testRandomStrings() throws Exception { |
| final CharArraySet dict = makeDictionary("a", "e", "i", "o", "u", "y", "bc", "def"); |
| Analyzer a = new Analyzer() { |
| |
| @Override |
| protected TokenStreamComponents createComponents(String fieldName) { |
| Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false); |
| return new TokenStreamComponents(tokenizer, new DictionaryCompoundWordTokenFilter(tokenizer, dict)); |
| } |
| }; |
| checkRandomData(random(), a, 200 * RANDOM_MULTIPLIER); |
| a.close(); |
| |
| InputSource is = new InputSource(getClass().getResource("da_UTF8.xml").toExternalForm()); |
| final HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter.getHyphenationTree(is); |
| Analyzer b = new Analyzer() { |
| |
| @Override |
| protected TokenStreamComponents createComponents(String fieldName) { |
| Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false); |
| TokenFilter filter = new HyphenationCompoundWordTokenFilter(tokenizer, hyphenator); |
| return new TokenStreamComponents(tokenizer, filter); |
| } |
| }; |
| checkRandomData(random(), b, 200 * RANDOM_MULTIPLIER); |
| b.close(); |
| } |
| |
| public void testEmptyTerm() throws Exception { |
| final CharArraySet dict = makeDictionary("a", "e", "i", "o", "u", "y", "bc", "def"); |
| Analyzer a = new Analyzer() { |
| |
| @Override |
| protected TokenStreamComponents createComponents(String fieldName) { |
| Tokenizer tokenizer = new KeywordTokenizer(); |
| return new TokenStreamComponents(tokenizer, new DictionaryCompoundWordTokenFilter(tokenizer, dict)); |
| } |
| }; |
| checkOneTerm(a, "", ""); |
| a.close(); |
| |
| InputSource is = new InputSource(getClass().getResource("da_UTF8.xml").toExternalForm()); |
| final HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter.getHyphenationTree(is); |
| Analyzer b = new Analyzer() { |
| |
| @Override |
| protected TokenStreamComponents createComponents(String fieldName) { |
| Tokenizer tokenizer = new KeywordTokenizer(); |
| TokenFilter filter = new HyphenationCompoundWordTokenFilter(tokenizer, hyphenator); |
| return new TokenStreamComponents(tokenizer, filter); |
| } |
| }; |
| checkOneTerm(b, "", ""); |
| b.close(); |
| } |
| } |