| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.lucene.analysis.compound; |
| |
| import java.io.IOException; |
| import java.io.Reader; |
| import java.io.StringReader; |
| import java.util.Arrays; |
| import org.apache.lucene.analysis.Analyzer; |
| import org.apache.lucene.analysis.BaseTokenStreamTestCase; |
| import org.apache.lucene.analysis.CharArraySet; |
| import org.apache.lucene.analysis.MockTokenizer; |
| import org.apache.lucene.analysis.TokenFilter; |
| import org.apache.lucene.analysis.TokenStream; |
| import org.apache.lucene.analysis.Tokenizer; |
| import org.apache.lucene.analysis.charfilter.MappingCharFilter; |
| import org.apache.lucene.analysis.charfilter.NormalizeCharMap; |
| import org.apache.lucene.analysis.compound.hyphenation.HyphenationTree; |
| import org.apache.lucene.analysis.core.KeywordTokenizer; |
| import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; |
| import org.apache.lucene.util.Attribute; |
| import org.apache.lucene.util.AttributeImpl; |
| import org.apache.lucene.util.AttributeReflector; |
| import org.xml.sax.InputSource; |
| |
| public class TestCompoundWordTokenFilter extends BaseTokenStreamTestCase { |
| |
| private static CharArraySet makeDictionary(String... dictionary) { |
| return new CharArraySet(Arrays.asList(dictionary), true); |
| } |
| |
| public void testHyphenationCompoundWordsDA() throws Exception { |
| CharArraySet dict = makeDictionary("læse", "hest"); |
| |
| InputSource is = new InputSource(getClass().getResource("da_UTF8.xml").toExternalForm()); |
| HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter.getHyphenationTree(is); |
| |
| HyphenationCompoundWordTokenFilter tf = |
| new HyphenationCompoundWordTokenFilter( |
| whitespaceMockTokenizer("min veninde som er lidt af en læsehest"), |
| hyphenator, |
| dict, |
| CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE, |
| CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE, |
| CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, |
| false); |
| assertTokenStreamContents( |
| tf, |
| new String[] { |
| "min", "veninde", "som", "er", "lidt", "af", "en", "læsehest", "læse", "hest" |
| }, |
| new int[] {1, 1, 1, 1, 1, 1, 1, 1, 0, 0}); |
| } |
| |
| public void testHyphenationCompoundWordsDELongestMatch() throws Exception { |
| CharArraySet dict = makeDictionary("basketball", "basket", "ball", "kurv"); |
| |
| InputSource is = new InputSource(getClass().getResource("da_UTF8.xml").toExternalForm()); |
| HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter.getHyphenationTree(is); |
| |
| // the word basket will not be added due to the longest match option |
| HyphenationCompoundWordTokenFilter tf = |
| new HyphenationCompoundWordTokenFilter( |
| whitespaceMockTokenizer("basketballkurv"), |
| hyphenator, |
| dict, |
| CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE, |
| CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE, |
| 40, |
| true); |
| assertTokenStreamContents( |
| tf, new String[] {"basketballkurv", "basketball", "ball", "kurv"}, new int[] {1, 0, 0, 0}); |
| } |
| |
| /** |
| * With hyphenation-only, you can get a lot of nonsense tokens. This can be controlled with the |
| * min/max subword size. |
| */ |
| public void testHyphenationOnly() throws Exception { |
| InputSource is = new InputSource(getClass().getResource("da_UTF8.xml").toExternalForm()); |
| HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter.getHyphenationTree(is); |
| |
| HyphenationCompoundWordTokenFilter tf = |
| new HyphenationCompoundWordTokenFilter( |
| whitespaceMockTokenizer("basketballkurv"), |
| hyphenator, |
| CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE, |
| 2, |
| 4); |
| |
| // min=2, max=4 |
| assertTokenStreamContents( |
| tf, new String[] {"basketballkurv", "ba", "sket", "bal", "ball", "kurv"}); |
| |
| tf = |
| new HyphenationCompoundWordTokenFilter( |
| whitespaceMockTokenizer("basketballkurv"), |
| hyphenator, |
| CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE, |
| 4, |
| 6); |
| |
| // min=4, max=6 |
| assertTokenStreamContents( |
| tf, new String[] {"basketballkurv", "basket", "sket", "ball", "lkurv", "kurv"}); |
| |
| tf = |
| new HyphenationCompoundWordTokenFilter( |
| whitespaceMockTokenizer("basketballkurv"), |
| hyphenator, |
| CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE, |
| 4, |
| 10); |
| |
| // min=4, max=10 |
| assertTokenStreamContents( |
| tf, |
| new String[] { |
| "basketballkurv", |
| "basket", |
| "basketbal", |
| "basketball", |
| "sket", |
| "sketbal", |
| "sketball", |
| "ball", |
| "ballkurv", |
| "lkurv", |
| "kurv" |
| }); |
| } |
| |
| public void testDumbCompoundWordsSE() throws Exception { |
| CharArraySet dict = |
| makeDictionary( |
| "Bil", "Dörr", "Motor", "Tak", "Borr", "Slag", "Hammar", "Pelar", "Glas", "Ögon", |
| "Fodral", "Bas", "Fiol", "Makare", "Gesäll", "Sko", "Vind", "Rute", "Torkare", "Blad"); |
| |
| DictionaryCompoundWordTokenFilter tf = |
| new DictionaryCompoundWordTokenFilter( |
| whitespaceMockTokenizer( |
| "Bildörr Bilmotor Biltak Slagborr Hammarborr Pelarborr Glasögonfodral Basfiolsfodral Basfiolsfodralmakaregesäll Skomakare Vindrutetorkare Vindrutetorkarblad abba"), |
| dict); |
| |
| assertTokenStreamContents( |
| tf, |
| new String[] { |
| "Bildörr", |
| "Bil", |
| "dörr", |
| "Bilmotor", |
| "Bil", |
| "motor", |
| "Biltak", |
| "Bil", |
| "tak", |
| "Slagborr", |
| "Slag", |
| "borr", |
| "Hammarborr", |
| "Hammar", |
| "borr", |
| "Pelarborr", |
| "Pelar", |
| "borr", |
| "Glasögonfodral", |
| "Glas", |
| "ögon", |
| "fodral", |
| "Basfiolsfodral", |
| "Bas", |
| "fiol", |
| "fodral", |
| "Basfiolsfodralmakaregesäll", |
| "Bas", |
| "fiol", |
| "fodral", |
| "makare", |
| "gesäll", |
| "Skomakare", |
| "Sko", |
| "makare", |
| "Vindrutetorkare", |
| "Vind", |
| "rute", |
| "torkare", |
| "Vindrutetorkarblad", |
| "Vind", |
| "rute", |
| "blad", |
| "abba" |
| }, |
| new int[] { |
| 0, 0, 0, 8, 8, 8, 17, 17, 17, 24, 24, 24, 33, 33, 33, 44, 44, 44, 54, 54, 54, 54, 69, 69, |
| 69, 69, 84, 84, 84, 84, 84, 84, 111, 111, 111, 121, 121, 121, 121, 137, 137, 137, 137, 156 |
| }, |
| new int[] { |
| 7, 7, 7, 16, 16, 16, 23, 23, 23, 32, 32, 32, 43, 43, 43, 53, 53, 53, 68, 68, 68, 68, 83, |
| 83, 83, 83, 110, 110, 110, 110, 110, 110, 120, 120, 120, 136, 136, 136, 136, 155, 155, |
| 155, 155, 160 |
| }, |
| new int[] { |
| 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, |
| 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1 |
| }); |
| } |
| |
| public void testDumbCompoundWordsSELongestMatch() throws Exception { |
| CharArraySet dict = |
| makeDictionary( |
| "Bil", |
| "Dörr", |
| "Motor", |
| "Tak", |
| "Borr", |
| "Slag", |
| "Hammar", |
| "Pelar", |
| "Glas", |
| "Ögon", |
| "Fodral", |
| "Bas", |
| "Fiols", |
| "Makare", |
| "Gesäll", |
| "Sko", |
| "Vind", |
| "Rute", |
| "Torkare", |
| "Blad", |
| "Fiolsfodral"); |
| |
| DictionaryCompoundWordTokenFilter tf = |
| new DictionaryCompoundWordTokenFilter( |
| whitespaceMockTokenizer("Basfiolsfodralmakaregesäll"), |
| dict, |
| CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE, |
| CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE, |
| CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, |
| true); |
| |
| assertTokenStreamContents( |
| tf, |
| new String[] { |
| "Basfiolsfodralmakaregesäll", "Bas", "fiolsfodral", "fodral", "makare", "gesäll" |
| }, |
| new int[] {0, 0, 0, 0, 0, 0}, |
| new int[] {26, 26, 26, 26, 26, 26}, |
| new int[] {1, 0, 0, 0, 0, 0}); |
| } |
| |
| public void testTokenEndingWithWordComponentOfMinimumLength() throws Exception { |
| CharArraySet dict = makeDictionary("ab", "cd", "ef"); |
| |
| Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false); |
| tokenizer.setReader(new StringReader("abcdef")); |
| DictionaryCompoundWordTokenFilter tf = |
| new DictionaryCompoundWordTokenFilter( |
| tokenizer, |
| dict, |
| CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE, |
| CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE, |
| CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, |
| false); |
| |
| assertTokenStreamContents( |
| tf, |
| new String[] {"abcdef", "ab", "cd", "ef"}, |
| new int[] {0, 0, 0, 0}, |
| new int[] {6, 6, 6, 6}, |
| new int[] {1, 0, 0, 0}); |
| } |
| |
| public void testWordComponentWithLessThanMinimumLength() throws Exception { |
| CharArraySet dict = makeDictionary("abc", "d", "efg"); |
| |
| Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false); |
| tokenizer.setReader(new StringReader("abcdefg")); |
| DictionaryCompoundWordTokenFilter tf = |
| new DictionaryCompoundWordTokenFilter( |
| tokenizer, |
| dict, |
| CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE, |
| CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE, |
| CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, |
| false); |
| |
| // since "d" is shorter than the minimum subword size, it should not be added to the token |
| // stream |
| assertTokenStreamContents( |
| tf, |
| new String[] {"abcdefg", "abc", "efg"}, |
| new int[] {0, 0, 0}, |
| new int[] {7, 7, 7}, |
| new int[] {1, 0, 0}); |
| } |
| |
| public void testReset() throws Exception { |
| CharArraySet dict = |
| makeDictionary("Rind", "Fleisch", "Draht", "Schere", "Gesetz", "Aufgabe", "Überwachung"); |
| |
| MockTokenizer wsTokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false); |
| wsTokenizer.setEnableChecks(false); // we will reset in a strange place |
| wsTokenizer.setReader(new StringReader("Rindfleischüberwachungsgesetz")); |
| DictionaryCompoundWordTokenFilter tf = |
| new DictionaryCompoundWordTokenFilter( |
| wsTokenizer, |
| dict, |
| CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE, |
| CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE, |
| CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, |
| false); |
| |
| CharTermAttribute termAtt = tf.getAttribute(CharTermAttribute.class); |
| tf.reset(); |
| assertTrue(tf.incrementToken()); |
| assertEquals("Rindfleischüberwachungsgesetz", termAtt.toString()); |
| assertTrue(tf.incrementToken()); |
| assertEquals("Rind", termAtt.toString()); |
| tf.end(); |
| tf.close(); |
| wsTokenizer.setReader(new StringReader("Rindfleischüberwachungsgesetz")); |
| tf.reset(); |
| assertTrue(tf.incrementToken()); |
| assertEquals("Rindfleischüberwachungsgesetz", termAtt.toString()); |
| } |
| |
| public void testRetainMockAttribute() throws Exception { |
| CharArraySet dict = makeDictionary("abc", "d", "efg"); |
| Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false); |
| tokenizer.setReader(new StringReader("abcdefg")); |
| TokenStream stream = new MockRetainAttributeFilter(tokenizer); |
| stream = |
| new DictionaryCompoundWordTokenFilter( |
| stream, |
| dict, |
| CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE, |
| CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE, |
| CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, |
| false); |
| MockRetainAttribute retAtt = stream.addAttribute(MockRetainAttribute.class); |
| stream.reset(); |
| while (stream.incrementToken()) { |
| assertTrue("Custom attribute value was lost", retAtt.getRetain()); |
| } |
| } |
| |
| public void testLucene8124() throws Exception { |
| InputSource is = |
| new InputSource(getClass().getResource("hyphenation-LUCENE-8124.xml").toExternalForm()); |
| HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter.getHyphenationTree(is); |
| |
| HyphenationCompoundWordTokenFilter tf = |
| new HyphenationCompoundWordTokenFilter(whitespaceMockTokenizer("Rindfleisch"), hyphenator); |
| |
| // TODO Rindfleisch returned twice is another issue of the HyphenationCompoundTokenFilter |
| assertTokenStreamContents(tf, new String[] {"Rindfleisch", "Rind", "Rindfleisch", "fleisch"}); |
| } |
| |
| public static interface MockRetainAttribute extends Attribute { |
| void setRetain(boolean attr); |
| |
| boolean getRetain(); |
| } |
| |
| public static final class MockRetainAttributeImpl extends AttributeImpl |
| implements MockRetainAttribute { |
| private boolean retain = false; |
| |
| @Override |
| public void clear() { |
| retain = false; |
| } |
| |
| @Override |
| public boolean getRetain() { |
| return retain; |
| } |
| |
| @Override |
| public void setRetain(boolean retain) { |
| this.retain = retain; |
| } |
| |
| @Override |
| public void copyTo(AttributeImpl target) { |
| MockRetainAttribute t = (MockRetainAttribute) target; |
| t.setRetain(retain); |
| } |
| |
| @Override |
| public void reflectWith(AttributeReflector reflector) { |
| reflector.reflect(MockRetainAttribute.class, "retain", retain); |
| } |
| } |
| |
| private static class MockRetainAttributeFilter extends TokenFilter { |
| |
| MockRetainAttribute retainAtt = addAttribute(MockRetainAttribute.class); |
| |
| MockRetainAttributeFilter(TokenStream input) { |
| super(input); |
| } |
| |
| @Override |
| public boolean incrementToken() throws IOException { |
| if (input.incrementToken()) { |
| retainAtt.setRetain(true); |
| return true; |
| } else { |
| return false; |
| } |
| } |
| } |
| |
| // SOLR-2891 |
| // *CompoundWordTokenFilter blindly adds term length to offset, but this can take things out of |
| // bounds |
| // wrt original text if a previous filter increases the length of the word (in this case ü -> ue) |
| // so in this case we behave like WDF, and preserve any modified offsets |
| public void testInvalidOffsets() throws Exception { |
| final CharArraySet dict = makeDictionary("fall"); |
| final NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder(); |
| builder.add("ü", "ue"); |
| final NormalizeCharMap normMap = builder.build(); |
| |
| Analyzer analyzer = |
| new Analyzer() { |
| |
| @Override |
| protected TokenStreamComponents createComponents(String fieldName) { |
| Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false); |
| TokenFilter filter = new DictionaryCompoundWordTokenFilter(tokenizer, dict); |
| return new TokenStreamComponents(tokenizer, filter); |
| } |
| |
| @Override |
| protected Reader initReader(String fieldName, Reader reader) { |
| return new MappingCharFilter(normMap, reader); |
| } |
| }; |
| |
| assertAnalyzesTo( |
| analyzer, |
| "banküberfall", |
| new String[] {"bankueberfall", "fall"}, |
| new int[] {0, 0}, |
| new int[] {12, 12}); |
| analyzer.close(); |
| } |
| |
| /** blast some random strings through the analyzer */ |
| public void testRandomStrings() throws Exception { |
| final CharArraySet dict = makeDictionary("a", "e", "i", "o", "u", "y", "bc", "def"); |
| Analyzer a = |
| new Analyzer() { |
| |
| @Override |
| protected TokenStreamComponents createComponents(String fieldName) { |
| Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false); |
| return new TokenStreamComponents( |
| tokenizer, new DictionaryCompoundWordTokenFilter(tokenizer, dict)); |
| } |
| }; |
| checkRandomData(random(), a, 200 * RANDOM_MULTIPLIER); |
| a.close(); |
| |
| InputSource is = new InputSource(getClass().getResource("da_UTF8.xml").toExternalForm()); |
| final HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter.getHyphenationTree(is); |
| Analyzer b = |
| new Analyzer() { |
| |
| @Override |
| protected TokenStreamComponents createComponents(String fieldName) { |
| Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false); |
| TokenFilter filter = new HyphenationCompoundWordTokenFilter(tokenizer, hyphenator); |
| return new TokenStreamComponents(tokenizer, filter); |
| } |
| }; |
| checkRandomData(random(), b, 200 * RANDOM_MULTIPLIER); |
| b.close(); |
| } |
| |
| public void testEmptyTerm() throws Exception { |
| final CharArraySet dict = makeDictionary("a", "e", "i", "o", "u", "y", "bc", "def"); |
| Analyzer a = |
| new Analyzer() { |
| |
| @Override |
| protected TokenStreamComponents createComponents(String fieldName) { |
| Tokenizer tokenizer = new KeywordTokenizer(); |
| return new TokenStreamComponents( |
| tokenizer, new DictionaryCompoundWordTokenFilter(tokenizer, dict)); |
| } |
| }; |
| checkOneTerm(a, "", ""); |
| a.close(); |
| |
| InputSource is = new InputSource(getClass().getResource("da_UTF8.xml").toExternalForm()); |
| final HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter.getHyphenationTree(is); |
| Analyzer b = |
| new Analyzer() { |
| |
| @Override |
| protected TokenStreamComponents createComponents(String fieldName) { |
| Tokenizer tokenizer = new KeywordTokenizer(); |
| TokenFilter filter = new HyphenationCompoundWordTokenFilter(tokenizer, hyphenator); |
| return new TokenStreamComponents(tokenizer, filter); |
| } |
| }; |
| checkOneTerm(b, "", ""); |
| b.close(); |
| } |
| } |