lucene/analysis/common/src/test/org/apache/lucene/analysis/compound/TestCompoundWordTokenFilter.java - lucene-solr - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.lucene.analysis.compound;

 import java.io.IOException;
 import java.io.Reader;
 import java.io.StringReader;
 import java.util.Arrays;
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
 import org.apache.lucene.analysis.CharArraySet;
 import org.apache.lucene.analysis.MockTokenizer;
 import org.apache.lucene.analysis.TokenFilter;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.Tokenizer;
 import org.apache.lucene.analysis.charfilter.MappingCharFilter;
 import org.apache.lucene.analysis.charfilter.NormalizeCharMap;
 import org.apache.lucene.analysis.compound.hyphenation.HyphenationTree;
 import org.apache.lucene.analysis.core.KeywordTokenizer;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.apache.lucene.util.Attribute;
 import org.apache.lucene.util.AttributeImpl;
 import org.apache.lucene.util.AttributeReflector;
 import org.xml.sax.InputSource;

 public class TestCompoundWordTokenFilter extends BaseTokenStreamTestCase {

   private static CharArraySet makeDictionary(String... dictionary) {
     return new CharArraySet(Arrays.asList(dictionary), true);
   }

   public void testHyphenationCompoundWordsDA() throws Exception {
     CharArraySet dict = makeDictionary("læse", "hest");

     InputSource is = new InputSource(getClass().getResource("da_UTF8.xml").toExternalForm());
     HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter.getHyphenationTree(is);

     HyphenationCompoundWordTokenFilter tf =
         new HyphenationCompoundWordTokenFilter(
             whitespaceMockTokenizer("min veninde som er lidt af en læsehest"),
             hyphenator,
             dict,
             CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
             CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE,
             CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE,
             false);
     assertTokenStreamContents(
         tf,
         new String[] {
           "min", "veninde", "som", "er", "lidt", "af", "en", "læsehest", "læse", "hest"
         },
         new int[] {1, 1, 1, 1, 1, 1, 1, 1, 0, 0});
   }

   public void testHyphenationCompoundWordsDELongestMatch() throws Exception {
     CharArraySet dict = makeDictionary("basketball", "basket", "ball", "kurv");

     InputSource is = new InputSource(getClass().getResource("da_UTF8.xml").toExternalForm());
     HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter.getHyphenationTree(is);

     // the word basket will not be added due to the longest match option
     HyphenationCompoundWordTokenFilter tf =
         new HyphenationCompoundWordTokenFilter(
             whitespaceMockTokenizer("basketballkurv"),
             hyphenator,
             dict,
             CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
             CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE,
             40,
             true);
     assertTokenStreamContents(
         tf, new String[] {"basketballkurv", "basketball", "ball", "kurv"}, new int[] {1, 0, 0, 0});
   }

   /**
    * With hyphenation-only, you can get a lot of nonsense tokens. This can be controlled with the
    * min/max subword size.
    */
   public void testHyphenationOnly() throws Exception {
     InputSource is = new InputSource(getClass().getResource("da_UTF8.xml").toExternalForm());
     HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter.getHyphenationTree(is);

     HyphenationCompoundWordTokenFilter tf =
         new HyphenationCompoundWordTokenFilter(
             whitespaceMockTokenizer("basketballkurv"),
             hyphenator,
             CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
             2,
             4);

     // min=2, max=4
     assertTokenStreamContents(
         tf, new String[] {"basketballkurv", "ba", "sket", "bal", "ball", "kurv"});

     tf =
         new HyphenationCompoundWordTokenFilter(
             whitespaceMockTokenizer("basketballkurv"),
             hyphenator,
             CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
             4,
             6);

     // min=4, max=6
     assertTokenStreamContents(
         tf, new String[] {"basketballkurv", "basket", "sket", "ball", "lkurv", "kurv"});

     tf =
         new HyphenationCompoundWordTokenFilter(
             whitespaceMockTokenizer("basketballkurv"),
             hyphenator,
             CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
             4,
             10);

     // min=4, max=10
     assertTokenStreamContents(
         tf,
         new String[] {
           "basketballkurv",
           "basket",
           "basketbal",
           "basketball",
           "sket",
           "sketbal",
           "sketball",
           "ball",
           "ballkurv",
           "lkurv",
           "kurv"
         });
   }

   public void testDumbCompoundWordsSE() throws Exception {
     CharArraySet dict =
         makeDictionary(
             "Bil", "Dörr", "Motor", "Tak", "Borr", "Slag", "Hammar", "Pelar", "Glas", "Ögon",
             "Fodral", "Bas", "Fiol", "Makare", "Gesäll", "Sko", "Vind", "Rute", "Torkare", "Blad");

     DictionaryCompoundWordTokenFilter tf =
         new DictionaryCompoundWordTokenFilter(
             whitespaceMockTokenizer(
                 "Bildörr Bilmotor Biltak Slagborr Hammarborr Pelarborr Glasögonfodral Basfiolsfodral Basfiolsfodralmakaregesäll Skomakare Vindrutetorkare Vindrutetorkarblad abba"),
             dict);

     assertTokenStreamContents(
         tf,
         new String[] {
           "Bildörr",
           "Bil",
           "dörr",
           "Bilmotor",
           "Bil",
           "motor",
           "Biltak",
           "Bil",
           "tak",
           "Slagborr",
           "Slag",
           "borr",
           "Hammarborr",
           "Hammar",
           "borr",
           "Pelarborr",
           "Pelar",
           "borr",
           "Glasögonfodral",
           "Glas",
           "ögon",
           "fodral",
           "Basfiolsfodral",
           "Bas",
           "fiol",
           "fodral",
           "Basfiolsfodralmakaregesäll",
           "Bas",
           "fiol",
           "fodral",
           "makare",
           "gesäll",
           "Skomakare",
           "Sko",
           "makare",
           "Vindrutetorkare",
           "Vind",
           "rute",
           "torkare",
           "Vindrutetorkarblad",
           "Vind",
           "rute",
           "blad",
           "abba"
         },
         new int[] {
           0, 0, 0, 8, 8, 8, 17, 17, 17, 24, 24, 24, 33, 33, 33, 44, 44, 44, 54, 54, 54, 54, 69, 69,
           69, 69, 84, 84, 84, 84, 84, 84, 111, 111, 111, 121, 121, 121, 121, 137, 137, 137, 137, 156
         },
         new int[] {
           7, 7, 7, 16, 16, 16, 23, 23, 23, 32, 32, 32, 43, 43, 43, 53, 53, 53, 68, 68, 68, 68, 83,
           83, 83, 83, 110, 110, 110, 110, 110, 110, 120, 120, 120, 136, 136, 136, 136, 155, 155,
           155, 155, 160
         },
         new int[] {
           1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
           0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1
         });
   }

   public void testDumbCompoundWordsSELongestMatch() throws Exception {
     CharArraySet dict =
         makeDictionary(
             "Bil",
             "Dörr",
             "Motor",
             "Tak",
             "Borr",
             "Slag",
             "Hammar",
             "Pelar",
             "Glas",
             "Ögon",
             "Fodral",
             "Bas",
             "Fiols",
             "Makare",
             "Gesäll",
             "Sko",
             "Vind",
             "Rute",
             "Torkare",
             "Blad",
             "Fiolsfodral");

     DictionaryCompoundWordTokenFilter tf =
         new DictionaryCompoundWordTokenFilter(
             whitespaceMockTokenizer("Basfiolsfodralmakaregesäll"),
             dict,
             CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
             CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE,
             CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE,
             true);

     assertTokenStreamContents(
         tf,
         new String[] {
           "Basfiolsfodralmakaregesäll", "Bas", "fiolsfodral", "fodral", "makare", "gesäll"
         },
         new int[] {0, 0, 0, 0, 0, 0},
         new int[] {26, 26, 26, 26, 26, 26},
         new int[] {1, 0, 0, 0, 0, 0});
   }

   public void testTokenEndingWithWordComponentOfMinimumLength() throws Exception {
     CharArraySet dict = makeDictionary("ab", "cd", "ef");

     Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
     tokenizer.setReader(new StringReader("abcdef"));
     DictionaryCompoundWordTokenFilter tf =
         new DictionaryCompoundWordTokenFilter(
             tokenizer,
             dict,
             CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
             CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE,
             CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE,
             false);

     assertTokenStreamContents(
         tf,
         new String[] {"abcdef", "ab", "cd", "ef"},
         new int[] {0, 0, 0, 0},
         new int[] {6, 6, 6, 6},
         new int[] {1, 0, 0, 0});
   }

   public void testWordComponentWithLessThanMinimumLength() throws Exception {
     CharArraySet dict = makeDictionary("abc", "d", "efg");

     Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
     tokenizer.setReader(new StringReader("abcdefg"));
     DictionaryCompoundWordTokenFilter tf =
         new DictionaryCompoundWordTokenFilter(
             tokenizer,
             dict,
             CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
             CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE,
             CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE,
             false);

     // since "d" is shorter than the minimum subword size, it should not be added to the token
     // stream
     assertTokenStreamContents(
         tf,
         new String[] {"abcdefg", "abc", "efg"},
         new int[] {0, 0, 0},
         new int[] {7, 7, 7},
         new int[] {1, 0, 0});
   }

   public void testReset() throws Exception {
     CharArraySet dict =
         makeDictionary("Rind", "Fleisch", "Draht", "Schere", "Gesetz", "Aufgabe", "Überwachung");

     MockTokenizer wsTokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
     wsTokenizer.setEnableChecks(false); // we will reset in a strange place
     wsTokenizer.setReader(new StringReader("Rindfleischüberwachungsgesetz"));
     DictionaryCompoundWordTokenFilter tf =
         new DictionaryCompoundWordTokenFilter(
             wsTokenizer,
             dict,
             CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
             CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE,
             CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE,
             false);

     CharTermAttribute termAtt = tf.getAttribute(CharTermAttribute.class);
     tf.reset();
     assertTrue(tf.incrementToken());
     assertEquals("Rindfleischüberwachungsgesetz", termAtt.toString());
     assertTrue(tf.incrementToken());
     assertEquals("Rind", termAtt.toString());
     tf.end();
     tf.close();
     wsTokenizer.setReader(new StringReader("Rindfleischüberwachungsgesetz"));
     tf.reset();
     assertTrue(tf.incrementToken());
     assertEquals("Rindfleischüberwachungsgesetz", termAtt.toString());
   }

   public void testRetainMockAttribute() throws Exception {
     CharArraySet dict = makeDictionary("abc", "d", "efg");
     Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
     tokenizer.setReader(new StringReader("abcdefg"));
     TokenStream stream = new MockRetainAttributeFilter(tokenizer);
     stream =
         new DictionaryCompoundWordTokenFilter(
             stream,
             dict,
             CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
             CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE,
             CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE,
             false);
     MockRetainAttribute retAtt = stream.addAttribute(MockRetainAttribute.class);
     stream.reset();
     while (stream.incrementToken()) {
       assertTrue("Custom attribute value was lost", retAtt.getRetain());
     }
   }

   public void testLucene8124() throws Exception {
     InputSource is =
         new InputSource(getClass().getResource("hyphenation-LUCENE-8124.xml").toExternalForm());
     HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter.getHyphenationTree(is);

     HyphenationCompoundWordTokenFilter tf =
         new HyphenationCompoundWordTokenFilter(whitespaceMockTokenizer("Rindfleisch"), hyphenator);

     // TODO Rindfleisch returned twice is another issue of the HyphenationCompoundTokenFilter
     assertTokenStreamContents(tf, new String[] {"Rindfleisch", "Rind", "Rindfleisch", "fleisch"});
   }

   public static interface MockRetainAttribute extends Attribute {
     void setRetain(boolean attr);

     boolean getRetain();
   }

   public static final class MockRetainAttributeImpl extends AttributeImpl
       implements MockRetainAttribute {
     private boolean retain = false;

     @Override
     public void clear() {
       retain = false;
     }

     @Override
     public boolean getRetain() {
       return retain;
     }

     @Override
     public void setRetain(boolean retain) {
       this.retain = retain;
     }

     @Override
     public void copyTo(AttributeImpl target) {
       MockRetainAttribute t = (MockRetainAttribute) target;
       t.setRetain(retain);
     }

     @Override
     public void reflectWith(AttributeReflector reflector) {
       reflector.reflect(MockRetainAttribute.class, "retain", retain);
     }
   }

   private static class MockRetainAttributeFilter extends TokenFilter {

     MockRetainAttribute retainAtt = addAttribute(MockRetainAttribute.class);

     MockRetainAttributeFilter(TokenStream input) {
       super(input);
     }

     @Override
     public boolean incrementToken() throws IOException {
       if (input.incrementToken()) {
         retainAtt.setRetain(true);
         return true;
       } else {
         return false;
       }
     }
   }

   // SOLR-2891
   // *CompoundWordTokenFilter blindly adds term length to offset, but this can take things out of
   // bounds
   // wrt original text if a previous filter increases the length of the word (in this case ü -> ue)
   // so in this case we behave like WDF, and preserve any modified offsets
   public void testInvalidOffsets() throws Exception {
     final CharArraySet dict = makeDictionary("fall");
     final NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder();
     builder.add("ü", "ue");
     final NormalizeCharMap normMap = builder.build();

     Analyzer analyzer =
         new Analyzer() {

           @Override
           protected TokenStreamComponents createComponents(String fieldName) {
             Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
             TokenFilter filter = new DictionaryCompoundWordTokenFilter(tokenizer, dict);
             return new TokenStreamComponents(tokenizer, filter);
           }

           @Override
           protected Reader initReader(String fieldName, Reader reader) {
             return new MappingCharFilter(normMap, reader);
           }
         };

     assertAnalyzesTo(
         analyzer,
         "banküberfall",
         new String[] {"bankueberfall", "fall"},
         new int[] {0, 0},
         new int[] {12, 12});
     analyzer.close();
   }

   /** blast some random strings through the analyzer */
   public void testRandomStrings() throws Exception {
     final CharArraySet dict = makeDictionary("a", "e", "i", "o", "u", "y", "bc", "def");
     Analyzer a =
         new Analyzer() {

           @Override
           protected TokenStreamComponents createComponents(String fieldName) {
             Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
             return new TokenStreamComponents(
                 tokenizer, new DictionaryCompoundWordTokenFilter(tokenizer, dict));
           }
         };
     checkRandomData(random(), a, 200 * RANDOM_MULTIPLIER);
     a.close();

     InputSource is = new InputSource(getClass().getResource("da_UTF8.xml").toExternalForm());
     final HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter.getHyphenationTree(is);
     Analyzer b =
         new Analyzer() {

           @Override
           protected TokenStreamComponents createComponents(String fieldName) {
             Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
             TokenFilter filter = new HyphenationCompoundWordTokenFilter(tokenizer, hyphenator);
             return new TokenStreamComponents(tokenizer, filter);
           }
         };
     checkRandomData(random(), b, 200 * RANDOM_MULTIPLIER);
     b.close();
   }

   public void testEmptyTerm() throws Exception {
     final CharArraySet dict = makeDictionary("a", "e", "i", "o", "u", "y", "bc", "def");
     Analyzer a =
         new Analyzer() {

           @Override
           protected TokenStreamComponents createComponents(String fieldName) {
             Tokenizer tokenizer = new KeywordTokenizer();
             return new TokenStreamComponents(
                 tokenizer, new DictionaryCompoundWordTokenFilter(tokenizer, dict));
           }
         };
     checkOneTerm(a, "", "");
     a.close();

     InputSource is = new InputSource(getClass().getResource("da_UTF8.xml").toExternalForm());
     final HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter.getHyphenationTree(is);
     Analyzer b =
         new Analyzer() {

           @Override
           protected TokenStreamComponents createComponents(String fieldName) {
             Tokenizer tokenizer = new KeywordTokenizer();
             TokenFilter filter = new HyphenationCompoundWordTokenFilter(tokenizer, hyphenator);
             return new TokenStreamComponents(tokenizer, filter);
           }
         };
     checkOneTerm(b, "", "");
     b.close();
   }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.lucene.analysis.compound;

	import java.io.IOException;
	import java.io.Reader;
	import java.io.StringReader;
	import java.util.Arrays;
	import org.apache.lucene.analysis.Analyzer;
	import org.apache.lucene.analysis.BaseTokenStreamTestCase;
	import org.apache.lucene.analysis.CharArraySet;
	import org.apache.lucene.analysis.MockTokenizer;
	import org.apache.lucene.analysis.TokenFilter;
	import org.apache.lucene.analysis.TokenStream;
	import org.apache.lucene.analysis.Tokenizer;
	import org.apache.lucene.analysis.charfilter.MappingCharFilter;
	import org.apache.lucene.analysis.charfilter.NormalizeCharMap;
	import org.apache.lucene.analysis.compound.hyphenation.HyphenationTree;
	import org.apache.lucene.analysis.core.KeywordTokenizer;
	import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
	import org.apache.lucene.util.Attribute;
	import org.apache.lucene.util.AttributeImpl;
	import org.apache.lucene.util.AttributeReflector;
	import org.xml.sax.InputSource;

	public class TestCompoundWordTokenFilter extends BaseTokenStreamTestCase {

	private static CharArraySet makeDictionary(String... dictionary) {
	return new CharArraySet(Arrays.asList(dictionary), true);
	}

	public void testHyphenationCompoundWordsDA() throws Exception {
	CharArraySet dict = makeDictionary("læse", "hest");

	InputSource is = new InputSource(getClass().getResource("da_UTF8.xml").toExternalForm());
	HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter.getHyphenationTree(is);

	HyphenationCompoundWordTokenFilter tf =
	new HyphenationCompoundWordTokenFilter(
	whitespaceMockTokenizer("min veninde som er lidt af en læsehest"),
	hyphenator,
	dict,
	CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
	CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE,
	CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE,
	false);
	assertTokenStreamContents(
	tf,
	new String[] {
	"min", "veninde", "som", "er", "lidt", "af", "en", "læsehest", "læse", "hest"
	},
	new int[] {1, 1, 1, 1, 1, 1, 1, 1, 0, 0});
	}

	public void testHyphenationCompoundWordsDELongestMatch() throws Exception {
	CharArraySet dict = makeDictionary("basketball", "basket", "ball", "kurv");

	InputSource is = new InputSource(getClass().getResource("da_UTF8.xml").toExternalForm());
	HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter.getHyphenationTree(is);

	// the word basket will not be added due to the longest match option
	HyphenationCompoundWordTokenFilter tf =
	new HyphenationCompoundWordTokenFilter(
	whitespaceMockTokenizer("basketballkurv"),
	hyphenator,
	dict,
	CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
	CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE,
	40,
	true);
	assertTokenStreamContents(
	tf, new String[] {"basketballkurv", "basketball", "ball", "kurv"}, new int[] {1, 0, 0, 0});
	}

	/**
	* With hyphenation-only, you can get a lot of nonsense tokens. This can be controlled with the
	* min/max subword size.
	*/
	public void testHyphenationOnly() throws Exception {
	InputSource is = new InputSource(getClass().getResource("da_UTF8.xml").toExternalForm());
	HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter.getHyphenationTree(is);

	HyphenationCompoundWordTokenFilter tf =
	new HyphenationCompoundWordTokenFilter(
	whitespaceMockTokenizer("basketballkurv"),
	hyphenator,
	CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
	2,
	4);

	// min=2, max=4
	assertTokenStreamContents(
	tf, new String[] {"basketballkurv", "ba", "sket", "bal", "ball", "kurv"});

	tf =
	new HyphenationCompoundWordTokenFilter(
	whitespaceMockTokenizer("basketballkurv"),
	hyphenator,
	CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
	4,
	6);

	// min=4, max=6
	assertTokenStreamContents(
	tf, new String[] {"basketballkurv", "basket", "sket", "ball", "lkurv", "kurv"});

	tf =
	new HyphenationCompoundWordTokenFilter(
	whitespaceMockTokenizer("basketballkurv"),
	hyphenator,
	CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
	4,
	10);

	// min=4, max=10
	assertTokenStreamContents(
	tf,
	new String[] {
	"basketballkurv",
	"basket",
	"basketbal",
	"basketball",
	"sket",
	"sketbal",
	"sketball",
	"ball",
	"ballkurv",
	"lkurv",
	"kurv"
	});
	}

	public void testDumbCompoundWordsSE() throws Exception {
	CharArraySet dict =
	makeDictionary(
	"Bil", "Dörr", "Motor", "Tak", "Borr", "Slag", "Hammar", "Pelar", "Glas", "Ögon",
	"Fodral", "Bas", "Fiol", "Makare", "Gesäll", "Sko", "Vind", "Rute", "Torkare", "Blad");

	DictionaryCompoundWordTokenFilter tf =
	new DictionaryCompoundWordTokenFilter(
	whitespaceMockTokenizer(
	"Bildörr Bilmotor Biltak Slagborr Hammarborr Pelarborr Glasögonfodral Basfiolsfodral Basfiolsfodralmakaregesäll Skomakare Vindrutetorkare Vindrutetorkarblad abba"),
	dict);

	assertTokenStreamContents(
	tf,
	new String[] {
	"Bildörr",
	"Bil",
	"dörr",
	"Bilmotor",
	"Bil",
	"motor",
	"Biltak",
	"Bil",
	"tak",
	"Slagborr",
	"Slag",
	"borr",
	"Hammarborr",
	"Hammar",
	"borr",
	"Pelarborr",
	"Pelar",
	"borr",
	"Glasögonfodral",
	"Glas",
	"ögon",
	"fodral",
	"Basfiolsfodral",
	"Bas",
	"fiol",
	"fodral",
	"Basfiolsfodralmakaregesäll",
	"Bas",
	"fiol",
	"fodral",
	"makare",
	"gesäll",
	"Skomakare",
	"Sko",
	"makare",
	"Vindrutetorkare",
	"Vind",
	"rute",
	"torkare",
	"Vindrutetorkarblad",
	"Vind",
	"rute",
	"blad",
	"abba"
	},
	new int[] {
	0, 0, 0, 8, 8, 8, 17, 17, 17, 24, 24, 24, 33, 33, 33, 44, 44, 44, 54, 54, 54, 54, 69, 69,
	69, 69, 84, 84, 84, 84, 84, 84, 111, 111, 111, 121, 121, 121, 121, 137, 137, 137, 137, 156
	},
	new int[] {
	7, 7, 7, 16, 16, 16, 23, 23, 23, 32, 32, 32, 43, 43, 43, 53, 53, 53, 68, 68, 68, 68, 83,
	83, 83, 83, 110, 110, 110, 110, 110, 110, 120, 120, 120, 136, 136, 136, 136, 155, 155,
	155, 155, 160
	},
	new int[] {
	1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
	0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1
	});
	}

	public void testDumbCompoundWordsSELongestMatch() throws Exception {
	CharArraySet dict =
	makeDictionary(
	"Bil",
	"Dörr",
	"Motor",
	"Tak",
	"Borr",
	"Slag",
	"Hammar",
	"Pelar",
	"Glas",
	"Ögon",
	"Fodral",
	"Bas",
	"Fiols",
	"Makare",
	"Gesäll",
	"Sko",
	"Vind",
	"Rute",
	"Torkare",
	"Blad",
	"Fiolsfodral");

	DictionaryCompoundWordTokenFilter tf =
	new DictionaryCompoundWordTokenFilter(
	whitespaceMockTokenizer("Basfiolsfodralmakaregesäll"),
	dict,
	CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
	CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE,
	CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE,
	true);

	assertTokenStreamContents(
	tf,
	new String[] {
	"Basfiolsfodralmakaregesäll", "Bas", "fiolsfodral", "fodral", "makare", "gesäll"
	},
	new int[] {0, 0, 0, 0, 0, 0},
	new int[] {26, 26, 26, 26, 26, 26},
	new int[] {1, 0, 0, 0, 0, 0});
	}

	public void testTokenEndingWithWordComponentOfMinimumLength() throws Exception {
	CharArraySet dict = makeDictionary("ab", "cd", "ef");

	Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
	tokenizer.setReader(new StringReader("abcdef"));
	DictionaryCompoundWordTokenFilter tf =
	new DictionaryCompoundWordTokenFilter(
	tokenizer,
	dict,
	CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
	CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE,
	CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE,
	false);

	assertTokenStreamContents(
	tf,
	new String[] {"abcdef", "ab", "cd", "ef"},
	new int[] {0, 0, 0, 0},
	new int[] {6, 6, 6, 6},
	new int[] {1, 0, 0, 0});
	}

	public void testWordComponentWithLessThanMinimumLength() throws Exception {
	CharArraySet dict = makeDictionary("abc", "d", "efg");

	Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
	tokenizer.setReader(new StringReader("abcdefg"));
	DictionaryCompoundWordTokenFilter tf =
	new DictionaryCompoundWordTokenFilter(
	tokenizer,
	dict,
	CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
	CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE,
	CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE,
	false);

	// since "d" is shorter than the minimum subword size, it should not be added to the token
	// stream
	assertTokenStreamContents(
	tf,
	new String[] {"abcdefg", "abc", "efg"},
	new int[] {0, 0, 0},
	new int[] {7, 7, 7},
	new int[] {1, 0, 0});
	}

	public void testReset() throws Exception {
	CharArraySet dict =
	makeDictionary("Rind", "Fleisch", "Draht", "Schere", "Gesetz", "Aufgabe", "Überwachung");

	MockTokenizer wsTokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
	wsTokenizer.setEnableChecks(false); // we will reset in a strange place
	wsTokenizer.setReader(new StringReader("Rindfleischüberwachungsgesetz"));
	DictionaryCompoundWordTokenFilter tf =
	new DictionaryCompoundWordTokenFilter(
	wsTokenizer,
	dict,
	CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
	CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE,
	CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE,
	false);

	CharTermAttribute termAtt = tf.getAttribute(CharTermAttribute.class);
	tf.reset();
	assertTrue(tf.incrementToken());
	assertEquals("Rindfleischüberwachungsgesetz", termAtt.toString());
	assertTrue(tf.incrementToken());
	assertEquals("Rind", termAtt.toString());
	tf.end();
	tf.close();
	wsTokenizer.setReader(new StringReader("Rindfleischüberwachungsgesetz"));
	tf.reset();
	assertTrue(tf.incrementToken());
	assertEquals("Rindfleischüberwachungsgesetz", termAtt.toString());
	}

	public void testRetainMockAttribute() throws Exception {
	CharArraySet dict = makeDictionary("abc", "d", "efg");
	Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
	tokenizer.setReader(new StringReader("abcdefg"));
	TokenStream stream = new MockRetainAttributeFilter(tokenizer);
	stream =
	new DictionaryCompoundWordTokenFilter(
	stream,
	dict,
	CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
	CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE,
	CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE,
	false);
	MockRetainAttribute retAtt = stream.addAttribute(MockRetainAttribute.class);
	stream.reset();
	while (stream.incrementToken()) {
	assertTrue("Custom attribute value was lost", retAtt.getRetain());
	}
	}

	public void testLucene8124() throws Exception {
	InputSource is =
	new InputSource(getClass().getResource("hyphenation-LUCENE-8124.xml").toExternalForm());
	HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter.getHyphenationTree(is);

	HyphenationCompoundWordTokenFilter tf =
	new HyphenationCompoundWordTokenFilter(whitespaceMockTokenizer("Rindfleisch"), hyphenator);

	// TODO Rindfleisch returned twice is another issue of the HyphenationCompoundTokenFilter
	assertTokenStreamContents(tf, new String[] {"Rindfleisch", "Rind", "Rindfleisch", "fleisch"});
	}

	public static interface MockRetainAttribute extends Attribute {
	void setRetain(boolean attr);

	boolean getRetain();
	}

	public static final class MockRetainAttributeImpl extends AttributeImpl
	implements MockRetainAttribute {
	private boolean retain = false;

	@Override
	public void clear() {
	retain = false;
	}

	@Override
	public boolean getRetain() {
	return retain;
	}

	@Override
	public void setRetain(boolean retain) {
	this.retain = retain;
	}

	@Override
	public void copyTo(AttributeImpl target) {
	MockRetainAttribute t = (MockRetainAttribute) target;
	t.setRetain(retain);
	}

	@Override
	public void reflectWith(AttributeReflector reflector) {
	reflector.reflect(MockRetainAttribute.class, "retain", retain);
	}
	}

	private static class MockRetainAttributeFilter extends TokenFilter {

	MockRetainAttribute retainAtt = addAttribute(MockRetainAttribute.class);

	MockRetainAttributeFilter(TokenStream input) {
	super(input);
	}

	@Override
	public boolean incrementToken() throws IOException {
	if (input.incrementToken()) {
	retainAtt.setRetain(true);
	return true;
	} else {
	return false;
	}
	}
	}

	// SOLR-2891
	// *CompoundWordTokenFilter blindly adds term length to offset, but this can take things out of
	// bounds
	// wrt original text if a previous filter increases the length of the word (in this case ü -> ue)
	// so in this case we behave like WDF, and preserve any modified offsets
	public void testInvalidOffsets() throws Exception {
	final CharArraySet dict = makeDictionary("fall");
	final NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder();
	builder.add("ü", "ue");
	final NormalizeCharMap normMap = builder.build();

	Analyzer analyzer =
	new Analyzer() {

	@Override
	protected TokenStreamComponents createComponents(String fieldName) {
	Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
	TokenFilter filter = new DictionaryCompoundWordTokenFilter(tokenizer, dict);
	return new TokenStreamComponents(tokenizer, filter);
	}

	@Override
	protected Reader initReader(String fieldName, Reader reader) {
	return new MappingCharFilter(normMap, reader);
	}
	};

	assertAnalyzesTo(
	analyzer,
	"banküberfall",
	new String[] {"bankueberfall", "fall"},
	new int[] {0, 0},
	new int[] {12, 12});
	analyzer.close();
	}

	/** blast some random strings through the analyzer */
	public void testRandomStrings() throws Exception {
	final CharArraySet dict = makeDictionary("a", "e", "i", "o", "u", "y", "bc", "def");
	Analyzer a =
	new Analyzer() {

	@Override
	protected TokenStreamComponents createComponents(String fieldName) {
	Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
	return new TokenStreamComponents(
	tokenizer, new DictionaryCompoundWordTokenFilter(tokenizer, dict));
	}
	};
	checkRandomData(random(), a, 200 * RANDOM_MULTIPLIER);
	a.close();

	InputSource is = new InputSource(getClass().getResource("da_UTF8.xml").toExternalForm());
	final HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter.getHyphenationTree(is);
	Analyzer b =
	new Analyzer() {

	@Override
	protected TokenStreamComponents createComponents(String fieldName) {
	Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
	TokenFilter filter = new HyphenationCompoundWordTokenFilter(tokenizer, hyphenator);
	return new TokenStreamComponents(tokenizer, filter);
	}
	};
	checkRandomData(random(), b, 200 * RANDOM_MULTIPLIER);
	b.close();
	}

	public void testEmptyTerm() throws Exception {
	final CharArraySet dict = makeDictionary("a", "e", "i", "o", "u", "y", "bc", "def");
	Analyzer a =
	new Analyzer() {

	@Override
	protected TokenStreamComponents createComponents(String fieldName) {
	Tokenizer tokenizer = new KeywordTokenizer();
	return new TokenStreamComponents(
	tokenizer, new DictionaryCompoundWordTokenFilter(tokenizer, dict));
	}
	};
	checkOneTerm(a, "", "");
	a.close();

	InputSource is = new InputSource(getClass().getResource("da_UTF8.xml").toExternalForm());
	final HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter.getHyphenationTree(is);
	Analyzer b =
	new Analyzer() {

	@Override
	protected TokenStreamComponents createComponents(String fieldName) {
	Tokenizer tokenizer = new KeywordTokenizer();
	TokenFilter filter = new HyphenationCompoundWordTokenFilter(tokenizer, hyphenator);
	return new TokenStreamComponents(tokenizer, filter);
	}
	};
	checkOneTerm(b, "", "");
	b.close();
	}
	}