lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/HyphenationCompoundWordTokenFilter.java - lucene-solr - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.lucene.analysis.compound;

 import java.io.IOException;

 import org.apache.lucene.analysis.CharArraySet;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.compound.hyphenation.Hyphenation;
 import org.apache.lucene.analysis.compound.hyphenation.HyphenationTree;
 import org.xml.sax.InputSource;

 /**
  * A {@link org.apache.lucene.analysis.TokenFilter} that decomposes compound words found in many Germanic languages.
  *
  * "Donaudampfschiff" becomes Donau, dampf, schiff so that you can find
  * "Donaudampfschiff" even when you only enter "schiff". It uses a hyphenation
  * grammar and a word dictionary to achieve this.
  */
 public class HyphenationCompoundWordTokenFilter extends
     CompoundWordTokenFilterBase {
   private HyphenationTree hyphenator;

   /**
    * Creates a new {@link HyphenationCompoundWordTokenFilter} instance.
    *
    * @param input
    *          the {@link org.apache.lucene.analysis.TokenStream} to process
    * @param hyphenator
    *          the hyphenation pattern tree to use for hyphenation
    * @param dictionary
    *          the word dictionary to match against.
    */
   public HyphenationCompoundWordTokenFilter(TokenStream input,
                                             HyphenationTree hyphenator, CharArraySet dictionary) {
     this(input, hyphenator, dictionary, DEFAULT_MIN_WORD_SIZE,
         DEFAULT_MIN_SUBWORD_SIZE, DEFAULT_MAX_SUBWORD_SIZE, false);
   }

   /**
    * Creates a new {@link HyphenationCompoundWordTokenFilter} instance.
    *
    * @param input
    *          the {@link org.apache.lucene.analysis.TokenStream} to process
    * @param hyphenator
    *          the hyphenation pattern tree to use for hyphenation
    * @param dictionary
    *          the word dictionary to match against.
    * @param minWordSize
    *          only words longer than this get processed
    * @param minSubwordSize
    *          only subwords longer than this get to the output stream
    * @param maxSubwordSize
    *          only subwords shorter than this get to the output stream
    * @param onlyLongestMatch
    *          Add only the longest matching subword to the stream
    */
   public HyphenationCompoundWordTokenFilter(TokenStream input,
                                             HyphenationTree hyphenator, CharArraySet dictionary, int minWordSize,
                                             int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) {
     super(input, dictionary, minWordSize, minSubwordSize, maxSubwordSize,
         onlyLongestMatch);

     this.hyphenator = hyphenator;
   }

   /**
    * Create a HyphenationCompoundWordTokenFilter with no dictionary.
    * <p>
    * Calls {@link #HyphenationCompoundWordTokenFilter(org.apache.lucene.analysis.TokenStream, org.apache.lucene.analysis.compound.hyphenation.HyphenationTree, org.apache.lucene.analysis.CharArraySet, int, int, int, boolean)
    * HyphenationCompoundWordTokenFilter(matchVersion, input, hyphenator,
    * null, minWordSize, minSubwordSize, maxSubwordSize }
    */
   public HyphenationCompoundWordTokenFilter(TokenStream input,
                                             HyphenationTree hyphenator, int minWordSize, int minSubwordSize,
                                             int maxSubwordSize) {
     this(input, hyphenator, null, minWordSize, minSubwordSize,
         maxSubwordSize, false);
   }

   /**
    * Create a HyphenationCompoundWordTokenFilter with no dictionary.
    * <p>
    * Calls {@link #HyphenationCompoundWordTokenFilter(org.apache.lucene.analysis.TokenStream, org.apache.lucene.analysis.compound.hyphenation.HyphenationTree, int, int, int)
    * HyphenationCompoundWordTokenFilter(matchVersion, input, hyphenator,
    * DEFAULT_MIN_WORD_SIZE, DEFAULT_MIN_SUBWORD_SIZE, DEFAULT_MAX_SUBWORD_SIZE }
    */
   public HyphenationCompoundWordTokenFilter(TokenStream input,
                                             HyphenationTree hyphenator) {
     this(input, hyphenator, DEFAULT_MIN_WORD_SIZE, DEFAULT_MIN_SUBWORD_SIZE,
         DEFAULT_MAX_SUBWORD_SIZE);
   }

   /**
    * Create a hyphenator tree
    *
    * @param hyphenationFilename the filename of the XML grammar to load
    * @return An object representing the hyphenation patterns
    * @throws java.io.IOException If there is a low-level I/O error.
    */
   public static HyphenationTree getHyphenationTree(String hyphenationFilename)
       throws IOException {
     return getHyphenationTree(new InputSource(hyphenationFilename));
   }

   /**
    * Create a hyphenator tree
    *
    * @param hyphenationSource the InputSource pointing to the XML grammar
    * @return An object representing the hyphenation patterns
    * @throws java.io.IOException If there is a low-level I/O error.
    */
   public static HyphenationTree getHyphenationTree(InputSource hyphenationSource)
       throws IOException {
     HyphenationTree tree = new HyphenationTree();
     tree.loadPatterns(hyphenationSource);
     return tree;
   }

   @Override
   protected void decompose() {
     // get the hyphenation points
     Hyphenation hyphens = hyphenator.hyphenate(termAtt.buffer(), 0, termAtt.length(), 1, 1);
     // No hyphen points found -> exit
     if (hyphens == null) {
       return;
     }

     final int[] hyp = hyphens.getHyphenationPoints();

     for (int i = 0; i < hyp.length; ++i) {
       int remaining = hyp.length - i;
       int start = hyp[i];
       CompoundToken longestMatchToken = null;
       for (int j = 1; j < remaining; j++) {
         int partLength = hyp[i + j] - start;

         // if the part is longer than maxSubwordSize we
         // are done with this round
         if (partLength > this.maxSubwordSize) {
           break;
         }

         // we only put subwords to the token stream
         // that are longer than minPartSize
         if (partLength < this.minSubwordSize) {
           // BOGUS/BROKEN/FUNKY/WACKO: somehow we have negative 'parts' according to the
           // calculation above, and we rely upon minSubwordSize being >=0 to filter them out...
           continue;
         }

         // check the dictionary
         if (dictionary == null || dictionary.contains(termAtt.buffer(), start, partLength)) {
           if (this.onlyLongestMatch) {
             if (longestMatchToken != null) {
               if (longestMatchToken.txt.length() < partLength) {
                 longestMatchToken = new CompoundToken(start, partLength);
               }
             } else {
               longestMatchToken = new CompoundToken(start, partLength);
             }
           } else {
             tokens.add(new CompoundToken(start, partLength));
           }
         } else if (dictionary.contains(termAtt.buffer(), start, partLength - 1)) {
           // check the dictionary again with a word that is one character
           // shorter
           // to avoid problems with genitive 's characters and other binding
           // characters
           if (this.onlyLongestMatch) {
             if (longestMatchToken != null) {
               if (longestMatchToken.txt.length() < partLength - 1) {
                 longestMatchToken = new CompoundToken(start, partLength - 1);
               }
             } else {
               longestMatchToken = new CompoundToken(start, partLength - 1);
             }
           } else {
             tokens.add(new CompoundToken(start, partLength - 1));
           }
         }
       }
       if (this.onlyLongestMatch && longestMatchToken!=null) {
         tokens.add(longestMatchToken);
       }
     }
   }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.lucene.analysis.compound;

	import java.io.IOException;

	import org.apache.lucene.analysis.CharArraySet;
	import org.apache.lucene.analysis.TokenStream;
	import org.apache.lucene.analysis.compound.hyphenation.Hyphenation;
	import org.apache.lucene.analysis.compound.hyphenation.HyphenationTree;
	import org.xml.sax.InputSource;

	/**
	* A {@link org.apache.lucene.analysis.TokenFilter} that decomposes compound words found in many Germanic languages.
	*
	* "Donaudampfschiff" becomes Donau, dampf, schiff so that you can find
	* "Donaudampfschiff" even when you only enter "schiff". It uses a hyphenation
	* grammar and a word dictionary to achieve this.
	*/
	public class HyphenationCompoundWordTokenFilter extends
	CompoundWordTokenFilterBase {
	private HyphenationTree hyphenator;

	/**
	* Creates a new {@link HyphenationCompoundWordTokenFilter} instance.
	*
	* @param input
	* the {@link org.apache.lucene.analysis.TokenStream} to process
	* @param hyphenator
	* the hyphenation pattern tree to use for hyphenation
	* @param dictionary
	* the word dictionary to match against.
	*/
	public HyphenationCompoundWordTokenFilter(TokenStream input,
	HyphenationTree hyphenator, CharArraySet dictionary) {
	this(input, hyphenator, dictionary, DEFAULT_MIN_WORD_SIZE,
	DEFAULT_MIN_SUBWORD_SIZE, DEFAULT_MAX_SUBWORD_SIZE, false);
	}

	/**
	* Creates a new {@link HyphenationCompoundWordTokenFilter} instance.
	*
	* @param input
	* the {@link org.apache.lucene.analysis.TokenStream} to process
	* @param hyphenator
	* the hyphenation pattern tree to use for hyphenation
	* @param dictionary
	* the word dictionary to match against.
	* @param minWordSize
	* only words longer than this get processed
	* @param minSubwordSize
	* only subwords longer than this get to the output stream
	* @param maxSubwordSize
	* only subwords shorter than this get to the output stream
	* @param onlyLongestMatch
	* Add only the longest matching subword to the stream
	*/
	public HyphenationCompoundWordTokenFilter(TokenStream input,
	HyphenationTree hyphenator, CharArraySet dictionary, int minWordSize,
	int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) {
	super(input, dictionary, minWordSize, minSubwordSize, maxSubwordSize,
	onlyLongestMatch);

	this.hyphenator = hyphenator;
	}

	/**
	* Create a HyphenationCompoundWordTokenFilter with no dictionary.
	* <p>
	* Calls {@link #HyphenationCompoundWordTokenFilter(org.apache.lucene.analysis.TokenStream, org.apache.lucene.analysis.compound.hyphenation.HyphenationTree, org.apache.lucene.analysis.CharArraySet, int, int, int, boolean)
	* HyphenationCompoundWordTokenFilter(matchVersion, input, hyphenator,
	* null, minWordSize, minSubwordSize, maxSubwordSize }
	*/
	public HyphenationCompoundWordTokenFilter(TokenStream input,
	HyphenationTree hyphenator, int minWordSize, int minSubwordSize,
	int maxSubwordSize) {
	this(input, hyphenator, null, minWordSize, minSubwordSize,
	maxSubwordSize, false);
	}

	/**
	* Create a HyphenationCompoundWordTokenFilter with no dictionary.
	* <p>
	* Calls {@link #HyphenationCompoundWordTokenFilter(org.apache.lucene.analysis.TokenStream, org.apache.lucene.analysis.compound.hyphenation.HyphenationTree, int, int, int)
	* HyphenationCompoundWordTokenFilter(matchVersion, input, hyphenator,
	* DEFAULT_MIN_WORD_SIZE, DEFAULT_MIN_SUBWORD_SIZE, DEFAULT_MAX_SUBWORD_SIZE }
	*/
	public HyphenationCompoundWordTokenFilter(TokenStream input,
	HyphenationTree hyphenator) {
	this(input, hyphenator, DEFAULT_MIN_WORD_SIZE, DEFAULT_MIN_SUBWORD_SIZE,
	DEFAULT_MAX_SUBWORD_SIZE);
	}

	/**
	* Create a hyphenator tree
	*
	* @param hyphenationFilename the filename of the XML grammar to load
	* @return An object representing the hyphenation patterns
	* @throws java.io.IOException If there is a low-level I/O error.
	*/
	public static HyphenationTree getHyphenationTree(String hyphenationFilename)
	throws IOException {
	return getHyphenationTree(new InputSource(hyphenationFilename));
	}

	/**
	* Create a hyphenator tree
	*
	* @param hyphenationSource the InputSource pointing to the XML grammar
	* @return An object representing the hyphenation patterns
	* @throws java.io.IOException If there is a low-level I/O error.
	*/
	public static HyphenationTree getHyphenationTree(InputSource hyphenationSource)
	throws IOException {
	HyphenationTree tree = new HyphenationTree();
	tree.loadPatterns(hyphenationSource);
	return tree;
	}

	@Override
	protected void decompose() {
	// get the hyphenation points
	Hyphenation hyphens = hyphenator.hyphenate(termAtt.buffer(), 0, termAtt.length(), 1, 1);
	// No hyphen points found -> exit
	if (hyphens == null) {
	return;
	}

	final int[] hyp = hyphens.getHyphenationPoints();

	for (int i = 0; i < hyp.length; ++i) {
	int remaining = hyp.length - i;
	int start = hyp[i];
	CompoundToken longestMatchToken = null;
	for (int j = 1; j < remaining; j++) {
	int partLength = hyp[i + j] - start;

	// if the part is longer than maxSubwordSize we
	// are done with this round
	if (partLength > this.maxSubwordSize) {
	break;
	}

	// we only put subwords to the token stream
	// that are longer than minPartSize
	if (partLength < this.minSubwordSize) {
	// BOGUS/BROKEN/FUNKY/WACKO: somehow we have negative 'parts' according to the
	// calculation above, and we rely upon minSubwordSize being >=0 to filter them out...
	continue;
	}

	// check the dictionary
	if (dictionary == null \|\| dictionary.contains(termAtt.buffer(), start, partLength)) {
	if (this.onlyLongestMatch) {
	if (longestMatchToken != null) {
	if (longestMatchToken.txt.length() < partLength) {
	longestMatchToken = new CompoundToken(start, partLength);
	}
	} else {
	longestMatchToken = new CompoundToken(start, partLength);
	}
	} else {
	tokens.add(new CompoundToken(start, partLength));
	}
	} else if (dictionary.contains(termAtt.buffer(), start, partLength - 1)) {
	// check the dictionary again with a word that is one character
	// shorter
	// to avoid problems with genitive 's characters and other binding
	// characters
	if (this.onlyLongestMatch) {
	if (longestMatchToken != null) {
	if (longestMatchToken.txt.length() < partLength - 1) {
	longestMatchToken = new CompoundToken(start, partLength - 1);
	}
	} else {
	longestMatchToken = new CompoundToken(start, partLength - 1);
	}
	} else {
	tokens.add(new CompoundToken(start, partLength - 1));
	}
	}
	}
	if (this.onlyLongestMatch && longestMatchToken!=null) {
	tokens.add(longestMatchToken);
	}
	}
	}
	}