| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.lucene.analysis.compound; |
| |
| import java.io.IOException; |
| |
| import org.apache.lucene.analysis.CharArraySet; |
| import org.apache.lucene.analysis.TokenStream; |
| import org.apache.lucene.analysis.compound.hyphenation.Hyphenation; |
| import org.apache.lucene.analysis.compound.hyphenation.HyphenationTree; |
| import org.xml.sax.InputSource; |
| |
| /** |
| * A {@link org.apache.lucene.analysis.TokenFilter} that decomposes compound words found in many Germanic languages. |
| * |
| * "Donaudampfschiff" becomes Donau, dampf, schiff so that you can find |
| * "Donaudampfschiff" even when you only enter "schiff". It uses a hyphenation |
| * grammar and a word dictionary to achieve this. |
| */ |
| public class HyphenationCompoundWordTokenFilter extends |
| CompoundWordTokenFilterBase { |
| private HyphenationTree hyphenator; |
| |
| /** |
| * Creates a new {@link HyphenationCompoundWordTokenFilter} instance. |
| * |
| * @param input |
| * the {@link org.apache.lucene.analysis.TokenStream} to process |
| * @param hyphenator |
| * the hyphenation pattern tree to use for hyphenation |
| * @param dictionary |
| * the word dictionary to match against. |
| */ |
| public HyphenationCompoundWordTokenFilter(TokenStream input, |
| HyphenationTree hyphenator, CharArraySet dictionary) { |
| this(input, hyphenator, dictionary, DEFAULT_MIN_WORD_SIZE, |
| DEFAULT_MIN_SUBWORD_SIZE, DEFAULT_MAX_SUBWORD_SIZE, false); |
| } |
| |
| /** |
| * Creates a new {@link HyphenationCompoundWordTokenFilter} instance. |
| * |
| * @param input |
| * the {@link org.apache.lucene.analysis.TokenStream} to process |
| * @param hyphenator |
| * the hyphenation pattern tree to use for hyphenation |
| * @param dictionary |
| * the word dictionary to match against. |
| * @param minWordSize |
| * only words longer than this get processed |
| * @param minSubwordSize |
| * only subwords longer than this get to the output stream |
| * @param maxSubwordSize |
| * only subwords shorter than this get to the output stream |
| * @param onlyLongestMatch |
| * Add only the longest matching subword to the stream |
| */ |
| public HyphenationCompoundWordTokenFilter(TokenStream input, |
| HyphenationTree hyphenator, CharArraySet dictionary, int minWordSize, |
| int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) { |
| super(input, dictionary, minWordSize, minSubwordSize, maxSubwordSize, |
| onlyLongestMatch); |
| |
| this.hyphenator = hyphenator; |
| } |
| |
| /** |
| * Create a HyphenationCompoundWordTokenFilter with no dictionary. |
| * <p> |
| * Calls {@link #HyphenationCompoundWordTokenFilter(org.apache.lucene.analysis.TokenStream, org.apache.lucene.analysis.compound.hyphenation.HyphenationTree, org.apache.lucene.analysis.CharArraySet, int, int, int, boolean) |
| * HyphenationCompoundWordTokenFilter(matchVersion, input, hyphenator, |
| * null, minWordSize, minSubwordSize, maxSubwordSize } |
| */ |
| public HyphenationCompoundWordTokenFilter(TokenStream input, |
| HyphenationTree hyphenator, int minWordSize, int minSubwordSize, |
| int maxSubwordSize) { |
| this(input, hyphenator, null, minWordSize, minSubwordSize, |
| maxSubwordSize, false); |
| } |
| |
| /** |
| * Create a HyphenationCompoundWordTokenFilter with no dictionary. |
| * <p> |
| * Calls {@link #HyphenationCompoundWordTokenFilter(org.apache.lucene.analysis.TokenStream, org.apache.lucene.analysis.compound.hyphenation.HyphenationTree, int, int, int) |
| * HyphenationCompoundWordTokenFilter(matchVersion, input, hyphenator, |
| * DEFAULT_MIN_WORD_SIZE, DEFAULT_MIN_SUBWORD_SIZE, DEFAULT_MAX_SUBWORD_SIZE } |
| */ |
| public HyphenationCompoundWordTokenFilter(TokenStream input, |
| HyphenationTree hyphenator) { |
| this(input, hyphenator, DEFAULT_MIN_WORD_SIZE, DEFAULT_MIN_SUBWORD_SIZE, |
| DEFAULT_MAX_SUBWORD_SIZE); |
| } |
| |
| /** |
| * Create a hyphenator tree |
| * |
| * @param hyphenationFilename the filename of the XML grammar to load |
| * @return An object representing the hyphenation patterns |
| * @throws java.io.IOException If there is a low-level I/O error. |
| */ |
| public static HyphenationTree getHyphenationTree(String hyphenationFilename) |
| throws IOException { |
| return getHyphenationTree(new InputSource(hyphenationFilename)); |
| } |
| |
| /** |
| * Create a hyphenator tree |
| * |
| * @param hyphenationSource the InputSource pointing to the XML grammar |
| * @return An object representing the hyphenation patterns |
| * @throws java.io.IOException If there is a low-level I/O error. |
| */ |
| public static HyphenationTree getHyphenationTree(InputSource hyphenationSource) |
| throws IOException { |
| HyphenationTree tree = new HyphenationTree(); |
| tree.loadPatterns(hyphenationSource); |
| return tree; |
| } |
| |
| @Override |
| protected void decompose() { |
| // get the hyphenation points |
| Hyphenation hyphens = hyphenator.hyphenate(termAtt.buffer(), 0, termAtt.length(), 1, 1); |
| // No hyphen points found -> exit |
| if (hyphens == null) { |
| return; |
| } |
| |
| final int[] hyp = hyphens.getHyphenationPoints(); |
| |
| for (int i = 0; i < hyp.length; ++i) { |
| int remaining = hyp.length - i; |
| int start = hyp[i]; |
| CompoundToken longestMatchToken = null; |
| for (int j = 1; j < remaining; j++) { |
| int partLength = hyp[i + j] - start; |
| |
| // if the part is longer than maxSubwordSize we |
| // are done with this round |
| if (partLength > this.maxSubwordSize) { |
| break; |
| } |
| |
| // we only put subwords to the token stream |
| // that are longer than minPartSize |
| if (partLength < this.minSubwordSize) { |
| // BOGUS/BROKEN/FUNKY/WACKO: somehow we have negative 'parts' according to the |
| // calculation above, and we rely upon minSubwordSize being >=0 to filter them out... |
| continue; |
| } |
| |
| // check the dictionary |
| if (dictionary == null || dictionary.contains(termAtt.buffer(), start, partLength)) { |
| if (this.onlyLongestMatch) { |
| if (longestMatchToken != null) { |
| if (longestMatchToken.txt.length() < partLength) { |
| longestMatchToken = new CompoundToken(start, partLength); |
| } |
| } else { |
| longestMatchToken = new CompoundToken(start, partLength); |
| } |
| } else { |
| tokens.add(new CompoundToken(start, partLength)); |
| } |
| } else if (dictionary.contains(termAtt.buffer(), start, partLength - 1)) { |
| // check the dictionary again with a word that is one character |
| // shorter |
| // to avoid problems with genitive 's characters and other binding |
| // characters |
| if (this.onlyLongestMatch) { |
| if (longestMatchToken != null) { |
| if (longestMatchToken.txt.length() < partLength - 1) { |
| longestMatchToken = new CompoundToken(start, partLength - 1); |
| } |
| } else { |
| longestMatchToken = new CompoundToken(start, partLength - 1); |
| } |
| } else { |
| tokens.add(new CompoundToken(start, partLength - 1)); |
| } |
| } |
| } |
| if (this.onlyLongestMatch && longestMatchToken!=null) { |
| tokens.add(longestMatchToken); |
| } |
| } |
| } |
| } |