| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.lucene.analysis.morfologik; |
| |
| |
| import java.io.IOException; |
| import java.util.ArrayList; |
| import java.util.Collections; |
| import java.util.List; |
| import java.util.regex.Pattern; |
| |
| import org.apache.lucene.analysis.TokenFilter; |
| import org.apache.lucene.analysis.TokenStream; |
| import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; |
| import org.apache.lucene.analysis.tokenattributes.KeywordAttribute; |
| import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; |
| import org.apache.lucene.util.CharsRefBuilder; |
| |
| import morfologik.stemming.Dictionary; |
| import morfologik.stemming.DictionaryLookup; |
| import morfologik.stemming.IStemmer; |
| import morfologik.stemming.WordData; |
| import morfologik.stemming.polish.PolishStemmer; |
| |
| /** |
| * {@link TokenFilter} using Morfologik library to transform input tokens into lemma and |
| * morphosyntactic (POS) tokens. Applies to Polish only. |
| * |
| * <p>MorfologikFilter contains a {@link MorphosyntacticTagsAttribute}, which provides morphosyntactic |
| * annotations for produced lemmas. See the Morfologik documentation for details.</p> |
| * |
| * @see <a href="http://morfologik.blogspot.com/">Morfologik project page</a> |
| */ |
| public class MorfologikFilter extends TokenFilter { |
| |
| private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); |
| private final MorphosyntacticTagsAttribute tagsAtt = addAttribute(MorphosyntacticTagsAttribute.class); |
| private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class); |
| private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class); |
| |
| private final CharsRefBuilder scratch = new CharsRefBuilder(); |
| |
| private State current; |
| private final TokenStream input; |
| private final IStemmer stemmer; |
| |
| private List<WordData> lemmaList; |
| private final ArrayList<StringBuilder> tagsList = new ArrayList<>(); |
| |
| private int lemmaListIndex; |
| |
| /** |
| * Creates a filter with the default (Polish) dictionary. |
| */ |
| public MorfologikFilter(final TokenStream in) { |
| this(in, new PolishStemmer().getDictionary()); |
| } |
| |
| /** |
| * Creates a filter with a given dictionary. |
| * |
| * @param in input token stream. |
| * @param dict Dictionary to use for stemming. |
| */ |
| public MorfologikFilter(final TokenStream in, final Dictionary dict) { |
| super(in); |
| this.input = in; |
| this.stemmer = new DictionaryLookup(dict); |
| this.lemmaList = Collections.emptyList(); |
| } |
| |
| /** |
| * A pattern used to split lemma forms. |
| */ |
| private final static Pattern lemmaSplitter = Pattern.compile("\\+|\\|"); |
| |
| private void popNextLemma() { |
| // One tag (concatenated) per lemma. |
| final WordData lemma = lemmaList.get(lemmaListIndex++); |
| termAtt.setEmpty().append(lemma.getStem()); |
| CharSequence tag = lemma.getTag(); |
| if (tag != null) { |
| String[] tags = lemmaSplitter.split(tag.toString()); |
| for (int i = 0; i < tags.length; i++) { |
| if (tagsList.size() <= i) { |
| tagsList.add(new StringBuilder()); |
| } |
| StringBuilder buffer = tagsList.get(i); |
| buffer.setLength(0); |
| buffer.append(tags[i]); |
| } |
| tagsAtt.setTags(tagsList.subList(0, tags.length)); |
| } else { |
| tagsAtt.setTags(Collections.<StringBuilder> emptyList()); |
| } |
| } |
| |
| /** |
| * Lookup a given surface form of a token and update |
| * {@link #lemmaList} and {@link #lemmaListIndex} accordingly. |
| */ |
| private boolean lookupSurfaceForm(CharSequence token) { |
| lemmaList = this.stemmer.lookup(token); |
| lemmaListIndex = 0; |
| return lemmaList.size() > 0; |
| } |
| |
| /** Retrieves the next token (possibly from the list of lemmas). */ |
| @Override |
| public final boolean incrementToken() throws IOException { |
| if (lemmaListIndex < lemmaList.size()) { |
| restoreState(current); |
| posIncrAtt.setPositionIncrement(0); |
| popNextLemma(); |
| return true; |
| } else if (this.input.incrementToken()) { |
| if (!keywordAttr.isKeyword() && |
| (lookupSurfaceForm(termAtt) || lookupSurfaceForm(toLowercase(termAtt)))) { |
| current = captureState(); |
| popNextLemma(); |
| } else { |
| tagsAtt.clear(); |
| } |
| return true; |
| } else { |
| return false; |
| } |
| } |
| |
| /** |
| * Convert to lowercase in-place. |
| */ |
| private CharSequence toLowercase(CharSequence chs) { |
| final int length = chs.length(); |
| scratch.setLength(length); |
| scratch.grow(length); |
| |
| char buffer[] = scratch.chars(); |
| for (int i = 0; i < length;) { |
| i += Character.toChars( |
| Character.toLowerCase(Character.codePointAt(chs, i)), buffer, i); |
| } |
| |
| return scratch.get(); |
| } |
| |
| /** Resets stems accumulator and hands over to superclass. */ |
| @Override |
| public void reset() throws IOException { |
| lemmaListIndex = 0; |
| lemmaList = Collections.emptyList(); |
| tagsList.clear(); |
| super.reset(); |
| } |
| } |