| // -*- c-basic-offset: 2 -*- |
| package org.apache.lucene.analysis.morfologik; |
| |
| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| import java.io.IOException; |
| import java.util.*; |
| |
| import morfologik.stemming.*; |
| |
| import org.apache.lucene.analysis.TokenFilter; |
| import org.apache.lucene.analysis.TokenStream; |
| import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; |
| import org.apache.lucene.analysis.tokenattributes.KeywordAttribute; |
| import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; |
| import org.apache.lucene.analysis.util.CharacterUtils; |
| import org.apache.lucene.util.*; |
| |
| /** |
| * {@link TokenFilter} using Morfologik library to transform input tokens into lemma and |
| * morphosyntactic (POS) tokens. Applies to Polish only. |
| * |
| * <p>MorfologikFilter contains a {@link MorphosyntacticTagsAttribute}, which provides morphosyntactic |
| * annotations for produced lemmas. See the Morfologik documentation for details.</p> |
| * |
| * @see <a href="http://morfologik.blogspot.com/">Morfologik project page</a> |
| */ |
| public class MorfologikFilter extends TokenFilter { |
| |
| private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); |
| private final MorphosyntacticTagsAttribute tagsAtt = addAttribute(MorphosyntacticTagsAttribute.class); |
| private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class); |
| private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class); |
| |
| private final CharsRef scratch = new CharsRef(0); |
| private final CharacterUtils charUtils; |
| |
| private State current; |
| private final TokenStream input; |
| private final IStemmer stemmer; |
| |
| private List<WordData> lemmaList; |
| private final ArrayList<StringBuilder> tagsList = new ArrayList<StringBuilder>(); |
| |
| private int lemmaListIndex; |
| |
| /** |
| * Creates MorfologikFilter |
| * @param in input token stream |
| * @param version Lucene version compatibility for lowercasing. |
| */ |
| public MorfologikFilter(final TokenStream in, final Version version) { |
| super(in); |
| this.input = in; |
| |
| // SOLR-4007: temporarily substitute context class loader to allow finding dictionary resources. |
| Thread me = Thread.currentThread(); |
| ClassLoader cl = me.getContextClassLoader(); |
| try { |
| me.setContextClassLoader(PolishStemmer.class.getClassLoader()); |
| this.stemmer = new PolishStemmer(); |
| this.charUtils = CharacterUtils.getInstance(version); |
| this.lemmaList = Collections.emptyList(); |
| } finally { |
| me.setContextClassLoader(cl); |
| } |
| } |
| |
| /** |
| * The tag encoding format has been changing in Morfologik from version |
| * to version. Let's keep both variants and determine which one to run |
| * based on this flag. |
| */ |
| private final static boolean multipleTagsPerLemma = true; |
| |
| private void popNextLemma() { |
| if (multipleTagsPerLemma) { |
| // One tag (concatenated) per lemma. |
| final WordData lemma = lemmaList.get(lemmaListIndex++); |
| termAtt.setEmpty().append(lemma.getStem()); |
| CharSequence tag = lemma.getTag(); |
| if (tag != null) { |
| String[] tags = tag.toString().split("\\+|\\|"); |
| for (int i = 0; i < tags.length; i++) { |
| if (tagsList.size() <= i) { |
| tagsList.add(new StringBuilder()); |
| } |
| StringBuilder buffer = tagsList.get(i); |
| buffer.setLength(0); |
| buffer.append(tags[i]); |
| } |
| tagsAtt.setTags(tagsList.subList(0, tags.length)); |
| } else { |
| tagsAtt.setTags(Collections.<StringBuilder> emptyList()); |
| } |
| } else { |
| // One tag (concatenated) per stem (lemma repeated). |
| CharSequence currentStem; |
| int tags = 0; |
| do { |
| final WordData lemma = lemmaList.get(lemmaListIndex++); |
| currentStem = lemma.getStem(); |
| final CharSequence tag = lemma.getTag(); |
| if (tag != null) { |
| if (tagsList.size() <= tags) { |
| tagsList.add(new StringBuilder()); |
| } |
| |
| final StringBuilder buffer = tagsList.get(tags++); |
| buffer.setLength(0); |
| buffer.append(lemma.getTag()); |
| } |
| } while (lemmaListIndex < lemmaList.size() && |
| equalCharSequences(lemmaList.get(lemmaListIndex).getStem(), currentStem)); |
| |
| // Set the lemma's base form and tags as attributes. |
| termAtt.setEmpty().append(currentStem); |
| tagsAtt.setTags(tagsList.subList(0, tags)); |
| } |
| } |
| |
| /** |
| * Compare two char sequences for equality. Assumes non-null arguments. |
| */ |
| private static final boolean equalCharSequences(CharSequence s1, CharSequence s2) { |
| int len1 = s1.length(); |
| int len2 = s2.length(); |
| if (len1 != len2) return false; |
| for (int i = len1; --i >= 0;) { |
| if (s1.charAt(i) != s2.charAt(i)) { |
| return false; |
| } |
| } |
| return true; |
| } |
| |
| /** |
| * Lookup a given surface form of a token and update |
| * {@link #lemmaList} and {@link #lemmaListIndex} accordingly. |
| */ |
| private boolean lookupSurfaceForm(CharSequence token) { |
| lemmaList = this.stemmer.lookup(token); |
| lemmaListIndex = 0; |
| return lemmaList.size() > 0; |
| } |
| |
| /** Retrieves the next token (possibly from the list of lemmas). */ |
| @Override |
| public final boolean incrementToken() throws IOException { |
| if (lemmaListIndex < lemmaList.size()) { |
| restoreState(current); |
| posIncrAtt.setPositionIncrement(0); |
| popNextLemma(); |
| return true; |
| } else if (this.input.incrementToken()) { |
| if (!keywordAttr.isKeyword() && |
| (lookupSurfaceForm(termAtt) || lookupSurfaceForm(toLowercase(termAtt)))) { |
| current = captureState(); |
| popNextLemma(); |
| } else { |
| tagsAtt.clear(); |
| } |
| return true; |
| } else { |
| return false; |
| } |
| } |
| |
| /** |
| * Convert to lowercase in-place. |
| */ |
| private CharSequence toLowercase(CharSequence chs) { |
| final int length = scratch.length = chs.length(); |
| scratch.grow(length); |
| |
| char buffer[] = scratch.chars; |
| for (int i = 0; i < length;) { |
| i += Character.toChars( |
| Character.toLowerCase(charUtils.codePointAt(chs, i)), buffer, i); |
| } |
| |
| return scratch; |
| } |
| |
| /** Resets stems accumulator and hands over to superclass. */ |
| @Override |
| public void reset() throws IOException { |
| lemmaListIndex = 0; |
| lemmaList = Collections.emptyList(); |
| tagsList.clear(); |
| super.reset(); |
| } |
| } |