blob: 049dad1570e6dd41cbe39d1948ab0942c1cecd77 [file] [log] [blame]
// -*- c-basic-offset: 2 -*-
package org.apache.lucene.analysis.morfologik;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.util.*;
import morfologik.stemming.*;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.util.CharacterUtils;
import org.apache.lucene.util.*;
/**
* {@link TokenFilter} using Morfologik library to transform input tokens into lemma and
* morphosyntactic (POS) tokens. Applies to Polish only.
*
* <p>MorfologikFilter contains a {@link MorphosyntacticTagsAttribute}, which provides morphosyntactic
* annotations for produced lemmas. See the Morfologik documentation for details.</p>
*
* @see <a href="http://morfologik.blogspot.com/">Morfologik project page</a>
*/
public class MorfologikFilter extends TokenFilter {
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final MorphosyntacticTagsAttribute tagsAtt = addAttribute(MorphosyntacticTagsAttribute.class);
private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class);
private final CharsRef scratch = new CharsRef(0);
private final CharacterUtils charUtils;
private State current;
private final TokenStream input;
private final IStemmer stemmer;
private List<WordData> lemmaList;
private final ArrayList<StringBuilder> tagsList = new ArrayList<StringBuilder>();
private int lemmaListIndex;
/**
* Creates MorfologikFilter
* @param in input token stream
* @param version Lucene version compatibility for lowercasing.
*/
public MorfologikFilter(final TokenStream in, final Version version) {
super(in);
this.input = in;
// SOLR-4007: temporarily substitute context class loader to allow finding dictionary resources.
Thread me = Thread.currentThread();
ClassLoader cl = me.getContextClassLoader();
try {
me.setContextClassLoader(PolishStemmer.class.getClassLoader());
this.stemmer = new PolishStemmer();
this.charUtils = CharacterUtils.getInstance(version);
this.lemmaList = Collections.emptyList();
} finally {
me.setContextClassLoader(cl);
}
}
/**
* The tag encoding format has been changing in Morfologik from version
* to version. Let's keep both variants and determine which one to run
* based on this flag.
*/
private final static boolean multipleTagsPerLemma = true;
private void popNextLemma() {
if (multipleTagsPerLemma) {
// One tag (concatenated) per lemma.
final WordData lemma = lemmaList.get(lemmaListIndex++);
termAtt.setEmpty().append(lemma.getStem());
CharSequence tag = lemma.getTag();
if (tag != null) {
String[] tags = tag.toString().split("\\+|\\|");
for (int i = 0; i < tags.length; i++) {
if (tagsList.size() <= i) {
tagsList.add(new StringBuilder());
}
StringBuilder buffer = tagsList.get(i);
buffer.setLength(0);
buffer.append(tags[i]);
}
tagsAtt.setTags(tagsList.subList(0, tags.length));
} else {
tagsAtt.setTags(Collections.<StringBuilder> emptyList());
}
} else {
// One tag (concatenated) per stem (lemma repeated).
CharSequence currentStem;
int tags = 0;
do {
final WordData lemma = lemmaList.get(lemmaListIndex++);
currentStem = lemma.getStem();
final CharSequence tag = lemma.getTag();
if (tag != null) {
if (tagsList.size() <= tags) {
tagsList.add(new StringBuilder());
}
final StringBuilder buffer = tagsList.get(tags++);
buffer.setLength(0);
buffer.append(lemma.getTag());
}
} while (lemmaListIndex < lemmaList.size() &&
equalCharSequences(lemmaList.get(lemmaListIndex).getStem(), currentStem));
// Set the lemma's base form and tags as attributes.
termAtt.setEmpty().append(currentStem);
tagsAtt.setTags(tagsList.subList(0, tags));
}
}
/**
* Compare two char sequences for equality. Assumes non-null arguments.
*/
private static final boolean equalCharSequences(CharSequence s1, CharSequence s2) {
int len1 = s1.length();
int len2 = s2.length();
if (len1 != len2) return false;
for (int i = len1; --i >= 0;) {
if (s1.charAt(i) != s2.charAt(i)) {
return false;
}
}
return true;
}
/**
* Lookup a given surface form of a token and update
* {@link #lemmaList} and {@link #lemmaListIndex} accordingly.
*/
private boolean lookupSurfaceForm(CharSequence token) {
lemmaList = this.stemmer.lookup(token);
lemmaListIndex = 0;
return lemmaList.size() > 0;
}
/** Retrieves the next token (possibly from the list of lemmas). */
@Override
public final boolean incrementToken() throws IOException {
if (lemmaListIndex < lemmaList.size()) {
restoreState(current);
posIncrAtt.setPositionIncrement(0);
popNextLemma();
return true;
} else if (this.input.incrementToken()) {
if (!keywordAttr.isKeyword() &&
(lookupSurfaceForm(termAtt) || lookupSurfaceForm(toLowercase(termAtt)))) {
current = captureState();
popNextLemma();
} else {
tagsAtt.clear();
}
return true;
} else {
return false;
}
}
/**
* Convert to lowercase in-place.
*/
private CharSequence toLowercase(CharSequence chs) {
final int length = scratch.length = chs.length();
scratch.grow(length);
char buffer[] = scratch.chars;
for (int i = 0; i < length;) {
i += Character.toChars(
Character.toLowerCase(charUtils.codePointAt(chs, i)), buffer, i);
}
return scratch;
}
/** Resets stems accumulator and hands over to superclass. */
@Override
public void reset() throws IOException {
lemmaListIndex = 0;
lemmaList = Collections.emptyList();
tagsList.clear();
super.reset();
}
}