blob: a615226cd7362654c9e3724283756538fc8f030b [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.icu;
import com.ibm.icu.text.Replaceable;
import com.ibm.icu.text.Transliterator;
import com.ibm.icu.text.UTF16;
import com.ibm.icu.text.UnicodeSet;
import java.io.IOException;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
/**
* A {@link TokenFilter} that transforms text with ICU.
*
* <p>ICU provides text-transformation functionality via its Transliteration API. Although script
* conversion is its most common use, a Transliterator can actually perform a more general class of
* tasks. In fact, Transliterator defines a very general API which specifies only that a segment of
* the input text is replaced by new text. The particulars of this conversion are determined
* entirely by subclasses of Transliterator.
*
* <p>Some useful transformations for search are built-in:
*
* <ul>
* <li>Conversion from Traditional to Simplified Chinese characters
* <li>Conversion from Hiragana to Katakana
* <li>Conversion from Fullwidth to Halfwidth forms.
* <li>Script conversions, for example Serbian Cyrillic to Latin
* </ul>
*
* <p>Example usage:
*
* <blockquote>
*
* stream = new ICUTransformFilter(stream, Transliterator.getInstance("Traditional-Simplified"));
*
* </blockquote>
*
* <br>
* For more details, see the <a href="http://userguide.icu-project.org/transforms/general">ICU User
* Guide</a>.
*/
public final class ICUTransformFilter extends TokenFilter {
// Transliterator to transform the text
private final Transliterator transform;
// Reusable position object
private final Transliterator.Position position = new Transliterator.Position();
// term attribute, will be updated with transformed text.
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
// Wraps a termAttribute around the replaceable interface.
private final ReplaceableTermAttribute replaceableAttribute = new ReplaceableTermAttribute();
/**
* Create a new ICUTransformFilter that transforms text on the given stream.
*
* @param input {@link TokenStream} to filter.
* @param transform Transliterator to transform the text.
*/
@SuppressWarnings("deprecation")
public ICUTransformFilter(TokenStream input, Transliterator transform) {
super(input);
this.transform = transform;
/*
* This is cheating, but speeds things up a lot.
* If we wanted to use pkg-private APIs we could probably do better.
*/
if (transform.getFilter() == null
&& transform instanceof com.ibm.icu.text.RuleBasedTransliterator) {
final UnicodeSet sourceSet = transform.getSourceSet();
if (sourceSet != null && !sourceSet.isEmpty()) transform.setFilter(sourceSet);
}
}
@Override
public boolean incrementToken() throws IOException {
/*
* Wrap around replaceable. clear the positions, and transliterate.
*/
if (input.incrementToken()) {
replaceableAttribute.setText(termAtt);
final int length = termAtt.length();
position.start = 0;
position.limit = length;
position.contextStart = 0;
position.contextLimit = length;
transform.filteredTransliterate(replaceableAttribute, position, false);
return true;
} else {
return false;
}
}
/** Wrap a {@link CharTermAttribute} with the Replaceable API. */
static final class ReplaceableTermAttribute implements Replaceable {
private char buffer[];
private int length;
private CharTermAttribute token;
void setText(final CharTermAttribute token) {
this.token = token;
this.buffer = token.buffer();
this.length = token.length();
}
@Override
public int char32At(int pos) {
return UTF16.charAt(buffer, 0, length, pos);
}
@Override
public char charAt(int pos) {
return buffer[pos];
}
@Override
public void copy(int start, int limit, int dest) {
char text[] = new char[limit - start];
getChars(start, limit, text, 0);
replace(dest, dest, text, 0, limit - start);
}
@Override
public void getChars(int srcStart, int srcLimit, char[] dst, int dstStart) {
System.arraycopy(buffer, srcStart, dst, dstStart, srcLimit - srcStart);
}
@Override
public boolean hasMetaData() {
return false;
}
@Override
public int length() {
return length;
}
@Override
public void replace(int start, int limit, String text) {
final int charsLen = text.length();
final int newLength = shiftForReplace(start, limit, charsLen);
// insert the replacement text
text.getChars(0, charsLen, buffer, start);
token.setLength(length = newLength);
}
@Override
public void replace(int start, int limit, char[] text, int charsStart, int charsLen) {
// shift text if necessary for the replacement
final int newLength = shiftForReplace(start, limit, charsLen);
// insert the replacement text
System.arraycopy(text, charsStart, buffer, start, charsLen);
token.setLength(length = newLength);
}
/** shift text (if necessary) for a replacement operation */
private int shiftForReplace(int start, int limit, int charsLen) {
final int replacementLength = limit - start;
final int newLength = length - replacementLength + charsLen;
// resize if necessary
if (newLength > length) buffer = token.resizeBuffer(newLength);
// if the substring being replaced is longer or shorter than the
// replacement, need to shift things around
if (replacementLength != charsLen && limit < length)
System.arraycopy(buffer, limit, buffer, start + charsLen, length - limit);
return newLength;
}
}
}