blob: 7d79b8427cc5212300496bb799a036124f58248d [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.cjk;
import java.io.IOException;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.util.ArrayUtil;
/**
* Forms bigrams of CJK terms that are generated from StandardTokenizer
* or ICUTokenizer.
* <p>
* CJK types are set by these tokenizers, but you can also use
* {@link #CJKBigramFilter(TokenStream, int)} to explicitly control which
* of the CJK scripts are turned into bigrams.
* <p>
* By default, when a CJK character has no adjacent characters to form
* a bigram, it is output in unigram form. If you want to always output
* both unigrams and bigrams, set the <code>outputUnigrams</code>
* flag in {@link CJKBigramFilter#CJKBigramFilter(TokenStream, int, boolean)}.
* This can be used for a combined unigram+bigram approach.
* <p>
* Unlike ICUTokenizer, StandardTokenizer does not split at script boundaries.
* Korean Hangul characters are treated the same as many other scripts'
* letters, and as a result, StandardTokenizer can produce tokens that mix
* Hangul and non-Hangul characters, e.g. "한국abc". Such mixed-script tokens
* are typed as <code>&lt;ALPHANUM&gt;</code> rather than
* <code>&lt;HANGUL&gt;</code>, and as a result, will not be converted to
* bigrams by CJKBigramFilter.
*
* In all cases, all non-CJK input is passed thru unmodified.
*/
public final class CJKBigramFilter extends TokenFilter {
// configuration
/** bigram flag for Han Ideographs */
public static final int HAN = 1;
/** bigram flag for Hiragana */
public static final int HIRAGANA = 2;
/** bigram flag for Katakana */
public static final int KATAKANA = 4;
/** bigram flag for Hangul */
public static final int HANGUL = 8;
/** when we emit a bigram, it's then marked as this type */
public static final String DOUBLE_TYPE = "<DOUBLE>";
/** when we emit a unigram, it's then marked as this type */
public static final String SINGLE_TYPE = "<SINGLE>";
// the types from standardtokenizer
private static final String HAN_TYPE = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.IDEOGRAPHIC];
private static final String HIRAGANA_TYPE = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.HIRAGANA];
private static final String KATAKANA_TYPE = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.KATAKANA];
private static final String HANGUL_TYPE = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.HANGUL];
// sentinel value for ignoring a script
private static final Object NO = new Object();
// these are set to either their type or NO if we want to pass them thru
private final Object doHan;
private final Object doHiragana;
private final Object doKatakana;
private final Object doHangul;
// true if we should output unigram tokens always
private final boolean outputUnigrams;
private boolean ngramState; // false = output unigram, true = output bigram
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
private final PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
private final PositionLengthAttribute posLengthAtt = addAttribute(PositionLengthAttribute.class);
// buffers containing codepoint and offsets in parallel
int buffer[] = new int[8];
int startOffset[] = new int[8];
int endOffset[] = new int[8];
// length of valid buffer
int bufferLen;
// current buffer index
int index;
// the last end offset, to determine if we should bigram across tokens
int lastEndOffset;
private boolean exhausted;
/**
* Calls {@link CJKBigramFilter#CJKBigramFilter(TokenStream, int)
* CJKBigramFilter(in, HAN | HIRAGANA | KATAKANA | HANGUL)}
*/
public CJKBigramFilter(TokenStream in) {
this(in, HAN | HIRAGANA | KATAKANA | HANGUL);
}
/**
* Calls {@link CJKBigramFilter#CJKBigramFilter(TokenStream, int, boolean)
* CJKBigramFilter(in, flags, false)}
*/
public CJKBigramFilter(TokenStream in, int flags) {
this(in, flags, false);
}
/**
* Create a new CJKBigramFilter, specifying which writing systems should be bigrammed,
* and whether or not unigrams should also be output.
* @param flags OR'ed set from {@link CJKBigramFilter#HAN}, {@link CJKBigramFilter#HIRAGANA},
* {@link CJKBigramFilter#KATAKANA}, {@link CJKBigramFilter#HANGUL}
* @param outputUnigrams true if unigrams for the selected writing systems should also be output.
* when this is false, this is only done when there are no adjacent characters to form
* a bigram.
*/
public CJKBigramFilter(TokenStream in, int flags, boolean outputUnigrams) {
super(in);
doHan = (flags & HAN) == 0 ? NO : HAN_TYPE;
doHiragana = (flags & HIRAGANA) == 0 ? NO : HIRAGANA_TYPE;
doKatakana = (flags & KATAKANA) == 0 ? NO : KATAKANA_TYPE;
doHangul = (flags & HANGUL) == 0 ? NO : HANGUL_TYPE;
this.outputUnigrams = outputUnigrams;
}
/*
* much of this complexity revolves around handling the special case of a
* "lone cjk character" where cjktokenizer would output a unigram. this
* is also the only time we ever have to captureState.
*/
@Override
public boolean incrementToken() throws IOException {
while (true) {
if (hasBufferedBigram()) {
// case 1: we have multiple remaining codepoints buffered,
// so we can emit a bigram here.
if (outputUnigrams) {
// when also outputting unigrams, we output the unigram first,
// then rewind back to revisit the bigram.
// so an input of ABC is A + (rewind)AB + B + (rewind)BC + C
// the logic in hasBufferedUnigram ensures we output the C,
// even though it did actually have adjacent CJK characters.
if (ngramState) {
flushBigram();
} else {
flushUnigram();
index--;
}
ngramState = !ngramState;
} else {
flushBigram();
}
return true;
} else if (doNext()) {
// case 2: look at the token type. should we form any n-grams?
String type = typeAtt.type();
if (type == doHan || type == doHiragana || type == doKatakana || type == doHangul) {
// acceptable CJK type: we form n-grams from these.
// as long as the offsets are aligned, we just add these to our current buffer.
// otherwise, we clear the buffer and start over.
if (offsetAtt.startOffset() != lastEndOffset) { // unaligned, clear queue
if (hasBufferedUnigram()) {
// we have a buffered unigram, and we peeked ahead to see if we could form
// a bigram, but we can't, because the offsets are unaligned. capture the state
// of this peeked data to be revisited next time thru the loop, and dump our unigram.
loneState = captureState();
flushUnigram();
return true;
}
index = 0;
bufferLen = 0;
}
refill();
} else {
// not a CJK type: we just return these as-is.
if (hasBufferedUnigram()) {
// we have a buffered unigram, and we peeked ahead to see if we could form
// a bigram, but we can't, because it's not a CJK type. capture the state
// of this peeked data to be revisited next time thru the loop, and dump our unigram.
loneState = captureState();
flushUnigram();
return true;
}
return true;
}
} else {
// case 3: we have only zero or 1 codepoints buffered,
// so not enough to form a bigram. But, we also have no
// more input. So if we have a buffered codepoint, emit
// a unigram, otherwise, it's end of stream.
if (hasBufferedUnigram()) {
flushUnigram(); // flush our remaining unigram
return true;
}
return false;
}
}
}
private State loneState; // rarely used: only for "lone cjk characters", where we emit unigrams
/**
* looks at next input token, returning false is none is available
*/
private boolean doNext() throws IOException {
if (loneState != null) {
restoreState(loneState);
loneState = null;
return true;
} else {
if (exhausted) {
return false;
} else if (input.incrementToken()) {
return true;
} else {
exhausted = true;
return false;
}
}
}
/**
* refills buffers with new data from the current token.
*/
private void refill() {
// compact buffers to keep them smallish if they become large
// just a safety check, but technically we only need the last codepoint
if (bufferLen > 64) {
int last = bufferLen - 1;
buffer[0] = buffer[last];
startOffset[0] = startOffset[last];
endOffset[0] = endOffset[last];
bufferLen = 1;
index -= last;
}
char termBuffer[] = termAtt.buffer();
int len = termAtt.length();
int start = offsetAtt.startOffset();
int end = offsetAtt.endOffset();
int newSize = bufferLen + len;
buffer = ArrayUtil.grow(buffer, newSize);
startOffset = ArrayUtil.grow(startOffset, newSize);
endOffset = ArrayUtil.grow(endOffset, newSize);
lastEndOffset = end;
if (end - start != len) {
// crazy offsets (modified by synonym or charfilter): just preserve
for (int i = 0, cp = 0; i < len; i += Character.charCount(cp)) {
cp = buffer[bufferLen] = Character.codePointAt(termBuffer, i, len);
startOffset[bufferLen] = start;
endOffset[bufferLen] = end;
bufferLen++;
}
} else {
// normal offsets
for (int i = 0, cp = 0, cpLen = 0; i < len; i += cpLen) {
cp = buffer[bufferLen] = Character.codePointAt(termBuffer, i, len);
cpLen = Character.charCount(cp);
startOffset[bufferLen] = start;
start = endOffset[bufferLen] = start + cpLen;
bufferLen++;
}
}
}
/**
* Flushes a bigram token to output from our buffer
* This is the normal case, e.g. ABC -&gt; AB BC
*/
private void flushBigram() {
clearAttributes();
char termBuffer[] = termAtt.resizeBuffer(4); // maximum bigram length in code units (2 supplementaries)
int len1 = Character.toChars(buffer[index], termBuffer, 0);
int len2 = len1 + Character.toChars(buffer[index+1], termBuffer, len1);
termAtt.setLength(len2);
offsetAtt.setOffset(startOffset[index], endOffset[index+1]);
typeAtt.setType(DOUBLE_TYPE);
// when outputting unigrams, all bigrams are synonyms that span two unigrams
if (outputUnigrams) {
posIncAtt.setPositionIncrement(0);
posLengthAtt.setPositionLength(2);
}
index++;
}
/**
* Flushes a unigram token to output from our buffer.
* This happens when we encounter isolated CJK characters, either the whole
* CJK string is a single character, or we encounter a CJK character surrounded
* by space, punctuation, english, etc, but not beside any other CJK.
*/
private void flushUnigram() {
clearAttributes();
char termBuffer[] = termAtt.resizeBuffer(2); // maximum unigram length (2 surrogates)
int len = Character.toChars(buffer[index], termBuffer, 0);
termAtt.setLength(len);
offsetAtt.setOffset(startOffset[index], endOffset[index]);
typeAtt.setType(SINGLE_TYPE);
index++;
}
/**
* True if we have multiple codepoints sitting in our buffer
*/
private boolean hasBufferedBigram() {
return bufferLen - index > 1;
}
/**
* True if we have a single codepoint sitting in our buffer, where its future
* (whether it is emitted as unigram or forms a bigram) depends upon not-yet-seen
* inputs.
*/
private boolean hasBufferedUnigram() {
if (outputUnigrams) {
// when outputting unigrams always
return bufferLen - index == 1;
} else {
// otherwise it's only when we have a lone CJK character
return bufferLen == 1 && index == 0;
}
}
@Override
public void reset() throws IOException {
super.reset();
bufferLen = 0;
index = 0;
lastEndOffset = 0;
loneState = null;
exhausted = false;
ngramState = false;
}
}