blob: 522e4c0bb13c84ec401d2b8ae6857bf01dc39681 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.miscellaneous;
import java.io.IOException;
import java.util.Arrays;
import java.util.Comparator;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.util.AttributeSource;
/**
* Filter outputs a single token which is a concatenation of the sorted and de-duplicated set of
* input tokens. This can be useful for clustering/linking use cases.
*/
public class FingerprintFilter extends TokenFilter {
public static final int DEFAULT_MAX_OUTPUT_TOKEN_SIZE = 1024;
public static final char DEFAULT_SEPARATOR = ' ';
private final CharTermAttribute termAttribute = addAttribute(CharTermAttribute.class);
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
private final PositionIncrementAttribute posIncrAtt =
addAttribute(PositionIncrementAttribute.class);
private final PositionLengthAttribute posLenAtt = addAttribute(PositionLengthAttribute.class);
private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
private CharArraySet uniqueTerms = null;
private final int maxOutputTokenSize;
private AttributeSource.State finalState;
private final char separator;
private boolean inputEnded = false;
/** Create a new FingerprintFilter with default settings */
public FingerprintFilter(TokenStream input) {
this(input, DEFAULT_MAX_OUTPUT_TOKEN_SIZE, DEFAULT_SEPARATOR);
}
/**
* Create a new FingerprintFilter with control over all settings
*
* @param input the source of tokens to be summarized into a single token
* @param maxOutputTokenSize the maximum length of the summarized output token. If exceeded, no
* output token is emitted
* @param separator the character used to separate tokens combined into the single output token
*/
public FingerprintFilter(TokenStream input, int maxOutputTokenSize, char separator) {
super(input);
this.maxOutputTokenSize = maxOutputTokenSize;
this.separator = separator;
}
@Override
public final boolean incrementToken() throws IOException {
if (inputEnded) {
return false;
}
boolean result = buildSingleOutputToken();
finalState = captureState();
return result;
}
/**
* Gathers all tokens from input, de-duplicates, sorts then concatenates.
*
* @return false for end of stream; true otherwise
*/
private final boolean buildSingleOutputToken() throws IOException {
inputEnded = false;
char clonedLastTerm[] = null;
uniqueTerms = new CharArraySet(8, false);
int outputTokenSize = 0;
while (input.incrementToken()) {
if (outputTokenSize > maxOutputTokenSize) {
continue;
}
final char term[] = termAttribute.buffer();
final int length = termAttribute.length();
if (!uniqueTerms.contains(term, 0, length)) {
// clone the term, and add to the set of seen terms.
clonedLastTerm = new char[length];
System.arraycopy(term, 0, clonedLastTerm, 0, length);
if (uniqueTerms.size() > 0) {
outputTokenSize++; // Add 1 for the separator char we will output
}
uniqueTerms.add(clonedLastTerm);
outputTokenSize += length;
}
}
// Force end-of-stream operations to get the final state.
input.end();
inputEnded = true;
// Gathering complete - now output exactly zero or one token:
// Set the attributes for the single output token
offsetAtt.setOffset(0, offsetAtt.endOffset());
posLenAtt.setPositionLength(1);
posIncrAtt.setPositionIncrement(1);
typeAtt.setType("fingerprint");
// No tokens gathered - no output
if (uniqueTerms.size() < 1) {
termAttribute.setEmpty();
return false;
}
// Tokens gathered are too large - no output
if (outputTokenSize > maxOutputTokenSize) {
termAttribute.setEmpty();
uniqueTerms.clear();
return false;
}
// Special case - faster option when we have a single token
if (uniqueTerms.size() == 1) {
termAttribute.setEmpty().append(new String(clonedLastTerm));
uniqueTerms.clear();
return true;
}
// Sort the set of deduplicated tokens and combine
Object[] items = uniqueTerms.toArray();
Arrays.sort(
items,
new Comparator<Object>() {
@Override
public int compare(Object o1, Object o2) {
char v1[] = (char[]) o1;
char v2[] = (char[]) o2;
int len1 = v1.length;
int len2 = v2.length;
int lim = Math.min(len1, len2);
int k = 0;
while (k < lim) {
char c1 = v1[k];
char c2 = v2[k];
if (c1 != c2) {
return c1 - c2;
}
k++;
}
return len1 - len2;
}
});
// TODO lets append directly to termAttribute?
StringBuilder sb = new StringBuilder();
for (Object item : items) {
if (sb.length() >= 1) {
sb.append(separator);
}
sb.append((char[]) item);
}
termAttribute.setEmpty().append(sb);
uniqueTerms.clear();
return true;
}
@Override
public final void end() throws IOException {
if (!inputEnded) {
// Rare case - If an IOException occurs while performing buildSingleOutputToken
// we may not have called input.end() already
input.end();
inputEnded = true;
}
if (finalState != null) {
restoreState(finalState);
}
}
@Override
public void reset() throws IOException {
super.reset();
inputEnded = false;
uniqueTerms = null;
}
}