blob: ea6d36255af2cb3eb75e76a7c1f33eb9811fdc05 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.phonetic;
import java.io.IOException;
import java.util.LinkedList;
import org.apache.commons.codec.language.DoubleMetaphone;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
/**
* Filter for DoubleMetaphone (supporting secondary codes)
*/
public final class DoubleMetaphoneFilter extends TokenFilter {
private static final String TOKEN_TYPE = "DoubleMetaphone";
private final LinkedList<State> remainingTokens = new LinkedList<>();
private final DoubleMetaphone encoder = new DoubleMetaphone();
private final boolean inject;
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final PositionIncrementAttribute posAtt = addAttribute(PositionIncrementAttribute.class);
/** Creates a DoubleMetaphoneFilter with the specified maximum code length,
* and either adding encoded forms as synonyms (<code>inject=true</code>) or
* replacing them.
*/
public DoubleMetaphoneFilter(TokenStream input, int maxCodeLength, boolean inject) {
super(input);
this.encoder.setMaxCodeLen(maxCodeLength);
this.inject = inject;
}
@Override
public boolean incrementToken() throws IOException {
for(;;) {
if (!remainingTokens.isEmpty()) {
// clearAttributes(); // not currently necessary
restoreState(remainingTokens.removeFirst());
return true;
}
if (!input.incrementToken()) return false;
int len = termAtt.length();
if (len==0) return true; // pass through zero length terms
int firstAlternativeIncrement = inject ? 0 : posAtt.getPositionIncrement();
String v = termAtt.toString();
String primaryPhoneticValue = encoder.doubleMetaphone(v);
String alternatePhoneticValue = encoder.doubleMetaphone(v, true);
// a flag to lazily save state if needed... this avoids a save/restore when only
// one token will be generated.
boolean saveState=inject;
if (primaryPhoneticValue!=null && primaryPhoneticValue.length() > 0 && !primaryPhoneticValue.equals(v)) {
if (saveState) {
remainingTokens.addLast(captureState());
}
posAtt.setPositionIncrement( firstAlternativeIncrement );
firstAlternativeIncrement = 0;
termAtt.setEmpty().append(primaryPhoneticValue);
saveState = true;
}
if (alternatePhoneticValue!=null && alternatePhoneticValue.length() > 0
&& !alternatePhoneticValue.equals(primaryPhoneticValue)
&& !primaryPhoneticValue.equals(v)) {
if (saveState) {
remainingTokens.addLast(captureState());
saveState = false;
}
posAtt.setPositionIncrement( firstAlternativeIncrement );
termAtt.setEmpty().append(alternatePhoneticValue);
saveState = true;
}
// Just one token to return, so no need to capture/restore
// any state, simply return it.
if (remainingTokens.isEmpty()) {
return true;
}
if (saveState) {
remainingTokens.addLast(captureState());
}
}
}
@Override
public void reset() throws IOException {
input.reset();
remainingTokens.clear();
}
}