blob: d4a3f8c53003fff0224192c09fd3e886a5b640b3 [file] [log] [blame]
package org.apache.lucene.analysis.stages;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.analysis.CharFilter;
import org.apache.lucene.analysis.stages.attributes.ArcAttribute;
import org.apache.lucene.analysis.stages.attributes.DeletedAttribute;
import org.apache.lucene.analysis.stages.attributes.OffsetAttribute;
import org.apache.lucene.analysis.stages.attributes.TermAttribute;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.FixedBitSet;
import java.io.IOException;
import java.io.Reader;
/** Uses a CharFilter to detect when punctuation occurs in the
* input in between two tokens, and then as a Stage it will
* re-insert [deleted] tokens when it notices the tokenizer
* had deleted the punctuation. E.g. this can be used to
* prevent synonyms/phrases from matching across punctuation. */
public class InsertDeletedPunctuationStage extends Stage {
private final DeletedAttribute delAttIn;
private final ArcAttribute arcAttIn;
private final TermAttribute termAttIn;
private final OffsetAttribute offsetAttIn;
private final ArcAttribute arcAttOut;
private final DeletedAttribute delAttOut;
private final TermAttribute termAttOut;
private final OffsetAttribute offsetAttOut;
private final String punctToken;
public InsertDeletedPunctuationStage(Stage prevStage, String punctToken) {
super(prevStage);
this.punctToken = punctToken;
delAttIn = prevStage.get(DeletedAttribute.class);
offsetAttIn = prevStage.get(OffsetAttribute.class);
arcAttIn = prevStage.get(ArcAttribute.class);
termAttIn = prevStage.get(TermAttribute.class);
delAttOut = create(DeletedAttribute.class);
offsetAttOut = create(OffsetAttribute.class);
arcAttOut = create(ArcAttribute.class);
termAttOut = create(TermAttribute.class);
}
private static class FindPunctuationCharFilter extends CharFilter {
FixedBitSet wasPunct = new FixedBitSet(128);
private int pos;
public FindPunctuationCharFilter(Reader input) {
super(input);
}
@Override
protected int correct(int offset) {
return offset;
}
@Override
public int read(char[] buffer, int offset, int length) throws IOException {
int count = input.read(buffer, offset, length);
for(int i=0;i<count;i++) {
if (isPunct(buffer[offset+i])) {
if (wasPunct.length() <= pos) {
int nextSize = ArrayUtil.oversize(pos+1, 1);
FixedBitSet nextBits = new FixedBitSet(nextSize);
nextBits.or(wasPunct);
wasPunct = nextBits;
}
wasPunct.set(pos);
}
pos++;
}
return count;
}
protected boolean isPunct(char ch) {
// TODO: use proper Character.isXXX apis:
return ch == '.' || ch == ',' || ch == ':' || ch == ';';
}
}
@Override
public void reset(Reader input) {
// nocommit this is iffy? if an earlier stage also
// wraps, then, we are different offsets
charFilter = new FindPunctuationCharFilter(input);
super.reset(charFilter);
lastEndOffset = 0;
lastPunct = false;
nodeOffset = 0;
}
private FindPunctuationCharFilter charFilter;
private boolean lastPunct;
private int lastEndOffset;
private int nodeOffset;
@Override
public boolean next() throws IOException {
if (lastPunct) {
// Return previously buffered token:
copyToken();
lastPunct = false;
return true;
}
if (prevStage.next()) {
int startOffset = offsetAttIn.startOffset();
assert startOffset <= charFilter.wasPunct.length();
for(int i=lastEndOffset;i<startOffset;i++) {
if (charFilter.wasPunct.get(i)) {
// The gap between the end of the last token,
// and this token, had punctuation:
lastPunct = true;
break;
}
}
if (lastPunct) {
// We insert a new node and token here:
// nocommit this (single int nodeOffset) is too simplistic?
arcAttOut.set(arcAttIn.from() + nodeOffset, arcAttIn.from() + nodeOffset + 1);
delAttOut.set(true);
offsetAttOut.setOffset(lastEndOffset, startOffset);
// nocommit: should we copy over the actual punct chars...?
termAttOut.set(punctToken);
nodeOffset++;
} else {
copyToken();
}
lastEndOffset = offsetAttIn.endOffset();
return true;
} else {
return false;
}
}
private void copyToken() {
if (delAttIn != null) {
delAttOut.set(delAttIn.deleted());
} else {
delAttOut.set(false);
}
termAttOut.set(termAttIn.get());
offsetAttOut.setOffset(offsetAttIn.startOffset(), offsetAttIn.endOffset());
arcAttOut.set(arcAttIn.from()+nodeOffset, arcAttIn.to() + nodeOffset);
}
}