blob: 888eeccfa055f6f0959a65db5ef132e92e2f2375 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.search.vectorhighlight;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import org.apache.lucene.search.vectorhighlight.FieldPhraseList.WeightedPhraseInfo;
/**
* A abstract implementation of {@link FragListBuilder}.
*/
public abstract class BaseFragListBuilder implements FragListBuilder {
public static final int MARGIN_DEFAULT = 6;
public static final int MIN_FRAG_CHAR_SIZE_FACTOR = 3;
final int margin;
final int minFragCharSize;
public BaseFragListBuilder( int margin ){
if( margin < 0 )
throw new IllegalArgumentException( "margin(" + margin + ") is too small. It must be 0 or higher." );
this.margin = margin;
this.minFragCharSize = Math.max( 1, margin * MIN_FRAG_CHAR_SIZE_FACTOR );
}
public BaseFragListBuilder(){
this( MARGIN_DEFAULT );
}
protected FieldFragList createFieldFragList( FieldPhraseList fieldPhraseList, FieldFragList fieldFragList, int fragCharSize ){
if( fragCharSize < minFragCharSize )
throw new IllegalArgumentException( "fragCharSize(" + fragCharSize + ") is too small. It must be " + minFragCharSize + " or higher." );
List<WeightedPhraseInfo> wpil = new ArrayList<>();
IteratorQueue<WeightedPhraseInfo> queue = new IteratorQueue<>(fieldPhraseList.getPhraseList().iterator());
WeightedPhraseInfo phraseInfo = null;
int startOffset = 0;
while((phraseInfo = queue.top()) != null){
// if the phrase violates the border of previous fragment, discard it and try next phrase
if( phraseInfo.getStartOffset() < startOffset ) {
queue.removeTop();
continue;
}
wpil.clear();
final int currentPhraseStartOffset = phraseInfo.getStartOffset();
int currentPhraseEndOffset = phraseInfo.getEndOffset();
int spanStart = Math.max(currentPhraseStartOffset - margin, startOffset);
int spanEnd = Math.max(currentPhraseEndOffset, spanStart + fragCharSize);
if (acceptPhrase(queue.removeTop(), currentPhraseEndOffset - currentPhraseStartOffset, fragCharSize)) {
wpil.add(phraseInfo);
}
while((phraseInfo = queue.top()) != null) { // pull until we crossed the current spanEnd
if (phraseInfo.getEndOffset() <= spanEnd) {
currentPhraseEndOffset = phraseInfo.getEndOffset();
if (acceptPhrase(queue.removeTop(), currentPhraseEndOffset - currentPhraseStartOffset, fragCharSize)) {
wpil.add(phraseInfo);
}
} else {
break;
}
}
if (wpil.isEmpty()) {
continue;
}
final int matchLen = currentPhraseEndOffset - currentPhraseStartOffset;
// now recalculate the start and end position to "center" the result
final int newMargin = Math.max(0, (fragCharSize-matchLen)/2); // matchLen can be > fragCharSize prevent IAOOB here
spanStart = currentPhraseStartOffset - newMargin;
if (spanStart < startOffset) {
spanStart = startOffset;
}
// whatever is bigger here we grow this out
spanEnd = spanStart + Math.max(matchLen, fragCharSize);
startOffset = spanEnd;
fieldFragList.add(spanStart, spanEnd, wpil);
}
return fieldFragList;
}
/**
* A predicate to decide if the given {@link WeightedPhraseInfo} should be
* accepted as a highlighted phrase or if it should be discarded.
* <p>
* The default implementation discards phrases that are composed of more than one term
* and where the matchLength exceeds the fragment character size.
*
* @param info the phrase info to accept
* @param matchLength the match length of the current phrase
* @param fragCharSize the configured fragment character size
* @return <code>true</code> if this phrase info should be accepted as a highligh phrase
*/
protected boolean acceptPhrase(WeightedPhraseInfo info, int matchLength, int fragCharSize) {
return info.getTermsOffsets().size() <= 1 || matchLength <= fragCharSize;
}
private static final class IteratorQueue<T> {
private final Iterator<T> iter;
private T top;
public IteratorQueue(Iterator<T> iter) {
this.iter = iter;
T removeTop = removeTop();
assert removeTop == null;
}
public T top() {
return top;
}
public T removeTop() {
T currentTop = top;
if (iter.hasNext()) {
top = iter.next();
} else {
top = null;
}
return currentTop;
}
}
}