blob: 2d9fb2f86d17d9877518566a98467305bf6bc6ab [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.search.vectorhighlight;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import org.apache.lucene.search.vectorhighlight.FieldQuery.QueryPhraseMap;
import org.apache.lucene.search.vectorhighlight.FieldTermStack.TermInfo;
import org.apache.lucene.util.MergedIterator;
/**
* FieldPhraseList has a list of WeightedPhraseInfo that is used by FragListBuilder
* to create a FieldFragList object.
*/
public class FieldPhraseList {
/**
* List of non-overlapping WeightedPhraseInfo objects.
*/
LinkedList<WeightedPhraseInfo> phraseList = new LinkedList<>();
/**
* create a FieldPhraseList that has no limit on the number of phrases to analyze
*
* @param fieldTermStack FieldTermStack object
* @param fieldQuery FieldQuery object
*/
public FieldPhraseList( FieldTermStack fieldTermStack, FieldQuery fieldQuery){
this (fieldTermStack, fieldQuery, Integer.MAX_VALUE);
}
/**
* return the list of WeightedPhraseInfo.
*
* @return phraseList.
*/
public List<WeightedPhraseInfo> getPhraseList() {
return phraseList;
}
/**
* a constructor.
*
* @param fieldTermStack FieldTermStack object
* @param fieldQuery FieldQuery object
* @param phraseLimit maximum size of phraseList
*/
public FieldPhraseList( FieldTermStack fieldTermStack, FieldQuery fieldQuery, int phraseLimit ){
final String field = fieldTermStack.getFieldName();
LinkedList<TermInfo> phraseCandidate = new LinkedList<>();
QueryPhraseMap currMap = null;
QueryPhraseMap nextMap = null;
while( !fieldTermStack.isEmpty() && (phraseList.size() < phraseLimit) )
{
phraseCandidate.clear();
TermInfo ti = null;
TermInfo first = null;
first = ti = fieldTermStack.pop();
currMap = fieldQuery.getFieldTermMap( field, ti.getText() );
while (currMap == null && ti.getNext() != first) {
ti = ti.getNext();
currMap = fieldQuery.getFieldTermMap( field, ti.getText() );
}
// if not found, discard top TermInfo from stack, then try next element
if( currMap == null ) continue;
// if found, search the longest phrase
phraseCandidate.add( ti );
while( true ){
first = ti = fieldTermStack.pop();
nextMap = null;
if( ti != null ) {
nextMap = currMap.getTermMap( ti.getText() );
while (nextMap == null && ti.getNext() != first) {
ti = ti.getNext();
nextMap = currMap.getTermMap( ti.getText() );
}
}
if( ti == null || nextMap == null ){
if( ti != null )
fieldTermStack.push( ti );
if( currMap.isValidTermOrPhrase( phraseCandidate ) ){
addIfNoOverlap( new WeightedPhraseInfo( phraseCandidate, currMap.getBoost(), currMap.getTermOrPhraseNumber() ) );
}
else{
while( phraseCandidate.size() > 1 ){
fieldTermStack.push( phraseCandidate.removeLast() );
currMap = fieldQuery.searchPhrase( field, phraseCandidate );
if( currMap != null ){
addIfNoOverlap( new WeightedPhraseInfo( phraseCandidate, currMap.getBoost(), currMap.getTermOrPhraseNumber() ) );
break;
}
}
}
break;
}
else{
phraseCandidate.add( ti );
currMap = nextMap;
}
}
}
}
/**
* Merging constructor.
*
* @param toMerge FieldPhraseLists to merge to build this one
*/
public FieldPhraseList( FieldPhraseList[] toMerge ) {
// Merge all overlapping WeightedPhraseInfos
// Step 1. Sort by startOffset, endOffset, and boost, in that order.
@SuppressWarnings( { "rawtypes", "unchecked" } )
Iterator< WeightedPhraseInfo >[] allInfos = new Iterator[ toMerge.length ];
int index = 0;
for ( FieldPhraseList fplToMerge : toMerge ) {
allInfos[ index++ ] = fplToMerge.phraseList.iterator();
}
MergedIterator< WeightedPhraseInfo > itr = new MergedIterator<>( false, allInfos );
// Step 2. Walk the sorted list merging infos that overlap
phraseList = new LinkedList<>();
if ( !itr.hasNext() ) {
return;
}
List< WeightedPhraseInfo > work = new ArrayList<>();
WeightedPhraseInfo first = itr.next();
work.add( first );
int workEndOffset = first.getEndOffset();
while ( itr.hasNext() ) {
WeightedPhraseInfo current = itr.next();
if ( current.getStartOffset() <= workEndOffset ) {
workEndOffset = Math.max( workEndOffset, current.getEndOffset() );
work.add( current );
} else {
if ( work.size() == 1 ) {
phraseList.add( work.get( 0 ) );
work.set( 0, current );
} else {
phraseList.add( new WeightedPhraseInfo( work ) );
work.clear();
work.add( current );
}
workEndOffset = current.getEndOffset();
}
}
if ( work.size() == 1 ) {
phraseList.add( work.get( 0 ) );
} else {
phraseList.add( new WeightedPhraseInfo( work ) );
work.clear();
}
}
public void addIfNoOverlap( WeightedPhraseInfo wpi ){
for( WeightedPhraseInfo existWpi : getPhraseList() ){
if( existWpi.isOffsetOverlap( wpi ) ) {
// WeightedPhraseInfo.addIfNoOverlap() dumps the second part of, for example, hyphenated words (social-economics).
// The result is that all informations in TermInfo are lost and not available for further operations.
existWpi.getTermsInfos().addAll( wpi.getTermsInfos() );
return;
}
}
getPhraseList().add( wpi );
}
/**
* Represents the list of term offsets and boost for some text
*/
public static class WeightedPhraseInfo implements Comparable< WeightedPhraseInfo > {
private List<Toffs> termsOffsets; // usually termsOffsets.size() == 1,
// but if position-gap > 1 and slop > 0 then size() could be greater than 1
private float boost; // query boost
private int seqnum;
private ArrayList<TermInfo> termsInfos;
/**
* Text of the match, calculated on the fly. Use for debugging only.
* @return the text
*/
public String getText() {
StringBuilder text = new StringBuilder();
for ( TermInfo ti: termsInfos ) {
text.append( ti.getText() );
}
return text.toString();
}
/**
* @return the termsOffsets
*/
public List<Toffs> getTermsOffsets() {
return termsOffsets;
}
/**
* @return the boost
*/
public float getBoost() {
return boost;
}
/**
* @return the termInfos
*/
public List<TermInfo> getTermsInfos() {
return termsInfos;
}
public WeightedPhraseInfo( LinkedList<TermInfo> terms, float boost ){
this( terms, boost, 0 );
}
public WeightedPhraseInfo( LinkedList<TermInfo> terms, float boost, int seqnum ){
this.boost = boost;
this.seqnum = seqnum;
// We keep TermInfos for further operations
termsInfos = new ArrayList<>( terms );
termsOffsets = new ArrayList<>( terms.size() );
TermInfo ti = terms.get( 0 );
termsOffsets.add( new Toffs( ti.getStartOffset(), ti.getEndOffset() ) );
if( terms.size() == 1 ){
return;
}
int pos = ti.getPosition();
for( int i = 1; i < terms.size(); i++ ){
ti = terms.get( i );
if( ti.getPosition() - pos == 1 ){
Toffs to = termsOffsets.get( termsOffsets.size() - 1 );
to.setEndOffset( ti.getEndOffset() );
}
else{
termsOffsets.add( new Toffs( ti.getStartOffset(), ti.getEndOffset() ) );
}
pos = ti.getPosition();
}
}
/**
* Merging constructor. Note that this just grabs seqnum from the first info.
*/
public WeightedPhraseInfo( Collection< WeightedPhraseInfo > toMerge ) {
// Pretty much the same idea as merging FieldPhraseLists:
// Step 1. Sort by startOffset, endOffset
// While we are here merge the boosts and termInfos
Iterator< WeightedPhraseInfo > toMergeItr = toMerge.iterator();
if ( !toMergeItr.hasNext() ) {
throw new IllegalArgumentException( "toMerge must contain at least one WeightedPhraseInfo." );
}
WeightedPhraseInfo first = toMergeItr.next();
@SuppressWarnings( { "rawtypes", "unchecked" } )
Iterator< Toffs >[] allToffs = new Iterator[ toMerge.size() ];
termsInfos = new ArrayList<>();
seqnum = first.seqnum;
boost = first.boost;
allToffs[ 0 ] = first.termsOffsets.iterator();
int index = 1;
while ( toMergeItr.hasNext() ) {
WeightedPhraseInfo info = toMergeItr.next();
boost += info.boost;
termsInfos.addAll( info.termsInfos );
allToffs[ index++ ] = info.termsOffsets.iterator();
}
// Step 2. Walk the sorted list merging overlaps
MergedIterator< Toffs > itr = new MergedIterator<>( false, allToffs );
termsOffsets = new ArrayList<>();
if ( !itr.hasNext() ) {
return;
}
Toffs work = itr.next();
while ( itr.hasNext() ) {
Toffs current = itr.next();
if ( current.startOffset <= work.endOffset ) {
work.endOffset = Math.max( work.endOffset, current.endOffset );
} else {
termsOffsets.add( work );
work = current;
}
}
termsOffsets.add( work );
}
public int getStartOffset(){
return termsOffsets.get( 0 ).startOffset;
}
public int getEndOffset(){
return termsOffsets.get( termsOffsets.size() - 1 ).endOffset;
}
public boolean isOffsetOverlap( WeightedPhraseInfo other ){
int so = getStartOffset();
int eo = getEndOffset();
int oso = other.getStartOffset();
int oeo = other.getEndOffset();
if( so <= oso && oso < eo ) return true;
if( so < oeo && oeo <= eo ) return true;
if( oso <= so && so < oeo ) return true;
if( oso < eo && eo <= oeo ) return true;
return false;
}
@Override
public String toString(){
StringBuilder sb = new StringBuilder();
sb.append( getText() ).append( '(' ).append( boost ).append( ")(" );
for( Toffs to : termsOffsets ){
sb.append( to );
}
sb.append( ')' );
return sb.toString();
}
/**
* @return the seqnum
*/
public int getSeqnum() {
return seqnum;
}
@Override
public int compareTo( WeightedPhraseInfo other ) {
int diff = getStartOffset() - other.getStartOffset();
if ( diff != 0 ) {
return diff;
}
diff = getEndOffset() - other.getEndOffset();
if ( diff != 0 ) {
return diff;
}
return (int) Math.signum( getBoost() - other.getBoost() );
}
@Override
public int hashCode() {
final int prime = 31;
int result = 1;
result = prime * result + getStartOffset();
result = prime * result + getEndOffset();
long b = Double.doubleToLongBits( getBoost() );
result = prime * result + ( int )( b ^ ( b >>> 32 ) );
return result;
}
@Override
public boolean equals(Object obj) {
if (this == obj) {
return true;
}
if (obj == null) {
return false;
}
if (getClass() != obj.getClass()) {
return false;
}
WeightedPhraseInfo other = (WeightedPhraseInfo) obj;
if (getStartOffset() != other.getStartOffset()) {
return false;
}
if (getEndOffset() != other.getEndOffset()) {
return false;
}
if (getBoost() != other.getBoost()) {
return false;
}
return true;
}
/**
* Term offsets (start + end)
*/
public static class Toffs implements Comparable< Toffs > {
private int startOffset;
private int endOffset;
public Toffs( int startOffset, int endOffset ){
this.startOffset = startOffset;
this.endOffset = endOffset;
}
public void setEndOffset( int endOffset ){
this.endOffset = endOffset;
}
public int getStartOffset(){
return startOffset;
}
public int getEndOffset(){
return endOffset;
}
@Override
public int compareTo( Toffs other ) {
int diff = getStartOffset() - other.getStartOffset();
if ( diff != 0 ) {
return diff;
}
return getEndOffset() - other.getEndOffset();
}
@Override
public int hashCode() {
final int prime = 31;
int result = 1;
result = prime * result + getStartOffset();
result = prime * result + getEndOffset();
return result;
}
@Override
public boolean equals(Object obj) {
if (this == obj) {
return true;
}
if (obj == null) {
return false;
}
if (getClass() != obj.getClass()) {
return false;
}
Toffs other = (Toffs) obj;
if (getStartOffset() != other.getStartOffset()) {
return false;
}
if (getEndOffset() != other.getEndOffset()) {
return false;
}
return true;
}
@Override
public String toString(){
StringBuilder sb = new StringBuilder();
sb.append( '(' ).append( startOffset ).append( ',' ).append( endOffset ).append( ')' );
return sb.toString();
}
}
}
}