blob: 2b6c1d521f02c893cb7424da17f4a3ad5b53eca8 [file] [log] [blame]
Index: lucene/highlighter/src/test/org/apache/lucene/search/vectorhighlight/FastVectorHighlighterTest.java
===================================================================
--- lucene/highlighter/src/test/org/apache/lucene/search/vectorhighlight/FastVectorHighlighterTest.java (revision 1504378)
+++ lucene/highlighter/src/test/org/apache/lucene/search/vectorhighlight/FastVectorHighlighterTest.java (working copy)
@@ -16,10 +16,18 @@
* limitations under the License.
*/
import java.io.IOException;
+import java.io.Reader;
+import java.util.Arrays;
+import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.analysis.MockTokenFilter;
import org.apache.lucene.analysis.MockTokenizer;
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
@@ -27,12 +35,14 @@
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.queries.CommonTermsQuery;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.PhraseQuery;
+import org.apache.lucene.search.Query;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
@@ -40,8 +50,9 @@
public class FastVectorHighlighterTest extends LuceneTestCase {
+
+ private static final String FIELD = "text";
-
public void testSimpleHighlightTest() throws IOException {
Directory dir = newDirectory();
IndexWriter writer = new IndexWriter(dir, newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random())));
@@ -287,4 +298,128 @@
writer.close();
dir.close();
}
+
+ public void testOverlappingPhrases() throws IOException {
+ final Analyzer analyzer = new Analyzer() {
+
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+ final Tokenizer source = new MockTokenizer(reader);
+ TokenStream sink = source;
+ sink = new SynonymFilter(sink);
+ return new TokenStreamComponents(source, sink);
+ }
+
+ };
+ final Directory directory = newDirectory();
+ RandomIndexWriter iw = new RandomIndexWriter(random(), directory, analyzer);
+ Document doc = new Document();
+ FieldType withVectors = new FieldType(TextField.TYPE_STORED);
+ withVectors.setStoreTermVectors(true);
+ withVectors.setStoreTermVectorPositions(true);
+ withVectors.setStoreTermVectorOffsets(true);
+ doc.add(new Field(FIELD, "a b c", withVectors));
+ iw.addDocument(doc);
+ DirectoryReader ir = iw.getReader();
+
+ // Disjunction of two overlapping phrase queries
+ final PhraseQuery pq1 = new PhraseQuery();
+ pq1.add(new Term(FIELD, "a"), 0);
+ pq1.add(new Term(FIELD, "b"), 1);
+ pq1.add(new Term(FIELD, "c"), 2);
+
+ final PhraseQuery pq2 = new PhraseQuery();
+ pq2.add(new Term(FIELD, "a"), 0);
+ pq2.add(new Term(FIELD, "B"), 1);
+ pq2.add(new Term(FIELD, "c"), 2);
+
+ final BooleanQuery bq = new BooleanQuery();
+ bq.add(pq1, Occur.SHOULD);
+ bq.add(pq2, Occur.SHOULD);
+
+ // Single phrase query with two terms at the same position
+ final PhraseQuery pq = new PhraseQuery();
+ pq.add(new Term(FIELD, "a"), 0);
+ pq.add(new Term(FIELD, "b"), 1);
+ pq.add(new Term(FIELD, "B"), 1);
+ pq.add(new Term(FIELD, "c"), 2);
+
+ for (Query query : Arrays.asList(pq1, pq2, bq, pq)) {
+ assertEquals(1, new IndexSearcher(ir).search(bq, 1).totalHits);
+
+ FastVectorHighlighter highlighter = new FastVectorHighlighter();
+ FieldQuery fieldQuery = highlighter.getFieldQuery(query, ir);
+ String[] bestFragments = highlighter.getBestFragments(fieldQuery, ir, 0, FIELD, 1000, 1);
+ assertEquals("<b>a b c</b>", bestFragments[0]);
+ }
+
+ ir.close();
+ iw.close();
+ directory.close();
+ }
+
+ public void testPhraseWithGap() throws IOException {
+ final Directory directory = newDirectory();
+ RandomIndexWriter iw = new RandomIndexWriter(random(), directory, new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false));
+ Document doc = new Document();
+ FieldType withVectors = new FieldType(TextField.TYPE_STORED);
+ withVectors.setStoreTermVectors(true);
+ withVectors.setStoreTermVectorPositions(true);
+ withVectors.setStoreTermVectorOffsets(true);
+ doc.add(new Field(FIELD, "a b c", withVectors));
+ iw.addDocument(doc);
+ DirectoryReader ir = iw.getReader();
+
+ final PhraseQuery pq = new PhraseQuery();
+ pq.add(new Term(FIELD, "c"), 2);
+ pq.add(new Term(FIELD, "a"), 0);
+
+ assertEquals(1, new IndexSearcher(ir).search(pq, 1).totalHits);
+
+ FastVectorHighlighter highlighter = new FastVectorHighlighter();
+ FieldQuery fieldQuery = highlighter.getFieldQuery(pq, ir);
+ String[] bestFragments = highlighter.getBestFragments(fieldQuery, ir, 0, FIELD, 1000, 1);
+ assertEquals("<b>a</b> b <b>c</b>", bestFragments[0]);
+
+ ir.close();
+ iw.close();
+ directory.close();
+ }
+
+ // Simple token filter that adds 'B' as a synonym of 'b'
+ private static class SynonymFilter extends TokenFilter {
+
+ final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+ final PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
+
+ State pending;
+
+ protected SynonymFilter(TokenStream input) {
+ super(input);
+ }
+
+ @Override
+ public boolean incrementToken() throws IOException {
+ if (pending != null) {
+ restoreState(pending);
+ termAtt.setEmpty().append('B');
+ posIncAtt.setPositionIncrement(0);
+ pending = null;
+ return true;
+ }
+ if (!input.incrementToken()) {
+ return false;
+ }
+ if (termAtt.toString().equals("b")) {
+ pending = captureState();
+ }
+ return true;
+ }
+
+ @Override
+ public void reset() throws IOException {
+ super.reset();
+ pending = null;
+ }
+ }
}
Index: lucene/highlighter/src/test/org/apache/lucene/search/vectorhighlight/FieldQueryTest.java
===================================================================
--- lucene/highlighter/src/test/org/apache/lucene/search/vectorhighlight/FieldQueryTest.java (revision 1504378)
+++ lucene/highlighter/src/test/org/apache/lucene/search/vectorhighlight/FieldQueryTest.java (working copy)
@@ -863,8 +863,8 @@
phraseCandidate.add( new TermInfo( "c", 4, 5, 4, 1 ) );
assertNull( fq.searchPhrase( F, phraseCandidate ) );
- // "a b c"~1
- query = pqF( 1F, 1, "a", "b", "c" );
+ // "a b c"~2
+ query = pqF( 1F, 2, "a", "b", "c" );
// phraseHighlight = true, fieldMatch = true
fq = new FieldQuery( query, true, true );
Index: lucene/highlighter/src/test/org/apache/lucene/search/vectorhighlight/FieldPhraseListTest.java
===================================================================
--- lucene/highlighter/src/test/org/apache/lucene/search/vectorhighlight/FieldPhraseListTest.java (revision 1504378)
+++ lucene/highlighter/src/test/org/apache/lucene/search/vectorhighlight/FieldPhraseListTest.java (working copy)
@@ -120,7 +120,31 @@
assertEquals( 4, fpl.phraseList.get( 0 ).getStartOffset() );
assertEquals( 9, fpl.phraseList.get( 0 ).getEndOffset() );
}
-
+
+ public void testProximityPhraseReverse() throws Exception {
+ make1d1fIndex( "z a a b c" );
+
+ FieldQuery fq = new FieldQuery( pqF( 2F, 3, "c", "a" ), true, true );
+ FieldTermStack stack = new FieldTermStack( reader, 0, F, fq );
+ FieldPhraseList fpl = new FieldPhraseList( stack, fq );
+ assertEquals( 1, fpl.phraseList.size() );
+ assertEquals( "ac(2.0)((4,5)(8,9))", fpl.phraseList.get( 0 ).toString() );
+ assertEquals( 4, fpl.phraseList.get( 0 ).getStartOffset() );
+ assertEquals( 9, fpl.phraseList.get( 0 ).getEndOffset() );
+ }
+
+ public void testProximityPhraseWithRepeatedTerms() throws Exception {
+ make1d1fIndex( "z a a b b z d" );
+
+ FieldQuery fq = new FieldQuery( pqF( 2F, 2, "a", "b", "d" ), true, true );
+ FieldTermStack stack = new FieldTermStack( reader, 0, F, fq );
+ FieldPhraseList fpl = new FieldPhraseList( stack, fq );
+ assertEquals( 1, fpl.phraseList.size() );
+ assertEquals( "abd(2.0)((4,7)(12,13))", fpl.phraseList.get( 0 ).toString() );
+ assertEquals( 4, fpl.phraseList.get( 0 ).getStartOffset() );
+ assertEquals( 13, fpl.phraseList.get( 0 ).getEndOffset() );
+ }
+
public void test2PhrasesOverlap() throws Exception {
make1d1fIndex( "d a b c d" );
Index: lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldTermStack.java
===================================================================
--- lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldTermStack.java (revision 1504378)
+++ lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldTermStack.java (working copy)
@@ -145,6 +145,13 @@
}
/**
+ * Return the top TermInfo object of the stack without removing it.
+ */
+ public TermInfo peek() {
+ return termList.peek();
+ }
+
+ /**
* @param termInfo the TermInfo object to be put on the top of the stack
*/
public void push( TermInfo termInfo ){
Index: lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldQuery.java
===================================================================
--- lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldQuery.java (revision 1504378)
+++ lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldQuery.java (working copy)
@@ -17,6 +17,8 @@
*/
import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
@@ -39,6 +41,7 @@
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.vectorhighlight.FieldTermStack.TermInfo;
+import org.apache.lucene.util.InPlaceMergeSorter;
/**
* FieldQuery breaks down query object into terms/phrases and keeps
@@ -347,6 +350,7 @@
boolean terminal;
int slop; // valid if terminal == true and phraseHighlight == true
float boost; // valid if terminal == true
+ int[] positions; // valid if terminal == true
int termOrPhraseNumber; // valid if terminal == true
FieldQuery fieldQuery;
Map<String, QueryPhraseMap> subMap = new HashMap<String, QueryPhraseMap>();
@@ -369,38 +373,107 @@
return map;
}
- void add( Query query, IndexReader reader ) {
+ void add( Query query, IndexReader reader ) {
if( query instanceof TermQuery ){
addTerm( ((TermQuery)query).getTerm(), query.getBoost() );
}
else if( query instanceof PhraseQuery ){
PhraseQuery pq = (PhraseQuery)query;
- Term[] terms = pq.getTerms();
- Map<String, QueryPhraseMap> map = subMap;
- QueryPhraseMap qpm = null;
- for( Term term : terms ){
- qpm = getOrNewMap( map, term.text() );
- map = qpm.subMap;
- }
- qpm.markTerminal( pq.getSlop(), pq.getBoost() );
+ final Term[] terms = pq.getTerms();
+ final int[] positions = pq.getPositions();
+ new InPlaceMergeSorter() {
+
+ @Override
+ protected void swap(int i, int j) {
+ Term tmpTerm = terms[i];
+ terms[i] = terms[j];
+ terms[j] = tmpTerm;
+
+ int tmpPos = positions[i];
+ positions[i] = positions[j];
+ positions[j] = tmpPos;
+ }
+
+ @Override
+ protected int compare(int i, int j) {
+ return positions[i] - positions[j];
+ }
+ }.sort(0, terms.length);
+
+ addToMap(pq, terms, positions, 0, subMap, pq.getSlop());
}
else
throw new RuntimeException( "query \"" + query.toString() + "\" must be flatten first." );
}
-
+
+ private int numTermsAtSamePosition(int[] positions, int i) {
+ int numTermsAtSamePosition = 1;
+ for (int j = i + 1; j < positions.length; ++j) {
+ if (positions[j] == positions[i]) {
+ ++numTermsAtSamePosition;
+ }
+ }
+ return numTermsAtSamePosition;
+ }
+
+ private void addToMap(PhraseQuery pq, Term[] terms, int[] positions, int i, Map<String, QueryPhraseMap> map, int slop) {
+ int numTermsAtSamePosition = numTermsAtSamePosition(positions, i);
+ for (int j = 0; j < numTermsAtSamePosition; ++j) {
+ QueryPhraseMap qpm = getOrNewMap(map, terms[i + j].text());
+ if (i + numTermsAtSamePosition == terms.length) {
+ qpm.markTerminal(pq.getSlop(), pq.getBoost(), uniquePositions(positions));
+ } else {
+ addToMap(pq, terms, positions, i + numTermsAtSamePosition, qpm.subMap, slop);
+ }
+ }
+ if (slop > 2 && i + numTermsAtSamePosition < terms.length) {
+ Term[] otherTerms = Arrays.copyOf(terms, terms.length);
+ int[] otherPositions = Arrays.copyOf(positions, positions.length);
+ final int nextTermAtSamePosition = numTermsAtSamePosition(positions, i + numTermsAtSamePosition);
+ System.arraycopy(terms, i + numTermsAtSamePosition, otherTerms, i, nextTermAtSamePosition);
+ System.arraycopy(positions, i + numTermsAtSamePosition, otherPositions, i, nextTermAtSamePosition);
+ System.arraycopy(terms, i, otherTerms, i + nextTermAtSamePosition, numTermsAtSamePosition);
+ System.arraycopy(positions, i, otherPositions, i + nextTermAtSamePosition, numTermsAtSamePosition);
+ addToMap(pq, otherTerms, otherPositions, i, map, slop - 2);
+ }
+ }
+
+ private int[] uniquePositions(int[] positions) {
+ int uniqueCount = 1;
+ for (int i = 1; i < positions.length; ++i) {
+ if (positions[i] != positions[i - 1]) {
+ ++uniqueCount;
+ }
+ }
+ if (uniqueCount == positions.length) {
+ return positions;
+ }
+ int[] result = new int[uniqueCount];
+ result[0] = positions[0];
+ for (int i = 1, j = 1; i < positions.length; ++i) {
+ if (positions[i] != positions[i - 1]) {
+ result[j++] = positions[i];
+ }
+ }
+ return result;
+ }
+
public QueryPhraseMap getTermMap( String term ){
return subMap.get( term );
}
private void markTerminal( float boost ){
- markTerminal( 0, boost );
+ markTerminal( 0, boost, null );
}
- private void markTerminal( int slop, float boost ){
- this.terminal = true;
- this.slop = slop;
- this.boost = boost;
- this.termOrPhraseNumber = fieldQuery.nextTermOrPhraseNumber();
+ private void markTerminal( int slop, float boost, int[] positions ){
+ if (slop > this.slop || (slop == this.slop && boost > this.boost)) {
+ this.terminal = true;
+ this.slop = slop;
+ this.boost = boost;
+ this.termOrPhraseNumber = fieldQuery.nextTermOrPhraseNumber();
+ this.positions = positions;
+ }
}
public boolean isTerminal(){
@@ -435,15 +508,20 @@
// if the candidate is a term, it is valid
if( phraseCandidate.size() == 1 ) return true;
+
+ assert phraseCandidate.size() == positions.length;
// else check whether the candidate is valid phrase
// compare position-gaps between terms to slop
int pos = phraseCandidate.get( 0 ).getPosition();
+ int totalDistance = 0;
for( int i = 1; i < phraseCandidate.size(); i++ ){
int nextPos = phraseCandidate.get( i ).getPosition();
- if( Math.abs( nextPos - pos - 1 ) > slop ) return false;
+ final int expectedDelta = positions[i] - positions[i - 1];
+ final int actualDelta = nextPos - pos;
+ totalDistance += Math.abs(expectedDelta - actualDelta);
pos = nextPos;
}
- return true;
+ return totalDistance <= slop;
}
}
}
Index: lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldPhraseList.java
===================================================================
--- lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldPhraseList.java (revision 1504378)
+++ lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldPhraseList.java (working copy)
@@ -16,7 +16,9 @@
* limitations under the License.
*/
+import java.util.ArrayDeque;
import java.util.ArrayList;
+import java.util.Deque;
import java.util.LinkedList;
import java.util.List;
@@ -60,49 +62,73 @@
public FieldPhraseList( FieldTermStack fieldTermStack, FieldQuery fieldQuery, int phraseLimit ){
final String field = fieldTermStack.getFieldName();
+ @SuppressWarnings("unchecked")
+ Deque<TermInfo>[] termStacks = new Deque[] {new ArrayDeque<TermInfo>()};
+ for (TermInfo ti = fieldTermStack.pop(); ti != null; ti = fieldTermStack.pop()) {
+ // If there are tokens at the same position, compute all combinations
+ if (!fieldTermStack.isEmpty() && fieldTermStack.peek().getPosition() == ti.getPosition()) {
+ List<TermInfo> samePositionTermInfos = new ArrayList<>(2);
+ samePositionTermInfos.add(ti);
+ samePositionTermInfos.add(fieldTermStack.pop());
+ while (!fieldTermStack.isEmpty() && fieldTermStack.peek().getPosition() == ti.getPosition()) {
+ samePositionTermInfos.add(fieldTermStack.pop());
+ }
+ final int numTokensAtSamePosition = samePositionTermInfos.size();
+ @SuppressWarnings("unchecked")
+ Deque<TermInfo>[] newTermStacks = new Deque[termStacks.length * numTokensAtSamePosition];
+ for (int i = 0, k = 0; i < termStacks.length; ++i) {
+ for (int j = 0; j < numTokensAtSamePosition; ++j) {
+ if (j == numTokensAtSamePosition - 1) {
+ newTermStacks[k] = termStacks[i];
+ } else {
+ newTermStacks[k] = new ArrayDeque<>(termStacks[i]);
+ }
+ newTermStacks[k++].offer(samePositionTermInfos.get(j));
+ }
+ }
+ termStacks = newTermStacks;
+ } else {
+ for (Deque<TermInfo> d : termStacks) {
+ d.offer(ti);
+ }
+ }
+ }
+
+ for (Deque<TermInfo> d : termStacks) {
+ extractPhrases(field, d, fieldQuery, phraseLimit);
+ }
+ }
+
+ void extractPhrases(String field, Deque<TermInfo> fieldTermStack, FieldQuery fieldQuery, int phraseLimit) {
LinkedList<TermInfo> phraseCandidate = new LinkedList<TermInfo>();
- QueryPhraseMap currMap = null;
- QueryPhraseMap nextMap = null;
- while( !fieldTermStack.isEmpty() && (phraseList.size() < phraseLimit) )
- {
+ while( !fieldTermStack.isEmpty() && (phraseList.size() < phraseLimit) ) {
+
+ int longest = 0;
phraseCandidate.clear();
-
- TermInfo ti = fieldTermStack.pop();
- currMap = fieldQuery.getFieldTermMap( field, ti.getText() );
-
- // if not found, discard top TermInfo from stack, then try next element
- if( currMap == null ) continue;
-
- // if found, search the longest phrase
- phraseCandidate.add( ti );
- while( true ){
- ti = fieldTermStack.pop();
- nextMap = null;
- if( ti != null )
- nextMap = currMap.getTermMap( ti.getText() );
- if( ti == null || nextMap == null ){
- if( ti != null )
- fieldTermStack.push( ti );
- if( currMap.isValidTermOrPhrase( phraseCandidate ) ){
- addIfNoOverlap( new WeightedPhraseInfo( phraseCandidate, currMap.getBoost(), currMap.getTermOrPhraseNumber() ) );
+ QueryPhraseMap currMap = null;
+ for (TermInfo ti : fieldTermStack) {
+ QueryPhraseMap nextMap = null;
+ if (currMap == null) {
+ nextMap = fieldQuery.getFieldTermMap(field, ti.getText());
+ if (nextMap == null) {
+ break;
}
- else{
- while( phraseCandidate.size() > 1 ){
- fieldTermStack.push( phraseCandidate.removeLast() );
- currMap = fieldQuery.searchPhrase( field, phraseCandidate );
- if( currMap != null ){
- addIfNoOverlap( new WeightedPhraseInfo( phraseCandidate, currMap.getBoost(), currMap.getTermOrPhraseNumber() ) );
- break;
- }
- }
- }
- break;
+ } else {
+ nextMap = currMap.getTermMap(ti.getText());
}
- else{
- phraseCandidate.add( ti );
+ if (nextMap != null) {
currMap = nextMap;
+ phraseCandidate.add(ti);
+ if( currMap.isValidTermOrPhrase( phraseCandidate ) ){
+ longest = phraseCandidate.size();
+ }
}
}
+
+ if (longest > 0) {
+ addIfNoOverlap( new WeightedPhraseInfo( phraseCandidate.subList(0, longest), currMap.getBoost(), currMap.getTermOrPhraseNumber() ) );
+ }
+ fieldTermStack.pop();
}
}
@@ -159,11 +185,11 @@
return termsInfos;
}
- public WeightedPhraseInfo( LinkedList<TermInfo> terms, float boost ){
+ public WeightedPhraseInfo( List<TermInfo> terms, float boost ){
this( terms, boost, 0 );
}
- public WeightedPhraseInfo( LinkedList<TermInfo> terms, float boost, int seqnum ){
+ public WeightedPhraseInfo( List<TermInfo> terms, float boost, int seqnum ){
this.boost = boost;
this.seqnum = seqnum;