docs/attachments/LUCENE-5182/LUCENE-5182.patch - lucene-jira-archive - Git at Google

 Index: lucene/highlighter/src/test/org/apache/lucene/search/vectorhighlight/FastVectorHighlighterTest.java
 ===================================================================
 --- lucene/highlighter/src/test/org/apache/lucene/search/vectorhighlight/FastVectorHighlighterTest.java	(revision 1515825)
 +++ lucene/highlighter/src/test/org/apache/lucene/search/vectorhighlight/FastVectorHighlighterTest.java	(working copy)
 @@ -47,6 +47,7 @@
  import org.apache.lucene.search.TopDocs;
  import org.apache.lucene.store.Directory;
  import org.apache.lucene.util.LuceneTestCase;
 +import org.apache.lucene.util._TestUtil;


  public class FastVectorHighlighterTest extends LuceneTestCase {
 @@ -298,6 +299,49 @@
      writer.close();
      dir.close();
    }
 +
 +  public void testLotsOfPhrases() throws IOException {
 +    Directory dir = newDirectory();
 +    IndexWriter writer = new IndexWriter(dir, newIndexWriterConfig(TEST_VERSION_CURRENT,  new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET)));
 +    FieldType type = new FieldType(TextField.TYPE_STORED);
 +    type.setStoreTermVectorOffsets(true);
 +    type.setStoreTermVectorPositions(true);
 +    type.setStoreTermVectors(true);
 +    type.freeze();
 +    String[] terms = { "org", "apache", "lucene"};
 +    int iters = atLeast(1000);
 +    StringBuilder builder = new StringBuilder();
 +    for (int i = 0; i < iters; i++) {
 +      builder.append(terms[random().nextInt(terms.length)]).append(" ");
 +      if (random().nextInt(6) == 3) {
 +        builder.append("solr").append(" ");
 +      }
 +    }
 +      Document doc = new Document();
 +      Field field = new Field("field", builder.toString(), type);
 +      doc.add(field);
 +      writer.addDocument(doc);
 +    PhraseQuery query = new PhraseQuery();
 +    query.add(new Term("field", "org"));
 +    query.add(new Term("field", "apache"));
 +    query.add(new Term("field", "lucene"));
 +
 +
 +    FastVectorHighlighter highlighter = new FastVectorHighlighter();
 +    IndexReader reader = DirectoryReader.open(writer, true);
 +    IndexSearcher searcher = newSearcher(reader);
 +    TopDocs hits = searcher.search(query, 10);
 +    assertEquals(1, hits.totalHits);
 +    FieldQuery fieldQuery  = highlighter.getFieldQuery(query, reader);
 +    String[] bestFragments = highlighter.getBestFragments(fieldQuery, reader, hits.scoreDocs[0].doc, "field", 1000, 1);
 +    for (int i = 0; i < bestFragments.length; i++) {
 +      String result = bestFragments[i].replaceAll("<b>org apache lucene</b>", "FOOBAR");
 +      assertFalse(result.contains("org apache lucene"));
 +    }
 +    reader.close();
 +    writer.close();
 +    dir.close();
 +  }

    public void testOverlappingPhrases() throws IOException {
      final Analyzer analyzer = new Analyzer() {
 Index: lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldQuery.java
 ===================================================================
 --- lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldQuery.java	(revision 1515825)
 +++ lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldQuery.java	(working copy)
 @@ -30,7 +30,6 @@

  import org.apache.lucene.index.IndexReader;
  import org.apache.lucene.index.Term;
 -import org.apache.lucene.queries.CommonTermsQuery;
  import org.apache.lucene.search.BooleanClause;
  import org.apache.lucene.search.BooleanQuery;
  import org.apache.lucene.search.ConstantScoreQuery;
 @@ -63,6 +62,8 @@

    // The maximum number of different matching terms accumulated from any one MultiTermQuery
    private static final int MAX_MTQ_TERMS = 1024;
 +
 +  private int maxPhraseWindow = 1;

    FieldQuery( Query query, IndexReader reader, boolean phraseHighlight, boolean fieldMatch ) throws IOException {
      this.fieldMatch = fieldMatch;
 @@ -400,7 +401,7 @@
              return positions[i] - positions[j];
            }
          }.sort(0, terms.length);
 -
 +
          addToMap(pq, terms, positions, 0, subMap, pq.getSlop());
        }
        else
 @@ -474,9 +475,16 @@
          this.boost = boost;
          this.termOrPhraseNumber = fieldQuery.nextTermOrPhraseNumber();
          this.positions = positions;
 +        if (positions != null) {
 +          fieldQuery.maxPhraseWindow = Math.max(fieldQuery.maxPhraseWindow, slop + positions[positions.length-1] - positions[0]);
 +        }
        }
      }

 +    public int getMaxPhraseWindow() {
 +      return fieldQuery.maxPhraseWindow;
 +    }
 +
      public boolean isTerminal(){
        return terminal;
      }
 Index: lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldPhraseList.java
 ===================================================================
 --- lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldPhraseList.java	(revision 1515825)
 +++ lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldPhraseList.java	(working copy)
 @@ -69,6 +69,9 @@
    }

    void extractPhrases(LinkedList<TermInfo> terms, QueryPhraseMap currMap, LinkedList<TermInfo> phraseCandidate, int longest) {
 +    if (phraseCandidate.size() > 1 && phraseCandidate.getLast().getPosition() - phraseCandidate.getFirst().getPosition() > currMap.getMaxPhraseWindow()) {
 +      return;
 +    }
      if (terms.isEmpty()) {
        if (longest > 0) {
          addIfNoOverlap( new WeightedPhraseInfo( phraseCandidate.subList(0, longest), currMap.getBoost(), currMap.getTermOrPhraseNumber() ) );
	Index: lucene/highlighter/src/test/org/apache/lucene/search/vectorhighlight/FastVectorHighlighterTest.java
	===================================================================
	--- lucene/highlighter/src/test/org/apache/lucene/search/vectorhighlight/FastVectorHighlighterTest.java (revision 1515825)
	+++ lucene/highlighter/src/test/org/apache/lucene/search/vectorhighlight/FastVectorHighlighterTest.java (working copy)
	@@ -47,6 +47,7 @@
	import org.apache.lucene.search.TopDocs;
	import org.apache.lucene.store.Directory;
	import org.apache.lucene.util.LuceneTestCase;
	+import org.apache.lucene.util._TestUtil;


	public class FastVectorHighlighterTest extends LuceneTestCase {
	@@ -298,6 +299,49 @@
	writer.close();
	dir.close();
	}
	+
	+ public void testLotsOfPhrases() throws IOException {
	+ Directory dir = newDirectory();
	+ IndexWriter writer = new IndexWriter(dir, newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET)));
	+ FieldType type = new FieldType(TextField.TYPE_STORED);
	+ type.setStoreTermVectorOffsets(true);
	+ type.setStoreTermVectorPositions(true);
	+ type.setStoreTermVectors(true);
	+ type.freeze();
	+ String[] terms = { "org", "apache", "lucene"};
	+ int iters = atLeast(1000);
	+ StringBuilder builder = new StringBuilder();
	+ for (int i = 0; i < iters; i++) {
	+ builder.append(terms[random().nextInt(terms.length)]).append(" ");
	+ if (random().nextInt(6) == 3) {
	+ builder.append("solr").append(" ");
	+ }
	+ }
	+ Document doc = new Document();
	+ Field field = new Field("field", builder.toString(), type);
	+ doc.add(field);
	+ writer.addDocument(doc);
	+ PhraseQuery query = new PhraseQuery();
	+ query.add(new Term("field", "org"));
	+ query.add(new Term("field", "apache"));
	+ query.add(new Term("field", "lucene"));
	+
	+
	+ FastVectorHighlighter highlighter = new FastVectorHighlighter();
	+ IndexReader reader = DirectoryReader.open(writer, true);
	+ IndexSearcher searcher = newSearcher(reader);
	+ TopDocs hits = searcher.search(query, 10);
	+ assertEquals(1, hits.totalHits);
	+ FieldQuery fieldQuery = highlighter.getFieldQuery(query, reader);
	+ String[] bestFragments = highlighter.getBestFragments(fieldQuery, reader, hits.scoreDocs[0].doc, "field", 1000, 1);
	+ for (int i = 0; i < bestFragments.length; i++) {
	+ String result = bestFragments[i].replaceAll("<b>org apache lucene</b>", "FOOBAR");
	+ assertFalse(result.contains("org apache lucene"));
	+ }
	+ reader.close();
	+ writer.close();
	+ dir.close();
	+ }

	public void testOverlappingPhrases() throws IOException {
	final Analyzer analyzer = new Analyzer() {
	Index: lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldQuery.java
	===================================================================
	--- lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldQuery.java (revision 1515825)
	+++ lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldQuery.java (working copy)
	@@ -30,7 +30,6 @@

	import org.apache.lucene.index.IndexReader;
	import org.apache.lucene.index.Term;
	-import org.apache.lucene.queries.CommonTermsQuery;
	import org.apache.lucene.search.BooleanClause;
	import org.apache.lucene.search.BooleanQuery;
	import org.apache.lucene.search.ConstantScoreQuery;
	@@ -63,6 +62,8 @@

	// The maximum number of different matching terms accumulated from any one MultiTermQuery
	private static final int MAX_MTQ_TERMS = 1024;
	+
	+ private int maxPhraseWindow = 1;

	FieldQuery( Query query, IndexReader reader, boolean phraseHighlight, boolean fieldMatch ) throws IOException {
	this.fieldMatch = fieldMatch;
	@@ -400,7 +401,7 @@
	return positions[i] - positions[j];
	}
	}.sort(0, terms.length);
	-
	+
	addToMap(pq, terms, positions, 0, subMap, pq.getSlop());
	}
	else
	@@ -474,9 +475,16 @@
	this.boost = boost;
	this.termOrPhraseNumber = fieldQuery.nextTermOrPhraseNumber();
	this.positions = positions;
	+ if (positions != null) {
	+ fieldQuery.maxPhraseWindow = Math.max(fieldQuery.maxPhraseWindow, slop + positions[positions.length-1] - positions[0]);
	+ }
	}
	}

	+ public int getMaxPhraseWindow() {
	+ return fieldQuery.maxPhraseWindow;
	+ }
	+
	public boolean isTerminal(){
	return terminal;
	}
	Index: lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldPhraseList.java
	===================================================================
	--- lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldPhraseList.java (revision 1515825)
	+++ lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldPhraseList.java (working copy)
	@@ -69,6 +69,9 @@
	}

	void extractPhrases(LinkedList<TermInfo> terms, QueryPhraseMap currMap, LinkedList<TermInfo> phraseCandidate, int longest) {
	+ if (phraseCandidate.size() > 1 && phraseCandidate.getLast().getPosition() - phraseCandidate.getFirst().getPosition() > currMap.getMaxPhraseWindow()) {
	+ return;
	+ }
	if (terms.isEmpty()) {
	if (longest > 0) {
	addIfNoOverlap( new WeightedPhraseInfo( phraseCandidate.subList(0, longest), currMap.getBoost(), currMap.getTermOrPhraseNumber() ) );