blob: 98e796e53f4b5fe30e4b4a8fd981d3cca8204730 [file] [log] [blame]
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterGraphFilter.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterGraphFilter.java
index 7516a23..04d6d2b 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterGraphFilter.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterGraphFilter.java
@@ -17,16 +17,33 @@
package org.apache.lucene.analysis.miscellaneous;
import java.io.IOException;
+import java.io.PrintWriter;
import java.util.*;
import org.apache.lucene.analysis.*;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.StopFilter;
+import org.apache.lucene.analysis.core.FlattenGraphFilter;
+import org.apache.lucene.analysis.core.FlattenGraphFilterFactory;
import org.apache.lucene.analysis.core.KeywordTokenizer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.document.TextField;
+import org.apache.lucene.index.DirectoryReader;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.index.IndexWriterConfig;
+import org.apache.lucene.search.BooleanClause;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.Query;
+import org.apache.lucene.search.TopDocs;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.util.IOUtils;
+import org.apache.lucene.util.QueryBuilder;
import org.apache.lucene.util.TestUtil;
import static org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter.*;
@@ -308,6 +325,46 @@ public class TestWordDelimiterGraphFilter extends BaseTokenStreamTestCase {
IOUtils.close(a, a2, a3);
}
+
+ public void testLucene7848() throws Exception {
+ CharArraySet protWords = new CharArraySet(Collections.emptySet(), false);
+
+ Analyzer indexingAnalyzer = new Analyzer() {
+ @Override
+ public TokenStreamComponents createComponents(String field) {
+ Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
+ return new TokenStreamComponents(tokenizer,
+ new WordDelimiterGraphFilter(tokenizer, GENERATE_WORD_PARTS | PRESERVE_ORIGINAL | GENERATE_NUMBER_PARTS | STEM_ENGLISH_POSSESSIVE, protWords));
+ }
+ };
+
+ Analyzer searchAnalyzer = new Analyzer() {
+ @Override
+ public TokenStreamComponents createComponents(String field) {
+ Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
+ TokenStream filter = new WordDelimiterGraphFilter(tokenizer, GENERATE_WORD_PARTS | PRESERVE_ORIGINAL | GENERATE_NUMBER_PARTS | STEM_ENGLISH_POSSESSIVE, protWords);
+ filter = new FlattenGraphFilter(filter);
+ return new TokenStreamComponents(tokenizer, filter);
+ }
+ };
+
+ String input = "SPECIAL PROJECTS - xxx,SPECIAL PROJECTS - yyy";
+ String field = "field";
+ IndexWriterConfig config = new IndexWriterConfig(indexingAnalyzer);
+ try (IndexWriter w = new IndexWriter(new RAMDirectory(), config)) {
+ Document doc = new Document();
+ doc.add(new TextField(field, input, Field.Store.YES));
+ w.addDocument(doc);
+ w.commit();
+
+ try (DirectoryReader reader = DirectoryReader.open(w, true, true)) {
+ IndexSearcher searcher = new IndexSearcher(reader);
+ Query q = new QueryBuilder(searchAnalyzer).createPhraseQuery(field, input);
+ TopDocs topDocs = searcher.search(q, 10);
+ assertEquals(1, topDocs.totalHits);
+ }
+ }
+ }
/** concat numbers + words + all */
public void testLotsOfConcatenating() throws Exception {
diff --git a/lucene/core/src/java/org/apache/lucene/util/QueryBuilder.java b/lucene/core/src/java/org/apache/lucene/util/QueryBuilder.java
index f077bfd..20aace7 100644
--- a/lucene/core/src/java/org/apache/lucene/util/QueryBuilder.java
+++ b/lucene/core/src/java/org/apache/lucene/util/QueryBuilder.java
@@ -354,17 +354,25 @@ public class QueryBuilder {
return null;
}
- List<SpanTermQuery> terms = new ArrayList<>();
+ SpanNearQuery.Builder builder = SpanNearQuery.newOrderedNearQuery(field);
+ PositionIncrementAttribute posIncrAtt = in.getAttribute(PositionIncrementAttribute.class);
+ int numClause = 0;
+ in.reset();
while (in.incrementToken()) {
- terms.add(new SpanTermQuery(new Term(field, termAtt.getBytesRef())));
+ if (posIncrAtt.getPositionIncrement() > 1) {
+ builder.addGap(posIncrAtt.getPositionIncrement() - 1);
+ ++ numClause;
+ }
+ ++ numClause;
+ builder.addClause(new SpanTermQuery(new Term(field, termAtt.getBytesRef())));
}
- if (terms.isEmpty()) {
+ if (numClause == 0) {
return null;
- } else if (terms.size() == 1) {
- return terms.get(0);
+ } else if (numClause == 1) {
+ return builder.build().getClauses()[0];
} else {
- return new SpanNearQuery(terms.toArray(new SpanTermQuery[0]), 0, true);
+ return builder.build();
}
}
@@ -549,9 +557,10 @@ public class QueryBuilder {
throws IOException {
source.reset();
GraphTokenStreamFiniteStrings graph = new GraphTokenStreamFiniteStrings(source);
- List<SpanQuery> clauses = new ArrayList<>();
+ SpanNearQuery.Builder builder = SpanNearQuery.newOrderedNearQuery(field).setSlop(phraseSlop);
int[] articulationPoints = graph.articulationPoints();
int lastState = 0;
+ int numClauses = 0;
for (int i = 0; i <= articulationPoints.length; i++) {
int start = lastState;
int end = -1;
@@ -560,6 +569,7 @@ public class QueryBuilder {
}
lastState = end;
final SpanQuery queryPos;
+ int posInc = 1;
if (graph.hasSidePath(start)) {
List<SpanQuery> queries = new ArrayList<>();
Iterator<TokenStream> it = graph.getFiniteStrings(start, end);
@@ -578,6 +588,7 @@ public class QueryBuilder {
} else {
Term[] terms = graph.getTerms(field, start);
assert terms.length > 0;
+ posInc = graph.getPosIncr(start);
if (terms.length == 1) {
queryPos = new SpanTermQuery(terms[0]);
} else {
@@ -591,16 +602,21 @@ public class QueryBuilder {
}
if (queryPos != null) {
- clauses.add(queryPos);
+ if (posInc > 1) {
+ builder.addGap(posInc - 1);
+ ++ numClauses;
+ }
+ builder.addClause(queryPos);
+ ++ numClauses;
}
}
- if (clauses.isEmpty()) {
+ if (numClauses == 0) {
return null;
- } else if (clauses.size() == 1) {
- return clauses.get(0);
+ } else if (numClauses == 1) {
+ return builder.build().getClauses()[0];
} else {
- return new SpanNearQuery(clauses.toArray(new SpanQuery[0]), phraseSlop, true);
+ return builder.build();
}
}
diff --git a/lucene/core/src/java/org/apache/lucene/util/graph/GraphTokenStreamFiniteStrings.java b/lucene/core/src/java/org/apache/lucene/util/graph/GraphTokenStreamFiniteStrings.java
index a700501..e41d326 100644
--- a/lucene/core/src/java/org/apache/lucene/util/graph/GraphTokenStreamFiniteStrings.java
+++ b/lucene/core/src/java/org/apache/lucene/util/graph/GraphTokenStreamFiniteStrings.java
@@ -128,6 +128,20 @@ public final class GraphTokenStreamFiniteStrings {
}
/**
+ * Returns the position increment at the provided state
+ */
+ public int getPosIncr(int state) {
+ int numT = det.initTransition(state, transition);
+ for (int i = 0; i < numT; i++) {
+ det.getNextTransition(transition);
+ for (int id = transition.min; id <= transition.max; id++) {
+ return idToInc.getOrDefault(id, 1);
+ }
+ }
+ return -1;
+ }
+
+ /**
* Get all finite strings from the automaton.
*/
public Iterator<TokenStream> getFiniteStrings() throws IOException {
diff --git a/lucene/queryparser/src/test/org/apache/lucene/queryparser/classic/TestQueryParser.java b/lucene/queryparser/src/test/org/apache/lucene/queryparser/classic/TestQueryParser.java
index 17107fc..3260e84 100644
--- a/lucene/queryparser/src/test/org/apache/lucene/queryparser/classic/TestQueryParser.java
+++ b/lucene/queryparser/src/test/org/apache/lucene/queryparser/classic/TestQueryParser.java
@@ -19,11 +19,14 @@ package org.apache.lucene.queryparser.classic;
import java.io.IOException;
import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.analysis.MockBytesAnalyzer;
import org.apache.lucene.analysis.MockLowerCaseFilter;
import org.apache.lucene.analysis.MockSynonymAnalyzer;
+import org.apache.lucene.analysis.MockSynonymFilter;
import org.apache.lucene.analysis.MockTokenizer;
+import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
@@ -571,6 +574,55 @@ public class TestQueryParser extends QueryParserTestBase {
assertEquals(graphAndQuery, dumb.parse("guinea pig cavy"));
}
+ /** whitespace+lowercase+stopword analyzer with synonyms */
+ private static class Analyzer3 extends Analyzer {
+ Analyzer3(){
+ super();
+ }
+
+ @Override
+ public TokenStreamComponents createComponents(String fieldName) {
+ Tokenizer tokenizer = new MockTokenizer( MockTokenizer.WHITESPACE, true);
+ CharArraySet stop = StopFilter.makeStopSet("the", "is");
+ return new TokenStreamComponents(tokenizer, new StopFilter(new MockSynonymFilter(tokenizer), stop));
+ }
+ }
+
+ public void testMultiWordSynonymsWithGaps() throws Exception {
+ QueryParser dumb = new QueryParser("field", new Analyzer3());
+ dumb.setSplitOnWhitespace(false);
+
+ SpanQuery spanGuineaPig = SpanNearQuery.newOrderedNearQuery("field")
+ .addGap(1)
+ .addClause(new SpanTermQuery(new Term("field", "guinea")))
+ .addClause(new SpanTermQuery(new Term("field", "pig")))
+ .setSlop(0)
+ .build();
+ SpanQuery spanCavy = SpanNearQuery.newOrderedNearQuery("field")
+ .addGap(1)
+ .addClause(new SpanTermQuery(new Term("field", "cavy")))
+ .setSlop(0)
+ .build();
+ SpanQuery spanOr = new SpanOrQuery(new SpanQuery[]{spanGuineaPig, spanCavy});
+ assertEquals(spanOr, dumb.parse("\"the guinea pig\""));
+
+ spanGuineaPig = SpanNearQuery.newOrderedNearQuery("field")
+ .addClause(new SpanTermQuery(new Term("field", "guinea")))
+ .addClause(new SpanTermQuery(new Term("field", "pig")))
+ .setSlop(0)
+ .build();
+ spanCavy =new SpanTermQuery(new Term("field", "cavy"));
+
+ spanOr = new SpanOrQuery(new SpanQuery[]{spanGuineaPig, spanCavy});
+ SpanQuery spanPhrase = SpanNearQuery.newOrderedNearQuery("field")
+ .addClause(new SpanTermQuery(new Term("field", "this")))
+ .addGap(2)
+ .addClause(new SpanTermQuery(new Term("field", "great")))
+ .addClause(spanOr)
+ .build();
+ assertEquals(spanPhrase, dumb.parse("\"this is the great guinea pig\""));
+ }
+
public void testEnableGraphQueries() throws Exception {
QueryParser dumb = new QueryParser("field", new Analyzer1());
dumb.setSplitOnWhitespace(false);