| diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterGraphFilter.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterGraphFilter.java |
| index 7516a23..04d6d2b 100644 |
| --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterGraphFilter.java |
| +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterGraphFilter.java |
| @@ -17,16 +17,33 @@ |
| package org.apache.lucene.analysis.miscellaneous; |
| |
| import java.io.IOException; |
| +import java.io.PrintWriter; |
| import java.util.*; |
| |
| import org.apache.lucene.analysis.*; |
| import org.apache.lucene.analysis.CharArraySet; |
| import org.apache.lucene.analysis.StopFilter; |
| +import org.apache.lucene.analysis.core.FlattenGraphFilter; |
| +import org.apache.lucene.analysis.core.FlattenGraphFilterFactory; |
| import org.apache.lucene.analysis.core.KeywordTokenizer; |
| import org.apache.lucene.analysis.standard.StandardAnalyzer; |
| import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; |
| import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; |
| +import org.apache.lucene.document.Document; |
| +import org.apache.lucene.document.Field; |
| +import org.apache.lucene.document.TextField; |
| +import org.apache.lucene.index.DirectoryReader; |
| +import org.apache.lucene.index.IndexReader; |
| +import org.apache.lucene.index.IndexWriter; |
| +import org.apache.lucene.index.IndexWriterConfig; |
| +import org.apache.lucene.search.BooleanClause; |
| +import org.apache.lucene.search.IndexSearcher; |
| +import org.apache.lucene.search.Query; |
| +import org.apache.lucene.search.TopDocs; |
| +import org.apache.lucene.store.Directory; |
| +import org.apache.lucene.store.RAMDirectory; |
| import org.apache.lucene.util.IOUtils; |
| +import org.apache.lucene.util.QueryBuilder; |
| import org.apache.lucene.util.TestUtil; |
| |
| import static org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter.*; |
| @@ -308,6 +325,46 @@ public class TestWordDelimiterGraphFilter extends BaseTokenStreamTestCase { |
| |
| IOUtils.close(a, a2, a3); |
| } |
| + |
| + public void testLucene7848() throws Exception { |
| + CharArraySet protWords = new CharArraySet(Collections.emptySet(), false); |
| + |
| + Analyzer indexingAnalyzer = new Analyzer() { |
| + @Override |
| + public TokenStreamComponents createComponents(String field) { |
| + Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false); |
| + return new TokenStreamComponents(tokenizer, |
| + new WordDelimiterGraphFilter(tokenizer, GENERATE_WORD_PARTS | PRESERVE_ORIGINAL | GENERATE_NUMBER_PARTS | STEM_ENGLISH_POSSESSIVE, protWords)); |
| + } |
| + }; |
| + |
| + Analyzer searchAnalyzer = new Analyzer() { |
| + @Override |
| + public TokenStreamComponents createComponents(String field) { |
| + Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false); |
| + TokenStream filter = new WordDelimiterGraphFilter(tokenizer, GENERATE_WORD_PARTS | PRESERVE_ORIGINAL | GENERATE_NUMBER_PARTS | STEM_ENGLISH_POSSESSIVE, protWords); |
| + filter = new FlattenGraphFilter(filter); |
| + return new TokenStreamComponents(tokenizer, filter); |
| + } |
| + }; |
| + |
| + String input = "SPECIAL PROJECTS - xxx,SPECIAL PROJECTS - yyy"; |
| + String field = "field"; |
| + IndexWriterConfig config = new IndexWriterConfig(indexingAnalyzer); |
| + try (IndexWriter w = new IndexWriter(new RAMDirectory(), config)) { |
| + Document doc = new Document(); |
| + doc.add(new TextField(field, input, Field.Store.YES)); |
| + w.addDocument(doc); |
| + w.commit(); |
| + |
| + try (DirectoryReader reader = DirectoryReader.open(w, true, true)) { |
| + IndexSearcher searcher = new IndexSearcher(reader); |
| + Query q = new QueryBuilder(searchAnalyzer).createPhraseQuery(field, input); |
| + TopDocs topDocs = searcher.search(q, 10); |
| + assertEquals(1, topDocs.totalHits); |
| + } |
| + } |
| + } |
| |
| /** concat numbers + words + all */ |
| public void testLotsOfConcatenating() throws Exception { |
| diff --git a/lucene/core/src/java/org/apache/lucene/util/QueryBuilder.java b/lucene/core/src/java/org/apache/lucene/util/QueryBuilder.java |
| index f077bfd..20aace7 100644 |
| --- a/lucene/core/src/java/org/apache/lucene/util/QueryBuilder.java |
| +++ b/lucene/core/src/java/org/apache/lucene/util/QueryBuilder.java |
| @@ -354,17 +354,25 @@ public class QueryBuilder { |
| return null; |
| } |
| |
| - List<SpanTermQuery> terms = new ArrayList<>(); |
| + SpanNearQuery.Builder builder = SpanNearQuery.newOrderedNearQuery(field); |
| + PositionIncrementAttribute posIncrAtt = in.getAttribute(PositionIncrementAttribute.class); |
| + int numClause = 0; |
| + in.reset(); |
| while (in.incrementToken()) { |
| - terms.add(new SpanTermQuery(new Term(field, termAtt.getBytesRef()))); |
| + if (posIncrAtt.getPositionIncrement() > 1) { |
| + builder.addGap(posIncrAtt.getPositionIncrement() - 1); |
| + ++ numClause; |
| + } |
| + ++ numClause; |
| + builder.addClause(new SpanTermQuery(new Term(field, termAtt.getBytesRef()))); |
| } |
| |
| - if (terms.isEmpty()) { |
| + if (numClause == 0) { |
| return null; |
| - } else if (terms.size() == 1) { |
| - return terms.get(0); |
| + } else if (numClause == 1) { |
| + return builder.build().getClauses()[0]; |
| } else { |
| - return new SpanNearQuery(terms.toArray(new SpanTermQuery[0]), 0, true); |
| + return builder.build(); |
| } |
| } |
| |
| @@ -549,9 +557,10 @@ public class QueryBuilder { |
| throws IOException { |
| source.reset(); |
| GraphTokenStreamFiniteStrings graph = new GraphTokenStreamFiniteStrings(source); |
| - List<SpanQuery> clauses = new ArrayList<>(); |
| + SpanNearQuery.Builder builder = SpanNearQuery.newOrderedNearQuery(field).setSlop(phraseSlop); |
| int[] articulationPoints = graph.articulationPoints(); |
| int lastState = 0; |
| + int numClauses = 0; |
| for (int i = 0; i <= articulationPoints.length; i++) { |
| int start = lastState; |
| int end = -1; |
| @@ -560,6 +569,7 @@ public class QueryBuilder { |
| } |
| lastState = end; |
| final SpanQuery queryPos; |
| + int posInc = 1; |
| if (graph.hasSidePath(start)) { |
| List<SpanQuery> queries = new ArrayList<>(); |
| Iterator<TokenStream> it = graph.getFiniteStrings(start, end); |
| @@ -578,6 +588,7 @@ public class QueryBuilder { |
| } else { |
| Term[] terms = graph.getTerms(field, start); |
| assert terms.length > 0; |
| + posInc = graph.getPosIncr(start); |
| if (terms.length == 1) { |
| queryPos = new SpanTermQuery(terms[0]); |
| } else { |
| @@ -591,16 +602,21 @@ public class QueryBuilder { |
| } |
| |
| if (queryPos != null) { |
| - clauses.add(queryPos); |
| + if (posInc > 1) { |
| + builder.addGap(posInc - 1); |
| + ++ numClauses; |
| + } |
| + builder.addClause(queryPos); |
| + ++ numClauses; |
| } |
| } |
| |
| - if (clauses.isEmpty()) { |
| + if (numClauses == 0) { |
| return null; |
| - } else if (clauses.size() == 1) { |
| - return clauses.get(0); |
| + } else if (numClauses == 1) { |
| + return builder.build().getClauses()[0]; |
| } else { |
| - return new SpanNearQuery(clauses.toArray(new SpanQuery[0]), phraseSlop, true); |
| + return builder.build(); |
| } |
| } |
| |
| diff --git a/lucene/core/src/java/org/apache/lucene/util/graph/GraphTokenStreamFiniteStrings.java b/lucene/core/src/java/org/apache/lucene/util/graph/GraphTokenStreamFiniteStrings.java |
| index a700501..e41d326 100644 |
| --- a/lucene/core/src/java/org/apache/lucene/util/graph/GraphTokenStreamFiniteStrings.java |
| +++ b/lucene/core/src/java/org/apache/lucene/util/graph/GraphTokenStreamFiniteStrings.java |
| @@ -128,6 +128,20 @@ public final class GraphTokenStreamFiniteStrings { |
| } |
| |
| /** |
| + * Returns the position increment at the provided state |
| + */ |
| + public int getPosIncr(int state) { |
| + int numT = det.initTransition(state, transition); |
| + for (int i = 0; i < numT; i++) { |
| + det.getNextTransition(transition); |
| + for (int id = transition.min; id <= transition.max; id++) { |
| + return idToInc.getOrDefault(id, 1); |
| + } |
| + } |
| + return -1; |
| + } |
| + |
| + /** |
| * Get all finite strings from the automaton. |
| */ |
| public Iterator<TokenStream> getFiniteStrings() throws IOException { |
| diff --git a/lucene/queryparser/src/test/org/apache/lucene/queryparser/classic/TestQueryParser.java b/lucene/queryparser/src/test/org/apache/lucene/queryparser/classic/TestQueryParser.java |
| index 17107fc..3260e84 100644 |
| --- a/lucene/queryparser/src/test/org/apache/lucene/queryparser/classic/TestQueryParser.java |
| +++ b/lucene/queryparser/src/test/org/apache/lucene/queryparser/classic/TestQueryParser.java |
| @@ -19,11 +19,14 @@ package org.apache.lucene.queryparser.classic; |
| import java.io.IOException; |
| |
| import org.apache.lucene.analysis.Analyzer; |
| +import org.apache.lucene.analysis.CharArraySet; |
| import org.apache.lucene.analysis.MockAnalyzer; |
| import org.apache.lucene.analysis.MockBytesAnalyzer; |
| import org.apache.lucene.analysis.MockLowerCaseFilter; |
| import org.apache.lucene.analysis.MockSynonymAnalyzer; |
| +import org.apache.lucene.analysis.MockSynonymFilter; |
| import org.apache.lucene.analysis.MockTokenizer; |
| +import org.apache.lucene.analysis.StopFilter; |
| import org.apache.lucene.analysis.TokenFilter; |
| import org.apache.lucene.analysis.TokenStream; |
| import org.apache.lucene.analysis.Tokenizer; |
| @@ -571,6 +574,55 @@ public class TestQueryParser extends QueryParserTestBase { |
| assertEquals(graphAndQuery, dumb.parse("guinea pig cavy")); |
| } |
| |
| + /** whitespace+lowercase+stopword analyzer with synonyms */ |
| + private static class Analyzer3 extends Analyzer { |
| + Analyzer3(){ |
| + super(); |
| + } |
| + |
| + @Override |
| + public TokenStreamComponents createComponents(String fieldName) { |
| + Tokenizer tokenizer = new MockTokenizer( MockTokenizer.WHITESPACE, true); |
| + CharArraySet stop = StopFilter.makeStopSet("the", "is"); |
| + return new TokenStreamComponents(tokenizer, new StopFilter(new MockSynonymFilter(tokenizer), stop)); |
| + } |
| + } |
| + |
| + public void testMultiWordSynonymsWithGaps() throws Exception { |
| + QueryParser dumb = new QueryParser("field", new Analyzer3()); |
| + dumb.setSplitOnWhitespace(false); |
| + |
| + SpanQuery spanGuineaPig = SpanNearQuery.newOrderedNearQuery("field") |
| + .addGap(1) |
| + .addClause(new SpanTermQuery(new Term("field", "guinea"))) |
| + .addClause(new SpanTermQuery(new Term("field", "pig"))) |
| + .setSlop(0) |
| + .build(); |
| + SpanQuery spanCavy = SpanNearQuery.newOrderedNearQuery("field") |
| + .addGap(1) |
| + .addClause(new SpanTermQuery(new Term("field", "cavy"))) |
| + .setSlop(0) |
| + .build(); |
| + SpanQuery spanOr = new SpanOrQuery(new SpanQuery[]{spanGuineaPig, spanCavy}); |
| + assertEquals(spanOr, dumb.parse("\"the guinea pig\"")); |
| + |
| + spanGuineaPig = SpanNearQuery.newOrderedNearQuery("field") |
| + .addClause(new SpanTermQuery(new Term("field", "guinea"))) |
| + .addClause(new SpanTermQuery(new Term("field", "pig"))) |
| + .setSlop(0) |
| + .build(); |
| + spanCavy =new SpanTermQuery(new Term("field", "cavy")); |
| + |
| + spanOr = new SpanOrQuery(new SpanQuery[]{spanGuineaPig, spanCavy}); |
| + SpanQuery spanPhrase = SpanNearQuery.newOrderedNearQuery("field") |
| + .addClause(new SpanTermQuery(new Term("field", "this"))) |
| + .addGap(2) |
| + .addClause(new SpanTermQuery(new Term("field", "great"))) |
| + .addClause(spanOr) |
| + .build(); |
| + assertEquals(spanPhrase, dumb.parse("\"this is the great guinea pig\"")); |
| + } |
| + |
| public void testEnableGraphQueries() throws Exception { |
| QueryParser dumb = new QueryParser("field", new Analyzer1()); |
| dumb.setSplitOnWhitespace(false); |