blob: 80bb1e7488757565206a0d04faa82e0e439edefe [file] [log] [blame]
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterFilter.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterFilter.java
index a22d9c9..a682348 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterFilter.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterFilter.java
@@ -131,6 +131,29 @@ public class TestWordDelimiterFilter extends BaseTokenStreamTestCase {
@Test
public void testSplits() throws Exception {
+ int flags = GENERATE_WORD_PARTS;
+ WordDelimiterFilter wdf = new WordDelimiterFilter(new CannedTokenStream(new Token("basic-split", 0, 11)), DEFAULT_WORD_DELIM_TABLE, flags, null);
+
+ assertTokenStreamContents(wdf,
+ new String[] { "basic", "split"},
+ new int[] { 0, 6 },
+ new int[] { 5, 11 },
+ null,
+ new int[] { 1, 1 },
+ new int[] { 1, 1 }, null, false);
+
+ flags = GENERATE_WORD_PARTS | PRESERVE_ORIGINAL;
+ wdf = new WordDelimiterFilter(new CannedTokenStream(new Token("basic-split", 0, 11)), DEFAULT_WORD_DELIM_TABLE, flags, null);
+
+ assertTokenStreamContents(wdf,
+ new String[] { "basic-split", "basic", "split"},
+ new int[] { 0, 0, 6 },
+ new int[] { 11, 5, 11 },
+ null,
+ new int[] { 1, 0, 1},
+ new int[] { 1, 1, 1}, null, false);
+
+
doSplit("basic-split","basic","split");
doSplit("camelCase","camel","Case");
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestSynonymMapFilter.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestSynonymMapFilter.java
index 0fbbd2e..c75c1ae 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestSynonymMapFilter.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestSynonymMapFilter.java
@@ -943,6 +943,52 @@ public class TestSynonymMapFilter extends BaseTokenStreamTestCase {
a.close();
}
+
+ public void testSplit() throws Exception {
+ b = new SynonymMap.Builder(true);
+ final boolean keepOrig = true;
+ add("wifi", "wi fi", keepOrig);
+ final SynonymMap map = b.build();
+ Analyzer a = new Analyzer() {
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName) {
+ Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
+ return new TokenStreamComponents(tokenizer, new SynonymFilter(tokenizer, map, true));
+ }
+ };
+
+ assertAnalyzesTo(a, "wifi",
+ new String[] { "wifi", "wi", "fi" },
+ new int[] { 0, 0, 0 },
+ new int[] { 4, 4, 4 },
+ null,
+ new int[] { 1, 0, 1 },
+ new int[] { 1, 1, 1 });
+ a.close();
+
+ b = new SynonymMap.Builder(true);
+ add("wifi", "wi fi", keepOrig);
+ add("wifi", "hotspot", keepOrig);
+ add("wifi", "wide fidelity", keepOrig); // not
+ final SynonymMap map2 = b.build();
+ a = new Analyzer() {
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName) {
+ Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
+ return new TokenStreamComponents(tokenizer, new SynonymFilter(tokenizer, map2, true));
+ }
+ };
+
+ assertAnalyzesTo(a, "wifi connection",
+ new String[] { "wifi", "wi", "hotspot", "wide", "connection", "fi", "fidelity" },
+ new int[] { 0, 0, 0, 0, 5, 5, 5 },
+ new int[] { 4, 4, 4, 4, 15, 15, 15 },
+ null,
+ new int[] { 1, 0, 0, 0, 1, 0, 0 },
+ new int[] { 1, 1, 1, 1, 1, 1, 1 });
+ a.close();
+ }
+
public void testEmpty() throws Exception {
Tokenizer tokenizer = new MockTokenizer();
tokenizer.setReader(new StringReader("aa bb"));
diff --git a/lucene/core/src/java/org/apache/lucene/util/QueryBuilder.java b/lucene/core/src/java/org/apache/lucene/util/QueryBuilder.java
index 309150e..d2f44d7 100644
--- a/lucene/core/src/java/org/apache/lucene/util/QueryBuilder.java
+++ b/lucene/core/src/java/org/apache/lucene/util/QueryBuilder.java
@@ -18,12 +18,16 @@ package org.apache.lucene.util;
import java.io.IOException;
+import java.io.PrintWriter;
import java.util.ArrayList;
import java.util.List;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.CachingTokenFilter;
import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
import org.apache.lucene.index.Term;
@@ -85,7 +89,7 @@ public class QueryBuilder {
if (operator != BooleanClause.Occur.SHOULD && operator != BooleanClause.Occur.MUST) {
throw new IllegalArgumentException("invalid operator: only SHOULD or MUST are allowed");
}
- return createFieldQuery(analyzer, operator, field, queryText, false, 0);
+ return createFieldQuery(analyzer, operator, field, queryText, false, false, 0);
}
/**
@@ -111,7 +115,7 @@ public class QueryBuilder {
* {@code MultiPhraseQuery}, based on the analysis of {@code queryText}
*/
public Query createPhraseQuery(String field, String queryText, int phraseSlop) {
- return createFieldQuery(analyzer, BooleanClause.Occur.MUST, field, queryText, true, phraseSlop);
+ return createFieldQuery(analyzer, BooleanClause.Occur.MUST, field, queryText, true, false, phraseSlop);
}
/**
@@ -133,7 +137,7 @@ public class QueryBuilder {
return createBooleanQuery(field, queryText, BooleanClause.Occur.MUST);
}
- Query query = createFieldQuery(analyzer, BooleanClause.Occur.SHOULD, field, queryText, false, 0);
+ Query query = createFieldQuery(analyzer, BooleanClause.Occur.SHOULD, field, queryText, false, false, 0);
if (query instanceof BooleanQuery) {
BooleanQuery bq = (BooleanQuery) query;
BooleanQuery.Builder builder = new BooleanQuery.Builder();
@@ -196,7 +200,8 @@ public class QueryBuilder {
* @param quoted true if phrases should be generated when terms occur at more than one position
* @param phraseSlop slop factor for phrase/multiphrase queries
*/
- protected final Query createFieldQuery(Analyzer analyzer, BooleanClause.Occur operator, String field, String queryText, boolean quoted, int phraseSlop) {
+ protected final Query createFieldQuery(Analyzer analyzer, BooleanClause.Occur operator, String field, String queryText,
+ boolean quoted, boolean autoGeneratePhraseQueries, int phraseSlop) {
assert operator == BooleanClause.Occur.SHOULD || operator == BooleanClause.Occur.MUST;
// Use the analyzer to get all the tokens, and then build an appropriate
@@ -252,12 +257,16 @@ public class QueryBuilder {
if (positionCount == 1) {
// only one position, with synonyms
return analyzeBoolean(field, stream);
+ } else if (autoGeneratePhraseQueries) {
+ // complex case: multiple positions
+ return analyzeAutoPhraseMultiBoolean(field, queryText, stream, operator);
} else {
// complex case: multiple positions
return analyzeMultiBoolean(field, stream, operator);
}
}
- } catch (IOException e) {
+ } catch (Exception e) {
+ e.printStackTrace();
throw new RuntimeException("Error analyzing query text", e);
}
}
@@ -325,6 +334,140 @@ public class QueryBuilder {
return q.build();
}
+ /**
+ * Creates complex boolean query from the cached tokenstream contents
+ * when autoGeneratePhraseQueries=true, generating phrase queries for
+ * non-overlapping tokens split from a single whitespace-separated original token
+ */
+ private Query analyzeAutoPhraseMultiBoolean
+ (String field, String queryText, TokenStream stream, BooleanClause.Occur operator) throws Exception {
+ Pattern nonWhitespacePattern = Pattern.compile("[^ \t\n\r\u3000]+"); // From classic QueryParser.jj
+ Matcher whitespaceSeparatedToken = nonWhitespacePattern.matcher(queryText);
+ List<Integer> wsTokStarts = new ArrayList<>();
+ List<Integer> wsTokEnds = new ArrayList<>();
+ while (whitespaceSeparatedToken.find()) {
+ wsTokStarts.add(whitespaceSeparatedToken.start());
+ wsTokEnds.add(whitespaceSeparatedToken.end());
+ }
+ if (wsTokStarts.isEmpty()) {
+ throw new Exception("Zero whitespace-separated tokens in query text '" + queryText + "'");
+ }
+ int prevWsTokNumStart = 0;
+ int prevAnalyzedTokenPosIncr = 1;
+ List<Query> q = new ArrayList<>();
+ List<Term> splitTerms = new ArrayList<>();
+ List<Term> overlappingTerms = new ArrayList<>();
+ TermToBytesRefAttribute analyzedTokenTermAtt = stream.getAttribute(TermToBytesRefAttribute.class);
+ PositionIncrementAttribute analyzedTokenPosIncrAtt = stream.getAttribute(PositionIncrementAttribute.class);
+ OffsetAttribute analyzedTokenOffsetAtt = stream.getAttribute(OffsetAttribute.class);
+
+ stream.reset();
+ while (stream.incrementToken()) {
+ int wsTokNumStart = getWsTokNum(analyzedTokenOffsetAtt.startOffset(), wsTokStarts, wsTokEnds);
+ int wsTokNumEnd = getWsTokNum(analyzedTokenOffsetAtt.endOffset(), wsTokStarts, wsTokEnds);
+ if (wsTokNumStart != wsTokNumEnd || wsTokNumStart != prevWsTokNumStart) {
+ // Current analyzed token is not from the same whitespace-separated token as the previous analyzed token
+ // First, create queries from buffered split & overlapping tokens
+ addSplitTerms(q, splitTerms, overlappingTerms);
+ }
+ switch (analyzedTokenPosIncrAtt.getPositionIncrement()) {
+ case 0: {
+ if ( ! splitTerms.isEmpty() && prevAnalyzedTokenPosIncr > 0) {
+ // assumption: first overlapping token is the original token from which following tokens were split
+ overlappingTerms.add(splitTerms.remove(splitTerms.size() - 1)); // trailing -> overlapping
+ }
+ splitTerms.add(new Term(field, analyzedTokenTermAtt.getBytesRef()));
+ break;
+ }
+ case 1: {
+ splitTerms.add(new Term(field, analyzedTokenTermAtt.getBytesRef()));
+ break;
+ }
+ default: {
+ addSplitTerms(q, splitTerms, overlappingTerms);
+ splitTerms.add(new Term(field, analyzedTokenTermAtt.getBytesRef()));
+ break;
+ }
+ }
+ prevWsTokNumStart = wsTokNumStart;
+ prevAnalyzedTokenPosIncr = analyzedTokenPosIncrAtt.getPositionIncrement();
+ }
+ addSplitTerms(q, splitTerms, overlappingTerms);
+
+ if (q.size() == 1) { // Don't wrap a lone query
+ return q.get(0);
+ } else {
+ BooleanQuery.Builder builder = newBooleanQuery();
+ q.forEach(query -> builder.add(query, operator));
+ return builder.build();
+ }
+ }
+
+ /**
+ * nocommit
+ */
+ private void addSplitTerms(List<Query> q, List<Term> splitTerms, List<Term> overlappingTerms) {
+ if (splitTerms.isEmpty()) {
+ if (overlappingTerms.isEmpty()) {
+ return;
+ } else if (overlappingTerms.size() == 1) {
+ q.add(newTermQuery(overlappingTerms.get(0)));
+ } else {
+ q.add(newSynonymQuery(overlappingTerms.toArray(new Term[overlappingTerms.size()])));
+ }
+ }
+ if (splitTerms.size() == 1) {
+ if (overlappingTerms.isEmpty()) {
+ q.add(newTermQuery(splitTerms.get(0)));
+ } else {
+ Term[] terms = new Term[overlappingTerms.size() + 1];
+ overlappingTerms.toArray(terms);
+ terms[overlappingTerms.size()] = splitTerms.get(0);
+ q.add(new SynonymQuery(terms));
+ }
+ } else {
+ PhraseQuery.Builder pqBuilder = new PhraseQuery.Builder();
+ splitTerms.forEach(pqBuilder::add);
+ Query splitTermsQuery = pqBuilder.build();
+ if (overlappingTerms.isEmpty()) {
+ q.add(splitTermsQuery);
+ } else {
+ BooleanQuery.Builder bqBuilder = newBooleanQuery();
+ BooleanClause.Occur operator = BooleanClause.Occur.SHOULD;
+ if (overlappingTerms.size() == 1) {
+ bqBuilder.add(newTermQuery(overlappingTerms.get(0)), operator);
+ } else {
+ bqBuilder.add(newSynonymQuery(overlappingTerms.toArray(new Term[overlappingTerms.size()])), operator);
+ }
+ bqBuilder.add(splitTermsQuery, operator);
+ q.add(bqBuilder.build());
+ }
+ }
+ splitTerms.clear();
+ overlappingTerms.clear();
+ }
+
+ /**
+ * nocommit
+ */
+ private int getWsTokNum(int offset, List<Integer> tokStart, List<Integer> tokEnd) throws Exception {
+ for (int tokNum = 0 ; tokNum < tokStart.size() ; ++tokNum) {
+ if (offset >= tokStart.get(tokNum) && offset <= tokEnd.get(tokNum))
+ return tokNum;
+ }
+ // failure: throw exception with details
+ StringBuilder bounds = new StringBuilder();
+ boolean first = true;
+ for (int i = 0 ; i < tokStart.size() ; ++i) {
+ if ( ! first)
+ bounds.append(", ");
+ first = false;
+ bounds.append("(").append(tokStart.get(i)).append(",").append(tokEnd.get(i)).append(")");
+ }
+ throw new Exception("offset " + offset + " is not within whitespace-separated token bounds: " + bounds);
+ }
+
+
/**
* Creates simple phrase query from the cached tokenstream contents
*/
diff --git a/lucene/queryparser/src/java/org/apache/lucene/queryparser/classic/QueryParserBase.java b/lucene/queryparser/src/java/org/apache/lucene/queryparser/classic/QueryParserBase.java
index fbe08a9..bd00cb3 100644
--- a/lucene/queryparser/src/java/org/apache/lucene/queryparser/classic/QueryParserBase.java
+++ b/lucene/queryparser/src/java/org/apache/lucene/queryparser/classic/QueryParserBase.java
@@ -472,7 +472,7 @@ public abstract class QueryParserBase extends QueryBuilder implements CommonQuer
*/
protected Query newFieldQuery(Analyzer analyzer, String field, String queryText, boolean quoted) throws ParseException {
BooleanClause.Occur occur = operator == Operator.AND ? BooleanClause.Occur.MUST : BooleanClause.Occur.SHOULD;
- return createFieldQuery(analyzer, occur, field, queryText, quoted || autoGeneratePhraseQueries, phraseSlop);
+ return createFieldQuery(analyzer, occur, field, queryText, quoted, autoGeneratePhraseQueries, phraseSlop);
}
diff --git a/lucene/queryparser/src/test/org/apache/lucene/queryparser/util/QueryParserTestBase.java b/lucene/queryparser/src/test/org/apache/lucene/queryparser/util/QueryParserTestBase.java
index 2170193..f5b55a4 100644
--- a/lucene/queryparser/src/test/org/apache/lucene/queryparser/util/QueryParserTestBase.java
+++ b/lucene/queryparser/src/test/org/apache/lucene/queryparser/util/QueryParserTestBase.java
@@ -330,6 +330,15 @@ public abstract class QueryParserTestBase extends LuceneTestCase {
CommonQueryParserConfiguration qp = getParserConfig(analyzer);
setAutoGeneratePhraseQueries(qp, true);
assertEquals(expected, getQuery("中国",qp));
+
+ qp = getParserConfig(new MockSynonymAnalyzer());
+ setAutoGeneratePhraseQueries(qp, true); //nocommit
+ assertQueryEquals(qp, "field", "guinea pig", "(guinea cavy) pig"); // should NOT generate a phrase query
+ assertQueryEquals(qp, "field", "wi-fi", "wi-fi \"wi fi\"");
+
+ qp = getParserConfig(null);
+ setAutoGeneratePhraseQueries(qp, true);
+ assertQueryEquals(qp, "field", "one two", "one two"); // should NOT generate a phrase query
}
public void testSimple() throws Exception {
diff --git a/lucene/test-framework/src/java/org/apache/lucene/analysis/MockSynonymFilter.java b/lucene/test-framework/src/java/org/apache/lucene/analysis/MockSynonymFilter.java
index 1d8b513..8e87f93 100644
--- a/lucene/test-framework/src/java/org/apache/lucene/analysis/MockSynonymFilter.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/analysis/MockSynonymFilter.java
@@ -56,6 +56,14 @@ public class MockSynonymFilter extends TokenFilter {
if (endOfInput == false && input.incrementToken()) {
if (termAtt.toString().equals("dogs")) {
addSynonymAndRestoreOrigToken("dog", 1, offsetAtt.endOffset());
+ } else if (termAtt.toString().equals("wi-fi")) {
+ AttributeSource origToken = cloneAttributes();
+ addSynonym("wi", 1, offsetAtt.endOffset());
+ origToken.copyTo(this);
+ termAtt.setEmpty().append("fi");
+ tokenQueue.add(cloneAttributes());
+ origToken.copyTo(this); // restore "wi-fi"
+ posLenAtt.setPositionLength(2);
} else if (termAtt.toString().equals("guinea")) {
AttributeSource firstSavedToken = cloneAttributes();
if (input.incrementToken()) {
@@ -68,6 +76,17 @@ public class MockSynonymFilter extends TokenFilter {
} else if (termAtt.toString().equals("dogs")) {
tokenQueue.add(cloneAttributes());
addSynonym("dog", 1, offsetAtt.endOffset());
+ } else if (termAtt.toString().equals("wi-fi")) {
+ AttributeSource origToken = cloneAttributes();
+ posLenAtt.setPositionLength(2);
+ tokenQueue.add(cloneAttributes());
+ origToken.copyTo(this);
+ addSynonym("wi", 1, offsetAtt.endOffset());
+ origToken.copyTo(this);
+ termAtt.setEmpty().append("fi");
+ tokenQueue.add(cloneAttributes());
+ } else {
+ tokenQueue.add(cloneAttributes());
}
} else {
endOfInput = true;
diff --git a/lucene/test-framework/src/test/org/apache/lucene/analysis/TestMockSynonymFilter.java b/lucene/test-framework/src/test/org/apache/lucene/analysis/TestMockSynonymFilter.java
index fb0d065..823ae6c 100644
--- a/lucene/test-framework/src/test/org/apache/lucene/analysis/TestMockSynonymFilter.java
+++ b/lucene/test-framework/src/test/org/apache/lucene/analysis/TestMockSynonymFilter.java
@@ -103,6 +103,15 @@ public class TestMockSynonymFilter extends BaseTokenStreamTestCase {
new int[]{1, 1, 1}, // position length
true); // check that offsets are correct
+ assertAnalyzesTo(analyzer, "guinea fowl",
+ new String[]{"guinea", "fowl"},
+ new int[]{0, 7}, // start offset
+ new int[]{6, 11}, // end offset
+ null,
+ new int[]{1, 1}, // position increment
+ new int[]{1, 1}, // position length
+ true); // check that offsets are correct
+
assertAnalyzesTo(analyzer, "dogs guinea",
new String[]{"dogs", "dog", "guinea"},
new int[]{0, 0, 5}, // start offset
@@ -147,5 +156,32 @@ public class TestMockSynonymFilter extends BaseTokenStreamTestCase {
new int[]{1, 1, 0, 1, 1, 1, 0, 1}, // position increment
new int[]{1, 1, 2, 1, 1, 1, 1, 1}, // position length
true); // check that offsets are correct
+
+ assertAnalyzesTo(analyzer, "wi-fi",
+ new String[]{"wi-fi", "wi", "fi"},
+ new int[]{0, 0, 0}, // start offset
+ new int[]{5, 5, 5}, // end offset
+ null,
+ new int[]{1, 0, 1}, // position increment
+ new int[]{2, 1, 1}, // position length
+ true); // check that offsets are correct
+
+ assertAnalyzesTo(analyzer, "guinea wi-fi",
+ new String[]{"guinea", "wi-fi", "wi", "fi"},
+ new int[]{0, 7, 7, 7}, // start offset
+ new int[]{6, 12, 12, 12}, // end offset
+ null,
+ new int[]{1, 1, 0, 1}, // position increment
+ new int[]{1, 2, 1, 1}, // position length
+ true); // check that offsets are correct
+
+ assertAnalyzesTo(analyzer, "wi-fi dogs",
+ new String[]{"wi-fi", "wi", "fi", "dogs", "dog"},
+ new int[]{0, 0, 0, 6, 6}, // start offset
+ new int[]{5, 5, 5, 10, 10}, // end offset
+ null,
+ new int[]{1, 0, 1, 1, 0}, // position increment
+ new int[]{2, 1, 1, 1, 1}, // position length
+ true); // check that offsets are correct
}
}
diff --git a/solr/core/src/java/org/apache/solr/parser/SolrQueryParserBase.java b/solr/core/src/java/org/apache/solr/parser/SolrQueryParserBase.java
index 24a6f3e..e8877fe 100644
--- a/solr/core/src/java/org/apache/solr/parser/SolrQueryParserBase.java
+++ b/solr/core/src/java/org/apache/solr/parser/SolrQueryParserBase.java
@@ -368,7 +368,7 @@ public abstract class SolrQueryParserBase extends QueryBuilder {
protected Query newFieldQuery(Analyzer analyzer, String field, String queryText, boolean quoted) throws SyntaxError {
BooleanClause.Occur occur = operator == Operator.AND ? BooleanClause.Occur.MUST : BooleanClause.Occur.SHOULD;
- return createFieldQuery(analyzer, occur, field, queryText, quoted || autoGeneratePhraseQueries, phraseSlop);
+ return createFieldQuery(analyzer, occur, field, queryText, quoted, autoGeneratePhraseQueries, phraseSlop);
}