blob: 9acf827458914d4edc8b7b3e12acb066b9cd3d58 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.handler.component;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.TreeMap;
import java.util.function.BiConsumer;
import java.util.stream.Collectors;
import java.util.stream.IntStream;
import java.util.stream.Stream;
import org.apache.solr.SolrTestCaseJ4;
import org.apache.solr.handler.component.PhrasesIdentificationComponent.Phrase;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.SolrException.ErrorCode;
import org.apache.solr.common.params.ShardParams;
import org.apache.solr.request.SolrQueryRequest;
import org.apache.solr.schema.IndexSchema;
import org.apache.solr.schema.SchemaField;
import org.junit.After;
import org.junit.BeforeClass;
import org.junit.Before;
import org.hamcrest.Description;
import org.hamcrest.Matcher;
import org.hamcrest.BaseMatcher;
public class PhrasesIdentificationComponentTest extends SolrTestCaseJ4 {
private static final String HANDLER = "/phrases";
@BeforeClass
public static void beforeClass() throws Exception {
initCore("solrconfig-phrases-identification.xml","schema-phrases-identification.xml");
}
@Before
public void addSomeDocs() throws Exception {
assertU(adoc("id", "42",
"title","Tale of the Brown Fox: was he lazy?",
"body", "No. The quick brown fox was a very brown fox who liked to get into trouble."));
assertU(adoc("id", "43",
"title","A fable in two acts",
"body", "The brOwn fOx jumped. The lazy dog did not"));
assertU(adoc("id", "44",
"title","Why the LazY dog was lazy",
"body", "News flash: Lazy Dog was not actually lazy, it just seemd so compared to Fox"));
assertU(adoc("id", "45",
"title","Why Are We Lazy?",
"body", "Because we are. that's why"));
assertU((commit()));
}
@After
public void deleteAllDocs() throws Exception {
assertU(delQ("*:*"));
assertU((commit()));
}
public void testWhiteBoxPhraseParsingLongInput() throws Exception {
final SchemaField field = h.getCore().getLatestSchema().getField("multigrams_body");
assertNotNull(field);
final List<Phrase> phrases = Phrase.extractPhrases
(" did a Quick brown FOX perniciously jump over the lAZy dog", field, 3, 7);
assertEquals(IntStream.rangeClosed((11-7+1), 11).sum(), // 11 words, max query phrase size is 7
phrases.size());
// spot check a few explicitly choosen phrases of various lengths...
{ // single term, close to edge so not as many super phrases as other terms might have
final Phrase lazy = phrases.get(phrases.size() - 1 - 2);
final String debug = lazy.toString();
assertEquals(debug, "lAZy", lazy.getSubSequence());
assertEquals(debug, 10, lazy.getPositionStart());
assertEquals(debug, 11, lazy.getPositionEnd());
assertEquals(debug, 1, lazy.getPositionLength());
assertEquals(debug, 54, lazy.getOffsetStart());
assertEquals(debug, 58, lazy.getOffsetEnd());
assertEquals(debug, 1, lazy.getIndividualIndexedTerms().size());
assertEquals(debug, 1, lazy.getLargestIndexedSubPhrases().size());
assertEquals(debug, lazy, lazy.getIndividualIndexedTerms().get(0));
assertEquals(debug, lazy, lazy.getLargestIndexedSubPhrases().get(0));
assertEquals(debug, 4, lazy.getIndexedSuperPhrases().size()); // (2 each: len=2, len=3)
}
{ // length 2, middle of the pack
final Phrase brown_fox = phrases.get((7 * 3) + 1);
final String debug = brown_fox.toString();
assertEquals(debug, "brown FOX", brown_fox.getSubSequence());
assertEquals(debug, 4, brown_fox.getPositionStart());
assertEquals(debug, 6, brown_fox.getPositionEnd());
assertEquals(debug, 2, brown_fox.getPositionLength());
assertEquals(debug, 17, brown_fox.getOffsetStart());
assertEquals(debug, 26, brown_fox.getOffsetEnd());
assertEquals(debug, 2, brown_fox.getIndividualIndexedTerms().size());
assertEquals(debug, 1, brown_fox.getLargestIndexedSubPhrases().size());
assertEquals(debug, brown_fox, brown_fox.getLargestIndexedSubPhrases().get(0));
assertEquals(debug, 2, brown_fox.getIndexedSuperPhrases().size()); // (2 @ len=3)
}
{ // length 3 (which is the max indexed size) @ start of the string
final Phrase daq = phrases.get(2);
final String debug = daq.toString();
assertEquals(debug, "did a Quick", daq.getSubSequence());
assertEquals(debug, 1, daq.getPositionStart());
assertEquals(debug, 4, daq.getPositionEnd());
assertEquals(debug, 3, daq.getPositionLength());
assertEquals(debug, 1, daq.getOffsetStart());
assertEquals(debug, 13, daq.getOffsetEnd());
assertEquals(debug, 3, daq.getIndividualIndexedTerms().size());
assertEquals(debug, 1, daq.getLargestIndexedSubPhrases().size());
assertEquals(debug, daq, daq.getLargestIndexedSubPhrases().get(0));
assertEquals(debug, 0, daq.getIndexedSuperPhrases().size());
}
{ // length 4 phrase (larger then the max indexed size)
final Phrase qbfp = phrases.get((7 * 2) + 3);
final String debug = qbfp.toString();
assertEquals(debug, "Quick brown FOX perniciously", qbfp.getSubSequence());
assertEquals(debug, 3, qbfp.getPositionStart());
assertEquals(debug, 7, qbfp.getPositionEnd());
assertEquals(debug, 4, qbfp.getPositionLength());
assertEquals(debug, 8, qbfp.getOffsetStart());
assertEquals(debug, 39, qbfp.getOffsetEnd());
assertEquals(debug, 4, qbfp.getIndividualIndexedTerms().size());
assertEquals(debug, 2, qbfp.getLargestIndexedSubPhrases().size());
assertEquals(debug, 0, qbfp.getIndexedSuperPhrases().size());
}
// some blanket assumptions about the results...
assertBasicSanityChecks(phrases, 11, 3, 7);
}
public void testWhiteBoxPhraseParsingShortInput() throws Exception {
// for input this short, either of these fields should be (mostly) equivalent
final Map<String,Integer> fields = new TreeMap<>();
fields.put("multigrams_body", 7);
fields.put("multigrams_body_short", 3);
for (Map.Entry<String,Integer> entry : fields.entrySet()) {
try {
final int maxQ = entry.getValue();
final SchemaField field = h.getCore().getLatestSchema().getField(entry.getKey());
assertNotNull(field);
// empty input shouldn't break anything
assertEquals(0, Phrase.extractPhrases(random().nextBoolean() ? "" : " ", field, 3, maxQ).size());
// input shorter them our index/query phrase sizes shouldn't break anything either....
final List<Phrase> phrases = Phrase.extractPhrases("brown FOX", field, 3, maxQ);
assertEquals(3, phrases.size());
{ // length 2
final Phrase brown_fox = phrases.get(1);
final String debug = brown_fox.toString();
assertEquals(debug, "brown FOX", brown_fox.getSubSequence());
assertEquals(debug, 1, brown_fox.getPositionStart());
assertEquals(debug, 3, brown_fox.getPositionEnd());
assertEquals(debug, 2, brown_fox.getPositionLength());
assertEquals(debug, 0, brown_fox.getOffsetStart());
assertEquals(debug, 9, brown_fox.getOffsetEnd());
assertEquals(debug, 2, brown_fox.getIndividualIndexedTerms().size());
assertEquals(debug, 1, brown_fox.getLargestIndexedSubPhrases().size());
assertEquals(debug, brown_fox, brown_fox.getLargestIndexedSubPhrases().get(0));
assertEquals(debug, 0, brown_fox.getIndexedSuperPhrases().size());
}
{ // length 1
final Phrase fox = phrases.get(2);
final String debug = fox.toString();
assertEquals(debug, "FOX", fox.getSubSequence());
assertEquals(debug, 2, fox.getPositionStart());
assertEquals(debug, 3, fox.getPositionEnd());
assertEquals(debug, 1, fox.getPositionLength());
assertEquals(debug, 6, fox.getOffsetStart());
assertEquals(debug, 9, fox.getOffsetEnd());
assertEquals(debug, 1, fox.getIndividualIndexedTerms().size());
assertEquals(debug, 1, fox.getLargestIndexedSubPhrases().size());
assertEquals(debug, fox, fox.getLargestIndexedSubPhrases().get(0));
assertEquals(debug, 1, fox.getIndexedSuperPhrases().size());
}
assertBasicSanityChecks(phrases, 2, 3, maxQ);
} catch (AssertionError e) {
throw new AssertionError(entry.getKey() + " => " + e.getMessage(), e);
}
}
}
/**
* Asserts some basic rules that should be enforced about all Phrases
* &amp; their linkages to oher phrases
*/
private void assertBasicSanityChecks(final List<Phrase> phrases,
final int inputPositionLength,
final int maxIndexedPositionLength,
final int maxQueryPositionLength) throws Exception {
assert 0 < phrases.size() : "Don't use this method if phrases might be empty";
assertEmptyStream("no phrase should be longer then "+maxQueryPositionLength+" positions",
phrases.stream().filter(p -> p.getPositionLength() > maxQueryPositionLength));
assertEmptyStream("no phrase should have a start offset < 0",
phrases.stream().filter(p -> p.getOffsetStart() < 0));
assertEmptyStream("no phrase should have a start position < 1",
phrases.stream().filter(p -> p.getPositionStart() < 1));
assertEmptyStream("If a phrase has a start offset of 0, then it must have position 1",
phrases.stream().filter(p -> (p.getOffsetStart() == 0)
&& (p.getPositionStart() != 1)));
final Phrase first = phrases.get(0);
final Phrase last = phrases.get(phrases.size()-1);
assertEmptyStream("no phrase should have a start offset < first phrase",
phrases.stream().filter(p -> p.getOffsetStart() < first.getOffsetStart()));
assertEmptyStream("no phrase should have an end offset > last phrase",
phrases.stream().filter(p -> last.getOffsetEnd() < p.getOffsetEnd()));
assertEmptyStream("no phrase should have a start position < first phrase",
phrases.stream().filter(p -> p.getPositionStart() < first.getPositionStart()));
assertEmptyStream("no phrase should have an end position > last phrase",
phrases.stream().filter(p -> last.getPositionEnd() < p.getPositionEnd()));
// NOTE: stuff below this point may not be true for all analyzers (ie: stopwords)
// but should be valid for the analyzers used in this test...
// (if we expand test to cover analyzers w/stopwords, refactor this into a new method)
for (int n = 1; n <= maxQueryPositionLength; n++) {
final int len = n;
final int expected = Math.max(0, 1 + inputPositionLength - n);
final List<Phrase> sizeN = phrases.stream().filter(p -> p.getPositionLength() == len
).collect(Collectors.toList());
assertEquals("Expected # phrases of size " + n + ": " + sizeN, expected, sizeN.size());
}
// check the quantities of sub-terms/phrases...
assertEmptyStream("no phrase should have num indexed terms != pos_len",
phrases.stream().filter
(p -> last.getPositionLength() != last.getIndividualIndexedTerms().size()));
assertEmptyStream("no phrase should have num sub-phrases != max(1, 1 + pos_len - "+maxIndexedPositionLength+")",
phrases.stream().filter
(p -> (Math.max(1, 1 + last.getPositionLength() - maxIndexedPositionLength)
!= last.getLargestIndexedSubPhrases().size())));
// NOTE: indexed super phrases can be of various lengths, and differing quantities near
// begining/end of input so don't worry about an exact count, just check their properties (below)
// check the properties of our sub/super phrases
for (Phrase phrase : phrases) {
final String debug = phrase.toString();
assertEmptyStream(debug + " should not have any indexed terms where pos_len != 1",
phrase.getIndividualIndexedTerms().stream().filter
(term -> 1 != term.getPositionLength()));
assertEmptyStream(debug + " should not have any sub-phrases where pos_len > min(pos_len, "
+ maxIndexedPositionLength+")",
phrase.getLargestIndexedSubPhrases().stream().filter
(inner -> (Math.min(phrase.getPositionLength(), maxIndexedPositionLength)
< inner.getPositionLength())));
assertEmptyStream(debug + " should not have any super-phrases where super.len <= phrase.len or "
+ maxIndexedPositionLength + " < super.len",
phrase.getIndexedSuperPhrases().stream().filter
(outer -> (outer.getPositionLength() <= phrase.getPositionLength() ||
maxIndexedPositionLength < outer.getPositionLength())));
}
}
public void testWhiteboxStats() throws Exception {
final SchemaField analysisField = h.getCore().getLatestSchema().getField("multigrams_body");
assertNotNull(analysisField);
final String input = "BROWN fox lAzY dog xxxyyyzzz";
// a function we'll re-use on phrases generated from the above input
// the multiplier let's us simulate multiple shards returning the same values
BiConsumer<Integer,List<Phrase>> assertions = (mult, phrases) -> {
final Phrase brown_fox = phrases.get(1);
assertEquals("BROWN fox", brown_fox.getSubSequence());
assertEquals(mult * 1, brown_fox.getTTF("multigrams_title"));
assertEquals(mult * 1, brown_fox.getDocFreq("multigrams_title"));
assertEquals(mult * 1, brown_fox.getConjunctionDocCount("multigrams_title"));
assertEquals(mult * 3, brown_fox.getTTF("multigrams_body"));
assertEquals(mult * 2, brown_fox.getDocFreq("multigrams_body"));
assertEquals(mult * 2, brown_fox.getConjunctionDocCount("multigrams_body"));
final Phrase fox_lazy = phrases.get(6);
assertEquals("fox lAzY", fox_lazy.getSubSequence());
assertEquals(mult * 0, fox_lazy.getTTF("multigrams_title"));
assertEquals(mult * 0, fox_lazy.getDocFreq("multigrams_title"));
assertEquals(mult * 1, fox_lazy.getConjunctionDocCount("multigrams_title"));
assertEquals(mult * 0, fox_lazy.getTTF("multigrams_body"));
assertEquals(mult * 0, fox_lazy.getDocFreq("multigrams_body"));
assertEquals(mult * 2, fox_lazy.getConjunctionDocCount("multigrams_body"));
final Phrase bfld = phrases.get(3);
assertEquals("BROWN fox lAzY dog", bfld.getSubSequence());
expectThrows(SolrException.class, () -> { bfld.getTTF("multigrams_title"); });
expectThrows(SolrException.class, () -> { bfld.getDocFreq("multigrams_title"); });
assertEquals(mult * 0, bfld.getConjunctionDocCount("multigrams_title"));
expectThrows(SolrException.class, () -> { bfld.getTTF("multigrams_body"); });
expectThrows(SolrException.class, () -> { bfld.getDocFreq("multigrams_body"); });
assertEquals(mult * 1, bfld.getConjunctionDocCount("multigrams_body"));
final Phrase xyz = phrases.get(phrases.size()-1);
assertEquals("xxxyyyzzz", xyz.getSubSequence());
assertEquals(mult * 0, xyz.getTTF("multigrams_title"));
assertEquals(mult * 0, xyz.getDocFreq("multigrams_title"));
assertEquals(mult * 0, xyz.getConjunctionDocCount("multigrams_title"));
assertEquals(mult * 0, xyz.getTTF("multigrams_body"));
assertEquals(mult * 0, xyz.getDocFreq("multigrams_body"));
assertEquals(mult * 0, xyz.getConjunctionDocCount("multigrams_body"));
return;
};
final List<Phrase> phrasesLocal = Phrase.extractPhrases(input, analysisField, 3, 7);
// freshly parsed phrases, w/o any stats populated, all the stats should be 0
assertions.accept(0, phrasesLocal);
// If we populate with our index stats, we should get the basic values in our BiConsumer
try (SolrQueryRequest req = req()) {
Phrase.populateStats(phrasesLocal, Arrays.asList("multigrams_body","multigrams_title"),
req.getSearcher());
}
assertions.accept(1, phrasesLocal);
// likewise, if we create a new freshly parsed set of phrases, and "merge" in the previous index stats
// (ie: merge results from one shard) we should get the same results
final List<Phrase> phrasesMerged = Phrase.extractPhrases(input, analysisField, 3, 7);
Phrase.populateStats(phrasesMerged, Phrase.formatShardResponse(phrasesLocal));
assertions.accept(1, phrasesMerged);
// if we merge in a second copy of the same results (ie: two identical shards)
// our results should be double what we had before
Phrase.populateStats(phrasesMerged, Phrase.formatShardResponse(phrasesLocal));
assertions.accept(2, phrasesMerged);
}
@SuppressWarnings({"unchecked"})
public void testWhiteboxScores() throws Exception {
final SchemaField analysisField = h.getCore().getLatestSchema().getField("multigrams_body");
assertNotNull(analysisField);
final Map<String,Double> fieldWeights = new TreeMap<>();
fieldWeights.put("multigrams_title", 1.0D);
fieldWeights.put("multigrams_body", 0.0D); // NOTE: 0 weighting should only affect total score
final String input = "xxxyyyzzz BROWN fox why are we lAzY";
final List<Phrase> phrases = Phrase.extractPhrases(input, analysisField, 3, 7);
try (SolrQueryRequest req = req()) {
Phrase.populateStats(phrases, fieldWeights.keySet(), req.getSearcher());
}
Phrase.populateScores(phrases, fieldWeights, 3, 7);
// do some basic sanity checks of the field & total scores...
for (Phrase xyz : phrases.subList(0, 7)) {
// first 7 all start with xyz which isn't in index (in either field) so all scores should be -1
assertEquals(xyz.toString(), -1.0D, xyz.getTotalScore(), 0.0D);
assertEquals(xyz.toString(), -1.0D, xyz.getFieldScore("multigrams_title"), 0.0D);
assertEquals(xyz.toString(), -1.0D, xyz.getFieldScore("multigrams_body"), 0.0D);
}
// any individual terms (past xyz) should score 0.0 because they are all actually in the index
// (in both fields)
for (Phrase term : phrases.subList(7, phrases.size()).stream().filter
((p -> 1 == p.getPositionLength())).collect(Collectors.toList())) {
assertEquals(term.toString(), 0.0D, term.getFieldScore("multigrams_title"), 0.0D);
assertEquals(term.toString(), 0.0D, term.getFieldScore("multigrams_body"), 0.0D);
assertEquals(term.toString(), 0.0D, term.getTotalScore(), 0.0D);
}
// "brown fox" should score positively in both fields, and overall...
final Phrase brown_fox = phrases.get(8);
assertEquals("BROWN fox", brown_fox.getSubSequence());
assertThat(brown_fox.toString(), brown_fox.getFieldScore("multigrams_title"), greaterThan(0.0D));
assertThat(brown_fox.toString(), brown_fox.getFieldScore("multigrams_body"), greaterThan(0.0D) );
assertThat(brown_fox.toString(), brown_fox.getTotalScore(), greaterThan(0.0D));
// "we lazy" does appear in a title value, but should score poorly given how often the terms
// are used in other contexts, and should score -1 against body -- but because of our weights,
// that shouldn't bring down the total
final Phrase we_lazy = phrases.get(phrases.size()-2);
assertEquals("we lAzY", we_lazy.getSubSequence());
assertEquals(we_lazy.toString(), -1.0D, we_lazy.getFieldScore("multigrams_body"), 0.0D);
assertThat(we_lazy.toString(), we_lazy.getFieldScore("multigrams_title"), lessThan(0.0D));
assertThat(we_lazy.toString(), we_lazy.getTotalScore(), lessThan(0.0D));
assertEquals(we_lazy.toString(), we_lazy.getFieldScore("multigrams_title"), we_lazy.getTotalScore(),
0.0D);
// "why are we lazy" is longer then the max indexed phrase size & appears verbatim in a title value
// it should score -1 against body -- but because of our weights, that shouldn't bring down the total
final Phrase wawl = phrases.get(phrases.size()-7);
assertEquals("why are we lAzY", wawl.getSubSequence());
assertEquals(wawl.toString(), -1.0D, wawl.getFieldScore("multigrams_body"), 0.0D);
assertThat(wawl.toString(), wawl.getFieldScore("multigrams_title"), greaterThan(0.0D));
assertThat(wawl.toString(), wawl.getTotalScore(), greaterThan(0.0D));
assertEquals(wawl.toString(), wawl.getFieldScore("multigrams_title"), wawl.getTotalScore(),
0.0D);
// "brown fox why are we" is longer then the max indexed phrase, and none of it's
// (longest) sub phrases exists in either field -- so all of it's scores should be -1
final Phrase bfwaw = phrases.get(11);
assertEquals("BROWN fox why are we", bfwaw.getSubSequence());
assertEquals(bfwaw.toString(), -1.0D, bfwaw.getFieldScore("multigrams_title"), 0.0D);
assertEquals(bfwaw.toString(), -1.0D, bfwaw.getFieldScore("multigrams_body"), 0.0D);
assertEquals(bfwaw.toString(), -1.0D, bfwaw.getTotalScore(), 0.0D);
}
@SuppressWarnings({"unchecked"})
public void testWhiteboxScorcesStopwords() throws Exception {
final String input = "why the lazy dog brown fox";
final Map<String,Double> fieldWeights = new TreeMap<>();
fieldWeights.put("multigrams_title", 1.0D);
fieldWeights.put("multigrams_title_stop", 1.0D);
{ // If our analysisField uses all terms,
// be we also generate scores from a field that filters stopwords...
final SchemaField analysisField = h.getCore().getLatestSchema().getField("multigrams_title");
assertNotNull(analysisField);
final List<Phrase> phrases = Phrase.extractPhrases(input, analysisField, 3, 7);
try (SolrQueryRequest req = req()) {
Phrase.populateStats(phrases, fieldWeights.keySet(), req.getSearcher());
}
Phrase.populateScores(phrases, fieldWeights, 3, 7);
// phrases that span the stop word should have valid scores from the field that doesn't care
// about stop words, but the stopword field should reject them
final Phrase why_the_lazy = phrases.get(2);
assertEquals("why the lazy", why_the_lazy.getSubSequence());
assertThat(why_the_lazy.toString(), why_the_lazy.getFieldScore("multigrams_title"), greaterThan(0.0D) );
assertEquals(why_the_lazy.toString(), -1.0D, why_the_lazy.getFieldScore("multigrams_title_stop"), 0.0D);
final Phrase the_lazy_dog = phrases.get(8);
assertEquals("the lazy dog", the_lazy_dog.getSubSequence());
assertThat(the_lazy_dog.toString(), the_lazy_dog.getFieldScore("multigrams_title"), greaterThan(0.0D) );
assertEquals(the_lazy_dog.toString(), -1.0D, the_lazy_dog.getFieldScore("multigrams_title_stop"), 0.0D);
// sanity check that good scores are still possible with stopwords
// "brown fox" should score positively in both fields, and overall...
final Phrase brown_fox = phrases.get(phrases.size()-2);
assertEquals("brown fox", brown_fox.getSubSequence());
assertThat(brown_fox.toString(), brown_fox.getFieldScore("multigrams_title"), greaterThan(0.0D));
assertThat(brown_fox.toString(), brown_fox.getFieldScore("multigrams_title_stop"), greaterThan(0.0D) );
assertThat(brown_fox.toString(), brown_fox.getTotalScore(), greaterThan(0.0D));
}
{ // now flip things: our analysisField filters stopwords,
// but we also generates scores from a field that doesn't know about them...
//
// (NOTE: the parser will still generate _some_ candidate phrases spaning the stop word position,
// but not ones that start with the stopword)
final SchemaField analysisField = h.getCore().getLatestSchema().getField("multigrams_title_stop");
assertNotNull(analysisField);
final List<Phrase> phrases = Phrase.extractPhrases(input, analysisField, 3, 7);
try (SolrQueryRequest req = req()) {
Phrase.populateStats(phrases, fieldWeights.keySet(), req.getSearcher());
}
Phrase.populateScores(phrases, fieldWeights, 3, 7);
assertTrue(phrases.toString(), 0 < phrases.size());
for (Phrase p : phrases) {
if (p.getPositionStart() <= 2 && 2 < p.getPositionEnd()) {
// phrases that span the stop word should have valid scores from the field that doesn't care
// about stop words, but the stopword field should reject them
assertEquals(p.toString(), -1.0D, p.getFieldScore("multigrams_title"), 0.0D);
assertEquals(p.toString(), -1.0D, p.getFieldScore("multigrams_title_stop"), 0.0D);
}
}
// sanity check that good scores are still possible with stopwords
// "brown fox" should score positively in both fields, and overall...
final Phrase brown_fox = phrases.get(phrases.size()-2);
assertEquals("brown fox", brown_fox.getSubSequence());
assertThat(brown_fox.toString(), brown_fox.getFieldScore("multigrams_title"), greaterThan(0.0D));
assertThat(brown_fox.toString(), brown_fox.getFieldScore("multigrams_title_stop"), greaterThan(0.0D) );
assertThat(brown_fox.toString(), brown_fox.getTotalScore(), greaterThan(0.0D));
}
}
public void testExpectedUserErrors() throws Exception {
assertQEx("empty field list should error",
"must specify a (weighted) list of fields",
req("q","foo", "phrases","true",
"phrases.fields", " "),
ErrorCode.BAD_REQUEST);
assertQEx("bogus field name should error",
"does not exist",
req("q","foo", "phrases","true",
"phrases.fields", "bogus1 bogus2"),
ErrorCode.BAD_REQUEST);
assertQEx("lack of shingles should cause error",
"Unable to determine max position length",
req("q","foo", "phrases","true",
"phrases.fields", "title"),
ErrorCode.BAD_REQUEST);
assertQEx("analyzer missmatch should cause error",
"must have the same fieldType",
req("q","foo", "phrases","true",
"phrases.fields", "multigrams_title multigrams_title_short"),
ErrorCode.BAD_REQUEST);
assertQEx("analysis field must exist",
"does not exist",
req("q","foo", "phrases","true",
"phrases.analysis.field", "bogus",
"phrases.fields", "multigrams_title multigrams_title_short"),
ErrorCode.BAD_REQUEST);
assertQEx("no query param should error",
"requires a query string",
req("qt", "/phrases",
"phrases.fields", "multigrams_title"),
ErrorCode.BAD_REQUEST);
}
public void testMaxShingleSizeHelper() throws Exception {
IndexSchema schema = h.getCore().getLatestSchema();
assertEquals(3, PhrasesIdentificationComponent.getMaxShingleSize
(schema.getFieldTypeByName("multigrams_3_7").getIndexAnalyzer()));
assertEquals(7, PhrasesIdentificationComponent.getMaxShingleSize
(schema.getFieldTypeByName("multigrams_3_7").getQueryAnalyzer()));
assertEquals(3, PhrasesIdentificationComponent.getMaxShingleSize
(schema.getFieldTypeByName("multigrams_3").getIndexAnalyzer()));
assertEquals(3, PhrasesIdentificationComponent.getMaxShingleSize
(schema.getFieldTypeByName("multigrams_3").getQueryAnalyzer()));
assertEquals(-1, PhrasesIdentificationComponent.getMaxShingleSize
(schema.getFieldTypeByName("text").getIndexAnalyzer()));
assertEquals(-1, PhrasesIdentificationComponent.getMaxShingleSize
(schema.getFieldTypeByName("text").getQueryAnalyzer()));
}
public void testSimplePhraseRequest() throws Exception {
final String input = " did a Quick brown FOX perniciously jump over the lazy dog";
final String expected = " did a Quick {brown FOX} perniciously jump over {the lazy dog}";
// should get same behavior regardless of wether we use "q" or "phrases.q"
for (String p : Arrays.asList("q", "phrases.q")) {
// basic request...
assertQ(req("qt", HANDLER, p, input)
// expect no search results...
, "count(//result)=0"
// just phrase info...
, "//lst[@name='phrases']/str[@name='input'][.='"+input+"']"
, "//lst[@name='phrases']/str[@name='summary'][.='"+expected+"']"
, "count(//lst[@name='phrases']/arr[@name='details']/lst) = 2"
//
, "//lst[@name='phrases']/arr[@name='details']/lst[1]/str[@name='text'][.='the lazy dog']"
, "//lst[@name='phrases']/arr[@name='details']/lst[1]/int[@name='offset_start'][.='50']"
, "//lst[@name='phrases']/arr[@name='details']/lst[1]/int[@name='offset_end'][.='62']"
, "//lst[@name='phrases']/arr[@name='details']/lst[1]/double[@name='score'][number(.) > 0]"
//
, "//lst[@name='phrases']/arr[@name='details']/lst[2]/str[@name='text'][.='brown FOX']"
, "//lst[@name='phrases']/arr[@name='details']/lst[2]/int[@name='offset_start'][.='17']"
, "//lst[@name='phrases']/arr[@name='details']/lst[2]/int[@name='offset_end'][.='26']"
, "//lst[@name='phrases']/arr[@name='details']/lst[2]/double[@name='score'][number(.) > 0]"
);
// empty input, empty phrases (and no error)...
assertQ(req("qt", HANDLER, p, "")
// expect no search results...
, "count(//result)=0"
// just empty phrase info for our empty input...
, "//lst[@name='phrases']/str[@name='input'][.='']"
, "//lst[@name='phrases']/str[@name='summary'][.='']"
, "count(//lst[@name='phrases']/arr[@name='details']) = 1"
, "count(//lst[@name='phrases']/arr[@name='details']/lst) = 0"
);
}
}
public void testSimpleSearchRequests() throws Exception {
final String input = "\"brown fox\"";
assertQ(req("q", input)
// basic search should have worked...
, "//result[@numFound='2']"
, "//result/doc/str[@name='id'][.='42']"
, "//result/doc/str[@name='id'][.='43']"
// and phrases should not be returned since they weren't requested...
, "0=count(//lst[@name='phrases'])"
);
assertQ(req("phrases", "false", "q", input)
// basic search should have worked...
, "//result[@numFound='2']"
, "//result/doc/str[@name='id'][.='42']"
, "//result/doc/str[@name='id'][.='43']"
// and phrases should not be returned since they were explicitly disabled...
, "0=count(//lst[@name='phrases'])"
);
// with input this short, all of these permutations of requests should produce the same output...
for (SolrQueryRequest req : Arrays.asList
( // simple, using 3/7 defaults
req("phrases","true", "q", input),
// simple, using just the 3/3 'short' fields
req("phrases","true", "q", input,
"phrases.fields", "multigrams_body_short multigrams_title_short^2"),
// diff analysers, but explicit override using 3/3 "short" field...
req("phrases","true", "q", input,
"phrases.fields", "multigrams_body multigrams_title_short^2",
"phrases.analysis.field", "multigrams_title_short"))) {
assertQ(req
// basic search should have worked...
, "//result[@numFound='2']"
, "//result/doc/str[@name='id'][.='42']"
, "//result/doc/str[@name='id'][.='43']"
// and we should have gotten phrase info...
, "//lst[@name='phrases']/str[@name='input'][.='"+input+"']"
, "//lst[@name='phrases']/str[@name='summary'][.='\"{brown fox}\"']"
, "count(//lst[@name='phrases']/arr[@name='details']/lst)=1"
, "//lst[@name='phrases']/arr[@name='details']/lst/str[@name='text'][.='brown fox']"
, "//lst[@name='phrases']/arr[@name='details']/lst/int[@name='offset_start'][.='1']"
, "//lst[@name='phrases']/arr[@name='details']/lst/int[@name='offset_end'][.='10']"
, "//lst[@name='phrases']/arr[@name='details']/lst/double[@name='score'][number(.) > 0]"
);
}
// override the query string to get different phrases
assertQ(req("phrases","true", "q", "*:*", "phrases.q", input)
// basic search should have found all docs...
, "//result[@numFound='4']"
// and we should have gotten phrase info for our alternative q string...
, "//lst[@name='phrases']/str[@name='input'][.='"+input+"']"
, "//lst[@name='phrases']/str[@name='summary'][.='\"{brown fox}\"']"
, "count(//lst[@name='phrases']/arr[@name='details']/lst)=1"
, "//lst[@name='phrases']/arr[@name='details']/lst/str[@name='text'][.='brown fox']"
, "//lst[@name='phrases']/arr[@name='details']/lst/int[@name='offset_start'][.='1']"
, "//lst[@name='phrases']/arr[@name='details']/lst/int[@name='offset_end'][.='10']"
, "//lst[@name='phrases']/arr[@name='details']/lst/double[@name='score'][number(.) > 0]"
);
// empty input, empty phrases (but no error)
assertQ(req("phrases","true", "q", "*:*", "phrases.q", "")
// basic search should have found all docs...
, "//result[@numFound='4']"
// and we should have gotten (empty) phrase info for our alternative q string...
, "//lst[@name='phrases']/str[@name='input'][.='']"
, "//lst[@name='phrases']/str[@name='summary'][.='']"
, "count(//lst[@name='phrases']/arr[@name='details']) = 1"
, "count(//lst[@name='phrases']/arr[@name='details']/lst) = 0"
);
}
public void testGreyboxShardSearchRequests() throws Exception {
final String input = "quick brown fox ran";
final String phrase_xpath = "//lst[@name='phrases']";
final String all_phrase_xpath = phrase_xpath + "/arr[@name='_all']";
// phrases requested, and correct request stage / shard purpose ...
assertQ(req("q", input,
"phrases","true",
ShardParams.IS_SHARD, "true",
ShardParams.SHARDS_PURPOSE, ""+PhrasesIdentificationComponent.SHARD_PURPOSE)
// this shard request should have caused stats to be returned about all phrases...
, "10=count("+ all_phrase_xpath +"/lst)"
// "quick" ...
, all_phrase_xpath + "/lst[1]/lst[@name='ttf']/long[@name='multigrams_body'][.='1']"
, all_phrase_xpath + "/lst[1]/lst[@name='ttf']/long[@name='multigrams_title'][.='0']"
// ...
// "brown fox"
, all_phrase_xpath + "/lst[6]/lst[@name='ttf']/long[@name='multigrams_body'][.='3']"
, all_phrase_xpath + "/lst[6]/lst[@name='ttf']/long[@name='multigrams_title'][.='1']"
, all_phrase_xpath + "/lst[6]/lst[@name='df']/long[@name='multigrams_body'][.='2']"
, all_phrase_xpath + "/lst[6]/lst[@name='df']/long[@name='multigrams_title'][.='1']"
, all_phrase_xpath + "/lst[6]/lst[@name='conj_dc']/long[@name='multigrams_body'][.='2']"
, all_phrase_xpath + "/lst[6]/lst[@name='conj_dc']/long[@name='multigrams_title'][.='1']"
// but no computed "scores"...
, "0=count("+phrase_xpath+"//*[@name='score'])"
);
// phrases requested, but incorrect request stage / shard purpose ...
assertQ(req("q", input,
"phrases","true",
ShardParams.IS_SHARD, "true",
ShardParams.SHARDS_PURPOSE, ""+ShardRequest.PURPOSE_GET_FIELDS)
, "0=count("+ phrase_xpath +"/lst)");
// phrases disabled, regardless of request stage / shard purpose ...
assertTrue("sanity check failed, stage was modified in code w/o updating test",
PhrasesIdentificationComponent.SHARD_PURPOSE != ShardRequest.PURPOSE_GET_FIELDS);
assertQ(req("q", input,
"phrases","false",
ShardParams.IS_SHARD, "true",
ShardParams.SHARDS_PURPOSE, ""+ShardRequest.PURPOSE_GET_FIELDS)
, "0=count("+ phrase_xpath +"/lst)");
assertQ(req("q", input,
"phrases","false",
ShardParams.IS_SHARD, "true",
ShardParams.SHARDS_PURPOSE, ""+PhrasesIdentificationComponent.SHARD_PURPOSE)
, "0=count("+ phrase_xpath +"/lst)");
}
// ////////////////////////////////////////////////////////////////
/**
* Trivial Helper method that collects &amp; compares to an empty List so
* the assertion shows the unexpected stream elements
*/
public <T> void assertEmptyStream(final String msg, final Stream<? extends T> stream) {
assertEquals(msg,
Collections.emptyList(),
stream.collect(Collectors.toList()));
}
/** helper, docs for future junit/hamcrest seems to have something similar */
@SuppressWarnings({"rawtypes"})
public static Matcher lessThan(double expected) {
return new BaseMatcher() {
@Override public boolean matches(Object actual) {
return ((Double)actual).compareTo(expected) < 0;
}
@Override public void describeTo(Description d) {
d.appendText("should be less than " + expected);
}
};
}
/** helper, docs for future junit/hamcrest seems to have something similar */
@SuppressWarnings({"rawtypes"})
public static Matcher greaterThan(double expected) {
return new BaseMatcher() {
@Override public boolean matches(Object actual) {
return 0 < ((Double)actual).compareTo(expected);
}
@Override public void describeTo(Description d) {
d.appendText("should be greater than " + expected);
}
};
}
}