blob: 14f9e5da05396f809470c7a1ba9ecd59c98dfc94 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.facet;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.SortedDocValuesField;
import org.apache.lucene.document.SortedSetDocValuesField;
import org.apache.lucene.document.StringField;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.MatchAllDocsQuery;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.LuceneTestCase;
public class TestStringValueFacetCounts extends FacetTestCase {
public void testBasicSingleValued() throws Exception {
Directory dir = newDirectory();
RandomIndexWriter writer = new RandomIndexWriter(random(), dir);
Document doc = new Document();
doc.add(new SortedSetDocValuesField("field", new BytesRef("foo")));
writer.addDocument(doc);
doc = new Document();
doc.add(new SortedSetDocValuesField("field", new BytesRef("foo")));
writer.addDocument(doc);
doc = new Document();
doc.add(new SortedSetDocValuesField("field", new BytesRef("bar")));
writer.addDocument(doc);
doc = new Document();
doc.add(new SortedSetDocValuesField("field", new BytesRef("bar")));
writer.addDocument(doc);
doc = new Document();
doc.add(new SortedSetDocValuesField("field", new BytesRef("baz")));
writer.addDocument(doc);
Map<String, Integer> expectedCounts = new HashMap<>();
expectedCounts.put("foo", 2);
expectedCounts.put("bar", 2);
expectedCounts.put("baz", 1);
int expectedTotalDocCount = 5;
IndexSearcher searcher = newSearcher(writer.getReader());
writer.close();
checkFacetResult(expectedCounts, expectedTotalDocCount, searcher, 10, 2, 1, 0);
IOUtils.close(searcher.getIndexReader(), dir);
}
// See: LUCENE-10070
public void testCountAll() throws Exception {
Directory dir = newDirectory();
RandomIndexWriter writer = new RandomIndexWriter(random(), dir);
Document doc = new Document();
doc.add(new StringField("id", "0", Field.Store.NO));
doc.add(new SortedSetDocValuesField("field", new BytesRef("foo")));
writer.addDocument(doc);
doc = new Document();
doc.add(new StringField("id", "1", Field.Store.NO));
doc.add(new SortedSetDocValuesField("field", new BytesRef("foo")));
writer.addDocument(doc);
writer.deleteDocuments(new Term("id", "0"));
IndexSearcher searcher = newSearcher(writer.getReader());
writer.close();
StringDocValuesReaderState state =
new StringDocValuesReaderState(searcher.getIndexReader(), "field");
StringValueFacetCounts facets = new StringValueFacetCounts(state);
assertEquals(
"dim=field path=[] value=1 childCount=1\n foo (1)",
facets.getTopChildren(10, "field").toString().trim());
IOUtils.close(searcher.getIndexReader(), dir);
}
public void testBasicSingleValuedUsingSortedDoc() throws Exception {
Directory dir = newDirectory();
RandomIndexWriter writer = new RandomIndexWriter(random(), dir);
Document doc = new Document();
doc.add(new SortedDocValuesField("field", new BytesRef("foo")));
writer.addDocument(doc);
doc = new Document();
doc.add(new SortedDocValuesField("field", new BytesRef("foo")));
writer.addDocument(doc);
doc = new Document();
doc.add(new SortedDocValuesField("field", new BytesRef("bar")));
writer.addDocument(doc);
doc = new Document();
doc.add(new SortedDocValuesField("field", new BytesRef("bar")));
writer.addDocument(doc);
doc = new Document();
doc.add(new SortedDocValuesField("field", new BytesRef("baz")));
writer.addDocument(doc);
Map<String, Integer> expectedCounts = new HashMap<>();
expectedCounts.put("foo", 2);
expectedCounts.put("bar", 2);
expectedCounts.put("baz", 1);
int expectedTotalDocCount = 5;
IndexSearcher searcher = newSearcher(writer.getReader());
writer.close();
checkFacetResult(expectedCounts, expectedTotalDocCount, searcher, 10, 2, 1, 0);
IOUtils.close(searcher.getIndexReader(), dir);
}
public void testBasicMultiValued() throws Exception {
Directory dir = newDirectory();
RandomIndexWriter writer = new RandomIndexWriter(random(), dir);
Document doc = new Document();
doc.add(new SortedSetDocValuesField("field", new BytesRef("foo")));
doc.add(new SortedSetDocValuesField("field", new BytesRef("bar")));
writer.addDocument(doc);
doc = new Document();
doc.add(new SortedSetDocValuesField("field", new BytesRef("foo")));
doc.add(new SortedSetDocValuesField("field", new BytesRef("baz")));
writer.addDocument(doc);
doc = new Document();
doc.add(new SortedSetDocValuesField("field", new BytesRef("baz")));
writer.addDocument(doc);
Map<String, Integer> expectedCounts = new HashMap<>();
expectedCounts.put("foo", 2);
expectedCounts.put("bar", 1);
expectedCounts.put("baz", 2);
int expectedTotalDocCount = 3;
IndexSearcher searcher = newSearcher(writer.getReader());
writer.close();
checkFacetResult(expectedCounts, expectedTotalDocCount, searcher, 10, 2, 1, 0);
IOUtils.close(searcher.getIndexReader(), dir);
}
public void testSparseMultiSegmentCase() throws Exception {
Directory dir = newDirectory();
RandomIndexWriter writer = new RandomIndexWriter(random(), dir);
Map<String, Integer> expectedCounts = new HashMap<>();
// Create two segments, each with only one doc that has a large number of SSDV field values.
// This ensures "sparse" counting will occur in StringValueFacetCounts (i.e., small number
// of hits relative to the field cardinality):
Document doc = new Document();
for (int i = 0; i < 100; i++) {
doc.add(new SortedSetDocValuesField("field", new BytesRef("foo_" + i)));
expectedCounts.put("foo_" + i, 1);
}
writer.addDocument(doc);
writer.commit();
doc = new Document();
for (int i = 0; i < 100; i++) {
doc.add(new SortedSetDocValuesField("field", new BytesRef("bar_" + i)));
expectedCounts.put("bar_" + i, 1);
}
writer.addDocument(doc);
int expectedTotalDocCount = 2;
IndexSearcher searcher = newSearcher(writer.getReader());
writer.close();
checkFacetResult(expectedCounts, expectedTotalDocCount, searcher, 10, 2, 1, 0);
IOUtils.close(searcher.getIndexReader(), dir);
}
public void testMissingSegment() throws Exception {
Directory dir = newDirectory();
RandomIndexWriter writer = new RandomIndexWriter(random(), dir);
Document doc = new Document();
doc.add(new SortedSetDocValuesField("field", new BytesRef("foo")));
doc.add(new SortedSetDocValuesField("field", new BytesRef("bar")));
writer.addDocument(doc);
writer.commit();
// segment with no values
doc = new Document();
writer.addDocument(doc);
writer.commit();
doc = new Document();
doc.add(new SortedSetDocValuesField("field", new BytesRef("baz")));
writer.addDocument(doc);
writer.commit();
Map<String, Integer> expectedCounts = new HashMap<>();
expectedCounts.put("foo", 1);
expectedCounts.put("bar", 1);
expectedCounts.put("baz", 1);
int expectedTotalDocCount = 2;
IndexSearcher searcher = newSearcher(writer.getReader());
writer.close();
checkFacetResult(expectedCounts, expectedTotalDocCount, searcher, 10, 2, 1, 0);
IOUtils.close(searcher.getIndexReader(), dir);
}
public void testStaleState() throws Exception {
Directory dir = newDirectory();
RandomIndexWriter writer = new RandomIndexWriter(random(), dir);
Document doc = new Document();
doc.add(new SortedSetDocValuesField("field", new BytesRef("foo")));
writer.addDocument(doc);
IndexReader reader = writer.getReader();
StringDocValuesReaderState state = new StringDocValuesReaderState(reader, "field");
doc = new Document();
doc.add(new SortedSetDocValuesField("field", new BytesRef("bar")));
writer.addDocument(doc);
IndexSearcher searcher = newSearcher(writer.getReader());
writer.close();
FacetsCollector c = new FacetsCollector();
searcher.search(new MatchAllDocsQuery(), c);
// using a stale state
expectThrows(IllegalStateException.class, () -> new StringValueFacetCounts(state, c));
IOUtils.close(reader, searcher.getIndexReader(), dir);
}
public void testRandom() throws Exception {
int fullIterations = LuceneTestCase.TEST_NIGHTLY ? 20 : 3;
for (int iter = 0; iter < fullIterations; iter++) {
Directory dir = newDirectory();
RandomIndexWriter writer = new RandomIndexWriter(random(), dir);
// Build up test data
String[] tokens = getRandomTokens(50); // 50 random values to pick from
int numDocs = atLeast(1000);
int expectedTotalDocCount = 0;
Map<String, Integer> expected = new HashMap<>();
for (int i = 0; i < numDocs; i++) {
Document doc = new Document();
// Sometimes we restrict docs to be single-valued, but most of the time they can have up to
// 5:
int maxValuesPerDoc;
if (random().nextInt(10) < 8) {
maxValuesPerDoc = 5;
} else {
maxValuesPerDoc = 1;
}
int valCount = random().nextInt(maxValuesPerDoc);
Set<String> docVals = new HashSet<>();
for (int j = 0; j < valCount; j++) {
int tokenIdx = random().nextInt(tokens.length);
String val = tokens[tokenIdx];
// values should only be counted once per document
if (docVals.contains(val) == false) {
expected.put(val, expected.getOrDefault(val, 0) + 1);
}
docVals.add(val);
doc.add(new SortedSetDocValuesField("field", new BytesRef(val)));
}
// only docs with at least one value in the field should be counted in the total
if (docVals.isEmpty() == false) {
expectedTotalDocCount++;
}
writer.addDocument(doc);
if (random().nextInt(10) == 0) {
writer.commit(); // sometimes commit
}
}
IndexSearcher searcher = newSearcher(writer.getReader());
writer.close();
// run iterations with random values of topN
int iterations = LuceneTestCase.TEST_NIGHTLY ? 10_000 : 50;
int[] topNs = new int[iterations];
for (int i = 0; i < iterations; i++) {
topNs[i] = atLeast(1);
}
checkFacetResult(expected, expectedTotalDocCount, searcher, topNs);
IOUtils.close(searcher.getIndexReader(), dir);
}
}
private void checkFacetResult(
Map<String, Integer> expectedCounts,
int expectedTotalDocsWithValue,
IndexSearcher searcher,
int... topNs)
throws IOException {
StringDocValuesReaderState state =
new StringDocValuesReaderState(searcher.getIndexReader(), "field");
FacetsCollector c = new FacetsCollector();
searcher.search(new MatchAllDocsQuery(), c);
for (int topN : topNs) {
StringValueFacetCounts facets;
// should get the same result whether-or-not we provide a FacetsCollector since it's doing
// a MatchAllDocsQuery:
if (random().nextBoolean()) {
facets = new StringValueFacetCounts(state, c);
} else {
facets = new StringValueFacetCounts(state);
}
// sort expected counts by count, value
List<Map.Entry<String, Integer>> expectedCountsSorted =
new ArrayList<>(expectedCounts.entrySet());
expectedCountsSorted.sort(
(a, b) -> {
int cmp = b.getValue().compareTo(a.getValue()); // high-to-low
if (cmp == 0) {
cmp = a.getKey().compareTo(b.getKey()); // low-to-high
}
return cmp;
});
// number of labels we expect is the number with a non-zero count
int expectedLabelCount =
(int) expectedCountsSorted.stream().filter(e -> e.getValue() > 0).count();
// topN == 0 is intentionally unsupported
if (topN == 0) {
assertThrows(IllegalArgumentException.class, () -> facets.getTopChildren(topN, "field"));
return;
}
FacetResult facetResult = facets.getTopChildren(topN, "field");
assertEquals("field", facetResult.dim);
assertEquals(0, facetResult.path.length);
assertEquals(expectedTotalDocsWithValue, facetResult.value);
assertEquals(expectedLabelCount, facetResult.childCount);
// getAllDims should return a singleton list with the same results as getTopChildren
List<FacetResult> allDims = facets.getAllDims(topN);
assertEquals(1, allDims.size());
assertEquals(facetResult, allDims.get(0));
// This is a little strange, but we request all labels at this point so that when we
// secondarily sort by label value in order to compare to the expected results, we have
// all the values. See LUCENE-9991:
int maxTopN = expectedCountsSorted.size();
facetResult = facets.getTopChildren(maxTopN, "field");
// also sort expected labels by count, value (these will be sorted by count, ord -- but since
// we have no insight into the ordinals assigned to the values, we resort)
Arrays.sort(
facetResult.labelValues,
(a, b) -> {
int cmp = Long.compare(b.value.longValue(), a.value.longValue()); // high-to-low
if (cmp == 0) {
cmp = a.label.compareTo(b.label); // low-to-high
}
return cmp;
});
for (int i = 0; i < Math.min(topN, maxTopN); i++) {
String expectedKey = expectedCountsSorted.get(i).getKey();
int expectedValue = expectedCountsSorted.get(i).getValue();
assertEquals(expectedKey, facetResult.labelValues[i].label);
assertEquals(expectedValue, facetResult.labelValues[i].value);
// make sure getSpecificValue reports the same count
assertEquals(expectedValue, facets.getSpecificValue("field", expectedKey));
}
// execute a "drill down" query on one of the values at random and make sure the total hits
// match the expected count provided by faceting
if (expectedCountsSorted.isEmpty() == false) {
DrillDownQuery q = new DrillDownQuery(new FacetsConfig());
int randomTestValIdx = random().nextInt(Math.min(expectedCountsSorted.size(), topN));
q.add("field", expectedCountsSorted.get(randomTestValIdx).getKey());
searcher.search(q, 1);
assertEquals(
expectedCountsSorted.get(randomTestValIdx).getValue(),
facetResult.labelValues[randomTestValIdx].value);
}
}
}
}