blob: 82eb5411175c191bf50f4459dabe1f6e57ac8ed0 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.facet;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.IntPoint;
import org.apache.lucene.document.NumericDocValuesField;
import org.apache.lucene.document.SortedNumericDocValuesField;
import org.apache.lucene.document.StringField;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.LongValuesSource;
import org.apache.lucene.search.MatchAllDocsQuery;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.TestUtil;
/** Tests long value facets. */
public class TestLongValueFacetCounts extends LuceneTestCase {
public void testBasic() throws Exception {
Directory d = newDirectory();
RandomIndexWriter w = new RandomIndexWriter(random(), d);
for (long l = 0; l < 100; l++) {
Document doc = new Document();
doc.add(new NumericDocValuesField("field", l % 5));
w.addDocument(doc);
}
// Also add Long.MAX_VALUE
Document doc = new Document();
doc.add(new NumericDocValuesField("field", Long.MAX_VALUE));
w.addDocument(doc);
IndexReader r = w.getReader();
w.close();
FacetsCollector fc = new FacetsCollector();
IndexSearcher s = newSearcher(r);
s.search(new MatchAllDocsQuery(), fc);
LongValueFacetCounts facets = new LongValueFacetCounts("field", fc, false);
FacetResult result = facets.getAllChildrenSortByValue();
assertEquals("dim=field path=[] value=101 childCount=6\n 0 (20)\n 1 (20)\n 2 (20)\n 3 (20)\n " +
"4 (20)\n 9223372036854775807 (1)\n",
result.toString());
r.close();
d.close();
}
// See: LUCENE-10070
public void testCountAll() throws Exception {
Directory d = newDirectory();
RandomIndexWriter w = new RandomIndexWriter(random(), d);
for (int i = 0; i < 10; i++) {
Document doc = new Document();
doc.add(new StringField("id", String.valueOf(i), Field.Store.NO));
doc.add(new NumericDocValuesField("field", i % 2));
w.addDocument(doc);
}
w.deleteDocuments(new Term("id", "0"));
IndexReader r = w.getReader();
w.close();
LongValueFacetCounts facets = new LongValueFacetCounts("field", r, false);
FacetResult result = facets.getAllChildrenSortByValue();
assertEquals("dim=field path=[] value=9 childCount=2\n 0 (4)\n 1 (5)\n", result.toString());
r.close();
d.close();
}
public void testOnlyBigLongs() throws Exception {
Directory d = newDirectory();
RandomIndexWriter w = new RandomIndexWriter(random(), d);
for (long l = 0; l < 3; l++) {
Document doc = new Document();
doc.add(new NumericDocValuesField("field", Long.MAX_VALUE - l));
w.addDocument(doc);
}
IndexReader r = w.getReader();
w.close();
FacetsCollector fc = new FacetsCollector();
IndexSearcher s = newSearcher(r);
s.search(new MatchAllDocsQuery(), fc);
LongValueFacetCounts facets = new LongValueFacetCounts("field", fc, false);
FacetResult result = facets.getAllChildrenSortByValue();
assertEquals("dim=field path=[] value=3 childCount=3\n 9223372036854775805 (1)\n " +
"9223372036854775806 (1)\n 9223372036854775807 (1)\n",
result.toString());
r.close();
d.close();
}
public void testGetAllDims() throws Exception {
Directory d = newDirectory();
RandomIndexWriter w = new RandomIndexWriter(random(), d);
for (long l = 0; l < 100; l++) {
Document doc = new Document();
doc.add(new NumericDocValuesField("field", l % 5));
w.addDocument(doc);
}
// Also add Long.MAX_VALUE
Document doc = new Document();
doc.add(new NumericDocValuesField("field", Long.MAX_VALUE));
w.addDocument(doc);
IndexReader r = w.getReader();
w.close();
FacetsCollector fc = new FacetsCollector();
IndexSearcher s = newSearcher(r);
s.search(new MatchAllDocsQuery(), fc);
Facets facets = new LongValueFacetCounts("field", fc, false);
List<FacetResult> result = facets.getAllDims(10);
assertEquals(1, result.size());
assertEquals("dim=field path=[] value=101 childCount=6\n 0 (20)\n 1 (20)\n 2 (20)\n " +
"3 (20)\n 4 (20)\n 9223372036854775807 (1)\n",
result.get(0).toString());
r.close();
d.close();
}
public void testRandomSingleValued() throws Exception {
Directory dir = newDirectory();
RandomIndexWriter w = new RandomIndexWriter(random(), dir);
int docCount = atLeast(1000);
double missingChance = random().nextDouble();
long maxValue;
if (random().nextBoolean()) {
maxValue = random().nextLong() & Long.MAX_VALUE;
} else {
maxValue = random().nextInt(1000);
}
if (VERBOSE) {
System.out.println("TEST: valueCount=" + docCount + " valueRange=-" + maxValue +
"-" + maxValue + " missingChance=" + missingChance);
}
Long[] values = new Long[docCount];
int missingCount = 0;
for (int i = 0; i < docCount; i++) {
Document doc = new Document();
doc.add(new IntPoint("id", i));
if (random().nextDouble() > missingChance) {
long value = TestUtil.nextLong(random(), -maxValue, maxValue);
doc.add(new NumericDocValuesField("field", value));
values[i] = value;
} else {
missingCount++;
}
w.addDocument(doc);
}
IndexReader r = w.getReader();
w.close();
IndexSearcher s = newSearcher(r);
int iters = atLeast(100);
for (int iter = 0; iter < iters; iter++) {
FacetsCollector fc = new FacetsCollector();
if (VERBOSE) {
System.out.println("\nTEST: iter=" + iter);
System.out.println(" test all docs");
}
// all docs
Map<Long, Integer> expected = new HashMap<>();
int expectedChildCount = 0;
for (int i = 0; i < docCount; i++) {
if (values[i] != null) {
Integer curCount = expected.get(values[i]);
if (curCount == null) {
curCount = 0;
expectedChildCount++;
}
expected.put(values[i], curCount + 1);
}
}
List<Map.Entry<Long, Integer>> expectedCounts = new ArrayList<>(expected.entrySet());
// sort by value
Collections.sort(expectedCounts,
(a, b) -> (Long.compare(a.getKey(), b.getKey())));
LongValueFacetCounts facetCounts;
if (random().nextBoolean()) {
s.search(new MatchAllDocsQuery(), fc);
if (random().nextBoolean()) {
if (VERBOSE) {
System.out.println(" use value source");
}
facetCounts = new LongValueFacetCounts("field", LongValuesSource.fromLongField("field"), fc);
} else {
if (VERBOSE) {
System.out.println(" use doc values");
}
facetCounts = new LongValueFacetCounts("field", fc, false);
}
} else {
// optimized count all:
if (random().nextBoolean()) {
if (VERBOSE) {
System.out.println(" count all value source");
}
facetCounts = new LongValueFacetCounts("field", LongValuesSource.fromLongField("field"), r);
} else {
if (VERBOSE) {
System.out.println(" count all doc values");
}
facetCounts = new LongValueFacetCounts("field", r, false);
}
}
FacetResult actual = facetCounts.getAllChildrenSortByValue();
assertSame("all docs, sort facets by value", expectedCounts, expectedChildCount,
docCount - missingCount, actual, Integer.MAX_VALUE);
// sort by count
Collections.sort(expectedCounts,
(a, b) -> {
int cmp = -Integer.compare(a.getValue(), b.getValue());
if (cmp == 0) {
// tie break by value
cmp = Long.compare(a.getKey(), b.getKey());
}
return cmp;
});
int topN;
if (random().nextBoolean()) {
topN = docCount;
} else {
topN = random().nextInt(docCount);
}
if (VERBOSE) {
System.out.println(" topN=" + topN);
}
actual = facetCounts.getTopChildrenSortByCount(topN);
assertSame("all docs, sort facets by count", expectedCounts, expectedChildCount, docCount - missingCount, actual, topN);
// subset of docs
int minId = random().nextInt(docCount);
int maxId = random().nextInt(docCount);
if (minId > maxId) {
int tmp = minId;
minId = maxId;
maxId = tmp;
}
if (VERBOSE) {
System.out.println(" test id range " + minId + "-" + maxId);
}
fc = new FacetsCollector();
s.search(IntPoint.newRangeQuery("id", minId, maxId), fc);
if (random().nextBoolean()) {
if (VERBOSE) {
System.out.println(" use doc values");
}
facetCounts = new LongValueFacetCounts("field", fc, false);
} else {
if (VERBOSE) {
System.out.println(" use value source");
}
facetCounts = new LongValueFacetCounts("field", LongValuesSource.fromLongField("field"), fc);
}
expected = new HashMap<>();
expectedChildCount = 0;
int totCount = 0;
for (int i = minId; i <= maxId; i++) {
if (values[i] != null) {
totCount++;
Integer curCount = expected.get(values[i]);
if (curCount == null) {
expectedChildCount++;
curCount = 0;
}
expected.put(values[i], curCount + 1);
}
}
expectedCounts = new ArrayList<>(expected.entrySet());
// sort by value
Collections.sort(expectedCounts,
(a, b) -> (Long.compare(a.getKey(), b.getKey())));
actual = facetCounts.getAllChildrenSortByValue();
assertSame("id " + minId + "-" + maxId + ", sort facets by value", expectedCounts,
expectedChildCount, totCount, actual, Integer.MAX_VALUE);
// sort by count
Collections.sort(expectedCounts,
(a, b) -> {
int cmp = -Integer.compare(a.getValue(), b.getValue());
if (cmp == 0) {
// tie break by value
cmp = Long.compare(a.getKey(), b.getKey());
}
return cmp;
});
if (random().nextBoolean()) {
topN = docCount;
} else {
topN = random().nextInt(docCount);
}
actual = facetCounts.getTopChildrenSortByCount(topN);
assertSame("id " + minId + "-" + maxId + ", sort facets by count", expectedCounts, expectedChildCount, totCount, actual, topN);
}
r.close();
dir.close();
}
public void testRandomMultiValued() throws Exception {
Directory dir = newDirectory();
RandomIndexWriter w = new RandomIndexWriter(random(), dir);
int docCount = atLeast(1000);
double missingChance = random().nextDouble();
// sometimes exercise codec optimizations when a claimed multi valued field is in fact single valued:
boolean allSingleValued = rarely();
long maxValue;
if (random().nextBoolean()) {
maxValue = random().nextLong() & Long.MAX_VALUE;
} else {
maxValue = random().nextInt(1000);
}
if (VERBOSE) {
System.out.println("TEST: valueCount=" + docCount + " valueRange=-" + maxValue +
"-" + maxValue + " missingChance=" + missingChance + " allSingleValued=" + allSingleValued);
}
long[][] values = new long[docCount][];
int missingCount = 0;
for (int i = 0; i < docCount; i++) {
Document doc = new Document();
doc.add(new IntPoint("id", i));
if (random().nextDouble() > missingChance) {
if (allSingleValued) {
values[i] = new long[1];
} else {
values[i] = new long[TestUtil.nextInt(random(), 1, 5)];
}
for (int j = 0; j < values[i].length; j++) {
long value = TestUtil.nextLong(random(), -maxValue, maxValue);
values[i][j] = value;
doc.add(new SortedNumericDocValuesField("field", value));
}
if (VERBOSE) {
System.out.println(" doc=" + i + " values=" + Arrays.toString(values[i]));
}
// sort values to enable duplicate detection by comparing with the previous value
Arrays.sort(values[i]);
} else {
missingCount++;
if (VERBOSE) {
System.out.println(" doc=" + i + " missing values");
}
}
w.addDocument(doc);
}
IndexReader r = w.getReader();
w.close();
IndexSearcher s = newSearcher(r);
int iters = atLeast(100);
for (int iter = 0; iter < iters; iter++) {
FacetsCollector fc = new FacetsCollector();
if (VERBOSE) {
System.out.println("\nTEST: iter=" + iter);
System.out.println(" test all docs");
}
// all docs
Map<Long, Integer> expected = new HashMap<>();
int expectedTotalCount = 0;
for (int i = 0; i < docCount; i++) {
if (values[i] != null && values[i].length > 0) {
expectedTotalCount++;
setExpectedFrequencies(values[i], expected);
}
}
List<Map.Entry<Long, Integer>> expectedCounts = new ArrayList<>(expected.entrySet());
int expectedChildCount = expected.size();
// sort by value
Collections.sort(expectedCounts,
(a, b) -> (Long.compare(a.getKey(), b.getKey())));
LongValueFacetCounts facetCounts;
if (random().nextBoolean()) {
s.search(new MatchAllDocsQuery(), fc);
if (VERBOSE) {
System.out.println(" use doc values");
}
facetCounts = new LongValueFacetCounts("field", fc, true);
} else {
// optimized count all:
if (VERBOSE) {
System.out.println(" count all doc values");
}
facetCounts = new LongValueFacetCounts("field", r, true);
}
FacetResult actual = facetCounts.getAllChildrenSortByValue();
assertSame("all docs, sort facets by value", expectedCounts, expectedChildCount,
expectedTotalCount, actual, Integer.MAX_VALUE);
// sort by count
Collections.sort(expectedCounts,
(a, b) -> {
int cmp = -Integer.compare(a.getValue(), b.getValue());
if (cmp == 0) {
// tie break by value
cmp = Long.compare(a.getKey(), b.getKey());
}
return cmp;
});
int topN;
if (random().nextBoolean()) {
topN = docCount;
} else {
topN = random().nextInt(docCount);
}
if (VERBOSE) {
System.out.println(" topN=" + topN);
}
actual = facetCounts.getTopChildrenSortByCount(topN);
assertSame("all docs, sort facets by count", expectedCounts, expectedChildCount, expectedTotalCount, actual, topN);
// subset of docs
int minId = random().nextInt(docCount);
int maxId = random().nextInt(docCount);
if (minId > maxId) {
int tmp = minId;
minId = maxId;
maxId = tmp;
}
if (VERBOSE) {
System.out.println(" test id range " + minId + "-" + maxId);
}
fc = new FacetsCollector();
s.search(IntPoint.newRangeQuery("id", minId, maxId), fc);
// cannot use value source here because we are multi valued
facetCounts = new LongValueFacetCounts("field", fc, true);
expected = new HashMap<>();
expectedTotalCount = 0;
for (int i = minId; i <= maxId; i++) {
if (values[i] != null && values[i].length > 0) {
expectedTotalCount++;
setExpectedFrequencies(values[i], expected);
}
}
expectedCounts = new ArrayList<>(expected.entrySet());
expectedChildCount = expected.size();
// sort by value
Collections.sort(expectedCounts,
(a, b) -> (Long.compare(a.getKey(), b.getKey())));
actual = facetCounts.getAllChildrenSortByValue();
assertSame("id " + minId + "-" + maxId + ", sort facets by value", expectedCounts,
expectedChildCount, expectedTotalCount, actual, Integer.MAX_VALUE);
// sort by count
Collections.sort(expectedCounts,
(a, b) -> {
int cmp = -Integer.compare(a.getValue(), b.getValue());
if (cmp == 0) {
// tie break by value
cmp = Long.compare(a.getKey(), b.getKey());
}
return cmp;
});
if (random().nextBoolean()) {
topN = docCount;
} else {
topN = random().nextInt(docCount);
}
actual = facetCounts.getTopChildrenSortByCount(topN);
assertSame("id " + minId + "-" + maxId + ", sort facets by count", expectedCounts, expectedChildCount, expectedTotalCount, actual, topN);
}
r.close();
dir.close();
}
private void setExpectedFrequencies(long[] values, Map<Long, Integer> expected) {
long previousValue = 0;
for (int j = 0; j < values.length; j++) {
if (j == 0 || previousValue != values[j]) {
Integer curCount = expected.getOrDefault(values[j], 0);
expected.put(values[j], curCount + 1);
}
previousValue = values[j];
}
}
private static void assertSame(String desc, List<Map.Entry<Long, Integer>> expectedCounts,
int expectedChildCount, int expectedTotalCount, FacetResult actual, int topN) {
int expectedTopN = Math.min(topN, expectedCounts.size());
if (VERBOSE) {
System.out.println(" expected topN=" + expectedTopN);
for (int i = 0; i < expectedTopN; i++) {
System.out.println(" " + i + ": value=" + expectedCounts.get(i).getKey() + " count=" + expectedCounts.get(i).getValue());
}
System.out.println(" actual topN=" + actual.labelValues.length);
for (int i = 0; i < actual.labelValues.length; i++) {
System.out.println(" " + i + ": value=" + actual.labelValues[i].label + " count=" + actual.labelValues[i].value);
}
}
assertEquals(desc + ": topN", expectedTopN, actual.labelValues.length);
assertEquals(desc + ": childCount", expectedChildCount, actual.childCount);
assertEquals(desc + ": totCount", expectedTotalCount, actual.value.intValue());
assertTrue(actual.labelValues.length <= topN);
for (int i = 0; i < expectedTopN; i++) {
assertEquals(desc + ": label[" + i + "]", Long.toString(expectedCounts.get(i).getKey()), actual.labelValues[i].label);
assertEquals(desc + ": counts[" + i + "]", expectedCounts.get(i).getValue().intValue(), actual.labelValues[i].value.intValue());
}
}
/**
* LUCENE-9964: Duplicate long values in a document field should only be counted once when using
* SortedNumericDocValuesFields
*/
public void testDuplicateLongValues() throws Exception {
Directory dir = newDirectory();
RandomIndexWriter w = new RandomIndexWriter(random(), dir);
Document doc = new Document();
// these two values are not unique in a document
doc.add(new SortedNumericDocValuesField("field", 42));
doc.add(new SortedNumericDocValuesField("field", 42));
w.addDocument(doc);
doc = new Document();
doc.add(new SortedNumericDocValuesField("field", 43));
doc.add(new SortedNumericDocValuesField("field", 43));
w.addDocument(doc);
IndexReader r = w.getReader();
w.close();
LongValueFacetCounts facetCounts = new LongValueFacetCounts("field", r, true);
FacetResult fr = facetCounts.getAllChildrenSortByValue();
for (LabelAndValue labelAndValue : fr.labelValues) {
assert labelAndValue.value.equals(1);
}
r.close();
dir.close();
}
}