blob: 3871ef513f7c705749775f6fc56db386cfa50254 [file] [log] [blame]
package org.apache.lucene.search.grouping;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.document.BinaryDocValuesField;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.NumericDocValuesField;
import org.apache.lucene.document.SortedDocValuesField;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.FieldInfo.DocValuesType;
import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.index.StoredDocument;
import org.apache.lucene.index.Term;
import org.apache.lucene.queries.function.valuesource.BytesRefFieldSource;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Sort;
import org.apache.lucene.search.SortField;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.grouping.function.FunctionDistinctValuesCollector;
import org.apache.lucene.search.grouping.function.FunctionFirstPassGroupingCollector;
import org.apache.lucene.search.grouping.term.TermDistinctValuesCollector;
import org.apache.lucene.search.grouping.term.TermFirstPassGroupingCollector;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.TestUtil;
import org.apache.lucene.util.mutable.MutableValue;
import org.apache.lucene.util.mutable.MutableValueStr;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Random;
import java.util.Set;
public class DistinctValuesCollectorTest extends AbstractGroupingTestCase {
private final static NullComparator nullComparator = new NullComparator();
private final String groupField = "author";
private final String dvGroupField = "author_dv";
private final String countField = "publisher";
private final String dvCountField = "publisher_dv";
public void testSimple() throws Exception {
Random random = random();
DocValuesType[] dvTypes = new DocValuesType[]{
DocValuesType.NUMERIC,
DocValuesType.BINARY,
DocValuesType.SORTED,
};
Directory dir = newDirectory();
RandomIndexWriter w = new RandomIndexWriter(
random,
dir,
newIndexWriterConfig(TEST_VERSION_CURRENT,
new MockAnalyzer(random)).setMergePolicy(newLogMergePolicy()));
boolean canUseDV = true;
DocValuesType dvType = canUseDV ? dvTypes[random.nextInt(dvTypes.length)] : null;
Document doc = new Document();
addField(doc, groupField, "1", dvType);
addField(doc, countField, "1", dvType);
doc.add(new TextField("content", "random text", Field.Store.NO));
doc.add(new StringField("id", "1", Field.Store.NO));
w.addDocument(doc);
// 1
doc = new Document();
addField(doc, groupField, "1", dvType);
addField(doc, countField, "1", dvType);
doc.add(new TextField("content", "some more random text blob", Field.Store.NO));
doc.add(new StringField("id", "2", Field.Store.NO));
w.addDocument(doc);
// 2
doc = new Document();
addField(doc, groupField, "1", dvType);
addField(doc, countField, "2", dvType);
doc.add(new TextField("content", "some more random textual data", Field.Store.NO));
doc.add(new StringField("id", "3", Field.Store.NO));
w.addDocument(doc);
w.commit(); // To ensure a second segment
// 3
doc = new Document();
addField(doc, groupField, "2", dvType);
doc.add(new TextField("content", "some random text", Field.Store.NO));
doc.add(new StringField("id", "4", Field.Store.NO));
w.addDocument(doc);
// 4
doc = new Document();
addField(doc, groupField, "3", dvType);
addField(doc, countField, "1", dvType);
doc.add(new TextField("content", "some more random text", Field.Store.NO));
doc.add(new StringField("id", "5", Field.Store.NO));
w.addDocument(doc);
// 5
doc = new Document();
addField(doc, groupField, "3", dvType);
addField(doc, countField, "1", dvType);
doc.add(new TextField("content", "random blob", Field.Store.NO));
doc.add(new StringField("id", "6", Field.Store.NO));
w.addDocument(doc);
// 6 -- no author field
doc = new Document();
doc.add(new TextField("content", "random word stuck in alot of other text", Field.Store.YES));
addField(doc, countField, "1", dvType);
doc.add(new StringField("id", "6", Field.Store.NO));
w.addDocument(doc);
IndexSearcher indexSearcher = newSearcher(w.getReader());
w.shutdown();
Comparator<AbstractDistinctValuesCollector.GroupCount<Comparable<Object>>> cmp = new Comparator<AbstractDistinctValuesCollector.GroupCount<Comparable<Object>>>() {
@Override
public int compare(AbstractDistinctValuesCollector.GroupCount<Comparable<Object>> groupCount1, AbstractDistinctValuesCollector.GroupCount<Comparable<Object>> groupCount2) {
if (groupCount1.groupValue == null) {
if (groupCount2.groupValue == null) {
return 0;
}
return -1;
} else if (groupCount2.groupValue == null) {
return 1;
} else {
return groupCount1.groupValue.compareTo(groupCount2.groupValue);
}
}
};
// === Search for content:random
AbstractFirstPassGroupingCollector<Comparable<Object>> firstCollector = createRandomFirstPassCollector(dvType, new Sort(), groupField, 10);
indexSearcher.search(new TermQuery(new Term("content", "random")), firstCollector);
AbstractDistinctValuesCollector<? extends AbstractDistinctValuesCollector.GroupCount<Comparable<Object>>> distinctValuesCollector
= createDistinctCountCollector(firstCollector, groupField, countField, dvType);
indexSearcher.search(new TermQuery(new Term("content", "random")), distinctValuesCollector);
List<? extends AbstractDistinctValuesCollector.GroupCount<Comparable<Object>>> gcs = distinctValuesCollector.getGroups();
Collections.sort(gcs, cmp);
assertEquals(4, gcs.size());
compareNull(gcs.get(0).groupValue);
List<Comparable<?>> countValues = new ArrayList<Comparable<?>>(gcs.get(0).uniqueValues);
assertEquals(1, countValues.size());
compare("1", countValues.get(0));
compare("1", gcs.get(1).groupValue);
countValues = new ArrayList<Comparable<?>>(gcs.get(1).uniqueValues);
Collections.sort(countValues, nullComparator);
assertEquals(2, countValues.size());
compare("1", countValues.get(0));
compare("2", countValues.get(1));
compare("2", gcs.get(2).groupValue);
countValues = new ArrayList<Comparable<?>>(gcs.get(2).uniqueValues);
assertEquals(1, countValues.size());
compareNull(countValues.get(0));
compare("3", gcs.get(3).groupValue);
countValues = new ArrayList<Comparable<?>>(gcs.get(3).uniqueValues);
assertEquals(1, countValues.size());
compare("1", countValues.get(0));
// === Search for content:some
firstCollector = createRandomFirstPassCollector(dvType, new Sort(), groupField, 10);
indexSearcher.search(new TermQuery(new Term("content", "some")), firstCollector);
distinctValuesCollector = createDistinctCountCollector(firstCollector, groupField, countField, dvType);
indexSearcher.search(new TermQuery(new Term("content", "some")), distinctValuesCollector);
gcs = distinctValuesCollector.getGroups();
Collections.sort(gcs, cmp);
assertEquals(3, gcs.size());
compare("1", gcs.get(0).groupValue);
countValues = new ArrayList<Comparable<?>>(gcs.get(0).uniqueValues);
assertEquals(2, countValues.size());
Collections.sort(countValues, nullComparator);
compare("1", countValues.get(0));
compare("2", countValues.get(1));
compare("2", gcs.get(1).groupValue);
countValues = new ArrayList<Comparable<?>>(gcs.get(1).uniqueValues);
assertEquals(1, countValues.size());
compareNull(countValues.get(0));
compare("3", gcs.get(2).groupValue);
countValues = new ArrayList<Comparable<?>>(gcs.get(2).uniqueValues);
assertEquals(1, countValues.size());
compare("1", countValues.get(0));
// === Search for content:blob
firstCollector = createRandomFirstPassCollector(dvType, new Sort(), groupField, 10);
indexSearcher.search(new TermQuery(new Term("content", "blob")), firstCollector);
distinctValuesCollector = createDistinctCountCollector(firstCollector, groupField, countField, dvType);
indexSearcher.search(new TermQuery(new Term("content", "blob")), distinctValuesCollector);
gcs = distinctValuesCollector.getGroups();
Collections.sort(gcs, cmp);
assertEquals(2, gcs.size());
compare("1", gcs.get(0).groupValue);
countValues = new ArrayList<Comparable<?>>(gcs.get(0).uniqueValues);
// B/c the only one document matched with blob inside the author 1 group
assertEquals(1, countValues.size());
compare("1", countValues.get(0));
compare("3", gcs.get(1).groupValue);
countValues = new ArrayList<Comparable<?>>(gcs.get(1).uniqueValues);
assertEquals(1, countValues.size());
compare("1", countValues.get(0));
indexSearcher.getIndexReader().close();
dir.close();
}
public void testRandom() throws Exception {
Random random = random();
int numberOfRuns = TestUtil.nextInt(random, 3, 6);
for (int indexIter = 0; indexIter < numberOfRuns; indexIter++) {
IndexContext context = createIndexContext();
for (int searchIter = 0; searchIter < 100; searchIter++) {
final IndexSearcher searcher = newSearcher(context.indexReader);
boolean useDv = context.dvType != null && random.nextBoolean();
DocValuesType dvType = useDv ? context.dvType : null;
String term = context.contentStrings[random.nextInt(context.contentStrings.length)];
Sort groupSort = new Sort(new SortField("id", SortField.Type.STRING));
int topN = 1 + random.nextInt(10);
List<AbstractDistinctValuesCollector.GroupCount<Comparable<?>>> expectedResult = createExpectedResult(context, term, groupSort, topN);
AbstractFirstPassGroupingCollector<Comparable<?>> firstCollector = createRandomFirstPassCollector(dvType, groupSort, groupField, topN);
searcher.search(new TermQuery(new Term("content", term)), firstCollector);
AbstractDistinctValuesCollector<? extends AbstractDistinctValuesCollector.GroupCount<Comparable<?>>> distinctValuesCollector
= createDistinctCountCollector(firstCollector, groupField, countField, dvType);
searcher.search(new TermQuery(new Term("content", term)), distinctValuesCollector);
@SuppressWarnings("unchecked")
List<AbstractDistinctValuesCollector.GroupCount<Comparable<?>>> actualResult = (List<AbstractDistinctValuesCollector.GroupCount<Comparable<?>>>) distinctValuesCollector.getGroups();
if (VERBOSE) {
System.out.println("Index iter=" + indexIter);
System.out.println("Search iter=" + searchIter);
System.out.println("1st pass collector class name=" + firstCollector.getClass().getName());
System.out.println("2nd pass collector class name=" + distinctValuesCollector.getClass().getName());
System.out.println("Search term=" + term);
System.out.println("DVType=" + dvType);
System.out.println("1st pass groups=" + firstCollector.getTopGroups(0, false));
System.out.println("Expected:");
printGroups(expectedResult);
System.out.println("Actual:");
printGroups(actualResult);
}
assertEquals(expectedResult.size(), actualResult.size());
for (int i = 0; i < expectedResult.size(); i++) {
AbstractDistinctValuesCollector.GroupCount<Comparable<?>> expected = expectedResult.get(i);
AbstractDistinctValuesCollector.GroupCount<Comparable<?>> actual = actualResult.get(i);
assertValues(expected.groupValue, actual.groupValue);
assertEquals(expected.uniqueValues.size(), actual.uniqueValues.size());
List<Comparable<?>> expectedUniqueValues = new ArrayList<>(expected.uniqueValues);
Collections.sort(expectedUniqueValues, nullComparator);
List<Comparable<?>> actualUniqueValues = new ArrayList<>(actual.uniqueValues);
Collections.sort(actualUniqueValues, nullComparator);
for (int j = 0; j < expectedUniqueValues.size(); j++) {
assertValues(expectedUniqueValues.get(j), actualUniqueValues.get(j));
}
}
}
context.indexReader.close();
context.directory.close();
}
}
private void printGroups(List<AbstractDistinctValuesCollector.GroupCount<Comparable<?>>> results) {
for(int i=0;i<results.size();i++) {
AbstractDistinctValuesCollector.GroupCount<Comparable<?>> group = results.get(i);
Object gv = group.groupValue;
if (gv instanceof BytesRef) {
System.out.println(i + ": groupValue=" + ((BytesRef) gv).utf8ToString());
} else {
System.out.println(i + ": groupValue=" + gv);
}
for(Object o : group.uniqueValues) {
if (o instanceof BytesRef) {
System.out.println(" " + ((BytesRef) o).utf8ToString());
} else {
System.out.println(" " + o);
}
}
}
}
private void assertValues(Object expected, Object actual) {
if (expected == null) {
compareNull(actual);
} else {
compare(((BytesRef) expected).utf8ToString(), actual);
}
}
private void compare(String expected, Object groupValue) {
if (BytesRef.class.isAssignableFrom(groupValue.getClass())) {
assertEquals(expected, ((BytesRef) groupValue).utf8ToString());
} else if (Double.class.isAssignableFrom(groupValue.getClass())) {
assertEquals(Double.parseDouble(expected), groupValue);
} else if (Long.class.isAssignableFrom(groupValue.getClass())) {
assertEquals(Long.parseLong(expected), groupValue);
} else if (MutableValue.class.isAssignableFrom(groupValue.getClass())) {
MutableValueStr mutableValue = new MutableValueStr();
mutableValue.value = new BytesRef(expected);
assertEquals(mutableValue, groupValue);
} else {
fail();
}
}
private void compareNull(Object groupValue) {
if (groupValue == null) {
return; // term based impl...
}
// DV based impls..
if (BytesRef.class.isAssignableFrom(groupValue.getClass())) {
assertEquals("", ((BytesRef) groupValue).utf8ToString());
} else if (Double.class.isAssignableFrom(groupValue.getClass())) {
assertEquals(0.0d, groupValue);
} else if (Long.class.isAssignableFrom(groupValue.getClass())) {
assertEquals(0L, groupValue);
// Function based impl
} else if (MutableValue.class.isAssignableFrom(groupValue.getClass())) {
assertFalse(((MutableValue) groupValue).exists());
} else {
fail();
}
}
private void addField(Document doc, String field, String value, DocValuesType type) {
doc.add(new StringField(field, value, Field.Store.YES));
if (type == null) {
return;
}
String dvField = field + "_dv";
Field valuesField = null;
switch (type) {
case NUMERIC:
valuesField = new NumericDocValuesField(dvField, Integer.parseInt(value));
break;
case BINARY:
valuesField = new BinaryDocValuesField(dvField, new BytesRef(value));
break;
case SORTED:
valuesField = new SortedDocValuesField(dvField, new BytesRef(value));
break;
}
doc.add(valuesField);
}
@SuppressWarnings({"unchecked","rawtypes"})
private <T extends Comparable> AbstractDistinctValuesCollector<AbstractDistinctValuesCollector.GroupCount<T>> createDistinctCountCollector(AbstractFirstPassGroupingCollector<T> firstPassGroupingCollector,
String groupField,
String countField,
DocValuesType dvType) {
Random random = random();
Collection<SearchGroup<T>> searchGroups = firstPassGroupingCollector.getTopGroups(0, false);
if (FunctionFirstPassGroupingCollector.class.isAssignableFrom(firstPassGroupingCollector.getClass())) {
return (AbstractDistinctValuesCollector) new FunctionDistinctValuesCollector(new HashMap<>(), new BytesRefFieldSource(groupField), new BytesRefFieldSource(countField), (Collection) searchGroups);
} else {
return (AbstractDistinctValuesCollector) new TermDistinctValuesCollector(groupField, countField, (Collection) searchGroups);
}
}
@SuppressWarnings({"unchecked","rawtypes"})
private <T> AbstractFirstPassGroupingCollector<T> createRandomFirstPassCollector(DocValuesType dvType, Sort groupSort, String groupField, int topNGroups) throws IOException {
Random random = random();
if (dvType != null) {
if (random.nextBoolean()) {
return (AbstractFirstPassGroupingCollector<T>) new FunctionFirstPassGroupingCollector(new BytesRefFieldSource(groupField), new HashMap<>(), groupSort, topNGroups);
} else {
return (AbstractFirstPassGroupingCollector<T>) new TermFirstPassGroupingCollector(groupField, groupSort, topNGroups);
}
} else {
if (random.nextBoolean()) {
return (AbstractFirstPassGroupingCollector<T>) new FunctionFirstPassGroupingCollector(new BytesRefFieldSource(groupField), new HashMap<>(), groupSort, topNGroups);
} else {
return (AbstractFirstPassGroupingCollector<T>) new TermFirstPassGroupingCollector(groupField, groupSort, topNGroups);
}
}
}
@SuppressWarnings({"unchecked","rawtypes"})
private List<AbstractDistinctValuesCollector.GroupCount<Comparable<?>>> createExpectedResult(IndexContext context, String term, Sort groupSort, int topN) {
class GroupCount extends AbstractDistinctValuesCollector.GroupCount<BytesRef> {
GroupCount(BytesRef groupValue, Collection<BytesRef> uniqueValues) {
super(groupValue);
this.uniqueValues.addAll(uniqueValues);
}
}
List result = new ArrayList();
Map<String, Set<String>> groupCounts = context.searchTermToGroupCounts.get(term);
int i = 0;
for (String group : groupCounts.keySet()) {
if (topN <= i++) {
break;
}
Set<BytesRef> uniqueValues = new HashSet<>();
for (String val : groupCounts.get(group)) {
uniqueValues.add(val != null ? new BytesRef(val) : null);
}
result.add(new GroupCount(group != null ? new BytesRef(group) : null, uniqueValues));
}
return result;
}
private IndexContext createIndexContext() throws Exception {
Random random = random();
DocValuesType[] dvTypes = new DocValuesType[]{
DocValuesType.BINARY,
DocValuesType.SORTED
};
Directory dir = newDirectory();
RandomIndexWriter w = new RandomIndexWriter(
random,
dir,
newIndexWriterConfig(TEST_VERSION_CURRENT,
new MockAnalyzer(random)).setMergePolicy(newLogMergePolicy())
);
boolean canUseDV = true;
DocValuesType dvType = canUseDV ? dvTypes[random.nextInt(dvTypes.length)] : null;
int numDocs = 86 + random.nextInt(1087) * RANDOM_MULTIPLIER;
String[] groupValues = new String[numDocs / 5];
String[] countValues = new String[numDocs / 10];
for (int i = 0; i < groupValues.length; i++) {
groupValues[i] = generateRandomNonEmptyString();
}
for (int i = 0; i < countValues.length; i++) {
countValues[i] = generateRandomNonEmptyString();
}
List<String> contentStrings = new ArrayList<>();
Map<String, Map<String, Set<String>>> searchTermToGroupCounts = new HashMap<>();
for (int i = 1; i <= numDocs; i++) {
String groupValue = random.nextInt(23) == 14 ? null : groupValues[random.nextInt(groupValues.length)];
String countValue = random.nextInt(21) == 13 ? null : countValues[random.nextInt(countValues.length)];
String content = "random" + random.nextInt(numDocs / 20);
Map<String, Set<String>> groupToCounts = searchTermToGroupCounts.get(content);
if (groupToCounts == null) {
// Groups sort always DOCID asc...
searchTermToGroupCounts.put(content, groupToCounts = new LinkedHashMap<>());
contentStrings.add(content);
}
Set<String> countsVals = groupToCounts.get(groupValue);
if (countsVals == null) {
groupToCounts.put(groupValue, countsVals = new HashSet<>());
}
countsVals.add(countValue);
Document doc = new Document();
doc.add(new StringField("id", String.format(Locale.ROOT, "%09d", i), Field.Store.YES));
if (groupValue != null) {
addField(doc, groupField, groupValue, dvType);
}
if (countValue != null) {
addField(doc, countField, countValue, dvType);
}
doc.add(new TextField("content", content, Field.Store.YES));
w.addDocument(doc);
}
DirectoryReader reader = w.getReader();
if (VERBOSE) {
for(int docID=0;docID<reader.maxDoc();docID++) {
StoredDocument doc = reader.document(docID);
System.out.println("docID=" + docID + " id=" + doc.get("id") + " content=" + doc.get("content") + " author=" + doc.get("author") + " publisher=" + doc.get("publisher"));
}
}
w.shutdown();
return new IndexContext(dir, reader, dvType, searchTermToGroupCounts, contentStrings.toArray(new String[contentStrings.size()]));
}
private static class IndexContext {
final Directory directory;
final DirectoryReader indexReader;
final DocValuesType dvType;
final Map<String, Map<String, Set<String>>> searchTermToGroupCounts;
final String[] contentStrings;
IndexContext(Directory directory, DirectoryReader indexReader, DocValuesType dvType,
Map<String, Map<String, Set<String>>> searchTermToGroupCounts, String[] contentStrings) {
this.directory = directory;
this.indexReader = indexReader;
this.dvType = dvType;
this.searchTermToGroupCounts = searchTermToGroupCounts;
this.contentStrings = contentStrings;
}
}
private static class NullComparator implements Comparator<Comparable<?>> {
@Override
@SuppressWarnings({"unchecked","rawtypes"})
public int compare(Comparable a, Comparable b) {
if (a == b) {
return 0;
} else if (a == null) {
return -1;
} else if (b == null) {
return 1;
} else {
return a.compareTo(b);
}
}
}
}