lucene/grouping/src/test/org/apache/lucene/search/grouping/DistinctValuesCollectorTest.java - lucene-solr - Git at Google

 package org.apache.lucene.search.grouping;

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 import org.apache.lucene.analysis.MockAnalyzer;
 import org.apache.lucene.document.BinaryDocValuesField;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Field;
 import org.apache.lucene.document.NumericDocValuesField;
 import org.apache.lucene.document.SortedDocValuesField;
 import org.apache.lucene.document.StringField;
 import org.apache.lucene.document.TextField;
 import org.apache.lucene.index.DirectoryReader;
 import org.apache.lucene.index.FieldInfo.DocValuesType;
 import org.apache.lucene.index.RandomIndexWriter;
 import org.apache.lucene.index.StoredDocument;
 import org.apache.lucene.index.Term;
 import org.apache.lucene.queries.function.valuesource.BytesRefFieldSource;
 import org.apache.lucene.search.IndexSearcher;
 import org.apache.lucene.search.Sort;
 import org.apache.lucene.search.SortField;
 import org.apache.lucene.search.TermQuery;
 import org.apache.lucene.search.grouping.function.FunctionDistinctValuesCollector;
 import org.apache.lucene.search.grouping.function.FunctionFirstPassGroupingCollector;
 import org.apache.lucene.search.grouping.term.TermDistinctValuesCollector;
 import org.apache.lucene.search.grouping.term.TermFirstPassGroupingCollector;
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.TestUtil;
 import org.apache.lucene.util.mutable.MutableValue;
 import org.apache.lucene.util.mutable.MutableValueStr;

 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.Collection;
 import java.util.Collections;
 import java.util.Comparator;
 import java.util.HashMap;
 import java.util.HashSet;
 import java.util.LinkedHashMap;
 import java.util.List;
 import java.util.Locale;
 import java.util.Map;
 import java.util.Random;
 import java.util.Set;

 public class DistinctValuesCollectorTest extends AbstractGroupingTestCase {

   private final static NullComparator nullComparator = new NullComparator();

   private final String groupField = "author";
   private final String dvGroupField = "author_dv";
   private final String countField = "publisher";
   private final String dvCountField = "publisher_dv";

   public void testSimple() throws Exception {
     Random random = random();
     DocValuesType[] dvTypes = new DocValuesType[]{
         DocValuesType.NUMERIC,
         DocValuesType.BINARY,
         DocValuesType.SORTED,
     };
     Directory dir = newDirectory();
     RandomIndexWriter w = new RandomIndexWriter(
         random,
         dir,
         newIndexWriterConfig(TEST_VERSION_CURRENT,
             new MockAnalyzer(random)).setMergePolicy(newLogMergePolicy()));
     boolean canUseDV = true;
     DocValuesType dvType = canUseDV ? dvTypes[random.nextInt(dvTypes.length)] : null;

     Document doc = new Document();
     addField(doc, groupField, "1", dvType);
     addField(doc, countField, "1", dvType);
     doc.add(new TextField("content", "random text", Field.Store.NO));
     doc.add(new StringField("id", "1", Field.Store.NO));
     w.addDocument(doc);

     // 1
     doc = new Document();
     addField(doc, groupField, "1", dvType);
     addField(doc, countField, "1", dvType);
     doc.add(new TextField("content", "some more random text blob", Field.Store.NO));
     doc.add(new StringField("id", "2", Field.Store.NO));
     w.addDocument(doc);

     // 2
     doc = new Document();
     addField(doc, groupField, "1", dvType);
     addField(doc, countField, "2", dvType);
     doc.add(new TextField("content", "some more random textual data", Field.Store.NO));
     doc.add(new StringField("id", "3", Field.Store.NO));
     w.addDocument(doc);
     w.commit(); // To ensure a second segment

     // 3
     doc = new Document();
     addField(doc, groupField, "2", dvType);
     doc.add(new TextField("content", "some random text", Field.Store.NO));
     doc.add(new StringField("id", "4", Field.Store.NO));
     w.addDocument(doc);

     // 4
     doc = new Document();
     addField(doc, groupField, "3", dvType);
     addField(doc, countField, "1", dvType);
     doc.add(new TextField("content", "some more random text", Field.Store.NO));
     doc.add(new StringField("id", "5", Field.Store.NO));
     w.addDocument(doc);

     // 5
     doc = new Document();
     addField(doc, groupField, "3", dvType);
     addField(doc, countField, "1", dvType);
     doc.add(new TextField("content", "random blob", Field.Store.NO));
     doc.add(new StringField("id", "6", Field.Store.NO));
     w.addDocument(doc);

     // 6 -- no author field
     doc = new Document();
     doc.add(new TextField("content", "random word stuck in alot of other text", Field.Store.YES));
     addField(doc, countField, "1", dvType);
     doc.add(new StringField("id", "6", Field.Store.NO));
     w.addDocument(doc);

     IndexSearcher indexSearcher = newSearcher(w.getReader());
     w.shutdown();

     Comparator<AbstractDistinctValuesCollector.GroupCount<Comparable<Object>>> cmp = new Comparator<AbstractDistinctValuesCollector.GroupCount<Comparable<Object>>>() {

       @Override
       public int compare(AbstractDistinctValuesCollector.GroupCount<Comparable<Object>> groupCount1, AbstractDistinctValuesCollector.GroupCount<Comparable<Object>> groupCount2) {
         if (groupCount1.groupValue == null) {
           if (groupCount2.groupValue == null) {
             return 0;
           }
           return -1;
         } else if (groupCount2.groupValue == null) {
           return 1;
         } else {
           return groupCount1.groupValue.compareTo(groupCount2.groupValue);
         }
       }

     };

     // === Search for content:random
     AbstractFirstPassGroupingCollector<Comparable<Object>> firstCollector = createRandomFirstPassCollector(dvType, new Sort(), groupField, 10);
     indexSearcher.search(new TermQuery(new Term("content", "random")), firstCollector);
     AbstractDistinctValuesCollector<? extends AbstractDistinctValuesCollector.GroupCount<Comparable<Object>>> distinctValuesCollector
         = createDistinctCountCollector(firstCollector, groupField, countField, dvType);
     indexSearcher.search(new TermQuery(new Term("content", "random")), distinctValuesCollector);

     List<? extends AbstractDistinctValuesCollector.GroupCount<Comparable<Object>>> gcs =  distinctValuesCollector.getGroups();
     Collections.sort(gcs, cmp);
     assertEquals(4, gcs.size());

     compareNull(gcs.get(0).groupValue);
     List<Comparable<?>> countValues = new ArrayList<Comparable<?>>(gcs.get(0).uniqueValues);
     assertEquals(1, countValues.size());
     compare("1", countValues.get(0));

     compare("1", gcs.get(1).groupValue);
     countValues = new ArrayList<Comparable<?>>(gcs.get(1).uniqueValues);
     Collections.sort(countValues, nullComparator);
     assertEquals(2, countValues.size());
     compare("1", countValues.get(0));
     compare("2", countValues.get(1));

     compare("2", gcs.get(2).groupValue);
     countValues = new ArrayList<Comparable<?>>(gcs.get(2).uniqueValues);
     assertEquals(1, countValues.size());
     compareNull(countValues.get(0));

     compare("3", gcs.get(3).groupValue);
     countValues = new ArrayList<Comparable<?>>(gcs.get(3).uniqueValues);
     assertEquals(1, countValues.size());
     compare("1", countValues.get(0));

     // === Search for content:some
     firstCollector = createRandomFirstPassCollector(dvType, new Sort(), groupField, 10);
     indexSearcher.search(new TermQuery(new Term("content", "some")), firstCollector);
     distinctValuesCollector = createDistinctCountCollector(firstCollector, groupField, countField, dvType);
     indexSearcher.search(new TermQuery(new Term("content", "some")), distinctValuesCollector);

     gcs = distinctValuesCollector.getGroups();
     Collections.sort(gcs, cmp);
     assertEquals(3, gcs.size());

     compare("1", gcs.get(0).groupValue);
     countValues = new ArrayList<Comparable<?>>(gcs.get(0).uniqueValues);
     assertEquals(2, countValues.size());
     Collections.sort(countValues, nullComparator);
     compare("1", countValues.get(0));
     compare("2", countValues.get(1));

     compare("2", gcs.get(1).groupValue);
     countValues = new ArrayList<Comparable<?>>(gcs.get(1).uniqueValues);
     assertEquals(1, countValues.size());
     compareNull(countValues.get(0));

     compare("3", gcs.get(2).groupValue);
     countValues = new ArrayList<Comparable<?>>(gcs.get(2).uniqueValues);
     assertEquals(1, countValues.size());
     compare("1", countValues.get(0));

      // === Search for content:blob
     firstCollector = createRandomFirstPassCollector(dvType, new Sort(), groupField, 10);
     indexSearcher.search(new TermQuery(new Term("content", "blob")), firstCollector);
     distinctValuesCollector = createDistinctCountCollector(firstCollector, groupField, countField, dvType);
     indexSearcher.search(new TermQuery(new Term("content", "blob")), distinctValuesCollector);

     gcs = distinctValuesCollector.getGroups();
     Collections.sort(gcs, cmp);
     assertEquals(2, gcs.size());

     compare("1", gcs.get(0).groupValue);
     countValues = new ArrayList<Comparable<?>>(gcs.get(0).uniqueValues);
     // B/c the only one document matched with blob inside the author 1 group
     assertEquals(1, countValues.size());
     compare("1", countValues.get(0));

     compare("3", gcs.get(1).groupValue);
     countValues = new ArrayList<Comparable<?>>(gcs.get(1).uniqueValues);
     assertEquals(1, countValues.size());
     compare("1", countValues.get(0));

     indexSearcher.getIndexReader().close();
     dir.close();
   }

   public void testRandom() throws Exception {
     Random random = random();
     int numberOfRuns = TestUtil.nextInt(random, 3, 6);
     for (int indexIter = 0; indexIter < numberOfRuns; indexIter++) {
       IndexContext context = createIndexContext();
       for (int searchIter = 0; searchIter < 100; searchIter++) {
         final IndexSearcher searcher = newSearcher(context.indexReader);
         boolean useDv = context.dvType != null && random.nextBoolean();
         DocValuesType dvType = useDv ? context.dvType : null;
         String term = context.contentStrings[random.nextInt(context.contentStrings.length)];
         Sort groupSort = new Sort(new SortField("id", SortField.Type.STRING));
         int topN = 1 + random.nextInt(10);

         List<AbstractDistinctValuesCollector.GroupCount<Comparable<?>>> expectedResult = createExpectedResult(context, term, groupSort, topN);

         AbstractFirstPassGroupingCollector<Comparable<?>> firstCollector = createRandomFirstPassCollector(dvType, groupSort, groupField, topN);
         searcher.search(new TermQuery(new Term("content", term)), firstCollector);
         AbstractDistinctValuesCollector<? extends AbstractDistinctValuesCollector.GroupCount<Comparable<?>>> distinctValuesCollector
             = createDistinctCountCollector(firstCollector, groupField, countField, dvType);
         searcher.search(new TermQuery(new Term("content", term)), distinctValuesCollector);
         @SuppressWarnings("unchecked")
         List<AbstractDistinctValuesCollector.GroupCount<Comparable<?>>> actualResult = (List<AbstractDistinctValuesCollector.GroupCount<Comparable<?>>>) distinctValuesCollector.getGroups();

         if (VERBOSE) {
           System.out.println("Index iter=" + indexIter);
           System.out.println("Search iter=" + searchIter);
           System.out.println("1st pass collector class name=" + firstCollector.getClass().getName());
           System.out.println("2nd pass collector class name=" + distinctValuesCollector.getClass().getName());
           System.out.println("Search term=" + term);
           System.out.println("DVType=" + dvType);
           System.out.println("1st pass groups=" + firstCollector.getTopGroups(0, false));
           System.out.println("Expected:");
           printGroups(expectedResult);
           System.out.println("Actual:");
           printGroups(actualResult);
         }

         assertEquals(expectedResult.size(), actualResult.size());
         for (int i = 0; i < expectedResult.size(); i++) {
           AbstractDistinctValuesCollector.GroupCount<Comparable<?>> expected = expectedResult.get(i);
           AbstractDistinctValuesCollector.GroupCount<Comparable<?>> actual = actualResult.get(i);
           assertValues(expected.groupValue, actual.groupValue);
           assertEquals(expected.uniqueValues.size(), actual.uniqueValues.size());
           List<Comparable<?>> expectedUniqueValues = new ArrayList<>(expected.uniqueValues);
           Collections.sort(expectedUniqueValues, nullComparator);
           List<Comparable<?>> actualUniqueValues = new ArrayList<>(actual.uniqueValues);
           Collections.sort(actualUniqueValues, nullComparator);
           for (int j = 0; j < expectedUniqueValues.size(); j++) {
             assertValues(expectedUniqueValues.get(j), actualUniqueValues.get(j));
           }
         }
       }
       context.indexReader.close();
       context.directory.close();
     }
   }

   private void printGroups(List<AbstractDistinctValuesCollector.GroupCount<Comparable<?>>> results) {
     for(int i=0;i<results.size();i++) {
       AbstractDistinctValuesCollector.GroupCount<Comparable<?>> group = results.get(i);
       Object gv = group.groupValue;
       if (gv instanceof BytesRef) {
         System.out.println(i + ": groupValue=" + ((BytesRef) gv).utf8ToString());
       } else {
         System.out.println(i + ": groupValue=" + gv);
       }
       for(Object o : group.uniqueValues) {
         if (o instanceof BytesRef) {
           System.out.println("  " + ((BytesRef) o).utf8ToString());
         } else {
           System.out.println("  " + o);
         }
       }
     }
   }

   private void assertValues(Object expected, Object actual) {
     if (expected == null) {
       compareNull(actual);
     } else {
       compare(((BytesRef) expected).utf8ToString(), actual);
     }
   }

   private void compare(String expected, Object groupValue) {
     if (BytesRef.class.isAssignableFrom(groupValue.getClass())) {
       assertEquals(expected, ((BytesRef) groupValue).utf8ToString());
     } else if (Double.class.isAssignableFrom(groupValue.getClass())) {
       assertEquals(Double.parseDouble(expected), groupValue);
     } else if (Long.class.isAssignableFrom(groupValue.getClass())) {
       assertEquals(Long.parseLong(expected), groupValue);
     } else if (MutableValue.class.isAssignableFrom(groupValue.getClass())) {
       MutableValueStr mutableValue = new MutableValueStr();
       mutableValue.value = new BytesRef(expected);
       assertEquals(mutableValue, groupValue);
     } else {
       fail();
     }
   }

   private void compareNull(Object groupValue) {
     if (groupValue == null) {
       return; // term based impl...
     }
     // DV based impls..
     if (BytesRef.class.isAssignableFrom(groupValue.getClass())) {
       assertEquals("", ((BytesRef) groupValue).utf8ToString());
     } else if (Double.class.isAssignableFrom(groupValue.getClass())) {
       assertEquals(0.0d, groupValue);
     } else if (Long.class.isAssignableFrom(groupValue.getClass())) {
       assertEquals(0L, groupValue);
       // Function based impl
     } else if (MutableValue.class.isAssignableFrom(groupValue.getClass())) {
       assertFalse(((MutableValue) groupValue).exists());
     } else {
       fail();
     }
   }

   private void addField(Document doc, String field, String value, DocValuesType type) {
     doc.add(new StringField(field, value, Field.Store.YES));
     if (type == null) {
       return;
     }
     String dvField = field + "_dv";

     Field valuesField = null;
     switch (type) {
       case NUMERIC:
         valuesField = new NumericDocValuesField(dvField, Integer.parseInt(value));
         break;
       case BINARY:
         valuesField = new BinaryDocValuesField(dvField, new BytesRef(value));
         break;
       case SORTED:
         valuesField = new SortedDocValuesField(dvField, new BytesRef(value));
         break;
     }
     doc.add(valuesField);
   }

   @SuppressWarnings({"unchecked","rawtypes"})
   private <T extends Comparable> AbstractDistinctValuesCollector<AbstractDistinctValuesCollector.GroupCount<T>> createDistinctCountCollector(AbstractFirstPassGroupingCollector<T> firstPassGroupingCollector,
                                                                       String groupField,
                                                                       String countField,
                                                                       DocValuesType dvType) {
     Random random = random();
     Collection<SearchGroup<T>> searchGroups = firstPassGroupingCollector.getTopGroups(0, false);
     if (FunctionFirstPassGroupingCollector.class.isAssignableFrom(firstPassGroupingCollector.getClass())) {
       return (AbstractDistinctValuesCollector) new FunctionDistinctValuesCollector(new HashMap<>(), new BytesRefFieldSource(groupField), new BytesRefFieldSource(countField), (Collection) searchGroups);
     } else {
       return (AbstractDistinctValuesCollector) new TermDistinctValuesCollector(groupField, countField, (Collection) searchGroups);
     }
   }

   @SuppressWarnings({"unchecked","rawtypes"})
   private <T> AbstractFirstPassGroupingCollector<T> createRandomFirstPassCollector(DocValuesType dvType, Sort groupSort, String groupField, int topNGroups) throws IOException {
     Random random = random();
     if (dvType != null) {
       if (random.nextBoolean()) {
         return (AbstractFirstPassGroupingCollector<T>) new FunctionFirstPassGroupingCollector(new BytesRefFieldSource(groupField), new HashMap<>(), groupSort, topNGroups);
       } else {
         return (AbstractFirstPassGroupingCollector<T>) new TermFirstPassGroupingCollector(groupField, groupSort, topNGroups);
       }
     } else {
       if (random.nextBoolean()) {
         return (AbstractFirstPassGroupingCollector<T>) new FunctionFirstPassGroupingCollector(new BytesRefFieldSource(groupField), new HashMap<>(), groupSort, topNGroups);
       } else {
         return (AbstractFirstPassGroupingCollector<T>) new TermFirstPassGroupingCollector(groupField, groupSort, topNGroups);
       }
     }
   }

   @SuppressWarnings({"unchecked","rawtypes"})
   private List<AbstractDistinctValuesCollector.GroupCount<Comparable<?>>> createExpectedResult(IndexContext context,  String term, Sort groupSort, int topN) {
     class GroupCount extends AbstractDistinctValuesCollector.GroupCount<BytesRef> {
       GroupCount(BytesRef groupValue, Collection<BytesRef> uniqueValues) {
         super(groupValue);
         this.uniqueValues.addAll(uniqueValues);
       }
     }

     List result = new ArrayList();
     Map<String, Set<String>> groupCounts = context.searchTermToGroupCounts.get(term);
     int i = 0;
     for (String group : groupCounts.keySet()) {
       if (topN <= i++) {
         break;
       }
       Set<BytesRef> uniqueValues = new HashSet<>();
       for (String val : groupCounts.get(group)) {
         uniqueValues.add(val != null ? new BytesRef(val) : null);
       }
       result.add(new GroupCount(group != null ? new BytesRef(group) : null, uniqueValues));
     }
     return result;
   }

   private IndexContext createIndexContext() throws Exception {
     Random random = random();
     DocValuesType[] dvTypes = new DocValuesType[]{
         DocValuesType.BINARY,
         DocValuesType.SORTED
     };

     Directory dir = newDirectory();
     RandomIndexWriter w = new RandomIndexWriter(
         random,
         dir,
         newIndexWriterConfig(TEST_VERSION_CURRENT,
         new MockAnalyzer(random)).setMergePolicy(newLogMergePolicy())
       );

     boolean canUseDV = true;
     DocValuesType dvType = canUseDV ? dvTypes[random.nextInt(dvTypes.length)] : null;

     int numDocs = 86 + random.nextInt(1087) * RANDOM_MULTIPLIER;
     String[] groupValues = new String[numDocs / 5];
     String[] countValues = new String[numDocs / 10];
     for (int i = 0; i < groupValues.length; i++) {
       groupValues[i] = generateRandomNonEmptyString();
     }
     for (int i = 0; i < countValues.length; i++) {
       countValues[i] = generateRandomNonEmptyString();
     }

     List<String> contentStrings = new ArrayList<>();
     Map<String, Map<String, Set<String>>> searchTermToGroupCounts = new HashMap<>();
     for (int i = 1; i <= numDocs; i++) {
       String groupValue = random.nextInt(23) == 14 ? null : groupValues[random.nextInt(groupValues.length)];
       String countValue = random.nextInt(21) == 13 ? null : countValues[random.nextInt(countValues.length)];
       String content = "random" + random.nextInt(numDocs / 20);
       Map<String, Set<String>> groupToCounts = searchTermToGroupCounts.get(content);
       if (groupToCounts == null) {
         // Groups sort always DOCID asc...
         searchTermToGroupCounts.put(content, groupToCounts = new LinkedHashMap<>());
         contentStrings.add(content);
       }

       Set<String> countsVals = groupToCounts.get(groupValue);
       if (countsVals == null) {
         groupToCounts.put(groupValue, countsVals = new HashSet<>());
       }
       countsVals.add(countValue);

       Document doc = new Document();
       doc.add(new StringField("id", String.format(Locale.ROOT, "%09d", i), Field.Store.YES));
       if (groupValue != null) {
         addField(doc, groupField, groupValue, dvType);
       }
       if (countValue != null) {
         addField(doc, countField, countValue, dvType);
       }
       doc.add(new TextField("content", content, Field.Store.YES));
       w.addDocument(doc);
     }

     DirectoryReader reader = w.getReader();
     if (VERBOSE) {
       for(int docID=0;docID<reader.maxDoc();docID++) {
         StoredDocument doc = reader.document(docID);
         System.out.println("docID=" + docID + " id=" + doc.get("id") + " content=" + doc.get("content") + " author=" + doc.get("author") + " publisher=" + doc.get("publisher"));
       }
     }

     w.shutdown();
     return new IndexContext(dir, reader, dvType, searchTermToGroupCounts, contentStrings.toArray(new String[contentStrings.size()]));
   }

   private static class IndexContext {

     final Directory directory;
     final DirectoryReader indexReader;
     final DocValuesType dvType;
     final Map<String, Map<String, Set<String>>> searchTermToGroupCounts;
     final String[] contentStrings;

     IndexContext(Directory directory, DirectoryReader indexReader, DocValuesType dvType,
                  Map<String, Map<String, Set<String>>> searchTermToGroupCounts, String[] contentStrings) {
       this.directory = directory;
       this.indexReader = indexReader;
       this.dvType = dvType;
       this.searchTermToGroupCounts = searchTermToGroupCounts;
       this.contentStrings = contentStrings;
     }
   }

   private static class NullComparator implements Comparator<Comparable<?>> {

     @Override
     @SuppressWarnings({"unchecked","rawtypes"})
     public int compare(Comparable a, Comparable b) {
       if (a == b) {
         return 0;
       } else if (a == null) {
         return -1;
       } else if (b == null) {
         return 1;
       } else {
         return a.compareTo(b);
       }
     }

   }

 }