| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.lucene.search; |
| |
| import java.io.IOException; |
| import java.util.Arrays; |
| import java.util.DoubleSummaryStatistics; |
| import java.util.LongSummaryStatistics; |
| import java.util.Objects; |
| import java.util.function.Predicate; |
| import java.util.stream.DoubleStream; |
| import java.util.stream.LongStream; |
| import java.util.stream.Stream; |
| |
| import org.apache.lucene.document.Document; |
| import org.apache.lucene.document.DoubleDocValuesField; |
| import org.apache.lucene.document.Field.Store; |
| import org.apache.lucene.document.NumericDocValuesField; |
| import org.apache.lucene.document.SortedDocValuesField; |
| import org.apache.lucene.document.SortedNumericDocValuesField; |
| import org.apache.lucene.document.SortedSetDocValuesField; |
| import org.apache.lucene.document.StringField; |
| import org.apache.lucene.index.DirectoryReader; |
| import org.apache.lucene.index.IndexReader; |
| import org.apache.lucene.index.IndexWriter; |
| import org.apache.lucene.index.Term; |
| import org.apache.lucene.search.DocValuesStats.DoubleDocValuesStats; |
| import org.apache.lucene.search.DocValuesStats.LongDocValuesStats; |
| import org.apache.lucene.search.DocValuesStats.SortedDocValuesStats; |
| import org.apache.lucene.search.DocValuesStats.SortedDoubleDocValuesStats; |
| import org.apache.lucene.search.DocValuesStats.SortedLongDocValuesStats; |
| import org.apache.lucene.search.DocValuesStats.SortedSetDocValuesStats; |
| import org.apache.lucene.store.Directory; |
| import org.apache.lucene.util.BytesRef; |
| import org.apache.lucene.util.LuceneTestCase; |
| import org.apache.lucene.util.TestUtil; |
| |
| /** Unit tests for {@link DocValuesStatsCollector}. */ |
| public class TestDocValuesStatsCollector extends LuceneTestCase { |
| |
| public void testNoDocsWithField() throws IOException { |
| try (Directory dir = newDirectory(); |
| IndexWriter indexWriter = new IndexWriter(dir, newIndexWriterConfig())) { |
| int numDocs = TestUtil.nextInt(random(), 1, 100); |
| for (int i = 0; i < numDocs; i++) { |
| indexWriter.addDocument(new Document()); |
| } |
| |
| try (DirectoryReader reader = DirectoryReader.open(indexWriter)) { |
| IndexSearcher searcher = new IndexSearcher(reader); |
| LongDocValuesStats stats = new LongDocValuesStats("foo"); |
| searcher.search(new MatchAllDocsQuery(), new DocValuesStatsCollector(stats)); |
| |
| assertEquals(0, stats.count()); |
| assertEquals(numDocs, stats.missing()); |
| } |
| } |
| } |
| |
| public void testOneDoc() throws IOException { |
| try (Directory dir = newDirectory(); |
| IndexWriter indexWriter = new IndexWriter(dir, newIndexWriterConfig())) { |
| String field = "numeric"; |
| Document doc = new Document(); |
| doc.add(new NumericDocValuesField(field, 1)); |
| doc.add(new StringField("id", "doc1", Store.NO)); |
| indexWriter.addDocument(doc); |
| |
| try (DirectoryReader reader = DirectoryReader.open(indexWriter)) { |
| IndexSearcher searcher = new IndexSearcher(reader); |
| LongDocValuesStats stats = new LongDocValuesStats(field); |
| searcher.search(new MatchAllDocsQuery(), new DocValuesStatsCollector(stats)); |
| |
| assertEquals(1, stats.count()); |
| assertEquals(0, stats.missing()); |
| assertEquals(1, stats.max().longValue()); |
| assertEquals(1, stats.min().longValue()); |
| assertEquals(1, stats.sum().longValue()); |
| assertEquals(1, stats.mean(), 0.0001); |
| assertEquals(0, stats.variance(), 0.0001); |
| assertEquals(0, stats.stdev(), 0.0001); |
| } |
| } |
| } |
| |
| public void testDocsWithLongValues() throws IOException { |
| try (Directory dir = newDirectory(); |
| IndexWriter indexWriter = new IndexWriter(dir, newIndexWriterConfig())) { |
| String field = "numeric"; |
| int numDocs = TestUtil.nextInt(random(), 1, 100); |
| long[] docValues = new long[numDocs]; |
| int nextVal = 1; |
| for (int i = 0; i < numDocs; i++) { |
| Document doc = new Document(); |
| if (random().nextBoolean()) { // not all documents have a value |
| doc.add(new NumericDocValuesField(field, nextVal)); |
| doc.add(new StringField("id", "doc" + i, Store.NO)); |
| docValues[i] = nextVal; |
| ++nextVal; |
| } |
| indexWriter.addDocument(doc); |
| } |
| |
| // 20% of cases delete some docs |
| if (random().nextDouble() < 0.2) { |
| for (int i = 0; i < numDocs; i++) { |
| if (random().nextBoolean()) { |
| indexWriter.deleteDocuments(new Term("id", "doc" + i)); |
| docValues[i] = 0; |
| } |
| } |
| } |
| |
| try (DirectoryReader reader = DirectoryReader.open(indexWriter)) { |
| IndexSearcher searcher = new IndexSearcher(reader); |
| LongDocValuesStats stats = new LongDocValuesStats(field); |
| searcher.search(new MatchAllDocsQuery(), new DocValuesStatsCollector(stats)); |
| |
| int expCount = (int) Arrays.stream(docValues).filter(v -> v > 0).count(); |
| assertEquals(expCount, stats.count()); |
| int numDocsWithoutField = (int) getZeroValues(docValues).count(); |
| assertEquals(computeExpMissing(numDocsWithoutField, numDocs, reader), stats.missing()); |
| if (stats.count() > 0) { |
| LongSummaryStatistics sumStats = getPositiveValues(docValues).summaryStatistics(); |
| assertEquals(sumStats.getMax(), stats.max().longValue()); |
| assertEquals(sumStats.getMin(), stats.min().longValue()); |
| assertEquals(sumStats.getAverage(), stats.mean(), 0.00001); |
| assertEquals(sumStats.getSum(), stats.sum().longValue()); |
| double variance = computeVariance(docValues, stats.mean, stats.count()); |
| assertEquals(variance, stats.variance(), 0.00001); |
| assertEquals(Math.sqrt(variance), stats.stdev(), 0.00001); |
| } |
| } |
| } |
| } |
| |
| public void testDocsWithDoubleValues() throws IOException { |
| try (Directory dir = newDirectory(); |
| IndexWriter indexWriter = new IndexWriter(dir, newIndexWriterConfig())) { |
| String field = "numeric"; |
| int numDocs = TestUtil.nextInt(random(), 1, 100); |
| double[] docValues = new double[numDocs]; |
| double nextVal = 1.0; |
| for (int i = 0; i < numDocs; i++) { |
| Document doc = new Document(); |
| if (random().nextBoolean()) { // not all documents have a value |
| doc.add(new DoubleDocValuesField(field, nextVal)); |
| doc.add(new StringField("id", "doc" + i, Store.NO)); |
| docValues[i] = nextVal; |
| ++nextVal; |
| } |
| indexWriter.addDocument(doc); |
| } |
| |
| // 20% of cases delete some docs |
| if (random().nextDouble() < 0.2) { |
| for (int i = 0; i < numDocs; i++) { |
| if (random().nextBoolean()) { |
| indexWriter.deleteDocuments(new Term("id", "doc" + i)); |
| docValues[i] = 0; |
| } |
| } |
| } |
| |
| try (DirectoryReader reader = DirectoryReader.open(indexWriter)) { |
| IndexSearcher searcher = new IndexSearcher(reader); |
| DoubleDocValuesStats stats = new DoubleDocValuesStats(field); |
| searcher.search(new MatchAllDocsQuery(), new DocValuesStatsCollector(stats)); |
| |
| int expCount = (int) Arrays.stream(docValues).filter(v -> v > 0).count(); |
| assertEquals(expCount, stats.count()); |
| int numDocsWithoutField = (int) getZeroValues(docValues).count(); |
| assertEquals(computeExpMissing(numDocsWithoutField, numDocs, reader), stats.missing()); |
| if (stats.count() > 0) { |
| DoubleSummaryStatistics sumStats = getPositiveValues(docValues).summaryStatistics(); |
| assertEquals(sumStats.getMax(), stats.max().doubleValue(), 0.00001); |
| assertEquals(sumStats.getMin(), stats.min().doubleValue(), 0.00001); |
| assertEquals(sumStats.getAverage(), stats.mean(), 0.00001); |
| assertEquals(sumStats.getSum(), stats.sum(), 0.00001); |
| double variance = computeVariance(docValues, stats.mean, stats.count()); |
| assertEquals(variance, stats.variance(), 0.00001); |
| assertEquals(Math.sqrt(variance), stats.stdev(), 0.00001); |
| } |
| } |
| } |
| } |
| |
| public void testDocsWithMultipleLongValues() throws IOException { |
| try (Directory dir = newDirectory(); |
| IndexWriter indexWriter = new IndexWriter(dir, newIndexWriterConfig())) { |
| String field = "numeric"; |
| int numDocs = TestUtil.nextInt(random(), 1, 100); |
| long[][] docValues = new long[numDocs][]; |
| long nextVal = 1; |
| for (int i = 0; i < numDocs; i++) { |
| Document doc = new Document(); |
| if (random().nextBoolean()) { // not all documents have a value |
| int numValues = TestUtil.nextInt(random(), 1, 5); |
| docValues[i] = new long[numValues]; |
| for (int j = 0; j < numValues; j++) { |
| doc.add(new SortedNumericDocValuesField(field, nextVal)); |
| docValues[i][j] = nextVal; |
| ++nextVal; |
| } |
| doc.add(new StringField("id", "doc" + i, Store.NO)); |
| } |
| indexWriter.addDocument(doc); |
| } |
| |
| // 20% of cases delete some docs |
| if (random().nextDouble() < 0.2) { |
| for (int i = 0; i < numDocs; i++) { |
| if (random().nextBoolean()) { |
| indexWriter.deleteDocuments(new Term("id", "doc" + i)); |
| docValues[i] = null; |
| } |
| } |
| } |
| |
| try (DirectoryReader reader = DirectoryReader.open(indexWriter)) { |
| IndexSearcher searcher = new IndexSearcher(reader); |
| SortedLongDocValuesStats stats = new SortedLongDocValuesStats(field); |
| searcher.search(new MatchAllDocsQuery(), new DocValuesStatsCollector(stats)); |
| |
| assertEquals(nonNull(docValues).count(), stats.count()); |
| int numDocsWithoutField = (int) isNull(docValues).count(); |
| assertEquals(computeExpMissing(numDocsWithoutField, numDocs, reader), stats.missing()); |
| if (stats.count() > 0) { |
| LongSummaryStatistics sumStats = filterAndFlatValues(docValues, (v) -> v != null).summaryStatistics(); |
| assertEquals(sumStats.getMax(), stats.max().longValue()); |
| assertEquals(sumStats.getMin(), stats.min().longValue()); |
| assertEquals(sumStats.getAverage(), stats.mean(), 0.00001); |
| assertEquals(sumStats.getSum(), stats.sum().longValue()); |
| assertEquals(sumStats.getCount(), stats.valuesCount()); |
| double variance = computeVariance(filterAndFlatValues(docValues, (v) -> v != null), stats.mean, stats.count()); |
| assertEquals(variance, stats.variance(), 0.00001); |
| assertEquals(Math.sqrt(variance), stats.stdev(), 0.00001); |
| } |
| } |
| } |
| } |
| |
| public void testDocsWithMultipleDoubleValues() throws IOException { |
| try (Directory dir = newDirectory(); |
| IndexWriter indexWriter = new IndexWriter(dir, newIndexWriterConfig())) { |
| String field = "numeric"; |
| int numDocs = TestUtil.nextInt(random(), 1, 100); |
| double[][] docValues = new double[numDocs][]; |
| double nextVal = 1; |
| for (int i = 0; i < numDocs; i++) { |
| Document doc = new Document(); |
| if (random().nextBoolean()) { // not all documents have a value |
| int numValues = TestUtil.nextInt(random(), 1, 5); |
| docValues[i] = new double[numValues]; |
| for (int j = 0; j < numValues; j++) { |
| doc.add(new SortedNumericDocValuesField(field, Double.doubleToRawLongBits(nextVal))); |
| docValues[i][j] = nextVal; |
| ++nextVal; |
| } |
| doc.add(new StringField("id", "doc" + i, Store.NO)); |
| } |
| indexWriter.addDocument(doc); |
| } |
| |
| // 20% of cases delete some docs |
| if (random().nextDouble() < 0.2) { |
| for (int i = 0; i < numDocs; i++) { |
| if (random().nextBoolean()) { |
| indexWriter.deleteDocuments(new Term("id", "doc" + i)); |
| docValues[i] = null; |
| } |
| } |
| } |
| |
| try (DirectoryReader reader = DirectoryReader.open(indexWriter)) { |
| IndexSearcher searcher = new IndexSearcher(reader); |
| SortedDoubleDocValuesStats stats = new SortedDoubleDocValuesStats(field); |
| searcher.search(new MatchAllDocsQuery(), new DocValuesStatsCollector(stats)); |
| |
| assertEquals(nonNull(docValues).count(), stats.count()); |
| int numDocsWithoutField = (int) isNull(docValues).count(); |
| assertEquals(computeExpMissing(numDocsWithoutField, numDocs, reader), stats.missing()); |
| if (stats.count() > 0) { |
| DoubleSummaryStatistics sumStats = filterAndFlatValues(docValues, (v) -> v != null).summaryStatistics(); |
| assertEquals(sumStats.getMax(), stats.max().longValue(), 0.00001); |
| assertEquals(sumStats.getMin(), stats.min().longValue(), 0.00001); |
| assertEquals(sumStats.getAverage(), stats.mean(), 0.00001); |
| assertEquals(sumStats.getSum(), stats.sum().doubleValue(), 0.00001); |
| assertEquals(sumStats.getCount(), stats.valuesCount()); |
| double variance = computeVariance(filterAndFlatValues(docValues, (v) -> v != null), stats.mean, stats.count()); |
| assertEquals(variance, stats.variance(), 0.00001); |
| assertEquals(Math.sqrt(variance), stats.stdev(), 0.00001); |
| } |
| } |
| } |
| } |
| |
| public void testDocsWithSortedValues() throws IOException { |
| try (Directory dir = newDirectory(); |
| IndexWriter indexWriter = new IndexWriter(dir, newIndexWriterConfig())) { |
| String field = "sorted"; |
| int numDocs = TestUtil.nextInt(random(), 1, 100); |
| BytesRef[] docValues = new BytesRef[numDocs]; |
| for (int i = 0; i < numDocs; i++) { |
| Document doc = new Document(); |
| if (random().nextBoolean()) { // not all documents have a value |
| BytesRef val = TestUtil.randomBinaryTerm(random()); |
| doc.add(new SortedDocValuesField(field, val)); |
| doc.add(new StringField("id", "doc" + i, Store.NO)); |
| docValues[i] = val; |
| } |
| indexWriter.addDocument(doc); |
| } |
| |
| // 20% of cases delete some docs |
| if (random().nextDouble() < 0.2) { |
| for (int i = 0; i < numDocs; i++) { |
| if (random().nextBoolean()) { |
| indexWriter.deleteDocuments(new Term("id", "doc" + i)); |
| docValues[i] = null; |
| } |
| } |
| } |
| |
| try (DirectoryReader reader = DirectoryReader.open(indexWriter)) { |
| IndexSearcher searcher = new IndexSearcher(reader); |
| SortedDocValuesStats stats = new SortedDocValuesStats(field); |
| searcher.search(new MatchAllDocsQuery(), new DocValuesStatsCollector(stats)); |
| |
| int expCount = (int) nonNull(docValues).count(); |
| assertEquals(expCount, stats.count()); |
| int numDocsWithoutField = (int) isNull(docValues).count(); |
| assertEquals(computeExpMissing(numDocsWithoutField, numDocs, reader), stats.missing()); |
| if (stats.count() > 0) { |
| assertEquals(nonNull(docValues).min(BytesRef::compareTo).get(), stats.min()); |
| assertEquals(nonNull(docValues).max(BytesRef::compareTo).get(), stats.max()); |
| } |
| } |
| } |
| } |
| |
| public void testDocsWithSortedSetValues() throws IOException { |
| try (Directory dir = newDirectory(); |
| IndexWriter indexWriter = new IndexWriter(dir, newIndexWriterConfig())) { |
| String field = "sorted"; |
| int numDocs = TestUtil.nextInt(random(), 1, 100); |
| BytesRef[][] docValues = new BytesRef[numDocs][]; |
| for (int i = 0; i < numDocs; i++) { |
| Document doc = new Document(); |
| if (random().nextBoolean()) { // not all documents have a value |
| int numValues = TestUtil.nextInt(random(), 1, 5); |
| docValues[i] = new BytesRef[numValues]; |
| for (int j = 0; j < numValues; j++) { |
| BytesRef val = TestUtil.randomBinaryTerm(random()); |
| doc.add(new SortedSetDocValuesField(field, val)); |
| docValues[i][j] = val; |
| } |
| doc.add(new StringField("id", "doc" + i, Store.NO)); |
| } |
| indexWriter.addDocument(doc); |
| } |
| |
| // 20% of cases delete some docs |
| if (random().nextDouble() < 0.2) { |
| for (int i = 0; i < numDocs; i++) { |
| if (random().nextBoolean()) { |
| indexWriter.deleteDocuments(new Term("id", "doc" + i)); |
| docValues[i] = null; |
| } |
| } |
| } |
| |
| try (DirectoryReader reader = DirectoryReader.open(indexWriter)) { |
| IndexSearcher searcher = new IndexSearcher(reader); |
| SortedSetDocValuesStats stats = new SortedSetDocValuesStats(field); |
| TotalHitCountCollector totalHitCount = new TotalHitCountCollector(); |
| searcher.search(new MatchAllDocsQuery(), MultiCollector.wrap(totalHitCount, new DocValuesStatsCollector(stats))); |
| |
| int expCount = (int) nonNull(docValues).count(); |
| assertEquals(expCount, stats.count()); |
| int numDocsWithoutField = (int) isNull(docValues).count(); |
| assertEquals(computeExpMissing(numDocsWithoutField, numDocs, reader), stats.missing()); |
| if (stats.count() > 0) { |
| assertEquals(nonNull(docValues).flatMap(Arrays::stream).min(BytesRef::compareTo).get(), stats.min()); |
| assertEquals(nonNull(docValues).flatMap(Arrays::stream).max(BytesRef::compareTo).get(), stats.max()); |
| } |
| } |
| } |
| } |
| |
| private static LongStream getPositiveValues(long[] values) { |
| return Arrays.stream(values).filter(v -> v > 0); |
| } |
| |
| private static DoubleStream getPositiveValues(double[] values) { |
| return Arrays.stream(values).filter(v -> v > 0); |
| } |
| |
| private static LongStream getZeroValues(long[] values) { |
| return Arrays.stream(values).filter(v -> v == 0); |
| } |
| |
| private static DoubleStream getZeroValues(double[] values) { |
| return Arrays.stream(values).filter(v -> v == 0); |
| } |
| |
| private static double computeVariance(long[] values, double mean, int count) { |
| return getPositiveValues(values).mapToDouble(v -> (v - mean) * (v-mean)).sum() / count; |
| } |
| |
| private static double computeVariance(double[] values, double mean, int count) { |
| return getPositiveValues(values).map(v -> (v - mean) * (v-mean)).sum() / count; |
| } |
| |
| private static LongStream filterAndFlatValues(long[][] values, Predicate<? super long[]> p) { |
| return nonNull(values).flatMapToLong(Arrays::stream); |
| } |
| |
| private static DoubleStream filterAndFlatValues(double[][] values, Predicate<? super double[]> p) { |
| return nonNull(values).flatMapToDouble(Arrays::stream); |
| } |
| |
| private static double computeVariance(LongStream values, double mean, int count) { |
| return values.mapToDouble(v -> (v - mean) * (v-mean)).sum() / count; |
| } |
| |
| private static double computeVariance(DoubleStream values, double mean, int count) { |
| return values.map(v -> (v - mean) * (v-mean)).sum() / count; |
| } |
| |
| private static <T> Stream<T> nonNull(T[] values) { |
| return filterValues(values, Objects::nonNull); |
| } |
| |
| private static <T> Stream<T> isNull(T[] values) { |
| return filterValues(values, Objects::isNull); |
| } |
| |
| private static <T> Stream<T> filterValues(T[] values, Predicate<? super T> p) { |
| return Arrays.stream(values).filter(p); |
| } |
| |
| private static int computeExpMissing(int numDocsWithoutField, int numIndexedDocs, IndexReader reader) { |
| // The number of missing documents equals the number of docs without the field (not indexed with it, or were |
| // deleted). However, in case we deleted all documents in a segment before the reader was opened, there will be |
| // a mismatch between numDocs (how many we indexed) to reader.maxDoc(), so compensate for that. |
| return numDocsWithoutField - reader.numDeletedDocs() - (numIndexedDocs - reader.maxDoc()); |
| } |
| } |