blob: 13fefa377099600a345a51eb96d075d9c4c9365e [file] [log] [blame]
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* See the License for the specific language governing permissions and
* limitations under the License.
package org.apache.solr.handler.admin;
import java.lang.invoke.MethodHandles;
import java.nio.file.Paths;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.Map;
import java.util.Objects;
import java.util.concurrent.atomic.AtomicLong;
import java.util.function.Function;
import org.apache.commons.math3.stat.descriptive.SummaryStatistics;
import org.apache.lucene.codecs.StoredFieldsReader;
import org.apache.lucene.index.BinaryDocValues;
import org.apache.lucene.index.CodecReader;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.FieldInfos;
import org.apache.lucene.index.Fields;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.NumericDocValues;
import org.apache.lucene.index.PointValues;
import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.index.SortedDocValues;
import org.apache.lucene.index.SortedNumericDocValues;
import org.apache.lucene.index.SortedSetDocValues;
import org.apache.lucene.index.StandardDirectoryReader;
import org.apache.lucene.index.StoredFieldVisitor;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.PriorityQueue;
import org.apache.lucene.util.RamUsageEstimator;
import org.apache.lucene.util.SuppressForbidden;
import org.apache.lucene.util.UnicodeUtil;
import org.apache.solr.common.MapWriter;
import org.apache.solr.common.util.Utils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
* Estimates the raw size of all uncompressed indexed data by scanning term, docValues and
* stored fields data. This utility also provides detailed statistics about term, docValues,
* postings and stored fields distributions.
public class IndexSizeEstimator {
private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
public static final String TERMS = "terms";
public static final String STORED_FIELDS = "storedFields";
public static final String NORMS = "norms";
public static final String DOC_VALUES = "docValues";
public static final String POINTS = "points";
public static final String TERM_VECTORS = "termVectors";
public static final String SUMMARY = "summary";
public static final String DETAILS = "details";
public static final String FIELDS_BY_SIZE = "fieldsBySize";
public static final String TYPES_BY_SIZE = "typesBySize";
public static final int DEFAULT_SAMPLING_THRESHOLD = 100_000;
public static final float DEFAULT_SAMPLING_PERCENT = 5.0f;
private final IndexReader reader;
private final int topN;
private final int maxLength;
private final boolean withSummary;
private final boolean withDetails;
private int samplingThreshold = DEFAULT_SAMPLING_THRESHOLD;
private float samplingPercent = DEFAULT_SAMPLING_PERCENT;
private int samplingStep = 1;
public static final class Estimate implements MapWriter {
private final Map<String, Long> fieldsBySize;
private final Map<String, Long> typesBySize;
private final Map<String, Object> summary;
private final Map<String, Object> details;
public Estimate(Map<String, Long> fieldsBySize, Map<String, Long> typesBySize, Map<String, Object> summary, Map<String, Object> details) {
this.fieldsBySize = fieldsBySize;
this.typesBySize = typesBySize;
this.summary = summary;
this.details = details;
public Map<String, Long> getFieldsBySize() {
return fieldsBySize;
public Map<String, Long> getTypesBySize() {
return typesBySize;
public Map<String, String> getHumanReadableFieldsBySize() {
LinkedHashMap<String, String> result = new LinkedHashMap<>();
fieldsBySize.forEach((field, size) -> result.put(field, RamUsageEstimator.humanReadableUnits(size)));
return result;
public Map<String, String> getHumanReadableTypesBySize() {
LinkedHashMap<String, String> result = new LinkedHashMap<>();
typesBySize.forEach((field, size) -> result.put(field, RamUsageEstimator.humanReadableUnits(size)));
return result;
public Map<String, Object> getSummary() {
return summary;
public Map<String, Object> getDetails() {
return details;
public void writeMap(EntryWriter ew) throws IOException {
ew.put(FIELDS_BY_SIZE, fieldsBySize);
ew.put(TYPES_BY_SIZE, typesBySize);
if (summary != null) {
ew.put(SUMMARY, summary);
if (details != null) {
ew.put(DETAILS, details);
public IndexSizeEstimator(IndexReader reader, int topN, int maxLength, boolean withSummary, boolean withDetails) {
this.reader = reader;
this.topN = topN;
this.maxLength = maxLength;
this.withSummary = withSummary;
this.withDetails = withDetails;
* Set the sampling threshold. If the index has more documents than this threshold
* then only some values will be sampled and the totals will be extrapolated.
* @param threshold size threshold (number of documents). Default value is {@link #DEFAULT_SAMPLING_THRESHOLD}.
* Setting this to values &lt;= 0 means no threshold (and no sampling).
public void setSamplingThreshold(int threshold) {
if (threshold <= 0) {
threshold = Integer.MAX_VALUE;
this.samplingThreshold = threshold;
* Sampling percent (a number greater than 0 and less or equal to 100). When index size exceeds
* the threshold then approximately only this percent of data will be retrieved from the index and the
* totals will be extrapolated.
* @param percent sample percent. Default value is {@link #DEFAULT_SAMPLING_PERCENT}.
* @throws IllegalArgumentException when value is less than or equal to 0.0 or greater than 100.0, or
* the sampling percent is so small that less than 10 documents would be sampled.
public void setSamplingPercent(float percent) throws IllegalArgumentException {
if (percent <= 0 || percent > 100) {
throw new IllegalArgumentException("samplingPercent must be 0 < percent <= 100");
if (reader.maxDoc() > samplingThreshold) {
samplingStep = Math.round(100.0f / samplingPercent);
if (log.isInfoEnabled()) {"- number of documents {} larger than {}, sampling percent is {} and sampling step {}", reader.maxDoc(), samplingThreshold, samplingPercent, samplingStep);
if (reader.maxDoc() / samplingStep < 10) {
throw new IllegalArgumentException("Out of " + reader.maxDoc() + " less than 10 documents would be sampled, which is too unreliable. Increase the samplingPercent.");
this.samplingPercent = percent;
public Estimate estimate() throws Exception {
Map<String, Object> details = new LinkedHashMap<>();
Map<String, Object> summary = new LinkedHashMap<>();
estimateSummary(details, summary);
if (samplingStep > 1) {
details.put("samplingPercent", samplingPercent);
details.put("samplingStep", samplingStep);
ItemPriorityQueue fieldSizeQueue = new ItemPriorityQueue(summary.size());
summary.forEach((field, perField) -> {
long size = ((AtomicLong)((Map<String, Object>)perField).get("totalSize")).get();
if (size > 0) {
fieldSizeQueue.insertWithOverflow(new Item(field, size));
Map<String, Long> fieldsBySize = new LinkedHashMap<>();
fieldSizeQueue._forEachEntry((k, v) -> fieldsBySize.put((String)k, (Long)v));
Map<String, AtomicLong> typeSizes = new HashMap<>();
summary.forEach((field, perField) -> {
Map<String, Object> perType = (Map<String, Object>)((Map<String, Object>)perField).get("perType");
perType.forEach((type, size) -> {
if (type.contains("_lengths")) {
AtomicLong totalSize = typeSizes.computeIfAbsent(type.replace("_lengths", ""), t -> new AtomicLong());
ItemPriorityQueue typesSizeQueue = new ItemPriorityQueue(typeSizes.size());
typeSizes.forEach((type, size) -> {
if (size.get() > 0) {
typesSizeQueue.insertWithOverflow(new Item(type, size.get()));
Map<String, Long> typesBySize = new LinkedHashMap<>();
typesSizeQueue._forEachEntry((k, v) -> typesBySize.put((String)k, (Long)v));
// sort summary by field size
Map<String, Object> newSummary = new LinkedHashMap<>();
fieldsBySize.keySet().forEach(k -> newSummary.put(String.valueOf(k), summary.get(k)));
// convert everything to maps and primitives
return new Estimate(fieldsBySize, typesBySize, withSummary ? newSummary : null, withDetails ? details : null);
private void convert(Map<String, Object> result) {
for (Map.Entry<String, Object> entry : result.entrySet()) {
Object value = entry.getValue();
if (value instanceof ItemPriorityQueue) {
ItemPriorityQueue queue = (ItemPriorityQueue)value;
Map<String, Object> map = new LinkedHashMap<>();
} else if (value instanceof MapWriterSummaryStatistics) {
MapWriterSummaryStatistics stats = (MapWriterSummaryStatistics)value;
Map<String, Object> map = new LinkedHashMap<>();
} else if (value instanceof AtomicLong) {
} else if (value instanceof Map) {
// recurse
convert((Map<String, Object>)value);
private void estimateSummary(Map<String, Object> details, Map<String, Object> summary) {"- preparing summary...");
details.forEach((type, perType) -> {
((Map<String, Object>)perType).forEach((field, perField) -> {
Map<String, Object> perFieldSummary = (Map<String, Object>)summary.computeIfAbsent(field, f -> new HashMap<>());
((Map<String, Object>)perField).forEach((k, val) -> {
if (val instanceof SummaryStatistics) {
SummaryStatistics stats = (SummaryStatistics)val;
if (k.startsWith("lengths")) {
AtomicLong total = (AtomicLong)perFieldSummary.computeIfAbsent("totalSize", kt -> new AtomicLong());
Map<String, Object> perTypeSummary = (Map<String, Object>)perFieldSummary.computeIfAbsent("perType", pt -> new HashMap<>());
AtomicLong total = (AtomicLong)perTypeSummary.computeIfAbsent(type + "_" + k, t -> new AtomicLong());
private void estimateNorms(Map<String, Object> result) throws IOException {"- estimating norms...");
Map<String, Map<String, Object>> stats = new HashMap<>();
for (LeafReaderContext leafReaderContext : reader.leaves()) {
LeafReader leafReader = leafReaderContext.reader();
FieldInfos fieldInfos = leafReader.getFieldInfos();
for (FieldInfo info : fieldInfos) {
NumericDocValues norms = leafReader.getNormValues(;
if (norms == null) {
Map<String, Object> perField = stats.computeIfAbsent(, n -> new HashMap<>());
SummaryStatistics lengthSummary = (SummaryStatistics)perField.computeIfAbsent("lengths", s -> new MapWriterSummaryStatistics());
while (norms.advance(norms.docID() + samplingStep) != DocIdSetIterator.NO_MORE_DOCS) {
for (int i = 0; i < samplingStep; i++) {
result.put(NORMS, stats);
private void estimatePoints(Map<String, Object> result) throws IOException {"- estimating points...");
Map<String, Map<String, Object>> stats = new HashMap<>();
for (LeafReaderContext leafReaderContext : reader.leaves()) {
LeafReader leafReader = leafReaderContext.reader();
FieldInfos fieldInfos = leafReader.getFieldInfos();
for (FieldInfo info : fieldInfos) {
PointValues values = leafReader.getPointValues(;
if (values == null) {
Map<String, Object> perField = stats.computeIfAbsent(, n -> new HashMap<>());
SummaryStatistics lengthSummary = (SummaryStatistics)perField.computeIfAbsent("lengths", s -> new MapWriterSummaryStatistics());
lengthSummary.addValue(values.size() * values.getBytesPerDimension() * values.getNumIndexDimensions());
result.put(POINTS, stats);
private void estimateTermVectors(Map<String, Object> result) throws IOException {"- estimating term vectors...");
Map<String, Map<String, Object>> stats = new HashMap<>();
for (LeafReaderContext leafReaderContext : reader.leaves()) {
LeafReader leafReader = leafReaderContext.reader();
Bits liveDocs = leafReader.getLiveDocs();
for (int docId = 0; docId < leafReader.maxDoc(); docId += samplingStep) {
if (liveDocs != null && !liveDocs.get(docId)) {
Fields termVectors = leafReader.getTermVectors(docId);
if (termVectors == null) {
for (String field : termVectors) {
Terms terms = termVectors.terms(field);
if (terms == null) {
estimateTermStats(field, terms, stats, true);
result.put(TERM_VECTORS, stats);
private void estimateDocValues(Map<String, Object> result) throws IOException {"- estimating docValues...");
Map<String, Map<String, Object>> stats = new HashMap<>();
for (LeafReaderContext context : reader.leaves()) {
LeafReader leafReader = context.reader();
FieldInfos fieldInfos = leafReader.getFieldInfos();
for (FieldInfo info : fieldInfos) {
// binary
countDocValues(stats,, "binary", leafReader.getBinaryDocValues(, values -> {
try {
BytesRef value = ((BinaryDocValues) values).binaryValue();
return value.length;
} catch (IOException e) {
// ignore
return 0;
// numeric
countDocValues(stats,, "numeric", leafReader.getNumericDocValues(, values -> 8);
countDocValues(stats,, "sorted", leafReader.getSortedDocValues(, values -> {
try {
TermsEnum termsEnum = ((SortedDocValues) values).termsEnum();
BytesRef term;
while ((term = != null) {
return term.length;
} catch (IOException e) {
// ignore
return 0;
countDocValues(stats,, "sortedNumeric", leafReader.getSortedNumericDocValues(,
values -> ((SortedNumericDocValues) values).docValueCount() * 8);
countDocValues(stats,, "sortedSet", leafReader.getSortedSetDocValues(, values -> {
try {
TermsEnum termsEnum = ((SortedSetDocValues) values).termsEnum();
BytesRef term;
while ((term = != null) {
return term.length;
} catch (IOException e) {
// ignore
return 0;
result.put(DOC_VALUES, stats);
private void countDocValues(Map<String, Map<String, Object>> stats, String field, String type, DocIdSetIterator values,
Function<DocIdSetIterator, Integer> valueLength) throws IOException {
if (values == null) {
Map<String, Object> perField = stats.computeIfAbsent(field, n -> new HashMap<>());
SummaryStatistics lengthSummary = (SummaryStatistics)perField.computeIfAbsent("lengths_" + type, s -> new MapWriterSummaryStatistics());
while (values.advance(values.docID() + samplingStep) != DocIdSetIterator.NO_MORE_DOCS) {
int len = valueLength.apply(values);
for (int i = 0; i < samplingStep; i++) {
private void estimateTerms(Map<String, Object> result) throws IOException {"- estimating terms...");
Map<String, Map<String, Object>> stats = new HashMap<>();
for (LeafReaderContext context : reader.leaves()) {
LeafReader leafReader = context.reader();
FieldInfos fieldInfos = leafReader.getFieldInfos();
for (FieldInfo info : fieldInfos) {
Terms terms = leafReader.terms(;
if (terms == null) {
estimateTermStats(, terms, stats, false);
result.put(TERMS, stats);
private void estimateTermStats(String field, Terms terms, Map<String, Map<String, Object>> stats, boolean isSampling) throws IOException {
Map<String, Object> perField = stats.computeIfAbsent(field, n -> new HashMap<>());
SummaryStatistics lengthSummary = (SummaryStatistics)perField.computeIfAbsent("lengths_terms", s -> new MapWriterSummaryStatistics());
SummaryStatistics docFreqSummary = (SummaryStatistics)perField.computeIfAbsent("docFreqs", s -> new MapWriterSummaryStatistics());
SummaryStatistics totalFreqSummary = (SummaryStatistics)perField.computeIfAbsent("lengths_postings", s -> new MapWriterSummaryStatistics());
// TODO: add this at some point
//SummaryStatistics impactsSummary = (SummaryStatistics)perField.computeIfAbsent("lengths_impacts", s -> new MapWriterSummaryStatistics());
SummaryStatistics payloadSummary = null;
if (terms.hasPayloads()) {
payloadSummary = (SummaryStatistics)perField.computeIfAbsent("lengths_payloads", s -> new MapWriterSummaryStatistics());
ItemPriorityQueue topLen = (ItemPriorityQueue)perField.computeIfAbsent("topLen", s -> new ItemPriorityQueue(topN));
ItemPriorityQueue topTotalFreq = (ItemPriorityQueue)perField.computeIfAbsent("topTotalFreq", s -> new ItemPriorityQueue(topN));
TermsEnum termsEnum = terms.iterator();
BytesRef term;
PostingsEnum postings = null;
while ((term = != null) {
if (isSampling) {
for (int i = 0; i < samplingStep; i++) {
} else {
if (terms.hasPayloads()) {
postings = termsEnum.postings(postings, PostingsEnum.ALL);
while (postings.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
int freq = postings.freq();
for (int i = 0; i < freq; i++) {
if (postings.nextPosition() < 0) {
BytesRef payload = postings.getPayload();
if (payload != null) {
if (isSampling) {
for (int k = 0; k < samplingStep; k++) {
} else {
String value = term.utf8ToString();
if (value.length() > maxLength) {
value = value.substring(0, maxLength);
topLen.insertWithOverflow(new Item(value, term.length));
topTotalFreq.insertWithOverflow(new Item(value, termsEnum.totalTermFreq()));
private void estimateStoredFields(Map<String, Object> result) throws IOException {"- estimating stored fields...");
Map<String, Map<String, Object>> stats = new HashMap<>();
for (LeafReaderContext context : reader.leaves()) {
LeafReader leafReader = context.reader();
EstimatingVisitor visitor = new EstimatingVisitor(stats, topN, maxLength, samplingStep);
Bits liveDocs = leafReader.getLiveDocs();
if (leafReader instanceof CodecReader) {
CodecReader codecReader = (CodecReader)leafReader;
StoredFieldsReader storedFieldsReader = codecReader.getFieldsReader();
// this instance may be faster for a full sequential pass
StoredFieldsReader mergeInstance = storedFieldsReader.getMergeInstance();
for (int docId = 0; docId < leafReader.maxDoc(); docId += samplingStep) {
if (liveDocs != null && !liveDocs.get(docId)) {
mergeInstance.visitDocument(docId, visitor);
if (mergeInstance != storedFieldsReader) {
} else {
for (int docId = 0; docId < leafReader.maxDoc(); docId += samplingStep) {
if (liveDocs != null && !liveDocs.get(docId)) {
leafReader.document(docId, visitor);
result.put(STORED_FIELDS, stats);
public static class Item {
Object value;
long size;
public Item(Object value, long size) {
this.value = value;
this.size = size;
public String toString() {
return "size=" + size + ", value=" + value;
public static class MapWriterSummaryStatistics extends SummaryStatistics implements MapWriter {
public void writeMap(EntryWriter ew) throws IOException {
ew.put("n", getN());
ew.put("min", getMin());
ew.put("max", getMax());
ew.put("sum", getSum());
ew.put("mean", getMean());
ew.put("geoMean", getGeometricMean());
ew.put("variance", getVariance());
ew.put("populationVariance", getPopulationVariance());
ew.put("stddev", getStandardDeviation());
ew.put("secondMoment", getSecondMoment());
ew.put("sumOfSquares", getSumsq());
ew.put("sumOfLogs", getSumOfLogs());
public static class ItemPriorityQueue extends PriorityQueue<Item> implements MapWriter {
public ItemPriorityQueue(int maxSize) {
protected boolean lessThan(Item a, Item b) {
return a.size < b.size;
public String toString() {
StringBuilder sb = new StringBuilder();
Iterator<Item> it = iterator();
while (it.hasNext()) {
if (sb.length() > 0) {
return sb.toString();
// WARNING: destructive! empties the queue
public void writeMap(EntryWriter ew) throws IOException {
Item[] items = new Item[size()];
int pos = size() - 1;
while (size() > 0) {
items[pos] = pop();
for (Item item : items) {
ew.put(String.valueOf(item.value), item.size);
private static class EstimatingVisitor extends StoredFieldVisitor {
final Map<String, Map<String, Object>> stats;
final int topN;
final int maxLength;
final int samplingStep;
EstimatingVisitor(Map<String, Map<String, Object>> stats, int topN, int maxLength, int samplingStep) {
this.stats = stats;
this.topN = topN;
this.maxLength = maxLength;
this.samplingStep = samplingStep;
/** Process a binary field.
* @param value newly allocated byte array with the binary contents.
public void binaryField(FieldInfo fieldInfo, byte[] value) throws IOException {
// trim the value if needed
int len = value != null ? value.length : 0;
if (len > maxLength) {
byte[] newValue = new byte[maxLength];
System.arraycopy(value, 0, newValue, 0, maxLength);
value = newValue;
String strValue = new BytesRef(value).toString();
countItem(, strValue, len);
/** Process a string field. */
public void stringField(FieldInfo fieldInfo, String value) throws IOException {
// trim the value if needed
int len = value != null ? UnicodeUtil.calcUTF16toUTF8Length(value, 0, value.length()) : 0;
if (value.length() > maxLength) {
value = value.substring(0, maxLength);
countItem(, value, len);
/** Process a int numeric field. */
public void intField(FieldInfo fieldInfo, int value) throws IOException {
countItem(, String.valueOf(value), 4);
/** Process a long numeric field. */
public void longField(FieldInfo fieldInfo, long value) throws IOException {
countItem(, String.valueOf(value), 8);
/** Process a float numeric field. */
public void floatField(FieldInfo fieldInfo, float value) throws IOException {
countItem(, String.valueOf(value), 4);
/** Process a double numeric field. */
public void doubleField(FieldInfo fieldInfo, double value) throws IOException {
countItem(, String.valueOf(value), 8);
private void countItem(String field, Object value, int size) {
Map<String, Object> perField = stats.computeIfAbsent(field, n -> new HashMap<>());
SummaryStatistics summary = (SummaryStatistics)perField.computeIfAbsent("lengths", s -> new MapWriterSummaryStatistics());
for (int i = 0; i < samplingStep; i++) {
ItemPriorityQueue topNqueue = (ItemPriorityQueue)perField.computeIfAbsent("topLen", s-> new ItemPriorityQueue(topN));
topNqueue.insertWithOverflow(new Item(value, size));
public Status needsField(FieldInfo fieldInfo) throws IOException {
return Status.YES;
@SuppressForbidden(reason = "System.err and System.out required for a command-line utility")
public static void main(String[] args) throws Exception {
if (args.length == 0) {
System.err.println("Usage: " + IndexSizeEstimator.class.getName() + " [-topN NUM] [-maxLen NUM] [-summary] [-details] <indexDir>");
System.err.println("\t<indexDir>\tpath to the index (parent path of 'segments_N' file)");
System.err.println("\t-topN NUM\tnumber of top largest items to collect");
System.err.println("\t-maxLen NUM\ttruncate the largest items to NUM bytes / characters");
String path = null;
int topN = 20;
int maxLen = 100;
boolean details = false;
boolean summary = false;
for (int i = 0; i < args.length; i++) {
if (args[i].equals("-topN")) {
topN = Integer.parseInt(args[++i]);
} else if (args[i].equals("-maxLen")) {
maxLen = Integer.parseInt(args[++i]);
} else if (args[i].equals("-details")) {
details = true;
} else if (args[i].equals("-summary")) {
summary = true;
} else {
path = args[i];
if (path == null) {
System.err.println("ERROR: <indexDir> argument is required.");
Directory dir =;
DirectoryReader reader =;
IndexSizeEstimator stats = new IndexSizeEstimator(reader, topN, maxLen, summary, details);