| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.solr.handler.admin; |
| |
| import java.io.IOException; |
| import java.lang.invoke.MethodHandles; |
| import java.nio.file.Paths; |
| import java.util.HashMap; |
| import java.util.Iterator; |
| import java.util.LinkedHashMap; |
| import java.util.Map; |
| import java.util.Objects; |
| import java.util.concurrent.atomic.AtomicLong; |
| import java.util.function.Function; |
| |
| import org.apache.commons.math3.stat.descriptive.SummaryStatistics; |
| import org.apache.lucene.codecs.StoredFieldsReader; |
| import org.apache.lucene.index.BinaryDocValues; |
| import org.apache.lucene.index.CodecReader; |
| import org.apache.lucene.index.DirectoryReader; |
| import org.apache.lucene.index.FieldInfo; |
| import org.apache.lucene.index.FieldInfos; |
| import org.apache.lucene.index.Fields; |
| import org.apache.lucene.index.IndexReader; |
| import org.apache.lucene.index.LeafReader; |
| import org.apache.lucene.index.LeafReaderContext; |
| import org.apache.lucene.index.NumericDocValues; |
| import org.apache.lucene.index.PointValues; |
| import org.apache.lucene.index.PostingsEnum; |
| import org.apache.lucene.index.SortedDocValues; |
| import org.apache.lucene.index.SortedNumericDocValues; |
| import org.apache.lucene.index.SortedSetDocValues; |
| import org.apache.lucene.index.StandardDirectoryReader; |
| import org.apache.lucene.index.StoredFieldVisitor; |
| import org.apache.lucene.index.Terms; |
| import org.apache.lucene.index.TermsEnum; |
| import org.apache.lucene.search.DocIdSetIterator; |
| import org.apache.lucene.store.Directory; |
| import org.apache.lucene.store.FSDirectory; |
| import org.apache.lucene.util.Bits; |
| import org.apache.lucene.util.BytesRef; |
| import org.apache.lucene.util.PriorityQueue; |
| import org.apache.lucene.util.RamUsageEstimator; |
| import org.apache.lucene.util.SuppressForbidden; |
| import org.apache.solr.common.MapWriter; |
| import org.apache.solr.common.util.Utils; |
| import org.slf4j.Logger; |
| import org.slf4j.LoggerFactory; |
| |
| /** |
| * Estimates the raw size of all uncompressed indexed data by scanning term, docValues and |
| * stored fields data. This utility also provides detailed statistics about term, docValues, |
| * postings and stored fields distributions. |
| */ |
| public class IndexSizeEstimator { |
| private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); |
| |
| public static final String TERMS = "terms"; |
| public static final String STORED_FIELDS = "storedFields"; |
| public static final String NORMS = "norms"; |
| public static final String DOC_VALUES = "docValues"; |
| public static final String POINTS = "points"; |
| public static final String TERM_VECTORS = "termVectors"; |
| public static final String SUMMARY = "summary"; |
| public static final String DETAILS = "details"; |
| public static final String FIELDS_BY_SIZE = "fieldsBySize"; |
| public static final String TYPES_BY_SIZE = "typesBySize"; |
| |
| public static final int DEFAULT_SAMPLING_THRESHOLD = 100_000; |
| public static final float DEFAULT_SAMPLING_PERCENT = 5.0f; |
| |
| private final IndexReader reader; |
| private final int topN; |
| private final int maxLength; |
| private final boolean withSummary; |
| private final boolean withDetails; |
| private int samplingThreshold = DEFAULT_SAMPLING_THRESHOLD; |
| private float samplingPercent = DEFAULT_SAMPLING_PERCENT; |
| private int samplingStep = 1; |
| |
| public static final class Estimate implements MapWriter { |
| private final Map<String, Long> fieldsBySize; |
| private final Map<String, Long> typesBySize; |
| private final Map<String, Object> summary; |
| private final Map<String, Object> details; |
| |
| public Estimate(Map<String, Long> fieldsBySize, Map<String, Long> typesBySize, Map<String, Object> summary, Map<String, Object> details) { |
| Objects.requireNonNull(fieldsBySize); |
| Objects.requireNonNull(typesBySize); |
| this.fieldsBySize = fieldsBySize; |
| this.typesBySize = typesBySize; |
| this.summary = summary; |
| this.details = details; |
| } |
| |
| public Map<String, Long> getFieldsBySize() { |
| return fieldsBySize; |
| } |
| |
| public Map<String, Long> getTypesBySize() { |
| return typesBySize; |
| } |
| |
| public Map<String, String> getHumanReadableFieldsBySize() { |
| LinkedHashMap<String, String> result = new LinkedHashMap<>(); |
| fieldsBySize.forEach((field, size) -> result.put(field, RamUsageEstimator.humanReadableUnits(size))); |
| return result; |
| } |
| |
| public Map<String, String> getHumanReadableTypesBySize() { |
| LinkedHashMap<String, String> result = new LinkedHashMap<>(); |
| typesBySize.forEach((field, size) -> result.put(field, RamUsageEstimator.humanReadableUnits(size))); |
| return result; |
| } |
| |
| public Map<String, Object> getSummary() { |
| return summary; |
| } |
| |
| public Map<String, Object> getDetails() { |
| return details; |
| } |
| |
| @Override |
| public void writeMap(EntryWriter ew) throws IOException { |
| ew.put(FIELDS_BY_SIZE, fieldsBySize); |
| ew.put(TYPES_BY_SIZE, typesBySize); |
| if (summary != null) { |
| ew.put(SUMMARY, summary); |
| } |
| if (details != null) { |
| ew.put(DETAILS, details); |
| } |
| } |
| } |
| |
| public IndexSizeEstimator(IndexReader reader, int topN, int maxLength, boolean withSummary, boolean withDetails) { |
| this.reader = reader; |
| this.topN = topN; |
| this.maxLength = maxLength; |
| this.withSummary = withSummary; |
| this.withDetails = withDetails; |
| } |
| |
| /** |
| * Set the sampling threshold. If the index has more documents than this threshold |
| * then only some values will be sampled and the totals will be extrapolated. |
| * @param threshold size threshold (number of documents). Default value is {@link #DEFAULT_SAMPLING_THRESHOLD}. |
| * Setting this to values <= 0 means no threshold (and no sampling). |
| */ |
| public void setSamplingThreshold(int threshold) { |
| if (threshold <= 0) { |
| threshold = Integer.MAX_VALUE; |
| } |
| this.samplingThreshold = threshold; |
| } |
| |
| /** |
| * Sampling percent (a number greater than 0 and less or equal to 100). When index size exceeds |
| * the threshold then approximately only this percent of data will be retrieved from the index and the |
| * totals will be extrapolated. |
| * @param percent sample percent. Default value is {@link #DEFAULT_SAMPLING_PERCENT}. |
| * @throws IllegalArgumentException when value is less than or equal to 0.0 or greater than 100.0, or |
| * the sampling percent is so small that less than 10 documents would be sampled. |
| */ |
| public void setSamplingPercent(float percent) throws IllegalArgumentException { |
| if (percent <= 0 || percent > 100) { |
| throw new IllegalArgumentException("samplingPercent must be 0 < percent <= 100"); |
| } |
| if (reader.maxDoc() > samplingThreshold) { |
| samplingStep = Math.round(100.0f / samplingPercent); |
| if (log.isInfoEnabled()) { |
| log.info("- number of documents {} larger than {}, sampling percent is {} and sampling step {}", reader.maxDoc(), samplingThreshold, samplingPercent, samplingStep); |
| } |
| if (reader.maxDoc() / samplingStep < 10) { |
| throw new IllegalArgumentException("Out of " + reader.maxDoc() + " less than 10 documents would be sampled, which is too unreliable. Increase the samplingPercent."); |
| } |
| } |
| this.samplingPercent = percent; |
| } |
| |
| @SuppressWarnings({"unchecked"}) |
| public Estimate estimate() throws Exception { |
| Map<String, Object> details = new LinkedHashMap<>(); |
| Map<String, Object> summary = new LinkedHashMap<>(); |
| estimateStoredFields(details); |
| estimateTerms(details); |
| estimateNorms(details); |
| estimatePoints(details); |
| estimateTermVectors(details); |
| estimateDocValues(details); |
| estimateSummary(details, summary); |
| if (samplingStep > 1) { |
| details.put("samplingPercent", samplingPercent); |
| details.put("samplingStep", samplingStep); |
| } |
| ItemPriorityQueue fieldSizeQueue = new ItemPriorityQueue(summary.size()); |
| summary.forEach((field, perField) -> { |
| long size = ((AtomicLong)((Map<String, Object>)perField).get("totalSize")).get(); |
| if (size > 0) { |
| fieldSizeQueue.insertWithOverflow(new Item(field, size)); |
| } |
| }); |
| Map<String, Long> fieldsBySize = new LinkedHashMap<>(); |
| fieldSizeQueue._forEachEntry((k, v) -> fieldsBySize.put((String)k, (Long)v)); |
| Map<String, AtomicLong> typeSizes = new HashMap<>(); |
| summary.forEach((field, perField) -> { |
| Map<String, Object> perType = (Map<String, Object>)((Map<String, Object>)perField).get("perType"); |
| perType.forEach((type, size) -> { |
| if (type.contains("_lengths")) { |
| AtomicLong totalSize = typeSizes.computeIfAbsent(type.replace("_lengths", ""), t -> new AtomicLong()); |
| totalSize.addAndGet(((AtomicLong)size).get()); |
| } |
| }); |
| }); |
| ItemPriorityQueue typesSizeQueue = new ItemPriorityQueue(typeSizes.size()); |
| typeSizes.forEach((type, size) -> { |
| if (size.get() > 0) { |
| typesSizeQueue.insertWithOverflow(new Item(type, size.get())); |
| } |
| }); |
| Map<String, Long> typesBySize = new LinkedHashMap<>(); |
| typesSizeQueue._forEachEntry((k, v) -> typesBySize.put((String)k, (Long)v)); |
| // sort summary by field size |
| Map<String, Object> newSummary = new LinkedHashMap<>(); |
| fieldsBySize.keySet().forEach(k -> newSummary.put(String.valueOf(k), summary.get(k))); |
| // convert everything to maps and primitives |
| convert(newSummary); |
| convert(details); |
| return new Estimate(fieldsBySize, typesBySize, withSummary ? newSummary : null, withDetails ? details : null); |
| } |
| |
| @SuppressWarnings({"unchecked"}) |
| private void convert(Map<String, Object> result) { |
| for (Map.Entry<String, Object> entry : result.entrySet()) { |
| Object value = entry.getValue(); |
| if (value instanceof ItemPriorityQueue) { |
| ItemPriorityQueue queue = (ItemPriorityQueue)value; |
| Map<String, Object> map = new LinkedHashMap<>(); |
| queue.toMap(map); |
| entry.setValue(map); |
| } else if (value instanceof MapWriterSummaryStatistics) { |
| MapWriterSummaryStatistics stats = (MapWriterSummaryStatistics)value; |
| Map<String, Object> map = new LinkedHashMap<>(); |
| stats.toMap(map); |
| entry.setValue(map); |
| } else if (value instanceof AtomicLong) { |
| entry.setValue(((AtomicLong)value).longValue()); |
| } else if (value instanceof Map) { |
| // recurse |
| convert((Map<String, Object>)value); |
| } |
| } |
| } |
| |
| @SuppressWarnings({"unchecked"}) |
| private void estimateSummary(Map<String, Object> details, Map<String, Object> summary) { |
| log.info("- preparing summary..."); |
| details.forEach((type, perType) -> { |
| ((Map<String, Object>)perType).forEach((field, perField) -> { |
| Map<String, Object> perFieldSummary = (Map<String, Object>)summary.computeIfAbsent(field, f -> new HashMap<>()); |
| ((Map<String, Object>)perField).forEach((k, val) -> { |
| if (val instanceof SummaryStatistics) { |
| SummaryStatistics stats = (SummaryStatistics)val; |
| if (k.startsWith("lengths")) { |
| AtomicLong total = (AtomicLong)perFieldSummary.computeIfAbsent("totalSize", kt -> new AtomicLong()); |
| total.addAndGet((long)stats.getSum()); |
| } |
| Map<String, Object> perTypeSummary = (Map<String, Object>)perFieldSummary.computeIfAbsent("perType", pt -> new HashMap<>()); |
| AtomicLong total = (AtomicLong)perTypeSummary.computeIfAbsent(type + "_" + k, t -> new AtomicLong()); |
| total.addAndGet((long)stats.getSum()); |
| } |
| }); |
| }); |
| }); |
| } |
| |
| private void estimateNorms(Map<String, Object> result) throws IOException { |
| log.info("- estimating norms..."); |
| Map<String, Map<String, Object>> stats = new HashMap<>(); |
| for (LeafReaderContext leafReaderContext : reader.leaves()) { |
| LeafReader leafReader = leafReaderContext.reader(); |
| FieldInfos fieldInfos = leafReader.getFieldInfos(); |
| for (FieldInfo info : fieldInfos) { |
| NumericDocValues norms = leafReader.getNormValues(info.name); |
| if (norms == null) { |
| continue; |
| } |
| Map<String, Object> perField = stats.computeIfAbsent(info.name, n -> new HashMap<>()); |
| SummaryStatistics lengthSummary = (SummaryStatistics)perField.computeIfAbsent("lengths", s -> new MapWriterSummaryStatistics()); |
| while (norms.advance(norms.docID() + samplingStep) != DocIdSetIterator.NO_MORE_DOCS) { |
| for (int i = 0; i < samplingStep; i++) { |
| lengthSummary.addValue(8); |
| } |
| } |
| } |
| } |
| result.put(NORMS, stats); |
| } |
| |
| private void estimatePoints(Map<String, Object> result) throws IOException { |
| log.info("- estimating points..."); |
| Map<String, Map<String, Object>> stats = new HashMap<>(); |
| for (LeafReaderContext leafReaderContext : reader.leaves()) { |
| LeafReader leafReader = leafReaderContext.reader(); |
| FieldInfos fieldInfos = leafReader.getFieldInfos(); |
| for (FieldInfo info : fieldInfos) { |
| PointValues values = leafReader.getPointValues(info.name); |
| if (values == null) { |
| continue; |
| } |
| Map<String, Object> perField = stats.computeIfAbsent(info.name, n -> new HashMap<>()); |
| SummaryStatistics lengthSummary = (SummaryStatistics)perField.computeIfAbsent("lengths", s -> new MapWriterSummaryStatistics()); |
| lengthSummary.addValue(values.size() * values.getBytesPerDimension() * values.getNumIndexDimensions()); |
| } |
| } |
| result.put(POINTS, stats); |
| } |
| |
| private void estimateTermVectors(Map<String, Object> result) throws IOException { |
| log.info("- estimating term vectors..."); |
| Map<String, Map<String, Object>> stats = new HashMap<>(); |
| for (LeafReaderContext leafReaderContext : reader.leaves()) { |
| LeafReader leafReader = leafReaderContext.reader(); |
| Bits liveDocs = leafReader.getLiveDocs(); |
| for (int docId = 0; docId < leafReader.maxDoc(); docId += samplingStep) { |
| if (liveDocs != null && !liveDocs.get(docId)) { |
| continue; |
| } |
| Fields termVectors = leafReader.getTermVectors(docId); |
| if (termVectors == null) { |
| continue; |
| } |
| for (String field : termVectors) { |
| Terms terms = termVectors.terms(field); |
| if (terms == null) { |
| continue; |
| } |
| estimateTermStats(field, terms, stats, true); |
| } |
| } |
| } |
| result.put(TERM_VECTORS, stats); |
| } |
| |
| private void estimateDocValues(Map<String, Object> result) throws IOException { |
| log.info("- estimating docValues..."); |
| Map<String, Map<String, Object>> stats = new HashMap<>(); |
| for (LeafReaderContext context : reader.leaves()) { |
| LeafReader leafReader = context.reader(); |
| FieldInfos fieldInfos = leafReader.getFieldInfos(); |
| for (FieldInfo info : fieldInfos) { |
| // binary |
| countDocValues(stats, info.name, "binary", leafReader.getBinaryDocValues(info.name), values -> { |
| try { |
| BytesRef value = ((BinaryDocValues) values).binaryValue(); |
| return value.length; |
| } catch (IOException e) { |
| // ignore |
| } |
| return 0; |
| }); |
| // numeric |
| countDocValues(stats, info.name, "numeric", leafReader.getNumericDocValues(info.name), values -> 8); |
| countDocValues(stats, info.name, "sorted", leafReader.getSortedDocValues(info.name), values -> { |
| try { |
| TermsEnum termsEnum = ((SortedDocValues) values).termsEnum(); |
| BytesRef term; |
| while ((term = termsEnum.next()) != null) { |
| return term.length; |
| } |
| } catch (IOException e) { |
| // ignore |
| } |
| return 0; |
| }); |
| countDocValues(stats, info.name, "sortedNumeric", leafReader.getSortedNumericDocValues(info.name), |
| values -> ((SortedNumericDocValues) values).docValueCount() * 8); |
| countDocValues(stats, info.name, "sortedSet", leafReader.getSortedSetDocValues(info.name), values -> { |
| try { |
| TermsEnum termsEnum = ((SortedSetDocValues) values).termsEnum(); |
| BytesRef term; |
| while ((term = termsEnum.next()) != null) { |
| return term.length; |
| } |
| } catch (IOException e) { |
| // ignore |
| } |
| return 0; |
| }); |
| } |
| } |
| result.put(DOC_VALUES, stats); |
| } |
| |
| private void countDocValues(Map<String, Map<String, Object>> stats, String field, String type, DocIdSetIterator values, |
| Function<DocIdSetIterator, Integer> valueLength) throws IOException { |
| if (values == null) { |
| return; |
| } |
| Map<String, Object> perField = stats.computeIfAbsent(field, n -> new HashMap<>()); |
| SummaryStatistics lengthSummary = (SummaryStatistics)perField.computeIfAbsent("lengths_" + type, s -> new MapWriterSummaryStatistics()); |
| while (values.advance(values.docID() + samplingStep) != DocIdSetIterator.NO_MORE_DOCS) { |
| int len = valueLength.apply(values); |
| for (int i = 0; i < samplingStep; i++) { |
| lengthSummary.addValue(len); |
| } |
| } |
| } |
| |
| private void estimateTerms(Map<String, Object> result) throws IOException { |
| log.info("- estimating terms..."); |
| Map<String, Map<String, Object>> stats = new HashMap<>(); |
| for (LeafReaderContext context : reader.leaves()) { |
| LeafReader leafReader = context.reader(); |
| FieldInfos fieldInfos = leafReader.getFieldInfos(); |
| for (FieldInfo info : fieldInfos) { |
| Terms terms = leafReader.terms(info.name); |
| if (terms == null) { |
| continue; |
| } |
| estimateTermStats(info.name, terms, stats, false); |
| } |
| } |
| result.put(TERMS, stats); |
| } |
| |
| private void estimateTermStats(String field, Terms terms, Map<String, Map<String, Object>> stats, boolean isSampling) throws IOException { |
| Map<String, Object> perField = stats.computeIfAbsent(field, n -> new HashMap<>()); |
| SummaryStatistics lengthSummary = (SummaryStatistics)perField.computeIfAbsent("lengths_terms", s -> new MapWriterSummaryStatistics()); |
| SummaryStatistics docFreqSummary = (SummaryStatistics)perField.computeIfAbsent("docFreqs", s -> new MapWriterSummaryStatistics()); |
| SummaryStatistics totalFreqSummary = (SummaryStatistics)perField.computeIfAbsent("lengths_postings", s -> new MapWriterSummaryStatistics()); |
| // TODO: add this at some point |
| //SummaryStatistics impactsSummary = (SummaryStatistics)perField.computeIfAbsent("lengths_impacts", s -> new MapWriterSummaryStatistics()); |
| SummaryStatistics payloadSummary = null; |
| if (terms.hasPayloads()) { |
| payloadSummary = (SummaryStatistics)perField.computeIfAbsent("lengths_payloads", s -> new MapWriterSummaryStatistics()); |
| } |
| ItemPriorityQueue topLen = (ItemPriorityQueue)perField.computeIfAbsent("topLen", s -> new ItemPriorityQueue(topN)); |
| ItemPriorityQueue topTotalFreq = (ItemPriorityQueue)perField.computeIfAbsent("topTotalFreq", s -> new ItemPriorityQueue(topN)); |
| TermsEnum termsEnum = terms.iterator(); |
| BytesRef term; |
| PostingsEnum postings = null; |
| while ((term = termsEnum.next()) != null) { |
| if (isSampling) { |
| for (int i = 0; i < samplingStep; i++) { |
| lengthSummary.addValue(term.length); |
| docFreqSummary.addValue(termsEnum.docFreq()); |
| totalFreqSummary.addValue(termsEnum.totalTermFreq()); |
| } |
| } else { |
| lengthSummary.addValue(term.length); |
| docFreqSummary.addValue(termsEnum.docFreq()); |
| totalFreqSummary.addValue(termsEnum.totalTermFreq()); |
| } |
| if (terms.hasPayloads()) { |
| postings = termsEnum.postings(postings, PostingsEnum.ALL); |
| while (postings.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) { |
| int freq = postings.freq(); |
| for (int i = 0; i < freq; i++) { |
| if (postings.nextPosition() < 0) { |
| break; |
| } |
| BytesRef payload = postings.getPayload(); |
| if (payload != null) { |
| if (isSampling) { |
| for (int k = 0; k < samplingStep; k++) { |
| payloadSummary.addValue(payload.length); |
| } |
| } else { |
| payloadSummary.addValue(payload.length); |
| } |
| } |
| } |
| } |
| } |
| String value = term.utf8ToString(); |
| if (value.length() > maxLength) { |
| value = value.substring(0, maxLength); |
| } |
| topLen.insertWithOverflow(new Item(value, term.length)); |
| topTotalFreq.insertWithOverflow(new Item(value, termsEnum.totalTermFreq())); |
| } |
| } |
| |
| |
| private void estimateStoredFields(Map<String, Object> result) throws IOException { |
| log.info("- estimating stored fields..."); |
| Map<String, Map<String, Object>> stats = new HashMap<>(); |
| for (LeafReaderContext context : reader.leaves()) { |
| LeafReader leafReader = context.reader(); |
| EstimatingVisitor visitor = new EstimatingVisitor(stats, topN, maxLength, samplingStep); |
| Bits liveDocs = leafReader.getLiveDocs(); |
| if (leafReader instanceof CodecReader) { |
| CodecReader codecReader = (CodecReader)leafReader; |
| StoredFieldsReader storedFieldsReader = codecReader.getFieldsReader(); |
| // this instance may be faster for a full sequential pass |
| StoredFieldsReader mergeInstance = storedFieldsReader.getMergeInstance(); |
| for (int docId = 0; docId < leafReader.maxDoc(); docId += samplingStep) { |
| if (liveDocs != null && !liveDocs.get(docId)) { |
| continue; |
| } |
| mergeInstance.visitDocument(docId, visitor); |
| } |
| if (mergeInstance != storedFieldsReader) { |
| mergeInstance.close(); |
| } |
| } else { |
| for (int docId = 0; docId < leafReader.maxDoc(); docId += samplingStep) { |
| if (liveDocs != null && !liveDocs.get(docId)) { |
| continue; |
| } |
| leafReader.document(docId, visitor); |
| } |
| } |
| } |
| result.put(STORED_FIELDS, stats); |
| } |
| |
| public static class Item { |
| Object value; |
| long size; |
| |
| public Item(Object value, long size) { |
| this.value = value; |
| this.size = size; |
| } |
| |
| public String toString() { |
| return "size=" + size + ", value=" + value; |
| } |
| } |
| |
| public static class MapWriterSummaryStatistics extends SummaryStatistics implements MapWriter { |
| |
| @Override |
| public void writeMap(EntryWriter ew) throws IOException { |
| ew.put("n", getN()); |
| ew.put("min", getMin()); |
| ew.put("max", getMax()); |
| ew.put("sum", getSum()); |
| ew.put("mean", getMean()); |
| ew.put("geoMean", getGeometricMean()); |
| ew.put("variance", getVariance()); |
| ew.put("populationVariance", getPopulationVariance()); |
| ew.put("stddev", getStandardDeviation()); |
| ew.put("secondMoment", getSecondMoment()); |
| ew.put("sumOfSquares", getSumsq()); |
| ew.put("sumOfLogs", getSumOfLogs()); |
| } |
| } |
| |
| public static class ItemPriorityQueue extends PriorityQueue<Item> implements MapWriter { |
| |
| public ItemPriorityQueue(int maxSize) { |
| super(maxSize); |
| } |
| |
| @Override |
| protected boolean lessThan(Item a, Item b) { |
| return a.size < b.size; |
| } |
| |
| public String toString() { |
| StringBuilder sb = new StringBuilder(); |
| Iterator<Item> it = iterator(); |
| while (it.hasNext()) { |
| if (sb.length() > 0) { |
| sb.append('\n'); |
| } |
| sb.append(it.next()); |
| } |
| return sb.toString(); |
| } |
| |
| // WARNING: destructive! empties the queue |
| @Override |
| public void writeMap(EntryWriter ew) throws IOException { |
| Item[] items = new Item[size()]; |
| int pos = size() - 1; |
| while (size() > 0) { |
| items[pos] = pop(); |
| pos--; |
| } |
| for (Item item : items) { |
| ew.put(String.valueOf(item.value), item.size); |
| } |
| } |
| } |
| |
| private static class EstimatingVisitor extends StoredFieldVisitor { |
| final Map<String, Map<String, Object>> stats; |
| final int topN; |
| final int maxLength; |
| final int samplingStep; |
| |
| EstimatingVisitor(Map<String, Map<String, Object>> stats, int topN, int maxLength, int samplingStep) { |
| this.stats = stats; |
| this.topN = topN; |
| this.maxLength = maxLength; |
| this.samplingStep = samplingStep; |
| } |
| |
| /** Process a binary field. |
| * @param value newly allocated byte array with the binary contents. |
| */ |
| public void binaryField(FieldInfo fieldInfo, byte[] value) throws IOException { |
| // trim the value if needed |
| int len = value != null ? value.length : 0; |
| if (len > maxLength) { |
| byte[] newValue = new byte[maxLength]; |
| System.arraycopy(value, 0, newValue, 0, maxLength); |
| value = newValue; |
| } |
| String strValue = new BytesRef(value).toString(); |
| countItem(fieldInfo.name, strValue, len); |
| } |
| |
| /** Process a string field. */ |
| public void stringField(FieldInfo fieldInfo, byte[] value) throws IOException { |
| // trim the value if needed |
| int len = value != null ? value.length : 0; |
| if (len > maxLength) { |
| byte[] newValue = new byte[maxLength]; |
| System.arraycopy(value, 0, newValue, 0, maxLength); |
| value = newValue; |
| } |
| String strValue = new String(value, "UTF-8"); |
| countItem(fieldInfo.name, strValue, len); |
| } |
| |
| /** Process a int numeric field. */ |
| public void intField(FieldInfo fieldInfo, int value) throws IOException { |
| countItem(fieldInfo.name, String.valueOf(value), 4); |
| } |
| |
| /** Process a long numeric field. */ |
| public void longField(FieldInfo fieldInfo, long value) throws IOException { |
| countItem(fieldInfo.name, String.valueOf(value), 8); |
| } |
| |
| /** Process a float numeric field. */ |
| public void floatField(FieldInfo fieldInfo, float value) throws IOException { |
| countItem(fieldInfo.name, String.valueOf(value), 4); |
| } |
| |
| /** Process a double numeric field. */ |
| public void doubleField(FieldInfo fieldInfo, double value) throws IOException { |
| countItem(fieldInfo.name, String.valueOf(value), 8); |
| } |
| |
| private void countItem(String field, Object value, int size) { |
| Map<String, Object> perField = stats.computeIfAbsent(field, n -> new HashMap<>()); |
| SummaryStatistics summary = (SummaryStatistics)perField.computeIfAbsent("lengths", s -> new MapWriterSummaryStatistics()); |
| for (int i = 0; i < samplingStep; i++) { |
| summary.addValue(size); |
| } |
| ItemPriorityQueue topNqueue = (ItemPriorityQueue)perField.computeIfAbsent("topLen", s-> new ItemPriorityQueue(topN)); |
| topNqueue.insertWithOverflow(new Item(value, size)); |
| } |
| |
| @Override |
| public Status needsField(FieldInfo fieldInfo) throws IOException { |
| return Status.YES; |
| } |
| } |
| |
| @SuppressForbidden(reason = "System.err and System.out required for a command-line utility") |
| public static void main(String[] args) throws Exception { |
| if (args.length == 0) { |
| System.err.println("Usage: " + IndexSizeEstimator.class.getName() + " [-topN NUM] [-maxLen NUM] [-summary] [-details] <indexDir>"); |
| System.err.println(); |
| System.err.println("\t<indexDir>\tpath to the index (parent path of 'segments_N' file)"); |
| System.err.println("\t-topN NUM\tnumber of top largest items to collect"); |
| System.err.println("\t-maxLen NUM\ttruncate the largest items to NUM bytes / characters"); |
| System.err.println(-1); |
| } |
| String path = null; |
| int topN = 20; |
| int maxLen = 100; |
| boolean details = false; |
| boolean summary = false; |
| for (int i = 0; i < args.length; i++) { |
| if (args[i].equals("-topN")) { |
| topN = Integer.parseInt(args[++i]); |
| } else if (args[i].equals("-maxLen")) { |
| maxLen = Integer.parseInt(args[++i]); |
| } else if (args[i].equals("-details")) { |
| details = true; |
| } else if (args[i].equals("-summary")) { |
| summary = true; |
| } else { |
| path = args[i]; |
| } |
| } |
| if (path == null) { |
| System.err.println("ERROR: <indexDir> argument is required."); |
| System.exit(-2); |
| } |
| Directory dir = FSDirectory.open(Paths.get(path)); |
| DirectoryReader reader = StandardDirectoryReader.open(dir); |
| IndexSizeEstimator stats = new IndexSizeEstimator(reader, topN, maxLen, summary, details); |
| System.out.println(Utils.toJSONString(stats.estimate())); |
| System.exit(0); |
| } |
| } |