| /* |
| * Licensed to the Apache Software Foundation (ASF) under one |
| * or more contributor license agreements. See the NOTICE file |
| * distributed with this work for additional information |
| * regarding copyright ownership. The ASF licenses this file |
| * to you under the Apache License, Version 2.0 (the |
| * "License"); you may not use this file except in compliance |
| * with the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, |
| * software distributed under the License is distributed on an |
| * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| * KIND, either express or implied. See the License for the |
| * specific language governing permissions and limitations |
| * under the License. |
| */ |
| |
| package org.apache.datasketches.req; |
| |
| import java.util.Arrays; |
| import java.util.List; |
| |
| import org.apache.datasketches.InequalitySearch; |
| |
| /** |
| * The Sorted View provides a view of the data retained by the sketch that would be cumbersome to get any other way. |
| * One can iterate of the contents of the sketch, but the result is not sorted. |
| * Trying to use getQuantiles would be very cumbersome since one doesn't know what ranks to use to supply the |
| * getQuantiles method. Even worse, suppose it is a large sketch that has retained 1000 values from a stream of |
| * millions (or billions). One would have to execute the getQuantiles method many thousands of times, and using |
| * trial & error, try to figure out what the sketch actually has retained. |
| * |
| * <p>The data from a Sorted view is an unbiased sample of the input stream that can be used for other kinds of |
| * analysis not directly provided by the sketch. A good example comparing two sketches using the Kolmogorov-Smirnov |
| * test. One needs this sorted view for the test.</p> |
| * |
| * <p>This sorted view can also be used for multiple getRank and getQuantile queries once it has been created. |
| * Because it takes some computational work to create this sorted view, it doesn't make sense to create this sorted view |
| * just for single getRank queries. For the first getQuantile queries, it must be created. But for all queries |
| * after the first, assuming the sketch has not been updated, the getQuantile and getRank queries are very fast.<p> |
| * @author Lee Rhodes |
| */ |
| public class ReqSketchSortedView { |
| private static final String LS = System.getProperty("line.separator"); |
| private float[] values; |
| private long[] cumWeights; |
| private final boolean hra; //used in merge |
| private final long N; |
| private final boolean dedup; |
| |
| /** |
| * |
| * Construct this sorted view with the given sketch. |
| * The number of values in this sorted view will be the same as the number of retained values in the sketch. |
| * @param sk the given sketch |
| */ |
| public ReqSketchSortedView(final ReqSketch sk) { |
| this(sk, false); |
| } |
| |
| /** |
| * Construct this sorted view with the given sketch and option to deduplicate the values. |
| * The weights will be combined for the duplicate values. |
| * The getQuantile() and getRank() methods will work correctly. |
| * @param sk the given sketch |
| * @param dedup if true, duplicate values will be combined into a single value with the combined weights. |
| */ |
| public ReqSketchSortedView(final ReqSketch sk, final boolean dedup) { |
| hra = sk.getHighRankAccuracy(); |
| N = sk.getN(); |
| this.dedup = dedup; |
| buildAuxTable(sk); |
| } |
| |
| /** |
| * Gets the quantile based on the given normalized rank, |
| * which must be in the range [0.0, 1.0], inclusive. |
| * @param normRank the given normalized rank |
| * @param inclusive determines the search criterion used. |
| * @return the quantile |
| */ |
| public float getQuantile(final double normRank, final boolean inclusive) { |
| final int len = cumWeights.length; |
| final long rank = (int)(normRank * N); |
| final InequalitySearch crit = inclusive ? InequalitySearch.GE : InequalitySearch.GT; |
| final int index = InequalitySearch.find(cumWeights, 0, len - 1, rank, crit); |
| if (index == -1) { |
| return values[len - 1]; //GT: normRank >= 1.0; GE: normRank > 1.0 |
| } |
| return values[index]; |
| } |
| |
| /** |
| * Gets the normalized rank based on the given value. |
| * @param value the given value |
| * @param inclusive determines the search criterion used. |
| * @return the normalized rank |
| */ |
| public double getRank(final float value, final boolean inclusive) { |
| final int len = values.length; |
| final InequalitySearch crit = inclusive ? InequalitySearch.LE : InequalitySearch.LT; |
| final int index = InequalitySearch.find(values, 0, len - 1, value, crit); |
| if (index == -1) { |
| return 0; //LT: value <= minValue; LE: value < minValue |
| } |
| return (double)cumWeights[index] / N; |
| } |
| |
| public ReqSketchSortedViewIterator iterator() { |
| return new ReqSketchSortedViewIterator(values, cumWeights); |
| } |
| |
| public String toString(final int precision, final int fieldSize) { |
| final StringBuilder sb = new StringBuilder(); |
| final int p = precision; |
| final int z = Math.max(fieldSize, 6); |
| final String ff = "%" + z + "." + p + "f"; |
| final String sf = "%" + z + "s"; |
| final String df = "%" + z + "d"; |
| final String dfmt = ff + df + LS; |
| final String sfmt = sf + sf + LS; |
| sb.append("REQ toString(): Sorted View Data:").append(LS + LS); |
| sb.append(String.format(sfmt, "Value", "CumWeight")); |
| final int totalCount = values.length; |
| for (int i = 0; i < totalCount; i++) { |
| final Row row = getRow(i); |
| sb.append(String.format(dfmt, row.value, row.cumWeight)); |
| } |
| return sb.toString(); |
| } |
| |
| //restricted methods |
| |
| private void buildAuxTable(final ReqSketch sk) { |
| final List<ReqCompactor> compactors = sk.getCompactors(); |
| final int numComp = compactors.size(); |
| final int totalValues = sk.getRetainedItems(); |
| values = new float[totalValues]; |
| cumWeights = new long[totalValues]; |
| int count = 0; |
| for (int i = 0; i < numComp; i++) { |
| final ReqCompactor c = compactors.get(i); |
| final FloatBuffer bufIn = c.getBuffer(); |
| final long bufWeight = 1 << c.getLgWeight(); |
| final int bufInLen = bufIn.getCount(); |
| mergeSortIn(bufIn, bufWeight, count); |
| count += bufInLen; |
| } |
| createCumulativeNativeRanks(); |
| if (dedup) { dedup(); } |
| } |
| |
| /** |
| * Specially modified version of FloatBuffer.mergeSortIn(). Here spaceAtBottom is always false and |
| * the ultimate array size has already been set. However, this must simultaneously deal with |
| * sorting the base FloatBuffer as well. |
| * |
| * @param bufIn given FloatBuffer. If not sorted it will be sorted here. |
| * @param bufWeight associated weight of input FloatBuffer |
| * @param count tracks number of values inserted into the class arrays |
| */ |
| private void mergeSortIn(final FloatBuffer bufIn, final long bufWeight, final int count) { |
| if (!bufIn.isSorted()) { bufIn.sort(); } |
| final float[] arrIn = bufIn.getArray(); //may be larger than its value count. |
| final int bufInLen = bufIn.getCount(); |
| final int totLen = count + bufInLen; |
| int i = count - 1; |
| int j = bufInLen - 1; |
| int h = hra ? bufIn.getCapacity() - 1 : bufInLen - 1; |
| for (int k = totLen; k-- > 0; ) { |
| if (i >= 0 && j >= 0) { //both valid |
| if (values[i] >= arrIn[h]) { |
| values[k] = values[i]; |
| cumWeights[k] = cumWeights[i--]; //not yet natRanks, just individual wts |
| } else { |
| values[k] = arrIn[h--]; j--; |
| cumWeights[k] = bufWeight; |
| } |
| } else if (i >= 0) { //i is valid |
| values[k] = values[i]; |
| cumWeights[k] = cumWeights[i--]; |
| } else if (j >= 0) { //j is valid |
| values[k] = arrIn[h--]; j--; |
| cumWeights[k] = bufWeight; |
| } else { |
| break; |
| } |
| } |
| } |
| |
| private void createCumulativeNativeRanks() { |
| final int len = values.length; |
| for (int i = 1; i < len; i++) { |
| cumWeights[i] += cumWeights[i - 1]; |
| } |
| assert cumWeights[len - 1] == N; |
| } |
| |
| private void dedup() { |
| final int valuesLen = values.length; |
| final float[] valuesB = new float[valuesLen]; |
| final long[] natRanksB = new long[valuesLen]; |
| int bidx = 0; |
| int i = 0; |
| while (i < valuesLen) { |
| int j = i + 1; |
| int hidup = j; |
| while (j < valuesLen && values[i] == values[j]) { |
| hidup = j++; |
| } |
| if (j - i == 1) { //no dups |
| valuesB[bidx] = values[i]; |
| natRanksB[bidx++] = cumWeights[i]; |
| i++; |
| continue; |
| } else { |
| valuesB[bidx] = values[hidup]; |
| natRanksB[bidx++] = cumWeights[hidup]; |
| i = j; |
| continue; |
| } |
| } |
| values = Arrays.copyOf(valuesB, bidx); |
| cumWeights = Arrays.copyOf(natRanksB, bidx); |
| } |
| |
| |
| |
| //used for testing and for public SortedView toString |
| |
| Row getRow(final int index) { |
| return new Row(values[index], cumWeights[index]); |
| } |
| |
| static class Row { |
| float value; |
| long cumWeight; |
| |
| Row(final float value, final long cumWeight) { |
| this.value = value; |
| this.cumWeight = cumWeight; |
| } |
| } |
| |
| } |