blob: c0cb99279da79b11d08c54132f085b75d70e25e0 [file] [log] [blame] [view]
---
layout: doc_page
---
<!--
Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
distributed with this work for additional information
regarding copyright ownership. The ASF licenses this file
to you under the Apache License, Version 2.0 (the
"License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing,
software distributed under the License is distributed on an
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
KIND, either express or implied. See the License for the
specific language governing permissions and limitations
under the License.
-->
# Quantiles/DoublesSketch Java Example
// simplified file operations and no error handling for clarity
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.util.Arrays;
import java.util.Random;
import org.apache.datasketches.memory.Memory;
import org.apache.datasketches.quantiles.DoublesSketch;
import org.apache.datasketches.quantiles.DoublesUnion;
import org.apache.datasketches.quantiles.UpdateDoublesSketch;
// this section generates two sketches from random data and serializes them into files
{
Random rand = new Random();
UpdateDoublesSketch sketch1 = DoublesSketch.builder().build(); // default k=128
for (int i = 0; i < 10000; i++) {
sketch1.update(rand.nextGaussian()); // mean=0, stddev=1
}
FileOutputStream out1 = new FileOutputStream("QuantilesDoublesSketch1.bin");
out1.write(sketch1.toByteArray());
out1.close();
UpdateDoublesSketch sketch2 = DoublesSketch.builder().build(); // default k=128
for (int i = 0; i < 10000; i++) {
sketch2.update(rand.nextGaussian() + 1); // shift the mean for the second sketch
}
FileOutputStream out2 = new FileOutputStream("QuantilesDoublesSketch2.bin");
out2.write(sketch2.toByteArray());
out2.close();
}
// this section deserializes the sketches, produces a union and prints some results
{
FileInputStream in1 = new FileInputStream("QuantilesDoublesSketch1.bin");
byte[] bytes1 = new byte[in1.available()];
in1.read(bytes1);
in1.close();
DoublesSketch sketch1 = DoublesSketch.wrap(Memory.wrap(bytes1));
FileInputStream in2 = new FileInputStream("QuantilesDoublesSketch2.bin");
byte[] bytes2 = new byte[in2.available()];
in2.read(bytes2);
in2.close();
DoublesSketch sketch2 = DoublesSketch.wrap(Memory.wrap(bytes2));
DoublesUnion union = DoublesUnion.builder().build(); // default k=128
union.update(sketch1);
union.update(sketch2);
DoublesSketch result = union.getResult();
// Debug output from the sketch
System.out.println(result.toString());
System.out.println("Min, Median, Max values");
System.out.println(Arrays.toString(result.getQuantiles(new double[] {0, 0.5, 1})));
System.out.println("Probability Histogram: estimated probability mass in 4 bins: (-inf, -2), [-2, 0), [0, 2), [2, +inf)");
System.out.println(Arrays.toString(result.getPMF(new double[] {-2, 0, 2})));
System.out.println("Frequency Histogram: estimated number of original values in the same bins");
double[] histogram = result.getPMF(new double[] {-2, 0, 2});
for (int i = 0; i < histogram.length; i++) {
histogram[i] *= result.getN(); // scale the fractions by the total count of values
}
System.out.println(Arrays.toString(histogram));
}
Output:
### Quantiles HeapUpdateDoublesSketch SUMMARY:
Empty : false
Direct, Capacity bytes : false,
Estimation Mode : true
K : 128
N : 20,000
Levels (Needed, Total, Valid): 7, 7, 4
Level Bit Pattern : 1001110
BaseBufferCount : 32
Combined Buffer Capacity : 1,152
Retained Items : 544
Compact Storage Bytes : 4,384
Updatable Storage Bytes : 9,248
Normalized Rank Error : 1.725%
Min Value : -4.113
Max Value : 4.363
### END SKETCH SUMMARY
Min, Median, Max values
[-4.113097775288085, 0.49496152841809893, 4.362712872544037]
Probability Histogram: estimated probability mass in 4 bins: (-inf, -2), [-2, 0), [0, 2), [2, +inf)
[0.01445, 0.3071, 0.58545, 0.093]
Frequency Histogram: estimated number of original values in the same bins
[289.0, 6142.0, 11709.0, 1860.0]