blob: 600c5c67905b05dac9594b6c6b6b3bb16f07bd85 [file] [log] [blame]
<!DOCTYPE html>
<!-- Start _layouts/doc_page.html-->
<html lang="en">
<head>
<!-- Start _include/site_head.html -->
<meta charset="UTF-8" />
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<meta name="description" content="">
<meta name="author" content="datasketches">
<title>DataSketches | </title>
<link rel="shortcut icon" href="/img/favicon.png">
<!-- original source: https://maxcdn.bootstrapcdn.com/font-awesome/4.1.0/css/font-awesome.min.css -->
<link rel="stylesheet" href="/css/font-awesome.min.css">
<!-- original source: https://maxcdn.bootstrapcdn.com/bootstrap/3.2.0/css/bootstrap.min.css -->
<link rel="stylesheet" href="/css/bootstrap.min.css">
<link rel="stylesheet" href="/css/fonts.css" type="text/css">
<link rel="stylesheet" href="/css/main.css">
<link rel="stylesheet" href="/css/header.css">
<link rel="stylesheet" href="/css/footer.css">
<link rel="stylesheet" href="/css/syntax.css">
<link rel="stylesheet" href="/css/docs.css">
<script type="text/x-mathjax-config">
MathJax.Hub.Config({tex2jax: {inlineMath: [['$','$'], ['\\(','\\)']]},showMathMenu:false,showMathMenuMSIE:false,showProcessingMessages:false});
</script>
<!-- original source: https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.1/MathJax.js?config=TeX-AMX_HTML-full -->
<script type="text/javascript" src="/js/MathJax.js?config=TeX-AMS_HTML"></script>
<!-- original source: https://code.jquery.com/jquery.min.js -->
<script src="/js/jquery.min.js"></script>
<!-- original source: https://maxcdn.bootstrapcdn.com/bootstrap/3.2.0/js/bootstrap.min.js -->
<script src="/js/bootstrap.min.js"></script> <!-- 3.2.0-->
<!-- End _include/site_head.html -->
</head>
<body>
<!-- Start _include/nav_bar.html -->
<div class="navbar navbar-inverse navbar-static-top ds-nav">
<div class="container">
<div class="navbar-header">
<button type="button" class="navbar-toggle" data-toggle="collapse" data-target=".navbar-collapse">
<span class="sr-only">Toggle navigation</span>
<span class="icon-bar"></span>
<span class="icon-bar"></span>
<span class="icon-bar"></span>
</button>
<a href="/" style="padding-top: 0px; padding-bottom: 0px;">
<span class="ds-small-h-logo"></span></a>
</div>
<div class="navbar-collapse collapse">
<ul class="nav navbar-nav navbar-right">
<li>
<a href="/docs/Background/TheChallenge.html">
<span class="fa fa-info-circle"></span> DOCUMENTATION</a>
</li>
<li>
<a href="/docs/Community/Downloads.html">
<span class="fa fa-download"></span> DOWNLOAD</a>
</li>
<!--
<li>
<a href="/docs/Architecture/Components.html">
<span class="fa fa-github"></span> GITHUB</a>
</li>
-->
<li>
<a href="/docs/Community/Research.html">
<span class="fa fa-paper-plane"></span> RESEARCH</a>
</li>
<li>
<a href="/docs/Community/index.html" style="padding-top: 0; padding-bottom: 0;">
<img class="ds-small-man" src="/img/datasketches-ManWhite.svg"/>COMMUNITY</a>
</li>
<li>
<ul class="nav navbar-nav navbar-right ds-nav">
<li class="dropdown ds-nav" >
<a href="#" class="dropdown-toggle" data-toggle="dropdown" role="button" aria-haspopup="true" aria-expanded="false" style="padding-top: 0; padding-bottom: 0;"><img class="apache-logo" src="/img/feather.svg"/>Apache <span class="caret"></span></a>
<ul class="dropdown-menu ds-nav">
<li><a href="https://www.apache.org/" target="_blank">Foundation</a></li>
<li><a href="https://www.apache.org/events/current-event" target="_blank">Events</a></li>
<li><a href="https://www.apache.org/licenses/" target="_blank">License</a></li>
<li><a href="https://privacy.apache.org/policies/privacy-policy-public.html" target="_blank">Privacy Policy</a></li>
<li><a href="https://www.apache.org/foundation/thanks.html" target="_blank">Thanks</a></li>
<li><a href="https://www.apache.org/security/" target="_blank">Security</a></li>
<li><a href="https://www.apache.org/foundation/sponsorship.html" target="_blank">Sponsorship</a></li>
</ul>
</li>
</ul>
</li>
</ul>
</div>
</div>
</div>
<!-- End _include/nav_bar.html -->
<!-- Start _include/javadocs.html -->
<div class="ds-header">
<div class="container">
<h4>API Snapshots:
<a href="https://apache.github.io/datasketches-java/4.2.0/">Java Core</a>,
<a href="https://apache.github.io/datasketches-cpp/5.0.0/">C++ Core</a>,
<a href="https://apache.github.io/datasketches-python/main/">Python</a>,
<a href="https://apache.github.io/datasketches-memory/master/">Memory</a>,
<a href="/api/pig/snapshot/apidocs/index.html">Pig</a>,
<a href="/api/hive/snapshot/apidocs/index.html">Hive</a>,
</h4>
</div>
</div>
<!-- End _include/javadocs.html -->
<div class="container">
<div class="row">
<!-- Start ToC Block -->
<div class="col-md-3">
<div class="searchbox" style="position:relative">
<gcse:searchbox-only></gcse:searchbox-only>
</div>
<!-- Start _includes/toc.html -->
<!-- Computer Generated File, Do Not Edit! -->
<link rel="stylesheet" href="/css/toc.css">
<div id="toc" class="nav toc hidden-print">
<p id="background">
<a data-toggle="collapse" class="menu collapsed" href="#collapse_background">Background</a>
</p>
<div class="collapse" id="collapse_background">
<li><a href="/docs/Background/TheChallenge.html">•The Challenge</a></li>
<li><a href="/docs/Background/SketchOrigins.html">•Sketch Origins</a></li>
<li><a href="/docs/Background/SketchElements.html">•Sketch Elements</a></li>
<li><a href="/docs/Background/Presentations.html">•Presentations</a></li>
<li><a href="https://github.com/apache/datasketches-website/tree/master/docs/pdf/DataSketches_deck.pdf">•Overview Slide Deck</a></li>
</div>
<p id="architecture-and-design">
<a data-toggle="collapse" class="menu collapsed" href="#collapse_architecture_and_design">Architecture And Design</a>
</p>
<div class="collapse" id="collapse_architecture_and_design">
<li><a href="/docs/Architecture/MajorSketchFamilies.html">•The Major Sketch Families</a></li>
<li><a href="/docs/Architecture/LargeScale.html">•Large Scale Computing</a></li>
<li><a href="/docs/Architecture/KeyFeatures.html">•Key Features</a></li>
<li><a href="/docs/Architecture/SketchFeaturesMatrix.html">•Sketch Features Matrix</a></li>
<li><a href="/docs/Architecture/Components.html">•Components</a></li>
<li><a href="/docs/Architecture/SketchesByComponent.html">•Sketches by Component</a></li>
<li><a href="/docs/Architecture/SketchCriteria.html">•Sketch Criteria</a></li>
<p id="memory-component">
<a data-toggle="collapse" class="menu collapsed" href="#collapse_memory_component">Memory Component</a>
</p>
<div class="collapse" id="collapse_memory_component">
<li><a href="/docs/Memory/MemoryComponent.html">•Memory Component</a></li>
<li><a href="/docs/Memory/MemoryPerformance.html">•Memory Component Performance</a></li>
</div>
<li><a href="/docs/Architecture/OrderSensitivity.html">•Notes on Order Sensitivity</a></li>
<li><a href="/docs/Architecture/Concurrency.html">•Notes on Concurrency</a></li>
</div>
<p id="sketch-families">
<a data-toggle="collapse" class="menu collapsed" href="#collapse_sketch_families">Sketch Families</a>
</p>
<div class="collapse" id="collapse_sketch_families">
<p id="distinct-counting">
<a data-toggle="collapse" class="menu collapsed" href="#collapse_distinct_counting">Distinct Counting</a>
</p>
<div class="collapse" id="collapse_distinct_counting">
<li><a href="/docs/DistinctCountFeaturesMatrix.html">•Features Matrix</a></li>
<li><a href="/docs/DistinctCountMeritComparisons.html">•Figures-of-Merit Comparison</a></li>
<p id="cpc-sketches">
<a data-toggle="collapse" class="menu collapsed" href="#collapse_cpc_sketches">CPC Sketches</a>
</p>
<div class="collapse" id="collapse_cpc_sketches">
<li><a href="/docs/CPC/CPC.html">•CPC Sketch</a></li>
<li><a href="/docs/CPC/CpcPerformance.html">•CPC Sketch Performance</a></li>
<p id="cpc-examples">
<a data-toggle="collapse" class="menu collapsed" href="#collapse_cpc_examples">CPC Examples</a>
</p>
<div class="collapse" id="collapse_cpc_examples">
<li><a href="/docs/CPC/CpcJavaExample.html">•CPC Sketch Java Example</a></li>
<li><a href="/docs/CPC/CpcCppExample.html">•CPC Sketch C++ Example</a></li>
<li><a href="/docs/CPC/CpcPigExample.html">•CPC Sketch Pig UDFs</a></li>
<li><a href="/docs/CPC/CpcHiveExample.html">•CPC Sketch Hive UDFs</a></li>
</div>
</div>
<p id="hyperloglog-sketches">
<a data-toggle="collapse" class="menu collapsed" href="#collapse_hyperloglog_sketches">HyperLogLog Sketches</a>
</p>
<div class="collapse" id="collapse_hyperloglog_sketches">
<li><a href="/docs/HLL/HLL.html">•HLL Sketch</a></li>
<li><a href="/docs/HLL/HllMap.html">•HLL Map Sketch</a></li>
<p id="hll-examples">
<a data-toggle="collapse" class="menu collapsed" href="#collapse_hll_examples">HLL Examples</a>
</p>
<div class="collapse" id="collapse_hll_examples">
<li><a href="/docs/HLL/HllJavaExample.html">•HLL Sketch Java Example</a></li>
<li><a href="/docs/HLL/HllCppExample.html">•HLL Sketch C++ Example</a></li>
<li><a href="/docs/HLL/HllPigUDFs.html">•HLL Sketch Pig UDFs</a></li>
<li><a href="/docs/HLL/HllHiveUDFs.html">•HLL Sketch Hive UDFs</a></li>
</div>
<p id="hll-studies">
<a data-toggle="collapse" class="menu collapsed" href="#collapse_hll_studies">HLL Studies</a>
</p>
<div class="collapse" id="collapse_hll_studies">
<li><a href="/docs/HLL/HllPerformance.html">•HLL Sketch Performance</a></li>
<li><a href="/docs/HLL/Hll_vs_CS_Hllpp.html">•HLL vs Clearspring HLL++</a></li>
<li><a href="/docs/HLL/HllSketchVsDruidHyperLogLogCollector.html">•HLL Sketch vs Druid HyperLogLogCollector</a></li>
</div>
</div>
<p id="theta-sketches">
<a data-toggle="collapse" class="menu collapsed" href="#collapse_theta_sketches">Theta Sketches</a>
</p>
<div class="collapse" id="collapse_theta_sketches">
<li><a href="/docs/Theta/ThetaSketchFramework.html">•Theta Sketch Framework</a></li>
<p id="theta-examples">
<a data-toggle="collapse" class="menu collapsed" href="#collapse_theta_examples">Theta Examples</a>
</p>
<div class="collapse" id="collapse_theta_examples">
<li><a href="/docs/Theta/ConcurrentThetaSketch.html">•Concurrent Theta Sketch</a></li>
<li><a href="/docs/Theta/ThetaJavaExample.html">•Theta Sketch Java Example</a></li>
<li><a href="/docs/Theta/ThetaSparkExample.html">•Theta Sketch Spark Example</a></li>
<li><a href="/docs/Theta/ThetaPigUDFs.html">•Theta Sketch Pig UDFs</a></li>
<li><a href="/docs/Theta/ThetaHiveUDFs.html">•Theta Sketch Hive UDFs</a></li>
</div>
<p id="kmv-tutorial">
<a data-toggle="collapse" class="menu collapsed" href="#collapse_kmv_tutorial">KMV Tutorial</a>
</p>
<div class="collapse" id="collapse_kmv_tutorial">
<li><a href="/docs/Theta/InverseEstimate.html">•The Inverse Estimate</a></li>
<li><a href="/docs/Theta/KMVempty.html">•Empty Sketch</a></li>
<li><a href="/docs/Theta/KMVfirstEst.html">•First Estimator</a></li>
<li><a href="/docs/Theta/KMVbetterEst.html">•Better Estimator</a></li>
<li><a href="/docs/Theta/KMVrejection.html">•Rejection Rules</a></li>
<li><a href="/docs/Theta/KMVupdateVkth.html">•Update V(kth) Rule</a></li>
</div>
<p id="set-operations-and-p-sampling">
<a data-toggle="collapse" class="menu collapsed" href="#collapse_set_operations_and_p-sampling">Set Operations and P-sampling</a>
</p>
<div class="collapse" id="collapse_set_operations_and_p-sampling">
<li><a href="/docs/Theta/ThetaSketchSetOps.html">•Set Operations</a></li>
<li><a href="/docs/Theta/ThetaSetOpsCornerCases.html">•Model & Test Set Operations</a></li>
<li><a href="/docs/Theta/ThetaPSampling.html"><i>p</i>-Sampling</a></li>
</div>
<p id="accuracy">
<a data-toggle="collapse" class="menu collapsed" href="#collapse_accuracy">Accuracy</a>
</p>
<div class="collapse" id="collapse_accuracy">
<li><a href="/docs/Theta/ThetaAccuracy.html">•Basic Accuracy</a></li>
<li><a href="/docs/Theta/ThetaAccuracyPlots.html">•Accuracy Plots</a></li>
<li><a href="/docs/Theta/ThetaErrorTable.html">•Relative Error Table</a></li>
<li><a href="/docs/Theta/ThetaSketchSetOpsAccuracy.html">•SetOp Accuracy</a></li>
<li><a href="/docs/Theta/AccuracyOfDifferentKUnions.html">•Unions With Different k</a></li>
</div>
<p id="size">
<a data-toggle="collapse" class="menu collapsed" href="#collapse_size">Size</a>
</p>
<div class="collapse" id="collapse_size">
<li><a href="/docs/Theta/ThetaSize.html">•Theta Sketch Size</a></li>
</div>
<p id="speed">
<a data-toggle="collapse" class="menu collapsed" href="#collapse_speed">Speed</a>
</p>
<div class="collapse" id="collapse_speed">
<li><a href="/docs/Theta/ThetaUpdateSpeed.html">•Update Speed</a></li>
<li><a href="/docs/Theta/ThetaMergeSpeed.html">•Merge Speed</a></li>
</div>
<p id="theta-sketch-theory">
<a data-toggle="collapse" class="menu collapsed" href="#collapse_theta_sketch_theory">Theta Sketch Theory</a>
</p>
<div class="collapse" id="collapse_theta_sketch_theory">
<li><a href="https://github.com/apache/datasketches-website/tree/master/docs/pdf/ThetaSketchFramework.pdf">•Theta Sketch Framework (PDF)</a></li>
<li><a href="https://github.com/apache/datasketches-website/tree/master/docs/pdf/ThetaSketchEquations.pdf">•Theta Sketch Equations (PDF)</a></li>
<li><a href="https://github.com/apache/datasketches-website/tree/master/docs/pdf/DataSketches.pdf">•DataSketches (PDF)</a></li>
<li><a href="/docs/Theta/ThetaConfidenceIntervals.html">•Confidence Intervals Notes</a></li>
<li><a href="/docs/Theta/ThetaMergingAlgorithm.html">•Merging Algorithm Notes</a></li>
<li><a href="/docs/Theta/ThetaReferences.html">•Theta References</a></li>
</div>
</div>
<p id="tuple-sketches">
<a data-toggle="collapse" class="menu collapsed" href="#collapse_tuple_sketches">Tuple Sketches</a>
</p>
<div class="collapse" id="collapse_tuple_sketches">
<li><a href="/docs/Tuple/TupleOverview.html">•Tuple Overview</a></li>
<p id="tuple-examples">
<a data-toggle="collapse" class="menu collapsed" href="#collapse_tuple_examples">Tuple Examples</a>
</p>
<div class="collapse" id="collapse_tuple_examples">
<li><a href="/docs/Tuple/TupleJavaExample.html">•Tuple Java Example</a></li>
<li><a href="/docs/Tuple/TupleEngagementExample.html">•Tuple Engagement Example</a></li>
<li><a href="/docs/Tuple/TuplePigUDFs.html">•Tuple Pig UDFs</a></li>
<li><a href="/docs/Tuple/TupleHiveUDFs.html">•Tuple Hive UDFs</a></li>
</div>
</div>
</div>
<p id="most-frequent">
<a data-toggle="collapse" class="menu collapsed" href="#collapse_most_frequent">Most Frequent</a>
</p>
<div class="collapse" id="collapse_most_frequent">
<li><a href="/docs/Frequency/FrequencySketchesOverview.html">•Frequency Sketches Overview</a></li>
<p id="frequent-item-sketches">
<a data-toggle="collapse" class="menu collapsed" href="#collapse_frequent_item_sketches">Frequent Item Sketches</a>
</p>
<div class="collapse" id="collapse_frequent_item_sketches">
<li><a href="/docs/Frequency/FrequentItemsOverview.html">•Frequent Items Overview</a></li>
<li><a href="/docs/Frequency/FrequentItemsErrorTable.html">•Frequent Items Error Table</a></li>
<li><a href="/docs/Frequency/FrequentItemsReferences.html">•Frequent Items References</a></li>
<li><a href="/docs/Frequency/FrequentItemsPerformance.html">•Frequent Items Performance</a></li>
<p id="most-frequent-examples">
<a data-toggle="collapse" class="menu collapsed" href="#collapse_most_frequent_examples">Most Frequent Examples</a>
</p>
<div class="collapse" id="collapse_most_frequent_examples">
<li><a href="/docs/Frequency/FrequentItemsJavaExample.html">•Frequent Items Java Example</a></li>
<li><a href="/docs/Frequency/FrequentItemsCppExample.html">•Frequent Items C++ Example</a></li>
<li><a href="/docs/Frequency/FrequentItemsPigUDFs.html">•Frequent Items Pig UDFs</a></li>
<li><a href="/docs/Frequency/FrequentItemsHiveUDFs.html">•Frequent Items Hive UDFs</a></li>
</div>
</div>
<p id="frequent-distinct-sketches">
<a data-toggle="collapse" class="menu collapsed" href="#collapse_frequent_distinct_sketches">Frequent Distinct Sketches</a>
</p>
<div class="collapse" id="collapse_frequent_distinct_sketches">
<li><a href="/docs/Frequency/FrequentDistinctTuplesSketch.html">•Frequent Distinct Tuples Sketch</a></li>
</div>
</div>
<p id="quantiles-and-histograms">
<a data-toggle="collapse" class="menu collapsed" href="#collapse_quantiles_and_histograms">Quantiles And Histograms</a>
</p>
<div class="collapse" id="collapse_quantiles_and_histograms">
<li><a href="/docs/Quantiles/SketchingQuantilesAndRanksTutorial.html">•Quantiles and Ranks Tutorial</a></li>
<li><a href="/docs/Quantiles/QuantilesOverview.html">•Quantiles Overview</a></li>
<li><a href="/docs/KLL/KLLSketch.html">•KLL Floats sketch</a></li>
<li><a href="/docs/KLL/KLLAccuracyAndSize.html">•KLL Sketch Accuracy and Size</a></li>
<li><a href="/docs/REQ/ReqSketch.html">•REQ Floats sketch</a></li>
<li><a href="/docs/Quantiles/OrigQuantilesSketch.html">•Original QuantilesSketch</a></li>
<p id="quantiles-examples">
<a data-toggle="collapse" class="menu collapsed" href="#collapse_quantiles_examples">Quantiles Examples</a>
</p>
<div class="collapse" id="collapse_quantiles_examples">
<li><a href="/docs/Quantiles/QuantilesJavaExample.html">•Quantiles Sketch Java Example</a></li>
<li><a href="/docs/KLL/KLLCppExample.html">•KLL Quantiles Sketch C++ Example</a></li>
<li><a href="/docs/Quantiles/QuantilesPigUDFs.html">•Quantiles Sketch Pig UDFs</a></li>
<li><a href="/docs/Quantiles/QuantilesHiveUDFs.html">•Quantiles Sketch Hive UDFs</a></li>
</div>
<p id="quantiles-studies">
<a data-toggle="collapse" class="menu collapsed" href="#collapse_quantiles_studies">Quantiles Studies</a>
</p>
<div class="collapse" id="collapse_quantiles_studies">
<li><a href="/docs/QuantilesStudies/DruidApproxHistogramStudy.html">•Druid Approximate Histogram</a></li>
<li><a href="/docs/QuantilesStudies/MomentsSketchStudy.html">•Moments Sketch Study</a></li>
<li><a href="/docs/QuantilesStudies/QuantilesStreamAStudy.html">•Quantiles StreamA Study</a></li>
<li><a href="/docs/QuantilesStudies/ExactQuantiles.html">•Exact Quantiles for Studies</a></li>
</div>
<p id="quantiles-sketch-theory">
<a data-toggle="collapse" class="menu collapsed" href="#collapse_quantiles_sketch_theory">Quantiles Sketch Theory</a>
</p>
<div class="collapse" id="collapse_quantiles_sketch_theory">
<li><a href="https://github.com/apache/datasketches-website/tree/master/docs/pdf/Quantiles_KLL.pdf">•Optimal Quantile Approximation in Streams</a></li>
<li><a href="/docs/Quantiles/QuantilesReferences.html">•Quantiles References</a></li>
</div>
</div>
<p id="sampling">
<a data-toggle="collapse" class="menu collapsed" href="#collapse_sampling">Sampling</a>
</p>
<div class="collapse" id="collapse_sampling">
<li><a href="/docs/Sampling/ReservoirSampling.html">•Reservoir Sampling</a></li>
<li><a href="/docs/Sampling/ReservoirSamplingPerformance.html">•Reservoir Sampling Performance</a></li>
<li><a href="/docs/Sampling/VarOptSampling.html">•VarOpt Sampling</a></li>
<p id="sampling-examples">
<a data-toggle="collapse" class="menu collapsed" href="#collapse_sampling_examples">Sampling Examples</a>
</p>
<div class="collapse" id="collapse_sampling_examples">
<li><a href="/docs/Sampling/ReservoirSamplingJava.html">•Reservoir Sampling Java Example</a></li>
<li><a href="/docs/Sampling/ReservoirSamplingPigUDFs.html">•Reservoir Sampling Pig UDFs</a></li>
<li><a href="/docs/Sampling/VarOptSamplingJava.html">•VarOpt Sampling Java Example</a></li>
<li><a href="/docs/Sampling/VarOptPigUDFs.html">•VarOpt Sampling Pig UDFs</a></li>
</div>
</div>
</div>
<p id="system-integrations">
<a data-toggle="collapse" class="menu collapsed" href="#collapse_system_integrations">System Integrations</a>
</p>
<div class="collapse" id="collapse_system_integrations">
<li><a href="/docs/SystemIntegrations/ApacheDruidIntegration.html">•Using Sketches in ApacheDruid</a></li>
<li><a href="/docs/SystemIntegrations/ApacheHiveIntegration.html">•Using Sketches in Apache Hive</a></li>
<li><a href="/docs/SystemIntegrations/ApachePigIntegration.html">•Using Sketches in Apache Pig</a></li>
<li><a href="/docs/SystemIntegrations/PostgreSQLIntegration.html">•Using Sketches in PostgreSQL</a></li>
</div>
<p id="community">
<a data-toggle="collapse" class="menu collapsed" href="#collapse_community">Community</a>
</p>
<div class="collapse" id="collapse_community">
<li><a href="/docs/Community/index.html">•Community</a></li>
<li><a href="/docs/Community/Downloads.html">•Downloads</a></li>
<li><a href="/docs/Community/NewCommitterProcess.html">•Committer Process</a></li>
<li><a href="/docs/Community/ReleaseProcessForCppComponents.html">•Release Process For CPP Components</a></li>
<li><a href="/docs/Community/ReleaseProcessForJavaComponents.html">•Release Process For Java Components</a></li>
<li><a href="/docs/Community/Transitioning.html">•Transitioning from prior GitHub Site</a></li>
</div>
<p id="research">
<a data-toggle="collapse" class="menu collapsed" href="#collapse_research">Research</a>
</p>
<div class="collapse" id="collapse_research">
<li><a href="/docs/Community/Research.html">•Research</a></li>
</div>
</div>
<!-- End _includes/toc.html -->
<!-- Start _includes/tocScript.html -->
<script>
(function () {
var findLineItem = function (path) {
return document.querySelector(`#toc [href="${path}"]`);
};
function findNavItem(path) {
return document.querySelector(`.nav [href="${path}"]`);
}
var highlighLineItem = function (element) {
element.classList.add('highlight');
};
var checkHasClass = function (element, className) {
return element.className.split(' ').find(function (item) { return item === className || '' })
}
var findAllCollapseParents = function (element) {
var collapseMenus = [];
var elementPointer = element;
while (elementPointer !== document.body) {
if (checkHasClass(elementPointer, 'collapse')) {
collapseMenus.push(elementPointer);
}
elementPointer = elementPointer.parentElement
}
return collapseMenus
};
var openMenuItem = function (element) {
// $(element).collapse('show') would start a transition, adding `in` class instead.
element.classList.add('in');
};
var openAllFromList = function (elementList) {
elementList.forEach(openMenuItem);
};
var highlightAndOpenMenu = function () {
// Highlight & expand nav item in the TOC
var currentLineItem = findLineItem(document.location.pathname);
highlighLineItem(currentLineItem);
openAllFromList(findAllCollapseParents(currentLineItem));
// Highlight nav item in top navigation
highlighLineItem(findNavItem(document.location.pathname));
};
$(highlightAndOpenMenu);
}());
</script>
<!-- End _includes/tocScript.html -->
</div>
<!-- End ToC Block -->
<div class="col-md-9 doc-content">
<!--
Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
distributed with this work for additional information
regarding copyright ownership. The ASF licenses this file
to you under the Apache License, Version 2.0 (the
"License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing,
software distributed under the License is distributed on an
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
KIND, either express or implied. See the License for the
specific language governing permissions and limitations
under the License.
-->
<h2 id="example-of-using-thetasketch-in-spark">Example of using ThetaSketch in Spark</h2>
<p>The key idea with respect to performance here is to arrange a two-phase process. In the first phase all input is partitioned by Spark and sent to executors. One sketch is created per partition (or per dimensional combination in that partition) and updated with all the input without serializing the sketch until the end of the phase. In the second phase the sketches from the first phase are merged. Therefore serialization would happen only between the phases to transfer the results of the first phase to the executors performing the second phase. In the code examples below we convert UpdateSketches to CompactSketches during serialization, which results in transferring less data, and also serves as a proof that no serialization is done during the first phase since the deserialized sketches cannot be updated, but only merged.</p>
<p>Building one sketch using old Spark API:</p>
<div class="highlighter-rouge"><div class="highlight"><pre class="highlight"><code>import org.apache.spark.SparkContext;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.function.Function2;
import org.apache.datasketches.theta.PairwiseSetOperations;
import org.apache.datasketches.theta.CompactSketch;
import org.apache.datasketches.theta.UpdateSketch;
public class Aggregate {
public static void main(final String[] args) {
final SparkConf conf = new SparkConf();
final JavaSparkContext context = new JavaSparkContext(conf);
final JavaRDD&lt;String&gt; lines = context.textFile("words.txt"); // one word per line
final ThetaSketchJavaSerializable initialValue = new ThetaSketchJavaSerializable();
final ThetaSketchJavaSerializable sketch = lines.aggregate(initialValue, new Add(), new Combine());
System.out.println("Unique count: " + String.format("%,f", sketch.getEstimate()));
}
static class Add implements Function2&lt;ThetaSketchJavaSerializable, String, ThetaSketchJavaSerializable&gt; {
public ThetaSketchJavaSerializable call(final ThetaSketchJavaSerializable sketch, final String value) throws Exception {
sketch.update(value);
return sketch;
}
}
static class Combine implements Function2&lt;ThetaSketchJavaSerializable, ThetaSketchJavaSerializable, ThetaSketchJavaSerializable&gt; {
static final ThetaSketchJavaSerializable emptySketchWrapped = new ThetaSketchJavaSerializable(UpdateSketch.builder().build().compact());
public ThetaSketchJavaSerializable call(final ThetaSketchJavaSerializable sketch1, final ThetaSketchJavaSerializable sketch2) throws Exception {
if (sketch1.getSketch() == null &amp;&amp; sketch2.getSketch() == null) return emptySketchWrapped;
if (sketch1.getSketch() == null) return sketch2;
if (sketch2.getSketch() == null) return sketch1;
final CompactSketch compactSketch1 = sketch1.getCompactSketch();
final CompactSketch compactSketch2 = sketch2.getCompactSketch();
return new ThetaSketchJavaSerializable(PairwiseSetOperations.union(compactSketch1, compactSketch2));
}
}
}
</code></pre></div></div>
<p>Wrapper to make ThetaSketch serializable:</p>
<div class="highlighter-rouge"><div class="highlight"><pre class="highlight"><code>import java.io.ObjectInputStream;
import java.io.ObjectOutputStream;
import java.io.IOException;
import java.io.Serializable;
import org.apache.datasketches.theta.Sketch;
import org.apache.datasketches.theta.UpdateSketch;
import org.apache.datasketches.theta.CompactSketch;
import org.apache.datasketches.theta.Sketches;
import org.apache.datasketches.memory.Memory;
public class ThetaSketchJavaSerializable implements Serializable {
private Sketch sketch;
public ThetaSketchJavaSerializable() {
}
public ThetaSketchJavaSerializable(final Sketch sketch) {
this.sketch = sketch;
}
public Sketch getSketch() {
return sketch;
}
public CompactSketch getCompactSketch() {
if (sketch == null) return null;
if (sketch instanceof UpdateSketch) return ((UpdateSketch) sketch).compact();
return (CompactSketch) sketch;
}
public void update(final String value) {
if (sketch == null) sketch = UpdateSketch.builder().build();
if (sketch instanceof UpdateSketch) {
((UpdateSketch) sketch).update(value);
} else {
throw new RuntimeException("update() on read-only sketch");
}
}
public double getEstimate() {
if (sketch == null) return 0.0;
return sketch.getEstimate();
}
private void writeObject(final ObjectOutputStream out) throws IOException {
if (sketch == null) {
out.writeInt(0);
return;
}
final byte[] serializedSketchBytes = ((UpdateSketch) sketch).compact().toByteArray();
out.writeInt(serializedSketchBytes.length);
out.write(serializedSketchBytes);
}
private void readObject(final ObjectInputStream in) throws IOException, ClassNotFoundException {
final int length = in.readInt();
if (length == 0) return;
final byte[] serializedSketchBytes = new byte[length];
in.readFully(serializedSketchBytes);
sketch = Sketches.wrapSketch(Memory.wrap(serializedSketchBytes));
}
}
</code></pre></div></div>
<p>Building one sketch using new Spark 2.x API and reading input from a Hive table:</p>
<div class="highlighter-rouge"><div class="highlight"><pre class="highlight"><code>import org.apache.spark.sql.SparkSession;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.datasketches.theta.PairwiseSetOperations;
import org.apache.datasketches.theta.Sketch;
import org.apache.datasketches.theta.UpdateSketch;
import org.apache.datasketches.theta.CompactSketch;
import org.apache.spark.api.java.function.MapPartitionsFunction;
import org.apache.spark.api.java.function.ReduceFunction;
import org.apache.spark.sql.expressions.Aggregator;
import org.apache.spark.sql.Encoder;
import org.apache.spark.sql.Encoders;
import scala.Tuple2;
import java.util.Iterator;
import java.util.Arrays;
public class Spark2DatasetMapPartitionsReduceJavaSerialization {
public static void main(String[] args) {
final SparkSession spark = SparkSession
.builder()
.appName("Spark2Aggregate")
.enableHiveSupport()
.getOrCreate();
final Dataset&lt;Row&gt; data = spark.sql("select userid from my_data where userid is not null");
final Dataset&lt;ThetaSketchJavaSerializable&gt; mappedData = data.mapPartitions(new MapPartitionsFunction&lt;Row, ThetaSketchJavaSerializable&gt;() {
@Override
public Iterator&lt;ThetaSketchJavaSerializable&gt; call(Iterator&lt;Row&gt; it) {
ThetaSketchJavaSerializable sketch = new ThetaSketchJavaSerializable();
while (it.hasNext()) {
sketch.update((String) it.next().get(0));
}
return Arrays.asList(sketch).iterator();
}
}, Encoders.javaSerialization(ThetaSketchJavaSerializable.class));
final ThetaSketchJavaSerializable sketch = mappedData.reduce(new ReduceFunction&lt;ThetaSketchJavaSerializable&gt;() {
@Override
public ThetaSketchJavaSerializable call(ThetaSketchJavaSerializable sketch1, ThetaSketchJavaSerializable sketch2) throws Exception {
if (sketch1.getSketch() == null &amp;&amp; sketch2.getSketch() == null) return new ThetaSketchJavaSerializable(UpdateSketch.builder().build().compact());
if (sketch1.getSketch() == null) return sketch2;
if (sketch2.getSketch() == null) return sketch1;
final CompactSketch compactSketch1 = sketch1.getCompactSketch();
final CompactSketch compactSketch2 = sketch2.getCompactSketch();
return new ThetaSketchJavaSerializable(PairwiseSetOperations.union(compactSketch1, compactSketch2));
}
});
System.out.println("Unique count: " + String.format("%,f", sketch.getEstimate()));
spark.stop();
}
}
</code></pre></div></div>
<p>Building multiple sketches (one sketch per key or dimension):</p>
<div class="highlighter-rouge"><div class="highlight"><pre class="highlight"><code>import org.apache.spark.SparkContext;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.api.java.function.PairFlatMapFunction;
import org.apache.datasketches.theta.PairwiseSetOperations;
import org.apache.datasketches.theta.CompactSketch;
import org.apache.datasketches.theta.UpdateSketch;
import java.util.List;
import java.util.ArrayList;
import java.util.Map;
import java.util.HashMap;
import java.util.Iterator;
import scala.Tuple2;
public class MapPartitionsToPairReduceByKey {
static final ThetaSketchJavaSerializable emptySketchWrapped = new ThetaSketchJavaSerializable(UpdateSketch.builder().build().compact());
public static void main(final String[] args) {
final SparkConf conf = new SparkConf();
final JavaSparkContext context = new JavaSparkContext(conf);
final JavaRDD&lt;String&gt; lines = context.textFile("agg-by-key-input.txt"); // format: key\tvalue
final JavaPairRDD&lt;String, ThetaSketchJavaSerializable&gt; mappedSketches = lines.mapPartitionsToPair(
new PairFlatMapFunction&lt;Iterator&lt;String&gt;, String, ThetaSketchJavaSerializable&gt;() {
@Override
public Iterator&lt;Tuple2&lt;String, ThetaSketchJavaSerializable&gt;&gt; call(final Iterator&lt;String&gt; input) {
// This map might be too big if there are too many keys in the input data
// One possible solution is to set a threshold on the number of entries
// and flush the HashMap once the threshold is reached (not shown here).
final Map&lt;String, ThetaSketchJavaSerializable&gt; map = new HashMap();
while (input.hasNext()) {
final String line = input.next();
final String[] tokens = line.split("\t");
ThetaSketchJavaSerializable sketch = map.get(tokens[0]);
if (sketch == null) {
sketch = new ThetaSketchJavaSerializable();
map.put(tokens[0], sketch);
}
sketch.update(tokens[1]);
}
final List&lt;Tuple2&lt;String, ThetaSketchJavaSerializable&gt;&gt; list = new ArrayList();
for (final Map.Entry&lt;String, ThetaSketchJavaSerializable&gt; entry: map.entrySet()) {
list.add(new Tuple2(entry.getKey(), entry.getValue()));
}
return list.iterator();
}
}
);
final JavaPairRDD&lt;String, ThetaSketchJavaSerializable&gt; sketches = mappedSketches.reduceByKey(
new Function2&lt;ThetaSketchJavaSerializable, ThetaSketchJavaSerializable, ThetaSketchJavaSerializable&gt;() {
@Override
public ThetaSketchJavaSerializable call(final ThetaSketchJavaSerializable sketch1, final ThetaSketchJavaSerializable sketch2) {
if (sketch1.getSketch() == null &amp;&amp; sketch2.getSketch() == null) return emptySketchWrapped;
if (sketch1.getSketch() == null) return sketch2;
if (sketch2.getSketch() == null) return sketch1;
final CompactSketch compactSketch1 = sketch1.getCompactSketch();
final CompactSketch compactSketch2 = sketch2.getCompactSketch();
return new ThetaSketchJavaSerializable(PairwiseSetOperations.union(compactSketch1, compactSketch2));
}
}, 1 // number of output partitions
);
final Iterator&lt;Tuple2&lt;String, ThetaSketchJavaSerializable&gt;&gt; it = sketches.toLocalIterator();
while (it.hasNext()) {
final Tuple2&lt;String, ThetaSketchJavaSerializable&gt; pair = it.next();
System.out.println("Pair: (" + pair._1 + ", " + pair._2.getEstimate() + ")");
}
}
}
</code></pre></div></div>
<p>Building multiple sketches using SparkSession and reading input from a Hive table:</p>
<div class="highlighter-rouge"><div class="highlight"><pre class="highlight"><code>import org.apache.spark.sql.SparkSession;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.function.PairFlatMapFunction;
import org.apache.spark.api.java.function.Function2;
import org.apache.datasketches.theta.PairwiseSetOperations;
import org.apache.datasketches.theta.UpdateSketch;
import org.apache.datasketches.theta.CompactSketch;
import scala.Tuple2;
import java.util.List;
import java.util.ArrayList;
import java.util.Iterator;
public class AggregateByKey2 {
public static void main(String[] args) {
SparkSession spark = SparkSession
.builder()
.appName("AggregateByKey2")
.enableHiveSupport()
.getOrCreate();
Dataset&lt;Row&gt; data = spark.sql("select country, userid from my_data where userid is not null");
final JavaPairRDD&lt;String, String&gt; pairs = data.javaRDD().mapPartitionsToPair(
new PairFlatMapFunction&lt;Iterator&lt;Row&gt;, String, String&gt;() {
@Override
public Iterator&lt;Tuple2&lt;String, String&gt;&gt; call(final Iterator&lt;Row&gt; input) {
final List&lt;Tuple2&lt;String, String&gt;&gt; list = new ArrayList();
while (input.hasNext()) {
final Row row = input.next();
list.add(new Tuple2&lt;String, String&gt;((String) row.get(0), (String) row.get(1)));
}
return list.iterator();
}
}
);
final JavaPairRDD&lt;String, ThetaSketchJavaSerializable&gt; sketches = pairs.aggregateByKey(
new ThetaSketchJavaSerializable(),
1, // number of partitions
new Add(),
new Combine()
);
final Iterator&lt;Tuple2&lt;String, ThetaSketchJavaSerializable&gt;&gt; it = sketches.toLocalIterator();
while (it.hasNext()) {
final Tuple2&lt;String, ThetaSketchJavaSerializable&gt; pair = it.next();
System.out.println("Pair: (" + pair._1 + ", " + pair._2.getEstimate() + ")");
}
spark.stop();
}
static class Add implements Function2&lt;ThetaSketchJavaSerializable, String, ThetaSketchJavaSerializable&gt; {
@Override
public ThetaSketchJavaSerializable call(final ThetaSketchJavaSerializable sketch, final String value) throws Exception {
sketch.update(value);
return sketch;
}
}
static class Combine implements Function2&lt;ThetaSketchJavaSerializable, ThetaSketchJavaSerializable, ThetaSketchJavaSerializable&gt; {
static final ThetaSketchJavaSerializable emptySketchWrapped = new ThetaSketchJavaSerializable(UpdateSketch.builder().build().compact());
public ThetaSketchJavaSerializable call(final ThetaSketchJavaSerializable sketch1, final ThetaSketchJavaSerializable sketch2) throws Exception {
if (sketch1.getSketch() == null &amp;&amp; sketch2.getSketch() == null) return emptySketchWrapped;
if (sketch1.getSketch() == null) return sketch2;
if (sketch2.getSketch() == null) return sketch1;
final CompactSketch compactSketch1 = sketch1.getCompactSketch();
final CompactSketch compactSketch2 = sketch2.getCompactSketch();
return new ThetaSketchJavaSerializable(PairwiseSetOperations.union(compactSketch1, compactSketch2));
}
}
}
</code></pre></div></div>
</div> <!-- End content -->
</div> <!-- End row -->
</div> <!-- End Container -->
<!-- Start _include/page_footer.html -->
<footer class="ds-footer">
<div class="container">
<div class="text-center">
<p>
<div>Copyright © 2024 <a href="https://www.apache.org">Apache Software Foundation</a>,
Licensed under the Apache License, Version 2.0. All Rights Reserved.
| <a href="https://privacy.apache.org/policies/privacy-policy-public.html">Privacy Policy</a><br/>
Apache DataSketches, Apache, the Apache feather logo, and the Apache DataSketches project logos are trademarks of The Apache Software Foundation.<br/>
All other marks mentioned may be trademarks or registered trademarks of their respective owners.
</div>
</p>
</div>
</div>
</footer>
<!-- End _include/page_footer.html -->
</body>
</html>
<!-- End _layouts/doc_page.html-->