blob: 8702d120f2376261fac641763c31026db1395156 [file] [log] [blame]
<!DOCTYPE html>
<html class="writer-html5" lang="en" data-content_root="../">
<head>
<meta charset="utf-8" /><meta name="viewport" content="width=device-width, initial-scale=1" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
<title>Exact and Bounded, Probabilitiy Proportional to Size (EBPPS) Sampling &mdash; datasketches 0.1 documentation</title>
<link rel="stylesheet" type="text/css" href="../_static/pygments.css?v=b86133f3" />
<link rel="stylesheet" type="text/css" href="../_static/css/theme.css?v=e59714d7" />
<script src="../_static/jquery.js?v=5d32c60e"></script>
<script src="../_static/_sphinx_javascript_frameworks_compat.js?v=2cd50e6c"></script>
<script src="../_static/documentation_options.js?v=2709fde1"></script>
<script src="../_static/doctools.js?v=9bcbadda"></script>
<script src="../_static/sphinx_highlight.js?v=dc90522c"></script>
<script src="../_static/js/theme.js"></script>
<link rel="index" title="Index" href="../genindex.html" />
<link rel="search" title="Search" href="../search.html" />
<link rel="next" title="Helper Classes" href="../helper/index.html" />
<link rel="prev" title="Variance Optimal Sampling (VarOpt)" href="varopt.html" />
</head>
<body class="wy-body-for-nav">
<div class="wy-grid-for-nav">
<nav data-toggle="wy-nav-shift" class="wy-nav-side">
<div class="wy-side-scroll">
<div class="wy-side-nav-search" >
<a href="../index.html" class="icon icon-home">
datasketches
</a>
<div role="search">
<form id="rtd-search-form" class="wy-form" action="../search.html" method="get">
<input type="text" name="q" placeholder="Search docs" aria-label="Search docs" />
<input type="hidden" name="check_keywords" value="yes" />
<input type="hidden" name="area" value="default" />
</form>
</div>
</div><div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="Navigation menu">
<ul>
<li class="toctree-l1"><a class="reference internal" href="../distinct_counting/index.html">Distinct Counting</a></li>
</ul>
<ul>
<li class="toctree-l1"><a class="reference internal" href="../quantiles/index.html">Quantiles Sketches</a></li>
</ul>
<ul>
<li class="toctree-l1"><a class="reference internal" href="../frequency/index.html">Frequency Sketches</a></li>
</ul>
<ul>
<li class="toctree-l1"><a class="reference internal" href="../vector/index.html">Vector Sketches</a></li>
</ul>
<ul class="current">
<li class="toctree-l1 current"><a class="reference internal" href="index.html">Random Sampling Sketches</a><ul class="current">
<li class="toctree-l2"><a class="reference internal" href="varopt.html">Variance Optimal Sampling (VarOpt)</a></li>
<li class="toctree-l2 current"><a class="current reference internal" href="#">Exact and Bounded, Probabilitiy Proportional to Size (EBPPS) Sampling</a><ul>
<li class="toctree-l3"><a class="reference internal" href="#datasketches.ebpps_sketch"><code class="docutils literal notranslate"><span class="pre">ebpps_sketch</span></code></a><ul>
<li class="toctree-l4"><a class="reference internal" href="#datasketches.ebpps_sketch.deserialize"><code class="docutils literal notranslate"><span class="pre">ebpps_sketch.deserialize()</span></code></a></li>
<li class="toctree-l4"><a class="reference internal" href="#datasketches.ebpps_sketch.__init__"><code class="docutils literal notranslate"><span class="pre">ebpps_sketch.__init__()</span></code></a></li>
<li class="toctree-l4"><a class="reference internal" href="#datasketches.ebpps_sketch.c"><code class="docutils literal notranslate"><span class="pre">ebpps_sketch.c</span></code></a></li>
<li class="toctree-l4"><a class="reference internal" href="#datasketches.ebpps_sketch.get_serialized_size_bytes"><code class="docutils literal notranslate"><span class="pre">ebpps_sketch.get_serialized_size_bytes</span></code></a></li>
<li class="toctree-l4"><a class="reference internal" href="#datasketches.ebpps_sketch.is_empty"><code class="docutils literal notranslate"><span class="pre">ebpps_sketch.is_empty</span></code></a></li>
<li class="toctree-l4"><a class="reference internal" href="#datasketches.ebpps_sketch.k"><code class="docutils literal notranslate"><span class="pre">ebpps_sketch.k</span></code></a></li>
<li class="toctree-l4"><a class="reference internal" href="#datasketches.ebpps_sketch.merge"><code class="docutils literal notranslate"><span class="pre">ebpps_sketch.merge</span></code></a></li>
<li class="toctree-l4"><a class="reference internal" href="#datasketches.ebpps_sketch.n"><code class="docutils literal notranslate"><span class="pre">ebpps_sketch.n</span></code></a></li>
<li class="toctree-l4"><a class="reference internal" href="#datasketches.ebpps_sketch.serialize"><code class="docutils literal notranslate"><span class="pre">ebpps_sketch.serialize</span></code></a></li>
<li class="toctree-l4"><a class="reference internal" href="#datasketches.ebpps_sketch.to_string"><code class="docutils literal notranslate"><span class="pre">ebpps_sketch.to_string</span></code></a></li>
<li class="toctree-l4"><a class="reference internal" href="#datasketches.ebpps_sketch.update"><code class="docutils literal notranslate"><span class="pre">ebpps_sketch.update</span></code></a></li>
</ul>
</li>
</ul>
</li>
</ul>
</li>
</ul>
<ul>
<li class="toctree-l1"><a class="reference internal" href="../helper/index.html">Helper Classes</a></li>
</ul>
</div>
</div>
</nav>
<section data-toggle="wy-nav-shift" class="wy-nav-content-wrap"><nav class="wy-nav-top" aria-label="Mobile navigation menu" >
<i data-toggle="wy-nav-top" class="fa fa-bars"></i>
<a href="../index.html">datasketches</a>
</nav>
<div class="wy-nav-content">
<div class="rst-content">
<div role="navigation" aria-label="Page navigation">
<ul class="wy-breadcrumbs">
<li><a href="../index.html" class="icon icon-home" aria-label="Home"></a></li>
<li class="breadcrumb-item"><a href="index.html">Random Sampling Sketches</a></li>
<li class="breadcrumb-item active">Exact and Bounded, Probabilitiy Proportional to Size (EBPPS) Sampling</li>
<li class="wy-breadcrumbs-aside">
<a href="../_sources/sampling/ebpps.rst.txt" rel="nofollow"> View page source</a>
</li>
</ul>
<hr/>
</div>
<div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
<div itemprop="articleBody">
<section id="exact-and-bounded-probabilitiy-proportional-to-size-ebpps-sampling">
<h1>Exact and Bounded, Probabilitiy Proportional to Size (EBPPS) Sampling<a class="headerlink" href="#exact-and-bounded-probabilitiy-proportional-to-size-ebpps-sampling" title="Link to this heading"></a></h1>
<p>An EBPPS sketch produces a randome sample of data from a stream of items, ensuring that the probability
of including an item is always exactly equal to the item’s size. The size of an item is defined as its
weight relative to the total weight of all items seen so far by the sketch. In contrast to VarOpt sampling,
this sketch may return fewer than <cite>k</cite> items in order to keep the probability of including an item strictly
proportional to its size.</p>
<p>This sketch is based on: B. Hentschel, P. J. Haas, Y. Tian
“Exact PPS Sampling with Bounded Sample Size”,
Information Processing Letters, 2023.</p>
<p>EBPPS sampling is related to reservoir sampling, but handles unequal item weights.
Feeding the sketch items with a uniform weight value will produce a sample equivalent to reservoir sampling.</p>
<div class="admonition note">
<p class="admonition-title">Note</p>
<p>Serializing and deserializing this sketch requires the use of a <a class="reference internal" href="../helper/serde.html#datasketches.PyObjectSerDe" title="datasketches.PyObjectSerDe"><code class="xref py py-class docutils literal notranslate"><span class="pre">PyObjectSerDe</span></code></a>.</p>
</div>
<dl class="py class">
<dt class="sig sig-object py" id="datasketches.ebpps_sketch">
<em class="property"><span class="k"><span class="pre">class</span></span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">ebpps_sketch</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="o"><span class="pre">*</span></span><span class="n"><span class="pre">args</span></span></em>, <em class="sig-param"><span class="o"><span class="pre">**</span></span><span class="n"><span class="pre">kwargs</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#datasketches.ebpps_sketch" title="Link to this definition"></a></dt>
<dd><p class="rubric">Static Methods:</p>
<dl class="py method">
<dt class="sig sig-object py" id="datasketches.ebpps_sketch.deserialize">
<span class="sig-name descname"><span class="pre">deserialize</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">bytes</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bytes</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">serde</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><a class="reference internal" href="../helper/serde.html#datasketches.PyObjectSerDe" title="_datasketches.PyObjectSerDe"><span class="pre">_datasketches.PyObjectSerDe</span></a></span></em><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><a class="reference internal" href="#datasketches.ebpps_sketch" title="_datasketches.ebpps_sketch"><span class="pre">_datasketches.ebpps_sketch</span></a></span></span><a class="headerlink" href="#datasketches.ebpps_sketch.deserialize" title="Link to this definition"></a></dt>
<dd><p>Reads a bytes object and returns the corresponding ebpps_sketch</p>
</dd></dl>
<p class="rubric">Non-static Methods:</p>
<dl class="py method">
<dt class="sig sig-object py" id="datasketches.ebpps_sketch.__init__">
<span class="sig-name descname"><span class="pre">__init__</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">self</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">k</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span></span></em><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><span class="pre">None</span></span></span><a class="headerlink" href="#datasketches.ebpps_sketch.__init__" title="Link to this definition"></a></dt>
<dd><p>Creates a new EBPPS sketch instance</p>
<dl class="field-list simple">
<dt class="field-odd">Parameters<span class="colon">:</span></dt>
<dd class="field-odd"><p><strong>k</strong> (<em>int</em>) – Maximum number of samples in the sketch</p>
</dd>
</dl>
</dd></dl>
<dl class="py property">
<dt class="sig sig-object py" id="datasketches.ebpps_sketch.c">
<em class="property"><span class="k"><span class="pre">property</span></span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">c</span></span><a class="headerlink" href="#datasketches.ebpps_sketch.c" title="Link to this definition"></a></dt>
<dd><p>The expected number of samples returned upon a call to get_result() or the creation of an iterator. The number is a floating point value, where the fractional portion represents the probability of including a “partial item” from the sample. The value C should be no larger than the sketch’s configured value of k, although numerical precision limitations mean it may exceed k by double precision floating point error margins in certain cases.</p>
</dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="datasketches.ebpps_sketch.get_serialized_size_bytes">
<span class="sig-name descname"><span class="pre">get_serialized_size_bytes</span></span><a class="headerlink" href="#datasketches.ebpps_sketch.get_serialized_size_bytes" title="Link to this definition"></a></dt>
<dd><p>Computes the size in bytes needed to serialize the current sketch</p>
</dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="datasketches.ebpps_sketch.is_empty">
<span class="sig-name descname"><span class="pre">is_empty</span></span><a class="headerlink" href="#datasketches.ebpps_sketch.is_empty" title="Link to this definition"></a></dt>
<dd><p>Returns True if the sketch is empty, otherwise False</p>
</dd></dl>
<dl class="py property">
<dt class="sig sig-object py" id="datasketches.ebpps_sketch.k">
<em class="property"><span class="k"><span class="pre">property</span></span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">k</span></span><a class="headerlink" href="#datasketches.ebpps_sketch.k" title="Link to this definition"></a></dt>
<dd><p>The sketch’s maximum configured sample size</p>
</dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="datasketches.ebpps_sketch.merge">
<span class="sig-name descname"><span class="pre">merge</span></span><a class="headerlink" href="#datasketches.ebpps_sketch.merge" title="Link to this definition"></a></dt>
<dd><p>Merges the sketch with the given sketch</p>
</dd></dl>
<dl class="py property">
<dt class="sig sig-object py" id="datasketches.ebpps_sketch.n">
<em class="property"><span class="k"><span class="pre">property</span></span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">n</span></span><a class="headerlink" href="#datasketches.ebpps_sketch.n" title="Link to this definition"></a></dt>
<dd><p>The total stream length</p>
</dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="datasketches.ebpps_sketch.serialize">
<span class="sig-name descname"><span class="pre">serialize</span></span><a class="headerlink" href="#datasketches.ebpps_sketch.serialize" title="Link to this definition"></a></dt>
<dd><p>Serializes the sketch into a bytes object</p>
</dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="datasketches.ebpps_sketch.to_string">
<span class="sig-name descname"><span class="pre">to_string</span></span><a class="headerlink" href="#datasketches.ebpps_sketch.to_string" title="Link to this definition"></a></dt>
<dd><p>Produces a string summary of the sketch and optionally prints the items</p>
</dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="datasketches.ebpps_sketch.update">
<span class="sig-name descname"><span class="pre">update</span></span><a class="headerlink" href="#datasketches.ebpps_sketch.update" title="Link to this definition"></a></dt>
<dd><p>Updates the sketch with the given value and weight</p>
</dd></dl>
</dd></dl>
</section>
</div>
</div>
<footer><div class="rst-footer-buttons" role="navigation" aria-label="Footer">
<a href="varopt.html" class="btn btn-neutral float-left" title="Variance Optimal Sampling (VarOpt)" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left" aria-hidden="true"></span> Previous</a>
<a href="../helper/index.html" class="btn btn-neutral float-right" title="Helper Classes" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right" aria-hidden="true"></span></a>
</div>
<hr/>
<div role="contentinfo">
<p>&#169; Copyright 2023.</p>
</div>
Built with <a href="https://www.sphinx-doc.org/">Sphinx</a> using a
<a href="https://github.com/readthedocs/sphinx_rtd_theme">theme</a>
provided by <a href="https://readthedocs.org">Read the Docs</a>.
</footer>
</div>
</div>
</section>
</div>
<script>
jQuery(function () {
SphinxRtdTheme.Navigation.enable(true);
});
</script>
</body>
</html>