| |
| |
| <!DOCTYPE html> |
| <html class="writer-html5" lang="en" data-content_root="../"> |
| <head> |
| <meta charset="utf-8" /><meta name="viewport" content="width=device-width, initial-scale=1" /> |
| |
| <meta name="viewport" content="width=device-width, initial-scale=1.0" /> |
| <title>Frequent Items — datasketches 0.1 documentation</title> |
| <link rel="stylesheet" type="text/css" href="../_static/pygments.css?v=b86133f3" /> |
| <link rel="stylesheet" type="text/css" href="../_static/css/theme.css?v=e59714d7" /> |
| |
| |
| <script src="../_static/jquery.js?v=5d32c60e"></script> |
| <script src="../_static/_sphinx_javascript_frameworks_compat.js?v=2cd50e6c"></script> |
| <script src="../_static/documentation_options.js?v=2709fde1"></script> |
| <script src="../_static/doctools.js?v=9bcbadda"></script> |
| <script src="../_static/sphinx_highlight.js?v=dc90522c"></script> |
| <script async="async" src="https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-mml-chtml.js"></script> |
| <script src="../_static/js/theme.js"></script> |
| <link rel="index" title="Index" href="../genindex.html" /> |
| <link rel="search" title="Search" href="../search.html" /> |
| <link rel="next" title="CountMin Sketch" href="count_min_sketch.html" /> |
| <link rel="prev" title="Frequency Sketches" href="index.html" /> |
| </head> |
| |
| <body class="wy-body-for-nav"> |
| <div class="wy-grid-for-nav"> |
| <nav data-toggle="wy-nav-shift" class="wy-nav-side"> |
| <div class="wy-side-scroll"> |
| <div class="wy-side-nav-search" > |
| |
| |
| |
| <a href="../index.html" class="icon icon-home"> |
| datasketches |
| </a> |
| <div role="search"> |
| <form id="rtd-search-form" class="wy-form" action="../search.html" method="get"> |
| <input type="text" name="q" placeholder="Search docs" aria-label="Search docs" /> |
| <input type="hidden" name="check_keywords" value="yes" /> |
| <input type="hidden" name="area" value="default" /> |
| </form> |
| </div> |
| </div><div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="Navigation menu"> |
| <ul> |
| <li class="toctree-l1"><a class="reference internal" href="../distinct_counting/index.html">Distinct Counting</a></li> |
| </ul> |
| <ul> |
| <li class="toctree-l1"><a class="reference internal" href="../quantiles/index.html">Quantiles Sketches</a></li> |
| </ul> |
| <ul class="current"> |
| <li class="toctree-l1 current"><a class="reference internal" href="index.html">Frequency Sketches</a><ul class="current"> |
| <li class="toctree-l2 current"><a class="current reference internal" href="#">Frequent Items</a><ul> |
| <li class="toctree-l3"><a class="reference internal" href="#datasketches.frequent_items_error_type"><code class="docutils literal notranslate"><span class="pre">frequent_items_error_type</span></code></a><ul> |
| <li class="toctree-l4"><a class="reference internal" href="#datasketches.frequent_items_error_type.NO_FALSE_POSITIVES"><code class="docutils literal notranslate"><span class="pre">frequent_items_error_type.NO_FALSE_POSITIVES</span></code></a></li> |
| <li class="toctree-l4"><a class="reference internal" href="#datasketches.frequent_items_error_type.NO_FALSE_NEGATIVES"><code class="docutils literal notranslate"><span class="pre">frequent_items_error_type.NO_FALSE_NEGATIVES</span></code></a></li> |
| </ul> |
| </li> |
| <li class="toctree-l3"><a class="reference internal" href="#datasketches.frequent_items_sketch"><code class="docutils literal notranslate"><span class="pre">frequent_items_sketch</span></code></a><ul> |
| <li class="toctree-l4"><a class="reference internal" href="#datasketches.frequent_items_sketch.deserialize"><code class="docutils literal notranslate"><span class="pre">frequent_items_sketch.deserialize()</span></code></a></li> |
| <li class="toctree-l4"><a class="reference internal" href="#datasketches.frequent_items_sketch.get_epsilon_for_lg_size"><code class="docutils literal notranslate"><span class="pre">frequent_items_sketch.get_epsilon_for_lg_size()</span></code></a></li> |
| <li class="toctree-l4"><a class="reference internal" href="#datasketches.frequent_items_sketch.get_apriori_error"><code class="docutils literal notranslate"><span class="pre">frequent_items_sketch.get_apriori_error()</span></code></a></li> |
| <li class="toctree-l4"><a class="reference internal" href="#datasketches.frequent_items_sketch.__init__"><code class="docutils literal notranslate"><span class="pre">frequent_items_sketch.__init__()</span></code></a></li> |
| <li class="toctree-l4"><a class="reference internal" href="#datasketches.frequent_items_sketch.epsilon"><code class="docutils literal notranslate"><span class="pre">frequent_items_sketch.epsilon</span></code></a></li> |
| <li class="toctree-l4"><a class="reference internal" href="#datasketches.frequent_items_sketch.get_estimate"><code class="docutils literal notranslate"><span class="pre">frequent_items_sketch.get_estimate</span></code></a></li> |
| <li class="toctree-l4"><a class="reference internal" href="#datasketches.frequent_items_sketch.get_frequent_items"><code class="docutils literal notranslate"><span class="pre">frequent_items_sketch.get_frequent_items</span></code></a></li> |
| <li class="toctree-l4"><a class="reference internal" href="#datasketches.frequent_items_sketch.get_lower_bound"><code class="docutils literal notranslate"><span class="pre">frequent_items_sketch.get_lower_bound</span></code></a></li> |
| <li class="toctree-l4"><a class="reference internal" href="#datasketches.frequent_items_sketch.get_serialized_size_bytes"><code class="docutils literal notranslate"><span class="pre">frequent_items_sketch.get_serialized_size_bytes</span></code></a></li> |
| <li class="toctree-l4"><a class="reference internal" href="#datasketches.frequent_items_sketch.get_upper_bound"><code class="docutils literal notranslate"><span class="pre">frequent_items_sketch.get_upper_bound</span></code></a></li> |
| <li class="toctree-l4"><a class="reference internal" href="#datasketches.frequent_items_sketch.is_empty"><code class="docutils literal notranslate"><span class="pre">frequent_items_sketch.is_empty</span></code></a></li> |
| <li class="toctree-l4"><a class="reference internal" href="#datasketches.frequent_items_sketch.merge"><code class="docutils literal notranslate"><span class="pre">frequent_items_sketch.merge</span></code></a></li> |
| <li class="toctree-l4"><a class="reference internal" href="#datasketches.frequent_items_sketch.num_active_items"><code class="docutils literal notranslate"><span class="pre">frequent_items_sketch.num_active_items</span></code></a></li> |
| <li class="toctree-l4"><a class="reference internal" href="#datasketches.frequent_items_sketch.serialize"><code class="docutils literal notranslate"><span class="pre">frequent_items_sketch.serialize</span></code></a></li> |
| <li class="toctree-l4"><a class="reference internal" href="#datasketches.frequent_items_sketch.to_string"><code class="docutils literal notranslate"><span class="pre">frequent_items_sketch.to_string</span></code></a></li> |
| <li class="toctree-l4"><a class="reference internal" href="#datasketches.frequent_items_sketch.total_weight"><code class="docutils literal notranslate"><span class="pre">frequent_items_sketch.total_weight</span></code></a></li> |
| <li class="toctree-l4"><a class="reference internal" href="#datasketches.frequent_items_sketch.update"><code class="docutils literal notranslate"><span class="pre">frequent_items_sketch.update</span></code></a></li> |
| </ul> |
| </li> |
| <li class="toctree-l3"><a class="reference internal" href="#datasketches.frequent_strings_sketch"><code class="docutils literal notranslate"><span class="pre">frequent_strings_sketch</span></code></a><ul> |
| <li class="toctree-l4"><a class="reference internal" href="#datasketches.frequent_strings_sketch.deserialize"><code class="docutils literal notranslate"><span class="pre">frequent_strings_sketch.deserialize()</span></code></a></li> |
| <li class="toctree-l4"><a class="reference internal" href="#datasketches.frequent_strings_sketch.get_epsilon_for_lg_size"><code class="docutils literal notranslate"><span class="pre">frequent_strings_sketch.get_epsilon_for_lg_size()</span></code></a></li> |
| <li class="toctree-l4"><a class="reference internal" href="#datasketches.frequent_strings_sketch.get_apriori_error"><code class="docutils literal notranslate"><span class="pre">frequent_strings_sketch.get_apriori_error()</span></code></a></li> |
| <li class="toctree-l4"><a class="reference internal" href="#datasketches.frequent_strings_sketch.__init__"><code class="docutils literal notranslate"><span class="pre">frequent_strings_sketch.__init__()</span></code></a></li> |
| <li class="toctree-l4"><a class="reference internal" href="#datasketches.frequent_strings_sketch.epsilon"><code class="docutils literal notranslate"><span class="pre">frequent_strings_sketch.epsilon</span></code></a></li> |
| <li class="toctree-l4"><a class="reference internal" href="#datasketches.frequent_strings_sketch.get_estimate"><code class="docutils literal notranslate"><span class="pre">frequent_strings_sketch.get_estimate</span></code></a></li> |
| <li class="toctree-l4"><a class="reference internal" href="#datasketches.frequent_strings_sketch.get_frequent_items"><code class="docutils literal notranslate"><span class="pre">frequent_strings_sketch.get_frequent_items</span></code></a></li> |
| <li class="toctree-l4"><a class="reference internal" href="#datasketches.frequent_strings_sketch.get_lower_bound"><code class="docutils literal notranslate"><span class="pre">frequent_strings_sketch.get_lower_bound</span></code></a></li> |
| <li class="toctree-l4"><a class="reference internal" href="#datasketches.frequent_strings_sketch.get_serialized_size_bytes"><code class="docutils literal notranslate"><span class="pre">frequent_strings_sketch.get_serialized_size_bytes</span></code></a></li> |
| <li class="toctree-l4"><a class="reference internal" href="#datasketches.frequent_strings_sketch.get_upper_bound"><code class="docutils literal notranslate"><span class="pre">frequent_strings_sketch.get_upper_bound</span></code></a></li> |
| <li class="toctree-l4"><a class="reference internal" href="#datasketches.frequent_strings_sketch.is_empty"><code class="docutils literal notranslate"><span class="pre">frequent_strings_sketch.is_empty</span></code></a></li> |
| <li class="toctree-l4"><a class="reference internal" href="#datasketches.frequent_strings_sketch.merge"><code class="docutils literal notranslate"><span class="pre">frequent_strings_sketch.merge</span></code></a></li> |
| <li class="toctree-l4"><a class="reference internal" href="#datasketches.frequent_strings_sketch.num_active_items"><code class="docutils literal notranslate"><span class="pre">frequent_strings_sketch.num_active_items</span></code></a></li> |
| <li class="toctree-l4"><a class="reference internal" href="#datasketches.frequent_strings_sketch.serialize"><code class="docutils literal notranslate"><span class="pre">frequent_strings_sketch.serialize</span></code></a></li> |
| <li class="toctree-l4"><a class="reference internal" href="#datasketches.frequent_strings_sketch.to_string"><code class="docutils literal notranslate"><span class="pre">frequent_strings_sketch.to_string</span></code></a></li> |
| <li class="toctree-l4"><a class="reference internal" href="#datasketches.frequent_strings_sketch.total_weight"><code class="docutils literal notranslate"><span class="pre">frequent_strings_sketch.total_weight</span></code></a></li> |
| <li class="toctree-l4"><a class="reference internal" href="#datasketches.frequent_strings_sketch.update"><code class="docutils literal notranslate"><span class="pre">frequent_strings_sketch.update</span></code></a></li> |
| </ul> |
| </li> |
| </ul> |
| </li> |
| <li class="toctree-l2"><a class="reference internal" href="count_min_sketch.html">CountMin Sketch</a></li> |
| </ul> |
| </li> |
| </ul> |
| <ul> |
| <li class="toctree-l1"><a class="reference internal" href="../vector/index.html">Vector Sketches</a></li> |
| </ul> |
| <ul> |
| <li class="toctree-l1"><a class="reference internal" href="../sampling/index.html">Random Sampling Sketches</a></li> |
| </ul> |
| <ul> |
| <li class="toctree-l1"><a class="reference internal" href="../helper/index.html">Helper Classes</a></li> |
| </ul> |
| |
| </div> |
| </div> |
| </nav> |
| |
| <section data-toggle="wy-nav-shift" class="wy-nav-content-wrap"><nav class="wy-nav-top" aria-label="Mobile navigation menu" > |
| <i data-toggle="wy-nav-top" class="fa fa-bars"></i> |
| <a href="../index.html">datasketches</a> |
| </nav> |
| |
| <div class="wy-nav-content"> |
| <div class="rst-content"> |
| <div role="navigation" aria-label="Page navigation"> |
| <ul class="wy-breadcrumbs"> |
| <li><a href="../index.html" class="icon icon-home" aria-label="Home"></a></li> |
| <li class="breadcrumb-item"><a href="index.html">Frequency Sketches</a></li> |
| <li class="breadcrumb-item active">Frequent Items</li> |
| <li class="wy-breadcrumbs-aside"> |
| <a href="../_sources/frequency/frequent_items.rst.txt" rel="nofollow"> View page source</a> |
| </li> |
| </ul> |
| <hr/> |
| </div> |
| <div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article"> |
| <div itemprop="articleBody"> |
| |
| <section id="frequent-items"> |
| <h1>Frequent Items<a class="headerlink" href="#frequent-items" title="Link to this heading"></a></h1> |
| <p>This sketch is useful for tracking approximate frequencies of items (<code class="docutils literal notranslate"><span class="pre">object</span></code> or <code class="docutils literal notranslate"><span class="pre">string</span></code>) with optional associated |
| integer counts that are members of a multiset of such items. |
| The true frequency of an item is defined to be the sum of associated counts.</p> |
| <p>This implementation provides the following capabilities:</p> |
| <ul class="simple"> |
| <li><p>Estimate the <em>frequency</em> of an item.</p></li> |
| <li><p>Return <em>upper</em> and <em>lower bounds</em> of any item, such that the true frequency is always between the upper and lower bounds.</p></li> |
| <li><p>Return a global <em>maximum error</em> that holds for all items in the stream.</p></li> |
| <li><p>Return an array of frequent items that qualify either a <em>NO_FALSE_POSITIVES</em> or a <em>NO_FALSE_NEGATIVES</em> error type.</p></li> |
| <li><p><em>Merge</em> itself with another sketch object created from this class.</p></li> |
| <li><p><em>Serialize/Deserialize</em> to/from a byte array.</p></li> |
| </ul> |
| <p><strong>Space Usage</strong></p> |
| <p>The sketch is initialized with a maximum map size, <code class="docutils literal notranslate"><span class="pre">maxMapSize</span></code>, that specifies the maximum physical length of the internal hash map of the form <code class="docutils literal notranslate"><span class="pre">(object</span> <span class="pre">item,</span> <span class="pre">int</span> <span class="pre">count)</span></code>. |
| The maximum map size is always a power of 2, defined through the variables <code class="docutils literal notranslate"><span class="pre">lg_max_map_size</span></code>.</p> |
| <p>The hash map starts at a very small size (8 entries) and grows as needed up to the specified maximum map size.</p> |
| <p>Excluding external space required for the item objects, the internal memory space usage of this sketch is <code class="docutils literal notranslate"><span class="pre">18</span> <span class="pre">*</span> <span class="pre">mapSize</span></code> bytes (assuming 8 bytes for each reference), |
| plus a small constant number of additional bytes. |
| The internal memory space usage of this sketch will never exceed <code class="docutils literal notranslate"><span class="pre">18</span> <span class="pre">*</span> <span class="pre">maxMapSize</span></code> bytes, plus a small constant number of additional bytes.</p> |
| <p><strong>Maximum Capacity of the Sketch</strong></p> |
| <p>The <code class="docutils literal notranslate"><span class="pre">LOAD_FACTOR</span></code> for the hash map is internally set at <span class="math notranslate nohighlight">\(75\%\)</span>, which means at any time the map capacity of <code class="docutils literal notranslate"><span class="pre">(item,</span> <span class="pre">count)</span></code> pairs is <code class="docutils literal notranslate"><span class="pre">mapCap</span> <span class="pre">=</span> <span class="pre">0.75</span> <span class="pre">*</span> <span class="pre">mapSize</span></code>. |
| The maximum capacity of <code class="docutils literal notranslate"><span class="pre">(item,</span> <span class="pre">count)</span></code> pairs of the sketch is <code class="docutils literal notranslate"><span class="pre">maxMapCap</span> <span class="pre">=</span> <span class="pre">0.75</span> <span class="pre">*</span> <span class="pre">maxMapSize</span></code>.</p> |
| <p><strong>Updating the sketch with ``(item, count)`` pairs</strong></p> |
| <p>If the item is found in the hash map, the mapped count field (the “counter”) is incremented by the incoming count; otherwise, a new counter <code class="docutils literal notranslate"><span class="pre">(item,</span> <span class="pre">count)</span></code> pair is created. |
| If the number of tracked counters reaches the maximum capacity of the hash map, the sketch decrements all of the counters (by an approximately computed median) |
| and removes any non-positive counters.</p> |
| <p><strong>Accuracy</strong></p> |
| <p>If fewer than <code class="docutils literal notranslate"><span class="pre">0.75</span> <span class="pre">*</span> <span class="pre">maxMapSize</span></code> different items are inserted into the sketch, the estimated frequencies returned by the sketch will be exact.</p> |
| <p>The logic of the frequent items sketch is such that the stored counts and true counts are never too different. |
| More specifically, for any item, the sketch can return an estimate of the true frequency of item, along with upper and lower bounds on the frequency (that hold deterministically).</p> |
| <p>For this implementation and for a specific active item, it is guaranteed that the true frequency will be between the Upper Bound (UB) and the Lower Bound (LB) computed for that item. |
| Specifically, <code class="docutils literal notranslate"><span class="pre">(UB-</span> <span class="pre">LB)</span> <span class="pre">≤</span> <span class="pre">W</span> <span class="pre">*</span> <span class="pre">epsilon</span></code>, where <span class="math notranslate nohighlight">\(W\)</span> denotes the sum of all item counts, and <span class="math notranslate nohighlight">\(epsilon = 3.5/M\)</span>, where <span class="math notranslate nohighlight">\(epsilon = M\)</span> is the maxMapSize.</p> |
| <p>This is a worst-case guarantee that applies to arbitrary inputs. |
| For inputs typically seen in practice, <code class="docutils literal notranslate"><span class="pre">(UB-LB)</span></code> is usually much smaller.</p> |
| <p><strong>Background</strong></p> |
| <p>This code implements a variant of what is commonly known as the “Misra-Gries algorithm”. |
| Variants of it were discovered and rediscovered and redesigned several times over the years:</p> |
| <ul class="simple"> |
| <li><p><em>Finding repeated elements</em>, Misra, Gries, 1982</p></li> |
| <li><p><em>Frequency estimation of Internet packet streams with limited space</em> Demaine, Lopez-Ortiz, Munro, 2002</p></li> |
| <li><p><em>A simple algorithm for finding frequent elements in streams and bags</em> Karp, Shenker, Papadimitriou, 2003</p></li> |
| <li><p><em>Efficient Computation of Frequent and Top-k Elements in Data Streams</em> Metwally, Agrawal, Abbadi, 2006</p></li> |
| </ul> |
| <p>For speed, we do employ some randomization that introduces a small probability that our proof of the worst-case bound might not apply to a given run. |
| However, we have ensured that this probability is extremely small. |
| For example, if the stream causes one table purge (rebuild), our proof of the worst-case bound applies with a probability of at least <cite>1 - 1E-14</cite>. |
| If the stream causes <code class="docutils literal notranslate"><span class="pre">1E9</span></code> purges, our proof applies with a probability of at least <code class="docutils literal notranslate"><span class="pre">1</span> <span class="pre">-</span> <span class="pre">1E-5</span></code>.</p> |
| <p>There are two flavors of Frequent Items Sketches, one with generic items (objects) and another specific to strings. |
| The string version is a legacy name from before the library supported generic objects and is retained |
| only for backwards compatibility.</p> |
| <div class="admonition note"> |
| <p class="admonition-title">Note</p> |
| <p>The <a class="reference internal" href="#datasketches.frequent_items_sketch" title="datasketches.frequent_items_sketch"><code class="xref py py-class docutils literal notranslate"><span class="pre">frequent_items_sketch</span></code></a> uses an input object’s <code class="docutils literal notranslate"><span class="pre">__hash__</span></code> and <code class="docutils literal notranslate"><span class="pre">__eq__</span></code> methods.</p> |
| </div> |
| <div class="admonition note"> |
| <p class="admonition-title">Note</p> |
| <p>Serializing and deserializing the <a class="reference internal" href="#datasketches.frequent_items_sketch" title="datasketches.frequent_items_sketch"><code class="xref py py-class docutils literal notranslate"><span class="pre">frequent_items_sketch</span></code></a> requires the use of a <a class="reference internal" href="../helper/serde.html#datasketches.PyObjectSerDe" title="datasketches.PyObjectSerDe"><code class="xref py py-class docutils literal notranslate"><span class="pre">PyObjectSerDe</span></code></a>.</p> |
| </div> |
| <dl class="py class"> |
| <dt class="sig sig-object py" id="datasketches.frequent_items_error_type"> |
| <em class="property"><span class="k"><span class="pre">class</span></span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">frequent_items_error_type</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="o"><span class="pre">*</span></span><span class="n"><span class="pre">values</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#datasketches.frequent_items_error_type" title="Link to this definition"></a></dt> |
| <dd><dl class="py attribute"> |
| <dt class="sig sig-object py" id="datasketches.frequent_items_error_type.NO_FALSE_POSITIVES"> |
| <span class="sig-name descname"><span class="pre">NO_FALSE_POSITIVES</span></span><em class="property"><span class="w"> </span><span class="pre">:</span> <span class="pre">Returns</span> <span class="pre">only</span> <span class="pre">true</span> <span class="pre">positives</span> <span class="pre">but</span> <span class="pre">may</span> <span class="pre">miss</span> <span class="pre">some</span> <span class="pre">heavy</span> <span class="pre">hitters.</span></em><a class="headerlink" href="#datasketches.frequent_items_error_type.NO_FALSE_POSITIVES" title="Link to this definition"></a></dt> |
| <dd></dd></dl> |
| |
| <dl class="py attribute"> |
| <dt class="sig sig-object py" id="datasketches.frequent_items_error_type.NO_FALSE_NEGATIVES"> |
| <span class="sig-name descname"><span class="pre">NO_FALSE_NEGATIVES</span></span><em class="property"><span class="w"> </span><span class="pre">:</span> <span class="pre">Does</span> <span class="pre">not</span> <span class="pre">miss</span> <span class="pre">any</span> <span class="pre">heavy</span> <span class="pre">hitters</span> <span class="pre">but</span> <span class="pre">may</span> <span class="pre">return</span> <span class="pre">false</span> <span class="pre">positives.</span></em><a class="headerlink" href="#datasketches.frequent_items_error_type.NO_FALSE_NEGATIVES" title="Link to this definition"></a></dt> |
| <dd></dd></dl> |
| |
| </dd></dl> |
| |
| <dl class="py class"> |
| <dt class="sig sig-object py" id="datasketches.frequent_items_sketch"> |
| <em class="property"><span class="k"><span class="pre">class</span></span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">frequent_items_sketch</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="o"><span class="pre">*</span></span><span class="n"><span class="pre">args</span></span></em>, <em class="sig-param"><span class="o"><span class="pre">**</span></span><span class="n"><span class="pre">kwargs</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#datasketches.frequent_items_sketch" title="Link to this definition"></a></dt> |
| <dd><p class="rubric">Static Methods:</p> |
| <dl class="py method"> |
| <dt class="sig sig-object py" id="datasketches.frequent_items_sketch.deserialize"> |
| <span class="sig-name descname"><span class="pre">deserialize</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">bytes</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bytes</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">serde</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><a class="reference internal" href="../helper/serde.html#datasketches.PyObjectSerDe" title="_datasketches.PyObjectSerDe"><span class="pre">_datasketches.PyObjectSerDe</span></a></span></em><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">→</span> <span class="sig-return-typehint"><a class="reference internal" href="#datasketches.frequent_items_sketch" title="_datasketches.frequent_items_sketch"><span class="pre">_datasketches.frequent_items_sketch</span></a></span></span><a class="headerlink" href="#datasketches.frequent_items_sketch.deserialize" title="Link to this definition"></a></dt> |
| <dd><p>Reads a bytes object using the provided serde and returns the corresponding frequent_strings_sketch.</p> |
| </dd></dl> |
| |
| <dl class="py method"> |
| <dt class="sig sig-object py" id="datasketches.frequent_items_sketch.get_epsilon_for_lg_size"> |
| <span class="sig-name descname"><span class="pre">get_epsilon_for_lg_size</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">lg_max_map_size</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span></span></em><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">→</span> <span class="sig-return-typehint"><span class="pre">float</span></span></span><a class="headerlink" href="#datasketches.frequent_items_sketch.get_epsilon_for_lg_size" title="Link to this definition"></a></dt> |
| <dd><p>Returns the epsilon value used to compute a priori error for a given log2(max_map_size)</p> |
| </dd></dl> |
| |
| <dl class="py method"> |
| <dt class="sig sig-object py" id="datasketches.frequent_items_sketch.get_apriori_error"> |
| <span class="sig-name descname"><span class="pre">get_apriori_error</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">lg_max_map_size</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">estimated_total_weight</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span></span></em><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">→</span> <span class="sig-return-typehint"><span class="pre">float</span></span></span><a class="headerlink" href="#datasketches.frequent_items_sketch.get_apriori_error" title="Link to this definition"></a></dt> |
| <dd><p>Returns the estimated a priori error given the max_map_size for the sketch and the estimated_total_stream_weight.</p> |
| </dd></dl> |
| |
| <p class="rubric">Non-static Methods:</p> |
| <dl class="py method"> |
| <dt class="sig sig-object py" id="datasketches.frequent_items_sketch.__init__"> |
| <span class="sig-name descname"><span class="pre">__init__</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">self</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">lg_max_k</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span></span></em><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">→</span> <span class="sig-return-typehint"><span class="pre">None</span></span></span><a class="headerlink" href="#datasketches.frequent_items_sketch.__init__" title="Link to this definition"></a></dt> |
| <dd><p>Creates an instance of the sketch</p> |
| <dl class="field-list simple"> |
| <dt class="field-odd">Parameters<span class="colon">:</span></dt> |
| <dd class="field-odd"><p><strong>lg_max_k</strong> (<em>int</em>) – base 2 logarithm of the maximum size of the internal hash map of the sketch. Maximum capacity is 0.75 of this value, which is the maximum number of distinct items the sketch can contain.</p> |
| </dd> |
| </dl> |
| </dd></dl> |
| |
| <dl class="py property"> |
| <dt class="sig sig-object py" id="datasketches.frequent_items_sketch.epsilon"> |
| <em class="property"><span class="k"><span class="pre">property</span></span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">epsilon</span></span><a class="headerlink" href="#datasketches.frequent_items_sketch.epsilon" title="Link to this definition"></a></dt> |
| <dd><p>The epsilon value used by the sketch to compute error</p> |
| </dd></dl> |
| |
| <dl class="py attribute"> |
| <dt class="sig sig-object py" id="datasketches.frequent_items_sketch.get_estimate"> |
| <span class="sig-name descname"><span class="pre">get_estimate</span></span><a class="headerlink" href="#datasketches.frequent_items_sketch.get_estimate" title="Link to this definition"></a></dt> |
| <dd><p>Returns the estimate of the weight (frequency) of the given item. |
| Note: The true frequency of a item would be the sum of the counts as a result of the two update functions.</p> |
| </dd></dl> |
| |
| <dl class="py attribute"> |
| <dt class="sig sig-object py" id="datasketches.frequent_items_sketch.get_frequent_items"> |
| <span class="sig-name descname"><span class="pre">get_frequent_items</span></span><a class="headerlink" href="#datasketches.frequent_items_sketch.get_frequent_items" title="Link to this definition"></a></dt> |
| <dd></dd></dl> |
| |
| <dl class="py attribute"> |
| <dt class="sig sig-object py" id="datasketches.frequent_items_sketch.get_lower_bound"> |
| <span class="sig-name descname"><span class="pre">get_lower_bound</span></span><a class="headerlink" href="#datasketches.frequent_items_sketch.get_lower_bound" title="Link to this definition"></a></dt> |
| <dd><p>Returns the guaranteed lower bound weight (frequency) of the given item.</p> |
| </dd></dl> |
| |
| <dl class="py attribute"> |
| <dt class="sig sig-object py" id="datasketches.frequent_items_sketch.get_serialized_size_bytes"> |
| <span class="sig-name descname"><span class="pre">get_serialized_size_bytes</span></span><a class="headerlink" href="#datasketches.frequent_items_sketch.get_serialized_size_bytes" title="Link to this definition"></a></dt> |
| <dd><p>Computes the size needed to serialize the current state of the sketch using the provided serde. This can be expensive since every item needs to be looked at.</p> |
| </dd></dl> |
| |
| <dl class="py attribute"> |
| <dt class="sig sig-object py" id="datasketches.frequent_items_sketch.get_upper_bound"> |
| <span class="sig-name descname"><span class="pre">get_upper_bound</span></span><a class="headerlink" href="#datasketches.frequent_items_sketch.get_upper_bound" title="Link to this definition"></a></dt> |
| <dd><p>Returns the guaranteed upper bound weight (frequency) of the given item.</p> |
| </dd></dl> |
| |
| <dl class="py attribute"> |
| <dt class="sig sig-object py" id="datasketches.frequent_items_sketch.is_empty"> |
| <span class="sig-name descname"><span class="pre">is_empty</span></span><a class="headerlink" href="#datasketches.frequent_items_sketch.is_empty" title="Link to this definition"></a></dt> |
| <dd><p>Returns True if the sketch is empty, otherwise False</p> |
| </dd></dl> |
| |
| <dl class="py attribute"> |
| <dt class="sig sig-object py" id="datasketches.frequent_items_sketch.merge"> |
| <span class="sig-name descname"><span class="pre">merge</span></span><a class="headerlink" href="#datasketches.frequent_items_sketch.merge" title="Link to this definition"></a></dt> |
| <dd><p>Merges the given sketch into this one</p> |
| </dd></dl> |
| |
| <dl class="py property"> |
| <dt class="sig sig-object py" id="datasketches.frequent_items_sketch.num_active_items"> |
| <em class="property"><span class="k"><span class="pre">property</span></span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">num_active_items</span></span><a class="headerlink" href="#datasketches.frequent_items_sketch.num_active_items" title="Link to this definition"></a></dt> |
| <dd><p>The number of active items in the sketch</p> |
| </dd></dl> |
| |
| <dl class="py attribute"> |
| <dt class="sig sig-object py" id="datasketches.frequent_items_sketch.serialize"> |
| <span class="sig-name descname"><span class="pre">serialize</span></span><a class="headerlink" href="#datasketches.frequent_items_sketch.serialize" title="Link to this definition"></a></dt> |
| <dd><p>Serializes the sketch into a bytes object using the provided serde.</p> |
| </dd></dl> |
| |
| <dl class="py attribute"> |
| <dt class="sig sig-object py" id="datasketches.frequent_items_sketch.to_string"> |
| <span class="sig-name descname"><span class="pre">to_string</span></span><a class="headerlink" href="#datasketches.frequent_items_sketch.to_string" title="Link to this definition"></a></dt> |
| <dd><p>Produces a string summary of the sketch</p> |
| </dd></dl> |
| |
| <dl class="py property"> |
| <dt class="sig sig-object py" id="datasketches.frequent_items_sketch.total_weight"> |
| <em class="property"><span class="k"><span class="pre">property</span></span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">total_weight</span></span><a class="headerlink" href="#datasketches.frequent_items_sketch.total_weight" title="Link to this definition"></a></dt> |
| <dd><p>The sum of the weights (frequencies) in the stream seen so far by the sketch</p> |
| </dd></dl> |
| |
| <dl class="py attribute"> |
| <dt class="sig sig-object py" id="datasketches.frequent_items_sketch.update"> |
| <span class="sig-name descname"><span class="pre">update</span></span><a class="headerlink" href="#datasketches.frequent_items_sketch.update" title="Link to this definition"></a></dt> |
| <dd><p>Updates the sketch with the given string and, optionally, a weight</p> |
| </dd></dl> |
| |
| </dd></dl> |
| |
| <dl class="py class"> |
| <dt class="sig sig-object py" id="datasketches.frequent_strings_sketch"> |
| <em class="property"><span class="k"><span class="pre">class</span></span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">frequent_strings_sketch</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="o"><span class="pre">*</span></span><span class="n"><span class="pre">args</span></span></em>, <em class="sig-param"><span class="o"><span class="pre">**</span></span><span class="n"><span class="pre">kwargs</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#datasketches.frequent_strings_sketch" title="Link to this definition"></a></dt> |
| <dd><p class="rubric">Static Methods:</p> |
| <dl class="py method"> |
| <dt class="sig sig-object py" id="datasketches.frequent_strings_sketch.deserialize"> |
| <span class="sig-name descname"><span class="pre">deserialize</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">bytes</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bytes</span></span></em><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">→</span> <span class="sig-return-typehint"><a class="reference internal" href="#datasketches.frequent_strings_sketch" title="_datasketches.frequent_strings_sketch"><span class="pre">_datasketches.frequent_strings_sketch</span></a></span></span><a class="headerlink" href="#datasketches.frequent_strings_sketch.deserialize" title="Link to this definition"></a></dt> |
| <dd><p>Reads a bytes object and returns the corresponding frequent_strings_sketch.</p> |
| </dd></dl> |
| |
| <dl class="py method"> |
| <dt class="sig sig-object py" id="datasketches.frequent_strings_sketch.get_epsilon_for_lg_size"> |
| <span class="sig-name descname"><span class="pre">get_epsilon_for_lg_size</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">lg_max_map_size</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span></span></em><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">→</span> <span class="sig-return-typehint"><span class="pre">float</span></span></span><a class="headerlink" href="#datasketches.frequent_strings_sketch.get_epsilon_for_lg_size" title="Link to this definition"></a></dt> |
| <dd><p>Returns the epsilon value used to compute a priori error for a given log2(max_map_size)</p> |
| </dd></dl> |
| |
| <dl class="py method"> |
| <dt class="sig sig-object py" id="datasketches.frequent_strings_sketch.get_apriori_error"> |
| <span class="sig-name descname"><span class="pre">get_apriori_error</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">lg_max_map_size</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">estimated_total_weight</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span></span></em><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">→</span> <span class="sig-return-typehint"><span class="pre">float</span></span></span><a class="headerlink" href="#datasketches.frequent_strings_sketch.get_apriori_error" title="Link to this definition"></a></dt> |
| <dd><p>Returns the estimated a priori error given the max_map_size for the sketch and the estimated_total_stream_weight.</p> |
| </dd></dl> |
| |
| <p class="rubric">Non-static Methods:</p> |
| <dl class="py method"> |
| <dt class="sig sig-object py" id="datasketches.frequent_strings_sketch.__init__"> |
| <span class="sig-name descname"><span class="pre">__init__</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">self</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">lg_max_k</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span></span></em><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">→</span> <span class="sig-return-typehint"><span class="pre">None</span></span></span><a class="headerlink" href="#datasketches.frequent_strings_sketch.__init__" title="Link to this definition"></a></dt> |
| <dd><p>Creates an instance of the sketch</p> |
| <dl class="field-list simple"> |
| <dt class="field-odd">Parameters<span class="colon">:</span></dt> |
| <dd class="field-odd"><p><strong>lg_max_k</strong> (<em>int</em>) – base 2 logarithm of the maximum size of the internal hash map of the sketch. Maximum capacity is 0.75 of this value, which is the maximum number of distinct items the sketch can contain.</p> |
| </dd> |
| </dl> |
| </dd></dl> |
| |
| <dl class="py property"> |
| <dt class="sig sig-object py" id="datasketches.frequent_strings_sketch.epsilon"> |
| <em class="property"><span class="k"><span class="pre">property</span></span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">epsilon</span></span><a class="headerlink" href="#datasketches.frequent_strings_sketch.epsilon" title="Link to this definition"></a></dt> |
| <dd><p>The epsilon value used by the sketch to compute error</p> |
| </dd></dl> |
| |
| <dl class="py attribute"> |
| <dt class="sig sig-object py" id="datasketches.frequent_strings_sketch.get_estimate"> |
| <span class="sig-name descname"><span class="pre">get_estimate</span></span><a class="headerlink" href="#datasketches.frequent_strings_sketch.get_estimate" title="Link to this definition"></a></dt> |
| <dd><p>Returns the estimate of the weight (frequency) of the given item. |
| Note: The true frequency of a item would be the sum of the counts as a result of the two update functions.</p> |
| </dd></dl> |
| |
| <dl class="py attribute"> |
| <dt class="sig sig-object py" id="datasketches.frequent_strings_sketch.get_frequent_items"> |
| <span class="sig-name descname"><span class="pre">get_frequent_items</span></span><a class="headerlink" href="#datasketches.frequent_strings_sketch.get_frequent_items" title="Link to this definition"></a></dt> |
| <dd></dd></dl> |
| |
| <dl class="py attribute"> |
| <dt class="sig sig-object py" id="datasketches.frequent_strings_sketch.get_lower_bound"> |
| <span class="sig-name descname"><span class="pre">get_lower_bound</span></span><a class="headerlink" href="#datasketches.frequent_strings_sketch.get_lower_bound" title="Link to this definition"></a></dt> |
| <dd><p>Returns the guaranteed lower bound weight (frequency) of the given item.</p> |
| </dd></dl> |
| |
| <dl class="py attribute"> |
| <dt class="sig sig-object py" id="datasketches.frequent_strings_sketch.get_serialized_size_bytes"> |
| <span class="sig-name descname"><span class="pre">get_serialized_size_bytes</span></span><a class="headerlink" href="#datasketches.frequent_strings_sketch.get_serialized_size_bytes" title="Link to this definition"></a></dt> |
| <dd><p>Computes the size needed to serialize the current state of the sketch. This can be expensive since every item needs to be looked at.</p> |
| </dd></dl> |
| |
| <dl class="py attribute"> |
| <dt class="sig sig-object py" id="datasketches.frequent_strings_sketch.get_upper_bound"> |
| <span class="sig-name descname"><span class="pre">get_upper_bound</span></span><a class="headerlink" href="#datasketches.frequent_strings_sketch.get_upper_bound" title="Link to this definition"></a></dt> |
| <dd><p>Returns the guaranteed upper bound weight (frequency) of the given item.</p> |
| </dd></dl> |
| |
| <dl class="py attribute"> |
| <dt class="sig sig-object py" id="datasketches.frequent_strings_sketch.is_empty"> |
| <span class="sig-name descname"><span class="pre">is_empty</span></span><a class="headerlink" href="#datasketches.frequent_strings_sketch.is_empty" title="Link to this definition"></a></dt> |
| <dd><p>Returns True if the sketch is empty, otherwise False</p> |
| </dd></dl> |
| |
| <dl class="py attribute"> |
| <dt class="sig sig-object py" id="datasketches.frequent_strings_sketch.merge"> |
| <span class="sig-name descname"><span class="pre">merge</span></span><a class="headerlink" href="#datasketches.frequent_strings_sketch.merge" title="Link to this definition"></a></dt> |
| <dd><p>Merges the given sketch into this one</p> |
| </dd></dl> |
| |
| <dl class="py property"> |
| <dt class="sig sig-object py" id="datasketches.frequent_strings_sketch.num_active_items"> |
| <em class="property"><span class="k"><span class="pre">property</span></span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">num_active_items</span></span><a class="headerlink" href="#datasketches.frequent_strings_sketch.num_active_items" title="Link to this definition"></a></dt> |
| <dd><p>The number of active items in the sketch</p> |
| </dd></dl> |
| |
| <dl class="py attribute"> |
| <dt class="sig sig-object py" id="datasketches.frequent_strings_sketch.serialize"> |
| <span class="sig-name descname"><span class="pre">serialize</span></span><a class="headerlink" href="#datasketches.frequent_strings_sketch.serialize" title="Link to this definition"></a></dt> |
| <dd><p>Serializes the sketch into a bytes object.</p> |
| </dd></dl> |
| |
| <dl class="py attribute"> |
| <dt class="sig sig-object py" id="datasketches.frequent_strings_sketch.to_string"> |
| <span class="sig-name descname"><span class="pre">to_string</span></span><a class="headerlink" href="#datasketches.frequent_strings_sketch.to_string" title="Link to this definition"></a></dt> |
| <dd><p>Produces a string summary of the sketch</p> |
| </dd></dl> |
| |
| <dl class="py property"> |
| <dt class="sig sig-object py" id="datasketches.frequent_strings_sketch.total_weight"> |
| <em class="property"><span class="k"><span class="pre">property</span></span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">total_weight</span></span><a class="headerlink" href="#datasketches.frequent_strings_sketch.total_weight" title="Link to this definition"></a></dt> |
| <dd><p>The sum of the weights (frequencies) in the stream seen so far by the sketch</p> |
| </dd></dl> |
| |
| <dl class="py attribute"> |
| <dt class="sig sig-object py" id="datasketches.frequent_strings_sketch.update"> |
| <span class="sig-name descname"><span class="pre">update</span></span><a class="headerlink" href="#datasketches.frequent_strings_sketch.update" title="Link to this definition"></a></dt> |
| <dd><p>Updates the sketch with the given string and, optionally, a weight</p> |
| </dd></dl> |
| |
| </dd></dl> |
| |
| </section> |
| |
| |
| </div> |
| </div> |
| <footer><div class="rst-footer-buttons" role="navigation" aria-label="Footer"> |
| <a href="index.html" class="btn btn-neutral float-left" title="Frequency Sketches" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left" aria-hidden="true"></span> Previous</a> |
| <a href="count_min_sketch.html" class="btn btn-neutral float-right" title="CountMin Sketch" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right" aria-hidden="true"></span></a> |
| </div> |
| |
| <hr/> |
| |
| <div role="contentinfo"> |
| <p>© Copyright 2023.</p> |
| </div> |
| |
| Built with <a href="https://www.sphinx-doc.org/">Sphinx</a> using a |
| <a href="https://github.com/readthedocs/sphinx_rtd_theme">theme</a> |
| provided by <a href="https://readthedocs.org">Read the Docs</a>. |
| |
| |
| </footer> |
| </div> |
| </div> |
| </section> |
| </div> |
| <script> |
| jQuery(function () { |
| SphinxRtdTheme.Navigation.enable(true); |
| }); |
| </script> |
| |
| </body> |
| </html> |