blob: f874af953d728b045889e8337d314ea03bfbe9c9 [file] [log] [blame]
<!-- HTML header for doxygen 1.8.4-->
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<meta http-equiv="Content-Type" content="text/xhtml;charset=UTF-8"/>
<meta http-equiv="X-UA-Compatible" content="IE=9"/>
<meta name="generator" content="Doxygen 1.8.13"/>
<meta name="keywords" content="madlib,postgres,greenplum,machine learning,data mining,deep learning,ensemble methods,data science,market basket analysis,affinity analysis,pca,lda,regression,elastic net,huber white,proportional hazards,k-means,latent dirichlet allocation,bayes,support vector machines,svm"/>
<title>MADlib: FM (Flajolet-Martin)</title>
<link href="tabs.css" rel="stylesheet" type="text/css"/>
<script type="text/javascript" src="jquery.js"></script>
<script type="text/javascript" src="dynsections.js"></script>
<link href="navtree.css" rel="stylesheet" type="text/css"/>
<script type="text/javascript" src="resize.js"></script>
<script type="text/javascript" src="navtreedata.js"></script>
<script type="text/javascript" src="navtree.js"></script>
<script type="text/javascript">
$(document).ready(initResizable);
</script>
<link href="search/search.css" rel="stylesheet" type="text/css"/>
<script type="text/javascript" src="search/searchdata.js"></script>
<script type="text/javascript" src="search/search.js"></script>
<script type="text/javascript">
$(document).ready(function() { init_search(); });
</script>
<script type="text/x-mathjax-config">
MathJax.Hub.Config({
extensions: ["tex2jax.js", "TeX/AMSmath.js", "TeX/AMSsymbols.js"],
jax: ["input/TeX","output/HTML-CSS"],
});
</script><script type="text/javascript" src="http://cdn.mathjax.org/mathjax/latest/MathJax.js"></script>
<!-- hack in the navigation tree -->
<script type="text/javascript" src="eigen_navtree_hacks.js"></script>
<link href="doxygen.css" rel="stylesheet" type="text/css" />
<link href="madlib_extra.css" rel="stylesheet" type="text/css"/>
<!-- google analytics -->
<script>
(function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
(i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o),
m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
})(window,document,'script','//www.google-analytics.com/analytics.js','ga');
ga('create', 'UA-45382226-1', 'madlib.apache.org');
ga('send', 'pageview');
</script>
</head>
<body>
<div id="top"><!-- do not remove this div, it is closed by doxygen! -->
<div id="titlearea">
<table cellspacing="0" cellpadding="0">
<tbody>
<tr style="height: 56px;">
<td id="projectlogo"><a href="http://madlib.apache.org"><img alt="Logo" src="madlib.png" height="50" style="padding-left:0.5em;" border="0"/ ></a></td>
<td style="padding-left: 0.5em;">
<div id="projectname">
<span id="projectnumber">1.16</span>
</div>
<div id="projectbrief">User Documentation for Apache MADlib</div>
</td>
<td> <div id="MSearchBox" class="MSearchBoxInactive">
<span class="left">
<img id="MSearchSelect" src="search/mag_sel.png"
onmouseover="return searchBox.OnSearchSelectShow()"
onmouseout="return searchBox.OnSearchSelectHide()"
alt=""/>
<input type="text" id="MSearchField" value="Search" accesskey="S"
onfocus="searchBox.OnSearchFieldFocus(true)"
onblur="searchBox.OnSearchFieldFocus(false)"
onkeyup="searchBox.OnSearchFieldChange(event)"/>
</span><span class="right">
<a id="MSearchClose" href="javascript:searchBox.CloseResultsWindow()"><img id="MSearchCloseImg" border="0" src="search/close.png" alt=""/></a>
</span>
</div>
</td>
</tr>
</tbody>
</table>
</div>
<!-- end header part -->
<!-- Generated by Doxygen 1.8.13 -->
<script type="text/javascript">
var searchBox = new SearchBox("searchBox", "search",false,'Search');
</script>
</div><!-- top -->
<div id="side-nav" class="ui-resizable side-nav-resizable">
<div id="nav-tree">
<div id="nav-tree-contents">
<div id="nav-sync" class="sync"></div>
</div>
</div>
<div id="splitbar" style="-moz-user-select:none;"
class="ui-resizable-handle">
</div>
</div>
<script type="text/javascript">
$(document).ready(function(){initNavTree('group__grp__fmsketch.html','');});
</script>
<div id="doc-content">
<!-- window showing the filter options -->
<div id="MSearchSelectWindow"
onmouseover="return searchBox.OnSearchSelectShow()"
onmouseout="return searchBox.OnSearchSelectHide()"
onkeydown="return searchBox.OnSearchSelectKey(event)">
</div>
<!-- iframe showing the search results (closed by default) -->
<div id="MSearchResultsWindow">
<iframe src="javascript:void(0)" frameborder="0"
name="MSearchResults" id="MSearchResults">
</iframe>
</div>
<div class="header">
<div class="headertitle">
<div class="title">FM (Flajolet-Martin)<div class="ingroups"><a class="el" href="group__grp__stats.html">Statistics</a> &raquo; <a class="el" href="group__grp__desc__stats.html">Descriptive Statistics</a> &raquo; <a class="el" href="group__grp__sketches.html">Cardinality Estimators</a></div></div> </div>
</div><!--header-->
<div class="contents">
<div class="toc"><b>Contents</b> <ul>
<li>
<a href="#syntax">Syntax</a> </li>
<li>
<a href="#examples">Examples</a> </li>
<li>
<a href="#literature">Literature</a> </li>
<li>
<a href="#related">Related Topics</a> </li>
</ul>
</div><p><a class="el" href="sketch_8sql__in.html#ae27d5aaa5e4b426bcfe55e05a89c8e0b">fmsketch_dcount</a> can be run on a column of any type. It returns an approximation to the number of distinct values (a la <code>COUNT(DISTINCT x)</code>), but faster and approximate. Like any aggregate, it can be combined with a GROUP BY clause to do distinct counts per group.</p>
<p><a class="anchor" id="syntax"></a></p><dl class="section user"><dt>Syntax</dt><dd></dd></dl>
<p>Get the number of distinct values in a designated column. </p><pre class="syntax">
fmsketch_dcount( col_name )
</pre><dl class="section note"><dt>Note</dt><dd>This is a <a href="https://www.postgresql.org/docs/current/static/xaggr.html">User Defined Aggregate</a> which returns the results when used in a query. Use "CREATE TABLE AS ", with the UDA as subquery if the results are to be stored. This is unlike the usual MADlib stored procedure interface which places the results in a table instead of returning it.</dd></dl>
<p><a class="anchor" id="examples"></a></p><dl class="section user"><dt>Examples</dt><dd><ol type="1">
<li>Generate some data. <pre class="example">
CREATE TABLE data(class INT, a1 INT);
INSERT INTO data SELECT 1,1 FROM generate_series(1,10000);
INSERT INTO data SELECT 1,2 FROM generate_series(1,15000);
INSERT INTO data SELECT 1,3 FROM generate_series(1,10000);
INSERT INTO data SELECT 2,5 FROM generate_series(1,1000);
INSERT INTO data SELECT 2,6 FROM generate_series(1,1000);
</pre></li>
<li>Find the distinct number of values for each class. <pre class="example">
SELECT class, fmsketch_dcount(a1)
FROM data
GROUP BY data.class;
</pre> Result: <pre class="result">
class | fmsketch_dcount
&#160;------+-----------------
2 | 2
1 | 3
(2 rows)
</pre></li>
</ol>
</dd></dl>
<p><a class="anchor" id="literature"></a></p><dl class="section user"><dt>Literature</dt><dd>[1] P. Flajolet and N.G. Martin. Probabilistic counting algorithms for data base applications, Journal of Computer and System Sciences 31(2), pp 182-209, 1985. <a href="http://algo.inria.fr/flajolet/Publications/FlMa85.pdf">http://algo.inria.fr/flajolet/Publications/FlMa85.pdf</a></dd></dl>
<p><a class="anchor" id="related"></a></p><dl class="section user"><dt>Related Topics</dt><dd>File <a class="el" href="sketch_8sql__in.html" title="SQL functions for sketch-based approximations of descriptive statistics. ">sketch.sql_in</a> documenting the SQL function. </dd></dl>
</div><!-- contents -->
</div><!-- doc-content -->
<!-- start footer part -->
<div id="nav-path" class="navpath"><!-- id is needed for treeview function! -->
<ul>
<li class="footer">Generated on Tue Jul 2 2019 22:35:51 for MADlib by
<a href="http://www.doxygen.org/index.html">
<img class="footer" src="doxygen.png" alt="doxygen"/></a> 1.8.13 </li>
</ul>
</div>
</body>
</html>