blob: 272e8c7cdb38b4bcba317f289f038c6bc179c2bf [file] [log] [blame]
<!-- HTML header for doxygen 1.8.4-->
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<meta http-equiv="Content-Type" content="text/xhtml;charset=UTF-8"/>
<meta http-equiv="X-UA-Compatible" content="IE=9"/>
<meta name="generator" content="Doxygen 1.8.4"/>
<meta name="keywords" content="madlib,postgres,greenplum,machine learning,data mining,deep learning,ensemble methods,data science,market basket analysis,affinity analysis,pca,lda,regression,elastic net,huber white,proportional hazards,k-means,latent dirichlet allocation,bayes,support vector machines,svm"/>
<title>MADlib: Sparse Vectors</title>
<link href="tabs.css" rel="stylesheet" type="text/css"/>
<script type="text/javascript" src="jquery.js"></script>
<script type="text/javascript" src="dynsections.js"></script>
<link href="navtree.css" rel="stylesheet" type="text/css"/>
<script type="text/javascript" src="resize.js"></script>
<script type="text/javascript" src="navtree.js"></script>
<script type="text/javascript">
$(document).ready(initResizable);
$(window).load(resizeHeight);
</script>
<link href="search/search.css" rel="stylesheet" type="text/css"/>
<script type="text/javascript" src="search/search.js"></script>
<script type="text/javascript">
$(document).ready(function() { searchBox.OnSelectItem(0); });
</script>
<script type="text/x-mathjax-config">
MathJax.Hub.Config({
extensions: ["tex2jax.js", "TeX/AMSmath.js", "TeX/AMSsymbols.js"],
jax: ["input/TeX","output/HTML-CSS"],
});
</script><script src="../mathjax/MathJax.js"></script>
<link href="doxygen.css" rel="stylesheet" type="text/css" />
<link href="madlib_extra.css" rel="stylesheet" type="text/css"/>
<!-- google analytics -->
<script>
(function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
(i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o),
m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
})(window,document,'script','//www.google-analytics.com/analytics.js','ga');
ga('create', 'UA-45382226-1', 'auto');
ga('send', 'pageview');
</script>
</head>
<body>
<div id="top"><!-- do not remove this div, it is closed by doxygen! -->
<div id="titlearea">
<table cellspacing="0" cellpadding="0">
<tbody>
<tr style="height: 56px;">
<td style="padding-left: 0.5em;">
<div id="projectname">MADlib
&#160;<span id="projectnumber">1.2</span> <span style="font-size:10pt; font-style:italic"><a href="../latest/./group__grp__svec.html"> A newer version is available</a></span>
</div>
<div id="projectbrief">User Documentation</div>
</td>
<!--BEGIN VERSIONS LINKS-->
<td style="padding-left: 0.5em;">
<div class="versionlist"><ul>
<li class="head">More versions:</li>
<li><a href="../v1.1/index.html">v1.1</li>
<li><a href="../v1.0/index.html">v1.0</li>
<li><a href="../v0.7/index.html">v0.7</li>
<li><a href="../v0.5/index.html">v0.5</li></ul>
</div>
</td>
<td> <div id="MSearchBox" class="MSearchBoxInactive">
<span class="left">
<img id="MSearchSelect" src="search/mag_sel.png"
onmouseover="return searchBox.OnSearchSelectShow()"
onmouseout="return searchBox.OnSearchSelectHide()"
alt=""/>
<input type="text" id="MSearchField" value="Search" accesskey="S"
onfocus="searchBox.OnSearchFieldFocus(true)"
onblur="searchBox.OnSearchFieldFocus(false)"
onkeyup="searchBox.OnSearchFieldChange(event)"/>
</span><span class="right">
<a id="MSearchClose" href="javascript:searchBox.CloseResultsWindow()"><img id="MSearchCloseImg" border="0" src="search/close.png" alt=""/></a>
</span>
</div>
</td>
</tr>
</tbody>
</table>
</div>
<!-- end header part -->
<!-- Generated by Doxygen 1.8.4 -->
<script type="text/javascript">
var searchBox = new SearchBox("searchBox", "search",false,'Search');
</script>
</div><!-- top -->
<div id="side-nav" class="ui-resizable side-nav-resizable">
<div id="nav-tree">
<div id="nav-tree-contents">
<div id="nav-sync" class="sync"></div>
</div>
</div>
<div id="splitbar" style="-moz-user-select:none;"
class="ui-resizable-handle">
</div>
</div>
<script type="text/javascript">
$(document).ready(function(){initNavTree('group__grp__svec.html','');});
</script>
<div id="doc-content">
<!-- window showing the filter options -->
<div id="MSearchSelectWindow"
onmouseover="return searchBox.OnSearchSelectShow()"
onmouseout="return searchBox.OnSearchSelectHide()"
onkeydown="return searchBox.OnSearchSelectKey(event)">
<a class="SelectItem" href="javascript:void(0)" onclick="searchBox.OnSelectItem(0)"><span class="SelectionMark">&#160;</span>All</a><a class="SelectItem" href="javascript:void(0)" onclick="searchBox.OnSelectItem(1)"><span class="SelectionMark">&#160;</span>Files</a><a class="SelectItem" href="javascript:void(0)" onclick="searchBox.OnSelectItem(2)"><span class="SelectionMark">&#160;</span>Functions</a><a class="SelectItem" href="javascript:void(0)" onclick="searchBox.OnSelectItem(3)"><span class="SelectionMark">&#160;</span>Variables</a><a class="SelectItem" href="javascript:void(0)" onclick="searchBox.OnSelectItem(4)"><span class="SelectionMark">&#160;</span>Groups</a></div>
<!-- iframe showing the search results (closed by default) -->
<div id="MSearchResultsWindow">
<iframe src="javascript:void(0)" frameborder="0"
name="MSearchResults" id="MSearchResults">
</iframe>
</div>
<div class="header">
<div class="headertitle">
<div class="title">Sparse Vectors<div class="ingroups"><a class="el" href="group__grp__support.html">Support Modules</a></div></div> </div>
</div><!--header-->
<div class="contents">
<dl class="section user"><dt>About</dt><dd></dd></dl>
<p>This module implements a sparse vector data type, named "svec", which provides compressed storage of vectors that have many duplicate elements.</p>
<p>Arrays of floating point numbers for various calculations sometimes have long runs of zeros (or some other default value). This is common in applications like scientific computing, retail optimization, and text processing. Each floating point number takes 8 bytes of storage in memory and/or disk, so saving those zeros is often worthwhile. There are also many computations that can benefit from skipping over the zeros.</p>
<p>Consider, for example, the following array of doubles stored as a Postgres/Greenplum "float8[]" data type:</p>
<div class="fragment"><div class="line"><span class="stringliteral">&#39;{0, 33,...40,000 zeros..., 12, 22 }&#39;</span>::float8[]</div>
</div><!-- fragment --><p>This array would occupy slightly more than 320KB of memory or disk, most of it zeros. Even if we were to exploit the null bitmap and store the zeros as nulls, we would still end up with a 5KB null bitmap, which is still not nearly as memory efficient as we'd like. Also, as we perform various operations on the array, we do work on 40,000 fields that turn out to be unimportant.</p>
<p>To solve the problems associated with the processing of vectors discussed above, the svec type employs a simple Run Length Encoding (RLE) scheme to represent sparse vectors as pairs of count-value arrays. For example, the array above would be represented as</p>
<div class="fragment"><div class="line"><span class="stringliteral">&#39;{1,1,40000,1,1}:{0,33,0,12,22}&#39;</span>::MADlib.svec</div>
</div><!-- fragment --><p>which says there is 1 occurrence of 0, followed by 1 occurrence of 33, followed by 40,000 occurrences of 0, etc. This uses just 5 integers and 5 floating point numbers to store the array. Further, it is easy to implement vector operations that can take advantage of the RLE representation to make computations faster. The SVEC module provides a library of such functions.</p>
<p>The current version only supports sparse vectors of float8 values. Future versions will support other base types.</p>
<dl class="section user"><dt>Usage</dt><dd></dd></dl>
<p>An SVEC can be constructed directly with a constant expression, as follows: </p>
<div class="fragment"><div class="line"><a class="code" href="robust_8sql__in.html#ac9ebd21770ba37efb90e1ccee36fc103">SELECT</a> <span class="stringliteral">&#39;{n1,n2,...,nk}:{v1,v2,...vk}&#39;</span>::MADlib.svec;</div>
</div><!-- fragment --><p> where <code>n1,n2,...,nk</code> specifies the counts for the values <code>v1,v2,...,vk</code>.</p>
<p>A float array can be cast to an SVEC: </p>
<div class="fragment"><div class="line"><a class="code" href="robust_8sql__in.html#ac9ebd21770ba37efb90e1ccee36fc103">SELECT</a> (<span class="stringliteral">&#39;{v1,v2,...vk}&#39;</span>::<span class="keywordtype">float</span>[])::MADlib.svec;</div>
</div><!-- fragment --><p>An SVEC can be created with an aggregation: </p>
<div class="fragment"><div class="line"><a class="code" href="robust_8sql__in.html#ac9ebd21770ba37efb90e1ccee36fc103">SELECT</a> MADlib.svec_agg(v1) FROM generate_series(1,<a class="code" href="svd_8sql__in.html#ac55562e3550918c83a7a81f0ddc007dc">k</a>);</div>
</div><!-- fragment --><p>An SVEC can be created using the <code>madlib.svec_cast_positions_float8arr()</code> function by supplying an array of positions and an array of values at those positions: </p>
<div class="fragment"><div class="line"><a class="code" href="robust_8sql__in.html#ac9ebd21770ba37efb90e1ccee36fc103">SELECT</a> MADlib.svec_cast_positions_float8arr(</div>
<div class="line"> array[n1,n2,...nk], -- positions of values in vector</div>
<div class="line"> array[v1,v2,...vk], -- values at each position</div>
<div class="line"> length, -- length of vector </div>
<div class="line"> base) -- value at unspecified positions</div>
</div><!-- fragment --><p> For example, the following expression: </p>
<div class="fragment"><div class="line"><a class="code" href="robust_8sql__in.html#ac9ebd21770ba37efb90e1ccee36fc103">SELECT</a> MADlib.svec_cast_positions_float8arr(</div>
<div class="line"> array[1,3,5],</div>
<div class="line"> array[2,4,6],</div>
<div class="line"> 10,</div>
<div class="line"> 0.0)</div>
</div><!-- fragment --><p> produces this SVEC: </p>
<div class="fragment"><div class="line"><a class="code" href="svec_8sql__in.html#a0ee423729fbca5abd46c86fa81d51f23">svec_cast_positions_float8arr</a> </div>
<div class="line">-------------------------------</div>
<div class="line">{1,1,1,1,1,5}:{2,0,4,0,6,0}</div>
</div><!-- fragment --><p>Add MADlib to the search_path to use the svec operators defined in the module.</p>
<p>See the file <a class="el" href="svec_8sql__in.html" title="SQL type definitions and functions for sparse vector data type svec ">svec.sql_in</a> for complete syntax.</p>
<dl class="section user"><dt>Examples</dt><dd></dd></dl>
<p>We can use operations with svec type like &lt;, &gt;, *, **, /, =, +, SUM, etc, and they have meanings associated with typical vector operations. For example, the plus (+) operator adds each of the terms of two vectors having the same dimension together. </p>
<div class="fragment"><div class="line">sql&gt; <a class="code" href="robust_8sql__in.html#ac9ebd21770ba37efb90e1ccee36fc103">SELECT</a> (<span class="stringliteral">&#39;{0,1,5}&#39;</span>::float8[]::MADlib.svec + <span class="stringliteral">&#39;{4,3,2}&#39;</span>::float8[]::MADlib.svec)::float8[];</div>
<div class="line"> float8 </div>
<div class="line">---------</div>
<div class="line"> {4,4,7}</div>
</div><!-- fragment --><p>Without the casting into float8[] at the end, we get: </p>
<div class="fragment"><div class="line">sql&gt; SELECT <span class="stringliteral">&#39;{0,1,5}&#39;</span>::float8[]::MADlib.svec + <span class="stringliteral">&#39;{4,3,2}&#39;</span>::float8[]::MADlib.svec;</div>
<div class="line"> ?column? </div>
<div class="line">----------</div>
<div class="line">{2,1}:{4,7} </div>
</div><!-- fragment --><p>A dot product (%*%) between the two vectors will result in a scalar result of type float8. The dot product should be (0*4 + 1*3 + 5*2) = 13, like this: </p>
<div class="fragment"><div class="line">sql&gt; SELECT <span class="stringliteral">&#39;{0,1,5}&#39;</span>::float8[]::MADlib.svec %*% <span class="stringliteral">&#39;{4,3,2}&#39;</span>::float8[]::MADlib.svec;</div>
<div class="line"> ?column? </div>
<div class="line">----------</div>
<div class="line"> 13</div>
</div><!-- fragment --><p>Special vector aggregate functions are also available. SUM is self explanatory. SVEC_COUNT_NONZERO evaluates the count of non-zero terms in each column found in a set of n-dimensional svecs and returns an svec with the counts. For instance, if we have the vectors {0,1,5}, {10,0,3},{0,0,3},{0,1,0}, then executing the SVEC_COUNT_NONZERO() aggregate function would result in {1,2,3}:</p>
<div class="fragment"><div class="line">sql&gt; create table list (a MADlib.svec);</div>
<div class="line">sql&gt; insert into list values (<span class="stringliteral">&#39;{0,1,5}&#39;</span>::float8[]), (<span class="stringliteral">&#39;{10,0,3}&#39;</span>::float8[]), (<span class="stringliteral">&#39;{0,0,3}&#39;</span>::float8[]),(<span class="stringliteral">&#39;{0,1,0}&#39;</span>::float8[]);</div>
<div class="line"></div>
<div class="line">sql&gt; SELECT MADlib.svec_count_nonzero(a)::float8[] FROM list;</div>
<div class="line"><a class="code" href="svec_8sql__in.html#abebdcbe45de346aff874db008e842e65">svec_count_nonzero</a> </div>
<div class="line">-----------------</div>
<div class="line"> {1,2,3}</div>
</div><!-- fragment --><p>We do not use null bitmaps in the svec data type. A null value in an svec is represented explicitly as an NVP (No Value Present) value. For example, we have: </p>
<div class="fragment"><div class="line">sql&gt; SELECT <span class="stringliteral">&#39;{1,2,3}:{4,null,5}&#39;</span>::MADlib.svec;</div>
<div class="line"> svec </div>
<div class="line">-------------------</div>
<div class="line"> {1,2,3}:{4,NVP,5}</div>
<div class="line"></div>
<div class="line">sql&gt; SELECT <span class="stringliteral">&#39;{1,2,3}:{4,null,5}&#39;</span>::MADlib.svec + <span class="stringliteral">&#39;{2,2,2}:{8,9,10}&#39;</span>::MADlib.svec; </div>
<div class="line"> ?column? </div>
<div class="line"> --------------------------</div>
<div class="line"> {1,2,1,2}:{12,NVP,14,15}</div>
</div><!-- fragment --><p>An element of an svec can be accessed using the <a class="el" href="svec_8sql__in.html#a8787222aec691f94d9808d1369aa401c">svec_proj()</a> function, which takes an svec and the index of the element desired. </p>
<div class="fragment"><div class="line">sql&gt; SELECT MADlib.svec_proj(<span class="stringliteral">&#39;{1,2,3}:{4,5,6}&#39;</span>::MADlib.svec, 1) + MADlib.svec_proj(<span class="stringliteral">&#39;{4,5,6}:{1,2,3}&#39;</span>::MADlib.svec, 15); </div>
<div class="line"> ?column? </div>
<div class="line">----------</div>
<div class="line"> 7</div>
</div><!-- fragment --><p>A subvector of an svec can be accessed using the <a class="el" href="svec_8sql__in.html#a5cb3446de5fc117befe88ccb1ebb0e4e">svec_subvec()</a> function, which takes an svec and the start and end index of the subvector desired. </p>
<div class="fragment"><div class="line">sql&gt; SELECT MADlib.svec_subvec(<span class="stringliteral">&#39;{2,4,6}:{1,3,5}&#39;</span>::MADlib.svec, 2, 11);</div>
<div class="line"> <a class="code" href="svec_8sql__in.html#a5cb3446de5fc117befe88ccb1ebb0e4e">svec_subvec</a> </div>
<div class="line">----------------- </div>
<div class="line"> {1,4,5}:{1,3,5}</div>
</div><!-- fragment --><p>The elements/subvector of an svec can be changed using the function <a class="el" href="svec_8sql__in.html#a59407764a1cbf1937da39cf39a2f447c">svec_change()</a>. It takes three arguments: an m-dimensional svec sv1, a start index j, and an n-dimensional svec sv2 such that j + n - 1 &lt;= m, and returns an svec like sv1 but with the subvector sv1[j:j+n-1] replaced by sv2. An example follows: </p>
<div class="fragment"><div class="line">sql&gt; SELECT MADlib.svec_change(<span class="stringliteral">&#39;{1,2,3}:{4,5,6}&#39;</span>::MADlib.svec,3,<span class="stringliteral">&#39;{2}:{3}&#39;</span>::MADlib.svec);</div>
<div class="line"> <a class="code" href="svec_8sql__in.html#a59407764a1cbf1937da39cf39a2f447c">svec_change</a> </div>
<div class="line">---------------------</div>
<div class="line"> {1,1,2,2}:{4,5,3,6}</div>
</div><!-- fragment --><p>There are also higher-order functions for processing svecs. For example, the following is the corresponding function for lapply() in R. </p>
<div class="fragment"><div class="line">sql&gt; SELECT MADlib.svec_lapply(<span class="stringliteral">&#39;sqrt&#39;</span>, <span class="stringliteral">&#39;{1,2,3}:{4,5,6}&#39;</span>::MADlib.svec);</div>
<div class="line"> <a class="code" href="svec_8sql__in.html#a0d94c44dde95a00e3d802dee6d7c01eb">svec_lapply</a> </div>
<div class="line">-----------------------------------------------</div>
<div class="line"> {1,2,3}:{2,2.23606797749979,2.44948974278318}</div>
</div><!-- fragment --><p>The full list of functions available for operating on svecs are available in svec.sql-in.</p>
<p><b> A More Extensive Example</b> </p>
<pre class="fragment">For a text classification example, let's assume we have a dictionary
composed of words in a sorted text array:
</pre> <div class="fragment"><div class="line">sql&gt; create table features (a text[]);</div>
<div class="line">sql&gt; insert into features values </div>
<div class="line"> (<span class="stringliteral">&#39;{am,before,being,bothered,corpus,document,i,in,is,me,</span></div>
<div class="line"><span class="stringliteral"> never,now,one,really,second,the,third,this,until}&#39;</span>);</div>
</div><!-- fragment --><p> We have a set of documents, each represented as an array of words: </p>
<div class="fragment"><div class="line">sql&gt; create table documents(a <span class="keywordtype">int</span>,b text[]);</div>
<div class="line">sql&gt; insert into documents values</div>
<div class="line"> (1,<span class="stringliteral">&#39;{this,is,one,document,in,the,corpus}&#39;</span>),</div>
<div class="line"> (2,<span class="stringliteral">&#39;{i,am,the,second,document,in,the,corpus}&#39;</span>),</div>
<div class="line"> (3,<span class="stringliteral">&#39;{being,third,never,really,bothered,me,until,now}&#39;</span>),</div>
<div class="line"> (4,<span class="stringliteral">&#39;{the,document,before,me,is,the,third,document}&#39;</span>);</div>
</div><!-- fragment --> <pre class="fragment">Now we have a dictionary and some documents, we would like to do some
document categorization using vector arithmetic on word counts and
proportions of dictionary words in each document.
To start this process, we'll need to find the dictionary words in each
document. We'll prepare what is called a Sparse Feature Vector or SFV
for each document. An SFV is a vector of dimension N, where N is the
number of dictionary words, and in each cell of an SFV is a count of
each dictionary word in the document.
Inside the sparse vector library, we have a function that will create
an SFV from a document, so we can just do this:
</pre> <div class="fragment"><div class="line">sql&gt; SELECT MADlib.svec_sfv((SELECT a FROM features LIMIT 1),b)::float8[] </div>
<div class="line"> FROM documents;</div>
<div class="line"></div>
<div class="line"> <a class="code" href="svec_8sql__in.html#a375acd521ed9cb05f63b3696dcc10bf4">svec_sfv</a></div>
<div class="line">-----------------------------------------</div>
<div class="line"> {0,0,0,0,1,1,0,1,1,0,0,0,1,0,0,1,0,1,0}</div>
<div class="line"> {0,0,1,1,0,0,0,0,0,1,1,1,0,1,0,0,1,0,1}</div>
<div class="line"> {1,0,0,0,1,1,1,1,0,0,0,0,0,0,1,2,0,0,0}</div>
<div class="line"> {0,1,0,0,0,2,0,0,1,1,0,0,0,0,0,2,1,0,0}</div>
</div><!-- fragment --><p> Note that the output of MADlib.svec_sfv() is an svec for each document containing the count of each of the dictionary words in the ordinal positions of the dictionary. This can more easily be understood by lining up the feature vector and text like this: </p>
<div class="fragment"><div class="line">sql&gt; SELECT MADlib.svec_sfv((SELECT a FROM features LIMIT 1),b)::float8[]</div>
<div class="line"> , b </div>
<div class="line"> FROM documents;</div>
<div class="line"></div>
<div class="line"> <a class="code" href="svec_8sql__in.html#a375acd521ed9cb05f63b3696dcc10bf4">svec_sfv</a> | b </div>
<div class="line">-----------------------------------------+--------------------------------------------------</div>
<div class="line"> {1,0,0,0,1,1,1,1,0,0,0,0,0,0,1,2,0,0,0} | {i,am,the,second,document,in,the,corpus}</div>
<div class="line"> {0,1,0,0,0,2,0,0,1,1,0,0,0,0,0,2,1,0,0} | {the,document,before,me,is,the,third,document}</div>
<div class="line"> {0,0,0,0,1,1,0,1,1,0,0,0,1,0,0,1,0,1,0} | {<span class="keyword">this</span>,is,one,document,in,the,corpus}</div>
<div class="line"> {0,0,1,1,0,0,0,0,0,1,1,1,0,1,0,0,1,0,1} | {being,third,never,really,bothered,me,until,now}</div>
<div class="line"></div>
<div class="line">sql&gt; SELECT * FROM features;</div>
<div class="line"> a </div>
<div class="line">--------------------------------------------------------------------------------------------------------</div>
<div class="line">{am,before,being,bothered,corpus,document,i,in,is,me,never,now,one,really,second,the,third,<span class="keyword">this</span>,until}</div>
</div><!-- fragment --> <pre class="fragment">Now when we look at the document "i am the second document in the corpus",
its SFV is {1,3*0,1,1,1,1,6*0,1,2}. The word "am" is the first ordinate in
the dictionary and there is 1 instance of it in the SFV. The word "before"
has no instances in the document, so its value is "0" and so on.
The function MADlib.svec_sfv() can process large
numbers of documents into their SFVs in parallel at high speed.
The rest of the categorization process is all vector math. The actual
count is hardly ever used. Instead, it's turned into a weight. The most
common weight is called tf/idf for Term Frequency / Inverse Document
Frequency. The calculation for a given term in a given document is
</pre> <div class="fragment"><div class="line">{#Times in document} * log {#Documents / #Documents the term appears in}.</div>
</div><!-- fragment --><p> For instance, the term "document" in document A would have weight 1 * log (4/3). In document D, it would have weight 2 * log (4/3). Terms that appear in every document would have tf/idf weight 0, since log (4/4) = log(1) = 0. (Our example has no term like that.) That usually sends a lot of values to 0.</p>
<p>For this part of the processing, we'll need to have a sparse vector of the dictionary dimension (19) with the values </p>
<div class="fragment"><div class="line">log(#documents/#Documents each term appears in). </div>
</div><!-- fragment --><p> There will be one such vector for the whole list of documents (aka the "corpus"). The #documents is just a count of all of the documents, in this case 4, but there is one divisor for each dictionary word and its value is the count of all the times that word appears in the document. This single vector for the whole corpus can then be scalar product multiplied by each document SFV to produce the Term Frequency/Inverse Document Frequency weights.</p>
<p>This can be done as follows: </p>
<div class="fragment"><div class="line">sql&gt; create table corpus as </div>
<div class="line"> (SELECT a, MADlib.svec_sfv((SELECT a FROM features LIMIT 1),b) sfv </div>
<div class="line"> FROM documents);</div>
<div class="line">sql&gt; create table weights as</div>
<div class="line"> (SELECT a docnum, MADlib.svec_mult(sfv, logidf) tf_idf </div>
<div class="line"> FROM (SELECT MADlib.svec_log(MADlib.svec_div(count(sfv)::MADlib.svec,MADlib.svec_count_nonzero(sfv))) logidf </div>
<div class="line"> FROM corpus) foo, corpus ORDER BYdocnum);</div>
<div class="line">sql&gt; SELECT * FROM weights;</div>
<div class="line"></div>
<div class="line">docnum | tf_idf </div>
<div class="line">-------+----------------------------------------------------------------------</div>
<div class="line"> 1 | {4,1,1,1,2,3,1,2,1,1,1,1}:{0,0.69,0.28,0,0.69,0,1.38,0,0.28,0,1.38,0}</div>
<div class="line"> 2 | {1,3,1,1,1,1,6,1,1,3}:{1.38,0,0.69,0.28,1.38,0.69,0,1.38,0.57,0}</div>
<div class="line"> 3 | {2,2,5,1,2,1,1,2,1,1,1}:{0,1.38,0,0.69,1.38,0,1.38,0,0.69,0,1.38}</div>
<div class="line"> 4 | {1,1,3,1,2,2,5,1,1,2}:{0,1.38,0,0.57,0,0.69,0,0.57,0.69,0}</div>
</div><!-- fragment --> <pre class="fragment">We can now get the "angular distance" between one document and the rest
of the documents using the ACOS of the dot product of the document vectors:
The following calculates the angular distance between the first document
and each of the other documents:
</pre> <div class="fragment"><div class="line">sql&gt; SELECT docnum,</div>
<div class="line"> 180. * ( ACOS( MADlib.svec_dmin( 1., MADlib.svec_dot(tf_idf, testdoc) </div>
<div class="line"> / (MADlib.svec_l2norm(tf_idf)*MADlib.svec_l2norm(testdoc))))/3.141592654) angular_distance </div>
<div class="line"> FROM weights,(SELECT tf_idf testdoc FROM weights WHERE docnum = 1 LIMIT 1) foo </div>
<div class="line"> ORDER BY 1;</div>
<div class="line"></div>
<div class="line">docnum | angular_distance </div>
<div class="line">--------+------------------</div>
<div class="line"> 1 | 0</div>
<div class="line"> 2 | 78.8235846096986</div>
<div class="line"> 3 | 89.9999999882484</div>
<div class="line"> 4 | 80.0232034288617</div>
</div><!-- fragment --><p> We can see that the angular distance between document 1 and itself is 0 degrees and between document 1 and 3 is 90 degrees because they share no features at all. The angular distance can now be plugged into machine learning algorithms that rely on a distance measure between data points.</p>
<p>SVEC also provides functionality for declaring array given an array of positions and array of values, intermediate values betweens those are declared to be base value that user provides in the same function call. In the example below the fist array of integers represents the positions for the array two (array of floats). Positions do not need to come in the sorted order. Third value represents desired maximum size of the array. This assures that array is of that size even if last position is not. If max size &lt; 1 that value is ignored and array will end at the last position in the position vector. Final value is a float representing the base value to be used between the declared ones (0 would be a common candidate): </p>
<div class="fragment"><div class="line">sql&gt; SELECT MADlib.svec_cast_positions_float8arr(<a class="code" href="robust_8sql__in.html#a510547c506e686972e73f497f0725d5c">ARRAY</a>[1,2,7,5,87],<a class="code" href="robust_8sql__in.html#a510547c506e686972e73f497f0725d5c">ARRAY</a>[.1,.2,.7,.5,.87],90,0.0);</div>
<div class="line"></div>
<div class="line"> <a class="code" href="svec_8sql__in.html#a0ee423729fbca5abd46c86fa81d51f23">svec_cast_positions_float8arr</a> </div>
<div class="line">-----------------------------------------------------</div>
<div class="line">{1,1,2,1,1,1,79,1,3}:{0.1,0.2,0,0.5,0,0.7,0,0.87,0}</div>
<div class="line">(1 row)</div>
</div><!-- fragment --> <pre class="fragment">Other examples of svecs usage can be found in the k-means module.
</pre><dl class="section see"><dt>See Also</dt><dd>File <a class="el" href="svec_8sql__in.html" title="SQL type definitions and functions for sparse vector data type svec ">svec.sql_in</a> documenting the SQL functions. </dd></dl>
</div><!-- contents -->
</div><!-- doc-content -->
<!-- start footer part -->
<div id="nav-path" class="navpath"><!-- id is needed for treeview function! -->
<ul>
<li class="footer">Generated on Thu Jan 9 2014 20:35:40 for MADlib by
<a href="http://www.doxygen.org/index.html">
<img class="footer" src="doxygen.png" alt="doxygen"/></a> 1.8.4 </li>
</ul>
</div>
</body>
</html>