blob: 9fd8bac600704015455145870e1ec7612c537003 [file] [log] [blame]
<!-- HTML header for doxygen 1.8.4-->
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<meta http-equiv="Content-Type" content="text/xhtml;charset=UTF-8"/>
<meta http-equiv="X-UA-Compatible" content="IE=9"/>
<meta name="generator" content="Doxygen 1.8.13"/>
<meta name="keywords" content="madlib,postgres,greenplum,machine learning,data mining,deep learning,ensemble methods,data science,market basket analysis,affinity analysis,pca,lda,regression,elastic net,huber white,proportional hazards,k-means,latent dirichlet allocation,bayes,support vector machines,svm"/>
<title>MADlib: Define Custom Functions</title>
<link href="tabs.css" rel="stylesheet" type="text/css"/>
<script type="text/javascript" src="jquery.js"></script>
<script type="text/javascript" src="dynsections.js"></script>
<link href="navtree.css" rel="stylesheet" type="text/css"/>
<script type="text/javascript" src="resize.js"></script>
<script type="text/javascript" src="navtreedata.js"></script>
<script type="text/javascript" src="navtree.js"></script>
<script type="text/javascript">
$(document).ready(initResizable);
</script>
<link href="search/search.css" rel="stylesheet" type="text/css"/>
<script type="text/javascript" src="search/searchdata.js"></script>
<script type="text/javascript" src="search/search.js"></script>
<script type="text/javascript">
$(document).ready(function() { init_search(); });
</script>
<script type="text/x-mathjax-config">
MathJax.Hub.Config({
extensions: ["tex2jax.js", "TeX/AMSmath.js", "TeX/AMSsymbols.js"],
jax: ["input/TeX","output/HTML-CSS"],
});
</script><script type="text/javascript" src="http://cdn.mathjax.org/mathjax/latest/MathJax.js"></script>
<!-- hack in the navigation tree -->
<script type="text/javascript" src="eigen_navtree_hacks.js"></script>
<link href="doxygen.css" rel="stylesheet" type="text/css" />
<link href="madlib_extra.css" rel="stylesheet" type="text/css"/>
<!-- google analytics -->
<script>
(function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
(i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o),
m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
})(window,document,'script','//www.google-analytics.com/analytics.js','ga');
ga('create', 'UA-45382226-1', 'madlib.apache.org');
ga('send', 'pageview');
</script>
</head>
<body>
<div id="top"><!-- do not remove this div, it is closed by doxygen! -->
<div id="titlearea">
<table cellspacing="0" cellpadding="0">
<tbody>
<tr style="height: 56px;">
<td id="projectlogo"><a href="http://madlib.apache.org"><img alt="Logo" src="madlib.png" height="50" style="padding-left:0.5em;" border="0"/ ></a></td>
<td style="padding-left: 0.5em;">
<div id="projectname">
<span id="projectnumber">1.21.0</span>
</div>
<div id="projectbrief">User Documentation for Apache MADlib</div>
</td>
<td> <div id="MSearchBox" class="MSearchBoxInactive">
<span class="left">
<img id="MSearchSelect" src="search/mag_sel.png"
onmouseover="return searchBox.OnSearchSelectShow()"
onmouseout="return searchBox.OnSearchSelectHide()"
alt=""/>
<input type="text" id="MSearchField" value="Search" accesskey="S"
onfocus="searchBox.OnSearchFieldFocus(true)"
onblur="searchBox.OnSearchFieldFocus(false)"
onkeyup="searchBox.OnSearchFieldChange(event)"/>
</span><span class="right">
<a id="MSearchClose" href="javascript:searchBox.CloseResultsWindow()"><img id="MSearchCloseImg" border="0" src="search/close.png" alt=""/></a>
</span>
</div>
</td>
</tr>
</tbody>
</table>
</div>
<!-- end header part -->
<!-- Generated by Doxygen 1.8.13 -->
<script type="text/javascript">
var searchBox = new SearchBox("searchBox", "search",false,'Search');
</script>
</div><!-- top -->
<div id="side-nav" class="ui-resizable side-nav-resizable">
<div id="nav-tree">
<div id="nav-tree-contents">
<div id="nav-sync" class="sync"></div>
</div>
</div>
<div id="splitbar" style="-moz-user-select:none;"
class="ui-resizable-handle">
</div>
</div>
<script type="text/javascript">
$(document).ready(function(){initNavTree('group__grp__custom__function.html','');});
</script>
<div id="doc-content">
<!-- window showing the filter options -->
<div id="MSearchSelectWindow"
onmouseover="return searchBox.OnSearchSelectShow()"
onmouseout="return searchBox.OnSearchSelectHide()"
onkeydown="return searchBox.OnSearchSelectKey(event)">
</div>
<!-- iframe showing the search results (closed by default) -->
<div id="MSearchResultsWindow">
<iframe src="javascript:void(0)" frameborder="0"
name="MSearchResults" id="MSearchResults">
</iframe>
</div>
<div class="header">
<div class="headertitle">
<div class="title">Define Custom Functions<div class="ingroups"><a class="el" href="group__grp__dl.html">Deep Learning</a> &raquo; <a class="el" href="group__grp__model__prep.html">Model Preparation</a></div></div> </div>
</div><!--header-->
<div class="contents">
<div class="toc"><b>Contents</b><ul>
<li class="level1">
<a href="#load_function">Load Function</a> </li>
<li class="level1">
<a href="#delete_function">Delete Function</a> </li>
<li class="level1">
<a href="#top_k_function">Top k Accuracy Function</a> </li>
<li class="level1">
<a href="#example">Examples</a> </li>
<li class="level1">
<a href="#literature">Literature</a> </li>
<li class="level1">
<a href="#related">Related Topics</a> </li>
</ul>
</div><dl class="section warning"><dt>Warning</dt><dd><em> For security reasons there are controls on custom functions in MADlib. You must be a superuser to create custom functions because they could theoretically allow execution of any untrusted Python code. Regular users with MADlib USAGE permission can use existing custom functions but cannot create new ones or update existing ones. See references [1] and [2] for information on privileges in Greenplum and PostgreSQL. </em></dd></dl>
<p>This function loads custom Python functions into a table for use by deep learning algorithms.</p>
<p>Custom functions can be useful if, for example, you need loss functions or metrics that are not built into the standard libraries. The functions to be loaded must be in the form of serialized Python objects created using Dill, which extends Python's pickle module to the majority of the built-in Python types [3].</p>
<p>Custom functions can also be used to return top k categorical accuracy in the case that you want a different k value than the default from Keras. This module includes a helper function to create the custom function automatically for a specified k.</p>
<p>There is also a utility function to delete a function from the table.</p>
<dl class="section note"><dt>Note</dt><dd>Do not specify a schema for the argument 'object_table' containing the Python objects, because the 'object_table' is automatically put in the MADlib schema. Also, any subsequent SQL queries on this table by regular users must specify '&lt;madlib_schema&gt;.object_table' in the usual way.</dd></dl>
<p><a class="anchor" id="load_function"></a></p><dl class="section user"><dt>Load Function</dt><dd></dd></dl>
<pre class="syntax">
load_custom_function(
object_table,
object,
name,
description
)
</pre><p> <b>Arguments</b> </p><dl class="arglist">
<dt>object_table </dt>
<dd><p class="startdd">VARCHAR. Table to load serialized Python objects. If this table does not exist, it will be created. If this table already exists, a new row is inserted into the existing table. Do not specify schema as part of the object table name, since it will be put in the MADlib schema automatically. </p>
<p class="enddd"></p>
</dd>
<dt>object </dt>
<dd><p class="startdd">BYTEA. PostgreSQL binary data type of the Python object. Object must be created with the Dill package for serializing Python objects.</p>
<dl class="section note"><dt>Note</dt><dd>The Dill package must be installed on all segments of the database cluster [3]. </dd></dl>
</dd>
<dt>name </dt>
<dd><p class="startdd">TEXT. Name of the object. Must be unique identifier in the table, since this name is used when passing the object to Keras. </p>
<p class="enddd"></p>
</dd>
<dt>description (optional) </dt>
<dd><p class="startdd">TEXT, default: NULL. Free text string to provide a description, if desired. </p>
<p class="enddd"></p>
</dd>
</dl>
<p><b>Output table</b> <br />
The output table contains the following columns: </p><table class="output">
<tr>
<th>id </th><td>SERIAL. Object ID. </td></tr>
<tr>
<th>name </th><td>TEXT PRIMARY KEY. Name of the object. </td></tr>
<tr>
<th>description </th><td>TEXT. Description of the object (free text). </td></tr>
<tr>
<th>object </th><td>BYTEA. Serialized Python object stored as a PostgreSQL binary data type. </td></tr>
</table>
<p><a class="anchor" id="delete_function"></a></p><dl class="section user"><dt>Delete Function</dt><dd></dd></dl>
<p>Delete by id: </p><pre class="syntax">
delete_custom_function(
object_table,
id
)
</pre><p> Or alternatively by name: </p><pre class="syntax">
delete_custom_function(
object_table,
name
)
</pre><p> <b>Arguments</b> </p><dl class="arglist">
<dt>object_table </dt>
<dd>VARCHAR. Table containing Python object to be deleted. Do not specify schema as part of the object table name. </dd>
<dt>id </dt>
<dd>INTEGER. The id of the object to be deleted. </dd>
<dt>name </dt>
<dd>TEXT. Name of the object to be deleted. </dd>
</dl>
<p><a class="anchor" id="top_k_function"></a></p><dl class="section user"><dt>Top k Accuracy Function</dt><dd></dd></dl>
<p>Create and load a custom function for a specific k into the custom functions table. The Keras accuracy parameter 'top_k_categorical_accuracy' returns top 5 accuracy by default [4]. If you want a different top k value, use this helper function to create a custom Python function to compute the top k accuracy that you specify.</p>
<pre class="syntax">
load_top_k_accuracy_function(
object_table,
k
)
</pre><p> <b>Arguments</b> </p><dl class="arglist">
<dt>object_table </dt>
<dd><p class="startdd">VARCHAR. Table to load serialized Python objects. If this table does not exist, it will be created. If this table already exists, a new row is inserted into the existing table. Do not specify schema as part of the object table name, since it will be put in the MADlib schema automatically. </p>
<p class="enddd"></p>
</dd>
<dt>k </dt>
<dd><p class="startdd">INTEGER. k value for the top k accuracy that you want. </p>
<p class="enddd"></p>
</dd>
</dl>
<p><b>Output table</b> <br />
The output table contains the following columns: </p><table class="output">
<tr>
<th>id </th><td>SERIAL. Object ID. </td></tr>
<tr>
<th>name </th><td>TEXT PRIMARY KEY. Name of the object. Generated with the following pattern: top_(k)_accuracy. </td></tr>
<tr>
<th>description </th><td>TEXT. Description of the object. </td></tr>
<tr>
<th>object </th><td>BYTEA. Serialized Python object stored as a PostgreSQL binary data type. </td></tr>
</table>
<p><a class="anchor" id="example"></a></p><dl class="section user"><dt>Examples</dt><dd><ol type="1">
<li>Load object using psycopg2. Psycopg is a PostgreSQL database adapter for the Python programming language. Note need to use the psycopg2.Binary() method to pass as bytes. <pre class="example">
# import database connector psycopg2 and create connection cursor
import psycopg2 as p2
conn = p2.connect('postgresql://gpadmin:8000/madlib')
cur = conn.cursor()
# import Dill and define functions
import dill
# custom loss
def squared_error(y_true, y_pred):
import tensorflow.keras.backend as K
return K.square(y_pred - y_true)
pb_squared_error=dill.dumps(squared_error)
# custom metric
def rmse(y_true, y_pred):
import tensorflow.keras.backend as K
return K.sqrt(K.mean(K.square(y_pred - y_true), axis=-1))
pb_rmse=dill.dumps(rmse)
# call load function
cur.execute("DROP TABLE IF EXISTS madlib.custom_function_table")
cur.execute("SELECT madlib.load_custom_function('custom_function_table', %s,'squared_error', 'squared error')", [p2.Binary(pb_squared_error)])
cur.execute("SELECT madlib.load_custom_function('custom_function_table', %s,'rmse', 'root mean square error')", [p2.Binary(pb_rmse)])
conn.commit()
</pre> List table to see objects: <pre class="example">
SELECT id, name, description FROM madlib.custom_function_table ORDER BY id;
</pre> <pre class="result">
id | name | description
----+---------------+------------------------
1 | squared_error | squared error
2 | rmse | root mean square error
</pre></li>
<li>Load object using a PL/Python function. First define the objects: <pre class="example">
CREATE OR REPLACE FUNCTION custom_function_squared_error()
RETURNS BYTEA AS
$$
import dill
def squared_error(y_true, y_pred):
import tensorflow.keras.backend as K
return K.square(y_pred - y_true)
pb_squared_error=dill.dumps(squared_error)
return pb_squared_error
$$ language plpythonu;
CREATE OR REPLACE FUNCTION custom_function_rmse()
RETURNS BYTEA AS
$$
import dill
def rmse(y_true, y_pred):
import tensorflow.keras.backend as K
return K.sqrt(K.mean(K.square(y_pred - y_true), axis=-1))
pb_rmse=dill.dumps(rmse)
return pb_rmse
$$ language plpythonu;
</pre> Now call loader: <pre class="result">
DROP TABLE IF EXISTS madlib.custom_function_table;
SELECT madlib.load_custom_function('custom_function_table',
custom_function_squared_error(),
'squared_error',
'squared error');
SELECT madlib.load_custom_function('custom_function_table',
custom_function_rmse(),
'rmse',
'root mean square error');
</pre></li>
<li>Delete an object by id: <pre class="example">
SELECT madlib.delete_custom_function( 'custom_function_table', 1);
SELECT id, name, description FROM madlib.custom_function_table ORDER BY id;
</pre> <pre class="result">
id | name | description
----+------+------------------------
2 | rmse | root mean square error
</pre> Delete an object by name: <pre class="example">
SELECT madlib.delete_custom_function( 'custom_function_table', 'rmse');
</pre> If all objects are deleted from the table using this function, the table itself will be dropped. </li>
<li>Load top 3 accuracy function followed by a top 10 accuracy function: <pre class="example">
DROP TABLE IF EXISTS custom_function_table;
SELECT madlib.load_top_k_accuracy_function('custom_function_table',
3);
SELECT madlib.load_top_k_accuracy_function('custom_function_table',
10);
SELECT id, name, description FROM madlib.custom_function_table ORDER BY id;
</pre> <pre class="result">
id | name | description
----+-----------------+-------------------------
1 | top_3_accuracy | returns top_3_accuracy
2 | top_10_accuracy | returns top_10_accuracy
</pre> <a class="anchor" id="literature"></a></li>
</ol>
</dd></dl>
<dl class="section user"><dt>Literature</dt><dd></dd></dl>
<p>[1] <a href="https://gpdb.docs.pivotal.io/latest/admin_guide/roles_privs.html">https://gpdb.docs.pivotal.io/latest/admin_guide/roles_privs.html</a></p>
<p>[2] <a href="https://www.postgresql.org/docs/current/ddl-priv.html">https://www.postgresql.org/docs/current/ddl-priv.html</a></p>
<p>[3] Python catalog for Dill package <a href="https://pypi.org/project/dill/">https://pypi.org/project/dill/</a></p>
<p>[4] <a href="https://keras.io/api/metrics/accuracy_metrics/#topkcategoricalaccuracy-class">https://keras.io/api/metrics/accuracy_metrics/#topkcategoricalaccuracy-class</a></p>
<p><a class="anchor" id="related"></a></p><dl class="section user"><dt>Related Topics</dt><dd></dd></dl>
<p>See <a class="el" href="madlib__keras__custom__function_8sql__in.html" title="Function to load serialized Python objects into a table. ">madlib_keras_custom_function.sql_in</a> </p>
</div><!-- contents -->
</div><!-- doc-content -->
<!-- start footer part -->
<div id="nav-path" class="navpath"><!-- id is needed for treeview function! -->
<ul>
<li class="footer">Generated on Thu Feb 23 2023 19:26:39 for MADlib by
<a href="http://www.doxygen.org/index.html">
<img class="footer" src="doxygen.png" alt="doxygen"/></a> 1.8.13 </li>
</ul>
</div>
</body>
</html>