| <!-- HTML header for doxygen 1.8.4--> |
| <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> |
| <html xmlns="http://www.w3.org/1999/xhtml"> |
| <head> |
| <meta http-equiv="Content-Type" content="text/xhtml;charset=UTF-8"/> |
| <meta http-equiv="X-UA-Compatible" content="IE=9"/> |
| <meta name="generator" content="Doxygen 1.8.4"/> |
| <meta name="keywords" content="madlib,postgres,greenplum,machine learning,data mining,deep learning,ensemble methods,data science,market basket analysis,affinity analysis,pca,lda,regression,elastic net,huber white,proportional hazards,k-means,latent dirichlet allocation,bayes,support vector machines,svm"/> |
| <title>MADlib: Data Preparation</title> |
| <link href="tabs.css" rel="stylesheet" type="text/css"/> |
| <script type="text/javascript" src="jquery.js"></script> |
| <script type="text/javascript" src="dynsections.js"></script> |
| <link href="navtree.css" rel="stylesheet" type="text/css"/> |
| <script type="text/javascript" src="resize.js"></script> |
| <script type="text/javascript" src="navtree.js"></script> |
| <script type="text/javascript"> |
| $(document).ready(initResizable); |
| $(window).load(resizeHeight); |
| </script> |
| <link href="search/search.css" rel="stylesheet" type="text/css"/> |
| <script type="text/javascript" src="search/search.js"></script> |
| <script type="text/javascript"> |
| $(document).ready(function() { searchBox.OnSelectItem(0); }); |
| </script> |
| <script type="text/x-mathjax-config"> |
| MathJax.Hub.Config({ |
| extensions: ["tex2jax.js", "TeX/AMSmath.js", "TeX/AMSsymbols.js"], |
| jax: ["input/TeX","output/HTML-CSS"], |
| }); |
| </script><script src="../mathjax/MathJax.js"></script> |
| <link href="doxygen.css" rel="stylesheet" type="text/css" /> |
| <link href="madlib_extra.css" rel="stylesheet" type="text/css"/> |
| </head> |
| <body> |
| <div id="top"><!-- do not remove this div, it is closed by doxygen! --> |
| <div id="titlearea"> |
| <table cellspacing="0" cellpadding="0"> |
| <tbody> |
| <tr style="height: 56px;"> |
| <td id="projectlogo"><a href="http://madlib.incubator.apache.org"><img alt="Logo" src="madlib.png" height="50" style="padding-left:0.5em;" border="0"/ ></a></td> |
| <td style="padding-left: 0.5em;"> |
| <div id="projectname"> |
| <span id="projectnumber">1.7</span> <span style="font-size:10pt; font-style:italic"><a href="../latest/./group__grp__data__prep.html"> A newer version is available</a></span> |
| </div> |
| <div id="projectbrief">User Documentation</div> |
| </td> |
| <td> <div id="MSearchBox" class="MSearchBoxInactive"> |
| <span class="left"> |
| <img id="MSearchSelect" src="search/mag_sel.png" |
| onmouseover="return searchBox.OnSearchSelectShow()" |
| onmouseout="return searchBox.OnSearchSelectHide()" |
| alt=""/> |
| <input type="text" id="MSearchField" value="Search" accesskey="S" |
| onfocus="searchBox.OnSearchFieldFocus(true)" |
| onblur="searchBox.OnSearchFieldFocus(false)" |
| onkeyup="searchBox.OnSearchFieldChange(event)"/> |
| </span><span class="right"> |
| <a id="MSearchClose" href="javascript:searchBox.CloseResultsWindow()"><img id="MSearchCloseImg" border="0" src="search/close.png" alt=""/></a> |
| </span> |
| </div> |
| </td> |
| </tr> |
| </tbody> |
| </table> |
| </div> |
| <!-- end header part --> |
| <!-- Generated by Doxygen 1.8.4 --> |
| <script type="text/javascript"> |
| var searchBox = new SearchBox("searchBox", "search",false,'Search'); |
| </script> |
| </div><!-- top --> |
| <div id="side-nav" class="ui-resizable side-nav-resizable"> |
| <div id="nav-tree"> |
| <div id="nav-tree-contents"> |
| <div id="nav-sync" class="sync"></div> |
| </div> |
| </div> |
| <div id="splitbar" style="-moz-user-select:none;" |
| class="ui-resizable-handle"> |
| </div> |
| </div> |
| <script type="text/javascript"> |
| $(document).ready(function(){initNavTree('group__grp__data__prep.html','');}); |
| </script> |
| <div id="doc-content"> |
| <!-- window showing the filter options --> |
| <div id="MSearchSelectWindow" |
| onmouseover="return searchBox.OnSearchSelectShow()" |
| onmouseout="return searchBox.OnSearchSelectHide()" |
| onkeydown="return searchBox.OnSearchSelectKey(event)"> |
| <a class="SelectItem" href="javascript:void(0)" onclick="searchBox.OnSelectItem(0)"><span class="SelectionMark"> </span>All</a><a class="SelectItem" href="javascript:void(0)" onclick="searchBox.OnSelectItem(1)"><span class="SelectionMark"> </span>Files</a><a class="SelectItem" href="javascript:void(0)" onclick="searchBox.OnSelectItem(2)"><span class="SelectionMark"> </span>Functions</a><a class="SelectItem" href="javascript:void(0)" onclick="searchBox.OnSelectItem(3)"><span class="SelectionMark"> </span>Groups</a></div> |
| |
| <!-- iframe showing the search results (closed by default) --> |
| <div id="MSearchResultsWindow"> |
| <iframe src="javascript:void(0)" frameborder="0" |
| name="MSearchResults" id="MSearchResults"> |
| </iframe> |
| </div> |
| |
| <div class="header"> |
| <div class="headertitle"> |
| <div class="title">Data Preparation<div class="ingroups"><a class="el" href="group__grp__support.html">Support Modules</a></div></div> </div> |
| </div><!--header--> |
| <div class="contents"> |
| <div class="toc"><b>Contents</b> </p> |
| <ul> |
| <li> |
| <a href="#categorical">Coding systems for categorical variables</a> </li> |
| <li> |
| <a href="#examples">Examples</a> </li> |
| </ul> |
| </div><p><a class="anchor" id="categorical"></a></p> |
| <dl class="section user"><dt>Coding systems for categorical variables</dt><dd>Categorical variables require special attention in regression analysis because, unlike dichotomous or continuous variables, they cannot be entered into the regression equation just as they are. For example, if you have a variable called race that is coded 1 = Hispanic, 2 = Asian, 3 = Black, 4 = White, then entering race in your regression will look at the linear effect of race, which is probably not what you intended. Instead, categorical variables like this need to be recoded into a series of indicator variables which can then be entered into the regression model. There are a variety of coding systems (also called as contrasts) that can be used when coding categorical variables. including dummy, effects, orthogonal, and helmert coding.</dd></dl> |
| <p>We currently only support the dummy coding technique. Dummy coding is used when a researcher wants to compare other groups of the predictor variable with one specific group of the predictor variable. Often, the specific group to compare with is called the reference group.</p> |
| <pre class="syntax"> |
| create_indicator_variables( |
| source_table, |
| output_table, |
| categorical_cols, |
| keep_null, |
| distributed_by |
| ) |
| </pre><p> <b>Arguments</b> </p> |
| <dl class="arglist"> |
| <dt>source_table </dt> |
| <dd>VARCHAR. Name of the source table, containing data for categorical variables. </dd> |
| <dt>output_table </dt> |
| <dd>VARCHAR. Name of result table. The output table has the same columns as the original table, adding new indicator variable columns for each categorical column. The column name for the indicator variable is <em>'categorical column name'</em>_<em>'categorical value'</em>. </dd> |
| <dt>categorical_cols </dt> |
| <dd>VARCHAR. Comma-separated string of column names of categorical variables that need to be dummy-coded. </dd> |
| <dt>keep_null (optional) </dt> |
| <dd>BOOLEAN. default: FALSE. Whether 'NULL' should be treated as one of the categories of the categorical variable. If True, then an indicator variable is created corresponding to the NULL value. If False, then all indicator variables for that record will be set to NULL. </dd> |
| <dt>distributed_by (optional) </dt> |
| <dd>VARCHAR. default: NULL. Columns to use for the distribution policy of the output table. When NULL, the distribution policy of 'source_table' will be used. This argument is not available for POSTGRESQL platforms. </dd> |
| </dl> |
| <p><a class="anchor" id="examples"></a></p> |
| <dl class="section user"><dt>Examples</dt><dd><ol type="1"> |
| <li>Use a subset of the abalone dataset. <pre class="example"> |
| DROP TABLE IF EXISTS abalone; |
| CREATE TABLE abalone ( |
| sex character varying, |
| length double precision, |
| diameter double precision, |
| height double precision |
| ); |
| COPY abalone (sex, length, diameter, height) FROM stdin WITH DELIMITER '|' NULL as '@'; |
| M| 0.455 | 0.365 | 0.095 |
| F| 0.53 | 0.42 | 0.135 |
| M| 0.35 | 0.265 | 0.09 |
| F| 0.53 | 0.415 | 0.15 |
| M| 0.44 | 0.365 | 0.125 |
| F| 0.545 | 0.425 | 0.125 |
| I| 0.33 | 0.255 | 0.08 |
| F| 0.55 | 0.44 | 0.15 |
| I| 0.425 | 0.30 | 0.095 |
| F| 0.525 | 0.38 | 0.140 |
| M| 0.475 | 0.37 | 0.125 |
| F| 0.535 | 0.405 | 0.145 |
| M| 0.43 | 0.358 | 0.11 |
| F| 0.47 | 0.355 | 0.100 |
| M| 0.49 | 0.38 | 0.135 |
| F| 0.44 | 0.340 | 0.100 |
| M| 0.5 | 0.400 | 0.13 |
| F| 0.565 | 0.44 | 0.155 |
| I| 0.355 | 0.280 | 0.085 |
| F| 0.550 | 0.415 | 0.135 |
| | 0.475 | 0.37 | 0.125 |
| \. |
| </pre></li> |
| <li>Create new table with dummy-coded indicator variables <pre class="example"> |
| drop table if exists abalone_out; |
| select madlib.create_indicator_variables ('abalone', 'abalone_out', 'sex'); |
| select * from abalone_out; |
| </pre> <pre class="result"> |
| sex | length | diameter | height | sex_F | sex_I | sex_M |
|   -----+--------+----------+--------+--------+--------+------- |
| F | 0.53 | 0.42 | 0.135 | 1 | 0 | 0 |
| F | 0.53 | 0.415 | 0.15 | 1 | 0 | 0 |
| F | 0.545 | 0.425 | 0.125 | 1 | 0 | 0 |
| F | 0.55 | 0.44 | 0.15 | 1 | 0 | 0 |
| F | 0.525 | 0.38 | 0.14 | 1 | 0 | 0 |
| F | 0.535 | 0.405 | 0.145 | 1 | 0 | 0 |
| F | 0.47 | 0.355 | 0.1 | 1 | 0 | 0 |
| F | 0.44 | 0.34 | 0.1 | 1 | 0 | 0 |
| F | 0.565 | 0.44 | 0.155 | 1 | 0 | 0 |
| F | 0.55 | 0.415 | 0.135 | 1 | 0 | 0 |
| M | 0.455 | 0.365 | 0.095 | 0 | 0 | 1 |
| M | 0.35 | 0.265 | 0.09 | 0 | 0 | 0 |
| M | 0.44 | 0.365 | 0.125 | 0 | 0 | 0 |
| I | 0.33 | 0.255 | 0.08 | 0 | 1 | 0 |
| I | 0.425 | 0.3 | 0.095 | 0 | 1 | 0 |
| M | 0.475 | 0.37 | 0.125 | 0 | 0 | 0 |
| M | 0.43 | 0.358 | 0.11 | 0 | 0 | 0 |
| M | 0.49 | 0.38 | 0.135 | 0 | 0 | 0 |
| M | 0.5 | 0.4 | 0.13 | 0 | 0 | 0 |
| I | 0.355 | 0.28 | 0.085 | 0 | 1 | 0 |
| NULL | 0.55 | 0.415 | 0.135 | NULL | NULL | NULL |
| </pre></li> |
| <li>Create indicator variable for 'NULL' value (note the additional column '"sex_NULL"') <pre class="example"> |
| drop table if exists abalone_out; |
| select madlib.create_indicator_variables'abalone', 'abalone_out', 'sex', True); |
| select * from abalone_out; |
| </pre> <pre class="result"> |
| sex | length | diameter | height | sex_F | sex_I | sex_M | sex_NULL |
|   ---—+-----—+-------—+-----—+-----—+-----—+----—+----— |
| F | 0.53 | 0.42 | 0.135 | 1 | 0 | 0 | 0 |
| F | 0.53 | 0.415 | 0.15 | 1 | 0 | 0 | 0 |
| F | 0.545 | 0.425 | 0.125 | 1 | 0 | 0 | 0 |
| F | 0.55 | 0.44 | 0.15 | 1 | 0 | 0 | 0 |
| F | 0.525 | 0.38 | 0.14 | 1 | 0 | 0 | 0 |
| F | 0.535 | 0.405 | 0.145 | 1 | 0 | 0 | 0 |
| F | 0.47 | 0.355 | 0.1 | 1 | 0 | 0 | 0 |
| F | 0.44 | 0.34 | 0.1 | 1 | 0 | 0 | 0 |
| F | 0.565 | 0.44 | 0.155 | 1 | 0 | 0 | 0 |
| F | 0.55 | 0.415 | 0.135 | 1 | 0 | 0 | 0 |
| M | 0.455 | 0.365 | 0.095 | 0 | 0 | 1 | 0 |
| M | 0.35 | 0.265 | 0.09 | 0 | 0 | 0 | 0 |
| M | 0.44 | 0.365 | 0.125 | 0 | 0 | 0 | 0 |
| I | 0.33 | 0.255 | 0.08 | 0 | 1 | 0 | 0 |
| I | 0.425 | 0.3 | 0.095 | 0 | 1 | 0 | 0 |
| M | 0.475 | 0.37 | 0.125 | 0 | 0 | 0 | 0 |
| M | 0.43 | 0.358 | 0.11 | 0 | 0 | 0 | 0 |
| M | 0.49 | 0.38 | 0.135 | 0 | 0 | 0 | 0 |
| M | 0.5 | 0.4 | 0.13 | 0 | 0 | 0 | 0 |
| I | 0.355 | 0.28 | 0.085 | 0 | 1 | 0 | 0 |
| NULL | 0.55 | 0.415 | 0.135 | 0 | 0 | 0 | 1 |
| </pre> </li> |
| </ol> |
| </dd></dl> |
| </div><!-- contents --> |
| </div><!-- doc-content --> |
| <!-- start footer part --> |
| <div id="nav-path" class="navpath"><!-- id is needed for treeview function! --> |
| <ul> |
| <li class="footer">Generated on Tue Dec 30 2014 21:44:13 for MADlib by |
| <a href="http://www.doxygen.org/index.html"> |
| <img class="footer" src="doxygen.png" alt="doxygen"/></a> 1.8.4 </li> |
| </ul> |
| </div> |
| </body> |
| </html> |