| <!-- HTML header for doxygen 1.8.4--> |
| <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> |
| <html xmlns="http://www.w3.org/1999/xhtml"> |
| <head> |
| <meta http-equiv="Content-Type" content="text/xhtml;charset=UTF-8"/> |
| <meta http-equiv="X-UA-Compatible" content="IE=9"/> |
| <meta name="generator" content="Doxygen 1.8.10"/> |
| <meta name="keywords" content="madlib,postgres,greenplum,machine learning,data mining,deep learning,ensemble methods,data science,market basket analysis,affinity analysis,pca,lda,regression,elastic net,huber white,proportional hazards,k-means,latent dirichlet allocation,bayes,support vector machines,svm"/> |
| <title>MADlib: Encoding Categorical Variables</title> |
| <link href="tabs.css" rel="stylesheet" type="text/css"/> |
| <script type="text/javascript" src="jquery.js"></script> |
| <script type="text/javascript" src="dynsections.js"></script> |
| <link href="navtree.css" rel="stylesheet" type="text/css"/> |
| <script type="text/javascript" src="resize.js"></script> |
| <script type="text/javascript" src="navtreedata.js"></script> |
| <script type="text/javascript" src="navtree.js"></script> |
| <script type="text/javascript"> |
| $(document).ready(initResizable); |
| $(window).load(resizeHeight); |
| </script> |
| <link href="search/search.css" rel="stylesheet" type="text/css"/> |
| <script type="text/javascript" src="search/searchdata.js"></script> |
| <script type="text/javascript" src="search/search.js"></script> |
| <script type="text/javascript"> |
| $(document).ready(function() { init_search(); }); |
| </script> |
| <script type="text/x-mathjax-config"> |
| MathJax.Hub.Config({ |
| extensions: ["tex2jax.js", "TeX/AMSmath.js", "TeX/AMSsymbols.js"], |
| jax: ["input/TeX","output/HTML-CSS"], |
| }); |
| </script><script src="../mathjax/MathJax.js"></script> |
| <!-- hack in the navigation tree --> |
| <script type="text/javascript" src="navtree_hack.js"></script> |
| <link href="doxygen.css" rel="stylesheet" type="text/css" /> |
| <link href="madlib_extra.css" rel="stylesheet" type="text/css"/> |
| <!-- google analytics --> |
| <script> |
| (function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){ |
| (i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o), |
| m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m) |
| })(window,document,'script','//www.google-analytics.com/analytics.js','ga'); |
| ga('create', 'UA-45382226-1', 'auto'); |
| ga('send', 'pageview'); |
| </script> |
| </head> |
| <body> |
| <div id="top"><!-- do not remove this div, it is closed by doxygen! --> |
| <div id="titlearea"> |
| <table cellspacing="0" cellpadding="0"> |
| <tbody> |
| <tr style="height: 56px;"> |
| <td id="projectlogo"><a href="http://madlib.incubator.apache.org"><img alt="Logo" src="madlib.png" height="50" style="padding-left:0.5em;" border="0"/ ></a></td> |
| <td style="padding-left: 0.5em;"> |
| <div id="projectname"> |
| <span id="projectnumber">1.8</span> |
| </div> |
| <div id="projectbrief">User Documentation for MADlib</div> |
| </td> |
| <td> <div id="MSearchBox" class="MSearchBoxInactive"> |
| <span class="left"> |
| <img id="MSearchSelect" src="search/mag_sel.png" |
| onmouseover="return searchBox.OnSearchSelectShow()" |
| onmouseout="return searchBox.OnSearchSelectHide()" |
| alt=""/> |
| <input type="text" id="MSearchField" value="Search" accesskey="S" |
| onfocus="searchBox.OnSearchFieldFocus(true)" |
| onblur="searchBox.OnSearchFieldFocus(false)" |
| onkeyup="searchBox.OnSearchFieldChange(event)"/> |
| </span><span class="right"> |
| <a id="MSearchClose" href="javascript:searchBox.CloseResultsWindow()"><img id="MSearchCloseImg" border="0" src="search/close.png" alt=""/></a> |
| </span> |
| </div> |
| </td> |
| </tr> |
| </tbody> |
| </table> |
| </div> |
| <!-- end header part --> |
| <!-- Generated by Doxygen 1.8.10 --> |
| <script type="text/javascript"> |
| var searchBox = new SearchBox("searchBox", "search",false,'Search'); |
| </script> |
| </div><!-- top --> |
| <div id="side-nav" class="ui-resizable side-nav-resizable"> |
| <div id="nav-tree"> |
| <div id="nav-tree-contents"> |
| <div id="nav-sync" class="sync"></div> |
| </div> |
| </div> |
| <div id="splitbar" style="-moz-user-select:none;" |
| class="ui-resizable-handle"> |
| </div> |
| </div> |
| <script type="text/javascript"> |
| $(document).ready(function(){initNavTree('group__grp__data__prep.html','');}); |
| </script> |
| <div id="doc-content"> |
| <!-- window showing the filter options --> |
| <div id="MSearchSelectWindow" |
| onmouseover="return searchBox.OnSearchSelectShow()" |
| onmouseout="return searchBox.OnSearchSelectHide()" |
| onkeydown="return searchBox.OnSearchSelectKey(event)"> |
| </div> |
| |
| <!-- iframe showing the search results (closed by default) --> |
| <div id="MSearchResultsWindow"> |
| <iframe src="javascript:void(0)" frameborder="0" |
| name="MSearchResults" id="MSearchResults"> |
| </iframe> |
| </div> |
| |
| <div class="header"> |
| <div class="headertitle"> |
| <div class="title">Encoding Categorical Variables<div class="ingroups"><a class="el" href="group__grp__utility__functions.html">Utility Functions</a></div></div> </div> |
| </div><!--header--> |
| <div class="contents"> |
| <div class="toc"><b>Contents</b> </p><ul> |
| <li> |
| <a href="#categorical">Coding systems for categorical variables</a> </li> |
| <li> |
| <a href="#examples">Examples</a> </li> |
| </ul> |
| </div><p><a class="anchor" id="categorical"></a></p><dl class="section user"><dt>Coding systems for categorical variables</dt><dd>Categorical variables require special attention in regression analysis because, unlike dichotomous or continuous variables, they cannot be entered into the regression equation just as they are. For example, if you have a variable called race that is coded 1 = Hispanic, 2 = Asian, 3 = Black, 4 = White, then entering race in your regression will look at the linear effect of race, which is probably not what you intended. Instead, categorical variables like this need to be recoded into a series of indicator variables which can then be entered into the regression model. There are a variety of coding systems (also called as contrasts) that can be used when coding categorical variables. including dummy, effects, orthogonal, and helmert coding.</dd></dl> |
| <p>We currently only support the dummy coding technique. Dummy coding is used when a researcher wants to compare other groups of the predictor variable with one specific group of the predictor variable. Often, the specific group to compare with is called the reference group.</p> |
| <pre class="syntax"> |
| create_indicator_variables( |
| source_table, |
| output_table, |
| categorical_cols, |
| keep_null, |
| distributed_by |
| ) |
| </pre><p> <b>Arguments</b> </p><dl class="arglist"> |
| <dt>source_table </dt> |
| <dd>VARCHAR. Name of the source table, containing data for categorical variables. </dd> |
| <dt>output_table </dt> |
| <dd>VARCHAR. Name of result table. The output table has the same columns as the original table, adding new indicator variable columns for each categorical column. The column name for the indicator variable is <em>'categorical column name'</em>_<em>'categorical value'</em>. </dd> |
| <dt>categorical_cols </dt> |
| <dd>VARCHAR. Comma-separated string of column names of categorical variables that need to be dummy-coded. </dd> |
| <dt>keep_null (optional) </dt> |
| <dd>BOOLEAN. default: FALSE. Whether 'NULL' should be treated as one of the categories of the categorical variable. If True, then an indicator variable is created corresponding to the NULL value. If False, then all indicator variables for that record will be set to NULL. </dd> |
| <dt>distributed_by (optional) </dt> |
| <dd>VARCHAR. default: NULL. Columns to use for the distribution policy of the output table. When NULL, the distribution policy of 'source_table' will be used. This argument is not available for POSTGRESQL platforms. </dd> |
| </dl> |
| <p><a class="anchor" id="examples"></a></p><dl class="section user"><dt>Examples</dt><dd></dd></dl> |
| <ol type="1"> |
| <li>Use a subset of the abalone dataset. <pre class="example"> |
| DROP TABLE IF EXISTS abalone; |
| CREATE TABLE abalone ( |
| sex character varying, |
| length double precision, |
| diameter double precision, |
| height double precision |
| ); |
| COPY abalone (sex, length, diameter, height) FROM stdin WITH DELIMITER '|' NULL as '@'; |
| M| 0.455 | 0.365 | 0.095 |
| F| 0.53 | 0.42 | 0.135 |
| M| 0.35 | 0.265 | 0.09 |
| F| 0.53 | 0.415 | 0.15 |
| M| 0.44 | 0.365 | 0.125 |
| F| 0.545 | 0.425 | 0.125 |
| I| 0.33 | 0.255 | 0.08 |
| F| 0.55 | 0.44 | 0.15 |
| I| 0.425 | 0.30 | 0.095 |
| F| 0.525 | 0.38 | 0.140 |
| M| 0.475 | 0.37 | 0.125 |
| F| 0.535 | 0.405 | 0.145 |
| M| 0.43 | 0.358 | 0.11 |
| F| 0.47 | 0.355 | 0.100 |
| M| 0.49 | 0.38 | 0.135 |
| F| 0.44 | 0.340 | 0.100 |
| M| 0.5 | 0.400 | 0.13 |
| F| 0.565 | 0.44 | 0.155 |
| I| 0.355 | 0.280 | 0.085 |
| F| 0.550 | 0.415 | 0.135 |
| | 0.475 | 0.37 | 0.125 |
| \. |
| </pre></li> |
| <li>Create new table with dummy-coded indicator variables <pre class="example"> |
| drop table if exists abalone_out; |
| select madlib.create_indicator_variables ('abalone', 'abalone_out', 'sex'); |
| select * from abalone_out; |
| </pre> <pre class="result"> |
| sex | length | diameter | height | sex_F | sex_I | sex_M |
|   -----+--------+----------+--------+--------+--------+------- |
| F | 0.53 | 0.42 | 0.135 | 1 | 0 | 0 |
| F | 0.53 | 0.415 | 0.15 | 1 | 0 | 0 |
| F | 0.545 | 0.425 | 0.125 | 1 | 0 | 0 |
| F | 0.55 | 0.44 | 0.15 | 1 | 0 | 0 |
| F | 0.525 | 0.38 | 0.14 | 1 | 0 | 0 |
| F | 0.535 | 0.405 | 0.145 | 1 | 0 | 0 |
| F | 0.47 | 0.355 | 0.1 | 1 | 0 | 0 |
| F | 0.44 | 0.34 | 0.1 | 1 | 0 | 0 |
| F | 0.565 | 0.44 | 0.155 | 1 | 0 | 0 |
| F | 0.55 | 0.415 | 0.135 | 1 | 0 | 0 |
| M | 0.455 | 0.365 | 0.095 | 0 | 0 | 1 |
| M | 0.35 | 0.265 | 0.09 | 0 | 0 | 0 |
| M | 0.44 | 0.365 | 0.125 | 0 | 0 | 0 |
| I | 0.33 | 0.255 | 0.08 | 0 | 1 | 0 |
| I | 0.425 | 0.3 | 0.095 | 0 | 1 | 0 |
| M | 0.475 | 0.37 | 0.125 | 0 | 0 | 0 |
| M | 0.43 | 0.358 | 0.11 | 0 | 0 | 0 |
| M | 0.49 | 0.38 | 0.135 | 0 | 0 | 0 |
| M | 0.5 | 0.4 | 0.13 | 0 | 0 | 0 |
| I | 0.355 | 0.28 | 0.085 | 0 | 1 | 0 |
| NULL | 0.55 | 0.415 | 0.135 | NULL | NULL | NULL |
| </pre></li> |
| <li>Create indicator variable for 'NULL' value (note the additional column '"sex_NULL"') <pre class="example"> |
| drop table if exists abalone_out; |
| select madlib.create_indicator_variables'abalone', 'abalone_out', 'sex', True); |
| select * from abalone_out; |
| </pre> <pre class="result"> |
| sex | length | diameter | height | sex_F | sex_I | sex_M | sex_NULL |
|   ---—+-----—+-------—+-----—+-----—+-----—+----—+----— |
| F | 0.53 | 0.42 | 0.135 | 1 | 0 | 0 | 0 |
| F | 0.53 | 0.415 | 0.15 | 1 | 0 | 0 | 0 |
| F | 0.545 | 0.425 | 0.125 | 1 | 0 | 0 | 0 |
| F | 0.55 | 0.44 | 0.15 | 1 | 0 | 0 | 0 |
| F | 0.525 | 0.38 | 0.14 | 1 | 0 | 0 | 0 |
| F | 0.535 | 0.405 | 0.145 | 1 | 0 | 0 | 0 |
| F | 0.47 | 0.355 | 0.1 | 1 | 0 | 0 | 0 |
| F | 0.44 | 0.34 | 0.1 | 1 | 0 | 0 | 0 |
| F | 0.565 | 0.44 | 0.155 | 1 | 0 | 0 | 0 |
| F | 0.55 | 0.415 | 0.135 | 1 | 0 | 0 | 0 |
| M | 0.455 | 0.365 | 0.095 | 0 | 0 | 1 | 0 |
| M | 0.35 | 0.265 | 0.09 | 0 | 0 | 0 | 0 |
| M | 0.44 | 0.365 | 0.125 | 0 | 0 | 0 | 0 |
| I | 0.33 | 0.255 | 0.08 | 0 | 1 | 0 | 0 |
| I | 0.425 | 0.3 | 0.095 | 0 | 1 | 0 | 0 |
| M | 0.475 | 0.37 | 0.125 | 0 | 0 | 0 | 0 |
| M | 0.43 | 0.358 | 0.11 | 0 | 0 | 0 | 0 |
| M | 0.49 | 0.38 | 0.135 | 0 | 0 | 0 | 0 |
| M | 0.5 | 0.4 | 0.13 | 0 | 0 | 0 | 0 |
| I | 0.355 | 0.28 | 0.085 | 0 | 1 | 0 | 0 |
| NULL | 0.55 | 0.415 | 0.135 | 0 | 0 | 0 | 1 |
| </pre> </li> |
| </ol> |
| </div><!-- contents --> |
| </div><!-- doc-content --> |
| <!-- start footer part --> |
| <div id="nav-path" class="navpath"><!-- id is needed for treeview function! --> |
| <ul> |
| <li class="footer">Generated on Mon Jul 27 2015 20:37:45 for MADlib by |
| <a href="http://www.doxygen.org/index.html"> |
| <img class="footer" src="doxygen.png" alt="doxygen"/></a> 1.8.10 </li> |
| </ul> |
| </div> |
| </body> |
| </html> |