blob: 0054e0aedbc94de455a8209bccee0541559594df [file] [log] [blame]
/* ----------------------------------------------------------------------- *//**
*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*
*
* @file create_indicators.sql_in
*
* @brief SQL functions for dummy coding categorical variables
* @date June 2014
*
* @sa Calculates dummy-coded indicator variables for categorical variables
*
*//* ----------------------------------------------------------------------- */
m4_include(`SQLCommon.m4')
/**
@addtogroup grp_indicator
\warning <em> This version of encoding categorical variables has been deprecated.
The new module with more capability can be found here \ref grp_encode_categorical</em>
<div class="toc"><b>Contents</b>
<ul>
<li><a href="#categorical">Coding systems for categorical variables</a></li>
<li><a href="#examples">Examples</a></li>
</ul>
</div>
@brief Provides utility functions helpful for data preparation before modeling
@anchor categorical
@par Coding systems for categorical variables
Categorical variables require special attention in regression analysis because,
unlike dichotomous or continuous variables, they cannot be entered into the
regression equation just as they are. For example, if you have a variable
called race that is coded 1 = Hispanic, 2 = Asian, 3 = Black, 4 = White, then
entering race in your regression will look at the linear effect of race, which
is probably not what you intended. Instead, categorical variables like this need
to be recoded into a series of indicator variables which can then be entered
into the regression model. There are a variety of coding systems (also called
as contrasts) that can be used when coding categorical variables. including
dummy, effects, orthogonal, and helmert coding.
We currently only support the dummy coding technique. Dummy coding is used when
a researcher wants to compare other groups of the predictor variable with one
specific group of the predictor variable. Often, the specific group to compare
with is called the reference group.
<pre class="syntax">
create_indicator_variables(
source_table,
output_table,
categorical_cols,
keep_null,
distributed_by
)
</pre>
\b Arguments
<dl class="arglist">
<dt>source_table</dt>
<dd>VARCHAR. Name of the source table, containing data for categorical variables.</dd>
<dt>output_table</dt>
<dd>VARCHAR. Name of result table. The output table has the same columns as
the original table, adding new indicator variable columns for each
categorical column. The column name for the indicator variable is
<em>'categorical column name'</em>_<em>'categorical value'</em>.
</dd>
<dt>categorical_cols </dt>
<dd>VARCHAR. Comma-separated string of column names of categorical variables
that need to be dummy-coded.</dd>
<dt>keep_null (optional)</dt>
<dd>BOOLEAN. default: FALSE. Whether 'NULL' should be treated as one of the
categories of the categorical variable. If True, then an indicator
variable is created corresponding to the NULL value. If False, then
all indicator variables for that record will be set to NULL.
</dd>
<dt>distributed_by (optional)</dt>
<dd>VARCHAR. default: NULL. Columns to use for the distribution policy of
the output table. When NULL, the distribution policy of 'source_table'
will be used. This argument is not available for POSTGRESQL platforms.</dd>
</dl>
@anchor examples
@examp
-# Use a subset of the abalone dataset.
<pre class="example">
DROP TABLE IF EXISTS abalone;
CREATE TABLE abalone (
sex character varying,
length double precision,
diameter double precision,
height double precision
);
COPY abalone (sex, length, diameter, height) FROM stdin WITH DELIMITER '|' NULL as '@';
M| 0.455 | 0.365 | 0.095
F| 0.53 | 0.42 | 0.135
M| 0.35 | 0.265 | 0.09
F| 0.53 | 0.415 | 0.15
M| 0.44 | 0.365 | 0.125
F| 0.545 | 0.425 | 0.125
I| 0.33 | 0.255 | 0.08
F| 0.55 | 0.44 | 0.15
I| 0.425 | 0.30 | 0.095
F| 0.525 | 0.38 | 0.140
M| 0.475 | 0.37 | 0.125
F| 0.535 | 0.405 | 0.145
M| 0.43 | 0.358 | 0.11
F| 0.47 | 0.355 | 0.100
M| 0.49 | 0.38 | 0.135
F| 0.44 | 0.340 | 0.100
M| 0.5 | 0.400 | 0.13
F| 0.565 | 0.44 | 0.155
I| 0.355 | 0.280 | 0.085
F| 0.550 | 0.415 | 0.135
@| 0.475 | 0.37 | 0.125
\\.
</pre>
-# Create new table with dummy-coded indicator variables
<pre class="example">
drop table if exists abalone_out;
select madlib.create_indicator_variables ('abalone', 'abalone_out', 'sex');
select * from abalone_out;
</pre>
<pre class="result">
sex | length | diameter | height | sex_F | sex_I | sex_M
&nbsp; -----+--------+----------+--------+--------+--------+-------
F | 0.53 | 0.42 | 0.135 | 1 | 0 | 0
F | 0.53 | 0.415 | 0.15 | 1 | 0 | 0
F | 0.545 | 0.425 | 0.125 | 1 | 0 | 0
F | 0.55 | 0.44 | 0.15 | 1 | 0 | 0
F | 0.525 | 0.38 | 0.14 | 1 | 0 | 0
F | 0.535 | 0.405 | 0.145 | 1 | 0 | 0
F | 0.47 | 0.355 | 0.1 | 1 | 0 | 0
F | 0.44 | 0.34 | 0.1 | 1 | 0 | 0
F | 0.565 | 0.44 | 0.155 | 1 | 0 | 0
F | 0.55 | 0.415 | 0.135 | 1 | 0 | 0
M | 0.455 | 0.365 | 0.095 | 0 | 0 | 1
M | 0.35 | 0.265 | 0.09 | 0 | 0 | 0
M | 0.44 | 0.365 | 0.125 | 0 | 0 | 0
I | 0.33 | 0.255 | 0.08 | 0 | 1 | 0
I | 0.425 | 0.3 | 0.095 | 0 | 1 | 0
M | 0.475 | 0.37 | 0.125 | 0 | 0 | 0
M | 0.43 | 0.358 | 0.11 | 0 | 0 | 0
M | 0.49 | 0.38 | 0.135 | 0 | 0 | 0
M | 0.5 | 0.4 | 0.13 | 0 | 0 | 0
I | 0.355 | 0.28 | 0.085 | 0 | 1 | 0
NULL | 0.55 | 0.415 | 0.135 | NULL | NULL | NULL
</pre>
-# Create indicator variable for 'NULL' value (note the additional column '"sex_NULL"')
<pre class="example">
drop table if exists abalone_out;
select madlib.create_indicator_variables'abalone', 'abalone_out', 'sex', True);
select * from abalone_out;
</pre>
<pre class="result">
sex | length | diameter | height | sex_F | sex_I | sex_M | sex_NULL
&nbsp; ------+--------+----------+--------+--------+--------+-------+-------
F | 0.53 | 0.42 | 0.135 | 1 | 0 | 0 | 0
F | 0.53 | 0.415 | 0.15 | 1 | 0 | 0 | 0
F | 0.545 | 0.425 | 0.125 | 1 | 0 | 0 | 0
F | 0.55 | 0.44 | 0.15 | 1 | 0 | 0 | 0
F | 0.525 | 0.38 | 0.14 | 1 | 0 | 0 | 0
F | 0.535 | 0.405 | 0.145 | 1 | 0 | 0 | 0
F | 0.47 | 0.355 | 0.1 | 1 | 0 | 0 | 0
F | 0.44 | 0.34 | 0.1 | 1 | 0 | 0 | 0
F | 0.565 | 0.44 | 0.155 | 1 | 0 | 0 | 0
F | 0.55 | 0.415 | 0.135 | 1 | 0 | 0 | 0
M | 0.455 | 0.365 | 0.095 | 0 | 0 | 1 | 0
M | 0.35 | 0.265 | 0.09 | 0 | 0 | 0 | 0
M | 0.44 | 0.365 | 0.125 | 0 | 0 | 0 | 0
I | 0.33 | 0.255 | 0.08 | 0 | 1 | 0 | 0
I | 0.425 | 0.3 | 0.095 | 0 | 1 | 0 | 0
M | 0.475 | 0.37 | 0.125 | 0 | 0 | 0 | 0
M | 0.43 | 0.358 | 0.11 | 0 | 0 | 0 | 0
M | 0.49 | 0.38 | 0.135 | 0 | 0 | 0 | 0
M | 0.5 | 0.4 | 0.13 | 0 | 0 | 0 | 0
I | 0.355 | 0.28 | 0.085 | 0 | 1 | 0 | 0
NULL | 0.55 | 0.415 | 0.135 | 0 | 0 | 0 | 1
</pre>
*/
-------------------------------------------------------------------------
/**
* @brief Create new table containing dummy coded variables for categorical variables
*
* @param source_table Name of table containing categorical variable
* @param out_table Name of table to output dummy variables
* @param categorical_cols Comma-separated list of column names to dummy code
* @param keep_null Boolean to determine the behavior for rows with NULL value
* @param distributed_by Comma-separated list of column names to use for distribution of output
*
* @return Void
*
*/
-- We don't create the below table for PostgreSQL since it does contain a
-- distribution policy.
m4_changequote(<!,!>)
m4_ifdef(<!__POSTGRESQL__!>, <!!>, <!
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.create_indicator_variables(
source_table TEXT,
out_table TEXT,
categorical_cols TEXT,
keep_null BOOLEAN,
distributed_by TEXT
) RETURNS VOID AS $$
PythonFunction(utilities, create_indicators, create_indicator_variables)
$$ LANGUAGE plpythonu
m4_ifdef(<!__HAS_FUNCTION_PROPERTIES__!>, <!MODIFIES SQL DATA!>, <!!>);
!>)
m4_changequote(<!`!>, <!'!>)
/**
* @brief Create new table containing dummy coded variables for categorical variables
*
* @param source_table Name of table containing categorical variable
* @param out_table Name of table to output dummy variables
* @param categorical_cols Comma-separated list of column names to dummy code
* @param keep_null Boolean to determine the behavior for rows with NULL value
*
* @return Void
*
*/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.create_indicator_variables(
source_table TEXT,
out_table TEXT,
categorical_cols TEXT,
keep_null BOOLEAN
) RETURNS VOID AS $$
PythonFunction(utilities, create_indicators, create_indicator_variables)
$$ LANGUAGE plpythonu
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.create_indicator_variables(
source_table TEXT,
out_table TEXT,
categorical_cols TEXT
) RETURNS VOID AS $$
SELECT MADLIB_SCHEMA.create_indicator_variables($1, $2, $3, False)
$$ LANGUAGE SQL
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
-- Online help
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.create_indicator_variables(
message VARCHAR
) RETURNS VARCHAR AS $$
PythonFunction(utilities, create_indicators, indicator_variables_help)
$$ LANGUAGE plpythonu IMMUTABLE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `CONTAINS SQL', `');
--------------------------------------------------------------------------------
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.create_indicator_variables()
RETURNS VARCHAR AS $$
SELECT MADLIB_SCHEMA.create_indicator_variables('');
$$ LANGUAGE sql IMMUTABLE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `CONTAINS SQL', `');
--------------------------------------------------------------------------------