| /* ----------------------------------------------------------------------- *//** |
| * |
| * Licensed to the Apache Software Foundation (ASF) under one |
| * or more contributor license agreements. See the NOTICE file |
| * distributed with this work for additional information |
| * regarding copyright ownership. The ASF licenses this file |
| * to you under the Apache License, Version 2.0 (the |
| * "License"); you may not use this file except in compliance |
| * with the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, |
| * software distributed under the License is distributed on an |
| * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| * KIND, either express or implied. See the License for the |
| * specific language governing permissions and limitations |
| * under the License. |
| * |
| * |
| * @file create_indicators.sql_in |
| * |
| * @brief SQL functions for dummy coding categorical variables |
| * @date June 2014 |
| * |
| * @sa Calculates dummy-coded indicator variables for categorical variables |
| * |
| *//* ----------------------------------------------------------------------- */ |
| |
| m4_include(`SQLCommon.m4') |
| |
| |
| /** |
| @addtogroup grp_indicator |
| |
| \warning <em> This version of encoding categorical variables has been deprecated. |
| The new module with more capability can be found here \ref grp_encode_categorical</em> |
| |
| <div class="toc"><b>Contents</b> |
| <ul> |
| <li><a href="#categorical">Coding systems for categorical variables</a></li> |
| <li><a href="#examples">Examples</a></li> |
| </ul> |
| </div> |
| |
| @brief Provides utility functions helpful for data preparation before modeling |
| |
| @anchor categorical |
| @par Coding systems for categorical variables |
| Categorical variables require special attention in regression analysis because, |
| unlike dichotomous or continuous variables, they cannot be entered into the |
| regression equation just as they are. For example, if you have a variable |
| called race that is coded 1 = Hispanic, 2 = Asian, 3 = Black, 4 = White, then |
| entering race in your regression will look at the linear effect of race, which |
| is probably not what you intended. Instead, categorical variables like this need |
| to be recoded into a series of indicator variables which can then be entered |
| into the regression model. There are a variety of coding systems (also called |
| as contrasts) that can be used when coding categorical variables. including |
| dummy, effects, orthogonal, and helmert coding. |
| |
| We currently only support the dummy coding technique. Dummy coding is used when |
| a researcher wants to compare other groups of the predictor variable with one |
| specific group of the predictor variable. Often, the specific group to compare |
| with is called the reference group. |
| |
| <pre class="syntax"> |
| create_indicator_variables( |
| source_table, |
| output_table, |
| categorical_cols, |
| keep_null, |
| distributed_by |
| ) |
| </pre> |
| \b Arguments |
| <dl class="arglist"> |
| <dt>source_table</dt> |
| <dd>VARCHAR. Name of the source table, containing data for categorical variables.</dd> |
| <dt>output_table</dt> |
| <dd>VARCHAR. Name of result table. The output table has the same columns as |
| the original table, adding new indicator variable columns for each |
| categorical column. The column name for the indicator variable is |
| <em>'categorical column name'</em>_<em>'categorical value'</em>. |
| </dd> |
| <dt>categorical_cols </dt> |
| <dd>VARCHAR. Comma-separated string of column names of categorical variables |
| that need to be dummy-coded.</dd> |
| <dt>keep_null (optional)</dt> |
| <dd>BOOLEAN. default: FALSE. Whether 'NULL' should be treated as one of the |
| categories of the categorical variable. If True, then an indicator |
| variable is created corresponding to the NULL value. If False, then |
| all indicator variables for that record will be set to NULL. |
| </dd> |
| <dt>distributed_by (optional)</dt> |
| <dd>VARCHAR. default: NULL. Columns to use for the distribution policy of |
| the output table. When NULL, the distribution policy of 'source_table' |
| will be used. This argument is not available for POSTGRESQL platforms.</dd> |
| </dl> |
| |
| @anchor examples |
| @examp |
| |
| -# Use a subset of the abalone dataset. |
| <pre class="example"> |
| DROP TABLE IF EXISTS abalone; |
| CREATE TABLE abalone ( |
| sex character varying, |
| length double precision, |
| diameter double precision, |
| height double precision |
| ); |
| COPY abalone (sex, length, diameter, height) FROM stdin WITH DELIMITER '|' NULL as '@'; |
| M| 0.455 | 0.365 | 0.095 |
| F| 0.53 | 0.42 | 0.135 |
| M| 0.35 | 0.265 | 0.09 |
| F| 0.53 | 0.415 | 0.15 |
| M| 0.44 | 0.365 | 0.125 |
| F| 0.545 | 0.425 | 0.125 |
| I| 0.33 | 0.255 | 0.08 |
| F| 0.55 | 0.44 | 0.15 |
| I| 0.425 | 0.30 | 0.095 |
| F| 0.525 | 0.38 | 0.140 |
| M| 0.475 | 0.37 | 0.125 |
| F| 0.535 | 0.405 | 0.145 |
| M| 0.43 | 0.358 | 0.11 |
| F| 0.47 | 0.355 | 0.100 |
| M| 0.49 | 0.38 | 0.135 |
| F| 0.44 | 0.340 | 0.100 |
| M| 0.5 | 0.400 | 0.13 |
| F| 0.565 | 0.44 | 0.155 |
| I| 0.355 | 0.280 | 0.085 |
| F| 0.550 | 0.415 | 0.135 |
| @| 0.475 | 0.37 | 0.125 |
| \\. |
| </pre> |
| |
| -# Create new table with dummy-coded indicator variables |
| <pre class="example"> |
| drop table if exists abalone_out; |
| select madlib.create_indicator_variables ('abalone', 'abalone_out', 'sex'); |
| select * from abalone_out; |
| </pre> |
| <pre class="result"> |
| sex | length | diameter | height | sex_F | sex_I | sex_M |
| -----+--------+----------+--------+--------+--------+------- |
| F | 0.53 | 0.42 | 0.135 | 1 | 0 | 0 |
| F | 0.53 | 0.415 | 0.15 | 1 | 0 | 0 |
| F | 0.545 | 0.425 | 0.125 | 1 | 0 | 0 |
| F | 0.55 | 0.44 | 0.15 | 1 | 0 | 0 |
| F | 0.525 | 0.38 | 0.14 | 1 | 0 | 0 |
| F | 0.535 | 0.405 | 0.145 | 1 | 0 | 0 |
| F | 0.47 | 0.355 | 0.1 | 1 | 0 | 0 |
| F | 0.44 | 0.34 | 0.1 | 1 | 0 | 0 |
| F | 0.565 | 0.44 | 0.155 | 1 | 0 | 0 |
| F | 0.55 | 0.415 | 0.135 | 1 | 0 | 0 |
| M | 0.455 | 0.365 | 0.095 | 0 | 0 | 1 |
| M | 0.35 | 0.265 | 0.09 | 0 | 0 | 0 |
| M | 0.44 | 0.365 | 0.125 | 0 | 0 | 0 |
| I | 0.33 | 0.255 | 0.08 | 0 | 1 | 0 |
| I | 0.425 | 0.3 | 0.095 | 0 | 1 | 0 |
| M | 0.475 | 0.37 | 0.125 | 0 | 0 | 0 |
| M | 0.43 | 0.358 | 0.11 | 0 | 0 | 0 |
| M | 0.49 | 0.38 | 0.135 | 0 | 0 | 0 |
| M | 0.5 | 0.4 | 0.13 | 0 | 0 | 0 |
| I | 0.355 | 0.28 | 0.085 | 0 | 1 | 0 |
| NULL | 0.55 | 0.415 | 0.135 | NULL | NULL | NULL |
| </pre> |
| |
| -# Create indicator variable for 'NULL' value (note the additional column '"sex_NULL"') |
| <pre class="example"> |
| drop table if exists abalone_out; |
| select madlib.create_indicator_variables'abalone', 'abalone_out', 'sex', True); |
| select * from abalone_out; |
| </pre> |
| <pre class="result"> |
| sex | length | diameter | height | sex_F | sex_I | sex_M | sex_NULL |
| ------+--------+----------+--------+--------+--------+-------+------- |
| F | 0.53 | 0.42 | 0.135 | 1 | 0 | 0 | 0 |
| F | 0.53 | 0.415 | 0.15 | 1 | 0 | 0 | 0 |
| F | 0.545 | 0.425 | 0.125 | 1 | 0 | 0 | 0 |
| F | 0.55 | 0.44 | 0.15 | 1 | 0 | 0 | 0 |
| F | 0.525 | 0.38 | 0.14 | 1 | 0 | 0 | 0 |
| F | 0.535 | 0.405 | 0.145 | 1 | 0 | 0 | 0 |
| F | 0.47 | 0.355 | 0.1 | 1 | 0 | 0 | 0 |
| F | 0.44 | 0.34 | 0.1 | 1 | 0 | 0 | 0 |
| F | 0.565 | 0.44 | 0.155 | 1 | 0 | 0 | 0 |
| F | 0.55 | 0.415 | 0.135 | 1 | 0 | 0 | 0 |
| M | 0.455 | 0.365 | 0.095 | 0 | 0 | 1 | 0 |
| M | 0.35 | 0.265 | 0.09 | 0 | 0 | 0 | 0 |
| M | 0.44 | 0.365 | 0.125 | 0 | 0 | 0 | 0 |
| I | 0.33 | 0.255 | 0.08 | 0 | 1 | 0 | 0 |
| I | 0.425 | 0.3 | 0.095 | 0 | 1 | 0 | 0 |
| M | 0.475 | 0.37 | 0.125 | 0 | 0 | 0 | 0 |
| M | 0.43 | 0.358 | 0.11 | 0 | 0 | 0 | 0 |
| M | 0.49 | 0.38 | 0.135 | 0 | 0 | 0 | 0 |
| M | 0.5 | 0.4 | 0.13 | 0 | 0 | 0 | 0 |
| I | 0.355 | 0.28 | 0.085 | 0 | 1 | 0 | 0 |
| NULL | 0.55 | 0.415 | 0.135 | 0 | 0 | 0 | 1 |
| </pre> |
| */ |
| |
| ------------------------------------------------------------------------- |
| |
| /** |
| * @brief Create new table containing dummy coded variables for categorical variables |
| * |
| * @param source_table Name of table containing categorical variable |
| * @param out_table Name of table to output dummy variables |
| * @param categorical_cols Comma-separated list of column names to dummy code |
| * @param keep_null Boolean to determine the behavior for rows with NULL value |
| * @param distributed_by Comma-separated list of column names to use for distribution of output |
| * |
| * @return Void |
| * |
| */ |
| |
| -- We don't create the below table for PostgreSQL since it does contain a |
| -- distribution policy. |
| m4_changequote(<!,!>) |
| m4_ifdef(<!__POSTGRESQL__!>, <!!>, <! |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.create_indicator_variables( |
| source_table TEXT, |
| out_table TEXT, |
| categorical_cols TEXT, |
| keep_null BOOLEAN, |
| distributed_by TEXT |
| ) RETURNS VOID AS $$ |
| PythonFunction(utilities, create_indicators, create_indicator_variables) |
| $$ LANGUAGE plpythonu |
| m4_ifdef(<!__HAS_FUNCTION_PROPERTIES__!>, <!MODIFIES SQL DATA!>, <!!>); |
| !>) |
| m4_changequote(<!`!>, <!'!>) |
| |
| /** |
| * @brief Create new table containing dummy coded variables for categorical variables |
| * |
| * @param source_table Name of table containing categorical variable |
| * @param out_table Name of table to output dummy variables |
| * @param categorical_cols Comma-separated list of column names to dummy code |
| * @param keep_null Boolean to determine the behavior for rows with NULL value |
| * |
| * @return Void |
| * |
| */ |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.create_indicator_variables( |
| source_table TEXT, |
| out_table TEXT, |
| categorical_cols TEXT, |
| keep_null BOOLEAN |
| ) RETURNS VOID AS $$ |
| PythonFunction(utilities, create_indicators, create_indicator_variables) |
| $$ LANGUAGE plpythonu |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.create_indicator_variables( |
| source_table TEXT, |
| out_table TEXT, |
| categorical_cols TEXT |
| ) RETURNS VOID AS $$ |
| SELECT MADLIB_SCHEMA.create_indicator_variables($1, $2, $3, False) |
| $$ LANGUAGE SQL |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| |
| -- Online help |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.create_indicator_variables( |
| message VARCHAR |
| ) RETURNS VARCHAR AS $$ |
| PythonFunction(utilities, create_indicators, indicator_variables_help) |
| $$ LANGUAGE plpythonu IMMUTABLE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `CONTAINS SQL', `'); |
| |
| -------------------------------------------------------------------------------- |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.create_indicator_variables() |
| RETURNS VARCHAR AS $$ |
| SELECT MADLIB_SCHEMA.create_indicator_variables(''); |
| $$ LANGUAGE sql IMMUTABLE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `CONTAINS SQL', `'); |
| -------------------------------------------------------------------------------- |