| /* ----------------------------------------------------------------------- *//** |
| * |
| * Licensed to the Apache Software Foundation (ASF) under one |
| * or more contributor license agreements. See the NOTICE file |
| * distributed with this work for additional information |
| * regarding copyright ownership. The ASF licenses this file |
| * to you under the Apache License, Version 2.0 (the |
| * "License"); you may not use this file except in compliance |
| * with the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, |
| * software distributed under the License is distributed on an |
| * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| * KIND, either express or implied. See the License for the |
| * specific language governing permissions and limitations |
| * under the License. |
| * |
| * |
| * @file encode_categorical.sql_in |
| * |
| * @brief SQL functions for encoding categorical variables to numerical values |
| * @date Dec 2016 |
| * |
| * @sa Encodes categorical variables to numerical values |
| * |
| *//* ----------------------------------------------------------------------- */ |
| |
| |
| m4_include(`SQLCommon.m4') |
| |
| |
| |
| /** |
| @addtogroup grp_encode_categorical |
| |
| <div class="toc"><b>Contents</b> |
| <ul> |
| <li><a href="#categorical">Coding Systems for Categorical Variables</a></li> |
| <li><a href="#examples">Examples</a></li> |
| <li><a href="#literature">Literature</a></li> |
| </ul> |
| </div> |
| |
| @brief Functions to encode categorical variables to prepare data for |
| input into predictive algorithms. |
| |
| @anchor categorical |
| @par Coding Systems for Categorical Variables |
| Categorical variables [1] require special attention in regression analysis because, |
| unlike dichotomous or continuous variables, they cannot be entered into the |
| regression equation just as they are. For example, if you have a variable |
| called race that is coded with 1=Hispanic, 2=Asian, 3=Black, 4=White, then |
| entering race in your regression will look at the linear effect of the race variable, which |
| is probably not what you intended. Instead, categorical variables like this need |
| to be coded into a series of indicator variables which can then be entered |
| into the regression model. There are a variety of coding systems that can |
| be used for coding categorical variables, including |
| one-hot, dummy, effects, orthogonal, and Helmert. |
| |
| We currently support one-hot and dummy coding techniques. |
| |
| Dummy coding is used when a researcher wants to compare other groups of the |
| predictor variable with one specific group of the predictor variable. |
| Often, the specific group to compare with is called the reference group. |
| |
| One-hot encoding is similar to dummy coding except it builds indicator (0/1) |
| columns (cast as numeric) for each value of each category. |
| Only one of these columns could take on the value 1 for each row (data point). |
| There is no reference category for this function. |
| |
| <pre class="syntax"> |
| encode_categorical_variables ( |
| source_table, |
| output_table, |
| categorical_cols, |
| categorical_cols_to_exclude, -- Optional |
| row_id, -- Optional |
| top, -- Optional |
| value_to_drop, -- Optional |
| encode_null, -- Optional |
| output_type, -- Optional |
| output_dictionary, -- Optional |
| distributed_by -- Optional |
| ) |
| </pre> |
| \b Arguments |
| <dl class="arglist"> |
| <dt>source_table</dt> |
| <dd>VARCHAR. Name of the table containing the source categorical data to encode.</dd> |
| |
| <dt>output_table</dt> |
| <dd>VARCHAR. Name of the result table. |
| |
| @note |
| If there are index columns in the 'source_table' specified by the parameter |
| 'row_id' (see below), then the output table will contain only the index |
| columns 'row_id' and the encoded columns. If the parameter 'row_id' is |
| not specified, then all columns from the 'source_table', with the |
| exception of the original columns that have been encoded, will be |
| included in the 'output_table'. |
| </dd> |
| |
| <dt>categorical_cols</dt> |
| <dd>VARCHAR. Comma-separated string of column names of categorical variables to encode. |
| Can also be '*' meaning all columns are to be encoded, except the ones |
| specified in 'categorical_cols_to_exclude' and 'row_id'. Please note that all Boolean, |
| integer and text columns are considered categorical columns and will be |
| encoded when ‘*’ is specified for this argument. |
| </dd> |
| |
| <dt>categorical_cols_to_exclude (optional)</dt> |
| <dd>VARCHAR. Comma-separated string of column names to exclude from the |
| categorical variables to encode. Applicable only if 'categorical_cols' = '*'. |
| </dd> |
| |
| <dt>row_id (optional)</dt> |
| <dd>VARCHAR. Comma-separated column name(s) corresponding to the primary key(s) of the |
| source table. This parameter determines the format of the 'output_table' as |
| described above. If 'categorical_cols' = '*', these columns will be |
| excluded from encoding (but will be included in the output table). |
| |
| @note |
| If you want to see both the raw categorical variable and its encoded form |
| in the output_table, then include the categorical variable in the 'row_id' parameter. |
| However, this will not work if you specify '*' for the parameter 'categorical_cols', |
| because in this case 'row_id' columns will not be encoded at all. |
| </dd> |
| |
| <dt>top (optional)</dt> |
| <dd>VARCHAR. default: NULL. If integer, encodes the top n values |
| by frequency. If float in the range (0.0, 1.0), encodes the specified fraction |
| of values by frequency (e.g., 0.1 means top 10%). Can be specified as a global |
| for all categorical columns, or as a dictionary with separate 'top' values for each |
| categorical variable. Set to NULL to encode all levels (values) for all categorical |
| columns. |
| </dd> |
| |
| <dt>value_to_drop (optional)</dt> |
| <dd>VARCHAR. Default: NULL. |
| |
| - For dummy coding, indicate the desired value (reference) to drop for each |
| categorical variable. Can be specified as a global for all categorical columns, |
| or a comma-separated string containing items of the form 'name=value', where |
| 'name' is the column name and 'value' is the reference value to be dropped. |
| |
| - Set to NULL for one-hot encoding (default) |
| |
| @note |
| If you specify both 'value_to_drop' and 'top' parameters, |
| the 'value_to_drop' will be applied first (takes priority), |
| then 'top' will be applied to the remaining values. |
| </dd> |
| |
| <dt>encode_null (optional)</dt> |
| <dd>BOOLEAN. default: FALSE. Whether NULL should be treated as one of the |
| values of the categorical variable. If TRUE, then an indicator |
| variable is created corresponding to the NULL value. If FALSE, then |
| all encoded values for that variable will be set to 0. |
| </dd> |
| |
| <dt>output_type (optional)</dt> |
| <dd>VARCHAR. default: 'column'. This parameter controls the output format |
| of the indicator variables. If 'column', a column is created for each indicator |
| variable. PostgreSQL limits the number of columns in a table. |
| If the total number of indicator columns exceeds the limit, then make this |
| parameter either 'array' to combine the indicator columns into an array or |
| 'svec' to cast the array output to <em>'madlib.svec'</em> type. |
| |
| Since the array output for any single tuple would be sparse |
| (only one non-zero entry for each categorical column), the 'svec' output would |
| be most efficient for storage. The 'array' output is useful if the array is |
| used for post-processing, including concatenating with other non-categorical |
| features. |
| |
| The order of the array is the same as specified in 'categorical_cols'. |
| A dictionary will be created when 'output_type' is 'array' or 'svec' to |
| define an index into the array. The dictionary table will be given the name |
| of the 'output_table' appended by '_dictionary'. |
| </dd> |
| |
| <dt>output_dictionary (optional)</dt> |
| <dd>BOOLEAN. default: FALSE. |
| This parameter is used to handle auto-generated column names that exceed the |
| PostgreSQL limit of 63 bytes. |
| |
| - If TRUE, column names will include numerical IDs and will create a dictionary |
| table called 'output_table_dictionary' |
| ('output_table' appended with '_dictionary'). |
| |
| - If FALSE, will auto-generate column names in the |
| usual way unless the limit of 63 bytes will be exceeded. In this case, a |
| dictionary output file will be created and a message given to the user. |
| </dd> |
| |
| <dt>distributed_by (optional)</dt> |
| <dd>VARCHAR. default: NULL. Columns to use for the distribution policy of |
| the output table. When NULL, either 'row_id' is used as distribution policy |
| (when provided), or else the distribution policy of 'source_table' will be used. |
| This argument does not apply to PostgreSQL platforms. |
| |
| - NULL: By default, the distribution policy of the source_table will be used. |
| - Comma-separated column names: Column(s) to be used for the distribution key. |
| - RANDOMLY: Use random distribution policy (only if there does not exist a column named 'randomly'). |
| |
| </dd> |
| </dl> |
| |
| @anchor examples |
| @examp |
| |
| -# Use a subset of the abalone dataset [2]: |
| <pre class="example"> |
| DROP TABLE IF EXISTS abalone; |
| CREATE TABLE abalone ( |
| id serial, |
| sex character varying, |
| length double precision, |
| diameter double precision, |
| height double precision, |
| rings int |
| ); |
| INSERT INTO abalone (sex, length, diameter, height, rings) VALUES |
| ('M', 0.455, 0.365, 0.095, 15), |
| ('M', 0.35, 0.265, 0.09, 7), |
| ('F', 0.53, 0.42, 0.135, 9), |
| ('M', 0.44, 0.365, 0.125, 10), |
| ('I', 0.33, 0.255, 0.08, 7), |
| ('I', 0.425, 0.3, 0.095, 8), |
| ('F', 0.53, 0.415, 0.15, 20), |
| ('F', 0.545, 0.425, 0.125, 16), |
| ('M', 0.475, 0.37, 0.125, 9), |
| (NULL, 0.55, 0.44, 0.15, 19), |
| ('F', 0.525, 0.38, 0.14, 14), |
| ('M', 0.43, 0.35, 0.11, 10), |
| ('M', 0.49, 0.38, 0.135, 11), |
| ('F', 0.535, 0.405, 0.145, 10), |
| ('F', 0.47, 0.355, 0.1, 10), |
| ('M', 0.5, 0.4, 0.13, 12), |
| ('I', 0.355, 0.28, 0.085, 7), |
| ('F', 0.44, 0.34, 0.1, 10), |
| ('M', 0.365, 0.295, 0.08, 7), |
| (NULL, 0.45, 0.32, 0.1, 9); |
| </pre> |
| |
| -# Create new table with one-hot encoding. |
| The column 'sex' is replaced by three columns encoding the |
| values 'F', 'M' and 'I'. Null values are not encoded by default: |
| <pre class="example"> |
| DROP TABLE IF EXISTS abalone_out, abalone_out_dictionary; |
| SELECT madlib.encode_categorical_variables ( |
| 'abalone', -- Source table |
| 'abalone_out', -- Output table |
| 'sex' -- Categorical columns |
| ); |
| SELECT * FROM abalone_out ORDER BY id; |
| </pre> |
| <pre class="result"> |
| id | length | diameter | height | rings | sex_F | sex_I | sex_M |
| ----+--------+----------+--------+-------+-------+-------+------- |
| 1 | 0.455 | 0.365 | 0.095 | 15 | 0 | 0 | 1 |
| 2 | 0.35 | 0.265 | 0.09 | 7 | 0 | 0 | 1 |
| 3 | 0.53 | 0.42 | 0.135 | 9 | 1 | 0 | 0 |
| 4 | 0.44 | 0.365 | 0.125 | 10 | 0 | 0 | 1 |
| 5 | 0.33 | 0.255 | 0.08 | 7 | 0 | 1 | 0 |
| 6 | 0.425 | 0.3 | 0.095 | 8 | 0 | 1 | 0 |
| 7 | 0.53 | 0.415 | 0.15 | 20 | 1 | 0 | 0 |
| 8 | 0.545 | 0.425 | 0.125 | 16 | 1 | 0 | 0 |
| 9 | 0.475 | 0.37 | 0.125 | 9 | 0 | 0 | 1 |
| 10 | 0.55 | 0.44 | 0.15 | 19 | 0 | 0 | 0 |
| 11 | 0.525 | 0.38 | 0.14 | 14 | 1 | 0 | 0 |
| 12 | 0.43 | 0.35 | 0.11 | 10 | 0 | 0 | 1 |
| 13 | 0.49 | 0.38 | 0.135 | 11 | 0 | 0 | 1 |
| 14 | 0.535 | 0.405 | 0.145 | 10 | 1 | 0 | 0 |
| 15 | 0.47 | 0.355 | 0.1 | 10 | 1 | 0 | 0 |
| 16 | 0.5 | 0.4 | 0.13 | 12 | 0 | 0 | 1 |
| 17 | 0.355 | 0.28 | 0.085 | 7 | 0 | 1 | 0 |
| 18 | 0.44 | 0.34 | 0.1 | 10 | 1 | 0 | 0 |
| 19 | 0.365 | 0.295 | 0.08 | 7 | 0 | 0 | 1 |
| 20 | 0.45 | 0.32 | 0.1 | 9 | 0 | 0 | 0 |
| (20 rows) |
| </pre> |
| |
| -# Now include NULL values in encoding (note the additional column 'sex_null'): |
| <pre class="example"> |
| DROP TABLE IF EXISTS abalone_out, abalone_out_dictionary; |
| SELECT madlib.encode_categorical_variables ( |
| 'abalone', -- Source table |
| 'abalone_out', -- Output table |
| 'sex', -- Categorical columns |
| NULL, -- Categorical columns to exclude |
| NULL, -- Index columns |
| NULL, -- Top values |
| NULL, -- Value to drop for dummy encoding |
| TRUE -- Encode nulls |
| ); |
| SELECT * FROM abalone_out ORDER BY id; |
| </pre> |
| <pre class="result"> |
| id | length | diameter | height | rings | sex_F | sex_I | sex_M | sex_null |
| ----+--------+----------+--------+-------+-------+-------+-------+---------- |
| 1 | 0.455 | 0.365 | 0.095 | 15 | 0 | 0 | 1 | 0 |
| 2 | 0.35 | 0.265 | 0.09 | 7 | 0 | 0 | 1 | 0 |
| 3 | 0.53 | 0.42 | 0.135 | 9 | 1 | 0 | 0 | 0 |
| 4 | 0.44 | 0.365 | 0.125 | 10 | 0 | 0 | 1 | 0 |
| 5 | 0.33 | 0.255 | 0.08 | 7 | 0 | 1 | 0 | 0 |
| 6 | 0.425 | 0.3 | 0.095 | 8 | 0 | 1 | 0 | 0 |
| 7 | 0.53 | 0.415 | 0.15 | 20 | 1 | 0 | 0 | 0 |
| 8 | 0.545 | 0.425 | 0.125 | 16 | 1 | 0 | 0 | 0 |
| 9 | 0.475 | 0.37 | 0.125 | 9 | 0 | 0 | 1 | 0 |
| 10 | 0.55 | 0.44 | 0.15 | 19 | 0 | 0 | 0 | 1 |
| 11 | 0.525 | 0.38 | 0.14 | 14 | 1 | 0 | 0 | 0 |
| 12 | 0.43 | 0.35 | 0.11 | 10 | 0 | 0 | 1 | 0 |
| 13 | 0.49 | 0.38 | 0.135 | 11 | 0 | 0 | 1 | 0 |
| 14 | 0.535 | 0.405 | 0.145 | 10 | 1 | 0 | 0 | 0 |
| 15 | 0.47 | 0.355 | 0.1 | 10 | 1 | 0 | 0 | 0 |
| 16 | 0.5 | 0.4 | 0.13 | 12 | 0 | 0 | 1 | 0 |
| 17 | 0.355 | 0.28 | 0.085 | 7 | 0 | 1 | 0 | 0 |
| 18 | 0.44 | 0.34 | 0.1 | 10 | 1 | 0 | 0 | 0 |
| 19 | 0.365 | 0.295 | 0.08 | 7 | 0 | 0 | 1 | 0 |
| 20 | 0.45 | 0.32 | 0.1 | 9 | 0 | 0 | 0 | 1 |
| (20 rows) |
| </pre> |
| |
| -# Encode all categorical variables in the source table. Also, specify the |
| column 'id' as the index (primary key) which changes the output table |
| to include only the index and the encoded variables: |
| <pre class="example"> |
| DROP TABLE IF EXISTS abalone_out, abalone_out_dictionary; |
| SELECT madlib.encode_categorical_variables ( |
| 'abalone', -- Source table |
| 'abalone_out', -- Output table |
| '*', -- Categorical columns |
| NULL, -- Categorical columns to exclude |
| 'id' -- Index columns |
| ); |
| SELECT * FROM abalone_out ORDER BY id; |
| </pre> |
| <pre class="result"> |
| id | sex_F | sex_I | sex_M | rings_7 | rings_8 | rings_9 | rings_10 | rings_11 | rings_12 | rings_14 | rings_15 | rings_16 | rings_19 | rings_20 |
| ----+-------+-------+-------+---------+---------+---------+----------+----------+----------+----------+----------+----------+----------+---------- |
| 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
| 2 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 3 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 4 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 5 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 6 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 7 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
| 8 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
| 9 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 10 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
| 11 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
| 12 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 13 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |
| 14 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 15 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 16 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 |
| 17 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 18 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 19 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 20 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| (20 rows) |
| </pre> |
| |
| -# Now let's encode only the top values and group others into a |
| miscellaneous bucket column. Top values can be global across all |
| columns or specified by column. As an example of the latter, here |
| are the top 2 'sex' values and the top 50% of 'rings' values: |
| <pre class="example"> |
| DROP TABLE IF EXISTS abalone_out, abalone_out_dictionary; |
| SELECT madlib.encode_categorical_variables ( |
| 'abalone', -- Source table |
| 'abalone_out', -- Output table |
| '*', -- Categorical columns |
| NULL, -- Categorical columns to exclude |
| 'id', -- Index columns |
| 'sex=2, rings=0.5' -- Top values |
| ); |
| SELECT * FROM abalone_out ORDER BY id; |
| </pre> |
| <pre class="result"> |
| id | sex_M | sex_F | sex__misc__ | rings_10 | rings_7 | rings_9 | rings__misc__ |
| ----+-------+-------+-------------+----------+---------+---------+--------------- |
| 1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 |
| 2 | 1 | 0 | 0 | 0 | 1 | 0 | 0 |
| 3 | 0 | 1 | 0 | 0 | 0 | 1 | 0 |
| 4 | 1 | 0 | 0 | 1 | 0 | 0 | 0 |
| 5 | 0 | 0 | 1 | 0 | 1 | 0 | 0 |
| 6 | 0 | 0 | 1 | 0 | 0 | 0 | 1 |
| 7 | 0 | 1 | 0 | 0 | 0 | 0 | 1 |
| 8 | 0 | 1 | 0 | 0 | 0 | 0 | 1 |
| 9 | 1 | 0 | 0 | 0 | 0 | 1 | 0 |
| 10 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
| 11 | 0 | 1 | 0 | 0 | 0 | 0 | 1 |
| 12 | 1 | 0 | 0 | 1 | 0 | 0 | 0 |
| 13 | 1 | 0 | 0 | 0 | 0 | 0 | 1 |
| 14 | 0 | 1 | 0 | 1 | 0 | 0 | 0 |
| 15 | 0 | 1 | 0 | 1 | 0 | 0 | 0 |
| 16 | 1 | 0 | 0 | 0 | 0 | 0 | 1 |
| 17 | 0 | 0 | 1 | 0 | 1 | 0 | 0 |
| 18 | 0 | 1 | 0 | 1 | 0 | 0 | 0 |
| 19 | 1 | 0 | 0 | 0 | 1 | 0 | 0 |
| 20 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
| (20 rows) |
| </pre> |
| |
| -# If you want to see both the raw categorical variable and its |
| encoded form in the output_table, then include the categorical |
| variable(s) in the index parameter. (Remember that this will |
| not work if you specify '*' for the parameter 'categorical_cols', |
| because in this case 'row_id' columns will not be encoded at all.) |
| <pre class="example"> |
| DROP TABLE IF EXISTS abalone_out, abalone_out_dictionary; |
| SELECT madlib.encode_categorical_variables ( |
| 'abalone', -- Source table |
| 'abalone_out', -- Output table |
| 'sex, rings', -- Categorical columns |
| NULL, -- Categorical columns to exclude |
| 'id, sex, rings' -- Index columns |
| ); |
| SELECT * FROM abalone_out ORDER BY id; |
| </pre> |
| <pre class="result"> |
| id | sex | rings | sex_F | sex_I | sex_M | rings_7 | rings_8 | rings_9 | rings_10 | rings_11 | rings_12 | rings_14 | rings_15 | rings_16 | rings_19 | rings_20 |
| ----+-----+-------+-------+-------+-------+---------+---------+---------+----------+----------+----------+----------+----------+----------+----------+---------- |
| 1 | M | 15 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
| 2 | M | 7 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 3 | F | 9 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 4 | M | 10 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 5 | I | 7 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 6 | I | 8 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 7 | F | 20 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
| 8 | F | 16 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
| 9 | M | 9 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 10 | | 19 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
| 11 | F | 14 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
| 12 | M | 10 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 13 | M | 11 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |
| 14 | F | 10 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 15 | F | 10 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 16 | M | 12 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 |
| 17 | I | 7 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 18 | F | 10 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 19 | M | 7 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 20 | | 9 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| (20 rows) |
| </pre> |
| |
| -# For dummy encoding, let's make the 'I' value from the 'sex' variable as the reference. |
| Here we use the 'value_to_drop' parameter: |
| <pre class="example"> |
| DROP TABLE IF EXISTS abalone_out, abalone_out_dictionary; |
| SELECT madlib.encode_categorical_variables ( |
| 'abalone', -- Source table |
| 'abalone_out', -- Output table |
| '*', -- Categorical columns |
| 'rings', -- Categorical columns to exclude |
| 'id', -- Index columns |
| NULL, -- Top value |
| 'sex=I' -- Value to drop for dummy encoding |
| ); |
| SELECT * FROM abalone_out ORDER BY id; |
| </pre> |
| <pre class="result"> |
| id | sex_F | sex_M |
| ----+-------+------- |
| 1 | 0 | 1 |
| 2 | 0 | 1 |
| 3 | 1 | 0 |
| 4 | 0 | 1 |
| 5 | 0 | 0 |
| 6 | 0 | 0 |
| 7 | 1 | 0 |
| 8 | 1 | 0 |
| 9 | 0 | 1 |
| 10 | 0 | 0 |
| 11 | 1 | 0 |
| 12 | 0 | 1 |
| 13 | 0 | 1 |
| 14 | 1 | 0 |
| 15 | 1 | 0 |
| 16 | 0 | 1 |
| 17 | 0 | 0 |
| 18 | 1 | 0 |
| 19 | 0 | 1 |
| 20 | 0 | 0 |
| (20 rows) |
| </pre> |
| |
| -# Create an array output for the two categorical variables in the source table: |
| <pre class="example"> |
| DROP TABLE IF EXISTS abalone_out, abalone_out_dictionary; |
| SELECT madlib.encode_categorical_variables ( |
| 'abalone', -- Source table |
| 'abalone_out', -- Output table |
| '*', -- Categorical columns |
| NULL, -- Categorical columns to exclude |
| 'id', -- Index columns |
| NULL, -- Top values |
| NULL, -- Value to drop for dummy encoding |
| NULL, -- Encode nulls |
| 'array' -- Array output type |
| ); |
| SELECT * FROM abalone_out ORDER BY id; |
| </pre> |
| <pre class="result"> |
| id | __encoded_variables__ |
| ----+------------------------------- |
| 1 | {0,0,1,0,0,0,0,0,0,0,1,0,0,0} |
| 2 | {0,0,1,1,0,0,0,0,0,0,0,0,0,0} |
| 3 | {1,0,0,0,0,1,0,0,0,0,0,0,0,0} |
| 4 | {0,0,1,0,0,0,1,0,0,0,0,0,0,0} |
| 5 | {0,1,0,1,0,0,0,0,0,0,0,0,0,0} |
| 6 | {0,1,0,0,1,0,0,0,0,0,0,0,0,0} |
| 7 | {1,0,0,0,0,0,0,0,0,0,0,0,0,1} |
| 8 | {1,0,0,0,0,0,0,0,0,0,0,1,0,0} |
| 9 | {0,0,1,0,0,1,0,0,0,0,0,0,0,0} |
| 10 | {0,0,0,0,0,0,0,0,0,0,0,0,1,0} |
| 11 | {1,0,0,0,0,0,0,0,0,1,0,0,0,0} |
| 12 | {0,0,1,0,0,0,1,0,0,0,0,0,0,0} |
| 13 | {0,0,1,0,0,0,0,1,0,0,0,0,0,0} |
| 14 | {1,0,0,0,0,0,1,0,0,0,0,0,0,0} |
| 15 | {1,0,0,0,0,0,1,0,0,0,0,0,0,0} |
| 16 | {0,0,1,0,0,0,0,0,1,0,0,0,0,0} |
| 17 | {0,1,0,1,0,0,0,0,0,0,0,0,0,0} |
| 18 | {1,0,0,0,0,0,1,0,0,0,0,0,0,0} |
| 19 | {0,0,1,1,0,0,0,0,0,0,0,0,0,0} |
| 20 | {0,0,0,0,0,1,0,0,0,0,0,0,0,0} |
| (20 rows) |
| </pre> |
| View the dictionary table that gives the index into the array: |
| <pre class="example"> |
| SELECT * FROM abalone_out_dictionary; |
| </pre> |
| <pre class="result"> |
| encoded_column_name | index | variable | value |
| -----------------------+-------+----------+------- |
| __encoded_variables__ | 1 | sex | F |
| __encoded_variables__ | 2 | sex | I |
| __encoded_variables__ | 3 | sex | M |
| __encoded_variables__ | 4 | rings | 7 |
| __encoded_variables__ | 5 | rings | 8 |
| __encoded_variables__ | 6 | rings | 9 |
| __encoded_variables__ | 7 | rings | 10 |
| __encoded_variables__ | 8 | rings | 11 |
| __encoded_variables__ | 9 | rings | 12 |
| __encoded_variables__ | 10 | rings | 14 |
| __encoded_variables__ | 11 | rings | 15 |
| __encoded_variables__ | 12 | rings | 16 |
| __encoded_variables__ | 13 | rings | 19 |
| __encoded_variables__ | 14 | rings | 20 |
| (14 rows) |
| </pre> |
| -# Create a dictionary output: |
| <pre class="example"> |
| DROP TABLE IF EXISTS abalone_out, abalone_out_dictionary; |
| SELECT madlib.encode_categorical_variables ( |
| 'abalone', -- Source table |
| 'abalone_out', -- Output table |
| '*', -- Categorical columns |
| NULL, -- Categorical columns to exclude |
| 'id', -- Index columns |
| NULL, -- Top values |
| NULL, -- Value to drop for dummy encoding |
| NULL, -- Encode nulls |
| NULL, -- Output type |
| TRUE -- Dictionary output |
| ); |
| SELECT * FROM abalone_out ORDER BY id; |
| </pre> |
| <pre class="result"> |
| id | sex_1 | sex_2 | sex_3 | rings_1 | rings_2 | rings_3 | rings_4 | rings_5 | rings_6 | rings_7 | rings_8 | rings_9 | rings_10 | rings_11 |
| ----+-------+-------+-------+---------+---------+---------+---------+---------+---------+---------+---------+---------+----------+---------- |
| 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
| 2 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 3 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 4 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 5 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 6 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 7 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
| 8 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
| 9 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 10 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
| 11 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
| 12 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 13 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |
| 14 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 15 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 16 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 |
| 17 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 18 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 19 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 20 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| (20 rows) |
| </pre> |
| View the dictionary table that defines the numerical columns in the output table: |
| <pre class="example"> |
| SELECT * FROM abalone_out_dictionary ORDER BY encoded_column_name; |
| </pre> |
| <pre class="result"> |
| encoded_column_name | index | variable | value |
| ---------------------+-------+----------+------- |
| "rings_1" | 1 | rings | 7 |
| "rings_10" | 10 | rings | 19 |
| "rings_11" | 11 | rings | 20 |
| "rings_2" | 2 | rings | 8 |
| "rings_3" | 3 | rings | 9 |
| "rings_4" | 4 | rings | 10 |
| "rings_5" | 5 | rings | 11 |
| "rings_6" | 6 | rings | 12 |
| "rings_7" | 7 | rings | 14 |
| "rings_8" | 8 | rings | 15 |
| "rings_9" | 9 | rings | 16 |
| "sex_1" | 1 | sex | F |
| "sex_2" | 2 | sex | I |
| "sex_3" | 3 | sex | M |
| (14 rows) |
| </pre> |
| |
| -# We can chose from various distribution policies of the output table, for examply RANDOMLY: |
| <pre class="example"> |
| DROP TABLE IF EXISTS abalone_out, abalone_out_dictionary; |
| SELECT madlib.encode_categorical_variables ( |
| 'abalone', -- Source table |
| 'abalone_out', -- Output table |
| '*', -- Categorical columns |
| NULL, -- Categorical columns to exclude |
| 'id', -- Index columns |
| NULL, -- Top values |
| NULL, -- Value to drop for dummy encoding |
| NULL, -- Encode nulls |
| NULL, -- Output type |
| NULL, -- Dictionary output |
| 'RANDOMLY' -- Distribution policy |
| ); |
| SELECT * FROM abalone_out ORDER BY id; |
| </pre> |
| <pre class="result"> |
| id | sex_F | sex_I | sex_M | rings_10 | rings_11 | rings_12 | rings_14 | rings_15 | rings_16 | rings_19 | rings_20 | rings_7 | rings_8 | rings_9 |
| ----+-------+-------+-------+----------+----------+----------+----------+----------+----------+----------+----------+---------+---------+--------- |
| 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |
| 2 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
| 3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
| 4 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 5 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
| 6 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
| 7 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
| 8 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 |
| 9 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
| 10 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
| 11 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 12 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 13 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 14 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 15 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 16 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 17 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
| 18 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 19 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
| 20 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
| (20 rows) |
| </pre> |
| |
| -# If you have a reason to encode FLOAT variables, you can cast them as TEXT |
| in the following way within the function call: |
| <pre class="example"> |
| DROP TABLE IF EXISTS abalone_out, abalone_out_dictionary; |
| SELECT madlib.encode_categorical_variables ( |
| 'abalone', -- Source table |
| 'abalone_out', -- Output table |
| 'height::TEXT' -- Categorical columns |
| ); |
| SELECT * FROM abalone_out ORDER BY id; |
| </pre> |
| <pre class="result"> |
| id | sex | length | diameter | height | rings | height::TEXT_0.08 | height::TEXT_0.085 | height::TEXT_0.09 | height::TEXT_0.095 | height::TEXT_0.1 | height::TEXT_0.11 | height::TEXT_0.125 | height::TEXT_0.13 | height::TEXT_0.135 | height::TEXT_0.14 | height::TEXT_0.145 | height::TEXT_0.15 |
| ----+-----+--------+----------+--------+-------+-------------------+--------------------+-------------------+--------------------+------------------+-------------------+--------------------+-------------------+--------------------+-------------------+--------------------+------------------- |
| 1 | M | 0.455 | 0.365 | 0.095 | 15 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 2 | M | 0.35 | 0.265 | 0.09 | 7 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 3 | F | 0.53 | 0.42 | 0.135 | 9 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
| 4 | M | 0.44 | 0.365 | 0.125 | 10 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 |
| 5 | I | 0.33 | 0.255 | 0.08 | 7 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 6 | I | 0.425 | 0.3 | 0.095 | 8 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 7 | F | 0.53 | 0.415 | 0.15 | 20 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
| 8 | F | 0.545 | 0.425 | 0.125 | 16 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 |
| 9 | M | 0.475 | 0.37 | 0.125 | 9 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 |
| 10 | | 0.55 | 0.44 | 0.15 | 19 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
| 11 | F | 0.525 | 0.38 | 0.14 | 14 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
| 12 | M | 0.43 | 0.35 | 0.11 | 10 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |
| 13 | M | 0.49 | 0.38 | 0.135 | 11 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
| 14 | F | 0.535 | 0.405 | 0.145 | 10 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
| 15 | F | 0.47 | 0.355 | 0.1 | 10 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 16 | M | 0.5 | 0.4 | 0.13 | 12 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
| 17 | I | 0.355 | 0.28 | 0.085 | 7 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 18 | F | 0.44 | 0.34 | 0.1 | 10 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 19 | M | 0.365 | 0.295 | 0.08 | 7 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 20 | | 0.45 | 0.32 | 0.1 | 9 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| (20 rows) |
| </pre> |
| |
| -# You can also use a logical expression in the categorical columns, which will |
| be passed as boolean, and in the output table there will be two columns with |
| name logical_expression_true and logical_expression_false: |
| <pre class="example"> |
| DROP TABLE IF EXISTS abalone_out, abalone_out_dictionary; |
| SELECT madlib.encode_categorical_variables ( |
| 'abalone', -- Source table |
| 'abalone_out', -- Output table |
| 'height>.10' -- Categorical columns |
| ); |
| SELECT * FROM abalone_out ORDER BY id; |
| </pre> |
| <pre class="result"> |
| id | sex | length | diameter | height | rings | height>.10_false | height>.10_true |
| ----+-----+--------+----------+--------+-------+------------------+----------------- |
| 1 | M | 0.455 | 0.365 | 0.095 | 15 | 1 | 0 |
| 2 | M | 0.35 | 0.265 | 0.09 | 7 | 1 | 0 |
| 3 | F | 0.53 | 0.42 | 0.135 | 9 | 0 | 1 |
| 4 | M | 0.44 | 0.365 | 0.125 | 10 | 0 | 1 |
| 5 | I | 0.33 | 0.255 | 0.08 | 7 | 1 | 0 |
| 6 | I | 0.425 | 0.3 | 0.095 | 8 | 1 | 0 |
| 7 | F | 0.53 | 0.415 | 0.15 | 20 | 0 | 1 |
| 8 | F | 0.545 | 0.425 | 0.125 | 16 | 0 | 1 |
| 9 | M | 0.475 | 0.37 | 0.125 | 9 | 0 | 1 |
| 10 | | 0.55 | 0.44 | 0.15 | 19 | 0 | 1 |
| 11 | F | 0.525 | 0.38 | 0.14 | 14 | 0 | 1 |
| 12 | M | 0.43 | 0.35 | 0.11 | 10 | 0 | 1 |
| 13 | M | 0.49 | 0.38 | 0.135 | 11 | 0 | 1 |
| 14 | F | 0.535 | 0.405 | 0.145 | 10 | 0 | 1 |
| 15 | F | 0.47 | 0.355 | 0.1 | 10 | 1 | 0 |
| 16 | M | 0.5 | 0.4 | 0.13 | 12 | 0 | 1 |
| 17 | I | 0.355 | 0.28 | 0.085 | 7 | 1 | 0 |
| 18 | F | 0.44 | 0.34 | 0.1 | 10 | 1 | 0 |
| 19 | M | 0.365 | 0.295 | 0.08 | 7 | 1 | 0 |
| 20 | | 0.45 | 0.32 | 0.1 | 9 | 1 | 0 |
| (20 rows) |
| </pre> |
| |
| @anchor literature |
| @literature |
| |
| @anchor svm-lit-1 |
| [1] https://en.wikipedia.org/wiki/Categorical_variable |
| |
| [2] https://archive.ics.uci.edu/ml/datasets/Abalone |
| |
| */ |
| ------------------------------------------------------------------------- |
| |
| /** |
| * @brief Encode categorical columns using either one-hot encoding or dummy coding |
| * |
| * @param source_table Name of table containing categorical variable |
| * @param output_table Name of table to output dummy variables |
| * @param categorical_cols Comma-separated list of column names to dummy code (can be '*') |
| * @param categorical_cols_to_exclude Comma-separated list of column names to exclude (if categorical_cols = '*') |
| * @param row_id Columns from source table to index output table |
| * @param top Parameter to include only top values of a categorical variable |
| * @param value_to_drop Parameter to set reference column in dummy coding |
| * @param encode_null Boolean to determine the behavior for rows with NULL value |
| * @param output_type Parameter to set output data type: 'column', 'array' or 'svec' |
| * @param output_dictionary Boolean to simplify column naming and with a separate |
| * mapping table to actual values |
| * @param distributed_by Comma-separated list of column names to use for distribution of output |
| * |
| * @return Void |
| * |
| */ |
| |
| -- Create the below function for PostgreSQL but ensure that distributed_by is a no-op |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.encode_categorical_variables( |
| source_table VARCHAR, |
| output_table VARCHAR, |
| categorical_cols VARCHAR, |
| categorical_cols_to_exclude VARCHAR, |
| row_id VARCHAR, |
| top VARCHAR, |
| value_to_drop VARCHAR, |
| encode_null BOOLEAN, |
| output_type VARCHAR, |
| output_dictionary BOOLEAN, |
| distributed_by VARCHAR |
| ) RETURNS VOID AS $$ |
| PythonFunction(utilities, encode_categorical, encode_categorical_variables) |
| $$ LANGUAGE plpythonu |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.encode_categorical_variables( |
| source_table VARCHAR, |
| output_table VARCHAR, |
| categorical_cols VARCHAR, |
| categorical_cols_to_exclude VARCHAR, |
| row_id VARCHAR, |
| top VARCHAR, |
| value_to_drop VARCHAR, |
| encode_null BOOLEAN, |
| output_type VARCHAR, |
| output_dictionary BOOLEAN |
| ) RETURNS VOID AS $$ |
| PythonFunction(utilities, encode_categorical, encode_categorical_variables) |
| $$ LANGUAGE plpythonu |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| |
| -- Overloaded functions -------------------------------------------------------- |
| -- Default values are set by underlying Python function |
| -------------------------------------------------------------------------------- |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.encode_categorical_variables( |
| source_table VARCHAR, |
| output_table VARCHAR, |
| categorical_cols VARCHAR, |
| categorical_cols_to_exclude VARCHAR, |
| row_id VARCHAR, |
| top VARCHAR, |
| value_to_drop VARCHAR, |
| encode_null BOOLEAN, |
| output_type VARCHAR |
| ) RETURNS VOID AS $$ |
| PythonFunction(utilities, encode_categorical, encode_categorical_variables) |
| $$ LANGUAGE plpythonu |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.encode_categorical_variables( |
| source_table VARCHAR, |
| output_table VARCHAR, |
| categorical_cols VARCHAR, |
| categorical_cols_to_exclude VARCHAR, |
| row_id VARCHAR, |
| top VARCHAR, |
| value_to_drop VARCHAR, |
| encode_null BOOLEAN |
| ) RETURNS VOID AS $$ |
| PythonFunction(utilities, encode_categorical, encode_categorical_variables) |
| $$ LANGUAGE plpythonu |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.encode_categorical_variables( |
| source_table VARCHAR, |
| output_table VARCHAR, |
| categorical_cols VARCHAR, |
| categorical_cols_to_exclude VARCHAR, |
| row_id VARCHAR, |
| top VARCHAR, |
| value_to_drop VARCHAR |
| ) RETURNS VOID AS $$ |
| PythonFunction(utilities, encode_categorical, encode_categorical_variables) |
| $$ LANGUAGE plpythonu |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.encode_categorical_variables( |
| source_table VARCHAR, |
| output_table VARCHAR, |
| categorical_cols VARCHAR, |
| categorical_cols_to_exclude VARCHAR, |
| row_id VARCHAR, |
| top VARCHAR |
| ) RETURNS VOID AS $$ |
| PythonFunction(utilities, encode_categorical, encode_categorical_variables) |
| $$ LANGUAGE plpythonu |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.encode_categorical_variables( |
| source_table VARCHAR, |
| output_table VARCHAR, |
| categorical_cols VARCHAR, |
| categorical_cols_to_exclude VARCHAR, |
| row_id VARCHAR |
| ) RETURNS VOID AS $$ |
| PythonFunction(utilities, encode_categorical, encode_categorical_variables) |
| $$ LANGUAGE plpythonu |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.encode_categorical_variables( |
| source_table VARCHAR, |
| output_table VARCHAR, |
| categorical_cols VARCHAR, |
| categorical_cols_to_exclude VARCHAR |
| ) RETURNS VOID AS $$ |
| PythonFunction(utilities, encode_categorical, encode_categorical_variables) |
| $$ LANGUAGE plpythonu |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.encode_categorical_variables( |
| source_table VARCHAR, |
| output_table VARCHAR, |
| categorical_cols VARCHAR |
| ) RETURNS VOID AS $$ |
| PythonFunction(utilities, encode_categorical, encode_categorical_variables) |
| $$ LANGUAGE plpythonu |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| |
| -- Online help ----------------------------------------------------------------- |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.encode_categorical_variables( |
| message VARCHAR |
| ) RETURNS VARCHAR AS $$ |
| PythonFunction(utilities, encode_categorical, encode_categorical_help) |
| $$ LANGUAGE plpythonu IMMUTABLE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `CONTAINS SQL', `'); |
| |
| -------------------------------------------------------------------------------- |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.encode_categorical_variables() |
| RETURNS VARCHAR AS $$ |
| SELECT MADLIB_SCHEMA.encode_categorical_variables(''); |
| $$ LANGUAGE sql IMMUTABLE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `CONTAINS SQL', `'); |
| -------------------------------------------------------------------------------- |