| /* ----------------------------------------------------------------------- *//** |
| * |
| * Licensed to the Apache Software Foundation (ASF) under one |
| * or more contributor license agreements. See the NOTICE file |
| * distributed with this work for additional information |
| * regarding copyright ownership. The ASF licenses this file |
| * to you under the Apache License, Version 2.0 (the |
| * "License"); you may not use this file except in compliance |
| * with the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, |
| * software distributed under the License is distributed on an |
| * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| * KIND, either express or implied. See the License for the |
| * specific language governing permissions and limitations |
| * under the License. |
| * |
| * |
| * @file pivot.sql_in |
| * |
| * @brief SQL functions for pivoting |
| * @date June 2016 |
| * |
| * @sa Creates a pivot table for data summarization. |
| * |
| *//* ----------------------------------------------------------------------- */ |
| |
| m4_include(`SQLCommon.m4') |
| |
| /** |
| @addtogroup grp_pivot |
| |
| <div class="toc"><b>Contents</b> |
| <ul> |
| <li><a href="#pivoting">Pivoting</a></li> |
| <li><a href="#notes">Notes</a></li> |
| <li><a href="#examples">Examples</a></li> |
| <li><a href="#literature">Literature</a></li> |
| </ul> |
| </div> |
| |
| @brief Pivoting and data summarization tools for preparing data |
| for modeling operations. |
| |
| @anchor pivoting |
| The goal of the MADlib pivot function is to provide a data summarization tool |
| that can do basic OLAP type operations on data stored in one table and output |
| the summarized data to a second table. |
| |
| |
| <pre class="syntax"> |
| pivot( |
| source_table, |
| output_table, |
| index, |
| pivot_cols, |
| pivot_values, |
| aggregate_func, |
| fill_value, |
| keep_null, |
| output_col_dictionary, |
| output_type |
| ) |
| </pre> |
| \b Arguments |
| <dl class="arglist"> |
| <dt>source_table</dt> |
| <dd>VARCHAR. Name of the source table (or view) containing data to |
| pivot.</dd> |
| |
| <dt>output_table</dt> |
| <dd>VARCHAR. Name of output table that contains the pivoted data. |
| The output table contains all the columns present in |
| the <em>'index'</em> column list, plus additional columns for each |
| distinct value in <em>'pivot_cols'</em>. |
| |
| @note The names of the columns in the output table are auto-generated. |
| Please see the examples section below to see how this works in practice. |
| The convention used is to concatenate the following strings and separate |
| each by an underscore '_' : |
| - name of the value column <em>'pivot_values'</em> |
| - aggregate function |
| - name of the pivot column <em>'pivot_cols'</em> |
| - values in the pivot column |
| </dd> |
| |
| <dt>index </dt> |
| <dd>VARCHAR. Comma-separated columns that will form the index of the output |
| pivot table. By index we mean the values to group by; these are the rows |
| in the output pivot table.</dd> |
| |
| <dt>pivot_cols </dt> |
| <dd>VARCHAR. Comma-separated columns that will form the columns of the |
| output pivot table.</dd> |
| |
| <dt>pivot_values </dt> |
| <dd>VARCHAR. Comma-separated columns that contain the values to be |
| summarized in the output pivot table.</dd> |
| |
| <dt>aggregate_func (optional)</dt> |
| <dd>VARCHAR. default: 'AVG'. A comma-separated list of aggregates to be |
| applied to values. These can be PostgreSQL built-in aggregates [1] or UDAs. It is |
| possible to assign a set of aggregates per value column. Please refer to the |
| examples 12\-14 below for syntax details.</dd> |
| |
| @note Only aggregates with |
| strict transition functions are permitted here. |
| A strict transition function means rows with null values are ignored; |
| the function is not called and the previous state value is retained. |
| If you need some other behavior for null inputs, this should |
| be done prior to calling the pivot function. |
| Aggregates with strict transition |
| functions are described in [2,3]. |
| |
| <dt>fill_value (optional)</dt> |
| <dd>VARCHAR. default: NULL. If specified, determines how to fill NULL |
| values resulting from pivot operation. This is a global parameter (not |
| applied per aggregate) and is applied post-aggregation to the output |
| table.</dd> |
| |
| <dt>keep_null (optional)</dt> |
| <dd>BOOLEAN. default: FALSE. If TRUE, then pivot columns are created |
| corresponding to NULL categories. If FALSE, then no pivot columns will be |
| created for NULL categories.</dd> |
| |
| <dt>output_col_dictionary (optional)</dt> |
| <dd>BOOLEAN. default: FALSE. This parameter is used to handle |
| auto-generated column names that exceed the PostgreSQL limit of 63 bytes |
| (could be a common occurrence). If TRUE, column names will be set as |
| numerical IDs and will create a dictionary table called output_table |
| appended with _dictionary. If FALSE, will auto-generate column names in |
| the usual way unless the limit of 63 bytes will be exceeded. In this case, |
| a dictionary output file will be created and a message given to the user. |
| </dd> |
| |
| <dt>output_type (optional)</dt> |
| <dd>VARCHAR. default: 'column'. This parameter controls the output format |
| of the pivoted variables. If 'column', a column is created for each pivot |
| variable. PostgreSQL limits the number of columns in a table |
| (250 - 1600 depending on column types). |
| If the total number of output columns exceeds this limit, then make this |
| parameter either 'array' (to combine the output columns into an array) or |
| 'svec' (to cast the array output to <em>'madlib.svec'</em> type). |
| If you have an 'aggregate_func' that has an array return type, |
| it cannot be combined with 'output_type'='array' or 'svec'. |
| |
| A dictionary will be created (<em>output_col_dictionary=TRUE</em>) |
| when 'output_type' is 'array' or 'svec' to define each index into the array. |
| </dd> |
| |
| </dl> |
| |
| @anchor notes |
| @note |
| - NULLs in the index column are treated like any other value. |
| - NULLs in the pivot column are ignored unless keep_null is TRUE. |
| - Only strict transition functions are |
| allowed so NULLs are ignored. |
| - It is not allowed to set the fill_value parameter without setting the |
| aggregate_func parameter due to possible ambiguity. Set |
| aggregate_func to NULL for the default behavior and use fill_value as desired. |
| Please note that full_value must be of the same type as the output of the |
| aggregate_func (or capable of being cast to the same type by PostgreSQL), |
| or else an error will result. |
| - It is not allowed to set the output_col_dictionary parameter without setting |
| the keep_null parameter due to possible ambiguity. Set |
| keep_null to NULL for the default behavior and use output_col_dictionary as |
| desired. |
| - Expressions (instead of column names) are not supported. Create a view with the |
| desired expressions and pass it as the input table (see example 3 below). |
| - It is allowed to pass a partial mapping for the aggregate_func parameter. The |
| missing value columns will be aggregated using the default function (average). |
| |
| @anchor examples |
| @examp |
| |
| -# Create a simple dataset to demonstrate a basic pivot: |
| <pre class="example"> |
| DROP TABLE IF EXISTS pivset CASCADE; -- View below may depend on table so use CASCADE |
| CREATE TABLE pivset( |
| id INTEGER, |
| piv INTEGER, |
| val FLOAT8 |
| ); |
| INSERT INTO pivset VALUES |
| (0, 10, 1), |
| (0, 10, 2), |
| (0, 20, 3), |
| (1, 20, 4), |
| (1, 30, 5), |
| (1, 30, 6), |
| (1, 10, 7), |
| (NULL, 10, 8), |
| (1, NULL, 9), |
| (1, 10, NULL); |
| </pre> |
| |
| -# Pivot the table: |
| <pre class="example"> |
| DROP TABLE IF EXISTS pivout; |
| SELECT madlib.pivot('pivset', 'pivout', 'id', 'piv', 'val'); |
| SELECT * FROM pivout ORDER BY id; |
| </pre> |
| <pre class="result"> |
| id | val_avg_piv_10 | val_avg_piv_20 | val_avg_piv_30 |
| ----+----------------+----------------+---------------- |
| 0 | 1.5 | 3 | |
| 1 | 7 | 4 | 5.5 |
| | 8 | | |
| </pre> |
| Here NULL is showing as an empty cell in the output. |
| |
| -# Now let's add some more columns to our data set and create a view: |
| <pre class="example"> |
| DROP VIEW IF EXISTS pivset_ext; |
| CREATE VIEW pivset_ext AS |
| SELECT *, |
| COALESCE(id + (val / 3)::integer, 0) AS id2, |
| COALESCE(100*(val / 3)::integer, 0) AS piv2, |
| COALESCE(val + 10, 0) AS val2 |
| FROM pivset; |
| SELECT id,id2,piv,piv2,val,val2 FROM pivset_ext |
| ORDER BY id,id2,piv,piv2,val,val2; |
| </pre> |
| <pre class="result"> |
| id | id2 | piv | piv2 | val | val2 |
| ----+-----+-----+------+-----+------ |
| 0 | 0 | 10 | 0 | 1 | 11 |
| 0 | 1 | 10 | 100 | 2 | 12 |
| 0 | 1 | 20 | 100 | 3 | 13 |
| 1 | 0 | 10 | 0 | | 0 |
| 1 | 2 | 20 | 100 | 4 | 14 |
| 1 | 3 | 10 | 200 | 7 | 17 |
| 1 | 3 | 30 | 200 | 5 | 15 |
| 1 | 3 | 30 | 200 | 6 | 16 |
| 1 | 4 | | 300 | 9 | 19 |
| | 0 | 10 | 300 | 8 | 18 |
| (10 rows) |
| </pre> |
| |
| -# Let's use a different aggregate function on the view we |
| just created: |
| <pre class="example"> |
| DROP TABLE IF EXISTS pivout; |
| SELECT madlib.pivot('pivset_ext', 'pivout', 'id', 'piv', 'val', 'sum'); |
| SELECT * FROM pivout ORDER BY id; |
| </pre> |
| <pre class="result"> |
| id | val_sum_piv_10 | val_sum_piv_20 | val_sum_piv_30 |
| ----+----------------+----------------+---------------- |
| 0 | 3 | 3 | |
| 1 | 7 | 4 | 11 |
| | 8 | | |
| </pre> |
| |
| -# Now create a custom aggregate. Note that the aggregate |
| must have a strict transition function: |
| <pre class="example"> |
| DROP FUNCTION IF EXISTS array_add1 (ANYARRAY, ANYELEMENT) CASCADE; |
| CREATE FUNCTION array_add1(ANYARRAY, ANYELEMENT) RETURNS ANYARRAY AS $$ |
| SELECT $1 || $2 |
| $$ LANGUAGE sql STRICT; |
| DROP AGGREGATE IF EXISTS array_accum1 (anyelement); |
| CREATE AGGREGATE array_accum1 (anyelement) ( |
| sfunc = array_add1, |
| stype = anyarray, |
| initcond = '{}' |
| ); |
| DROP TABLE IF EXISTS pivout; |
| SELECT madlib.pivot('pivset_ext', 'pivout', 'id', 'piv', 'val', 'array_accum1'); |
| SELECT * FROM pivout ORDER BY id; |
| </pre> |
| <pre class="result"> |
| id | val_array_accum1_piv_10 | val_array_accum1_piv_20 | val_array_accum1_piv_30 |
| ----+-------------------------+-------------------------+------------------------- |
| 0 | {1,2} | {3} | {} |
| 1 | {7} | {4} | {5,6} |
| | {8} | {} | {} |
| </pre> |
| |
| -# Keep null values in the pivot column: |
| <pre class="example"> |
| DROP TABLE IF EXISTS pivout; |
| SELECT madlib.pivot('pivset_ext', 'pivout', 'id', 'piv', 'val', 'sum', True); |
| SELECT * FROM pivout ORDER BY id; |
| </pre> |
| <pre class="result"> |
| id | val_sum_piv_10 | val_sum_piv_20 | val_sum_piv_30 | val_sum_piv_null |
| ----+----------------+----------------+----------------+------------------ |
| 0 | 3 | 3 | | |
| 1 | 7 | 4 | 11 | 9 |
| | 8 | | | |
| </pre> |
| |
| -# Fill null results with a value of interest: |
| <pre class="example"> |
| DROP TABLE IF EXISTS pivout; |
| SELECT madlib.pivot('pivset_ext', 'pivout', 'id', 'piv', 'val', 'sum', '111'); |
| SELECT * FROM pivout ORDER BY id; |
| </pre> |
| <pre class="result"> |
| id | val_sum_piv_10 | val_sum_piv_20 | val_sum_piv_30 |
| ----+----------------+----------------+---------------- |
| 0 | 3 | 3 | 111 |
| 1 | 7 | 4 | 11 |
| | 8 | 111 | 111 |
| </pre> |
| |
| -# Use multiple index columns: |
| <pre class="example"> |
| DROP TABLE IF EXISTS pivout; |
| SELECT madlib.pivot('pivset_ext', 'pivout', 'id,id2', 'piv', 'val'); |
| SELECT * FROM pivout ORDER BY id,id2; |
| </pre> |
| <pre class="result"> |
| id | id2 | val_avg_piv_10 | val_avg_piv_20 | val_avg_piv_30 |
| ----+-----+----------------+----------------+---------------- |
| 0 | 0 | 1 | | |
| 0 | 1 | 2 | 3 | |
| 1 | 0 | | | |
| 1 | 2 | | 4 | |
| 1 | 3 | 7 | | 5.5 |
| 1 | 4 | | | |
| | 0 | 8 | | |
| </pre> |
| |
| -# Use multiple pivot columns with columnar output: |
| <pre class="example"> |
| \\x on |
| DROP TABLE IF EXISTS pivout; |
| SELECT madlib.pivot('pivset_ext', 'pivout', 'id', 'piv, piv2', 'val'); |
| SELECT * FROM pivout ORDER BY id; |
| </pre> |
| <pre class="result"> |
| -[ RECORD 1 ]-----------+---- |
| id | 0 |
| val_avg_piv_10_piv2_0 | 1 |
| val_avg_piv_10_piv2_100 | 2 |
| val_avg_piv_10_piv2_200 | |
| val_avg_piv_10_piv2_300 | |
| val_avg_piv_20_piv2_0 | |
| val_avg_piv_20_piv2_100 | 3 |
| val_avg_piv_20_piv2_200 | |
| val_avg_piv_20_piv2_300 | |
| val_avg_piv_30_piv2_0 | |
| val_avg_piv_30_piv2_100 | |
| val_avg_piv_30_piv2_200 | |
| val_avg_piv_30_piv2_300 | |
| -[ RECORD 2 ]-----------+---- |
| id | 1 |
| val_avg_piv_10_piv2_0 | |
| val_avg_piv_10_piv2_100 | |
| val_avg_piv_10_piv2_200 | 7 |
| val_avg_piv_10_piv2_300 | |
| val_avg_piv_20_piv2_0 | |
| val_avg_piv_20_piv2_100 | 4 |
| val_avg_piv_20_piv2_200 | |
| val_avg_piv_20_piv2_300 | |
| val_avg_piv_30_piv2_0 | |
| val_avg_piv_30_piv2_100 | |
| val_avg_piv_30_piv2_200 | 5.5 |
| val_avg_piv_30_piv2_300 | |
| ... |
| </pre> |
| |
| -# Use multiple pivot columns (same as above) with an array output: |
| <pre class="example"> |
| DROP TABLE IF EXISTS pivout, pivout_dictionary; |
| SELECT madlib.pivot('pivset_ext', 'pivout', 'id', 'piv, piv2', 'val', |
| NULL, NULL, FALSE, FALSE, 'array'); |
| \\x off |
| SELECT * FROM pivout ORDER BY id; |
| </pre> |
| <pre class="result"> |
| id | val_avg |
| --------+------------------------------------------------------------ |
| 0 | {1,2,NULL,NULL,NULL,3,NULL,NULL,NULL,NULL,NULL,NULL} |
| 1 | {NULL,NULL,7,NULL,NULL,4,NULL,NULL,NULL,NULL,5.5,NULL} |
| [NULL] | {NULL,NULL,NULL,8,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL} |
| </pre> |
| <pre class="example"> |
| -- Use the dictionary to understand what each index of an array corresponds to |
| SELECT * FROM pivout_dictionary; |
| </pre> |
| <pre class="result"> |
| __pivot_cid__ | pval | agg | piv | piv2 | col_name |
| ---------------+------+-----+-----+------+--------------------------- |
| 1 | val | avg | 10 | 0 | "val_avg_piv_10_piv2_0" |
| 2 | val | avg | 10 | 100 | "val_avg_piv_10_piv2_100" |
| 3 | val | avg | 10 | 200 | "val_avg_piv_10_piv2_200" |
| 4 | val | avg | 10 | 300 | "val_avg_piv_10_piv2_300" |
| 5 | val | avg | 20 | 0 | "val_avg_piv_20_piv2_0" |
| 6 | val | avg | 20 | 100 | "val_avg_piv_20_piv2_100" |
| 7 | val | avg | 20 | 200 | "val_avg_piv_20_piv2_200" |
| 8 | val | avg | 20 | 300 | "val_avg_piv_20_piv2_300" |
| 9 | val | avg | 30 | 0 | "val_avg_piv_30_piv2_0" |
| 10 | val | avg | 30 | 100 | "val_avg_piv_30_piv2_100" |
| 11 | val | avg | 30 | 200 | "val_avg_piv_30_piv2_200" |
| 12 | val | avg | 30 | 300 | "val_avg_piv_30_piv2_300" |
| </pre> |
| |
| -# Use multiple value columns: |
| <pre class="example"> |
| DROP TABLE IF EXISTS pivout; |
| SELECT madlib.pivot('pivset_ext', 'pivout', 'id', 'piv', 'val, val2'); |
| \\x on |
| SELECT * FROM pivout ORDER BY id; |
| </pre> |
| <pre class="result"> |
| -[ RECORD 1 ]---+----- |
| id | 0 |
| val_avg_piv_10 | 1.5 |
| val_avg_piv_20 | 3 |
| val_avg_piv_30 | |
| val2_avg_piv_10 | 11.5 |
| val2_avg_piv_20 | 13 |
| val2_avg_piv_30 | |
| -[ RECORD 2 ]---+----- |
| id | 1 |
| val_avg_piv_10 | 7 |
| val_avg_piv_20 | 4 |
| val_avg_piv_30 | 5.5 |
| val2_avg_piv_10 | 8.5 |
| val2_avg_piv_20 | 14 |
| val2_avg_piv_30 | 15.5 |
| ... |
| </pre> |
| |
| |
| -# Use multiple aggregate functions on the same value column (cross product): |
| <pre class="example"> |
| DROP TABLE IF EXISTS pivout; |
| SELECT madlib.pivot('pivset_ext', 'pivout', 'id', 'piv', 'val', 'avg, sum'); |
| \\x on |
| SELECT * FROM pivout ORDER BY id; |
| </pre> |
| <pre class="result"> |
| -[ RECORD 1 ]--+---- |
| id | 0 |
| val_avg_piv_10 | 1.5 |
| val_avg_piv_20 | 3 |
| val_avg_piv_30 | |
| val_sum_piv_10 | 3 |
| val_sum_piv_20 | 3 |
| val_sum_piv_30 | |
| -[ RECORD 2 ]--+---- |
| id | 1 |
| val_avg_piv_10 | 7 |
| val_avg_piv_20 | 4 |
| val_avg_piv_30 | 5.5 |
| val_sum_piv_10 | 7 |
| val_sum_piv_20 | 4 |
| val_sum_piv_30 | 11 |
| ... |
| </pre> |
| |
| -# Use different aggregate functions for different value columns: |
| <pre class="example"> |
| DROP TABLE IF EXISTS pivout; |
| SELECT madlib.pivot('pivset_ext', 'pivout', 'id', 'piv', 'val, val2', |
| 'val=avg, val2=sum'); |
| \\x on |
| SELECT * FROM pivout ORDER BY id; |
| </pre> |
| <pre class="result"> |
| -[ RECORD 1 ]---+---- |
| id | 0 |
| val_avg_piv_10 | 1.5 |
| val_avg_piv_20 | 3 |
| val_avg_piv_30 | |
| val2_sum_piv_10 | 23 |
| val2_sum_piv_20 | 13 |
| val2_sum_piv_30 | |
| -[ RECORD 2 ]---+---- |
| id | 1 |
| val_avg_piv_10 | 7 |
| val_avg_piv_20 | 4 |
| val_avg_piv_30 | 5.5 |
| val2_sum_piv_10 | 17 |
| val2_sum_piv_20 | 14 |
| val2_sum_piv_30 | 31 |
| ... |
| </pre> |
| |
| -# Use multiple aggregate functions for different value columns: |
| <pre class="example"> |
| DROP TABLE IF EXISTS pivout; |
| SELECT madlib.pivot('pivset_ext', 'pivout', 'id', 'piv', 'val, val2', |
| 'val=avg, val2=[avg,sum]'); |
| \\x on |
| SELECT * FROM pivout ORDER BY id; |
| </pre> |
| <pre class="result"> |
| -[ RECORD 1 ]---+----- |
| id | 0 |
| val_avg_piv_10 | 1.5 |
| val_avg_piv_20 | 3 |
| val_avg_piv_30 | |
| val2_avg_piv_10 | 11.5 |
| val2_avg_piv_20 | 13 |
| val2_avg_piv_30 | |
| val2_sum_piv_10 | 23 |
| val2_sum_piv_20 | 13 |
| val2_sum_piv_30 | |
| -[ RECORD 2 ]---+----- |
| id | 1 |
| val_avg_piv_10 | 7 |
| val_avg_piv_20 | 4 |
| val_avg_piv_30 | 5.5 |
| val2_avg_piv_10 | 8.5 |
| val2_avg_piv_20 | 14 |
| val2_avg_piv_30 | 15.5 |
| val2_sum_piv_10 | 17 |
| val2_sum_piv_20 | 14 |
| val2_sum_piv_30 | 31 |
| ... |
| </pre> |
| |
| -# Combine all of the options: |
| <pre class="example"> |
| DROP TABLE IF EXISTS pivout; |
| SELECT madlib.pivot('pivset_ext', 'pivout', 'id, id2', 'piv, piv2', 'val, val2', |
| 'val=avg, val2=[avg,sum]', '111', True); |
| \\x on |
| SELECT * FROM pivout ORDER BY id,id2; |
| </pre> |
| <pre class="result"> |
| -[ RECORD 1 ]--------------+----- |
| id | 0 |
| id2 | 0 |
| val_avg_piv_null_piv2_0 | 111 |
| val_avg_piv_null_piv2_100 | 111 |
| val_avg_piv_null_piv2_200 | 111 |
| val_avg_piv_null_piv2_300 | 111 |
| val_avg_piv_10_piv2_0 | 1 |
| val_avg_piv_10_piv2_100 | 111 |
| val_avg_piv_10_piv2_200 | 111 |
| val_avg_piv_10_piv2_300 | 111 |
| val_avg_piv_20_piv2_0 | 111 |
| val_avg_piv_20_piv2_100 | 111 |
| val_avg_piv_20_piv2_200 | 111 |
| val_avg_piv_20_piv2_300 | 111 |
| val_avg_piv_30_piv2_0 | 111 |
| val_avg_piv_30_piv2_100 | 111 |
| val_avg_piv_30_piv2_200 | 111 |
| val_avg_piv_30_piv2_300 | 111 |
| val2_avg_piv_null_piv2_0 | 111 |
| val2_avg_piv_null_piv2_100 | 111 |
| val2_avg_piv_null_piv2_200 | 111 |
| val2_avg_piv_null_piv2_300 | 111 |
| val2_avg_piv_10_piv2_0 | 11 |
| val2_avg_piv_10_piv2_100 | 111 |
| ... |
| -[ RECORD 2 ]--------------+----- |
| id | 0 |
| id2 | 1 |
| val_avg_piv_null_piv2_0 | 111 |
| val_avg_piv_null_piv2_100 | 111 |
| val_avg_piv_null_piv2_200 | 111 |
| val_avg_piv_null_piv2_300 | 111 |
| val_avg_piv_10_piv2_0 | 111 |
| val_avg_piv_10_piv2_100 | 2 |
| val_avg_piv_10_piv2_200 | 111 |
| val_avg_piv_10_piv2_300 | 111 |
| val_avg_piv_20_piv2_0 | 111 |
| val_avg_piv_20_piv2_100 | 3 |
| val_avg_piv_20_piv2_200 | 111 |
| val_avg_piv_20_piv2_300 | 111 |
| val_avg_piv_30_piv2_0 | 111 |
| val_avg_piv_30_piv2_100 | 111 |
| val_avg_piv_30_piv2_200 | 111 |
| val_avg_piv_30_piv2_300 | 111 |
| val2_avg_piv_null_piv2_0 | 111 |
| val2_avg_piv_null_piv2_100 | 111 |
| val2_avg_piv_null_piv2_200 | 111 |
| val2_avg_piv_null_piv2_300 | 111 |
| ... |
| </pre> |
| |
| -# Create a dictionary for output column names: |
| <pre class="example"> |
| DROP TABLE IF EXISTS pivout, pivout_dictionary; |
| SELECT madlib.pivot('pivset_ext', 'pivout', 'id, id2', 'piv, piv2', 'val, val2', |
| 'val=avg, val2=[avg,sum]', '111', True, True); |
| \\x off |
| SELECT * FROM pivout_dictionary order by __pivot_cid__; |
| </pre> |
| <pre class="result"> |
| __pivot_cid__ | pval | agg | piv | piv2 | col_name |
| ---------------+------+-----+--------+------+------------------------------ |
| __p_1__ | val | avg | [NULL] | 0 | "val_avg_piv_null_piv2_0" |
| __p_2__ | val | avg | [NULL] | 100 | "val_avg_piv_null_piv2_100" |
| __p_3__ | val | avg | [NULL] | 200 | "val_avg_piv_null_piv2_200" |
| __p_4__ | val | avg | [NULL] | 300 | "val_avg_piv_null_piv2_300" |
| __p_5__ | val | avg | 10 | 0 | "val_avg_piv_10_piv2_0" |
| __p_6__ | val | avg | 10 | 100 | "val_avg_piv_10_piv2_100" |
| __p_7__ | val | avg | 10 | 200 | "val_avg_piv_10_piv2_200" |
| __p_8__ | val | avg | 10 | 300 | "val_avg_piv_10_piv2_300" |
| __p_9__ | val | avg | 20 | 0 | "val_avg_piv_20_piv2_0" |
| __p_10__ | val | avg | 20 | 100 | "val_avg_piv_20_piv2_100" |
| __p_11__ | val | avg | 20 | 200 | "val_avg_piv_20_piv2_200" |
| __p_12__ | val | avg | 20 | 300 | "val_avg_piv_20_piv2_300" |
| __p_13__ | val | avg | 30 | 0 | "val_avg_piv_30_piv2_0" |
| __p_14__ | val | avg | 30 | 100 | "val_avg_piv_30_piv2_100" |
| __p_15__ | val | avg | 30 | 200 | "val_avg_piv_30_piv2_200" |
| __p_16__ | val | avg | 30 | 300 | "val_avg_piv_30_piv2_300" |
| __p_17__ | val2 | avg | [NULL] | 0 | "val2_avg_piv_null_piv2_0" |
| __p_18__ | val2 | avg | [NULL] | 100 | "val2_avg_piv_null_piv2_100" |
| __p_19__ | val2 | avg | [NULL] | 200 | "val2_avg_piv_null_piv2_200" |
| __p_20__ | val2 | avg | [NULL] | 300 | "val2_avg_piv_null_piv2_300" |
| __p_21__ | val2 | avg | 10 | 0 | "val2_avg_piv_10_piv2_0" |
| ... |
| (48 rows) |
| </pre> |
| <pre class="example"> |
| \\x on |
| SELECT * FROM pivout ORDER BY id,id2; |
| </pre> |
| <pre class="result"> |
| -[ RECORD 1 ]---- |
| id | 0 |
| id2 | 0 |
| __p_1__ | 111 |
| __p_2__ | 111 |
| __p_3__ | 111 |
| __p_4__ | 111 |
| __p_5__ | 1 |
| __p_6__ | 111 |
| __p_7__ | 111 |
| __p_8__ | 111 |
| __p_9__ | 111 |
| __p_10__ | 111 |
| __p_11__ | 111 |
| __p_12__ | 111 |
| __p_13__ | 111 |
| ... |
| -[ RECORD 2 ]---- |
| id | 0 |
| id2 | 1 |
| __p_1__ | 111 |
| __p_2__ | 111 |
| __p_3__ | 111 |
| __p_4__ | 111 |
| __p_5__ | 111 |
| __p_6__ | 2 |
| __p_7__ | 111 |
| __p_8__ | 111 |
| __p_9__ | 111 |
| __p_10__ | 3 |
| __p_11__ | 111 |
| __p_12__ | 111 |
| __p_13__ | 111 |
| ... |
| -[ RECORD 3 ]---- |
| id | 1 |
| id2 | 0 |
| __p_1__ | 111 |
| __p_2__ | 111 |
| __p_3__ | 111 |
| __p_4__ | 111 |
| __p_5__ | 111 |
| __p_6__ | 111 |
| __p_7__ | 111 |
| __p_8__ | 111 |
| __p_9__ | 111 |
| __p_10__ | 111 |
| __p_11__ | 111 |
| __p_12__ | 111 |
| __p_13__ | 111 |
| ... |
| </pre> |
| |
| @anchor literature |
| @literature |
| |
| NOTE: The following links refer to documentation resources for the |
| current PostgreSQL database version. Depending upon your database |
| platform version, you may need to change "current" references in the |
| links to your database version. |
| |
| If your database platform uses the Greenplum Database (or related |
| variants), please check with the project community and/or your |
| database vendor to identify the PostgreSQL version it is based on. |
| |
| @anchor svm-lit-1 |
| [1] https://www.postgresql.org/docs/current/static/functions-aggregate.html |
| |
| [2] https://www.postgresql.org/docs/current/static/sql-createaggregate.html |
| |
| [3] https://www.postgresql.org/docs/current/static/xaggr.html |
| */ |
| |
| ------------------------------------------------------------------------- |
| |
| |
| /** |
| * @brief Helper function that can be used to pivot tables |
| * |
| * @param source_table The original data table |
| * @param output_table The output table that contains the dummy |
| * variable columns |
| * @param index The index columns to group by the records by |
| * @param pivot_cols The columns to pivot the table |
| * @param pivot_values The value columns to be summarized in the |
| * pivoted table |
| * @param aggregate_func The aggregate function to be applied to the |
| * values |
| * @param fill_value If specified, determines how to fill NULL |
| * values resulting from pivot operation |
| * @param keep_null The flag for determining how to handle NULL |
| * values in pivot columns |
| * @param output_col_dictionary The flag for enabling the creation of the |
| * output dictionary for shorter column names |
| * @return Void |
| * |
| */ |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.pivot( |
| source_table TEXT, |
| output_table TEXT, |
| index TEXT, |
| pivot_cols TEXT, |
| pivot_values TEXT, |
| aggregate_func TEXT, |
| fill_value TEXT, |
| keep_null BOOLEAN, |
| output_col_dictionary BOOLEAN, |
| output_type TEXT |
| |
| ) RETURNS VOID AS $$ |
| PythonFunction(utilities, pivot, pivot) |
| $$ LANGUAGE plpythonu VOLATILE |
| m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `MODIFIES SQL DATA', `'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.pivot( |
| source_table TEXT, |
| output_table TEXT, |
| index TEXT, |
| pivot_cols TEXT, |
| pivot_values TEXT, |
| aggregate_func TEXT, |
| fill_value TEXT, |
| keep_null BOOLEAN, |
| output_col_dictionary BOOLEAN |
| |
| ) RETURNS VOID AS $$ |
| SELECT MADLIB_SCHEMA.pivot($1, $2, $3, $4, $5, $6, $7, $8, $9, NULL) |
| $$ LANGUAGE sql VOLATILE |
| m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `CONTAINS SQL', `'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.pivot( |
| source_table TEXT, |
| output_table TEXT, |
| index TEXT, |
| pivot_cols TEXT, |
| pivot_values TEXT, |
| aggregate_func TEXT, |
| fill_value TEXT, |
| keep_null BOOLEAN |
| |
| ) RETURNS VOID AS $$ |
| SELECT MADLIB_SCHEMA.pivot($1, $2, $3, $4, $5, $6, $7, $8, FALSE) |
| $$ LANGUAGE sql VOLATILE |
| m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `CONTAINS SQL', `'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.pivot( |
| source_table TEXT, |
| output_table TEXT, |
| index TEXT, |
| pivot_cols TEXT, |
| pivot_values TEXT, |
| aggregate_func TEXT, |
| fill_value TEXT |
| |
| ) RETURNS VOID AS $$ |
| SELECT MADLIB_SCHEMA.pivot($1, $2, $3, $4, $5, $6, $7, FALSE, FALSE) |
| $$ LANGUAGE sql VOLATILE |
| m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `CONTAINS SQL', `'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.pivot( |
| source_table TEXT, |
| output_table TEXT, |
| index TEXT, |
| pivot_cols TEXT, |
| pivot_values TEXT, |
| aggregate_func TEXT, |
| keep_null BOOLEAN |
| |
| ) RETURNS VOID AS $$ |
| SELECT MADLIB_SCHEMA.pivot($1, $2, $3, $4, $5, $6, NULL, $7, FALSE) |
| $$ LANGUAGE sql VOLATILE |
| m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `CONTAINS SQL', `'); |
| |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.pivot( |
| source_table TEXT, |
| output_table TEXT, |
| index TEXT, |
| pivot_cols TEXT, |
| pivot_values TEXT, |
| aggregate_func TEXT |
| |
| ) RETURNS VOID AS $$ |
| SELECT MADLIB_SCHEMA.pivot($1, $2, $3, $4, $5, $6, NULL, FALSE, FALSE) |
| $$ LANGUAGE sql VOLATILE |
| m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `CONTAINS SQL', `'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.pivot( |
| source_table TEXT, |
| output_table TEXT, |
| index TEXT, |
| pivot_cols TEXT, |
| pivot_values TEXT, |
| keep_null BOOLEAN |
| |
| ) RETURNS VOID AS $$ |
| SELECT MADLIB_SCHEMA.pivot($1, $2, $3, $4, $5, NULL, NULL, $6, FALSE) |
| $$ LANGUAGE sql VOLATILE |
| m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `CONTAINS SQL', `'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.pivot( |
| source_table TEXT, |
| output_table TEXT, |
| index TEXT, |
| pivot_cols TEXT, |
| pivot_values TEXT |
| |
| ) RETURNS VOID AS $$ |
| SELECT MADLIB_SCHEMA.pivot($1, $2, $3, $4, $5, NULL, NULL, FALSE, FALSE) |
| $$ LANGUAGE sql VOLATILE |
| m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `CONTAINS SQL', `'); |
| |
| -- Online help |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.pivot( |
| message VARCHAR |
| ) RETURNS VARCHAR AS $$ |
| PythonFunction(utilities, pivot, pivot_help) |
| $$ LANGUAGE plpythonu IMMUTABLE |
| m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `CONTAINS SQL', `'); |
| |
| -------------------------------------------------------------------------------- |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.pivot() |
| RETURNS VARCHAR AS $$ |
| SELECT MADLIB_SCHEMA.pivot(''); |
| $$ LANGUAGE sql IMMUTABLE |
| m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `CONTAINS SQL', `'); |
| -------------------------------------------------------------------------------- |
| |