blob: 925da2dc59355e110e0e1ae0d6ddb363fac314c6 [file] [log] [blame]
"""
@file pca_project.py_in
@namespace pca
"""
import plpy
import time
from linalg.matrix_ops import cast_dense_input_table_to_correct_columns
from linalg.matrix_ops import create_temp_sparse_matrix_table_with_dims
from linalg.matrix_ops import get_dims
from linalg.matrix_ops import validate_dense
from linalg.matrix_ops import validate_sparse
from utilities.utilities import __mad_version
from utilities.utilities import unique_string
from utilities.utilities import _assert
from utilities.utilities import _array_to_string
from utilities.validate_args import columns_exist_in_table
from utilities.validate_args import table_exists
from utilities.utilities import add_postfix
version_wrapper = __mad_version()
string_to_array = version_wrapper.select_vecfunc()
array_to_string = version_wrapper.select_vec_return()
ZERO_THRESHOLD = 1e-6
# Dense PCA help function
def pca_project_help(schema_madlib, usage_string=None, **kwargs):
"""
Given a usage string, give out function usage information.
"""
if usage_string is None:
usage_string = ''
if (usage_string.lower() == "usage"):
return """
----------------------------------------------------------------
Usage
----------------------------------------------------------------
SELECT {schema_madlib}.pca_project (
'tbl_source', -- Data table
'tbl_pc', -- Table with principal componenents
(obtained as output from pca_train)
'tbl_result', -- Result table
'row_id', -- Name of the column containing the row_id
-- Optional Parameters
----------------------------------------------------------------
'tbl_residual', -- Residual table (Default: NULL)
'tbl_result_summary', -- Result summary table (Default : NULL)
);
Output Tables
--------------------------------------------------------------------
The output is divided into three tables (two of which are optional)
--------------------------------------------------------------------
The output table ('tbl_result' above) encodes a dense matrix
with the projection onto the principal components. The matrix contains
the following columns:
'row_id' INTEGER, -- Row id of the output matrix
'row_vec' DOUBLE PRECISION[], -- A vector containing elements in the row of the matrix
--------------------------------------------------------------------
The residual table ('tbl_residual' above) encodes a dense residual
matrix which has the following columns
'row_id' INTEGER, -- Row id of the output matrix
'row_vec' DOUBLE PRECISION[], -- A vector containing elements in the row of the matrix
--------------------------------------------------------------------
The result summary table ('tbl_result_summary' above) has the following columns
'exec_time' INTEGER, -- Wall clock time (ms) of the function.
'residual_norm' DOUBLE PRECISION, -- Absolute error of the residuals
'relative_residual_norm' DOUBLE PRECISION -- Relative error of the residuals
----------------------------------------------------------------
""".format(schema_madlib=schema_madlib)
else:
return """
----------------------------------------------------------------
Summary: PCA Projection
----------------------------------------------------------------
PCA Projection: Projects a dataset to an already trained
space of principal components.
--
For function usage information, run
SELECT {schema_madlib}.pca_project('usage');
--
""".format(schema_madlib=schema_madlib)
# Sparse PCA help function
# ------------------------------------------------------------------------
def pca_sparse_project_help(schema_madlib, usage_string=None, **kwargs):
"""
Given a usage string, give out function usage information.
"""
if usage_string is None:
usage_string = ''
if (usage_string.lower() == "usage"):
return """
----------------------------------------------------------------
Usage
----------------------------------------------------------------
SELECT {schema_madlib}.pca_sparse_project (
'tbl_source', -- Data table
'tbl_pc', -- Table with principal componenents
(obtained as output from pca_train)
'tbl_result', -- Result table
'row_id', -- Name of the column containing the row_id
'col_id', -- Name of the column containing the col_id
'val_id', -- Name of the column containing the val_id
'row_dim' -- Row dimension of the sparse matrix
'col_dim' -- Column dimension of the sparse matrix
-- Optional Parameters
----------------------------------------------------------------
'tbl_residual', -- Residual table (Default: NULL)
'tbl_result_summary', -- Result summary table (Default : NULL)
);
Output Tables
----------------------------------------------------------------
The output is divided into three tables (two of which are optional)
-----------------------------------------------------------------------------------------
The output table ('tbl_result' above) encodes a dense matrix
with the projection onto the principal components. The matrix contains
the following columns:
'row_id' INTEGER, -- Row id of the output matrix
'row_vec' DOUBLE PRECISION[], -- A vector containing elements in the row of the matrix
-----------------------------------------------------------------------------------------
The residual table ('tbl_residual' above) encodes a dense residual
matrix which has the following columns
'row_id' INTEGER, -- Row id of the output matrix
'row_vec' DOUBLE PRECISION[], -- A vector containing elements in the row of the matrix
-----------------------------------------------------------------------------------------
The result summary table ('tbl_result_summary' above) has the following columns
'exec_time' INTEGER, -- Wall clock time (ms) of the function.
'residual_norm' DOUBLE PRECISION, -- Absolute error of the residuals
'relative_residual_norm' DOUBLE PRECISION -- Relative error of the residuals
----------------------------------------------------------------
""".format(schema_madlib=schema_madlib)
else:
return """
----------------------------------------------------------------
Summary: PCA Projection
----------------------------------------------------------------
PCA Projection: Projects a dataset to an already trained
space of principal components.
--
For function usage information, run
SELECT {schema_madlib}.pca_sparse_project('usage');
--
""".format(schema_madlib=schema_madlib)
# Validate arguments: Same as pca
# ------------------------------------------------------------------------
def _validate_args(schema_madlib,
source_table,
pc_table,
out_table,
row_id,
col_id=None,
val_id=None,
row_dim=None,
col_dim=None,
residual_table=None,
result_summary_table=None):
"""
Validates all arguments passed to the PCA function
Args:
@param schema_madlib Name of MADlib schema
@param source_table Name of the input table (containing data to project)
@param pc_table Name of table with principal components (output by the training function)
@param out_table Name of output table to store projection result
@param row_id Name of the row_id column
@param col_id Name of the col_id column (only for sparse matrices)
@param val_id Name of the val_id column (only for sparse matrices)
@param row_dim Number of rows in input matrix (only for sparse matrices)
@param col_dim Number of columns in input matrix (only for sparse matrices)
@param residual_table Name of the residual table (to store error in projection)
@param result_summary_table Name of result summary table
Returns:
None
Throws:
plpy.error if any argument is invalid
"""
_assert(source_table is not None and table_exists(source_table),
"PCA error: Source data table does not exist!")
_assert(pc_table is not None and table_exists(pc_table),
"PCA error: Principal comp. table does not exist!")
_assert(table_exists(add_postfix(pc_table, "_mean")),
"PCA error: Source data table column means does not exist!")
# Make sure that the output table does not exist
# Also check that the output table is not null
_assert(out_table and out_table.strip(),
"PCA error: Invalid output table name.")
_assert(not table_exists(out_table, only_first_schema=True),
"PCA error: Output table {0} already exists!".format(str(out_table)))
# Check that the result summary table is not empty
if result_summary_table is not None:
_assert(result_summary_table.strip(),
"PCA error: Invalid result summary table name!")
_assert(not table_exists(result_summary_table, only_first_schema=True),
"PCA error: Result summary table {0} already exists!".
format(result_summary_table))
# Check that the result summary table is not empty
if residual_table is not None:
_assert(residual_table.strip(),
"PCA error: Invalid residual table name!")
_assert(not table_exists(residual_table, only_first_schema=True),
"PCA error: Residual table {0} already exists!".
format(residual_table))
# Check that the row_id exists
_assert(columns_exist_in_table(source_table, [row_id], schema_madlib),
"PCA error: {1} column does not exist in {0}!".
format(source_table, "NULL" if row_id is None else row_id))
# For sparse inputs: Check that the row_id exists
if col_id or val_id:
_assert(col_id,
"PCA error: Column ID should be provided if value ID is input!")
_assert(val_id,
"PCA error: Value ID should be provided if column ID is input!")
_assert(columns_exist_in_table(source_table, [col_id], schema_madlib),
"PCA error: {1} column does not exist in {0}!".
format(source_table, col_id))
_assert(columns_exist_in_table(source_table, [val_id], schema_madlib),
"PCA error: {1} column does not exist in {0}!".
format(source_table, val_id))
_assert(row_dim > 0 and col_dim > 0,
"PCA error: row_dim/col_dim should be positive integer")
validate_sparse(source_table,
{'row': row_id, 'col': col_id, 'val': val_id},
checkCol=False)
# ------------------------------------------------------------------------
def pca_sparse_project(schema_madlib,
source_table,
pc_table,
out_table,
row_id,
col_id,
val_id,
row_dim,
col_dim,
residual_table,
result_summary_table,
**kwargs):
"""
PCA projection of the matrix in source_table.
This function is the specific call for pca projection. It projects
the input matrix into the principal components.
Args:
@param schema_madlib Name of MADlib schema
@param source_table Name of the input table (containing data to project)
@param pc_table Name of table with principal components (output by the training function)
@param out_table Name of output table to store projection result
@param row_id Name of the row_id column
@param col_id Name of the col_id column
@param val_id Name of the val_id column
@param row_dim Number of rows in input matrix
@param col_dim Number of columns in input matrix
@param residual_table Name of the residual table (to store error in projection)
@param result_summary_table Name of result summary table
Returns:
None
Throws:
plpy.error if any argument is invalid
"""
# Reset the message level to avoid random messages
old_msg_level = plpy.execute("""
SELECT setting
FROM pg_settings
WHERE name='client_min_messages'
""")[0]['setting']
plpy.execute('SET client_min_messages TO warning')
# Step 1: Validate the input arguments
_validate_args(schema_madlib,
source_table,
pc_table,
out_table,
row_id,
col_id,
val_id,
row_dim,
col_dim,
residual_table,
result_summary_table)
# Step 2: Create a copy of the sparse matrix and add row_dims and col_dims
# Warning: This changes the column names of the table
sparse_table_copy = "pg_temp." + unique_string() + "_sparse_table_copy"
create_temp_sparse_matrix_table_with_dims(source_table,
sparse_table_copy,
row_id,
col_id,
val_id,
row_dim,
col_dim)
# Step 3: Densify the input matrix
x_dense = "pg_temp." + unique_string() + "_dense"
plpy.execute("""
SELECT {schema_madlib}.matrix_densify(
'{sparse_table_copy}', 'row={row_id}, col={col_id}, val={val_id}',
'{x_dense}', 'row=row_id, col=col_id,val=row_vec')
""".format(**locals()))
# Step 4: Pass the densified matrix to regular PCA
pca_project(schema_madlib,
x_dense,
pc_table,
out_table,
'row_id',
residual_table,
result_summary_table)
# Step 4: Clean up
plpy.execute(
"""
DROP TABLE IF EXISTS {x_dense};
DROP TABLE IF EXISTS {sparse_table_copy};
""".format(x_dense=x_dense,
sparse_table_copy=sparse_table_copy))
plpy.execute("SET client_min_messages TO %s" % old_msg_level)
# ------------------------------------------------------------------------
def pca_project(schema_madlib,
source_table,
pc_table,
out_table,
row_id,
residual_table,
result_summary_table,
**kwargs):
"""
PCA projection of the matrix in source_table.
This function is the specific call for pca projection. It projects
the input matrix into the principal components.
Args:
@param schema_madlib Name of MADlib schema
@param source_table Name of the input table (containing data to project)
@param pc_table Name of table with principal components (output by the training function)
@param out_table Name of output table to store projection result
@param row_id Name of the row_id column
@param residual_table Name of the residual table (to store error in projection)
@param result_summary_table Name of result summary table
Returns:
None
Throws:
plpy.error if any argument is invalid
"""
t0 = time.time() # measure the starting time
# Reset the message level to avoid random messages
old_msg_level = plpy.execute("""
SELECT setting
FROM pg_settings
WHERE name='client_min_messages'
""")[0]['setting']
plpy.execute('SET client_min_messages TO warning')
# Step 1: Validate the input arguments
_validate_args(schema_madlib, source_table, pc_table, out_table,
row_id, None, None, None, None,
residual_table, result_summary_table)
# Make sure that the table has row_id and row_vec
source_table_copy = "pg_temp." + unique_string()
need_new_column_names = cast_dense_input_table_to_correct_columns(
schema_madlib, source_table, source_table_copy, row_id)
if(need_new_column_names):
source_table = source_table_copy
[row_dim, col_dim] = get_dims(source_table,
{'row': 'row_id', 'col': 'col_id',
'val': 'row_vec'})
validate_dense(source_table,
{'row': 'row_id', 'col': 'col_id', 'val': 'row_vec'},
checkCol=False, row_dim=row_dim)
# Step 2: Compute the PCA Projection matrix
# The R code to perform this step is
# p <- princomp(mat)
# low_rank_representation <- mat %*% p$loadings[,1:k]
# First normalize the data (Column means)
scaled_source_table = "pg_temp." + unique_string() + "_scaled_table"
x_std_str = _array_to_string([1] * col_dim)
pc_table_mean = add_postfix(pc_table, "_mean")
plpy.execute(
"""
CREATE TABLE {scaled_source_table} AS
SELECT
row_id,
({schema_madlib}.utils_normalize_data(
row_vec,
(select column_mean from {pc_table_mean}),
'{x_std_str}'::double precision[]))
AS row_vec
FROM {source_table}
""".format(schema_madlib=schema_madlib,
pc_table_mean=pc_table_mean,
source_table=source_table,
scaled_source_table=scaled_source_table,
x_std_str=x_std_str))
plpy.execute(
"""
SELECT {schema_madlib}.matrix_mult('{scaled_source_table}',
'trans=false,row=row_id, col=col_id, val=row_vec',
'{pc_table}',
'trans=TRUE, row=row_id, col=col_id, val=principal_components',
'{out_table}',
'row=row_id, col=col_id,val=row_vec');
""".format(schema_madlib=schema_madlib,
scaled_source_table=scaled_source_table,
pc_table=pc_table,
out_table=out_table))
# Step 3: Compute the Residual table (if required)
# Residual table: res = mat - proj
if residual_table or result_summary_table:
create_temp_residual_table = False
if not residual_table:
create_temp_residual_table = True
residual_table = "pg_temp." + unique_string() + "_temp_residual"
approx_table = "pg_temp." + unique_string() + "_approx"
# Build an approximate reconstruction of the data
plpy.execute(
"""
SELECT {schema_madlib}.matrix_mult('{out_table}',
'row=row_id, col=col_id, val=row_vec',
'{pc_table}',
'row=row_id, col=col_id, val=principal_components',
'{approx_table}',
'row=row_id, col=col_id, val=row_vec');
""".format(schema_madlib=schema_madlib,
out_table=out_table,
pc_table=pc_table,
approx_table=approx_table))
# Compute the difference between the reconstruction and real data
# Note that both the approximation and source data are recentered here
plpy.execute(
"""
SELECT {schema_madlib}.matrix_scale_and_add(
'{scaled_source_table}',
'row=row_id, col=col_id, val=row_vec',
'{approx_table}',
'row=row_id, col=col_id, val=row_vec',
-1,
'{residual_table}',
'row=row_id, col=col_id, val=row_vec');
""".format(schema_madlib=schema_madlib,
scaled_source_table=scaled_source_table,
approx_table=approx_table,
residual_table=residual_table))
# Step 4: Compute the results summary table (if required)
# If the residual table is not asked by the user, but he does ask for
# result summary table, then we need to compute the residuals
if result_summary_table:
source_table_norm = plpy.execute(
"""
SELECT {schema_madlib}.matrix_norm('{source_table}',
'row=row_id, col=col_id, val=row_vec') as r
""".format(schema_madlib=schema_madlib,
source_table=source_table,
row_id=row_id))[0]['r']
# Compute the norm of the residual table
residual_norm = plpy.execute(
"""
SELECT {schema_madlib}.matrix_norm('{residual_table_name}',
'row=row_id, col=col_id, val=row_vec') as r
""".format(schema_madlib=schema_madlib,
residual_table_name=residual_table,
row_id=row_id))[0]['r']
# Compute the relative error of the norm
# Prevent division by zero
if(source_table_norm > ZERO_THRESHOLD):
relative_residual_norm = residual_norm / source_table_norm
else:
relative_residual_norm = 0
plpy.execute(
"""
CREATE TABLE {result_summary_table} ( exec_time FLOAT8,
residual_norm FLOAT8,
relative_residual_norm FLOAT8);
""".format(result_summary_table=result_summary_table))
# Compute the time in milli-seconds
t1 = time.time()
dt = (t1 - t0) * 1000.
plpy.execute(
"""
INSERT INTO {result_summary_table} VALUES
({dt},
{residual_norm}::double precision,
{relative_residual_norm}::double precision);
""".format(dt=dt,
residual_norm=residual_norm,
result_summary_table=result_summary_table,
relative_residual_norm=relative_residual_norm))
plpy.execute("""
DROP TABLE IF EXISTS {approx_table};
""".format(approx_table=approx_table))
if create_temp_residual_table:
plpy.execute("""
DROP TABLE IF EXISTS {residual_table};
""".format(residual_table=residual_table))
plpy.execute("DROP TABLE IF EXISTS {0}".format(scaled_source_table))
plpy.execute("SET client_min_messages TO %s" % old_msg_level)