blob: 214500981f0b0f4aa046e64146c0400b14b5c5f2 [file] [log] [blame]
# coding=utf-8
"""
@file sparse_linear_systems.py_in
@brief sparse Linear Systems:
@namespace marginal
@brief Solve sparse Linear Systems
"""
import plpy
# use mad_vec to process arrays passed as strings in GPDB < 4.1 and PG < 9.0
import re
from utilities.utilities import __mad_version
from utilities.validate_args import table_exists
from utilities.validate_args import columns_exist_in_table
from utilities.validate_args import table_is_empty
from utilities.utilities import _string_to_array
version_wrapper = __mad_version()
string_to_array = version_wrapper.select_vecfunc()
array_to_string = version_wrapper.select_vec_return()
# Direct method: Algorithms dictionary
# ========================================================================
SUPPORTED_SOLVERS = ['direct', 'iterative']
DIRECT_ALG_DICT = {
'ldlt' : 1,
'llt' : 2}
ITERATIVE_IN_MEM_ALG_DICT = {
'cg-mem' : 1,
'bicgstab-mem' : 2,
'precond-cg-mem' : 3,
'precond-bicgstab-mem' : 4}
# In the future, when iterative disk algorithms are introduced, then these
# dictionaries should be differnt
ITERATIVE_ALG_DICT = ITERATIVE_IN_MEM_ALG_DICT
# ========================================================================
# Validate arguments
def _validate_args(schema_madlib,
lhs_table,
rhs_table,
out_table,
lhs_row_id,
lhs_col_id,
lhs_value,
rhs_row_id,
rhs_value,
num_vars,
grouping_cols,
optimizer,
optimizer_options):
"""
@brief Validate args for sparse linear systems
@param lhs_table String identifying the input A matrix
@param rhs_table String identifying the input b vector
@param out_table String identifying the output table to be created
@param lhs_row_id Column name containing the LHS of the equations
@param lhs_col_id Column name containing the LHS of the equations
@param lhs_value Column name containing the LHS of the equations
@param rhs_row_id Column name containing the RHS of the equations
@param rhs_value Column name containing the RHS of the equations
@param num_vars Number of variables in the system
@param grouping_sols Columns to group the linear systems by
@param optimzer Optimizer to be used
@param optimzer_options Optimizer options used
@return void
"""
# LHS and RHS Tables
if not lhs_table or lhs_table in ('null', '') or \
(not table_exists(lhs_table)):
plpy.error("Sparse Linear Systems error: LHS matrix table '%s' does not exist!" % lhs_table)
if not rhs_table or lhs_table in ('null', '') or \
(not table_exists(rhs_table)):
plpy.error("Sparse Linear Systems error: RHS table '%s' does not exist!" % rhs_table)
if table_is_empty(lhs_table):
plpy.error("Sparse Linear Systems error: LHS matrix table is empty!")
if table_is_empty(rhs_table):
plpy.error("Sparse Linear Systems error: RHS vector table is empty!")
# Output table checks
if out_table.lower() in ('null', ''):
plpy.error("Sparse Linear Systems error: Invalid output table name!")
if table_exists(out_table, only_first_schema=True):
plpy.error("Sparse Linear Systems error: Output table already exists!")
# Empty table checks
if table_is_empty(lhs_table):
plpy.error("Sparse Linear Systems error: Empty LHS table!")
if table_is_empty(rhs_table):
plpy.error("Sparse Linear Systems error: Empty RHS table!")
# Check the names for row_id, lhs and rhs
if not lhs_row_id or lhs_row_id.lower() in ('null', ''):
plpy.error("Sparse Linear Systems error: Invalid lhs_row_id column name!")
if not lhs_col_id or lhs_col_id.lower() in ('null', ''):
plpy.error("Sparse Linear Systems error: Invalid lhs_col_id column name!")
if not lhs_value or lhs_value.lower() in ('null', ''):
plpy.error("Sparse Linear Systems error: Invalid lhs_value column name!")
if not rhs_row_id or rhs_row_id.lower() in ('null', ''):
plpy.error("Sparse Linear Systems error: Invalid rhs_row_id column name!")
if not rhs_value or rhs_value.lower() in ('null', ''):
plpy.error("Sparse Linear Systems error: Invalid rhs_value column name!")
# Now check that the row_id, col_id etc are valid columns in teh table
if not columns_exist_in_table(lhs_table, [lhs_row_id]):
plpy.error("""
Sparse Linear Systems error: Column {lhs_row_id} does not exist in the table {lhs_table}
""".format(lhs_row_id = lhs_row_id,
lhs_table = lhs_table))
if not columns_exist_in_table(lhs_table, [lhs_col_id]):
plpy.error("""
Sparse Linear Systems error: Column {lhs_col_id} does not exist in the table {lhs_table}
""".format(lhs_col_id = lhs_col_id,
lhs_table = lhs_table))
if not columns_exist_in_table(lhs_table, [lhs_value]):
plpy.error("""
Sparse Linear Systems error: Column {lhs_value} does not exist in the table {lhs_table}
""".format(lhs_value= lhs_value,
lhs_table = lhs_table))
if not columns_exist_in_table(rhs_table, [rhs_row_id]):
plpy.error("""
Sparse Linear Systems error: Column {rhs_row_id} does not exist in the table {rhs_table}
""".format(rhs_row_id = rhs_row_id,
rhs_table = rhs_table))
if not columns_exist_in_table(rhs_table, [rhs_value]):
plpy.error("""
Sparse Linear Systems error: Column {rhs_value} does not exist in the table {rhs_table}
""".format(rhs_value= rhs_value,
rhs_table = rhs_table))
# Numvars must be positive
if num_vars <= 0:
plpy.error("Sparse Linear Systems error: Negative or null input for number of variables!")
# Check that the optimizer is supported
optimizer = optimizer.lower()
if optimizer not in SUPPORTED_SOLVERS:
plpy.error(""" Sparse Linear Systems error: Optimizer does not exist. Must be in ({supported_solvers}).
""".format(supported_solvers = ", ".join(SUPPORTED_SOLVERS)))
if grouping_cols:
if not columns_exist_in_table(lhs_table,
_string_to_array(grouping_cols), schema_madlib):
plpy.error("Sparse Linear Systems error: Grouping column does not exist!")
# ========================================================================
# Convert numeric array to SQL string
def _internal_py_array_to_sql_string(array):
"""
Convert a python list to an SQL ARRAY
@param array Input python list
@return output SQL array usable in a query
"""
return "ARRAY[%s]" % ','.join(map(str, array))
# Help function for sparse linear solvers
# ========================================================================
def linear_solver_sparse_help(schema_madlib, input_string = None, **kwargs):
"""
Given input string (eg. optimizer name) print out the related
information.
If a family name is given, print out the supported optimizer together
with its default optimizer. If an optimizer name is given, print out the
necessary parameters.
@param input_string Helper function notes depend on the inputs tring
"""
if input_string is None:
input_string = "help"
if (input_string.lower() == "usage" ):
return """
----------------------------------------------------------------
Usage
----------------------------------------------------------------
SELECT {schema_madlib}.linear_solver_sparse(
'lhs_tbl_source', -- Data table (A matrix)
'rhs_tbl_source', -- Data table (b vector)
'tbl_result', -- Result table
'lhs_row_id', -- Name of column containing row_id
'lhs_col_id', -- Name of column containing col_id
'lhs_value' , -- Name of column containing value
'rhs_row_id', -- Name of column containing row_id
'rhs_value' , -- Name of column containing value
num_vars , -- Number of variables
'grouping_cols', -- Grouping columns (Default: NULL)
'optimizer', -- Name of optimizer. Default: 'direct'
'optimizer_params',-- Text array of optimizer parameters
);
----------------------------------------------------------------
Output
----------------------------------------------------------------
The output table (tbl_result in the above) has the following columns:
solution DOUBLE PRECISION[], -- Solution
residual_norm DOUBLE PRECISION, -- Norm of the residual
iters INTEGER -- Iterations of the algorithm
----------------------------------------------------------------
Summary
----------------------------------------------------------------
The following solvers are supported:
(1) Direct Methods
-------------------
SELECT {schema_madlib}.linear_solve_sparse('direct');
to see more help.
(2) Iterative Methods
-------------------
SELECT {schema_madlib}.linear_solve_sparse('iterative');
to see more help.
""".format(schema_madlib = schema_madlib)
elif input_string.lower() == "direct":
return """
----------------------------------------------------------------
Direct methods for sparse linear systems (direct)
----------------------------------------------------------------
Direct methods are IN-MEMORY solutions for sparse linear systems.
The methods however will not work if the input matrices are larger
than 1GB.
There are several algorithms that can be classified as 'direct' methods
of solving linear systems. Madlib functions provide various algorithmic
options available for users.
Optional Parameters --------------------------------
algorithm - DEFAULT is ldlt
The following table provides a guideline on the choice
of algorithm based on conditions on the A matrix,
speed of the algorithms and memory usage.
Algorithm | Contitions on A | Speed | Memory
----------------------------------------------------------
llt | Sym. Pos Def | ++ | ---
ldlt | Sym. Pos Def | ++ | ---
For memory, '-' uses less memory than '--'
For speed, '++' is faster than '+'
"""
elif input_string.lower() == "iterative":
return """
----------------------------------------------------------------
Iterative methods for sparse linear systems (iterative)
----------------------------------------------------------------
Iterative methods are iterative solutions for sparse linear systems.
The methods however will not work if the input matrices are larger
than 1GB.
There are several algorithms that can be classified as 'iterative' methods
of solving linear systems. Madlib functions provide various algorithmic
options available for users.
Optional Parameters --------------------------------
algorithm - DEFAULT is cg-mem
The following table provides a guideline on the choice
of algorithm based on conditions on the A matrix,
speed of the algorithms and memory usage.
Algorithm | Contitions on A | Speed | Memory
----------------------------------------------------------
cg-mem | Sym. Pos Def | +++ | -
bicgstab-mem | Square | ++ | -
precond-cg-mem | Sym. Pos Def | ++ | -
precond-bicgstab-mem | Square | + | -
For memory, '-' uses less memory than '--'
For speed, '++' is faster than '+'
"""
else:
return """
----------------------------------------------------------------
Summary
----------------------------------------------------------------
The following solvers are supported:
(1) Direct Solve
--
Run:
SELECT {schema_madlib}.linear_solve_sparse('direct');
to see more help.
(2) Iterative Solve
--
Run:
SELECT {schema_madlib}.linear_solve_sparse('iterative');
to see more help.
--
Run: SELECT {schema_madlib}.linear_solve_sparse('usage');
to see how to use.
--
""".format(schema_madlib = schema_madlib)
# Load default options
# ========================================================================
def _load_default_options(optimizer):
"""
@brief Function to return a dictionary with default options for the solver
@param optimzer Optimizer to be used
@return dictionary
"""
OPTIONS_DICT = {}
# Load default options for all solvers here
if optimizer == 'direct':
OPTIONS_DICT['algorithm'] = 'ldlt'
elif optimizer == 'iterative':
OPTIONS_DICT['algorithm'] = 'cg-mem'
OPTIONS_DICT['toler'] = 1e-5
OPTIONS_DICT['maxit'] = 1e5
return OPTIONS_DICT
# ========================================================================
# Convert numeric array to SQL string
def _internal_py_array_to_sql_string(array):
return "ARRAY[%s]" % ','.join(map(str, array))
# Main function for sparse linear solvers
# ========================================================================
def linear_solver_sparse(schema_madlib,
lhs_table,
rhs_table,
out_table,
lhs_row_id,
lhs_col_id,
lhs_value,
rhs_row_id,
rhs_value,
num_vars,
grouping_cols,
optimizer,
optimizer_options,
**kwargs):
"""
@brief Main function for sparse linear systems
@param lhs_table String identifying the input A matrix
@param rhs_table String identifying the input b vector
@param out_table String identifying the output table to be created
@param lhs_row_id Column name containing the LHS of the equations
@param lhs_col_id Column name containing the LHS of the equations
@param lhs_value Column name containing the LHS of the equations
@param rhs_row_id Column name containing the RHS of the equations
@param rhs_value Column name containing the RHS of the equations
@param num_vars Number of variables in the system
@param grouping_sols Columns to group the linear systems by
@param optimzer Optimizer to be used
@param optimzer_options Optimizer options used
@return void
"""
# Reset the message level to avoid random messages
# -------------------------------------------------------------------------
old_msg_level = plpy.execute("""
SELECT setting
FROM pg_settings
WHERE name='client_min_messages'
""")[0]['setting']
plpy.execute('SET client_min_messages TO warning')
# Validate Inputs
# -------------------------------------------------------------------------
if optimizer is None:
optimizer = "direct"
if num_vars is None:
num_vars = -1
_validate_args(schema_madlib,
lhs_table,
rhs_table,
out_table,
lhs_row_id,
lhs_col_id,
lhs_value,
rhs_row_id,
rhs_value,
num_vars,
grouping_cols,
optimizer,
optimizer_options)
# Parse optional inputs
# -------------------------------------------------------------------------
optimizer = optimizer.lower()
OPTIONS_DICT = _load_default_options(optimizer)
if optimizer_options is not None:
optimizer_options = optimizer_options.split(',')
for opt in optimizer_options:
try:
[param, value] = opt.split("=")
OPTIONS_DICT[param.strip().lower()] = value.strip().lower()
except:
plpy.error("""Sparse linear systems error: Optimizer options must be of the form 'param1 = value1, param2 = value2'""")
# Sparse Linear System Solve
# -------------------------------------------------------------------------
# Step 1: Calculate the number of equations in the system
sql_output = plpy.execute("""
SELECT max({lhs_row_id}) AS numequations,
max({lhs_col_id}) AS max_col_id,
count({lhs_row_id}) AS nnz
FROM {lhs_table}
""".format( lhs_row_id = lhs_row_id,
lhs_col_id = lhs_col_id,
lhs_table = lhs_table))
numEquations = int(sql_output[0]['numequations']) + 1
nnz = int(sql_output[0]['nnz'])
max_col_id = int(sql_output[0]['max_col_id'])
# Check that the columns are consistent with num_vars
if num_vars <= max_col_id:
plpy.error("""
Sparse linear systems error: The num_vars input is {num_vars} but the largest entry in column {lhs_col_id} of table {lhs_table} is {max_col_id}.
""".format( num_vars = num_vars,
lhs_table = lhs_table,
lhs_col_id = lhs_col_id,
max_col_id = max_col_id))
# Step 2: Chose the right solver
if optimizer == 'direct':
# Call the SQL function to evaluate the direct linear system
sparse_solution = _direct_sparse_linear_system_solve(
schema_madlib,
lhs_table,
rhs_table,
lhs_row_id,
lhs_col_id,
lhs_value,
rhs_row_id,
rhs_value,
numEquations,
num_vars,
nnz,
OPTIONS_DICT)
elif optimizer == 'iterative':
# Call the SQL function to evaluate the iterative linear system
# Warning: Some iterative solvers are in-memory while others are on disk
sparse_solution = _iterative_sparse_linear_system_solve(
schema_madlib,
lhs_table,
rhs_table,
lhs_row_id,
lhs_col_id,
lhs_value,
rhs_row_id,
rhs_value,
numEquations,
num_vars,
nnz,
OPTIONS_DICT)
# Step 3: Insert the solution stats into table
plpy.execute("""
CREATE TABLE {out_table} (
solution DOUBLE PRECISION[],
residual_norm DOUBLE PRECISION,
iters INTEGER)
""".format(
out_table = out_table))
solution = sparse_solution["solution"]
residual_norm = sparse_solution["residual_norm"]
iters = sparse_solution["iters"]
# Check for NULL (Convert to None)
if iters is None:
iters = "NULL"
if residual_norm is None:
residual_norm = "NULL"
insert_string = """
INSERT INTO {out_table} VALUES ({solution}, {residual_norm}, {iters});
""".format(out_table = out_table,
solution = _internal_py_array_to_sql_string(solution),
residual_norm = residual_norm,
iters = iters)
# Step 4: Clean up output to make sure infinity and nan are cast properly
insert_string = re.sub('Infinity|inf', "'Infinity'::double precision", insert_string)
insert_string = re.sub('Nan|nan', "'Nan'::double precision", insert_string)
plpy.execute(insert_string)
# Reset the message level
plpy.execute("SET client_min_messages TO %s" % old_msg_level)
# ========================================================================
# Iterative sparse linear systems
def _iterative_sparse_linear_system_solve(schema_madlib,
lhs_table,
rhs_table,
lhs_row_id,
lhs_col_id,
lhs_value,
rhs_row_id,
rhs_value,
numEquations,
numVars,
nnz,
OPTIONS_DICT):
"""
@brief Run SQL for sparse direct linear systems
@param schema_madlib Schema for Madlib
@param lhs_table String identifying the input table
@param row_id Column name with row_id
@param left_hand_side Column name containing the LHS of the equations
@param right_hand_side Column name containing the RHS of the equations
@param numEquations Number of equations in the linear system
@param OPTIONS_DICT Dictionary with options
@return output_solution Dictionary pointing to SQL objects with the solution
"""
# Check that the options provided in the input are supported
if OPTIONS_DICT['algorithm'] not in ITERATIVE_ALG_DICT.keys():
plpy.error("""Iterative method supports only algorithms in ({alg_list})
""".format(alg_list = ','.join(ITERATIVE_ALG_DICT.keys())))
# Convert the algorithm string to the option (integer)
algorithm = ITERATIVE_ALG_DICT[OPTIONS_DICT['algorithm']]
termToler = float(OPTIONS_DICT['toler'])
maxIter = int(OPTIONS_DICT['maxit'])
# In memory iterative solvers
if OPTIONS_DICT['algorithm'] in ITERATIVE_IN_MEM_ALG_DICT:
# Run the SQL for sparse direct linear systems
sparse_solution = plpy.execute("""
SELECT (output).*
FROM (
SELECT {schema_madlib}.sparse_inmem_iterative_linear_system(
({lhs_table}.{lhs_row_id})::INTEGER,
({lhs_table}.{lhs_col_id})::INTEGER,
{lhs_table}.{lhs_value},
coalesce({rhs_table}.{rhs_value}, 0),
{numEquations},
{numVars},
{nnz},
{algorithm},
{maxIter},
{termToler}
) AS output
FROM {lhs_table} LEFT JOIN {rhs_table} on
({lhs_table}.{lhs_row_id} = {rhs_table}.{rhs_row_id}) )q
""".format(
schema_madlib = schema_madlib,
lhs_table = lhs_table,
rhs_table = rhs_table,
lhs_row_id = lhs_row_id,
lhs_col_id = lhs_col_id,
lhs_value = lhs_value,
rhs_value = rhs_value,
rhs_row_id = rhs_row_id,
numEquations = numEquations,
numVars = numVars,
nnz = nnz,
algorithm = algorithm,
maxIter = maxIter,
termToler = termToler
))
output_solution = {}
output_solution["solution"] = string_to_array(sparse_solution[0]["solution"])
output_solution["iters"] = sparse_solution[0]["iters"]
output_solution["residual_norm"] = sparse_solution[0]["residual_norm"]
return output_solution
# ========================================================================
# Direct sparse linear systems
def _direct_sparse_linear_system_solve(schema_madlib,
lhs_table,
rhs_table,
lhs_row_id,
lhs_col_id,
lhs_value,
rhs_row_id,
rhs_value,
numEquations,
numVars,
nnz,
OPTIONS_DICT):
"""
@brief Run SQL for sparse direct linear systems
@param schema_madlib Schema for Madlib
@param lhs_table String identifying the input table
@param row_id Column name with row_id
@param left_hand_side Column name containing the LHS of the equations
@param right_hand_side Column name containing the RHS of the equations
@param numEquations Number of equations in the linear system
@param OPTIONS_DICT Dictionary with options
@return output_solution Dictionary pointing to SQL objects with the solution
"""
# Check that the options provided in the input are supported
if OPTIONS_DICT['algorithm'] not in DIRECT_ALG_DICT.keys():
plpy.error("""Direct method supports only algorithms in ({alg_list})
""".format(alg_list = ','.join(DIRECT_ALG_DICT.keys())))
# Convert the algorithm string to the option (integer)
algorithm = DIRECT_ALG_DICT[OPTIONS_DICT['algorithm']]
# Run the SQL for sparse direct linear systems
sparse_solution = plpy.execute("""
SELECT (output).*
FROM (
SELECT {schema_madlib}.sparse_direct_linear_system(
({lhs_table}.{lhs_row_id})::INTEGER,
({lhs_table}.{lhs_col_id})::INTEGER,
{lhs_table}.{lhs_value},
coalesce({rhs_table}.{rhs_value}, 0),
{numEquations},
{numVars},
{nnz},
{algorithm}
) AS output
FROM {lhs_table} LEFT JOIN {rhs_table} on
({lhs_table}.{lhs_row_id} = {rhs_table}.{rhs_row_id}) )q
""".format(
schema_madlib = schema_madlib,
lhs_table = lhs_table,
rhs_table = rhs_table,
lhs_row_id = lhs_row_id,
lhs_col_id = lhs_col_id,
lhs_value = lhs_value,
rhs_value = rhs_value,
rhs_row_id = rhs_row_id,
numEquations = numEquations,
numVars = numVars,
nnz = nnz,
algorithm = algorithm
))
output_solution = {}
output_solution["solution"] = string_to_array(sparse_solution[0]["solution"])
output_solution["iters"] = None
output_solution["residual_norm"] = sparse_solution[0]["residual_norm"]
return output_solution