blob: f690d9472e892017e9f818b68c691b03fd689277 [file] [log] [blame]
/* ----------------------------------------------------------------------- *//**
*
* @file profile.sql_in
*
* @brief SQL function for single-pass table profiles
* @date January 2011
*
* @sa For a brief introduction to "profiles", see the module
* description grp_profile. Cf. also the module grp_sketches.
*
*//* ----------------------------------------------------------------------- */
/**
@addtogroup grp_profile
@about
This module computes a "profile" of a table or view: a configurable set of aggregates to be run on each column of a table.
Because the input schema of the table or view is unknown, we need to synthesize SQL to suit. This is done either via the UDF <c>profile_run</c>, or from the command line via the script <c>profile.py</c>.
@prereq
Requires MADlib sketch module, which provides single-pass approximations of various descriptive statistics.
The -k flag to the command line generates histograms, represented as PostgreSQL arrays. The only python DB API library we've used that works with arrays is pygresql. See Bugs below.
@usage
1) As a command-line script:
Running <c>python profile.py -h</c> gives usage information:
\code
Usage: profile.py [options] tablename
(note: database connection info taken from Config.yml, see -c flag)
Options:
-h, --help show this help message and exit
-n NUMERICAGGS, --numeric=NUMERICAGGS
array of aggs for integer columns
-t NON_NUMERICAGGS, --nonnumeric=NON_NUMERICAGGS
array of aggs for non-numeric columns
-k, --kitchensink compute longer list of statistics
-s SKIPCOLS, --skip=SKIPCOLS
array of columns to skip
-c CONFIGDIR, --configdir=CONFIGDIR
directory holding Config.yml (default madpack SCRIPTDIR if
already initialized, else madpy directory)
\endcode
2) As a database stored procedure:
Function: <tt>profile_run( '<em>input_table</em>')</tt>
Parameters:
- <em>input_table</em> : fully qualified table name to analyze
Example:
\code
select madlib.profile_run( 'pg_tables');
\endcode
\bug
- Known problem with psycopg2:
The python <c>psycopg2</c> DB API library seems unhappy with array
outputs, like those returned by the histogram UDAs invoked by
<c>profile.py -k</c>. This is a known bug that was supposedly closed in 2005
(http://osdir.com/ml/python.db.psycopg.devel/2005-12/msg00032.html) but still
seems to be present as of version 2.3.2. The simple workaround is to use
<c>pygresql</c> instead of <c>psycopg2</c>.
@sa file profile.sql_in (documenting the SQL function), module grp_sketches
*/
/**
* @brief Compute a "profile" of a table or view
*
* @param input_table table name to analyze
* @return Textual summary of the algorithm run
*/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.profile_run( input_table text)
RETURNS text
AS $$
import sys
try:
from madlib import profile
except:
sys.path.append("PLPYTHON_LIBDIR")
from madlib import profile
plpy.execute( 'set client_min_messages=warning');
return profile.profile_run( input_table);
$$ LANGUAGE plpythonu;