blob: 74cce1b7db4c432004bb664bc12c2856ea6ebb66 [file] [log] [blame]
#!/usr/bin/python
import sys
# import psycopg2
from madpy.ext import plpy
# helper routine to munge the agg arguments
# replaces the string "%%" with the column argument
# and adds the suffix "_distinct" to the AS name of distinct aggs
def __munge_agg(x):
if x.split("(")[0] == x:
x += '(%%)'
fname = x.split("(")[0].split(".")[-1]
x += " AS " + "%%_" + fname
if x.lower().find("distinct") >= 0:
x += "_distinct"
return x
# helper routine to query the database catalog and get column names for
# table "table".
# return value is a list of lists; the first list is the
# integer columns, the second list is the non-integer columns
def catalog_columns(conn, table, skipcols):
# Open a cursor to perform database operations
cur = conn.cursor()
# Fetch integer columnnames from table
cur.execute("""select column_name from information_schema.columns where
table_name = %s and numeric_scale = 0 order by
ordinal_position;""",
(str(table),))
out = cur.fetchall()
numcols = [c[0] for c in out if c[0] not in skipcols]
cur.close()
# Fetch non-integer columnnames from table
cur = conn.cursor()
cur.execute("""select column_name from information_schema.columns where
table_name = %s and (numeric_scale is null or
numeric_scale > 0) order by ordinal_position;""",
(table,))
out = cur.fetchall()
non_numcols = [c[0] for c in out if c[0] not in skipcols]
# Make the changes to the database persistent
# conn.commit()
# Close communication with the database
cur.close()
return([numcols, non_numcols])
# Generates an SQL query for data profiling. Arguments:
# tablename: name of the table/view being profiled
# numaggs: a list of aggregation functions to run on numeric columns
# non_numaggs, a list of aggregation functions to run on all columns
# numcols: a list of numeric column expressions
# non_numcols: a list of non_numeric columns expressions
# The lists of aggs can include forms like "SUM", "SUM(%%)" or even
# "SUM(DISTINCT %%)", where the %% string is replaced by the corresponding
# column expression
def gen_profile_query(tablename, numaggs, non_numaggs, numcols, non_numcols):
retval = "SELECT COUNT(*)\n, "
nums = ", \n".join([", ".join([__munge_agg(a).replace('%%', c)
for a in numaggs+non_numaggs])
for c in numcols])
retval += nums
if nums != None:
retval += ", \n"
retval += ", \n".join([", ".join([__munge_agg(a).replace('%%', c)
for a in non_numaggs])
for c in non_numcols])
retval += "\n FROM "
retval += tablename + ";"
retval = retval.replace( "\n, \n", "\n")
return retval