src/ports/postgres/modules/assoc_rules/assoc_rules.py_in - madlib - Git at Google

 # coding=utf-8

 """
 @file assoc_rules.py_in

 @brief Association Rules - Apriori Algorithm Implementation.

 @namespace assoc_rules
 """
 import time
 import plpy
 from utilities.validate_args import columns_exist_in_table
 from utilities.validate_args import table_exists
 from utilities.validate_args import output_tbl_valid
 from utilities.utilities import unique_string
 from utilities.control import MinWarning


 """
 @brief if the given condition is false, then raise an error with the message
 @param condition    the condition to be assert
 @param msg          the error message to be reported
 """
 def __assert(condition, msg):
     if not condition :
         plpy.error(msg);

 """
 @brief test if the 1st value is greater than the 2nd value
 @param val1    a float number
 @param val2    a float number
 @return true if the 1st value greater than the 2nd value.
         Otherwise return false
 """
 def __float_gt(val1, val2):
     return val1 - val2 > 1E-14;


 """
 @brief test if the 1st value is less than or equal to the 2nd value
 @param val1    a float number
 @param val2    a float number
 @return true if the 1st value is less than or equal to the 2nd value.
         Otherwise return false
 """
 def __float_le(val1, val2):
     return (val1 - val2) < -1E-14 or abs(val1 - val2) < 1E-14;


 """
 @brief The entry function for the Apriori.
 @param support         minimum level of support needed for each itemset
                        to be included in result
 @param confidence      minimum level of confidence needed for each rule
                        to be included in result
 @param tid_col         name of the column storing the transaction ids
 @param item_col        name of the column storing the products
 @param input_table     name of the table where the data is stored
 @param output_schema   name of the schema where the final results will be stored
 @param verbose         determining if output contains comments
 @param max_itemset_size determines the maximum size of frequent itemsets allowed
                         to generate association rules from
 @param max_lhs_size    determines the maximum size of the lhs of the rule
 @param max_rhs_size    determines the maximum size of the rhs of the rule
 """

 @MinWarning("warning")
 def assoc_rules(madlib_schema, support, confidence, tid_col,
                 item_col, input_table, output_schema, verbose,
                 max_itemset_size, max_lhs_size, max_rhs_size):

     begin_func_exec = time.time();
     begin_step_exec = time.time();
     cal_itemsets_time = 0;
     if max_itemset_size is None:
         max_itemset_size = 10
     elif max_itemset_size <= 1:
         plpy.error("ERROR: max_itemset_size has to be greater than 1.")

     #Validate LHS RHS
     __assert(max_lhs_size is None or max_lhs_size > 0,
              "max_lhs_size param must be a positive number.")
     __assert(max_rhs_size is None or max_rhs_size > 0,
              "max_rhs_size param must be a positive number.")
     #check parameters
     __assert(
             support is not None and
             confidence is not None and
             tid_col is not None and
             item_col is not None and
             input_table is not None and
             verbose is not None,
             "none of the parameters except the output_schema should be null"
             );

     __assert(
             __float_gt(support, 0) and __float_le(support, 1),
             "the parameter support must be in range of (0, 1]"
             );

     __assert(
             __float_gt(confidence, 0) and __float_le(confidence, 1),
             "the parameter confidence must be in range of (0, 1]"
             );

     __assert(len(input_table) > 0 and table_exists(input_table, madlib_schema) and
              len(tid_col) > 0 and len(item_col) > 0 and
              columns_exist_in_table(input_table, [tid_col, item_col], madlib_schema),
             "the input table must exist and the tid_col and "
             "item_col must be in the input table")

     if output_schema is None :
         rv = plpy.execute("SELECT current_schema() as s");
         output_schema = rv[0]["s"];
     else :
         rv = plpy.execute("""
             SELECT count(*) as cnt
             FROM pg_namespace
             where quote_ident(nspname) = '{0}'
             """.format(output_schema)
             );
         __assert(rv[0]["cnt"] == 1, "the output schema does not exist");

     if verbose:
         plpy.info("finished checking parameters");

     # create the result table for keeping the generated rules
     output_tbl_valid("{0}.assoc_rules".format(output_schema), 'assoc rules')
     plpy.execute("""
         CREATE TABLE {0}.assoc_rules
             (
             ruleId      INT,
             pre         TEXT[],
             post        TEXT[],
             count       INT,
             support     FLOAT8,
             confidence  FLOAT8,
             lift        FLOAT8,
             conviction  FLOAT8
             )
         m4_ifdef(`__POSTGRESQL__', `', `DISTRIBUTED BY (ruleId)')""".format(output_schema)
         );

     # the auxiliary table for keeping the association rules
     assoc_rules_aux_tmp = unique_string('assoc_rules_aux_tmp')
     plpy.execute("""
         CREATE TEMP TABLE {0}
             (
             ruleId      SERIAL,
             pre         TEXT,
             post        TEXT,
             support     FLOAT8,
             confidence  FLOAT8,
             lift        FLOAT8,
             conviction  FLOAT8
             )
         m4_ifdef(`__POSTGRESQL__', `', `DISTRIBUTED BY (ruleId)')
         """.format(assoc_rules_aux_tmp));

     # if the tid in the input table doesn't start with 1 and the IDs are not
     # continuous, then we will make the IDs start with 1 and continuous.
     # note: duplicated records will be removed.
     rv = plpy.execute("""
         SELECT max({0}) as x, min({1}) as i, count({2}) as c
         FROM {3}""".format(tid_col, tid_col, tid_col, input_table));
     if rv[0]["i"] != 1 or rv[0]["x"] - rv[0]["i"] != rv[0]["c"] :
         assoc_input_unique = unique_string('assoc_input_unique')
         plpy.execute("""
              CREATE TEMP TABLE {5} AS
              SELECT
                 dense_rank() OVER (ORDER BY tid)::BIGINT as tid,
                 item
              FROM (
                 SELECT {0} as tid,
                        {1}::text as item
                 FROM {2}
                 WHERE {3} IS NOT NULL AND
                       {4}  IS NOT NULL
                 GROUP BY 1,2
              ) t
              m4_ifdef(`__POSTGRESQL__', `', `DISTRIBUTED BY (tid)')
              """.format(tid_col, item_col, input_table, tid_col, item_col,
                         assoc_input_unique)
              );

     if verbose :
         plpy.info("finished removing duplicates");

     rv = plpy.execute("""
         SELECT count(DISTINCT tid) as c1, count(DISTINCT item) as c2
         FROM {assoc_input_unique}
         """.format(**locals()));
     num_tranx = rv[0]["c1"];
     num_prod = rv[0]["c2"];
     min_supp_tranx = float(num_tranx) * support;
     # Set default values to the max possible number that an itemset can
     # have on its LHS or RHS for a given input dataset.
     max_lhs_size = num_prod if max_lhs_size is None else max_lhs_size
     max_rhs_size = num_prod if max_rhs_size is None else max_rhs_size

     # get the items whose counts are greater than the given
     # support counts. Each item will be given a continuous number.
     assoc_item_uniq = unique_string('assoc_item_uniq')
     plpy.execute("""
         CREATE TEMP TABLE {assoc_item_uniq} (item_id, item_text, cnt) AS
         SELECT row_number() OVER (ORDER BY cnt DESC)::BIGINT,
            item::TEXT,
            cnt::FLOAT8
         FROM (
            SELECT item AS item, count(tid) as cnt
            FROM {assoc_input_unique}
            GROUP BY item
         ) t
         WHERE cnt::FLOAT8 >= {min_supp_tranx}
         m4_ifdef(`__POSTGRESQL__', `', `DISTRIBUTED BY (item_id)')
         """.format(**locals()));

     if verbose :
         plpy.info("finished encoding items");

     rv = plpy.execute("SELECT count(item_id) as c FROM {assoc_item_uniq}".format(**locals()));
     num_supp_prod = rv[0]["c"];

     assoc_enc_input = unique_string('assoc_enc_input')
     plpy.execute("""
          CREATE TEMP TABLE {assoc_enc_input} (tid, item, cnt) AS
          SELECT t1.tid, t2.item_id, t2.cnt
          FROM {assoc_input_unique} t1, {assoc_item_uniq} t2
          WHERE t1.item = t2.item_text
          m4_ifdef(`__POSTGRESQL__', `', `DISTRIBUTED BY (tid)')
          """.format(**locals()));

     if verbose :
         plpy.info("finished encoding input table: {0}".format(
                 time.time() - begin_step_exec));
         plpy.info("Beginning iteration #1");

     begin_step_exec = time.time();

     # this table keeps the patterns in each iteration.
     assoc_rule_sets = unique_string('assoc_rule_sets')
     plpy.execute("""
         CREATE TEMP TABLE {assoc_rule_sets}
             (
             text_svec   TEXT,
             set_list    {madlib_schema}.svec,
             support     FLOAT8,
             iteration   INT
             )
         m4_ifdef(`__POSTGRESQL__', `', `DISTRIBUTED BY (text_svec)')
         """.format(**locals()));

     # this table keeps the patterns in the current iteration.
     assoc_rule_sets_loop = unique_string('assoc_rule_sets_loop')
     plpy.execute("""
         CREATE TEMP TABLE {assoc_rule_sets_loop}
             (
             id          BIGINT,
             set_list    {madlib_schema}.svec,
             support     FLOAT8,
             tids        {madlib_schema}.svec
             )
         m4_ifdef(`__POSTGRESQL__', `', `DISTRIBUTED BY (id)')
         """.format(**locals()));

     # this table adds a new column to order the table by its setlist
     assoc_rule_sets_loop_ordered = unique_string('assoc_rule_sets_loop_ordered')
     plpy.execute("""
         CREATE TEMP TABLE {assoc_rule_sets_loop_ordered}
             (
             id          BIGINT,
             set_list    {madlib_schema}.svec,
             support     FLOAT8,
             tids        {madlib_schema}.svec,
             newrownum   BIGINT
             )
         m4_ifdef(`__POSTGRESQL__', `', `DISTRIBUTED BY (newrownum)')
         """.format(**locals()));

     plpy.execute("""
          INSERT INTO {assoc_rule_sets_loop} (id, set_list, support, tids)
          SELECT
              t.item,
             {madlib_schema}.svec_cast_positions_float8arr
               (ARRAY[t.item], ARRAY[1], {num_supp_prod}, 0),
             t.cnt::FLOAT8 / {num_tranx},
             {madlib_schema}.svec_cast_positions_float8arr
                 (array_agg(t.tid), array_agg(1), {num_tranx}, 0)
          FROM {assoc_enc_input} t
          GROUP BY t.item, t.cnt
          """.format(**locals()));

     plpy.execute("""
             INSERT INTO {assoc_rule_sets_loop_ordered}(id, set_list, support,
                                                      tids, newrownum)
             SELECT *, row_number() over (order by set_list) as newrownum
             FROM {assoc_rule_sets_loop}
             """.format(**locals()));
     rv = plpy.execute("""
         SELECT count(id) as c1, max(id) as c2
         FROM {assoc_rule_sets_loop}
         """.format(**locals()));
     num_item_loop = rv[0]["c1"];
     max_item_loop = rv[0]["c2"];

     __assert(
             (num_item_loop == max_item_loop) or
             (num_item_loop == 0 and max_item_loop is None),
             "internal error: num_item_loop must be equal to max_item_loop"
         );

     # As two different svecs may have the same hash key,
     # we use this table to assign a unique ID for each svec.
     assoc_loop_aux = unique_string('al_aux')
     plpy.execute("""
          CREATE TEMP TABLE {assoc_loop_aux}
             (
             id              BIGSERIAL,
             set_list        {madlib_schema}.svec,
             support         FLOAT8,
             tids            {madlib_schema}.svec
             )
          m4_ifdef(`__POSTGRESQL__', `', `DISTRIBUTED BY (id)')
          """.format(**locals()));

     if verbose  :
         plpy.info("{0} Frequent itemsets found in this iteration".format(
             num_supp_prod));
         plpy.info("Completed iteration # 1. Time: {0}".format(
             time.time() - begin_step_exec));

     curr_iter = 0;
     num_products_threshold = num_supp_prod
     while num_item_loop > 0 and curr_iter < max_itemset_size:
         begin_step_exec = time.time();
         curr_iter = curr_iter + 1;

         if verbose  :
             plpy.info("Beginning iteration # {0}".format(curr_iter + 1));

         plpy.execute("""
              INSERT INTO {assoc_rule_sets}
                 (text_svec, set_list, support, iteration)
              SELECT array_to_string(
                     {madlib_schema}.svec_nonbase_positions(set_list, 0), ','),
                     set_list,
                     support, {curr_iter}
              FROM {assoc_rule_sets_loop}""".format(**locals()));

         if verbose  :
             plpy.info("time of preparing data: {0}".format(
                  time.time() - begin_step_exec));

         # generate the patterns for the next iteration

         # The vector operations are used to calculate the boolean OR operation
         # efficiently.
         # Example:
         # a1 = [1,1,0,0,0,1]            a2 = [1,1,0,0,1,1]
         # svec_a1 = {2,3,1}:{1,0,1}     svec_a2 = {2,2,2}:{1,0,1}
         # x = svec_a1 + svec_a2 = [2,2,0,0,1,2] -> {2,2,1,1}:{2,0,1,2}
         # y = svec_a1 * svec_a2 = [1,1,0,0,0,1] -> {2,3,1}:{1,0,1}
         # x - y = [1,1,0,0,1,1] -> {2,2,2}:{1,0,1}

         # The boolean OR operation is used to combine two itemsets to create a larger one.
         # (a,b,c) combined with (a,b,d) will give (a,b,c,d)
         # (a,b,c) combined with (c,d,e) will give (a,b,c,d,e)

         # The {assoc_rule_sets_loop_ordered} table is ordered by the setlist.
         # This ensures that the sets (a,b,c) and (a,b,d) are close to each other.
         # t3.newrownum-t1.newrownum <= {num_products_threshold} check ensures that we are
         # looking at a subset of pairs of itemsets instead of all pairs.

         # Let's assume we are trying to get to (a,b,c,d)
         # If the support for (a,b,c,d) is above the threshold; (a,b,c), (a,b,d),
         # (b,c,d) should all be already in our list.
         # In this case it doesn't matter if we skip trying (a,b,c) and (b,c,d)
         # combination because the result is already covered by (a,b,c) + (a,c,d).

         # The iterp1 check ensures that the new itemset is of a certain size.
         # At every iteration, we increase the target itemset size by one.

         plpy.execute("ALTER SEQUENCE {assoc_loop_aux}_id_seq RESTART WITH 1".format(**locals()));
         plpy.execute("TRUNCATE TABLE {assoc_loop_aux}".format(**locals()));
         plpy.execute("""
            INSERT INTO {assoc_loop_aux}(set_list, support, tids)
            SELECT DISTINCT ON({madlib_schema}.svec_to_string(set_list)) set_list,
                    {madlib_schema}.svec_l1norm(tids)::FLOAT8 / {num_tranx},
                    tids
            FROM (
              SELECT
                 {madlib_schema}.svec_minus(
                     {madlib_schema}.svec_plus(t1.set_list, t3.set_list),
                     {madlib_schema}.svec_mult(t1.set_list, t3.set_list)
                 ) as set_list,
                 {madlib_schema}.svec_mult(t1.tids, t3.tids) as tids
              FROM {assoc_rule_sets_loop_ordered} t1,
                   {assoc_rule_sets_loop_ordered} t3
              WHERE t1.newrownum < t3.newrownum AND
                    t3.newrownum-t1.newrownum <= {num_products_threshold}
            ) t
            WHERE {madlib_schema}.svec_l1norm(set_list)::INT = {iterp1} AND
                  {madlib_schema}.svec_l1norm(tids)::FLOAT8 >= {min_supp_tranx}
            """.format(iterp1=curr_iter + 1, **locals())
            );

         plpy.execute("TRUNCATE TABLE {assoc_rule_sets_loop}".format(**locals()));
         plpy.execute("""
             INSERT INTO {assoc_rule_sets_loop}(id, set_list, support, tids)
             SELECT * FROM {assoc_loop_aux}
             """.format(**locals()));
         plpy.execute("TRUNCATE TABLE {assoc_rule_sets_loop_ordered}".format(**locals()));
         plpy.execute("""
             INSERT INTO {assoc_rule_sets_loop_ordered}(id, set_list, support, tids, newrownum)
             SELECT *, row_number() over (order by set_list) as newrownum
             FROM {assoc_rule_sets_loop}
             """.format(**locals()));

         rv = plpy.execute("""
             SELECT count(id) as c1, max(id) as c2
             FROM {assoc_rule_sets_loop}
             """.format(**locals()));
         num_item_loop = rv[0]["c1"];
         max_item_loop = rv[0]["c2"];
         __assert(
             (num_item_loop == max_item_loop) or
             (num_item_loop == 0 and max_item_loop is None),
             "internal error: num_item_loop must be equal to max_item_loop"
             );
         # num_products_threshold should be equal to the number of distinct
         # products that are present in the previous iteration's frequent
         # itemsets.
         # We can ideally do away with the if condition, but it's a trade-off:
         # The query in the if condition might be a considerable overhead when
         # the original number of products is very low compared to the number
         # of rows in {assoc_rule_sets_loop} in a given iteration. On the other
         # hand, it can be a considerable improvement if we have a lot of
         # distinct products and only a small number of them are present in
         # frequent itemsets. But this is specific to datasets and parameters
         # (such as support), so the following if statment is a compromise.
         if num_item_loop < num_supp_prod:
             # Get number of 1's from all set_lists in {assoc_rule_sets_loop}
             num_products_threshold = plpy.execute("""
                 SELECT {madlib_schema}.svec_l1norm(
                     {madlib_schema}.svec_count_nonzero(a)) AS cnt
                 FROM (
                     SELECT {madlib_schema}.svec_sum(set_list) AS a
                     FROM {assoc_rule_sets_loop}
                     ) t
                 """.format(**locals()))[0]['cnt']

         if verbose :
             plpy.info("{0} Frequent itemsets found in this iteration".format(
                 num_item_loop));
             plpy.info("Completed iteration # {0}. Time: {1}".format(
                     curr_iter + 1, time.time() - begin_step_exec));

         cal_itemsets_time = cal_itemsets_time + (time.time() - begin_step_exec);

     if (num_item_loop == 0) and (curr_iter < 2) and verbose :
         plpy.info("No association rules found that meet given criteria");
         plpy.info("finished itemsets finding. Time: {0}".format(
             cal_itemsets_time));
         total_rules = 0;
     else :
         if (verbose) :
             plpy.info("begin to generate the final rules");

         begin_step_exec = time.time();
         # generate all the final rules
         plpy.execute("""
              INSERT INTO {assoc_rules_aux_tmp}
                 (pre, post, support, confidence, lift, conviction)
              SELECT
                 t.item[1],
                 t.item[2],
                 t.support_xy,
                 t.support_xy / x.support,
                 t.support_xy / (x.support * y.support),
                 CASE WHEN abs(t.support_xy / x.support - 1) < 1.0E-10 THEN
                     0
                 ELSE
                     (1 - y.support) / (1 - (t.support_xy / x.support))
                 END
             FROM (
                 SELECT
                     {madlib_schema}.gen_rules_from_cfp(text_svec,
                                                        iteration,
                                                        {max_lhs_size},
                                                        {max_rhs_size}) as item,
                     support as support_xy
                 FROM {assoc_rule_sets}
                 WHERE iteration > 1
             ) t, {assoc_rule_sets} x, {assoc_rule_sets} y
             WHERE t.item[1] = x.text_svec AND
                   t.item[2] = y.text_svec AND
                   (t.support_xy / x.support) >= {confidence}
             """.format(**locals()));

         # generate the readable rules
         pre_tmp_table = unique_string('pre_tmp_table')
         plpy.execute("""
              CREATE TEMP TABLE {pre_tmp_table} AS
              SELECT ruleId, array_agg(item_text) as pre
              FROM
                 (
                     SELECT
                         ruleId,
                         unnest(string_to_array(pre, ','))::BIGINT as pre_id
                     FROM {assoc_rules_aux_tmp}
                 ) s1, {assoc_item_uniq} s2
              WHERE s1.pre_id = s2.item_id
              GROUP BY ruleId
              m4_ifdef(`__POSTGRESQL__', `', `DISTRIBUTED BY (ruleId)')
              """.format(**locals()));

         post_tmp_table = unique_string('post_tmp_table')
         plpy.execute("""
              CREATE TEMP TABLE {post_tmp_table} AS
              SELECT ruleId, array_agg(item_text) as post
              FROM
                 (
                     SELECT
                         ruleId,
                         unnest(string_to_array(post, ','))::BIGINT as post_id
                     FROM {assoc_rules_aux_tmp}
                 ) s1, {assoc_item_uniq} s2
              WHERE s1.post_id = s2.item_id
              GROUP BY ruleId
              m4_ifdef(`__POSTGRESQL__', `', `DISTRIBUTED BY (ruleId)')
              """.format(**locals()));

         plpy.execute("""
              INSERT INTO {output_schema}.assoc_rules
              SELECT t1.ruleId, t2.pre, t3.post, t1.support*{num_tranx}::INT AS count, support,
                     confidence, lift, conviction
              FROM
                 {assoc_rules_aux_tmp} t1,
                 {pre_tmp_table} t2,
                 {post_tmp_table} t3
              WHERE t1.ruleId = t2.ruleId AND t1.ruleId = t3.ruleId
              """.format(**locals()));


         # if in verbose mode, we will keep all the intermediate tables
         if not verbose :
             plpy.execute("""
                 DROP TABLE IF EXISTS {post_tmp_table};
                 DROP TABLE IF EXISTS {pre_tmp_table};
                 DROP TABLE IF EXISTS {assoc_rules_aux_tmp};
                 DROP TABLE IF EXISTS {assoc_input_unique};
                 DROP TABLE IF EXISTS {assoc_item_uniq};
                 DROP TABLE IF EXISTS {assoc_enc_input};
                 DROP TABLE IF EXISTS {assoc_rule_sets};
                 DROP TABLE IF EXISTS {assoc_rule_sets_loop};
                 DROP TABLE IF EXISTS {assoc_rule_sets_loop_ordered};
                 DROP TABLE IF EXISTS {assoc_loop_aux};
                 """.format(**locals()));
         if verbose :
             rv = plpy.execute("""
                 SELECT count(*) as c FROM {0}.assoc_rules
                 """.format(output_schema));
             num_item_loop = rv[0]["c"];
             plpy.info("{0} Total association rules found. Time: {1}".format(
                     num_item_loop, time.time() - begin_step_exec));

         rv = plpy.execute("""
             SELECT count(*) as c FROM {0}.assoc_rules
             """.format(output_schema));
         total_rules = rv[0]["c"];

     return (
             output_schema,
             'assoc_rules',
             total_rules,
             time.time() - begin_func_exec
            );

 def assoc_rules_help_message(schema_madlib, message=None, **kwargs):
     """
     Given a help string, provide usage information

     Args:
         @param schema_madlib Name of the MADlib schema
         @param message  Helper message to print

     Returns:
         None
     """
     if message is not None and \
             message.lower() in ("usage", "help", "?"):
         return """
 -----------------------------------------------------------------------
                                 USAGE
 -----------------------------------------------------------------------
 SELECT {schema_madlib}.assoc_rules(
     support,            -- FLOAT8,  minimum level of support needed for each
                                     itemset to be included in result
     confidence,         -- FLOAT8,  minimum level of confidence needed for each
                                     rule to be included in result
     tid_col,            -- TEXT,    name of the column storing the transaction ids
     item_col,           -- TEXT,    name of the column storing the products
     input_table,        -- TEXT,    name of the table containing the input data
     output_schema,      -- TEXT,    name of the schema where the final results
                                     will be stored. The schema must be created
                                     before calling the function.
                                     Alternatively, use <tt>NULL</tt> to output
                                     to the current schema.
     verbose,            -- BOOLEAN, (optional, default: False) determines if
                                     details are printed for each iteration as
                                     the algorithm progresses
     max_itemset_size,   -- INTEGER, (optional, default: itemsets of all sizes)
                                     determines the maximum size of frequent
                                     itemsets allowed that are used for generating
                                     association rules. Value less than 2 throws an error.
     max_lhs_size,       -- INTEGER, (optional, default: NULL) determines the
                                     maximum size of the lhs of the rule.
                                     NULL means there is no restriction on the
                                     size of the left hand side.
     max_rhs_size        -- INTEGER  (optional, default: NULL) determines the
                                     maximum size of the rhs of the rule.
                                     NULL means there is no restriction on the
                                     size of the right hand side.
 );
 -------------------------------------------------------------------------
                                 OUTPUT TABLES
 -------------------------------------------------------------------------
 The output table "assoc_rules" in the "output_schema" contains a unique rule of the form "If X, then Y
 (i.e., X => Y)" in each row. X and Y are non-empty itemsets, called the antecedent and consequent, or
 the left-hand-side (LHS) and right-hand-side (LHS), of the rule respectively.

 in each row, with the following columns:
     ruleid,     -- INTEGER, row number
     pre,        -- TEXT, specifies the antecedent, or the LHS of the rule
     post,       -- DOUBLE, specifies the consequent, or the RHS of the rule
     support,    -- DOUBLE, support of the frequent itemset X,Y
     count,      -- INTEGER, number of transactions in the input table that contain X,Y
     confidence, -- DOUBLE, the ratio of number of transactions that contain X,Y to the number of transactions
                         that contain X
     lift,       -- DOUBLE, the ratio of observed support of X,Y to the expected support of X,Y, assuming X and
                         Y are independent.
     conviction  -- DOUBLE, the ratio of expected support of X occurring without Y assuming X and Y are
                         independent, to the observed support of X occuring without Y
         """.format(schema_madlib=schema_madlib)
     else:
         return """
 For an overview on usage, run: SELECT {schema_madlib}.assoc_rules('usage');
         """.format(schema_madlib=schema_madlib)