| # coding=utf-8 |
| |
| """@file bayes.py_in |
| |
| @brief Naive Bayes classification with user-defined smoothing factor (default: |
| Laplacian smoothing). |
| |
| @namespace bayes |
| |
| Naive Bayes: Setup Functions |
| |
| @internal |
| @implementation |
| |
| For the Naive Bayes Classification, we need a product over probabilities. |
| However, multiplying lots of small numbers can lead to an exponent overflow. |
| E.g., multiplying more than 324 numbers at most 0.1 will yield a product of 0 |
| in machine arithmetic. A safer way is therefore summing logarithms. |
| |
| By the IEEE 754 standard, the smallest number representable as |
| DOUBLE PRECISION (64bit) is $2^{-1022}$, i.e., approximately 2.225e-308. |
| See, e.g., http://en.wikipedia.org/wiki/Double_precision |
| Hence, log(x) = log_10(x) for any non-zero DOUBLE PRECISION @f$x \ge -308@f$. |
| |
| Note for theorists: |
| - Even adding infinitely many \f$ \log_{10}(x)@f$ for @f$0 < x \le 1 \f$ will |
| never cause an overflow because addition will have no effect once the sum |
| reaches approx $308 * 2^{53}$ (correspnding to the machine precision). |
| |
| The functions __get_*_sql are private because we do not want to commit ourselves |
| to a particular interface. We might want to be able to change implementation |
| details should the need arise. |
| @endinternal |
| """ |
| |
| import plpy |
| from utilities.validate_args import table_exists |
| from utilities.validate_args import columns_exist_in_table |
| from utilities.validate_args import table_is_empty |
| |
| def __nb_validate_args(**kwargs): |
| """Validate parameters for Naive Bayes functions. |
| |
| This function performs general checkings for all Naive Bayes functions, |
| assuming valid if the argument is not in **kwargs. |
| |
| Possible keys in **kwargs: |
| @param trainingSource name of relation containing training data |
| @param trainingClassColumn name of column with class |
| @param trainingAttrColumn name of column with attributes array |
| @param numericAttrsColumnIndices list of column names that are to be treaded as of type numeric |
| @param numAttrs Number of attributes to use for classification |
| @param numericFeatureStatsDestName Name of relation containing the descriptive statistics |
| (mean,var) of the numerical features |
| @param classPriorsDestName Name of class-priors relation to create |
| @param featureProbsDestName Name of feature-probabilities relation to create |
| @param classPriorsSource Relation (class, class_cnt, all_cnt) where |
| class is c, class_cnt is \#c, all_cnt is the number of rows in |
| \em trainingSource |
| @param featureProbsSource Relation (class, attr, value, cnt, attr_cnt) where |
| (class, attr, value) = (c,i,a), cnt = \#(c,i,a), and attr_cnt = \#i |
| @param numericFeatureStatsSource Relation (class,attr,attr_mean,attr_var) |
| where mean is the mean of the numeric attr and var is its stddev, used for gaussian smoothing |
| @param classifySource Name of the relation that contains data to be classified |
| @param classifyKeyColumn Name of column in \em classifySource that can |
| serve as unique identifier |
| @param classifyAttrColumn Name of attributes-array column in \em classifySource |
| @param destName Name of the table or view to create |
| """ |
| |
| for k in ('classPriorsDestName', 'featureProbsDestName', 'destName','numericFeatureStatsDestName'): |
| if k in kwargs: |
| out = kwargs[k] |
| if out is None or out == '': |
| plpy.error("Naive Bayes error: Invalid output table name!") |
| if table_exists(out, only_first_schema=True): |
| plpy.error("Naive Bayes error: Output table already exists. " + \ |
| "Drop table '{0}' before calling the function.".format(out)) |
| |
| if 'numAttrs' in kwargs: |
| n_attr = kwargs['numAttrs'] |
| if n_attr is None or n_attr <= 0: |
| plpy.error("Naive Bayes error: 'numAttrs' should be > 0!") |
| |
| model_tbl = {'classPriorsSource': ('class', 'class_cnt', 'all_cnt'), |
| 'featureProbsSource': ('class', 'attr', 'value', 'cnt', 'attr_cnt'), |
| 'numericFeatureStatsSource' : ('class','attr','attr_mean','attr_var')} |
| |
| empty_tables = [] |
| |
| for mdl in model_tbl: |
| if mdl in kwargs: |
| tbl = kwargs[mdl] |
| |
| if tbl is None or tbl == '': |
| plpy.error("Naive Bayes error: Invalid {0} name!".format(mdl)) |
| if not table_exists(tbl): |
| plpy.error("Naive Bayes error: " + \ |
| "{0} '{1}' does not exist!".format(mdl, tbl)) |
| if table_is_empty(tbl): |
| empty_tables += [mdl] |
| |
| for c in model_tbl[mdl]: |
| if c is None or c == '': |
| plpy.error("Naive Bayes error: Invalid column name!") |
| if not columns_exist_in_table(tbl, model_tbl[mdl]): |
| plpy.error("Naive Bayes error: {0} '{1}' does not have the" + \ |
| "necessary column(s)!".format(mdl, tbl)) |
| |
| # classPriorsSource should never be empty |
| if 'classPriorsSource' in empty_tables: |
| plpy.error("Naive Bayes error: " + \ |
| "{0} '{1}' is empty!".format('classPriorsSource', kwargs['classPriorsSource'])) |
| |
| #at most one of the other tables can be empty |
| if 'featureProbsSource' in empty_tables and 'numericFeatureStatsSource' in empty_tables: |
| plpy.error("Naive Bayes error: " + \ |
| "{0} '{1}' and {2} '{3}' are empty!".format( |
| 'featureProbsSource', |
| kwargs['featureProbsSource'], |
| 'numericFeatureStatsSource', |
| kwargs['numericFeatureStatsSource'])) |
| |
| if 'trainingSource' in kwargs: |
| tbl = kwargs['trainingSource'] |
| c_col = kwargs['trainingClassColumn'] |
| a_col = kwargs['trainingAttrColumn'] |
| n_attr = kwargs['numAttrs'] |
| |
| if tbl is None or tbl == '': |
| plpy.error("Naive Bayes error: Invalid training table name!") |
| if not table_exists(tbl): |
| plpy.error("Naive Bayes error: " + \ |
| "Training table '{0}' does not exist!".format(tbl)) |
| if table_is_empty(tbl): |
| plpy.error("Naive Bayes error: " + \ |
| "Training table '{0}' is empty!".format(tbl)) |
| |
| for c in (a_col, c_col): |
| if c is None or c == '': |
| plpy.error("Naive Bayes error: Invalid column name!") |
| if not columns_exist_in_table(tbl, [c]): |
| plpy.error("Naive Bayes error: " + \ |
| "Column '{0}' does not exist!".format(c)) |
| |
| bound = plpy.execute("""SELECT {n_attr} <= array_upper({a_col}, 1) |
| AS bound FROM {tbl} LIMIT 1""".format(n_attr=str(n_attr), |
| a_col=a_col, tbl=tbl))[0]['bound'] |
| if not bound: |
| plpy.error("Naive Bayes error:" + \ |
| "'numAttrs' {0} is out of bound!".format(str(n_attr))) |
| |
| if 'classifySource' in kwargs: |
| tbl = kwargs['classifySource'] |
| k_col = kwargs['classifyKeyColumn'] |
| a_col = kwargs['classifyAttrColumn'] |
| n_attr = kwargs['numAttrs'] |
| |
| if tbl is None or tbl == '': |
| plpy.error("Naive Bayes error: Invalid classify table name!") |
| if not table_exists(tbl): |
| plpy.error("Naive Bayes error: " + \ |
| "Classify table '{0}' does not exist!".format(tbl)) |
| if table_is_empty(tbl): |
| plpy.error("Naive Bayes error: " + \ |
| "Classify table '{0}' is empty!".format(tbl)) |
| |
| for c in (a_col, k_col): |
| if c is None or c == '': |
| plpy.error("Naive Bayes error: Invalid column name!") |
| if not columns_exist_in_table(tbl, [c]): |
| plpy.error("Naive Bayes error: " + \ |
| "Column '{0}' does not exist!".format(c)) |
| |
| bound = plpy.execute("""SELECT {n_attr} <= array_upper({a_col}, 1) |
| AS bound FROM {tbl} LIMIT 1""".format(n_attr=str(n_attr), |
| a_col=a_col, tbl=tbl))[0]['bound'] |
| if not bound: |
| plpy.error("Naive Bayes error:" + \ |
| "'numAttrs' {0} is out of bound!".format(str(n_attr))) |
| |
| def __get_numeric_attr_indices(numericFeatureStatsSource): |
| """ Return the SQL array containing the indices of the |
| numeric attributes in the numericFeatureStatsSource |
| table |
| """ |
| return """ |
| SELECT array_agg(DISTINCT(attr)) |
| FROM {0} as numericFeatureStatsSource |
| """.format(numericFeatureStatsSource) |
| |
| def __get_feature_probs_sql(**kwargs): |
| """Return SQL query with columns (class, attr, value, cnt, attr_cnt). |
| |
| For class c, attr i, and value a, cnt is #(c,i,a) and attr_cnt is \#i. |
| |
| Note that the query will contain a row for every pair (class, value) |
| occuring in the training data (so it might also contain rows where |
| \#(c,i,a) = 0). |
| |
| @param classPriorsSource Relation (class, class_cnt, all_cnt) where |
| class is c, class_cnt is \#c, all_cnt is the number of rows in |
| \em trainingSource |
| @param attrValuesSource Relation (attr, value) containing all distinct |
| attribute, value pairs. If omitted, will use __get_attr_values_sql() |
| @param attrCountsSource Relation (attr, attr_cnt) where attr is i and |
| attr_cnt is \#i. If omitted, will use __get_attr_counts_sql() |
| @param trainingSource name of relation containing training data |
| @param trainingClassColumn name of column with class |
| @param trainingAttrColumn name of column with attributes array |
| @param numAttrs Number of attributes to use for classification |
| @param numericAttrsColumnIndices Indices of the attributes which are numeric |
| and hence must be removed from the feature probs table |
| |
| For meanings of \#(c,i,a), \#c, and \#i see the general description of |
| \ref bayes. |
| """ |
| |
| if not 'attrValuesSource' in kwargs: |
| kwargs.update(dict( |
| attrValuesSource = "(" + __get_attr_values_sql(**kwargs) + ")" |
| )) |
| if not 'attrCountsSource' in kwargs: |
| kwargs.update(dict( |
| attrCountsSource = "(" + __get_attr_counts_sql(**kwargs) + ")" |
| )) |
| |
| # {trainingSource} cannot be a subquery, because we use it more than once in |
| # our generated SQL. |
| sql = """ |
| SELECT |
| class, |
| attr, |
| value, |
| cnt, |
| attr_cnt |
| FROM( |
| SELECT |
| class, |
| attr, |
| value, |
| coalesce(cnt, 0) AS cnt, |
| attr_cnt |
| FROM |
| ( |
| SELECT * |
| FROM |
| {classPriorsSource} AS classes |
| CROSS JOIN |
| {attrValuesSource} AS attr_values |
| ) AS required_triples |
| LEFT OUTER JOIN |
| ( |
| SELECT |
| trainingSource.{trainingClassColumn} AS class, |
| generate_series(1, {numAttrs}) AS attr, |
| unnest(trainingSource.{trainingAttrColumn}) AS value, |
| count(*) AS cnt |
| FROM |
| {trainingSource} AS trainingSource |
| WHERE |
| {schema_madlib}.array_contains_null({trainingAttrColumn}) = 'f' |
| GROUP BY |
| class, |
| attr, |
| value |
| ) AS triple_counts |
| USING (class, attr, value) |
| INNER JOIN |
| {attrCountsSource} AS attr_counts |
| USING (attr) |
| ) p |
| """ |
| |
| if kwargs.has_key("numericAttrsColumnIndices") \ |
| and kwargs['numericAttrsColumnIndices'].lower() != 'array[]': |
| sql += """ |
| WHERE NOT EXISTS ( |
| SELECT 1 |
| FROM( |
| SELECT unnest({numericAttrsColumnIndices}::INTEGER[]) as attr |
| ) q |
| WHERE p.attr=q.attr |
| ) |
| """ |
| |
| return sql.format(**kwargs) |
| |
| def __get_attr_values_sql(**kwargs): |
| """ |
| Return SQL query with columns (attr, value). |
| |
| The query contains a row for each pair that occurs in the training data. |
| |
| @param trainingSource Name of relation containing the training data |
| @param trainingAttrColumn Name of attributes-array column in training data |
| @param numAttrs Number of attributes to use for classification |
| |
| @internal |
| \par Implementation Notes: |
| If PostgreSQL supported count(DISTINCT ...) for window functions, we could |
| consolidate this function with __get_attr_counts_sql(): |
| @verbatim |
| [...] count(DISTINCT value) OVER (PARTITION BY attr) [...] |
| @endverbatim |
| @endinternal |
| |
| """ |
| |
| return """ |
| SELECT attr, value |
| FROM( |
| SELECT |
| generate_series(1, {numAttrs}) AS attr, |
| unnest(trainingSource.{trainingAttrColumn}) AS value |
| FROM |
| {trainingSource} AS trainingSource |
| WHERE |
| {schema_madlib}.array_contains_null({trainingAttrColumn}) = 'f' |
| GROUP BY attr, value |
| ) l |
| WHERE value IS NOT NULL |
| """.format(**kwargs) |
| |
| |
| def __get_attr_counts_sql(**kwargs): |
| """ |
| Return SQL query with columns (attr, attr_cnt) |
| |
| For attr i, attr_cnt is \#i. |
| |
| @param trainingSource Name of relation containing the training data |
| @param trainingAttrColumn Name of attributes-array column in training data |
| @param numAttrs Number of attributes to use for classification |
| |
| """ |
| |
| return """ |
| SELECT |
| attr, count(value) AS attr_cnt |
| FROM |
| ( |
| SELECT |
| attr, value |
| FROM ( |
| SELECT |
| generate_series(1, {numAttrs}) AS attr, |
| unnest(trainingSource.{trainingAttrColumn}) AS value |
| FROM |
| {trainingSource} AS trainingSource |
| WHERE |
| {schema_madlib}.array_contains_null({trainingAttrColumn}) = 'f' |
| ) l |
| GROUP BY attr, value |
| ) m |
| GROUP BY attr |
| """.format(**kwargs) |
| |
| def __get_attr_numeric_stats_sql(**kwargs): |
| """ |
| Return SQL query with columns (class,attr, attr_mean,attr_var) for |
| numeric attributes |
| |
| @param trainingSource Name of relation containing the training data |
| @param trainingClassColumn Name of the class column in the training source table |
| @param trainingAttrColumn Name of attributes-array column in training data |
| @param numAttrs Number of attributes to use for classification |
| @param numericAttrsColumnIndices Indices of attributes in trainingAttrColumn array to be treated as numeric |
| |
| """ |
| |
| return """ |
| SELECT |
| class,attr, avg(value) AS attr_mean , variance(value) as attr_var |
| FROM |
| ( |
| SELECT |
| l.class,l.attr,l.value |
| FROM ( |
| SELECT |
| trainingSource.{trainingClassColumn} as class, |
| generate_series(1, {numAttrs}) AS attr, |
| unnest(trainingSource.{trainingAttrColumn}) AS value |
| FROM |
| {trainingSource} AS trainingSource |
| WHERE |
| {schema_madlib}.array_contains_null({trainingAttrColumn}) = 'f' |
| ) l |
| INNER JOIN ( |
| SELECT |
| unnest({numericAttrsColumnIndices}::INTEGER[]) AS attr |
| ) m |
| ON (l.attr=m.attr) |
| ) p |
| GROUP BY class,attr |
| """.format(**kwargs) |
| |
| |
| def __get_class_priors_sql(**kwargs): |
| """ |
| Return SQL query with columns (class, class_cnt, all_cnt) |
| |
| For class c, class_cnt is \#c. all_cnt is the total number of records in the |
| training data. |
| |
| @param trainingSource Name of relation containing the training data |
| @param trainingClassColumn Name of class column in training data |
| |
| """ |
| |
| return """ |
| SELECT * FROM |
| ( |
| SELECT |
| trainingSource.{trainingClassColumn} AS class, |
| count(*) AS class_cnt |
| FROM {trainingSource} AS trainingSource |
| WHERE |
| {schema_madlib}.array_contains_null({trainingAttrColumn}) = 'f' |
| GROUP BY trainingSource.{trainingClassColumn} |
| ) l |
| CROSS JOIN |
| ( |
| SELECT |
| count(*) AS all_cnt |
| FROM {trainingSource} AS trainingSource |
| WHERE |
| {schema_madlib}.array_contains_null({trainingAttrColumn}) = 'f' |
| ) m |
| """.format(**kwargs) |
| |
| def __get_class_prior_prob_values_sql(**kwargs): |
| """ |
| Return SQL query with columns (key, class, log_prob), where log_prob |
| is the class prior probability. This will return all possible combinations |
| of key,class so that they can later be grouped together and added with the |
| log probabilites of the attributes. |
| |
| @param numAttrs Number of attributes to use for classification |
| @param classifySource Name of the relation that contains data to be classified |
| @param classifyKeyColumn Name of column in \em classifySource that can |
| serve as unique identifier |
| @param classifyAttrColumn Name of attributes-array column in \em classifySource |
| @param classPriorsSource |
| Relation (class, class_cnt, all_cnt) where |
| class is c, class_cnt is \#c, all_cnt is the number of training |
| samples. |
| """ |
| |
| return """ |
| SELECT |
| classify.{classifyKeyColumn} AS key, |
| classes.class, |
| log(classes.class_cnt::DOUBLE PRECISION/classes.all_cnt) |
| FROM |
| {classifySource} AS classify, |
| {classPriorsSource} AS classes |
| WHERE |
| {schema_madlib}.array_contains_null({classifyAttrColumn}) = 'f' |
| """.format(**kwargs) |
| |
| |
| |
| def __get_keys_and_prob_values_sql(**kwargs): |
| """ |
| Return SQL query with columns (key, class, log_prob). |
| |
| For class c and the attribute array identified by key k, log_prob is |
| log( P(C = c) * P(A = a(k)[] | C = c) ). |
| |
| For each key k and class c, the query also contains a row (k, c, NULL). This |
| is for technical reasons (we want every key-class pair to appear in the |
| query. NULL serves as a default value if there is insufficient training data |
| to compute a probability value). |
| |
| @param numAttrs Number of attributes to use for classification |
| @param classifySource Name of the relation that contains data to be classified |
| @param classifyKeyColumn Name of column in \em classifySource that can |
| serve as unique identifier |
| @param classifyAttrColumn Name of attributes-array column in \em classifySource |
| @param classPriorsSource |
| Relation (class, class_cnt, all_cnt) where |
| class is c, class_cnt is \#c, all_cnt is the number of training |
| samples. |
| @param featureProbsSource |
| Relation (class, attr, value, cnt, attr_cnt) where |
| (class, attr, value) = (c,i,a), cnt = \#(c,i,a), and attr_cnt = \#i |
| @param smoothingFactor Smoothing factor for computing feature |
| feature probabilities. Default value: 1.0 (Laplacian Smoothing). |
| |
| """ |
| |
| # {classifySource} cannot be a subquery, because we use it more than once in |
| # our generated SQL. |
| return """ |
| SELECT t2.key, t2.class, t1.log_prob FROM |
| ( |
| SELECT |
| classify.key, |
| classPriors.class, |
| CASE WHEN count(*) < 1 THEN 0 -- ignoring NULL attributes for now |
| -- {numAttrs} THEN NULL |
| ELSE |
| sum( log((featureProbs.cnt::DOUBLE PRECISION + |
| {smoothingFactor})/ (classPriors.class_cnt + |
| {smoothingFactor} * featureProbs.attr_cnt)) ) |
| END |
| AS log_prob |
| FROM |
| {featureProbsSource} AS featureProbs, |
| {classPriorsSource} AS classPriors, |
| ( |
| SELECT |
| classifySource.{classifyKeyColumn} AS key, |
| generate_series(1, {numAttrs}) AS attr, |
| unnest(classifySource.{classifyAttrColumn}) AS value |
| FROM |
| {classifySource} AS classifySource |
| WHERE |
| {schema_madlib}.array_contains_null({classifyAttrColumn}) = 'f' |
| ) AS classify |
| WHERE |
| featureProbs.class = classPriors.class AND |
| featureProbs.attr = classify.attr AND |
| ( CASE WHEN classify.value is NULL |
| THEN false |
| ELSE featureProbs.value = classify.value |
| END |
| ) |
| AND |
| ({smoothingFactor} > 0 OR featureProbs.cnt > 0) -- prevent division by 0 |
| GROUP BY |
| classify.key, classPriors.class, classPriors.class_cnt, classPriors.all_cnt |
| ) t1 |
| RIGHT OUTER JOIN |
| ( |
| SELECT |
| classify.{classifyKeyColumn} AS key, |
| classes.class |
| FROM |
| {classifySource} AS classify, |
| {classPriorsSource} AS classes |
| WHERE |
| {schema_madlib}.array_contains_null({classifyAttrColumn}) = 'f' |
| GROUP BY classify.{classifyKeyColumn}, classes.class |
| ) t2 |
| ON t1.key = t2.key AND t1.class=t2.class |
| """.format(**kwargs) |
| |
| def __get_keys_and_probs_values_numeric_attrs_sql(**kwargs): |
| """ |
| This calculates the log probabilites of numeric attributes |
| based on the gaussian smoothing formula. |
| This will result in a table of the form (key,class,<sum of prob_values for |
| all_numeric_attrs>).These probabilites can then be added to the log |
| probabilities obtained in __get_keys_and_probs_values and the class priors |
| computed in __get_class_prior_prob_values_sql to compute the final |
| log probabilities. |
| |
| @param numAttrs Number of attributes to use for classification |
| @param classifySource Name of the relation that contains data to be classified |
| @param classifyKeyColumn Name of column in \em classifySource that can |
| serve as unique identifier |
| @param classifyAttrColumn Name of attributes-array column in \em classifySource |
| @param classPriorsSource |
| Relation (class, class_cnt, all_cnt) where |
| class is c, class_cnt is \#c, all_cnt is the number of training |
| samples. |
| @param numericFeatureStatsSource |
| Relation (class, attr, attr_mean, attr_var) for gaussian smoothing |
| of continuous attributes. |
| """ |
| |
| # {classifySource} cannot be a subquery, because we use it more than once in |
| # our generated SQL. |
| return """ |
| SELECT s2.key, s2.class, s1.log_prob FROM |
| ( |
| SELECT |
| classify.key, |
| numericFeatureStatsSource.class, |
| sum(-0.5*(log(2*pi()*attr_var::DOUBLE PRECISION) + log(exp(1.0))*(((attr_mean::DOUBLE PRECISION-value)^2)/(attr_var::DOUBLE PRECISION)))) |
| AS log_prob |
| FROM |
| {numericFeatureStatsSource} AS numericFeatureStatsSource, |
| ( |
| SELECT |
| classifySource.{classifyKeyColumn} AS key, |
| generate_series(1, {numAttrs}) AS attr, |
| unnest(classifySource.{classifyAttrColumn}) AS value |
| FROM |
| {classifySource} AS classifySource |
| WHERE |
| {schema_madlib}.array_contains_null({classifyAttrColumn}) = 'f' |
| ) AS classify |
| WHERE |
| numericFeatureStatsSource.attr = classify.attr |
| GROUP BY |
| classify.key,numericFeatureStatsSource.class |
| ) s1 |
| RIGHT OUTER JOIN |
| ( |
| SELECT |
| classify.{classifyKeyColumn} AS key, |
| classes.class |
| FROM |
| {classifySource} AS classify, |
| {classPriorsSource} AS classes |
| WHERE |
| {schema_madlib}.array_contains_null({classifyAttrColumn}) = 'f' |
| GROUP BY classify.{classifyKeyColumn}, classes.class |
| ) s2 |
| ON s1.key = s2.key AND s1.class=s2.class |
| """.format(**kwargs) |
| |
| |
| def __get_classification_sql(**kwargs): |
| """ |
| Return SQL query with columns (key, nb_classification, nb_log_probability) |
| |
| @param keys_and_prob_values Relation (key, class, log_prob) |
| |
| """ |
| |
| kwargs.update( |
| keys_and_prob_values = "(" + __get_keys_and_prob_values_sql(**kwargs) + ")" |
| ) |
| return """ |
| SELECT |
| key, |
| {schema_madlib}.argmax(class, log_prob) AS nb_classification, |
| max(log_prob) AS nb_log_probability |
| FROM {keys_and_prob_values} AS keys_and_nb_values |
| GROUP BY key |
| """.format(**kwargs) |
| |
| def create_prepared_data_table(**kwargs): |
| """Wrapper around create_prepared_data() to create a table (and not a view) |
| """ |
| old_msg_level = plpy.execute("select setting from pg_settings where \ |
| name='client_min_messages'")[0]['setting'] |
| plpy.execute("set client_min_messages to error") |
| kwargs.update(whatToCreate = 'TABLE') |
| __nb_validate_args(**kwargs) |
| create_prepared_data(**kwargs) |
| plpy.execute("set client_min_messages to " + old_msg_level) |
| |
| def create_prepared_data(**kwargs): |
| """Precompute all class priors and feature probabilities. |
| |
| When the precomputations are stored in a table, this function will create |
| indices that speed up lookups necessary for Naive Bayes classification. |
| Moreover, it runs ANALYZE on the new tables to allow for optimized query |
| plans. |
| |
| Class priors are stored in a relation with columns |
| (class, class_cnt, all_cnt). |
| |
| @param trainingSource Name of relation containing the training data |
| @param trainingClassColumn Name of class column in training data |
| @param trainingAttrColumn Name of attributes-array column in training data |
| @param numericAttrsColumnIndices Name of indices-array (index starting from 1) that contains indices of numeric attrs |
| @param numAttrs Number of attributes to use for classification |
| |
| @param whatToCreate (Optional) Either \c 'TABLE' OR \c 'VIEW' (the default). |
| @param classPriorsDestName Name of class-priors relation to create |
| @param featureProbsDestName Name of feature-probabilities relation to create |
| @param numericFeatureStatsDestName Name of the statistics table for numeric attributes |
| |
| """ |
| if not kwargs.has_key('numAttrs'): |
| plpy.error("'numAttrs' must be provided") |
| |
| if not kwargs.has_key('trainingSource'): |
| plpy.error("'trainingSource' must be provided") |
| |
| if not kwargs.has_key('trainingAttrColumn'): |
| plpy.error("'trainingAttrColumn' must be provided") |
| |
| __verify_attr_num( |
| kwargs["trainingSource"], |
| kwargs["trainingAttrColumn"], |
| kwargs["numAttrs"]) |
| |
| # if only one of these is present, error out |
| if kwargs.has_key('numericAttrsColumnIndices') ^ kwargs.has_key('numericFeatureStatsDestName'): |
| plpy.error("Both 'numericAttrsColumnIndices' and 'numericFeatureStatsDestName' must be provided"); |
| |
| # verify if the attributes specified in the numericAttrsColumnIndices are |
| # really numeric in nature. TODO |
| if kwargs.has_key('numericAttrsColumnIndices') and kwargs.has_key('numericFeatureStatsDestName'): |
| __verify_numeric_attr_type( |
| kwargs["trainingSource"], |
| kwargs["trainingAttrColumn"], |
| kwargs["numericAttrsColumnIndices"], |
| kwargs["numAttrs"]); |
| |
| # create the tables for the categorical kind attributes |
| if kwargs['whatToCreate'] == 'TABLE': |
| # FIXME: ANALYZE is not portable. |
| kwargs.update(dict( |
| attrCountsSource = '_madlib_nb_attr_counts', |
| attrValuesSource = '_madlib_nb_attr_values' |
| )) |
| plpy.execute(""" |
| DROP TABLE IF EXISTS {attrCountsSource}; |
| CREATE TEMPORARY TABLE {attrCountsSource} |
| AS |
| {attr_counts_sql} |
| m4_ifdef(`__POSTGRESQL__', `', `DISTRIBUTED BY (attr)'); |
| ALTER TABLE {attrCountsSource} ADD PRIMARY KEY (attr); |
| ANALYZE {attrCountsSource}; |
| |
| DROP TABLE IF EXISTS {attrValuesSource}; |
| CREATE TEMPORARY TABLE {attrValuesSource} |
| AS |
| {attr_values_sql} |
| m4_ifdef(`__POSTGRESQL__', `', `DISTRIBUTED BY (attr, value)'); |
| |
| ALTER TABLE {attrValuesSource} ADD PRIMARY KEY (attr, value); |
| ANALYZE {attrValuesSource}; |
| """.format( |
| attrCountsSource = kwargs['attrCountsSource'], |
| attrValuesSource = kwargs['attrValuesSource'], |
| attr_counts_sql = "(" + __get_attr_counts_sql(**kwargs) + ")", |
| attr_values_sql = "(" + __get_attr_values_sql(**kwargs) + ")" |
| ) |
| ) |
| |
| kwargs.update(dict( |
| sql = __get_class_priors_sql(**kwargs) |
| )) |
| |
| plpy.execute(""" |
| CREATE {whatToCreate} {classPriorsDestName} |
| AS |
| {sql} |
| m4_ifdef(`__POSTGRESQL__', `', `DISTRIBUTED BY (class)'); |
| """.format(**kwargs) |
| ) |
| if kwargs['whatToCreate'] == 'TABLE': |
| plpy.execute(""" |
| ALTER TABLE {classPriorsDestName} ADD PRIMARY KEY (class); |
| ANALYZE {classPriorsDestName}; |
| """.format(**kwargs)) |
| |
| kwargs.update(dict( |
| classPriorsSource = kwargs['classPriorsDestName'] |
| )) |
| |
| |
| # Create the attr-mean-var-class table (for later use in computing probs). |
| # This can be directly computed using the sql query. no need for intermediate table |
| # Also update here the pointer to this globally created table |
| if kwargs.has_key('numericAttrsColumnIndices') and kwargs.has_key('numericFeatureStatsDestName'): |
| plpy.execute(""" |
| CREATE {whatToCreate} {numericFeatureStatsDestName} |
| AS |
| {sql} |
| m4_ifdef(`__POSTGRESQL__', `', `DISTRIBUTED BY (class, attr)'); |
| """.format( |
| whatToCreate = kwargs['whatToCreate'], |
| numericFeatureStatsDestName = kwargs['numericFeatureStatsDestName'], |
| sql = __get_attr_numeric_stats_sql(**kwargs) |
| ) |
| ) |
| else: |
| kwargs.update(dict( |
| numericAttrsColumnIndices = 'ARRAY[]' |
| )) |
| |
| kwargs.update(dict( |
| sql = __get_feature_probs_sql(**kwargs) |
| )) |
| |
| |
| plpy.execute(""" |
| CREATE {whatToCreate} {featureProbsDestName} |
| AS |
| {sql} |
| m4_ifdef(`__POSTGRESQL__', `', `DISTRIBUTED BY (class, attr, value)'); |
| """.format(**kwargs) |
| ) |
| |
| |
| if kwargs['whatToCreate'] == 'TABLE': |
| plpy.execute(""" |
| ALTER TABLE {featureProbsDestName} ADD PRIMARY KEY (class, attr, value); |
| ANALYZE {featureProbsDestName}; |
| DROP TABLE {attrCountsSource}; |
| DROP TABLE {attrValuesSource}; |
| """.format(**kwargs)) |
| |
| if kwargs.has_key('numericAttrsColumnIndices') and kwargs.has_key('numericFeatureStatsDestName'): |
| if kwargs['whatToCreate'] == 'TABLE': |
| plpy.execute(""" |
| ALTER TABLE {numericFeatureStatsDestName} ADD PRIMARY KEY (class, attr); |
| ANALYZE {numericFeatureStatsDestName}; |
| """.format(**kwargs)) |
| |
| |
| |
| def create_classification_view(**kwargs): |
| """Wrapper around create_classification() to create a view (and not a table) |
| """ |
| old_msg_level = plpy.execute("select setting from pg_settings where \ |
| name='client_min_messages'")[0]['setting'] |
| plpy.execute("set client_min_messages to error") |
| kwargs.update(whatToCreate = 'VIEW') |
| __nb_validate_args(**kwargs) |
| create_classification(**kwargs) |
| plpy.execute("set client_min_messages to " + old_msg_level) |
| |
| |
| def create_classification(**kwargs): |
| """ |
| Create a view/table with columns (key, nb_classification). |
| |
| The created relation will be |
| |
| <tt>{TABLE|VIEW} <em>destName</em> (key, nb_classification)</tt> |
| |
| where \c nb_classification is an array containing the most likely |
| class(es) of the record in \em classifySource identified by \c key. |
| |
| There are two sets of arguments this function can be called with. The |
| following parameters are always needed: |
| @param numAttrs Number of attributes to use for classification |
| @param destName Name of the table or view to create |
| @param whatToCreate (Optional) Either \c 'TABLE' OR \c 'VIEW' (the default). |
| @param smoothingFactor (Optional) Smoothing factor for computing feature |
| feature probabilities. Default value: 1.0 (Laplacian Smoothing). |
| @param classifySource Name of the relation that contains data to be classified |
| @param classifyKeyColumn Name of column in \em classifySource that can |
| serve as unique identifier |
| @param classifyAttrColumn Name of attributes-array column in \em classifySource |
| |
| Furthermore, provide either: |
| @param classPriorsSource |
| Relation (class, class_cnt, all_cnt) where |
| class is c, class_cnt is \#c, all_cnt is the number of training |
| samples. |
| @param featureProbsSource |
| Relation (class, attr, value, cnt, attr_cnt) where |
| (class, attr, value) = (c,i,a), cnt = \#(c,i,a), and attr_cnt = \#i |
| In addition to the above, for numeric attributes provide: |
| @param numericFeatureStatsSource |
| Relation (class,attr,attr_mean,attr_var) where |
| attr_mean and attr_vat are the mean and variance of attr respectively |
| |
| Or have this function operate on the "raw" training data: |
| @param trainingSource |
| Name of relation containing the training data |
| @param trainingClassColumn |
| Name of class column in training data |
| @param trainingAttrColumn |
| Name of attributes-array column in \em trainingSource |
| In addition to the above, for numeric attributes provide: |
| @param numericAttrsColumnIndices |
| Array containing attribute indices of numeric attributes |
| |
| """ |
| __init_prepared_data(kwargs) |
| |
| if kwargs.has_key('trainingSource') <> kwargs.has_key('trainingAttrColumn'): |
| plpy.error("'trainingSource' and 'trainingAttrColumn' must be provided together") |
| |
| if not kwargs.has_key('numAttrs'): |
| plpy.error("'numAttrs' must be provided") |
| |
| if 'trainingSource' in kwargs: |
| __verify_attr_num( |
| kwargs["trainingSource"], |
| kwargs["trainingAttrColumn"], |
| kwargs["numAttrs"]) |
| |
| if not kwargs.has_key('classifySource'): |
| plpy.error("'classifySource' must be provided") |
| |
| if not kwargs.has_key('classifyAttrColumn'): |
| plpy.error("'classifyAttrColumn' must be provided") |
| |
| __verify_attr_num( |
| kwargs["classifySource"], |
| kwargs["classifyAttrColumn"], |
| kwargs["numAttrs"]) |
| |
| kwargs.update( |
| keys_and_prob_values = "(" + __get_keys_and_prob_values_sql(**kwargs) + ")", |
| class_prior_prob_values = "(" + __get_class_prior_prob_values_sql(**kwargs) + ")" |
| ) |
| |
| # if there are any numeric attributes in the test data, compute their probabilities as well. |
| if kwargs.has_key('numericFeatureStatsSource'): |
| kwargs.update(dict( |
| numeric_attrs_keys_and_prob_values = "(" + __get_keys_and_probs_values_numeric_attrs_sql(**kwargs) + ")" |
| )) |
| # now add these probabilities to the original table |
| plpy.execute(""" |
| CREATE {whatToCreate} {destName} AS |
| SELECT |
| p.key, |
| {schema_madlib}.argmax(p.class, (p.log_prob)) AS nb_classification |
| FROM( |
| SELECT |
| key, |
| class, |
| sum(log_prob) as log_prob |
| FROM( |
| {keys_and_prob_values} |
| UNION ALL |
| {numeric_attrs_keys_and_prob_values} |
| UNION ALL |
| {class_prior_prob_values} |
| ) x |
| GROUP BY key,class |
| ) p |
| GROUP BY p.key |
| """.format(**kwargs)) |
| else: |
| plpy.execute(""" |
| CREATE {whatToCreate} {destName} AS |
| SELECT |
| key, |
| {schema_madlib}.argmax(class, log_prob) AS nb_classification |
| FROM ( |
| SELECT |
| key, |
| class, |
| sum(log_prob) as log_prob |
| FROM( |
| {keys_and_prob_values} |
| UNION ALL |
| {class_prior_prob_values} |
| ) x |
| GROUP BY key,class |
| ) AS keys_and_nb_values |
| GROUP BY key |
| """.format(**kwargs)) |
| |
| |
| def create_bayes_probabilities_view(**kwargs): |
| """Wrapper around create_bayes_probabilities() to create a view (and not a table) |
| """ |
| old_msg_level = plpy.execute("select setting from pg_settings where \ |
| name='client_min_messages'")[0]['setting'] |
| plpy.execute("set client_min_messages to error") |
| kwargs.update(whatToCreate = 'VIEW') |
| __nb_validate_args(**kwargs) |
| create_bayes_probabilities(**kwargs) |
| plpy.execute("set client_min_messages to " + old_msg_level) |
| |
| def create_bayes_probabilities(**kwargs): |
| """Create table/view with columns (key, class, nb_prob) |
| |
| The created relation will be |
| |
| <tt>{TABLE|VIEW} <em>destName</em> (key, class, nb_prob)</tt> |
| |
| where \c nb_prob is the Naive-Bayes probability that \c class is the true |
| class of the record in \em classifySource identified by \c key. |
| |
| There are two sets of arguments this function can be called with. The |
| following parameters are always needed: |
| @param numAttrs Number of attributes to use for classification |
| @param destName Name of the table or view to create |
| @param whatToCreate (Optional) Either \c 'TABLE' OR \c 'VIEW' (the default). |
| @param smoothingFactor (Optional) Smoothing factor for computing feature |
| feature probabilities. Default value: 1.0 (Laplacian Smoothing). |
| |
| Furthermore, provide either: |
| @param classPriorsSource |
| Relation (class, class_cnt, all_cnt) where |
| class is c, class_cnt is \#c, all_cnt is the number of training |
| samples. |
| @param featureProbsSource |
| Relation (class, attr, value, cnt, attr_cnt) where |
| (class, attr, value) = (c,i,a), cnt = \#(c,i,a), and attr_cnt = \#i |
| In addition to the above, for numeric attributes provide: |
| @param numericFeatureStatsSource |
| Relation (class,attr,attr_mean,attr_var) where |
| attr_mean and attr_vat are the mean and variance of attr respectively |
| |
| Or have this function operate on the "raw" training data: |
| @param trainingSource |
| Name of relation containing the training data |
| @param trainingClassColumn |
| Name of class column in training data |
| @param trainingAttrColumn |
| Name of attributes-array column in training data |
| In addition to the above, for numeric attributes provide: |
| @param numericAttrsColumnIndices |
| Array containing attribute indices of numeric attributes |
| |
| @internal |
| \par Implementation Notes: |
| |
| We have two numerical problems when copmuting the probabilities |
| @verbatim |
| P(C = c) * P(A = a | C = c) |
| P(C = c) = --------------------------------- (*) |
| -- |
| \ P(C = c') * P(A = a | C = c') |
| /_ |
| c' |
| __ |
| where P(A = a | C = c) = || P(A_i = a_i | C = c). |
| i |
| @endverbatim |
| |
| 1. P(A = a | C = c) could be a very small number not representable in |
| double-precision floating-point arithmetic. |
| - Solution: We have log( P(C = c) * P(A = a | C = c) ) as indermediate |
| results. We will add the maximum absolute value of these intermediate |
| results to all of them. This corresponds to multiplying numerator and |
| denominator of (*) with the same factor. The "normalization" ensures |
| that the numerator of (*) can never be 0 (in FP arithmetic) for all c. |
| |
| 2. PostgreSQL raises an error in case of underflows, even when 0 is the |
| desirable outcome. |
| - Solution: if log_10 ( P(A = a | C = c) ) < -300, we interprete |
| P(A = a | C = c) = 0. Note here that 1e-300 is roughly in the order of |
| magnitude of the smallest double precision FP number. |
| @endinternal |
| """ |
| |
| __init_prepared_data(kwargs) |
| |
| if kwargs.has_key('trainingSource') <> kwargs.has_key('trainingAttrColumn'): |
| plpy.error("'trainingSource' and 'trainingAttrColumn' must be provided together") |
| |
| if not kwargs.has_key('numAttrs'): |
| plpy.error("'numAttrs' must be provided") |
| |
| if 'trainingSource' in kwargs: |
| __verify_attr_num( |
| kwargs["trainingSource"], |
| kwargs["trainingAttrColumn"], |
| kwargs["numAttrs"]) |
| |
| if not kwargs.has_key('classifySource'): |
| plpy.error("'classifySource' must be provided") |
| |
| if not kwargs.has_key('classifyAttrColumn'): |
| plpy.error("'classifyAttrColumn' must be provided") |
| |
| __verify_attr_num( |
| kwargs["classifySource"], |
| kwargs["classifyAttrColumn"], |
| kwargs["numAttrs"]) |
| |
| kwargs.update(dict( |
| keys_and_prob_values = "(" + __get_keys_and_prob_values_sql(**kwargs) + ")", |
| class_prior_prob_values = "(" + __get_class_prior_prob_values_sql(**kwargs) + ")" |
| )) |
| |
| # if there are any numeric attributes in the test data, compute their probabilities as well. |
| if kwargs.has_key('numericFeatureStatsSource'): |
| kwargs.update(dict( |
| numeric_attrs_keys_and_prob_values = "(" + __get_keys_and_probs_values_numeric_attrs_sql(**kwargs) + ")" |
| )) |
| #use the modified version of the query to calculate final probabilities |
| #by adding the probabilities for the numeric attrs as well |
| plpy.execute(""" |
| CREATE {whatToCreate} {destName} AS |
| SELECT |
| key, |
| class, |
| nb_prob / sum(nb_prob) OVER (PARTITION BY key) AS nb_prob |
| FROM |
| ( |
| SELECT |
| p.key, |
| p.class, |
| CASE WHEN max(p.log_prob) - max(max(p.log_prob)) OVER (PARTITION BY p.key) < -300 THEN 0 |
| ELSE pow(10, max(p.log_prob) - max(max(p.log_prob)) OVER (PARTITION BY p.key)) |
| END AS nb_prob |
| FROM |
| ( |
| SELECT |
| key, |
| class, |
| sum(log_prob) as log_prob |
| FROM( |
| {keys_and_prob_values} |
| UNION ALL |
| {numeric_attrs_keys_and_prob_values} |
| UNION ALL |
| {class_prior_prob_values} |
| ) x |
| GROUP BY key,class |
| ) p |
| GROUP BY |
| p.key, p.class |
| ) AS keys_and_nb_values |
| ORDER BY |
| key, class |
| """.format(**kwargs)) |
| |
| else: |
| plpy.execute(""" |
| CREATE {whatToCreate} {destName} AS |
| SELECT |
| key, |
| class, |
| nb_prob / sum(nb_prob) OVER (PARTITION BY key) AS nb_prob |
| FROM |
| ( |
| SELECT |
| key, |
| class, |
| CASE WHEN max(log_prob) - max(max(log_prob)) OVER (PARTITION BY key) < -300 THEN 0 |
| ELSE pow(10, max(log_prob) - max(max(log_prob)) OVER (PARTITION BY key)) |
| END AS nb_prob |
| FROM( |
| SELECT |
| key, |
| class, |
| sum(log_prob) as log_prob |
| FROM( |
| {keys_and_prob_values} |
| UNION ALL |
| {class_prior_prob_values} |
| ) x |
| GROUP BY key,class |
| ) p |
| GROUP BY |
| key, class |
| ) AS keys_and_nb_values |
| ORDER BY |
| key, class |
| """.format(**kwargs)) |
| |
| |
| def __init_prepared_data(kwargs): |
| """ |
| Fill in values for optional parameters: Create subqueries instead of using |
| a relation. |
| |
| """ |
| |
| if not 'classPriorsSource' in kwargs: |
| kwargs.update(dict( |
| classPriorsSource = "(" + __get_class_priors_sql(**kwargs) + ")" |
| )) |
| if kwargs.has_key('numericAttrsColumnIndices') and 'numericFeatureStatsSource' not in kwargs: |
| # sanity check the indices before executing any sql query |
| __verify_numeric_attr_type( |
| kwargs["trainingSource"], |
| kwargs["trainingAttrColumn"], |
| kwargs["numericAttrsColumnIndices"], |
| kwargs["numAttrs"]); |
| kwargs.update(dict( |
| numericFeatureStatsSource = "(" + __get_attr_numeric_stats_sql(**kwargs) + ")" |
| )) |
| elif 'numericFeatureStatsSource' in kwargs: |
| kwargs.update(dict( |
| numericAttrsColumnIndices = __get_numeric_attr_indices(kwargs['numericFeatureStatsSource']) |
| )) |
| |
| if not 'featureProbsSource' in kwargs: |
| kwargs.update(dict( |
| featureProbsSource = "(" + __get_feature_probs_sql(**kwargs) + ")" |
| )) |
| |
| if not 'smoothingFactor' in kwargs: |
| kwargs.update(dict( |
| smoothingFactor = 1 |
| )) |
| |
| def __verify_attr_num(sourceTable, attrColumn, numAttr): |
| """ |
| Verify the number of attributes for each record is as expected. |
| """ |
| result = plpy.execute(""" |
| SELECT count(*) size FROM {0} |
| WHERE array_upper({1}, 1) <> {2} |
| """.format(sourceTable, attrColumn, numAttr) |
| ) |
| |
| dsize = result[0]['size'] |
| if (dsize <> 0): |
| plpy.error('found %d records in "%s" where "%s" was not of expected length (%d)'\ |
| % (dsize, sourceTable, attrColumn, numAttr)) |
| |
| def __verify_numeric_attr_type(sourceTable, attrColumn, numericAttrsColumnIndices, numAttr): |
| """ |
| Verify if the attribute indices provided in numericAttrsColumnIndices are |
| actually numeric |
| """ |
| #check if all data is composed of integers |
| result = plpy.execute(""" |
| SELECT * FROM ( |
| SELECT unnest({0}::TEXT[]) |
| ) x |
| WHERE x.unnest !~ '^[0-9]+$' |
| """.format(numericAttrsColumnIndices) |
| ) |
| |
| if len(result)>0: |
| plpy.error("Invalid array index %s specified for numeric attribute"%result[0]['unnest']) |
| |
| #check for ranges |
| result = plpy.execute(""" |
| SELECT max(unnest),min(unnest) |
| FROM unnest({0}::INTEGER[]) |
| """.format(numericAttrsColumnIndices) |
| ) |
| |
| #empty array |
| if result[0]['max'] is None and result[0]['min'] is None: |
| plpy.error('No indices specified for numeric attributes') |
| |
| #bounds checking |
| maxidx = int(result[0]['max']); |
| minidx = int(result[0]['min']); |
| nattrs = int(numAttr) |
| if maxidx > nattrs : |
| plpy.error('Invalid index value %d is greater than the number of attributes'%maxidx); |
| if minidx < 1: |
| plpy.error('Invalid index value %d is less than 1'%minidx); |
| |