| /* ----------------------------------------------------------------------- *//** |
| * |
| * @file bayes.sql_in |
| * |
| * @brief SQL functions for naive Bayes |
| * @date January 2011 |
| * |
| * @sa For a brief introduction to Naive Bayes Classification, see the module |
| * description \ref grp_bayes. |
| * |
| *//* ----------------------------------------------------------------------- */ |
| |
| m4_include(`SQLCommon.m4') |
| |
| /** |
| @addtogroup grp_bayes |
| |
| <div class="toc"><b>Contents</b> |
| <ul> |
| <li><a href="#train">Training Function(s)</a></li> |
| <li><a href="#classify">Classify Function(s)</a></li> |
| <li><a href="#probabilities">Probabilities Function(s)</a></li> |
| <li><a href="#adhoc">Ad Hoc Computation</a></li> |
| <li><a href="#notes">Implementation Notes</a></li> |
| <li><a href="#examples">Examples</a></li> |
| <li><a href="#background">Technical Background</a></li> |
| <li><a href="#related">Related Topics</a></li> |
| </ul> |
| </div> |
| |
| @brief Constructs a classification model from a dataset where each attribute |
| independently contributes to the probability that a data point belongs to a |
| category. |
| |
| \warning <em> This MADlib method is still in early stage development. |
| Interface and implementation are subject to change. </em> |
| |
| Naive Bayes refers to a stochastic model where all independent variables |
| \f$ a_1, \dots, a_n \f$ (often referred to as attributes in this context) |
| independently contribute to the probability that a data point belongs to a |
| certain class \f$ c \f$. |
| |
| Naives Bayes classification estimates feature probabilities and class priors |
| using maximum likelihood or Laplacian smoothing. For numeric attributes, |
| Gaussian smoothing can be used to estimate the feature probabilities.These |
| parameters are then used to classify new data. |
| |
| |
| @anchor train |
| @par Training Function(s) |
| |
| |
| For data with only categorical attributes, precompute feature probabilities and class priors |
| using the following function: |
| |
| <pre class="syntax"> |
| create_nb_prepared_data_tables ( trainingSource, |
| trainingClassColumn, |
| trainingAttrColumn, |
| numAttrs, |
| featureProbsName, |
| classPriorsName |
| ) |
| </pre> |
| |
| For data containing both categorical and numeric attributes, use the following form to |
| precompute the Gaussian parameters (mean and variance) for numeric attributes alongside |
| the feature probabilities for categorical attributes and class priors. |
| |
| <pre class="syntax"> |
| create_nb_prepared_data_tables ( trainingSource, |
| trainingClassColumn, |
| trainingAttrColumn, |
| numericAttrsColumnIndices, |
| numAttrs, |
| featureProbsName, |
| numericAttrParamsName, |
| classPriorsName |
| ) |
| </pre> |
| |
| The \e trainingSource is expected to be of the following form: |
| <pre>{TABLE|VIEW} <em>trainingSource</em> ( |
| ... |
| <em>trainingClassColumn</em> INTEGER, |
| <em>trainingAttrColumn</em> INTEGER[] OR NUMERIC[] OR FLOAT8[], |
| ... |
| )</pre> |
| |
| \e numericAttrsColumnIndices should be of type TEXT, specified as an array of |
| indices (starting from 1) in the \e trainingAttrColumn attributes-array that |
| correspond to numeric attributes. |
| |
| The two output tables are: |
| - \e featureProbsName – stores feature probabilities |
| - \e classPriorsName – stores the class priors |
| |
| In addition to the above, if the function specifying numeric attributes is used, |
| an additional table \e numericAttrParamsName is created which stores the |
| Gaussian parameters for the numeric attributes. |
| |
| @anchor classify |
| @par Classify Function(s) |
| |
| Perform Naive Bayes classification: |
| <pre class="syntax"> |
| create_nb_classify_view ( featureProbsName, |
| classPriorsName, |
| classifySource, |
| classifyKeyColumn, |
| classifyAttrColumn, |
| numAttrs, |
| destName |
| ) |
| </pre> |
| |
| For data with numeric attributes, use the following version: |
| |
| <pre class="syntax"> |
| create_nb_classify_view ( featureProbsName, |
| classPriorsName, |
| classifySource, |
| classifyKeyColumn, |
| classifyAttrColumn, |
| numAttrs, |
| numericAttrParamsName, |
| destName |
| ) |
| </pre> |
| |
| |
| The <b>data to classify</b> is expected to be of the following form: |
| <pre>{TABLE|VIEW} <em>classifySource</em> ( |
| ... |
| <em>classifyKeyColumn</em> ANYTYPE, |
| <em>classifyAttrColumn</em> INTEGER[], |
| ... |
| )</pre> |
| |
| |
| This function creates the view <tt><em>destName</em></tt> mapping |
| <em>classifyKeyColumn</em> to the Naive Bayes classification. |
| <pre class="result"> |
| key | nb_classification |
| ---+------------------ |
| ... |
| </pre> |
| |
| @anchor probabilities |
| @par Probabilities Function(s) |
| |
| Compute Naive Bayes probabilities. |
| <pre class="syntax"> |
| create_nb_probs_view( featureProbsName, |
| classPriorsName, |
| classifySource, |
| classifyKeyColumn, |
| classifyAttrColumn, |
| numAttrs, |
| destName |
| ) |
| </pre> |
| |
| For data with numeric attributes , use the following version: |
| |
| <pre class="syntax"> |
| create_nb_probs_view( featureProbsName, |
| classPriorsName, |
| classifySource, |
| classifyKeyColumn, |
| classifyAttrColumn, |
| numAttrs, |
| numericAttrParamsName, |
| destName |
| ) |
| </pre> |
| |
| |
| This creates the view <tt><em>destName</em></tt> mapping |
| <em>classifyKeyColumn</em> and every single class to the Naive Bayes |
| probability: |
| <pre class="result"> |
| key | class | nb_prob |
| ---+-------+-------- |
| ... |
| </pre> |
| |
| @anchor adhoc |
| @par Ad Hoc Computation Function |
| |
| With ad hoc execution (no precomputation), the |
| functions create_nb_classify_view() and create_nb_probs_view() can |
| be used in an ad-hoc fashion without the |
| precomputation step. In this case, replace the function arguments |
| |
| <pre>'<em>featureProbsName</em>', '<em>classPriorsName</em>'</pre> |
| with |
| <pre>'<em>trainingSource</em>', '<em>trainingClassColumn</em>', '<em>trainingAttrColumn</em>'</pre> |
| for data without any any numeric attributes and with |
| <pre>'<em>trainingSource</em>', '<em>trainingClassColumn</em>', '<em>trainingAttrColumn</em>', '<em>numericAttrsColumnIndices</em>'</pre> |
| for data containing numeric attributes as well. |
| |
| |
| @anchor notes |
| @par Implementation Notes |
| - The probabilities computed on the platforms of PostgreSQL and Greenplum |
| database have a small difference due to the nature of floating point |
| computation. Usually this is not important. However, if a data point has |
| \f[ |
| P(C=c_i \mid A) \approx P(C=c_j \mid A) |
| \f] |
| for two classes, this data point might be classified into diferent classes on |
| PostgreSQL and Greenplum. This leads to the differences in classifications |
| on PostgreSQL and Greenplum for some data sets, but this should not |
| affect the quality of the results. |
| |
| - When two classes have equal and highest probability among all classes, |
| the classification result is an array of these two classes, but the order |
| of the two classes is random. |
| |
| - The current implementation of Naive Bayes classification is suitable |
| for discontinuous (categorial) attributes as well as continuous (numeric) |
| attributes.\n |
| For continuous data, a typical assumption, usually used for small datasets, |
| is that the continuous values associated with each class are distributed |
| according to a Gaussian distribution, |
| and the probabilities \f$ P(A_i = a \mid C=c) \f$ are estimated using the |
| Gaussian Distribution formula: |
| \f[ |
| P(A_i=a \mid C=c) = \frac{1}{\sqrt{2\pi\sigma^{2}_c}}exp\left(-\frac{(a-\mu_c)^{2}}{2\sigma^{2}_c}\right) |
| \f] |
| where \f$\mu_c\f$ and \f$\sigma^{2}_c\f$ are the population mean and variance |
| of the attribute for the class \f$c\f$.\n |
| Another common technique for handling continuous values, which is better for |
| large data sets, is to use binning to discretize the values, and convert the |
| continuous data into categorical bins. This approach is currently not implemented. |
| |
| - One can provide floating point data to the Naive Bayes |
| classification function. If the corresponding attribute index is not specified |
| in \e numericAttrsColumnIndices, floating point numbers will be used as symbolic |
| substitutions for categorial data. In this case, the classification would work |
| best if there are sufficient data points for each floating point attribute. However, |
| if floating point numbers are used as continuous data without the attribute being |
| marked as of type numeric in \e numericAttrsColumnIndices, no warning is raised |
| and the result may not be as expected. |
| |
| @anchor examples |
| @examp |
| |
| The following is an extremely simplified example of the above option #1 which |
| can by verified by hand. |
| |
| -# The training and the classification data. |
| <pre class="example"> |
| SELECT * FROM training; |
| </pre> |
| Result: |
| <pre class="result"> |
| id | class | attributes |
| ---+-------+------------ |
| 1 | 1 | {1,2,3} |
| 2 | 1 | {1,2,1} |
| 3 | 1 | {1,4,3} |
| 4 | 2 | {1,2,2} |
| 5 | 2 | {0,2,2} |
| 6 | 2 | {0,1,3} |
| (6 rows) |
| </pre> |
| <pre class="example"> |
| SELECT * FROM toclassify; |
| </pre> |
| Result: |
| <pre class="result"> |
| id | attributes |
| ---+------------ |
| 1 | {0,2,1} |
| 2 | {1,2,3} |
| (2 rows) |
| </pre> |
| |
| -# Precompute feature probabilities and class priors. |
| <pre class="example"> |
| SELECT madlib.create_nb_prepared_data_tables( 'training', |
| 'class', |
| 'attributes', |
| 3, |
| 'nb_feature_probs', |
| 'nb_class_priors' |
| ); |
| </pre> |
| |
| -# Optionally check the contents of the precomputed tables. |
| <pre class="example"> |
| SELECT * FROM nb_class_priors; |
| </pre> |
| Result: |
| <pre class="result"> |
| class | class_cnt | all_cnt |
| ------+-----------+--------- |
| 1 | 3 | 6 |
| 2 | 3 | 6 |
| (2 rows) |
| </pre> |
| <pre class="example"> |
| SELECT * FROM nb_feature_probs; |
| </pre> |
| Result: |
| <pre class="result"> |
| class | attr | value | cnt | attr_cnt |
| ------+------+-------+-----+---------- |
| 1 | 1 | 0 | 0 | 2 |
| 1 | 1 | 1 | 3 | 2 |
| 1 | 2 | 1 | 0 | 3 |
| 1 | 2 | 2 | 2 | 3 |
| ... |
| </pre> |
| |
| -# Create the view with Naive Bayes classification and check the results. |
| <pre class="example"> |
| SELECT madlib.create_nb_classify_view( 'nb_feature_probs', |
| 'nb_class_priors', |
| 'toclassify', |
| 'id', |
| 'attributes', |
| 3, |
| 'nb_classify_view_fast' |
| ); |
| |
| SELECT * FROM nb_classify_view_fast; |
| </pre> |
| Result: |
| <pre class="result"> |
| key | nb_classification |
| ----+------------------- |
| 1 | {2} |
| 2 | {1} |
| (2 rows) |
| </pre> |
| |
| -# Look at the probabilities for each class (note that we use "Laplacian smoothing"), |
| <pre class="example"> |
| SELECT madlib.create_nb_probs_view( 'nb_feature_probs', |
| 'nb_class_priors', |
| 'toclassify', |
| 'id', |
| 'attributes', |
| 3, |
| 'nb_probs_view_fast' |
| ); |
| |
| SELECT * FROM nb_probs_view_fast; |
| </pre> |
| Result: |
| <pre class="result"> |
| key | class | nb_prob |
| ----+-------+--------- |
| 1 | 1 | 0.4 |
| 1 | 2 | 0.6 |
| 2 | 1 | 0.75 |
| 2 | 2 | 0.25 |
| (4 rows) |
| </pre> |
| |
| The following is an example of using a dataset with both numeric and |
| categorical attributes |
| |
| -# The training and the classification data. Attributes {height(numeric),weight(numeric),shoe size(categorical)}, |
| Class{sex(1=male,2=female)} |
| <pre class="example"> |
| SELECT * FROM gaussian_data; |
| </pre> |
| Result: |
| <pre class="result"> |
| id | sex | attributes |
| ----+-----+--------------- |
| 1 | 1 | {6,180,12} |
| 2 | 1 | {5.92,190,12} |
| 3 | 1 | {5.58,170,11} |
| 4 | 1 | {5.92,165,11} |
| 5 | 2 | {5,100,6} |
| 6 | 2 | {5.5,150,6} |
| 7 | 2 | {5.42,130,7} |
| 8 | 2 | {5.75,150,8} |
| (8 rows) |
| </pre> |
| <pre class="example"> |
| SELECT * FROM gaussian_test; |
| </pre> |
| Result: |
| <pre class="result"> |
| id | sex | attributes |
| ----+-----+-------------- |
| 9 | 1 | {5.8,180,11} |
| 10 | 2 | {5,160,6} |
| (2 rows) |
| </pre> |
| |
| -# Precompute feature probabilities and class priors. |
| <pre class="example"> |
| SELECT madlib.create_nb_prepared_data_tables( 'gaussian_data', |
| 'sex', |
| 'attributes', |
| 'ARRAY[1,2]', |
| 3, |
| 'categ_feature_probs', |
| 'numeric_attr_params', |
| 'class_priors' |
| ); |
| </pre> |
| |
| -# Optionally check the contents of the precomputed tables. |
| <pre class="example"> |
| SELECT * FROM class_priors; |
| </pre> |
| Result: |
| <pre class="result"> |
| class | class_cnt | all_cnt |
| -------+-----------+--------- |
| 1 | 4 | 8 |
| 2 | 4 | 8 |
| (2 rows) |
| </pre> |
| <pre class="example"> |
| SELECT * FROM categ_feature_probs; |
| </pre> |
| Result: |
| <pre class="result"> |
| class | attr | value | cnt | attr_cnt |
| -------+------+-------+-----+---------- |
| 2 | 3 | 6 | 2 | 5 |
| 1 | 3 | 12 | 2 | 5 |
| 2 | 3 | 7 | 1 | 5 |
| 1 | 3 | 11 | 2 | 5 |
| 2 | 3 | 8 | 1 | 5 |
| 2 | 3 | 12 | 0 | 5 |
| 1 | 3 | 6 | 0 | 5 |
| 2 | 3 | 11 | 0 | 5 |
| 1 | 3 | 8 | 0 | 5 |
| 1 | 3 | 7 | 0 | 5 |
| (10 rows) |
| </pre> |
| <pre class="example"> |
| SELECT * FROM numeric_attr_params; |
| </pre> |
| Result: |
| <pre class="result"> |
| class | attr | attr_mean | attr_var |
| -------+------+----------------------+------------------------ |
| 1 | 1 | 5.8550000000000000 | 0.03503333333333333333 |
| 1 | 2 | 176.2500000000000000 | 122.9166666666666667 |
| 2 | 1 | 5.4175000000000000 | 0.09722500000000000000 |
| 2 | 2 | 132.5000000000000000 | 558.3333333333333333 |
| (4 rows) |
| </pre> |
| |
| -# Create the view with Naive Bayes classification and check the results. |
| <pre class="example"> |
| SELECT madlib.create_nb_classify_view( 'categ_feature_probs', |
| 'class_priors', |
| 'gaussian_test', |
| 'id', |
| 'attributes', |
| 3, |
| 'numeric_attr_params', |
| 'classify_view' |
| ); |
| |
| SELECT * FROM classify_view; |
| </pre> |
| Result: |
| <pre class="result"> |
| key | nb_classification |
| ----+------------------- |
| 9 | {1} |
| 10 | {2} |
| (2 rows) |
| </pre> |
| |
| -# Look at the probabilities for each class |
| <pre class="example"> |
| SELECT madlib.create_nb_probs_view( 'categ_feature_probs', |
| 'class_priors', |
| 'gaussian_test', |
| 'id', |
| 'attributes', |
| 3, |
| 'numeric_attr_params', |
| 'probs_view' |
| ); |
| |
| SELECT * FROM probs_view; |
| </pre> |
| Result: |
| <pre class="result"> |
| key | class | nb_prob |
| -----+-------+---------------------- |
| 9 | 1 | 0.993556745948775 |
| 9 | 2 | 0.00644325405122553 |
| 10 | 1 | 5.74057538627122e-05 |
| 10 | 2 | 0.999942594246137 |
| (4 rows) |
| </pre> |
| |
| |
| @anchor background |
| @par Technical Background |
| |
| In detail, \b Bayes' theorem states that |
| \f[ |
| \Pr(C = c \mid A_1 = a_1, \dots, A_n = a_n) |
| = \frac{\Pr(C = c) \cdot \Pr(A_1 = a_1, \dots, A_n = a_n \mid C = c)} |
| {\Pr(A_1 = a_1, \dots, A_n = a_n)} |
| \,, |
| \f] |
| and the \b naive assumption is that |
| \f[ |
| \Pr(A_1 = a_1, \dots, A_n = a_n \mid C = c) |
| = \prod_{i=1}^n \Pr(A_i = a_i \mid C = c) |
| \,. |
| \f] |
| Naives Bayes classification estimates feature probabilities and class priors |
| using maximum likelihood or Laplacian smoothing. These parameters are then used |
| to classifying new data. |
| |
| A Naive Bayes classifier computes the following formula: |
| \f[ |
| \text{classify}(a_1, ..., a_n) |
| = \arg\max_c \left\{ |
| \Pr(C = c) \cdot \prod_{i=1}^n \Pr(A_i = a_i \mid C = c) |
| \right\} |
| \f] |
| where \f$ c \f$ ranges over all classes in the training data and probabilites |
| are estimated with relative frequencies from the training set. |
| There are different ways to estimate the feature probabilities |
| \f$ P(A_i = a \mid C = c) \f$. The maximum likelihood estimate takes the |
| relative frequencies. That is: |
| \f[ |
| P(A_i = a \mid C = c) = \frac{\#(c,i,a)}{\#c} |
| \f] |
| where |
| - \f$ \#(c,i,a) \f$ denotes the # of training samples where attribute \f$ i \f$ |
| is \f$ a \f$ and class is \f$ c \f$ |
| - \f$ \#c \f$ denotes the # of training samples where class is \f$ c \f$. |
| |
| Since the maximum likelihood sometimes results in estimates of "0", you might |
| want to use a "smoothed" estimate. To do this, you add a number of "virtual" |
| samples and make the assumption that these samples are evenly distributed among |
| the values assumed by attribute \f$ i \f$ (that is, the set of all values |
| observed for attribute \f$ a \f$ for any class): |
| |
| \f[ |
| P(A_i = a \mid C = c) = \frac{\#(c,i,a) + s}{\#c + s \cdot \#i} |
| \f] |
| where |
| - \f$ \#i \f$ denotes the # of distinct values for attribute \f$ i \f$ (for all |
| classes) |
| - \f$ s \geq 0 \f$ denotes the smoothing factor. |
| |
| The case \f$ s = 1 \f$ is known as "Laplace smoothing". The case \f$ s = 0 \f$ |
| trivially reduces to maximum-likelihood estimates. |
| |
| |
| @anchor literature |
| @literature |
| |
| [1] Tom Mitchell: Machine Learning, McGraw Hill, 1997. Book chapter |
| <em>Generativ and Discriminative Classifiers: Naive Bayes and Logistic |
| Regression</em> available at: http://www.cs.cmu.edu/~tom/NewChapters.html |
| |
| [2] Wikipedia, Naive Bayes classifier, |
| http://en.wikipedia.org/wiki/Naive_Bayes_classifier |
| |
| @anchor related |
| @par Related Topics |
| File bayes.sql_in documenting the SQL functions. |
| |
| @internal |
| @sa namespace bayes (documenting the implementation in Python) |
| @endinternal |
| |
| */ |
| |
| -- Begin of argmax definition |
| |
| DROP TYPE IF EXISTS MADLIB_SCHEMA.args_and_value_double CASCADE; |
| CREATE TYPE MADLIB_SCHEMA.args_and_value_double AS ( |
| args INTEGER[], |
| value DOUBLE PRECISION |
| ); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.argmax_transition( |
| oldmax MADLIB_SCHEMA.ARGS_AND_VALUE_DOUBLE, |
| newkey INTEGER, |
| newvalue DOUBLE PRECISION) |
| RETURNS MADLIB_SCHEMA.args_and_value_double AS |
| $$ |
| SELECT CASE WHEN $3 < $1.value OR $2 IS NULL OR ($3 IS NULL AND NOT $1.value IS NULL) THEN $1 |
| WHEN $3 = $1.value OR ($3 IS NULL AND $1.value IS NULL AND NOT $1.args IS NULL) |
| THEN ($1.args || $2, $3)::MADLIB_SCHEMA.args_and_value_double |
| ELSE (array[$2], $3)::MADLIB_SCHEMA.args_and_value_double |
| END |
| $$ |
| LANGUAGE sql IMMUTABLE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `CONTAINS SQL', `'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.argmax_combine( |
| max1 MADLIB_SCHEMA.ARGS_AND_VALUE_DOUBLE, |
| max2 MADLIB_SCHEMA.ARGS_AND_VALUE_DOUBLE) |
| RETURNS MADLIB_SCHEMA.ARGS_AND_VALUE_DOUBLE AS |
| $$ |
| -- If SQL guaranteed short-circuit evaluation, the following could become |
| -- shorter. Unfortunately, this is not the case. |
| -- Section 6.3.3.3 of ISO/IEC 9075-1:2008 Framework (SQL/Framework): |
| -- |
| -- "However, it is implementation-dependent whether expressions are |
| -- actually evaluated left to right, particularly when operands or |
| -- operators might cause conditions to be raised or if the results of the |
| -- expressions can be determined without completely evaluating all parts |
| -- of the expression." |
| -- |
| -- Again, the optimizer does its job hopefully. |
| SELECT CASE WHEN $1 IS NULL THEN $2 |
| WHEN $2 IS NULL THEN $1 |
| WHEN ($1.value = $2.value) OR ($1.value IS NULL AND $2.value IS NULL) |
| THEN ($1.args || $2.args, $1.value)::MADLIB_SCHEMA.ARGS_AND_VALUE_DOUBLE |
| WHEN $1.value IS NULL OR $1.value < $2.value THEN $2 |
| ELSE $1 |
| END |
| $$ |
| LANGUAGE sql IMMUTABLE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `CONTAINS SQL', `'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.argmax_final( |
| finalstate MADLIB_SCHEMA.ARGS_AND_VALUE_DOUBLE) |
| RETURNS INTEGER[] AS |
| $$ |
| SELECT $1.args |
| $$ |
| LANGUAGE sql IMMUTABLE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `CONTAINS SQL', `'); |
| |
| /** |
| * @internal |
| * @brief Argmax: Return the key of the row for which value is maximal |
| * |
| * The "index set" of the argmax function is of type INTEGER and we range over |
| * DOUBLE PRECISION values. It is not required that all keys are distinct. |
| * |
| * @note |
| * argmax should only be used on unsorted data because it will not exploit |
| * indices, and its running time is \f$ \Theta(n) \f$. |
| * |
| * @implementation |
| * The implementation is in SQL, with a flavor of functional programming. |
| * The hope is that the optimizer does a good job here. |
| */ |
| DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.argmax(/*+ key */ INTEGER, /*+ value */ DOUBLE PRECISION) CASCADE; |
| CREATE AGGREGATE MADLIB_SCHEMA.argmax(/*+ key */ INTEGER, /*+ value */ DOUBLE PRECISION) ( |
| SFUNC=MADLIB_SCHEMA.argmax_transition, |
| STYPE=MADLIB_SCHEMA.ARGS_AND_VALUE_DOUBLE, |
| m4_ifdef(`__POSTGRESQL__', `', `prefunc=MADLIB_SCHEMA.argmax_combine,') |
| FINALFUNC=MADLIB_SCHEMA.argmax_final |
| ); |
| |
| |
| /** |
| * @brief Precompute all class priors and feature probabilities |
| * |
| * Feature probabilities are stored in a table of format |
| * <pre>TABLE <em>featureProbsDestName</em> ( |
| * class INTEGER, |
| * attr INTEGER, |
| * value INTEGER, |
| * cnt INTEGER, |
| * attr_cnt INTEGER |
| *)</pre> |
| * |
| * Class priors are stored in a table of format |
| * <pre>TABLE <em>classPriorsDestName</em> ( |
| * class INTEGER, |
| * class_cnt INTEGER, |
| * all_cnt INTEGER |
| *)</pre> |
| * |
| * @param trainingSource Name of relation containing the training data |
| * @param trainingClassColumn Name of class column in training data |
| * @param trainingAttrColumn Name of attributes-array column in training data |
| * @param numAttrs Number of attributes to use for classification |
| * @param featureProbsDestName Name of feature-probabilities table to create |
| * @param classPriorsDestName Name of class-priors table to create |
| * |
| * @usage |
| * Precompute feature probabilities and class priors: |
| * <pre>SELECT \ref create_nb_prepared_data_tables( |
| * '<em>trainingSource</em>', '<em>trainingClassColumn</em>', '<em>trainingAttrColumn</em>', |
| * <em>numAttrs</em>, '<em>featureProbsName</em>', '<em>classPriorsName</em>' |
| *);</pre> |
| * |
| * @internal |
| * @sa This function is a wrapper for bayes::create_prepared_data(). |
| */ |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.create_nb_prepared_data_tables( |
| "trainingSource" VARCHAR, |
| "trainingClassColumn" VARCHAR, |
| "trainingAttrColumn" VARCHAR, |
| "numAttrs" INTEGER, |
| "featureProbsDestName" VARCHAR, |
| "classPriorsDestName" VARCHAR) |
| RETURNS VOID |
| AS $$PythonFunction(bayes, bayes, create_prepared_data_table)$$ |
| LANGUAGE plpythonu VOLATILE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.create_nb_prepared_data_tables( |
| "trainingSource" VARCHAR, |
| "trainingClassColumn" VARCHAR, |
| "trainingAttrColumn" VARCHAR, |
| "numericAttrsColumnIndices" VARCHAR, |
| "numAttrs" INTEGER, |
| "featureProbsDestName" VARCHAR, |
| "numericFeatureStatsDestName" VARCHAR, |
| "classPriorsDestName" VARCHAR) |
| RETURNS VOID |
| AS $$PythonFunction(bayes, bayes, create_prepared_data_table)$$ |
| LANGUAGE plpythonu VOLATILE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| |
| |
| /** |
| * @brief Create a view with columns <tt>(key, nb_classification)</tt> |
| * |
| * The created relation will be |
| * |
| * <tt>{TABLE|VIEW} <em>destName</em> (key, nb_classification)</tt> |
| * |
| * where \c nb_classification is an array containing the most likely |
| * class(es) of the record in \em classifySource identified by \c key. |
| * |
| * @param featureProbsSource Name of table with precomputed feature |
| * probabilities, as created with create_nb_prepared_data_tables() |
| * @param classPriorsSource Name of table with precomputed class priors, as |
| * created with create_nb_prepared_data_tables() |
| * @param classifySource Name of the relation that contains data to be classified |
| * @param classifyKeyColumn Name of column in \em classifySource that can |
| * serve as unique identifier (the key of the source relation) |
| * @param classifyAttrColumn Name of attributes-array column in \em classifySource |
| * @param numAttrs Number of attributes to use for classification |
| * @param destName Name of the view to create |
| * |
| * @note \c create_nb_classify_view can be called in an ad-hoc fashion. See |
| * \ref grp_bayes for instructions. |
| * |
| * @usage |
| * -# Create Naive Bayes classifications view: |
| * <pre>SELECT \ref create_nb_classify_view( |
| * '<em>featureProbsName</em>', '<em>classPriorsName</em>', |
| * '<em>classifySource</em>', '<em>classifyKeyColumn</em>', '<em>classifyAttrColumn</em>', |
| * <em>numAttrs</em>, '<em>destName</em>' |
| *);</pre> |
| * -# Show Naive Bayes classifications: |
| * <pre>SELECT * FROM <em>destName</em>;</pre> |
| * |
| * @internal |
| * @sa This function is a wrapper for bayes::create_classification(). See there |
| * for details. |
| */ |
| |
| /* default API without any numeric attributes and pre-trained data*/ |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.create_nb_classify_view( |
| "featureProbsSource" VARCHAR, |
| "classPriorsSource" VARCHAR, |
| "classifySource" VARCHAR, |
| "classifyKeyColumn" VARCHAR, |
| "classifyAttrColumn" VARCHAR, |
| "numAttrs" INTEGER, |
| "destName" VARCHAR) |
| RETURNS VOID |
| AS $$PythonFunction(bayes, bayes, create_classification_view)$$ |
| LANGUAGE plpythonu VOLATILE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| |
| /* API with numeric attributes and pre-trained data */ |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.create_nb_classify_view( |
| "featureProbsSource" VARCHAR, |
| "classPriorsSource" VARCHAR, |
| "classifySource" VARCHAR, |
| "classifyKeyColumn" VARCHAR, |
| "classifyAttrColumn" VARCHAR, |
| "numAttrs" INTEGER, |
| "numericFeatureStatsSource" VARCHAR, |
| "destName" VARCHAR) |
| RETURNS VOID |
| AS $$PythonFunction(bayes, bayes, create_classification_view)$$ |
| LANGUAGE plpythonu VOLATILE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| |
| /* API without numeric attrs but ad-hoc computation */ |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.create_nb_classify_view( |
| "trainingSource" VARCHAR, |
| "trainingClassColumn" VARCHAR, |
| "trainingAttrColumn" VARCHAR, |
| "classifySource" VARCHAR, |
| "classifyKeyColumn" VARCHAR, |
| "classifyAttrColumn" VARCHAR, |
| "numAttrs" INTEGER, |
| "destName" VARCHAR) |
| RETURNS VOID |
| AS $$PythonFunction(bayes, bayes, create_classification_view)$$ |
| LANGUAGE plpythonu VOLATILE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| |
| /* API with numeric attrs but ad-hoc computation */ |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.create_nb_classify_view( |
| "trainingSource" VARCHAR, |
| "trainingClassColumn" VARCHAR, |
| "trainingAttrColumn" VARCHAR, |
| "numericAttrsColumnIndices" VARCHAR, |
| "classifySource" VARCHAR, |
| "classifyKeyColumn" VARCHAR, |
| "classifyAttrColumn" VARCHAR, |
| "numAttrs" INTEGER, |
| "destName" VARCHAR) |
| RETURNS VOID |
| AS $$PythonFunction(bayes, bayes, create_classification_view)$$ |
| LANGUAGE plpythonu VOLATILE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| |
| /** |
| * @brief Create view with columns <tt>(key, class, nb_prob)</tt> |
| * |
| * The created view will be of the following form: |
| * |
| * <pre>VIEW <em>destName</em> ( |
| * key ANYTYPE, |
| * class INTEGER, |
| * nb_prob FLOAT8 |
| *)</pre> |
| * |
| * where \c nb_prob is the Naive-Bayes probability that \c class is the true |
| * class of the record in \em classifySource identified by \c key. |
| * |
| * @param featureProbsSource Name of table with precomputed feature |
| * probabilities, as created with create_nb_prepared_data_tables() |
| * @param classPriorsSource Name of table with precomputed class priors, as |
| * created with create_nb_prepared_data_tables() |
| * @param classifySource Name of the relation that contains data to be classified |
| * @param classifyKeyColumn Name of column in \em classifySource that can |
| * serve as unique identifier (the key of the source relation) |
| * @param classifyAttrColumn Name of attributes-array column in \em classifySource |
| * @param numAttrs Number of attributes to use for classification |
| * @param destName Name of the view to create |
| * |
| * @note \c create_nb_probs_view can be called in an ad-hoc fashion. See |
| * \ref grp_bayes for instructions. |
| * |
| * @usage |
| * -# Create Naive Bayes probabilities view: |
| * <pre>SELECT \ref create_nb_probs_view( |
| * '<em>featureProbsName</em>', '<em>classPriorsName</em>', |
| * '<em>classifySource</em>', '<em>classifyKeyColumn</em>', '<em>classifyAttrColumn</em>', |
| * <em>numAttrs</em>, '<em>destName</em>' |
| *);</pre> |
| * -# Show Naive Bayes probabilities: |
| * <pre>SELECT * FROM <em>destName</em>;</pre> |
| * |
| * @internal |
| * @sa This function is a wrapper for bayes::create_bayes_probabilities(). |
| */ |
| |
| /* API without support for numeric attributes and with support for pre-computed data */ |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.create_nb_probs_view( |
| "featureProbsSource" VARCHAR, |
| "classPriorsSource" VARCHAR, |
| "classifySource" VARCHAR, |
| "classifyKeyColumn" VARCHAR, |
| "classifyAttrColumn" VARCHAR, |
| "numAttrs" INTEGER, |
| "destName" VARCHAR) |
| RETURNS VOID |
| AS $$PythonFunction(bayes, bayes, create_bayes_probabilities_view)$$ |
| LANGUAGE plpythonu VOLATILE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| |
| /* API with support for numeric attributes and with support for pre-computed data */ |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.create_nb_probs_view( |
| "featureProbsSource" VARCHAR, |
| "classPriorsSource" VARCHAR, |
| "classifySource" VARCHAR, |
| "classifyKeyColumn" VARCHAR, |
| "classifyAttrColumn" VARCHAR, |
| "numAttrs" INTEGER, |
| "numericFeatureStatsSource" VARCHAR, |
| "destName" VARCHAR) |
| RETURNS VOID |
| AS $$PythonFunction(bayes, bayes, create_bayes_probabilities_view)$$ |
| LANGUAGE plpythonu VOLATILE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| |
| /* API without support for numeric attributes and without support for pre-computed data */ |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.create_nb_probs_view( |
| "trainingSource" VARCHAR, |
| "trainingClassColumn" VARCHAR, |
| "trainingAttrColumn" VARCHAR, |
| "classifySource" VARCHAR, |
| "classifyKeyColumn" VARCHAR, |
| "classifyAttrColumn" VARCHAR, |
| "numAttrs" INTEGER, |
| "destName" VARCHAR) |
| RETURNS VOID |
| AS $$PythonFunction(bayes, bayes, create_bayes_probabilities_view)$$ |
| LANGUAGE plpythonu VOLATILE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |
| |
| /* API with support for numeric attributes but without support for pre-computed data */ |
| CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.create_nb_probs_view( |
| "trainingSource" VARCHAR, |
| "trainingClassColumn" VARCHAR, |
| "trainingAttrColumn" VARCHAR, |
| "numericAttrsColumnIndices" VARCHAR, |
| "classifySource" VARCHAR, |
| "classifyKeyColumn" VARCHAR, |
| "classifyAttrColumn" VARCHAR, |
| "numAttrs" INTEGER, |
| "destName" VARCHAR) |
| RETURNS VOID |
| AS $$PythonFunction(bayes, bayes, create_bayes_probabilities_view)$$ |
| LANGUAGE plpythonu VOLATILE |
| m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); |