methods/bayes/src/pg_gp/bayes.sql_in - madlib - Git at Google

 /* ----------------------------------------------------------------------- *//**
  *
  * @file bayes.sql_in
  *
  * @brief SQL functions for naive Bayes
  * @date   January 2011
  *
  * @sa For a brief introduction to Naive Bayes Classification, see the module
  *     description \ref grp_bayes.
  *
  *//* --------------------------------------------------------------------------
  *
  * This file is preprocessed with m4. Macro expansion can be turned of by
  * enclosing text in <nom4> and </nom4>.
  */
 changequote(`<nom4>', `</nom4>')

 /**
 @addtogroup grp_bayes

 @about

 Naive Bayes classification with user-defined smoothing factor (default:
 Laplacian smoothing).

 A Naive Bayes classifier computes the following formula:
 \f[
     \text{classify}(a_1, ..., a_n)
     =   \arg\max_c \left\{
             P(C = c) \cdot \prod_{i=1}^n P(A_i = a_i \mid C = c)
         \right\}
 \f]
 where $c$ ranges over all classes in the training data and probabilites are
 estimated with relative frequencies from the training set.

 There are different ways to estimate the feature probabilities
 \f$ P(A_i = a \mid C = c) \f$.  The maximum likelihood exstimate takes the relative
 frequencies. That is:
 \f[
     P(A_i = a \mid C = c) = \frac{\#(c,i,a)}{\#c}
 \f]
 where
 - \f$ \#(c,i,a) \f$ denotes the # of training samples where attribute $i$ is $a$ and class
   is $c$
 - \f$ \#c \f$ denotes the # of training samples where class is $c$.

 Since the maximum likelihood sometimes results in estimates of 0, it might
 be desirable to use a "smoothed" estimate. Intuitively, one adds a number of
 "virtual" samples and assumes that these samples are evenly distributed
 among the values attribute $i$ can assume (i.e., the set of all values observed
 for attribute $a$ for any class):

 \f[
     P(A_i = a \mid C = c) = \frac{\#(c,i,a) + s}{\#c + s \cdot \#i}
 \f]
 where
 - \f$ \#i \f$ denotes the # of distinct values for attribute $i$ (for all
   classes)
 - \f$ s \geq 0 \f$ denotes the smoothing factor.

 The case $s = 1$ is known as "Laplace smoothing" in the literature. The case
 $s = 0$ trivially reduces to maximum-likelihood estimates.

 @prereq

 Implemented in Python and SQL for PostgreSQL/Greenplum. The Python code only
 serves to genereate SQL and should be portable.

 @usage

 -# The training data is expected to be of the following form:\n
    <tt>{TABLE|VIEW} <em>trainingSource</em> ([...] <em>trainingClassColumn</em>
    INTEGER, <em>trainingAttrColumn</em> INTEGER[]
    [...])</tt>
 -# (Optional step:) Precompute feature probabilities and class priors for faster
    classification later:\n
    <tt>SELECT create_nb_prepared_data_tables('<em>trainingSource</em>',
    '<em>trainingClassColumn</em>', '<em>trainingAttrColumn</em>', numAttrs,
    '<em>featureProbsName</em>', '<em>classPriorsName</em>')</tt>\n
    \n
    This creates the two tables <tt><em>featureProbsName</em>(class INTEGER,
    attr INTEGER, value INTEGER, cnt INTEGER, attr_cnt INTEGER))</tt> and
    <tt><em>classPriorsName</em>(class INTEGER, class_cnt INTEGER, all_cnt
    INTEGER)</tt>.
 -# The data that we want to run Naive Bayes classification on is expected to be
    of the following form:\n
    <tt>{TABLE|VIEW} <em>classifySource</em> ([...] <em>classifyKeyColumn</em>
    ANYTYPE, <em>classifyAttrColumn</em> INTEGER[] [...])</tt>\n
 -# Run Naive Bayes classification as follows:\n
    <tt>SELECT create_nb_classify_view('<em>featureProbsName</em>',
    '<em>classPriorsName</em>', '<em>classifySource</em>',
    '<em>classifyKeyColumn</em>', '<em>classifyAttrColumn</em>', numAttrs,
    '<em>destName</em>')</tt>\n
    \n
    This creates a view <tt><em>destName</em>(key ANYTYPE,
    nb_classification INTEGER)</tt> which contains a row for every row in
    <tt><em>classifySource</em></tt>, where <tt>key</tt> =
    <tt><em>classifyKeyColumn</em></tt> and <tt>nb_classification</tt>
    is the Naive Bayes classification.

 @examp

 The following is an extremely simplified example which can by verify by hand.

 @verbatim
 # -- The training data
 # select * from bayes;
  id | class | attributes
 ----+-------+------------
   1 |     1 | {1,2,3}
   3 |     1 | {1,4,3}
   5 |     2 | {0,2,2}
   2 |     1 | {1,2,1}
   4 |     2 | {1,2,2}
   6 |     2 | {0,1,3}
 (6 rows)

 # -- Precompute feature probabilities and class priors
 # select create_nb_prepared_data_tables(
 #     'bayes', 'class', 'attributes', 3,
 #     'nb_feature_probs', 'nb_class_priors');

 # select * from nb_feature_probs;
  class | attr | value | cnt | attr_cnt
 -------+------+-------+-----+----------
      1 |    1 |     0 |   0 |        2
      1 |    1 |     1 |   3 |        2
      1 |    2 |     1 |   0 |        3
      1 |    2 |     2 |   2 |        3
 [...]

 # select * from nb_class_priors;
  class | class_cnt | all_cnt
 -------+-----------+---------
      1 |         3 |       6
      2 |         3 |       6
 (2 rows)

 # -- Data we want to run Naive Bayes classification on
 # select * from toclassify;
  id | attributes
 ----+------------
   1 | {0,2,1}
   2 | {1,2,3}

 # -- Run Naive Bayes
 # select create_nb_classify_view(
 #     'nb_feature_probs', 'nb_class_priors',
 #     'toclassify', 'id', 'attributes', 3,
 #     'nb_classify_view_fast');

 # select * from nb_classify_view_fast;
  key | nb_classification
 -----+-------------------
    1 | {2}
    2 | {1}
 (2 rows)

 # -- Look at the probabilities for each class (Note we use Laplacian Smoothing):
 # select create_nb_probs_view(
 #     'nb_feature_probs', 'nb_class_priors',
 #     'toclassify', 'id', 'attributes', 3,
 #     'nb_probs_view_fast');

 # select * from nb_probs_view_fast;
  key | class | nb_prob
 -----+-------+---------
    1 |     1 |     0.4
    1 |     2 |     0.6
    2 |     1 |    0.75
    2 |     2 |    0.25
 (4 rows)
 @endverbatim

 @sa file bayes.sql_in (documenting the SQL functions)

 @internal
 @sa namespace bayes (documenting the implementation in Python)
 @endinternal

 @literature

 [1] Tom Mitchell: Machine Learning, McGraw Hill, 1997. Book chapter
     <em>Generativ and Discriminative Classifiers: Naive Bayes and Logistic
     Regression</em> available at: http://www.cs.cmu.edu/~tom/NewChapters.html

 [2] Wikipedia, Naive Bayes classifier,
     http://en.wikipedia.org/wiki/Naive_Bayes_classifier
 */

 -- Begin of argmax definition

 CREATE TYPE MADLIB_SCHEMA.ARGS_AND_VALUE_DOUBLE AS (
     args INTEGER[],
     value DOUBLE PRECISION
 );

 CREATE FUNCTION MADLIB_SCHEMA.argmax_transition(
     oldmax MADLIB_SCHEMA.ARGS_AND_VALUE_DOUBLE,
     newkey INTEGER,
     newvalue DOUBLE PRECISION)
 RETURNS MADLIB_SCHEMA.ARGS_AND_VALUE_DOUBLE AS
 $$
     SELECT CASE WHEN $3 < $1.value OR $2 IS NULL OR ($3 IS NULL AND NOT $1.value IS NULL) THEN $1
                 WHEN $3 = $1.value OR ($3 IS NULL AND $1.value IS NULL AND NOT $1.args IS NULL)
                     THEN ($1.args || $2, $3)::MADLIB_SCHEMA.ARGS_AND_VALUE_DOUBLE
                 ELSE (array[$2], $3)::MADLIB_SCHEMA.ARGS_AND_VALUE_DOUBLE
            END
 $$
 LANGUAGE sql IMMUTABLE;

 CREATE FUNCTION MADLIB_SCHEMA.argmax_combine(
     max1 MADLIB_SCHEMA.ARGS_AND_VALUE_DOUBLE,
     max2 MADLIB_SCHEMA.ARGS_AND_VALUE_DOUBLE)
 RETURNS MADLIB_SCHEMA.ARGS_AND_VALUE_DOUBLE AS
 $$
     -- If SQL guaranteed short-circuit evaluation, the following could become
     -- shorter. Unfortunately, this is not the case.
     -- Section 6.3.3.3 of ISO/IEC 9075-1:2008 Framework (SQL/Framework):
     --
     --  "However, it is implementation-dependent whether expressions are
     --   actually evaluated left to right, particularly when operands or
     --   operators might cause conditions to be raised or if the results of the
     --   expressions can be determined without completely evaluating all parts
     --   of the expression."
     --
     -- Again, the optimizer does its job hopefully.
     SELECT CASE WHEN $1 IS NULL THEN $2
                 WHEN $2 IS NULL THEN $1
                 WHEN ($1.value = $2.value) OR ($1.value IS NULL AND $2.value IS NULL)
                     THEN ($1.args || $2.args, $1.value)::MADLIB_SCHEMA.ARGS_AND_VALUE_DOUBLE
                 WHEN $1.value IS NULL OR $1.value < $2.value THEN $2
                 ELSE $1
            END
 $$
 LANGUAGE sql IMMUTABLE;

 CREATE FUNCTION MADLIB_SCHEMA.argmax_final(
     finalstate MADLIB_SCHEMA.ARGS_AND_VALUE_DOUBLE)
 RETURNS INTEGER[] AS
 $$
     SELECT $1.args
 $$
 LANGUAGE sql IMMUTABLE;

 /**
  * @brief Argmax: Return the key of the row for which value is maximal
  *
  * The "index set" of the argmax function is of type INTEGER and we range over
  * DOUBLE PRECISION values. It is not required that all keys are distinct.
  *
  * @note
  * argmax should only be used on unsorted data because it will not exploit
  * indices, and its running time is \f$ \Theta(n) \f$.
  *
  * @implementation
  *
  * The implementation is in SQL, with a flavor of functional programming.
  * The hope is that the optimizer does a good job here.
  */
 CREATE AGGREGATE MADLIB_SCHEMA.argmax(/*+ key */ INTEGER, /*+ value */ DOUBLE PRECISION) (
     SFUNC=MADLIB_SCHEMA.argmax_transition,
     STYPE=MADLIB_SCHEMA.ARGS_AND_VALUE_DOUBLE,
     ifdef(<nom4>GREENPLUM</nom4>,<nom4>prefunc = MADLIB_SCHEMA.argmax_combine,</nom4>)
     FINALFUNC=MADLIB_SCHEMA.argmax_final
 );


 /**
  * @brief Precompute all class priors and feature probabilities
  *
  * Feature probabilities are stored in a table with columns
  * (class, attr, value, cnt, attr_cnt)
  *
  * Class priors are stored in a relation with columns
  * (class, class_cnt, all_cnt).
  *
  * @param trainingSource Name of relation containing the training data
  * @param trainingClassColumn Name of class column in training data
  * @param trainingAttrColumn Name of attributes-array column in training data
  * @param numAttrs Number of attributes to use for classification
  * @param featureProbsDestName Name of feature-probabilities table to create
  * @param classPriorsDestName Name of class-priors table to create
  *
  * @internal
  * @sa This function is a wrapper for bayes::create_prepared_data().
  */
 CREATE FUNCTION MADLIB_SCHEMA.create_nb_prepared_data_tables(
     "trainingSource" VARCHAR,
     "trainingClassColumn" VARCHAR,
     "trainingAttrColumn" VARCHAR,
     "numAttrs" INTEGER,
     "featureProbsDestName" VARCHAR,
     "classPriorsDestName" VARCHAR)
 RETURNS VOID AS $$
     import sys
     try:
         ifdef(<nom4>DEBUG</nom4>,,<nom4>from madlib </nom4>)import bayes
     except:
         sys.path.append("PLPYTHON_LIBDIR")
         ifdef(<nom4>DEBUG</nom4>,,<nom4>from madlib </nom4>)import bayes

     global whatToCreate
     whatToCreate = 'TABLE'
     bayes.create_prepared_data(**globals())
 $$ LANGUAGE plpythonu VOLATILE;


 /**
  * @brief Create view that contains all keys of the source relation and the
  *        respective naive Bayes classifications
  *
  * @param featureProbsSource Name of table with precomputed feature
  *        probabilities, as created with create_nb_prepared_data_tables()
  * @param classPriorsSource Name of table with precomputed class priors, as
  *        created with create_nb_prepared_data_tables()
  * @param classifySource Name of the relation that contains data to be classified
  * @param classifyKeyColumn Name of column in \em classifySource that can
  *        serve as unique identifier (the key of the source relation)
  * @param classifyAttrColumn Name of attributes-array column in \em classifySource
  * @param numAttrs Number of attributes to use for classification
  * @param destName Name of the view to create
  *
  * @internal
  * @sa This function is a wrapper for bayes::create_classification(). See there
  *     for details.
  */
 CREATE FUNCTION MADLIB_SCHEMA.create_nb_classify_view(
     "featureProbsSource" VARCHAR,
     "classPriorsSource" VARCHAR,
     "classifySource" VARCHAR,
     "classifyKeyColumn" VARCHAR,
     "classifyAttrColumn" VARCHAR,
     "numAttrs" INTEGER,
     "destName" VARCHAR)
 RETURNS VOID AS $$
     import sys
     try:
         ifdef(<nom4>DEBUG</nom4>,,<nom4>from madlib </nom4>)import bayes
     except:
         sys.path.append("PLPYTHON_LIBDIR")
         ifdef(<nom4>DEBUG</nom4>,,<nom4>from madlib </nom4>)import bayes

     global whatToCreate
     whatToCreate = 'VIEW'
     bayes.create_classification(**globals())
 $$ LANGUAGE plpythonu VOLATILE;


 /**
  * @brief Create a view with columns <tt>(key, nb_classification)</tt>
  *
  * The created relation will be
  *
  * <tt>{TABLE|VIEW} <em>destName</em> (key, nb_classification)</tt>
  *
  * where \c nb_classification is an array containing the most likely
  * class(es) of the record in \em classifySource identified by \c key.
  *
  * @param trainingSource
  *        Name of relation containing the training data
  * @param trainingClassColumn
  *        Name of class column in training data
  * @param trainingAttrColumn
  *        Name of attributes-array column in \em trainingSource
  * @param classifySource Name of the relation that contains data to be classified
  * @param classifyKeyColumn Name of column in \em classifySource that can
  *        serve as unique identifier (the key of the source relation)
  * @param classifyAttrColumn Name of attributes-array column in \em classifySource
  * @param numAttrs Number of attributes to use for classification
  * @param destName Name of the view to create
  *
  * @internal
  * @sa This function is a wrapper for bayes::create_classification().
  */
 CREATE FUNCTION MADLIB_SCHEMA.create_nb_classify_view(
     "trainingSource" VARCHAR,
     "trainingClassColumn" VARCHAR,
     "trainingAttrColumn" VARCHAR,
     "classifySource" VARCHAR,
     "classifyKeyColumn" VARCHAR,
     "classifyAttrColumn" VARCHAR,
     "numAttrs" INTEGER,
     "destName" VARCHAR)
 RETURNS VOID AS $$
     import sys
     try:
         ifdef(<nom4>DEBUG</nom4>,,<nom4>from madlib </nom4>)import bayes
     except:
         sys.path.append("PLPYTHON_LIBDIR")
         ifdef(<nom4>DEBUG</nom4>,,<nom4>from madlib </nom4>)import bayes

     global whatToCreate
     whatToCreate = 'VIEW'
     bayes.create_classification(**globals())
 $$ LANGUAGE plpythonu VOLATILE;


 /**
  * @brief Create view with columns <tt>(key, class, nb_prob)</tt>
  *
  * The created view will be
  *
  * <tt>VIEW <em>destName</em> (key, class, nb_prob)</tt>
  *
  * where \c nb_prob is the Naive-Bayes probability that \c class is the true
  * class of the record in \em classifySource identified by \c key.
  *
  * @param trainingSource
  *        Name of relation containing the training data
  * @param trainingClassColumn
  *        Name of class column in training data
  * @param trainingAttrColumn
  *        Name of attributes-array column in \em trainingSource
  * @param classifySource Name of the relation that contains data to be classified
  * @param classifyKeyColumn Name of column in \em classifySource that can
  *        serve as unique identifier (the key of the source relation)
  * @param classifyAttrColumn Name of attributes-array column in \em classifySource
  * @param numAttrs Number of attributes to use for classification
  * @param destName Name of the view to create
  *
  * @internal
  * @sa This function is a wrapper for bayes::create_bayes_probabilities().
  */
 CREATE FUNCTION MADLIB_SCHEMA.create_nb_probs_view(
     "trainingSource" VARCHAR,
     "trainingClassColumn" VARCHAR,
     "trainingAttrColumn" VARCHAR,
     "classifySource" VARCHAR,
     "classifyKeyColumn" VARCHAR,
     "classifyAttrColumn" VARCHAR,
     "numAttrs" INTEGER,
     "destName" VARCHAR)
 RETURNS VOID AS $$
     import sys
     try:
         ifdef(<nom4>DEBUG</nom4>,,<nom4>from madlib </nom4>)import bayes
     except:
         sys.path.append("PLPYTHON_LIBDIR")
         ifdef(<nom4>DEBUG</nom4>,,<nom4>from madlib </nom4>)import bayes

     global whatToCreate
     whatToCreate = 'VIEW'
     bayes.create_bayes_probabilities(**globals())
 $$ LANGUAGE plpythonu VOLATILE;


 /**
  * @brief Create view with columns <tt>(key, class, nb_prob)</tt>
  *
  * The created view will be
  *
  * <tt>VIEW <em>destName</em> (key, class, nb_prob)</tt>
  *
  * where \c nb_prob is the Naive-Bayes probability that \c class is the true
  * class of the record in \em classifySource identified by \c key.
  *
  * @param featureProbsSource Name of table with precomputed feature
  *        probabilities, as created with create_nb_prepared_data_tables()
  * @param classPriorsSource Name of table with precomputed class priors, as
  *        created with create_nb_prepared_data_tables()
  * @param classifySource Name of the relation that contains data to be classified
  * @param classifyKeyColumn Name of column in \em classifySource that can
  *        serve as unique identifier (the key of the source relation)
  * @param classifyAttrColumn Name of attributes-array column in \em classifySource
  * @param numAttrs Number of attributes to use for classification
  * @param destName Name of the view to create
  *
  * @internal
  * @sa This function is a wrapper for bayes::create_bayes_probabilities().
  */
 CREATE FUNCTION MADLIB_SCHEMA.create_nb_probs_view(
     "featureProbsSource" VARCHAR,
     "classPriorsSource" VARCHAR,
     "classifySource" VARCHAR,
     "classifyKeyColumn" VARCHAR,
     "classifyAttrColumn" VARCHAR,
     "numAttrs" INTEGER,
     "destName" VARCHAR)
 RETURNS VOID AS $$
     import sys
     try:
         ifdef(<nom4>DEBUG</nom4>,,<nom4>from madlib </nom4>)import bayes
     except:
         sys.path.append("PLPYTHON_LIBDIR")
         ifdef(<nom4>DEBUG</nom4>,,<nom4>from madlib </nom4>)import bayes

     global whatToCreate
     whatToCreate = 'VIEW'
     bayes.create_bayes_probabilities(**globals())
 $$ LANGUAGE plpythonu VOLATILE;


 /**
  * @brief Create a SQL function mapping arrays of attribute values to the Naive
  *        Bayes classification.
  *
  * The created SQL function will be:
  *
  * <tt>
  * FUNCTION <em>destName</em> (attributes INTEGER[], smoothingFactor DOUBLE PRECISION)
  * RETURNS INTEGER[]</tt>
  *
  * @note
  * On Greenplum, the generated SQL function can only be called on the master.
  *
  * @param featureProbsSource Name of table with precomputed feature
  *        probabilities, as created with create_nb_prepared_data_tables()
  * @param classPriorsSource Name of table with precomputed class priors, as
  *        created with create_nb_prepared_data_tables()
  * @param numAttrs Number of attributes to use for classification
  * @param destName Name of the function to create
  *
  * @internal
  * @sa This function is a wrapper for bayes::create_classification_function().
  */
 CREATE FUNCTION MADLIB_SCHEMA.create_nb_classify_fn(
     "featureProbsSource" VARCHAR,
     "classPriorsSource" VARCHAR,
     "numAttrs" INTEGER,
     "destName" VARCHAR)
 RETURNS VOID AS $$
     import sys
     try:
         ifdef(<nom4>DEBUG</nom4>,,<nom4>from madlib </nom4>)import bayes
     except:
         sys.path.append("PLPYTHON_LIBDIR")
         ifdef(<nom4>DEBUG</nom4>,,<nom4>from madlib </nom4>)import bayes

     bayes.create_classification_function(**globals())
 $$ LANGUAGE plpythonu VOLATILE;


 /**
  * @brief Create a SQL function mapping arrays of attribute values to the Naive
  *        Bayes classification.
  *
  * The created SQL function will be:
  *
  * <tt>
  * FUNCTION <em>destName</em> (attributes INTEGER[], smoothingFactor DOUBLE PRECISION)
  * RETURNS INTEGER[]</tt>
  *
  * @note
  * On Greenplum, the generated SQL function can only be called on the master.
  *
  * @param trainingSource
  *        Name of relation containing the training data
  * @param trainingClassColumn
  *        Name of class column in training data
  * @param trainingAttrColumn
  *        Name of attributes-array column in \em trainingSource
  * @param numAttrs Number of attributes to use for classification
  * @param destName Name of the function to create
  *
  * @internal
  * @sa This function is a wrapper for bayes::create_classification_function().
  */
 CREATE FUNCTION MADLIB_SCHEMA.create_nb_classify_fn(
     "trainingSource" VARCHAR,
     "trainingClassColumn" VARCHAR,
     "trainingAttrColumn" VARCHAR,
     "numAttrs" INTEGER,
     "destName" VARCHAR)
 RETURNS VOID AS $$
     import sys
     try:
         ifdef(<nom4>DEBUG</nom4>,,<nom4>from madlib </nom4>)import bayes
     except:
         sys.path.append("PLPYTHON_LIBDIR")
         ifdef(<nom4>DEBUG</nom4>,,<nom4>from madlib </nom4>)import bayes

     bayes.create_classification_function(**globals())
 $$ LANGUAGE plpythonu VOLATILE;


 ifdef(<nom4>DEBUG</nom4>,<nom4>
 /*
  * Function for initializing python paths to the paths in dynamic_library_path.
  * This is only needed when debugging Python-based functions without installing
  * them in a location where Python would find them automatically.
  */
 CREATE FUNCTION MADLIB_SCHEMA.init_python_paths()
 RETURNS VOID AS
 $$
     # FIXME: The following code should be common code and not reside in a specialized module
     import sys

     dyld_paths = plpy.execute(
         "SHOW dynamic_library_path")[0]["dynamic_library_path"].split(':')
     before_default = True
     count = 0
     for path in dyld_paths:
         if path == "$libdir":
             before_default = False
         else:
             if before_default:
                 sys.path.insert(count, path)
                 count += 1
             else:
                 sys.path.append(path)
 $$ LANGUAGE plpythonu VOLATILE;
 </nom4>)
	/* ----------------------------------------------------------------------- //*
	*
	* @file bayes.sql_in
	*
	* @brief SQL functions for naive Bayes
	* @date January 2011
	*
	* @sa For a brief introduction to Naive Bayes Classification, see the module
	* description \ref grp_bayes.
	*
	// --------------------------------------------------------------------------
	*
	* This file is preprocessed with m4. Macro expansion can be turned of by
	* enclosing text in <nom4> and </nom4>.
	*/
	changequote(`<nom4>', `</nom4>')

	/**
	@addtogroup grp_bayes

	@about

	Naive Bayes classification with user-defined smoothing factor (default:
	Laplacian smoothing).

	A Naive Bayes classifier computes the following formula:
	\f[
	\text{classify}(a_1, ..., a_n)
	= \arg\max_c \left\{
	P(C = c) \cdot \prod_{i=1}^n P(A_i = a_i \mid C = c)
	\right\}
	\f]
	where $c$ ranges over all classes in the training data and probabilites are
	estimated with relative frequencies from the training set.

	There are different ways to estimate the feature probabilities
	\f$ P(A_i = a \mid C = c) \f$. The maximum likelihood exstimate takes the relative
	frequencies. That is:
	\f[
	P(A_i = a \mid C = c) = \frac{\#(c,i,a)}{\#c}
	\f]
	where
	- \f$ \#(c,i,a) \f$ denotes the # of training samples where attribute $i$ is $a$ and class
	is $c$
	- \f$ \#c \f$ denotes the # of training samples where class is $c$.

	Since the maximum likelihood sometimes results in estimates of 0, it might
	be desirable to use a "smoothed" estimate. Intuitively, one adds a number of
	"virtual" samples and assumes that these samples are evenly distributed
	among the values attribute $i$ can assume (i.e., the set of all values observed
	for attribute $a$ for any class):

	\f[
	P(A_i = a \mid C = c) = \frac{\#(c,i,a) + s}{\#c + s \cdot \#i}
	\f]
	where
	- \f$ \#i \f$ denotes the # of distinct values for attribute $i$ (for all
	classes)
	- \f$ s \geq 0 \f$ denotes the smoothing factor.

	The case $s = 1$ is known as "Laplace smoothing" in the literature. The case
	$s = 0$ trivially reduces to maximum-likelihood estimates.

	@prereq

	Implemented in Python and SQL for PostgreSQL/Greenplum. The Python code only
	serves to genereate SQL and should be portable.

	@usage

	-# The training data is expected to be of the following form:\n
	<tt>{TABLE\|VIEW} <em>trainingSource</em> ([...] <em>trainingClassColumn</em>
	INTEGER, <em>trainingAttrColumn</em> INTEGER[]
	[...])</tt>
	-# (Optional step:) Precompute feature probabilities and class priors for faster
	classification later:\n
	<tt>SELECT create_nb_prepared_data_tables('<em>trainingSource</em>',
	'<em>trainingClassColumn</em>', '<em>trainingAttrColumn</em>', numAttrs,
	'<em>featureProbsName</em>', '<em>classPriorsName</em>')</tt>\n
	\n
	This creates the two tables <tt><em>featureProbsName</em>(class INTEGER,
	attr INTEGER, value INTEGER, cnt INTEGER, attr_cnt INTEGER))</tt> and
	<tt><em>classPriorsName</em>(class INTEGER, class_cnt INTEGER, all_cnt
	INTEGER)</tt>.
	-# The data that we want to run Naive Bayes classification on is expected to be
	of the following form:\n
	<tt>{TABLE\|VIEW} <em>classifySource</em> ([...] <em>classifyKeyColumn</em>
	ANYTYPE, <em>classifyAttrColumn</em> INTEGER[] [...])</tt>\n
	-# Run Naive Bayes classification as follows:\n
	<tt>SELECT create_nb_classify_view('<em>featureProbsName</em>',
	'<em>classPriorsName</em>', '<em>classifySource</em>',
	'<em>classifyKeyColumn</em>', '<em>classifyAttrColumn</em>', numAttrs,
	'<em>destName</em>')</tt>\n
	\n
	This creates a view <tt><em>destName</em>(key ANYTYPE,
	nb_classification INTEGER)</tt> which contains a row for every row in
	<tt><em>classifySource</em></tt>, where <tt>key</tt> =
	<tt><em>classifyKeyColumn</em></tt> and <tt>nb_classification</tt>
	is the Naive Bayes classification.

	@examp

	The following is an extremely simplified example which can by verify by hand.

	@verbatim
	# -- The training data
	# select * from bayes;
	id \| class \| attributes
	----+-------+------------
	1 \| 1 \| {1,2,3}
	3 \| 1 \| {1,4,3}
	5 \| 2 \| {0,2,2}
	2 \| 1 \| {1,2,1}
	4 \| 2 \| {1,2,2}
	6 \| 2 \| {0,1,3}
	(6 rows)

	# -- Precompute feature probabilities and class priors
	# select create_nb_prepared_data_tables(
	# 'bayes', 'class', 'attributes', 3,
	# 'nb_feature_probs', 'nb_class_priors');

	# select * from nb_feature_probs;
	class \| attr \| value \| cnt \| attr_cnt
	-------+------+-------+-----+----------
	1 \| 1 \| 0 \| 0 \| 2
	1 \| 1 \| 1 \| 3 \| 2
	1 \| 2 \| 1 \| 0 \| 3
	1 \| 2 \| 2 \| 2 \| 3
	[...]

	# select * from nb_class_priors;
	class \| class_cnt \| all_cnt
	-------+-----------+---------
	1 \| 3 \| 6
	2 \| 3 \| 6
	(2 rows)

	# -- Data we want to run Naive Bayes classification on
	# select * from toclassify;
	id \| attributes
	----+------------
	1 \| {0,2,1}
	2 \| {1,2,3}

	# -- Run Naive Bayes
	# select create_nb_classify_view(
	# 'nb_feature_probs', 'nb_class_priors',
	# 'toclassify', 'id', 'attributes', 3,
	# 'nb_classify_view_fast');

	# select * from nb_classify_view_fast;
	key \| nb_classification
	-----+-------------------
	1 \| {2}
	2 \| {1}
	(2 rows)

	# -- Look at the probabilities for each class (Note we use Laplacian Smoothing):
	# select create_nb_probs_view(
	# 'nb_feature_probs', 'nb_class_priors',
	# 'toclassify', 'id', 'attributes', 3,
	# 'nb_probs_view_fast');

	# select * from nb_probs_view_fast;
	key \| class \| nb_prob
	-----+-------+---------
	1 \| 1 \| 0.4
	1 \| 2 \| 0.6
	2 \| 1 \| 0.75
	2 \| 2 \| 0.25
	(4 rows)
	@endverbatim

	@sa file bayes.sql_in (documenting the SQL functions)

	@internal
	@sa namespace bayes (documenting the implementation in Python)
	@endinternal

	@literature

	[1] Tom Mitchell: Machine Learning, McGraw Hill, 1997. Book chapter
	<em>Generativ and Discriminative Classifiers: Naive Bayes and Logistic
	Regression</em> available at: http://www.cs.cmu.edu/~tom/NewChapters.html

	[2] Wikipedia, Naive Bayes classifier,
	http://en.wikipedia.org/wiki/Naive_Bayes_classifier
	*/

	-- Begin of argmax definition

	CREATE TYPE MADLIB_SCHEMA.ARGS_AND_VALUE_DOUBLE AS (
	args INTEGER[],
	value DOUBLE PRECISION
	);

	CREATE FUNCTION MADLIB_SCHEMA.argmax_transition(
	oldmax MADLIB_SCHEMA.ARGS_AND_VALUE_DOUBLE,
	newkey INTEGER,
	newvalue DOUBLE PRECISION)
	RETURNS MADLIB_SCHEMA.ARGS_AND_VALUE_DOUBLE AS
	$$
	SELECT CASE WHEN $3 < $1.value OR $2 IS NULL OR ($3 IS NULL AND NOT $1.value IS NULL) THEN $1
	WHEN $3 = $1.value OR ($3 IS NULL AND $1.value IS NULL AND NOT $1.args IS NULL)
	THEN ($1.args \|\| $2, $3)::MADLIB_SCHEMA.ARGS_AND_VALUE_DOUBLE
	ELSE (array[$2], $3)::MADLIB_SCHEMA.ARGS_AND_VALUE_DOUBLE
	END
	$$
	LANGUAGE sql IMMUTABLE;

	CREATE FUNCTION MADLIB_SCHEMA.argmax_combine(
	max1 MADLIB_SCHEMA.ARGS_AND_VALUE_DOUBLE,
	max2 MADLIB_SCHEMA.ARGS_AND_VALUE_DOUBLE)
	RETURNS MADLIB_SCHEMA.ARGS_AND_VALUE_DOUBLE AS
	$$
	-- If SQL guaranteed short-circuit evaluation, the following could become
	-- shorter. Unfortunately, this is not the case.
	-- Section 6.3.3.3 of ISO/IEC 9075-1:2008 Framework (SQL/Framework):
	--
	-- "However, it is implementation-dependent whether expressions are
	-- actually evaluated left to right, particularly when operands or
	-- operators might cause conditions to be raised or if the results of the
	-- expressions can be determined without completely evaluating all parts
	-- of the expression."
	--
	-- Again, the optimizer does its job hopefully.
	SELECT CASE WHEN $1 IS NULL THEN $2
	WHEN $2 IS NULL THEN $1
	WHEN ($1.value = $2.value) OR ($1.value IS NULL AND $2.value IS NULL)
	THEN ($1.args \|\| $2.args, $1.value)::MADLIB_SCHEMA.ARGS_AND_VALUE_DOUBLE
	WHEN $1.value IS NULL OR $1.value < $2.value THEN $2
	ELSE $1
	END
	$$
	LANGUAGE sql IMMUTABLE;

	CREATE FUNCTION MADLIB_SCHEMA.argmax_final(
	finalstate MADLIB_SCHEMA.ARGS_AND_VALUE_DOUBLE)
	RETURNS INTEGER[] AS
	$$
	SELECT $1.args
	$$
	LANGUAGE sql IMMUTABLE;

	/**
	* @brief Argmax: Return the key of the row for which value is maximal
	*
	* The "index set" of the argmax function is of type INTEGER and we range over
	* DOUBLE PRECISION values. It is not required that all keys are distinct.
	*
	* @note
	* argmax should only be used on unsorted data because it will not exploit
	* indices, and its running time is \f$ \Theta(n) \f$.
	*
	* @implementation
	*
	* The implementation is in SQL, with a flavor of functional programming.
	* The hope is that the optimizer does a good job here.
	*/
	CREATE AGGREGATE MADLIB_SCHEMA.argmax(/+ key / INTEGER, /+ value / DOUBLE PRECISION) (
	SFUNC=MADLIB_SCHEMA.argmax_transition,
	STYPE=MADLIB_SCHEMA.ARGS_AND_VALUE_DOUBLE,
	ifdef(<nom4>GREENPLUM</nom4>,<nom4>prefunc = MADLIB_SCHEMA.argmax_combine,</nom4>)
	FINALFUNC=MADLIB_SCHEMA.argmax_final
	);


	/**
	* @brief Precompute all class priors and feature probabilities
	*
	* Feature probabilities are stored in a table with columns
	* (class, attr, value, cnt, attr_cnt)
	*
	* Class priors are stored in a relation with columns
	* (class, class_cnt, all_cnt).
	*
	* @param trainingSource Name of relation containing the training data
	* @param trainingClassColumn Name of class column in training data
	* @param trainingAttrColumn Name of attributes-array column in training data
	* @param numAttrs Number of attributes to use for classification
	* @param featureProbsDestName Name of feature-probabilities table to create
	* @param classPriorsDestName Name of class-priors table to create
	*
	* @internal
	* @sa This function is a wrapper for bayes::create_prepared_data().
	*/
	CREATE FUNCTION MADLIB_SCHEMA.create_nb_prepared_data_tables(
	"trainingSource" VARCHAR,
	"trainingClassColumn" VARCHAR,
	"trainingAttrColumn" VARCHAR,
	"numAttrs" INTEGER,
	"featureProbsDestName" VARCHAR,
	"classPriorsDestName" VARCHAR)
	RETURNS VOID AS $$
	import sys
	try:
	ifdef(<nom4>DEBUG</nom4>,,<nom4>from madlib </nom4>)import bayes
	except:
	sys.path.append("PLPYTHON_LIBDIR")
	ifdef(<nom4>DEBUG</nom4>,,<nom4>from madlib </nom4>)import bayes

	global whatToCreate
	whatToCreate = 'TABLE'
	bayes.create_prepared_data(**globals())
	$$ LANGUAGE plpythonu VOLATILE;


	/**
	* @brief Create view that contains all keys of the source relation and the
	* respective naive Bayes classifications
	*
	* @param featureProbsSource Name of table with precomputed feature
	* probabilities, as created with create_nb_prepared_data_tables()
	* @param classPriorsSource Name of table with precomputed class priors, as
	* created with create_nb_prepared_data_tables()
	* @param classifySource Name of the relation that contains data to be classified
	* @param classifyKeyColumn Name of column in \em classifySource that can
	* serve as unique identifier (the key of the source relation)
	* @param classifyAttrColumn Name of attributes-array column in \em classifySource
	* @param numAttrs Number of attributes to use for classification
	* @param destName Name of the view to create
	*
	* @internal
	* @sa This function is a wrapper for bayes::create_classification(). See there
	* for details.
	*/
	CREATE FUNCTION MADLIB_SCHEMA.create_nb_classify_view(
	"featureProbsSource" VARCHAR,
	"classPriorsSource" VARCHAR,
	"classifySource" VARCHAR,
	"classifyKeyColumn" VARCHAR,
	"classifyAttrColumn" VARCHAR,
	"numAttrs" INTEGER,
	"destName" VARCHAR)
	RETURNS VOID AS $$
	import sys
	try:
	ifdef(<nom4>DEBUG</nom4>,,<nom4>from madlib </nom4>)import bayes
	except:
	sys.path.append("PLPYTHON_LIBDIR")
	ifdef(<nom4>DEBUG</nom4>,,<nom4>from madlib </nom4>)import bayes

	global whatToCreate
	whatToCreate = 'VIEW'
	bayes.create_classification(**globals())
	$$ LANGUAGE plpythonu VOLATILE;


	/**
	* @brief Create a view with columns <tt>(key, nb_classification)</tt>
	*
	* The created relation will be
	*
	* <tt>{TABLE\|VIEW} <em>destName</em> (key, nb_classification)</tt>
	*
	* where \c nb_classification is an array containing the most likely
	* class(es) of the record in \em classifySource identified by \c key.
	*
	* @param trainingSource
	* Name of relation containing the training data
	* @param trainingClassColumn
	* Name of class column in training data
	* @param trainingAttrColumn
	* Name of attributes-array column in \em trainingSource
	* @param classifySource Name of the relation that contains data to be classified
	* @param classifyKeyColumn Name of column in \em classifySource that can
	* serve as unique identifier (the key of the source relation)
	* @param classifyAttrColumn Name of attributes-array column in \em classifySource
	* @param numAttrs Number of attributes to use for classification
	* @param destName Name of the view to create
	*
	* @internal
	* @sa This function is a wrapper for bayes::create_classification().
	*/
	CREATE FUNCTION MADLIB_SCHEMA.create_nb_classify_view(
	"trainingSource" VARCHAR,
	"trainingClassColumn" VARCHAR,
	"trainingAttrColumn" VARCHAR,
	"classifySource" VARCHAR,
	"classifyKeyColumn" VARCHAR,
	"classifyAttrColumn" VARCHAR,
	"numAttrs" INTEGER,
	"destName" VARCHAR)
	RETURNS VOID AS $$
	import sys
	try:
	ifdef(<nom4>DEBUG</nom4>,,<nom4>from madlib </nom4>)import bayes
	except:
	sys.path.append("PLPYTHON_LIBDIR")
	ifdef(<nom4>DEBUG</nom4>,,<nom4>from madlib </nom4>)import bayes

	global whatToCreate
	whatToCreate = 'VIEW'
	bayes.create_classification(**globals())
	$$ LANGUAGE plpythonu VOLATILE;


	/**
	* @brief Create view with columns <tt>(key, class, nb_prob)</tt>
	*
	* The created view will be
	*
	* <tt>VIEW <em>destName</em> (key, class, nb_prob)</tt>
	*
	* where \c nb_prob is the Naive-Bayes probability that \c class is the true
	* class of the record in \em classifySource identified by \c key.
	*
	* @param trainingSource
	* Name of relation containing the training data
	* @param trainingClassColumn
	* Name of class column in training data
	* @param trainingAttrColumn
	* Name of attributes-array column in \em trainingSource
	* @param classifySource Name of the relation that contains data to be classified
	* @param classifyKeyColumn Name of column in \em classifySource that can
	* serve as unique identifier (the key of the source relation)
	* @param classifyAttrColumn Name of attributes-array column in \em classifySource
	* @param numAttrs Number of attributes to use for classification
	* @param destName Name of the view to create
	*
	* @internal
	* @sa This function is a wrapper for bayes::create_bayes_probabilities().
	*/
	CREATE FUNCTION MADLIB_SCHEMA.create_nb_probs_view(
	"trainingSource" VARCHAR,
	"trainingClassColumn" VARCHAR,
	"trainingAttrColumn" VARCHAR,
	"classifySource" VARCHAR,
	"classifyKeyColumn" VARCHAR,
	"classifyAttrColumn" VARCHAR,
	"numAttrs" INTEGER,
	"destName" VARCHAR)
	RETURNS VOID AS $$
	import sys
	try:
	ifdef(<nom4>DEBUG</nom4>,,<nom4>from madlib </nom4>)import bayes
	except:
	sys.path.append("PLPYTHON_LIBDIR")
	ifdef(<nom4>DEBUG</nom4>,,<nom4>from madlib </nom4>)import bayes

	global whatToCreate
	whatToCreate = 'VIEW'
	bayes.create_bayes_probabilities(**globals())
	$$ LANGUAGE plpythonu VOLATILE;


	/**
	* @brief Create view with columns <tt>(key, class, nb_prob)</tt>
	*
	* The created view will be
	*
	* <tt>VIEW <em>destName</em> (key, class, nb_prob)</tt>
	*
	* where \c nb_prob is the Naive-Bayes probability that \c class is the true
	* class of the record in \em classifySource identified by \c key.
	*
	* @param featureProbsSource Name of table with precomputed feature
	* probabilities, as created with create_nb_prepared_data_tables()
	* @param classPriorsSource Name of table with precomputed class priors, as
	* created with create_nb_prepared_data_tables()
	* @param classifySource Name of the relation that contains data to be classified
	* @param classifyKeyColumn Name of column in \em classifySource that can
	* serve as unique identifier (the key of the source relation)
	* @param classifyAttrColumn Name of attributes-array column in \em classifySource
	* @param numAttrs Number of attributes to use for classification
	* @param destName Name of the view to create
	*
	* @internal
	* @sa This function is a wrapper for bayes::create_bayes_probabilities().
	*/
	CREATE FUNCTION MADLIB_SCHEMA.create_nb_probs_view(
	"featureProbsSource" VARCHAR,
	"classPriorsSource" VARCHAR,
	"classifySource" VARCHAR,
	"classifyKeyColumn" VARCHAR,
	"classifyAttrColumn" VARCHAR,
	"numAttrs" INTEGER,
	"destName" VARCHAR)
	RETURNS VOID AS $$
	import sys
	try:
	ifdef(<nom4>DEBUG</nom4>,,<nom4>from madlib </nom4>)import bayes
	except:
	sys.path.append("PLPYTHON_LIBDIR")
	ifdef(<nom4>DEBUG</nom4>,,<nom4>from madlib </nom4>)import bayes

	global whatToCreate
	whatToCreate = 'VIEW'
	bayes.create_bayes_probabilities(**globals())
	$$ LANGUAGE plpythonu VOLATILE;


	/**
	* @brief Create a SQL function mapping arrays of attribute values to the Naive
	* Bayes classification.
	*
	* The created SQL function will be:
	*
	* <tt>
	* FUNCTION <em>destName</em> (attributes INTEGER[], smoothingFactor DOUBLE PRECISION)
	* RETURNS INTEGER[]</tt>
	*
	* @note
	* On Greenplum, the generated SQL function can only be called on the master.
	*
	* @param featureProbsSource Name of table with precomputed feature
	* probabilities, as created with create_nb_prepared_data_tables()
	* @param classPriorsSource Name of table with precomputed class priors, as
	* created with create_nb_prepared_data_tables()
	* @param numAttrs Number of attributes to use for classification
	* @param destName Name of the function to create
	*
	* @internal
	* @sa This function is a wrapper for bayes::create_classification_function().
	*/
	CREATE FUNCTION MADLIB_SCHEMA.create_nb_classify_fn(
	"featureProbsSource" VARCHAR,
	"classPriorsSource" VARCHAR,
	"numAttrs" INTEGER,
	"destName" VARCHAR)
	RETURNS VOID AS $$
	import sys
	try:
	ifdef(<nom4>DEBUG</nom4>,,<nom4>from madlib </nom4>)import bayes
	except:
	sys.path.append("PLPYTHON_LIBDIR")
	ifdef(<nom4>DEBUG</nom4>,,<nom4>from madlib </nom4>)import bayes

	bayes.create_classification_function(**globals())
	$$ LANGUAGE plpythonu VOLATILE;


	/**
	* @brief Create a SQL function mapping arrays of attribute values to the Naive
	* Bayes classification.
	*
	* The created SQL function will be:
	*
	* <tt>
	* FUNCTION <em>destName</em> (attributes INTEGER[], smoothingFactor DOUBLE PRECISION)
	* RETURNS INTEGER[]</tt>
	*
	* @note
	* On Greenplum, the generated SQL function can only be called on the master.
	*
	* @param trainingSource
	* Name of relation containing the training data
	* @param trainingClassColumn
	* Name of class column in training data
	* @param trainingAttrColumn
	* Name of attributes-array column in \em trainingSource
	* @param numAttrs Number of attributes to use for classification
	* @param destName Name of the function to create
	*
	* @internal
	* @sa This function is a wrapper for bayes::create_classification_function().
	*/
	CREATE FUNCTION MADLIB_SCHEMA.create_nb_classify_fn(
	"trainingSource" VARCHAR,
	"trainingClassColumn" VARCHAR,
	"trainingAttrColumn" VARCHAR,
	"numAttrs" INTEGER,
	"destName" VARCHAR)
	RETURNS VOID AS $$
	import sys
	try:
	ifdef(<nom4>DEBUG</nom4>,,<nom4>from madlib </nom4>)import bayes
	except:
	sys.path.append("PLPYTHON_LIBDIR")
	ifdef(<nom4>DEBUG</nom4>,,<nom4>from madlib </nom4>)import bayes

	bayes.create_classification_function(**globals())
	$$ LANGUAGE plpythonu VOLATILE;


	ifdef(<nom4>DEBUG</nom4>,<nom4>
	/*
	* Function for initializing python paths to the paths in dynamic_library_path.
	* This is only needed when debugging Python-based functions without installing
	* them in a location where Python would find them automatically.
	*/
	CREATE FUNCTION MADLIB_SCHEMA.init_python_paths()
	RETURNS VOID AS
	$$
	# FIXME: The following code should be common code and not reside in a specialized module
	import sys

	dyld_paths = plpy.execute(
	"SHOW dynamic_library_path")[0]["dynamic_library_path"].split(':')
	before_default = True
	count = 0
	for path in dyld_paths:
	if path == "$libdir":
	before_default = False
	else:
	if before_default:
	sys.path.insert(count, path)
	count += 1
	else:
	sys.path.append(path)
	$$ LANGUAGE plpythonu VOLATILE;
	</nom4>)