blob: a06824d0fb36ba946c1c4761b559c45cfb9611b2 [file] [log] [blame]
/* ----------------------------------------------------------------------- *//**
*
* @file bayes.sql_in
*
* @brief SQL functions for naive Bayes
* @date January 2011
*
* @sa For a brief introduction to Naive Bayes Classification, see the module
* description \ref grp_bayes.
*
*//* --------------------------------------------------------------------------
*
* This file is preprocessed with m4. Macro expansion can be turned of by
* enclosing text in <nom4> and </nom4>.
*/
changequote(`<nom4>', `</nom4>')
/**
@addtogroup grp_bayes
@about
Naive Bayes classification with user-defined smoothing factor (default:
Laplacian smoothing).
A Naive Bayes classifier computes the following formula:
\f[
\text{classify}(a_1, ..., a_n)
= \arg\max_c \left\{
P(C = c) \cdot \prod_{i=1}^n P(A_i = a_i \mid C = c)
\right\}
\f]
where $c$ ranges over all classes in the training data and probabilites are
estimated with relative frequencies from the training set.
There are different ways to estimate the feature probabilities
\f$ P(A_i = a \mid C = c) \f$. The maximum likelihood exstimate takes the relative
frequencies. That is:
\f[
P(A_i = a \mid C = c) = \frac{\#(c,i,a)}{\#c}
\f]
where
- \f$ \#(c,i,a) \f$ denotes the # of training samples where attribute $i$ is $a$ and class
is $c$
- \f$ \#c \f$ denotes the # of training samples where class is $c$.
Since the maximum likelihood sometimes results in estimates of 0, it might
be desirable to use a "smoothed" estimate. Intuitively, one adds a number of
"virtual" samples and assumes that these samples are evenly distributed
among the values attribute $i$ can assume (i.e., the set of all values observed
for attribute $a$ for any class):
\f[
P(A_i = a \mid C = c) = \frac{\#(c,i,a) + s}{\#c + s \cdot \#i}
\f]
where
- \f$ \#i \f$ denotes the # of distinct values for attribute $i$ (for all
classes)
- \f$ s \geq 0 \f$ denotes the smoothing factor.
The case $s = 1$ is known as "Laplace smoothing" in the literature. The case
$s = 0$ trivially reduces to maximum-likelihood estimates.
@prereq
Implemented in Python and SQL for PostgreSQL/Greenplum. The Python code only
serves to genereate SQL and should be portable.
@usage
-# The training data is expected to be of the following form:\n
<tt>{TABLE|VIEW} <em>trainingSource</em> ([...] <em>trainingClassColumn</em>
INTEGER, <em>trainingAttrColumn</em> INTEGER[]
[...])</tt>
-# (Optional step:) Precompute feature probabilities and class priors for faster
classification later:\n
<tt>SELECT create_nb_prepared_data_tables('<em>trainingSource</em>',
'<em>trainingClassColumn</em>', '<em>trainingAttrColumn</em>', numAttrs,
'<em>featureProbsName</em>', '<em>classPriorsName</em>')</tt>\n
\n
This creates the two tables <tt><em>featureProbsName</em>(class INTEGER,
attr INTEGER, value INTEGER, cnt INTEGER, attr_cnt INTEGER))</tt> and
<tt><em>classPriorsName</em>(class INTEGER, class_cnt INTEGER, all_cnt
INTEGER)</tt>.
-# The data that we want to run Naive Bayes classification on is expected to be
of the following form:\n
<tt>{TABLE|VIEW} <em>classifySource</em> ([...] <em>classifyKeyColumn</em>
ANYTYPE, <em>classifyAttrColumn</em> INTEGER[] [...])</tt>\n
-# Run Naive Bayes classification as follows:\n
<tt>SELECT create_nb_classify_view('<em>featureProbsName</em>',
'<em>classPriorsName</em>', '<em>classifySource</em>',
'<em>classifyKeyColumn</em>', '<em>classifyAttrColumn</em>', numAttrs,
'<em>destName</em>')</tt>\n
\n
This creates a view <tt><em>destName</em>(key ANYTYPE,
nb_classification INTEGER)</tt> which contains a row for every row in
<tt><em>classifySource</em></tt>, where <tt>key</tt> =
<tt><em>classifyKeyColumn</em></tt> and <tt>nb_classification</tt>
is the Naive Bayes classification.
@examp
The following is an extremely simplified example which can by verify by hand.
@verbatim
# -- The training data
# select * from bayes;
id | class | attributes
----+-------+------------
1 | 1 | {1,2,3}
3 | 1 | {1,4,3}
5 | 2 | {0,2,2}
2 | 1 | {1,2,1}
4 | 2 | {1,2,2}
6 | 2 | {0,1,3}
(6 rows)
# -- Precompute feature probabilities and class priors
# select create_nb_prepared_data_tables(
# 'bayes', 'class', 'attributes', 3,
# 'nb_feature_probs', 'nb_class_priors');
# select * from nb_feature_probs;
class | attr | value | cnt | attr_cnt
-------+------+-------+-----+----------
1 | 1 | 0 | 0 | 2
1 | 1 | 1 | 3 | 2
1 | 2 | 1 | 0 | 3
1 | 2 | 2 | 2 | 3
[...]
# select * from nb_class_priors;
class | class_cnt | all_cnt
-------+-----------+---------
1 | 3 | 6
2 | 3 | 6
(2 rows)
# -- Data we want to run Naive Bayes classification on
# select * from toclassify;
id | attributes
----+------------
1 | {0,2,1}
2 | {1,2,3}
# -- Run Naive Bayes
# select create_nb_classify_view(
# 'nb_feature_probs', 'nb_class_priors',
# 'toclassify', 'id', 'attributes', 3,
# 'nb_classify_view_fast');
# select * from nb_classify_view_fast;
key | nb_classification
-----+-------------------
1 | {2}
2 | {1}
(2 rows)
# -- Look at the probabilities for each class (Note we use Laplacian Smoothing):
# select create_nb_probs_view(
# 'nb_feature_probs', 'nb_class_priors',
# 'toclassify', 'id', 'attributes', 3,
# 'nb_probs_view_fast');
# select * from nb_probs_view_fast;
key | class | nb_prob
-----+-------+---------
1 | 1 | 0.4
1 | 2 | 0.6
2 | 1 | 0.75
2 | 2 | 0.25
(4 rows)
@endverbatim
@sa file bayes.sql_in (documenting the SQL functions)
@internal
@sa namespace bayes (documenting the implementation in Python)
@endinternal
@literature
[1] Tom Mitchell: Machine Learning, McGraw Hill, 1997. Book chapter
<em>Generativ and Discriminative Classifiers: Naive Bayes and Logistic
Regression</em> available at: http://www.cs.cmu.edu/~tom/NewChapters.html
[2] Wikipedia, Naive Bayes classifier,
http://en.wikipedia.org/wiki/Naive_Bayes_classifier
*/
-- Begin of argmax definition
CREATE TYPE MADLIB_SCHEMA.ARGS_AND_VALUE_DOUBLE AS (
args INTEGER[],
value DOUBLE PRECISION
);
CREATE FUNCTION MADLIB_SCHEMA.argmax_transition(
oldmax MADLIB_SCHEMA.ARGS_AND_VALUE_DOUBLE,
newkey INTEGER,
newvalue DOUBLE PRECISION)
RETURNS MADLIB_SCHEMA.ARGS_AND_VALUE_DOUBLE AS
$$
SELECT CASE WHEN $3 < $1.value OR $2 IS NULL OR ($3 IS NULL AND NOT $1.value IS NULL) THEN $1
WHEN $3 = $1.value OR ($3 IS NULL AND $1.value IS NULL AND NOT $1.args IS NULL)
THEN ($1.args || $2, $3)::MADLIB_SCHEMA.ARGS_AND_VALUE_DOUBLE
ELSE (array[$2], $3)::MADLIB_SCHEMA.ARGS_AND_VALUE_DOUBLE
END
$$
LANGUAGE sql IMMUTABLE;
CREATE FUNCTION MADLIB_SCHEMA.argmax_combine(
max1 MADLIB_SCHEMA.ARGS_AND_VALUE_DOUBLE,
max2 MADLIB_SCHEMA.ARGS_AND_VALUE_DOUBLE)
RETURNS MADLIB_SCHEMA.ARGS_AND_VALUE_DOUBLE AS
$$
-- If SQL guaranteed short-circuit evaluation, the following could become
-- shorter. Unfortunately, this is not the case.
-- Section 6.3.3.3 of ISO/IEC 9075-1:2008 Framework (SQL/Framework):
--
-- "However, it is implementation-dependent whether expressions are
-- actually evaluated left to right, particularly when operands or
-- operators might cause conditions to be raised or if the results of the
-- expressions can be determined without completely evaluating all parts
-- of the expression."
--
-- Again, the optimizer does its job hopefully.
SELECT CASE WHEN $1 IS NULL THEN $2
WHEN $2 IS NULL THEN $1
WHEN ($1.value = $2.value) OR ($1.value IS NULL AND $2.value IS NULL)
THEN ($1.args || $2.args, $1.value)::MADLIB_SCHEMA.ARGS_AND_VALUE_DOUBLE
WHEN $1.value IS NULL OR $1.value < $2.value THEN $2
ELSE $1
END
$$
LANGUAGE sql IMMUTABLE;
CREATE FUNCTION MADLIB_SCHEMA.argmax_final(
finalstate MADLIB_SCHEMA.ARGS_AND_VALUE_DOUBLE)
RETURNS INTEGER[] AS
$$
SELECT $1.args
$$
LANGUAGE sql IMMUTABLE;
/**
* @brief Argmax: Return the key of the row for which value is maximal
*
* The "index set" of the argmax function is of type INTEGER and we range over
* DOUBLE PRECISION values. It is not required that all keys are distinct.
*
* @note
* argmax should only be used on unsorted data because it will not exploit
* indices, and its running time is \f$ \Theta(n) \f$.
*
* @implementation
*
* The implementation is in SQL, with a flavor of functional programming.
* The hope is that the optimizer does a good job here.
*/
CREATE AGGREGATE MADLIB_SCHEMA.argmax(/*+ key */ INTEGER, /*+ value */ DOUBLE PRECISION) (
SFUNC=MADLIB_SCHEMA.argmax_transition,
STYPE=MADLIB_SCHEMA.ARGS_AND_VALUE_DOUBLE,
ifdef(<nom4>GREENPLUM</nom4>,<nom4>prefunc = MADLIB_SCHEMA.argmax_combine,</nom4>)
FINALFUNC=MADLIB_SCHEMA.argmax_final
);
/**
* @brief Precompute all class priors and feature probabilities
*
* Feature probabilities are stored in a table with columns
* (class, attr, value, cnt, attr_cnt)
*
* Class priors are stored in a relation with columns
* (class, class_cnt, all_cnt).
*
* @param trainingSource Name of relation containing the training data
* @param trainingClassColumn Name of class column in training data
* @param trainingAttrColumn Name of attributes-array column in training data
* @param numAttrs Number of attributes to use for classification
* @param featureProbsDestName Name of feature-probabilities table to create
* @param classPriorsDestName Name of class-priors table to create
*
* @internal
* @sa This function is a wrapper for bayes::create_prepared_data().
*/
CREATE FUNCTION MADLIB_SCHEMA.create_nb_prepared_data_tables(
"trainingSource" VARCHAR,
"trainingClassColumn" VARCHAR,
"trainingAttrColumn" VARCHAR,
"numAttrs" INTEGER,
"featureProbsDestName" VARCHAR,
"classPriorsDestName" VARCHAR)
RETURNS VOID AS $$
import sys
try:
ifdef(<nom4>DEBUG</nom4>,,<nom4>from madlib </nom4>)import bayes
except:
sys.path.append("PLPYTHON_LIBDIR")
ifdef(<nom4>DEBUG</nom4>,,<nom4>from madlib </nom4>)import bayes
global whatToCreate
whatToCreate = 'TABLE'
bayes.create_prepared_data(**globals())
$$ LANGUAGE plpythonu VOLATILE;
/**
* @brief Create view that contains all keys of the source relation and the
* respective naive Bayes classifications
*
* @param featureProbsSource Name of table with precomputed feature
* probabilities, as created with create_nb_prepared_data_tables()
* @param classPriorsSource Name of table with precomputed class priors, as
* created with create_nb_prepared_data_tables()
* @param classifySource Name of the relation that contains data to be classified
* @param classifyKeyColumn Name of column in \em classifySource that can
* serve as unique identifier (the key of the source relation)
* @param classifyAttrColumn Name of attributes-array column in \em classifySource
* @param numAttrs Number of attributes to use for classification
* @param destName Name of the view to create
*
* @internal
* @sa This function is a wrapper for bayes::create_classification(). See there
* for details.
*/
CREATE FUNCTION MADLIB_SCHEMA.create_nb_classify_view(
"featureProbsSource" VARCHAR,
"classPriorsSource" VARCHAR,
"classifySource" VARCHAR,
"classifyKeyColumn" VARCHAR,
"classifyAttrColumn" VARCHAR,
"numAttrs" INTEGER,
"destName" VARCHAR)
RETURNS VOID AS $$
import sys
try:
ifdef(<nom4>DEBUG</nom4>,,<nom4>from madlib </nom4>)import bayes
except:
sys.path.append("PLPYTHON_LIBDIR")
ifdef(<nom4>DEBUG</nom4>,,<nom4>from madlib </nom4>)import bayes
global whatToCreate
whatToCreate = 'VIEW'
bayes.create_classification(**globals())
$$ LANGUAGE plpythonu VOLATILE;
/**
* @brief Create a view with columns <tt>(key, nb_classification)</tt>
*
* The created relation will be
*
* <tt>{TABLE|VIEW} <em>destName</em> (key, nb_classification)</tt>
*
* where \c nb_classification is an array containing the most likely
* class(es) of the record in \em classifySource identified by \c key.
*
* @param trainingSource
* Name of relation containing the training data
* @param trainingClassColumn
* Name of class column in training data
* @param trainingAttrColumn
* Name of attributes-array column in \em trainingSource
* @param classifySource Name of the relation that contains data to be classified
* @param classifyKeyColumn Name of column in \em classifySource that can
* serve as unique identifier (the key of the source relation)
* @param classifyAttrColumn Name of attributes-array column in \em classifySource
* @param numAttrs Number of attributes to use for classification
* @param destName Name of the view to create
*
* @internal
* @sa This function is a wrapper for bayes::create_classification().
*/
CREATE FUNCTION MADLIB_SCHEMA.create_nb_classify_view(
"trainingSource" VARCHAR,
"trainingClassColumn" VARCHAR,
"trainingAttrColumn" VARCHAR,
"classifySource" VARCHAR,
"classifyKeyColumn" VARCHAR,
"classifyAttrColumn" VARCHAR,
"numAttrs" INTEGER,
"destName" VARCHAR)
RETURNS VOID AS $$
import sys
try:
ifdef(<nom4>DEBUG</nom4>,,<nom4>from madlib </nom4>)import bayes
except:
sys.path.append("PLPYTHON_LIBDIR")
ifdef(<nom4>DEBUG</nom4>,,<nom4>from madlib </nom4>)import bayes
global whatToCreate
whatToCreate = 'VIEW'
bayes.create_classification(**globals())
$$ LANGUAGE plpythonu VOLATILE;
/**
* @brief Create view with columns <tt>(key, class, nb_prob)</tt>
*
* The created view will be
*
* <tt>VIEW <em>destName</em> (key, class, nb_prob)</tt>
*
* where \c nb_prob is the Naive-Bayes probability that \c class is the true
* class of the record in \em classifySource identified by \c key.
*
* @param trainingSource
* Name of relation containing the training data
* @param trainingClassColumn
* Name of class column in training data
* @param trainingAttrColumn
* Name of attributes-array column in \em trainingSource
* @param classifySource Name of the relation that contains data to be classified
* @param classifyKeyColumn Name of column in \em classifySource that can
* serve as unique identifier (the key of the source relation)
* @param classifyAttrColumn Name of attributes-array column in \em classifySource
* @param numAttrs Number of attributes to use for classification
* @param destName Name of the view to create
*
* @internal
* @sa This function is a wrapper for bayes::create_bayes_probabilities().
*/
CREATE FUNCTION MADLIB_SCHEMA.create_nb_probs_view(
"trainingSource" VARCHAR,
"trainingClassColumn" VARCHAR,
"trainingAttrColumn" VARCHAR,
"classifySource" VARCHAR,
"classifyKeyColumn" VARCHAR,
"classifyAttrColumn" VARCHAR,
"numAttrs" INTEGER,
"destName" VARCHAR)
RETURNS VOID AS $$
import sys
try:
ifdef(<nom4>DEBUG</nom4>,,<nom4>from madlib </nom4>)import bayes
except:
sys.path.append("PLPYTHON_LIBDIR")
ifdef(<nom4>DEBUG</nom4>,,<nom4>from madlib </nom4>)import bayes
global whatToCreate
whatToCreate = 'VIEW'
bayes.create_bayes_probabilities(**globals())
$$ LANGUAGE plpythonu VOLATILE;
/**
* @brief Create view with columns <tt>(key, class, nb_prob)</tt>
*
* The created view will be
*
* <tt>VIEW <em>destName</em> (key, class, nb_prob)</tt>
*
* where \c nb_prob is the Naive-Bayes probability that \c class is the true
* class of the record in \em classifySource identified by \c key.
*
* @param featureProbsSource Name of table with precomputed feature
* probabilities, as created with create_nb_prepared_data_tables()
* @param classPriorsSource Name of table with precomputed class priors, as
* created with create_nb_prepared_data_tables()
* @param classifySource Name of the relation that contains data to be classified
* @param classifyKeyColumn Name of column in \em classifySource that can
* serve as unique identifier (the key of the source relation)
* @param classifyAttrColumn Name of attributes-array column in \em classifySource
* @param numAttrs Number of attributes to use for classification
* @param destName Name of the view to create
*
* @internal
* @sa This function is a wrapper for bayes::create_bayes_probabilities().
*/
CREATE FUNCTION MADLIB_SCHEMA.create_nb_probs_view(
"featureProbsSource" VARCHAR,
"classPriorsSource" VARCHAR,
"classifySource" VARCHAR,
"classifyKeyColumn" VARCHAR,
"classifyAttrColumn" VARCHAR,
"numAttrs" INTEGER,
"destName" VARCHAR)
RETURNS VOID AS $$
import sys
try:
ifdef(<nom4>DEBUG</nom4>,,<nom4>from madlib </nom4>)import bayes
except:
sys.path.append("PLPYTHON_LIBDIR")
ifdef(<nom4>DEBUG</nom4>,,<nom4>from madlib </nom4>)import bayes
global whatToCreate
whatToCreate = 'VIEW'
bayes.create_bayes_probabilities(**globals())
$$ LANGUAGE plpythonu VOLATILE;
/**
* @brief Create a SQL function mapping arrays of attribute values to the Naive
* Bayes classification.
*
* The created SQL function will be:
*
* <tt>
* FUNCTION <em>destName</em> (attributes INTEGER[], smoothingFactor DOUBLE PRECISION)
* RETURNS INTEGER[]</tt>
*
* @note
* On Greenplum, the generated SQL function can only be called on the master.
*
* @param featureProbsSource Name of table with precomputed feature
* probabilities, as created with create_nb_prepared_data_tables()
* @param classPriorsSource Name of table with precomputed class priors, as
* created with create_nb_prepared_data_tables()
* @param numAttrs Number of attributes to use for classification
* @param destName Name of the function to create
*
* @internal
* @sa This function is a wrapper for bayes::create_classification_function().
*/
CREATE FUNCTION MADLIB_SCHEMA.create_nb_classify_fn(
"featureProbsSource" VARCHAR,
"classPriorsSource" VARCHAR,
"numAttrs" INTEGER,
"destName" VARCHAR)
RETURNS VOID AS $$
import sys
try:
ifdef(<nom4>DEBUG</nom4>,,<nom4>from madlib </nom4>)import bayes
except:
sys.path.append("PLPYTHON_LIBDIR")
ifdef(<nom4>DEBUG</nom4>,,<nom4>from madlib </nom4>)import bayes
bayes.create_classification_function(**globals())
$$ LANGUAGE plpythonu VOLATILE;
/**
* @brief Create a SQL function mapping arrays of attribute values to the Naive
* Bayes classification.
*
* The created SQL function will be:
*
* <tt>
* FUNCTION <em>destName</em> (attributes INTEGER[], smoothingFactor DOUBLE PRECISION)
* RETURNS INTEGER[]</tt>
*
* @note
* On Greenplum, the generated SQL function can only be called on the master.
*
* @param trainingSource
* Name of relation containing the training data
* @param trainingClassColumn
* Name of class column in training data
* @param trainingAttrColumn
* Name of attributes-array column in \em trainingSource
* @param numAttrs Number of attributes to use for classification
* @param destName Name of the function to create
*
* @internal
* @sa This function is a wrapper for bayes::create_classification_function().
*/
CREATE FUNCTION MADLIB_SCHEMA.create_nb_classify_fn(
"trainingSource" VARCHAR,
"trainingClassColumn" VARCHAR,
"trainingAttrColumn" VARCHAR,
"numAttrs" INTEGER,
"destName" VARCHAR)
RETURNS VOID AS $$
import sys
try:
ifdef(<nom4>DEBUG</nom4>,,<nom4>from madlib </nom4>)import bayes
except:
sys.path.append("PLPYTHON_LIBDIR")
ifdef(<nom4>DEBUG</nom4>,,<nom4>from madlib </nom4>)import bayes
bayes.create_classification_function(**globals())
$$ LANGUAGE plpythonu VOLATILE;
ifdef(<nom4>DEBUG</nom4>,<nom4>
/*
* Function for initializing python paths to the paths in dynamic_library_path.
* This is only needed when debugging Python-based functions without installing
* them in a location where Python would find them automatically.
*/
CREATE FUNCTION MADLIB_SCHEMA.init_python_paths()
RETURNS VOID AS
$$
# FIXME: The following code should be common code and not reside in a specialized module
import sys
dyld_paths = plpy.execute(
"SHOW dynamic_library_path")[0]["dynamic_library_path"].split(':')
before_default = True
count = 0
for path in dyld_paths:
if path == "$libdir":
before_default = False
else:
if before_default:
sys.path.insert(count, path)
count += 1
else:
sys.path.append(path)
$$ LANGUAGE plpythonu VOLATILE;
</nom4>)