methods/regress/src/pg_gp/student.c - madlib - Git at Google

 /* ----------------------------------------------------------------------- *//**
  *
  * @file student.c
  *
  * @brief Evaluate the Student-T distribution function.
  * @author Florian Schoppmann
  * @date   November 2010
  *
  *//* --------------------------------------------------------------------------
  *
  * This file is compliant with the C99 standard (Compile with "-std=c99").
  */

 /**
  * @file student.c
  *
  * Emprirical results indicate that the numerical quality of the series
  * expansion from [1] (see notes below) is vastly superior to using continued
  * fractions for computing the cdf via the incomplete beta function.
  *
  * @literature
  *
  * [1] Abramowitz and Stegun, Handbook of Mathematical Functions with Formulas,
  *     Graphs, and Mathematical Tables, 1972
  *     page 948: http://people.math.sfu.ca/~cbm/aands/page_948.htm
  *
  * Further reading (for computing the Student-T cdf via the incomplete beta
  * function):
  *
  * [2] NIST Digital Library of Mathematical Functions, Ch. 8,
  *     Incomplete Gamma and Related Functions,
  *     http://dlmf.nist.gov/8.17
  *
  * [3] Lentz, Generating Bessel functions in Mie scattering calculations using
  *     continued fractions, Applied Optics, Vol. 15, No. 3, 1976
  *
  * [4] Thompson and Barnett, Coulomb and Bessel Functions of Complex Arguments
  *     and Order, Journal of Computational Physics, Vol. 64, 1986
  *
  * [5] Cuyt et al., Handbook of Continued Fractions for Special Functions,
  *     Springer, 2008
  *
  * [6] Gil et al., Numerical Methods for Special Functions, SIAM, 2008
  *
  * [7] Press et al., Numerical Recipes in C++, 3rd edition,
  *     Cambridge Univ. Press, 2007
  *
  * [8] DiDonato, Morris, Jr., Algorithm 708: Significant Digit Computation of
  *     the Incomplete Beta Function Ratios, ACM Transactions on Mathematical
  *     Software, Vol. 18, No. 3, 1992
  *
  * Approximating the Student-T distribution function with the normal
  * distribution:
  *
  * [9]  Gleason, A note on a proposed student t approximation, Computational
  *      Statistics & Data Analysis, Vol. 34, No. 1, 2000
  *
  * [10] Gaver and Kafadar, A Retrievable Recipe for Inverse t, The American
  *      Statistician, Vol. 38, No. 4, 1984
  */

 /* Type definitions for float8, uint32, ... occur in postgres.h.
  * Postgres-specific code only begins after: -- POSTGRES -- */
 #include "postgres.h"

 #include "student.h"

 #include <math.h>


 /* Prototypes of internal functions */

 static inline float8 normal_cdf(float8 t);
 static float8 studentT_cdf_approx(int64 nu, float8 t);


 /**
  * Compute Pr[T <= t] for Student-t distributed T with nu degrees of freedom.
  *
  * For nu >= 1000000, we just use the normal distribution as an approximation.
  * For 1000000 >= nu >= 200, we use a simple approximation from [9].
  * We are much more cautious than usual here (it is folklore that the normal
  * distribution is a "good" estimate for Student-T if nu >= 30), but we can
  * afford the extra work as this function is not designed to be called from
  * inner loops. Performance should still be reasonably good, with at most ~100
  * iterations in any case (just one if nu >= 200).
  *
  * For nu < 200, we use the series expansions 26.7.3 and 26.7.4 from [1] and
  * substitute sin(theta) = t/sqrt(n * z), where z = 1 + t^2/nu.
  *
  * This gives:
  * @verbatim
  *                          t
  *   A(t|1)  = 2 arctan( -------- ) ,
  *                       sqrt(nu)
  *
  *                                                    (nu-3)/2
  *             2   [            t              t         --    2 * 4 * ... * (2i)  ]
  *   A(t|nu) = - * [ arctan( -------- ) + ------------ * \  ---------------------- ]
  *             π   [         sqrt(nu)     sqrt(nu) * z   /_ 3 * ... * (2i+1) * z^i ]
  *                                                       i=0
  *           for odd nu > 1, and
  *
  *                         (nu-2)/2
  *                  t         -- 1 * 3 * ... * (2i - 1)
  *   A(t|nu) = ------------ * \  ------------------------ for even nu,
  *             sqrt(nu * z)   /_ 2 * 4 * ... * (2i) * z^i
  *                            i=0
  *
  * where A(t|nu) = Pr[|T| <= t].
  * @endverbatim
  *
  * @param nu Degree of freedom (>= 1)
  * @param t Argument to cdf.
  *
  * Note: The running time of calculating the series is proportional to nu. This
  * We therefore use the normal distribution as an approximation for large nu.
  * Another idea for handling this case can be found in reference [8].
  */

 float8 studentT_cdf(int64 nu, float8 t) {
 	float8		z,
 				t_by_sqrt_nu;
 	float8		A, /* contains A(t|nu) */
 				prod = 1.,
 				sum = 1.;

 	/* Handle extreme cases. See above. */

 	if (nu <= 0)
 		return NAN;
 	else if (nu >= 1000000)
 		return normal_cdf(t);
 	else if (nu >= 200)
 		return studentT_cdf_approx(nu, t);

 	/* Handle main case (nu < 200) in the rest of the function. */

 	z = 1. + t * t / nu;
 	t_by_sqrt_nu = fabs(t) / sqrt(nu);

 	if (nu == 1)
 	{
 		A = 2. / M_PI * atan(t_by_sqrt_nu);
 	}
 	else if (nu & 1) /* odd nu > 1 */
 	{
 		for (int j = 2; j <= nu - 3; j += 2)
 		{
 			prod = prod * j / ((j + 1) * z);
 			sum = sum + prod;
 		}
 		A = 2 / M_PI * ( atan(t_by_sqrt_nu) + t_by_sqrt_nu / z * sum );
 	}
 	else /* even nu */
 	{
 		for (int j = 2; j <= nu - 2; j += 2)
 		{
 			prod = prod * (j - 1) / (j * z);
 			sum = sum + prod;
 		}
 		A = t_by_sqrt_nu / sqrt(z) * sum;
 	}

 	/* A should obviously lie withing the interval [0,1] plus minus (hopefully
 	 * small) rounding errors. */
 	if (A > 1.)
 		A = 1.;
 	else if (A < 0.)
 		A = 0.;

 	/* The Student-T distribution is obviously symmetric around t=0... */
 	if (t < 0)
 		return .5 * (1. - A);
 	else
 		return 1. - .5 * (1. - A);
 }

 /**
  * Compute the normal distribution function using the library error function.
  *
  * This approximation satisfies
  * rel_error < 0.0001 || abs_error < 0.00000001
  * for all nu >= 1000000. (Tested on Mac OS X 10.6, gcc-4.2.)
  */

 static inline float8 normal_cdf(float8 t)
 {
 	return .5 + .5 * erf(t / sqrt(2.));
 }


 /**
  * Approximate Student-T distribution using a formula suggested in
  * [9], which goes back to an approximation suggested in [10].
  *
  * Compared to the series expansion, this approximation satisfies
  * rel_error < 0.0001 || abs_error < 0.00000001
  * for all nu >= 200. (Tested on Mac OS X 10.6, gcc-4.2.)
  */
 static float8 studentT_cdf_approx(int64 nu, float8 t)
 {
 	float8	g = (nu - 1.5) / ((nu - 1) * (nu - 1)),
 			z = sqrt( log(1. + t * t / nu) / g );

 	if (t < 0)
 		z *= -1.;

 	return normal_cdf(z);
 }

 /* -- POSTGRES -- PostgreSQL-specific code begins here. */

 /* Indicate "version 1" calling conventions for all exported functions. */

 PG_FUNCTION_INFO_V1(student_t_cdf);

 #include "utils/builtins.h"

 /**
  * Expose the studentT_cdf as a user-defined function.
  */

 Datum student_t_cdf(PG_FUNCTION_ARGS)
 {
 	int32		nu;
 	float8		t;

 	/*
 	 * Perform all the error checking needed to ensure that no one is
 	 * trying to call this in some sort of crazy way.
 	 */
 	if (PG_NARGS() != 2)
 	{
 		ereport(ERROR,
 				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
 				 errmsg("function \"%s\" called with invalid parameters",
 					format_procedure(fcinfo->flinfo->fn_oid))));
 	}
 	if (PG_ARGISNULL(0) || PG_ARGISNULL(1))
 		PG_RETURN_NULL();

 	nu = PG_GETARG_INT32(0);
 	t = PG_GETARG_FLOAT8(1);

 	/* We want to ensure nu > 0 */
 	if (nu <= 0)
 		PG_RETURN_NULL();

 	PG_RETURN_FLOAT8(studentT_cdf(nu, t));
 }
	/* ----------------------------------------------------------------------- //*
	*
	* @file student.c
	*
	* @brief Evaluate the Student-T distribution function.
	* @author Florian Schoppmann
	* @date November 2010
	*
	// --------------------------------------------------------------------------
	*
	* This file is compliant with the C99 standard (Compile with "-std=c99").
	*/

	/**
	* @file student.c
	*
	* Emprirical results indicate that the numerical quality of the series
	* expansion from [1] (see notes below) is vastly superior to using continued
	* fractions for computing the cdf via the incomplete beta function.
	*
	* @literature
	*
	* [1] Abramowitz and Stegun, Handbook of Mathematical Functions with Formulas,
	* Graphs, and Mathematical Tables, 1972
	* page 948: http://people.math.sfu.ca/~cbm/aands/page_948.htm
	*
	* Further reading (for computing the Student-T cdf via the incomplete beta
	* function):
	*
	* [2] NIST Digital Library of Mathematical Functions, Ch. 8,
	* Incomplete Gamma and Related Functions,
	* http://dlmf.nist.gov/8.17
	*
	* [3] Lentz, Generating Bessel functions in Mie scattering calculations using
	* continued fractions, Applied Optics, Vol. 15, No. 3, 1976
	*
	* [4] Thompson and Barnett, Coulomb and Bessel Functions of Complex Arguments
	* and Order, Journal of Computational Physics, Vol. 64, 1986
	*
	* [5] Cuyt et al., Handbook of Continued Fractions for Special Functions,
	* Springer, 2008
	*
	* [6] Gil et al., Numerical Methods for Special Functions, SIAM, 2008
	*
	* [7] Press et al., Numerical Recipes in C++, 3rd edition,
	* Cambridge Univ. Press, 2007
	*
	* [8] DiDonato, Morris, Jr., Algorithm 708: Significant Digit Computation of
	* the Incomplete Beta Function Ratios, ACM Transactions on Mathematical
	* Software, Vol. 18, No. 3, 1992
	*
	* Approximating the Student-T distribution function with the normal
	* distribution:
	*
	* [9] Gleason, A note on a proposed student t approximation, Computational
	* Statistics & Data Analysis, Vol. 34, No. 1, 2000
	*
	* [10] Gaver and Kafadar, A Retrievable Recipe for Inverse t, The American
	* Statistician, Vol. 38, No. 4, 1984
	*/

	/* Type definitions for float8, uint32, ... occur in postgres.h.
	* Postgres-specific code only begins after: -- POSTGRES -- */
	#include "postgres.h"

	#include "student.h"

	#include <math.h>


	/* Prototypes of internal functions */

	static inline float8 normal_cdf(float8 t);
	static float8 studentT_cdf_approx(int64 nu, float8 t);


	/**
	* Compute Pr[T <= t] for Student-t distributed T with nu degrees of freedom.
	*
	* For nu >= 1000000, we just use the normal distribution as an approximation.
	* For 1000000 >= nu >= 200, we use a simple approximation from [9].
	* We are much more cautious than usual here (it is folklore that the normal
	* distribution is a "good" estimate for Student-T if nu >= 30), but we can
	* afford the extra work as this function is not designed to be called from
	* inner loops. Performance should still be reasonably good, with at most ~100
	* iterations in any case (just one if nu >= 200).
	*
	* For nu < 200, we use the series expansions 26.7.3 and 26.7.4 from [1] and
	* substitute sin(theta) = t/sqrt(n * z), where z = 1 + t^2/nu.
	*
	* This gives:
	* @verbatim
	* t
	* A(t\|1) = 2 arctan( -------- ) ,
	* sqrt(nu)
	*
	* (nu-3)/2
	* 2 [ t t -- 2 * 4 * ... * (2i) ]
	* A(t\|nu) = - * [ arctan( -------- ) + ------------ * \ ---------------------- ]
	* π [ sqrt(nu) sqrt(nu) * z /_ 3 * ... * (2i+1) * z^i ]
	* i=0
	* for odd nu > 1, and
	*
	* (nu-2)/2
	* t -- 1 * 3 * ... * (2i - 1)
	* A(t\|nu) = ------------ * \ ------------------------ for even nu,
	* sqrt(nu * z) /_ 2 * 4 * ... * (2i) * z^i
	* i=0
	*
	* where A(t\|nu) = Pr[\|T\| <= t].
	* @endverbatim
	*
	* @param nu Degree of freedom (>= 1)
	* @param t Argument to cdf.
	*
	* Note: The running time of calculating the series is proportional to nu. This
	* We therefore use the normal distribution as an approximation for large nu.
	* Another idea for handling this case can be found in reference [8].
	*/

	float8 studentT_cdf(int64 nu, float8 t) {
	float8 z,
	t_by_sqrt_nu;
	float8 A, /* contains A(t\|nu) */
	prod = 1.,
	sum = 1.;

	/* Handle extreme cases. See above. */

	if (nu <= 0)
	return NAN;
	else if (nu >= 1000000)
	return normal_cdf(t);
	else if (nu >= 200)
	return studentT_cdf_approx(nu, t);

	/* Handle main case (nu < 200) in the rest of the function. */

	z = 1. + t * t / nu;
	t_by_sqrt_nu = fabs(t) / sqrt(nu);

	if (nu == 1)
	{
	A = 2. / M_PI * atan(t_by_sqrt_nu);
	}
	else if (nu & 1) /* odd nu > 1 */
	{
	for (int j = 2; j <= nu - 3; j += 2)
	{
	prod = prod * j / ((j + 1) * z);
	sum = sum + prod;
	}
	A = 2 / M_PI * ( atan(t_by_sqrt_nu) + t_by_sqrt_nu / z * sum );
	}
	else /* even nu */
	{
	for (int j = 2; j <= nu - 2; j += 2)
	{
	prod = prod * (j - 1) / (j * z);
	sum = sum + prod;
	}
	A = t_by_sqrt_nu / sqrt(z) * sum;
	}

	/* A should obviously lie withing the interval [0,1] plus minus (hopefully
	* small) rounding errors. */
	if (A > 1.)
	A = 1.;
	else if (A < 0.)
	A = 0.;

	/* The Student-T distribution is obviously symmetric around t=0... */
	if (t < 0)
	return .5 * (1. - A);
	else
	return 1. - .5 * (1. - A);
	}

	/**
	* Compute the normal distribution function using the library error function.
	*
	* This approximation satisfies
	* rel_error < 0.0001 \|\| abs_error < 0.00000001
	* for all nu >= 1000000. (Tested on Mac OS X 10.6, gcc-4.2.)
	*/

	static inline float8 normal_cdf(float8 t)
	{
	return .5 + .5 * erf(t / sqrt(2.));
	}


	/**
	* Approximate Student-T distribution using a formula suggested in
	* [9], which goes back to an approximation suggested in [10].
	*
	* Compared to the series expansion, this approximation satisfies
	* rel_error < 0.0001 \|\| abs_error < 0.00000001
	* for all nu >= 200. (Tested on Mac OS X 10.6, gcc-4.2.)
	*/
	static float8 studentT_cdf_approx(int64 nu, float8 t)
	{
	float8 g = (nu - 1.5) / ((nu - 1) * (nu - 1)),
	z = sqrt( log(1. + t * t / nu) / g );

	if (t < 0)
	z *= -1.;

	return normal_cdf(z);
	}

	/* -- POSTGRES -- PostgreSQL-specific code begins here. */

	/* Indicate "version 1" calling conventions for all exported functions. */

	PG_FUNCTION_INFO_V1(student_t_cdf);

	#include "utils/builtins.h"

	/**
	* Expose the studentT_cdf as a user-defined function.
	*/

	Datum student_t_cdf(PG_FUNCTION_ARGS)
	{
	int32 nu;
	float8 t;

	/*
	* Perform all the error checking needed to ensure that no one is
	* trying to call this in some sort of crazy way.
	*/
	if (PG_NARGS() != 2)
	{
	ereport(ERROR,
	(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
	errmsg("function \"%s\" called with invalid parameters",
	format_procedure(fcinfo->flinfo->fn_oid))));
	}
	if (PG_ARGISNULL(0) \|\| PG_ARGISNULL(1))
	PG_RETURN_NULL();

	nu = PG_GETARG_INT32(0);
	t = PG_GETARG_FLOAT8(1);

	/* We want to ensure nu > 0 */
	if (nu <= 0)
	PG_RETURN_NULL();

	PG_RETURN_FLOAT8(studentT_cdf(nu, t));
	}