src/modules/prob/student.hpp - madlib - Git at Google

 /* ----------------------------------------------------------------------- *//**
  *
  * @file student.hpp
  *
  * @brief Evaluate the Student's t-distribution function.
  * @author Florian Schoppmann
  * @date   November 2010
  *
  *//* -------------------------------------------------------------------- *//**
  *
  * @file student.hpp
  *
  * Emprirical results indicate that the numerical quality of the series
  * expansion from [1] (see notes below) is vastly superior to using continued
  * fractions for computing the cdf via the incomplete beta function.
  *
  * @literature
  *
  * [1] Abramowitz and Stegun, Handbook of Mathematical Functions with Formulas,
  *     Graphs, and Mathematical Tables, 1972
  *     page 948: http://people.math.sfu.ca/~cbm/aands/page_948.htm
  *
  * Further reading (for computing the Student-T cdf via the incomplete beta
  * function):
  *
  * [2] NIST Digital Library of Mathematical Functions, Ch. 8,
  *     Incomplete Gamma and Related Functions,
  *     http://dlmf.nist.gov/8.17
  *
  * [3] Lentz, Generating Bessel functions in Mie scattering calculations using
  *     continued fractions, Applied Optics, Vol. 15, No. 3, 1976
  *
  * [4] Thompson and Barnett, Coulomb and Bessel Functions of Complex Arguments
  *     and Order, Journal of Computational Physics, Vol. 64, 1986
  *
  * [5] Cuyt et al., Handbook of Continued Fractions for Special Functions,
  *     Springer, 2008
  *
  * [6] Gil et al., Numerical Methods for Special Functions, SIAM, 2008
  *
  * [7] Press et al., Numerical Recipes in C++, 3rd edition,
  *     Cambridge Univ. Press, 2007
  *
  * [8] DiDonato, Morris, Jr., Algorithm 708: Significant Digit Computation of
  *     the Incomplete Beta Function Ratios, ACM Transactions on Mathematical
  *     Software, Vol. 18, No. 3, 1992
  *
  * Approximating the Student-T distribution function with the normal
  * distribution:
  *
  * [9]  Gleason, A note on a proposed student t approximation, Computational
  *      Statistics & Data Analysis, Vol. 34, No. 1, 2000
  *
  * [10] Gaver and Kafadar, A Retrievable Recipe for Inverse t, The American
  *      Statistician, Vol. 38, No. 4, 1984
  */

 /**
  * @brief Student-t cumulative distribution function
  */
 DECLARE_UDF(prob, students_t_cdf)
 DECLARE_UDF(prob, students_t_pdf)
 DECLARE_UDF(prob, students_t_quantile)


 #ifndef MADLIB_MODULES_PROB_STUDENT_T_HPP
 #define MADLIB_MODULES_PROB_STUDENT_T_HPP

 #include <boost/math/distributions/detail/common_error_handling.hpp>
 #include <boost/math/distributions/normal.hpp>
 #include <boost/math/distributions/students_t.hpp>

 namespace madlib {

 namespace modules {

 namespace prob {

 typedef boost::math::students_t_distribution<double, boost_mathkit_policy>
     students_t;

 namespace {

 /**
  * @brief Compute one-sided Student's t cumulative distribution function
  *
  * We use the series expansions 26.7.3 and 26.7.4 from [1] and
  * substitute sin(theta) = t/sqrt(n * z), where z = 1 + t^2/nu.
  *
  * This gives:
  * @verbatim
  *                          t
  *   A(t|1)  = 2 arctan( -------- ) ,
  *                       sqrt(nu)
  *
  *                                                    (nu-3)/2
  *             2   [            t              t         --    2 * 4 * ... * (2i)  ]
  *   A(t|nu) = - * [ arctan( -------- ) + ------------ * \  ---------------------- ]
  *             π   [         sqrt(nu)     sqrt(nu) * z   /_ 3 * ... * (2i+1) * z^i ]
  *                                                       i=0
  *           for odd nu > 1, and
  *
  *                         (nu-2)/2
  *                  t         -- 1 * 3 * ... * (2i - 1)
  *   A(t|nu) = ------------ * \  ------------------------ for even nu,
  *             sqrt(nu * z)   /_ 2 * 4 * ... * (2i) * z^i
  *                            i=0
  *
  * where A(t|nu) = Pr[|T| <= t].
  * @endverbatim
  *
  * @param t
  * @param nu Degree of freedom \f$ \nu > 0 \f$
  * @return \f$ \Pr[|T| < t] \f$ where \f$ t \geq 0 \f$, \f$ T \f$ is a Student's
  *     T-distributed random variable with \f$ \nu \f$ degrees of
  *     freedom.
  *
  * Note: The running time of calculating the series is proportional to nu.
  * We therefore use the normal distribution as an approximation for large nu.
  * Another idea for handling this case can be found in reference [8].
  */
 template <class RealType>
 inline
 RealType
 oneSidedStudentsT_CDF(const RealType& t,  uint64_t nu) {
     RealType    z,
                 t_by_sqrt_nu;
     RealType    A, /* contains A(t|nu) */
                 prod = 1.,
                 sum = 1.;

     /* Handle main case (nu \in {1, ..., 200}) in the rest of the function. */
     z = 1. + t * t / static_cast<double>(nu);
     t_by_sqrt_nu = std::fabs(t) / std::sqrt(static_cast<double>(nu));

     if (nu == 1)
     {
         A = 2. / M_PI * std::atan(t_by_sqrt_nu);
     }
     else if (nu & 1) /* odd nu > 1 */
     {
         for (uint64_t j = 2; j + 3 <= nu; j += 2)
         {
             prod = prod * static_cast<double>(j)
                  / (static_cast<double>(j + 1) * z);
             sum = sum + prod;
         }
         A = 2 / M_PI * ( std::atan(t_by_sqrt_nu) + t_by_sqrt_nu / z * sum );
     }
     else /* even nu */
     {
         for (uint64_t j = 2; j + 2 <= nu; j += 2)
         {
             prod = prod * static_cast<double>(j - 1)
                  / (static_cast<double>(j) * z);
             sum = sum + prod;
         }
         A = t_by_sqrt_nu / std::sqrt(z) * sum;
     }

     /* A should obviously be within the interval [0,1] plus minus (hopefully
      * small) rounding errors. */
     if (A > 1.)
         A = 1.;
     else if (A < 0.)
         A = 0.;

     return A;
 }

 /**
  * @brief Compute parameter for normal CDF for approximating the Student's T CDF
  *
  * Gleason suggested a formula for approximating the Student's
  * t-distribution [9], which goes back to an approximation suggested in [10].
  *
  * Compared to the series expansion, this approximation satisfies
  * rel_error < 0.0001 || abs_error < 0.00000001
  * for all nu >= 200. (Tested on Mac OS X 10.6, gcc-4.2.)
  *
  * @param t
  * @param nu Degree of freedom \f$ \nu > 0 \f$
  * @returns A value \f$ z \f$ such that for a Student's t-distributed
  *     random variable \f$ T \f$ with \f$ nu \f$ degrees of freedom and a
  *     standard normally distributed random variable \f$ Z \f$, it holds that
  *     \f$ \Pr[T \leq t] \approx \Pr[Z \leq z] \f$.
  */
 template <class RealType>
 inline
 RealType
 GleasonsNormalApproxForStudentsT(const RealType& t, const RealType& nu) {
     double  g = (nu - 1.5) / ((nu - 1) * (nu - 1)),
             z = std::sqrt( std::log(1. + t * t / nu) / g );

     if (t < 0)
         z *= -1.;

     return z;
 }

 } // anonymous namespace

 /**
  * @brief Compute Student's cumulative distribution function
  *
  * For nu >= 1000000, we just use the normal distribution as an approximation.
  * For 1000000 >= nu >= 200, we use a simple approximation from [9].
  * If nu is not within 0.01 of a natural number, we will call the student-t
  * CDF from boost. Otherwise, our approach should be much more precise than
  * using the incomplete beta function as boost does (see the references).
  *
  * We are much more cautious than usual here (it is folklore that the normal
  * distribution is a "good" estimate for Student-T if nu >= 30), but we can
  * afford the extra work as this function is not designed to be called from
  * inner loops. Performance should still be reasonably good, with at most ~100
  * iterations in any case (just one if nu >= 200).
  *
  * For nu < 200, we use the series expansions 26.7.3 and 26.7.4 from [1] and
  * substitute sin(theta) = t/sqrt(n * z), where z = 1 + t^2/nu (using
  * oneSidedStudentsT_CDF()).
  *
  * @param dist A Student's t-distribution object, containing the degree of
  *     freedom \f$ \nu \f$
  * @param t
  * @return \f$ \Pr[T < t] \f$ where \f$ t \geq 0 \f$, \f$ T \f$ is a Student's
  *     T-distributed random variable with \f$ \nu \f$ degrees of
  *     freedom.
  */
 template <class RealType, class Policy>
 inline
 RealType
 cdf(const boost::math::students_t_distribution<RealType, Policy>& dist,
     const RealType& t) {

     RealType df = dist.degrees_of_freedom();

     // FIXME: Add some justification/do some tests.
     if (!std::isfinite(df) || std::fabs(df - std::floor(df))/df > 0.01)
         return boost::math::cdf(dist, t);

     static const char* function = "madlib::modules::prob::cdf("
         "const students_t_distribution<%1%>&, %1%)";

     RealType result;
     if (!boost::math::detail::check_df(function, df, &result, Policy()))
         return result;

     if (df >= 200)
         return
             boost::math::cdf(
                 boost::math::normal_distribution<RealType, Policy>(),
                 df >= 1000000
                     ? t
                     : GleasonsNormalApproxForStudentsT(t, df)
             );

     // We first compute A = Pr[|T| < t]
     RealType A = oneSidedStudentsT_CDF(t, static_cast<uint64_t>(df));

     /* The Student-T distribution is obviously symmetric around t=0... */
     if (t < 0)
         /* FIXME: If A is approximately 1, we will face a loss of significance.
          *  */
         return .5 * (1. - A);
     else
         /* While we only know A in [0,1] here, the end result will be in
          * [0.5, 1]. Hence, there is no problem with adding 1 and A, even if
          * A << 1. */
         return .5 * (1. + A);
 }

 /**
  * @brief Compute the complement of Student's cumulative distribution function
  */
 template <class RealType, class Policy>
 inline
 RealType
 cdf(
     const boost::math::complemented2_type<
         boost::math::students_t_distribution<RealType, Policy>,
         RealType
     >& c
 ) {
     RealType df = c.dist.degrees_of_freedom();
     if (df >= 200) {
         static const char* function = "madlib::modules::prob::cdf("
             "const complement(students_t_distribution<%1%>&), %1%)";

         RealType result;
         if (!boost::math::detail::check_df(function, df, &result, Policy()))
             return result;

         return
             boost::math::cdf(complement(
                 boost::math::normal_distribution<RealType, Policy>(),
                 df >= 1000000
                     ? c.param
                     : GleasonsNormalApproxForStudentsT(c.param, df)
             ));
     }

     return prob::cdf(c.dist, -c.param);
 }

 template <class RealType, class Policy>
 inline
 RealType
 pdf(const boost::math::students_t_distribution<RealType, Policy>& dist,
     const RealType& t) {
     return boost::math::pdf(dist, t);
 }

 template <class RealType, class Policy>
 inline
 RealType
 pdf(
     const boost::math::complemented2_type<
         boost::math::students_t_distribution<RealType, Policy>,
         RealType
     >& c
 ) {
     return boost::math::pdf(c);
 }

 template <class RealType, class Policy>
 inline
 RealType
 quantile(const boost::math::students_t_distribution<RealType, Policy>& dist,
     const RealType& p) {

     using namespace boost::math;

     static const char* function = "madlib::modules::prob::quantile("
         "const students_t_distribution<%1%>&, %1%)";

     // FIXME: Boost bug 6937 prevent proper argument validation.
     // https://svn.boost.org/trac/boost/ticket/6937
     // Until this is fixed upstream, we do the following checks here.
     RealType df = dist.degrees_of_freedom();
     RealType result;
     if (!detail::check_df(function, df, &result, Policy())
         || !detail::check_probability(function, p, &result, Policy()))
         return result;

     return boost::math::quantile(dist, p);
 }

 template <class RealType, class Policy>
 inline
 RealType
 quantile(
     const boost::math::complemented2_type<
         boost::math::students_t_distribution<RealType, Policy>,
         RealType
     >& c
 ) {
     return boost::math::quantile(c);
 }

 } // namespace prob

 } // namespace modules

 } // namespace madlib

 #endif // defined(MADLIB_MODULES_PROB_STUDENT_T_HPP)
	/* ----------------------------------------------------------------------- //*
	*
	* @file student.hpp
	*
	* @brief Evaluate the Student's t-distribution function.
	* @author Florian Schoppmann
	* @date November 2010
	*
	// -------------------------------------------------------------------- //*
	*
	* @file student.hpp
	*
	* Emprirical results indicate that the numerical quality of the series
	* expansion from [1] (see notes below) is vastly superior to using continued
	* fractions for computing the cdf via the incomplete beta function.
	*
	* @literature
	*
	* [1] Abramowitz and Stegun, Handbook of Mathematical Functions with Formulas,
	* Graphs, and Mathematical Tables, 1972
	* page 948: http://people.math.sfu.ca/~cbm/aands/page_948.htm
	*
	* Further reading (for computing the Student-T cdf via the incomplete beta
	* function):
	*
	* [2] NIST Digital Library of Mathematical Functions, Ch. 8,
	* Incomplete Gamma and Related Functions,
	* http://dlmf.nist.gov/8.17
	*
	* [3] Lentz, Generating Bessel functions in Mie scattering calculations using
	* continued fractions, Applied Optics, Vol. 15, No. 3, 1976
	*
	* [4] Thompson and Barnett, Coulomb and Bessel Functions of Complex Arguments
	* and Order, Journal of Computational Physics, Vol. 64, 1986
	*
	* [5] Cuyt et al., Handbook of Continued Fractions for Special Functions,
	* Springer, 2008
	*
	* [6] Gil et al., Numerical Methods for Special Functions, SIAM, 2008
	*
	* [7] Press et al., Numerical Recipes in C++, 3rd edition,
	* Cambridge Univ. Press, 2007
	*
	* [8] DiDonato, Morris, Jr., Algorithm 708: Significant Digit Computation of
	* the Incomplete Beta Function Ratios, ACM Transactions on Mathematical
	* Software, Vol. 18, No. 3, 1992
	*
	* Approximating the Student-T distribution function with the normal
	* distribution:
	*
	* [9] Gleason, A note on a proposed student t approximation, Computational
	* Statistics & Data Analysis, Vol. 34, No. 1, 2000
	*
	* [10] Gaver and Kafadar, A Retrievable Recipe for Inverse t, The American
	* Statistician, Vol. 38, No. 4, 1984
	*/

	/**
	* @brief Student-t cumulative distribution function
	*/
	DECLARE_UDF(prob, students_t_cdf)
	DECLARE_UDF(prob, students_t_pdf)
	DECLARE_UDF(prob, students_t_quantile)


	#ifndef MADLIB_MODULES_PROB_STUDENT_T_HPP
	#define MADLIB_MODULES_PROB_STUDENT_T_HPP

	#include <boost/math/distributions/detail/common_error_handling.hpp>
	#include <boost/math/distributions/normal.hpp>
	#include <boost/math/distributions/students_t.hpp>

	namespace madlib {

	namespace modules {

	namespace prob {

	typedef boost::math::students_t_distribution<double, boost_mathkit_policy>
	students_t;

	namespace {

	/**
	* @brief Compute one-sided Student's t cumulative distribution function
	*
	* We use the series expansions 26.7.3 and 26.7.4 from [1] and
	* substitute sin(theta) = t/sqrt(n * z), where z = 1 + t^2/nu.
	*
	* This gives:
	* @verbatim
	* t
	* A(t\|1) = 2 arctan( -------- ) ,
	* sqrt(nu)
	*
	* (nu-3)/2
	* 2 [ t t -- 2 * 4 * ... * (2i) ]
	* A(t\|nu) = - * [ arctan( -------- ) + ------------ * \ ---------------------- ]
	* π [ sqrt(nu) sqrt(nu) * z /_ 3 * ... * (2i+1) * z^i ]
	* i=0
	* for odd nu > 1, and
	*
	* (nu-2)/2
	* t -- 1 * 3 * ... * (2i - 1)
	* A(t\|nu) = ------------ * \ ------------------------ for even nu,
	* sqrt(nu * z) /_ 2 * 4 * ... * (2i) * z^i
	* i=0
	*
	* where A(t\|nu) = Pr[\|T\| <= t].
	* @endverbatim
	*
	* @param t
	* @param nu Degree of freedom \f$ \nu > 0 \f$
	* @return \f$ \Pr[\|T\| < t] \f$ where \f$ t \geq 0 \f$, \f$ T \f$ is a Student's
	* T-distributed random variable with \f$ \nu \f$ degrees of
	* freedom.
	*
	* Note: The running time of calculating the series is proportional to nu.
	* We therefore use the normal distribution as an approximation for large nu.
	* Another idea for handling this case can be found in reference [8].
	*/
	template <class RealType>
	inline
	RealType
	oneSidedStudentsT_CDF(const RealType& t, uint64_t nu) {
	RealType z,
	t_by_sqrt_nu;
	RealType A, /* contains A(t\|nu) */
	prod = 1.,
	sum = 1.;

	/* Handle main case (nu \in {1, ..., 200}) in the rest of the function. */
	z = 1. + t * t / static_cast<double>(nu);
	t_by_sqrt_nu = std::fabs(t) / std::sqrt(static_cast<double>(nu));

	if (nu == 1)
	{
	A = 2. / M_PI * std::atan(t_by_sqrt_nu);
	}
	else if (nu & 1) /* odd nu > 1 */
	{
	for (uint64_t j = 2; j + 3 <= nu; j += 2)
	{
	prod = prod * static_cast<double>(j)
	/ (static_cast<double>(j + 1) * z);
	sum = sum + prod;
	}
	A = 2 / M_PI * ( std::atan(t_by_sqrt_nu) + t_by_sqrt_nu / z * sum );
	}
	else /* even nu */
	{
	for (uint64_t j = 2; j + 2 <= nu; j += 2)
	{
	prod = prod * static_cast<double>(j - 1)
	/ (static_cast<double>(j) * z);
	sum = sum + prod;
	}
	A = t_by_sqrt_nu / std::sqrt(z) * sum;
	}

	/* A should obviously be within the interval [0,1] plus minus (hopefully
	* small) rounding errors. */
	if (A > 1.)
	A = 1.;
	else if (A < 0.)
	A = 0.;

	return A;
	}

	/**
	* @brief Compute parameter for normal CDF for approximating the Student's T CDF
	*
	* Gleason suggested a formula for approximating the Student's
	* t-distribution [9], which goes back to an approximation suggested in [10].
	*
	* Compared to the series expansion, this approximation satisfies
	* rel_error < 0.0001 \|\| abs_error < 0.00000001
	* for all nu >= 200. (Tested on Mac OS X 10.6, gcc-4.2.)
	*
	* @param t
	* @param nu Degree of freedom \f$ \nu > 0 \f$
	* @returns A value \f$ z \f$ such that for a Student's t-distributed
	* random variable \f$ T \f$ with \f$ nu \f$ degrees of freedom and a
	* standard normally distributed random variable \f$ Z \f$, it holds that
	* \f$ \Pr[T \leq t] \approx \Pr[Z \leq z] \f$.
	*/
	template <class RealType>
	inline
	RealType
	GleasonsNormalApproxForStudentsT(const RealType& t, const RealType& nu) {
	double g = (nu - 1.5) / ((nu - 1) * (nu - 1)),
	z = std::sqrt( std::log(1. + t * t / nu) / g );

	if (t < 0)
	z *= -1.;

	return z;
	}

	} // anonymous namespace

	/**
	* @brief Compute Student's cumulative distribution function
	*
	* For nu >= 1000000, we just use the normal distribution as an approximation.
	* For 1000000 >= nu >= 200, we use a simple approximation from [9].
	* If nu is not within 0.01 of a natural number, we will call the student-t
	* CDF from boost. Otherwise, our approach should be much more precise than
	* using the incomplete beta function as boost does (see the references).
	*
	* We are much more cautious than usual here (it is folklore that the normal
	* distribution is a "good" estimate for Student-T if nu >= 30), but we can
	* afford the extra work as this function is not designed to be called from
	* inner loops. Performance should still be reasonably good, with at most ~100
	* iterations in any case (just one if nu >= 200).
	*
	* For nu < 200, we use the series expansions 26.7.3 and 26.7.4 from [1] and
	* substitute sin(theta) = t/sqrt(n * z), where z = 1 + t^2/nu (using
	* oneSidedStudentsT_CDF()).
	*
	* @param dist A Student's t-distribution object, containing the degree of
	* freedom \f$ \nu \f$
	* @param t
	* @return \f$ \Pr[T < t] \f$ where \f$ t \geq 0 \f$, \f$ T \f$ is a Student's
	* T-distributed random variable with \f$ \nu \f$ degrees of
	* freedom.
	*/
	template <class RealType, class Policy>
	inline
	RealType
	cdf(const boost::math::students_t_distribution<RealType, Policy>& dist,
	const RealType& t) {

	RealType df = dist.degrees_of_freedom();

	// FIXME: Add some justification/do some tests.
	if (!std::isfinite(df) \|\| std::fabs(df - std::floor(df))/df > 0.01)
	return boost::math::cdf(dist, t);

	static const char* function = "madlib::modules::prob::cdf("
	"const students_t_distribution<%1%>&, %1%)";

	RealType result;
	if (!boost::math::detail::check_df(function, df, &result, Policy()))
	return result;

	if (df >= 200)
	return
	boost::math::cdf(
	boost::math::normal_distribution<RealType, Policy>(),
	df >= 1000000
	? t
	: GleasonsNormalApproxForStudentsT(t, df)
	);

	// We first compute A = Pr[\|T\| < t]
	RealType A = oneSidedStudentsT_CDF(t, static_cast<uint64_t>(df));

	/* The Student-T distribution is obviously symmetric around t=0... */
	if (t < 0)
	/* FIXME: If A is approximately 1, we will face a loss of significance.
	* */
	return .5 * (1. - A);
	else
	/* While we only know A in [0,1] here, the end result will be in
	* [0.5, 1]. Hence, there is no problem with adding 1 and A, even if
	* A << 1. */
	return .5 * (1. + A);
	}

	/**
	* @brief Compute the complement of Student's cumulative distribution function
	*/
	template <class RealType, class Policy>
	inline
	RealType
	cdf(
	const boost::math::complemented2_type<
	boost::math::students_t_distribution<RealType, Policy>,
	RealType
	>& c
	) {
	RealType df = c.dist.degrees_of_freedom();
	if (df >= 200) {
	static const char* function = "madlib::modules::prob::cdf("
	"const complement(students_t_distribution<%1%>&), %1%)";

	RealType result;
	if (!boost::math::detail::check_df(function, df, &result, Policy()))
	return result;

	return
	boost::math::cdf(complement(
	boost::math::normal_distribution<RealType, Policy>(),
	df >= 1000000
	? c.param
	: GleasonsNormalApproxForStudentsT(c.param, df)
	));
	}

	return prob::cdf(c.dist, -c.param);
	}

	template <class RealType, class Policy>
	inline
	RealType
	pdf(const boost::math::students_t_distribution<RealType, Policy>& dist,
	const RealType& t) {
	return boost::math::pdf(dist, t);
	}

	template <class RealType, class Policy>
	inline
	RealType
	pdf(
	const boost::math::complemented2_type<
	boost::math::students_t_distribution<RealType, Policy>,
	RealType
	>& c
	) {
	return boost::math::pdf(c);
	}

	template <class RealType, class Policy>
	inline
	RealType
	quantile(const boost::math::students_t_distribution<RealType, Policy>& dist,
	const RealType& p) {

	using namespace boost::math;

	static const char* function = "madlib::modules::prob::quantile("
	"const students_t_distribution<%1%>&, %1%)";

	// FIXME: Boost bug 6937 prevent proper argument validation.
	// https://svn.boost.org/trac/boost/ticket/6937
	// Until this is fixed upstream, we do the following checks here.
	RealType df = dist.degrees_of_freedom();
	RealType result;
	if (!detail::check_df(function, df, &result, Policy())
	\|\| !detail::check_probability(function, p, &result, Policy()))
	return result;

	return boost::math::quantile(dist, p);
	}

	template <class RealType, class Policy>
	inline
	RealType
	quantile(
	const boost::math::complemented2_type<
	boost::math::students_t_distribution<RealType, Policy>,
	RealType
	>& c
	) {
	return boost::math::quantile(c);
	}

	} // namespace prob

	} // namespace modules

	} // namespace madlib

	#endif // defined(MADLIB_MODULES_PROB_STUDENT_T_HPP)