blob: 8dcbedf0b91c61c8977b2d08fce0dc2386ee766c [file] [log] [blame]
/* ----------------------------------------------------------------------- *//**
*
* @file student.hpp
*
* @brief Evaluate the Student's t-distribution function.
* @author Florian Schoppmann
* @date November 2010
*
*//* -------------------------------------------------------------------- *//**
*
* @file student.hpp
*
* Emprirical results indicate that the numerical quality of the series
* expansion from [1] (see notes below) is vastly superior to using continued
* fractions for computing the cdf via the incomplete beta function.
*
* @literature
*
* [1] Abramowitz and Stegun, Handbook of Mathematical Functions with Formulas,
* Graphs, and Mathematical Tables, 1972
* page 948: http://people.math.sfu.ca/~cbm/aands/page_948.htm
*
* Further reading (for computing the Student-T cdf via the incomplete beta
* function):
*
* [2] NIST Digital Library of Mathematical Functions, Ch. 8,
* Incomplete Gamma and Related Functions,
* http://dlmf.nist.gov/8.17
*
* [3] Lentz, Generating Bessel functions in Mie scattering calculations using
* continued fractions, Applied Optics, Vol. 15, No. 3, 1976
*
* [4] Thompson and Barnett, Coulomb and Bessel Functions of Complex Arguments
* and Order, Journal of Computational Physics, Vol. 64, 1986
*
* [5] Cuyt et al., Handbook of Continued Fractions for Special Functions,
* Springer, 2008
*
* [6] Gil et al., Numerical Methods for Special Functions, SIAM, 2008
*
* [7] Press et al., Numerical Recipes in C++, 3rd edition,
* Cambridge Univ. Press, 2007
*
* [8] DiDonato, Morris, Jr., Algorithm 708: Significant Digit Computation of
* the Incomplete Beta Function Ratios, ACM Transactions on Mathematical
* Software, Vol. 18, No. 3, 1992
*
* Approximating the Student-T distribution function with the normal
* distribution:
*
* [9] Gleason, A note on a proposed student t approximation, Computational
* Statistics & Data Analysis, Vol. 34, No. 1, 2000
*
* [10] Gaver and Kafadar, A Retrievable Recipe for Inverse t, The American
* Statistician, Vol. 38, No. 4, 1984
*/
/**
* @brief Student-t cumulative distribution function
*/
DECLARE_UDF(prob, students_t_cdf)
DECLARE_UDF(prob, students_t_pdf)
DECLARE_UDF(prob, students_t_quantile)
#ifndef MADLIB_MODULES_PROB_STUDENT_T_HPP
#define MADLIB_MODULES_PROB_STUDENT_T_HPP
#include <boost/math/distributions/detail/common_error_handling.hpp>
#include <boost/math/distributions/normal.hpp>
#include <boost/math/distributions/students_t.hpp>
namespace madlib {
namespace modules {
namespace prob {
typedef boost::math::students_t_distribution<double, boost_mathkit_policy>
students_t;
namespace {
/**
* @brief Compute one-sided Student's t cumulative distribution function
*
* We use the series expansions 26.7.3 and 26.7.4 from [1] and
* substitute sin(theta) = t/sqrt(n * z), where z = 1 + t^2/nu.
*
* This gives:
* @verbatim
* t
* A(t|1) = 2 arctan( -------- ) ,
* sqrt(nu)
*
* (nu-3)/2
* 2 [ t t -- 2 * 4 * ... * (2i) ]
* A(t|nu) = - * [ arctan( -------- ) + ------------ * \ ---------------------- ]
* π [ sqrt(nu) sqrt(nu) * z /_ 3 * ... * (2i+1) * z^i ]
* i=0
* for odd nu > 1, and
*
* (nu-2)/2
* t -- 1 * 3 * ... * (2i - 1)
* A(t|nu) = ------------ * \ ------------------------ for even nu,
* sqrt(nu * z) /_ 2 * 4 * ... * (2i) * z^i
* i=0
*
* where A(t|nu) = Pr[|T| <= t].
* @endverbatim
*
* @param t
* @param nu Degree of freedom \f$ \nu > 0 \f$
* @return \f$ \Pr[|T| < t] \f$ where \f$ t \geq 0 \f$, \f$ T \f$ is a Student's
* T-distributed random variable with \f$ \nu \f$ degrees of
* freedom.
*
* Note: The running time of calculating the series is proportional to nu.
* We therefore use the normal distribution as an approximation for large nu.
* Another idea for handling this case can be found in reference [8].
*/
template <class RealType>
inline
RealType
oneSidedStudentsT_CDF(const RealType& t, uint64_t nu) {
RealType z,
t_by_sqrt_nu;
RealType A, /* contains A(t|nu) */
prod = 1.,
sum = 1.;
/* Handle main case (nu \in {1, ..., 200}) in the rest of the function. */
z = 1. + t * t / static_cast<double>(nu);
t_by_sqrt_nu = std::fabs(t) / std::sqrt(static_cast<double>(nu));
if (nu == 1)
{
A = 2. / M_PI * std::atan(t_by_sqrt_nu);
}
else if (nu & 1) /* odd nu > 1 */
{
for (uint64_t j = 2; j + 3 <= nu; j += 2)
{
prod = prod * static_cast<double>(j)
/ (static_cast<double>(j + 1) * z);
sum = sum + prod;
}
A = 2 / M_PI * ( std::atan(t_by_sqrt_nu) + t_by_sqrt_nu / z * sum );
}
else /* even nu */
{
for (uint64_t j = 2; j + 2 <= nu; j += 2)
{
prod = prod * static_cast<double>(j - 1)
/ (static_cast<double>(j) * z);
sum = sum + prod;
}
A = t_by_sqrt_nu / std::sqrt(z) * sum;
}
/* A should obviously be within the interval [0,1] plus minus (hopefully
* small) rounding errors. */
if (A > 1.)
A = 1.;
else if (A < 0.)
A = 0.;
return A;
}
/**
* @brief Compute parameter for normal CDF for approximating the Student's T CDF
*
* Gleason suggested a formula for approximating the Student's
* t-distribution [9], which goes back to an approximation suggested in [10].
*
* Compared to the series expansion, this approximation satisfies
* rel_error < 0.0001 || abs_error < 0.00000001
* for all nu >= 200. (Tested on Mac OS X 10.6, gcc-4.2.)
*
* @param t
* @param nu Degree of freedom \f$ \nu > 0 \f$
* @returns A value \f$ z \f$ such that for a Student's t-distributed
* random variable \f$ T \f$ with \f$ nu \f$ degrees of freedom and a
* standard normally distributed random variable \f$ Z \f$, it holds that
* \f$ \Pr[T \leq t] \approx \Pr[Z \leq z] \f$.
*/
template <class RealType>
inline
RealType
GleasonsNormalApproxForStudentsT(const RealType& t, const RealType& nu) {
double g = (nu - 1.5) / ((nu - 1) * (nu - 1)),
z = std::sqrt( std::log(1. + t * t / nu) / g );
if (t < 0)
z *= -1.;
return z;
}
} // anonymous namespace
/**
* @brief Compute Student's cumulative distribution function
*
* For nu >= 1000000, we just use the normal distribution as an approximation.
* For 1000000 >= nu >= 200, we use a simple approximation from [9].
* If nu is not within 0.01 of a natural number, we will call the student-t
* CDF from boost. Otherwise, our approach should be much more precise than
* using the incomplete beta function as boost does (see the references).
*
* We are much more cautious than usual here (it is folklore that the normal
* distribution is a "good" estimate for Student-T if nu >= 30), but we can
* afford the extra work as this function is not designed to be called from
* inner loops. Performance should still be reasonably good, with at most ~100
* iterations in any case (just one if nu >= 200).
*
* For nu < 200, we use the series expansions 26.7.3 and 26.7.4 from [1] and
* substitute sin(theta) = t/sqrt(n * z), where z = 1 + t^2/nu (using
* oneSidedStudentsT_CDF()).
*
* @param dist A Student's t-distribution object, containing the degree of
* freedom \f$ \nu \f$
* @param t
* @return \f$ \Pr[T < t] \f$ where \f$ t \geq 0 \f$, \f$ T \f$ is a Student's
* T-distributed random variable with \f$ \nu \f$ degrees of
* freedom.
*/
template <class RealType, class Policy>
inline
RealType
cdf(const boost::math::students_t_distribution<RealType, Policy>& dist,
const RealType& t) {
RealType df = dist.degrees_of_freedom();
// FIXME: Add some justification/do some tests.
if (!std::isfinite(df) || std::fabs(df - std::floor(df))/df > 0.01)
return boost::math::cdf(dist, t);
static const char* function = "madlib::modules::prob::cdf("
"const students_t_distribution<%1%>&, %1%)";
RealType result;
if (!boost::math::detail::check_df(function, df, &result, Policy()))
return result;
if (df >= 200)
return
boost::math::cdf(
boost::math::normal_distribution<RealType, Policy>(),
df >= 1000000
? t
: GleasonsNormalApproxForStudentsT(t, df)
);
// We first compute A = Pr[|T| < t]
RealType A = oneSidedStudentsT_CDF(t, static_cast<uint64_t>(df));
/* The Student-T distribution is obviously symmetric around t=0... */
if (t < 0)
/* FIXME: If A is approximately 1, we will face a loss of significance.
* */
return .5 * (1. - A);
else
/* While we only know A in [0,1] here, the end result will be in
* [0.5, 1]. Hence, there is no problem with adding 1 and A, even if
* A << 1. */
return .5 * (1. + A);
}
/**
* @brief Compute the complement of Student's cumulative distribution function
*/
template <class RealType, class Policy>
inline
RealType
cdf(
const boost::math::complemented2_type<
boost::math::students_t_distribution<RealType, Policy>,
RealType
>& c
) {
RealType df = c.dist.degrees_of_freedom();
if (df >= 200) {
static const char* function = "madlib::modules::prob::cdf("
"const complement(students_t_distribution<%1%>&), %1%)";
RealType result;
if (!boost::math::detail::check_df(function, df, &result, Policy()))
return result;
return
boost::math::cdf(complement(
boost::math::normal_distribution<RealType, Policy>(),
df >= 1000000
? c.param
: GleasonsNormalApproxForStudentsT(c.param, df)
));
}
return prob::cdf(c.dist, -c.param);
}
template <class RealType, class Policy>
inline
RealType
pdf(const boost::math::students_t_distribution<RealType, Policy>& dist,
const RealType& t) {
return boost::math::pdf(dist, t);
}
template <class RealType, class Policy>
inline
RealType
pdf(
const boost::math::complemented2_type<
boost::math::students_t_distribution<RealType, Policy>,
RealType
>& c
) {
return boost::math::pdf(c);
}
template <class RealType, class Policy>
inline
RealType
quantile(const boost::math::students_t_distribution<RealType, Policy>& dist,
const RealType& p) {
using namespace boost::math;
static const char* function = "madlib::modules::prob::quantile("
"const students_t_distribution<%1%>&, %1%)";
// FIXME: Boost bug 6937 prevent proper argument validation.
// https://svn.boost.org/trac/boost/ticket/6937
// Until this is fixed upstream, we do the following checks here.
RealType df = dist.degrees_of_freedom();
RealType result;
if (!detail::check_df(function, df, &result, Policy())
|| !detail::check_probability(function, p, &result, Policy()))
return result;
return boost::math::quantile(dist, p);
}
template <class RealType, class Policy>
inline
RealType
quantile(
const boost::math::complemented2_type<
boost::math::students_t_distribution<RealType, Policy>,
RealType
>& c
) {
return boost::math::quantile(c);
}
} // namespace prob
} // namespace modules
} // namespace madlib
#endif // defined(MADLIB_MODULES_PROB_STUDENT_T_HPP)