| /* ----------------------------------------------------------------------- *//** |
| * |
| * @file t_test.cpp |
| * |
| * @brief t-Test functions |
| * |
| *//* ----------------------------------------------------------------------- */ |
| |
| #include <dbconnector/dbconnector.hpp> |
| #include <modules/prob/boost.hpp> |
| #include <modules/prob/student.hpp> |
| #include <modules/shared/HandleTraits.hpp> |
| |
| #include "t_test.hpp" |
| |
| namespace madlib { |
| |
| namespace modules { |
| |
| namespace stats { |
| |
| namespace { |
| AnyType tStatsToResult(double inT, double inDegreeOfFreedom); |
| } |
| |
| /** |
| * @brief Transition state for t-Test functions |
| * |
| * Note: We assume that the DOUBLE PRECISION array is initialized by the |
| * database with length 6, and all elemenets are 0. |
| */ |
| template <class Handle> |
| class TTestTransitionState { |
| public: |
| TTestTransitionState(const AnyType &inArray) |
| : mStorage(inArray.getAs<Handle>()), |
| numX(&mStorage[0]), |
| x_sum(&mStorage[1]), |
| correctedX_square_sum(&mStorage[2]), |
| numY(&mStorage[3]), |
| y_sum(&mStorage[4]), |
| correctedY_square_sum(&mStorage[5]) { } |
| |
| /** |
| * @brief Convert to backend representation |
| * |
| * We define this function so that we can use TransitionState in the argument |
| * list and as a return type. |
| */ |
| inline operator AnyType() const { |
| return mStorage; |
| } |
| |
| private: |
| Handle mStorage; |
| |
| public: |
| typename HandleTraits<Handle>::ReferenceToUInt64 numX; |
| typename HandleTraits<Handle>::ReferenceToDouble x_sum; |
| typename HandleTraits<Handle>::ReferenceToDouble correctedX_square_sum; |
| |
| typename HandleTraits<Handle>::ReferenceToUInt64 numY; |
| typename HandleTraits<Handle>::ReferenceToDouble y_sum; |
| typename HandleTraits<Handle>::ReferenceToDouble correctedY_square_sum; |
| }; |
| |
| /** |
| * @brief Update the corrected sum of squares |
| * |
| * For numerical stability, we should not compute the sample variance in the |
| * naive way. The literature has many examples where this gives bad results |
| * even with moderately sized inputs. |
| * |
| * See: |
| * |
| * B. P. Welford (1962). "Note on a method for calculating corrected sums of |
| * squares and products". Technometrics 4(3):419–420. |
| * |
| * Chan, Tony F.; Golub, Gene H.; LeVeque, Randall J. (1979), "Updating |
| * Formulae and a Pairwise Algorithm for Computing Sample Variances.", Technical |
| * Report STAN-CS-79-773, Department of Computer Science, Stanford University. |
| * ftp://reports.stanford.edu/pub/cstr/reports/cs/tr/79/773/CS-TR-79-773.pdf |
| */ |
| inline |
| void |
| updateCorrectedSumOfSquares(double &ioLeftWeight, double &ioLeftSum, |
| double &ioLeftCorrectedSumSquares, double inRightWeight, double inRightSum, |
| double inRightCorrectedSumSquares) { |
| |
| if (inRightWeight <= 0) |
| return; |
| |
| // FIXME: Use compensated sums for numerical stability |
| // See Ogita et al., "Accurate Sum and Dot Product", SIAM Journal on |
| // Scientific Computing (SISC), 26(6):1955-1988, 2005. |
| if (ioLeftWeight <= 0) |
| ioLeftCorrectedSumSquares = inRightCorrectedSumSquares; |
| else { |
| double diff = inRightWeight / ioLeftWeight * ioLeftSum - inRightSum; |
| ioLeftCorrectedSumSquares |
| += inRightCorrectedSumSquares |
| + ioLeftWeight / (inRightWeight * (ioLeftWeight + inRightWeight)) |
| * diff * diff; |
| } |
| |
| ioLeftSum += inRightSum; |
| ioLeftWeight += inRightWeight; |
| } |
| |
| /** |
| * @brief Perform the one-sample t-test transition step |
| */ |
| AnyType |
| t_test_one_transition::run(AnyType &args) { |
| TTestTransitionState<MutableArrayHandle<double> > state = args[0]; |
| double x = args[1].getAs<double>(); |
| |
| updateCorrectedSumOfSquares( |
| state.numX.ref(), state.x_sum.ref(), state.correctedX_square_sum.ref(), |
| 1, x, 0); |
| |
| return state; |
| } |
| |
| /** |
| * @brief Perform the two-sample t-test transition step |
| */ |
| AnyType |
| t_test_two_transition::run(AnyType &args) { |
| TTestTransitionState<MutableArrayHandle<double> > state = args[0]; |
| bool firstSample = args[1].getAs<bool>(); |
| double value = args[2].getAs<double>(); |
| |
| if (firstSample) |
| updateCorrectedSumOfSquares( |
| state.numX.ref(), state.x_sum.ref(), |
| state.correctedX_square_sum.ref(), |
| 1, value, 0); |
| else |
| updateCorrectedSumOfSquares( |
| state.numY.ref(), state.y_sum.ref(), |
| state.correctedY_square_sum.ref(), |
| 1, value, 0); |
| |
| return state; |
| } |
| |
| /** |
| * @brief Perform the perliminary aggregation function: Merge transition states |
| */ |
| AnyType |
| t_test_merge_states::run(AnyType &args) { |
| TTestTransitionState<MutableArrayHandle<double> > stateLeft = args[0]; |
| TTestTransitionState<ArrayHandle<double> > stateRight = args[1]; |
| |
| // Merge states together and return |
| updateCorrectedSumOfSquares( |
| stateLeft.numX.ref(), stateLeft.x_sum.ref(), |
| stateLeft.correctedX_square_sum.ref(), |
| stateRight.numX.ref(), stateRight.x_sum, |
| stateRight.correctedX_square_sum); |
| updateCorrectedSumOfSquares( |
| stateLeft.numY.ref(), stateLeft.y_sum.ref(), |
| stateLeft.correctedY_square_sum.ref(), |
| stateRight.numY.ref(), stateRight.y_sum, |
| stateRight.correctedY_square_sum); |
| |
| return stateLeft; |
| } |
| |
| /** |
| * @brief Perform the one-sided t-Test final step |
| */ |
| AnyType |
| t_test_one_final::run(AnyType &args) { |
| TTestTransitionState<ArrayHandle<double> > state = args[0]; |
| |
| // If we haven't seen enough data, just return Null. This is the standard |
| // behavior of aggregate function on empty data sets (compare, e.g., |
| // how PostgreSQL handles stddev_samp on quasi-empty inputs) |
| if (state.numX <= 1) |
| return Null(); |
| |
| const double& numX = state.numX.ref(); |
| double degreeOfFreedom = numX - 1; |
| double sampleVariance = state.correctedX_square_sum |
| / degreeOfFreedom; |
| double t = std::sqrt(numX / sampleVariance) * (state.x_sum / numX); |
| |
| return tStatsToResult(t, degreeOfFreedom); |
| } |
| |
| /** |
| * @brief Perform the pooled (i.e., assuming equal variances) two-sample t-Test |
| * final step |
| */ |
| AnyType |
| t_test_two_pooled_final::run(AnyType &args) { |
| TTestTransitionState<ArrayHandle<double> > state = args[0]; |
| |
| // If we haven't seen enough data, just return Null. This is the standard |
| // behavior of aggregate function on empty data sets (compare, e.g., |
| // how PostgreSQL handles stddev_samp on quasi-empty inputs) |
| if (state.numX == 0 || state.numY == 0 || state.numX + state.numY <= 2) |
| return Null(); |
| |
| // Formulas taken from: |
| // http://www.itl.nist.gov/div898/handbook/eda/section3/eda353.htm |
| const double& numX = state.numX.ref(); |
| const double& numY = state.numY.ref(); |
| double dfEqualVar = numX + numY - 2; |
| double diffInMeans = state.x_sum / numX - state.y_sum / numY; |
| double sampleVariancePooled |
| = (state.correctedX_square_sum + state.correctedY_square_sum) |
| / dfEqualVar; |
| double tDenomEqualVar |
| = std::sqrt(sampleVariancePooled * (1. / numX + 1. / numY)); |
| double tEqualVar = diffInMeans / tDenomEqualVar; |
| |
| return tStatsToResult(tEqualVar, dfEqualVar); |
| } |
| |
| /** |
| * @brief Perform the unpooled (i.e., assuming unequal variances) two-sample |
| * t-Test final step |
| */ |
| AnyType |
| t_test_two_unpooled_final::run(AnyType &args) { |
| TTestTransitionState<ArrayHandle<double> > state = args[0]; |
| |
| // If we haven't seen enough data, just return Null. This is the standard |
| // behavior of aggregate function on empty data sets (compare, e.g., |
| // how PostgreSQL handles stddev_samp on quasi-empty inputs) |
| if (state.numX <= 1 || state.numY <= 1) |
| return Null(); |
| |
| // Formulas taken from: |
| // http://www.itl.nist.gov/div898/handbook/eda/section3/eda353.htm |
| const double& numX = state.numX.ref(); |
| const double& numY = state.numY.ref(); |
| double sampleVarianceX = state.correctedX_square_sum / (numX - 1); |
| double sampleVarianceY = state.correctedY_square_sum / (numY - 1); |
| |
| double sampleVarianceX_over_numX = sampleVarianceX / numX; |
| double sampleVarianceY_over_numY = sampleVarianceY / numY; |
| |
| double dfUnequalVar |
| = std::pow(sampleVarianceX_over_numX + sampleVarianceY_over_numY, 2) |
| / ( |
| std::pow(sampleVarianceX_over_numX, 2) / (numX - 1) |
| + std::pow(sampleVarianceY_over_numY, 2) / (numY - 1) |
| ); |
| double diffInMeans = state.x_sum / numX - state.y_sum / numY; |
| double tDenomUnequalVar |
| = std::sqrt(sampleVarianceX / numX + sampleVarianceY / numY); |
| double tUnequalVar = diffInMeans / tDenomUnequalVar; |
| |
| return tStatsToResult(tUnequalVar, dfUnequalVar); |
| } |
| |
| /** |
| * @brief Perform the F-test final step |
| */ |
| AnyType |
| f_test_final::run(AnyType &args) { |
| using boost::math::complement; |
| |
| TTestTransitionState<ArrayHandle<double> > state = args[0]; |
| |
| // If we haven't seen enough data, just return Null. This is the standard |
| // behavior of aggregate function on empty data sets (compare, e.g., |
| // how PostgreSQL handles stddev_samp on quasi-empty inputs) |
| if (state.numX <= 1 || state.numY <= 1) |
| return Null(); |
| |
| // Formulas taken from: |
| // http://www.itl.nist.gov/div898/handbook/eda/section3/eda359.htm |
| double dfX = static_cast<double>(state.numX - 1); |
| double dfY = static_cast<double>(state.numY - 1); |
| double sampleVarianceX = state.correctedX_square_sum / dfX; |
| double sampleVarianceY = state.correctedY_square_sum / dfY; |
| double statistic = sampleVarianceX / sampleVarianceY; |
| |
| AnyType tuple; |
| double pvalue_one_sided |
| = prob::cdf(complement(prob::fisher_f(dfX, dfY), statistic)); |
| double pvalue_two_sided = 2. * std::min(pvalue_one_sided, |
| 1. - pvalue_one_sided); |
| tuple |
| << statistic |
| << dfX |
| << dfY |
| << pvalue_one_sided |
| << pvalue_two_sided; |
| return tuple; |
| } |
| |
| namespace { |
| |
| inline |
| AnyType |
| tStatsToResult(double inT, double inDegreeOfFreedom) { |
| // Return t statistic, degrees of freedom, one-tailed p-value (Null |
| // hypothesis \mu <= \mu_0), and two-tailed p-value (\mu = \mu_0). |
| // Recall definition of p-value: The probability of observating a |
| // value at least as extreme as the one observed, assuming that the null |
| // hypothesis is true. |
| using boost::math::complement; |
| |
| AnyType tuple; |
| tuple |
| << inT |
| << inDegreeOfFreedom |
| << prob::cdf(complement(prob::students_t(inDegreeOfFreedom), inT)) |
| << 2. * prob::cdf(boost::math::complement( |
| prob::students_t(inDegreeOfFreedom), std::fabs(inT))); |
| return tuple; |
| } |
| |
| } |
| |
| } // namespace stats |
| |
| } // namespace modules |
| |
| } // namespace madlib |