src/modules/stats/one_way_anova.cpp - madlib - Git at Google

 /* ----------------------------------------------------------------------- *//**
  *
  * @file one_way_anova.cpp
  *
  * @brief One-way ANOVA functions
  *
  *//* ----------------------------------------------------------------------- */

 #include <dbconnector/dbconnector.hpp>
 #include <modules/prob/boost.hpp>
 #include <modules/shared/HandleTraits.hpp>
 #include <utils/Math.hpp>

 #include "one_way_anova.hpp"

 namespace madlib {

 namespace modules {

 namespace stats {

 /**
  * @brief Transition state for one-way ANOVA functions
  *
  * Note: We assume that the DOUBLE PRECISION array is initialized by the
  * database with length 2, and all elemenets are 0. Handle::operator[] will
  * perform bounds checking.
  */
 template <class Handle>
 class OWATransitionState {
 public:
     OWATransitionState(const AnyType &inArray)
       : mStorage(inArray.getAs<Handle>()) {

         rebind(utils::nextPowerOfTwo(static_cast<uint32_t>(mStorage[0])));
     }

     /**
      * @brief Convert to backend representation
      *
      * We define this function so that we can use TransitionState in the argument
      * list and as a return type.
      */
     inline operator AnyType() const {
         return mStorage;
     }

     /**
      * @brief Return the index (in the num, sum, and corrected_square_sum
      *     fields) of a group value
      *
      * If a value is not found, we add a new group to the transition state.
      * Since we do not want to reallocate too often, we reserve some buffer
      * space in the storage array. So we need to reallocate and copy memory only
      * whenever the number of groups hits a power of 2.
      */
     uint32_t idxOfGroup(const Allocator& inAllocator, int32_t inValue);

 private:
     static inline size_t arraySize(uint32_t inNumGroupsReserved) {
         return 1 + 5 * inNumGroupsReserved;
     }

     void rebind(uint32_t inNumGroupsReserved) {
         madlib_assert(mStorage.size() >= arraySize(inNumGroupsReserved),
             std::runtime_error("Out-of-bounds array access detected."));

         numGroups.rebind(&mStorage[0]);
         groupValues = &mStorage[1];
         posToIndices = &mStorage[1 + inNumGroupsReserved];
         num.rebind(&mStorage[1 + 2 * inNumGroupsReserved], inNumGroupsReserved);
         sum.rebind(&mStorage[1 + 3 * inNumGroupsReserved], inNumGroupsReserved);
         corrected_square_sum.rebind(
             &mStorage[1 + 4 * inNumGroupsReserved], inNumGroupsReserved);
     }

     Handle mStorage;

 public:
     typename HandleTraits<Handle>::ReferenceToUInt32 numGroups;
     typename HandleTraits<Handle>::DoublePtr groupValues;
     typename HandleTraits<Handle>::DoublePtr posToIndices;
     typename HandleTraits<Handle>::ColumnVectorTransparentHandleMap num;
     typename HandleTraits<Handle>::ColumnVectorTransparentHandleMap sum;
     typename HandleTraits<Handle>::ColumnVectorTransparentHandleMap corrected_square_sum;
 };

 template <>
 uint32_t
 OWATransitionState<ArrayHandle<double> >::idxOfGroup(
     const Allocator&, int32_t inValue) {

     std::ptrdiff_t pos = std::lower_bound(
         groupValues, groupValues + numGroups, inValue) - groupValues;

     if (static_cast<uint64_t>(pos) >= numGroups
         || groupValues[pos] != inValue) {
         // Did not find this group value. We have to start a new group.

         throw std::runtime_error("Could not find a grouping value during "
             "one-way ANOVA.");
     }
     return static_cast<uint32_t>(pos);
 }

 template <>
 uint32_t
 OWATransitionState<MutableArrayHandle<double> >::idxOfGroup(
     const Allocator& inAllocator, int32_t inValue) {

     std::ptrdiff_t pos = std::lower_bound(
         groupValues, groupValues + numGroups, inValue) - groupValues;

     if (static_cast<uint64_t>(pos) >= numGroups
         || groupValues[pos] != inValue) {
         // Did not find this group value. We have to start a new group.

         uint32_t numGroupsReserved = utils::nextPowerOfTwo(
             static_cast<uint32_t>(numGroups));
         if (numGroupsReserved > numGroups) {
             // We have enough reserve space allocated.
             std::copy(groupValues + pos, groupValues + numGroups,
                 groupValues + pos + 1);
             groupValues[pos] = inValue;

             std::copy(posToIndices + pos, posToIndices + numGroups,
                 posToIndices + pos + 1);
             posToIndices[pos] = numGroups++;
         } else {
             // We need to reallocate storage for the transition state
             // Save our current state, so we can subsequently restore it
             // with the new storage
             OWATransitionState oldSelf = *this;
             if (numGroupsReserved == 0)
                 numGroupsReserved = 1;
             else {
                 if (static_cast<uint64_t>(2) * numGroupsReserved >
                     std::numeric_limits<uint32_t>::max())
                     throw std::runtime_error("Too many groups.");

                 numGroupsReserved = 2U * numGroupsReserved;
             }
             mStorage = inAllocator.allocateArray<double, dbal::AggregateContext,
                 dbal::DoZero, dbal::ThrowBadAlloc>(arraySize(numGroupsReserved));
             rebind(numGroupsReserved);

             numGroups = oldSelf.numGroups + 1;
             std::copy(oldSelf.groupValues, oldSelf.groupValues + pos, groupValues);
             std::copy(oldSelf.groupValues + pos,
                 oldSelf.groupValues + oldSelf.numGroups, groupValues + pos + 1);
             groupValues[pos] = inValue;

             std::copy(oldSelf.posToIndices, oldSelf.posToIndices + pos, posToIndices);
             std::copy(oldSelf.posToIndices + pos,
                 oldSelf.posToIndices + oldSelf.numGroups, posToIndices + pos + 1);
             posToIndices[pos] = oldSelf.numGroups;

             num.segment(0, oldSelf.numGroups) << oldSelf.num;
             sum.segment(0, oldSelf.numGroups) << oldSelf.sum;
             corrected_square_sum.segment(0, oldSelf.numGroups) << oldSelf.corrected_square_sum;
         }
     }
     return static_cast<uint32_t>(posToIndices[pos]);
 }

 /**
  * @brief Update the corrected sum of squares
  *
  * For numerical stability, we should not compute the sample variance in the
  * naive way. The literature has many examples where this gives bad results
  * even with moderately sized inputs.
  *
  * See:
  *
  * B. P. Welford (1962). "Note on a method for calculating corrected sums of
  * squares and products". Technometrics 4(3):419–420.
  *
  * Chan, Tony F.; Golub, Gene H.; LeVeque, Randall J. (1979), "Updating
  * Formulae and a Pairwise Algorithm for Computing Sample Variances.", Technical
  * Report STAN-CS-79-773, Department of Computer Science, Stanford University.
  * ftp://reports.stanford.edu/pub/cstr/reports/cs/tr/79/773/CS-TR-79-773.pdf
  */
 inline
 void
 updateCorrectedSumOfSquares(double &ioLeftWeight, double &ioLeftSum,
     double &ioLeftCorrectedSumSquares, double inRightWeight, double inRightSum,
     double inRightCorrectedSumSquares) {

     if (inRightWeight <= 0)
         return;

     // See Ogita et al., "Accurate Sum and Dot Product", SIAM Journal on
     // Scientific Computing (SISC), 26(6):1955-1988, 2005.
     if (ioLeftWeight <= 0)
         ioLeftCorrectedSumSquares = inRightCorrectedSumSquares;
     else {
         double diff = inRightWeight / ioLeftWeight * ioLeftSum - inRightSum;
         ioLeftCorrectedSumSquares
                += inRightCorrectedSumSquares
                 + ioLeftWeight / (inRightWeight * (ioLeftWeight + inRightWeight))
                     * diff * diff;
     }

     ioLeftSum += inRightSum;
     ioLeftWeight += inRightWeight;
 }

 /**
  * @brief Perform the transition step
  */
 AnyType
 one_way_anova_transition::run(AnyType &args) {
     OWATransitionState<MutableArrayHandle<double> > state = args[0];
     int32_t group = args[1].getAs<int32_t>();
     double value = args[2].getAs<double>();

     uint32_t idx = state.idxOfGroup(*this, group);
     updateCorrectedSumOfSquares(
         state.num(idx), state.sum(idx), state.corrected_square_sum(idx),
         1, value, 0);

     return state;
 }

 /**
  * @brief Perform the perliminary aggregation function: Merge transition states
  */
 AnyType
 one_way_anova_merge_states::run(AnyType &args) {
     if (args[1].getAs<ArrayHandle<double> >().size() == 2) { return args[0]; }
     else if (args[0].getAs<ArrayHandle<double> >().size() == 2) { return args[1]; }

     OWATransitionState<MutableArrayHandle<double> > stateLeft = args[0];
     OWATransitionState<ArrayHandle<double> > stateRight = args[1];

     // Merge states together and return
     for (uint64_t posRight = 0; posRight < stateRight.numGroups; posRight++) {
         int32_t value
             = static_cast<int32_t>(stateRight.groupValues[posRight]);
         uint32_t idxRight = static_cast<uint32_t>(stateRight.posToIndices[posRight]);
         uint32_t idxLeft = stateLeft.idxOfGroup(*this, value);
         updateCorrectedSumOfSquares(
             stateLeft.num(idxLeft), stateLeft.sum(idxLeft),
                 stateLeft.corrected_square_sum(idxLeft),
             stateRight.num(idxRight), stateRight.sum(idxRight),
                 stateRight.corrected_square_sum(idxRight));
     }

     return stateLeft;
 }

 /**
  * @brief Perform the one-sided t-Test final step
  */
 AnyType
 one_way_anova_final::run(AnyType &args) {
     using boost::math::complement;

     OWATransitionState<ArrayHandle<double> > state = args[0];

     // If we haven't seen any data, just return Null. This is the standard
     // behavior of aggregate function on empty data sets (compare, e.g.,
     // how PostgreSQL handles sum or avg on empty inputs)
     if (state.numGroups == 0)
         return Null();

     double grand_mean = state.sum.sum() / state.num.sum();
     double sum_squares_between = 0;

     for (Index idx = 0; idx < static_cast<Index>(state.numGroups); ++idx)
         sum_squares_between += state.num(idx)
                              * std::pow(state.sum(idx) / state.num(idx)
                                         - grand_mean, 2);

     double sum_squares_within = state.corrected_square_sum.sum();
     double df_between = state.numGroups - 1;
     double df_within = state.num.sum() - state.numGroups;
     double mean_square_between = sum_squares_between / df_between;
     double mean_square_within = sum_squares_within / df_within;
     double statistic = mean_square_between / mean_square_within;

     AnyType tuple;
     tuple
         << sum_squares_between
         << sum_squares_within
         << static_cast<int64_t>(df_between)
         << static_cast<int64_t>(df_within)
         << mean_square_between
         << mean_square_within
         << statistic
         << ((df_between >= 1 && df_within >= 1)
             ? prob::cdf(
                 complement(prob::fisher_f(df_between, df_within), statistic))
             : Null());
     return tuple;
 }

 } // namespace stats

 } // namespace modules

 } // namespace madlib
	/* ----------------------------------------------------------------------- //*
	*
	* @file one_way_anova.cpp
	*
	* @brief One-way ANOVA functions
	*
	// ----------------------------------------------------------------------- */

	#include <dbconnector/dbconnector.hpp>
	#include <modules/prob/boost.hpp>
	#include <modules/shared/HandleTraits.hpp>
	#include <utils/Math.hpp>

	#include "one_way_anova.hpp"

	namespace madlib {

	namespace modules {

	namespace stats {

	/**
	* @brief Transition state for one-way ANOVA functions
	*
	* Note: We assume that the DOUBLE PRECISION array is initialized by the
	* database with length 2, and all elemenets are 0. Handle::operator[] will
	* perform bounds checking.
	*/
	template <class Handle>
	class OWATransitionState {
	public:
	OWATransitionState(const AnyType &inArray)
	: mStorage(inArray.getAs<Handle>()) {

	rebind(utils::nextPowerOfTwo(static_cast<uint32_t>(mStorage[0])));
	}

	/**
	* @brief Convert to backend representation
	*
	* We define this function so that we can use TransitionState in the argument
	* list and as a return type.
	*/
	inline operator AnyType() const {
	return mStorage;
	}

	/**
	* @brief Return the index (in the num, sum, and corrected_square_sum
	* fields) of a group value
	*
	* If a value is not found, we add a new group to the transition state.
	* Since we do not want to reallocate too often, we reserve some buffer
	* space in the storage array. So we need to reallocate and copy memory only
	* whenever the number of groups hits a power of 2.
	*/
	uint32_t idxOfGroup(const Allocator& inAllocator, int32_t inValue);

	private:
	static inline size_t arraySize(uint32_t inNumGroupsReserved) {
	return 1 + 5 * inNumGroupsReserved;
	}

	void rebind(uint32_t inNumGroupsReserved) {
	madlib_assert(mStorage.size() >= arraySize(inNumGroupsReserved),
	std::runtime_error("Out-of-bounds array access detected."));

	numGroups.rebind(&mStorage[0]);
	groupValues = &mStorage[1];
	posToIndices = &mStorage[1 + inNumGroupsReserved];
	num.rebind(&mStorage[1 + 2 * inNumGroupsReserved], inNumGroupsReserved);
	sum.rebind(&mStorage[1 + 3 * inNumGroupsReserved], inNumGroupsReserved);
	corrected_square_sum.rebind(
	&mStorage[1 + 4 * inNumGroupsReserved], inNumGroupsReserved);
	}

	Handle mStorage;

	public:
	typename HandleTraits<Handle>::ReferenceToUInt32 numGroups;
	typename HandleTraits<Handle>::DoublePtr groupValues;
	typename HandleTraits<Handle>::DoublePtr posToIndices;
	typename HandleTraits<Handle>::ColumnVectorTransparentHandleMap num;
	typename HandleTraits<Handle>::ColumnVectorTransparentHandleMap sum;
	typename HandleTraits<Handle>::ColumnVectorTransparentHandleMap corrected_square_sum;
	};

	template <>
	uint32_t
	OWATransitionState<ArrayHandle<double> >::idxOfGroup(
	const Allocator&, int32_t inValue) {

	std::ptrdiff_t pos = std::lower_bound(
	groupValues, groupValues + numGroups, inValue) - groupValues;

	if (static_cast<uint64_t>(pos) >= numGroups
	\|\| groupValues[pos] != inValue) {
	// Did not find this group value. We have to start a new group.

	throw std::runtime_error("Could not find a grouping value during "
	"one-way ANOVA.");
	}
	return static_cast<uint32_t>(pos);
	}

	template <>
	uint32_t
	OWATransitionState<MutableArrayHandle<double> >::idxOfGroup(
	const Allocator& inAllocator, int32_t inValue) {

	std::ptrdiff_t pos = std::lower_bound(
	groupValues, groupValues + numGroups, inValue) - groupValues;

	if (static_cast<uint64_t>(pos) >= numGroups
	\|\| groupValues[pos] != inValue) {
	// Did not find this group value. We have to start a new group.

	uint32_t numGroupsReserved = utils::nextPowerOfTwo(
	static_cast<uint32_t>(numGroups));
	if (numGroupsReserved > numGroups) {
	// We have enough reserve space allocated.
	std::copy(groupValues + pos, groupValues + numGroups,
	groupValues + pos + 1);
	groupValues[pos] = inValue;

	std::copy(posToIndices + pos, posToIndices + numGroups,
	posToIndices + pos + 1);
	posToIndices[pos] = numGroups++;
	} else {
	// We need to reallocate storage for the transition state
	// Save our current state, so we can subsequently restore it
	// with the new storage
	OWATransitionState oldSelf = *this;
	if (numGroupsReserved == 0)
	numGroupsReserved = 1;
	else {
	if (static_cast<uint64_t>(2) * numGroupsReserved >
	std::numeric_limits<uint32_t>::max())
	throw std::runtime_error("Too many groups.");

	numGroupsReserved = 2U * numGroupsReserved;
	}
	mStorage = inAllocator.allocateArray<double, dbal::AggregateContext,
	dbal::DoZero, dbal::ThrowBadAlloc>(arraySize(numGroupsReserved));
	rebind(numGroupsReserved);

	numGroups = oldSelf.numGroups + 1;
	std::copy(oldSelf.groupValues, oldSelf.groupValues + pos, groupValues);
	std::copy(oldSelf.groupValues + pos,
	oldSelf.groupValues + oldSelf.numGroups, groupValues + pos + 1);
	groupValues[pos] = inValue;

	std::copy(oldSelf.posToIndices, oldSelf.posToIndices + pos, posToIndices);
	std::copy(oldSelf.posToIndices + pos,
	oldSelf.posToIndices + oldSelf.numGroups, posToIndices + pos + 1);
	posToIndices[pos] = oldSelf.numGroups;

	num.segment(0, oldSelf.numGroups) << oldSelf.num;
	sum.segment(0, oldSelf.numGroups) << oldSelf.sum;
	corrected_square_sum.segment(0, oldSelf.numGroups) << oldSelf.corrected_square_sum;
	}
	}
	return static_cast<uint32_t>(posToIndices[pos]);
	}

	/**
	* @brief Update the corrected sum of squares
	*
	* For numerical stability, we should not compute the sample variance in the
	* naive way. The literature has many examples where this gives bad results
	* even with moderately sized inputs.
	*
	* See:
	*
	* B. P. Welford (1962). "Note on a method for calculating corrected sums of
	* squares and products". Technometrics 4(3):419–420.
	*
	* Chan, Tony F.; Golub, Gene H.; LeVeque, Randall J. (1979), "Updating
	* Formulae and a Pairwise Algorithm for Computing Sample Variances.", Technical
	* Report STAN-CS-79-773, Department of Computer Science, Stanford University.
	* ftp://reports.stanford.edu/pub/cstr/reports/cs/tr/79/773/CS-TR-79-773.pdf
	*/
	inline
	void
	updateCorrectedSumOfSquares(double &ioLeftWeight, double &ioLeftSum,
	double &ioLeftCorrectedSumSquares, double inRightWeight, double inRightSum,
	double inRightCorrectedSumSquares) {

	if (inRightWeight <= 0)
	return;

	// See Ogita et al., "Accurate Sum and Dot Product", SIAM Journal on
	// Scientific Computing (SISC), 26(6):1955-1988, 2005.
	if (ioLeftWeight <= 0)
	ioLeftCorrectedSumSquares = inRightCorrectedSumSquares;
	else {
	double diff = inRightWeight / ioLeftWeight * ioLeftSum - inRightSum;
	ioLeftCorrectedSumSquares
	+= inRightCorrectedSumSquares
	+ ioLeftWeight / (inRightWeight * (ioLeftWeight + inRightWeight))
	* diff * diff;
	}

	ioLeftSum += inRightSum;
	ioLeftWeight += inRightWeight;
	}

	/**
	* @brief Perform the transition step
	*/
	AnyType
	one_way_anova_transition::run(AnyType &args) {
	OWATransitionState<MutableArrayHandle<double> > state = args[0];
	int32_t group = args[1].getAs<int32_t>();
	double value = args[2].getAs<double>();

	uint32_t idx = state.idxOfGroup(*this, group);
	updateCorrectedSumOfSquares(
	state.num(idx), state.sum(idx), state.corrected_square_sum(idx),
	1, value, 0);

	return state;
	}

	/**
	* @brief Perform the perliminary aggregation function: Merge transition states
	*/
	AnyType
	one_way_anova_merge_states::run(AnyType &args) {
	if (args[1].getAs<ArrayHandle<double> >().size() == 2) { return args[0]; }
	else if (args[0].getAs<ArrayHandle<double> >().size() == 2) { return args[1]; }

	OWATransitionState<MutableArrayHandle<double> > stateLeft = args[0];
	OWATransitionState<ArrayHandle<double> > stateRight = args[1];

	// Merge states together and return
	for (uint64_t posRight = 0; posRight < stateRight.numGroups; posRight++) {
	int32_t value
	= static_cast<int32_t>(stateRight.groupValues[posRight]);
	uint32_t idxRight = static_cast<uint32_t>(stateRight.posToIndices[posRight]);
	uint32_t idxLeft = stateLeft.idxOfGroup(*this, value);
	updateCorrectedSumOfSquares(
	stateLeft.num(idxLeft), stateLeft.sum(idxLeft),
	stateLeft.corrected_square_sum(idxLeft),
	stateRight.num(idxRight), stateRight.sum(idxRight),
	stateRight.corrected_square_sum(idxRight));
	}

	return stateLeft;
	}

	/**
	* @brief Perform the one-sided t-Test final step
	*/
	AnyType
	one_way_anova_final::run(AnyType &args) {
	using boost::math::complement;

	OWATransitionState<ArrayHandle<double> > state = args[0];

	// If we haven't seen any data, just return Null. This is the standard
	// behavior of aggregate function on empty data sets (compare, e.g.,
	// how PostgreSQL handles sum or avg on empty inputs)
	if (state.numGroups == 0)
	return Null();

	double grand_mean = state.sum.sum() / state.num.sum();
	double sum_squares_between = 0;

	for (Index idx = 0; idx < static_cast<Index>(state.numGroups); ++idx)
	sum_squares_between += state.num(idx)
	* std::pow(state.sum(idx) / state.num(idx)
	- grand_mean, 2);

	double sum_squares_within = state.corrected_square_sum.sum();
	double df_between = state.numGroups - 1;
	double df_within = state.num.sum() - state.numGroups;
	double mean_square_between = sum_squares_between / df_between;
	double mean_square_within = sum_squares_within / df_within;
	double statistic = mean_square_between / mean_square_within;

	AnyType tuple;
	tuple
	<< sum_squares_between
	<< sum_squares_within
	<< static_cast<int64_t>(df_between)
	<< static_cast<int64_t>(df_within)
	<< mean_square_between
	<< mean_square_within
	<< statistic
	<< ((df_between >= 1 && df_within >= 1)
	? prob::cdf(
	complement(prob::fisher_f(df_between, df_within), statistic))
	: Null());
	return tuple;
	}

	} // namespace stats

	} // namespace modules

	} // namespace madlib