src/modules/regress/linear.cpp - madlib - Git at Google

 /* ----------------------------------------------------------------------- *//**
  *
  * @file linear.cpp
  *
  * @brief Linear-regression functions
  *
  *//* ----------------------------------------------------------------------- */

 #include <modules/regress/linear.hpp>
 #include <modules/prob/student.hpp>
 #include <utils/Reference.hpp>

 // Floating-point classification functions are in C99 and TR1, but not in the
 // official C++ Standard (before C++0x). We therefore use the Boost implementation
 #include <boost/math/special_functions/fpclassify.hpp>


 // Import names from Armadillo
 using arma::vec;
 using arma::mat;
 using arma::as_scalar;

 namespace madlib {

 using utils::Reference;

 namespace modules {

 // Import names from other MADlib modules
 using prob::studentT_cdf;

 namespace regress {

 /**
  * @brief Transition state for linear-regression functions
  *
  * TransitionState encapsualtes the transition state during the
  * linear-regression aggregate functions. To the database, the state is exposed
  * as a single DOUBLE PRECISION array, to the C++ code it is a proper object
  * containing scalars, a vector, and a matrix.
  *
  * Note: We assume that the DOUBLE PRECISION array is initialized by the
  * database with length at least 5, and all elemenets are 0.
  */
 class LinearRegression::TransitionState {
 public:
     /**
      * @internal Member initalization occurs in the order of declaration in the
      *      class (see ISO/IEC 14882:2003, Section 12.6.2). The order in the
      *      init list is irrelevant. It is important that mStorage gets
      *      initialized before the other members!
      */
     TransitionState(AnyType inArg)
         : mStorage(inArg.cloneIfImmutable()) {

         rebind();
     }

     /**
      * We define this function so that we can use TransitionState in the argument
      * list and as a return type.
      */
     inline operator AnyType() const {
         return mStorage;
     }

     /**
      * @brief Initialize the transition state. Only called for first row.
      *
      * @param inAllocator Allocator for the memory transition state. Must fill
      *     the memory block with zeros.
      * @param inWidthOfX Number of independent variables. The first row of data
      *     determines the size of the transition state. This size is a quadratic
      *     function of inWidthOfX.
      */
     inline void initialize(AllocatorSPtr inAllocator,
         const uint16_t inWidthOfX) {

         mStorage.rebind(inAllocator, boost::extents[ arraySize(inWidthOfX) ]);
         rebind(inWidthOfX);
         widthOfX = inWidthOfX;
     }

     /**
      * @brief Merge with another TransitionState object
      */
     TransitionState &operator+=(const TransitionState &inOtherState) {
         if (mStorage.size() != inOtherState.mStorage.size())
             throw std::logic_error("Internal error: Incompatible transition states");

         for (uint32_t i = 0; i < mStorage.size(); i++)
             mStorage[i] += inOtherState.mStorage[i];

         widthOfX = inOtherState.widthOfX;
         return *this;
     }

 private:
     static inline uint32_t arraySize(const uint16_t inWidthOfX) {
         return 4 + inWidthOfX + inWidthOfX * inWidthOfX;
     }

     /**
      * @brief Who should own TransparentHandles pointing to slices of mStorage?
      */
     AbstractHandle::MemoryController memoryController() const {
         AbstractHandle::MemoryController ctrl =
             mStorage.memoryHandle()->memoryController();

         return (ctrl == AbstractHandle::kSelf ? AbstractHandle::kLocal : ctrl);
     }

     /**
      * @brief Rebind to a new storage array
      *
      * @param inWidthOfX If this value is positive, use it as the number of
      *     independent variables. This is needed during initialization, when
      *     the storage array is still all zero, but we do already know the
      *     with of the design matrix.
      */
     void rebind(uint16_t inWidthOfX = 0) {
         numRows.rebind(&mStorage[0]);
         widthOfX.rebind(&mStorage[1]);

         if (inWidthOfX != 0)
             widthOfX = inWidthOfX;

         y_sum.rebind(&mStorage[2]);
         y_square_sum.rebind(&mStorage[3]);
         X_transp_Y.rebind(
             TransparentHandle::create(&mStorage[4], widthOfX * sizeof(double),
                 memoryController()),
             widthOfX);
         X_transp_X.rebind(
             TransparentHandle::create(&mStorage[4 + widthOfX],
                 widthOfX * widthOfX * sizeof(double),
                 memoryController()),
             widthOfX, widthOfX);
     }

     Array<double> mStorage;

 public:
     Reference<double, uint64_t> numRows;
     Reference<double, uint16_t> widthOfX;
     Reference<double> y_sum;
     Reference<double> y_square_sum;
     DoubleCol X_transp_Y;
     DoubleMat X_transp_X;
 };

 /**
  * @brief Perform the linear-regression transition step
  *
  * We update: the number of rows \f$ n \f$, the partial sums
  * \f$ \sum_{i=1}^n y_i \f$ and \f$ \sum_{i=1}^n y_i^2 \f$, the matrix
  * \f$ X^T X \f$, and the vector \f$ X^T \boldsymbol y \f$.
  */
 AnyType LinearRegression::transition(AbstractDBInterface &db, AnyType args) {
     AnyType::iterator arg(args);

     // Arguments from SQL call. Immutable values passed by reference should be
     // instantiated from the respective <tt>_const</tt> class. Otherwise, the
     // abstraction layer will perform a deep copy (i.e., waste unnecessary
     // processor cycles).
     TransitionState state = *arg++;
     double y = *arg++;
     DoubleRow_const x = *arg++;

     // See MADLIB-138. At least on certain platforms and with certain versions,
     // LAPACK will run into an infinite loop if pinv() is called for non-finite
     // matrices. We extend the check also to the dependent variables.
     if (!boost::math::isfinite(y))
         throw std::invalid_argument("Dependent variables are not finite.");
     else if (!x.is_finite())
         throw std::invalid_argument("Design matrix is not finite.");

     // Now do the transition step.
     if (state.numRows == 0) {
         if (x.n_elem > std::numeric_limits<uint16_t>::max())
             throw std::domain_error("Number of independent variables cannot be "
                 "larger than 65535.");

         state.initialize(
             db.allocator(
                 AbstractAllocator::kAggregate,
                 AbstractAllocator::kZero
             ),
             x.n_elem);
     }
     state.numRows++;
     state.y_sum += y;
     state.y_square_sum += y * y;
     state.X_transp_Y += trans(x) * y;
     state.X_transp_X += trans(x) * x;

     return state;
 }

 /**
  * @brief Perform the perliminary aggregation function: Merge transition states
  */
 AnyType LinearRegression::mergeStates(AbstractDBInterface & /* db */, AnyType args) {
     TransitionState stateLeft = args[0].cloneIfImmutable();
     const TransitionState stateRight = args[1];

     // We first handle the trivial case where this function is called with one
     // of the states being the initial state
     if (stateLeft.numRows == 0)
         return stateRight;
     else if (stateRight.numRows == 0)
         return stateLeft;

     // Merge states together and return
     stateLeft += stateRight;
     return stateLeft;
 }

 /**
  * @brief Perform the linear-regression final step
  */
 AnyType LinearRegression::final(AbstractDBInterface &db, AnyType args) {
     const TransitionState state = args[0];

     // See MADLIB-138. At least on certain platforms and with certain versions,
     // LAPACK will run into an infinite loop if pinv() is called for non-finite
     // matrices. We extend the check also to the dependent variables.
     if (!state.X_transp_X.is_finite() || !state.X_transp_Y.is_finite())
         throw std::invalid_argument("Design matrix is not finite.");

     // FIXME: We have essentially two calls to svd now (pinv calls svd, too).
     // This is a waste of processor cycles and energy.
     vec singularValues = svd(state.X_transp_X);
     double condition_X_transp_X = max(singularValues) / min(singularValues);

     // See:
     // Lichtblau, Daniel and Weisstein, Eric W. "Condition Number."
     // From MathWorld--A Wolfram Web Resource.
     // http://mathworld.wolfram.com/ConditionNumber.html
     if (condition_X_transp_X > 1000)
         db.out << "Matrix X^T X is ill-conditioned (condition number "
             "= " << condition_X_transp_X << "). "
             "Expect strong multicollinearity." << std::endl;

     // Precompute (X^T * X)^+
     mat inverse_of_X_transp_X = pinv(state.X_transp_X);

     // Vector of coefficients: For efficiency reasons, we want to return this
     // by reference, so we need to bind to db memory
     DoubleCol coef(db.allocator(), state.widthOfX);
     coef = inverse_of_X_transp_X * state.X_transp_Y;

     // explained sum of squares (regression sum of squares)
     double ess
         = as_scalar(
             trans(state.X_transp_Y) * coef
             - ((state.y_sum * state.y_sum) / state.numRows)
           );

     // total sum of squares
     double tss
         = state.y_square_sum
             - ((state.y_sum * state.y_sum) / state.numRows);

     // With infinite precision, the following checks are pointless. But due to
     // floating-point arithmetic, this need not hold at this point.
     // Without a formal proof convincing us of the contrary, we should
     // anticipate that numerical peculiarities might occur.
     if (tss < 0)
         tss = 0;
     if (ess < 0)
         ess = 0;
     // Since we know tss with greater accuracy than ess, we do the following
     // sanity adjustment to ess:
     if (ess > tss)
         ess = tss;

     // coefficient of determination
     // If tss == 0, then the regression perfectly fits the data, so the
     // coefficient of determination is 1.
     double r2 = (tss == 0 ? 1 : ess / tss);

     // In the case of linear regression:
     // residual sum of squares (rss) = total sum of squares (tss) - explained
     // sum of squares (ess)
     // Proof: http://en.wikipedia.org/wiki/Sum_of_squares
     double rss = tss - ess;

     // Variance is also called the mean square error
 	double variance = rss / (state.numRows - state.widthOfX);

     // Vector of standard errors and t-statistics: For efficiency reasons, we
     // want to return these by reference, so we need to bind to db memory
     DoubleCol stdErr(db.allocator(), state.widthOfX);
     DoubleCol tStats(db.allocator(), state.widthOfX);
     for (int i = 0; i < state.widthOfX; i++) {
         // In an abundance of caution, we see a tiny possibility that numerical
         // instabilities in the pinv operation can lead to negative values on
         // the main diagonal of even a SPD matrix
         if (inverse_of_X_transp_X(i,i) < 0) {
             stdErr(i) = 0;
         } else {
             stdErr(i) = std::sqrt( variance * inverse_of_X_transp_X(i,i) );
         }

         if (coef(i) == 0 && stdErr(i) == 0) {
             // In this special case, 0/0 should be interpreted as 0:
             // We know that 0 is the exact value for the coefficient, so
             // the t-value should be 0 (corresponding to a p-value of 1)
             tStats(i) = 0;
         } else {
             // If stdErr(i) == 0 then abs(tStats(i)) will be infinity, which is
             // what we need.
             tStats(i) = coef(i) / stdErr(i);
         }
     }

     // Vector of p-values: For efficiency reasons, we want to return this
     // by reference, so we need to bind to db memory
     DoubleCol pValues(db.allocator(), state.widthOfX);
     for (int i = 0; i < state.widthOfX; i++)
         pValues(i) = 2. * (1. - studentT_cdf(
                                     state.numRows - state.widthOfX,
                                     std::fabs( tStats(i) )));

     // Return all coefficients, standard errors, etc. in a tuple
     AnyTypeVector tuple;
     ConcreteRecord::iterator tupleElement(tuple);

     *tupleElement++ = coef;
     *tupleElement++ = r2;
     *tupleElement++ = stdErr;
     *tupleElement++ = tStats;
     *tupleElement   = pValues;

     return tuple;
 }

 } // namespace regress

 } // namespace modules

 } // namespace madlib
	/* ----------------------------------------------------------------------- //*
	*
	* @file linear.cpp
	*
	* @brief Linear-regression functions
	*
	// ----------------------------------------------------------------------- */

	#include <modules/regress/linear.hpp>
	#include <modules/prob/student.hpp>
	#include <utils/Reference.hpp>

	// Floating-point classification functions are in C99 and TR1, but not in the
	// official C++ Standard (before C++0x). We therefore use the Boost implementation
	#include <boost/math/special_functions/fpclassify.hpp>


	// Import names from Armadillo
	using arma::vec;
	using arma::mat;
	using arma::as_scalar;

	namespace madlib {

	using utils::Reference;

	namespace modules {

	// Import names from other MADlib modules
	using prob::studentT_cdf;

	namespace regress {

	/**
	* @brief Transition state for linear-regression functions
	*
	* TransitionState encapsualtes the transition state during the
	* linear-regression aggregate functions. To the database, the state is exposed
	* as a single DOUBLE PRECISION array, to the C++ code it is a proper object
	* containing scalars, a vector, and a matrix.
	*
	* Note: We assume that the DOUBLE PRECISION array is initialized by the
	* database with length at least 5, and all elemenets are 0.
	*/
	class LinearRegression::TransitionState {
	public:
	/**
	* @internal Member initalization occurs in the order of declaration in the
	* class (see ISO/IEC 14882:2003, Section 12.6.2). The order in the
	* init list is irrelevant. It is important that mStorage gets
	* initialized before the other members!
	*/
	TransitionState(AnyType inArg)
	: mStorage(inArg.cloneIfImmutable()) {

	rebind();
	}

	/**
	* We define this function so that we can use TransitionState in the argument
	* list and as a return type.
	*/
	inline operator AnyType() const {
	return mStorage;
	}

	/**
	* @brief Initialize the transition state. Only called for first row.
	*
	* @param inAllocator Allocator for the memory transition state. Must fill
	* the memory block with zeros.
	* @param inWidthOfX Number of independent variables. The first row of data
	* determines the size of the transition state. This size is a quadratic
	* function of inWidthOfX.
	*/
	inline void initialize(AllocatorSPtr inAllocator,
	const uint16_t inWidthOfX) {

	mStorage.rebind(inAllocator, boost::extents[ arraySize(inWidthOfX) ]);
	rebind(inWidthOfX);
	widthOfX = inWidthOfX;
	}

	/**
	* @brief Merge with another TransitionState object
	*/
	TransitionState &operator+=(const TransitionState &inOtherState) {
	if (mStorage.size() != inOtherState.mStorage.size())
	throw std::logic_error("Internal error: Incompatible transition states");

	for (uint32_t i = 0; i < mStorage.size(); i++)
	mStorage[i] += inOtherState.mStorage[i];

	widthOfX = inOtherState.widthOfX;
	return *this;
	}

	private:
	static inline uint32_t arraySize(const uint16_t inWidthOfX) {
	return 4 + inWidthOfX + inWidthOfX * inWidthOfX;
	}

	/**
	* @brief Who should own TransparentHandles pointing to slices of mStorage?
	*/
	AbstractHandle::MemoryController memoryController() const {
	AbstractHandle::MemoryController ctrl =
	mStorage.memoryHandle()->memoryController();

	return (ctrl == AbstractHandle::kSelf ? AbstractHandle::kLocal : ctrl);
	}

	/**
	* @brief Rebind to a new storage array
	*
	* @param inWidthOfX If this value is positive, use it as the number of
	* independent variables. This is needed during initialization, when
	* the storage array is still all zero, but we do already know the
	* with of the design matrix.
	*/
	void rebind(uint16_t inWidthOfX = 0) {
	numRows.rebind(&mStorage[0]);
	widthOfX.rebind(&mStorage[1]);

	if (inWidthOfX != 0)
	widthOfX = inWidthOfX;

	y_sum.rebind(&mStorage[2]);
	y_square_sum.rebind(&mStorage[3]);
	X_transp_Y.rebind(
	TransparentHandle::create(&mStorage[4], widthOfX * sizeof(double),
	memoryController()),
	widthOfX);
	X_transp_X.rebind(
	TransparentHandle::create(&mStorage[4 + widthOfX],
	widthOfX * widthOfX * sizeof(double),
	memoryController()),
	widthOfX, widthOfX);
	}

	Array<double> mStorage;

	public:
	Reference<double, uint64_t> numRows;
	Reference<double, uint16_t> widthOfX;
	Reference<double> y_sum;
	Reference<double> y_square_sum;
	DoubleCol X_transp_Y;
	DoubleMat X_transp_X;
	};

	/**
	* @brief Perform the linear-regression transition step
	*
	* We update: the number of rows \f$ n \f$, the partial sums
	* \f$ \sum_{i=1}^n y_i \f$ and \f$ \sum_{i=1}^n y_i^2 \f$, the matrix
	* \f$ X^T X \f$, and the vector \f$ X^T \boldsymbol y \f$.
	*/
	AnyType LinearRegression::transition(AbstractDBInterface &db, AnyType args) {
	AnyType::iterator arg(args);

	// Arguments from SQL call. Immutable values passed by reference should be
	// instantiated from the respective <tt>_const</tt> class. Otherwise, the
	// abstraction layer will perform a deep copy (i.e., waste unnecessary
	// processor cycles).
	TransitionState state = *arg++;
	double y = *arg++;
	DoubleRow_const x = *arg++;

	// See MADLIB-138. At least on certain platforms and with certain versions,
	// LAPACK will run into an infinite loop if pinv() is called for non-finite
	// matrices. We extend the check also to the dependent variables.
	if (!boost::math::isfinite(y))
	throw std::invalid_argument("Dependent variables are not finite.");
	else if (!x.is_finite())
	throw std::invalid_argument("Design matrix is not finite.");

	// Now do the transition step.
	if (state.numRows == 0) {
	if (x.n_elem > std::numeric_limits<uint16_t>::max())
	throw std::domain_error("Number of independent variables cannot be "
	"larger than 65535.");

	state.initialize(
	db.allocator(
	AbstractAllocator::kAggregate,
	AbstractAllocator::kZero
	),
	x.n_elem);
	}
	state.numRows++;
	state.y_sum += y;
	state.y_square_sum += y * y;
	state.X_transp_Y += trans(x) * y;
	state.X_transp_X += trans(x) * x;

	return state;
	}

	/**
	* @brief Perform the perliminary aggregation function: Merge transition states
	*/
	AnyType LinearRegression::mergeStates(AbstractDBInterface & /* db */, AnyType args) {
	TransitionState stateLeft = args[0].cloneIfImmutable();
	const TransitionState stateRight = args[1];

	// We first handle the trivial case where this function is called with one
	// of the states being the initial state
	if (stateLeft.numRows == 0)
	return stateRight;
	else if (stateRight.numRows == 0)
	return stateLeft;

	// Merge states together and return
	stateLeft += stateRight;
	return stateLeft;
	}

	/**
	* @brief Perform the linear-regression final step
	*/
	AnyType LinearRegression::final(AbstractDBInterface &db, AnyType args) {
	const TransitionState state = args[0];

	// See MADLIB-138. At least on certain platforms and with certain versions,
	// LAPACK will run into an infinite loop if pinv() is called for non-finite
	// matrices. We extend the check also to the dependent variables.
	if (!state.X_transp_X.is_finite() \|\| !state.X_transp_Y.is_finite())
	throw std::invalid_argument("Design matrix is not finite.");

	// FIXME: We have essentially two calls to svd now (pinv calls svd, too).
	// This is a waste of processor cycles and energy.
	vec singularValues = svd(state.X_transp_X);
	double condition_X_transp_X = max(singularValues) / min(singularValues);

	// See:
	// Lichtblau, Daniel and Weisstein, Eric W. "Condition Number."
	// From MathWorld--A Wolfram Web Resource.
	// http://mathworld.wolfram.com/ConditionNumber.html
	if (condition_X_transp_X > 1000)
	db.out << "Matrix X^T X is ill-conditioned (condition number "
	"= " << condition_X_transp_X << "). "
	"Expect strong multicollinearity." << std::endl;

	// Precompute (X^T * X)^+
	mat inverse_of_X_transp_X = pinv(state.X_transp_X);

	// Vector of coefficients: For efficiency reasons, we want to return this
	// by reference, so we need to bind to db memory
	DoubleCol coef(db.allocator(), state.widthOfX);
	coef = inverse_of_X_transp_X * state.X_transp_Y;

	// explained sum of squares (regression sum of squares)
	double ess
	= as_scalar(
	trans(state.X_transp_Y) * coef
	- ((state.y_sum * state.y_sum) / state.numRows)
	);

	// total sum of squares
	double tss
	= state.y_square_sum
	- ((state.y_sum * state.y_sum) / state.numRows);

	// With infinite precision, the following checks are pointless. But due to
	// floating-point arithmetic, this need not hold at this point.
	// Without a formal proof convincing us of the contrary, we should
	// anticipate that numerical peculiarities might occur.
	if (tss < 0)
	tss = 0;
	if (ess < 0)
	ess = 0;
	// Since we know tss with greater accuracy than ess, we do the following
	// sanity adjustment to ess:
	if (ess > tss)
	ess = tss;

	// coefficient of determination
	// If tss == 0, then the regression perfectly fits the data, so the
	// coefficient of determination is 1.
	double r2 = (tss == 0 ? 1 : ess / tss);

	// In the case of linear regression:
	// residual sum of squares (rss) = total sum of squares (tss) - explained
	// sum of squares (ess)
	// Proof: http://en.wikipedia.org/wiki/Sum_of_squares
	double rss = tss - ess;

	// Variance is also called the mean square error
	double variance = rss / (state.numRows - state.widthOfX);

	// Vector of standard errors and t-statistics: For efficiency reasons, we
	// want to return these by reference, so we need to bind to db memory
	DoubleCol stdErr(db.allocator(), state.widthOfX);
	DoubleCol tStats(db.allocator(), state.widthOfX);
	for (int i = 0; i < state.widthOfX; i++) {
	// In an abundance of caution, we see a tiny possibility that numerical
	// instabilities in the pinv operation can lead to negative values on
	// the main diagonal of even a SPD matrix
	if (inverse_of_X_transp_X(i,i) < 0) {
	stdErr(i) = 0;
	} else {
	stdErr(i) = std::sqrt( variance * inverse_of_X_transp_X(i,i) );
	}

	if (coef(i) == 0 && stdErr(i) == 0) {
	// In this special case, 0/0 should be interpreted as 0:
	// We know that 0 is the exact value for the coefficient, so
	// the t-value should be 0 (corresponding to a p-value of 1)
	tStats(i) = 0;
	} else {
	// If stdErr(i) == 0 then abs(tStats(i)) will be infinity, which is
	// what we need.
	tStats(i) = coef(i) / stdErr(i);
	}
	}

	// Vector of p-values: For efficiency reasons, we want to return this
	// by reference, so we need to bind to db memory
	DoubleCol pValues(db.allocator(), state.widthOfX);
	for (int i = 0; i < state.widthOfX; i++)
	pValues(i) = 2. * (1. - studentT_cdf(
	state.numRows - state.widthOfX,
	std::fabs( tStats(i) )));

	// Return all coefficients, standard errors, etc. in a tuple
	AnyTypeVector tuple;
	ConcreteRecord::iterator tupleElement(tuple);

	*tupleElement++ = coef;
	*tupleElement++ = r2;
	*tupleElement++ = stdErr;
	*tupleElement++ = tStats;
	*tupleElement = pValues;

	return tuple;
	}

	} // namespace regress

	} // namespace modules

	} // namespace madlib