blob: fc0b722437e72cf1788e4dbaf479be4219164989 [file] [log] [blame]
/* ----------------------------------------------------------------------- *//**
*
* @file linear.cpp
*
* @brief Linear-regression functions
*
*//* ----------------------------------------------------------------------- */
#include <modules/regress/linear.hpp>
#include <modules/prob/student.hpp>
#include <utils/Reference.hpp>
// Floating-point classification functions are in C99 and TR1, but not in the
// official C++ Standard (before C++0x). We therefore use the Boost implementation
#include <boost/math/special_functions/fpclassify.hpp>
// Import names from Armadillo
using arma::vec;
using arma::mat;
using arma::as_scalar;
namespace madlib {
using utils::Reference;
namespace modules {
// Import names from other MADlib modules
using prob::studentT_cdf;
namespace regress {
/**
* @brief Transition state for linear-regression functions
*
* TransitionState encapsualtes the transition state during the
* linear-regression aggregate functions. To the database, the state is exposed
* as a single DOUBLE PRECISION array, to the C++ code it is a proper object
* containing scalars, a vector, and a matrix.
*
* Note: We assume that the DOUBLE PRECISION array is initialized by the
* database with length at least 5, and all elemenets are 0.
*/
class LinearRegression::TransitionState {
public:
/**
* @internal Member initalization occurs in the order of declaration in the
* class (see ISO/IEC 14882:2003, Section 12.6.2). The order in the
* init list is irrelevant. It is important that mStorage gets
* initialized before the other members!
*/
TransitionState(AnyType inArg)
: mStorage(inArg.cloneIfImmutable()) {
rebind();
}
/**
* We define this function so that we can use TransitionState in the argument
* list and as a return type.
*/
inline operator AnyType() const {
return mStorage;
}
/**
* @brief Initialize the transition state. Only called for first row.
*
* @param inAllocator Allocator for the memory transition state. Must fill
* the memory block with zeros.
* @param inWidthOfX Number of independent variables. The first row of data
* determines the size of the transition state. This size is a quadratic
* function of inWidthOfX.
*/
inline void initialize(AllocatorSPtr inAllocator,
const uint16_t inWidthOfX) {
mStorage.rebind(inAllocator, boost::extents[ arraySize(inWidthOfX) ]);
rebind(inWidthOfX);
widthOfX = inWidthOfX;
}
/**
* @brief Merge with another TransitionState object
*/
TransitionState &operator+=(const TransitionState &inOtherState) {
if (mStorage.size() != inOtherState.mStorage.size())
throw std::logic_error("Internal error: Incompatible transition states");
for (uint32_t i = 0; i < mStorage.size(); i++)
mStorage[i] += inOtherState.mStorage[i];
widthOfX = inOtherState.widthOfX;
return *this;
}
private:
static inline uint32_t arraySize(const uint16_t inWidthOfX) {
return 4 + inWidthOfX + inWidthOfX * inWidthOfX;
}
/**
* @brief Who should own TransparentHandles pointing to slices of mStorage?
*/
AbstractHandle::MemoryController memoryController() const {
AbstractHandle::MemoryController ctrl =
mStorage.memoryHandle()->memoryController();
return (ctrl == AbstractHandle::kSelf ? AbstractHandle::kLocal : ctrl);
}
/**
* @brief Rebind to a new storage array
*
* @param inWidthOfX If this value is positive, use it as the number of
* independent variables. This is needed during initialization, when
* the storage array is still all zero, but we do already know the
* with of the design matrix.
*/
void rebind(uint16_t inWidthOfX = 0) {
numRows.rebind(&mStorage[0]);
widthOfX.rebind(&mStorage[1]);
if (inWidthOfX != 0)
widthOfX = inWidthOfX;
y_sum.rebind(&mStorage[2]);
y_square_sum.rebind(&mStorage[3]);
X_transp_Y.rebind(
TransparentHandle::create(&mStorage[4], widthOfX * sizeof(double),
memoryController()),
widthOfX);
X_transp_X.rebind(
TransparentHandle::create(&mStorage[4 + widthOfX],
widthOfX * widthOfX * sizeof(double),
memoryController()),
widthOfX, widthOfX);
}
Array<double> mStorage;
public:
Reference<double, uint64_t> numRows;
Reference<double, uint16_t> widthOfX;
Reference<double> y_sum;
Reference<double> y_square_sum;
DoubleCol X_transp_Y;
DoubleMat X_transp_X;
};
/**
* @brief Perform the linear-regression transition step
*
* We update: the number of rows \f$ n \f$, the partial sums
* \f$ \sum_{i=1}^n y_i \f$ and \f$ \sum_{i=1}^n y_i^2 \f$, the matrix
* \f$ X^T X \f$, and the vector \f$ X^T \boldsymbol y \f$.
*/
AnyType LinearRegression::transition(AbstractDBInterface &db, AnyType args) {
AnyType::iterator arg(args);
// Arguments from SQL call. Immutable values passed by reference should be
// instantiated from the respective <tt>_const</tt> class. Otherwise, the
// abstraction layer will perform a deep copy (i.e., waste unnecessary
// processor cycles).
TransitionState state = *arg++;
double y = *arg++;
DoubleRow_const x = *arg++;
// See MADLIB-138. At least on certain platforms and with certain versions,
// LAPACK will run into an infinite loop if pinv() is called for non-finite
// matrices. We extend the check also to the dependent variables.
if (!boost::math::isfinite(y))
throw std::invalid_argument("Dependent variables are not finite.");
else if (!x.is_finite())
throw std::invalid_argument("Design matrix is not finite.");
// Now do the transition step.
if (state.numRows == 0) {
if (x.n_elem > std::numeric_limits<uint16_t>::max())
throw std::domain_error("Number of independent variables cannot be "
"larger than 65535.");
state.initialize(
db.allocator(
AbstractAllocator::kAggregate,
AbstractAllocator::kZero
),
x.n_elem);
}
state.numRows++;
state.y_sum += y;
state.y_square_sum += y * y;
state.X_transp_Y += trans(x) * y;
state.X_transp_X += trans(x) * x;
return state;
}
/**
* @brief Perform the perliminary aggregation function: Merge transition states
*/
AnyType LinearRegression::mergeStates(AbstractDBInterface & /* db */, AnyType args) {
TransitionState stateLeft = args[0].cloneIfImmutable();
const TransitionState stateRight = args[1];
// We first handle the trivial case where this function is called with one
// of the states being the initial state
if (stateLeft.numRows == 0)
return stateRight;
else if (stateRight.numRows == 0)
return stateLeft;
// Merge states together and return
stateLeft += stateRight;
return stateLeft;
}
/**
* @brief Perform the linear-regression final step
*/
AnyType LinearRegression::final(AbstractDBInterface &db, AnyType args) {
const TransitionState state = args[0];
// See MADLIB-138. At least on certain platforms and with certain versions,
// LAPACK will run into an infinite loop if pinv() is called for non-finite
// matrices. We extend the check also to the dependent variables.
if (!state.X_transp_X.is_finite() || !state.X_transp_Y.is_finite())
throw std::invalid_argument("Design matrix is not finite.");
// FIXME: We have essentially two calls to svd now (pinv calls svd, too).
// This is a waste of processor cycles and energy.
vec singularValues = svd(state.X_transp_X);
double condition_X_transp_X = max(singularValues) / min(singularValues);
// See:
// Lichtblau, Daniel and Weisstein, Eric W. "Condition Number."
// From MathWorld--A Wolfram Web Resource.
// http://mathworld.wolfram.com/ConditionNumber.html
if (condition_X_transp_X > 1000)
db.out << "Matrix X^T X is ill-conditioned (condition number "
"= " << condition_X_transp_X << "). "
"Expect strong multicollinearity." << std::endl;
// Precompute (X^T * X)^+
mat inverse_of_X_transp_X = pinv(state.X_transp_X);
// Vector of coefficients: For efficiency reasons, we want to return this
// by reference, so we need to bind to db memory
DoubleCol coef(db.allocator(), state.widthOfX);
coef = inverse_of_X_transp_X * state.X_transp_Y;
// explained sum of squares (regression sum of squares)
double ess
= as_scalar(
trans(state.X_transp_Y) * coef
- ((state.y_sum * state.y_sum) / state.numRows)
);
// total sum of squares
double tss
= state.y_square_sum
- ((state.y_sum * state.y_sum) / state.numRows);
// With infinite precision, the following checks are pointless. But due to
// floating-point arithmetic, this need not hold at this point.
// Without a formal proof convincing us of the contrary, we should
// anticipate that numerical peculiarities might occur.
if (tss < 0)
tss = 0;
if (ess < 0)
ess = 0;
// Since we know tss with greater accuracy than ess, we do the following
// sanity adjustment to ess:
if (ess > tss)
ess = tss;
// coefficient of determination
// If tss == 0, then the regression perfectly fits the data, so the
// coefficient of determination is 1.
double r2 = (tss == 0 ? 1 : ess / tss);
// In the case of linear regression:
// residual sum of squares (rss) = total sum of squares (tss) - explained
// sum of squares (ess)
// Proof: http://en.wikipedia.org/wiki/Sum_of_squares
double rss = tss - ess;
// Variance is also called the mean square error
double variance = rss / (state.numRows - state.widthOfX);
// Vector of standard errors and t-statistics: For efficiency reasons, we
// want to return these by reference, so we need to bind to db memory
DoubleCol stdErr(db.allocator(), state.widthOfX);
DoubleCol tStats(db.allocator(), state.widthOfX);
for (int i = 0; i < state.widthOfX; i++) {
// In an abundance of caution, we see a tiny possibility that numerical
// instabilities in the pinv operation can lead to negative values on
// the main diagonal of even a SPD matrix
if (inverse_of_X_transp_X(i,i) < 0) {
stdErr(i) = 0;
} else {
stdErr(i) = std::sqrt( variance * inverse_of_X_transp_X(i,i) );
}
if (coef(i) == 0 && stdErr(i) == 0) {
// In this special case, 0/0 should be interpreted as 0:
// We know that 0 is the exact value for the coefficient, so
// the t-value should be 0 (corresponding to a p-value of 1)
tStats(i) = 0;
} else {
// If stdErr(i) == 0 then abs(tStats(i)) will be infinity, which is
// what we need.
tStats(i) = coef(i) / stdErr(i);
}
}
// Vector of p-values: For efficiency reasons, we want to return this
// by reference, so we need to bind to db memory
DoubleCol pValues(db.allocator(), state.widthOfX);
for (int i = 0; i < state.widthOfX; i++)
pValues(i) = 2. * (1. - studentT_cdf(
state.numRows - state.widthOfX,
std::fabs( tStats(i) )));
// Return all coefficients, standard errors, etc. in a tuple
AnyTypeVector tuple;
ConcreteRecord::iterator tupleElement(tuple);
*tupleElement++ = coef;
*tupleElement++ = r2;
*tupleElement++ = stdErr;
*tupleElement++ = tStats;
*tupleElement = pValues;
return tuple;
}
} // namespace regress
} // namespace modules
} // namespace madlib