blob: 00aa9c0b4b35fe94d82293ed97be0613a222a545 [file] [log] [blame]
/* ----------------------------------------------------------------------- *//**
*
* @file LinearRegression_impl.hpp
*
*//* ----------------------------------------------------------------------- */
#ifndef MADLIB_MODULES_REGRESS_LINEAR_REGRESSION_IMPL_HPP
#define MADLIB_MODULES_REGRESS_LINEAR_REGRESSION_IMPL_HPP
#include <dbconnector/dbconnector.hpp>
#include <boost/math/distributions.hpp>
#include <modules/prob/student.hpp>
#include <modules/prob/boost.hpp>
#include <limits>
namespace madlib {
// Use Eigen
using namespace dbal::eigen_integration;
namespace modules {
namespace regress {
template <class Container>
inline
LinearRegressionAccumulator<Container>::LinearRegressionAccumulator(
Init_type& inInitialization)
: Base(inInitialization) {
this->initialize();
}
/**
* @brief Bind all elements of the state to the data in the stream
*
* The bind() is special in that even after running operator>>() on an element,
* there is no guarantee yet that the element can indeed be accessed. It is
* cruicial to first check this.
*
* Provided that this methods correctly lists all member variables, all other
* methods can, however, rely on that fact that all variables are correctly
* initialized and accessible.
*/
template <class Container>
inline
void
LinearRegressionAccumulator<Container>::bind(ByteStream_type& inStream) {
inStream
>> numRows >> widthOfX >> y_sum >> y_square_sum;
uint16_t actualWidthOfX = widthOfX.isNull()
? static_cast<uint16_t>(0)
: static_cast<uint16_t>(widthOfX);
inStream
>> X_transp_Y.rebind(actualWidthOfX)
>> X_transp_X.rebind(actualWidthOfX, actualWidthOfX);
}
/**
* @brief Update the accumulation state
*
* We update the number of rows \f$ n \f$, the partial
* sums \f$ \sum_{i=1}^n y_i \f$ and \f$ \sum_{i=1}^n y_i^2 \f$, the matrix
* \f$ X^T X \f$, and the vector \f$ X^T \boldsymbol y \f$.
*/
template <class Container>
inline
LinearRegressionAccumulator<Container>&
LinearRegressionAccumulator<Container>::operator<<(const tuple_type& inTuple) {
const MappedColumnVector& x = std::get<0>(inTuple);
const double& y = std::get<1>(inTuple);
// The following checks were introduced with MADLIB-138. It still seems
// useful to have clear error messages in case of infinite input values.
if (!std::isfinite(y))
throw std::domain_error("Dependent variables are not finite.");
else if (!dbal::eigen_integration::isfinite(x))
throw std::domain_error("Design matrix is not finite.");
else if (x.size() > std::numeric_limits<uint16_t>::max())
throw std::domain_error("Number of independent variables cannot be "
"larger than 65535.");
// Initialize in first iteration
if (numRows == 0) {
widthOfX = static_cast<uint16_t>(x.size());
this->resize();
} else if (widthOfX != static_cast<uint16_t>(x.size())) {
throw std::runtime_error("Inconsistent numbers of independent "
"variables.");
}
numRows++;
y_sum += y;
y_square_sum += y * y;
X_transp_Y.noalias() += x * y;
// X^T X is symmetric, so it is sufficient to only fill a triangular part
// of the matrix
triangularView<Lower>(X_transp_X) += x * trans(x);
return *this;
}
/**
* @brief Merge with another accumulation state
*/
template <class Container>
template <class OtherContainer>
inline
LinearRegressionAccumulator<Container>&
LinearRegressionAccumulator<Container>::operator<<(
const LinearRegressionAccumulator<OtherContainer>& inOther) {
// Initialize if necessary
if (numRows == 0) {
*this = inOther;
return *this;
} else if (inOther.numRows == 0)
return *this;
else if (widthOfX != inOther.widthOfX)
throw std::runtime_error("Inconsistent numbers of independent "
"variables.");
numRows += inOther.numRows;
y_sum += inOther.y_sum;
y_square_sum += inOther.y_square_sum;
X_transp_Y.noalias() += inOther.X_transp_Y;
triangularView<Lower>(X_transp_X) += inOther.X_transp_X;
return *this;
}
template <class Container>
template <class OtherContainer>
inline
LinearRegressionAccumulator<Container>&
LinearRegressionAccumulator<Container>::operator=(
const LinearRegressionAccumulator<OtherContainer>& inOther) {
this->copy(inOther);
return *this;
}
template <class Container>
LinearRegression::LinearRegression(
const LinearRegressionAccumulator<Container>& inState) {
compute(inState);
}
/**
* @brief Transform a linear-regression accumulation state into a result
*
* The result of the accumulation phase is \f$ X^T X \f$ and
* \f$ X^T \boldsymbol y \f$. We first compute the pseudo-inverse, then the
* regression coefficients, the model statistics, etc.
*
* @sa For the mathematical description, see \ref grp_linreg.
*/
template <class Container>
inline
LinearRegression&
LinearRegression::compute(
const LinearRegressionAccumulator<Container>& inState) {
Allocator& allocator = defaultAllocator();
// The following checks were introduced with MADLIB-138. It still seems
// useful to have clear error messages in case of infinite input values.
if (!dbal::eigen_integration::isfinite(inState.X_transp_X) ||
!dbal::eigen_integration::isfinite(inState.X_transp_Y))
throw std::domain_error("Design matrix is not finite.");
SymmetricPositiveDefiniteEigenDecomposition<Matrix> decomposition(
inState.X_transp_X, EigenvaluesOnly, ComputePseudoInverse);
// Precompute (X^T * X)^+
Matrix inverse_of_X_transp_X = decomposition.pseudoInverse();
conditionNo = decomposition.conditionNo();
// Vector of coefficients: For efficiency reasons, we want to return this
// by reference, so we need to bind to db memory
coef.rebind(allocator.allocateArray<double>(inState.widthOfX));
coef.noalias() = inverse_of_X_transp_X * inState.X_transp_Y;
// explained sum of squares (regression sum of squares)
double ess = dot(inState.X_transp_Y, coef)
- (inState.y_sum * inState.y_sum / static_cast<double>(inState.numRows));
// total sum of squares
double tss = inState.y_square_sum
- (inState.y_sum * inState.y_sum / static_cast<double>(inState.numRows));
// With infinite precision, the following checks are pointless. But due to
// floating-point arithmetic, this need not hold at this point.
// Without a formal proof convincing us of the contrary, we should
// anticipate that numerical peculiarities might occur.
if (tss < 0)
tss = 0;
if (ess < 0)
ess = 0;
// Since we know tss with greater accuracy than ess, we do the following
// sanity adjustment to ess:
if (ess > tss)
ess = tss;
// coefficient of determination
// If tss == 0, then the regression perfectly fits the data, so the
// coefficient of determination is 1.
r2 = (tss == 0 ? 1 : ess / tss);
// In the case of linear regression:
// residual sum of squares (rss) = total sum of squares (tss) - explained
// sum of squares (ess)
// Proof: http://en.wikipedia.org/wiki/Sum_of_squares
double rss = tss - ess;
// Variance is also called the mean square error
double variance = rss / static_cast<double>(inState.numRows - inState.widthOfX);
// Vector of standard errors and t-statistics: For efficiency reasons, we
// want to return these by reference, so we need to bind to db memory
stdErr.rebind(allocator.allocateArray<double>(inState.widthOfX));
tStats.rebind(allocator.allocateArray<double>(inState.widthOfX));
for (int i = 0; i < inState.widthOfX; i++) {
// In an abundance of caution, we see a tiny possibility that numerical
// instabilities in the pinv operation can lead to negative values on
// the main diagonal of even a SPD matrix
if (inverse_of_X_transp_X(i,i) < 0) {
stdErr(i) = 0;
} else {
stdErr(i) = std::sqrt( variance * inverse_of_X_transp_X(i,i) );
}
if (coef(i) == 0 && stdErr(i) == 0) {
// In this special case, 0/0 should be interpreted as 0:
// We know that 0 is the exact value for the coefficient, so
// the t-value should be 0 (corresponding to a p-value of 1)
tStats(i) = 0;
} else {
// If stdErr(i) == 0 then abs(tStats(i)) will be infinity, which
// is what we need.
tStats(i) = coef(i) / stdErr(i);
}
}
// Matrix of variance-covariance matrix : For efficiency reasons, we want to return this
// by reference, so we need to bind to db memory
vcov.rebind(allocator.allocateArray<double>(inState.widthOfX,
inState.widthOfX),
inState.widthOfX,
inState.widthOfX);
vcov = variance * inverse_of_X_transp_X;
// Vector of p-values: For efficiency reasons, we want to return this
// by reference, so we need to bind to db memory
pValues.rebind(allocator.allocateArray<double>(inState.widthOfX));
if (inState.numRows > inState.widthOfX)
for (int i = 0; i < inState.widthOfX; i++)
pValues(i) = 2. * prob::cdf(
boost::math::complement(
prob::students_t(
static_cast<double>(inState.numRows - inState.widthOfX)
),
std::fabs(tStats(i))
));
return *this;
}
/*
* Robust Linear Regression: Huber-White Sandwich estimator
*/
template <class Container>
inline
RobustLinearRegressionAccumulator<Container>::RobustLinearRegressionAccumulator(
Init_type& inInitialization) : Base(inInitialization) {
this->initialize();
}
/**
* @brief Bind all elements of the state to the data in the stream
*
* The bind() is special in that even after running operator>>() on an element,
* there is no guarantee yet that the element can indeed be accessed. It is
* cruicial to first check this.
*
* Provided that this methods correctly lists all member variables, all other
* methods can, however, rely on that fact that all variables are correctly
* initialized and accessible.
*/
template <class Container>
inline
void
RobustLinearRegressionAccumulator<Container>::bind(ByteStream_type& inStream) {
inStream >> numRows >> widthOfX;
uint16_t actualWidthOfX = widthOfX.isNull() ? static_cast<uint16_t>(0) : static_cast<uint16_t>(widthOfX);
inStream
>> ols_coef.rebind(actualWidthOfX)
>> X_transp_X.rebind(actualWidthOfX, actualWidthOfX)
>> X_transp_r2_X.rebind(actualWidthOfX, actualWidthOfX);
}
/**
* @brief Update the accumulation state
*
* We update the number of rows \f$ n \f$, the matrix \f$ X^T X \f$,
* and the matrix \f$ X^T diag(r_1^2, r_2^2 ... r_n^2 X \f$
*/
template <class Container>
inline
RobustLinearRegressionAccumulator<Container>&
RobustLinearRegressionAccumulator<Container>::operator<<(const tuple_type& inTuple) {
// Inputs
const MappedColumnVector& x = std::get<0>(inTuple);
const double& y = std::get<1>(inTuple);
const MappedColumnVector& coef = std::get<2>(inTuple);
if (!std::isfinite(y))
throw std::domain_error("Dependent variables are not finite.");
else if (x.size() > std::numeric_limits<uint16_t>::max())
throw std::domain_error("Number of independent variables cannot be "
"larger than 65535.");
// Initialize in first iteration
if (numRows == 0) {
widthOfX = static_cast<uint16_t>(x.size());
this->resize();
ols_coef = coef;
}
// dimension check
if (widthOfX != static_cast<uint16_t>(x.size())) {
throw std::runtime_error("Inconsistent numbers of independent "
"variables.");
}
numRows++;
double r = y - trans(ols_coef)*x;
// The following matrices are symmetric, so it is sufficient to
// only fill a triangular part
triangularView<Lower>(X_transp_X) += x * trans(x);
triangularView<Lower>(X_transp_r2_X) += r * r * x * trans(x);
return *this;
}
/**
* @brief Merge with another accumulation state
*/
template <class Container>
template <class OtherContainer>
inline
RobustLinearRegressionAccumulator<Container>&
RobustLinearRegressionAccumulator<Container>::operator<<(
const RobustLinearRegressionAccumulator<OtherContainer>& inOther) {
numRows += inOther.numRows;
triangularView<Lower>(X_transp_X) += inOther.X_transp_X;
triangularView<Lower>(X_transp_r2_X) += inOther.X_transp_r2_X;
return *this;
}
template <class Container>
template <class OtherContainer>
inline
RobustLinearRegressionAccumulator<Container>&
RobustLinearRegressionAccumulator<Container>::operator=(
const RobustLinearRegressionAccumulator<OtherContainer>& inOther) {
this->copy(inOther);
return *this;
}
template <class Container>
RobustLinearRegression::RobustLinearRegression(
const RobustLinearRegressionAccumulator<Container>& inState) {
compute(inState);
}
/**
* @brief Transform a robust linear-regression accumulation state into a result
*
* The result of the accumulation phase is \f$ X^T X \f$ and
* \f$ X^T U X \f$. We first compute the pseudo-inverse, then the
* the robust model statistics.
*
* @sa For the mathematical description, see \ref grp_linreg.
*/
template <class Container>
inline
RobustLinearRegression&
RobustLinearRegression::compute(
const RobustLinearRegressionAccumulator<Container>& inState) {
Allocator& allocator = defaultAllocator();
// The following checks were introduced with MADLIB-138. It still seems
// useful to have clear error messages in case of infinite input values.
if (!dbal::eigen_integration::isfinite(inState.X_transp_X) ||
!dbal::eigen_integration::isfinite(inState.X_transp_r2_X))
throw std::domain_error("Design matrix is not finite.");
SymmetricPositiveDefiniteEigenDecomposition<Matrix> decomposition(
inState.X_transp_X, EigenvaluesOnly, ComputePseudoInverse);
// Precompute (X^T * X)^+
Matrix inverse_of_X_transp_X = decomposition.pseudoInverse();
// Calculate the robust variance covariance matrix as:
// (X^T X)^-1 X^T diag(r1^2,r2^2....rn^2)X (X^T X)^-1
// Where r_1, r_2 ... r_n are the residuals
// Note: X_transp_r2_X calcualtes X^T diag(r1^2,r2^2....rn^2)X
Matrix robust_var_cov = inState.X_transp_r2_X.template triangularView<Eigen::StrictlyLower>();
robust_var_cov = robust_var_cov + trans(inState.X_transp_r2_X);
robust_var_cov = inverse_of_X_transp_X * robust_var_cov * inverse_of_X_transp_X;
// Vector of standard errors and t-statistics: For efficiency reasons, we
// want to return these by reference, so we need to bind to db memory
coef.rebind(allocator.allocateArray<double>(inState.widthOfX));
stdErr.rebind(allocator.allocateArray<double>(inState.widthOfX));
tStats.rebind(allocator.allocateArray<double>(inState.widthOfX));
for (int i = 0; i < inState.widthOfX; i++) {
// In an abundance of caution, we see a tiny possibility that numerical
// instabilities in the pinv operation can lead to negative values on
// the main diagonal of even a SPD matrix
coef(i) = inState.ols_coef(i);
if (inverse_of_X_transp_X(i,i) < 0) {
stdErr(i) = 0;
} else {
stdErr(i) = std::sqrt(robust_var_cov(i,i));
}
if (inState.ols_coef(i) == 0 && stdErr(i) == 0) {
// In this special case, 0/0 should be interpreted as 0:
// We know that 0 is the exact value for the coefficient, so
// the t-value should be 0 (corresponding to a p-value of 1)
tStats(i) = 0;
} else {
// If stdErr(i) == 0 then abs(tStats(i)) will be infinity, which
// is what we need.
tStats(i) = inState.ols_coef(i) / stdErr(i);
}
}
// Vector of p-values: For efficiency reasons, we want to return this
// by reference, so we need to bind to db memory
pValues.rebind(allocator.allocateArray<double>(inState.widthOfX));
if (inState.numRows > inState.widthOfX){
for (int i = 0; i < inState.widthOfX; i++){
pValues(i) = 2. * prob::cdf(
boost::math::complement(
prob::students_t(
static_cast<double>(inState.numRows - inState.widthOfX)
),
std::fabs(tStats(i))
));
}
}
return *this;
}
/*
Regression for the tests for heteroskedasticity
*/
template <class Container>
inline
HeteroLinearRegressionAccumulator<Container>::HeteroLinearRegressionAccumulator(
Init_type& inInitialization)
: Base(inInitialization) {
this->initialize();
}
/**
* @brief Bind all elements of the state to the data in the stream
*
* The bind() is special in that even after running operator>>() on an element,
* there is no guarantee yet that the element can indeed be accessed. It is
* cruicial to first check this.
*
* Provided that this methods correctly lists all member variables, all other
* methods can, however, rely on that fact that all variables are correctly
* initialized and accessible.
*/
template <class Container>
inline
void
HeteroLinearRegressionAccumulator<Container>::bind(ByteStream_type& inStream) {
inStream
>> numRows >> widthOfX >> a_sum >> a_square_sum;
uint16_t actualWidthOfX = widthOfX.isNull()
? static_cast<uint16_t>(0)
: static_cast<uint16_t>(widthOfX);
inStream
>> X_transp_A.rebind(actualWidthOfX)
>> X_transp_X.rebind(actualWidthOfX, actualWidthOfX);
}
/**
* @brief Update the accumulation state
*
* We update the number of rows \f$ n \f$, the partial
* sums \f$ \sum_{i=1}^n y_i \f$ and \f$ \sum_{i=1}^n y_i^2 \f$, the matrix
* \f$ X^T X \f$, and the vector \f$ X^T \boldsymbol y \f$.
*/
template <class Container>
inline
HeteroLinearRegressionAccumulator<Container>&
HeteroLinearRegressionAccumulator<Container>::operator<<(const hetero_tuple_type& inTuple) {
const MappedColumnVector& x = std::get<0>(inTuple);
const double& y = std::get<1>(inTuple);
const MappedColumnVector& coef = std::get<2>(inTuple);
if (!std::isfinite(y))
throw std::domain_error("Dependent variables are not finite.");
else if (!dbal::eigen_integration::isfinite(x))
throw std::domain_error("Design matrix is not finite.");
else if (x.size() > std::numeric_limits<uint16_t>::max())
throw std::domain_error("Number of independent variables cannot be "
"larger than 65535.");
// Initialize in first iteration
if (numRows == 0) {
widthOfX = static_cast<uint16_t>(x.size());
this->resize();
}
// dimension check
if (widthOfX != static_cast<uint16_t>(x.size())) {
throw std::runtime_error("Inconsistent numbers of independent "
"variables.");
}
double a = y - trans(coef)*x;
a = a*a;
numRows++;
a_sum += a;
a_square_sum += a*a;
X_transp_A.noalias() += x * a;
// X^T X is symmetric, so it is sufficient to only fill a triangular part
// of the matrix
triangularView<Lower>(X_transp_X) += x * trans(x);
return *this;
}
/**
* @brief Merge with another accumulation state
*/
template <class Container>
template <class OtherContainer>
inline
HeteroLinearRegressionAccumulator<Container>&
HeteroLinearRegressionAccumulator<Container>::operator<<(
const HeteroLinearRegressionAccumulator<OtherContainer>& inOther) {
numRows += inOther.numRows;
a_sum += inOther.a_sum;
a_square_sum += inOther.a_square_sum;
X_transp_A.noalias() += inOther.X_transp_A;
triangularView<Lower>(X_transp_X) += inOther.X_transp_X;
return *this;
}
template <class Container>
template <class OtherContainer>
inline
HeteroLinearRegressionAccumulator<Container>&
HeteroLinearRegressionAccumulator<Container>::operator=(
const HeteroLinearRegressionAccumulator<OtherContainer>& inOther) {
this->copy(inOther);
return *this;
}
template <class Container>
HeteroLinearRegression::HeteroLinearRegression(
const HeteroLinearRegressionAccumulator<Container>& inState) {
compute(inState);
}
/**
* @brief Transform a linear-regression accumulation state into a result
*
* The result of the accumulation phase is \f$ X^T X \f$ and
* \f$ X^T \boldsymbol y \f$. We first compute the pseudo-inverse, then the
* regression coefficients, the model statistics, etc.
*
* @sa For the mathematical description, see \ref grp_linreg.
*/
template <class Container>
inline
HeteroLinearRegression&
HeteroLinearRegression::compute(
const HeteroLinearRegressionAccumulator<Container>& inState) {
// The following checks were introduced with MADLIB-138. It still seems
// useful to have clear error messages in case of infinite input values.
if (!dbal::eigen_integration::isfinite(inState.X_transp_X) ||
!dbal::eigen_integration::isfinite(inState.X_transp_A))
throw std::domain_error("Design matrix is not finite.");
SymmetricPositiveDefiniteEigenDecomposition<Matrix> decomposition(
inState.X_transp_X, EigenvaluesOnly, ComputePseudoInverse);
// Precompute (X^T * X)^+
Matrix inverse_of_X_transp_X = decomposition.pseudoInverse();
ColumnVector coef;
coef = inverse_of_X_transp_X * inState.X_transp_A;
// explained sum of squares (regression sum of squares)
double ess = dot(inState.X_transp_A, coef)
- (inState.a_sum * inState.a_sum / static_cast<double>(inState.numRows));
// total sum of squares
double tss = inState.a_square_sum
- (inState.a_sum * inState.a_sum / static_cast<double>(inState.numRows));
// With infinite precision, the following checks are pointless. But due to
// floating-point arithmetic, this need not hold at this point.
// Without a formal proof convincing us of the contrary, we should
// anticipate that numerical peculiarities might occur.
if (tss < 0) tss = 0;
if (ess < 0) ess = 0;
// Since we know tss with greater accuracy than ess, we do the following
// sanity adjustment to ess:
if (ess > tss) ess = tss;
// Test statistic: numRows*Coefficient of determination
test_statistic = static_cast<double>(inState.numRows) * (tss == 0 ? 1 : ess / tss);
pValue = prob::cdf(complement(prob::chi_squared(
static_cast<double>(inState.widthOfX-1)), test_statistic));
return *this;
}
} // namespace regress
} // namespace modules
} // namespace madlib
#endif // defined(MADLIB_MODULES_REGRESS_LINEAR_REGRESSION_IMPL_HPP)