blob: 93ac8ad7dc8ab915f0291b922f5d734ab1ee2c57 [file] [log] [blame]
/* ----------------------------------------------------------------------- *//**
*
* @file MultiResponseGLM_impl.hpp
*
*//* ----------------------------------------------------------------------- */
#ifndef MADLIB_MODULES_GLM_MULTIRESPONSEGLM_IMPL_HPP
#define MADLIB_MODULES_GLM_MULTIRESPONSEGLM_IMPL_HPP
#include <dbconnector/dbconnector.hpp>
#include <boost/math/distributions.hpp>
#include <limits>
namespace madlib {
namespace modules {
namespace glm {
// Use Eigen
using namespace dbal::eigen_integration;
template <class Container, class Family, class Link>
inline
MultiResponseGLMAccumulator<Container,Family,Link>::MultiResponseGLMAccumulator(
Init_type& inInitialization)
: Base(inInitialization), optimizer(inInitialization) {
this->initialize();
}
/**
* @brief Reset the accumulator before accumulating the first tuple
*/
template <class Container, class Family, class Link>
inline
void
MultiResponseGLMAccumulator<Container,Family,Link>::reset() {
num_rows = 0;
terminated = false;
loglik = 0.;
optimizer.reset();
}
/**
* @brief Bind all elements of the state to the data in the stream
*
* The bind() is special in that even after running operator>>() on an element,
* there is no guarantee yet that the element can indeed be accessed. It is
* cruicial to first check this.
*
* Provided that this methods correctly lists all member variables, all other
* methods can, however, rely on that fact that all variables are correctly
* initialized and accessible.
*/
template <class Container, class Family, class Link>
inline
void
MultiResponseGLMAccumulator<Container,Family,Link>::bind(
ByteStream_type& inStream) {
inStream
>> num_features
>> num_categories
>> num_rows
>> terminated
>> loglik
>> optimizer;
// bind vcov and optimizer.hessian onto the same memory as we don't need them
// at the same time
vcov.rebind(optimizer.hessian.memoryHandle(),
optimizer.hessian.rows(),
optimizer.hessian.cols());
}
/**
* @brief Update the accumulation state by feeding a tuple
*/
template <class Container, class Family, class Link>
inline
MultiResponseGLMAccumulator<Container,Family,Link>&
MultiResponseGLMAccumulator<Container,Family,Link>::operator<<(
const tuple_type& inTuple) {
const MappedColumnVector& x = std::get<0>(inTuple);
const int& y = static_cast<int>(std::get<1>(inTuple));
// convert y to an indicator vector
ColumnVector vecY(num_categories-1);
vecY.fill(0);
if (y != 0) { vecY(y - 1) = 1; }
// The following checks were introduced with MADLIB-138. It still seems
// useful to have clear error messages in case of infinite input values.
if (!dbal::eigen_integration::isfinite(x)) {
warning("Design matrix is not finite.");
} else if (x.size() > std::numeric_limits<uint16_t>::max()) {
warning("Number of independent variables cannot be "
"larger than 65535.");
} else if (num_features != static_cast<uint16_t>(x.size())) {
warning("Inconsistent numbers of independent variables.");
} else {
// normal case
uint16_t N = num_features;
uint16_t C = static_cast<uint16_t>(num_categories - 1);
ColumnVector mu(C);
ColumnVector ita(C);
Matrix G_prime(C, C);
Matrix V(C, C);
// initialize ita and mu in the first iteration
if (optimizer.beta.norm() == 0.) {
Link::init(mu);
Link::link_func(mu,ita);
} else {
for (int i = 0; i < ita.size(); i ++) {
ita(i) = trans(x) * optimizer.beta.segment(i*N, N);
}
Link::mean_func(ita, mu);
}
// intermediate values
Link::mean_derivative(ita, G_prime);
Family::variance(mu, V);
// log-likelihood
loglik += Family::loglik(vecY, mu, 1.);
// calcualte X_trans_W_X
Matrix GtVinvG(trans(G_prime) * V.inverse() * G_prime);
Matrix XXt(x * trans(x));
for (int i = 0; i < C; i ++) {
for (int j = 0; j < C; j ++) {
for (int x_i = 0; x_i < num_features; x_i ++) {
for (int x_j = 0; x_j < num_features; x_j ++) {
optimizer.hessian(i*num_features + x_i,
j*num_features + x_j)
+= GtVinvG(i,j) * XXt(x_i,x_j);
}
}
}
}
// calculate X_trans_W_Y
ColumnVector YVinvG(trans(vecY-mu) * V.inverse() * G_prime);
for (int i = 0; i < C; i ++) {
optimizer.grad.segment(i*N, N) -= YVinvG(i) * x;
}
num_rows ++;
return *this;
} // all tests passed and we were in the else
// error case
terminated = true;
return *this;
}
/**
* @brief Merge with another accumulation state
*/
template <class Container, class Family, class Link>
template <class C, class F, class L>
inline
MultiResponseGLMAccumulator<Container,Family,Link>&
MultiResponseGLMAccumulator<Container,Family,Link>::operator<<(
const MultiResponseGLMAccumulator<C,F,L>& inOther) {
if (this->empty()) {
*this = inOther;
} else if (inOther.empty()) {
} else if (num_features != inOther.num_features) {
warning("Inconsistent numbers of independent variables.");
terminated = true;
} else {
num_rows += inOther.num_rows;
loglik += inOther.loglik;
optimizer.grad += inOther.optimizer.grad;
optimizer.hessian += inOther.optimizer.hessian;
}
return *this;
}
/**
* @brief Copy from a previous state
*/
template <class Container, class Family, class Link>
template <class C, class F, class L>
inline
MultiResponseGLMAccumulator<Container,Family,Link>&
MultiResponseGLMAccumulator<Container,Family,Link>::operator=(
const MultiResponseGLMAccumulator<C,F,L>& inOther) {
this->copy(inOther);
return *this;
}
/**
* @brief Apply the accumulated intra-state values to inter-state member
*/
template <class Container, class Family, class Link>
inline
void
MultiResponseGLMAccumulator<Container,Family,Link>::apply() {
if (!dbal::eigen_integration::isfinite(optimizer.hessian) ||
!dbal::eigen_integration::isfinite(optimizer.grad)) {
warning("Hessian or gradient is not finite.");
terminated = true;
} else {
optimizer.apply();
}
}
// -----------------------------------------------------------------------
template <class Container>
MultiResponseGLMResult::MultiResponseGLMResult(const MultiResponseGLMAccumulator<Container>& state) {
compute(state);
}
/**
* @brief Transform an accumulation state into a result
*/
template <class Container>
inline
MultiResponseGLMResult&
MultiResponseGLMResult::compute(const MultiResponseGLMAccumulator<Container>& state) {
typedef Eigen::Map<Matrix> MMap;
typedef MMap::Scalar Scalar;
// Vector of coefficients: For efficiency reasons, we want to return this
// by reference, so we need to bind to db memory
Allocator& allocator = defaultAllocator();
Index N = static_cast<Index>(state.num_features);
Index C = static_cast<Index>(state.num_categories - 1);
coef.rebind(allocator.allocateArray<double>(C, N), N, C);
std_err.rebind(allocator.allocateArray<double>(C, N), N, C);
z_stats.rebind(allocator.allocateArray<double>(C, N), N, C);
p_values.rebind(allocator.allocateArray<double>(C, N), N, C);
// computation
loglik = state.loglik;
coef = MMap(const_cast<Scalar*>(state.optimizer.beta.data()), N, C);
Matrix tmp_sd(state.vcov.diagonal().cwiseSqrt());
tmp_sd.resize(N, C);
std_err = tmp_sd;
z_stats = coef.cwiseQuotient(std_err);
for (Index i = 0; i < N*C; i ++) {
p_values(i) = 2. * prob::cdf( prob::normal(), -std::abs(z_stats(i)));
}
num_rows_processed = state.num_rows;
return *this;
}
} // namespace glm
} // namespace modules
} // namespace madlib
#endif // defined(MADLIB_MODULES_GLM_MULTIRESPONSEGLM_IMPL_HPP)