src/modules/glm/MultiResponseGLM_impl.hpp - madlib - Git at Google

 /* ----------------------------------------------------------------------- *//**
  *
  * @file MultiResponseGLM_impl.hpp
  *
  *//* ----------------------------------------------------------------------- */

 #ifndef MADLIB_MODULES_GLM_MULTIRESPONSEGLM_IMPL_HPP
 #define MADLIB_MODULES_GLM_MULTIRESPONSEGLM_IMPL_HPP

 #include <dbconnector/dbconnector.hpp>
 #include <boost/math/distributions.hpp>
 #include <limits>

 namespace madlib {

 namespace modules {

 namespace glm {

 // Use Eigen
 using namespace dbal::eigen_integration;

 template <class Container, class Family, class Link>
 inline
 MultiResponseGLMAccumulator<Container,Family,Link>::MultiResponseGLMAccumulator(
         Init_type& inInitialization)
   : Base(inInitialization), optimizer(inInitialization) {

     this->initialize();
 }


 /**
  * @brief Reset the accumulator before accumulating the first tuple
  */
 template <class Container, class Family, class Link>
 inline
 void
 MultiResponseGLMAccumulator<Container,Family,Link>::reset() {
     num_rows = 0;
     terminated = false;
     loglik = 0.;
     optimizer.reset();
 }

 /**
  * @brief Bind all elements of the state to the data in the stream
  *
  * The bind() is special in that even after running operator>>() on an element,
  * there is no guarantee yet that the element can indeed be accessed. It is
  * cruicial to first check this.
  *
  * Provided that this methods correctly lists all member variables, all other
  * methods can, however, rely on that fact that all variables are correctly
  * initialized and accessible.
  */
 template <class Container, class Family, class Link>
 inline
 void
 MultiResponseGLMAccumulator<Container,Family,Link>::bind(
         ByteStream_type& inStream) {

     inStream
         >> num_features
         >> num_categories
         >> num_rows
         >> terminated
         >> loglik
         >> optimizer;

     // bind vcov and optimizer.hessian onto the same memory as we don't need them
     // at the same time
     vcov.rebind(optimizer.hessian.memoryHandle(),
                 optimizer.hessian.rows(),
                 optimizer.hessian.cols());
 }

 /**
  * @brief Update the accumulation state by feeding a tuple
  */
 template <class Container, class Family, class Link>
 inline
 MultiResponseGLMAccumulator<Container,Family,Link>&
 MultiResponseGLMAccumulator<Container,Family,Link>::operator<<(
         const tuple_type& inTuple) {

     const MappedColumnVector& x = std::get<0>(inTuple);
     const int& y = static_cast<int>(std::get<1>(inTuple));
     // convert y to an indicator vector
     ColumnVector vecY(num_categories-1);
     vecY.fill(0);
     if (y != 0) { vecY(y - 1) = 1; }

     // The following checks were introduced with MADLIB-138. It still seems
     // useful to have clear error messages in case of infinite input values.
     if (!dbal::eigen_integration::isfinite(x)) {
         warning("Design matrix is not finite.");
     } else if (x.size() > std::numeric_limits<uint16_t>::max()) {
         warning("Number of independent variables cannot be "
             "larger than 65535.");
     } else if (num_features != static_cast<uint16_t>(x.size())) {
         warning("Inconsistent numbers of independent variables.");
     } else {
         // normal case
         uint16_t N = num_features;
         uint16_t C = static_cast<uint16_t>(num_categories - 1);
         ColumnVector mu(C);
         ColumnVector ita(C);
         Matrix G_prime(C, C);
         Matrix V(C, C);

         // initialize ita and mu in the first iteration
         if (optimizer.beta.norm() == 0.) {
             Link::init(mu);
             Link::link_func(mu,ita);
         } else {
             for (int i = 0; i < ita.size(); i ++) {
                 ita(i) = trans(x) * optimizer.beta.segment(i*N, N);
             }
             Link::mean_func(ita, mu);
         }

         // intermediate values
         Link::mean_derivative(ita, G_prime);
         Family::variance(mu, V);

         // log-likelihood
         loglik += Family::loglik(vecY, mu, 1.);

         // calcualte X_trans_W_X
         Matrix GtVinvG(trans(G_prime) * V.inverse() * G_prime);
         Matrix XXt(x * trans(x));
         for (int i = 0; i < C; i ++) {
             for (int j = 0; j < C; j ++) {
                 for (int x_i = 0; x_i < num_features; x_i ++) {
                    for (int x_j = 0; x_j < num_features; x_j ++) {
                         optimizer.hessian(i*num_features + x_i,
                                           j*num_features + x_j)
                                 += GtVinvG(i,j) * XXt(x_i,x_j);
                    }
                 }
             }
         }

         // calculate X_trans_W_Y
         ColumnVector YVinvG(trans(vecY-mu) * V.inverse() * G_prime);
         for (int i = 0; i < C; i ++) {
             optimizer.grad.segment(i*N, N) -= YVinvG(i) * x;
         }
         num_rows ++;
         return *this;
     } // all tests passed and we were in the else

     // error case
     terminated = true;
     return *this;
 }

 /**
  * @brief Merge with another accumulation state
  */
 template <class Container, class Family, class Link>
 template <class C, class F, class L>
 inline
 MultiResponseGLMAccumulator<Container,Family,Link>&
 MultiResponseGLMAccumulator<Container,Family,Link>::operator<<(
         const MultiResponseGLMAccumulator<C,F,L>& inOther) {
     if (this->empty()) {
         *this = inOther;
     } else if (inOther.empty()) {
     } else if (num_features != inOther.num_features) {
         warning("Inconsistent numbers of independent variables.");
         terminated = true;
     } else {
         num_rows += inOther.num_rows;
         loglik += inOther.loglik;
         optimizer.grad += inOther.optimizer.grad;
         optimizer.hessian += inOther.optimizer.hessian;
     }

     return *this;
 }

 /**
  * @brief Copy from a previous state
  */
 template <class Container, class Family, class Link>
 template <class C, class F, class L>
 inline
 MultiResponseGLMAccumulator<Container,Family,Link>&
 MultiResponseGLMAccumulator<Container,Family,Link>::operator=(
         const MultiResponseGLMAccumulator<C,F,L>& inOther) {
     this->copy(inOther);
     return *this;
 }

 /**
  * @brief Apply the accumulated intra-state values to inter-state member
  */
 template <class Container, class Family, class Link>
 inline
 void
 MultiResponseGLMAccumulator<Container,Family,Link>::apply() {
     if (!dbal::eigen_integration::isfinite(optimizer.hessian) ||
             !dbal::eigen_integration::isfinite(optimizer.grad)) {
         warning("Hessian or gradient is not finite.");
         terminated = true;
     } else {
         optimizer.apply();
     }

 }
 // -----------------------------------------------------------------------

 template <class Container>
 MultiResponseGLMResult::MultiResponseGLMResult(const MultiResponseGLMAccumulator<Container>& state) {
     compute(state);
 }

 /**
  * @brief Transform an accumulation state into a result
  */
 template <class Container>
 inline
 MultiResponseGLMResult&
 MultiResponseGLMResult::compute(const MultiResponseGLMAccumulator<Container>& state) {
     typedef Eigen::Map<Matrix> MMap;
     typedef MMap::Scalar Scalar;
     // Vector of coefficients: For efficiency reasons, we want to return this
     // by reference, so we need to bind to db memory
     Allocator& allocator = defaultAllocator();
     Index N = static_cast<Index>(state.num_features);
     Index C = static_cast<Index>(state.num_categories - 1);
     coef.rebind(allocator.allocateArray<double>(C, N), N, C);
     std_err.rebind(allocator.allocateArray<double>(C, N), N, C);
     z_stats.rebind(allocator.allocateArray<double>(C, N), N, C);
     p_values.rebind(allocator.allocateArray<double>(C, N), N, C);

     // computation
     loglik = state.loglik;
     coef = MMap(const_cast<Scalar*>(state.optimizer.beta.data()), N, C);
     Matrix tmp_sd(state.vcov.diagonal().cwiseSqrt());
     tmp_sd.resize(N, C);
     std_err = tmp_sd;
     z_stats = coef.cwiseQuotient(std_err);
     for (Index i = 0; i < N*C; i ++) {
         p_values(i) = 2. * prob::cdf( prob::normal(), -std::abs(z_stats(i)));
     }
     num_rows_processed = state.num_rows;

     return *this;
 }

 } // namespace glm

 } // namespace modules

 } // namespace madlib

 #endif // defined(MADLIB_MODULES_GLM_MULTIRESPONSEGLM_IMPL_HPP)
	/* ----------------------------------------------------------------------- //*
	*
	* @file MultiResponseGLM_impl.hpp
	*
	// ----------------------------------------------------------------------- */

	#ifndef MADLIB_MODULES_GLM_MULTIRESPONSEGLM_IMPL_HPP
	#define MADLIB_MODULES_GLM_MULTIRESPONSEGLM_IMPL_HPP

	#include <dbconnector/dbconnector.hpp>
	#include <boost/math/distributions.hpp>
	#include <limits>

	namespace madlib {

	namespace modules {

	namespace glm {

	// Use Eigen
	using namespace dbal::eigen_integration;

	template <class Container, class Family, class Link>
	inline
	MultiResponseGLMAccumulator<Container,Family,Link>::MultiResponseGLMAccumulator(
	Init_type& inInitialization)
	: Base(inInitialization), optimizer(inInitialization) {

	this->initialize();
	}


	/**
	* @brief Reset the accumulator before accumulating the first tuple
	*/
	template <class Container, class Family, class Link>
	inline
	void
	MultiResponseGLMAccumulator<Container,Family,Link>::reset() {
	num_rows = 0;
	terminated = false;
	loglik = 0.;
	optimizer.reset();
	}

	/**
	* @brief Bind all elements of the state to the data in the stream
	*
	* The bind() is special in that even after running operator>>() on an element,
	* there is no guarantee yet that the element can indeed be accessed. It is
	* cruicial to first check this.
	*
	* Provided that this methods correctly lists all member variables, all other
	* methods can, however, rely on that fact that all variables are correctly
	* initialized and accessible.
	*/
	template <class Container, class Family, class Link>
	inline
	void
	MultiResponseGLMAccumulator<Container,Family,Link>::bind(
	ByteStream_type& inStream) {

	inStream
	>> num_features
	>> num_categories
	>> num_rows
	>> terminated
	>> loglik
	>> optimizer;

	// bind vcov and optimizer.hessian onto the same memory as we don't need them
	// at the same time
	vcov.rebind(optimizer.hessian.memoryHandle(),
	optimizer.hessian.rows(),
	optimizer.hessian.cols());
	}

	/**
	* @brief Update the accumulation state by feeding a tuple
	*/
	template <class Container, class Family, class Link>
	inline
	MultiResponseGLMAccumulator<Container,Family,Link>&
	MultiResponseGLMAccumulator<Container,Family,Link>::operator<<(
	const tuple_type& inTuple) {

	const MappedColumnVector& x = std::get<0>(inTuple);
	const int& y = static_cast<int>(std::get<1>(inTuple));
	// convert y to an indicator vector
	ColumnVector vecY(num_categories-1);
	vecY.fill(0);
	if (y != 0) { vecY(y - 1) = 1; }

	// The following checks were introduced with MADLIB-138. It still seems
	// useful to have clear error messages in case of infinite input values.
	if (!dbal::eigen_integration::isfinite(x)) {
	warning("Design matrix is not finite.");
	} else if (x.size() > std::numeric_limits<uint16_t>::max()) {
	warning("Number of independent variables cannot be "
	"larger than 65535.");
	} else if (num_features != static_cast<uint16_t>(x.size())) {
	warning("Inconsistent numbers of independent variables.");
	} else {
	// normal case
	uint16_t N = num_features;
	uint16_t C = static_cast<uint16_t>(num_categories - 1);
	ColumnVector mu(C);
	ColumnVector ita(C);
	Matrix G_prime(C, C);
	Matrix V(C, C);

	// initialize ita and mu in the first iteration
	if (optimizer.beta.norm() == 0.) {
	Link::init(mu);
	Link::link_func(mu,ita);
	} else {
	for (int i = 0; i < ita.size(); i ++) {
	ita(i) = trans(x) * optimizer.beta.segment(i*N, N);
	}
	Link::mean_func(ita, mu);
	}

	// intermediate values
	Link::mean_derivative(ita, G_prime);
	Family::variance(mu, V);

	// log-likelihood
	loglik += Family::loglik(vecY, mu, 1.);

	// calcualte X_trans_W_X
	Matrix GtVinvG(trans(G_prime) * V.inverse() * G_prime);
	Matrix XXt(x * trans(x));
	for (int i = 0; i < C; i ++) {
	for (int j = 0; j < C; j ++) {
	for (int x_i = 0; x_i < num_features; x_i ++) {
	for (int x_j = 0; x_j < num_features; x_j ++) {
	optimizer.hessian(i*num_features + x_i,
	j*num_features + x_j)
	+= GtVinvG(i,j) * XXt(x_i,x_j);
	}
	}
	}
	}

	// calculate X_trans_W_Y
	ColumnVector YVinvG(trans(vecY-mu) * V.inverse() * G_prime);
	for (int i = 0; i < C; i ++) {
	optimizer.grad.segment(iN, N) -= YVinvG(i) x;
	}
	num_rows ++;
	return *this;
	} // all tests passed and we were in the else

	// error case
	terminated = true;
	return *this;
	}

	/**
	* @brief Merge with another accumulation state
	*/
	template <class Container, class Family, class Link>
	template <class C, class F, class L>
	inline
	MultiResponseGLMAccumulator<Container,Family,Link>&
	MultiResponseGLMAccumulator<Container,Family,Link>::operator<<(
	const MultiResponseGLMAccumulator<C,F,L>& inOther) {
	if (this->empty()) {
	*this = inOther;
	} else if (inOther.empty()) {
	} else if (num_features != inOther.num_features) {
	warning("Inconsistent numbers of independent variables.");
	terminated = true;
	} else {
	num_rows += inOther.num_rows;
	loglik += inOther.loglik;
	optimizer.grad += inOther.optimizer.grad;
	optimizer.hessian += inOther.optimizer.hessian;
	}

	return *this;
	}

	/**
	* @brief Copy from a previous state
	*/
	template <class Container, class Family, class Link>
	template <class C, class F, class L>
	inline
	MultiResponseGLMAccumulator<Container,Family,Link>&
	MultiResponseGLMAccumulator<Container,Family,Link>::operator=(
	const MultiResponseGLMAccumulator<C,F,L>& inOther) {
	this->copy(inOther);
	return *this;
	}

	/**
	* @brief Apply the accumulated intra-state values to inter-state member
	*/
	template <class Container, class Family, class Link>
	inline
	void
	MultiResponseGLMAccumulator<Container,Family,Link>::apply() {
	if (!dbal::eigen_integration::isfinite(optimizer.hessian) \|\|
	!dbal::eigen_integration::isfinite(optimizer.grad)) {
	warning("Hessian or gradient is not finite.");
	terminated = true;
	} else {
	optimizer.apply();
	}

	}
	// -----------------------------------------------------------------------

	template <class Container>
	MultiResponseGLMResult::MultiResponseGLMResult(const MultiResponseGLMAccumulator<Container>& state) {
	compute(state);
	}

	/**
	* @brief Transform an accumulation state into a result
	*/
	template <class Container>
	inline
	MultiResponseGLMResult&
	MultiResponseGLMResult::compute(const MultiResponseGLMAccumulator<Container>& state) {
	typedef Eigen::Map<Matrix> MMap;
	typedef MMap::Scalar Scalar;
	// Vector of coefficients: For efficiency reasons, we want to return this
	// by reference, so we need to bind to db memory
	Allocator& allocator = defaultAllocator();
	Index N = static_cast<Index>(state.num_features);
	Index C = static_cast<Index>(state.num_categories - 1);
	coef.rebind(allocator.allocateArray<double>(C, N), N, C);
	std_err.rebind(allocator.allocateArray<double>(C, N), N, C);
	z_stats.rebind(allocator.allocateArray<double>(C, N), N, C);
	p_values.rebind(allocator.allocateArray<double>(C, N), N, C);

	// computation
	loglik = state.loglik;
	coef = MMap(const_cast<Scalar*>(state.optimizer.beta.data()), N, C);
	Matrix tmp_sd(state.vcov.diagonal().cwiseSqrt());
	tmp_sd.resize(N, C);
	std_err = tmp_sd;
	z_stats = coef.cwiseQuotient(std_err);
	for (Index i = 0; i < N*C; i ++) {
	p_values(i) = 2. * prob::cdf( prob::normal(), -std::abs(z_stats(i)));
	}
	num_rows_processed = state.num_rows;

	return *this;
	}

	} // namespace glm

	} // namespace modules

	} // namespace madlib

	#endif // defined(MADLIB_MODULES_GLM_MULTIRESPONSEGLM_IMPL_HPP)