src/modules/glm/GLM_impl.hpp - madlib - Git at Google

 /* ----------------------------------------------------------------------- *//**
  *
  * @file GLM_impl.hpp
  *
  *//* ----------------------------------------------------------------------- */

 #ifndef MADLIB_MODULES_GLM_GLM_IMPL_HPP
 #define MADLIB_MODULES_GLM_GLM_IMPL_HPP

 #include <dbconnector/dbconnector.hpp>
 #include <boost/math/distributions.hpp>
 #include <limits>

 namespace madlib {

 namespace modules {

 namespace glm {

 // Use Eigen
 using namespace dbal::eigen_integration;

 template <class Container, class Family, class Link>
 inline
 GLMAccumulator<Container,Family,Link>::GLMAccumulator(
         Init_type& inInitialization)
 : Base(inInitialization) {

     this->initialize();
 }


 /**
  * @brief Reset the accumulator before accumulating the first tuple
  */
 template <class Container, class Family, class Link>
 inline
 void
 GLMAccumulator<Container,Family,Link>::reset() {
     num_rows = 0;
     terminated = false;
     loglik = 0.;
     dispersion_accum = 0.;
     grad.setZero();
     hessian.setZero();
 }

 /**
  * @brief Bind all elements of the state to the data in the stream
  *
  * The bind() is special in that even after running operator>>() on an element,
  * there is no guarantee yet that the element can indeed be accessed. It is
  * cruicial to first check this.
  *
  * Provided that this methods correctly lists all member variables, all other
  * methods can, however, rely on that fact that all variables are correctly
  * initialized and accessible.
  */
 template <class Container, class Family, class Link>
 inline
 void
 GLMAccumulator<Container,Family,Link>::bind(
         ByteStream_type& inStream) {
     inStream
         >> num_rows
         >> terminated
         >> loglik
         >> dispersion
         >> dispersion_accum
         >> num_coef;

         uint16_t M = num_coef.isNull()
             ? static_cast<uint16_t>(0)
             : static_cast<uint16_t>(num_coef);
         inStream
             >> beta.rebind(M)
             >> grad.rebind(M)
             >> hessian.rebind(M, M);

     // bind vcov and optimizer.hessian onto the same memory as we don't need them
     // at the same time
     vcov.rebind(hessian.memoryHandle(),
                 hessian.rows(),
                 hessian.cols());
 }

 /**
  * @brief Update the accumulation state by feeding a tuple
  */
 template <class Container, class Family, class Link>
 inline
 GLMAccumulator<Container,Family,Link>&
 GLMAccumulator<Container,Family,Link>::operator<<(
         const tuple_type& inTuple) {
     const MappedColumnVector& x = std::get<0>(inTuple);
     const double& y = std::get<1>(inTuple);
     // The following checks were introduced with MADLIB-138. It still seems
     // useful to have clear error messages in case of infinite input values.
     if (!std::isfinite(y)) {
         warning("Dependent variables are not finite.");
     //domain check for y
     } else if (!Family::in_range(y)) {
         std::stringstream err_msg;
         err_msg << "Dependent variables are out of range: "
             << Family::out_of_range_err_msg();
         throw std::runtime_error(err_msg.str());
     } else if (!dbal::eigen_integration::isfinite(x)) {
         warning("Design matrix is not finite.");
     } else if (x.size() > std::numeric_limits<uint16_t>::max()) {
         warning("Number of independent variables cannot be "
             "larger than 65535.");
     } else if (num_coef != static_cast<uint16_t>(x.size())) {
         warning("Inconsistent numbers of independent variables.");
     } else {
         // normal case
         if (beta.norm() == 0.) {
             double mu = Link::init(y);
             double ita = Link::link_func(mu);
             double G_prime = Link::mean_derivative(ita);
             double V = Family::variance(mu);
             double w = G_prime * G_prime / V;
             // dispersion_accum += (y - mu) * (y - mu) / V;
             loglik += Family::loglik(y, mu, dispersion);
             hessian += x * trans(x) * w; // X_trans_W_X
             grad -= x * w * ita; // X_trans_W_Y
         } else {
             double ita = trans(x) * beta;
             double mu = Link::mean_func(ita);
             double G_prime = Link::mean_derivative(ita);
             double V = Family::variance(mu);
             double w = G_prime * G_prime / V;
             dispersion_accum += (y - mu) * (y - mu) / V;
             loglik += Family::loglik(y, mu, dispersion);

             if (!std::isfinite(static_cast<double>(loglik))) {
                 terminated = true;
                 warning("Log-likelihood becomes negative infinite. Maybe the model is not proper for this data set.");
                 return *this;
             }

             hessian += x * trans(x) * w; // X_trans_W_X
             grad -= x * (y - mu) * G_prime / V; // X_trans_W_Y
         }
         num_rows ++;
         return *this;
     }

     // error case
     terminated = true;
     return *this;
 }

 /**
  * @brief Merge with another accumulation state
  */
 template <class Container, class Family, class Link>
 template <class C, class F, class L>
 inline
 GLMAccumulator<Container,Family,Link>&
 GLMAccumulator<Container,Family,Link>::operator<<(
         const GLMAccumulator<C,F,L>& inOther) {
     if (this->empty()) {
         *this = inOther;
     } else if (inOther.empty()) {
     } else if (num_coef != inOther.num_coef) {
         warning("Inconsistent numbers of independent variables.");
         terminated = true;
     } else {
         num_rows += inOther.num_rows;
         loglik += inOther.loglik;
         grad += inOther.grad;
         hessian += inOther.hessian;
         dispersion_accum += inOther.dispersion_accum;
     }

     return *this;
 }

 /**
  * @brief Copy from a previous state
  */
 template <class Container, class Family, class Link>
 template <class C, class F, class L>
 inline
 GLMAccumulator<Container,Family,Link>&
 GLMAccumulator<Container,Family,Link>::operator=(
         const GLMAccumulator<C,F,L>& inOther) {
     this->copy(inOther);
     return *this;
 }

 /**
  * @brief Apply the accumulated intra-state values to inter-state member
  */
 template <class Container, class Family, class Link>
 inline
 void
 GLMAccumulator<Container,Family,Link>::apply() {
     if (!dbal::eigen_integration::isfinite(hessian) ||
             !dbal::eigen_integration::isfinite(grad)) {
         warning("Hessian or gradient is not finite.");
         terminated = true;
     } else {
         SymmetricPositiveDefiniteEigenDecomposition<Matrix> decomposition(
                 hessian, EigenvaluesOnly, ComputePseudoInverse);

         if (beta.norm() == 0.) {
             dispersion = 1.;
         } else {
             dispersion = dispersion_accum / static_cast<double>(num_rows);
         }
         hessian = decomposition.pseudoInverse(); // become inverse after apply
         beta -= hessian * grad;
     }
 }

 // -----------------------------------------------------------------------

 template <class Container>
 GLMResult::GLMResult(const GLMAccumulator<Container>& state) {
     compute(state);
 }

 /**
  * @brief Transform an accumulation state into a result
  */
 template <class Container>
 inline
 GLMResult&
 GLMResult::compute(const GLMAccumulator<Container>& state) {
     // Vector of coefficients: For efficiency reasons, we want to return this
     // by reference, so we need to bind to db memory
     Allocator& allocator = defaultAllocator();
     uint64_t n = state.num_rows;
     uint16_t p = state.num_coef;
     coef.rebind(allocator.allocateArray<double>(p));
     std_err.rebind(allocator.allocateArray<double>(p));
     z_stats.rebind(allocator.allocateArray<double>(p));
     p_values.rebind(allocator.allocateArray<double>(p));

     loglik = state.loglik;
     coef = state.beta;
     dispersion = state.dispersion *
             static_cast<double>(n) / static_cast<double>(n - p);
     std_err = state.vcov.diagonal().cwiseSqrt();
     z_stats = coef.cwiseQuotient(std_err);
     for (Index i = 0; i < p; i ++) {
         p_values(i) = 2. * prob::cdf(prob::normal(), -std::abs(z_stats(i)));
     }
     num_rows_processed = n;

     return *this;
 }

 } // namespace glm

 } // namespace modules

 } // namespace madlib

 #endif // defined(MADLIB_MODULES_GLM_GLM_IMPL_HPP)
	/* ----------------------------------------------------------------------- //*
	*
	* @file GLM_impl.hpp
	*
	// ----------------------------------------------------------------------- */

	#ifndef MADLIB_MODULES_GLM_GLM_IMPL_HPP
	#define MADLIB_MODULES_GLM_GLM_IMPL_HPP

	#include <dbconnector/dbconnector.hpp>
	#include <boost/math/distributions.hpp>
	#include <limits>

	namespace madlib {

	namespace modules {

	namespace glm {

	// Use Eigen
	using namespace dbal::eigen_integration;

	template <class Container, class Family, class Link>
	inline
	GLMAccumulator<Container,Family,Link>::GLMAccumulator(
	Init_type& inInitialization)
	: Base(inInitialization) {

	this->initialize();
	}


	/**
	* @brief Reset the accumulator before accumulating the first tuple
	*/
	template <class Container, class Family, class Link>
	inline
	void
	GLMAccumulator<Container,Family,Link>::reset() {
	num_rows = 0;
	terminated = false;
	loglik = 0.;
	dispersion_accum = 0.;
	grad.setZero();
	hessian.setZero();
	}

	/**
	* @brief Bind all elements of the state to the data in the stream
	*
	* The bind() is special in that even after running operator>>() on an element,
	* there is no guarantee yet that the element can indeed be accessed. It is
	* cruicial to first check this.
	*
	* Provided that this methods correctly lists all member variables, all other
	* methods can, however, rely on that fact that all variables are correctly
	* initialized and accessible.
	*/
	template <class Container, class Family, class Link>
	inline
	void
	GLMAccumulator<Container,Family,Link>::bind(
	ByteStream_type& inStream) {
	inStream
	>> num_rows
	>> terminated
	>> loglik
	>> dispersion
	>> dispersion_accum
	>> num_coef;

	uint16_t M = num_coef.isNull()
	? static_cast<uint16_t>(0)
	: static_cast<uint16_t>(num_coef);
	inStream
	>> beta.rebind(M)
	>> grad.rebind(M)
	>> hessian.rebind(M, M);

	// bind vcov and optimizer.hessian onto the same memory as we don't need them
	// at the same time
	vcov.rebind(hessian.memoryHandle(),
	hessian.rows(),
	hessian.cols());
	}

	/**
	* @brief Update the accumulation state by feeding a tuple
	*/
	template <class Container, class Family, class Link>
	inline
	GLMAccumulator<Container,Family,Link>&
	GLMAccumulator<Container,Family,Link>::operator<<(
	const tuple_type& inTuple) {
	const MappedColumnVector& x = std::get<0>(inTuple);
	const double& y = std::get<1>(inTuple);
	// The following checks were introduced with MADLIB-138. It still seems
	// useful to have clear error messages in case of infinite input values.
	if (!std::isfinite(y)) {
	warning("Dependent variables are not finite.");
	//domain check for y
	} else if (!Family::in_range(y)) {
	std::stringstream err_msg;
	err_msg << "Dependent variables are out of range: "
	<< Family::out_of_range_err_msg();
	throw std::runtime_error(err_msg.str());
	} else if (!dbal::eigen_integration::isfinite(x)) {
	warning("Design matrix is not finite.");
	} else if (x.size() > std::numeric_limits<uint16_t>::max()) {
	warning("Number of independent variables cannot be "
	"larger than 65535.");
	} else if (num_coef != static_cast<uint16_t>(x.size())) {
	warning("Inconsistent numbers of independent variables.");
	} else {
	// normal case
	if (beta.norm() == 0.) {
	double mu = Link::init(y);
	double ita = Link::link_func(mu);
	double G_prime = Link::mean_derivative(ita);
	double V = Family::variance(mu);
	double w = G_prime * G_prime / V;
	// dispersion_accum += (y - mu) * (y - mu) / V;
	loglik += Family::loglik(y, mu, dispersion);
	hessian += x * trans(x) * w; // X_trans_W_X
	grad -= x * w * ita; // X_trans_W_Y
	} else {
	double ita = trans(x) * beta;
	double mu = Link::mean_func(ita);
	double G_prime = Link::mean_derivative(ita);
	double V = Family::variance(mu);
	double w = G_prime * G_prime / V;
	dispersion_accum += (y - mu) * (y - mu) / V;
	loglik += Family::loglik(y, mu, dispersion);

	if (!std::isfinite(static_cast<double>(loglik))) {
	terminated = true;
	warning("Log-likelihood becomes negative infinite. Maybe the model is not proper for this data set.");
	return *this;
	}

	hessian += x * trans(x) * w; // X_trans_W_X
	grad -= x * (y - mu) * G_prime / V; // X_trans_W_Y
	}
	num_rows ++;
	return *this;
	}

	// error case
	terminated = true;
	return *this;
	}

	/**
	* @brief Merge with another accumulation state
	*/
	template <class Container, class Family, class Link>
	template <class C, class F, class L>
	inline
	GLMAccumulator<Container,Family,Link>&
	GLMAccumulator<Container,Family,Link>::operator<<(
	const GLMAccumulator<C,F,L>& inOther) {
	if (this->empty()) {
	*this = inOther;
	} else if (inOther.empty()) {
	} else if (num_coef != inOther.num_coef) {
	warning("Inconsistent numbers of independent variables.");
	terminated = true;
	} else {
	num_rows += inOther.num_rows;
	loglik += inOther.loglik;
	grad += inOther.grad;
	hessian += inOther.hessian;
	dispersion_accum += inOther.dispersion_accum;
	}

	return *this;
	}

	/**
	* @brief Copy from a previous state
	*/
	template <class Container, class Family, class Link>
	template <class C, class F, class L>
	inline
	GLMAccumulator<Container,Family,Link>&
	GLMAccumulator<Container,Family,Link>::operator=(
	const GLMAccumulator<C,F,L>& inOther) {
	this->copy(inOther);
	return *this;
	}

	/**
	* @brief Apply the accumulated intra-state values to inter-state member
	*/
	template <class Container, class Family, class Link>
	inline
	void
	GLMAccumulator<Container,Family,Link>::apply() {
	if (!dbal::eigen_integration::isfinite(hessian) \|\|
	!dbal::eigen_integration::isfinite(grad)) {
	warning("Hessian or gradient is not finite.");
	terminated = true;
	} else {
	SymmetricPositiveDefiniteEigenDecomposition<Matrix> decomposition(
	hessian, EigenvaluesOnly, ComputePseudoInverse);

	if (beta.norm() == 0.) {
	dispersion = 1.;
	} else {
	dispersion = dispersion_accum / static_cast<double>(num_rows);
	}
	hessian = decomposition.pseudoInverse(); // become inverse after apply
	beta -= hessian * grad;
	}
	}

	// -----------------------------------------------------------------------

	template <class Container>
	GLMResult::GLMResult(const GLMAccumulator<Container>& state) {
	compute(state);
	}

	/**
	* @brief Transform an accumulation state into a result
	*/
	template <class Container>
	inline
	GLMResult&
	GLMResult::compute(const GLMAccumulator<Container>& state) {
	// Vector of coefficients: For efficiency reasons, we want to return this
	// by reference, so we need to bind to db memory
	Allocator& allocator = defaultAllocator();
	uint64_t n = state.num_rows;
	uint16_t p = state.num_coef;
	coef.rebind(allocator.allocateArray<double>(p));
	std_err.rebind(allocator.allocateArray<double>(p));
	z_stats.rebind(allocator.allocateArray<double>(p));
	p_values.rebind(allocator.allocateArray<double>(p));

	loglik = state.loglik;
	coef = state.beta;
	dispersion = state.dispersion *
	static_cast<double>(n) / static_cast<double>(n - p);
	std_err = state.vcov.diagonal().cwiseSqrt();
	z_stats = coef.cwiseQuotient(std_err);
	for (Index i = 0; i < p; i ++) {
	p_values(i) = 2. * prob::cdf(prob::normal(), -std::abs(z_stats(i)));
	}
	num_rows_processed = n;

	return *this;
	}

	} // namespace glm

	} // namespace modules

	} // namespace madlib

	#endif // defined(MADLIB_MODULES_GLM_GLM_IMPL_HPP)