src/modules/glm/Ordinal_impl.hpp - madlib - Git at Google

 /* ----------------------------------------------------------------------- *//**
  *
  * @file Ordinal_impl.hpp
  *
  *//* ----------------------------------------------------------------------- */

 #ifndef MADLIB_MODULES_GLM_ORDINAL_IMPL_HPP
 #define MADLIB_MODULES_GLM_ORDINAL_IMPL_HPP

 #include <dbconnector/dbconnector.hpp>
 #include <boost/math/distributions.hpp>
 #include <limits>

 namespace madlib {

 namespace modules {

 namespace glm {

 // Use Eigen
 using namespace dbal::eigen_integration;

 template <class Container, class Family, class Link>
 inline
 OrdinalAccumulator<Container,Family,Link>::OrdinalAccumulator(
         Init_type& inInitialization)
   : Base(inInitialization), optimizer(inInitialization) {

     this->initialize();
 }


 /**
  * @brief Reset the accumulator before accumulating the first tuple
  */
 template <class Container, class Family, class Link>
 inline
 void
 OrdinalAccumulator<Container,Family,Link>::reset() {
     num_rows = 0;
     terminated = false;
     loglik = 0.;
     optimizer.reset();
 }

 /**
  * @brief Bind all elements of the state to the data in the stream
  *
  * The bind() is special in that even after running operator>>() on an element,
  * there is no guarantee yet that the element can indeed be accessed. It is
  * cruicial to first check this.
  *
  * Provided that this methods correctly lists all member variables, all other
  * methods can, however, rely on that fact that all variables are correctly
  * initialized and accessible.
  */
 template <class Container, class Family, class Link>
 inline
 void
 OrdinalAccumulator<Container,Family,Link>::bind(
         ByteStream_type& inStream) {

     inStream
         >> num_features
         >> num_categories
         >> num_rows
         >> terminated
         >> loglik
         >> optimizer;

     // bind vcov and optimizer.hessian onto the same memory as we don't need them
     // at the same time
     vcov.rebind(optimizer.hessian.memoryHandle(),
                 optimizer.hessian.rows(),
                 optimizer.hessian.cols());
 }

 /**
  * @brief Update the accumulation state by feeding a tuple
  */
 template <class Container, class Family, class Link>
 inline
 OrdinalAccumulator<Container,Family,Link>&
 OrdinalAccumulator<Container,Family,Link>::operator<<(
         const tuple_type& inTuple) {

     const MappedColumnVector& x = std::get<0>(inTuple);
     const int& y = static_cast<int>(std::get<1>(inTuple));
     // convert y to an indicator vector
     ColumnVector vecY(num_categories-1);
     vecY.fill(0);
     if (y != (num_categories-1)) { vecY(y) = 1; }

     // The following checks were introduced with MADLIB-138. It still seems
     // useful to have clear error messages in case of infinite input values.

     if (!dbal::eigen_integration::isfinite(x)) {
         warning("Design matrix is not finite.");
     } else if (x.size() > std::numeric_limits<uint16_t>::max()) {
         warning("Number of independent variables cannot be "
             "larger than 65535.");
     } else if (num_features != static_cast<uint16_t>(x.size())) {
         warning("Inconsistent numbers of independent variables.");
     } else {
         // normal case
         uint16_t N = num_features;
         uint16_t C = static_cast<uint16_t>(num_categories - 1);

         uint32_t state_size = 6 + (N + C)*(N + C) + 2 * (N + C);
         // GPDB limits the single array size to be 1GB, which means that the size
         // of a double array cannot be large than 134217727 because
         // (134217727 * 8) / (1024 * 1024) = 1023. And solve
         // state_size = x^2 + 2^x + 6 <= 134217727 will give x <= 11584.
         if(state_size > 134217727)
             throw std::domain_error(
                 "The sum of number of independent variables and number of "
                  "categories cannot be larger than 11584.");
         ColumnVector mu(C);
         ColumnVector ita(C);
         Matrix G_prime(C, C);
         Matrix V(C, C);

         if (optimizer.beta.norm() == 0.) {
             // beta is 0 in the first iteration
             Link::init(mu);
             Link::link_func(mu,ita);
         } else {
             //beta has non-zero value
             double Xtbeta = trans(x)*optimizer.beta.segment(C,N);
             for (int i=0; i<ita.size(); i++)
                 ita(i) = optimizer.beta(i) - Xtbeta;
             Link::mean_func(ita, mu);
        }

         Link::mean_derivative(ita, G_prime);
         Family::variance(mu, V);
         loglik += Family::loglik(vecY, mu, 1);
         Matrix Vinv(V.inverse());

         // calcualte X_trans_W_X
         Matrix GtVinvG(trans(G_prime)*Vinv*G_prime);
         ColumnVector h_vec(C);
         h_vec.fill(0);
         for (int i=0;i<C;i++)
             for (int j=0;j<C;j++)
                  h_vec(i) += G_prime(i,j);

         Matrix GtVinvhXt(trans(G_prime)*Vinv*h_vec*trans(x));
         GtVinvhXt = -1*GtVinvhXt;
         Matrix XhtVinvG(x*trans(h_vec)*Vinv*G_prime);
         XhtVinvG = -1*XhtVinvG;
         Matrix htVinvhXXt(x*trans(x));
         double temp  = trans(h_vec)*Vinv*h_vec;
         htVinvhXXt = temp * htVinvhXXt;

         for (int i=0;i<C;i++)
             for (int j=0;j<C;j++)
                 optimizer.hessian(i,j) += GtVinvG(i,j);
         for (int i=0;i<C;i++)
             for (int j=C;j<(C+N);j++)
                 optimizer.hessian(i,j) += GtVinvhXt(i,j-C);

         for (int i=C;i<(C+N);i++)
             for (int j=0;j<C;j++)
                 optimizer.hessian(i,j) += XhtVinvG(i-C,j);

         for (int i=C;i<(C+N);i++)
             for (int j=C;j<(C+N);j++)
                 optimizer.hessian(i,j) += htVinvhXXt(i-C,j-C);

         // calculate X_trans_W_Y
         ColumnVector YVinvG(trans(vecY - mu)*Vinv*G_prime);
         ColumnVector YVinvhX(x);
         temp = trans(vecY - mu)*Vinv*h_vec;
         YVinvhX = -1*temp * YVinvhX;

         optimizer.grad.segment(0, C) += YVinvG;
         optimizer.grad.segment(C, N) += YVinvhX;

         num_rows ++;
         return *this;
     } // all tests passed and we were in the else

     // error case
     terminated = true;
     return *this;
 }

 /**
  * @brief Merge with another accumulation state
  */
 template <class Container, class Family, class Link>
 template <class C, class F, class L>
 inline
 OrdinalAccumulator<Container,Family,Link>&
 OrdinalAccumulator<Container,Family,Link>::operator<<(
         const OrdinalAccumulator<C,F,L>& inOther) {
     if (this->empty()) {
         *this = inOther;
     } else if (inOther.empty()) {
     } else if (num_features != inOther.num_features) {
         warning("Inconsistent numbers of independent variables.");
         terminated = true;
     } else {
         num_rows += inOther.num_rows;
         loglik += inOther.loglik;
         optimizer.grad += inOther.optimizer.grad;
         optimizer.hessian += inOther.optimizer.hessian;
     }

     return *this;
 }

 /**
  * @brief Copy from a previous state
  */
 template <class Container, class Family, class Link>
 template <class C, class F, class L>
 inline
 OrdinalAccumulator<Container,Family,Link>&
 OrdinalAccumulator<Container,Family,Link>::operator=(
         const OrdinalAccumulator<C,F,L>& inOther) {
     this->copy(inOther);
     return *this;
 }

 /**
  * @brief Apply the accumulated intra-state values to inter-state member
  */
 template <class Container, class Family, class Link>
 inline
 void
 OrdinalAccumulator<Container,Family,Link>::apply() {
     if (!dbal::eigen_integration::isfinite(optimizer.hessian) ||
             !dbal::eigen_integration::isfinite(optimizer.grad)) {
         warning("Ordinal:Hessian or gradient is not finite. One possibility is that intercept is included in the independent variables. If that is the case, please drop the intercept and rerun the function.");
         terminated = true;
     } else {
                 vcov = optimizer.hessian.inverse();
                 optimizer.beta += vcov * optimizer.grad;
     }

 }
 // -----------------------------------------------------------------------

 template <class Container>
 OrdinalResult::OrdinalResult(const OrdinalAccumulator<Container>& state) {
     compute(state);
 }

 /**
  * @brief Transform an accumulation state into a result
  */
 template <class Container>
 inline
 OrdinalResult&
 OrdinalResult::compute(const OrdinalAccumulator<Container>& state) {
     // Vector of coefficients: For efficiency reasons, we want to return this
     // by reference, so we need to bind to db memory
     Allocator& allocator = defaultAllocator();
     Index N = static_cast<Index>(state.num_features);
     Index C = static_cast<Index>(state.num_categories - 1);

     coef_alpha.rebind(allocator.allocateArray<double>(C));
     std_err_alpha.rebind(allocator.allocateArray<double>(C));
     z_stats_alpha.rebind(allocator.allocateArray<double>(C));
     p_values_alpha.rebind(allocator.allocateArray<double>(C));

     coef_beta.rebind(allocator.allocateArray<double>(N));
     std_err_beta.rebind(allocator.allocateArray<double>(N));
     z_stats_beta.rebind(allocator.allocateArray<double>(N));
     p_values_beta.rebind(allocator.allocateArray<double>(N));

     // computation
     loglik = state.loglik;
     coef_alpha = state.optimizer.beta.segment(0,C);
     ColumnVector temp(state.vcov.diagonal().cwiseSqrt());
     std_err_alpha = temp.segment(0,C);
     z_stats_alpha = coef_alpha.cwiseQuotient(std_err_alpha);
     for (Index i = 0; i < C; i ++) {
         p_values_alpha(i) = 2. * prob::cdf( prob::normal(), -std::abs(z_stats_alpha(i)));
     }

     coef_beta = state.optimizer.beta.segment(C,N);
     std_err_beta = temp.segment(C,N);
     z_stats_beta = coef_beta.cwiseQuotient(std_err_beta);
     for (Index i = 0; i < N; i ++) {
         p_values_beta(i) = 2. * prob::cdf( prob::normal(), -std::abs(z_stats_beta(i)));
     }
     num_rows_processed = state.num_rows;

     return *this;
 }

 } // namespace glm

 } // namespace modules

 } // namespace madlib

 #endif // defined(MADLIB_MODULES_GLM_ORDINAL_IMPL_HPP)
	/* ----------------------------------------------------------------------- //*
	*
	* @file Ordinal_impl.hpp
	*
	// ----------------------------------------------------------------------- */

	#ifndef MADLIB_MODULES_GLM_ORDINAL_IMPL_HPP
	#define MADLIB_MODULES_GLM_ORDINAL_IMPL_HPP

	#include <dbconnector/dbconnector.hpp>
	#include <boost/math/distributions.hpp>
	#include <limits>

	namespace madlib {

	namespace modules {

	namespace glm {

	// Use Eigen
	using namespace dbal::eigen_integration;

	template <class Container, class Family, class Link>
	inline
	OrdinalAccumulator<Container,Family,Link>::OrdinalAccumulator(
	Init_type& inInitialization)
	: Base(inInitialization), optimizer(inInitialization) {

	this->initialize();
	}


	/**
	* @brief Reset the accumulator before accumulating the first tuple
	*/
	template <class Container, class Family, class Link>
	inline
	void
	OrdinalAccumulator<Container,Family,Link>::reset() {
	num_rows = 0;
	terminated = false;
	loglik = 0.;
	optimizer.reset();
	}

	/**
	* @brief Bind all elements of the state to the data in the stream
	*
	* The bind() is special in that even after running operator>>() on an element,
	* there is no guarantee yet that the element can indeed be accessed. It is
	* cruicial to first check this.
	*
	* Provided that this methods correctly lists all member variables, all other
	* methods can, however, rely on that fact that all variables are correctly
	* initialized and accessible.
	*/
	template <class Container, class Family, class Link>
	inline
	void
	OrdinalAccumulator<Container,Family,Link>::bind(
	ByteStream_type& inStream) {

	inStream
	>> num_features
	>> num_categories
	>> num_rows
	>> terminated
	>> loglik
	>> optimizer;

	// bind vcov and optimizer.hessian onto the same memory as we don't need them
	// at the same time
	vcov.rebind(optimizer.hessian.memoryHandle(),
	optimizer.hessian.rows(),
	optimizer.hessian.cols());
	}

	/**
	* @brief Update the accumulation state by feeding a tuple
	*/
	template <class Container, class Family, class Link>
	inline
	OrdinalAccumulator<Container,Family,Link>&
	OrdinalAccumulator<Container,Family,Link>::operator<<(
	const tuple_type& inTuple) {

	const MappedColumnVector& x = std::get<0>(inTuple);
	const int& y = static_cast<int>(std::get<1>(inTuple));
	// convert y to an indicator vector
	ColumnVector vecY(num_categories-1);
	vecY.fill(0);
	if (y != (num_categories-1)) { vecY(y) = 1; }

	// The following checks were introduced with MADLIB-138. It still seems
	// useful to have clear error messages in case of infinite input values.

	if (!dbal::eigen_integration::isfinite(x)) {
	warning("Design matrix is not finite.");
	} else if (x.size() > std::numeric_limits<uint16_t>::max()) {
	warning("Number of independent variables cannot be "
	"larger than 65535.");
	} else if (num_features != static_cast<uint16_t>(x.size())) {
	warning("Inconsistent numbers of independent variables.");
	} else {
	// normal case
	uint16_t N = num_features;
	uint16_t C = static_cast<uint16_t>(num_categories - 1);

	uint32_t state_size = 6 + (N + C)(N + C) + 2 (N + C);
	// GPDB limits the single array size to be 1GB, which means that the size
	// of a double array cannot be large than 134217727 because
	// (134217727 * 8) / (1024 * 1024) = 1023. And solve
	// state_size = x^2 + 2^x + 6 <= 134217727 will give x <= 11584.
	if(state_size > 134217727)
	throw std::domain_error(
	"The sum of number of independent variables and number of "
	"categories cannot be larger than 11584.");
	ColumnVector mu(C);
	ColumnVector ita(C);
	Matrix G_prime(C, C);
	Matrix V(C, C);

	if (optimizer.beta.norm() == 0.) {
	// beta is 0 in the first iteration
	Link::init(mu);
	Link::link_func(mu,ita);
	} else {
	//beta has non-zero value
	double Xtbeta = trans(x)*optimizer.beta.segment(C,N);
	for (int i=0; i<ita.size(); i++)
	ita(i) = optimizer.beta(i) - Xtbeta;
	Link::mean_func(ita, mu);
	}

	Link::mean_derivative(ita, G_prime);
	Family::variance(mu, V);
	loglik += Family::loglik(vecY, mu, 1);
	Matrix Vinv(V.inverse());

	// calcualte X_trans_W_X
	Matrix GtVinvG(trans(G_prime)VinvG_prime);
	ColumnVector h_vec(C);
	h_vec.fill(0);
	for (int i=0;i<C;i++)
	for (int j=0;j<C;j++)
	h_vec(i) += G_prime(i,j);

	Matrix GtVinvhXt(trans(G_prime)Vinvh_vec*trans(x));
	GtVinvhXt = -1*GtVinvhXt;
	Matrix XhtVinvG(xtrans(h_vec)Vinv*G_prime);
	XhtVinvG = -1*XhtVinvG;
	Matrix htVinvhXXt(x*trans(x));
	double temp = trans(h_vec)Vinvh_vec;
	htVinvhXXt = temp * htVinvhXXt;

	for (int i=0;i<C;i++)
	for (int j=0;j<C;j++)
	optimizer.hessian(i,j) += GtVinvG(i,j);
	for (int i=0;i<C;i++)
	for (int j=C;j<(C+N);j++)
	optimizer.hessian(i,j) += GtVinvhXt(i,j-C);

	for (int i=C;i<(C+N);i++)
	for (int j=0;j<C;j++)
	optimizer.hessian(i,j) += XhtVinvG(i-C,j);

	for (int i=C;i<(C+N);i++)
	for (int j=C;j<(C+N);j++)
	optimizer.hessian(i,j) += htVinvhXXt(i-C,j-C);

	// calculate X_trans_W_Y
	ColumnVector YVinvG(trans(vecY - mu)VinvG_prime);
	ColumnVector YVinvhX(x);
	temp = trans(vecY - mu)Vinvh_vec;
	YVinvhX = -1temp YVinvhX;

	optimizer.grad.segment(0, C) += YVinvG;
	optimizer.grad.segment(C, N) += YVinvhX;

	num_rows ++;
	return *this;
	} // all tests passed and we were in the else

	// error case
	terminated = true;
	return *this;
	}

	/**
	* @brief Merge with another accumulation state
	*/
	template <class Container, class Family, class Link>
	template <class C, class F, class L>
	inline
	OrdinalAccumulator<Container,Family,Link>&
	OrdinalAccumulator<Container,Family,Link>::operator<<(
	const OrdinalAccumulator<C,F,L>& inOther) {
	if (this->empty()) {
	*this = inOther;
	} else if (inOther.empty()) {
	} else if (num_features != inOther.num_features) {
	warning("Inconsistent numbers of independent variables.");
	terminated = true;
	} else {
	num_rows += inOther.num_rows;
	loglik += inOther.loglik;
	optimizer.grad += inOther.optimizer.grad;
	optimizer.hessian += inOther.optimizer.hessian;
	}

	return *this;
	}

	/**
	* @brief Copy from a previous state
	*/
	template <class Container, class Family, class Link>
	template <class C, class F, class L>
	inline
	OrdinalAccumulator<Container,Family,Link>&
	OrdinalAccumulator<Container,Family,Link>::operator=(
	const OrdinalAccumulator<C,F,L>& inOther) {
	this->copy(inOther);
	return *this;
	}

	/**
	* @brief Apply the accumulated intra-state values to inter-state member
	*/
	template <class Container, class Family, class Link>
	inline
	void
	OrdinalAccumulator<Container,Family,Link>::apply() {
	if (!dbal::eigen_integration::isfinite(optimizer.hessian) \|\|
	!dbal::eigen_integration::isfinite(optimizer.grad)) {
	warning("Ordinal:Hessian or gradient is not finite. One possibility is that intercept is included in the independent variables. If that is the case, please drop the intercept and rerun the function.");
	terminated = true;
	} else {
	vcov = optimizer.hessian.inverse();
	optimizer.beta += vcov * optimizer.grad;
	}

	}
	// -----------------------------------------------------------------------

	template <class Container>
	OrdinalResult::OrdinalResult(const OrdinalAccumulator<Container>& state) {
	compute(state);
	}

	/**
	* @brief Transform an accumulation state into a result
	*/
	template <class Container>
	inline
	OrdinalResult&
	OrdinalResult::compute(const OrdinalAccumulator<Container>& state) {
	// Vector of coefficients: For efficiency reasons, we want to return this
	// by reference, so we need to bind to db memory
	Allocator& allocator = defaultAllocator();
	Index N = static_cast<Index>(state.num_features);
	Index C = static_cast<Index>(state.num_categories - 1);

	coef_alpha.rebind(allocator.allocateArray<double>(C));
	std_err_alpha.rebind(allocator.allocateArray<double>(C));
	z_stats_alpha.rebind(allocator.allocateArray<double>(C));
	p_values_alpha.rebind(allocator.allocateArray<double>(C));

	coef_beta.rebind(allocator.allocateArray<double>(N));
	std_err_beta.rebind(allocator.allocateArray<double>(N));
	z_stats_beta.rebind(allocator.allocateArray<double>(N));
	p_values_beta.rebind(allocator.allocateArray<double>(N));

	// computation
	loglik = state.loglik;
	coef_alpha = state.optimizer.beta.segment(0,C);
	ColumnVector temp(state.vcov.diagonal().cwiseSqrt());
	std_err_alpha = temp.segment(0,C);
	z_stats_alpha = coef_alpha.cwiseQuotient(std_err_alpha);
	for (Index i = 0; i < C; i ++) {
	p_values_alpha(i) = 2. * prob::cdf( prob::normal(), -std::abs(z_stats_alpha(i)));
	}

	coef_beta = state.optimizer.beta.segment(C,N);
	std_err_beta = temp.segment(C,N);
	z_stats_beta = coef_beta.cwiseQuotient(std_err_beta);
	for (Index i = 0; i < N; i ++) {
	p_values_beta(i) = 2. * prob::cdf( prob::normal(), -std::abs(z_stats_beta(i)));
	}
	num_rows_processed = state.num_rows;

	return *this;
	}

	} // namespace glm

	} // namespace modules

	} // namespace madlib

	#endif // defined(MADLIB_MODULES_GLM_ORDINAL_IMPL_HPP)