blob: ceb85f58672f472d0000b04559a6f12e2a78d7c9 [file] [log] [blame]
/* ----------------------------------------------------------------------- *//**
*
* @file glm.cpp
*
* @brief Generalized linear model functions
*
*//* ----------------------------------------------------------------------- */
#include <dbconnector/dbconnector.hpp>
#include <modules/prob/student.hpp>
#include "GLM_proto.hpp"
#include "GLM_impl.hpp"
#include "glm.hpp"
#include <cmath>
namespace madlib {
namespace modules {
namespace glm {
// types of states
typedef GLMAccumulator<RootContainer> GLMState;
typedef GLMAccumulator<MutableRootContainer> MutableGLMState;
#define DEFINE_GLM_TRANSITION(_state_type) \
_state_type state = args[0].getAs<MutableByteString>(); \
if (state.terminated || args[1].isNull() || args[2].isNull()) { \
return args[0]; \
} \
double y = args[1].getAs<double>(); \
MappedColumnVector x; \
try { \
MappedColumnVector xx = args[2].getAs<MappedColumnVector>(); \
x.rebind(xx.memoryHandle(), xx.size()); \
} catch (const ArrayWithNullException &e) { \
return args[0]; \
} \
\
if (state.empty()) { \
state.num_coef = static_cast<uint16_t>(x.size()); \
state.resize(); \
if (!args[3].isNull()) { \
GLMState prev_state = args[3].getAs<ByteString>(); \
state = prev_state; \
state.reset(); \
} \
} \
\
state << MutableGLMState::tuple_type(x, y); \
return state.storage()
// ------------------------------------------------------------------------
typedef GLMAccumulator<MutableRootContainer,Poisson,Log>
MutableGLMPoissonLogState;
AnyType
glm_poisson_log_transition::run(AnyType& args) {
DEFINE_GLM_TRANSITION(MutableGLMPoissonLogState);
}
// ------------------------------------------------------------------------
typedef GLMAccumulator<MutableRootContainer,Poisson,Identity>
MutableGLMPoissonIdentityState;
AnyType
glm_poisson_identity_transition::run(AnyType& args) {
DEFINE_GLM_TRANSITION(MutableGLMPoissonIdentityState);
}
// ------------------------------------------------------------------------
typedef GLMAccumulator<MutableRootContainer,Poisson,Sqrt>
MutableGLMPoissonSqrtState;
AnyType
glm_poisson_sqrt_transition::run(AnyType& args) {
DEFINE_GLM_TRANSITION(MutableGLMPoissonSqrtState);
}
// ------------------------------------------------------------
typedef GLMAccumulator<MutableRootContainer,Gaussian,Log>
MutableGLMGaussianLogState;
AnyType
glm_gaussian_log_transition::run(AnyType& args) {
DEFINE_GLM_TRANSITION(MutableGLMGaussianLogState);
}
// ------------------------------------------------------------------------
typedef GLMAccumulator<MutableRootContainer,Gaussian,Identity>
MutableGLMGaussianIdentityState;
AnyType
glm_gaussian_identity_transition::run(AnyType& args) {
DEFINE_GLM_TRANSITION(MutableGLMGaussianIdentityState);
}
// ------------------------------------------------------------------------
typedef GLMAccumulator<MutableRootContainer,Gaussian,Inverse>
MutableGLMGaussianInverseState;
AnyType
glm_gaussian_inverse_transition::run(AnyType& args) {
DEFINE_GLM_TRANSITION(MutableGLMGaussianInverseState);
}
// ------------------------------------------------------------------------
typedef GLMAccumulator<MutableRootContainer,Gamma,Log>
MutableGLMGammaLogState;
AnyType
glm_gamma_log_transition::run(AnyType& args) {
DEFINE_GLM_TRANSITION(MutableGLMGammaLogState);
}
// ------------------------------------------------------------------------
typedef GLMAccumulator<MutableRootContainer,Gamma,Identity>
MutableGLMGammaIdentityState;
AnyType
glm_gamma_identity_transition::run(AnyType& args) {
DEFINE_GLM_TRANSITION(MutableGLMGammaIdentityState);
}
// ------------------------------------------------------------------------
typedef GLMAccumulator<MutableRootContainer,Gamma,Inverse>
MutableGLMGammaInverseState;
AnyType
glm_gamma_inverse_transition::run(AnyType& args) {
DEFINE_GLM_TRANSITION(MutableGLMGammaInverseState);
}
// ------------------------------------------------------------
typedef GLMAccumulator<MutableRootContainer, Binomial, Probit>
MutableGLMBinomialProbitState;
AnyType
glm_binomial_probit_transition::run(AnyType& args) {
DEFINE_GLM_TRANSITION(MutableGLMBinomialProbitState);
}
// ------------------------------------------------------------
typedef GLMAccumulator<MutableRootContainer, Binomial, Logit>
MutableGLMBinomialLogitState;
AnyType
glm_binomial_logit_transition::run(AnyType& args) {
DEFINE_GLM_TRANSITION(MutableGLMBinomialLogitState);
}
// ------------------------------------------------------------------------
typedef GLMAccumulator<MutableRootContainer,InverseGaussian,Log>
MutableGLMInverseGaussianLogState;
AnyType
glm_inverse_gaussian_log_transition::run(AnyType& args) {
DEFINE_GLM_TRANSITION(MutableGLMInverseGaussianLogState);
}
// ------------------------------------------------------------------------
typedef GLMAccumulator<MutableRootContainer,InverseGaussian,Identity>
MutableGLMInverseGaussianIdentityState;
AnyType
glm_inverse_gaussian_identity_transition::run(AnyType& args) {
DEFINE_GLM_TRANSITION(MutableGLMInverseGaussianIdentityState);
}
// ------------------------------------------------------------------------
typedef GLMAccumulator<MutableRootContainer,InverseGaussian,Inverse>
MutableGLMInverseGaussianInverseState;
AnyType
glm_inverse_gaussian_inverse_transition::run(AnyType& args) {
DEFINE_GLM_TRANSITION(MutableGLMInverseGaussianInverseState);
}
// ------------------------------------------------------------------------
typedef GLMAccumulator<MutableRootContainer,InverseGaussian,SqrInverse>
MutableGLMInverseGaussianSqrInverseState;
AnyType
glm_inverse_gaussian_sqr_inverse_transition::run(AnyType& args) {
DEFINE_GLM_TRANSITION(MutableGLMInverseGaussianSqrInverseState);
}
// ------------------------------------------------------------------------
AnyType
glm_merge_states::run(AnyType& args) {
MutableGLMState stateLeft = args[0].getAs<MutableByteString>();
GLMState stateRight = args[1].getAs<ByteString>();
stateLeft << stateRight;
return stateLeft.storage();
}
AnyType
glm_final::run(AnyType& args) {
MutableGLMState state = args[0].getAs<MutableByteString>();
// If we haven't seen any valid data, just return Null. This is the standard
// behavior of aggregate function on empty data sets (compare, e.g.,
// how PostgreSQL handles sum or avg on empty inputs)
if (state.empty() || state.terminated) { return Null(); }
state.apply();
return state.storage();
}
// ------------------------------------------------------------------------
AnyType
glm_result_z_stats::run(AnyType& args) {
if (args[0].isNull()) { return Null(); }
GLMState state = args[0].getAs<ByteString>();
GLMResult result(state);
AnyType tuple;
tuple << result.coef
<< result.loglik
<< result.std_err
<< result.z_stats
<< result.p_values
<< result.num_rows_processed
<< 1.; // when z-stats is calculated, dispersion parameter is always 1
return tuple;
}
// ------------------------------------------------------------------------
AnyType
glm_result_t_stats::run(AnyType& args) {
if (args[0].isNull()) { return Null(); }
GLMState state = args[0].getAs<ByteString>();
GLMResult result(state);
result.std_err = result.std_err * sqrt(result.dispersion);
result.z_stats = result.coef.cwiseQuotient(result.std_err);
uint64_t n = state.num_rows;
uint16_t p = state.num_coef;
for (Index i = 0; i < p; i++) {
result.p_values(i) = 2. * prob::cdf(
boost::math::complement(
prob::students_t(static_cast<double>(n - p)),
std::fabs(result.z_stats(i))
));
}
AnyType tuple;
tuple << result.coef
<< result.loglik
<< result.std_err
<< result.z_stats
<< result.p_values
<< result.num_rows_processed
<< result.dispersion;
return tuple;
}
// ------------------------------------------------------------------------
AnyType
glm_loglik_diff::run(AnyType& args) {
if (args[0].isNull() || args[1].isNull()) {
return std::numeric_limits<double>::infinity();
} else {
double a = GLMState(args[0].getAs<ByteString>()).loglik;
double b = GLMState(args[1].getAs<ByteString>()).loglik;
if (a >= 0. || b >= 0.) { return 0.; } // probability = 1
return std::abs(a - b) / std::min(std::abs(a), std::abs(b));
}
}
// ------------------------------------------------------------------------
AnyType glm_predict::run(AnyType &args) {
try {
args[0].getAs<MappedColumnVector>();
} catch (const ArrayWithNullException &e) {
throw std::runtime_error(
"GLM error: the coefficients contain NULL values");
}
// returns NULL if args[1] (features) contains NULL values
try {
args[1].getAs<MappedColumnVector>();
} catch (const ArrayWithNullException &e) {
return Null();
}
MappedColumnVector vec1 = args[0].getAs<MappedColumnVector>();
MappedColumnVector vec2 = args[1].getAs<MappedColumnVector>();
std::string link = args[2].getAs<std::string>();
if (vec1.size() != vec2.size())
throw std::runtime_error(
"Coefficients and independent variables are of incompatible length");
double dot = vec1.dot(vec2);
if (link.compare("identity")==0) return(dot);
if (link.compare("inverse")==0) return(1/dot);
if (link.compare("log")==0) return(exp(dot));
if (link.compare("sqrt")==0) return(dot*dot);
if (link.compare("sqr_inverse")==0) return(1/sqrt(dot));
if (link.compare("probit")==0) return(prob::cdf(prob::normal(), dot));
if (link.compare("logit")==0) return (1./(1 + exp(-dot)));
throw std::runtime_error("Invalid link function!");
}
AnyType glm_predict_binomial::run(AnyType &args) {
try {
args[0].getAs<MappedColumnVector>();
} catch (const ArrayWithNullException &e) {
throw std::runtime_error(
"GLM error: the coefficients contain NULL values");
}
// returns NULL if args[1] (features) contains NULL values
try {
args[1].getAs<MappedColumnVector>();
} catch (const ArrayWithNullException &e) {
return Null();
}
MappedColumnVector vec1 = args[0].getAs<MappedColumnVector>();
MappedColumnVector vec2 = args[1].getAs<MappedColumnVector>();
std::string link = args[2].getAs<std::string>();
if (vec1.size() != vec2.size())
throw std::runtime_error(
"Coefficients and independent variables are of incompatible length");
double dot = vec1.dot(vec2);
if (link.compare("probit")==0) return((prob::cdf(prob::normal(), dot)) >= 0.5);
if (link.compare("logit")==0) return ((1./(1 + exp(-dot))) >= 0.5);
throw std::runtime_error("Invalid link function!");
}
// ------------------------------------------------------------------------
AnyType glm_predict_poisson::run(AnyType &args) {
try {
args[0].getAs<MappedColumnVector>();
} catch (const ArrayWithNullException &e) {
throw std::runtime_error(
"GLM error: the coefficients contain NULL values");
}
// returns NULL if args[1] (features) contains NULL values
try {
args[1].getAs<MappedColumnVector>();
} catch (const ArrayWithNullException &e) {
return Null();
}
MappedColumnVector vec1 = args[0].getAs<MappedColumnVector>();
MappedColumnVector vec2 = args[1].getAs<MappedColumnVector>();
std::string link = args[2].getAs<std::string>();
if (vec1.size() != vec2.size())
throw std::runtime_error(
"Coefficients and independent variables are of incompatible length");
double dot = vec1.dot(vec2);
if (link.compare("identity")==0) return(round(dot));
if (link.compare("log")==0) return(round(exp(dot)));
if (link.compare("sqrt")==0) return(round(dot*dot));
throw std::runtime_error("Invalid link function!");
}
} // namespace glm
} // namespace modules
} // namespace madlib