blob: 627a18bae3da1151c85a51d9943980db232313b6 [file] [log] [blame]
// ---------------------------------------------------------------------------
// Marginal Effects Multi-Logistic
// ---------------------------------------------------------------------------
/**
* @brief State for marginal effects calculation for logistic regression
*
* TransitionState encapsualtes the transition state during the
* marginal effects calculation for the logistic-regression aggregate function.
* To the database, the state is exposed as a single DOUBLE PRECISION array,
* to the C++ code it is a proper object containing scalars and vectors.
*
* Note: We assume that the DOUBLE PRECISION array is initialized by the
* database with length at least 5, and all elemenets are 0.
*
*/
#include <dbconnector/dbconnector.hpp>
#include <modules/shared/HandleTraits.hpp>
#include <modules/prob/boost.hpp>
#include <boost/math/distributions.hpp>
#include <modules/prob/student.hpp>
#include "mlogr_margins.hpp"
#include <vector>
#include <fstream>
namespace madlib {
// Use Eigen
using namespace dbal::eigen_integration;
namespace modules {
// Import names from other MADlib modules
using dbal::NoSolutionFoundException;
namespace regress {
template <class Handle>
class mlogregrMarginalTransitionState {
// By ยง14.5.3/9: "Friend declarations shall not declare partial
// specializations." We do access protected members in operator+=().
template <class OtherHandle>
friend class mlogregrMarginalTransitionState;
public:
mlogregrMarginalTransitionState(const AnyType &inArray)
: mStorage(inArray.getAs<Handle>()) {
rebind(static_cast<uint16_t>(mStorage[0]),static_cast<uint16_t>(mStorage[1]));
}
/**
* @brief Convert to backend representation
*
* We define this function so that we can use State in the
* argument list and as a return type.
*/
inline operator AnyType() const {
return mStorage;
}
/**
* @brief Initialize the iteratively-reweighted-least-squares state.
*
* This function is only called for the first iteration, for the first row.
*/
inline void initialize(const Allocator &inAllocator,
uint16_t inWidthOfX, uint16_t inNumCategories, uint16_t inRefCategory) {
mStorage = inAllocator.allocateArray<double, dbal::AggregateContext,
dbal::DoZero, dbal::ThrowBadAlloc>(arraySize(inWidthOfX, inNumCategories));
rebind(inWidthOfX, inNumCategories);
widthOfX = inWidthOfX;
numCategories = inNumCategories;
ref_category = inRefCategory;
}
/**
* @brief We need to support assigning the previous state
*/
template <class OtherHandle> mlogregrMarginalTransitionState &operator=(
const mlogregrMarginalTransitionState<OtherHandle> &inOtherState) {
for (size_t i = 0; i < mStorage.size(); i++)
mStorage[i] = inOtherState.mStorage[i];
return *this;
}
/**
* @brief Merge with another State object by copying the intra-iteration
* fields
*/
template <class OtherHandle> mlogregrMarginalTransitionState &operator+=(
const mlogregrMarginalTransitionState<OtherHandle> &inOtherState) {
if (mStorage.size() != inOtherState.mStorage.size() ||
widthOfX != inOtherState.widthOfX)
throw std::logic_error("Internal error: Incompatible transition "
"states");
numRows += inOtherState.numRows;
margins_matrix += inOtherState.margins_matrix;
X_bar += inOtherState.X_bar;
X_transp_AX += inOtherState.X_transp_AX;
reference_margins+= inOtherState.reference_margins;
delta += inOtherState.delta;
return *this;
}
/**
* @brief Reset the inter-iteration fields.
*/
inline void reset() {
numRows = 0;
margins_matrix.fill(0);
X_bar.fill(0);
X_transp_AX.fill(0);
reference_margins.fill(0);
delta.fill(0);
}
private:
static inline uint32_t arraySize(const uint16_t inWidthOfX,
const uint16_t inNumCategories) {
return 4 + 2*inWidthOfX * inNumCategories + 2*inWidthOfX +
2 * inNumCategories * inWidthOfX * inWidthOfX * inNumCategories;
}
/**
* @brief Rebind to a new storage array
*
* @param inWidthOfX The number of independent variables.
* @param inNumCategories The number of categories of the dependant var
* Array layout (iteration refers to one aggregate-function call):
* Inter-iteration components (updated in final function):
* - 0: widthOfX (number of independant variables)
* - 1: numCategories (number of categories)
* - 2: ref_category
* - 3: coef (vector of coefficients)
*
* Intra-iteration components (updated in transition step):
* - 3 + widthOfX*numCategories: numRows (number of rows already processed in this iteration)
* - 4 + widthOfX * inNumCategories: margins_matrix
*/
void rebind(uint16_t inWidthOfX = 0, uint16_t inNumCategories = 0) {
widthOfX.rebind(&mStorage[0]);
numCategories.rebind(&mStorage[1]);
ref_category.rebind(&mStorage[2]);
coef.rebind(&mStorage[3], inWidthOfX * inNumCategories);
numRows.rebind(&mStorage[3 + inWidthOfX * inNumCategories]);
margins_matrix.rebind(&mStorage[4 + inWidthOfX * inNumCategories],
inNumCategories , inWidthOfX );
X_bar.rebind(&mStorage[4 + 2*inWidthOfX * inNumCategories], inWidthOfX);
reference_margins.rebind(&mStorage[4 + inWidthOfX + 2*inWidthOfX*inNumCategories],
inWidthOfX);
X_transp_AX.rebind(&mStorage[4 + 2*inWidthOfX * inNumCategories + 2*inWidthOfX],
inNumCategories*inWidthOfX, inWidthOfX*inNumCategories);
delta.rebind(&mStorage[4 + 2*inWidthOfX * inNumCategories + 2*inWidthOfX +
inNumCategories * inWidthOfX * inWidthOfX * inNumCategories],
inNumCategories*inWidthOfX, inWidthOfX*inNumCategories);
}
Handle mStorage;
public:
typename HandleTraits<Handle>::ReferenceToUInt16 widthOfX;
typename HandleTraits<Handle>::ReferenceToUInt16 numCategories;
typename HandleTraits<Handle>::ColumnVectorTransparentHandleMap coef;
typename HandleTraits<Handle>::ReferenceToUInt64 numRows;
typename HandleTraits<Handle>::MatrixTransparentHandleMap margins_matrix;
typename HandleTraits<Handle>::MatrixTransparentHandleMap X_transp_AX;
typename HandleTraits<Handle>::ColumnVectorTransparentHandleMap reference_margins;
typename HandleTraits<Handle>::ReferenceToUInt16 ref_category;
typename HandleTraits<Handle>::ColumnVectorTransparentHandleMap X_bar;
typename HandleTraits<Handle>::MatrixTransparentHandleMap delta;
};
// ----------------------------------------------------------------------
AnyType
mlogregr_marginal_step_transition::run(AnyType &args) {
mlogregrMarginalTransitionState<MutableArrayHandle<double> > state = args[0];
if (args[1].isNull() || args[2].isNull() || args[3].isNull() ||
args[4].isNull() || args[5].isNull()) {
return args[0];
}
// Get x as a vector of double
MappedColumnVector x;
try{
// an exception is raised in the backend if args[2] contains nulls
MappedColumnVector xx = args[4].getAs<MappedColumnVector>();
// x is a const reference, we can only rebind to change its pointer
x.rebind(xx.memoryHandle(), xx.size());
} catch (const ArrayWithNullException &e) {
return args[0];
}
// Get the category & numCategories as integer
int16_t category = static_cast<int16_t>(args[1].getAs<int>());
// Number of categories after pivoting (We pivot around ref_category)
int16_t numCategories = static_cast<int16_t>(args[2].getAs<int>() - 1);
int32_t ref_category = args[3].getAs<int32_t>();
MappedMatrix coefMat = args[5].getAs<MappedMatrix>();
// The following check was added with MADLIB-138.
if (!x.is_finite())
throw std::domain_error("Design matrix is not finite.");
if (state.numRows == 0) {
if (x.size() > std::numeric_limits<uint16_t>::max())
throw std::domain_error("Number of independent variables cannot be "
"larger than 65535.");
if (numCategories < 1)
throw std::domain_error("Number of cateogires must be at least 2");
if (category > numCategories)
throw std::domain_error("You have entered a category > numCategories"
"Categories must be of values {0,1... numCategories-1}");
// Init the state (requires x.size() and category.size())
state.initialize(*this,
static_cast<uint16_t>(x.size()) ,
static_cast<uint16_t>(numCategories),
static_cast<uint16_t>(ref_category));
Matrix mat = coefMat;
mat.transposeInPlace();
mat.resize(coefMat.size(), 1);
state.coef = mat;
}
// Now do the transition step
state.numRows++;
/*
Get y: Convert to 1/0 boolean vector
Example: Category 4 : 0 0 0 1 0 0
Storing it in this forms helps us get a nice closed form expression
*/
//To pivot around the specified reference category
ColumnVector y(numCategories);
y.fill(0);
if (category > ref_category) {
y(category - 1) = 1;
} else if (category < ref_category) {
y(category) = 1;
}
// Marginal Effect calculations
// ----------------------------------------------------------------------
Matrix coef = state.coef;
coef.resize(numCategories, state.widthOfX);
// prob is vector of size # categories
/*
Note: The above 2 lines could have been written as:
ColumnVector prob = -coef*x; but this creates warnings.
See multilog for details.
*/
ColumnVector prob(numCategories);
prob = coef*x;
// Calculate the odds ratio
prob = prob.array().exp();
double prob_sum = prob.sum();
prob = prob / (1 + prob_sum);
Matrix probDiag = prob.asDiagonal();
Matrix a(numCategories,numCategories);
a = prob * prob.transpose() - probDiag;
// Start the Hessian calculations
Matrix X_transp_AX(numCategories * state.widthOfX, numCategories * state.widthOfX);
/*
Again: The following 3 lines could have been written as
Matrix XXTrans = x * x.transpose();
but it creates warnings related to the type of x. Here is an easy fix
*/
Matrix cv_x = x;
Matrix XXTrans = trans(cv_x);
XXTrans = cv_x * XXTrans;
//Eigen doesn't supported outer-products for matrices, so we have to do our own.
//This operation is also known as a tensor-product.
for (int i1 = 0; i1 < state.widthOfX; i1++){
for (int i2 = 0; i2 <state.widthOfX; i2++){
int rowOffset = numCategories * i1;
int colOffset = numCategories * i2;
X_transp_AX.block(rowOffset, colOffset, numCategories, numCategories) = XXTrans(i1, i2) * a;
}
}
triangularView<Lower>(state.X_transp_AX) += X_transp_AX;
int numIndepVars = static_cast<int>(state.coef.size() / state.numCategories);
// Marginal effects (reference calculated separately)
ColumnVector coef_trans_prob;
coef_trans_prob = coef.transpose() * prob;
Matrix margins_matrix = coef;
margins_matrix.rowwise() -= coef_trans_prob.transpose();
margins_matrix = prob.asDiagonal() * margins_matrix;
int idx, index, e_j_J, e_k_K, e_k_K_j_J;
for (int K=0; K < numIndepVars; K++){
for (int J=0; J < numCategories; J++){
idx = K*numCategories + J;
for (int k=0; k < numIndepVars; k++){
for (int j=0; j < numCategories; j++){
e_j_J = (j==J) ? 1: 0;
e_k_K = (k==K) ? 1: 0;
e_k_K_j_J = (j==J && k==K) ? 1: 0;
index = k*numCategories + j;
state.delta(index, idx) +=
x(K) * (e_j_J - prob(J)) * margins_matrix(j,k);
state.delta(index, idx) += prob(j) *
(e_k_K_j_J - prob(J) * e_k_K - x(K) * margins_matrix(J,k));
}
}
}
}
state.margins_matrix += margins_matrix;
return state;
}
// ----------------------------------------------------------------------
/**
* @brief Perform the perliminary aggregation function: Merge transition states
This merge step is the same as that of logistic regression.
*/
AnyType
mlogregr_marginal_step_merge_states::run(AnyType &args) {
mlogregrMarginalTransitionState<MutableArrayHandle<double> > stateLeft = args[0];
mlogregrMarginalTransitionState<ArrayHandle<double> > stateRight = args[1];
// We first handle the trivial case where this function is called with one
// of the states being the initial state
if (stateLeft.numRows == 0)
return stateRight;
else if (stateRight.numRows == 0)
return stateLeft;
// Merge states together and return
stateLeft += stateRight;
return stateLeft;
}
AnyType mlogregr_marginalstateToResult(
const Allocator &inAllocator,
const Index numRows,
const ColumnVector &inCoef,
const ColumnVector &inMargins,
const ColumnVector &inVariance
) {
MutableNativeColumnVector margins(
inAllocator.allocateArray<double>(inMargins.size()));
MutableNativeColumnVector coef(
inAllocator.allocateArray<double>(inMargins.size()));
MutableNativeColumnVector stdErr(
inAllocator.allocateArray<double>(inMargins.size()));
MutableNativeColumnVector tStats(
inAllocator.allocateArray<double>(inMargins.size()));
MutableNativeColumnVector pValues(
inAllocator.allocateArray<double>(inMargins.size()));
for (Index i = 0; i < inMargins.size(); ++i) {
margins(i) = inMargins(i);
coef(i) = inCoef(i);
stdErr(i) = std::sqrt(inVariance(i));
tStats(i) = margins(i) / stdErr(i);
// P-values only make sense if numRows > coef.size()
if (numRows > inCoef.size())
pValues(i) = 2. * prob::cdf( prob::normal(),
-std::abs(tStats(i)));
}
// Return all coefficients, standard errors, etc. in a tuple
// Note: PValues will return NULL if numRows <= coef.size
AnyType tuple;
tuple << margins
<< coef
<< stdErr
<< tStats
<< (numRows > inCoef.size()? pValues: Null());
return tuple;
}
// ----------------------------------------------------------------------
/**
* @brief Perform the logistic-regression final step
*/
AnyType
mlogregr_marginal_step_final::run(AnyType &args) {
// We request a mutable object. Depending on the backend, this might perform
// a deep copy.
mlogregrMarginalTransitionState<ArrayHandle<double> > state = args[0];
// Aggregates that haven't seen any data just return Null.
if (state.numRows == 0)
return Null();
if(!state.coef.is_finite())
throw NoSolutionFoundException("Over- or underflow in Newton step, "
"while updating coefficients. Input data is likely of poor "
"numerical condition.");
// See MADLIB-138. At least on certain platforms and with certain versions,
// LAPACK will run into an infinite loop if pinv() is called for non-finite
// matrices. We extend the check also to the dependent variables.
if (!state.X_transp_AX.is_finite())
throw NoSolutionFoundException("Over- or underflow in intermediate "
"calculation. Input data is likely of poor numerical condition.");
// Include marginal effects of reference variable:
// FIXME: They have been taken out of the output for now
//const int size = state.coef.size() + numIndepVars;
const size_t size = state.coef.size();
// Variance-covariance calculation
// ----------------------------------------------------------
SymmetricPositiveDefiniteEigenDecomposition<Matrix> decomposition(
-1 * state.X_transp_AX, EigenvaluesOnly, ComputePseudoInverse);
// Precompute -(X^T * A * X)^-1
Matrix V = decomposition.pseudoInverse();
int numIndepVars = static_cast<int>(state.coef.size() / state.numCategories);
int numCategories = state.numCategories;
Matrix coef = state.coef;
coef.resize(state.numCategories, state.widthOfX);
// Variance & Marginal gradient
ColumnVector marginal_gradient(size);
ColumnVector variance(size);
variance.setOnes();
variance = (state.delta * V * trans(state.delta) / static_cast<double>(state.numRows*state.numRows)).diagonal();
// Add in reference variables to all the calculations
// ----------------------------------------------------------
ColumnVector coef_with_ref(size);
ColumnVector margins_with_ref(size);
// Vectorize the margins_matrix and add the reference variable
for (int k=0; k < numIndepVars; k++){
for (int j=0; j < numCategories; j++){
int index = k * numCategories + j;
coef_with_ref(index) = coef(j,k);
margins_with_ref(index) = state.margins_matrix(j,k) / static_cast<double>(state.numRows);
}
}
return mlogregr_marginalstateToResult(*this,
state.numRows,
coef_with_ref,
margins_with_ref,
variance);
}
}
}
}