blob: 6c5f41fa0677f21159c33a672c0523a4b68de6b9 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
/* ------------------------------------------------------
*
* @file logistic.cpp
*
* @brief Logistic-Regression functions
*
* We implement the conjugate-gradient method and the iteratively-reweighted-
* least-squares method.
*
*//* ----------------------------------------------------------------------- */
#include <limits>
#include <dbconnector/dbconnector.hpp>
#include <modules/shared/HandleTraits.hpp>
#include <modules/prob/boost.hpp>
#include <boost/math/distributions.hpp>
#include <modules/prob/student.hpp>
#include "simple_logistic.hpp"
namespace madlib {
// Use Eigen
using namespace dbal::eigen_integration;
namespace modules {
// Import names from other MADlib modules
using dbal::NoSolutionFoundException;
namespace hello_world {
// FIXME this enum should be accessed by all modules that may need grouping
// valid status values
enum { IN_PROCESS, COMPLETED, TERMINATED, NULL_EMPTY };
// Internal functions
AnyType stateToResult(const Allocator &inAllocator,
const HandleMap<const ColumnVector, TransparentHandle<double> >& inCoef,
const Matrix & hessian,
const double &logLikelihood,
int status,
const uint64_t &numRows);
// ---------------------------------------------------------------------------
// Logistic Regression States
// ---------------------------------------------------------------------------
/**
* @brief Inter- and intra-iteration state for conjugate-gradient method for
* logistic regression
*
* TransitionState encapsualtes the transition state during the
* logistic-regression aggregate function. To the database, the state is
* exposed as a single DOUBLE PRECISION array, to the C++ code it is a proper
* object containing scalars and vectors.
*
* Note: We assume that the DOUBLE PRECISION array is initialized by the
* database with length at least 5, and all elemenets are 0.
*
*/
template <class Handle>
class LogRegrSPTransitionState {
template <class OtherHandle>
friend class LogRegrSPTransitionState;
public:
LogRegrSPTransitionState(const AnyType &inArray)
: mStorage(inArray.getAs<Handle>()) {
rebind(static_cast<uint16_t>(mStorage[1]));
}
/**
* @brief Convert to backend representation
*
* We define this function so that we can use State in the
* argument list and as a return type.
*/
inline operator AnyType() const {
return mStorage;
}
/**
* @brief Initialize the conjugate-gradient state.
*
* This function is only called for the first iteration, for the first row.
*/
inline void initialize(const Allocator &inAllocator, uint16_t inWidthOfX) {
mStorage = inAllocator.allocateArray<double, dbal::AggregateContext,
dbal::DoZero, dbal::ThrowBadAlloc>(arraySize(inWidthOfX));
rebind(inWidthOfX);
widthOfX = inWidthOfX;
}
/**
* @brief We need to support assigning the previous state
*/
template <class OtherHandle>
LogRegrSPTransitionState &operator=(
const LogRegrSPTransitionState<OtherHandle> &inOtherState) {
for (size_t i = 0; i < mStorage.size(); i++)
mStorage[i] = inOtherState.mStorage[i];
return *this;
}
/**
* @brief Merge with another State object by copying the intra-iteration
* fields
*/
template <class OtherHandle>
LogRegrSPTransitionState &operator+=(
const LogRegrSPTransitionState<OtherHandle> &inOtherState) {
if (mStorage.size() != inOtherState.mStorage.size() ||
widthOfX != inOtherState.widthOfX)
throw std::logic_error("Internal error: Incompatible transition "
"states");
numRows += inOtherState.numRows;
gradNew += inOtherState.gradNew;
X_transp_AX += inOtherState.X_transp_AX;
logLikelihood += inOtherState.logLikelihood;
// merged state should have the higher status
// (see top of file for more on 'status' )
status = (inOtherState.status > status) ? inOtherState.status : status;
return *this;
}
/**
* @brief Reset the inter-iteration fields.
*/
inline void reset() {
numRows = 0;
X_transp_AX.fill(0);
gradNew.fill(0);
logLikelihood = 0;
status = IN_PROCESS;
}
private:
static inline size_t arraySize(const uint16_t inWidthOfX) {
return 6 + inWidthOfX * inWidthOfX + 4 * inWidthOfX;
}
/**
* @brief Rebind to a new storage array
*
* @param inWidthOfX The number of independent variables.
*
* Array layout (iteration refers to one aggregate-function call):
* Inter-iteration components (updated in final function):
* - 0: iteration (current iteration)
* - 1: widthOfX (number of coefficients)
* - 2: coef (vector of coefficients)
* - 2 + widthOfX: dir (direction)
* - 2 + 2 * widthOfX: grad (gradient)
* - 2 + 3 * widthOfX: beta (scale factor)
*
* Intra-iteration components (updated in transition step):
* - 3 + 3 * widthOfX: numRows (number of rows already processed in this iteration)
* - 4 + 3 * widthOfX: gradNew (intermediate value for gradient)
* - 4 + 4 * widthOfX: X_transp_AX (X^T A X)
* - 4 + widthOfX * widthOfX + 4 * widthOfX: logLikelihood ( ln(l(c)) )
*/
void rebind(uint16_t inWidthOfX) {
iteration.rebind(&mStorage[0]);
widthOfX.rebind(&mStorage[1]);
coef.rebind(&mStorage[2], inWidthOfX);
dir.rebind(&mStorage[2 + inWidthOfX], inWidthOfX);
grad.rebind(&mStorage[2 + 2 * inWidthOfX], inWidthOfX);
beta.rebind(&mStorage[2 + 3 * inWidthOfX]);
numRows.rebind(&mStorage[3 + 3 * inWidthOfX]);
gradNew.rebind(&mStorage[4 + 3 * inWidthOfX], inWidthOfX);
X_transp_AX.rebind(&mStorage[4 + 4 * inWidthOfX], inWidthOfX, inWidthOfX);
logLikelihood.rebind(&mStorage[4 + inWidthOfX * inWidthOfX + 4 * inWidthOfX]);
status.rebind(&mStorage[5 + inWidthOfX * inWidthOfX + 4 * inWidthOfX]);
}
Handle mStorage;
public:
typename HandleTraits<Handle>::ReferenceToUInt32 iteration;
typename HandleTraits<Handle>::ReferenceToUInt16 widthOfX;
typename HandleTraits<Handle>::ColumnVectorTransparentHandleMap coef;
typename HandleTraits<Handle>::ColumnVectorTransparentHandleMap dir;
typename HandleTraits<Handle>::ColumnVectorTransparentHandleMap grad;
typename HandleTraits<Handle>::ReferenceToDouble beta;
typename HandleTraits<Handle>::ReferenceToUInt64 numRows;
typename HandleTraits<Handle>::ColumnVectorTransparentHandleMap gradNew;
typename HandleTraits<Handle>::MatrixTransparentHandleMap X_transp_AX;
typename HandleTraits<Handle>::ReferenceToDouble logLikelihood;
typename HandleTraits<Handle>::ReferenceToUInt16 status;
};
/**
* @brief Logistic function
*/
inline double sigma(double x) {
return 1. / (1. + std::exp(-x));
}
/**
* @brief Perform the logistic-regression transition step
*/
AnyType
logregr_simple_step_transition::run(AnyType &args) {
LogRegrSPTransitionState<MutableArrayHandle<double> > state = args[0];
if (args[1].isNull() || args[2].isNull()) { return args[0]; }
double y = args[1].getAs<bool>() ? 1. : -1.;
MappedColumnVector x;
try {
// an exception is raised in the backend if args[2] contains nulls
MappedColumnVector xx = args[2].getAs<MappedColumnVector>();
// x is a const reference, we can only rebind to change its pointer
x.rebind(xx.memoryHandle(), xx.size());
} catch (const ArrayWithNullException &e) {
return args[0];
}
// The following check was added with MADLIB-138.
if (!dbal::eigen_integration::isfinite(x)) {
warning("Design matrix is not finite.");
state.status = TERMINATED;
return state;
}
if (state.numRows == 0) {
if (x.size() > std::numeric_limits<uint16_t>::max()){
warning("Number of independent variables cannot be larger than 65535.");
state.status = TERMINATED;
return state;
}
state.initialize(*this, static_cast<uint16_t>(x.size()));
if (!args[3].isNull()) {
LogRegrSPTransitionState<ArrayHandle<double> > previousState = args[3];
state = previousState;
state.reset();
}
}
// Now do the transition step
state.numRows++;
double xc = dot(x, state.coef);
state.gradNew.noalias() += sigma(-y * xc) * y * trans(x);
// Note: sigma(-x) = 1 - sigma(x).
double a = sigma(xc) * sigma(-xc);
state.X_transp_AX += x * trans(x) * a;
state.logLikelihood -= std::log( 1. + std::exp(-y * xc) );
return state;
}
/**
* @brief Perform the perliminary aggregation function: Merge transition states
*/
AnyType
logregr_simple_step_merge_states::run(AnyType &args) {
LogRegrSPTransitionState<MutableArrayHandle<double> > stateLeft = args[0];
LogRegrSPTransitionState<ArrayHandle<double> > stateRight = args[1];
// We first handle the trivial case where this function is called with one
// of the states being the initial state
if (stateLeft.numRows == 0)
return stateRight;
else if (stateRight.numRows == 0)
return stateLeft;
// Merge states together and return
stateLeft += stateRight;
return stateLeft;
}
/**
* @brief Perform the logistic-regression final step
*/
AnyType
logregr_simple_step_final::run(AnyType &args) {
// We request a mutable object. Depending on the backend, this might perform
// a deep copy.
LogRegrSPTransitionState<MutableArrayHandle<double> > state = args[0];
// Aggregates that haven't seen any data just return Null.
if (state.numRows == 0){
state.status = NULL_EMPTY;
return state;
}
// Note: k = state.iteration
if (state.iteration == 0) {
// Iteration computes the gradient
state.dir = state.gradNew;
state.grad = state.gradNew;
} else {
// We use the Hestenes-Stiefel update formula:
//
// g_k^T (g_k - g_{k-1})
// beta_k = -------------------------
// d_{k-1}^T (g_k - g_{k-1})
ColumnVector gradNewMinusGrad = state.gradNew - state.grad;
state.beta
= dot(state.gradNew, gradNewMinusGrad)
/ dot(state.dir, gradNewMinusGrad);
// Do a direction restart (Powell restart)
// Note: This is testing whether state.beta < 0 if state.beta were
// assigned according to Polak-Ribière
if (dot(state.gradNew, gradNewMinusGrad)
/ dot(state.grad, state.grad) <= std::numeric_limits<double>::denorm_min()) state.beta = 0;
// d_k = g_k - beta_k * d_{k-1}
state.dir = state.gradNew - state.beta * state.dir;
state.grad = state.gradNew;
}
// H_k = - X^T A_k X
// where A_k = diag(a_1, ..., a_n) and a_i = sigma(x_i c_{k-1}) sigma(-x_i c_{k-1})
//
// g_k^T d_k
// alpha_k = -------------
// d_k^T H_k d_k
//
// c_k = c_{k-1} - alpha_k * d_k
state.coef += dot(state.grad, state.dir) /
as_scalar(trans(state.dir) * state.X_transp_AX * state.dir)
* state.dir;
if(!state.coef.is_finite()){
// we don't throw an error - instead set the state status to allow
// other states (possibly trained as part of group by) to continue
// training
warning("Over- or underflow in conjugate-gradient step, while updating "
"coefficients. Input data is likely of poor numerical condition.");
state.status = TERMINATED;
return state;
}
state.iteration++;
return state;
}
/**
* @brief Return the difference in log-likelihood between two states
*/
AnyType
internal_logregr_simple_step_distance::run(AnyType &args) {
LogRegrSPTransitionState<ArrayHandle<double> > stateLeft = args[0];
LogRegrSPTransitionState<ArrayHandle<double> > stateRight = args[1];
if(stateLeft.status == NULL_EMPTY || stateRight.status == NULL_EMPTY){
return 0.0;
}
return std::abs(stateLeft.logLikelihood - stateRight.logLikelihood);
}
/**
* @brief Return the coefficients and diagnostic statistics of the state
*/
AnyType
internal_logregr_simple_result::run(AnyType &args) {
LogRegrSPTransitionState<ArrayHandle<double> > state = args[0];
if (state.status == NULL_EMPTY)
return Null();
SymmetricPositiveDefiniteEigenDecomposition<Matrix> decomposition(
state.X_transp_AX, EigenvaluesOnly, ComputePseudoInverse);
return stateToResult(*this, state.coef,
state.X_transp_AX, state.logLikelihood,
state.status, state.numRows);
}
/**
* @brief Compute the diagnostic statistics
*
* This function wraps the common parts of computing the results for both the
* CG and the IRLS method.
*/
AnyType stateToResult(
const Allocator &inAllocator,
const HandleMap<const ColumnVector, TransparentHandle<double> > &inCoef,
const Matrix & hessian,
const double &logLikelihood,
int status,
const uint64_t &numRows) {
SymmetricPositiveDefiniteEigenDecomposition<Matrix> decomposition(
hessian, EigenvaluesOnly, ComputePseudoInverse);
const Matrix &inverse_of_X_transp_AX = decomposition.pseudoInverse();
const ColumnVector &diagonal_of_X_transp_AX = inverse_of_X_transp_AX.diagonal();
MutableNativeColumnVector stdErr(
inAllocator.allocateArray<double>(inCoef.size()));
MutableNativeColumnVector waldZStats(
inAllocator.allocateArray<double>(inCoef.size()));
MutableNativeColumnVector waldPValues(
inAllocator.allocateArray<double>(inCoef.size()));
MutableNativeColumnVector oddsRatios(
inAllocator.allocateArray<double>(inCoef.size()));
for (Index i = 0; i < inCoef.size(); ++i) {
stdErr(i) = std::sqrt(diagonal_of_X_transp_AX(i));
waldZStats(i) = inCoef(i) / stdErr(i);
waldPValues(i) = 2. * prob::cdf( prob::normal(),
-std::abs(waldZStats(i)));
oddsRatios(i) = std::exp( inCoef(i) );
}
// Return all coefficients, standard errors, etc. in a tuple
AnyType tuple;
tuple << inCoef << logLikelihood << stdErr << waldZStats << waldPValues
<< oddsRatios << inverse_of_X_transp_AX
<< sqrt(decomposition.conditionNo()) << status << numRows;
return tuple;
}
} // namespace hello_world
} // namespace modules
} // namespace madlib