| /* ----------------------------------------------------------------------- *//** |
| * |
| * @file multilogistic.cpp |
| * |
| * @brief Multinomial Logistic-Regression functions |
| * |
| * We implement the iteratively-reweighted-least-squares method. |
| * |
| *//* ----------------------------------------------------------------------- */ |
| |
| #include <dbconnector/dbconnector.hpp> |
| |
| #include <modules/shared/HandleTraits.hpp> |
| #include <modules/prob/boost.hpp> |
| #include <boost/math/distributions.hpp> |
| #include <modules/prob/student.hpp> |
| #include "multilogistic.hpp" |
| |
| #include <vector> |
| #include <fstream> |
| |
| namespace madlib { |
| |
| // Use Eigen |
| using namespace dbal::eigen_integration; |
| |
| namespace modules { |
| |
| // Import names from other MADlib modules |
| using dbal::NoSolutionFoundException; |
| |
| namespace regress { |
| |
| /** |
| * @brief Logistic function |
| */ |
| inline double sigma(double x) { |
| return 1. / (1. + std::exp(-x)); |
| } |
| |
| |
| /** |
| * @brief Inter- and intra-iteration state for iteratively-reweighted-least- |
| * squares method for logistic regression |
| * |
| * TransitionState encapsualtes the transition state during the |
| * logistic-regression aggregate function. To the database, the state is |
| * exposed as a single DOUBLE PRECISION array, to the C++ code it is a proper |
| * object containing scalars, a vector, and a matrix. |
| * |
| * Note: We assume that the DOUBLE PRECISION array is initialized by the |
| * database with length at least 4, and all elemenets are 0. |
| */ |
| template <class Handle> |
| class MLogRegrIRLSTransitionState { |
| |
| // By ยง14.5.3/9: "Friend declarations shall not declare partial |
| // specializations." We do access protected members in operator+=(). |
| template <class OtherHandle> |
| friend class MLogRegrIRLSTransitionState; |
| |
| public: |
| |
| |
| MLogRegrIRLSTransitionState(const AnyType &inArray) |
| : mStorage(inArray.getAs<Handle>()) { |
| rebind(static_cast<uint16_t>(mStorage[0]),static_cast<uint16_t>(mStorage[1])); |
| } |
| |
| /** |
| * @brief Convert to backend representation |
| * |
| * We define this function so that we can use State in the |
| * argument list and as a return type. |
| */ |
| inline operator AnyType() const { |
| return mStorage; |
| } |
| |
| /** |
| * @brief Initialize the iteratively-reweighted-least-squares state. |
| * |
| * This function is only called for the first iteration, for the first row. |
| */ |
| inline void initialize( |
| const Allocator &inAllocator, |
| uint16_t inWidthOfX, |
| uint16_t inNumCategories, uint16_t inRefCategory) { |
| |
| size_t state_size = arraySize(inWidthOfX, inNumCategories); |
| // GPDB limits the single array size to be 1GB, which means that the size |
| // of a double array cannot be large than 134217727 because |
| // (134217727 * 8) / (1024 * 1024) = 1023. And solve |
| // state_size = x^2 + 2^x + 6 <= 134217727 will give x <= 11584. |
| if(state_size > 134217727) |
| throw std::domain_error( |
| "The product of number of independent variables and number of " |
| "categories cannot be larger than 11584."); |
| |
| mStorage = inAllocator.allocateArray<double, dbal::AggregateContext, |
| dbal::DoZero, dbal::ThrowBadAlloc>(state_size); |
| rebind(inWidthOfX, inNumCategories); |
| widthOfX = inWidthOfX; |
| numCategories = inNumCategories; |
| ref_category = inRefCategory; |
| } |
| |
| /** |
| * @brief We need to support assigning the previous state |
| */ |
| template <class OtherHandle> MLogRegrIRLSTransitionState &operator=( |
| const MLogRegrIRLSTransitionState<OtherHandle> &inOtherState) { |
| |
| for (size_t i = 0; i < mStorage.size(); i++) |
| mStorage[i] = inOtherState.mStorage[i]; |
| return *this; |
| } |
| |
| /** |
| * @brief Merge with another State object by copying the intra-iteration |
| * fields |
| */ |
| template <class OtherHandle> MLogRegrIRLSTransitionState &operator+=( |
| const MLogRegrIRLSTransitionState<OtherHandle> &inOtherState) { |
| if (mStorage.size() != inOtherState.mStorage.size() || |
| widthOfX != inOtherState.widthOfX) |
| throw std::logic_error("Internal error: Incompatible transition " |
| "states"); |
| |
| numRows += inOtherState.numRows; |
| gradient += inOtherState.gradient; |
| X_transp_AX += inOtherState.X_transp_AX; |
| logLikelihood += inOtherState.logLikelihood; |
| return *this; |
| } |
| |
| /** |
| * @brief Reset the inter-iteration fields. |
| */ |
| inline void reset() { |
| numRows = 0; |
| gradient.fill(0); |
| X_transp_AX.fill(0); |
| logLikelihood = 0; |
| } |
| |
| private: |
| static inline uint32_t arraySize(const uint16_t inWidthOfX, |
| const uint16_t inNumCategories) { |
| return 6 + inWidthOfX * inWidthOfX * inNumCategories * inNumCategories |
| + 2 * inWidthOfX * inNumCategories; |
| } |
| |
| void rebind(uint16_t inWidthOfX = 0, uint16_t inNumCategories = 0) { |
| widthOfX.rebind(&mStorage[0]); |
| numCategories.rebind(&mStorage[1]); |
| conditionNo.rebind(&mStorage[2]); |
| |
| coef.rebind(&mStorage[3], inWidthOfX*inNumCategories); |
| |
| numRows.rebind(&mStorage[3 + inWidthOfX*inNumCategories]); |
| |
| gradient.rebind(&mStorage[4 + inWidthOfX*inNumCategories],inWidthOfX*inNumCategories); |
| X_transp_AX.rebind(&mStorage[4 + 2 * inWidthOfX*inNumCategories], |
| inNumCategories*inWidthOfX, inWidthOfX*inNumCategories); |
| logLikelihood.rebind(&mStorage[4 + |
| inNumCategories*inNumCategories*inWidthOfX*inWidthOfX |
| + 2 * inWidthOfX*inNumCategories]); |
| ref_category.rebind(&mStorage[5 + |
| inNumCategories*inNumCategories*inWidthOfX*inWidthOfX |
| + 2 * inWidthOfX*inNumCategories]); |
| } |
| |
| Handle mStorage; |
| |
| public: |
| typename HandleTraits<Handle>::ReferenceToUInt16 widthOfX; |
| typename HandleTraits<Handle>::ReferenceToUInt16 numCategories; |
| typename HandleTraits<Handle>::ReferenceToDouble conditionNo; |
| |
| typename HandleTraits<Handle>::ColumnVectorTransparentHandleMap coef; |
| typename HandleTraits<Handle>::ReferenceToUInt64 numRows; |
| typename HandleTraits<Handle>::ColumnVectorTransparentHandleMap gradient; |
| typename HandleTraits<Handle>::MatrixTransparentHandleMap X_transp_AX; |
| typename HandleTraits<Handle>::ReferenceToDouble logLikelihood; |
| typename HandleTraits<Handle>::ReferenceToUInt16 ref_category; |
| }; |
| |
| |
| /** |
| * @brief Inter- and intra-iteration state for robust variance calculations |
| * |
| * TransitionState encapsualtes the transition state during the |
| * logistic-regression robust variance calculation. To the database, the state is |
| * exposed as a single DOUBLE PRECISION array, to the C++ code it is a proper |
| * object containing scalars, a vector, and a matrix. |
| * |
| * Note: We assume that the DOUBLE PRECISION array is initialized by the |
| * database with length at least 4, and all elemenets are 0. |
| */ |
| template <class Handle> |
| class MLogRegrRobustTransitionState { |
| |
| // By ยง14.5.3/9: "Friend declarations shall not declare partial |
| // specializations." We do access protected members in operator+=(). |
| template <class OtherHandle> |
| friend class MLogRegrRobustTransitionState; |
| |
| public: |
| MLogRegrRobustTransitionState(const AnyType &inArray) |
| : mStorage(inArray.getAs<Handle>()) { |
| rebind(static_cast<uint16_t>(mStorage[0]),static_cast<uint16_t>(mStorage[1])); |
| } |
| |
| /** |
| * @brief Convert to backend representation |
| * |
| * We define this function so that we can use State in the |
| * argument list and as a return type. |
| */ |
| inline operator AnyType() const { |
| return mStorage; |
| } |
| |
| /** |
| * @brief Initialize the iteratively-reweighted-least-squares state. |
| * |
| * This function is only called for the first iteration, for the first row. |
| */ |
| inline void initialize(const Allocator &inAllocator, |
| uint16_t inWidthOfX, uint16_t inNumCategories, uint16_t inRefCategory) { |
| mStorage = inAllocator.allocateArray<double, dbal::AggregateContext, |
| dbal::DoZero, dbal::ThrowBadAlloc>(arraySize(inWidthOfX, inNumCategories)); |
| rebind(inWidthOfX, inNumCategories); |
| widthOfX = inWidthOfX; |
| numCategories = inNumCategories; |
| ref_category = inRefCategory; |
| } |
| |
| /** |
| * @brief We need to support assigning the previous state |
| */ |
| template <class OtherHandle> MLogRegrRobustTransitionState &operator=( |
| const MLogRegrRobustTransitionState<OtherHandle> &inOtherState) { |
| for (size_t i = 0; i < mStorage.size(); i++) |
| mStorage[i] = inOtherState.mStorage[i]; |
| return *this; |
| } |
| |
| /** |
| * @brief Merge with another State object by copying the intra-iteration |
| * fields |
| */ |
| template <class OtherHandle> MLogRegrRobustTransitionState &operator+=( |
| const MLogRegrRobustTransitionState<OtherHandle> &inOtherState) { |
| if (mStorage.size() != inOtherState.mStorage.size() || |
| widthOfX != inOtherState.widthOfX) |
| throw std::logic_error("Internal error: Incompatible transition " |
| "states"); |
| |
| numRows += inOtherState.numRows; |
| X_transp_AX += inOtherState.X_transp_AX; |
| meat += inOtherState.meat; |
| return *this; |
| } |
| |
| /** |
| * @brief Reset the inter-iteration fields. |
| */ |
| inline void reset() { |
| numRows = 0; |
| meat.fill(0); |
| X_transp_AX.fill(0); |
| } |
| |
| private: |
| static inline uint32_t arraySize(const uint16_t inWidthOfX, |
| const uint16_t inNumCategories) { |
| return 4 + 2*inWidthOfX * inWidthOfX * inNumCategories * inNumCategories |
| + inWidthOfX * inNumCategories; |
| } |
| |
| /** |
| * @brief Rebind to a new storage array |
| * |
| * @param inWidthOfX The number of independent variables. |
| * @param inNumCategories The number of categories of the dependant var |
| |
| |
| * Array layout (iteration refers to one aggregate-function call): |
| * Inter-iteration components (updated in final function): |
| * - 0: widthOfX (number of independant variables) |
| * - 1: numCategories (number of categories) |
| * - 2: ref_category |
| * - 3: coef (vector of coefficients) |
| * |
| * Intra-iteration components (updated in transition step): |
| * - 3 + widthOfX*numCategories: numRows (number of rows already processed in this iteration) |
| * - 4 + widthOfX * inNumCategories: X_transp_AX (X^T A X). |
| * - 4 + widthOfX^2*numCategories^2: meat (The meat matrix) |
| */ |
| void rebind(uint16_t inWidthOfX = 0, uint16_t inNumCategories = 0) { |
| widthOfX.rebind(&mStorage[0]); |
| numCategories.rebind(&mStorage[1]); |
| ref_category.rebind(&mStorage[2]); |
| coef.rebind(&mStorage[3], inWidthOfX * inNumCategories); |
| numRows.rebind(&mStorage[3 + inWidthOfX * inNumCategories]); |
| X_transp_AX.rebind(&mStorage[4 + inWidthOfX * inNumCategories], |
| inNumCategories * inWidthOfX, inWidthOfX * inNumCategories); |
| meat.rebind(&mStorage[4 + |
| inNumCategories * inNumCategories * inWidthOfX * inWidthOfX |
| + inWidthOfX * inNumCategories], inWidthOfX * inNumCategories, inWidthOfX * inNumCategories); |
| } |
| |
| Handle mStorage; |
| |
| public: |
| typename HandleTraits<Handle>::ReferenceToUInt16 widthOfX; |
| typename HandleTraits<Handle>::ReferenceToUInt16 numCategories; |
| typename HandleTraits<Handle>::ReferenceToUInt16 ref_category; |
| |
| typename HandleTraits<Handle>::ColumnVectorTransparentHandleMap coef; |
| typename HandleTraits<Handle>::ReferenceToUInt64 numRows; |
| typename HandleTraits<Handle>::MatrixTransparentHandleMap X_transp_AX; |
| typename HandleTraits<Handle>::MatrixTransparentHandleMap meat; |
| }; |
| |
| /** |
| * @brief IRLS Transition |
| * @param args |
| * |
| * Arguments (Matched with PSQL wrapped) |
| * - 0: Current State |
| * - 1: y value (Integer) |
| * - 2: numCategories (Integer) |
| * - 3: ref_category (Integer) |
| * - 4: X value (Column Vector) |
| * - 5: Previous State |
| |
| */ |
| AnyType |
| __mlogregr_irls_step_transition::run(AnyType &args) { |
| MLogRegrIRLSTransitionState<MutableArrayHandle<double> > state = args[0]; |
| |
| if (args[1].isNull() || args[2].isNull() || args[3].isNull() || |
| args[4].isNull()) { |
| return args[0]; |
| } |
| |
| // Get x as a vector of double |
| MappedColumnVector x; |
| try{ |
| // an exception is raised in the backend if args[2] contains nulls |
| MappedColumnVector xx = args[4].getAs<MappedColumnVector>(); |
| // x is a const reference, we can only rebind to change its pointer |
| x.rebind(xx.memoryHandle(), xx.size()); |
| } catch (const ArrayWithNullException &e) { |
| return args[0]; |
| } |
| |
| // Get the category & numCategories as integer |
| int32_t category = args[1].getAs<int32_t>(); |
| // Number of categories after pivoting (we pivot around the first category) |
| int32_t numCategories = (args[2].getAs<int32_t>() - 1); |
| int32_t ref_category = args[3].getAs<int32_t>(); |
| |
| // The following check was added with MADLIB-138. |
| if (!x.is_finite()) |
| throw std::domain_error("Design matrix is not finite."); |
| |
| if (state.numRows == 0) { |
| if (x.size() > std::numeric_limits<uint16_t>::max()) |
| throw std::domain_error("Number of independent variables cannot be " |
| "larger than 65535."); |
| |
| if (numCategories < 1) |
| throw std::domain_error("Number of cateogires must be at least 2"); |
| |
| // Init the state (requires x.size() and category.size()) |
| state.initialize(*this, |
| static_cast<uint16_t>(x.size()) , |
| static_cast<uint16_t>(numCategories), |
| static_cast<uint16_t>(ref_category)); |
| |
| if (!args[5].isNull()) { |
| MLogRegrIRLSTransitionState<ArrayHandle<double> > |
| previousState = args[5]; |
| state = previousState; |
| state.reset(); |
| } |
| } |
| |
| |
| /* |
| * This check should be done for each iteration. Only checking the first |
| * run is not enough. |
| */ |
| if (category > numCategories || category < 0) |
| throw std::domain_error("Invalid category. Categories must be integer " |
| "values between 0 and (number of categories - 1)."); |
| |
| if (ref_category > numCategories || ref_category < 0) |
| throw std::domain_error("Invalid reference category. Reference category must be integer " |
| "value between 0 and (number of categories - 1)."); |
| |
| // Now do the transition step |
| state.numRows++; |
| /* Get y: Convert to 1/0 boolean vector |
| Example: Category 4 : 0 0 0 1 0 0 |
| Storing it in this forms helps us get a nice closed form expression |
| */ |
| |
| //To pivot around the specified reference category |
| ColumnVector y(numCategories); |
| y.fill(0); |
| if (category > ref_category) { |
| y(category - 1) = 1; |
| } else if (category < ref_category) { |
| y(category) = 1; |
| } |
| |
| /* |
| Compute the parameter vector (the 'pi' vector in the documentation) |
| for the data point being processed. |
| Casting the coefficients into a matrix makes the calculation simple. |
| */ |
| Matrix coef = state.coef; |
| coef.resize(numCategories, state.widthOfX); |
| |
| //Store the intermediate calculations because we'll reuse them in the LLH |
| ColumnVector t1 = x; //t1 is vector of size state.widthOfX |
| t1 = coef*x; |
| /* Note: The above 2 lines could have been written as: |
| ColumnVector t1 = -coef*x; |
| |
| but this creates warnings. These warnings are somehow related to the factor |
| that x is of an immutable type. The following alternative could resolve |
| the warnings (although not necessary the best alternative): |
| */ |
| |
| ColumnVector t2 = t1.array().exp(); |
| double t3 = 1 + t2.sum(); |
| ColumnVector pi = t2/t3; |
| |
| //The gradient matrix has numCategories rows and widthOfX columns |
| Matrix grad = -y*x.transpose() + pi*x.transpose(); |
| //We cast the gradient into a vector to make the Newton step calculations much easier. |
| grad.resize(numCategories*state.widthOfX,1); |
| |
| /* |
| a is a matrix of size JxJ where J is the number of categories |
| a_j1j2 = -pi(j1)*(1-pi(j2))if j1 == j2 |
| a_j1j2 = pi(j1)*pi(j2) if j1 != j2 |
| */ |
| Matrix a(numCategories,numCategories); |
| // Compute the 'a' matrix. |
| Matrix piDiag = pi.asDiagonal(); |
| a = pi * pi.transpose() - piDiag; |
| state.gradient.noalias() += grad; |
| |
| //Start the Hessian calculations |
| Matrix X_transp_AX(numCategories * state.widthOfX, numCategories * state.widthOfX); |
| |
| /* |
| Again: The following 3 lines could have been written as |
| Matrix XXTrans = x * x.transpose(); |
| but it creates warnings related to the type of x. Here is an easy fix |
| */ |
| Matrix cv_x = x; |
| Matrix XXTrans = trans(cv_x); |
| XXTrans = cv_x * XXTrans; |
| |
| //Eigen doesn't supported outer-products for matrices, so we have to do our own. |
| //This operation is also known as a tensor-product. |
| for (int i1 = 0; i1 < state.widthOfX; i1++){ |
| for (int i2 = 0; i2 <state.widthOfX; i2++){ |
| int rowOffset = numCategories * i1; |
| int colOffset = numCategories * i2; |
| |
| X_transp_AX.block(rowOffset, colOffset, numCategories, numCategories) = XXTrans(i1,i2)*a; |
| } |
| } |
| |
| triangularView<Lower>(state.X_transp_AX) += X_transp_AX; |
| |
| state.logLikelihood += y.transpose()*t1 - log(t3); |
| |
| return state; |
| |
| } |
| |
| /** |
| * @brief Perform the perliminary aggregation function: Merge transition states |
| This merge step is the same as that of logistic regression. |
| */ |
| AnyType |
| __mlogregr_irls_step_merge_states::run(AnyType &args) { |
| MLogRegrIRLSTransitionState<MutableArrayHandle<double> > stateLeft = args[0]; |
| MLogRegrIRLSTransitionState<ArrayHandle<double> > stateRight = args[1]; |
| |
| // We first handle the trivial case where this function is called with one |
| // of the states being the initial state |
| if (stateLeft.numRows == 0) |
| return stateRight; |
| else if (stateRight.numRows == 0) |
| return stateLeft; |
| |
| // Merge states together and return |
| stateLeft += stateRight; |
| return stateLeft; |
| } |
| |
| /** |
| * @brief Perform the logistic-regression final step |
| */ |
| AnyType |
| __mlogregr_irls_step_final::run(AnyType &args) { |
| // We request a mutable object. |
| // Depending on the backend, this might perform a deep copy. |
| MLogRegrIRLSTransitionState<MutableArrayHandle<double> > state = args[0]; |
| |
| // Aggregates that haven't seen any data just return Null. |
| if (state.numRows == 0) |
| return Null(); |
| |
| // See MADLIB-138. At least on certain platforms and with certain versions, |
| // LAPACK will run into an infinite loop if pinv() is called for non-finite |
| // matrices. We extend the check also to the dependent variables. |
| if (!state.X_transp_AX.is_finite() || !state.gradient.is_finite()) |
| throw NoSolutionFoundException("Over- or underflow in intermediate " |
| "calculation. Input data is likely of poor numerical condition."); |
| |
| SymmetricPositiveDefiniteEigenDecomposition<Matrix> decomposition( |
| -1 * state.X_transp_AX, EigenvaluesOnly, ComputePseudoInverse); |
| |
| // Precompute (X^T * A * X)^-1 |
| Matrix hessianInv = -1 * decomposition.pseudoInverse(); |
| |
| state.coef.noalias() += hessianInv * state.gradient; |
| |
| if(!state.coef.is_finite()) |
| throw NoSolutionFoundException("Over- or underflow in Newton step, " |
| "while updating coefficients. Input data is likely of poor " |
| "numerical condition."); |
| |
| // We use the intra-iteration field gradient for storing the diagonal |
| // of X^T A X, so that we don't have to recompute it in the result function. |
| // Likewise, we store the condition number. |
| // FIXME: This feels a bit like a hack. |
| state.conditionNo = decomposition.conditionNo(); |
| state.X_transp_AX = -1 * hessianInv; |
| |
| return state; |
| } |
| |
| /** |
| * @brief Compute the diagnostic statistics |
| * |
| * This function wraps the common parts of mlogregr state into the result. |
| * (the result data type is defined in multilogistic.sql_in) |
| */ |
| AnyType mLogstateToResult( |
| const Allocator &inAllocator, |
| MLogRegrIRLSTransitionState<ArrayHandle<double> > state) { |
| |
| int ref_category = state.ref_category; |
| const HandleMap<const ColumnVector, TransparentHandle<double> > &inCoef = state.coef; |
| double logLikelihood = state.logLikelihood; |
| uint64_t num_processed = state.numRows; |
| |
| // Per the hack at the end of the final function we place the inverse |
| // of the X_tranp_AX into the state.X_transp_AX |
| const Matrix & X_transp_AX_inverse = state.X_transp_AX; |
| const ColumnVector &diagonal_of_hessian = X_transp_AX_inverse.diagonal(); |
| |
| MutableNativeColumnVector stdErr( |
| inAllocator.allocateArray<double>(inCoef.size())); |
| MutableNativeColumnVector waldZStats( |
| inAllocator.allocateArray<double>(inCoef.size())); |
| MutableNativeColumnVector waldPValues( |
| inAllocator.allocateArray<double>(inCoef.size())); |
| MutableNativeColumnVector oddsRatios( |
| inAllocator.allocateArray<double>(inCoef.size())); |
| |
| for (Index i = 0; i < inCoef.size(); ++i) { |
| stdErr(i) = std::sqrt(diagonal_of_hessian(i)); |
| waldZStats(i) = inCoef(i) / stdErr(i); |
| waldPValues(i) = 2. * prob::cdf( prob::normal(), |
| -std::abs(waldZStats(i))); |
| oddsRatios(i) = std::exp( inCoef(i) ); |
| } |
| int num_iterations = 0; |
| // Return all coefficients, standard errors, etc. in a tuple |
| AnyType tuple; |
| tuple << ref_category << inCoef << logLikelihood << stdErr |
| << waldZStats << waldPValues << oddsRatios |
| << static_cast<double>(state.conditionNo) << num_iterations << num_processed; |
| return tuple; |
| } |
| |
| |
| /** |
| * @brief Return the difference in log-likelihood between two states |
| */ |
| AnyType |
| __internal_mlogregr_irls_step_distance::run(AnyType &args) { |
| MLogRegrIRLSTransitionState<ArrayHandle<double> > stateLeft = args[0]; |
| MLogRegrIRLSTransitionState<ArrayHandle<double> > stateRight = args[1]; |
| |
| return std::abs(stateLeft.logLikelihood - stateRight.logLikelihood); |
| } |
| |
| /** |
| * @brief Return the coefficients and diagnostic statistics of the state |
| */ |
| AnyType |
| __internal_mlogregr_irls_result::run(AnyType &args) { |
| MLogRegrIRLSTransitionState<ArrayHandle<double> > state = args[0]; |
| |
| return mLogstateToResult(*this, state); |
| // state.ref_category, state.coef, |
| // state.gradient, state.logLikelihood, state.X_transp_AX(0,0)); |
| } |
| |
| /** |
| * @brief Return the coefficients and diagnostic statistics of the state |
| */ |
| AnyType |
| __internal_mlogregr_summary_results::run(AnyType &args) { |
| MLogRegrIRLSTransitionState<ArrayHandle<double> > state = args[0]; |
| const Matrix & X_transp_AX_inverse = state.X_transp_AX; |
| // X_transp_AX is actually it's inverse - this is a hack added at the end of |
| // the final function |
| AnyType tuple; |
| Matrix coef = state.coef; |
| coef.resize(state.numCategories, state.widthOfX); |
| coef.transposeInPlace(); |
| tuple << coef << X_transp_AX_inverse; |
| return tuple; |
| } |
| // ----------------- End of Multinomial Logistic Regression -------------------- |
| |
| // --------------------------------------------------------------------------- |
| // Robust Variance Multi-Logistic |
| // --------------------------------------------------------------------------- |
| |
| AnyType |
| mlogregr_robust_step_transition::run(AnyType &args) { |
| using std::endl; |
| |
| MLogRegrRobustTransitionState<MutableArrayHandle<double> > state = args[0]; |
| |
| if (args[1].isNull() || args[2].isNull() || args[3].isNull() || |
| args[4].isNull() || args[5].isNull()) { |
| return args[0]; |
| } |
| |
| // Get x as a vector of double |
| MappedColumnVector x; |
| try{ |
| // an exception is raised in the backend if args[2] contains nulls |
| MappedColumnVector xx = args[4].getAs<MappedColumnVector>(); |
| // x is a const reference, we can only rebind to change its pointer |
| x.rebind(xx.memoryHandle(), xx.size()); |
| } catch (const ArrayWithNullException &e) { |
| return args[0]; |
| } |
| |
| // Get the category & numCategories as integer |
| int16_t category = static_cast<int16_t>(args[1].getAs<int>()); |
| // Number of categories after pivoting (We pivot around the first category) |
| int16_t numCategories = static_cast<int16_t>(args[2].getAs<int>() - 1); |
| int32_t ref_category = args[3].getAs<int32_t>(); |
| MappedMatrix coefMat = args[5].getAs<MappedMatrix>(); |
| |
| // The following check was added with MADLIB-138. |
| if (!x.is_finite()) |
| throw std::domain_error("Design matrix is not finite."); |
| |
| if (state.numRows == 0) { |
| if (x.size() > std::numeric_limits<uint16_t>::max()) |
| throw std::domain_error("Number of independent variables cannot be " |
| "larger than 65535."); |
| if (numCategories < 1) |
| throw std::domain_error("Number of cateogires must be at least 2"); |
| if (category > numCategories) |
| throw std::domain_error("You have entered a category > numCategories" |
| "Categories must be of values {0,1... numCategories-1}"); |
| |
| // Init the state (requires x.size() and category.size()) |
| state.initialize(*this, |
| static_cast<uint16_t>(x.size()) , |
| static_cast<uint16_t>(numCategories), |
| static_cast<uint16_t>(ref_category)); |
| |
| Matrix mat = coefMat; |
| mat.transposeInPlace(); |
| mat.resize(coefMat.size(), 1); |
| state.coef = mat; |
| } |
| |
| // Now do the transition step |
| state.numRows++; |
| /* |
| Get y: Convert to 1/0 boolean vector |
| Example: Category 4 : 0 0 0 1 0 0 |
| Storing it in this forms helps us get a nice closed form expression |
| */ |
| |
| //To pivot around the specified reference category |
| ColumnVector y(numCategories); |
| y.fill(0); |
| if (category > ref_category) { |
| y(category - 1) = 1; |
| } else if (category < ref_category) { |
| y(category) = 1; |
| } |
| |
| /* |
| Compute the parameter vector (the 'pi' vector in the documentation) |
| for the data point being processed. |
| Casting the coefficients into a matrix makes the calculation simple. |
| */ |
| |
| Matrix coef = state.coef; |
| coef.resize(numCategories, state.widthOfX); |
| |
| //Store the intermediate calculations because we'll reuse them in the LLH |
| ColumnVector t1 = x; //t1 is vector of size state.widthOfX |
| t1 = coef*x; |
| /* |
| Note: The above 2 lines could have been written as: |
| ColumnVector t1 = -coef*x; |
| |
| but this creates warnings. These warnings are somehow related to the factor |
| that x is an immutable type. |
| */ |
| |
| ColumnVector t2 = t1.array().exp(); |
| double t3 = 1 + t2.sum(); |
| ColumnVector pi = t2/t3; |
| |
| //The gradient matrix has numCategories rows and widthOfX columns |
| Matrix grad = -y * x.transpose() + pi * x.transpose(); |
| //We cast the gradient into a vector to make the math easier. |
| grad.resize(numCategories * state.widthOfX, 1); |
| |
| Matrix GradGradTranspose; |
| GradGradTranspose = grad * grad.transpose(); |
| state.meat += GradGradTranspose; |
| |
| /* |
| a is a matrix of size JxJ where J is the number of categories |
| a_j1j2 = -pi(j1)*(1-pi(j2))if j1 == j2 |
| a_j1j2 = pi(j1)*pi(j2) if j1 != j2 |
| */ |
| // Compute the 'a' matrix. |
| Matrix a(numCategories,numCategories); |
| Matrix piDiag = pi.asDiagonal(); |
| a = pi * pi.transpose() - piDiag; |
| |
| //Start the Hessian calculations |
| Matrix X_transp_AX(numCategories * state.widthOfX, numCategories * state.widthOfX); |
| |
| /* |
| Again: The following 3 lines could have been written as |
| Matrix XXTrans = x * x.transpose(); |
| but it creates warnings related to the type of x. Here is an easy fix |
| */ |
| Matrix cv_x = x; |
| Matrix XXTrans = trans(cv_x); |
| XXTrans = cv_x * XXTrans; |
| |
| //Eigen doesn't supported outer-products for matrices, so we have to do our own. |
| //This operation is also known as a tensor-product. |
| for (int i1 = 0; i1 < state.widthOfX; i1++){ |
| for (int i2 = 0; i2 <state.widthOfX; i2++){ |
| int rowOffset = numCategories * i1; |
| int colOffset = numCategories * i2; |
| |
| X_transp_AX.block(rowOffset, colOffset, numCategories, numCategories) = XXTrans(i1, i2) * a; |
| } |
| } |
| |
| triangularView<Lower>(state.X_transp_AX) += X_transp_AX; |
| |
| return state; |
| |
| } |
| |
| /** |
| * @brief Perform the perliminary aggregation function: Merge transition states |
| This merge step is the same as that of logistic regression. |
| */ |
| AnyType |
| mlogregr_robust_step_merge_states::run(AnyType &args) { |
| MLogRegrRobustTransitionState<MutableArrayHandle<double> > stateLeft = args[0]; |
| MLogRegrRobustTransitionState<ArrayHandle<double> > stateRight = args[1]; |
| |
| // We first handle the trivial case where this function is called with one |
| // of the states being the initial state |
| if (stateLeft.numRows == 0) |
| return stateRight; |
| else if (stateRight.numRows == 0) |
| return stateLeft; |
| |
| // Merge states together and return |
| stateLeft += stateRight; |
| return stateLeft; |
| } |
| |
| AnyType MLrobuststateToResult( |
| const Allocator &inAllocator, |
| int ref_category, |
| const HandleMap<const ColumnVector, TransparentHandle<double> >& inCoef, |
| const ColumnVector &diagonal_of_varianceMat) { |
| |
| MutableNativeColumnVector variance( |
| inAllocator.allocateArray<double>(inCoef.size())); |
| MutableNativeColumnVector stdErr( |
| inAllocator.allocateArray<double>(inCoef.size())); |
| MutableNativeColumnVector waldZStats( |
| inAllocator.allocateArray<double>(inCoef.size())); |
| MutableNativeColumnVector waldPValues( |
| inAllocator.allocateArray<double>(inCoef.size())); |
| |
| for (Index i = 0; i < inCoef.size(); ++i) { |
| variance(i) = diagonal_of_varianceMat(i); |
| stdErr(i) = std::sqrt(diagonal_of_varianceMat(i)); |
| waldZStats(i) = inCoef(i) / stdErr(i); |
| waldPValues(i) = 2. * prob::cdf( |
| prob::normal(), -std::abs(waldZStats(i))); |
| } |
| |
| // Return all coefficients, standard errors, etc. in a tuple |
| AnyType tuple; |
| tuple << ref_category << inCoef << stdErr << waldZStats << waldPValues; |
| return tuple; |
| } |
| |
| |
| /** |
| * @brief Perform the logistic-regression final step |
| */ |
| AnyType |
| mlogregr_robust_step_final::run(AnyType &args) { |
| // We request a mutable object. Depending on the backend, this might perform |
| // a deep copy. |
| MLogRegrRobustTransitionState<ArrayHandle<double> > state = args[0]; |
| // Aggregates that haven't seen any data just return Null. |
| if (state.numRows == 0) |
| return Null(); |
| |
| // See MADLIB-138. At least on certain platforms and with certain versions, |
| // LAPACK will run into an infinite loop if pinv() is called for non-finite |
| // matrices. We extend the check also to the dependent variables. |
| if (!state.X_transp_AX.is_finite()) |
| throw NoSolutionFoundException("Over- or underflow in intermediate " |
| "calculation. Input data is likely of poor numerical condition."); |
| |
| |
| SymmetricPositiveDefiniteEigenDecomposition<Matrix> decomposition( |
| -1 * state.X_transp_AX, EigenvaluesOnly, ComputePseudoInverse); |
| |
| // Precompute (X^T * A * X)^-1 |
| Matrix bread = decomposition.pseudoInverse(); |
| Matrix varianceMat; |
| varianceMat = bread * state.meat * bread; |
| |
| if(!state.coef.is_finite()) |
| throw NoSolutionFoundException("Over- or underflow in Newton step, " |
| "while updating coefficients. Input data is likely of poor " |
| "numerical condition."); |
| |
| return MLrobuststateToResult(*this, state.ref_category, state.coef, varianceMat.diagonal()); |
| } |
| // ------------------------ End of Robust Variance ----------------------------- |
| |
| AnyType __sub_array::run(AnyType &args) { |
| if (args[0].isNull() || args[1].isNull()) |
| return Null(); |
| |
| ArrayHandle<double> value = args[0].getAs<ArrayHandle<double> >(); |
| ArrayHandle<int32> index = args[1].getAs<ArrayHandle<int32> >(); |
| |
| for (size_t i = 0; i < index.size(); i++) { |
| if (index[i] < 1 || index[i] > static_cast<int>(value.size())) |
| throw std::domain_error("Invalid indices - out of bound"); |
| } |
| |
| MutableArrayHandle<double> res = |
| allocateArray<double, dbal::AggregateContext, dbal::DoZero, |
| dbal::ThrowBadAlloc>(index.size()); |
| |
| for (size_t i = 0; i < index.size(); i++) |
| res[i] = value[index[i] - 1]; |
| |
| return res; |
| } |
| |
| // ---------------------------------------------------------------------------- |
| typedef struct __sr_ctx{ |
| const double * inarray; |
| int32_t maxcall; |
| int32_t num_feature; |
| int32_t num_category; |
| int32_t ref_category; |
| int32_t curcall; |
| } sr_ctx; |
| |
| void * |
| __mlogregr_format::SRF_init(AnyType &args) { |
| sr_ctx *ctx = new sr_ctx; |
| ctx->curcall = 0; |
| |
| MutableArrayHandle<double> inarray = NULL; |
| try{ |
| inarray = args[0].getAs<MutableArrayHandle<double> >(); |
| } catch (const ArrayWithNullException &e) { |
| ctx->maxcall = 0; |
| return ctx; |
| } |
| |
| int32_t num_feature = args[1].getAs<int32_t>(); |
| int32_t num_category = args[2].getAs<int32_t>(); |
| int32_t ref_category = args[3].getAs<int32_t>(); |
| |
| ctx->inarray = inarray.ptr(); |
| ctx->maxcall = num_category - 1; |
| ctx->num_category = num_category - 1; |
| ctx->num_feature = num_feature; |
| ctx->ref_category = ref_category; |
| |
| if (num_feature * (num_category - 1) != |
| static_cast<int32_t>(inarray.size())) { |
| throw std::runtime_error("num_feature * (num_category - 1) != " |
| "inarray.size()"); |
| } |
| |
| if (ref_category >= num_category){ |
| throw std::runtime_error("ref_category >= num_category"); |
| } |
| |
| return ctx; |
| } |
| |
| AnyType |
| __mlogregr_format::SRF_next(void * user_fctx, bool * is_last_call) { |
| sr_ctx * ctx = (sr_ctx *) user_fctx; |
| if (ctx->curcall >= ctx->maxcall) { |
| *is_last_call = true; |
| return Null(); |
| } |
| |
| MutableArrayHandle<double> outarray = |
| allocateArray<double, dbal::FunctionContext, |
| dbal::DoZero, dbal::ThrowBadAlloc>(ctx->num_feature); |
| for(int i = 0; i < ctx->num_feature; i++) { |
| outarray[i] = ctx->inarray[i * ctx->num_category + ctx->curcall]; |
| } |
| |
| AnyType tuple; |
| tuple << (ctx->curcall < ctx->ref_category |
| ? ctx->curcall |
| : ctx->curcall + 1) |
| << outarray; |
| |
| ctx->curcall++; |
| |
| return tuple; |
| } |
| /* ------------------------------------------------------------ */ |
| |
| |
| AnyType mlogregr_predict_prob::run(AnyType &args) |
| { |
| // dimension: N x (L - 1), where L is the number of categories |
| MappedMatrix coef = args[0].getAs<MappedMatrix>(); |
| |
| int ref_category = args[1].getAs<int>(); |
| MappedColumnVector x; |
| try { |
| MappedColumnVector xx = args[2].getAs<MappedColumnVector>(); |
| x.rebind(xx.memoryHandle(), xx.size()); |
| } catch (const ArrayWithNullException &e) { |
| return Null(); |
| } |
| |
| int L = static_cast<int>(coef.cols()) + 1; // number of categories |
| |
| ColumnVector res(L); |
| int cat_index = 0; |
| for (int i = 0; i < L; i++) { |
| if (i == ref_category) { |
| res(i) = 1; |
| } else { |
| res(i) = exp(coef.col(cat_index).dot(x)); |
| cat_index++; |
| } |
| } |
| res /= res.sum(); |
| return res; |
| } |
| /* ------------------------------------------------------------ */ |
| |
| AnyType mlogregr_predict_response::run(AnyType &args) |
| { |
| // dimension: N x (L - 1), where L is the number of categories |
| MappedMatrix coef = args[0].getAs<MappedMatrix>(); |
| |
| int ref_category = args[1].getAs<int>(); |
| MappedColumnVector x; |
| try { |
| MappedColumnVector xx = args[2].getAs<MappedColumnVector>(); |
| x.rebind(xx.memoryHandle(), xx.size()); |
| } catch (const ArrayWithNullException &e) { |
| return Null(); |
| } |
| |
| ColumnVector linear_predictors(x.transpose() * coef); |
| Index max_cat = 0; |
| linear_predictors.maxCoeff(&max_cat); |
| double max_lp = linear_predictors(max_cat); |
| |
| if (exp(max_lp) < 1){ |
| // no category has high enough probability as reference category |
| return ref_category; |
| } else if (max_cat < ref_category){ |
| return static_cast<uint>(max_cat); |
| } |
| else{ |
| // since ref_category is not present in the coef matrix, index of |
| // categories after ref_category is 1 less than the actual index |
| return static_cast<uint>(max_cat + 1u); |
| } |
| } |
| |
| } // namespace regress |
| |
| } // namespace modules |
| |
| } // namespace madlib |