blob: a12f095220aa01db24261bfbe075455382fff81a [file] [log] [blame]
/* ------------------------------------------------------
*
* @file random_forest.cpp
*
* @brief Random Forest functions
*
*
*/ /* ----------------------------------------------------------------------- */
#include <iostream>
#include <sstream>
#include <vector>
#include <string>
#include <list>
#include <iterator>
#include <dbconnector/dbconnector.hpp>
#include <boost/random/discrete_distribution.hpp>
#include <boost/random/variate_generator.hpp>
#include "DT_proto.hpp"
#include "DT_impl.hpp"
#include "ConSplits.hpp"
#include <math.h> /* fabs */
#include "random_forest.hpp"
namespace madlib {
// Use Eigen
using namespace dbal::eigen_integration;
using boost::random::discrete_distribution;
using boost::random::variate_generator;
namespace modules {
namespace recursive_partitioning {
typedef DecisionTree<RootContainer> Tree;
/*
* Permute each categorical variable and predict
*/
AnyType
rf_cat_imp_score::run(AnyType &args) {
if (args[0].isNull() || args[7].isNull()) { return Null(); }
Tree dt = args[0].getAs<ByteString>();
MutableNativeIntegerVector cat_features;
NativeColumnVector con_features;
try {
if (args[1].isNull()){
// no cat features
return Null();
}
else {
MutableNativeIntegerVector xx_cat = args[1].getAs<MutableNativeIntegerVector>();
cat_features.rebind(xx_cat.memoryHandle(), xx_cat.size());
}
if (args[2].isNull()){
con_features.rebind(this->allocateArray<double>(0));
}
else {
NativeColumnVector xx_con = args[2].getAs<NativeColumnVector>();
con_features.rebind(xx_con.memoryHandle(), xx_con.size());
}
} catch (const ArrayWithNullException &e) {
// not expect to reach here
// if max_surr = 0, nulls are filtered
// otherwise, mapped to -1 or NaN
return Null();
}
MappedIntegerVector cat_n_levels = args[3].getAs<MappedIntegerVector>();
int n_permutations = args[4].getAs<int>();
double y = args[5].getAs<double>();
bool is_classification = args[6].getAs<bool>();
MappedMatrix distributions = args[7].getAs<MappedMatrix>();
// returning
MutableNativeColumnVector permuted_predictions(
this->allocateArray<double>(cat_n_levels.size()));
// permute each and predict
NativeRandomNumberGenerator generator;
for (int p = 0; p < n_permutations; p ++) {
for (Index i = 0; i < cat_n_levels.size(); i ++) {
int orig_i = cat_features(i);
discrete_distribution<> ddist(distributions.col(i).data(),
distributions.col(i).data() + cat_n_levels(i) + 1);
variate_generator<NativeRandomNumberGenerator, discrete_distribution<> >
rvt(generator, ddist);
cat_features(i) = rvt() - 1;
// calling NativeIntegerVector for a const cast
// see EigenIntegration_impl.hpp in ports for details
double prediction = dt.predict_response(
NativeIntegerVector(cat_features.memoryHandle()), con_features);
double score = 0.;
if (is_classification) {
score = y - prediction < 1e-3 ? 1. : 0.;
} else {
score = - (y - prediction) * (y - prediction);
}
permuted_predictions(i) += score;
cat_features(i) = orig_i;
}
}
permuted_predictions /= n_permutations;
return permuted_predictions;
}
// ------------------------------------------------------------
/*
* Permute each continuous variable and predict
*/
AnyType
rf_con_imp_score::run(AnyType &args) {
if (args[0].isNull() || args[7].isNull()) { return Null(); }
Tree dt = args[0].getAs<ByteString>();
NativeIntegerVector cat_features;
MutableNativeColumnVector con_features;
try {
if (args[1].isNull()){
// no cat features
cat_features.rebind(this->allocateArray<int>(0));
}
else {
NativeIntegerVector xx_cat = args[1].getAs<NativeIntegerVector>();
cat_features.rebind(xx_cat.memoryHandle(), xx_cat.size());
}
if (args[2].isNull()){
//no con features
return Null();
}
else {
MutableNativeColumnVector xx_con = args[2].getAs<MutableNativeColumnVector>();
con_features.rebind(xx_con.memoryHandle(), xx_con.size());
}
} catch (const ArrayWithNullException &e) {
// not expect to reach here
// if max_surr = 0, nulls are filtered
// otherwise, mapped to -1 or NaN
return Null();
}
// con_splits size = num_con_features x num_bins
// When num_con_features = 0, the input will be an empty string that is read
// as a ByteString
ConSplitsResult<RootContainer> splits_results = args[3].getAs<ByteString>();
int n_permutations = args[4].getAs<int>();
double y = args[5].getAs<double>();
bool is_classification = args[6].getAs<bool>();
MappedMatrix distributions = args[7].getAs<MappedMatrix>();
// returning
MutableNativeColumnVector permuted_predictions(
this->allocateArray<double>(con_features.size()));
// permute each and predict
NativeRandomNumberGenerator generator;
for (int p = 0; p < n_permutations; p ++) {
for (Index i = 0; i < con_features.size(); i ++) {
double orig_i = con_features(i);
discrete_distribution<> ddist(distributions.col(i).data(),
distributions.col(i).data() + distributions.rows());
variate_generator<NativeRandomNumberGenerator, discrete_distribution<> >
rvt(generator, ddist);
int outcome = rvt();
if (outcome == 0) {
con_features(i) = std::numeric_limits<double>::quiet_NaN();
} else if (outcome == static_cast<int>(distributions.rows()) - 1) {
// bin value that is larger than the last separator (last value in con_splits)
con_features(i) = splits_results.con_splits(i, outcome-2) + 1.;
} else {
con_features(i) = splits_results.con_splits(i, outcome-1);
}
// calling NativeColumnVector for a const cast
// see EigenIntegration_impl.hpp in ports for details
double prediction = dt.predict_response(
cat_features, NativeColumnVector(con_features.memoryHandle()));
double score = 0.;
if (is_classification) {
score = y - prediction < 1e-3 ? 1. : 0.;
} else {
score = - (y - prediction) * (y - prediction);
}
permuted_predictions(i) += score;
con_features(i) = orig_i;
}
}
permuted_predictions /= n_permutations;
return permuted_predictions;
}
// ------------------------------------------------------------
AnyType
normalize_sum_array::run(AnyType &args){
const MappedColumnVector input_vector = args[0].getAs<MappedColumnVector>();
const double sum_target = args[1].getAs<double>();
double sum_input_vector = input_vector.sum();
// Avoid divide by zero by dividing by a small number if sum is small
double VAR_IMP_EPSILON = 1e-6;
if (sum_input_vector < VAR_IMP_EPSILON)
sum_input_vector = VAR_IMP_EPSILON;
ColumnVector output_vector = input_vector * sum_target / sum_input_vector;
return output_vector;
}
} // namespace recursive_partitioning
} // namespace modules
} // namespace madlib