src/modules/recursive_partitioning/random_forest.cpp - madlib - Git at Google

 /* ------------------------------------------------------
  *
  * @file random_forest.cpp
  *
  * @brief Random Forest functions
  *
  *
  */ /* ----------------------------------------------------------------------- */

 #include <iostream>
 #include <sstream>
 #include <vector>
 #include <string>
 #include <list>
 #include <iterator>

 #include <dbconnector/dbconnector.hpp>
 #include <boost/random/discrete_distribution.hpp>
 #include <boost/random/variate_generator.hpp>

 #include "DT_proto.hpp"
 #include "DT_impl.hpp"
 #include "ConSplits.hpp"

 #include <math.h>       /* fabs */

 #include "random_forest.hpp"

 namespace madlib {

 // Use Eigen
 using namespace dbal::eigen_integration;

 using boost::random::discrete_distribution;
 using boost::random::variate_generator;

 namespace modules {

 namespace recursive_partitioning {

 typedef DecisionTree<RootContainer> Tree;

 /*
  * Permute each categorical variable and predict
  */
 AnyType
 rf_cat_imp_score::run(AnyType &args) {
     if (args[0].isNull() || args[7].isNull()) { return Null(); }
     Tree dt = args[0].getAs<ByteString>();
     MutableNativeIntegerVector cat_features;
     NativeColumnVector con_features;
     try {
         if (args[1].isNull()){
             // no cat features
             return Null();
         }
         else {
             MutableNativeIntegerVector xx_cat = args[1].getAs<MutableNativeIntegerVector>();
             cat_features.rebind(xx_cat.memoryHandle(), xx_cat.size());
         }
         if (args[2].isNull()){
             con_features.rebind(this->allocateArray<double>(0));
         }
         else {
             NativeColumnVector xx_con = args[2].getAs<NativeColumnVector>();
             con_features.rebind(xx_con.memoryHandle(), xx_con.size());
         }
     } catch (const ArrayWithNullException &e) {
         // not expect to reach here
         // if max_surr = 0, nulls are filtered
         // otherwise, mapped to -1 or NaN
         return Null();
     }

     MappedIntegerVector cat_n_levels = args[3].getAs<MappedIntegerVector>();

     int n_permutations = args[4].getAs<int>();
     double y = args[5].getAs<double>();
     bool is_classification = args[6].getAs<bool>();
     MappedMatrix distributions = args[7].getAs<MappedMatrix>();

     // returning
     MutableNativeColumnVector permuted_predictions(
             this->allocateArray<double>(cat_n_levels.size()));

     // permute each and predict
     NativeRandomNumberGenerator generator;
     for (int p = 0; p < n_permutations; p ++) {
         for (Index i = 0; i < cat_n_levels.size(); i ++) {
             int orig_i = cat_features(i);
             discrete_distribution<> ddist(distributions.col(i).data(),
                     distributions.col(i).data() + cat_n_levels(i) + 1);
             variate_generator<NativeRandomNumberGenerator, discrete_distribution<> >
                     rvt(generator, ddist);

             cat_features(i) = rvt() - 1;

             // calling NativeIntegerVector for a const cast
             // see EigenIntegration_impl.hpp in ports for details
             double prediction = dt.predict_response(
                 NativeIntegerVector(cat_features.memoryHandle()), con_features);
             double score = 0.;
             if (is_classification) {
                 score = y - prediction < 1e-3 ? 1. : 0.;
             } else {
                 score = - (y - prediction) * (y - prediction);
             }
             permuted_predictions(i) += score;

             cat_features(i) = orig_i;
         }
     }
     permuted_predictions /= n_permutations;
     return permuted_predictions;
 }
 // ------------------------------------------------------------


 /*
  * Permute each continuous variable and predict
  */
 AnyType
 rf_con_imp_score::run(AnyType &args) {
     if (args[0].isNull() || args[7].isNull()) { return Null(); }
     Tree dt = args[0].getAs<ByteString>();
     NativeIntegerVector cat_features;
     MutableNativeColumnVector con_features;
     try {
         if (args[1].isNull()){
             // no cat features
             cat_features.rebind(this->allocateArray<int>(0));
         }
         else {
             NativeIntegerVector xx_cat = args[1].getAs<NativeIntegerVector>();
             cat_features.rebind(xx_cat.memoryHandle(), xx_cat.size());
         }
         if (args[2].isNull()){
             //no con features
             return Null();
         }
         else {
             MutableNativeColumnVector xx_con = args[2].getAs<MutableNativeColumnVector>();
             con_features.rebind(xx_con.memoryHandle(), xx_con.size());
         }
     } catch (const ArrayWithNullException &e) {
         // not expect to reach here
         // if max_surr = 0, nulls are filtered
         // otherwise, mapped to -1 or NaN
         return Null();
     }

     // con_splits size = num_con_features x num_bins
     // When num_con_features = 0, the input will be an empty string that is read
     // as a ByteString
     ConSplitsResult<RootContainer> splits_results = args[3].getAs<ByteString>();

     int n_permutations = args[4].getAs<int>();
     double y = args[5].getAs<double>();
     bool is_classification = args[6].getAs<bool>();
     MappedMatrix distributions = args[7].getAs<MappedMatrix>();

     // returning
     MutableNativeColumnVector permuted_predictions(
             this->allocateArray<double>(con_features.size()));

     // permute each and predict
     NativeRandomNumberGenerator generator;
     for (int p = 0; p < n_permutations; p ++) {
         for (Index i = 0; i < con_features.size(); i ++) {
             double orig_i = con_features(i);
             discrete_distribution<> ddist(distributions.col(i).data(),
                     distributions.col(i).data() + distributions.rows());
             variate_generator<NativeRandomNumberGenerator, discrete_distribution<> >
                     rvt(generator, ddist);

             int outcome = rvt();
             if (outcome == 0) {
                 con_features(i) = std::numeric_limits<double>::quiet_NaN();
             } else if (outcome == static_cast<int>(distributions.rows()) - 1) {
                 // bin value that is larger than the last separator (last value in con_splits)
                 con_features(i) = splits_results.con_splits(i, outcome-2) + 1.;
             } else {
                 con_features(i) = splits_results.con_splits(i, outcome-1);
             }

             // calling NativeColumnVector for a const cast
             // see EigenIntegration_impl.hpp in ports for details
             double prediction = dt.predict_response(
                 cat_features, NativeColumnVector(con_features.memoryHandle()));
             double score = 0.;
             if (is_classification) {
                 score = y - prediction < 1e-3 ? 1. : 0.;
             } else {
                 score = - (y - prediction) * (y - prediction);
             }
             permuted_predictions(i) += score;

             con_features(i) = orig_i;
         }
     }
     permuted_predictions /= n_permutations;
     return permuted_predictions;
 }
 // ------------------------------------------------------------


 AnyType
 normalize_sum_array::run(AnyType &args){
     const MappedColumnVector input_vector = args[0].getAs<MappedColumnVector>();
     const double sum_target = args[1].getAs<double>();

     double sum_input_vector = input_vector.sum();
     // Avoid divide by zero by dividing by a small number if sum is small
     double VAR_IMP_EPSILON = 1e-6;
     if (sum_input_vector < VAR_IMP_EPSILON)
         sum_input_vector = VAR_IMP_EPSILON;
     ColumnVector output_vector = input_vector * sum_target / sum_input_vector;
     return output_vector;
 }


 } // namespace recursive_partitioning
 } // namespace modules
 } // namespace madlib
	/* ------------------------------------------------------
	*
	* @file random_forest.cpp
	*
	* @brief Random Forest functions
	*
	*
	/ / ----------------------------------------------------------------------- */

	#include <iostream>
	#include <sstream>
	#include <vector>
	#include <string>
	#include <list>
	#include <iterator>

	#include <dbconnector/dbconnector.hpp>
	#include <boost/random/discrete_distribution.hpp>
	#include <boost/random/variate_generator.hpp>

	#include "DT_proto.hpp"
	#include "DT_impl.hpp"
	#include "ConSplits.hpp"

	#include <math.h> /* fabs */

	#include "random_forest.hpp"

	namespace madlib {

	// Use Eigen
	using namespace dbal::eigen_integration;

	using boost::random::discrete_distribution;
	using boost::random::variate_generator;

	namespace modules {

	namespace recursive_partitioning {

	typedef DecisionTree<RootContainer> Tree;

	/*
	* Permute each categorical variable and predict
	*/
	AnyType
	rf_cat_imp_score::run(AnyType &args) {
	if (args[0].isNull() \|\| args[7].isNull()) { return Null(); }
	Tree dt = args[0].getAs<ByteString>();
	MutableNativeIntegerVector cat_features;
	NativeColumnVector con_features;
	try {
	if (args[1].isNull()){
	// no cat features
	return Null();
	}
	else {
	MutableNativeIntegerVector xx_cat = args[1].getAs<MutableNativeIntegerVector>();
	cat_features.rebind(xx_cat.memoryHandle(), xx_cat.size());
	}
	if (args[2].isNull()){
	con_features.rebind(this->allocateArray<double>(0));
	}
	else {
	NativeColumnVector xx_con = args[2].getAs<NativeColumnVector>();
	con_features.rebind(xx_con.memoryHandle(), xx_con.size());
	}
	} catch (const ArrayWithNullException &e) {
	// not expect to reach here
	// if max_surr = 0, nulls are filtered
	// otherwise, mapped to -1 or NaN
	return Null();
	}

	MappedIntegerVector cat_n_levels = args[3].getAs<MappedIntegerVector>();

	int n_permutations = args[4].getAs<int>();
	double y = args[5].getAs<double>();
	bool is_classification = args[6].getAs<bool>();
	MappedMatrix distributions = args[7].getAs<MappedMatrix>();

	// returning
	MutableNativeColumnVector permuted_predictions(
	this->allocateArray<double>(cat_n_levels.size()));

	// permute each and predict
	NativeRandomNumberGenerator generator;
	for (int p = 0; p < n_permutations; p ++) {
	for (Index i = 0; i < cat_n_levels.size(); i ++) {
	int orig_i = cat_features(i);
	discrete_distribution<> ddist(distributions.col(i).data(),
	distributions.col(i).data() + cat_n_levels(i) + 1);
	variate_generator<NativeRandomNumberGenerator, discrete_distribution<> >
	rvt(generator, ddist);

	cat_features(i) = rvt() - 1;

	// calling NativeIntegerVector for a const cast
	// see EigenIntegration_impl.hpp in ports for details
	double prediction = dt.predict_response(
	NativeIntegerVector(cat_features.memoryHandle()), con_features);
	double score = 0.;
	if (is_classification) {
	score = y - prediction < 1e-3 ? 1. : 0.;
	} else {
	score = - (y - prediction) * (y - prediction);
	}
	permuted_predictions(i) += score;

	cat_features(i) = orig_i;
	}
	}
	permuted_predictions /= n_permutations;
	return permuted_predictions;
	}
	// ------------------------------------------------------------


	/*
	* Permute each continuous variable and predict
	*/
	AnyType
	rf_con_imp_score::run(AnyType &args) {
	if (args[0].isNull() \|\| args[7].isNull()) { return Null(); }
	Tree dt = args[0].getAs<ByteString>();
	NativeIntegerVector cat_features;
	MutableNativeColumnVector con_features;
	try {
	if (args[1].isNull()){
	// no cat features
	cat_features.rebind(this->allocateArray<int>(0));
	}
	else {
	NativeIntegerVector xx_cat = args[1].getAs<NativeIntegerVector>();
	cat_features.rebind(xx_cat.memoryHandle(), xx_cat.size());
	}
	if (args[2].isNull()){
	//no con features
	return Null();
	}
	else {
	MutableNativeColumnVector xx_con = args[2].getAs<MutableNativeColumnVector>();
	con_features.rebind(xx_con.memoryHandle(), xx_con.size());
	}
	} catch (const ArrayWithNullException &e) {
	// not expect to reach here
	// if max_surr = 0, nulls are filtered
	// otherwise, mapped to -1 or NaN
	return Null();
	}

	// con_splits size = num_con_features x num_bins
	// When num_con_features = 0, the input will be an empty string that is read
	// as a ByteString
	ConSplitsResult<RootContainer> splits_results = args[3].getAs<ByteString>();

	int n_permutations = args[4].getAs<int>();
	double y = args[5].getAs<double>();
	bool is_classification = args[6].getAs<bool>();
	MappedMatrix distributions = args[7].getAs<MappedMatrix>();

	// returning
	MutableNativeColumnVector permuted_predictions(
	this->allocateArray<double>(con_features.size()));

	// permute each and predict
	NativeRandomNumberGenerator generator;
	for (int p = 0; p < n_permutations; p ++) {
	for (Index i = 0; i < con_features.size(); i ++) {
	double orig_i = con_features(i);
	discrete_distribution<> ddist(distributions.col(i).data(),
	distributions.col(i).data() + distributions.rows());
	variate_generator<NativeRandomNumberGenerator, discrete_distribution<> >
	rvt(generator, ddist);

	int outcome = rvt();
	if (outcome == 0) {
	con_features(i) = std::numeric_limits<double>::quiet_NaN();
	} else if (outcome == static_cast<int>(distributions.rows()) - 1) {
	// bin value that is larger than the last separator (last value in con_splits)
	con_features(i) = splits_results.con_splits(i, outcome-2) + 1.;
	} else {
	con_features(i) = splits_results.con_splits(i, outcome-1);
	}

	// calling NativeColumnVector for a const cast
	// see EigenIntegration_impl.hpp in ports for details
	double prediction = dt.predict_response(
	cat_features, NativeColumnVector(con_features.memoryHandle()));
	double score = 0.;
	if (is_classification) {
	score = y - prediction < 1e-3 ? 1. : 0.;
	} else {
	score = - (y - prediction) * (y - prediction);
	}
	permuted_predictions(i) += score;

	con_features(i) = orig_i;
	}
	}
	permuted_predictions /= n_permutations;
	return permuted_predictions;
	}
	// ------------------------------------------------------------


	AnyType
	normalize_sum_array::run(AnyType &args){
	const MappedColumnVector input_vector = args[0].getAs<MappedColumnVector>();
	const double sum_target = args[1].getAs<double>();

	double sum_input_vector = input_vector.sum();
	// Avoid divide by zero by dividing by a small number if sum is small
	double VAR_IMP_EPSILON = 1e-6;
	if (sum_input_vector < VAR_IMP_EPSILON)
	sum_input_vector = VAR_IMP_EPSILON;
	ColumnVector output_vector = input_vector * sum_target / sum_input_vector;
	return output_vector;
	}


	} // namespace recursive_partitioning
	} // namespace modules
	} // namespace madlib