blob: cd298265417ed2b416b147a1b2f4d6386bc80f99 [file] [log] [blame]
#-------------------------------------------------------------
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#
#-------------------------------------------------------------
# This script implements random forest for recoded and binned categorical and
# numerical input features. In detail, we train multiple CART (classification
# and regression trees) decision trees in parallel and use them as an ensemble.
# classifier/regressor. Each tree is trained on a sample of observations (rows)
# and optionally subset of features (columns). During tree construction, split
# candidates are additionally chosen on a sample of remaining features.
#
# .. code-block:: text
#
# For example, given a feature matrix with features [a,b,c,d]
# and the following two trees, M (the output) would look as follows:
#
# (L1) |a<7| |d<5|
# / \\ / \\
# (L2) |c<3| |b<4| |a<7| P3:2
# / \\ / \\ / \\
# (L3) P1:2 P2:1 P3:1 P4:2 P1:2 P2:1
#
# --> M :=
# [[1, 7, 3, 3, 2, 4, 0, 2, 0, 1, 0, 1, 0, 2], (1st tree)
# [4, 5, 1, 7, 0, 2, 0, 2, 0, 1, 0, 0, 0, 0]] (2nd tree)
# |(L1)| | (L2) | | (L3) |
#
# With feature sampling (feature_frac < 1), each tree is
# prefixed by a one-hot vector of sampled features
# (e.g., [1,1,1,0] if we sampled a,b,c of the four features)
#
# .. code-block:: python
#
# >>> import numpy as np
# >>> from systemds.context import SystemDSContext
# >>> from systemds.operator.algorithm import randomForest, randomForestPredict
# >>>
# >>> # tiny toy dataset
# >>> X = np.array([[1],
# ... [2],
# ... [10],
# ... [11]], dtype=np.int64)
# >>> y = np.array([[1],
# ... [1],
# ... [2],
# ... [2]], dtype=np.int64)
# >>>
# >>> with SystemDSContext() as sds:
# ... X_sds = sds.from_numpy(X)
# ... y_sds = sds.from_numpy(y)
# ...
# ... ctypes = sds.from_numpy(np.array([[1, 2]], dtype=np.int64))
# ...
# ... # train a 4-tree forest (no sampling)
# ... M = randomForest(
# ... X_sds, y_sds, ctypes,
# ... num_trees = 4,
# ... sample_frac = 1.0,
# ... feature_frac = 1.0,
# ... max_depth = 3,
# ... min_leaf = 1,
# ... min_split = 2,
# ... seed = 42
# ... )
# ...
# ... preds = randomForestPredict(X_sds, ctypes, M).compute()
# ... print(preds)
# [[1.]
# [1.]
# [2.]
# [2.]]
#
#
# INPUT:
# ------------------------------------------------------------------------------
# X Feature matrix in recoded/binned representation
# y Label matrix in recoded/binned representation
# ctypes Row-Vector of column types [1 scale/ordinal, 2 categorical]
# of shape 1-by-(ncol(X)+1), where the last entry is the y type
# num_trees Number of trees to be learned in the random forest model
# sample_frac Sample fraction of examples for each tree in the forest
# feature_frac Sample fraction of features for each tree in the forest
# max_depth Maximum depth of the learned tree (stopping criterion)
# min_leaf Minimum number of samples in leaf nodes (stopping criterion)
# min_split Minimum number of samples in leaf for attempting a split
# max_features Parameter controlling the number of features used as split
# candidates at tree nodes: m = ceil(num_features^max_features)
# max_values Parameter controlling the number of values per feature used
# as split candidates: nb = ceil(num_values^max_values)
# impurity Impurity measure: entropy, gini (default), rss (regression)
# seed Fixed seed for randomization of samples and split candidates
# verbose Flag indicating verbose debug output
# ------------------------------------------------------------------------------
#
# OUTPUT:
# ------------------------------------------------------------------------------
# M Matrix M containing the learned trees, in linearized form.
# ------------------------------------------------------------------------------
m_randomForest = function(Matrix[Double] X, Matrix[Double] y, Matrix[Double] ctypes,
Int num_trees = 16, Double sample_frac = 0.1, Double feature_frac = 1.0,
Int max_depth = 10, Int min_leaf = 20, Int min_split = 50,
Double max_features = 0.5, Double max_values = 1.0,
String impurity = "gini", Int seed = -1, Boolean verbose = FALSE)
return(Matrix[Double] M)
{
t1 = time();
# validation and initialization of reproducible seeds
if(verbose) {
print("randomForest: initialize with num_trees=" + num_trees + ", sample_frac=" + sample_frac
+ ", feature_frac=" + feature_frac + ", impurity=" + impurity + ", seed=" + seed + ".");
}
if(ncol(ctypes) != ncol(X)+1)
stop("randomForest: inconsistent num features (incl. label) and col types: "+ncol(X)+" vs "+ncol(ctypes)+".");
if( sum(X<=0) != 0 )
stop("randomForest: feature matrix X is not properly recoded/binned: "+sum(X<=0));
if(sum(y <= 0) != 0)
stop("randomForest: y is not properly recoded and binned (contiguous positive integers).");
if(max(y) == 1)
stop("randomForest: y contains only one class label.");
lseed = as.integer(ifelse(seed!=-1, seed, as.scalar(rand(rows=1,cols=1,min=0, max=1e9))));
randSeeds = rand(rows = 3 * num_trees, cols = 1, seed=lseed, min=0, max=1e9);
# training of num_tree decision trees
M = matrix(0, rows=num_trees, cols=2*(2^max_depth-1));
F = matrix(1, rows=num_trees, cols=ncol(X));
parfor(i in 1:num_trees) {
if( verbose )
print("randomForest: start training tree "+i+"/"+num_trees+".");
# step 1: sample data
Xi = X; yi = y;
if( sample_frac < 1.0 ) {
si1 = as.integer(as.scalar(randSeeds[3*(i-1)+1,1]));
I1 = rand(rows=nrow(X), cols=1, seed=si1) <= sample_frac;
if( sum(I1) <= 1 ) # min 2 tuples
I1[1:2,] = matrix(1,2,1);
Xi = removeEmpty(target=X, margin="rows", select=I1);
yi = removeEmpty(target=y, margin="rows", select=I1);
}
# step 2: sample features
if( feature_frac < 1.0 ) {
si2 = as.integer(as.scalar(randSeeds[3*(i-1)+2,1]));
I2 = rand(rows=ncol(X), cols=1, seed=si2) <= feature_frac;
Xi = removeEmpty(target=Xi, margin="cols", select=I2);
F[i,] = t(I2);
}
if( verbose )
print("-- ["+i+"] sampled "+nrow(Xi)+"/"+nrow(X)+" rows and "+ncol(Xi)+"/"+ncol(X)+" cols.");
# step 3: train decision tree
t2 = time();
si3 = as.integer(as.scalar(randSeeds[3*(i-1)+3,1]));
Mtemp = decisionTree(X=Xi, y=yi, ctypes=ctypes, max_depth=max_depth, min_split=min_split,
min_leaf=min_leaf, max_features=max_features, max_values=max_values,
impurity=impurity, seed=si3, verbose=verbose);
M[i,1:length(Mtemp)] = matrix(Mtemp, rows=1, cols=length(Mtemp));
if( verbose )
print("-- ["+i+"] trained decision tree in "+(time()-t2)/1e9+" seconds.");
}
M = cbind(F, M);
if(verbose) {
print("randomForest: trained ensemble with num_trees="+num_trees+" in "+(time()-t1)/1e9+" seconds.");
}
}