| #------------------------------------------------------------- |
| # |
| # Licensed to the Apache Software Foundation (ASF) under one |
| # or more contributor license agreements. See the NOTICE file |
| # distributed with this work for additional information |
| # regarding copyright ownership. The ASF licenses this file |
| # to you under the Apache License, Version 2.0 (the |
| # "License"); you may not use this file except in compliance |
| # with the License. You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, |
| # software distributed under the License is distributed on an |
| # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| # KIND, either express or implied. See the License for the |
| # specific language governing permissions and limitations |
| # under the License. |
| # |
| #------------------------------------------------------------- |
| |
| # This script implements random forest for recoded and binned categorical and |
| # numerical input features. In detail, we train multiple CART (classification |
| # and regression trees) decision trees in parallel and use them as an ensemble. |
| # classifier/regressor. Each tree is trained on a sample of observations (rows) |
| # and optionally subset of features (columns). During tree construction, split |
| # candidates are additionally chosen on a sample of remaining features. |
| # |
| # .. code-block:: text |
| # |
| # For example, given a feature matrix with features [a,b,c,d] |
| # and the following two trees, M (the output) would look as follows: |
| # |
| # (L1) |a<7| |d<5| |
| # / \\ / \\ |
| # (L2) |c<3| |b<4| |a<7| P3:2 |
| # / \\ / \\ / \\ |
| # (L3) P1:2 P2:1 P3:1 P4:2 P1:2 P2:1 |
| # |
| # --> M := |
| # [[1, 7, 3, 3, 2, 4, 0, 2, 0, 1, 0, 1, 0, 2], (1st tree) |
| # [4, 5, 1, 7, 0, 2, 0, 2, 0, 1, 0, 0, 0, 0]] (2nd tree) |
| # |(L1)| | (L2) | | (L3) | |
| # |
| # With feature sampling (feature_frac < 1), each tree is |
| # prefixed by a one-hot vector of sampled features |
| # (e.g., [1,1,1,0] if we sampled a,b,c of the four features) |
| # |
| # .. code-block:: python |
| # |
| # >>> import numpy as np |
| # >>> from systemds.context import SystemDSContext |
| # >>> from systemds.operator.algorithm import randomForest, randomForestPredict |
| # >>> |
| # >>> # tiny toy dataset |
| # >>> X = np.array([[1], |
| # ... [2], |
| # ... [10], |
| # ... [11]], dtype=np.int64) |
| # >>> y = np.array([[1], |
| # ... [1], |
| # ... [2], |
| # ... [2]], dtype=np.int64) |
| # >>> |
| # >>> with SystemDSContext() as sds: |
| # ... X_sds = sds.from_numpy(X) |
| # ... y_sds = sds.from_numpy(y) |
| # ... |
| # ... ctypes = sds.from_numpy(np.array([[1, 2]], dtype=np.int64)) |
| # ... |
| # ... # train a 4-tree forest (no sampling) |
| # ... M = randomForest( |
| # ... X_sds, y_sds, ctypes, |
| # ... num_trees = 4, |
| # ... sample_frac = 1.0, |
| # ... feature_frac = 1.0, |
| # ... max_depth = 3, |
| # ... min_leaf = 1, |
| # ... min_split = 2, |
| # ... seed = 42 |
| # ... ) |
| # ... |
| # ... preds = randomForestPredict(X_sds, ctypes, M).compute() |
| # ... print(preds) |
| # [[1.] |
| # [1.] |
| # [2.] |
| # [2.]] |
| # |
| # |
| # INPUT: |
| # ------------------------------------------------------------------------------ |
| # X Feature matrix in recoded/binned representation |
| # y Label matrix in recoded/binned representation |
| # ctypes Row-Vector of column types [1 scale/ordinal, 2 categorical] |
| # of shape 1-by-(ncol(X)+1), where the last entry is the y type |
| # num_trees Number of trees to be learned in the random forest model |
| # sample_frac Sample fraction of examples for each tree in the forest |
| # feature_frac Sample fraction of features for each tree in the forest |
| # max_depth Maximum depth of the learned tree (stopping criterion) |
| # min_leaf Minimum number of samples in leaf nodes (stopping criterion) |
| # min_split Minimum number of samples in leaf for attempting a split |
| # max_features Parameter controlling the number of features used as split |
| # candidates at tree nodes: m = ceil(num_features^max_features) |
| # max_values Parameter controlling the number of values per feature used |
| # as split candidates: nb = ceil(num_values^max_values) |
| # impurity Impurity measure: entropy, gini (default), rss (regression) |
| # seed Fixed seed for randomization of samples and split candidates |
| # verbose Flag indicating verbose debug output |
| # ------------------------------------------------------------------------------ |
| # |
| # OUTPUT: |
| # ------------------------------------------------------------------------------ |
| # M Matrix M containing the learned trees, in linearized form. |
| # ------------------------------------------------------------------------------ |
| |
| m_randomForest = function(Matrix[Double] X, Matrix[Double] y, Matrix[Double] ctypes, |
| Int num_trees = 16, Double sample_frac = 0.1, Double feature_frac = 1.0, |
| Int max_depth = 10, Int min_leaf = 20, Int min_split = 50, |
| Double max_features = 0.5, Double max_values = 1.0, |
| String impurity = "gini", Int seed = -1, Boolean verbose = FALSE) |
| return(Matrix[Double] M) |
| { |
| t1 = time(); |
| |
| # validation and initialization of reproducible seeds |
| if(verbose) { |
| print("randomForest: initialize with num_trees=" + num_trees + ", sample_frac=" + sample_frac |
| + ", feature_frac=" + feature_frac + ", impurity=" + impurity + ", seed=" + seed + "."); |
| } |
| if(ncol(ctypes) != ncol(X)+1) |
| stop("randomForest: inconsistent num features (incl. label) and col types: "+ncol(X)+" vs "+ncol(ctypes)+"."); |
| if( sum(X<=0) != 0 ) |
| stop("randomForest: feature matrix X is not properly recoded/binned: "+sum(X<=0)); |
| if(sum(y <= 0) != 0) |
| stop("randomForest: y is not properly recoded and binned (contiguous positive integers)."); |
| if(max(y) == 1) |
| stop("randomForest: y contains only one class label."); |
| |
| lseed = as.integer(ifelse(seed!=-1, seed, as.scalar(rand(rows=1,cols=1,min=0, max=1e9)))); |
| randSeeds = rand(rows = 3 * num_trees, cols = 1, seed=lseed, min=0, max=1e9); |
| |
| # training of num_tree decision trees |
| M = matrix(0, rows=num_trees, cols=2*(2^max_depth-1)); |
| F = matrix(1, rows=num_trees, cols=ncol(X)); |
| parfor(i in 1:num_trees) { |
| if( verbose ) |
| print("randomForest: start training tree "+i+"/"+num_trees+"."); |
| |
| # step 1: sample data |
| Xi = X; yi = y; |
| if( sample_frac < 1.0 ) { |
| si1 = as.integer(as.scalar(randSeeds[3*(i-1)+1,1])); |
| I1 = rand(rows=nrow(X), cols=1, seed=si1) <= sample_frac; |
| if( sum(I1) <= 1 ) # min 2 tuples |
| I1[1:2,] = matrix(1,2,1); |
| Xi = removeEmpty(target=X, margin="rows", select=I1); |
| yi = removeEmpty(target=y, margin="rows", select=I1); |
| } |
| |
| # step 2: sample features |
| if( feature_frac < 1.0 ) { |
| si2 = as.integer(as.scalar(randSeeds[3*(i-1)+2,1])); |
| I2 = rand(rows=ncol(X), cols=1, seed=si2) <= feature_frac; |
| Xi = removeEmpty(target=Xi, margin="cols", select=I2); |
| F[i,] = t(I2); |
| } |
| |
| if( verbose ) |
| print("-- ["+i+"] sampled "+nrow(Xi)+"/"+nrow(X)+" rows and "+ncol(Xi)+"/"+ncol(X)+" cols."); |
| |
| # step 3: train decision tree |
| t2 = time(); |
| si3 = as.integer(as.scalar(randSeeds[3*(i-1)+3,1])); |
| Mtemp = decisionTree(X=Xi, y=yi, ctypes=ctypes, max_depth=max_depth, min_split=min_split, |
| min_leaf=min_leaf, max_features=max_features, max_values=max_values, |
| impurity=impurity, seed=si3, verbose=verbose); |
| M[i,1:length(Mtemp)] = matrix(Mtemp, rows=1, cols=length(Mtemp)); |
| if( verbose ) |
| print("-- ["+i+"] trained decision tree in "+(time()-t2)/1e9+" seconds."); |
| } |
| M = cbind(F, M); |
| |
| if(verbose) { |
| print("randomForest: trained ensemble with num_trees="+num_trees+" in "+(time()-t1)/1e9+" seconds."); |
| } |
| } |