scripts/builtin/randomForest.dml - systemds - Git at Google

 #-------------------------------------------------------------
 #
 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #   http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
 #
 #-------------------------------------------------------------

 # This script implements random forest for recoded and binned categorical and
 # numerical input features. In detail, we train multiple CART (classification
 # and regression trees) decision trees in parallel and use them as an ensemble.
 # classifier/regressor. Each tree is trained on a sample of observations (rows)
 # and optionally subset of features (columns). During tree construction, split
 # candidates are additionally chosen on a sample of remaining features.
 #
 # .. code-block:: text
 #
 #   For example, given a feature matrix with features [a,b,c,d]
 #   and the following two trees, M (the output) would look as follows:
 #
 #   (L1)          |a<7|                   |d<5|
 #                /     \\                 /     \\
 #   (L2)     |c<3|     |b<4|         |a<7|     P3:2
 #            /   \\     /   \\         /  \\
 #   (L3)   P1:2 P2:1 P3:1 P4:2     P1:2 P2:1
 #
 #   --> M :=
 #   [[1, 7, 3, 3, 2, 4, 0, 2, 0, 1, 0, 1, 0, 2],  (1st tree)
 #    [4, 5, 1, 7, 0, 2, 0, 2, 0, 1, 0, 0, 0, 0]]  (2nd tree)
 #    |(L1)| |  (L2)   | |        (L3)         |
 #
 #   With feature sampling (feature_frac < 1), each tree is
 #   prefixed by a one-hot vector of sampled features
 #   (e.g., [1,1,1,0] if we sampled a,b,c of the four features)
 #
 # .. code-block:: python
 #
 #   >>> import numpy as np
 #   >>> from systemds.context import SystemDSContext
 #   >>> from systemds.operator.algorithm import randomForest, randomForestPredict
 #   >>>
 #   >>> # tiny toy dataset
 #   >>> X = np.array([[1],
 #   ...               [2],
 #   ...               [10],
 #   ...               [11]], dtype=np.int64)
 #   >>> y = np.array([[1],
 #   ...               [1],
 #   ...               [2],
 #   ...               [2]], dtype=np.int64)
 #   >>>
 #   >>> with SystemDSContext() as sds:
 #   ...     X_sds = sds.from_numpy(X)
 #   ...     y_sds = sds.from_numpy(y)
 #   ...
 #   ...     ctypes = sds.from_numpy(np.array([[1, 2]], dtype=np.int64))
 #   ...
 #   ...     # train a 4-tree forest (no sampling)
 #   ...     M = randomForest(
 #   ...             X_sds, y_sds, ctypes,
 #   ...             num_trees    = 4,
 #   ...             sample_frac  = 1.0,
 #   ...             feature_frac = 1.0,
 #   ...             max_depth    = 3,
 #   ...             min_leaf     = 1,
 #   ...             min_split    = 2,
 #   ...             seed         = 42
 #   ...          )
 #   ...
 #   ...     preds = randomForestPredict(X_sds, ctypes, M).compute()
 #   ...     print(preds)
 #   [[1.]
 #    [1.]
 #    [2.]
 #    [2.]]
 #
 #
 # INPUT:
 # ------------------------------------------------------------------------------
 # X               Feature matrix in recoded/binned representation
 # y               Label matrix in recoded/binned representation
 # ctypes          Row-Vector of column types [1 scale/ordinal, 2 categorical]
 #                 of shape 1-by-(ncol(X)+1), where the last entry is the y type
 # num_trees       Number of trees to be learned in the random forest model
 # sample_frac     Sample fraction of examples for each tree in the forest
 # feature_frac    Sample fraction of features for each tree in the forest
 # max_depth       Maximum depth of the learned tree (stopping criterion)
 # min_leaf        Minimum number of samples in leaf nodes (stopping criterion)
 # min_split       Minimum number of samples in leaf for attempting a split
 # max_features    Parameter controlling the number of features used as split
 #                 candidates at tree nodes: m = ceil(num_features^max_features)
 # max_values      Parameter controlling the number of values per feature used
 #                 as split candidates: nb = ceil(num_values^max_values)
 # impurity        Impurity measure: entropy, gini (default), rss (regression)
 # seed            Fixed seed for randomization of samples and split candidates
 # verbose         Flag indicating verbose debug output
 # ------------------------------------------------------------------------------
 #
 # OUTPUT:
 # ------------------------------------------------------------------------------
 # M              Matrix M containing the learned trees, in linearized form.
 # ------------------------------------------------------------------------------

 m_randomForest = function(Matrix[Double] X, Matrix[Double] y, Matrix[Double] ctypes,
     Int num_trees = 16, Double sample_frac = 0.1, Double feature_frac = 1.0,
     Int max_depth = 10, Int min_leaf = 20, Int min_split = 50,
     Double max_features = 0.5, Double max_values = 1.0,
     String impurity = "gini", Int seed = -1, Boolean verbose = FALSE)
   return(Matrix[Double] M)
 {
   t1 = time();

   # validation and initialization of reproducible seeds
   if(verbose) {
     print("randomForest: initialize with num_trees=" + num_trees + ", sample_frac=" + sample_frac
       + ", feature_frac=" + feature_frac + ", impurity=" + impurity + ", seed=" + seed + ".");
   }
   if(ncol(ctypes) != ncol(X)+1)
     stop("randomForest: inconsistent num features (incl. label) and col types: "+ncol(X)+" vs "+ncol(ctypes)+".");
   if( sum(X<=0) != 0 )
     stop("randomForest: feature matrix X is not properly recoded/binned: "+sum(X<=0));
   if(sum(y <= 0) != 0)
     stop("randomForest: y is not properly recoded and binned (contiguous positive integers).");
   if(max(y) == 1)
     stop("randomForest: y contains only one class label.");

   lseed = as.integer(ifelse(seed!=-1, seed, as.scalar(rand(rows=1,cols=1,min=0, max=1e9))));
   randSeeds = rand(rows = 3 * num_trees, cols = 1, seed=lseed, min=0, max=1e9);

   # training of num_tree decision trees
   M = matrix(0, rows=num_trees, cols=2*(2^max_depth-1));
   F = matrix(1, rows=num_trees, cols=ncol(X));
   parfor(i in 1:num_trees) {
     if( verbose )
       print("randomForest: start training tree "+i+"/"+num_trees+".");

     # step 1: sample data
     Xi = X; yi = y;
     if( sample_frac < 1.0 ) {
       si1 = as.integer(as.scalar(randSeeds[3*(i-1)+1,1]));
       I1 = rand(rows=nrow(X), cols=1, seed=si1) <= sample_frac;
       if( sum(I1) <= 1 ) # min 2 tuples
         I1[1:2,] = matrix(1,2,1);
       Xi = removeEmpty(target=X, margin="rows", select=I1);
       yi = removeEmpty(target=y, margin="rows", select=I1);
     }

     # step 2: sample features
     if( feature_frac < 1.0 ) {
       si2 = as.integer(as.scalar(randSeeds[3*(i-1)+2,1]));
       I2 = rand(rows=ncol(X), cols=1, seed=si2) <= feature_frac;
       Xi = removeEmpty(target=Xi, margin="cols", select=I2);
       F[i,] = t(I2);
     }

     if( verbose )
       print("-- ["+i+"] sampled "+nrow(Xi)+"/"+nrow(X)+" rows and "+ncol(Xi)+"/"+ncol(X)+" cols.");

     # step 3: train decision tree
     t2 = time();
     si3 = as.integer(as.scalar(randSeeds[3*(i-1)+3,1]));
     Mtemp = decisionTree(X=Xi, y=yi, ctypes=ctypes, max_depth=max_depth, min_split=min_split,
       min_leaf=min_leaf, max_features=max_features, max_values=max_values,
       impurity=impurity, seed=si3, verbose=verbose);
     M[i,1:length(Mtemp)] = matrix(Mtemp, rows=1, cols=length(Mtemp));
     if( verbose )
       print("-- ["+i+"] trained decision tree in "+(time()-t2)/1e9+" seconds.");
   }
   M = cbind(F, M);

   if(verbose) {
     print("randomForest: trained ensemble with num_trees="+num_trees+" in "+(time()-t1)/1e9+" seconds.");
   }
 }
	#-------------------------------------------------------------
	#
	# Licensed to the Apache Software Foundation (ASF) under one
	# or more contributor license agreements. See the NOTICE file
	# distributed with this work for additional information
	# regarding copyright ownership. The ASF licenses this file
	# to you under the Apache License, Version 2.0 (the
	# "License"); you may not use this file except in compliance
	# with the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing,
	# software distributed under the License is distributed on an
	# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	# KIND, either express or implied. See the License for the
	# specific language governing permissions and limitations
	# under the License.
	#
	#-------------------------------------------------------------

	# This script implements random forest for recoded and binned categorical and
	# numerical input features. In detail, we train multiple CART (classification
	# and regression trees) decision trees in parallel and use them as an ensemble.
	# classifier/regressor. Each tree is trained on a sample of observations (rows)
	# and optionally subset of features (columns). During tree construction, split
	# candidates are additionally chosen on a sample of remaining features.
	#
	# .. code-block:: text
	#
	# For example, given a feature matrix with features [a,b,c,d]
	# and the following two trees, M (the output) would look as follows:
	#
	# (L1) \|a<7\| \|d<5\|
	# / \\ / \\
	# (L2) \|c<3\| \|b<4\| \|a<7\| P3:2
	# / \\ / \\ / \\
	# (L3) P1:2 P2:1 P3:1 P4:2 P1:2 P2:1
	#
	# --> M :=
	# [[1, 7, 3, 3, 2, 4, 0, 2, 0, 1, 0, 1, 0, 2], (1st tree)
	# [4, 5, 1, 7, 0, 2, 0, 2, 0, 1, 0, 0, 0, 0]] (2nd tree)
	# \|(L1)\| \| (L2) \| \| (L3) \|
	#
	# With feature sampling (feature_frac < 1), each tree is
	# prefixed by a one-hot vector of sampled features
	# (e.g., [1,1,1,0] if we sampled a,b,c of the four features)
	#
	# .. code-block:: python
	#
	# >>> import numpy as np
	# >>> from systemds.context import SystemDSContext
	# >>> from systemds.operator.algorithm import randomForest, randomForestPredict
	# >>>
	# >>> # tiny toy dataset
	# >>> X = np.array([[1],
	# ... [2],
	# ... [10],
	# ... [11]], dtype=np.int64)
	# >>> y = np.array([[1],
	# ... [1],
	# ... [2],
	# ... [2]], dtype=np.int64)
	# >>>
	# >>> with SystemDSContext() as sds:
	# ... X_sds = sds.from_numpy(X)
	# ... y_sds = sds.from_numpy(y)
	# ...
	# ... ctypes = sds.from_numpy(np.array([[1, 2]], dtype=np.int64))
	# ...
	# ... # train a 4-tree forest (no sampling)
	# ... M = randomForest(
	# ... X_sds, y_sds, ctypes,
	# ... num_trees = 4,
	# ... sample_frac = 1.0,
	# ... feature_frac = 1.0,
	# ... max_depth = 3,
	# ... min_leaf = 1,
	# ... min_split = 2,
	# ... seed = 42
	# ... )
	# ...
	# ... preds = randomForestPredict(X_sds, ctypes, M).compute()
	# ... print(preds)
	# [[1.]
	# [1.]
	# [2.]
	# [2.]]
	#
	#
	# INPUT:
	# ------------------------------------------------------------------------------
	# X Feature matrix in recoded/binned representation
	# y Label matrix in recoded/binned representation
	# ctypes Row-Vector of column types [1 scale/ordinal, 2 categorical]
	# of shape 1-by-(ncol(X)+1), where the last entry is the y type
	# num_trees Number of trees to be learned in the random forest model
	# sample_frac Sample fraction of examples for each tree in the forest
	# feature_frac Sample fraction of features for each tree in the forest
	# max_depth Maximum depth of the learned tree (stopping criterion)
	# min_leaf Minimum number of samples in leaf nodes (stopping criterion)
	# min_split Minimum number of samples in leaf for attempting a split
	# max_features Parameter controlling the number of features used as split
	# candidates at tree nodes: m = ceil(num_features^max_features)
	# max_values Parameter controlling the number of values per feature used
	# as split candidates: nb = ceil(num_values^max_values)
	# impurity Impurity measure: entropy, gini (default), rss (regression)
	# seed Fixed seed for randomization of samples and split candidates
	# verbose Flag indicating verbose debug output
	# ------------------------------------------------------------------------------
	#
	# OUTPUT:
	# ------------------------------------------------------------------------------
	# M Matrix M containing the learned trees, in linearized form.
	# ------------------------------------------------------------------------------

	m_randomForest = function(Matrix[Double] X, Matrix[Double] y, Matrix[Double] ctypes,
	Int num_trees = 16, Double sample_frac = 0.1, Double feature_frac = 1.0,
	Int max_depth = 10, Int min_leaf = 20, Int min_split = 50,
	Double max_features = 0.5, Double max_values = 1.0,
	String impurity = "gini", Int seed = -1, Boolean verbose = FALSE)
	return(Matrix[Double] M)
	{
	t1 = time();

	# validation and initialization of reproducible seeds
	if(verbose) {
	print("randomForest: initialize with num_trees=" + num_trees + ", sample_frac=" + sample_frac
	+ ", feature_frac=" + feature_frac + ", impurity=" + impurity + ", seed=" + seed + ".");
	}
	if(ncol(ctypes) != ncol(X)+1)
	stop("randomForest: inconsistent num features (incl. label) and col types: "+ncol(X)+" vs "+ncol(ctypes)+".");
	if( sum(X<=0) != 0 )
	stop("randomForest: feature matrix X is not properly recoded/binned: "+sum(X<=0));
	if(sum(y <= 0) != 0)
	stop("randomForest: y is not properly recoded and binned (contiguous positive integers).");
	if(max(y) == 1)
	stop("randomForest: y contains only one class label.");

	lseed = as.integer(ifelse(seed!=-1, seed, as.scalar(rand(rows=1,cols=1,min=0, max=1e9))));
	randSeeds = rand(rows = 3 * num_trees, cols = 1, seed=lseed, min=0, max=1e9);

	# training of num_tree decision trees
	M = matrix(0, rows=num_trees, cols=2*(2^max_depth-1));
	F = matrix(1, rows=num_trees, cols=ncol(X));
	parfor(i in 1:num_trees) {
	if( verbose )
	print("randomForest: start training tree "+i+"/"+num_trees+".");

	# step 1: sample data
	Xi = X; yi = y;
	if( sample_frac < 1.0 ) {
	si1 = as.integer(as.scalar(randSeeds[3*(i-1)+1,1]));
	I1 = rand(rows=nrow(X), cols=1, seed=si1) <= sample_frac;
	if( sum(I1) <= 1 ) # min 2 tuples
	I1[1:2,] = matrix(1,2,1);
	Xi = removeEmpty(target=X, margin="rows", select=I1);
	yi = removeEmpty(target=y, margin="rows", select=I1);
	}

	# step 2: sample features
	if( feature_frac < 1.0 ) {
	si2 = as.integer(as.scalar(randSeeds[3*(i-1)+2,1]));
	I2 = rand(rows=ncol(X), cols=1, seed=si2) <= feature_frac;
	Xi = removeEmpty(target=Xi, margin="cols", select=I2);
	F[i,] = t(I2);
	}

	if( verbose )
	print("-- ["+i+"] sampled "+nrow(Xi)+"/"+nrow(X)+" rows and "+ncol(Xi)+"/"+ncol(X)+" cols.");

	# step 3: train decision tree
	t2 = time();
	si3 = as.integer(as.scalar(randSeeds[3*(i-1)+3,1]));
	Mtemp = decisionTree(X=Xi, y=yi, ctypes=ctypes, max_depth=max_depth, min_split=min_split,
	min_leaf=min_leaf, max_features=max_features, max_values=max_values,
	impurity=impurity, seed=si3, verbose=verbose);
	M[i,1:length(Mtemp)] = matrix(Mtemp, rows=1, cols=length(Mtemp));
	if( verbose )
	print("-- ["+i+"] trained decision tree in "+(time()-t2)/1e9+" seconds.");
	}
	M = cbind(F, M);

	if(verbose) {
	print("randomForest: trained ensemble with num_trees="+num_trees+" in "+(time()-t1)/1e9+" seconds.");
	}
	}