scripts/builtin/decisionTree.dml - systemds - Git at Google

 #-------------------------------------------------------------
 #
 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #   http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
 #
 #-------------------------------------------------------------

 # This script implements decision trees for recoded and binned categorical and
 # numerical input features. We train a single CART (classification and
 # regression tree) decision trees depending on the provided labels y, either
 # classification (majority vote per leaf) or regression (average per leaf).
 #
 # .. code-block::
 #
 #   For example, give a feature matrix with features [a,b,c,d]
 #   and the following trees, M would look as follows:
 #
 #   (L1)               |d<5|
 #                     /     \
 #   (L2)           P1:2    |a<7|
 #                          /   \
 #   (L3)                 P2:2 P3:1
 #
 #   --> M :=
 #   [[4, 5, 0, 2, 1, 7, 0, 0, 0, 0, 0, 2, 0, 1]]
 #    |(L1)| |  (L2)   | |        (L3)         |
 #
 #
 #
 # INPUT:
 # ------------------------------------------------------------------------------
 # X               Feature matrix in recoded/binned representation
 # y               Label matrix in recoded/binned representation
 # ctypes          Row-Vector of column types [1 scale/ordinal, 2 categorical]
 #                 of shape 1-by-(ncol(X)+1), where the last entry is the y type
 # max_depth       Maximum depth of the learned tree (stopping criterion)
 # min_leaf        Minimum number of samples in leaf nodes (stopping criterion),
 #                 odd number recommended to avoid 50/50 leaf label decisions
 # min_split       Minimum number of samples in leaf for attempting a split
 # max_features    Parameter controlling the number of features used as split
 #                 candidates at tree nodes: m = ceil(num_features^max_features)
 # max_values      Parameter controlling the number of values per feature used
 #                 as split candidates: nb = ceil(num_values^max_values)
 # max_dataratio   Parameter in [0,1] controlling when to materialize data
 #                 subsets of X and y on node splits. When set to 0, we always
 #                 scan the original X and y, which has the benefit of avoiding
 #                 the allocation and maintenance of data for all active nodes.
 #                 When set to 0.01 we rematerialize whenever the sub-tree data
 #                 would be less than 1% of last the parent materialize data size.
 # impurity        Impurity measure: entropy, gini (default), rss (regression)
 # seed            Fixed seed for randomization of samples and split candidates
 # verbose         Flag indicating verbose debug output
 # ------------------------------------------------------------------------------
 #
 # OUTPUT:
 # ------------------------------------------------------------------------------
 # M              Matrix M containing the learned trees, in linearized form
 # ------------------------------------------------------------------------------

 m_decisionTree = function(Matrix[Double] X, Matrix[Double] y, Matrix[Double] ctypes,
     Int max_depth = 10, Int min_leaf = 20, Int min_split = 50,
     Double max_features = 0.5, Double max_values = 1.0, Double max_dataratio = 0.25,
     String impurity = "gini", Int seed = -1, Boolean verbose = FALSE)
   return(Matrix[Double] M)
 {
   t1 = time();

   # validation checks
   if( max_depth > 32 )
     stop("decisionTree: invalid max_depth > 32: "+max_depth);
   if( sum(X<=0) != 0 )
     stop("decisionTree: feature matrix X is not properly recoded/binned (values <= 0): "+sum(X<=0));
   if( sum(abs(X-round(X))>1e-14) != 0 )
     stop("decisionTree: feature matrix X is not properly recoded/binned (non-integer): "+sum(abs(X-round(X))>1e-14));
   if( sum(y<=0) != 0 )
     stop("decisionTree: label vector y is not properly recoded/binned: "+sum(y<=0));

   # initialize input data and basic statistics
   # (we keep y2 and the indicates I in transposed form for sparsity exploitation)
   m = nrow(X); n = ncol(X);
   classify = (as.scalar(ctypes[1,n+1]) == 2);

   fdom = max(colMaxs(X),2);           # num distinct per feature
   foffb = t(cumsum(t(fdom))) - fdom;  # feature begin
   foffe = t(cumsum(t(fdom)))          # feature end
   rix = matrix(seq(1,m)%*%matrix(1,1,n), m*n, 1)
   cix = matrix(X + foffb, m*n, 1);
   X2 = table(rix, cix, 1, m, as.scalar(foffe[,n]), FALSE); #one-hot encoded
   y2 = table(seq(1,m), y);
   cnt = colSums(X2);
   I = matrix(1, rows=1, cols=nrow(X));

   if( verbose ) {
     print("decisionTree: initialize with max_depth=" + max_depth + ", max_features="
       + max_features +", max_dataratio" + max_dataratio + ", impurity="
       + impurity + ", seed=" + seed + ".");
     print("decisionTree: basic statistics:");
     print("-- impurity: " + as.scalar(computeImpurity(y2, I, impurity)));
     print("-- minFeatureCount: " + min(cnt));
     print("-- maxFeatureCount: " + max(cnt));
   }

   # queue-based node splitting
   M = matrix(0, rows=1, cols=2*(2^max_depth-1))
   queue = list(list(1,I,X2,y2)); # node IDs / data indicators
   maxPath = 1;
   while( length(queue) > 0 ) {
     # pop next node from queue for splitting
     [queue, node0] = remove(queue, 1);
     node = as.list(node0);
     nID = as.scalar(node[1]);
     nI = as.matrix(node[2]);
     X2 = as.matrix(node[3]);
     y2 = as.matrix(node[4]);
     if(verbose)
       print("decisionTree: attempting split of node "+nID+" ("+sum(nI)+" rows)");

     # optional rematerialization of data per node
     if( sum(nI) < max_dataratio*ncol(nI) ) {
       if(verbose)
         print("-- compacting data: "+ncol(nI)+" --> "+sum(nI));
       X2 = removeEmpty(target=X2, margin="rows", select=t(nI));
       y2 = removeEmpty(target=y2, margin="rows", select=t(nI));
       nI = matrix(1, rows=1, cols=nrow(X2));
     }

     # find best split attribute
     nSeed = ifelse(seed==-1, seed, seed*nID);
     [f, v, IDleft, Ileft, IDright, Iright] = findBestSplit(
       X2, y2, foffb, foffe, nID, nI, min_leaf, max_features, max_values, impurity, nSeed);
     validSplit = sum(Ileft) >= min_leaf & sum(Iright) >= min_leaf;
     if(verbose)
       print("-- best split: f"+f+" <= "+v+" --> valid="+validSplit);
     if( validSplit )
       M[, 2*nID-1:2*nID] = t(as.matrix(list(f,v)));
     else
       M[, 2*nID] = computeLeafLabel(y2, nI, classify, verbose);
     maxPath = max(maxPath, floor(log(nID,2)+1));

     # split data, finalize or recurse
     if( validSplit ) {
       if( sum(Ileft) >= min_split & floor(log(IDleft,2))+2 < max_depth )
         queue = append(queue, list(IDleft,Ileft,X2,y2));
       else
         M[,2*IDleft] = computeLeafLabel(y2, Ileft, classify, verbose)
       if( sum(Iright) >= min_split & floor(log(IDright,2))+2 < max_depth )
         queue = append(queue, list(IDright,Iright,X2,y2));
       else
         M[,2*IDright] = computeLeafLabel(y2, Iright, classify, verbose)
       maxPath = max(maxPath, floor(log(IDleft,2)+1));
     }
   }

   # summary and encoding
   M = M[1, 1:2*(2^maxPath-1)];

   if(verbose) {
     print("decisionTree: final constructed tree (linearized):");
     print("--" + toString(M));
   }
 }

 findBestSplit = function(Matrix[Double] X2, Matrix[Double] y2, Matrix[Double] foffb, Matrix[Double] foffe,
     Int ID, Matrix[Double] I, Int min_leaf, Double max_features, Double max_values, String impurity, Int seed)
   return(Int f, Int v, Int IDleft, Matrix[Double] Ileft, Int IDright, Matrix[Double] Iright)
 {
   # sample features iff max_features < 1
   n = ncol(foffb);
   numI = sum(I);
   feat = seq(1,n);
   if( max_features < 1.0 ) {
     rI = rand(rows=n, cols=1, seed=seed) <= (n^max_features/n);
     feat = removeEmpty(target=feat, margin="rows", select=rI);
     if( sum(feat) == 0 ) #sample at least one
       feat[1,1] = round(rand(rows=1, cols=1, min=1, max=n));
   }

   # evaluate features and feature splits
   # (both categorical and numerical are treated similarly by
   # finding a cutoff point in the recoded/binned representation)
   R = matrix(0, rows=3, cols=nrow(feat));
   parfor( i in 1:nrow(feat) ) {
     f = as.scalar(feat[i]);        # feature
     beg = as.scalar(foffb[1,f])+1; # feature start in X2
     end = as.scalar(foffe[1,f]);   # feature end in X2
     belen = end-beg; #numFeat - 1
     while(FALSE){} # make beg/end known

     # construct 0/1 predicate vectors with <= semantics
     # find rows that match at least one value and appear in I
     # (vectorized evaluation, each column in P is a split candidate)
     fP = upper.tri(target=matrix(1,belen,belen), diag=TRUE);
     vI = seq(1,belen);
     if( max_values < 1.0 & ncol(fP)>10 ) {
       rI2 = rand(rows=ncol(fP),cols=1,seed=seed) <= (ncol(fP)^max_values/ncol(fP));
       fP = removeEmpty(target=fP, margin="cols", select=t(rI2));
       vI = removeEmpty(target=vI, margin="rows", select=rI2);
     }
     P = matrix(0, ncol(X2), ncol(fP));
     P[beg:end-1,1:ncol(fP)] = fP;
     Ileft = (t(X2 %*% P) * I) != 0;
     Iright = (Ileft==0) * I;

     # compute information gain for all split candidates
     ig = as.scalar(computeImpurity(y2, I, impurity))
          - rowSums(Ileft)/numI * computeImpurity(y2, Ileft, impurity)
          - rowSums(Iright)/numI * computeImpurity(y2, Iright, impurity);
     ig = replace(target=ig, pattern=NaN, replacement=0);

     # track best split value and index, incl validity
     valid = (rowSums(Ileft)>=min_leaf) & (rowSums(Iright)>=min_leaf);
     bestig = max(valid*ig);
     bestv = ifelse(bestig>0, nrow(valid)-as.scalar(rowIndexMax(t(rev(valid*ig))))+beg, -1);
     if( bestv >= 0 )
       bestv = as.scalar(vI[bestv-beg+1,1])+beg-1;
     R[,i] = as.matrix(list(f, bestig, bestv));
   }
   ix = as.scalar(rowIndexMax(R[2,]));

   # extract indicators and IDs
   IDleft = 2 * ID;
   IDright= 2 * ID + 1;
   f = as.integer(as.scalar(feat[ix,1]));
   beg = as.scalar(foffb[1,f]);
   v = as.integer(as.scalar(R[3,ix])-beg);
   if( max(R[2,]) > 0 ) {
     p = table(seq(beg+1, beg+v), 1, ncol(X2), 1);
     Ileft = (t(X2 %*% p) * I) != 0;
     Iright = I * (Ileft==0);
   }
   else { # no information gain
     Ileft = as.matrix(0);
     Iright = as.matrix(0);
   }
 }

 computeImpurity = function(Matrix[Double] y2, Matrix[Double] I, String impurity)
   return(Matrix[Double] score)
 {
   f = (I %*% y2) / rowSums(I); # rel. freq. per category/bin
   score = matrix(0, nrow(I), 1);
   if( impurity == "gini" )
     score = 1 - rowSums(f^2); # sum(f*(1-f));
   else if( impurity == "entropy" )
     score = rowSums(-f * log(f));
   else if( impurity == "rss" ) { # residual sum of squares
     yhat = f %*% seq(1,ncol(f));                # yhat
     res = outer(yhat, t(rowIndexMax(y2)), "-"); # yhat-y
     score = rowSums((I * res)^2);               # sum((yhat-y)^2)
   }
   else
     stop("decisionTree: unsupported impurity measure: "+impurity);
 }

 computeLeafLabel = function(Matrix[Double] y2, Matrix[Double] I, Boolean classify, Boolean verbose)
   return(Double label)
 {
   f = (I %*% y2) / sum(I);
   label = as.scalar(ifelse(classify,
     rowIndexMax(f), f %*% seq(1,ncol(f))));
   if(verbose)
     print("-- leaf node label: " + label +" ("+sum(I)*max(f)+"/"+sum(I)+")");
 }
	#-------------------------------------------------------------
	#
	# Licensed to the Apache Software Foundation (ASF) under one
	# or more contributor license agreements. See the NOTICE file
	# distributed with this work for additional information
	# regarding copyright ownership. The ASF licenses this file
	# to you under the Apache License, Version 2.0 (the
	# "License"); you may not use this file except in compliance
	# with the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing,
	# software distributed under the License is distributed on an
	# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	# KIND, either express or implied. See the License for the
	# specific language governing permissions and limitations
	# under the License.
	#
	#-------------------------------------------------------------

	# This script implements decision trees for recoded and binned categorical and
	# numerical input features. We train a single CART (classification and
	# regression tree) decision trees depending on the provided labels y, either
	# classification (majority vote per leaf) or regression (average per leaf).
	#
	# .. code-block::
	#
	# For example, give a feature matrix with features [a,b,c,d]
	# and the following trees, M would look as follows:
	#
	# (L1) \|d<5\|
	# / \
	# (L2) P1:2 \|a<7\|
	# / \
	# (L3) P2:2 P3:1
	#
	# --> M :=
	# [[4, 5, 0, 2, 1, 7, 0, 0, 0, 0, 0, 2, 0, 1]]
	# \|(L1)\| \| (L2) \| \| (L3) \|
	#
	#
	#
	# INPUT:
	# ------------------------------------------------------------------------------
	# X Feature matrix in recoded/binned representation
	# y Label matrix in recoded/binned representation
	# ctypes Row-Vector of column types [1 scale/ordinal, 2 categorical]
	# of shape 1-by-(ncol(X)+1), where the last entry is the y type
	# max_depth Maximum depth of the learned tree (stopping criterion)
	# min_leaf Minimum number of samples in leaf nodes (stopping criterion),
	# odd number recommended to avoid 50/50 leaf label decisions
	# min_split Minimum number of samples in leaf for attempting a split
	# max_features Parameter controlling the number of features used as split
	# candidates at tree nodes: m = ceil(num_features^max_features)
	# max_values Parameter controlling the number of values per feature used
	# as split candidates: nb = ceil(num_values^max_values)
	# max_dataratio Parameter in [0,1] controlling when to materialize data
	# subsets of X and y on node splits. When set to 0, we always
	# scan the original X and y, which has the benefit of avoiding
	# the allocation and maintenance of data for all active nodes.
	# When set to 0.01 we rematerialize whenever the sub-tree data
	# would be less than 1% of last the parent materialize data size.
	# impurity Impurity measure: entropy, gini (default), rss (regression)
	# seed Fixed seed for randomization of samples and split candidates
	# verbose Flag indicating verbose debug output
	# ------------------------------------------------------------------------------
	#
	# OUTPUT:
	# ------------------------------------------------------------------------------
	# M Matrix M containing the learned trees, in linearized form
	# ------------------------------------------------------------------------------

	m_decisionTree = function(Matrix[Double] X, Matrix[Double] y, Matrix[Double] ctypes,
	Int max_depth = 10, Int min_leaf = 20, Int min_split = 50,
	Double max_features = 0.5, Double max_values = 1.0, Double max_dataratio = 0.25,
	String impurity = "gini", Int seed = -1, Boolean verbose = FALSE)
	return(Matrix[Double] M)
	{
	t1 = time();

	# validation checks
	if( max_depth > 32 )
	stop("decisionTree: invalid max_depth > 32: "+max_depth);
	if( sum(X<=0) != 0 )
	stop("decisionTree: feature matrix X is not properly recoded/binned (values <= 0): "+sum(X<=0));
	if( sum(abs(X-round(X))>1e-14) != 0 )
	stop("decisionTree: feature matrix X is not properly recoded/binned (non-integer): "+sum(abs(X-round(X))>1e-14));
	if( sum(y<=0) != 0 )
	stop("decisionTree: label vector y is not properly recoded/binned: "+sum(y<=0));

	# initialize input data and basic statistics
	# (we keep y2 and the indicates I in transposed form for sparsity exploitation)
	m = nrow(X); n = ncol(X);
	classify = (as.scalar(ctypes[1,n+1]) == 2);

	fdom = max(colMaxs(X),2); # num distinct per feature
	foffb = t(cumsum(t(fdom))) - fdom; # feature begin
	foffe = t(cumsum(t(fdom))) # feature end
	rix = matrix(seq(1,m)%%matrix(1,1,n), mn, 1)
	cix = matrix(X + foffb, m*n, 1);
	X2 = table(rix, cix, 1, m, as.scalar(foffe[,n]), FALSE); #one-hot encoded
	y2 = table(seq(1,m), y);
	cnt = colSums(X2);
	I = matrix(1, rows=1, cols=nrow(X));

	if( verbose ) {
	print("decisionTree: initialize with max_depth=" + max_depth + ", max_features="
	+ max_features +", max_dataratio" + max_dataratio + ", impurity="
	+ impurity + ", seed=" + seed + ".");
	print("decisionTree: basic statistics:");
	print("-- impurity: " + as.scalar(computeImpurity(y2, I, impurity)));
	print("-- minFeatureCount: " + min(cnt));
	print("-- maxFeatureCount: " + max(cnt));
	}

	# queue-based node splitting
	M = matrix(0, rows=1, cols=2*(2^max_depth-1))
	queue = list(list(1,I,X2,y2)); # node IDs / data indicators
	maxPath = 1;
	while( length(queue) > 0 ) {
	# pop next node from queue for splitting
	[queue, node0] = remove(queue, 1);
	node = as.list(node0);
	nID = as.scalar(node[1]);
	nI = as.matrix(node[2]);
	X2 = as.matrix(node[3]);
	y2 = as.matrix(node[4]);
	if(verbose)
	print("decisionTree: attempting split of node "+nID+" ("+sum(nI)+" rows)");

	# optional rematerialization of data per node
	if( sum(nI) < max_dataratio*ncol(nI) ) {
	if(verbose)
	print("-- compacting data: "+ncol(nI)+" --> "+sum(nI));
	X2 = removeEmpty(target=X2, margin="rows", select=t(nI));
	y2 = removeEmpty(target=y2, margin="rows", select=t(nI));
	nI = matrix(1, rows=1, cols=nrow(X2));
	}

	# find best split attribute
	nSeed = ifelse(seed==-1, seed, seed*nID);
	[f, v, IDleft, Ileft, IDright, Iright] = findBestSplit(
	X2, y2, foffb, foffe, nID, nI, min_leaf, max_features, max_values, impurity, nSeed);
	validSplit = sum(Ileft) >= min_leaf & sum(Iright) >= min_leaf;
	if(verbose)
	print("-- best split: f"+f+" <= "+v+" --> valid="+validSplit);
	if( validSplit )
	M[, 2nID-1:2nID] = t(as.matrix(list(f,v)));
	else
	M[, 2*nID] = computeLeafLabel(y2, nI, classify, verbose);
	maxPath = max(maxPath, floor(log(nID,2)+1));

	# split data, finalize or recurse
	if( validSplit ) {
	if( sum(Ileft) >= min_split & floor(log(IDleft,2))+2 < max_depth )
	queue = append(queue, list(IDleft,Ileft,X2,y2));
	else
	M[,2*IDleft] = computeLeafLabel(y2, Ileft, classify, verbose)
	if( sum(Iright) >= min_split & floor(log(IDright,2))+2 < max_depth )
	queue = append(queue, list(IDright,Iright,X2,y2));
	else
	M[,2*IDright] = computeLeafLabel(y2, Iright, classify, verbose)
	maxPath = max(maxPath, floor(log(IDleft,2)+1));
	}
	}

	# summary and encoding
	M = M[1, 1:2*(2^maxPath-1)];

	if(verbose) {
	print("decisionTree: final constructed tree (linearized):");
	print("--" + toString(M));
	}
	}

	findBestSplit = function(Matrix[Double] X2, Matrix[Double] y2, Matrix[Double] foffb, Matrix[Double] foffe,
	Int ID, Matrix[Double] I, Int min_leaf, Double max_features, Double max_values, String impurity, Int seed)
	return(Int f, Int v, Int IDleft, Matrix[Double] Ileft, Int IDright, Matrix[Double] Iright)
	{
	# sample features iff max_features < 1
	n = ncol(foffb);
	numI = sum(I);
	feat = seq(1,n);
	if( max_features < 1.0 ) {
	rI = rand(rows=n, cols=1, seed=seed) <= (n^max_features/n);
	feat = removeEmpty(target=feat, margin="rows", select=rI);
	if( sum(feat) == 0 ) #sample at least one
	feat[1,1] = round(rand(rows=1, cols=1, min=1, max=n));
	}

	# evaluate features and feature splits
	# (both categorical and numerical are treated similarly by
	# finding a cutoff point in the recoded/binned representation)
	R = matrix(0, rows=3, cols=nrow(feat));
	parfor( i in 1:nrow(feat) ) {
	f = as.scalar(feat[i]); # feature
	beg = as.scalar(foffb[1,f])+1; # feature start in X2
	end = as.scalar(foffe[1,f]); # feature end in X2
	belen = end-beg; #numFeat - 1
	while(FALSE){} # make beg/end known

	# construct 0/1 predicate vectors with <= semantics
	# find rows that match at least one value and appear in I
	# (vectorized evaluation, each column in P is a split candidate)
	fP = upper.tri(target=matrix(1,belen,belen), diag=TRUE);
	vI = seq(1,belen);
	if( max_values < 1.0 & ncol(fP)>10 ) {
	rI2 = rand(rows=ncol(fP),cols=1,seed=seed) <= (ncol(fP)^max_values/ncol(fP));
	fP = removeEmpty(target=fP, margin="cols", select=t(rI2));
	vI = removeEmpty(target=vI, margin="rows", select=rI2);
	}
	P = matrix(0, ncol(X2), ncol(fP));
	P[beg:end-1,1:ncol(fP)] = fP;
	Ileft = (t(X2 %% P) I) != 0;
	Iright = (Ileft==0) * I;

	# compute information gain for all split candidates
	ig = as.scalar(computeImpurity(y2, I, impurity))
	- rowSums(Ileft)/numI * computeImpurity(y2, Ileft, impurity)
	- rowSums(Iright)/numI * computeImpurity(y2, Iright, impurity);
	ig = replace(target=ig, pattern=NaN, replacement=0);

	# track best split value and index, incl validity
	valid = (rowSums(Ileft)>=min_leaf) & (rowSums(Iright)>=min_leaf);
	bestig = max(valid*ig);
	bestv = ifelse(bestig>0, nrow(valid)-as.scalar(rowIndexMax(t(rev(valid*ig))))+beg, -1);
	if( bestv >= 0 )
	bestv = as.scalar(vI[bestv-beg+1,1])+beg-1;
	R[,i] = as.matrix(list(f, bestig, bestv));
	}
	ix = as.scalar(rowIndexMax(R[2,]));

	# extract indicators and IDs
	IDleft = 2 * ID;
	IDright= 2 * ID + 1;
	f = as.integer(as.scalar(feat[ix,1]));
	beg = as.scalar(foffb[1,f]);
	v = as.integer(as.scalar(R[3,ix])-beg);
	if( max(R[2,]) > 0 ) {
	p = table(seq(beg+1, beg+v), 1, ncol(X2), 1);
	Ileft = (t(X2 %% p) I) != 0;
	Iright = I * (Ileft==0);
	}
	else { # no information gain
	Ileft = as.matrix(0);
	Iright = as.matrix(0);
	}
	}

	computeImpurity = function(Matrix[Double] y2, Matrix[Double] I, String impurity)
	return(Matrix[Double] score)
	{
	f = (I %*% y2) / rowSums(I); # rel. freq. per category/bin
	score = matrix(0, nrow(I), 1);
	if( impurity == "gini" )
	score = 1 - rowSums(f^2); # sum(f*(1-f));
	else if( impurity == "entropy" )
	score = rowSums(-f * log(f));
	else if( impurity == "rss" ) { # residual sum of squares
	yhat = f %*% seq(1,ncol(f)); # yhat
	res = outer(yhat, t(rowIndexMax(y2)), "-"); # yhat-y
	score = rowSums((I * res)^2); # sum((yhat-y)^2)
	}
	else
	stop("decisionTree: unsupported impurity measure: "+impurity);
	}

	computeLeafLabel = function(Matrix[Double] y2, Matrix[Double] I, Boolean classify, Boolean verbose)
	return(Double label)
	{
	f = (I %*% y2) / sum(I);
	label = as.scalar(ifelse(classify,
	rowIndexMax(f), f %*% seq(1,ncol(f))));
	if(verbose)
	print("-- leaf node label: " + label +" ("+sum(I)*max(f)+"/"+sum(I)+")");
	}