blob: c128f094c9826c9376245c699222f8d6629e488a [file] [log] [blame]
#-------------------------------------------------------------
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#
#-------------------------------------------------------------
# This script implements decision trees for recoded and binned categorical and
# numerical input features. We train a single CART (classification and
# regression tree) decision trees depending on the provided labels y, either
# classification (majority vote per leaf) or regression (average per leaf).
#
# .. code-block::
#
# For example, give a feature matrix with features [a,b,c,d]
# and the following trees, M would look as follows:
#
# (L1) |d<5|
# / \
# (L2) P1:2 |a<7|
# / \
# (L3) P2:2 P3:1
#
# --> M :=
# [[4, 5, 0, 2, 1, 7, 0, 0, 0, 0, 0, 2, 0, 1]]
# |(L1)| | (L2) | | (L3) |
#
#
#
# INPUT:
# ------------------------------------------------------------------------------
# X Feature matrix in recoded/binned representation
# y Label matrix in recoded/binned representation
# ctypes Row-Vector of column types [1 scale/ordinal, 2 categorical]
# of shape 1-by-(ncol(X)+1), where the last entry is the y type
# max_depth Maximum depth of the learned tree (stopping criterion)
# min_leaf Minimum number of samples in leaf nodes (stopping criterion),
# odd number recommended to avoid 50/50 leaf label decisions
# min_split Minimum number of samples in leaf for attempting a split
# max_features Parameter controlling the number of features used as split
# candidates at tree nodes: m = ceil(num_features^max_features)
# max_values Parameter controlling the number of values per feature used
# as split candidates: nb = ceil(num_values^max_values)
# max_dataratio Parameter in [0,1] controlling when to materialize data
# subsets of X and y on node splits. When set to 0, we always
# scan the original X and y, which has the benefit of avoiding
# the allocation and maintenance of data for all active nodes.
# When set to 0.01 we rematerialize whenever the sub-tree data
# would be less than 1% of last the parent materialize data size.
# impurity Impurity measure: entropy, gini (default), rss (regression)
# seed Fixed seed for randomization of samples and split candidates
# verbose Flag indicating verbose debug output
# ------------------------------------------------------------------------------
#
# OUTPUT:
# ------------------------------------------------------------------------------
# M Matrix M containing the learned trees, in linearized form
# ------------------------------------------------------------------------------
m_decisionTree = function(Matrix[Double] X, Matrix[Double] y, Matrix[Double] ctypes,
Int max_depth = 10, Int min_leaf = 20, Int min_split = 50,
Double max_features = 0.5, Double max_values = 1.0, Double max_dataratio = 0.25,
String impurity = "gini", Int seed = -1, Boolean verbose = FALSE)
return(Matrix[Double] M)
{
t1 = time();
# validation checks
if( max_depth > 32 )
stop("decisionTree: invalid max_depth > 32: "+max_depth);
if( sum(X<=0) != 0 )
stop("decisionTree: feature matrix X is not properly recoded/binned (values <= 0): "+sum(X<=0));
if( sum(abs(X-round(X))>1e-14) != 0 )
stop("decisionTree: feature matrix X is not properly recoded/binned (non-integer): "+sum(abs(X-round(X))>1e-14));
if( sum(y<=0) != 0 )
stop("decisionTree: label vector y is not properly recoded/binned: "+sum(y<=0));
# initialize input data and basic statistics
# (we keep y2 and the indicates I in transposed form for sparsity exploitation)
m = nrow(X); n = ncol(X);
classify = (as.scalar(ctypes[1,n+1]) == 2);
fdom = max(colMaxs(X),2); # num distinct per feature
foffb = t(cumsum(t(fdom))) - fdom; # feature begin
foffe = t(cumsum(t(fdom))) # feature end
rix = matrix(seq(1,m)%*%matrix(1,1,n), m*n, 1)
cix = matrix(X + foffb, m*n, 1);
X2 = table(rix, cix, 1, m, as.scalar(foffe[,n]), FALSE); #one-hot encoded
y2 = table(seq(1,m), y);
cnt = colSums(X2);
I = matrix(1, rows=1, cols=nrow(X));
if( verbose ) {
print("decisionTree: initialize with max_depth=" + max_depth + ", max_features="
+ max_features +", max_dataratio" + max_dataratio + ", impurity="
+ impurity + ", seed=" + seed + ".");
print("decisionTree: basic statistics:");
print("-- impurity: " + as.scalar(computeImpurity(y2, I, impurity)));
print("-- minFeatureCount: " + min(cnt));
print("-- maxFeatureCount: " + max(cnt));
}
# queue-based node splitting
M = matrix(0, rows=1, cols=2*(2^max_depth-1))
queue = list(list(1,I,X2,y2)); # node IDs / data indicators
maxPath = 1;
while( length(queue) > 0 ) {
# pop next node from queue for splitting
[queue, node0] = remove(queue, 1);
node = as.list(node0);
nID = as.scalar(node[1]);
nI = as.matrix(node[2]);
X2 = as.matrix(node[3]);
y2 = as.matrix(node[4]);
if(verbose)
print("decisionTree: attempting split of node "+nID+" ("+sum(nI)+" rows)");
# optional rematerialization of data per node
if( sum(nI) < max_dataratio*ncol(nI) ) {
if(verbose)
print("-- compacting data: "+ncol(nI)+" --> "+sum(nI));
X2 = removeEmpty(target=X2, margin="rows", select=t(nI));
y2 = removeEmpty(target=y2, margin="rows", select=t(nI));
nI = matrix(1, rows=1, cols=nrow(X2));
}
# find best split attribute
nSeed = ifelse(seed==-1, seed, seed*nID);
[f, v, IDleft, Ileft, IDright, Iright] = findBestSplit(
X2, y2, foffb, foffe, nID, nI, min_leaf, max_features, max_values, impurity, nSeed);
validSplit = sum(Ileft) >= min_leaf & sum(Iright) >= min_leaf;
if(verbose)
print("-- best split: f"+f+" <= "+v+" --> valid="+validSplit);
if( validSplit )
M[, 2*nID-1:2*nID] = t(as.matrix(list(f,v)));
else
M[, 2*nID] = computeLeafLabel(y2, nI, classify, verbose);
maxPath = max(maxPath, floor(log(nID,2)+1));
# split data, finalize or recurse
if( validSplit ) {
if( sum(Ileft) >= min_split & floor(log(IDleft,2))+2 < max_depth )
queue = append(queue, list(IDleft,Ileft,X2,y2));
else
M[,2*IDleft] = computeLeafLabel(y2, Ileft, classify, verbose)
if( sum(Iright) >= min_split & floor(log(IDright,2))+2 < max_depth )
queue = append(queue, list(IDright,Iright,X2,y2));
else
M[,2*IDright] = computeLeafLabel(y2, Iright, classify, verbose)
maxPath = max(maxPath, floor(log(IDleft,2)+1));
}
}
# summary and encoding
M = M[1, 1:2*(2^maxPath-1)];
if(verbose) {
print("decisionTree: final constructed tree (linearized):");
print("--" + toString(M));
}
}
findBestSplit = function(Matrix[Double] X2, Matrix[Double] y2, Matrix[Double] foffb, Matrix[Double] foffe,
Int ID, Matrix[Double] I, Int min_leaf, Double max_features, Double max_values, String impurity, Int seed)
return(Int f, Int v, Int IDleft, Matrix[Double] Ileft, Int IDright, Matrix[Double] Iright)
{
# sample features iff max_features < 1
n = ncol(foffb);
numI = sum(I);
feat = seq(1,n);
if( max_features < 1.0 ) {
rI = rand(rows=n, cols=1, seed=seed) <= (n^max_features/n);
feat = removeEmpty(target=feat, margin="rows", select=rI);
if( sum(feat) == 0 ) #sample at least one
feat[1,1] = round(rand(rows=1, cols=1, min=1, max=n));
}
# evaluate features and feature splits
# (both categorical and numerical are treated similarly by
# finding a cutoff point in the recoded/binned representation)
R = matrix(0, rows=3, cols=nrow(feat));
parfor( i in 1:nrow(feat) ) {
f = as.scalar(feat[i]); # feature
beg = as.scalar(foffb[1,f])+1; # feature start in X2
end = as.scalar(foffe[1,f]); # feature end in X2
belen = end-beg; #numFeat - 1
while(FALSE){} # make beg/end known
# construct 0/1 predicate vectors with <= semantics
# find rows that match at least one value and appear in I
# (vectorized evaluation, each column in P is a split candidate)
fP = upper.tri(target=matrix(1,belen,belen), diag=TRUE);
vI = seq(1,belen);
if( max_values < 1.0 & ncol(fP)>10 ) {
rI2 = rand(rows=ncol(fP),cols=1,seed=seed) <= (ncol(fP)^max_values/ncol(fP));
fP = removeEmpty(target=fP, margin="cols", select=t(rI2));
vI = removeEmpty(target=vI, margin="rows", select=rI2);
}
P = matrix(0, ncol(X2), ncol(fP));
P[beg:end-1,1:ncol(fP)] = fP;
Ileft = (t(X2 %*% P) * I) != 0;
Iright = (Ileft==0) * I;
# compute information gain for all split candidates
ig = as.scalar(computeImpurity(y2, I, impurity))
- rowSums(Ileft)/numI * computeImpurity(y2, Ileft, impurity)
- rowSums(Iright)/numI * computeImpurity(y2, Iright, impurity);
ig = replace(target=ig, pattern=NaN, replacement=0);
# track best split value and index, incl validity
valid = (rowSums(Ileft)>=min_leaf) & (rowSums(Iright)>=min_leaf);
bestig = max(valid*ig);
bestv = ifelse(bestig>0, nrow(valid)-as.scalar(rowIndexMax(t(rev(valid*ig))))+beg, -1);
if( bestv >= 0 )
bestv = as.scalar(vI[bestv-beg+1,1])+beg-1;
R[,i] = as.matrix(list(f, bestig, bestv));
}
ix = as.scalar(rowIndexMax(R[2,]));
# extract indicators and IDs
IDleft = 2 * ID;
IDright= 2 * ID + 1;
f = as.integer(as.scalar(feat[ix,1]));
beg = as.scalar(foffb[1,f]);
v = as.integer(as.scalar(R[3,ix])-beg);
if( max(R[2,]) > 0 ) {
p = table(seq(beg+1, beg+v), 1, ncol(X2), 1);
Ileft = (t(X2 %*% p) * I) != 0;
Iright = I * (Ileft==0);
}
else { # no information gain
Ileft = as.matrix(0);
Iright = as.matrix(0);
}
}
computeImpurity = function(Matrix[Double] y2, Matrix[Double] I, String impurity)
return(Matrix[Double] score)
{
f = (I %*% y2) / rowSums(I); # rel. freq. per category/bin
score = matrix(0, nrow(I), 1);
if( impurity == "gini" )
score = 1 - rowSums(f^2); # sum(f*(1-f));
else if( impurity == "entropy" )
score = rowSums(-f * log(f));
else if( impurity == "rss" ) { # residual sum of squares
yhat = f %*% seq(1,ncol(f)); # yhat
res = outer(yhat, t(rowIndexMax(y2)), "-"); # yhat-y
score = rowSums((I * res)^2); # sum((yhat-y)^2)
}
else
stop("decisionTree: unsupported impurity measure: "+impurity);
}
computeLeafLabel = function(Matrix[Double] y2, Matrix[Double] I, Boolean classify, Boolean verbose)
return(Double label)
{
f = (I %*% y2) / sum(I);
label = as.scalar(ifelse(classify,
rowIndexMax(f), f %*% seq(1,ncol(f))));
if(verbose)
print("-- leaf node label: " + label +" ("+sum(I)*max(f)+"/"+sum(I)+")");
}