blob: 4585d56d08f7240f01e9028a5a8fc6651d254982 [file] [log] [blame]
#-------------------------------------------------------------
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#
#-------------------------------------------------------------
# This script implements random forest prediction for recoded and binned
# categorical and numerical input features.
# Hummingbird paper (https://www.usenix.org/system/files/osdi20-nakandala.pdf).
#
# INPUT:
# ------------------------------------------------------------------------------
# X Feature matrix in recoded/binned representation
# y Label matrix in recoded/binned representation,
# optional for accuracy evaluation
# ctypes Row-Vector of column types [1 scale/ordinal, 2 categorical]
# M Matrix M holding the learned tree in linearized form
# see decisionTree() for the detailed tree representation.
# strategy Prediction strategy, can be one of ["GEMM", "TT", "PTT"],
# referring to "Generic matrix multiplication",
# "Tree traversal", and "Perfect tree traversal", respectively
# verbose Flag indicating verbose debug output
# ------------------------------------------------------------------------------
#
# OUTPUT:
# ------------------------------------------------------------------------------
# yhat Label vector of predictions
# ------------------------------------------------------------------------------
m_decisionTreePredict = function(Matrix[Double] X, Matrix[Double] y = matrix(0,0,0),
Matrix[Double] ctypes, Matrix[Double] M, String strategy="TT", Boolean verbose = FALSE)
return (Matrix[Double] yhat)
{
if( strategy == "TT" )
yhat = predict_TT(M, X);
else if( strategy == "GEMM" )
yhat = predict_GEMM(M, X);
else {
print ("No such strategy" + strategy)
yhat = matrix("0", rows=0, cols=0)
}
}
predict_TT = function (Matrix[Double] M, Matrix[Double] X)
return (Matrix[Double] yhat)
{
# initialization of model tensors and parameters
[N, N_L, N_R, N_F, N_T, C] = createTTNodeTensors(M)
nr = nrow(X); n = ncol(N_L);
tree_depth = ceiling(log(max(N)+1,2)) # max depth
Ti = matrix(1, nr, 1); # current nodes (start at root)
noChange = FALSE; i = 1;
while( !noChange & i <= tree_depth) {
P = table(seq(1,nr), Ti, nr, n);
TF = P %*% t(N_F); # get node feature indexes
Tv = rowSums(X * table(seq(1,nr),TF,nr,ncol(X))); # get feature values
Tt = P %*% t(N_T); # get node thresholds
TL = P %*% t(N_L); # get node left paths
TR = P %*% t(N_R); # get node right paths
# pick left or right path for each record separately
Ti_new = ifelse(Tv <= Tt, TL, TR);
noChange = (sum(Ti != Ti_new) == 0);
i = i + 1;
Ti = Ti_new;
}
# extract classes
yhat = table(seq(1,nr), Ti, nr, n) %*% C;
}
predict_GEMM = function (Matrix[Double] M, Matrix[Double] X)
return (Matrix[Double] Y)
{
# initialization of model tensors and parameters
[A, B, C, D, E] = createGEMMNodeTensors(M, ncol(X));
# scoring pipline, evaluating all nodes in parallel
Y = rowIndexMax(((((X %*% A) <= B) %*% C) == D) %*% E);
}
createTTNodeTensors = function( Matrix[Double] M )
return (Matrix[Double] N, Matrix[Double] N_L, Matrix[Double] N_R,
Matrix[Double] N_F, Matrix[Double] N_T, Matrix[Double] C)
{
# all tree nodes (inner and leaf nodes)
M2 = matrix(M, rows=ncol(M)/2, cols=2);
NID = seq(1, nrow(M2));
nI = (M2[,1]!=0 | M2[,2]!=0)
cnI = cumsum(nI);
N = t(removeEmpty(target=NID, margin="rows", select=nI));
n_nodes = ncol(N)
# left/right child node ids, default self-id
N_L = t(removeEmpty(target=ifelse(M2[,1]!=0, 2*NID, NID), margin="rows", select=nI));
N_R = t(removeEmpty(target=ifelse(M2[,1]!=0, 2*NID+1, NID), margin="rows", select=nI));
# recoding to node vector positions (removed non-existing nodes)
N_L = t(table(seq(1,n_nodes), t(N_L), n_nodes, nrow(M2)) %*% cnI);
N_R = t(table(seq(1,n_nodes), t(N_R), n_nodes, nrow(M2)) %*% cnI);
# node feature IDs (positions) and threshold values
N_F = t(removeEmpty(target=ifelse(M2[,1]!=0, M2[,1], 1), margin="rows", select=nI));
N_T = t(removeEmpty(target=ifelse(M2[,1]!=0, M2[,2], 0), margin="rows", select=nI));
C = removeEmpty(target=M2[,2], margin="rows", select=nI);
}
createGEMMNodeTensors = function( Matrix[Double] M, Int m )
return (Matrix[Double] A, Matrix[Double] B, Matrix[Double] C,
Matrix[Double] D, Matrix[Double] E)
{
M2 = matrix(M, rows=ncol(M)/2, cols=2)
NID = seq(1, nrow(M2))
# predicate map [#feat x #inodes] and values [1 x #inodes]
is_inner = M2[,1]!=0
I1 = removeEmpty(target=NID, margin="rows", select=is_inner)
pivot = removeEmpty(target=M2[,1], margin="rows", select=is_inner)
nin = nrow(I1)
A = table(pivot, seq(1,nin), m, nin)
B = t(removeEmpty(target=M2[,2], margin="rows", select=is_inner))
# bucket paths [#inodes x #paths] and path sums
is_leaf = (!is_inner & M2[,2]!=0)
leaf_ids = t(removeEmpty(target=NID, margin="rows", select=is_leaf))
last_leaf = as.scalar(leaf_ids[1,ncol(leaf_ids)])
leaf_classes = removeEmpty(target=M2[,2], margin="rows", select=is_leaf)
nl = ncol(leaf_ids)
# iterate over each inner node and check for each leaf node if it is the left subtree (1), right subtree (-1) or not included (0)
# | i |
# / \
# |2i| |2i+1|
# / \ / \
# |4i| |4i+1| |4i+2| |4i+3|
#
# left_subtree_of_node(i) = { x | (2^j)*i <= x < (2^j)*i + 2^(j-1), for j elem {1, 2, 3, ...}} -> j is the level of tree
# right_subtree_of_node(i) = { x | (2^j)*i + 2^(j-1) <= x < (2^j + 1)*i, for j elem {1, 2, 3, ...}}
C = matrix(0, nin, nl)
parfor(i in seq(1, nin)){
boundary_left = 2*as.scalar(I1[i, 1]) # initialize the left boundary with the left child of the inner node
out = matrix(0, 1, nl)
step_size = 1
# iterate each level of tree [log(max_node_id) iterations]
while(boundary_left < last_leaf) {
# left side
subset_lower_bound = leaf_ids >= boundary_left
boundary_right = boundary_left + step_size
subset_upper_bound = leaf_ids < boundary_right
ones = subset_lower_bound & subset_upper_bound
out = out + ones
# right side
subset_lower_bound = !subset_upper_bound #reuse by inverting
boundary_right = boundary_right + step_size
subset_upper_bound = leaf_ids < boundary_right
ones = subset_lower_bound & subset_upper_bound
out = out - ones
step_size = step_size*2 # with each level the amount of nodes in subtree level doubles
boundary_left = boundary_left*2 # new left boundary is the left child of the previous left boundary
}
C[i,] = out
}
D = colSums(max(C, 0));
# class map [#paths x #classes]
E = table(seq(1,ncol(C)),leaf_classes)
}