| #------------------------------------------------------------- |
| # |
| # Licensed to the Apache Software Foundation (ASF) under one |
| # or more contributor license agreements. See the NOTICE file |
| # distributed with this work for additional information |
| # regarding copyright ownership. The ASF licenses this file |
| # to you under the Apache License, Version 2.0 (the |
| # "License"); you may not use this file except in compliance |
| # with the License. You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, |
| # software distributed under the License is distributed on an |
| # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| # KIND, either express or implied. See the License for the |
| # specific language governing permissions and limitations |
| # under the License. |
| # |
| #------------------------------------------------------------- |
| |
| # This script implements random forest prediction for recoded and binned |
| # categorical and numerical input features. |
| # Hummingbird paper (https://www.usenix.org/system/files/osdi20-nakandala.pdf). |
| # |
| # INPUT: |
| # ------------------------------------------------------------------------------ |
| # X Feature matrix in recoded/binned representation |
| # y Label matrix in recoded/binned representation, |
| # optional for accuracy evaluation |
| # ctypes Row-Vector of column types [1 scale/ordinal, 2 categorical] |
| # M Matrix M holding the learned tree in linearized form |
| # see decisionTree() for the detailed tree representation. |
| # strategy Prediction strategy, can be one of ["GEMM", "TT", "PTT"], |
| # referring to "Generic matrix multiplication", |
| # "Tree traversal", and "Perfect tree traversal", respectively |
| # verbose Flag indicating verbose debug output |
| # ------------------------------------------------------------------------------ |
| # |
| # OUTPUT: |
| # ------------------------------------------------------------------------------ |
| # yhat Label vector of predictions |
| # ------------------------------------------------------------------------------ |
| |
| m_decisionTreePredict = function(Matrix[Double] X, Matrix[Double] y = matrix(0,0,0), |
| Matrix[Double] ctypes, Matrix[Double] M, String strategy="TT", Boolean verbose = FALSE) |
| return (Matrix[Double] yhat) |
| { |
| if( strategy == "TT" ) |
| yhat = predict_TT(M, X); |
| else if( strategy == "GEMM" ) |
| yhat = predict_GEMM(M, X); |
| else { |
| print ("No such strategy" + strategy) |
| yhat = matrix("0", rows=0, cols=0) |
| } |
| } |
| |
| predict_TT = function (Matrix[Double] M, Matrix[Double] X) |
| return (Matrix[Double] yhat) |
| { |
| # initialization of model tensors and parameters |
| [N, N_L, N_R, N_F, N_T, C] = createTTNodeTensors(M) |
| nr = nrow(X); n = ncol(N_L); |
| tree_depth = ceiling(log(max(N)+1,2)) # max depth |
| |
| |
| Ti = matrix(1, nr, 1); # current nodes (start at root) |
| noChange = FALSE; i = 1; |
| while( !noChange & i <= tree_depth) { |
| P = table(seq(1,nr), Ti, nr, n); |
| TF = P %*% t(N_F); # get node feature indexes |
| Tv = rowSums(X * table(seq(1,nr),TF,nr,ncol(X))); # get feature values |
| Tt = P %*% t(N_T); # get node thresholds |
| TL = P %*% t(N_L); # get node left paths |
| TR = P %*% t(N_R); # get node right paths |
| # pick left or right path for each record separately |
| Ti_new = ifelse(Tv <= Tt, TL, TR); |
| noChange = (sum(Ti != Ti_new) == 0); |
| i = i + 1; |
| Ti = Ti_new; |
| } |
| |
| # extract classes |
| yhat = table(seq(1,nr), Ti, nr, n) %*% C; |
| } |
| |
| predict_GEMM = function (Matrix[Double] M, Matrix[Double] X) |
| return (Matrix[Double] Y) |
| { |
| # initialization of model tensors and parameters |
| [A, B, C, D, E] = createGEMMNodeTensors(M, ncol(X)); |
| |
| # scoring pipline, evaluating all nodes in parallel |
| Y = rowIndexMax(((((X %*% A) <= B) %*% C) == D) %*% E); |
| } |
| |
| createTTNodeTensors = function( Matrix[Double] M ) |
| return (Matrix[Double] N, Matrix[Double] N_L, Matrix[Double] N_R, |
| Matrix[Double] N_F, Matrix[Double] N_T, Matrix[Double] C) |
| { |
| # all tree nodes (inner and leaf nodes) |
| M2 = matrix(M, rows=ncol(M)/2, cols=2); |
| NID = seq(1, nrow(M2)); |
| nI = (M2[,1]!=0 | M2[,2]!=0) |
| cnI = cumsum(nI); |
| N = t(removeEmpty(target=NID, margin="rows", select=nI)); |
| n_nodes = ncol(N) |
| |
| # left/right child node ids, default self-id |
| N_L = t(removeEmpty(target=ifelse(M2[,1]!=0, 2*NID, NID), margin="rows", select=nI)); |
| N_R = t(removeEmpty(target=ifelse(M2[,1]!=0, 2*NID+1, NID), margin="rows", select=nI)); |
| # recoding to node vector positions (removed non-existing nodes) |
| N_L = t(table(seq(1,n_nodes), t(N_L), n_nodes, nrow(M2)) %*% cnI); |
| N_R = t(table(seq(1,n_nodes), t(N_R), n_nodes, nrow(M2)) %*% cnI); |
| |
| # node feature IDs (positions) and threshold values |
| N_F = t(removeEmpty(target=ifelse(M2[,1]!=0, M2[,1], 1), margin="rows", select=nI)); |
| N_T = t(removeEmpty(target=ifelse(M2[,1]!=0, M2[,2], 0), margin="rows", select=nI)); |
| |
| C = removeEmpty(target=M2[,2], margin="rows", select=nI); |
| } |
| |
| createGEMMNodeTensors = function( Matrix[Double] M, Int m ) |
| return (Matrix[Double] A, Matrix[Double] B, Matrix[Double] C, |
| Matrix[Double] D, Matrix[Double] E) |
| { |
| M2 = matrix(M, rows=ncol(M)/2, cols=2) |
| NID = seq(1, nrow(M2)) |
| |
| # predicate map [#feat x #inodes] and values [1 x #inodes] |
| is_inner = M2[,1]!=0 |
| I1 = removeEmpty(target=NID, margin="rows", select=is_inner) |
| pivot = removeEmpty(target=M2[,1], margin="rows", select=is_inner) |
| nin = nrow(I1) |
| A = table(pivot, seq(1,nin), m, nin) |
| B = t(removeEmpty(target=M2[,2], margin="rows", select=is_inner)) |
| |
| # bucket paths [#inodes x #paths] and path sums |
| is_leaf = (!is_inner & M2[,2]!=0) |
| leaf_ids = t(removeEmpty(target=NID, margin="rows", select=is_leaf)) |
| last_leaf = as.scalar(leaf_ids[1,ncol(leaf_ids)]) |
| leaf_classes = removeEmpty(target=M2[,2], margin="rows", select=is_leaf) |
| nl = ncol(leaf_ids) |
| |
| # iterate over each inner node and check for each leaf node if it is the left subtree (1), right subtree (-1) or not included (0) |
| # | i | |
| # / \ |
| # |2i| |2i+1| |
| # / \ / \ |
| # |4i| |4i+1| |4i+2| |4i+3| |
| # |
| # left_subtree_of_node(i) = { x | (2^j)*i <= x < (2^j)*i + 2^(j-1), for j elem {1, 2, 3, ...}} -> j is the level of tree |
| # right_subtree_of_node(i) = { x | (2^j)*i + 2^(j-1) <= x < (2^j + 1)*i, for j elem {1, 2, 3, ...}} |
| |
| C = matrix(0, nin, nl) |
| parfor(i in seq(1, nin)){ |
| boundary_left = 2*as.scalar(I1[i, 1]) # initialize the left boundary with the left child of the inner node |
| out = matrix(0, 1, nl) |
| step_size = 1 |
| |
| # iterate each level of tree [log(max_node_id) iterations] |
| while(boundary_left < last_leaf) { |
| |
| # left side |
| subset_lower_bound = leaf_ids >= boundary_left |
| boundary_right = boundary_left + step_size |
| subset_upper_bound = leaf_ids < boundary_right |
| ones = subset_lower_bound & subset_upper_bound |
| out = out + ones |
| |
| # right side |
| subset_lower_bound = !subset_upper_bound #reuse by inverting |
| boundary_right = boundary_right + step_size |
| subset_upper_bound = leaf_ids < boundary_right |
| ones = subset_lower_bound & subset_upper_bound |
| out = out - ones |
| |
| step_size = step_size*2 # with each level the amount of nodes in subtree level doubles |
| boundary_left = boundary_left*2 # new left boundary is the left child of the previous left boundary |
| } |
| C[i,] = out |
| } |
| D = colSums(max(C, 0)); |
| |
| # class map [#paths x #classes] |
| E = table(seq(1,ncol(C)),leaf_classes) |
| } |