blob: 1b214bd10a759ea591ca0a41170dc0a76aa4180a [file] [log] [blame]
#-------------------------------------------------------------
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#
#-------------------------------------------------------------
# Learns a decision tree given training data
# Writes out the learnt decision tree consisting
# of a nodes matrix and edges matrix.
# The representation is explained in detail below.
#
# arguments:
# 1st arg: data matrix (rows are samples)
# 2nd arg: column vector containing labels
# 3rd arg: minsplit (min. number of samples
# before we create a split)
# 4th arg: name of file to write nodes matrix to
# 5th arg: name of file to write edges matrix to
#
# What does this script do:
# Learns a decision tree for data containing
# categorical features and categorical label.
# Does not handle scale features as of yet.
# Does not perform pruning. Uses information
# gain to choose features to split on.
# See Wikipedia's ID3 decision tree learning
# algorithm webpage for pseudo code.
#
# Representing decision trees:
# Since SystemDS can only represent matrices,
# one question that needs answering is how to
# represent decision trees. One option is to
# restrict ourselves to building binary decision
# trees (this is what Netezza and Oracle do).
# Binary decision trees are fine except that they
# are less compact than trees that can have more
# than two children. We avoided that restriction.
#
# Our decision trees are represented using two
# matrices. One is called the nodes matrix, which
# contains definitions of nodes in the decision tree,
# and the other is called the edges matrix, containing
# the edges in the decision tree. This is a standard
# way of representing trees (graphs, in fact) in
# databases.
#
# Nodes matrix: Is a 2-column matrix. First column
# contains a column index in the input data indicating
# the feature that is being split in this node. The
# first column entry can be -1, that is not a valid index.
# When the first column denotes -1, that indicates this
# node is a leaf, in which case the second column entry
# will denote the class label to be predicted is a sample
# reaches this leaf.
#
# Edges matrix: Is a 3-colunm matrix. The first column
# denotes the row-index of the parent node in the nodes
# matrix (i.e., a pointer into the nodes matrix). The
# third column denotes the row-index of the child node
# in the nodes matrix. The second column contains a
# value in the domain of the feature that is being split
# in the parent node. For instance, let p,v,c denote a
# row in the edges matrix. If the p-th row in the nodes
# matrix says feature f is being split in this node, then
# v has to be a value in the domain of f.
# Further notes:
# Because of the way SystemDS's groupedAggregate function
# works, it is a good idea to have values in the domain of
# a feature numbered starting with 1 contiguously through to
# its domain size. Note that this is not a restriction. The
# script takes care to handle 0/1-valued features (i.e,
# feature values begin from 0) and groupedAggregate can handle
# non-contiguously numbered values (but in this case the
# learnt decision tree may have spurious nodes where no samples
# reach).
id3_learn = function(Matrix[Double] X, Matrix[Double] y, Matrix[Double] X_subset, Matrix[Double] attributes, Double minsplit) return (Matrix[Double] nodes, Matrix[Double] edges){
hist_labels = aggregate(target=X_subset, groups=y, fn="sum")
print(" ");
while(FALSE){}
#go through the histogram to compute the number of labels
#with non-zero samples
#and to pull out the most popular label
num_non_zero_labels = sum(hist_labels > 0);
most_popular_label = rowIndexMax(t(hist_labels));
num_remaining_attrs = sum(attributes)
num_samples = sum(X_subset)
print("num non zero labels: " + num_non_zero_labels)
mpl = as.scalar(most_popular_label)
print("most popular label: " + mpl)
print("num remaining attrs: " + num_remaining_attrs)
nodes = matrix(0, rows=1, cols=1)
edges = matrix(0, rows=1, cols=1)
#if all samples have the same label then return a leaf node
#if no attributes remain then return a leaf node with the most popular label
if(num_non_zero_labels == 1 | num_remaining_attrs == 0 | num_samples < minsplit){
nodes = matrix(0, rows=1, cols=2)
nodes[1,1] = -1
nodes[1,2] = most_popular_label
edges = matrix(-1, rows=1, cols=1)
}else{
#computing gains for all available attributes using parfor
hist_labels2 = aggregate(target=X_subset, groups=y, fn="sum")
num_samples2 = sum(X_subset)
print(num_samples2+" #samples")
zero_entries_in_hist1 = (hist_labels2 == 0)
pi1 = hist_labels2/num_samples2
log_term1 = zero_entries_in_hist1*1 + (1-zero_entries_in_hist1)*pi1
entropy_vector1 = -pi1*log(log_term1)
ht = sum(entropy_vector1)
sz = nrow(attributes)
gains = matrix(0, rows=sz, cols=1)
for(i in 1:nrow(attributes)){
if(as.scalar(attributes[i,1]) == 1){
attr_vals = X[,i]
attr_domain = aggregate(target=X_subset, groups=attr_vals, fn="sum")
hxt_vector = matrix(0, rows=nrow(attr_domain), cols=1)
for(j in 1:nrow(attr_domain), check=0){
if(as.scalar(attr_domain[j,1]) != 0){
val = j
Tj = X_subset * (X[,i] == val)
#entropy = compute_entropy(Tj, y)
hist_labels1 = aggregate(target=Tj, groups=y, fn="sum")
num_samples1 = sum(Tj)
zero_entries_in_hist = (hist_labels1 == 0)
piv = hist_labels1/num_samples1
log_term = zero_entries_in_hist*1 + (1-zero_entries_in_hist)*piv
entropy_vector = -piv*log(log_term)
entropy = sum(entropy_vector)
hxt_vector[j,1] = sum(Tj)/sum(X_subset)*entropy
}
}
hxt = sum(hxt_vector)
gains[i,1] = (ht - hxt)
}
}
#pick out attr with highest gain
best_attr = -1
max_gain = 0
for(i4 in 1:nrow(gains)){
#print("best attr " + best_attr + " max gain " + max_gain)
if(as.scalar(attributes[i4,1]) == 1){
g = as.scalar(gains[i4,1])
if(best_attr == -1 | max_gain <= g){
max_gain = g
best_attr = i4
}
}
}
print("best attribute is: " + best_attr)
print("gain is: " + max_gain)
attr_vals = X[,best_attr]
attr_domain = aggregate(target=X_subset, groups=attr_vals, fn="sum")
print(" ");
while(FALSE){}
new_attributes = attributes
new_attributes[best_attr, 1] = 0
max_sz_subtree = 2*sum(X_subset)
sz2 = nrow(attr_domain)
sz1 = sz2*max_sz_subtree
while(FALSE){}
tempNodeStore = matrix(0, rows=2, cols=sz1)
tempEdgeStore = matrix(0, rows=3, cols=sz1)
numSubtreeNodes = matrix(0, rows=sz2, cols=1)
numSubtreeEdges = matrix(0, rows=sz2, cols=1)
for(i1 in 1:nrow(attr_domain), check=0){
Ti = X_subset * (X[,best_attr] == i1)
num_nodes_Ti = sum(Ti)
if(num_nodes_Ti > 0){
[nodesi, edgesi] = id3_learn(X, y, Ti, new_attributes, minsplit)
start_pt = 1+(i1-1)*max_sz_subtree
tempNodeStore[,start_pt:(start_pt+nrow(nodesi)-1)] = t(nodesi)
numSubtreeNodes[i1,1] = nrow(nodesi)
if(nrow(edgesi)!=1 | ncol(edgesi)!=1 | as.scalar(edgesi[1,1])!=-1){
tempEdgeStore[,start_pt:(start_pt+nrow(edgesi)-1)] = t(edgesi)
numSubtreeEdges[i1,1] = nrow(edgesi)
}else{
numSubtreeEdges[i1,1] = 0
}
}
}
num_nodes_in_subtrees = sum(numSubtreeNodes)
num_edges_in_subtrees = sum(numSubtreeEdges)
#creating root node
sz = 1+num_nodes_in_subtrees
while(FALSE){}
nodes = matrix(0, rows=sz, cols=2)
nodes[1,1] = best_attr
numNodes = 1
#edges from root to children
sz = sum(numSubtreeNodes > 0) + num_edges_in_subtrees
while(FALSE){}
edges = matrix(1, rows=sz, cols=3)
numEdges = 0
for(i6 in 1:nrow(attr_domain)){
num_nodesi = as.scalar(numSubtreeNodes[i6,1])
if(num_nodesi > 0){
edges[numEdges+1,2] = i6
numEdges = numEdges + 1
}
}
nonEmptyAttri = 0
for(i7 in 1:nrow(attr_domain)){
numNodesInSubtree = as.scalar(numSubtreeNodes[i7,1])
if(numNodesInSubtree > 0){
start_pt1 = 1 + (i7-1)*max_sz_subtree
nodes[numNodes+1:numNodes+numNodesInSubtree,] = t(tempNodeStore[,start_pt1:(start_pt1+numNodesInSubtree-1)])
numEdgesInSubtree = as.scalar(numSubtreeEdges[i7,1])
if(numEdgesInSubtree!=0){
edgesi1 = t(tempEdgeStore[,start_pt1:(start_pt1+numEdgesInSubtree-1)])
edgesi1[,1] = edgesi1[,1] + numNodes
edgesi1[,3] = edgesi1[,3] + numNodes
edges[numEdges+1:numEdges+numEdgesInSubtree,] = edgesi1
numEdges = numEdges + numEdgesInSubtree
}
edges[nonEmptyAttri+1,3] = numNodes + 1
nonEmptyAttri = nonEmptyAttri + 1
numNodes = numNodes + numNodesInSubtree
}
}
}
}
X = read($1);
y = read($2);
n = nrow(X)
m = ncol(X)
minsplit = 2
X_subset = matrix(1, rows=n, cols=1)
attributes = matrix(1, rows=m, cols=1)
# recoding inputs
featureCorrections = 1 - colMins(X)
onesMat = matrix(1, rows=n, cols=m)
X = onesMat %*% diag(t(featureCorrections)) + X
labelCorrection = 1 - min(y)
y = y + labelCorrection + 0
[nodes, edges] = id3_learn(X, y, X_subset, attributes, minsplit)
# decoding outputs
nodes[,2] = nodes[,2] - labelCorrection * (nodes[,1] == -1)
for(i3 in 1:nrow(edges)){
#parfor(i3 in 1:nrow(edges)){
e_parent = as.scalar(edges[i3,1])
parent_feature = as.scalar(nodes[e_parent,1])
correction = as.scalar(featureCorrections[1,parent_feature])
edges[i3,2] = edges[i3,2] - correction
}
write(nodes, $3, format="text")
write(edges, $4, format="text")