| #------------------------------------------------------------- |
| # |
| # Licensed to the Apache Software Foundation (ASF) under one |
| # or more contributor license agreements. See the NOTICE file |
| # distributed with this work for additional information |
| # regarding copyright ownership. The ASF licenses this file |
| # to you under the Apache License, Version 2.0 (the |
| # "License"); you may not use this file except in compliance |
| # with the License. You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, |
| # software distributed under the License is distributed on an |
| # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| # KIND, either express or implied. See the License for the |
| # specific language governing permissions and limitations |
| # under the License. |
| # |
| #------------------------------------------------------------- |
| |
| # Learns a decision tree given training data |
| # Writes out the learnt decision tree consisting |
| # of a nodes matrix and edges matrix. |
| # The representation is explained in detail below. |
| # |
| # arguments: |
| # 1st arg: data matrix (rows are samples) |
| # 2nd arg: column vector containing labels |
| # 3rd arg: minsplit (min. number of samples |
| # before we create a split) |
| # 4th arg: name of file to write nodes matrix to |
| # 5th arg: name of file to write edges matrix to |
| # |
| # What does this script do: |
| # Learns a decision tree for data containing |
| # categorical features and categorical label. |
| # Does not handle scale features as of yet. |
| # Does not perform pruning. Uses information |
| # gain to choose features to split on. |
| # See Wikipedia's ID3 decision tree learning |
| # algorithm webpage for pseudo code. |
| # |
| # Representing decision trees: |
| # Since SystemDS can only represent matrices, |
| # one question that needs answering is how to |
| # represent decision trees. One option is to |
| # restrict ourselves to building binary decision |
| # trees (this is what Netezza and Oracle do). |
| # Binary decision trees are fine except that they |
| # are less compact than trees that can have more |
| # than two children. We avoided that restriction. |
| # |
| # Our decision trees are represented using two |
| # matrices. One is called the nodes matrix, which |
| # contains definitions of nodes in the decision tree, |
| # and the other is called the edges matrix, containing |
| # the edges in the decision tree. This is a standard |
| # way of representing trees (graphs, in fact) in |
| # databases. |
| # |
| # Nodes matrix: Is a 2-column matrix. First column |
| # contains a column index in the input data indicating |
| # the feature that is being split in this node. The |
| # first column entry can be -1, that is not a valid index. |
| # When the first column denotes -1, that indicates this |
| # node is a leaf, in which case the second column entry |
| # will denote the class label to be predicted is a sample |
| # reaches this leaf. |
| # |
| # Edges matrix: Is a 3-colunm matrix. The first column |
| # denotes the row-index of the parent node in the nodes |
| # matrix (i.e., a pointer into the nodes matrix). The |
| # third column denotes the row-index of the child node |
| # in the nodes matrix. The second column contains a |
| # value in the domain of the feature that is being split |
| # in the parent node. For instance, let p,v,c denote a |
| # row in the edges matrix. If the p-th row in the nodes |
| # matrix says feature f is being split in this node, then |
| # v has to be a value in the domain of f. |
| |
| # Further notes: |
| # Because of the way SystemDS's groupedAggregate function |
| # works, it is a good idea to have values in the domain of |
| # a feature numbered starting with 1 contiguously through to |
| # its domain size. Note that this is not a restriction. The |
| # script takes care to handle 0/1-valued features (i.e, |
| # feature values begin from 0) and groupedAggregate can handle |
| # non-contiguously numbered values (but in this case the |
| # learnt decision tree may have spurious nodes where no samples |
| # reach). |
| |
| id3_learn = function(Matrix[Double] X, Matrix[Double] y, Matrix[Double] X_subset, Matrix[Double] attributes, Double minsplit) return (Matrix[Double] nodes, Matrix[Double] edges){ |
| hist_labels = aggregate(target=X_subset, groups=y, fn="sum") |
| |
| print(" "); |
| while(FALSE){} |
| |
| #go through the histogram to compute the number of labels |
| #with non-zero samples |
| #and to pull out the most popular label |
| |
| num_non_zero_labels = sum(hist_labels > 0); |
| most_popular_label = rowIndexMax(t(hist_labels)); |
| num_remaining_attrs = sum(attributes) |
| |
| num_samples = sum(X_subset) |
| |
| print("num non zero labels: " + num_non_zero_labels) |
| mpl = as.scalar(most_popular_label) |
| print("most popular label: " + mpl) |
| print("num remaining attrs: " + num_remaining_attrs) |
| |
| nodes = matrix(0, rows=1, cols=1) |
| edges = matrix(0, rows=1, cols=1) |
| |
| #if all samples have the same label then return a leaf node |
| #if no attributes remain then return a leaf node with the most popular label |
| if(num_non_zero_labels == 1 | num_remaining_attrs == 0 | num_samples < minsplit){ |
| nodes = matrix(0, rows=1, cols=2) |
| nodes[1,1] = -1 |
| nodes[1,2] = most_popular_label |
| edges = matrix(-1, rows=1, cols=1) |
| }else{ |
| #computing gains for all available attributes using parfor |
| hist_labels2 = aggregate(target=X_subset, groups=y, fn="sum") |
| num_samples2 = sum(X_subset) |
| print(num_samples2+" #samples") |
| zero_entries_in_hist1 = (hist_labels2 == 0) |
| pi1 = hist_labels2/num_samples2 |
| log_term1 = zero_entries_in_hist1*1 + (1-zero_entries_in_hist1)*pi1 |
| entropy_vector1 = -pi1*log(log_term1) |
| ht = sum(entropy_vector1) |
| |
| sz = nrow(attributes) |
| gains = matrix(0, rows=sz, cols=1) |
| for(i in 1:nrow(attributes)){ |
| if(as.scalar(attributes[i,1]) == 1){ |
| attr_vals = X[,i] |
| attr_domain = aggregate(target=X_subset, groups=attr_vals, fn="sum") |
| |
| hxt_vector = matrix(0, rows=nrow(attr_domain), cols=1) |
| |
| for(j in 1:nrow(attr_domain), check=0){ |
| if(as.scalar(attr_domain[j,1]) != 0){ |
| val = j |
| Tj = X_subset * (X[,i] == val) |
| |
| #entropy = compute_entropy(Tj, y) |
| hist_labels1 = aggregate(target=Tj, groups=y, fn="sum") |
| num_samples1 = sum(Tj) |
| zero_entries_in_hist = (hist_labels1 == 0) |
| piv = hist_labels1/num_samples1 |
| log_term = zero_entries_in_hist*1 + (1-zero_entries_in_hist)*piv |
| entropy_vector = -piv*log(log_term) |
| entropy = sum(entropy_vector) |
| |
| hxt_vector[j,1] = sum(Tj)/sum(X_subset)*entropy |
| } |
| } |
| hxt = sum(hxt_vector) |
| gains[i,1] = (ht - hxt) |
| } |
| } |
| |
| #pick out attr with highest gain |
| best_attr = -1 |
| max_gain = 0 |
| for(i4 in 1:nrow(gains)){ |
| #print("best attr " + best_attr + " max gain " + max_gain) |
| if(as.scalar(attributes[i4,1]) == 1){ |
| g = as.scalar(gains[i4,1]) |
| if(best_attr == -1 | max_gain <= g){ |
| max_gain = g |
| best_attr = i4 |
| } |
| } |
| } |
| |
| print("best attribute is: " + best_attr) |
| print("gain is: " + max_gain) |
| |
| attr_vals = X[,best_attr] |
| attr_domain = aggregate(target=X_subset, groups=attr_vals, fn="sum") |
| print(" "); |
| while(FALSE){} |
| |
| new_attributes = attributes |
| new_attributes[best_attr, 1] = 0 |
| |
| max_sz_subtree = 2*sum(X_subset) |
| sz2 = nrow(attr_domain) |
| sz1 = sz2*max_sz_subtree |
| while(FALSE){} |
| |
| tempNodeStore = matrix(0, rows=2, cols=sz1) |
| tempEdgeStore = matrix(0, rows=3, cols=sz1) |
| numSubtreeNodes = matrix(0, rows=sz2, cols=1) |
| numSubtreeEdges = matrix(0, rows=sz2, cols=1) |
| |
| for(i1 in 1:nrow(attr_domain), check=0){ |
| |
| Ti = X_subset * (X[,best_attr] == i1) |
| num_nodes_Ti = sum(Ti) |
| |
| if(num_nodes_Ti > 0){ |
| [nodesi, edgesi] = id3_learn(X, y, Ti, new_attributes, minsplit) |
| |
| start_pt = 1+(i1-1)*max_sz_subtree |
| tempNodeStore[,start_pt:(start_pt+nrow(nodesi)-1)] = t(nodesi) |
| numSubtreeNodes[i1,1] = nrow(nodesi) |
| if(nrow(edgesi)!=1 | ncol(edgesi)!=1 | as.scalar(edgesi[1,1])!=-1){ |
| tempEdgeStore[,start_pt:(start_pt+nrow(edgesi)-1)] = t(edgesi) |
| numSubtreeEdges[i1,1] = nrow(edgesi) |
| }else{ |
| numSubtreeEdges[i1,1] = 0 |
| } |
| } |
| } |
| |
| num_nodes_in_subtrees = sum(numSubtreeNodes) |
| num_edges_in_subtrees = sum(numSubtreeEdges) |
| |
| #creating root node |
| sz = 1+num_nodes_in_subtrees |
| while(FALSE){} |
| |
| nodes = matrix(0, rows=sz, cols=2) |
| nodes[1,1] = best_attr |
| numNodes = 1 |
| |
| #edges from root to children |
| sz = sum(numSubtreeNodes > 0) + num_edges_in_subtrees |
| while(FALSE){} |
| |
| edges = matrix(1, rows=sz, cols=3) |
| numEdges = 0 |
| for(i6 in 1:nrow(attr_domain)){ |
| num_nodesi = as.scalar(numSubtreeNodes[i6,1]) |
| if(num_nodesi > 0){ |
| edges[numEdges+1,2] = i6 |
| numEdges = numEdges + 1 |
| } |
| } |
| |
| nonEmptyAttri = 0 |
| for(i7 in 1:nrow(attr_domain)){ |
| numNodesInSubtree = as.scalar(numSubtreeNodes[i7,1]) |
| |
| if(numNodesInSubtree > 0){ |
| start_pt1 = 1 + (i7-1)*max_sz_subtree |
| nodes[numNodes+1:numNodes+numNodesInSubtree,] = t(tempNodeStore[,start_pt1:(start_pt1+numNodesInSubtree-1)]) |
| |
| numEdgesInSubtree = as.scalar(numSubtreeEdges[i7,1]) |
| |
| if(numEdgesInSubtree!=0){ |
| edgesi1 = t(tempEdgeStore[,start_pt1:(start_pt1+numEdgesInSubtree-1)]) |
| edgesi1[,1] = edgesi1[,1] + numNodes |
| edgesi1[,3] = edgesi1[,3] + numNodes |
| edges[numEdges+1:numEdges+numEdgesInSubtree,] = edgesi1 |
| numEdges = numEdges + numEdgesInSubtree |
| } |
| |
| edges[nonEmptyAttri+1,3] = numNodes + 1 |
| nonEmptyAttri = nonEmptyAttri + 1 |
| |
| numNodes = numNodes + numNodesInSubtree |
| } |
| } |
| } |
| } |
| |
| X = read($1); |
| y = read($2); |
| |
| n = nrow(X) |
| m = ncol(X) |
| |
| minsplit = 2 |
| |
| |
| X_subset = matrix(1, rows=n, cols=1) |
| attributes = matrix(1, rows=m, cols=1) |
| # recoding inputs |
| featureCorrections = 1 - colMins(X) |
| onesMat = matrix(1, rows=n, cols=m) |
| |
| X = onesMat %*% diag(t(featureCorrections)) + X |
| labelCorrection = 1 - min(y) |
| y = y + labelCorrection + 0 |
| |
| [nodes, edges] = id3_learn(X, y, X_subset, attributes, minsplit) |
| |
| # decoding outputs |
| nodes[,2] = nodes[,2] - labelCorrection * (nodes[,1] == -1) |
| for(i3 in 1:nrow(edges)){ |
| #parfor(i3 in 1:nrow(edges)){ |
| e_parent = as.scalar(edges[i3,1]) |
| parent_feature = as.scalar(nodes[e_parent,1]) |
| correction = as.scalar(featureCorrections[1,parent_feature]) |
| edges[i3,2] = edges[i3,2] - correction |
| } |
| |
| write(nodes, $3, format="text") |
| write(edges, $4, format="text") |