src/test/scripts/applications/id3/id3.R - systemds - Git at Google

 #-------------------------------------------------------------
 #
 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #   http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
 #
 #-------------------------------------------------------------

 args <- commandArgs(TRUE)

 library("Matrix")
 library("matrixStats")

 options(warn=-1)


 id3_learn = function(X, y, X_subset, attributes, minsplit)
 {
 	#try the two base cases first

 	#of the remaining samples, compute a histogram for labels
 	hist_labels_helper = as.matrix(aggregate(as.vector(X_subset), by=list(as.vector(y)), FUN=sum))
 	hist_labels = as.matrix(hist_labels_helper[,2])

 	#go through the histogram to compute the number of labels
 	#with non-zero samples
 	#and to pull out the most popular label

 	num_non_zero_labels = sum(hist_labels > 0);
 	most_popular_label = which.max(t(hist_labels));
 	num_remaining_attrs = sum(attributes)

 	num_samples = sum(X_subset)
 	mpl = as.numeric(most_popular_label)

 	nodes = matrix(0, 1, 1)
 	edges = matrix(0, 1, 1)

 	#if all samples have the same label then return a leaf node
 	#if no attributes remain then return a leaf node with the most popular label
 	if(num_non_zero_labels == 1 | num_remaining_attrs == 0 | num_samples < minsplit){
 		nodes = matrix(0, 1, 2)
 		nodes[1,1] = -1
 		nodes[1,2] = most_popular_label
 		edges = matrix(-1, 1, 1)
 	}else{
 		#computing gains for all available attributes using parfor
 		hist_labels2_helper = as.matrix(aggregate(as.vector(X_subset), by=list(as.vector(y)), FUN=sum))
 		hist_labels2 = as.matrix(hist_labels2_helper[,2])
 		num_samples2 = sum(X_subset)
 		zero_entries_in_hist1 = (hist_labels2 == 0)
 		pi1 = hist_labels2/num_samples2
 		log_term1 = zero_entries_in_hist1*1 + (1-zero_entries_in_hist1)*pi1
 		entropy_vector1 = -pi1*log(log_term1)
 		ht = sum(entropy_vector1)

 		sz = nrow(attributes)
 		gains = matrix(0, sz, 1)
 		for(i in 1:nrow(attributes)){
 			if(as.numeric(attributes[i,1]) == 1){
 				attr_vals = X[,i]
 				attr_domain_helper = as.matrix(aggregate(as.vector(X_subset), by=list(as.vector(attr_vals)), FUN=sum))
 				attr_domain = as.matrix(attr_domain_helper[,2])

 				hxt_vector = matrix(0, nrow(attr_domain), 1)

 				for(j in 1:nrow(attr_domain)){
 					if(as.numeric(attr_domain[j,1]) != 0){
 						val = j
 						Tj = X_subset * (X[,i] == val)

 						#entropy = compute_entropy(Tj, y)
 						hist_labels1_helper = as.matrix(aggregate(as.vector(Tj), by=list(as.vector(y)), FUN=sum))
 						hist_labels1 = as.matrix(hist_labels1_helper[,2])
 						num_samples1 = sum(Tj)
 						zero_entries_in_hist = (hist_labels1 == 0)
 						piv = hist_labels1/num_samples1
 						log_term = zero_entries_in_hist*1 + (1-zero_entries_in_hist)*piv
 						entropy_vector = -piv*log(log_term)
 						entropy = sum(entropy_vector)

 						hxt_vector[j,1] = sum(Tj)/sum(X_subset)*entropy
 					}
 				}
 				hxt = sum(hxt_vector)
 				gains[i,1] = (ht - hxt)
 			}
 		}

 		#pick out attr with highest gain
 		best_attr = -1
 		max_gain = 0
 		for(i4 in 1:nrow(gains)){
 			if(as.numeric(attributes[i4,1]) == 1){
 				g = as.numeric(gains[i4,1])
 				if(best_attr == -1 | max_gain <= g){
 					max_gain = g
 					best_attr = i4
 				}
 			}
 		}

 		attr_vals = X[,best_attr]
 		attr_domain_helper = as.matrix(aggregate(as.vector(X_subset), by=list(as.vector(attr_vals)), FUN=sum))
 		attr_domain = as.matrix(attr_domain_helper[,2])

 		new_attributes = attributes
 		new_attributes[best_attr, 1] = 0

 		max_sz_subtree = 2*sum(X_subset)
 		sz2 = nrow(attr_domain)
 		sz1 = sz2*max_sz_subtree

 		tempNodeStore = matrix(0, 2, sz1)
 		tempEdgeStore = matrix(0, 3, sz1)
 		numSubtreeNodes = matrix(0, sz2, 1)
 		numSubtreeEdges = matrix(0, sz2, 1)

 		for(i1 in 1:nrow(attr_domain)){

 			Ti = X_subset * (X[,best_attr] == i1)
 			num_nodes_Ti = sum(Ti)

 			if(num_nodes_Ti > 0){
 				tmpRet <- id3_learn(X, y, Ti, new_attributes, minsplit)
 			  nodesi = as.matrix(tmpRet$a);
         edgesi = as.matrix(tmpRet$b);

 				start_pt = 1+(i1-1)*max_sz_subtree
         tempNodeStore[,start_pt:(start_pt+nrow(nodesi)-1)] = t(nodesi)

         numSubtreeNodes[i1,1] = nrow(nodesi)
 				if(nrow(edgesi)!=1 | ncol(edgesi)!=1 | as.numeric(edgesi[1,1])!=-1){
 					tempEdgeStore[,start_pt:(start_pt+nrow(edgesi)-1)] = t(edgesi)
 					numSubtreeEdges[i1,1] = nrow(edgesi)
 				}else{
 					numSubtreeEdges[i1,1] = 0
 				}
 			}
 		}

 		num_nodes_in_subtrees = sum(numSubtreeNodes)
 		num_edges_in_subtrees = sum(numSubtreeEdges)

 		#creating root node
 		sz = 1+num_nodes_in_subtrees

 		nodes = matrix(0, sz, 2)
 		nodes[1,1] = best_attr
 		numNodes = 1

 		#edges from root to children
 		sz = sum(numSubtreeNodes > 0) + num_edges_in_subtrees

 		edges = matrix(1, sz, 3)
 		numEdges = 0
 		for(i6 in 1:nrow(attr_domain)){
 			num_nodesi = as.numeric(numSubtreeNodes[i6,1])
 			if(num_nodesi > 0){
 				edges[numEdges+1,2] = i6
 				numEdges = numEdges + 1
 			}
 		}

 		nonEmptyAttri = 0
 		for(i7 in 1:nrow(attr_domain)){
 			numNodesInSubtree = as.numeric(numSubtreeNodes[i7,1])

 			if(numNodesInSubtree > 0){
 				start_pt1 = 1 + (i7-1)*max_sz_subtree
 				nodes[(numNodes+1):(numNodes+numNodesInSubtree),] = t(tempNodeStore[,start_pt1:(start_pt1+numNodesInSubtree-1)])

 				numEdgesInSubtree = as.numeric(numSubtreeEdges[i7,1])

 				if(numEdgesInSubtree!=0){
 					edgesi1 = t(tempEdgeStore[,start_pt1:(start_pt1+numEdgesInSubtree-1)])
 					edgesi1[,1] = edgesi1[,1] + numNodes
 					edgesi1[,3] = edgesi1[,3] + numNodes

 					edges[(numEdges+1):(numEdges+numEdgesInSubtree),] = edgesi1
 					numEdges = numEdges + numEdgesInSubtree
 				}

 				edges[nonEmptyAttri+1,3] = numNodes + 1
 				nonEmptyAttri = nonEmptyAttri + 1

 				numNodes = numNodes + numNodesInSubtree
 			}
 		}
 	}

   return ( list(a=nodes, b=edges) );
 }

 X = readMM(paste(args[1], "X.mtx", sep=""));
 y = readMM(paste(args[1], "y.mtx", sep=""));

 n = nrow(X)
 m = ncol(X)

 minsplit = 2


 X_subset = matrix(1, n, 1)
 attributes = matrix(1, m, 1)
 # recoding inputs
 featureCorrections = as.vector(1 - colMins(as.matrix(X)))
 onesMat = matrix(1, n, m)

 X = onesMat %*% diag(featureCorrections) + X
 labelCorrection = 1 - min(y)
 y = y + labelCorrection + 0

 tmpRet <- id3_learn(X, y, X_subset, attributes, minsplit)
 nodes = as.matrix(tmpRet$a)
 edges = as.matrix(tmpRet$b)

 # decoding outputs
 nodes[,2] = nodes[,2] - labelCorrection * (nodes[,1] == -1)
 for(i3 in 1:nrow(edges)){
 	e_parent = as.numeric(edges[i3,1])
 	parent_feature = as.numeric(nodes[e_parent,1])
 	correction = as.numeric(featureCorrections[parent_feature])
 	edges[i3,2] = edges[i3,2] - correction
 }

 writeMM(as(nodes,"CsparseMatrix"), paste(args[2],"nodes", sep=""), format = "text")
 writeMM(as(edges,"CsparseMatrix"), paste(args[2],"edges", sep=""), format = "text")
	#-------------------------------------------------------------
	#
	# Licensed to the Apache Software Foundation (ASF) under one
	# or more contributor license agreements. See the NOTICE file
	# distributed with this work for additional information
	# regarding copyright ownership. The ASF licenses this file
	# to you under the Apache License, Version 2.0 (the
	# "License"); you may not use this file except in compliance
	# with the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing,
	# software distributed under the License is distributed on an
	# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	# KIND, either express or implied. See the License for the
	# specific language governing permissions and limitations
	# under the License.
	#
	#-------------------------------------------------------------

	args <- commandArgs(TRUE)

	library("Matrix")
	library("matrixStats")

	options(warn=-1)


	id3_learn = function(X, y, X_subset, attributes, minsplit)
	{
	#try the two base cases first

	#of the remaining samples, compute a histogram for labels
	hist_labels_helper = as.matrix(aggregate(as.vector(X_subset), by=list(as.vector(y)), FUN=sum))
	hist_labels = as.matrix(hist_labels_helper[,2])

	#go through the histogram to compute the number of labels
	#with non-zero samples
	#and to pull out the most popular label

	num_non_zero_labels = sum(hist_labels > 0);
	most_popular_label = which.max(t(hist_labels));
	num_remaining_attrs = sum(attributes)

	num_samples = sum(X_subset)
	mpl = as.numeric(most_popular_label)

	nodes = matrix(0, 1, 1)
	edges = matrix(0, 1, 1)

	#if all samples have the same label then return a leaf node
	#if no attributes remain then return a leaf node with the most popular label
	if(num_non_zero_labels == 1 \| num_remaining_attrs == 0 \| num_samples < minsplit){
	nodes = matrix(0, 1, 2)
	nodes[1,1] = -1
	nodes[1,2] = most_popular_label
	edges = matrix(-1, 1, 1)
	}else{
	#computing gains for all available attributes using parfor
	hist_labels2_helper = as.matrix(aggregate(as.vector(X_subset), by=list(as.vector(y)), FUN=sum))
	hist_labels2 = as.matrix(hist_labels2_helper[,2])
	num_samples2 = sum(X_subset)
	zero_entries_in_hist1 = (hist_labels2 == 0)
	pi1 = hist_labels2/num_samples2
	log_term1 = zero_entries_in_hist11 + (1-zero_entries_in_hist1)pi1
	entropy_vector1 = -pi1*log(log_term1)
	ht = sum(entropy_vector1)

	sz = nrow(attributes)
	gains = matrix(0, sz, 1)
	for(i in 1:nrow(attributes)){
	if(as.numeric(attributes[i,1]) == 1){
	attr_vals = X[,i]
	attr_domain_helper = as.matrix(aggregate(as.vector(X_subset), by=list(as.vector(attr_vals)), FUN=sum))
	attr_domain = as.matrix(attr_domain_helper[,2])

	hxt_vector = matrix(0, nrow(attr_domain), 1)

	for(j in 1:nrow(attr_domain)){
	if(as.numeric(attr_domain[j,1]) != 0){
	val = j
	Tj = X_subset * (X[,i] == val)

	#entropy = compute_entropy(Tj, y)
	hist_labels1_helper = as.matrix(aggregate(as.vector(Tj), by=list(as.vector(y)), FUN=sum))
	hist_labels1 = as.matrix(hist_labels1_helper[,2])
	num_samples1 = sum(Tj)
	zero_entries_in_hist = (hist_labels1 == 0)
	piv = hist_labels1/num_samples1
	log_term = zero_entries_in_hist1 + (1-zero_entries_in_hist)piv
	entropy_vector = -piv*log(log_term)
	entropy = sum(entropy_vector)

	hxt_vector[j,1] = sum(Tj)/sum(X_subset)*entropy
	}
	}
	hxt = sum(hxt_vector)
	gains[i,1] = (ht - hxt)
	}
	}

	#pick out attr with highest gain
	best_attr = -1
	max_gain = 0
	for(i4 in 1:nrow(gains)){
	if(as.numeric(attributes[i4,1]) == 1){
	g = as.numeric(gains[i4,1])
	if(best_attr == -1 \| max_gain <= g){
	max_gain = g
	best_attr = i4
	}
	}
	}

	attr_vals = X[,best_attr]
	attr_domain_helper = as.matrix(aggregate(as.vector(X_subset), by=list(as.vector(attr_vals)), FUN=sum))
	attr_domain = as.matrix(attr_domain_helper[,2])

	new_attributes = attributes
	new_attributes[best_attr, 1] = 0

	max_sz_subtree = 2*sum(X_subset)
	sz2 = nrow(attr_domain)
	sz1 = sz2*max_sz_subtree

	tempNodeStore = matrix(0, 2, sz1)
	tempEdgeStore = matrix(0, 3, sz1)
	numSubtreeNodes = matrix(0, sz2, 1)
	numSubtreeEdges = matrix(0, sz2, 1)

	for(i1 in 1:nrow(attr_domain)){

	Ti = X_subset * (X[,best_attr] == i1)
	num_nodes_Ti = sum(Ti)

	if(num_nodes_Ti > 0){
	tmpRet <- id3_learn(X, y, Ti, new_attributes, minsplit)
	nodesi = as.matrix(tmpRet$a);
	edgesi = as.matrix(tmpRet$b);

	start_pt = 1+(i1-1)*max_sz_subtree
	tempNodeStore[,start_pt:(start_pt+nrow(nodesi)-1)] = t(nodesi)

	numSubtreeNodes[i1,1] = nrow(nodesi)
	if(nrow(edgesi)!=1 \| ncol(edgesi)!=1 \| as.numeric(edgesi[1,1])!=-1){
	tempEdgeStore[,start_pt:(start_pt+nrow(edgesi)-1)] = t(edgesi)
	numSubtreeEdges[i1,1] = nrow(edgesi)
	}else{
	numSubtreeEdges[i1,1] = 0
	}
	}
	}

	num_nodes_in_subtrees = sum(numSubtreeNodes)
	num_edges_in_subtrees = sum(numSubtreeEdges)

	#creating root node
	sz = 1+num_nodes_in_subtrees

	nodes = matrix(0, sz, 2)
	nodes[1,1] = best_attr
	numNodes = 1

	#edges from root to children
	sz = sum(numSubtreeNodes > 0) + num_edges_in_subtrees

	edges = matrix(1, sz, 3)
	numEdges = 0
	for(i6 in 1:nrow(attr_domain)){
	num_nodesi = as.numeric(numSubtreeNodes[i6,1])
	if(num_nodesi > 0){
	edges[numEdges+1,2] = i6
	numEdges = numEdges + 1
	}
	}

	nonEmptyAttri = 0
	for(i7 in 1:nrow(attr_domain)){
	numNodesInSubtree = as.numeric(numSubtreeNodes[i7,1])

	if(numNodesInSubtree > 0){
	start_pt1 = 1 + (i7-1)*max_sz_subtree
	nodes[(numNodes+1):(numNodes+numNodesInSubtree),] = t(tempNodeStore[,start_pt1:(start_pt1+numNodesInSubtree-1)])

	numEdgesInSubtree = as.numeric(numSubtreeEdges[i7,1])

	if(numEdgesInSubtree!=0){
	edgesi1 = t(tempEdgeStore[,start_pt1:(start_pt1+numEdgesInSubtree-1)])
	edgesi1[,1] = edgesi1[,1] + numNodes
	edgesi1[,3] = edgesi1[,3] + numNodes

	edges[(numEdges+1):(numEdges+numEdgesInSubtree),] = edgesi1
	numEdges = numEdges + numEdgesInSubtree
	}

	edges[nonEmptyAttri+1,3] = numNodes + 1
	nonEmptyAttri = nonEmptyAttri + 1

	numNodes = numNodes + numNodesInSubtree
	}
	}
	}

	return ( list(a=nodes, b=edges) );
	}

	X = readMM(paste(args[1], "X.mtx", sep=""));
	y = readMM(paste(args[1], "y.mtx", sep=""));

	n = nrow(X)
	m = ncol(X)

	minsplit = 2


	X_subset = matrix(1, n, 1)
	attributes = matrix(1, m, 1)
	# recoding inputs
	featureCorrections = as.vector(1 - colMins(as.matrix(X)))
	onesMat = matrix(1, n, m)

	X = onesMat %*% diag(featureCorrections) + X
	labelCorrection = 1 - min(y)
	y = y + labelCorrection + 0

	tmpRet <- id3_learn(X, y, X_subset, attributes, minsplit)
	nodes = as.matrix(tmpRet$a)
	edges = as.matrix(tmpRet$b)

	# decoding outputs
	nodes[,2] = nodes[,2] - labelCorrection * (nodes[,1] == -1)
	for(i3 in 1:nrow(edges)){
	e_parent = as.numeric(edges[i3,1])
	parent_feature = as.numeric(nodes[e_parent,1])
	correction = as.numeric(featureCorrections[parent_feature])
	edges[i3,2] = edges[i3,2] - correction
	}

	writeMM(as(nodes,"CsparseMatrix"), paste(args[2],"nodes", sep=""), format = "text")
	writeMM(as(edges,"CsparseMatrix"), paste(args[2],"edges", sep=""), format = "text")