scripts/datagen/genRandData4NMF.dml - systemds - Git at Google

 #-------------------------------------------------------------
 #
 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #   http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
 #
 #-------------------------------------------------------------

 # generates random data for non-negative
 # matrix factorization
 #
 # follows lda's generative model
 # see Blei, Ng & Jordan, JMLR'03 paper
 # titled Latent Dirichlet Allocation
 #
 # $1 is number of samples
 # $2 is number of features
 # $3 is number of latent factors
 # $4 is number of features per sample
 # 	 (may overlap). use this to vary
 #	 sparsity.
 # $5 is file to store sample mixtures
 # $6 is file to store factors
 # $7 is file to store generated data

 numDocuments = $1
 numFeatures = $2
 numTopics = $3
 numWordsPerDoc = $4

 docTopicMixtures = Rand(rows=numDocuments, cols=numTopics, min=0.0, max=1.0, pdf="uniform", seed=0, sparsity=0.75)
 denomsTM = rowSums(docTopicMixtures)
 zerosInDenomsTM = denomsTM == 0
 denomsTM = 0.1*zerosInDenomsTM + (1-zerosInDenomsTM)*denomsTM
 parfor(i in 1:numTopics){
 	docTopicMixtures[,i] = docTopicMixtures[,i]/denomsTM
 }
 write(docTopicMixtures, $5, format="binary")
 for(j in 2:numTopics){
 	docTopicMixtures[,j] = docTopicMixtures[,j-1] + docTopicMixtures[,j]
 }

 topicDistributions = Rand(rows=numTopics, cols=numFeatures, min=0.0, max=1.0, pdf="uniform", seed=0, sparsity=0.75)
 parfor(i in 1:numTopics){
 	topicDist = topicDistributions[i,]

 	denom2 = sum(topicDist)
 	if(denom2 == 0){
 		denom2 = denom2 + 0.1
 	}

 	topicDistributions[i,] = topicDist / denom2
 }
 write(topicDistributions, $6, format="binary")
 for(j in 2:numFeatures){
 	topicDistributions[,j] = topicDistributions[,j-1] + topicDistributions[,j]
 }

 data = Rand(rows=numDocuments, cols=numFeatures, min=0, max=0, pdf="uniform")

 parfor(i in 1:numDocuments){
 	docTopic = docTopicMixtures[i,]

     ldata = Rand(rows=1, cols=numFeatures, min=0, max=0, pdf="uniform");

 	r_z = Rand(rows=numWordsPerDoc, cols=1, min=0, max=1, pdf="uniform", seed=0)
 	r_w = Rand(rows=numWordsPerDoc, cols=1, min=0, max=1, pdf="uniform", seed=0)

 	for(j in 1:numWordsPerDoc){
 		rz = as.scalar(r_z[j,1])
 		continue = 1

 		z = -1
 		#this is a workaround
 		#z=1

 		for(k1 in 1:numTopics){
 			prob = as.scalar(docTopic[1,k1])
 			if(continue==1 & rz <= prob){
 				z=k1
 				continue=0
 			}
 		}

 		if(z==-1){
 			print("z is unassigned: " + z)
 			z = numTopics
 		}

 		rw = as.scalar(r_w[j,1])
 		continue = 1

 		w = -1
 		#this is a workaround
 		#w = 1

 		for(k2 in 1:numFeatures){
 			prob = as.scalar(topicDistributions[z,k2])
 			if(continue == 1 & rw <= prob){
 				w = k2
 				continue = 0
 			}
 		}

 		if(w==-1){
 			print("w is unassigned: " + w)
 			w = numFeatures
 		}

 		ldata[1,w] = ldata[1,w] + 1
 	}

     data[i,] = ldata;
 }

 write(data, $7, format="binary")
	#-------------------------------------------------------------
	#
	# Licensed to the Apache Software Foundation (ASF) under one
	# or more contributor license agreements. See the NOTICE file
	# distributed with this work for additional information
	# regarding copyright ownership. The ASF licenses this file
	# to you under the Apache License, Version 2.0 (the
	# "License"); you may not use this file except in compliance
	# with the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing,
	# software distributed under the License is distributed on an
	# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	# KIND, either express or implied. See the License for the
	# specific language governing permissions and limitations
	# under the License.
	#
	#-------------------------------------------------------------

	# generates random data for non-negative
	# matrix factorization
	#
	# follows lda's generative model
	# see Blei, Ng & Jordan, JMLR'03 paper
	# titled Latent Dirichlet Allocation
	#
	# $1 is number of samples
	# $2 is number of features
	# $3 is number of latent factors
	# $4 is number of features per sample
	# (may overlap). use this to vary
	# sparsity.
	# $5 is file to store sample mixtures
	# $6 is file to store factors
	# $7 is file to store generated data

	numDocuments = $1
	numFeatures = $2
	numTopics = $3
	numWordsPerDoc = $4

	docTopicMixtures = Rand(rows=numDocuments, cols=numTopics, min=0.0, max=1.0, pdf="uniform", seed=0, sparsity=0.75)
	denomsTM = rowSums(docTopicMixtures)
	zerosInDenomsTM = denomsTM == 0
	denomsTM = 0.1zerosInDenomsTM + (1-zerosInDenomsTM)denomsTM
	parfor(i in 1:numTopics){
	docTopicMixtures[,i] = docTopicMixtures[,i]/denomsTM
	}
	write(docTopicMixtures, $5, format="binary")
	for(j in 2:numTopics){
	docTopicMixtures[,j] = docTopicMixtures[,j-1] + docTopicMixtures[,j]
	}

	topicDistributions = Rand(rows=numTopics, cols=numFeatures, min=0.0, max=1.0, pdf="uniform", seed=0, sparsity=0.75)
	parfor(i in 1:numTopics){
	topicDist = topicDistributions[i,]

	denom2 = sum(topicDist)
	if(denom2 == 0){
	denom2 = denom2 + 0.1
	}

	topicDistributions[i,] = topicDist / denom2
	}
	write(topicDistributions, $6, format="binary")
	for(j in 2:numFeatures){
	topicDistributions[,j] = topicDistributions[,j-1] + topicDistributions[,j]
	}

	data = Rand(rows=numDocuments, cols=numFeatures, min=0, max=0, pdf="uniform")

	parfor(i in 1:numDocuments){
	docTopic = docTopicMixtures[i,]

	ldata = Rand(rows=1, cols=numFeatures, min=0, max=0, pdf="uniform");

	r_z = Rand(rows=numWordsPerDoc, cols=1, min=0, max=1, pdf="uniform", seed=0)
	r_w = Rand(rows=numWordsPerDoc, cols=1, min=0, max=1, pdf="uniform", seed=0)

	for(j in 1:numWordsPerDoc){
	rz = as.scalar(r_z[j,1])
	continue = 1

	z = -1
	#this is a workaround
	#z=1

	for(k1 in 1:numTopics){
	prob = as.scalar(docTopic[1,k1])
	if(continue==1 & rz <= prob){
	z=k1
	continue=0
	}
	}

	if(z==-1){
	print("z is unassigned: " + z)
	z = numTopics
	}

	rw = as.scalar(r_w[j,1])
	continue = 1

	w = -1
	#this is a workaround
	#w = 1

	for(k2 in 1:numFeatures){
	prob = as.scalar(topicDistributions[z,k2])
	if(continue == 1 & rw <= prob){
	w = k2
	continue = 0
	}
	}

	if(w==-1){
	print("w is unassigned: " + w)
	w = numFeatures
	}

	ldata[1,w] = ldata[1,w] + 1
	}

	data[i,] = ldata;
	}

	write(data, $7, format="binary")