scripts/datagen/genRandData4NMFBlockwise.dml - systemds - Git at Google

 #-------------------------------------------------------------
 #
 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #   http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
 #
 #-------------------------------------------------------------

 # generates random data for non-negative
 # matrix factorization
 #
 # follows lda's generative model
 # see Blei, Ng & Jordan, JMLR'03 paper
 # titled Latent Dirichlet Allocation
 #
 # $1 is number of samples
 # $2 is number of features
 # $3 is number of latent factors
 # $4 is number of features per sample
 # 	 (may overlap). use this to vary
 #	 sparsity.
 # $5 is file to store sample mixtures
 # $6 is file to store factors
 # $7 is file to store generated data
 #
 # $8 is the blocksize, i.e., number of rows per block
 #    (should be set such that $8x$2 fits in mem budget)

 numDocuments = $1
 numFeatures = $2
 numTopics = $3
 numWordsPerDoc = $4
 blocksize = $8

 docTopicMixtures = Rand(rows=numDocuments, cols=numTopics, min=0.0, max=1.0, pdf="uniform", seed=0, sparsity=0.75)
 denomsTM = rowSums(docTopicMixtures)
 zerosInDenomsTM = (denomsTM == 0)
 denomsTM = 0.1*zerosInDenomsTM + (1-zerosInDenomsTM)*denomsTM
 parfor(i in 1:numTopics){
 	docTopicMixtures[,i] = docTopicMixtures[,i]/denomsTM
 }
 write(docTopicMixtures, $5, format="binary")
 for(j in 2:numTopics){
 	docTopicMixtures[,j] = docTopicMixtures[,j-1] + docTopicMixtures[,j]
 }

 topicDistributions = Rand(rows=numTopics, cols=numFeatures, min=0.0, max=1.0, pdf="uniform", seed=0, sparsity=0.75)
 parfor(i in 1:numTopics){
 	topicDist = topicDistributions[i,]

 	denom2 = sum(topicDist)
 	if(denom2 == 0){
 		denom2 = denom2 + 0.1
 	}

 	topicDistributions[i,] = topicDist / denom2
 }
 write(topicDistributions, $6, format="binary")
 for(j in 2:numFeatures){
 	topicDistributions[,j] = topicDistributions[,j-1] + topicDistributions[,j]
 }

 data0 = Rand(rows=numDocuments, cols=numFeatures, min=0, max=0, pdf="uniform")

 #outer-loop for blockwise computation
 for( k in seq(1,numDocuments,blocksize) )
 {
   len = min(blocksize,numDocuments-k); #block length
   data = data0[k:(k+len),];            #obtain block

   parfor(i in 1:len){
   	docTopic = docTopicMixtures[i,]

   	r_z = Rand(rows=numWordsPerDoc, cols=1, min=0, max=1, pdf="uniform", seed=0)
   	r_w = Rand(rows=numWordsPerDoc, cols=1, min=0, max=1, pdf="uniform", seed=0)

   	for(j in 1:numWordsPerDoc){
   		rz = as.scalar(r_z[j,1])
   		continue = 1

   		z = -1
   		#this is a workaround
   		#z=1

   		for(k1 in 1:numTopics){
   			prob = as.scalar(docTopic[1,k1])
   			if(continue==1 & rz <= prob){
   				z=k1
   				continue=0
   			}
   		}

   		if(z==-1){
   			print("z is unassigned: " + z)
   			z = numTopics
   		}

   		rw = as.scalar(r_w[j,1])
   		continue = 1

   		w = -1
   		#this is a workaround
   		#w = 1

   		for(k2 in 1:numFeatures){
   			prob = as.scalar(topicDistributions[z,k2])
   			if(continue == 1 & rw <= prob){
   				w = k2
   				continue = 0
   			}
   		}

   		if(w==-1){
   			print("w is unassigned: " + w)
   			w = numFeatures
   		}

   		data[i,w] = data[i,w] + 1
   	}
   }

   data0[k:(k+len),] = data; # write block back
 }

 write(data0, $7, format="binary")
	#-------------------------------------------------------------
	#
	# Licensed to the Apache Software Foundation (ASF) under one
	# or more contributor license agreements. See the NOTICE file
	# distributed with this work for additional information
	# regarding copyright ownership. The ASF licenses this file
	# to you under the Apache License, Version 2.0 (the
	# "License"); you may not use this file except in compliance
	# with the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing,
	# software distributed under the License is distributed on an
	# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	# KIND, either express or implied. See the License for the
	# specific language governing permissions and limitations
	# under the License.
	#
	#-------------------------------------------------------------

	# generates random data for non-negative
	# matrix factorization
	#
	# follows lda's generative model
	# see Blei, Ng & Jordan, JMLR'03 paper
	# titled Latent Dirichlet Allocation
	#
	# $1 is number of samples
	# $2 is number of features
	# $3 is number of latent factors
	# $4 is number of features per sample
	# (may overlap). use this to vary
	# sparsity.
	# $5 is file to store sample mixtures
	# $6 is file to store factors
	# $7 is file to store generated data
	#
	# $8 is the blocksize, i.e., number of rows per block
	# (should be set such that $8x$2 fits in mem budget)

	numDocuments = $1
	numFeatures = $2
	numTopics = $3
	numWordsPerDoc = $4
	blocksize = $8

	docTopicMixtures = Rand(rows=numDocuments, cols=numTopics, min=0.0, max=1.0, pdf="uniform", seed=0, sparsity=0.75)
	denomsTM = rowSums(docTopicMixtures)
	zerosInDenomsTM = (denomsTM == 0)
	denomsTM = 0.1zerosInDenomsTM + (1-zerosInDenomsTM)denomsTM
	parfor(i in 1:numTopics){
	docTopicMixtures[,i] = docTopicMixtures[,i]/denomsTM
	}
	write(docTopicMixtures, $5, format="binary")
	for(j in 2:numTopics){
	docTopicMixtures[,j] = docTopicMixtures[,j-1] + docTopicMixtures[,j]
	}

	topicDistributions = Rand(rows=numTopics, cols=numFeatures, min=0.0, max=1.0, pdf="uniform", seed=0, sparsity=0.75)
	parfor(i in 1:numTopics){
	topicDist = topicDistributions[i,]

	denom2 = sum(topicDist)
	if(denom2 == 0){
	denom2 = denom2 + 0.1
	}

	topicDistributions[i,] = topicDist / denom2
	}
	write(topicDistributions, $6, format="binary")
	for(j in 2:numFeatures){
	topicDistributions[,j] = topicDistributions[,j-1] + topicDistributions[,j]
	}

	data0 = Rand(rows=numDocuments, cols=numFeatures, min=0, max=0, pdf="uniform")

	#outer-loop for blockwise computation
	for( k in seq(1,numDocuments,blocksize) )
	{
	len = min(blocksize,numDocuments-k); #block length
	data = data0[k:(k+len),]; #obtain block

	parfor(i in 1:len){
	docTopic = docTopicMixtures[i,]

	r_z = Rand(rows=numWordsPerDoc, cols=1, min=0, max=1, pdf="uniform", seed=0)
	r_w = Rand(rows=numWordsPerDoc, cols=1, min=0, max=1, pdf="uniform", seed=0)

	for(j in 1:numWordsPerDoc){
	rz = as.scalar(r_z[j,1])
	continue = 1

	z = -1
	#this is a workaround
	#z=1

	for(k1 in 1:numTopics){
	prob = as.scalar(docTopic[1,k1])
	if(continue==1 & rz <= prob){
	z=k1
	continue=0
	}
	}

	if(z==-1){
	print("z is unassigned: " + z)
	z = numTopics
	}

	rw = as.scalar(r_w[j,1])
	continue = 1

	w = -1
	#this is a workaround
	#w = 1

	for(k2 in 1:numFeatures){
	prob = as.scalar(topicDistributions[z,k2])
	if(continue == 1 & rw <= prob){
	w = k2
	continue = 0
	}
	}

	if(w==-1){
	print("w is unassigned: " + w)
	w = numFeatures
	}

	data[i,w] = data[i,w] + 1
	}
	}

	data0[k:(k+len),] = data; # write block back
	}

	write(data0, $7, format="binary")