| #------------------------------------------------------------- |
| # |
| # Licensed to the Apache Software Foundation (ASF) under one |
| # or more contributor license agreements. See the NOTICE file |
| # distributed with this work for additional information |
| # regarding copyright ownership. The ASF licenses this file |
| # to you under the Apache License, Version 2.0 (the |
| # "License"); you may not use this file except in compliance |
| # with the License. You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, |
| # software distributed under the License is distributed on an |
| # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| # KIND, either express or implied. See the License for the |
| # specific language governing permissions and limitations |
| # under the License. |
| # |
| #------------------------------------------------------------- |
| |
| # generates random data for non-negative |
| # matrix factorization |
| # |
| # follows lda's generative model |
| # see Blei, Ng & Jordan, JMLR'03 paper |
| # titled Latent Dirichlet Allocation |
| # |
| # $1 is number of samples |
| # $2 is number of features |
| # $3 is number of latent factors |
| # $4 is number of features per sample |
| # (may overlap). use this to vary |
| # sparsity. |
| # $5 is file to store sample mixtures |
| # $6 is file to store factors |
| # $7 is file to store generated data |
| # |
| # $8 is the blocksize, i.e., number of rows per block |
| # (should be set such that $8x$2 fits in mem budget) |
| |
| numDocuments = $1 |
| numFeatures = $2 |
| numTopics = $3 |
| numWordsPerDoc = $4 |
| blocksize = $8 |
| |
| docTopicMixtures = Rand(rows=numDocuments, cols=numTopics, min=0.0, max=1.0, pdf="uniform", seed=0, sparsity=0.75) |
| denomsTM = rowSums(docTopicMixtures) |
| zerosInDenomsTM = (denomsTM == 0) |
| denomsTM = 0.1*zerosInDenomsTM + (1-zerosInDenomsTM)*denomsTM |
| parfor(i in 1:numTopics){ |
| docTopicMixtures[,i] = docTopicMixtures[,i]/denomsTM |
| } |
| write(docTopicMixtures, $5, format="binary") |
| for(j in 2:numTopics){ |
| docTopicMixtures[,j] = docTopicMixtures[,j-1] + docTopicMixtures[,j] |
| } |
| |
| topicDistributions = Rand(rows=numTopics, cols=numFeatures, min=0.0, max=1.0, pdf="uniform", seed=0, sparsity=0.75) |
| parfor(i in 1:numTopics){ |
| topicDist = topicDistributions[i,] |
| |
| denom2 = sum(topicDist) |
| if(denom2 == 0){ |
| denom2 = denom2 + 0.1 |
| } |
| |
| topicDistributions[i,] = topicDist / denom2 |
| } |
| write(topicDistributions, $6, format="binary") |
| for(j in 2:numFeatures){ |
| topicDistributions[,j] = topicDistributions[,j-1] + topicDistributions[,j] |
| } |
| |
| data0 = Rand(rows=numDocuments, cols=numFeatures, min=0, max=0, pdf="uniform") |
| |
| #outer-loop for blockwise computation |
| for( k in seq(1,numDocuments,blocksize) ) |
| { |
| len = min(blocksize,numDocuments-k); #block length |
| data = data0[k:(k+len),]; #obtain block |
| |
| parfor(i in 1:len){ |
| docTopic = docTopicMixtures[i,] |
| |
| r_z = Rand(rows=numWordsPerDoc, cols=1, min=0, max=1, pdf="uniform", seed=0) |
| r_w = Rand(rows=numWordsPerDoc, cols=1, min=0, max=1, pdf="uniform", seed=0) |
| |
| for(j in 1:numWordsPerDoc){ |
| rz = as.scalar(r_z[j,1]) |
| continue = 1 |
| |
| z = -1 |
| #this is a workaround |
| #z=1 |
| |
| for(k1 in 1:numTopics){ |
| prob = as.scalar(docTopic[1,k1]) |
| if(continue==1 & rz <= prob){ |
| z=k1 |
| continue=0 |
| } |
| } |
| |
| if(z==-1){ |
| print("z is unassigned: " + z) |
| z = numTopics |
| } |
| |
| rw = as.scalar(r_w[j,1]) |
| continue = 1 |
| |
| w = -1 |
| #this is a workaround |
| #w = 1 |
| |
| for(k2 in 1:numFeatures){ |
| prob = as.scalar(topicDistributions[z,k2]) |
| if(continue == 1 & rw <= prob){ |
| w = k2 |
| continue = 0 |
| } |
| } |
| |
| if(w==-1){ |
| print("w is unassigned: " + w) |
| w = numFeatures |
| } |
| |
| data[i,w] = data[i,w] + 1 |
| } |
| } |
| |
| data0[k:(k+len),] = data; # write block back |
| } |
| |
| write(data0, $7, format="binary") |