blob: a82ac4e0f19799d744e1f42b785552358b48d4c9 [file] [log] [blame]
#-------------------------------------------------------------
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#
#-------------------------------------------------------------
# generates random data for non-negative
# matrix factorization
#
# follows lda's generative model
# see Blei, Ng & Jordan, JMLR'03 paper
# titled Latent Dirichlet Allocation
#
# $1 is number of samples
# $2 is number of features
# $3 is number of latent factors
# $4 is number of features per sample
# (may overlap). use this to vary
# sparsity.
# $5 is file to store sample mixtures
# $6 is file to store factors
# $7 is file to store generated data
numDocuments = $1
numFeatures = $2
numTopics = $3
numWordsPerDoc = $4
docTopicMixtures = Rand(rows=numDocuments, cols=numTopics, min=0.0, max=1.0, pdf="uniform", seed=0, sparsity=0.75)
denomsTM = rowSums(docTopicMixtures)
zerosInDenomsTM = denomsTM == 0
denomsTM = 0.1*zerosInDenomsTM + (1-zerosInDenomsTM)*denomsTM
parfor(i in 1:numTopics){
docTopicMixtures[,i] = docTopicMixtures[,i]/denomsTM
}
write(docTopicMixtures, $5, format="binary")
for(j in 2:numTopics){
docTopicMixtures[,j] = docTopicMixtures[,j-1] + docTopicMixtures[,j]
}
topicDistributions = Rand(rows=numTopics, cols=numFeatures, min=0.0, max=1.0, pdf="uniform", seed=0, sparsity=0.75)
parfor(i in 1:numTopics){
topicDist = topicDistributions[i,]
denom2 = sum(topicDist)
if(denom2 == 0){
denom2 = denom2 + 0.1
}
topicDistributions[i,] = topicDist / denom2
}
write(topicDistributions, $6, format="binary")
for(j in 2:numFeatures){
topicDistributions[,j] = topicDistributions[,j-1] + topicDistributions[,j]
}
data = Rand(rows=numDocuments, cols=numFeatures, min=0, max=0, pdf="uniform")
parfor(i in 1:numDocuments){
docTopic = docTopicMixtures[i,]
ldata = Rand(rows=1, cols=numFeatures, min=0, max=0, pdf="uniform");
r_z = Rand(rows=numWordsPerDoc, cols=1, min=0, max=1, pdf="uniform", seed=0)
r_w = Rand(rows=numWordsPerDoc, cols=1, min=0, max=1, pdf="uniform", seed=0)
for(j in 1:numWordsPerDoc){
rz = as.scalar(r_z[j,1])
continue = 1
z = -1
#this is a workaround
#z=1
for(k1 in 1:numTopics){
prob = as.scalar(docTopic[1,k1])
if(continue==1 & rz <= prob){
z=k1
continue=0
}
}
if(z==-1){
print("z is unassigned: " + z)
z = numTopics
}
rw = as.scalar(r_w[j,1])
continue = 1
w = -1
#this is a workaround
#w = 1
for(k2 in 1:numFeatures){
prob = as.scalar(topicDistributions[z,k2])
if(continue == 1 & rw <= prob){
w = k2
continue = 0
}
}
if(w==-1){
print("w is unassigned: " + w)
w = numFeatures
}
ldata[1,w] = ldata[1,w] + 1
}
data[i,] = ldata;
}
write(data, $7, format="binary")