blob: 0faaa6feb021e9818237ae11012efa54445a5bb9 [file] [log] [blame]
#include <iostream>
#include <set>
#include <stdlib.h>
using namespace std;
int sampleWord(int topicid) {
int rword = random() % 100;
rword = rword + topicid * 100;
if (rword == 0) rword = 1;
return rword;
}
void sampleTopicDistrn(float * ret) {
float total = 0;
for (int i=0; i!=10; i++) {
float nextr = random() * 1.0 / RAND_MAX;
total += nextr;
if (total >= 1.0) {
ret[i] = 1;
for (int j=i+1; j<10; j++) ret[j] = 1;
break;
}
if (i == 9 && total < 1.0) { ret[i] = 9; break; }
ret[i] = total;
}
}
void printTopicDistrn(float * ret) {
cout << "(";
for (int i=0; i!=9; i++)
cout << ret[i] << ",";
cout << ret[9] << ")";
}
int sampleTopic(float * ret) {
float r = random() * 1.0 / RAND_MAX;
int topic = 0;
while (r >= ret[topic]) topic++;
return topic;
}
set<int> alphabet;
int maxword = 0;
int main() {
int numdocs, numwords;;
float ret[10];
cerr << "Please enter number of documents required.\n";
cin >> numdocs;
cerr << "Please enter number of words per document required.\n";
cin >> numwords;
cout << "DROP TABLE IF EXISTS madlib.plda_mycorpus;\n";
cout << "CREATE TABLE madlib.plda_mycorpus ( id int4, contents int4[] ) DISTRIBUTED BY (id);\n";
cout << "INSERT INTO madlib.plda_mycorpus VALUES \n";
for (int i=0; i!=numdocs; i++) {
sampleTopicDistrn(ret);
// printTopicDistrn(ret);
// cout << endl;
cout << " (" << i << ", '{";
for (int j=0; j!=numwords; j++) {
int topic = sampleTopic(ret);
int word = sampleWord(topic);
alphabet.insert(word);
if (word > maxword) maxword = word;
//cout << "(" << topic << "," << word << ") ";
cout << word;
if (j == numwords-1) cout << "}')";
else cout << ",";
}
if (i == numdocs-1) cout << ";\n";
else cout << ",\n";
}
cout << endl << endl;
cout << "DROP TABLE IF EXISTS madlib.plda_mydict;\n";
cout << "CREATE TABLE madlib.plda_mydict ( dict text[] ) DISTRIBUTED RANDOMLY;\n";
cout << "insert into madlib.plda_mydict values \n";
cout << " ('{";
for (int i=1; i!=maxword; i++)
cout << i << ",";
cout << maxword << "}');\n";
/*
set<int>::iterator p = alphabet.begin();
while (p != alphabet.end()) {
cout << *p;
if (++p == alphabet.end()) cout << "}');\n";
else cout << ",";
}
*/
}