blob: 830f5ea84ad93e7e8cc4abd3cfe1f4e4b4597846 [file] [log] [blame]
import predictionio
import argparse
import os
categories = os.popen('ls ./data/20_newsgroups').read().split('\n')[: -1]
def import_events(client):
count = 0
print('Importing data.....')
for k in range(len(categories)):
cat_files = os.popen(
'ls ./data/20_newsgroups/' + categories[k] + '/*'
).read().split('\n')[: -1]
for file_ in cat_files:
try:
client.create_event(
event = "documents",
entity_id = count,
entity_type = "source",
properties = {
"label": k,
"text": open(file_).read(),
"category" : categories[k]
})
count += 1
except UnicodeDecodeError:
pass
except predictionio.NotCreatedError:
pass
print("Imported {0} events.".format(count))
stop_words = open('./data/stopwords.txt').read().split('\n')
def import_stopwords(client):
count = 0
print("Importing stop words.....")
for elem in stop_words:
count += 1
client.create_event(
event = "stopwords",
entity_id = count,
entity_type = "resource",
properties = {
"word" : elem
})
print("Imported {0} stop words.".format(count))
if __name__ == '__main__':
parser = argparse.ArgumentParser(
description="Import sample data for text manipulation engine")
parser.add_argument('--access_key', default='invald_access_key')
parser.add_argument('--url', default="http://localhost:7070")
args = parser.parse_args()
client = predictionio.EventClient(
access_key=args.access_key,
url=args.url,
threads=20,
qsize=5000)
import_events(client)
import_stopwords(client)