blob: a7350328c0a4d912a912d0cf59671856d5559fb6 [file] [log] [blame]
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction import text
import predictionio
import argparse
categories = ['rec.sport.baseball', 'talk.religion.misc', 'rec.motorcycles', 'rec.sport.hockey',
'comp.sys.ibm.pc.hardware', 'rec.autos', 'comp.graphics', 'talk.politics.misc',
'comp.os.ms-windows.misc', 'sci.med']
twenty_train = fetch_20newsgroups(subset = 'train',
shuffle=True,
random_state=10,
categories = categories)
stop_words = text.ENGLISH_STOP_WORDS
def import_events(client):
train = ((float(twenty_train.target[k]),
twenty_train.data[k]) for k in range(len(twenty_train.data)))
count = 0
print('Importing data.....')
for elem in train:
count += 1
client.create_event(
event = "documents",
entity_id = count,
entity_type = "source",
properties = {
"label": elem[0],
"text": elem[1]
})
print("Imported {0} events.".format(count))
def import_stopwords(client):
count = 0
print("Importing stop words.....")
for elem in stop_words:
count += 1
client.create_event(
event = "stopwords",
entity_id = count,
entity_type = "resource",
properties = {
"word" : elem
})
print("Imported {0} stop words.".format(count))
if __name__ == '__main__':
parser = argparse.ArgumentParser(
description="Import sample data for text manipulation engine")
parser.add_argument('--access_key', default='invald_access_key')
parser.add_argument('--url', default="http://localhost:7070")
args = parser.parse_args()
client = predictionio.EventClient(
access_key=args.access_key,
url=args.url,
threads=20,
qsize=5000)
import_events(client)
import_stopwords(client)