data/import_eventserver.py - predictionio-template-text-classifier - Git at Google

 from sklearn.datasets import fetch_20newsgroups
 from sklearn.feature_extraction import text
 import predictionio
 import argparse


 categories = ['rec.sport.baseball', 'talk.religion.misc', 'rec.motorcycles', 'rec.sport.hockey',
               'comp.sys.ibm.pc.hardware', 'rec.autos', 'comp.graphics', 'talk.politics.misc',
               'comp.os.ms-windows.misc', 'sci.med']

 twenty_train = fetch_20newsgroups(subset = 'train',
                                   shuffle=True,
                                   random_state=10,
                                   categories = categories)
 stop_words = text.ENGLISH_STOP_WORDS


 def import_events(client):
     train = ((float(twenty_train.target[k]),
              twenty_train.data[k]) for k in range(len(twenty_train.data)))
     count = 0
     print('Importing data.....')
     for elem in train:
         count += 1
         client.create_event(
             event = "documents",
             entity_id = count,
             entity_type = "source",
             properties = {
                 "label": elem[0],
                 "text": elem[1]
             })
     print("Imported {0} events.".format(count))


 def import_stopwords(client):
     count = 0
     print("Importing stop words.....")
     for elem in stop_words:
         count += 1
         client.create_event(
             event = "stopwords",
             entity_id = count,
             entity_type = "resource",
             properties = {
                 "word" : elem
             })
     print("Imported {0} stop words.".format(count))


 if __name__ == '__main__':
     parser = argparse.ArgumentParser(
         description="Import sample data for text manipulation engine")
     parser.add_argument('--access_key', default='invald_access_key')
     parser.add_argument('--url', default="http://localhost:7070")
     args = parser.parse_args()

     client = predictionio.EventClient(
         access_key=args.access_key,
         url=args.url,
         threads=20,
         qsize=5000)

     import_events(client)
     import_stopwords(client)
	from sklearn.datasets import fetch_20newsgroups
	from sklearn.feature_extraction import text
	import predictionio
	import argparse


	categories = ['rec.sport.baseball', 'talk.religion.misc', 'rec.motorcycles', 'rec.sport.hockey',
	'comp.sys.ibm.pc.hardware', 'rec.autos', 'comp.graphics', 'talk.politics.misc',
	'comp.os.ms-windows.misc', 'sci.med']

	twenty_train = fetch_20newsgroups(subset = 'train',
	shuffle=True,
	random_state=10,
	categories = categories)
	stop_words = text.ENGLISH_STOP_WORDS


	def import_events(client):
	train = ((float(twenty_train.target[k]),
	twenty_train.data[k]) for k in range(len(twenty_train.data)))
	count = 0
	print('Importing data.....')
	for elem in train:
	count += 1
	client.create_event(
	event = "documents",
	entity_id = count,
	entity_type = "source",
	properties = {
	"label": elem[0],
	"text": elem[1]
	})
	print("Imported {0} events.".format(count))


	def import_stopwords(client):
	count = 0
	print("Importing stop words.....")
	for elem in stop_words:
	count += 1
	client.create_event(
	event = "stopwords",
	entity_id = count,
	entity_type = "resource",
	properties = {
	"word" : elem
	})
	print("Imported {0} stop words.".format(count))




	if __name__ == '__main__':
	parser = argparse.ArgumentParser(
	description="Import sample data for text manipulation engine")
	parser.add_argument('--access_key', default='invald_access_key')
	parser.add_argument('--url', default="http://localhost:7070")
	args = parser.parse_args()

	client = predictionio.EventClient(
	access_key=args.access_key,
	url=args.url,
	threads=20,
	qsize=5000)

	import_events(client)
	import_stopwords(client)