examples/qabot/qabot_data.py - singa - Git at Google

 #
 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #   http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
 #

 import numpy as np
 import random

 download_dir = "/tmp/"
 import os
 import urllib


 def check_exist_or_download(url):
     ''' download data into tmp '''
     name = url.rsplit('/', 1)[-1]
     filename = os.path.join(download_dir, name)
     if not os.path.isfile(filename):
         print("Downloading %s" % url)
         urllib.request.urlretrieve(url, filename)
     return filename


 def unzip_data(download_dir, data_zip):
     data_dir = download_dir + "insuranceQA-master/V2/"
     if not os.path.exists(data_dir):
         print("extracting %s to %s" % (download_dir, data_dir))
         from zipfile import ZipFile
         with ZipFile(data_zip, 'r') as zipObj:
             zipObj.extractall(download_dir)
     return data_dir


 def get_label2answer(data_dir):
     import gzip
     label2answer = dict()
     with gzip.open(data_dir +
                    "/InsuranceQA.label2answer.token.encoded.gz") as fin:
         for line in fin:
             pair = line.decode().strip().split("\t")
             idxs = pair[1].split(" ")
             idxs = [int(idx.replace("idx_", "")) for idx in idxs]
             label2answer[int(pair[0])] = idxs
     return label2answer


 pad_idx = 0
 pad_string = "<pad>"
 pad_embed = np.zeros((300,))

 insuranceqa_train_filename = "/InsuranceQA.question.anslabel.token.100.pool.solr.train.encoded.gz"
 insuranceqa_test_filename = "/InsuranceQA.question.anslabel.token.100.pool.solr.test.encoded.gz"
 insuranceQA_url = "https://github.com/shuzi/insuranceQA/archive/master.zip"
 insuranceQA_cache_fp = download_dir + "insuranceQA_cache.pickle"
 google_news_pretrain_embeddings_link = "https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz"


 def get_idx2word(data_dir):
     idx2word = dict()
     with open(data_dir + "vocabulary", encoding="utf-8") as vc_f:
         for line in vc_f:
             pair = line.strip().split("\t")
             idx = int(pair[0].replace("idx_", ""))
             idx2word[idx] = pair[1]

     # add padding string to idx2word lookup
     idx2word[pad_idx] = pad_string

     return idx2word


 def get_train_raw(data_dir, data_filename):
     ''' deserialize training data file
         args:
             data_dir: dir of data file
         return:
             train_raw: list of QnA pair, length of list  == number of samples,
                 each pair has 3 fields:
                     0 is question sentence idx encoded, use idx2word to decode,
                         idx2vec to get embedding.
                     1 is ans labels, each label corresponds to a ans sentence,
                         use label2answer to decode.
                     2 is top K candidate ans, these are negative ans for
                         training.
     '''
     train_raw = []
     import gzip
     with gzip.open(data_dir + data_filename) as fin:
         for line in fin:
             tpl = line.decode().strip().split("\t")
             question = [
                 int(idx.replace("idx_", "")) for idx in tpl[1].split(" ")
             ]
             ans = [int(label) for label in tpl[2].split(" ")]
             candis = [int(label) for label in tpl[3].split(" ")]
             train_raw.append((question, ans, candis))
     return train_raw


 def limit_encode_train(train_raw, label2answer, idx2word, q_seq_limit,
                        ans_seq_limit, idx2vec):
     ''' prepare train data to embedded word vector sequence given sequence limit
         return:
             questions_encoded: np ndarray, shape
                 (number samples, seq length, vector size)
             poss_encoded: same layout, sequence for positive answer
             negs_encoded: same layout, sequence for negative answer
     '''
     questions = [question for question, answers, candis in train_raw]
     # choose 1 answer from answer pool
     poss = [
         label2answer[random.choice(answers)]
         for question, answers, candis in train_raw
     ]
     # choose 1 candidate from candidate pool
     negs = [
         label2answer[random.choice(candis)]
         for question, answers, candis in train_raw
     ]

     # filtered word not in idx2vec
     questions_filtered = [
         [idx for idx in q if idx in idx2vec] for q in questions
     ]
     poss_filtered = [[idx for idx in ans if idx in idx2vec] for ans in poss]
     negs_filtered = [[idx for idx in ans if idx in idx2vec] for ans in negs]

     # crop to seq limit
     questions_crop = [
         q[:q_seq_limit] + [0] * max(0, q_seq_limit - len(q))
         for q in questions_filtered
     ]
     poss_crop = [
         ans[:ans_seq_limit] + [0] * max(0, ans_seq_limit - len(ans))
         for ans in poss_filtered
     ]
     negs_crop = [
         ans[:ans_seq_limit] + [0] * max(0, ans_seq_limit - len(ans))
         for ans in negs_filtered
     ]

     # encoded, word idx to word vector
     questions_encoded = [[idx2vec[idx] for idx in q] for q in questions_crop]
     poss_encoded = [[idx2vec[idx] for idx in ans] for ans in poss_crop]
     negs_encoded = [[idx2vec[idx] for idx in ans] for ans in negs_crop]

     # make nd array
     questions_encoded = np.array(questions_encoded).astype(np.float32)
     poss_encoded = np.array(poss_encoded).astype(np.float32)
     negs_encoded = np.array(negs_encoded).astype(np.float32)
     return questions_encoded, poss_encoded, negs_encoded


 def get_idx2vec_weights(wv, idx2word):
     idx2vec = {k: wv[v] for k, v in idx2word.items() if v in wv}

     # add padding embedding (all zeros) to idx2vec lookup
     idx2vec[pad_idx] = pad_embed
     return idx2vec


 def prepare_data(use_cache=True):
     import pickle
     if not os.path.isfile(insuranceQA_cache_fp) or not use_cache:
         # no cache is found, preprocess data from scratch
         print("prepare data from scratch")

         # get pretained word vector
         from gensim.models.keyedvectors import KeyedVectors
         google_news_pretrain_fp = check_exist_or_download(
             google_news_pretrain_embeddings_link)
         wv = KeyedVectors.load_word2vec_format(google_news_pretrain_fp,
                                                binary=True)

         # prepare insurance QA dataset
         data_zip = check_exist_or_download(insuranceQA_url)
         data_dir = unzip_data(download_dir, data_zip)

         label2answer = get_label2answer(data_dir)
         idx2word = get_idx2word(data_dir)
         idx2vec = get_idx2vec_weights(wv, idx2word)

         train_raw = get_train_raw(data_dir, insuranceqa_train_filename)
         test_raw = get_train_raw(data_dir, insuranceqa_test_filename)
         with open(insuranceQA_cache_fp, 'wb') as handle:
             pickle.dump((train_raw, test_raw, label2answer, idx2word, idx2vec),
                         handle,
                         protocol=pickle.HIGHEST_PROTOCOL)
     else:
         # load from cached pickle
         with open(insuranceQA_cache_fp, 'rb') as handle:
             (train_raw, test_raw, label2answer, idx2word,
              idx2vec) = pickle.load(handle)

     return train_raw, test_raw, label2answer, idx2word, idx2vec


 def limit_encode_eval(train_raw,
                       label2answer,
                       idx2word,
                       q_seq_limit,
                       ans_seq_limit,
                       idx2vec,
                       top_k_candi_limit=6):
     ''' prepare train data to embedded word vector sequence given sequence limit for testing
         return:
             questions_encoded: np ndarray, shape
                 (number samples, seq length, vector size)
             poss_encoded: same layout, sequence for positive answer
             negs_encoded: same layout, sequence for negative answer
     '''
     questions = [question for question, answers, candis in train_raw]

     # combine truth and candidate answers label,
     candi_pools = [
         list(answers + candis)[:top_k_candi_limit]
         for question, answers, candis in train_raw
     ]
     assert all([len(pool) == top_k_candi_limit for pool in candi_pools])

     ans_count = [len(answers) for question, answers, candis in train_raw]
     assert all([c > 0 for c in ans_count])

     # encode ans
     candi_pools_encoded = [[label2answer[candi_label]
                             for candi_label in pool]
                            for pool in candi_pools]

     # filtered word not in idx2vec
     questions_filtered = [
         [idx for idx in q if idx in idx2vec] for q in questions
     ]
     candi_pools_filtered = [[[idx
                               for idx in candi_encoded
                               if idx in idx2vec]
                              for candi_encoded in pool]
                             for pool in candi_pools_encoded]

     # crop to seq limit
     questions_crop = [
         q[:q_seq_limit] + [0] * max(0, q_seq_limit - len(q))
         for q in questions_filtered
     ]
     candi_pools_crop = [[
         candi[:ans_seq_limit] + [0] * max(0, ans_seq_limit - len(candi))
         for candi in pool
     ]
                         for pool in candi_pools_filtered]

     # encoded, word idx to word vector
     questions_encoded = [[idx2vec[idx] for idx in q] for q in questions_crop]
     candi_pools_encoded = [[[idx2vec[idx]
                              for idx in candi]
                             for candi in pool]
                            for pool in candi_pools_crop]
     questions_encoded = np.array(questions_encoded).astype(np.float32)
     candi_pools_encoded = np.array(candi_pools_encoded).astype(np.float32)

     # candi_pools_encoded shape
     #    (number of sample QnA,
     #     number of candi in pool,
     #     number of sequence word idx per candi,
     #     300 word embedding for 1 word idx)
     #  e.g 10 QnA to test
     #      5 each question has 5 possible ans
     #      8 each ans has 8 words
     #      300 each word has vector size 300
     return questions_encoded, candi_pools_encoded, ans_count
	#
	# Licensed to the Apache Software Foundation (ASF) under one
	# or more contributor license agreements. See the NOTICE file
	# distributed with this work for additional information
	# regarding copyright ownership. The ASF licenses this file
	# to you under the Apache License, Version 2.0 (the
	# "License"); you may not use this file except in compliance
	# with the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing,
	# software distributed under the License is distributed on an
	# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	# KIND, either express or implied. See the License for the
	# specific language governing permissions and limitations
	# under the License.
	#

	import numpy as np
	import random

	download_dir = "/tmp/"
	import os
	import urllib


	def check_exist_or_download(url):
	''' download data into tmp '''
	name = url.rsplit('/', 1)[-1]
	filename = os.path.join(download_dir, name)
	if not os.path.isfile(filename):
	print("Downloading %s" % url)
	urllib.request.urlretrieve(url, filename)
	return filename


	def unzip_data(download_dir, data_zip):
	data_dir = download_dir + "insuranceQA-master/V2/"
	if not os.path.exists(data_dir):
	print("extracting %s to %s" % (download_dir, data_dir))
	from zipfile import ZipFile
	with ZipFile(data_zip, 'r') as zipObj:
	zipObj.extractall(download_dir)
	return data_dir


	def get_label2answer(data_dir):
	import gzip
	label2answer = dict()
	with gzip.open(data_dir +
	"/InsuranceQA.label2answer.token.encoded.gz") as fin:
	for line in fin:
	pair = line.decode().strip().split("\t")
	idxs = pair[1].split(" ")
	idxs = [int(idx.replace("idx_", "")) for idx in idxs]
	label2answer[int(pair[0])] = idxs
	return label2answer


	pad_idx = 0
	pad_string = "<pad>"
	pad_embed = np.zeros((300,))

	insuranceqa_train_filename = "/InsuranceQA.question.anslabel.token.100.pool.solr.train.encoded.gz"
	insuranceqa_test_filename = "/InsuranceQA.question.anslabel.token.100.pool.solr.test.encoded.gz"
	insuranceQA_url = "https://github.com/shuzi/insuranceQA/archive/master.zip"
	insuranceQA_cache_fp = download_dir + "insuranceQA_cache.pickle"
	google_news_pretrain_embeddings_link = "https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz"


	def get_idx2word(data_dir):
	idx2word = dict()
	with open(data_dir + "vocabulary", encoding="utf-8") as vc_f:
	for line in vc_f:
	pair = line.strip().split("\t")
	idx = int(pair[0].replace("idx_", ""))
	idx2word[idx] = pair[1]

	# add padding string to idx2word lookup
	idx2word[pad_idx] = pad_string

	return idx2word


	def get_train_raw(data_dir, data_filename):
	''' deserialize training data file
	args:
	data_dir: dir of data file
	return:
	train_raw: list of QnA pair, length of list == number of samples,
	each pair has 3 fields:
	0 is question sentence idx encoded, use idx2word to decode,
	idx2vec to get embedding.
	1 is ans labels, each label corresponds to a ans sentence,
	use label2answer to decode.
	2 is top K candidate ans, these are negative ans for
	training.
	'''
	train_raw = []
	import gzip
	with gzip.open(data_dir + data_filename) as fin:
	for line in fin:
	tpl = line.decode().strip().split("\t")
	question = [
	int(idx.replace("idx_", "")) for idx in tpl[1].split(" ")
	]
	ans = [int(label) for label in tpl[2].split(" ")]
	candis = [int(label) for label in tpl[3].split(" ")]
	train_raw.append((question, ans, candis))
	return train_raw


	def limit_encode_train(train_raw, label2answer, idx2word, q_seq_limit,
	ans_seq_limit, idx2vec):
	''' prepare train data to embedded word vector sequence given sequence limit
	return:
	questions_encoded: np ndarray, shape
	(number samples, seq length, vector size)
	poss_encoded: same layout, sequence for positive answer
	negs_encoded: same layout, sequence for negative answer
	'''
	questions = [question for question, answers, candis in train_raw]
	# choose 1 answer from answer pool
	poss = [
	label2answer[random.choice(answers)]
	for question, answers, candis in train_raw
	]
	# choose 1 candidate from candidate pool
	negs = [
	label2answer[random.choice(candis)]
	for question, answers, candis in train_raw
	]

	# filtered word not in idx2vec
	questions_filtered = [
	[idx for idx in q if idx in idx2vec] for q in questions
	]
	poss_filtered = [[idx for idx in ans if idx in idx2vec] for ans in poss]
	negs_filtered = [[idx for idx in ans if idx in idx2vec] for ans in negs]

	# crop to seq limit
	questions_crop = [
	q[:q_seq_limit] + [0] * max(0, q_seq_limit - len(q))
	for q in questions_filtered
	]
	poss_crop = [
	ans[:ans_seq_limit] + [0] * max(0, ans_seq_limit - len(ans))
	for ans in poss_filtered
	]
	negs_crop = [
	ans[:ans_seq_limit] + [0] * max(0, ans_seq_limit - len(ans))
	for ans in negs_filtered
	]

	# encoded, word idx to word vector
	questions_encoded = [[idx2vec[idx] for idx in q] for q in questions_crop]
	poss_encoded = [[idx2vec[idx] for idx in ans] for ans in poss_crop]
	negs_encoded = [[idx2vec[idx] for idx in ans] for ans in negs_crop]

	# make nd array
	questions_encoded = np.array(questions_encoded).astype(np.float32)
	poss_encoded = np.array(poss_encoded).astype(np.float32)
	negs_encoded = np.array(negs_encoded).astype(np.float32)
	return questions_encoded, poss_encoded, negs_encoded


	def get_idx2vec_weights(wv, idx2word):
	idx2vec = {k: wv[v] for k, v in idx2word.items() if v in wv}

	# add padding embedding (all zeros) to idx2vec lookup
	idx2vec[pad_idx] = pad_embed
	return idx2vec


	def prepare_data(use_cache=True):
	import pickle
	if not os.path.isfile(insuranceQA_cache_fp) or not use_cache:
	# no cache is found, preprocess data from scratch
	print("prepare data from scratch")

	# get pretained word vector
	from gensim.models.keyedvectors import KeyedVectors
	google_news_pretrain_fp = check_exist_or_download(
	google_news_pretrain_embeddings_link)
	wv = KeyedVectors.load_word2vec_format(google_news_pretrain_fp,
	binary=True)

	# prepare insurance QA dataset
	data_zip = check_exist_or_download(insuranceQA_url)
	data_dir = unzip_data(download_dir, data_zip)

	label2answer = get_label2answer(data_dir)
	idx2word = get_idx2word(data_dir)
	idx2vec = get_idx2vec_weights(wv, idx2word)

	train_raw = get_train_raw(data_dir, insuranceqa_train_filename)
	test_raw = get_train_raw(data_dir, insuranceqa_test_filename)
	with open(insuranceQA_cache_fp, 'wb') as handle:
	pickle.dump((train_raw, test_raw, label2answer, idx2word, idx2vec),
	handle,
	protocol=pickle.HIGHEST_PROTOCOL)
	else:
	# load from cached pickle
	with open(insuranceQA_cache_fp, 'rb') as handle:
	(train_raw, test_raw, label2answer, idx2word,
	idx2vec) = pickle.load(handle)

	return train_raw, test_raw, label2answer, idx2word, idx2vec


	def limit_encode_eval(train_raw,
	label2answer,
	idx2word,
	q_seq_limit,
	ans_seq_limit,
	idx2vec,
	top_k_candi_limit=6):
	''' prepare train data to embedded word vector sequence given sequence limit for testing
	return:
	questions_encoded: np ndarray, shape
	(number samples, seq length, vector size)
	poss_encoded: same layout, sequence for positive answer
	negs_encoded: same layout, sequence for negative answer
	'''
	questions = [question for question, answers, candis in train_raw]

	# combine truth and candidate answers label,
	candi_pools = [
	list(answers + candis)[:top_k_candi_limit]
	for question, answers, candis in train_raw
	]
	assert all([len(pool) == top_k_candi_limit for pool in candi_pools])

	ans_count = [len(answers) for question, answers, candis in train_raw]
	assert all([c > 0 for c in ans_count])

	# encode ans
	candi_pools_encoded = [[label2answer[candi_label]
	for candi_label in pool]
	for pool in candi_pools]

	# filtered word not in idx2vec
	questions_filtered = [
	[idx for idx in q if idx in idx2vec] for q in questions
	]
	candi_pools_filtered = [[[idx
	for idx in candi_encoded
	if idx in idx2vec]
	for candi_encoded in pool]
	for pool in candi_pools_encoded]

	# crop to seq limit
	questions_crop = [
	q[:q_seq_limit] + [0] * max(0, q_seq_limit - len(q))
	for q in questions_filtered
	]
	candi_pools_crop = [[
	candi[:ans_seq_limit] + [0] * max(0, ans_seq_limit - len(candi))
	for candi in pool
	]
	for pool in candi_pools_filtered]

	# encoded, word idx to word vector
	questions_encoded = [[idx2vec[idx] for idx in q] for q in questions_crop]
	candi_pools_encoded = [[[idx2vec[idx]
	for idx in candi]
	for candi in pool]
	for pool in candi_pools_crop]
	questions_encoded = np.array(questions_encoded).astype(np.float32)
	candi_pools_encoded = np.array(candi_pools_encoded).astype(np.float32)

	# candi_pools_encoded shape
	# (number of sample QnA,
	# number of candi in pool,
	# number of sequence word idx per candi,
	# 300 word embedding for 1 word idx)
	# e.g 10 QnA to test
	# 5 each question has 5 possible ans
	# 8 each ans has 8 words
	# 300 each word has vector size 300
	return questions_encoded, candi_pools_encoded, ans_count