blob: 4494855dfbb76d5e2c2c34205dab99147a1b64ff [file] [log] [blame]
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#
import numpy as np
import random
download_dir = "/tmp/"
import os
import urllib
def check_exist_or_download(url):
''' download data into tmp '''
name = url.rsplit('/', 1)[-1]
filename = os.path.join(download_dir, name)
if not os.path.isfile(filename):
print("Downloading %s" % url)
urllib.request.urlretrieve(url, filename)
return filename
def unzip_data(download_dir, data_zip):
data_dir = download_dir + "insuranceQA-master/V2/"
if not os.path.exists(data_dir):
print("extracting %s to %s" % (download_dir, data_dir))
from zipfile import ZipFile
with ZipFile(data_zip, 'r') as zipObj:
zipObj.extractall(download_dir)
return data_dir
def get_label2answer(data_dir):
import gzip
label2answer = dict()
with gzip.open(data_dir +
"/InsuranceQA.label2answer.token.encoded.gz") as fin:
for line in fin:
pair = line.decode().strip().split("\t")
idxs = pair[1].split(" ")
idxs = [int(idx.replace("idx_", "")) for idx in idxs]
label2answer[int(pair[0])] = idxs
return label2answer
pad_idx = 0
pad_string = "<pad>"
pad_embed = np.zeros((300,))
insuranceqa_train_filename = "/InsuranceQA.question.anslabel.token.100.pool.solr.train.encoded.gz"
insuranceqa_test_filename = "/InsuranceQA.question.anslabel.token.100.pool.solr.test.encoded.gz"
insuranceQA_url = "https://github.com/shuzi/insuranceQA/archive/master.zip"
insuranceQA_cache_fp = download_dir + "insuranceQA_cache.pickle"
google_news_pretrain_embeddings_link = "https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz"
def get_idx2word(data_dir):
idx2word = dict()
with open(data_dir + "vocabulary", encoding="utf-8") as vc_f:
for line in vc_f:
pair = line.strip().split("\t")
idx = int(pair[0].replace("idx_", ""))
idx2word[idx] = pair[1]
# add padding string to idx2word lookup
idx2word[pad_idx] = pad_string
return idx2word
def get_train_raw(data_dir, data_filename):
''' deserialize training data file
args:
data_dir: dir of data file
return:
train_raw: list of QnA pair, length of list == number of samples,
each pair has 3 fields:
0 is question sentence idx encoded, use idx2word to decode,
idx2vec to get embedding.
1 is ans labels, each label corresponds to a ans sentence,
use label2answer to decode.
2 is top K candidate ans, these are negative ans for
training.
'''
train_raw = []
import gzip
with gzip.open(data_dir + data_filename) as fin:
for line in fin:
tpl = line.decode().strip().split("\t")
question = [
int(idx.replace("idx_", "")) for idx in tpl[1].split(" ")
]
ans = [int(label) for label in tpl[2].split(" ")]
candis = [int(label) for label in tpl[3].split(" ")]
train_raw.append((question, ans, candis))
return train_raw
def limit_encode_train(train_raw, label2answer, idx2word, q_seq_limit,
ans_seq_limit, idx2vec):
''' prepare train data to embedded word vector sequence given sequence limit
return:
questions_encoded: np ndarray, shape
(number samples, seq length, vector size)
poss_encoded: same layout, sequence for positive answer
negs_encoded: same layout, sequence for negative answer
'''
questions = [question for question, answers, candis in train_raw]
# choose 1 answer from answer pool
poss = [
label2answer[random.choice(answers)]
for question, answers, candis in train_raw
]
# choose 1 candidate from candidate pool
negs = [
label2answer[random.choice(candis)]
for question, answers, candis in train_raw
]
# filtered word not in idx2vec
questions_filtered = [
[idx for idx in q if idx in idx2vec] for q in questions
]
poss_filtered = [[idx for idx in ans if idx in idx2vec] for ans in poss]
negs_filtered = [[idx for idx in ans if idx in idx2vec] for ans in negs]
# crop to seq limit
questions_crop = [
q[:q_seq_limit] + [0] * max(0, q_seq_limit - len(q))
for q in questions_filtered
]
poss_crop = [
ans[:ans_seq_limit] + [0] * max(0, ans_seq_limit - len(ans))
for ans in poss_filtered
]
negs_crop = [
ans[:ans_seq_limit] + [0] * max(0, ans_seq_limit - len(ans))
for ans in negs_filtered
]
# encoded, word idx to word vector
questions_encoded = [[idx2vec[idx] for idx in q] for q in questions_crop]
poss_encoded = [[idx2vec[idx] for idx in ans] for ans in poss_crop]
negs_encoded = [[idx2vec[idx] for idx in ans] for ans in negs_crop]
# make nd array
questions_encoded = np.array(questions_encoded).astype(np.float32)
poss_encoded = np.array(poss_encoded).astype(np.float32)
negs_encoded = np.array(negs_encoded).astype(np.float32)
return questions_encoded, poss_encoded, negs_encoded
def get_idx2vec_weights(wv, idx2word):
idx2vec = {k: wv[v] for k, v in idx2word.items() if v in wv}
# add padding embedding (all zeros) to idx2vec lookup
idx2vec[pad_idx] = pad_embed
return idx2vec
def prepare_data(use_cache=True):
import pickle
if not os.path.isfile(insuranceQA_cache_fp) or not use_cache:
# no cache is found, preprocess data from scratch
print("prepare data from scratch")
# get pretained word vector
from gensim.models.keyedvectors import KeyedVectors
google_news_pretrain_fp = check_exist_or_download(
google_news_pretrain_embeddings_link)
wv = KeyedVectors.load_word2vec_format(google_news_pretrain_fp,
binary=True)
# prepare insurance QA dataset
data_zip = check_exist_or_download(insuranceQA_url)
data_dir = unzip_data(download_dir, data_zip)
label2answer = get_label2answer(data_dir)
idx2word = get_idx2word(data_dir)
idx2vec = get_idx2vec_weights(wv, idx2word)
train_raw = get_train_raw(data_dir, insuranceqa_train_filename)
test_raw = get_train_raw(data_dir, insuranceqa_test_filename)
with open(insuranceQA_cache_fp, 'wb') as handle:
pickle.dump((train_raw, test_raw, label2answer, idx2word, idx2vec),
handle,
protocol=pickle.HIGHEST_PROTOCOL)
else:
# load from cached pickle
with open(insuranceQA_cache_fp, 'rb') as handle:
(train_raw, test_raw, label2answer, idx2word,
idx2vec) = pickle.load(handle)
return train_raw, test_raw, label2answer, idx2word, idx2vec
def limit_encode_eval(train_raw,
label2answer,
idx2word,
q_seq_limit,
ans_seq_limit,
idx2vec,
top_k_candi_limit=6):
''' prepare train data to embedded word vector sequence given sequence limit for testing
return:
questions_encoded: np ndarray, shape
(number samples, seq length, vector size)
poss_encoded: same layout, sequence for positive answer
negs_encoded: same layout, sequence for negative answer
'''
questions = [question for question, answers, candis in train_raw]
# combine truth and candidate answers label,
candi_pools = [
list(answers + candis)[:top_k_candi_limit]
for question, answers, candis in train_raw
]
assert all([len(pool) == top_k_candi_limit for pool in candi_pools])
ans_count = [len(answers) for question, answers, candis in train_raw]
assert all([c > 0 for c in ans_count])
# encode ans
candi_pools_encoded = [[label2answer[candi_label]
for candi_label in pool]
for pool in candi_pools]
# filtered word not in idx2vec
questions_filtered = [
[idx for idx in q if idx in idx2vec] for q in questions
]
candi_pools_filtered = [[[idx
for idx in candi_encoded
if idx in idx2vec]
for candi_encoded in pool]
for pool in candi_pools_encoded]
# crop to seq limit
questions_crop = [
q[:q_seq_limit] + [0] * max(0, q_seq_limit - len(q))
for q in questions_filtered
]
candi_pools_crop = [[
candi[:ans_seq_limit] + [0] * max(0, ans_seq_limit - len(candi))
for candi in pool
]
for pool in candi_pools_filtered]
# encoded, word idx to word vector
questions_encoded = [[idx2vec[idx] for idx in q] for q in questions_crop]
candi_pools_encoded = [[[idx2vec[idx]
for idx in candi]
for candi in pool]
for pool in candi_pools_crop]
questions_encoded = np.array(questions_encoded).astype(np.float32)
candi_pools_encoded = np.array(candi_pools_encoded).astype(np.float32)
# candi_pools_encoded shape
# (number of sample QnA,
# number of candi in pool,
# number of sequence word idx per candi,
# 300 word embedding for 1 word idx)
# e.g 10 QnA to test
# 5 each question has 5 possible ans
# 8 each ans has 8 words
# 300 each word has vector size 300
return questions_encoded, candi_pools_encoded, ans_count