blob: 6664cb5702c6adda2fae487858da2523b4219d68 [file] [log] [blame]
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#
import os
import re
import zipfile
from tempfile import TemporaryDirectory
import tensorflow as tf
import numpy as np
import random
from math import floor
def load_data(file):
with open(file, encoding="utf-8") as f:
target = []
source = []
for line in f:
parts = re.split(r'\t+', line)
target.append(parts[0].strip());
source.append(parts[1].strip())
return source, target
def encode_name(char_dict, names):
max_length = 0
for name in names:
length = len(name)
if length > max_length:
max_length = length
# TODO: To be able to use padding for variable length sequences
# pad with the eos marker
encoded_names = np.zeros((len(names), max_length))
for bi in range(len(names)):
for ci in range(len(names[bi])):
encoded_names.itemset((bi, ci), char_dict[names[bi][ci]])
return encoded_names
def mini_batch(target_char_dict, target, source_char_dict, source, batch_size, batch_index):
begin = batch_index
end = min(batch_index + batch_size, len(source))
target_batch = target[begin : end]
target_length = []
for i in range(begin, end):
target_length.append(len(target[i]) + 1) # TODO: The correction should be done in the graph ...
source_batch = source[batch_index : batch_index + batch_size]
source_length = []
for i in range(begin, end):
source_length.append(len(source[i]))
return encode_name(target_char_dict, target_batch), np.asarray(target_length), \
encode_name(source_char_dict, source_batch), np.asarray(source_length)
def create_graph(mode, batch_size, encoder_nchars, max_target_length, decoder_nchars):
# Hyper parameters
encoder_char_dim = 100
num_units = 256
batch_size_ph = tf.placeholder_with_default(batch_size, shape=(), name="batch_size")
# Encoder
encoder_char_ids_ph = tf.placeholder(tf.int32, shape=[None, None], name="encoder_char_ids")
encoder_lengths_ph = tf.placeholder(tf.int32, shape=[None], name="encoder_lengths")
encoder_embedding_weights = tf.get_variable(name="char_embeddings", dtype=tf.float32,
shape=[encoder_nchars, encoder_char_dim])
encoder_emb_inp = tf.nn.embedding_lookup(encoder_embedding_weights, encoder_char_ids_ph)
if "TRAIN" == mode:
encoder_emb_inp = tf.nn.dropout(encoder_emb_inp, 0.7)
encoder_emb_inp = tf.transpose(encoder_emb_inp, perm=[1, 0, 2])
encoder_cell = tf.nn.rnn_cell.BasicLSTMCell(num_units)
initial_state = encoder_cell.zero_state(batch_size_ph, dtype=tf.float32)
encoder_outputs, encoder_state = tf.nn.dynamic_rnn(
encoder_cell, encoder_emb_inp, initial_state=initial_state,
sequence_length=encoder_lengths_ph,
time_major=True, swap_memory=True)
# Decoder
decoder_char_ids_ph = tf.placeholder(tf.int32, shape=[None, None], name="decoder_char_ids")
decoder_lengths = tf.placeholder(tf.int32, shape=[None], name="decoder_lengths")
# decoder output (decoder_input shifted to the left by one)
decoder_char_dim = 100
decoder_embedding_weights = tf.get_variable(name="decoder_char_embeddings", dtype=tf.float32,
shape=[decoder_nchars, decoder_char_dim])
projection_layer = tf.layers.Dense(units=decoder_nchars, use_bias=True) # To predict one output char at a time ...
attention_states = tf.transpose(encoder_outputs, [1, 0, 2])
attention_mechanism = tf.contrib.seq2seq.LuongAttention(
num_units, attention_states,
memory_sequence_length=encoder_lengths_ph)
decoder_cell = tf.nn.rnn_cell.BasicLSTMCell(num_units)
decoder_cell = tf.contrib.seq2seq.AttentionWrapper(decoder_cell, attention_mechanism,
attention_layer_size=num_units)
# decoder_initial_state = encoder_state
decoder_initial_state = decoder_cell.zero_state(dtype=tf.float32, batch_size=batch_size_ph)
if "TRAIN" == mode:
decoder_input = tf.pad(decoder_char_ids_ph, tf.constant([[0,0], [1,0]]),
'CONSTANT', constant_values=(decoder_nchars-2))
decoder_emb_inp = tf.nn.embedding_lookup(decoder_embedding_weights, decoder_input)
decoder_emb_inp = tf.transpose(decoder_emb_inp, perm=[1, 0, 2])
helper = tf.contrib.seq2seq.TrainingHelper(
decoder_emb_inp, [max_target_length for _ in range(batch_size)], time_major=True)
decoder = tf.contrib.seq2seq.BasicDecoder(decoder_cell, helper,
decoder_initial_state, output_layer=projection_layer)
outputs, _, _ = tf.contrib.seq2seq.dynamic_decode(decoder, output_time_major=True, swap_memory=True )
logits = outputs.rnn_output
train_prediction = outputs.sample_id
decoder_output = tf.pad(tf.transpose(decoder_char_ids_ph, perm=[1, 0]), tf.constant([[0,1], [0,0]]),
'CONSTANT', constant_values=(decoder_nchars-1))
crossent = tf.nn.sparse_softmax_cross_entropy_with_logits(
labels=decoder_output, logits=logits, name="crossent")
loss = tf.reduce_sum(crossent * tf.to_float(decoder_lengths)) / (batch_size * max_target_length)
# Optimizer
# TODO: Tutorial suggest to swap to SGD for alter iterations
# optimizer = tf.train.AdamOptimizer()
optimizer = tf.train.RMSPropOptimizer(learning_rate=0.001)
gradients, v = zip(*optimizer.compute_gradients(loss))
gradients, _ = tf.clip_by_global_norm(gradients, 10.0)
optimize = optimizer.apply_gradients(zip(gradients, v))
return encoder_char_ids_ph, encoder_lengths_ph, decoder_char_ids_ph, decoder_lengths, optimize, train_prediction, outputs
if "EVAL" == mode:
helperE = tf.contrib.seq2seq.GreedyEmbeddingHelper(
decoder_embedding_weights,
tf.fill([batch_size_ph], decoder_nchars-2), decoder_nchars-1)
decoderE = tf.contrib.seq2seq.BasicDecoder(
decoder_cell, helperE, decoder_initial_state,
output_layer=projection_layer)
outputs, _, _ = tf.contrib.seq2seq.dynamic_decode(decoderE, maximum_iterations=20)
translations = tf.identity(outputs.sample_id, name="decode")
return encoder_char_ids_ph, encoder_lengths_ph, translations
def encode_chars(names):
char_set = set()
for name in names:
char_set = char_set.union(name)
return {k: v for v, k in enumerate(char_set)}
# TODO: Deduplicate this, same as in namefinder.py
def write_mapping(tags, output_filename):
with open(output_filename, 'w', encoding='utf-8') as f:
for i, tag in enumerate(tags):
f.write('{}\n'.format(tag))
def main():
checkpoints_path = "/tmp/model/checkpoints"
source_train, target_train = load_data("date_train.txt")
source_dev, target_dev = load_data("date_dev.txt")
source_test, target_test = load_data("date_test.txt")
source_char_dict = encode_chars(source_train + source_dev + source_test)
source_char_dict[chr(0)] = 0
target_char_dict = encode_chars(target_train + target_dev + target_test)
# TODO: Find better chars for begin and end markers
target_char_dict['S'] = len(target_char_dict)
target_char_dict['E'] = len(target_char_dict)
target_dict_rev = {v: k for k, v in target_char_dict.items()}
batch_size = 20
# TODO: Don't hard code this ...
target_max_len = 9
train_graph = tf.Graph()
eval_graph = tf.Graph()
with train_graph.as_default():
t_encoder_char_ids_ph, t_encoder_lengths_ph, t_decoder_char_ids_ph, t_decoder_lengths, t_adam_optimize, t_train_prediction, t_dec_out = \
create_graph("TRAIN", batch_size, len(source_char_dict), target_max_len, len(target_char_dict))
train_saver = tf.train.Saver()
train_sess = tf.Session()
train_sess.run(tf.global_variables_initializer())
with eval_graph.as_default():
e_encoder_char_ids_ph, e_encoder_lengths_ph, e_dec_out = \
create_graph("EVAL", batch_size, len(source_char_dict), target_max_len, len(target_char_dict))
eval_saver = tf.train.Saver()
eval_sess = tf.Session(graph=eval_graph)
for epoch in range(20):
print("Epoch " + str(epoch))
with train_graph.as_default():
for batch_index in range(floor(len(source_train) / batch_size)):
if batch_index > 0 and batch_index % 100 == 0:
print("batch_index " + str(batch_index))
target_batch, target_length, source_batch, source_length = \
mini_batch(target_char_dict, target_train, source_char_dict, source_train, batch_size, batch_index)
# TODO: Add char dropout here ...
for i, j in np.ndindex(source_batch.shape):
if random.uniform(0, 1) <= 0.0005:
source_batch[i][j] = 0
feed_dict = {t_encoder_lengths_ph: source_length, t_encoder_char_ids_ph: source_batch,
t_decoder_lengths: target_length, t_decoder_char_ids_ph: target_batch}
t1, dec1 = train_sess.run([t_adam_optimize, t_dec_out], feed_dict)
dec2 = train_sess.run([t_dec_out], feed_dict)
tv=1
# Save train model, and restore it into the eval session
checkpoint_path = train_saver.save(train_sess, checkpoints_path, global_step=epoch)
eval_saver.restore(eval_sess, checkpoint_path)
with eval_graph.as_default():
count_correct = 0
for batch_index in range(floor(len(source_dev) / batch_size)):
target_batch, target_length, source_batch, source_length = \
mini_batch(target_char_dict, target_dev, source_char_dict, source_dev, batch_size, batch_index)
begin = batch_index
end = min(batch_index + batch_size, len(source_dev))
target_strings = target_dev[begin:end]
feed_dict = {e_encoder_lengths_ph: source_length, e_encoder_char_ids_ph: source_batch}
result = eval_sess.run(e_dec_out, feed_dict)
decoded_dates = []
for coded_date in result:
date = ""
for char_id in coded_date:
if not char_id == len(target_char_dict) - 1:
date = date + (target_dict_rev[char_id])
decoded_dates.append(date)
for i in range(len(target_strings)):
if target_strings[i] == decoded_dates[i]:
count_correct = count_correct + 1
print("Dev: " + str(count_correct / len(target_dev)))
with TemporaryDirectory() as temp_dir:
temp_model_dir = temp_dir + "/model"
with eval_graph.as_default():
builder = tf.saved_model.builder.SavedModelBuilder(temp_model_dir)
builder.add_meta_graph_and_variables(eval_sess, [tf.saved_model.tag_constants.SERVING])
builder.save()
write_mapping(source_char_dict, temp_model_dir + '/source_char_dict.txt')
write_mapping(target_char_dict, temp_model_dir + '/target_char_dict.txt')
zipf = zipfile.ZipFile("normalizer.zip", 'w', zipfile.ZIP_DEFLATED)
for root, dirs, files in os.walk(temp_model_dir):
for file in files:
modelFile = os.path.join(root, file)
zipf.write(modelFile, arcname=os.path.relpath(modelFile, temp_model_dir))
if __name__ == "__main__":
main()