blob: 1fc683b9b3009ac03c7682bac949ba5af7c3171e [file] [log] [blame]
.. Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
distributed with this work for additional information
regarding copyright ownership. The ASF licenses this file
to you under the Apache License, Version 2.0 (the
"License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing,
software distributed under the License is distributed on an
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
KIND, either express or implied. See the License for the
specific language governing permissions and limitations
under the License.
Google Neural Machine Translation
=================================
In this notebook, we are going to train Google NMT on IWSLT 2015
English-Vietnamese Dataset. The building process includes four steps: 1)
load and process dataset, 2) create sampler and DataLoader, 3) build
model, and 4) write training epochs.
Load MXNET and Gluon
--------------------
.. code:: python
import warnings
warnings.filterwarnings('ignore')
import argparse
import time
import random
import os
import logging
import numpy as np
import mxnet as mx
from mxnet import gluon
import gluonnlp as nlp
import nmt
Hyper-parameters
----------------
.. code:: python
np.random.seed(100)
random.seed(100)
mx.random.seed(10000)
ctx = mx.gpu(0)
# parameters for dataset
dataset = 'IWSLT2015'
src_lang, tgt_lang = 'en', 'vi'
src_max_len, tgt_max_len = 50, 50
# parameters for model
num_hidden = 512
num_layers = 2
num_bi_layers = 1
dropout = 0.2
# parameters for training
batch_size, test_batch_size = 128, 32
num_buckets = 5
epochs = 1
clip = 5
lr = 0.001
lr_update_factor = 0.5
log_interval = 10
save_dir = 'gnmt_en_vi_u512'
#parameters for testing
beam_size = 10
lp_alpha = 1.0
lp_k = 5
nmt.utils.logging_config(save_dir)
Load and Preprocess Dataset
---------------------------
The following shows how to process the dataset and cache the processed
dataset for future use. The processing steps include: 1) clip the source
and target sequences, 2) split the string input to a list of tokens, 3)
map the string token into its integer index in the vocabulary, and 4)
append end-of-sentence (EOS) token to source sentence and add BOS and
EOS tokens to target sentence.
.. code:: python
def cache_dataset(dataset, prefix):
"""Cache the processed npy dataset the dataset into a npz
Parameters
----------
dataset : gluon.data.SimpleDataset
file_path : str
"""
if not os.path.exists(nmt._constants.CACHE_PATH):
os.makedirs(nmt._constants.CACHE_PATH)
src_data = np.array([ele[0] for ele in dataset])
tgt_data = np.array([ele[1] for ele in dataset])
np.savez(os.path.join(nmt._constants.CACHE_PATH, prefix + '.npz'), src_data=src_data, tgt_data=tgt_data)
def load_cached_dataset(prefix):
cached_file_path = os.path.join(nmt._constants.CACHE_PATH, prefix + '.npz')
if os.path.exists(cached_file_path):
print('Load cached data from {}'.format(cached_file_path))
dat = np.load(cached_file_path)
return gluon.data.ArrayDataset(np.array(dat['src_data']), np.array(dat['tgt_data']))
else:
return None
class TrainValDataTransform(object):
"""Transform the machine translation dataset.
Clip source and the target sentences to the maximum length. For the source sentence, append the
EOS. For the target sentence, append BOS and EOS.
Parameters
----------
src_vocab : Vocab
tgt_vocab : Vocab
src_max_len : int
tgt_max_len : int
"""
def __init__(self, src_vocab, tgt_vocab, src_max_len, tgt_max_len):
self._src_vocab = src_vocab
self._tgt_vocab = tgt_vocab
self._src_max_len = src_max_len
self._tgt_max_len = tgt_max_len
def __call__(self, src, tgt):
if self._src_max_len > 0:
src_sentence = self._src_vocab[src.split()[:self._src_max_len]]
else:
src_sentence = self._src_vocab[src.split()]
if self._tgt_max_len > 0:
tgt_sentence = self._tgt_vocab[tgt.split()[:self._tgt_max_len]]
else:
tgt_sentence = self._tgt_vocab[tgt.split()]
src_sentence.append(self._src_vocab[self._src_vocab.eos_token])
tgt_sentence.insert(0, self._tgt_vocab[self._tgt_vocab.bos_token])
tgt_sentence.append(self._tgt_vocab[self._tgt_vocab.eos_token])
src_npy = np.array(src_sentence, dtype=np.int32)
tgt_npy = np.array(tgt_sentence, dtype=np.int32)
return src_npy, tgt_npy
def process_dataset(dataset, src_vocab, tgt_vocab, src_max_len=-1, tgt_max_len=-1):
start = time.time()
dataset_processed = dataset.transform(TrainValDataTransform(src_vocab, tgt_vocab,
src_max_len,
tgt_max_len), lazy=False)
end = time.time()
print('Processing time spent: {}'.format(end - start))
return dataset_processed
def load_translation_data(dataset, src_lang='en', tgt_lang='vi'):
"""Load translation dataset
Parameters
----------
dataset : str
src_lang : str, default 'en'
tgt_lang : str, default 'vi'
Returns
-------
data_train_processed : Dataset
The preprocessed training sentence pairs
data_val_processed : Dataset
The preprocessed validation sentence pairs
data_test_processed : Dataset
The preprocessed test sentence pairs
val_tgt_sentences : list
The target sentences in the validation set
test_tgt_sentences : list
The target sentences in the test set
src_vocab : Vocab
Vocabulary of the source language
tgt_vocab : Vocab
Vocabulary of the target language
"""
common_prefix = 'IWSLT2015_{}_{}_{}_{}'.format(src_lang, tgt_lang,
src_max_len, tgt_max_len)
data_train = nlp.data.IWSLT2015('train', src_lang=src_lang, tgt_lang=tgt_lang)
data_val = nlp.data.IWSLT2015('val', src_lang=src_lang, tgt_lang=tgt_lang)
data_test = nlp.data.IWSLT2015('test', src_lang=src_lang, tgt_lang=tgt_lang)
src_vocab, tgt_vocab = data_train.src_vocab, data_train.tgt_vocab
data_train_processed = load_cached_dataset(common_prefix + '_train')
if not data_train_processed:
data_train_processed = process_dataset(data_train, src_vocab, tgt_vocab,
src_max_len, tgt_max_len)
cache_dataset(data_train_processed, common_prefix + '_train')
data_val_processed = load_cached_dataset(common_prefix + '_val')
if not data_val_processed:
data_val_processed = process_dataset(data_val, src_vocab, tgt_vocab)
cache_dataset(data_val_processed, common_prefix + '_val')
data_test_processed = load_cached_dataset(common_prefix + '_test')
if not data_test_processed:
data_test_processed = process_dataset(data_test, src_vocab, tgt_vocab)
cache_dataset(data_test_processed, common_prefix + '_test')
fetch_tgt_sentence = lambda src, tgt: tgt.split()
val_tgt_sentences = list(data_val.transform(fetch_tgt_sentence))
test_tgt_sentences = list(data_test.transform(fetch_tgt_sentence))
return data_train_processed, data_val_processed, data_test_processed, \
val_tgt_sentences, test_tgt_sentences, src_vocab, tgt_vocab
def get_data_lengths(dataset):
return list(dataset.transform(lambda srg, tgt: (len(srg), len(tgt))))
data_train, data_val, data_test, val_tgt_sentences, test_tgt_sentences, src_vocab, tgt_vocab\
= load_translation_data(dataset=dataset, src_lang=src_lang, tgt_lang=tgt_lang)
data_train_lengths = get_data_lengths(data_train)
data_val_lengths = get_data_lengths(data_val)
data_test_lengths = get_data_lengths(data_test)
with open(os.path.join(save_dir, 'val_gt.txt'), 'w', encoding='utf-8') as of:
for ele in val_tgt_sentences:
of.write(' '.join(ele) + '\n')
with open(os.path.join(save_dir, 'test_gt.txt'), 'w', encoding='utf-8') as of:
for ele in test_tgt_sentences:
of.write(' '.join(ele) + '\n')
data_train = data_train.transform(lambda src, tgt: (src, tgt, len(src), len(tgt)), lazy=False)
data_val = gluon.data.SimpleDataset([(ele[0], ele[1], len(ele[0]), len(ele[1]), i)
for i, ele in enumerate(data_val)])
data_test = gluon.data.SimpleDataset([(ele[0], ele[1], len(ele[0]), len(ele[1]), i)
for i, ele in enumerate(data_test)])
Create Sampler and DataLoader
-----------------------------
Now, we have obtained ``data_train``, ``data_val``, and ``data_test``.
The next step is to construct sampler and DataLoader. The first step is
to construct batchify function, which pads and stacks sequences to form
mini-batch.
.. code:: python
train_batchify_fn = nlp.data.batchify.Tuple(nlp.data.batchify.Pad(),
nlp.data.batchify.Pad(),
nlp.data.batchify.Stack(dtype='float32'),
nlp.data.batchify.Stack(dtype='float32'))
test_batchify_fn = nlp.data.batchify.Tuple(nlp.data.batchify.Pad(),
nlp.data.batchify.Pad(),
nlp.data.batchify.Stack(dtype='float32'),
nlp.data.batchify.Stack(dtype='float32'),
nlp.data.batchify.Stack())
We can then construct bucketing samplers, which generate batches by
grouping sequences with similar lengths. Here, the bucketing scheme is
empirically determined.
.. code:: python
bucket_scheme = nlp.data.ExpWidthBucket(bucket_len_step=1.2)
train_batch_sampler = nlp.data.FixedBucketSampler(lengths=data_train_lengths,
batch_size=batch_size,
num_buckets=num_buckets,
shuffle=True,
bucket_scheme=bucket_scheme)
logging.info('Train Batch Sampler:\n{}'.format(train_batch_sampler.stats()))
val_batch_sampler = nlp.data.FixedBucketSampler(lengths=data_val_lengths,
batch_size=test_batch_size,
num_buckets=num_buckets,
shuffle=False)
logging.info('Valid Batch Sampler:\n{}'.format(val_batch_sampler.stats()))
test_batch_sampler = nlp.data.FixedBucketSampler(lengths=data_test_lengths,
batch_size=test_batch_size,
num_buckets=num_buckets,
shuffle=False)
logging.info('Test Batch Sampler:\n{}'.format(test_batch_sampler.stats()))
Given the samplers, we can create DataLoader, which is iterable.
.. code:: python
train_data_loader = gluon.data.DataLoader(data_train,
batch_sampler=train_batch_sampler,
batchify_fn=train_batchify_fn,
num_workers=4)
val_data_loader = gluon.data.DataLoader(data_val,
batch_sampler=val_batch_sampler,
batchify_fn=test_batchify_fn,
num_workers=4)
test_data_loader = gluon.data.DataLoader(data_test,
batch_sampler=test_batch_sampler,
batchify_fn=test_batchify_fn,
num_workers=4)
Build GNMT Model
----------------
After obtaining DataLoader, we can build the model. The GNMT encoder and
decoder can be easily constructed by calling
``get_gnmt_encoder_decoder`` function. Then, we feed the encoder and
decoder to ``NMTModel`` to construct the GNMT model. ``model.hybridize``
allows computation to be done using the symbolic backend.
.. code:: python
encoder, decoder = nmt.gnmt.get_gnmt_encoder_decoder(hidden_size=num_hidden,
dropout=dropout,
num_layers=num_layers,
num_bi_layers=num_bi_layers)
model = nmt.translation.NMTModel(src_vocab=src_vocab, tgt_vocab=tgt_vocab, encoder=encoder, decoder=decoder,
embed_size=num_hidden, prefix='gnmt_')
model.initialize(init=mx.init.Uniform(0.1), ctx=ctx)
static_alloc = True
model.hybridize(static_alloc=static_alloc)
logging.info(model)
# Due to the paddings, we need to mask out the losses corresponding to padding tokens.
loss_function = nmt.loss.SoftmaxCEMaskedLoss()
loss_function.hybridize(static_alloc=static_alloc)
We also build the beam search translator.
.. code:: python
translator = nmt.translation.BeamSearchTranslator(model=model, beam_size=beam_size,
scorer=nlp.model.BeamSearchScorer(alpha=lp_alpha,
K=lp_k),
max_length=tgt_max_len + 100)
logging.info('Use beam_size={}, alpha={}, K={}'.format(beam_size, lp_alpha, lp_k))
We define evaluation function as follows. The ``evaluate`` function use
beam search translator to generate outputs for the validation and
testing datasets.
.. code:: python
def evaluate(data_loader):
"""Evaluate given the data loader
Parameters
----------
data_loader : gluon.data.DataLoader
Returns
-------
avg_loss : float
Average loss
real_translation_out : list of list of str
The translation output
"""
translation_out = []
all_inst_ids = []
avg_loss_denom = 0
avg_loss = 0.0
for _, (src_seq, tgt_seq, src_valid_length, tgt_valid_length, inst_ids) \
in enumerate(data_loader):
src_seq = src_seq.as_in_context(ctx)
tgt_seq = tgt_seq.as_in_context(ctx)
src_valid_length = src_valid_length.as_in_context(ctx)
tgt_valid_length = tgt_valid_length.as_in_context(ctx)
# Calculating Loss
out, _ = model(src_seq, tgt_seq[:, :-1], src_valid_length, tgt_valid_length - 1)
loss = loss_function(out, tgt_seq[:, 1:], tgt_valid_length - 1).mean().asscalar()
all_inst_ids.extend(inst_ids.asnumpy().astype(np.int32).tolist())
avg_loss += loss * (tgt_seq.shape[1] - 1)
avg_loss_denom += (tgt_seq.shape[1] - 1)
# Translate
samples, _, sample_valid_length =\
translator.translate(src_seq=src_seq, src_valid_length=src_valid_length)
max_score_sample = samples[:, 0, :].asnumpy()
sample_valid_length = sample_valid_length[:, 0].asnumpy()
for i in range(max_score_sample.shape[0]):
translation_out.append(
[tgt_vocab.idx_to_token[ele] for ele in
max_score_sample[i][1:(sample_valid_length[i] - 1)]])
avg_loss = avg_loss / avg_loss_denom
real_translation_out = [None for _ in range(len(all_inst_ids))]
for ind, sentence in zip(all_inst_ids, translation_out):
real_translation_out[ind] = sentence
return avg_loss, real_translation_out
def write_sentences(sentences, file_path):
with open(file_path, 'w', encoding='utf-8') as of:
for sent in sentences:
of.write(' '.join(sent) + '\n')
Training Epochs
---------------
Before entering the training stage, we need to create trainer for
updating the parameters. In the following example, we create a trainer
that uses ADAM optimzier.
.. code:: python
trainer = gluon.Trainer(model.collect_params(), 'adam', {'learning_rate': lr})
We can then write the training loop. During the training, we evaluate on
the validation and testing datasets every epoch, and record the
parameters that give the hightest BLEU score on the validation dataset.
Before performing forward and backward, we first use ``as_in_context``
function to copy the mini-batch to GPU. The statement
``with mx.autograd.record()`` tells Gluon backend to compute the
gradients for the part inside the block.
.. code:: python
best_valid_bleu = 0.0
for epoch_id in range(epochs):
log_avg_loss = 0
log_avg_gnorm = 0
log_wc = 0
log_start_time = time.time()
for batch_id, (src_seq, tgt_seq, src_valid_length, tgt_valid_length)\
in enumerate(train_data_loader):
# logging.info(src_seq.context) Context suddenly becomes GPU.
src_seq = src_seq.as_in_context(ctx)
tgt_seq = tgt_seq.as_in_context(ctx)
src_valid_length = src_valid_length.as_in_context(ctx)
tgt_valid_length = tgt_valid_length.as_in_context(ctx)
with mx.autograd.record():
out, _ = model(src_seq, tgt_seq[:, :-1], src_valid_length, tgt_valid_length - 1)
loss = loss_function(out, tgt_seq[:, 1:], tgt_valid_length - 1).mean()
loss = loss * (tgt_seq.shape[1] - 1) / (tgt_valid_length - 1).mean()
loss.backward()
grads = [p.grad(ctx) for p in model.collect_params().values()]
gnorm = gluon.utils.clip_global_norm(grads, clip)
trainer.step(1)
src_wc = src_valid_length.sum().asscalar()
tgt_wc = (tgt_valid_length - 1).sum().asscalar()
step_loss = loss.asscalar()
log_avg_loss += step_loss
log_avg_gnorm += gnorm
log_wc += src_wc + tgt_wc
if (batch_id + 1) % log_interval == 0:
wps = log_wc / (time.time() - log_start_time)
logging.info('[Epoch {} Batch {}/{}] loss={:.4f}, ppl={:.4f}, gnorm={:.4f}, '
'throughput={:.2f}K wps, wc={:.2f}K'
.format(epoch_id, batch_id + 1, len(train_data_loader),
log_avg_loss / log_interval,
np.exp(log_avg_loss / log_interval),
log_avg_gnorm / log_interval,
wps / 1000, log_wc / 1000))
log_start_time = time.time()
log_avg_loss = 0
log_avg_gnorm = 0
log_wc = 0
valid_loss, valid_translation_out = evaluate(val_data_loader)
valid_bleu_score, _, _, _, _ = nmt.bleu.compute_bleu([val_tgt_sentences], valid_translation_out)
logging.info('[Epoch {}] valid Loss={:.4f}, valid ppl={:.4f}, valid bleu={:.2f}'
.format(epoch_id, valid_loss, np.exp(valid_loss), valid_bleu_score * 100))
test_loss, test_translation_out = evaluate(test_data_loader)
test_bleu_score, _, _, _, _ = nmt.bleu.compute_bleu([test_tgt_sentences], test_translation_out)
logging.info('[Epoch {}] test Loss={:.4f}, test ppl={:.4f}, test bleu={:.2f}'
.format(epoch_id, test_loss, np.exp(test_loss), test_bleu_score * 100))
write_sentences(valid_translation_out,
os.path.join(save_dir, 'epoch{:d}_valid_out.txt').format(epoch_id))
write_sentences(test_translation_out,
os.path.join(save_dir, 'epoch{:d}_test_out.txt').format(epoch_id))
if valid_bleu_score > best_valid_bleu:
best_valid_bleu = valid_bleu_score
save_path = os.path.join(save_dir, 'valid_best.params')
logging.info('Save best parameters to {}'.format(save_path))
model.save_parameters(save_path)
if epoch_id + 1 >= (epochs * 2) // 3:
new_lr = trainer.learning_rate * lr_update_factor
logging.info('Learning rate change to {}'.format(new_lr))
trainer.set_learning_rate(new_lr)
Summary
-------
In this notebook, we have shown how to train a GNMT model on IWSLT 2015
English-Vietnamese using Gluon NLP toolkit. The complete training script
can be found
`here <https://github.com/dmlc/gluon-nlp/blob/master/scripts/nmt/train_gnmt.py>`__.
The command to reproduce the result can be seen in the `nmt scripts
page <http://gluon-nlp.mxnet.io/scripts/index.html#machine-translation>`__.