This tutorial will demonstrate creating a language model using a character level RNN model using MXNet-R package. You will need the following R packages to run this tutorial -
We will use the tinyshakespeare dataset to build this model.
library("readr") library("stringr") library("stringi") library("mxnet")
Download the data:
download.data <- function(data_dir) { dir.create(data_dir, showWarnings = FALSE) if (!file.exists(paste0(data_dir,'input.txt'))) { download.file(url='https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/tinyshakespeare/input.txt', destfile=paste0(data_dir,'input.txt'), method='wget') } }
Next we transform the test into feature vectors that is fed into the RNN model. The make_data
function reads the dataset, cleans it of any non-alphanumeric characters, splits it into individual characters and groups it into sequences of length seq.len
.
make_data <- function(path, seq.len = 32, dic=NULL) { text_vec <- read_file(file = path) text_vec <- stri_enc_toascii(str = text_vec) text_vec <- str_replace_all(string = text_vec, pattern = "[^[:print:]]", replacement = "") text_vec <- strsplit(text_vec, '') %>% unlist if (is.null(dic)) { char_keep <- sort(unique(text_vec)) } else char_keep <- names(dic)[!dic == 0] # Remove terms not part of dictionary text_vec <- text_vec[text_vec %in% char_keep] # Build dictionary dic <- 1:length(char_keep) names(dic) <- char_keep # reverse dictionary rev_dic <- names(dic) names(rev_dic) <- dic # Adjust by -1 to have a 1-lag for labels num.seq <- (length(text_vec) - 1) %/% seq.len features <- dic[text_vec[1:(seq.len * num.seq)]] labels <- dic[text_vec[1:(seq.len*num.seq) + 1]] features_array <- array(features, dim = c(seq.len, num.seq)) labels_array <- array(labels, dim = c(seq.len, num.seq)) return (list(features_array = features_array, labels_array = labels_array, dic = dic, rev_dic = rev_dic)) } seq.len <- 100 data_prep <- make_data(path = "input.txt", seq.len = seq.len, dic=NULL)
Fetch the features and labels for training the model, and split the data into training and evaluation in 9:1 ratio.
X <- data_prep$features_array Y <- data_prep$labels_array dic <- data_prep$dic rev_dic <- data_prep$rev_dic vocab <- length(dic) samples <- tail(dim(X), 1) train.val.fraction <- 0.9 X.train.data <- X[, 1:as.integer(samples * train.val.fraction)] X.val.data <- X[, -(1:as.integer(samples * train.val.fraction))] X.train.label <- Y[, 1:as.integer(samples * train.val.fraction)] X.val.label <- Y[, -(1:as.integer(samples * train.val.fraction))] train_buckets <- list("100" = list(data = X.train.data, label = X.train.label)) eval_buckets <- list("100" = list(data = X.val.data, label = X.val.label)) train_buckets <- list(buckets = train_buckets, dic = dic, rev_dic = rev_dic) eval_buckets <- list(buckets = eval_buckets, dic = dic, rev_dic = rev_dic)
Create iterators for training and evaluation datasets.
vocab <- length(eval_buckets$dic) batch.size <- 32 train.data <- mx.io.bucket.iter(buckets = train_buckets$buckets, batch.size = batch.size, data.mask.element = 0, shuffle = TRUE) eval.data <- mx.io.bucket.iter(buckets = eval_buckets$buckets, batch.size = batch.size, data.mask.element = 0, shuffle = FALSE)
This model is a multi-layer RNN for sampling from character-level language models. It has a one-to-one model configuration since for each character, we want to predict the next one. For a sequence of length 100, there are also 100 labels, corresponding the same sequence of characters but offset by a position of +1. The parameters output_last_state is set to TRUE in order to access the state of the RNN cells when performing inference.
rnn_graph_one_one <- rnn.graph(num_rnn_layer = 3, num_hidden = 96, input_size = vocab, num_embed = 64, num_decode = vocab, dropout = 0.2, ignore_label = 0, cell_type = "lstm", masking = F, output_last_state = T, loss_output = "softmax", config = "one-to-one") graph.viz(rnn_graph_one_one, type = "graph", direction = "LR", graph.height.px = 180, shape=c(100, 64)) devices <- mx.cpu() initializer <- mx.init.Xavier(rnd_type = "gaussian", factor_type = "avg", magnitude = 3) optimizer <- mx.opt.create("adadelta", rho = 0.9, eps = 1e-5, wd = 1e-8, clip_gradient = 5, rescale.grad = 1/batch.size) logger <- mx.metric.logger() epoch.end.callback <- mx.callback.log.train.metric(period = 1, logger = logger) batch.end.callback <- mx.callback.log.train.metric(period = 50) mx.metric.custom_nd <- function(name, feval) { init <- function() { c(0, 0) } update <- function(label, pred, state) { m <- feval(label, pred) state <- c(state[[1]] + 1, state[[2]] + m) return(state) } get <- function(state) { list(name=name, value = (state[[2]] / state[[1]])) } ret <- (list(init = init, update = update, get = get)) class(ret) <- "mx.metric" return(ret) } mx.metric.Perplexity <- mx.metric.custom_nd("Perplexity", function(label, pred) { label <- mx.nd.reshape(label, shape = -1) label_probs <- as.array(mx.nd.choose.element.0index(pred, label)) batch <- length(label_probs) NLL <- -sum(log(pmax(1e-15, as.array(label_probs)))) / batch Perplexity <- exp(NLL) return(Perplexity) }) model <- mx.model.buckets(symbol = rnn_graph_one_one, train.data = train.data, eval.data = eval.data, num.round = 20, ctx = devices, verbose = TRUE, metric = mx.metric.Perplexity, initializer = initializer, optimizer = optimizer, batch.end.callback = NULL, epoch.end.callback = epoch.end.callback) mx.model.save(model, prefix = "one_to_one_seq_model", iteration = 20)
Start training with 1 devices [1] Train-Perplexity=13.7040474322178 [1] Validation-Perplexity=7.94617194460922 [2] Train-Perplexity=6.57039815554525 [2] Validation-Perplexity=6.60806110658011 [3] Train-Perplexity=5.65360504501481 [3] Validation-Perplexity=6.18932770630876 [4] Train-Perplexity=5.32547285727298 [4] Validation-Perplexity=6.02198756798859 [5] Train-Perplexity=5.14373631472579 [5] Validation-Perplexity=5.8095658243407 [6] Train-Perplexity=5.03077673487379 [6] Validation-Perplexity=5.72582993567431 [7] Train-Perplexity=4.94453383291536 [7] Validation-Perplexity=5.6445258528126 [8] Train-Perplexity=4.88635290100261 [8] Validation-Perplexity=5.6730024536433 [9] Train-Perplexity=4.84205646230548 [9] Validation-Perplexity=5.50960780230982 [10] Train-Perplexity=4.80441673535513 [10] Validation-Perplexity=5.57002263750006 [11] Train-Perplexity=4.77763413242626 [11] Validation-Perplexity=5.55152143269169 [12] Train-Perplexity=4.74937775290777 [12] Validation-Perplexity=5.44968305351486 [13] Train-Perplexity=4.72824849541467 [13] Validation-Perplexity=5.50889348298234 [14] Train-Perplexity=4.70980846981694 [14] Validation-Perplexity=5.51473225859859 [15] Train-Perplexity=4.69685776886122 [15] Validation-Perplexity=5.45391985233811 [16] Train-Perplexity=4.67837107034824 [16] Validation-Perplexity=5.46636764997829 [17] Train-Perplexity=4.66866961934873 [17] Validation-Perplexity=5.44267086113492 [18] Train-Perplexity=4.65611469144194 [18] Validation-Perplexity=5.4290169469462 [19] Train-Perplexity=4.64614689879405 [19] Validation-Perplexity=5.44221549833917 [20] Train-Perplexity=4.63764001963654 [20] Validation-Perplexity=5.42114250842862
We now use the saved model to do inference and sample text character by character that will look like the original training data.
set.seed(0) model <- mx.model.load(prefix = "one_to_one_seq_model", iteration = 20) internals <- model$symbol$get.internals() sym_state <- internals$get.output(which(internals$outputs %in% "RNN_state")) sym_state_cell <- internals$get.output(which(internals$outputs %in% "RNN_state_cell")) sym_output <- internals$get.output(which(internals$outputs %in% "loss_output")) symbol <- mx.symbol.Group(sym_output, sym_state, sym_state_cell) infer_raw <- c("Thou ") infer_split <- dic[strsplit(infer_raw, '') %>% unlist] infer_length <- length(infer_split) infer.data <- mx.io.arrayiter(data = matrix(infer_split), label = matrix(infer_split), batch.size = 1, shuffle = FALSE) infer <- mx.infer.rnn.one(infer.data = infer.data, symbol = symbol, arg.params = model$arg.params, aux.params = model$aux.params, input.params = NULL, ctx = devices) pred_prob <- as.numeric(as.array(mx.nd.slice.axis( infer$loss_output, axis = 0, begin = infer_length-1, end = infer_length))) pred <- sample(length(pred_prob), prob = pred_prob, size = 1) - 1 predict <- c(predict, pred) for (i in 1:200) { infer.data <- mx.io.arrayiter(data = as.matrix(pred), label = as.matrix(pred), batch.size = 1, shuffle = FALSE) infer <- mx.infer.rnn.one(infer.data = infer.data, symbol = symbol, arg.params = model$arg.params, aux.params = model$aux.params, input.params = list(rnn.state = infer[[2]], rnn.state.cell = infer[[3]]), ctx = devices) pred_prob <- as.numeric(as.array(infer$loss_output)) pred <- sample(length(pred_prob), prob = pred_prob, size = 1, replace = T) - 1 predict <- c(predict, pred) } predict_txt <- paste0(rev_dic[as.character(predict)], collapse = "") predict_txt_tot <- paste0(infer_raw, predict_txt, collapse = "") print(predict_txt_tot)
[1] "Thou NAknowledge thee my Comfort and his late she.FRIAR LAURENCE:Nothing a groats waterd forth. The lend he thank that;When she I am brother draw London: and not hear that know.BENVOLIO:How along, makes your "