This tutorial shows how to use an LSTM model to build a char-level language model, and generate text from it. For demonstration purposes, we use a Shakespearean text. You can find the data on GitHub.
Load in the data and preprocess it:
require(mxnet)
## Loading required package: mxnet
## Loading required package: methods
Set the basic network parameters:
batch.size = 32 seq.len = 32 num.hidden = 16 num.embed = 16 num.lstm.layer = 1 num.round = 1 learning.rate= 0.1 wd=0.00001 clip_gradient=1 update.period = 1
Download the data:
download.data <- function(data_dir) { dir.create(data_dir, showWarnings = FALSE) if (!file.exists(paste0(data_dir,'input.txt'))) { download.file(url='https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/tinyshakespeare/input.txt', destfile=paste0(data_dir,'input.txt'), method='wget') } }
Make a dictionary from the text:
make.dict <- function(text, max.vocab=10000) { text <- strsplit(text, '') dic <- list() idx <- 1 for (c in text[[1]]) { if (!(c %in% names(dic))) { dic[[c]] <- idx idx <- idx + 1 } } if (length(dic) == max.vocab - 1) dic[["UNKNOWN"]] <- idx cat(paste0("Total unique char: ", length(dic), "\n")) return (dic) }
Transfer the text into a data feature:
make.data <- function(file.path, seq.len=32, max.vocab=10000, dic=NULL) { fi <- file(file.path, "r") text <- paste(readLines(fi), collapse="\n") close(fi) if (is.null(dic)) dic <- make.dict(text, max.vocab) lookup.table <- list() for (c in names(dic)) { idx <- dic[[c]] lookup.table[[idx]] <- c } char.lst <- strsplit(text, '')[[1]] num.seq <- as.integer(length(char.lst) / seq.len) char.lst <- char.lst[1:(num.seq * seq.len)] data <- array(0, dim=c(seq.len, num.seq)) idx <- 1 for (i in 1:num.seq) { for (j in 1:seq.len) { if (char.lst[idx] %in% names(dic)) data[j, i] <- dic[[ char.lst[idx] ]]-1 else { data[j, i] <- dic[["UNKNOWN"]]-1 } idx <- idx + 1 } } return (list(data=data, dic=dic, lookup.table=lookup.table)) }
Move the tail text:
drop.tail <- function(X, batch.size) { shape <- dim(X) nstep <- as.integer(shape[2] / batch.size) return (X[, 1:(nstep * batch.size)]) }
Get the label of X:
get.label <- function(X) { label <- array(0, dim=dim(X)) d <- dim(X)[1] w <- dim(X)[2] for (i in 0:(w-1)) { for (j in 1:d) { label[i*d+j] <- X[(i*d+j)%%(w*d)+1] } } return (label) }
Get the training data and evaluation data:
download.data("./data/") ret <- make.data("./data/input.txt", seq.len=seq.len)
## Total unique char: 65
X <- ret$data dic <- ret$dic lookup.table <- ret$lookup.table vocab <- length(dic) shape <- dim(X) train.val.fraction <- 0.9 size <- shape[2] X.train.data <- X[, 1:as.integer(size * train.val.fraction)] X.val.data <- X[, -(1:as.integer(size * train.val.fraction))] X.train.data <- drop.tail(X.train.data, batch.size) X.val.data <- drop.tail(X.val.data, batch.size) X.train.label <- get.label(X.train.data) X.val.label <- get.label(X.val.data) X.train <- list(data=X.train.data, label=X.train.label) X.val <- list(data=X.val.data, label=X.val.label)
In mxnet
, we have a function called mx.lstm
so that users can build a general LSTM model:
model <- mx.lstm(X.train, X.val, ctx=mx.cpu(), num.round=num.round, update.period=update.period, num.lstm.layer=num.lstm.layer, seq.len=seq.len, num.hidden=num.hidden, num.embed=num.embed, num.label=vocab, batch.size=batch.size, input.size=vocab, initializer=mx.init.uniform(0.1), learning.rate=learning.rate, wd=wd, clip_gradient=clip_gradient)
## Epoch [31] Train: NLL=3.53787130224343, Perp=34.3936275728271 ## Epoch [62] Train: NLL=3.43087958036949, Perp=30.903813186055 ## Epoch [93] Train: NLL=3.39771238228587, Perp=29.8956319855751 ## Epoch [124] Train: NLL=3.37581711716687, Perp=29.2481732041015 ## Epoch [155] Train: NLL=3.34523331338447, Perp=28.3671933405139 ## Epoch [186] Train: NLL=3.30756356274787, Perp=27.31848454823 ## Epoch [217] Train: NLL=3.25642968403829, Perp=25.9566978956055 ## Epoch [248] Train: NLL=3.19825967486207, Perp=24.4898727477925 ## Epoch [279] Train: NLL=3.14013971549828, Perp=23.1070950525017 ## Epoch [310] Train: NLL=3.08747601837462, Perp=21.9216781782189 ## Epoch [341] Train: NLL=3.04015595674863, Perp=20.9085038031042 ## Epoch [372] Train: NLL=2.99839339255659, Perp=20.0532932584534 ## Epoch [403] Train: NLL=2.95940091012609, Perp=19.2864139984503 ## Epoch [434] Train: NLL=2.92603311380224, Perp=18.6534872738302 ## Epoch [465] Train: NLL=2.89482756896395, Perp=18.0803835531869 ## Epoch [496] Train: NLL=2.86668230478397, Perp=17.5786009078994 ## Epoch [527] Train: NLL=2.84089368534943, Perp=17.1310684830416 ## Epoch [558] Train: NLL=2.81725862932279, Perp=16.7309220880514 ## Epoch [589] Train: NLL=2.79518870141492, Perp=16.3657166956952 ## Epoch [620] Train: NLL=2.77445683225304, Perp=16.0299176962855 ## Epoch [651] Train: NLL=2.75490970113174, Perp=15.719621374694 ## Epoch [682] Train: NLL=2.73697900634351, Perp=15.4402696117257 ## Epoch [713] Train: NLL=2.72059739336781, Perp=15.1893935780915 ## Epoch [744] Train: NLL=2.70462837571585, Perp=14.948760335793 ## Epoch [775] Train: NLL=2.68909904683828, Perp=14.7184093476224 ## Epoch [806] Train: NLL=2.67460054451836, Perp=14.5065539595711 ## Epoch [837] Train: NLL=2.66078997776751, Perp=14.3075873113043 ## Epoch [868] Train: NLL=2.6476781639279, Perp=14.1212134100373 ## Epoch [899] Train: NLL=2.63529039846876, Perp=13.9473621677371 ## Epoch [930] Train: NLL=2.62367693518974, Perp=13.7863219168709 ## Epoch [961] Train: NLL=2.61238282674384, Perp=13.6314936713501 ## Iter [1] Train: Time: 10301.6818172932 sec, NLL=2.60536539345356, Perp=13.5361704272949 ## Iter [1] Val: NLL=2.26093848746227, Perp=9.59208699731232
Use the helper function for random sample:
cdf <- function(weights) { total <- sum(weights) result <- c() cumsum <- 0 for (w in weights) { cumsum <- cumsum+w result <- c(result, cumsum / total) } return (result) } search.val <- function(cdf, x) { l <- 1 r <- length(cdf) while (l <= r) { m <- as.integer((l+r)/2) if (cdf[m] < x) { l <- m+1 } else { r <- m-1 } } return (l) } choice <- function(weights) { cdf.vals <- cdf(as.array(weights)) x <- runif(1) idx <- search.val(cdf.vals, x) return (idx) }
Use random output or fixed output by choosing the greatest probability:
make.output <- function(prob, sample=FALSE) { if (!sample) { idx <- which.max(as.array(prob)) } else { idx <- choice(prob) } return (idx) }
In mxnet
, we have a function called mx.lstm.inference
so that users can build an inference from an LSTM model, and then use the mx.lstm.forward
function to get forward output from the inference.
Build an inference from the model:
infer.model <- mx.lstm.inference(num.lstm.layer=num.lstm.layer, input.size=vocab, num.hidden=num.hidden, num.embed=num.embed, num.label=vocab, arg.params=model$arg.params, ctx=mx.cpu())
Generate a sequence of 75 characters using the mx.lstm.forward
function:
start <- 'a' seq.len <- 75 random.sample <- TRUE last.id <- dic[[start]] out <- "a" for (i in (1:(seq.len-1))) { input <- c(last.id-1) ret <- mx.lstm.forward(infer.model, input, FALSE) infer.model <- ret$model prob <- ret$prob last.id <- make.output(prob, random.sample) out <- paste0(out, lookup.table[[last.id]]) } cat (paste0(out, "\n"))
The result:
ah not a drobl greens Settled asing lately sistering sounted to their hight
In mxnet
, other RNN models, like custom RNN and GRU, are also provided:
mx.lstm
with mx.rnn
to train an RNN model. You can replace mx.lstm.inference
and mx.lstm.forward
with mx.rnn.inference
and mx.rnn.forward
to build inference from an RNN model and get the forward result from the inference model.mx.lstm
with mx.gru
to train a GRU model. You can replace mx.lstm.inference
and mx.lstm.forward
with mx.gru.inference
and mx.gru.forward
to build inference from a GRU model and get the forward result from the inference model.