R-package/vignettes/CharRnnModel.Rmd - mxnet - Git at Google


 # Character-level Language Model using RNN

 This tutorial will demonstrate creating a language model using a character level RNN model using MXNet-R package. You will need the following R packages to run this tutorial -
  - readr
  - stringr
  - stringi
  - mxnet

 We will use the [tinyshakespeare](https://github.com/dmlc/web-data/tree/master/mxnet/tinyshakespeare) dataset to build this model.


 ```R
 library("readr")
 library("stringr")
 library("stringi")
 library("mxnet")
 ```

 ## Preprocess and prepare the data

 Download the data:


 ```R
 download.data <- function(data_dir) {
     dir.create(data_dir, showWarnings = FALSE)
     if (!file.exists(paste0(data_dir,'input.txt'))) {
         download.file(url='https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/tinyshakespeare/input.txt',
                       destfile=paste0(data_dir,'input.txt'), method='wget')
     }
 }
 ```

 Next we transform the test into feature vectors that is fed into the RNN model. The `make_data` function reads the dataset, cleans it of any non-alphanumeric characters, splits it into individual characters and groups it into sequences of length `seq.len`.


 ```R
 make_data <- function(path, seq.len = 32, dic=NULL) {

   text_vec <- read_file(file = path)
   text_vec <- stri_enc_toascii(str = text_vec)
   text_vec <- str_replace_all(string = text_vec, pattern = "[^[:print:]]", replacement = "")
   text_vec <- strsplit(text_vec, '') %>% unlist

   if (is.null(dic)) {
     char_keep <- sort(unique(text_vec))
   } else char_keep <- names(dic)[!dic == 0]

   # Remove terms not part of dictionary
   text_vec <- text_vec[text_vec %in% char_keep]

   # Build dictionary
   dic <- 1:length(char_keep)
   names(dic) <- char_keep

   # reverse dictionary
   rev_dic <- names(dic)
   names(rev_dic) <- dic

   # Adjust by -1 to have a 1-lag for labels
   num.seq <- (length(text_vec) - 1) %/% seq.len

   features <- dic[text_vec[1:(seq.len * num.seq)]]
   labels <- dic[text_vec[1:(seq.len * num.seq)+1]]

   features_array <- array(features, dim = c(seq.len, num.seq))
   labels_array <- array(labels, dim = c(seq.len, num.seq))

   return (list(features_array = features_array, labels_array = labels_array, dic = dic, rev_dic = rev_dic))
 }


 seq.len <- 100
 data_prep <- make_data(path = "input.txt", seq.len = seq.len, dic = NULL)
 ```

 Fetch the features and labels for training the model, and split the data into training and evaluation in 9:1 ratio.


 ```R
 X <- data_prep$features_array
 Y <- data_prep$labels_array
 dic <- data_prep$dic
 rev_dic <- data_prep$rev_dic
 vocab <- length(dic)

 samples <- tail(dim(X), 1)
 train.val.fraction <- 0.9

 X.train.data <- X[, 1:as.integer(samples * train.val.fraction)]
 X.val.data <- X[, -(1:as.integer(samples * train.val.fraction))]

 X.train.label <- Y[, 1:as.integer(samples * train.val.fraction)]
 X.val.label <- Y[, -(1:as.integer(samples * train.val.fraction))]

 train_buckets <- list("100" = list(data = X.train.data, label = X.train.label))
 eval_buckets <- list("100" = list(data = X.val.data, label = X.val.label))

 train_buckets <- list(buckets = train_buckets, dic = dic, rev_dic = rev_dic)
 eval_buckets <- list(buckets = eval_buckets, dic = dic, rev_dic = rev_dic)
 ```

 Create iterators for training and evaluation datasets.


 ```R
 vocab <- length(eval_buckets$dic)

 batch.size <- 32

 train.data <- mx.io.bucket.iter(buckets = train_buckets$buckets, batch.size = batch.size,
                                 data.mask.element = 0, shuffle = TRUE)

 eval.data <- mx.io.bucket.iter(buckets = eval_buckets$buckets, batch.size = batch.size,
                                data.mask.element = 0, shuffle = FALSE)
 ```

 ## Train the Model


 This model is a multi-layer RNN for sampling from character-level language models. It has a one-to-one model configuration since for each character, we want to predict the next one. For a sequence of length 100, there are also 100 labels, corresponding the same sequence of characters but offset by a position of +1. The parameters output_last_state is set to TRUE in order to access the state of the RNN cells when performing inference.


 ```R
 rnn_graph_one_one <- rnn.graph(num_rnn_layer = 3,
                                num_hidden = 96,
                                input_size = vocab,
                                num_embed = 64,
                                num_decode =vocab,
                                dropout = 0.2,
                                ignore_label = 0,
                                cell_type = "lstm",
                                masking = F,
                                output_last_state = T,
                                loss_output = "softmax",
                                config = "one-to-one")

 graph.viz(rnn_graph_one_one, type = "graph", direction = "LR",
           graph.height.px = 180, shape=c(100, 64))

 devices <- mx.cpu()

 initializer <- mx.init.Xavier(rnd_type = "gaussian", factor_type = "avg", magnitude = 3)

 optimizer <- mx.opt.create("adadelta", rho = 0.9, eps = 1e-5, wd = 1e-8,
                            clip_gradient = 5, rescale.grad = 1/batch.size)

 logger <- mx.metric.logger()
 epoch.end.callback <- mx.callback.log.train.metric(period = 1, logger = logger)
 batch.end.callback <- mx.callback.log.train.metric(period = 50)

 mx.metric.custom_nd <- function(name, feval) {
   init <- function() {
     c(0, 0)
   }
   update <- function(label, pred, state) {
     m <- feval(label, pred)
     state <- c(state[[1]] + 1, state[[2]] + m)
     return(state)
   }
   get <- function(state) {
     list(name = name, value = (state[[2]]/state[[1]]))
   }
   ret <- (list(init = init, update = update, get = get))
   class(ret) <- "mx.metric"
   return(ret)
 }

 mx.metric.Perplexity <- mx.metric.custom_nd("Perplexity", function(label, pred) {
   label <- mx.nd.reshape(label, shape = -1)
   label_probs <- as.array(mx.nd.choose.element.0index(pred, label))
   batch <- length(label_probs)
   NLL <- -sum(log(pmax(1e-15, as.array(label_probs)))) / batch
   Perplexity <- exp(NLL)
   return(Perplexity)
 })

 model <- mx.model.buckets(symbol = rnn_graph_one_one,
                           train.data = train.data, eval.data = eval.data,
                           num.round = 20, ctx = devices, verbose = TRUE,
                           metric = mx.metric.Perplexity,
                           initializer = initializer, optimizer = optimizer,
                           batch.end.callback = NULL,
                           epoch.end.callback = epoch.end.callback)

 mx.model.save(model, prefix = "one_to_one_seq_model", iteration = 20)
 ```

     Start training with 1 devices
     [1] Train-Perplexity=13.7040474322178
     [1] Validation-Perplexity=7.94617194460922
     [2] Train-Perplexity=6.57039815554525
     [2] Validation-Perplexity=6.60806110658011
     [3] Train-Perplexity=5.65360504501481
     [3] Validation-Perplexity=6.18932770630876
     [4] Train-Perplexity=5.32547285727298
     [4] Validation-Perplexity=6.02198756798859
     [5] Train-Perplexity=5.14373631472579
     [5] Validation-Perplexity=5.8095658243407
     [6] Train-Perplexity=5.03077673487379
     [6] Validation-Perplexity=5.72582993567431
     [7] Train-Perplexity=4.94453383291536
     [7] Validation-Perplexity=5.6445258528126
     [8] Train-Perplexity=4.88635290100261
     [8] Validation-Perplexity=5.6730024536433
     [9] Train-Perplexity=4.84205646230548
     [9] Validation-Perplexity=5.50960780230982
     [10] Train-Perplexity=4.80441673535513
     [10] Validation-Perplexity=5.57002263750006
     [11] Train-Perplexity=4.77763413242626
     [11] Validation-Perplexity=5.55152143269169
     [12] Train-Perplexity=4.74937775290777
     [12] Validation-Perplexity=5.44968305351486
     [13] Train-Perplexity=4.72824849541467
     [13] Validation-Perplexity=5.50889348298234
     [14] Train-Perplexity=4.70980846981694
     [14] Validation-Perplexity=5.51473225859859
     [15] Train-Perplexity=4.69685776886122
     [15] Validation-Perplexity=5.45391985233811
     [16] Train-Perplexity=4.67837107034824
     [16] Validation-Perplexity=5.46636764997829
     [17] Train-Perplexity=4.66866961934873
     [17] Validation-Perplexity=5.44267086113492
     [18] Train-Perplexity=4.65611469144194
     [18] Validation-Perplexity=5.4290169469462
     [19] Train-Perplexity=4.64614689879405
     [19] Validation-Perplexity=5.44221549833917
     [20] Train-Perplexity=4.63764001963654
     [20] Validation-Perplexity=5.42114250842862


 ## Inference on the Model

 We now use the saved model to do inference and sample text character by character that will look like the original training data.


 ```R
 set.seed(0)
 model <- mx.model.load(prefix = "one_to_one_seq_model", iteration = 20)

 internals <- model$symbol$get.internals()
 sym_state <- internals$get.output(which(internals$outputs %in% "RNN_state"))
 sym_state_cell <- internals$get.output(which(internals$outputs %in% "RNN_state_cell"))
 sym_output <- internals$get.output(which(internals$outputs %in% "loss_output"))
 symbol <- mx.symbol.Group(sym_output, sym_state, sym_state_cell)

 infer_raw <- c("Thou ")
 infer_split <- dic[strsplit(infer_raw, '') %>% unlist]
 infer_length <- length(infer_split)

 infer.data <- mx.io.arrayiter(data = matrix(infer_split), label = matrix(infer_split),
                               batch.size = 1, shuffle = FALSE)

 infer <- mx.infer.rnn.one(infer.data = infer.data,
                           symbol = symbol,
                           arg.params = model$arg.params,
                           aux.params = model$aux.params,
                           input.params = NULL,
                           ctx = devices)

 pred_prob <- as.numeric(as.array(mx.nd.slice.axis(
     infer$loss_output, axis=0, begin = infer_length-1, end = infer_length)))
 pred <- sample(length(pred_prob), prob = pred_prob, size = 1) - 1
 predict <- c(predict, pred)

 for (i in 1:200) {

   infer.data <- mx.io.arrayiter(data = as.matrix(pred), label = as.matrix(pred),
                                 batch.size = 1, shuffle = FALSE)

   infer <- mx.infer.rnn.one(infer.data = infer.data,
                             symbol = symbol,
                             arg.params = model$arg.params,
                             aux.params = model$aux.params,
                             input.params = list(rnn.state = infer[[2]],
                                                 rnn.state.cell = infer[[3]]),
                             ctx = devices)

   pred_prob <- as.numeric(as.array(infer$loss_output))
   pred <- sample(length(pred_prob), prob = pred_prob, size = 1, replace = T) - 1
   predict <- c(predict, pred)
 }

 predict_txt <- paste0(rev_dic[as.character(predict)], collapse = "")
 predict_txt_tot <- paste0(infer_raw, predict_txt, collapse = "")
 print(predict_txt_tot)
 ```

     [1] "Thou NAknowledge thee my Comfort and his late she.FRIAR LAURENCE:Nothing a groats waterd forth. The lend he thank that;When she I am brother draw London: and not hear that know.BENVOLIO:How along, makes your "


 <!-- INSERT SOURCE DOWNLOAD BUTTONS -->

	# Character-level Language Model using RNN

	This tutorial will demonstrate creating a language model using a character level RNN model using MXNet-R package. You will need the following R packages to run this tutorial -
	- readr
	- stringr
	- stringi
	- mxnet

	We will use the [tinyshakespeare](https://github.com/dmlc/web-data/tree/master/mxnet/tinyshakespeare) dataset to build this model.


	```R
	library("readr")
	library("stringr")
	library("stringi")
	library("mxnet")
	```

	## Preprocess and prepare the data

	Download the data:


	```R
	download.data <- function(data_dir) {
	dir.create(data_dir, showWarnings = FALSE)
	if (!file.exists(paste0(data_dir,'input.txt'))) {
	download.file(url='https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/tinyshakespeare/input.txt',
	destfile=paste0(data_dir,'input.txt'), method='wget')
	}
	}
	```

	Next we transform the test into feature vectors that is fed into the RNN model. The `make_data` function reads the dataset, cleans it of any non-alphanumeric characters, splits it into individual characters and groups it into sequences of length `seq.len`.


	```R
	make_data <- function(path, seq.len = 32, dic=NULL) {

	text_vec <- read_file(file = path)
	text_vec <- stri_enc_toascii(str = text_vec)
	text_vec <- str_replace_all(string = text_vec, pattern = "[^[:print:]]", replacement = "")
	text_vec <- strsplit(text_vec, '') %>% unlist

	if (is.null(dic)) {
	char_keep <- sort(unique(text_vec))
	} else char_keep <- names(dic)[!dic == 0]

	# Remove terms not part of dictionary
	text_vec <- text_vec[text_vec %in% char_keep]

	# Build dictionary
	dic <- 1:length(char_keep)
	names(dic) <- char_keep

	# reverse dictionary
	rev_dic <- names(dic)
	names(rev_dic) <- dic

	# Adjust by -1 to have a 1-lag for labels
	num.seq <- (length(text_vec) - 1) %/% seq.len

	features <- dic[text_vec[1:(seq.len * num.seq)]]
	labels <- dic[text_vec[1:(seq.len * num.seq)+1]]

	features_array <- array(features, dim = c(seq.len, num.seq))
	labels_array <- array(labels, dim = c(seq.len, num.seq))

	return (list(features_array = features_array, labels_array = labels_array, dic = dic, rev_dic = rev_dic))
	}


	seq.len <- 100
	data_prep <- make_data(path = "input.txt", seq.len = seq.len, dic = NULL)
	```

	Fetch the features and labels for training the model, and split the data into training and evaluation in 9:1 ratio.


	```R
	X <- data_prep$features_array
	Y <- data_prep$labels_array
	dic <- data_prep$dic
	rev_dic <- data_prep$rev_dic
	vocab <- length(dic)

	samples <- tail(dim(X), 1)
	train.val.fraction <- 0.9

	X.train.data <- X[, 1:as.integer(samples * train.val.fraction)]
	X.val.data <- X[, -(1:as.integer(samples * train.val.fraction))]

	X.train.label <- Y[, 1:as.integer(samples * train.val.fraction)]
	X.val.label <- Y[, -(1:as.integer(samples * train.val.fraction))]

	train_buckets <- list("100" = list(data = X.train.data, label = X.train.label))
	eval_buckets <- list("100" = list(data = X.val.data, label = X.val.label))

	train_buckets <- list(buckets = train_buckets, dic = dic, rev_dic = rev_dic)
	eval_buckets <- list(buckets = eval_buckets, dic = dic, rev_dic = rev_dic)
	```

	Create iterators for training and evaluation datasets.


	```R
	vocab <- length(eval_buckets$dic)

	batch.size <- 32

	train.data <- mx.io.bucket.iter(buckets = train_buckets$buckets, batch.size = batch.size,
	data.mask.element = 0, shuffle = TRUE)

	eval.data <- mx.io.bucket.iter(buckets = eval_buckets$buckets, batch.size = batch.size,
	data.mask.element = 0, shuffle = FALSE)
	```

	## Train the Model


	This model is a multi-layer RNN for sampling from character-level language models. It has a one-to-one model configuration since for each character, we want to predict the next one. For a sequence of length 100, there are also 100 labels, corresponding the same sequence of characters but offset by a position of +1. The parameters output_last_state is set to TRUE in order to access the state of the RNN cells when performing inference.


	```R
	rnn_graph_one_one <- rnn.graph(num_rnn_layer = 3,
	num_hidden = 96,
	input_size = vocab,
	num_embed = 64,
	num_decode =vocab,
	dropout = 0.2,
	ignore_label = 0,
	cell_type = "lstm",
	masking = F,
	output_last_state = T,
	loss_output = "softmax",
	config = "one-to-one")

	graph.viz(rnn_graph_one_one, type = "graph", direction = "LR",
	graph.height.px = 180, shape=c(100, 64))

	devices <- mx.cpu()

	initializer <- mx.init.Xavier(rnd_type = "gaussian", factor_type = "avg", magnitude = 3)

	optimizer <- mx.opt.create("adadelta", rho = 0.9, eps = 1e-5, wd = 1e-8,
	clip_gradient = 5, rescale.grad = 1/batch.size)

	logger <- mx.metric.logger()
	epoch.end.callback <- mx.callback.log.train.metric(period = 1, logger = logger)
	batch.end.callback <- mx.callback.log.train.metric(period = 50)

	mx.metric.custom_nd <- function(name, feval) {
	init <- function() {
	c(0, 0)
	}
	update <- function(label, pred, state) {
	m <- feval(label, pred)
	state <- c(state[[1]] + 1, state[[2]] + m)
	return(state)
	}
	get <- function(state) {
	list(name = name, value = (state[[2]]/state[[1]]))
	}
	ret <- (list(init = init, update = update, get = get))
	class(ret) <- "mx.metric"
	return(ret)
	}

	mx.metric.Perplexity <- mx.metric.custom_nd("Perplexity", function(label, pred) {
	label <- mx.nd.reshape(label, shape = -1)
	label_probs <- as.array(mx.nd.choose.element.0index(pred, label))
	batch <- length(label_probs)
	NLL <- -sum(log(pmax(1e-15, as.array(label_probs)))) / batch
	Perplexity <- exp(NLL)
	return(Perplexity)
	})

	model <- mx.model.buckets(symbol = rnn_graph_one_one,
	train.data = train.data, eval.data = eval.data,
	num.round = 20, ctx = devices, verbose = TRUE,
	metric = mx.metric.Perplexity,
	initializer = initializer, optimizer = optimizer,
	batch.end.callback = NULL,
	epoch.end.callback = epoch.end.callback)

	mx.model.save(model, prefix = "one_to_one_seq_model", iteration = 20)
	```

	Start training with 1 devices
	[1] Train-Perplexity=13.7040474322178
	[1] Validation-Perplexity=7.94617194460922
	[2] Train-Perplexity=6.57039815554525
	[2] Validation-Perplexity=6.60806110658011
	[3] Train-Perplexity=5.65360504501481
	[3] Validation-Perplexity=6.18932770630876
	[4] Train-Perplexity=5.32547285727298
	[4] Validation-Perplexity=6.02198756798859
	[5] Train-Perplexity=5.14373631472579
	[5] Validation-Perplexity=5.8095658243407
	[6] Train-Perplexity=5.03077673487379
	[6] Validation-Perplexity=5.72582993567431
	[7] Train-Perplexity=4.94453383291536
	[7] Validation-Perplexity=5.6445258528126
	[8] Train-Perplexity=4.88635290100261
	[8] Validation-Perplexity=5.6730024536433
	[9] Train-Perplexity=4.84205646230548
	[9] Validation-Perplexity=5.50960780230982
	[10] Train-Perplexity=4.80441673535513
	[10] Validation-Perplexity=5.57002263750006
	[11] Train-Perplexity=4.77763413242626
	[11] Validation-Perplexity=5.55152143269169
	[12] Train-Perplexity=4.74937775290777
	[12] Validation-Perplexity=5.44968305351486
	[13] Train-Perplexity=4.72824849541467
	[13] Validation-Perplexity=5.50889348298234
	[14] Train-Perplexity=4.70980846981694
	[14] Validation-Perplexity=5.51473225859859
	[15] Train-Perplexity=4.69685776886122
	[15] Validation-Perplexity=5.45391985233811
	[16] Train-Perplexity=4.67837107034824
	[16] Validation-Perplexity=5.46636764997829
	[17] Train-Perplexity=4.66866961934873
	[17] Validation-Perplexity=5.44267086113492
	[18] Train-Perplexity=4.65611469144194
	[18] Validation-Perplexity=5.4290169469462
	[19] Train-Perplexity=4.64614689879405
	[19] Validation-Perplexity=5.44221549833917
	[20] Train-Perplexity=4.63764001963654
	[20] Validation-Perplexity=5.42114250842862


	## Inference on the Model

	We now use the saved model to do inference and sample text character by character that will look like the original training data.


	```R
	set.seed(0)
	model <- mx.model.load(prefix = "one_to_one_seq_model", iteration = 20)

	internals <- model$symbol$get.internals()
	sym_state <- internals$get.output(which(internals$outputs %in% "RNN_state"))
	sym_state_cell <- internals$get.output(which(internals$outputs %in% "RNN_state_cell"))
	sym_output <- internals$get.output(which(internals$outputs %in% "loss_output"))
	symbol <- mx.symbol.Group(sym_output, sym_state, sym_state_cell)

	infer_raw <- c("Thou ")
	infer_split <- dic[strsplit(infer_raw, '') %>% unlist]
	infer_length <- length(infer_split)

	infer.data <- mx.io.arrayiter(data = matrix(infer_split), label = matrix(infer_split),
	batch.size = 1, shuffle = FALSE)

	infer <- mx.infer.rnn.one(infer.data = infer.data,
	symbol = symbol,
	arg.params = model$arg.params,
	aux.params = model$aux.params,
	input.params = NULL,
	ctx = devices)

	pred_prob <- as.numeric(as.array(mx.nd.slice.axis(
	infer$loss_output, axis=0, begin = infer_length-1, end = infer_length)))
	pred <- sample(length(pred_prob), prob = pred_prob, size = 1) - 1
	predict <- c(predict, pred)

	for (i in 1:200) {

	infer.data <- mx.io.arrayiter(data = as.matrix(pred), label = as.matrix(pred),
	batch.size = 1, shuffle = FALSE)

	infer <- mx.infer.rnn.one(infer.data = infer.data,
	symbol = symbol,
	arg.params = model$arg.params,
	aux.params = model$aux.params,
	input.params = list(rnn.state = infer[[2]],
	rnn.state.cell = infer[[3]]),
	ctx = devices)

	pred_prob <- as.numeric(as.array(infer$loss_output))
	pred <- sample(length(pred_prob), prob = pred_prob, size = 1, replace = T) - 1
	predict <- c(predict, pred)
	}

	predict_txt <- paste0(rev_dic[as.character(predict)], collapse = "")
	predict_txt_tot <- paste0(infer_raw, predict_txt, collapse = "")
	print(predict_txt_tot)
	```

	[1] "Thou NAknowledge thee my Comfort and his late she.FRIAR LAURENCE:Nothing a groats waterd forth. The lend he thank that;When she I am brother draw London: and not hear that know.BENVOLIO:How along, makes your "


	<!-- INSERT SOURCE DOWNLOAD BUTTONS -->