This is a simple example of how a recurrent neural network can be used in order to build a character-level language model and generate text from it. For demonstration purposes we use text from Shakespear.

Preliminaries

The following lines, contain some preliminary, necessary functions.

require(mxnet)

Make a dictionary from the text

make.dict <- function(text, max.vocab=10000) {
       text <- strsplit(text, '')
       dic <- list()
       idx <- 1
       for (c in text[[1]]) {
           if (!(c %in% names(dic))) {
               dic[[c]] <- idx
               idx <- idx + 1
           }
       }
       if (length(dic) == max.vocab - 1)
           dic[["UNKNOWN"]] <- idx
       cat(paste0("Total unique char: ", length(dic), "\n"))
       return (dic)
    }

Transfer the text into a data tensor format

make.data <- function(file.path, seq.len=32, max.vocab=10000, dic=NULL)      {
       fi <- file(file.path, "r")
       text <- paste(readLines(fi), collapse="\n")
       close(fi)

       if (is.null(dic))
           dic <- make.dict(text, max.vocab)
       lookup.table <- list()
       for (c in names(dic)) {
           idx <- dic[[c]]
           lookup.table[[idx]] <- c 
        }

       char.lst <- strsplit(text, '')[[1]]
       num.seq <- as.integer(length(char.lst) / seq.len)
       char.lst <- char.lst[1:(num.seq * seq.len)]
       data <- array(0, dim=c(seq.len, num.seq))
       idx <- 1
       for (i in 1:num.seq) {
            for (j in 1:seq.len) {
                if (char.lst[idx] %in% names(dic))
                   data[j, i] <- dic[[ char.lst[idx] ]]-1
               else {
                   data[j, i] <- dic[["UNKNOWN"]]-1
               }
               idx <- idx + 1
           }
       }
        return (list(data=data, dic=dic, lookup.table=lookup.table))
   }

Move the tail text

 drop.tail <- function(X, batch.size) {
       shape <- dim(X)
       nstep <- as.integer(shape[2] / batch.size)
       return (X[, 1:(nstep * batch.size)])
   }

Get the label of X:

   get.label <- function(X) {
       label <- array(0, dim=dim(X))
       d <- dim(X)[1]
       w <- dim(X)[2]
       for (i in 0:(w-1)) {
           for (j in 1:d) {
               label[i*d+j] <- X[(i*d+j)%%(w*d)+1]
           }
       }
       return (label)
   }

Main model training

First we set the network basic parameters

batch.size = 32
   seq.len = 32
   num.hidden = 16
   num.embed = 16
   num.lstm.layer = 1
   num.round = 40
   learning.rate= 0.1
   wd=0.00001
   clip_gradient=1
   update.period = 1

Then we load the data

ret <- make.data("input.txt", seq.len=seq.len)

Then we proceeding with preparing the training of the model

X <- ret$data
   dic <- ret$dic
   lookup.table <- ret$lookup.table

   vocab <- length(dic)

   shape <- dim(X)
   train.val.fraction <- 0.9
   size <- shape[2]

   X.train.data <- X[, 1:as.integer(size * train.val.fraction)]
   X.val.data <- X[, -(1:as.integer(size * train.val.fraction))]
   X.train.data <- drop.tail(X.train.data, batch.size)
   X.val.data <- drop.tail(X.val.data, batch.size)

   X.train.label <- get.label(X.train.data)
   X.val.label <- get.label(X.val.data)

   X.train <- list(data=X.train.data, label=X.train.label)
   X.val <- list(data=X.val.data, label=X.val.label)

Finally, we use the mx.lstm method from mxnet in order to build an LSTM model.

 model <- mx.lstm(X.train, X.val, 
                    ctx=mx.cpu(),
                    num.round=num.round, 
                    update.period=update.period,
                    num.lstm.layer=num.lstm.layer, 
                    seq.len=seq.len,
                    num.hidden=num.hidden, 
                    num.embed=num.embed, 
                    num.label=vocab,
                    batch.size=batch.size, 
                    input.size=vocab,
                    initializer=mx.init.uniform(0.1), 
                    learning.rate=learning.rate,
                    wd=wd,
                    clip_gradient=clip_gradient)

Build an inference model

Some helper functions are needed

cdf <- function(weights) {
       total <- sum(weights)
       result <- c()
       cumsum <- 0
       for (w in weights) {
           cumsum <- cumsum+w
           result <- c(result, cumsum / total)
       }
       return (result)
   }
search.val <- function(cdf, x) {
       l <- 1
       r <- length(cdf) 
       while (l <= r) {
           m <- as.integer((l+r)/2)
           if (cdf[m] < x) {
               l <- m+1
           } else {
               r <- m-1
           }
       }
       return (l)
   }
choice <- function(weights) {
       cdf.vals <- cdf(as.array(weights))
       x <- runif(1)
       idx <- search.val(cdf.vals, x)
       return (idx)
   }
make.output <- function(prob, sample=FALSE) {
       if (!sample) {
           idx <- which.max(as.array(prob))
       }
       else {
           idx <- choice(prob)
       }
       return (idx)

   }

In mxnet, we have a function called mx.lstm.inference so that users can build an inference from an LSTM model, and then use the mx.lstm.forward function to get forward output from the inference.

Build an inference from the model:

  infer.model <- mx.lstm.inference(num.lstm.layer=num.lstm.layer,
                                    input.size=vocab,
                                    num.hidden=num.hidden,
                                    num.embed=num.embed,
                                    num.label=vocab,
                                    arg.params=model$arg.params,
                                    ctx=mx.cpu())

Generate a sequence of 100 characters using the mx.lstm.forward function:

 start <- 'the meaning of life is '
   seq.len <- 75
   random.sample <- TRUE

   last.id <- dic[[start]]
   out <- "a"
   for (i in (1:(seq.len-1))) {
       input <- c(last.id-1)
       ret <- mx.lstm.forward(infer.model, input, FALSE)
       infer.model <- ret$model
       prob <- ret$prob
       last.id <- make.output(prob, random.sample)
       out <- paste0(out, lookup.table[[last.id]])
   }
   cat (paste0(out, "\n"))