| { |
| "cells": [ |
| { |
| "cell_type": "markdown", |
| "metadata": {}, |
| "source": [ |
| "Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements; and to You under the Apache License, Version 2.0. " |
| ] |
| }, |
| { |
| "cell_type": "markdown", |
| "metadata": {}, |
| "source": [ |
| "# RNN for Character Level Language Modeling" |
| ] |
| }, |
| { |
| "cell_type": "markdown", |
| "metadata": {}, |
| "source": [ |
| "## Dataset pre-processing\n", |
| "\n", |
| "### sample data" |
| ] |
| }, |
| { |
| "cell_type": "code", |
| "execution_count": 1, |
| "metadata": {}, |
| "outputs": [], |
| "source": [ |
| "from __future__ import division\n", |
| "from __future__ import print_function\n", |
| "from future import standard_library\n", |
| "standard_library.install_aliases()\n", |
| "from builtins import zip\n", |
| "from builtins import range\n", |
| "from builtins import object\n", |
| "from past.utils import old_div\n", |
| "import pickle as pickle\n", |
| "import numpy as np\n", |
| "import argparse\n", |
| "import sys\n", |
| "from tqdm import tnrange, tqdm_notebook\n", |
| "\n", |
| "# sys.path.append(os.path.join(os.path.dirname(__file__), '../../build/python'))\n", |
| "from singa import layer\n", |
| "from singa import loss\n", |
| "from singa import device\n", |
| "from singa import tensor\n", |
| "from singa import optimizer\n", |
| "from singa import initializer\n", |
| "from singa.proto import model_pb2\n", |
| "from singa import utils" |
| ] |
| }, |
| { |
| "cell_type": "code", |
| "execution_count": 2, |
| "metadata": { |
| "collapsed": true |
| }, |
| "outputs": [], |
| "source": [ |
| "class Data(object):\n", |
| "\n", |
| " def __init__(self, fpath, batch_size=32, seq_length=100, train_ratio=0.8):\n", |
| " '''Data object for loading a plain text file.\n", |
| "\n", |
| " Args:\n", |
| " fpath, path to the text file.\n", |
| " train_ratio, split the text file into train and test sets, where\n", |
| " train_ratio of the characters are in the train set.\n", |
| " '''\n", |
| " self.raw_data = open(fpath, 'r').read() # read text file\n", |
| " chars = list(set(self.raw_data))\n", |
| " self.vocab_size = len(chars)\n", |
| " self.char_to_idx = {ch: i for i, ch in enumerate(chars)}\n", |
| " self.idx_to_char = {i: ch for i, ch in enumerate(chars)}\n", |
| " data = [self.char_to_idx[c] for c in self.raw_data]\n", |
| " # seq_length + 1 for the data + label\n", |
| " nsamples = old_div(len(data), (1 + seq_length))\n", |
| " data = data[0:nsamples * (1 + seq_length)]\n", |
| " data = np.asarray(data, dtype=np.int32)\n", |
| " data = np.reshape(data, (-1, seq_length + 1))\n", |
| " # shuffle all sequences\n", |
| " np.random.shuffle(data)\n", |
| " self.train_dat = data[0:int(data.shape[0]*train_ratio)]\n", |
| " self.num_train_batch = old_div(self.train_dat.shape[0], batch_size)\n", |
| " self.val_dat = data[self.train_dat.shape[0]:]\n", |
| " self.num_test_batch = old_div(self.val_dat.shape[0], batch_size)\n", |
| " self.batch_size = batch_size\n", |
| " self.seq_length = seq_length\n", |
| " print('train dat', self.train_dat.shape)\n", |
| " print('val dat', self.val_dat.shape)\n", |
| "\n", |
| "\n", |
| "def numpy2tensors(npx, npy, dev):\n", |
| " '''batch, seq, dim -- > seq, batch, dim'''\n", |
| " tmpx = np.swapaxes(npx, 0, 1)\n", |
| " tmpy = np.swapaxes(npy, 0, 1)\n", |
| " inputs = []\n", |
| " labels = []\n", |
| " for t in range(tmpx.shape[0]):\n", |
| " x = tensor.from_numpy(tmpx[t])\n", |
| " y = tensor.from_numpy(tmpy[t])\n", |
| " x.to_device(dev)\n", |
| " y.to_device(dev)\n", |
| " inputs.append(x)\n", |
| " labels.append(y)\n", |
| " return inputs, labels\n", |
| "\n", |
| "\n", |
| "def convert(batch, batch_size, seq_length, vocab_size, dev):\n", |
| " '''convert a batch of data into a sequence of input tensors'''\n", |
| " y = batch[:, 1:]\n", |
| " x1 = batch[:, :seq_length]\n", |
| " x = np.zeros((batch_size, seq_length, vocab_size), dtype=np.float32)\n", |
| " for b in range(batch_size):\n", |
| " for t in range(seq_length):\n", |
| " c = x1[b, t]\n", |
| " x[b, t, c] = 1\n", |
| " return numpy2tensors(x, y, dev)" |
| ] |
| }, |
| { |
| "cell_type": "markdown", |
| "metadata": {}, |
| "source": [ |
| "Prepare the dataset. Download [all works of Shakespeare concatenated](http://cs.stanford.edu/people/karpathy/char-rnn/shakespeare_input.txt). Other plain text files can also be used. " |
| ] |
| }, |
| { |
| "cell_type": "markdown", |
| "metadata": {}, |
| "source": [ |
| "## Create the network" |
| ] |
| }, |
| { |
| "cell_type": "code", |
| "execution_count": 3, |
| "metadata": {}, |
| "outputs": [ |
| { |
| "name": "stdout", |
| "output_type": "stream", |
| "text": [ |
| "train dat (36224, 101)\n", |
| "val dat (9056, 101)\n", |
| "dense w (32, 67)\n", |
| "dense b (67,)\n", |
| "dense weight l1 = 0.154445\n", |
| "dense b l1 = 0.000000\n" |
| ] |
| } |
| ], |
| "source": [ |
| "def get_lr(epoch):\n", |
| " return old_div(0.001, float(1 << (old_div(epoch, 50))))\n", |
| "\n", |
| "hidden_size=32\n", |
| "num_stacks=1\n", |
| "dropout=0.5\n", |
| "\n", |
| "data = Data('static/shakespeare_input.txt')\n", |
| "# SGD with L2 gradient normalization\n", |
| "opt = optimizer.RMSProp(constraint=optimizer.L2Constraint(5))\n", |
| "cuda = device.create_cuda_gpu()\n", |
| "rnn = layer.LSTM(name='lstm', hidden_size=hidden_size, num_stacks=num_stacks, dropout=dropout, input_sample_shape=(data.vocab_size,))\n", |
| "rnn.to_device(cuda)\n", |
| "rnn_w = rnn.param_values()[0]\n", |
| "rnn_w.uniform(-0.08, 0.08) \n", |
| "\n", |
| "dense = layer.Dense('dense', data.vocab_size, input_sample_shape=(32,))\n", |
| "dense.to_device(cuda)\n", |
| "dense_w = dense.param_values()[0]\n", |
| "dense_b = dense.param_values()[1]\n", |
| "print('dense w ', dense_w.shape)\n", |
| "print('dense b ', dense_b.shape)\n", |
| "initializer.uniform(dense_w, dense_w.shape[0], 0)\n", |
| "print('dense weight l1 = %f' % (dense_w.l1()))\n", |
| "dense_b.set_value(0)\n", |
| "print('dense b l1 = %f' % (dense_b.l1()))\n", |
| "\n", |
| "g_dense_w = tensor.Tensor(dense_w.shape, cuda)\n", |
| "g_dense_b = tensor.Tensor(dense_b.shape, cuda)" |
| ] |
| }, |
| { |
| "cell_type": "markdown", |
| "metadata": {}, |
| "source": [ |
| "## Conduct SGD" |
| ] |
| }, |
| { |
| "cell_type": "code", |
| "execution_count": 4, |
| "metadata": {}, |
| "outputs": [ |
| { |
| "data": { |
| "application/vnd.jupyter.widget-view+json": { |
| "model_id": "f97e3eae043e4cafb09b9860af94ef3c" |
| } |
| }, |
| "metadata": {}, |
| "output_type": "display_data" |
| }, |
| { |
| "name": "stdout", |
| "output_type": "stream", |
| "text": [ |
| "\n", |
| "\n", |
| "Epoch 0, train loss is 2.722489\n" |
| ] |
| }, |
| { |
| "data": { |
| "application/vnd.jupyter.widget-view+json": { |
| "model_id": "b614c1d388d94b839723aaf8272e968f" |
| } |
| }, |
| "metadata": {}, |
| "output_type": "display_data" |
| }, |
| { |
| "name": "stdout", |
| "output_type": "stream", |
| "text": [ |
| "\n", |
| "\n", |
| "Epoch 1, train loss is 4.940666\n" |
| ] |
| }, |
| { |
| "data": { |
| "application/vnd.jupyter.widget-view+json": { |
| "model_id": "77878ccc79ab444d9b5d7bb9cc3b95dd" |
| } |
| }, |
| "metadata": {}, |
| "output_type": "display_data" |
| }, |
| { |
| "name": "stdout", |
| "output_type": "stream", |
| "text": [ |
| "\n", |
| "\n", |
| "Epoch 2, train loss is 7.043295\n" |
| ] |
| } |
| ], |
| "source": [ |
| "lossfun = loss.SoftmaxCrossEntropy()\n", |
| "train_loss = 0\n", |
| "for epoch in range(3):\n", |
| " bar = tnrange(data.num_train_batch, desc='Epoch %d' % 0)\n", |
| " for b in bar:\n", |
| " batch = data.train_dat[b * data.batch_size: (b + 1) * data.batch_size]\n", |
| " inputs, labels = convert(batch, data.batch_size, data.seq_length, data.vocab_size, cuda)\n", |
| " inputs.append(tensor.Tensor())\n", |
| " inputs.append(tensor.Tensor())\n", |
| "\n", |
| " outputs = rnn.forward(model_pb2.kTrain, inputs)[0:-2]\n", |
| " grads = []\n", |
| " batch_loss = 0\n", |
| " g_dense_w.set_value(0.0)\n", |
| " g_dense_b.set_value(0.0)\n", |
| " for output, label in zip(outputs, labels):\n", |
| " act = dense.forward(model_pb2.kTrain, output)\n", |
| " lvalue = lossfun.forward(model_pb2.kTrain, act, label)\n", |
| " batch_loss += lvalue.l1()\n", |
| " grad = lossfun.backward()\n", |
| " grad /= data.batch_size\n", |
| " grad, gwb = dense.backward(model_pb2.kTrain, grad)\n", |
| " grads.append(grad)\n", |
| " g_dense_w += gwb[0]\n", |
| " g_dense_b += gwb[1]\n", |
| " # print output.l1(), act.l1()\n", |
| " bar.set_postfix(train_loss=old_div(batch_loss, data.seq_length))\n", |
| " train_loss += batch_loss\n", |
| "\n", |
| " grads.append(tensor.Tensor())\n", |
| " grads.append(tensor.Tensor())\n", |
| " g_rnn_w = rnn.backward(model_pb2.kTrain, grads)[1][0]\n", |
| " dense_w, dense_b = dense.param_values()\n", |
| " opt.apply_with_lr(epoch, get_lr(epoch), g_rnn_w, rnn_w, 'rnnw')\n", |
| " opt.apply_with_lr(epoch, get_lr(epoch), g_dense_w, dense_w, 'dense_w')\n", |
| " opt.apply_with_lr(epoch, get_lr(epoch), g_dense_b, dense_b, 'dense_b')\n", |
| " print('\\nEpoch %d, train loss is %f' % (epoch, train_loss / data.num_train_batch / data.seq_length))" |
| ] |
| }, |
| { |
| "cell_type": "markdown", |
| "metadata": {}, |
| "source": [ |
| "## Checkpoint" |
| ] |
| }, |
| { |
| "cell_type": "code", |
| "execution_count": 5, |
| "metadata": {}, |
| "outputs": [ |
| { |
| "name": "stdout", |
| "output_type": "stream", |
| "text": [ |
| "saving model to static/model_2.bin\n" |
| ] |
| } |
| ], |
| "source": [ |
| "model_path= 'static/model_' + str(epoch) + '.bin'\n", |
| "\n", |
| "with open(model_path, 'wb') as fd:\n", |
| " print('saving model to %s' % model_path)\n", |
| " d = {}\n", |
| " for name, w in zip(['rnn_w', 'dense_w', 'dense_b'],[rnn_w, dense_w, dense_b]):\n", |
| " d[name] = tensor.to_numpy(w)\n", |
| " d['idx_to_char'] = data.idx_to_char\n", |
| " d['char_to_idx'] = data.char_to_idx\n", |
| " d['hidden_size'] = hidden_size\n", |
| " d['num_stacks'] = num_stacks\n", |
| " d['dropout'] = dropout\n", |
| " pickle.dump(d, fd)\n", |
| "fd.close()" |
| ] |
| }, |
| { |
| "cell_type": "markdown", |
| "metadata": {}, |
| "source": [ |
| "# Sample" |
| ] |
| }, |
| { |
| "cell_type": "code", |
| "execution_count": 6, |
| "metadata": {}, |
| "outputs": [ |
| { |
| "name": "stdout", |
| "output_type": "stream", |
| "text": [ |
| "Before we proceed any further, hear me speak.\n", |
| "\n", |
| "BRANCANBHAND:\n", |
| "But yey toor ssen!\n", |
| "\n", |
| "CRROSLA:\n", |
| "Ony chorsery,\n", |
| "I sty hit to ruse's\n", |
| "'bae\n", |
| "As bit.\n", |
| "Hew, sfohmzero nitl\n", |
| "No Wimen;\n", |
| "A astherter!\n", |
| "\n", |
| "CAORTEUS:\n", |
| "Dodt;\n", |
| "Wighble a cavinn a nooms;\n", |
| "Pepeif,\n", |
| "That by peryer,\n", |
| "Cisher jay thay ro ou hough me me awow, and fer,\n", |
| "Got thy\n", |
| "zith shone sort in and kides Eok spand.\n", |
| "\n", |
| "\n" |
| ] |
| } |
| ], |
| "source": [ |
| "nsamples = 300\n", |
| "seed_text = \"Before we proceed any further, hear me speak.\"\n", |
| "do_sample = True\n", |
| "\n", |
| "with open(model_path, 'rb') as fd:\n", |
| " d = pickle.load(fd)\n", |
| " rnn_w = tensor.from_numpy(d['rnn_w'])\n", |
| " idx_to_char = d['idx_to_char']\n", |
| " char_to_idx = d['char_to_idx']\n", |
| " vocab_size = len(idx_to_char)\n", |
| " dense_w = tensor.from_numpy(d['dense_w'])\n", |
| " dense_b = tensor.from_numpy(d['dense_b'])\n", |
| " hidden_size = d['hidden_size']\n", |
| " num_stacks = d['num_stacks']\n", |
| " dropout = d['dropout']\n", |
| "\n", |
| "rnn = layer.LSTM(name='lstm', hidden_size=hidden_size,\n", |
| " num_stacks=num_stacks, dropout=dropout,\n", |
| " input_sample_shape=(len(idx_to_char),))\n", |
| "rnn.to_device(cuda)\n", |
| "rnn.param_values()[0].copy_data(rnn_w)\n", |
| "dense = layer.Dense('dense', vocab_size, input_sample_shape=(hidden_size,))\n", |
| "dense.to_device(cuda)\n", |
| "dense.param_values()[0].copy_data(dense_w)\n", |
| "dense.param_values()[1].copy_data(dense_b)\n", |
| "hx = tensor.Tensor((num_stacks, 1, hidden_size), cuda)\n", |
| "cx = tensor.Tensor((num_stacks, 1, hidden_size), cuda)\n", |
| "hx.set_value(0.0)\n", |
| "cx.set_value(0.0)\n", |
| "if len(seed_text) > 0:\n", |
| " for c in seed_text:\n", |
| " x = np.zeros((1, vocab_size), dtype=np.float32)\n", |
| " x[0, char_to_idx[c]] = 1\n", |
| " tx = tensor.from_numpy(x)\n", |
| " tx.to_device(cuda)\n", |
| " inputs = [tx, hx, cx]\n", |
| " outputs = rnn.forward(model_pb2.kEval, inputs)\n", |
| " y = dense.forward(model_pb2.kEval, outputs[0])\n", |
| " y = tensor.softmax(y)\n", |
| " hx = outputs[1]\n", |
| " cx = outputs[2]\n", |
| " sys.stdout.write(seed_text)\n", |
| "else:\n", |
| " y = tensor.Tensor((1, vocab_size), cuda)\n", |
| " y.set_value(old_div(1.0, vocab_size))\n", |
| "\n", |
| "for i in range(nsamples):\n", |
| " y.to_host()\n", |
| " prob = tensor.to_numpy(y)[0]\n", |
| " if do_sample:\n", |
| " cur = np.random.choice(vocab_size, 1, p=prob)[0]\n", |
| " else:\n", |
| " cur = np.argmax(prob)\n", |
| " sys.stdout.write(idx_to_char[cur])\n", |
| " x = np.zeros((1, vocab_size), dtype=np.float32)\n", |
| " x[0, cur] = 1\n", |
| " tx = tensor.from_numpy(x)\n", |
| " tx.to_device(cuda)\n", |
| " inputs = [tx, hx, cx]\n", |
| " outputs = rnn.forward(model_pb2.kEval, inputs)\n", |
| " y = dense.forward(model_pb2.kEval, outputs[0])\n", |
| " y = tensor.softmax(y)\n", |
| " hx = outputs[1]\n", |
| " cx = outputs[2]\n", |
| "print('')" |
| ] |
| }, |
| { |
| "cell_type": "code", |
| "execution_count": null, |
| "metadata": { |
| "collapsed": true |
| }, |
| "outputs": [], |
| "source": [] |
| } |
| ], |
| "metadata": { |
| "anaconda-cloud": {}, |
| "kernelspec": { |
| "display_name": "py3", |
| "language": "python", |
| "name": "py3" |
| }, |
| "language_info": { |
| "codemirror_mode": { |
| "name": "ipython", |
| "version": 3 |
| }, |
| "file_extension": ".py", |
| "mimetype": "text/x-python", |
| "name": "python", |
| "nbconvert_exporter": "python", |
| "pygments_lexer": "ipython3", |
| "version": "3.5.3" |
| } |
| }, |
| "nbformat": 4, |
| "nbformat_minor": 2 |
| } |