blob: c34230db4d777d9bef8862ba402204d79821cf43 [file] [log] [blame]
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements; and to You under the Apache License, Version 2.0. "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# RNN for Character Level Language Modeling"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Dataset pre-processing\n",
"\n",
"### sample data"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"from __future__ import division\n",
"from __future__ import print_function\n",
"from future import standard_library\n",
"standard_library.install_aliases()\n",
"from builtins import zip\n",
"from builtins import range\n",
"from builtins import object\n",
"from past.utils import old_div\n",
"import pickle as pickle\n",
"import numpy as np\n",
"import argparse\n",
"import sys\n",
"from tqdm import tnrange, tqdm_notebook\n",
"\n",
"# sys.path.append(os.path.join(os.path.dirname(__file__), '../../build/python'))\n",
"from singa import layer\n",
"from singa import loss\n",
"from singa import device\n",
"from singa import tensor\n",
"from singa import optimizer\n",
"from singa import initializer\n",
"from singa.proto import model_pb2\n",
"from singa import utils"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"class Data(object):\n",
"\n",
" def __init__(self, fpath, batch_size=32, seq_length=100, train_ratio=0.8):\n",
" '''Data object for loading a plain text file.\n",
"\n",
" Args:\n",
" fpath, path to the text file.\n",
" train_ratio, split the text file into train and test sets, where\n",
" train_ratio of the characters are in the train set.\n",
" '''\n",
" self.raw_data = open(fpath, 'r').read() # read text file\n",
" chars = list(set(self.raw_data))\n",
" self.vocab_size = len(chars)\n",
" self.char_to_idx = {ch: i for i, ch in enumerate(chars)}\n",
" self.idx_to_char = {i: ch for i, ch in enumerate(chars)}\n",
" data = [self.char_to_idx[c] for c in self.raw_data]\n",
" # seq_length + 1 for the data + label\n",
" nsamples = old_div(len(data), (1 + seq_length))\n",
" data = data[0:nsamples * (1 + seq_length)]\n",
" data = np.asarray(data, dtype=np.int32)\n",
" data = np.reshape(data, (-1, seq_length + 1))\n",
" # shuffle all sequences\n",
" np.random.shuffle(data)\n",
" self.train_dat = data[0:int(data.shape[0]*train_ratio)]\n",
" self.num_train_batch = old_div(self.train_dat.shape[0], batch_size)\n",
" self.val_dat = data[self.train_dat.shape[0]:]\n",
" self.num_test_batch = old_div(self.val_dat.shape[0], batch_size)\n",
" self.batch_size = batch_size\n",
" self.seq_length = seq_length\n",
" print('train dat', self.train_dat.shape)\n",
" print('val dat', self.val_dat.shape)\n",
"\n",
"\n",
"def numpy2tensors(npx, npy, dev):\n",
" '''batch, seq, dim -- > seq, batch, dim'''\n",
" tmpx = np.swapaxes(npx, 0, 1)\n",
" tmpy = np.swapaxes(npy, 0, 1)\n",
" inputs = []\n",
" labels = []\n",
" for t in range(tmpx.shape[0]):\n",
" x = tensor.from_numpy(tmpx[t])\n",
" y = tensor.from_numpy(tmpy[t])\n",
" x.to_device(dev)\n",
" y.to_device(dev)\n",
" inputs.append(x)\n",
" labels.append(y)\n",
" return inputs, labels\n",
"\n",
"\n",
"def convert(batch, batch_size, seq_length, vocab_size, dev):\n",
" '''convert a batch of data into a sequence of input tensors'''\n",
" y = batch[:, 1:]\n",
" x1 = batch[:, :seq_length]\n",
" x = np.zeros((batch_size, seq_length, vocab_size), dtype=np.float32)\n",
" for b in range(batch_size):\n",
" for t in range(seq_length):\n",
" c = x1[b, t]\n",
" x[b, t, c] = 1\n",
" return numpy2tensors(x, y, dev)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Prepare the dataset. Download [all works of Shakespeare concatenated](http://cs.stanford.edu/people/karpathy/char-rnn/shakespeare_input.txt). Other plain text files can also be used. "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Create the network"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"train dat (36224, 101)\n",
"val dat (9056, 101)\n",
"dense w (32, 67)\n",
"dense b (67,)\n",
"dense weight l1 = 0.154445\n",
"dense b l1 = 0.000000\n"
]
}
],
"source": [
"def get_lr(epoch):\n",
" return old_div(0.001, float(1 << (old_div(epoch, 50))))\n",
"\n",
"hidden_size=32\n",
"num_stacks=1\n",
"dropout=0.5\n",
"\n",
"data = Data('static/shakespeare_input.txt')\n",
"# SGD with L2 gradient normalization\n",
"opt = optimizer.RMSProp(constraint=optimizer.L2Constraint(5))\n",
"cuda = device.create_cuda_gpu()\n",
"rnn = layer.LSTM(name='lstm', hidden_size=hidden_size, num_stacks=num_stacks, dropout=dropout, input_sample_shape=(data.vocab_size,))\n",
"rnn.to_device(cuda)\n",
"rnn_w = rnn.param_values()[0]\n",
"rnn_w.uniform(-0.08, 0.08) \n",
"\n",
"dense = layer.Dense('dense', data.vocab_size, input_sample_shape=(32,))\n",
"dense.to_device(cuda)\n",
"dense_w = dense.param_values()[0]\n",
"dense_b = dense.param_values()[1]\n",
"print('dense w ', dense_w.shape)\n",
"print('dense b ', dense_b.shape)\n",
"initializer.uniform(dense_w, dense_w.shape[0], 0)\n",
"print('dense weight l1 = %f' % (dense_w.l1()))\n",
"dense_b.set_value(0)\n",
"print('dense b l1 = %f' % (dense_b.l1()))\n",
"\n",
"g_dense_w = tensor.Tensor(dense_w.shape, cuda)\n",
"g_dense_b = tensor.Tensor(dense_b.shape, cuda)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Conduct SGD"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "f97e3eae043e4cafb09b9860af94ef3c"
}
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"\n",
"Epoch 0, train loss is 2.722489\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "b614c1d388d94b839723aaf8272e968f"
}
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"\n",
"Epoch 1, train loss is 4.940666\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "77878ccc79ab444d9b5d7bb9cc3b95dd"
}
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"\n",
"Epoch 2, train loss is 7.043295\n"
]
}
],
"source": [
"lossfun = loss.SoftmaxCrossEntropy()\n",
"train_loss = 0\n",
"for epoch in range(3):\n",
" bar = tnrange(data.num_train_batch, desc='Epoch %d' % 0)\n",
" for b in bar:\n",
" batch = data.train_dat[b * data.batch_size: (b + 1) * data.batch_size]\n",
" inputs, labels = convert(batch, data.batch_size, data.seq_length, data.vocab_size, cuda)\n",
" inputs.append(tensor.Tensor())\n",
" inputs.append(tensor.Tensor())\n",
"\n",
" outputs = rnn.forward(model_pb2.kTrain, inputs)[0:-2]\n",
" grads = []\n",
" batch_loss = 0\n",
" g_dense_w.set_value(0.0)\n",
" g_dense_b.set_value(0.0)\n",
" for output, label in zip(outputs, labels):\n",
" act = dense.forward(model_pb2.kTrain, output)\n",
" lvalue = lossfun.forward(model_pb2.kTrain, act, label)\n",
" batch_loss += lvalue.l1()\n",
" grad = lossfun.backward()\n",
" grad /= data.batch_size\n",
" grad, gwb = dense.backward(model_pb2.kTrain, grad)\n",
" grads.append(grad)\n",
" g_dense_w += gwb[0]\n",
" g_dense_b += gwb[1]\n",
" # print output.l1(), act.l1()\n",
" bar.set_postfix(train_loss=old_div(batch_loss, data.seq_length))\n",
" train_loss += batch_loss\n",
"\n",
" grads.append(tensor.Tensor())\n",
" grads.append(tensor.Tensor())\n",
" g_rnn_w = rnn.backward(model_pb2.kTrain, grads)[1][0]\n",
" dense_w, dense_b = dense.param_values()\n",
" opt.apply_with_lr(epoch, get_lr(epoch), g_rnn_w, rnn_w, 'rnnw')\n",
" opt.apply_with_lr(epoch, get_lr(epoch), g_dense_w, dense_w, 'dense_w')\n",
" opt.apply_with_lr(epoch, get_lr(epoch), g_dense_b, dense_b, 'dense_b')\n",
" print('\\nEpoch %d, train loss is %f' % (epoch, train_loss / data.num_train_batch / data.seq_length))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Checkpoint"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"saving model to static/model_2.bin\n"
]
}
],
"source": [
"model_path= 'static/model_' + str(epoch) + '.bin'\n",
"\n",
"with open(model_path, 'wb') as fd:\n",
" print('saving model to %s' % model_path)\n",
" d = {}\n",
" for name, w in zip(['rnn_w', 'dense_w', 'dense_b'],[rnn_w, dense_w, dense_b]):\n",
" d[name] = tensor.to_numpy(w)\n",
" d['idx_to_char'] = data.idx_to_char\n",
" d['char_to_idx'] = data.char_to_idx\n",
" d['hidden_size'] = hidden_size\n",
" d['num_stacks'] = num_stacks\n",
" d['dropout'] = dropout\n",
" pickle.dump(d, fd)\n",
"fd.close()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Sample"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Before we proceed any further, hear me speak.\n",
"\n",
"BRANCANBHAND:\n",
"But yey toor ssen!\n",
"\n",
"CRROSLA:\n",
"Ony chorsery,\n",
"I sty hit to ruse's\n",
"'bae\n",
"As bit.\n",
"Hew, sfohmzero nitl\n",
"No Wimen;\n",
"A astherter!\n",
"\n",
"CAORTEUS:\n",
"Dodt;\n",
"Wighble a cavinn a nooms;\n",
"Pepeif,\n",
"That by peryer,\n",
"Cisher jay thay ro ou hough me me awow, and fer,\n",
"Got thy\n",
"zith shone sort in and kides Eok spand.\n",
"\n",
"\n"
]
}
],
"source": [
"nsamples = 300\n",
"seed_text = \"Before we proceed any further, hear me speak.\"\n",
"do_sample = True\n",
"\n",
"with open(model_path, 'rb') as fd:\n",
" d = pickle.load(fd)\n",
" rnn_w = tensor.from_numpy(d['rnn_w'])\n",
" idx_to_char = d['idx_to_char']\n",
" char_to_idx = d['char_to_idx']\n",
" vocab_size = len(idx_to_char)\n",
" dense_w = tensor.from_numpy(d['dense_w'])\n",
" dense_b = tensor.from_numpy(d['dense_b'])\n",
" hidden_size = d['hidden_size']\n",
" num_stacks = d['num_stacks']\n",
" dropout = d['dropout']\n",
"\n",
"rnn = layer.LSTM(name='lstm', hidden_size=hidden_size,\n",
" num_stacks=num_stacks, dropout=dropout,\n",
" input_sample_shape=(len(idx_to_char),))\n",
"rnn.to_device(cuda)\n",
"rnn.param_values()[0].copy_data(rnn_w)\n",
"dense = layer.Dense('dense', vocab_size, input_sample_shape=(hidden_size,))\n",
"dense.to_device(cuda)\n",
"dense.param_values()[0].copy_data(dense_w)\n",
"dense.param_values()[1].copy_data(dense_b)\n",
"hx = tensor.Tensor((num_stacks, 1, hidden_size), cuda)\n",
"cx = tensor.Tensor((num_stacks, 1, hidden_size), cuda)\n",
"hx.set_value(0.0)\n",
"cx.set_value(0.0)\n",
"if len(seed_text) > 0:\n",
" for c in seed_text:\n",
" x = np.zeros((1, vocab_size), dtype=np.float32)\n",
" x[0, char_to_idx[c]] = 1\n",
" tx = tensor.from_numpy(x)\n",
" tx.to_device(cuda)\n",
" inputs = [tx, hx, cx]\n",
" outputs = rnn.forward(model_pb2.kEval, inputs)\n",
" y = dense.forward(model_pb2.kEval, outputs[0])\n",
" y = tensor.softmax(y)\n",
" hx = outputs[1]\n",
" cx = outputs[2]\n",
" sys.stdout.write(seed_text)\n",
"else:\n",
" y = tensor.Tensor((1, vocab_size), cuda)\n",
" y.set_value(old_div(1.0, vocab_size))\n",
"\n",
"for i in range(nsamples):\n",
" y.to_host()\n",
" prob = tensor.to_numpy(y)[0]\n",
" if do_sample:\n",
" cur = np.random.choice(vocab_size, 1, p=prob)[0]\n",
" else:\n",
" cur = np.argmax(prob)\n",
" sys.stdout.write(idx_to_char[cur])\n",
" x = np.zeros((1, vocab_size), dtype=np.float32)\n",
" x[0, cur] = 1\n",
" tx = tensor.from_numpy(x)\n",
" tx.to_device(cuda)\n",
" inputs = [tx, hx, cx]\n",
" outputs = rnn.forward(model_pb2.kEval, inputs)\n",
" y = dense.forward(model_pb2.kEval, outputs[0])\n",
" y = tensor.softmax(y)\n",
" hx = outputs[1]\n",
" cx = outputs[2]\n",
"print('')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"anaconda-cloud": {},
"kernelspec": {
"display_name": "py3",
"language": "python",
"name": "py3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.3"
}
},
"nbformat": 4,
"nbformat_minor": 2
}