examples/cifar10/train.py - singa - Git at Google

 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #   http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # =============================================================================
 """ CIFAR10 dataset is at https://www.cs.toronto.edu/~kriz/cifar.html.
 It includes 5 binary dataset, each contains 10000 images. 1 row (1 image)
 includes 1 label & 3072 pixels.  3072 pixels are 3 channels of a 32x32 image
 """
 from __future__ import division
 from __future__ import print_function
 from builtins import zip
 from builtins import str
 from builtins import range
 try:
     import pickle
 except ImportError:
     import cPickle as pickle
 import numpy as np
 import os
 import argparse
 from tqdm import trange

 from singa import utils
 from singa import optimizer
 from singa import device
 from singa import tensor
 from caffe import caffe_net

 import cnn
 import vgg
 import resnet


 def load_dataset(filepath):
     print('Loading data file %s' % filepath)
     with open(filepath, 'rb') as fd:
         try:
             cifar10 = pickle.load(fd, encoding='latin1')
         except TypeError:
             cifar10 = pickle.load(fd)
     image = cifar10['data'].astype(dtype=np.uint8)
     image = image.reshape((-1, 3, 32, 32))
     label = np.asarray(cifar10['labels'], dtype=np.uint8)
     label = label.reshape(label.size, 1)
     return image, label


 def load_train_data(dir_path, num_batches=5):
     labels = []
     batchsize = 10000
     images = np.empty((num_batches * batchsize, 3, 32, 32), dtype=np.uint8)
     for did in range(1, num_batches + 1):
         fname_train_data = dir_path + "/data_batch_{}".format(did)
         image, label = load_dataset(fname_train_data)
         images[(did - 1) * batchsize:did * batchsize] = image
         labels.extend(label)
     images = np.array(images, dtype=np.float32)
     labels = np.array(labels, dtype=np.int32)
     return images, labels


 def load_test_data(dir_path):
     images, labels = load_dataset(dir_path + "/test_batch")
     return np.array(images,  dtype=np.float32), np.array(labels, dtype=np.int32)


 def normalize_for_vgg(train_x, test_x):
     mean = train_x.mean()
     std = train_x.std()
     train_x -= mean
     test_x -= mean
     train_x /= std
     test_x /= std
     return train_x, test_x


 def normalize_for_alexnet(train_x, test_x):
     mean = np.average(train_x, axis=0)
     train_x -= mean
     test_x -= mean
     return train_x, test_x


 def vgg_lr(epoch):
     return 0.1 / float(1 << (epoch // 25))


 def alexnet_lr(epoch):
     if epoch < 120:
         return 0.001
     elif epoch < 130:
         return 0.0001
     else:
         return 0.00001


 def resnet_lr(epoch):
     if epoch < 81:
         return 0.1
     elif epoch < 122:
         return 0.01
     else:
         return 0.001


 def caffe_lr(epoch):
     if epoch < 8:
         return 0.001
     else:
         return 0.0001


 def train(data, net, max_epoch, get_lr, weight_decay, batch_size=100,
           use_cpu=False):
     print('Start intialization............')
     if use_cpu:
         print('Using CPU')
         dev = device.get_default_device()
     else:
         print('Using GPU')
         dev = device.create_cuda_gpu()

     net.to_device(dev)
     opt = optimizer.SGD(momentum=0.9, weight_decay=weight_decay)
     for (p, specs) in zip(net.param_names(), net.param_specs()):
         opt.register(p, specs)

     tx = tensor.Tensor((batch_size, 3, 32, 32), dev)
     ty = tensor.Tensor((batch_size,), dev, tensor.int32)
     train_x, train_y, test_x, test_y = data
     num_train_batch = train_x.shape[0] // batch_size
     num_test_batch = test_x.shape[0] // batch_size
     idx = np.arange(train_x.shape[0], dtype=np.int32)
     for epoch in range(max_epoch):
         np.random.shuffle(idx)
         loss, acc = 0.0, 0.0
         with trange(num_train_batch) as t:
             t.set_description('Epoch={}'.format(epoch))
             for b in t:
                 x = train_x[idx[b * batch_size: (b + 1) * batch_size]]
                 y = train_y[idx[b * batch_size: (b + 1) * batch_size]]
                 tx.copy_from_numpy(x)
                 ty.copy_from_numpy(y)
                 grads, (l, a) = net.train(tx, ty)
                 loss += l
                 acc += a
                 for (s, p, g) in zip(net.param_names(), net.param_values(), grads):
                     opt.apply_with_lr(epoch, get_lr(epoch), g, p, str(s), b)
                 t.set_postfix(loss=l, accuracy=a)

         info = 'Training loss = %f, training accuracy = %f, lr = %f' \
             % ((loss / num_train_batch), (acc / num_train_batch), get_lr(epoch))
         print(info)

         loss, acc = 0.0, 0.0
         for b in range(num_test_batch):
             x = test_x[b * batch_size: (b + 1) * batch_size]
             y = test_y[b * batch_size: (b + 1) * batch_size]
             tx.copy_from_numpy(x)
             ty.copy_from_numpy(y)
             l, a = net.evaluate(tx, ty)
             loss += l
             acc += a

         print('Test loss = %f, test accuracy = %f' %
               ((loss / num_test_batch), (acc / num_test_batch)))
     net.save('model', 20)  # save model params into checkpoint file

 if __name__ == '__main__':
     parser = argparse.ArgumentParser(description='Train dcnn for cifar10')
     parser.add_argument('model', choices=['vgg', 'cnn', 'resnet', 'caffe'],
                         default='vgg')
     parser.add_argument('data', default='cifar-10-batches-py')
     parser.add_argument('--use_cpu', action='store_true')
     args = parser.parse_args()
     assert os.path.exists(args.data), \
         'Pls download the cifar10 dataset via "download_data.py py"'
     print('Loading data ..................')
     train_x, train_y = load_train_data(args.data)
     test_x, test_y = load_test_data(args.data)
     if args.model == 'caffe':
         train_x, test_x = normalize_for_alexnet(train_x, test_x)
         net = caffe_net.create_net(args.use_cpu)
         # for cifar10_full_train_test.prototxt
         train((train_x, train_y, test_x, test_y), net, 160, alexnet_lr, 0.004,
               use_cpu=args.use_cpu)
         # for cifar10_quick_train_test.prototxt
         # train((train_x, train_y, test_x, test_y), net, 18, caffe_lr, 0.004,
         #      use_cpu=args.use_cpu)
     elif args.model == 'cnn':
         train_x, test_x = normalize_for_alexnet(train_x, test_x)
         net = cnn.create_net(args.use_cpu)
         train((train_x, train_y, test_x, test_y), net, 2, alexnet_lr, 0.004,
               use_cpu=args.use_cpu)
     elif args.model == 'vgg':
         train_x, test_x = normalize_for_vgg(train_x, test_x)
         net = vgg.create_net(args.use_cpu)
         train((train_x, train_y, test_x, test_y), net, 250, vgg_lr, 0.0005,
               use_cpu=args.use_cpu)
     else:
         train_x, test_x = normalize_for_alexnet(train_x, test_x)
         net = resnet.create_net(args.use_cpu)
         train((train_x, train_y, test_x, test_y), net, 200, resnet_lr, 1e-4,
               use_cpu=args.use_cpu)
	# Licensed to the Apache Software Foundation (ASF) under one
	# or more contributor license agreements. See the NOTICE file
	# distributed with this work for additional information
	# regarding copyright ownership. The ASF licenses this file
	# to you under the Apache License, Version 2.0 (the
	# "License"); you may not use this file except in compliance
	# with the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	# =============================================================================
	""" CIFAR10 dataset is at https://www.cs.toronto.edu/~kriz/cifar.html.
	It includes 5 binary dataset, each contains 10000 images. 1 row (1 image)
	includes 1 label & 3072 pixels. 3072 pixels are 3 channels of a 32x32 image
	"""
	from __future__ import division
	from __future__ import print_function
	from builtins import zip
	from builtins import str
	from builtins import range
	try:
	import pickle
	except ImportError:
	import cPickle as pickle
	import numpy as np
	import os
	import argparse
	from tqdm import trange

	from singa import utils
	from singa import optimizer
	from singa import device
	from singa import tensor
	from caffe import caffe_net

	import cnn
	import vgg
	import resnet


	def load_dataset(filepath):
	print('Loading data file %s' % filepath)
	with open(filepath, 'rb') as fd:
	try:
	cifar10 = pickle.load(fd, encoding='latin1')
	except TypeError:
	cifar10 = pickle.load(fd)
	image = cifar10['data'].astype(dtype=np.uint8)
	image = image.reshape((-1, 3, 32, 32))
	label = np.asarray(cifar10['labels'], dtype=np.uint8)
	label = label.reshape(label.size, 1)
	return image, label


	def load_train_data(dir_path, num_batches=5):
	labels = []
	batchsize = 10000
	images = np.empty((num_batches * batchsize, 3, 32, 32), dtype=np.uint8)
	for did in range(1, num_batches + 1):
	fname_train_data = dir_path + "/data_batch_{}".format(did)
	image, label = load_dataset(fname_train_data)
	images[(did - 1) * batchsize:did * batchsize] = image
	labels.extend(label)
	images = np.array(images, dtype=np.float32)
	labels = np.array(labels, dtype=np.int32)
	return images, labels


	def load_test_data(dir_path):
	images, labels = load_dataset(dir_path + "/test_batch")
	return np.array(images, dtype=np.float32), np.array(labels, dtype=np.int32)


	def normalize_for_vgg(train_x, test_x):
	mean = train_x.mean()
	std = train_x.std()
	train_x -= mean
	test_x -= mean
	train_x /= std
	test_x /= std
	return train_x, test_x


	def normalize_for_alexnet(train_x, test_x):
	mean = np.average(train_x, axis=0)
	train_x -= mean
	test_x -= mean
	return train_x, test_x


	def vgg_lr(epoch):
	return 0.1 / float(1 << (epoch // 25))


	def alexnet_lr(epoch):
	if epoch < 120:
	return 0.001
	elif epoch < 130:
	return 0.0001
	else:
	return 0.00001


	def resnet_lr(epoch):
	if epoch < 81:
	return 0.1
	elif epoch < 122:
	return 0.01
	else:
	return 0.001


	def caffe_lr(epoch):
	if epoch < 8:
	return 0.001
	else:
	return 0.0001


	def train(data, net, max_epoch, get_lr, weight_decay, batch_size=100,
	use_cpu=False):
	print('Start intialization............')
	if use_cpu:
	print('Using CPU')
	dev = device.get_default_device()
	else:
	print('Using GPU')
	dev = device.create_cuda_gpu()

	net.to_device(dev)
	opt = optimizer.SGD(momentum=0.9, weight_decay=weight_decay)
	for (p, specs) in zip(net.param_names(), net.param_specs()):
	opt.register(p, specs)

	tx = tensor.Tensor((batch_size, 3, 32, 32), dev)
	ty = tensor.Tensor((batch_size,), dev, tensor.int32)
	train_x, train_y, test_x, test_y = data
	num_train_batch = train_x.shape[0] // batch_size
	num_test_batch = test_x.shape[0] // batch_size
	idx = np.arange(train_x.shape[0], dtype=np.int32)
	for epoch in range(max_epoch):
	np.random.shuffle(idx)
	loss, acc = 0.0, 0.0
	with trange(num_train_batch) as t:
	t.set_description('Epoch={}'.format(epoch))
	for b in t:
	x = train_x[idx[b * batch_size: (b + 1) * batch_size]]
	y = train_y[idx[b * batch_size: (b + 1) * batch_size]]
	tx.copy_from_numpy(x)
	ty.copy_from_numpy(y)
	grads, (l, a) = net.train(tx, ty)
	loss += l
	acc += a
	for (s, p, g) in zip(net.param_names(), net.param_values(), grads):
	opt.apply_with_lr(epoch, get_lr(epoch), g, p, str(s), b)
	t.set_postfix(loss=l, accuracy=a)

	info = 'Training loss = %f, training accuracy = %f, lr = %f' \
	% ((loss / num_train_batch), (acc / num_train_batch), get_lr(epoch))
	print(info)

	loss, acc = 0.0, 0.0
	for b in range(num_test_batch):
	x = test_x[b * batch_size: (b + 1) * batch_size]
	y = test_y[b * batch_size: (b + 1) * batch_size]
	tx.copy_from_numpy(x)
	ty.copy_from_numpy(y)
	l, a = net.evaluate(tx, ty)
	loss += l
	acc += a

	print('Test loss = %f, test accuracy = %f' %
	((loss / num_test_batch), (acc / num_test_batch)))
	net.save('model', 20) # save model params into checkpoint file

	if __name__ == '__main__':
	parser = argparse.ArgumentParser(description='Train dcnn for cifar10')
	parser.add_argument('model', choices=['vgg', 'cnn', 'resnet', 'caffe'],
	default='vgg')
	parser.add_argument('data', default='cifar-10-batches-py')
	parser.add_argument('--use_cpu', action='store_true')
	args = parser.parse_args()
	assert os.path.exists(args.data), \
	'Pls download the cifar10 dataset via "download_data.py py"'
	print('Loading data ..................')
	train_x, train_y = load_train_data(args.data)
	test_x, test_y = load_test_data(args.data)
	if args.model == 'caffe':
	train_x, test_x = normalize_for_alexnet(train_x, test_x)
	net = caffe_net.create_net(args.use_cpu)
	# for cifar10_full_train_test.prototxt
	train((train_x, train_y, test_x, test_y), net, 160, alexnet_lr, 0.004,
	use_cpu=args.use_cpu)
	# for cifar10_quick_train_test.prototxt
	# train((train_x, train_y, test_x, test_y), net, 18, caffe_lr, 0.004,
	# use_cpu=args.use_cpu)
	elif args.model == 'cnn':
	train_x, test_x = normalize_for_alexnet(train_x, test_x)
	net = cnn.create_net(args.use_cpu)
	train((train_x, train_y, test_x, test_y), net, 2, alexnet_lr, 0.004,
	use_cpu=args.use_cpu)
	elif args.model == 'vgg':
	train_x, test_x = normalize_for_vgg(train_x, test_x)
	net = vgg.create_net(args.use_cpu)
	train((train_x, train_y, test_x, test_y), net, 250, vgg_lr, 0.0005,
	use_cpu=args.use_cpu)
	else:
	train_x, test_x = normalize_for_alexnet(train_x, test_x)
	net = resnet.create_net(args.use_cpu)
	train((train_x, train_y, test_x, test_y), net, 200, resnet_lr, 1e-4,
	use_cpu=args.use_cpu)