blob: d2d70df5507ef4a9fccdd0496931d043dc15733e [file] [log] [blame]
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# =============================================================================
""" CIFAR10 dataset is at https://www.cs.toronto.edu/~kriz/cifar.html.
It includes 5 binary dataset, each contains 10000 images. 1 row (1 image)
includes 1 label & 3072 pixels. 3072 pixels are 3 channels of a 32x32 image
"""
import cPickle
import numpy as np
import os
import argparse
# sys.path.append(os.path.join(os.path.dirname(__file__), '../../build/python'))
from singa import utils
from singa import optimizer
from singa import device
from singa import tensor
from singa.proto import core_pb2
import alexnet
import vgg
import resnet
def load_dataset(filepath):
print 'Loading data file %s' % filepath
with open(filepath, 'rb') as fd:
cifar10 = cPickle.load(fd)
image = cifar10['data'].astype(dtype=np.uint8)
image = image.reshape((-1, 3, 32, 32))
label = np.asarray(cifar10['labels'], dtype=np.uint8)
label = label.reshape(label.size, 1)
return image, label
def load_train_data(dir_path, num_batches=5):
labels = []
batchsize = 10000
images = np.empty((num_batches * batchsize, 3, 32, 32), dtype=np.uint8)
for did in range(1, num_batches + 1):
fname_train_data = dir_path + "/data_batch_{}".format(did)
image, label = load_dataset(fname_train_data)
images[(did - 1) * batchsize:did * batchsize] = image
labels.extend(label)
images = np.array(images, dtype=np.float32)
labels = np.array(labels, dtype=np.int32)
return images, labels
def load_test_data(dir_path):
images, labels = load_dataset(dir_path + "/test_batch")
return np.array(images, dtype=np.float32), np.array(labels, dtype=np.int32)
def normalize_for_vgg(train_x, test_x):
mean = train_x.mean()
std = train_x.std()
train_x -= mean
test_x -= mean
train_x /= std
test_x /= std
return train_x, test_x
def normalize_for_alexnet(train_x, test_x):
mean = np.average(train_x, axis=0)
train_x -= mean
test_x -= mean
return train_x, test_x
def vgg_lr(epoch):
return 0.1 / float(1 << ((epoch / 25)))
def alexnet_lr(epoch):
if epoch < 120:
return 0.001
elif epoch < 130:
return 0.0001
else:
return 0.00001
def resnet_lr(epoch):
if epoch < 80:
return 0.02
elif epoch < 120:
return 0.005
else:
return 0.001
def train(data, net, max_epoch, get_lr, weight_decay, batch_size=100,
use_cpu=False):
print 'Start intialization............'
if use_cpu:
print 'Using CPU'
dev = device.get_default_device()
else:
print 'Using GPU'
dev = device.create_cuda_gpu()
net.to_device(dev)
opt = optimizer.SGD(momentum=0.9, weight_decay=weight_decay)
for (p, specs) in zip(net.param_names(), net.param_specs()):
opt.register(p, specs)
tx = tensor.Tensor((batch_size, 3, 32, 32), dev)
ty = tensor.Tensor((batch_size,), dev, core_pb2.kInt)
train_x, train_y, test_x, test_y = data
num_train_batch = train_x.shape[0] / batch_size
num_test_batch = test_x.shape[0] / batch_size
idx = np.arange(train_x.shape[0], dtype=np.int32)
for epoch in range(max_epoch):
np.random.shuffle(idx)
loss, acc = 0.0, 0.0
print 'Epoch %d' % epoch
for b in range(num_train_batch):
x = train_x[idx[b * batch_size: (b + 1) * batch_size]]
y = train_y[idx[b * batch_size: (b + 1) * batch_size]]
tx.copy_from_numpy(x)
ty.copy_from_numpy(y)
grads, (l, a) = net.train(tx, ty)
loss += l
acc += a
for (s, p, g) in zip(net.param_names(), net.param_values(), grads):
opt.apply_with_lr(epoch, get_lr(epoch), g, p, str(s))
# update progress bar
utils.update_progress(b * 1.0 / num_train_batch,
'training loss = %f, accuracy = %f' % (l, a))
info = '\ntraining loss = %f, training accuracy = %f, lr = %f' \
% (loss / num_train_batch, acc / num_train_batch, get_lr(epoch))
print info
loss, acc = 0.0, 0.0
for b in range(num_test_batch):
x = test_x[b * batch_size: (b + 1) * batch_size]
y = test_y[b * batch_size: (b + 1) * batch_size]
tx.copy_from_numpy(x)
ty.copy_from_numpy(y)
l, a = net.evaluate(tx, ty)
loss += l
acc += a
print 'test loss = %f, test accuracy = %f' \
% (loss / num_test_batch, acc / num_test_batch)
net.save('model.bin') # save model params into checkpoint file
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Train vgg/alexnet for cifar10')
parser.add_argument('model', choices=['vgg', 'alexnet', 'resnet'], default='alexnet')
parser.add_argument('data', default='cifar-10-batches-py')
parser.add_argument('--use_cpu', action='store_true')
args = parser.parse_args()
assert os.path.exists(args.data), \
'Pls download the cifar10 dataset via "download_data.py py"'
print 'Loading data ..................'
train_x, train_y = load_train_data(args.data)
test_x, test_y = load_test_data(args.data)
if args.model == 'alexnet':
train_x, test_x = normalize_for_alexnet(train_x, test_x)
net = alexnet.create_net(args.use_cpu)
train((train_x, train_y, test_x, test_y), net, 160, alexnet_lr, 0.004,
use_cpu=args.use_cpu)
elif args.model == 'vgg':
train_x, test_x = normalize_for_vgg(train_x, test_x)
net = vgg.create_net(args.use_cpu)
train((train_x, train_y, test_x, test_y), net, 250, vgg_lr, 0.0005,
use_cpu=args.use_cpu)
else:
train_x, test_x = normalize_for_alexnet(train_x, test_x)
net = resnet.create_net(args.use_cpu)
train((train_x, train_y, test_x, test_y), net, 200, resnet_lr, 1e-4,
use_cpu=args.use_cpu)