Merge pull request #1048 from apache/dev
Dev
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 29043ab..9f49efa 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -29,10 +29,10 @@
#string(REGEX REPLACE "^[0-9]+\\.[0-9]+\\.([0-9]+).*" "\\1" VERSION_PATCH "${VERSION}")
-SET(PACKAGE_VERSION 3.3.0) # ${VERSION})
-SET(VERSION 3.3.0)
-SET(SINGA_MAJOR_VERSION 3)
-SET(SINGA_MINOR_VERSION 3)
+SET(PACKAGE_VERSION 4.0.0) # ${VERSION})
+SET(VERSION 4.0.0)
+SET(SINGA_MAJOR_VERSION 4)
+SET(SINGA_MINOR_VERSION 0)
SET(SINGA_PATCH_VERSION 0)
#SET(SINGA_MAJOR_VERSION ${VERSION_MAJOR}) # 0 -
#SET(SINGA_MINOR_VERSION ${VERSION_MINOR}) # 0 - 9
diff --git a/RELEASE_NOTES b/RELEASE_NOTES
index d426ac2..bbe6772 100644
--- a/RELEASE_NOTES
+++ b/RELEASE_NOTES
@@ -1,3 +1,40 @@
+Release Notes - SINGA - Version singa-4.0.0
+
+SINGA is a distributed deep learning library.
+
+This release includes following changes:
+
+ * Enhance distributed training
+ * Add support for configuration of number of GPUs to be used.
+ * Increase max epoch for better convergence.
+ * Print intermediate mini-batch information.
+ * Add support for switching between CPU and GPU devices.
+
+ * Enhance example code
+ * Update the args of normalize forward function in the transforms of the BloodMnist example.
+ * Update the xceptionnet in the cnn example.
+ * Add arguments for weight decay, momentum and learning rates in the cnn example.
+ * Add training scripts for more datasets and model types in the cnn example.
+ * Add resnet dist version for the large dataset cnn example.
+ * Add cifar 10 multi process for the large dataset cnn example.
+ * Add sparsification implementation for mnist in the large dataset cnn example.
+ * Update the cifar datasets downloading to local directories.
+ * Extend the cifar datasets load function for customized directorires.
+
+ * Enhance the webpage
+ * Update online documentation for distributed training.
+
+ * Promote code quality
+ * Update inline comments for prepreocessing and data loading
+
+ * Update the PIL image module
+
+ * Update the runtime Dockerfile
+
+ * Update the conda files
+
+----------------------------------------------------------------------------------------------
+
Release Notes - SINGA - Version singa-3.3.0
SINGA is a distributed deep learning library.
diff --git a/examples/cifar_distributed_cnn/autograd/cifar10_multiprocess.py b/examples/cifar_distributed_cnn/autograd/cifar10_multiprocess.py
old mode 100755
new mode 100644
index b5e51ad..df2dba8
--- a/examples/cifar_distributed_cnn/autograd/cifar10_multiprocess.py
+++ b/examples/cifar_distributed_cnn/autograd/cifar10_multiprocess.py
@@ -26,7 +26,7 @@
# Generate a NCCL ID to be used for collective communication
nccl_id = singa.NcclIdHolder()
- # number of GPUs to be used
+ # Configure number of GPUs to be used
world_size = int(sys.argv[1])
# Testing the experimental partial-parameter update asynchronous training
diff --git a/examples/cifar_distributed_cnn/autograd/resnet_cifar10.py b/examples/cifar_distributed_cnn/autograd/resnet_cifar10.py
index 0d6379b..a8e6efd 100644
--- a/examples/cifar_distributed_cnn/autograd/resnet_cifar10.py
+++ b/examples/cifar_distributed_cnn/autograd/resnet_cifar10.py
@@ -138,7 +138,7 @@
return output
-# Function to sychronize SINGA TENSOR initial model parameters
+# Function to synchronize SINGA TENSOR initial model parameters
def synchronize(tensor, dist_opt):
dist_opt.all_reduce(tensor.data)
dist_opt.wait()
@@ -159,7 +159,7 @@
nccl_id=None,
partial_update=False):
- # Define the hypermeters for the train_cifar10
+ # Define the hyperparameters for the train_cifar10
sgd = opt.SGD(lr=0.005, momentum=0.9, weight_decay=1e-5)
max_epoch = 5
batch_size = 32
@@ -199,7 +199,7 @@
idx = np.arange(train_x.shape[0], dtype=np.int32)
if DIST:
- #Sychronize the initial parameters
+ # Synchronize the initial parameters
autograd.training = True
x = np.random.randn(batch_size, 3, IMG_SIZE,
IMG_SIZE).astype(np.float32)
@@ -220,7 +220,7 @@
if ((DIST == False) or (sgd.global_rank == 0)):
print('Starting Epoch %d:' % (epoch))
- #Training phase
+ # Training phase
autograd.training = True
train_correct = np.zeros(shape=[1], dtype=np.float32)
test_correct = np.zeros(shape=[1], dtype=np.float32)
@@ -258,11 +258,11 @@
flush=True)
if partial_update:
- # Sychronize parameters before evaluation phase
+ # Synchronize parameters before evaluation phase
for p in param:
synchronize(p, sgd)
- #Evaulation phase
+ # Evaluation phase
autograd.training = False
for b in range(num_test_batch):
x = test_x[b * batch_size:(b + 1) * batch_size]
@@ -275,7 +275,7 @@
to_categorical(y, num_classes))
if DIST:
- # Reduce the evaulation accuracy from multiple devices
+ # Reduce the evaluation accuracy from multiple devices
test_correct = reduce_variable(test_correct, sgd, reducer)
# Output the evaluation accuracy
diff --git a/examples/cifar_distributed_cnn/autograd/xceptionnet.py b/examples/cifar_distributed_cnn/autograd/xceptionnet.py
index 357e47d..ce28640 100644
--- a/examples/cifar_distributed_cnn/autograd/xceptionnet.py
+++ b/examples/cifar_distributed_cnn/autograd/xceptionnet.py
@@ -140,7 +140,7 @@
self.conv2 = layer.Conv2d(32, 64, 3, 1, 1, bias=False)
self.bn2 = layer.BatchNorm2d(64)
self.relu2 = layer.ReLU()
- # do relu here
+ # Relu Layer
self.block1 = Block(64,
128,
@@ -225,7 +225,7 @@
self.bn3 = layer.BatchNorm2d(1536)
self.relu3 = layer.ReLU()
- # do relu here
+ # Relu Layer
self.conv4 = layer.SeparableConv2d(1536, 2048, 3, 1, 1)
self.bn4 = layer.BatchNorm2d(2048)
@@ -279,9 +279,8 @@
if __name__ == '__main__':
model = Xception(num_classes=1000)
- print('Start intialization............')
+ print('Start initialization............')
dev = device.create_cuda_gpu_on(0)
- #dev = device.create_cuda_gpu()
niters = 20
batch_size = 16
diff --git a/examples/cifar_distributed_cnn/benchmark.py b/examples/cifar_distributed_cnn/benchmark.py
index 9156927..997d5a4 100644
--- a/examples/cifar_distributed_cnn/benchmark.py
+++ b/examples/cifar_distributed_cnn/benchmark.py
@@ -42,7 +42,7 @@
# For distributed training, sequential has better throughput in the current version
if DIST == True:
- sgd = opt.DistOpt(sgd)
+ sgd = opt.DistOpt(sgd) # Need to make sure DistOpt is working for multiple GPUs/nodes
world_size = sgd.world_size
local_rank = sgd.local_rank
global_rank = sgd.global_rank
diff --git a/examples/cifar_distributed_cnn/data/cifar10.py b/examples/cifar_distributed_cnn/data/cifar10.py
index 3b83ad7..1f57d03 100644
--- a/examples/cifar_distributed_cnn/data/cifar10.py
+++ b/examples/cifar_distributed_cnn/data/cifar10.py
@@ -82,7 +82,7 @@
val_x[:, ch, :, :] /= std[ch]
return train_x, val_x
-def load():
+def load(): # Need to pass in the path for loading training data
train_x, train_y = load_train_data()
val_x, val_y = load_test_data()
train_x, val_x = normalize(train_x, val_x)
diff --git a/examples/cifar_distributed_cnn/data/download_mnist.py b/examples/cifar_distributed_cnn/data/download_mnist.py
index 65acb0e..18d71b9 100644
--- a/examples/cifar_distributed_cnn/data/download_mnist.py
+++ b/examples/cifar_distributed_cnn/data/download_mnist.py
@@ -23,7 +23,7 @@
def check_exist_or_download(url):
- download_dir = '/tmp/'
+ download_dir = '/tmp/' # downloaded to the /tmp/ folder
name = url.rsplit('/', 1)[-1]
filename = os.path.join(download_dir, name)
@@ -36,14 +36,14 @@
if __name__ == '__main__':
- #url of the mnist dataset
+ # List urls of the mnist dataset
train_x_url = 'http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz'
train_y_url = 'http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz'
valid_x_url = 'http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz'
valid_y_url = 'http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz'
- #download the mnist dataset
+ # Download the mnist dataset
check_exist_or_download(train_x_url)
check_exist_or_download(train_y_url)
check_exist_or_download(valid_x_url)
- check_exist_or_download(valid_y_url)
+ check_exist_or_download(valid_y_url)
\ No newline at end of file
diff --git a/examples/cifar_distributed_cnn/run-rtx.sh b/examples/cifar_distributed_cnn/run-rtx.sh
index 9b73653..2f4262c 100755
--- a/examples/cifar_distributed_cnn/run-rtx.sh
+++ b/examples/cifar_distributed_cnn/run-rtx.sh
@@ -20,7 +20,19 @@
#!/usr/bin/env python -W ignore::DeprecationWarning
# resnet
+mpiexec -np 8 python train_mpi.py resnet mnist -l 0.015 -b 32
mpiexec -np 8 python train_mpi.py resnet cifar10 -l 0.015 -b 32
+mpiexec -np 8 python train_mpi.py resnet cifar100 -l 0.015 -b 32
# cnn
+mpiexec -np 8 python train_mpi.py cnn mnist -l 0.015 -b 32
mpiexec -np 8 python train_mpi.py cnn cifar10 -l 0.015 -b 32
+mpiexec -np 8 python train_mpi.py cnn cifar100 -l 0.015 -b 32
+
+# mlp
+mpiexec -np 8 python train_mpi.py mlp cifar10 -l 0.015 -b 32
+mpiexec -np 8 python train_mpi.py mlp cifar100 -l 0.015 -b 32
+
+# alexnet
+mpiexec -np 8 python train_mpi.py alexnet cifar10 -l 0.015 -b 32
+mpiexec -np 8 python train_mpi.py alexnet cifar100 -l 0.015 -b 32
diff --git a/examples/cifar_distributed_cnn/train_cnn.py b/examples/cifar_distributed_cnn/train_cnn.py
old mode 100755
new mode 100644
index d102623..eb0fbf3
--- a/examples/cifar_distributed_cnn/train_cnn.py
+++ b/examples/cifar_distributed_cnn/train_cnn.py
@@ -111,7 +111,7 @@
dist_option='plain',
spars=None,
precision='float32'):
- dev = device.create_cuda_gpu_on(local_rank)
+ dev = device.create_cuda_gpu_on(local_rank) # need to change to CPU device for CPU-only machines
dev.SetRandSeed(0)
np.random.seed(0)
@@ -224,6 +224,8 @@
start = time.time()
for b in range(niters):
+ # if b % 100 == 0:
+ # print ("b: \n", b)
# Generate the patch data in this iteration
# Train the model
model(tx, ty, dist_option, spars)
@@ -265,7 +267,7 @@
dest='precision')
parser.add_argument('-m',
'--max-epoch',
- default=10,
+ default=20,
type=int,
help='maximum epochs',
dest='max_epoch')
diff --git a/examples/cnn/autograd/cifar10_multiprocess.py b/examples/cnn/autograd/cifar10_multiprocess.py
index b5e51ad..4b3cb0f 100644
--- a/examples/cnn/autograd/cifar10_multiprocess.py
+++ b/examples/cnn/autograd/cifar10_multiprocess.py
@@ -26,7 +26,7 @@
# Generate a NCCL ID to be used for collective communication
nccl_id = singa.NcclIdHolder()
- # number of GPUs to be used
+ # Configure the number of GPUs to be used
world_size = int(sys.argv[1])
# Testing the experimental partial-parameter update asynchronous training
@@ -40,4 +40,4 @@
partial_update)))
for p in process:
- p.start()
+ p.start()
\ No newline at end of file
diff --git a/examples/cnn/autograd/resnet_cifar10.py b/examples/cnn/autograd/resnet_cifar10.py
index 3c6876f..7541736 100644
--- a/examples/cnn/autograd/resnet_cifar10.py
+++ b/examples/cnn/autograd/resnet_cifar10.py
@@ -199,7 +199,7 @@
idx = np.arange(train_x.shape[0], dtype=np.int32)
if DIST:
- #Sychronize the initial parameters
+ # Sychronize the initial parameters
autograd.training = True
x = np.random.randn(batch_size, 3, IMG_SIZE,
IMG_SIZE).astype(np.float32)
@@ -220,7 +220,7 @@
if ((DIST == False) or (sgd.global_rank == 0)):
print('Starting Epoch %d:' % (epoch))
- #Training phase
+ # Training phase
autograd.training = True
train_correct = np.zeros(shape=[1], dtype=np.float32)
test_correct = np.zeros(shape=[1], dtype=np.float32)
@@ -262,7 +262,7 @@
for p in param:
synchronize(p, sgd)
- #Evaulation phase
+ # Evaulation phase
autograd.training = False
for b in range(num_test_batch):
x = test_x[b * batch_size:(b + 1) * batch_size]
diff --git a/examples/cnn/autograd/xceptionnet.py b/examples/cnn/autograd/xceptionnet.py
index 357e47d..8fb23d8 100644
--- a/examples/cnn/autograd/xceptionnet.py
+++ b/examples/cnn/autograd/xceptionnet.py
@@ -225,7 +225,7 @@
self.bn3 = layer.BatchNorm2d(1536)
self.relu3 = layer.ReLU()
- # do relu here
+ # Relu Layer
self.conv4 = layer.SeparableConv2d(1536, 2048, 3, 1, 1)
self.bn4 = layer.BatchNorm2d(2048)
diff --git a/examples/cnn/data/cifar10.py b/examples/cnn/data/cifar10.py
index d61d84e..5caaf30 100644
--- a/examples/cnn/data/cifar10.py
+++ b/examples/cnn/data/cifar10.py
@@ -40,7 +40,7 @@
return image, label
-def load_train_data(dir_path='/tmp/cifar-10-batches-py', num_batches=5):
+def load_train_data(dir_path='/tmp/cifar-10-batches-py', num_batches=5): # need to save to specific local directories
labels = []
batchsize = 10000
images = np.empty((num_batches * batchsize, 3, 32, 32), dtype=np.uint8)
@@ -54,7 +54,7 @@
return images, labels
-def load_test_data(dir_path='/tmp/cifar-10-batches-py'):
+def load_test_data(dir_path='/tmp/cifar-10-batches-py'): # need to save to specific local directories
images, labels = load_dataset(check_dataset_exist(dir_path + "/test_batch"))
return np.array(images, dtype=np.float32), np.array(labels, dtype=np.int32)
diff --git a/examples/cnn/data/download_cifar10.py b/examples/cnn/data/download_cifar10.py
index a010b2e..8e44679 100755
--- a/examples/cnn/data/download_cifar10.py
+++ b/examples/cnn/data/download_cifar10.py
@@ -30,7 +30,7 @@
if os.path.exists(filepath):
print('The tar file does exist. Extracting it now..')
with tarfile.open(filepath, 'r') as f:
- f.extractall('/tmp/')
+ f.extractall('/tmp/') # need to specify a local directory
print('Finished!')
sys.exit(0)
@@ -43,7 +43,7 @@
if __name__ == '__main__':
- dirpath = '/tmp/'
+ dirpath = '/tmp/' # need to specify a local directory
gzfile = dirpath + 'cifar-10-python.tar.gz'
url = 'http://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz'
do_download(dirpath, gzfile, url)
diff --git a/examples/cnn/data/download_cifar100.py b/examples/cnn/data/download_cifar100.py
index 59f9d23..5f1e21b 100755
--- a/examples/cnn/data/download_cifar100.py
+++ b/examples/cnn/data/download_cifar100.py
@@ -20,7 +20,7 @@
from download_cifar10 import do_download
if __name__ == '__main__':
- dirpath = '/tmp/'
+ dirpath = '/tmp/' # need to specify a local directory
gzfile = dirpath + 'cifar-100-python.tar.gz'
url = 'http://www.cs.toronto.edu/~kriz/cifar-100-python.tar.gz'
do_download(dirpath, gzfile, url)
diff --git a/examples/cnn/data/mnist.py b/examples/cnn/data/mnist.py
index 9cd1a84..b25bf5e 100644
--- a/examples/cnn/data/mnist.py
+++ b/examples/cnn/data/mnist.py
@@ -34,10 +34,10 @@
def load_dataset():
- train_x_path = '/tmp/train-images-idx3-ubyte.gz'
- train_y_path = '/tmp/train-labels-idx1-ubyte.gz'
- valid_x_path = '/tmp/t10k-images-idx3-ubyte.gz'
- valid_y_path = '/tmp/t10k-labels-idx1-ubyte.gz'
+ train_x_path = '/tmp/train-images-idx3-ubyte.gz' # need to change to local disk
+ train_y_path = '/tmp/train-labels-idx1-ubyte.gz' # need to change to local disk
+ valid_x_path = '/tmp/t10k-images-idx3-ubyte.gz' # need to change to local disk
+ valid_y_path = '/tmp/t10k-labels-idx1-ubyte.gz' # need to change to local disk
train_x = read_image_file(check_dataset_exist(train_x_path)).astype(
np.float32)
diff --git a/examples/cnn/run.sh b/examples/cnn/run.sh
new file mode 100644
index 0000000..9f1c4aa
--- /dev/null
+++ b/examples/cnn/run.sh
@@ -0,0 +1,36 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+### mnist
+python train_cnn.py mlp mnist
+python train_cnn.py cnn mnist
+python train_cnn.py resnet mnist
+python train_cnn.py alexnet mnist
+
+### cifar10
+python train_cnn.py mlp cifar10
+python train_cnn.py cnn cifar10
+python train_cnn.py resnet cifar10
+python train_cnn.py alexnet cifar10
+
+### cifar100
+python train_cnn.py mlp cifar100
+python train_cnn.py cnn cifar100
+python train_cnn.py resnet cifar100
+python train_cnn.py alexnet cifar100
diff --git a/examples/cnn/train_cnn.py b/examples/cnn/train_cnn.py
index bcccc51..5c06470 100644
--- a/examples/cnn/train_cnn.py
+++ b/examples/cnn/train_cnn.py
@@ -107,7 +107,7 @@
dist_option='plain',
spars=None,
precision='float32'):
- dev = device.create_cuda_gpu_on(local_rank)
+ dev = device.create_cuda_gpu_on(local_rank) # need to change to CPU device for CPU-only machines
dev.SetRandSeed(0)
np.random.seed(0)
@@ -165,13 +165,6 @@
train_x, train_y, val_x, val_y = partition(global_rank, world_size,
train_x, train_y, val_x,
val_y)
- '''
- # check dataset shape correctness
- if global_rank == 0:
- print("Check the shape of dataset:")
- print(train_x.shape)
- print(train_y.shape)
- '''
if model.dimension == 4:
tx = tensor.Tensor(
@@ -207,6 +200,8 @@
model.train()
for b in range(num_train_batch):
+ # if b % 100 == 0:
+ # print ("b: \n", b)
# Generate the patch data in this iteration
x = train_x[idx[b * batch_size:(b + 1) * batch_size]]
if model.dimension == 4:
@@ -282,7 +277,7 @@
dest='precision')
parser.add_argument('-m',
'--max-epoch',
- default=10,
+ default=100,
type=int,
help='maximum epochs',
dest='max_epoch')
diff --git a/examples/demos/Classification/BloodMnist/ClassDemo.py b/examples/demos/Classification/BloodMnist/ClassDemo.py
index faf0036..a6872f8 100644
--- a/examples/demos/Classification/BloodMnist/ClassDemo.py
+++ b/examples/demos/Classification/BloodMnist/ClassDemo.py
@@ -181,13 +181,13 @@
return correct
-# define pre-processing methods (transforms)
+# Define pre-processing methods (transforms)
transforms = Compose([
ToTensor(),
Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])
-# dataset loading
+# Dataset loading
dataset_path = "./bloodmnist"
train_path = os.path.join(dataset_path, "train")
val_path = os.path.join(dataset_path, "val")
@@ -201,12 +201,12 @@
batch_size = 256
-# model configuration
+# Model configuration for CNN
model = CNNModel(num_classes=num_class)
criterion = layer.SoftMaxCrossEntropy()
optimizer_ft = opt.Adam(lr=1e-3)
-# start training
+# Start training
dev = device.create_cpu_device()
dev.SetRandSeed(0)
np.random.seed(0)
@@ -234,10 +234,10 @@
test_correct = np.zeros(shape=[1], dtype=np.float32)
train_loss = np.zeros(shape=[1], dtype=np.float32)
- # training part
+ # Training part
model.train()
for b in tqdm(range(num_train_batch)):
- # extract batch from image list
+ # Extract batch from image list
x, y = train_dataset.batchgenerator(idx[b * batch_size:(b + 1) * batch_size],
batch_size=batch_size, data_size=(3, model.input_size, model.input_size))
x = x.astype(np_dtype['float32'])
@@ -252,7 +252,7 @@
(train_loss, train_correct /
(num_train_batch * batch_size)))
- # validation part
+ # Validation part
model.eval()
for b in tqdm(range(num_val_batch)):
x, y = train_dataset.batchgenerator(idx[b * batch_size:(b + 1) * batch_size],
diff --git a/examples/demos/Classification/BloodMnist/transforms.py b/examples/demos/Classification/BloodMnist/transforms.py
index 1375ffd..5b51117 100644
--- a/examples/demos/Classification/BloodMnist/transforms.py
+++ b/examples/demos/Classification/BloodMnist/transforms.py
@@ -84,14 +84,14 @@
if not isinstance(pic, Image.Image):
raise TypeError('pic should be PIL Image. Got {}'.format(type(pic)))
- # handle PIL Image
+ # Handle PIL Image
mode_to_nptype = {'I': np.int32, 'I;16': np.int16, 'F': np.float32}
img = np.array(pic, mode_to_nptype.get(pic.mode, np.uint8), copy=True)
if pic.mode == '1':
img = 255 * img
- # put it from HWC to CHW format
+ # Put it from HWC to CHW format
img = np.transpose(img, (2, 0, 1))
if img.dtype == np.uint8:
@@ -128,7 +128,7 @@
self.std = std
self.inplace = inplace
- def forward(self, img: np.ndarray) -> np.ndarray:
+ def forward(self, img: np.ndarray):
"""
Args:
img (Numpy ndarray): Array image to be normalized.
diff --git a/examples/largedataset_cnn/autograd/cifar10_multiprocess.py b/examples/largedataset_cnn/autograd/cifar10_multiprocess.py
new file mode 100644
index 0000000..df2dba8
--- /dev/null
+++ b/examples/largedataset_cnn/autograd/cifar10_multiprocess.py
@@ -0,0 +1,43 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+from resnet_cifar10 import *
+import multiprocessing
+import sys
+
+if __name__ == '__main__':
+
+ # Generate a NCCL ID to be used for collective communication
+ nccl_id = singa.NcclIdHolder()
+
+ # Configure number of GPUs to be used
+ world_size = int(sys.argv[1])
+
+ # Testing the experimental partial-parameter update asynchronous training
+ partial_update = True
+
+ process = []
+ for local_rank in range(0, world_size):
+ process.append(
+ multiprocessing.Process(target=train_cifar10,
+ args=(True, local_rank, world_size, nccl_id,
+ partial_update)))
+
+ for p in process:
+ p.start()
diff --git a/examples/largedataset_cnn/autograd/mnist_cnn.py b/examples/largedataset_cnn/autograd/mnist_cnn.py
new file mode 100644
index 0000000..9fa921c
--- /dev/null
+++ b/examples/largedataset_cnn/autograd/mnist_cnn.py
@@ -0,0 +1,304 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+from singa import singa_wrap as singa
+from singa import autograd
+from singa import layer
+from singa import tensor
+from singa import device
+from singa import opt
+import numpy as np
+import os
+import sys
+import gzip
+import codecs
+import time
+
+
+class CNN:
+
+ def __init__(self):
+ self.conv1 = layer.Conv2d(1, 20, 5, padding=0)
+ self.conv2 = layer.Conv2d(20, 50, 5, padding=0)
+ self.linear1 = layer.Linear(4 * 4 * 50, 500)
+ self.linear2 = layer.Linear(500, 10)
+ self.pooling1 = layer.MaxPool2d(2, 2, padding=0)
+ self.pooling2 = layer.MaxPool2d(2, 2, padding=0)
+ self.relu1 = layer.ReLU()
+ self.relu2 = layer.ReLU()
+ self.relu3 = layer.ReLU()
+ self.flatten = layer.Flatten()
+
+ def forward(self, x):
+ y = self.conv1(x)
+ y = self.relu1(y)
+ y = self.pooling1(y)
+ y = self.conv2(y)
+ y = self.relu2(y)
+ y = self.pooling2(y)
+ y = self.flatten(y)
+ y = self.linear1(y)
+ y = self.relu3(y)
+ y = self.linear2(y)
+ return y
+
+
+def check_dataset_exist(dirpath):
+ if not os.path.exists(dirpath):
+ print(
+ 'The MNIST dataset does not exist. Please download the mnist dataset using download_mnist.py (e.g. python3 download_mnist.py)'
+ )
+ sys.exit(0)
+ return dirpath
+
+
+def load_dataset():
+ train_x_path = '/tmp/train-images-idx3-ubyte.gz'
+ train_y_path = '/tmp/train-labels-idx1-ubyte.gz'
+ valid_x_path = '/tmp/t10k-images-idx3-ubyte.gz'
+ valid_y_path = '/tmp/t10k-labels-idx1-ubyte.gz'
+
+ train_x = read_image_file(check_dataset_exist(train_x_path)).astype(
+ np.float32)
+ train_y = read_label_file(check_dataset_exist(train_y_path)).astype(
+ np.float32)
+ valid_x = read_image_file(check_dataset_exist(valid_x_path)).astype(
+ np.float32)
+ valid_y = read_label_file(check_dataset_exist(valid_y_path)).astype(
+ np.float32)
+ return train_x, train_y, valid_x, valid_y
+
+
+def read_label_file(path):
+ with gzip.open(path, 'rb') as f:
+ data = f.read()
+ assert get_int(data[:4]) == 2049
+ length = get_int(data[4:8])
+ parsed = np.frombuffer(data, dtype=np.uint8, offset=8).reshape((length))
+ return parsed
+
+
+def get_int(b):
+ return int(codecs.encode(b, 'hex'), 16)
+
+
+def read_image_file(path):
+ with gzip.open(path, 'rb') as f:
+ data = f.read()
+ assert get_int(data[:4]) == 2051
+ length = get_int(data[4:8])
+ num_rows = get_int(data[8:12])
+ num_cols = get_int(data[12:16])
+ parsed = np.frombuffer(data, dtype=np.uint8, offset=16).reshape(
+ (length, 1, num_rows, num_cols))
+ return parsed
+
+
+def to_categorical(y, num_classes):
+ y = np.array(y, dtype="int")
+ n = y.shape[0]
+ categorical = np.zeros((n, num_classes))
+ categorical[np.arange(n), y] = 1
+ categorical = categorical.astype(np.float32)
+ return categorical
+
+
+def accuracy(pred, target):
+ y = np.argmax(pred, axis=1)
+ t = np.argmax(target, axis=1)
+ a = y == t
+ return np.array(a, "int").sum()
+
+
+# Function to all reduce NUMPY accuracy and loss from multiple devices
+def reduce_variable(variable, dist_opt, reducer):
+ reducer.copy_from_numpy(variable)
+ dist_opt.all_reduce(reducer.data)
+ dist_opt.wait()
+ output = tensor.to_numpy(reducer)
+ return output
+
+
+# Function to sychronize SINGA TENSOR initial model parameters
+def synchronize(tensor, dist_opt):
+ dist_opt.all_reduce(tensor.data)
+ dist_opt.wait()
+ tensor /= dist_opt.world_size
+
+
+# Data augmentation
+def augmentation(x, batch_size):
+ xpad = np.pad(x, [[0, 0], [0, 0], [4, 4], [4, 4]], 'symmetric')
+ for data_num in range(0, batch_size):
+ offset = np.random.randint(8, size=2)
+ x[data_num, :, :, :] = xpad[data_num, :, offset[0]:offset[0] + 28,
+ offset[1]:offset[1] + 28]
+ if_flip = np.random.randint(2)
+ if (if_flip):
+ x[data_num, :, :, :] = x[data_num, :, :, ::-1]
+ return x
+
+
+# Data partition
+def data_partition(dataset_x, dataset_y, global_rank, world_size):
+ data_per_rank = dataset_x.shape[0] // world_size
+ idx_start = global_rank * data_per_rank
+ idx_end = (global_rank + 1) * data_per_rank
+ return dataset_x[idx_start:idx_end], dataset_y[idx_start:idx_end]
+
+
+def train_mnist_cnn(DIST=False,
+ local_rank=None,
+ world_size=None,
+ nccl_id=None,
+ spars=0,
+ topK=False,
+ corr=True):
+
+ # Define the hypermeters for the mnist_cnn
+ max_epoch = 10
+ batch_size = 64
+ sgd = opt.SGD(lr=0.005, momentum=0.9, weight_decay=1e-5)
+
+ # Prepare training and valadiation data
+ train_x, train_y, test_x, test_y = load_dataset()
+ IMG_SIZE = 28
+ num_classes = 10
+ train_y = to_categorical(train_y, num_classes)
+ test_y = to_categorical(test_y, num_classes)
+
+ # Normalization
+ train_x = train_x / 255
+ test_x = test_x / 255
+
+ if DIST:
+ # For distributed GPU training
+ sgd = opt.DistOpt(sgd,
+ nccl_id=nccl_id,
+ local_rank=local_rank,
+ world_size=world_size)
+ dev = device.create_cuda_gpu_on(sgd.local_rank)
+
+ # Dataset partition for distributed training
+ train_x, train_y = data_partition(train_x, train_y, sgd.global_rank,
+ sgd.world_size)
+ test_x, test_y = data_partition(test_x, test_y, sgd.global_rank,
+ sgd.world_size)
+ world_size = sgd.world_size
+ else:
+ # For single GPU
+ dev = device.create_cuda_gpu()
+ world_size = 1
+
+ # Create model
+ model = CNN()
+
+ tx = tensor.Tensor((batch_size, 1, IMG_SIZE, IMG_SIZE), dev, tensor.float32)
+ ty = tensor.Tensor((batch_size, num_classes), dev, tensor.int32)
+ num_train_batch = train_x.shape[0] // batch_size
+ num_test_batch = test_x.shape[0] // batch_size
+ idx = np.arange(train_x.shape[0], dtype=np.int32)
+
+ if DIST:
+ #Sychronize the initial parameters
+ autograd.training = True
+ x = np.random.randn(batch_size, 1, IMG_SIZE,
+ IMG_SIZE).astype(np.float32)
+ y = np.zeros(shape=(batch_size, num_classes), dtype=np.int32)
+ tx.copy_from_numpy(x)
+ ty.copy_from_numpy(y)
+ out = model.forward(tx)
+ loss = autograd.softmax_cross_entropy(out, ty)
+ for p, g in autograd.backward(loss):
+ synchronize(p, sgd)
+
+ # Training and evaulation loop
+ for epoch in range(max_epoch):
+ start_time = time.time()
+ np.random.shuffle(idx)
+
+ if ((DIST == False) or (sgd.global_rank == 0)):
+ print('Starting Epoch %d:' % (epoch))
+
+ # Training phase
+ autograd.training = True
+ train_correct = np.zeros(shape=[1], dtype=np.float32)
+ test_correct = np.zeros(shape=[1], dtype=np.float32)
+ train_loss = np.zeros(shape=[1], dtype=np.float32)
+
+ for b in range(num_train_batch):
+ x = train_x[idx[b * batch_size:(b + 1) * batch_size]]
+ x = augmentation(x, batch_size)
+ y = train_y[idx[b * batch_size:(b + 1) * batch_size]]
+ tx.copy_from_numpy(x)
+ ty.copy_from_numpy(y)
+ out = model.forward(tx)
+ loss = autograd.softmax_cross_entropy(out, ty)
+ train_correct += accuracy(tensor.to_numpy(out), y)
+ train_loss += tensor.to_numpy(loss)[0]
+ if DIST:
+ if (spars == 0):
+ sgd.backward_and_update(loss, threshold=50000)
+ else:
+ sgd.backward_and_sparse_update(loss,
+ spars=spars,
+ topK=topK,
+ corr=corr)
+ else:
+ sgd(loss)
+
+ if DIST:
+ # Reduce the evaluation accuracy and loss from multiple devices
+ reducer = tensor.Tensor((1,), dev, tensor.float32)
+ train_correct = reduce_variable(train_correct, sgd, reducer)
+ train_loss = reduce_variable(train_loss, sgd, reducer)
+
+ # Output the training loss and accuracy
+ if ((DIST == False) or (sgd.global_rank == 0)):
+ print('Training loss = %f, training accuracy = %f' %
+ (train_loss, train_correct /
+ (num_train_batch * batch_size * world_size)),
+ flush=True)
+
+ # Evaluation phase
+ autograd.training = False
+ for b in range(num_test_batch):
+ x = test_x[b * batch_size:(b + 1) * batch_size]
+ y = test_y[b * batch_size:(b + 1) * batch_size]
+ tx.copy_from_numpy(x)
+ ty.copy_from_numpy(y)
+ out_test = model.forward(tx)
+ test_correct += accuracy(tensor.to_numpy(out_test), y)
+
+ if DIST:
+ # Reduce the evaulation accuracy from multiple devices
+ test_correct = reduce_variable(test_correct, sgd, reducer)
+
+ # Output the evaluation accuracy
+ if ((DIST == False) or (sgd.global_rank == 0)):
+ print('Evaluation accuracy = %f, Elapsed Time = %fs' %
+ (test_correct / (num_test_batch * batch_size * world_size),
+ time.time() - start_time),
+ flush=True)
+
+
+if __name__ == '__main__':
+
+ DIST = False
+ train_mnist_cnn(DIST=DIST)
\ No newline at end of file
diff --git a/examples/largedataset_cnn/autograd/mnist_dist.py b/examples/largedataset_cnn/autograd/mnist_dist.py
new file mode 100644
index 0000000..e705c31
--- /dev/null
+++ b/examples/largedataset_cnn/autograd/mnist_dist.py
@@ -0,0 +1,25 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+from mnist_cnn import *
+
+if __name__ == '__main__':
+
+ DIST = True
+ train_mnist_cnn(DIST=DIST)
\ No newline at end of file
diff --git a/examples/largedataset_cnn/autograd/mnist_multiprocess.py b/examples/largedataset_cnn/autograd/mnist_multiprocess.py
new file mode 100644
index 0000000..f51344f
--- /dev/null
+++ b/examples/largedataset_cnn/autograd/mnist_multiprocess.py
@@ -0,0 +1,39 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+from mnist_cnn import *
+import multiprocessing
+import sys
+
+if __name__ == '__main__':
+
+ # Generate a NCCL ID to be used for collective communication
+ nccl_id = singa.NcclIdHolder()
+
+ # Number of GPUs to be used
+ world_size = int(sys.argv[1])
+
+ process = []
+ for local_rank in range(0, world_size):
+ process.append(
+ multiprocessing.Process(target=train_mnist_cnn,
+ args=(True, local_rank, world_size, nccl_id)))
+
+ for p in process:
+ p.start()
diff --git a/examples/largedataset_cnn/autograd/resnet_cifar10.py b/examples/largedataset_cnn/autograd/resnet_cifar10.py
new file mode 100644
index 0000000..a8e6efd
--- /dev/null
+++ b/examples/largedataset_cnn/autograd/resnet_cifar10.py
@@ -0,0 +1,292 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+try:
+ import pickle
+except ImportError:
+ import cPickle as pickle
+
+from singa import singa_wrap as singa
+from singa import autograd
+from singa import tensor
+from singa import device
+from singa import opt
+from PIL import Image
+import numpy as np
+import os
+import sys
+import time
+
+
+def load_dataset(filepath):
+ with open(filepath, 'rb') as fd:
+ try:
+ cifar10 = pickle.load(fd, encoding='latin1')
+ except TypeError:
+ cifar10 = pickle.load(fd)
+ image = cifar10['data'].astype(dtype=np.uint8)
+ image = image.reshape((-1, 3, 32, 32))
+ label = np.asarray(cifar10['labels'], dtype=np.uint8)
+ label = label.reshape(label.size, 1)
+ return image, label
+
+
+def load_train_data(dir_path='cifar-10-batches-py', num_batches=5):
+ labels = []
+ batchsize = 10000
+ images = np.empty((num_batches * batchsize, 3, 32, 32), dtype=np.uint8)
+ for did in range(1, num_batches + 1):
+ fname_train_data = dir_path + "/data_batch_{}".format(did)
+ image, label = load_dataset(check_dataset_exist(fname_train_data))
+ images[(did - 1) * batchsize:did * batchsize] = image
+ labels.extend(label)
+ images = np.array(images, dtype=np.float32)
+ labels = np.array(labels, dtype=np.int32)
+ return images, labels
+
+
+def load_test_data(dir_path='cifar-10-batches-py'):
+ images, labels = load_dataset(check_dataset_exist(dir_path + "/test_batch"))
+ return np.array(images, dtype=np.float32), np.array(labels, dtype=np.int32)
+
+
+def check_dataset_exist(dirpath):
+ if not os.path.exists(dirpath):
+ print(
+ 'Please download the cifar10 dataset using download_data.py (e.g. python ~/singa/examples/cifar10/download_data.py py)'
+ )
+ sys.exit(0)
+ return dirpath
+
+
+def normalize_for_resnet(train_x, test_x):
+ mean = [0.4914, 0.4822, 0.4465]
+ std = [0.2023, 0.1994, 0.2010]
+ train_x /= 255
+ test_x /= 255
+ for ch in range(0, 2):
+ train_x[:, ch, :, :] -= mean[ch]
+ train_x[:, ch, :, :] /= std[ch]
+ test_x[:, ch, :, :] -= mean[ch]
+ test_x[:, ch, :, :] /= std[ch]
+ return train_x, test_x
+
+
+def resize_dataset(x, IMG_SIZE):
+ num_data = x.shape[0]
+ dim = x.shape[1]
+ X = np.zeros(shape=(num_data, dim, IMG_SIZE, IMG_SIZE), dtype=np.float32)
+ for n in range(0, num_data):
+ for d in range(0, dim):
+ X[n, d, :, :] = np.array(Image.fromarray(x[n, d, :, :]).resize(
+ (IMG_SIZE, IMG_SIZE), Image.BILINEAR),
+ dtype=np.float32)
+ return X
+
+
+def augmentation(x, batch_size):
+ xpad = np.pad(x, [[0, 0], [0, 0], [4, 4], [4, 4]], 'symmetric')
+ for data_num in range(0, batch_size):
+ offset = np.random.randint(8, size=2)
+ x[data_num, :, :, :] = xpad[data_num, :, offset[0]:offset[0] + 32,
+ offset[1]:offset[1] + 32]
+ if_flip = np.random.randint(2)
+ if (if_flip):
+ x[data_num, :, :, :] = x[data_num, :, :, ::-1]
+ return x
+
+
+def accuracy(pred, target):
+ y = np.argmax(pred, axis=1)
+ t = np.argmax(target, axis=1)
+ a = y == t
+ return np.array(a, "int").sum()
+
+
+def to_categorical(y, num_classes):
+ y = np.array(y, dtype="int")
+ n = y.shape[0]
+ categorical = np.zeros((n, num_classes))
+ for i in range(0, n):
+ categorical[i, y[i]] = 1
+ categorical = categorical.astype(np.float32)
+ return categorical
+
+
+# Function to all reduce NUMPY accuracy and loss from multiple devices
+def reduce_variable(variable, dist_opt, reducer):
+ reducer.copy_from_numpy(variable)
+ dist_opt.all_reduce(reducer.data)
+ dist_opt.wait()
+ output = tensor.to_numpy(reducer)
+ return output
+
+
+# Function to synchronize SINGA TENSOR initial model parameters
+def synchronize(tensor, dist_opt):
+ dist_opt.all_reduce(tensor.data)
+ dist_opt.wait()
+ tensor /= dist_opt.world_size
+
+
+# Data partition
+def data_partition(dataset_x, dataset_y, global_rank, world_size):
+ data_per_rank = dataset_x.shape[0] // world_size
+ idx_start = global_rank * data_per_rank
+ idx_end = (global_rank + 1) * data_per_rank
+ return dataset_x[idx_start:idx_end], dataset_y[idx_start:idx_end]
+
+
+def train_cifar10(DIST=False,
+ local_rank=None,
+ world_size=None,
+ nccl_id=None,
+ partial_update=False):
+
+ # Define the hyperparameters for the train_cifar10
+ sgd = opt.SGD(lr=0.005, momentum=0.9, weight_decay=1e-5)
+ max_epoch = 5
+ batch_size = 32
+
+ train_x, train_y = load_train_data()
+ test_x, test_y = load_test_data()
+ train_x, test_x = normalize_for_resnet(train_x, test_x)
+ IMG_SIZE = 224
+ num_classes = 10
+
+ if DIST:
+ # For distributed GPU training
+ sgd = opt.DistOpt(sgd,
+ nccl_id=nccl_id,
+ local_rank=local_rank,
+ world_size=world_size)
+ dev = device.create_cuda_gpu_on(sgd.local_rank)
+
+ # Dataset partition for distributed training
+ train_x, train_y = data_partition(train_x, train_y, sgd.global_rank,
+ sgd.world_size)
+ test_x, test_y = data_partition(test_x, test_y, sgd.global_rank,
+ sgd.world_size)
+ world_size = sgd.world_size
+ else:
+ # For single GPU
+ dev = device.create_cuda_gpu()
+ world_size = 1
+
+ from resnet import resnet50
+ model = resnet50(num_classes=num_classes)
+
+ tx = tensor.Tensor((batch_size, 3, IMG_SIZE, IMG_SIZE), dev, tensor.float32)
+ ty = tensor.Tensor((batch_size,), dev, tensor.int32)
+ num_train_batch = train_x.shape[0] // batch_size
+ num_test_batch = test_x.shape[0] // batch_size
+ idx = np.arange(train_x.shape[0], dtype=np.int32)
+
+ if DIST:
+ # Synchronize the initial parameters
+ autograd.training = True
+ x = np.random.randn(batch_size, 3, IMG_SIZE,
+ IMG_SIZE).astype(np.float32)
+ y = np.zeros(shape=(batch_size,), dtype=np.int32)
+ tx.copy_from_numpy(x)
+ ty.copy_from_numpy(y)
+ out = model(tx)
+ loss = autograd.softmax_cross_entropy(out, ty)
+ param = []
+ for p, _ in autograd.backward(loss):
+ synchronize(p, sgd)
+ param.append(p)
+
+ for epoch in range(max_epoch):
+ start_time = time.time()
+ np.random.shuffle(idx)
+
+ if ((DIST == False) or (sgd.global_rank == 0)):
+ print('Starting Epoch %d:' % (epoch))
+
+ # Training phase
+ autograd.training = True
+ train_correct = np.zeros(shape=[1], dtype=np.float32)
+ test_correct = np.zeros(shape=[1], dtype=np.float32)
+ train_loss = np.zeros(shape=[1], dtype=np.float32)
+
+ for b in range(num_train_batch):
+ x = train_x[idx[b * batch_size:(b + 1) * batch_size]]
+ x = augmentation(x, batch_size)
+ x = resize_dataset(x, IMG_SIZE)
+ y = train_y[idx[b * batch_size:(b + 1) * batch_size]]
+ tx.copy_from_numpy(x)
+ ty.copy_from_numpy(y)
+ out = model(tx)
+ loss = autograd.softmax_cross_entropy(out, ty)
+ train_correct += accuracy(tensor.to_numpy(out),
+ to_categorical(y, num_classes)).astype(
+ np.float32)
+ train_loss += tensor.to_numpy(loss)[0]
+ if not partial_update:
+ sgd.backward_and_update(loss)
+ else:
+ sgd.backward_and_partial_update(loss)
+
+ if DIST:
+ # Reduce the evaluation accuracy and loss from multiple devices
+ reducer = tensor.Tensor((1,), dev, tensor.float32)
+ train_correct = reduce_variable(train_correct, sgd, reducer)
+ train_loss = reduce_variable(train_loss, sgd, reducer)
+
+ # Output the training loss and accuracy
+ if ((DIST == False) or (sgd.global_rank == 0)):
+ print('Training loss = %f, training accuracy = %f' %
+ (train_loss, train_correct /
+ (num_train_batch * batch_size * world_size)),
+ flush=True)
+
+ if partial_update:
+ # Synchronize parameters before evaluation phase
+ for p in param:
+ synchronize(p, sgd)
+
+ # Evaluation phase
+ autograd.training = False
+ for b in range(num_test_batch):
+ x = test_x[b * batch_size:(b + 1) * batch_size]
+ x = resize_dataset(x, IMG_SIZE)
+ y = test_y[b * batch_size:(b + 1) * batch_size]
+ tx.copy_from_numpy(x)
+ ty.copy_from_numpy(y)
+ out_test = model(tx)
+ test_correct += accuracy(tensor.to_numpy(out_test),
+ to_categorical(y, num_classes))
+
+ if DIST:
+ # Reduce the evaluation accuracy from multiple devices
+ test_correct = reduce_variable(test_correct, sgd, reducer)
+
+ # Output the evaluation accuracy
+ if ((DIST == False) or (sgd.global_rank == 0)):
+ print('Evaluation accuracy = %f, Elapsed Time = %fs' %
+ (test_correct / (num_test_batch * batch_size * world_size),
+ time.time() - start_time),
+ flush=True)
+
+
+if __name__ == '__main__':
+
+ DIST = False
+ train_cifar10(DIST=DIST)
diff --git a/examples/largedataset_cnn/autograd/resnet_dist.py b/examples/largedataset_cnn/autograd/resnet_dist.py
new file mode 100755
index 0000000..6f9b56c
--- /dev/null
+++ b/examples/largedataset_cnn/autograd/resnet_dist.py
@@ -0,0 +1,87 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+# the code is modified from
+# https://github.com/pytorch/vision/blob/master/torchvision/models/resnet.py
+
+from singa import autograd
+from singa import tensor
+from singa import device
+from singa import opt
+
+import numpy as np
+from tqdm import trange
+
+if __name__ == "__main__":
+ sgd = opt.SGD(lr=0.1, momentum=0.9, weight_decay=1e-5)
+ sgd = opt.DistOpt(sgd)
+
+ if (sgd.global_rank == 0):
+ print("Start intialization...........", flush=True)
+
+ dev = device.create_cuda_gpu_on(sgd.local_rank)
+
+ from resnet import resnet50
+ model = resnet50()
+
+ niters = 100
+ batch_size = 32
+ IMG_SIZE = 224
+
+ tx = tensor.Tensor((batch_size, 3, IMG_SIZE, IMG_SIZE), dev)
+ ty = tensor.Tensor((batch_size,), dev, tensor.int32)
+ autograd.training = True
+ x = np.random.randn(batch_size, 3, IMG_SIZE, IMG_SIZE).astype(np.float32)
+ y = np.random.randint(0, 1000, batch_size, dtype=np.int32)
+ tx.copy_from_numpy(x)
+ ty.copy_from_numpy(y)
+
+ import time
+
+ dev.Sync()
+ start = time.time()
+ fd = 0
+ softmax = 0
+ update = 0
+ with trange(niters) as t:
+ for _ in t:
+ dev.Sync()
+ tick = time.time()
+ x = model(tx)
+ dev.Sync()
+ fd += time.time() - tick
+ tick = time.time()
+ loss = autograd.softmax_cross_entropy(x, ty)
+ dev.Sync()
+ softmax += time.time() - tick
+ sgd.backward_and_update(loss)
+
+ dev.Sync()
+ end = time.time()
+ throughput = float(sgd.world_size * niters * batch_size) / (end - start)
+ titer = (end - start) / float(niters)
+ tforward = float(fd) / float(niters)
+ tsoftmax = float(softmax) / float(niters)
+ tbackward = titer - tforward - tsoftmax
+
+ if (sgd.global_rank == 0):
+ print("\nThroughput = {} per second".format(throughput), flush=True)
+ print("Total={}, forward={}, softmax={}, backward={}".format(
+ titer, tforward, tsoftmax, tbackward),
+ flush=True)
diff --git a/examples/largedataset_cnn/autograd/sparsification_mnist.py b/examples/largedataset_cnn/autograd/sparsification_mnist.py
new file mode 100644
index 0000000..315605a
--- /dev/null
+++ b/examples/largedataset_cnn/autograd/sparsification_mnist.py
@@ -0,0 +1,45 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+from mnist_cnn import *
+import multiprocessing
+import sys
+
+if __name__ == '__main__':
+
+ # Generate a NCCL ID to be used for collective communication
+ nccl_id = singa.NcclIdHolder()
+
+ # Number of GPUs to be used
+ world_size = int(sys.argv[1])
+
+ # Use sparsification with parameters
+ topK = False # When topK = False, Sparsification based on a constant absolute threshold
+ corr = True # If True, uses local accumulate gradient for the correction
+ sparsThreshold = 0.05 # The constant absolute threshold for sparsification
+
+ process = []
+ for local_rank in range(0, world_size):
+ process.append(
+ multiprocessing.Process(target=train_mnist_cnn,
+ args=(True, local_rank, world_size, nccl_id,
+ sparsThreshold, topK, corr)))
+
+ for p in process:
+ p.start()
diff --git a/examples/largedataset_cnn/autograd/xceptionnet.py b/examples/largedataset_cnn/autograd/xceptionnet.py
new file mode 100644
index 0000000..ce28640
--- /dev/null
+++ b/examples/largedataset_cnn/autograd/xceptionnet.py
@@ -0,0 +1,302 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+from singa import autograd
+from singa import tensor
+from singa import device
+from singa import layer
+from singa import opt
+
+import numpy as np
+from tqdm import trange
+
+# the code is modified from
+# https://github.com/Cadene/pretrained-models.pytorch/blob/master/pretrainedmodels/models/xception.py
+
+
+class Block(layer.Layer):
+
+ def __init__(self,
+ in_filters,
+ out_filters,
+ reps,
+ strides=1,
+ padding=0,
+ start_with_relu=True,
+ grow_first=True):
+ super(Block, self).__init__()
+
+ if out_filters != in_filters or strides != 1:
+ self.skip = layer.Conv2d(in_filters,
+ out_filters,
+ 1,
+ stride=strides,
+ padding=padding,
+ bias=False)
+ self.skipbn = layer.BatchNorm2d(out_filters)
+ else:
+ self.skip = None
+
+ self.layers = []
+
+ filters = in_filters
+ if grow_first:
+ self.layers.append(layer.ReLU())
+ self.layers.append(
+ layer.SeparableConv2d(in_filters,
+ out_filters,
+ 3,
+ stride=1,
+ padding=1,
+ bias=False))
+ self.layers.append(layer.BatchNorm2d(out_filters))
+ filters = out_filters
+
+ for i in range(reps - 1):
+ self.layers.append(layer.ReLU())
+ self.layers.append(
+ layer.SeparableConv2d(filters,
+ filters,
+ 3,
+ stride=1,
+ padding=1,
+ bias=False))
+ self.layers.append(layer.BatchNorm2d(filters))
+
+ if not grow_first:
+ self.layers.append(layer.ReLU())
+ self.layers.append(
+ layer.SeparableConv2d(in_filters,
+ out_filters,
+ 3,
+ stride=1,
+ padding=1,
+ bias=False))
+ self.layers.append(layer.BatchNorm2d(out_filters))
+
+ if not start_with_relu:
+ self.layers = self.layers[1:]
+ else:
+ self.layers[0] = layer.ReLU()
+
+ if strides != 1:
+ self.layers.append(layer.MaxPool2d(3, strides, padding + 1))
+
+ self.register_layers(*self.layers)
+
+ self.add = layer.Add()
+
+ def forward(self, x):
+ y = self.layers[0](x)
+ for layer in self.layers[1:]:
+ if isinstance(y, tuple):
+ y = y[0]
+ y = layer(y)
+
+ if self.skip is not None:
+ skip = self.skip(x)
+ skip = self.skipbn(skip)
+ else:
+ skip = x
+ y = self.add(y, skip)
+ return y
+
+
+__all__ = ['Xception']
+
+
+class Xception(layer.Layer):
+ """
+ Xception optimized for the ImageNet dataset, as specified in
+ https://arxiv.org/pdf/1610.02357.pdf
+ """
+
+ def __init__(self, num_classes=1000):
+ """ Constructor
+ Args:
+ num_classes: number of classes
+ """
+ super(Xception, self).__init__()
+ self.num_classes = num_classes
+
+ self.conv1 = layer.Conv2d(3, 32, 3, 2, 0, bias=False)
+ self.bn1 = layer.BatchNorm2d(32)
+ self.relu1 = layer.ReLU()
+
+ self.conv2 = layer.Conv2d(32, 64, 3, 1, 1, bias=False)
+ self.bn2 = layer.BatchNorm2d(64)
+ self.relu2 = layer.ReLU()
+ # Relu Layer
+
+ self.block1 = Block(64,
+ 128,
+ 2,
+ 2,
+ padding=0,
+ start_with_relu=False,
+ grow_first=True)
+ self.block2 = Block(128,
+ 256,
+ 2,
+ 2,
+ padding=0,
+ start_with_relu=True,
+ grow_first=True)
+ self.block3 = Block(256,
+ 728,
+ 2,
+ 2,
+ padding=0,
+ start_with_relu=True,
+ grow_first=True)
+
+ self.block4 = Block(728,
+ 728,
+ 3,
+ 1,
+ start_with_relu=True,
+ grow_first=True)
+ self.block5 = Block(728,
+ 728,
+ 3,
+ 1,
+ start_with_relu=True,
+ grow_first=True)
+ self.block6 = Block(728,
+ 728,
+ 3,
+ 1,
+ start_with_relu=True,
+ grow_first=True)
+ self.block7 = Block(728,
+ 728,
+ 3,
+ 1,
+ start_with_relu=True,
+ grow_first=True)
+
+ self.block8 = Block(728,
+ 728,
+ 3,
+ 1,
+ start_with_relu=True,
+ grow_first=True)
+ self.block9 = Block(728,
+ 728,
+ 3,
+ 1,
+ start_with_relu=True,
+ grow_first=True)
+ self.block10 = Block(728,
+ 728,
+ 3,
+ 1,
+ start_with_relu=True,
+ grow_first=True)
+ self.block11 = Block(728,
+ 728,
+ 3,
+ 1,
+ start_with_relu=True,
+ grow_first=True)
+
+ self.block12 = Block(728,
+ 1024,
+ 2,
+ 2,
+ start_with_relu=True,
+ grow_first=False)
+
+ self.conv3 = layer.SeparableConv2d(1024, 1536, 3, 1, 1)
+ self.bn3 = layer.BatchNorm2d(1536)
+ self.relu3 = layer.ReLU()
+
+ # Relu Layer
+ self.conv4 = layer.SeparableConv2d(1536, 2048, 3, 1, 1)
+ self.bn4 = layer.BatchNorm2d(2048)
+
+ self.relu4 = layer.ReLU()
+ self.globalpooling = layer.MaxPool2d(10, 1)
+ self.flatten = layer.Flatten()
+ self.fc = layer.Linear(2048, num_classes)
+
+ def features(self, input):
+ x = self.conv1(input)
+ x = self.bn1(x)
+ x = self.relu1(x)
+
+ x = self.conv2(x)
+ x = self.bn2(x)
+ x = self.relu2(x)
+
+ x = self.block1(x)
+ x = self.block2(x)
+ x = self.block3(x)
+ x = self.block4(x)
+ x = self.block5(x)
+ x = self.block6(x)
+ x = self.block7(x)
+ x = self.block8(x)
+ x = self.block9(x)
+ x = self.block10(x)
+ x = self.block11(x)
+ x = self.block12(x)
+
+ x = self.conv3(x)
+ x = self.bn3(x)
+ x = self.relu3(x)
+
+ x = self.conv4(x)
+ x = self.bn4(x)
+ return x
+
+ def logits(self, features):
+ x = self.relu4(features)
+ x = self.globalpooling(x)
+ x = self.flatten(x)
+ x = self.fc(x)
+ return x
+
+ def forward(self, input):
+ x = self.features(input)
+ x = self.logits(x)
+ return x
+
+
+if __name__ == '__main__':
+ model = Xception(num_classes=1000)
+ print('Start initialization............')
+ dev = device.create_cuda_gpu_on(0)
+
+ niters = 20
+ batch_size = 16
+ IMG_SIZE = 299
+ sgd = opt.SGD(lr=0.1, momentum=0.9, weight_decay=1e-5)
+
+ tx = tensor.Tensor((batch_size, 3, IMG_SIZE, IMG_SIZE), dev)
+ ty = tensor.Tensor((batch_size,), dev, tensor.int32)
+ autograd.training = True
+ x = np.random.randn(batch_size, 3, IMG_SIZE, IMG_SIZE).astype(np.float32)
+ y = np.random.randint(0, 1000, batch_size, dtype=np.int32)
+ tx.copy_from_numpy(x)
+ ty.copy_from_numpy(y)
+
+ with trange(niters) as t:
+ for _ in t:
+ x = model(tx)
+ loss = autograd.softmax_cross_entropy(x, ty)
+ sgd(loss)
diff --git a/examples/largedataset_cnn/train_largedata.py b/examples/largedataset_cnn/train_largedata.py
index 6dfd548..a6b7704 100755
--- a/examples/largedataset_cnn/train_largedata.py
+++ b/examples/largedataset_cnn/train_largedata.py
@@ -142,13 +142,6 @@
train_x, train_y, val_x, val_y = partition(global_rank, world_size,
train_x, train_y, val_x,
val_y)
- '''
- # Check dataset shape correctness
- if global_rank == 0:
- print("Check the shape of dataset:")
- print(train_x.shape)
- print(train_y.shape)
- '''
if model.dimension == 4:
tx = tensor.Tensor(
diff --git a/examples/singa_easy/singa_easy/__init__.py b/examples/singa_easy/singa_easy/__init__.py
new file mode 100644
index 0000000..b6cab62
--- /dev/null
+++ b/examples/singa_easy/singa_easy/__init__.py
@@ -0,0 +1,20 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+#!/usr/bin/env python
\ No newline at end of file
diff --git a/examples/singa_easy/singa_easy/modules/__init__.py b/examples/singa_easy/singa_easy/modules/__init__.py
new file mode 100644
index 0000000..752cd17
--- /dev/null
+++ b/examples/singa_easy/singa_easy/modules/__init__.py
@@ -0,0 +1,21 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+#!/usr/bin/env python
+
diff --git a/examples/singa_easy/singa_easy/modules/explanations/__init__.py b/examples/singa_easy/singa_easy/modules/explanations/__init__.py
new file mode 100644
index 0000000..b6cab62
--- /dev/null
+++ b/examples/singa_easy/singa_easy/modules/explanations/__init__.py
@@ -0,0 +1,20 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+#!/usr/bin/env python
\ No newline at end of file
diff --git a/examples/singa_easy/singa_easy/modules/explanations/lime/__init__.py b/examples/singa_easy/singa_easy/modules/explanations/lime/__init__.py
index fe95886..b6cab62 100644
--- a/examples/singa_easy/singa_easy/modules/explanations/lime/__init__.py
+++ b/examples/singa_easy/singa_easy/modules/explanations/lime/__init__.py
@@ -16,3 +16,5 @@
# specific language governing permissions and limitations
# under the License.
#
+
+#!/usr/bin/env python
\ No newline at end of file