Merge pull request #1048 from apache/dev

Dev
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 29043ab..9f49efa 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -29,10 +29,10 @@
 #string(REGEX REPLACE "^[0-9]+\\.[0-9]+\\.([0-9]+).*" "\\1" VERSION_PATCH "${VERSION}")
 
 
-SET(PACKAGE_VERSION 3.3.0) # ${VERSION})
-SET(VERSION 3.3.0)
-SET(SINGA_MAJOR_VERSION 3)
-SET(SINGA_MINOR_VERSION 3)
+SET(PACKAGE_VERSION 4.0.0) # ${VERSION})
+SET(VERSION 4.0.0)
+SET(SINGA_MAJOR_VERSION 4)
+SET(SINGA_MINOR_VERSION 0)
 SET(SINGA_PATCH_VERSION 0)
 #SET(SINGA_MAJOR_VERSION ${VERSION_MAJOR})  # 0 -
 #SET(SINGA_MINOR_VERSION ${VERSION_MINOR})  # 0 - 9
diff --git a/RELEASE_NOTES b/RELEASE_NOTES
index d426ac2..bbe6772 100644
--- a/RELEASE_NOTES
+++ b/RELEASE_NOTES
@@ -1,3 +1,40 @@
+Release Notes - SINGA - Version singa-4.0.0
+
+SINGA is a distributed deep learning library.
+
+This release includes following changes:
+
+  * Enhance distributed training
+    * Add support for configuration of number of GPUs to be used.
+    * Increase max epoch for better convergence.
+    * Print intermediate mini-batch information.
+    * Add support for switching between CPU and GPU devices.
+
+  * Enhance example code
+    * Update the args of normalize forward function in the transforms of the BloodMnist example.
+    * Update the xceptionnet in the cnn example.
+    * Add arguments for weight decay, momentum and learning rates in the cnn example.
+    * Add training scripts for more datasets and model types in the cnn example.
+    * Add resnet dist version for the large dataset cnn example.
+    * Add cifar 10 multi process for the large dataset cnn example.
+    * Add sparsification implementation for mnist in the large dataset cnn example.
+    * Update the cifar datasets downloading to local directories.
+    * Extend the cifar datasets load function for customized directorires.
+
+  * Enhance the webpage
+    * Update online documentation for distributed training.
+
+  * Promote code quality
+    * Update inline comments for prepreocessing and data loading
+
+  * Update the PIL image module
+
+  * Update the runtime Dockerfile
+
+  * Update the conda files
+
+----------------------------------------------------------------------------------------------
+
 Release Notes - SINGA - Version singa-3.3.0
 
 SINGA is a distributed deep learning library.
diff --git a/examples/cifar_distributed_cnn/autograd/cifar10_multiprocess.py b/examples/cifar_distributed_cnn/autograd/cifar10_multiprocess.py
old mode 100755
new mode 100644
index b5e51ad..df2dba8
--- a/examples/cifar_distributed_cnn/autograd/cifar10_multiprocess.py
+++ b/examples/cifar_distributed_cnn/autograd/cifar10_multiprocess.py
@@ -26,7 +26,7 @@
     # Generate a NCCL ID to be used for collective communication
     nccl_id = singa.NcclIdHolder()
 
-    # number of GPUs to be used
+    # Configure number of GPUs to be used
     world_size = int(sys.argv[1])
 
     # Testing the experimental partial-parameter update asynchronous training
diff --git a/examples/cifar_distributed_cnn/autograd/resnet_cifar10.py b/examples/cifar_distributed_cnn/autograd/resnet_cifar10.py
index 0d6379b..a8e6efd 100644
--- a/examples/cifar_distributed_cnn/autograd/resnet_cifar10.py
+++ b/examples/cifar_distributed_cnn/autograd/resnet_cifar10.py
@@ -138,7 +138,7 @@
     return output
 
 
-# Function to sychronize SINGA TENSOR initial model parameters
+# Function to synchronize SINGA TENSOR initial model parameters
 def synchronize(tensor, dist_opt):
     dist_opt.all_reduce(tensor.data)
     dist_opt.wait()
@@ -159,7 +159,7 @@
                   nccl_id=None,
                   partial_update=False):
 
-    # Define the hypermeters for the train_cifar10
+    # Define the hyperparameters for the train_cifar10
     sgd = opt.SGD(lr=0.005, momentum=0.9, weight_decay=1e-5)
     max_epoch = 5
     batch_size = 32
@@ -199,7 +199,7 @@
     idx = np.arange(train_x.shape[0], dtype=np.int32)
 
     if DIST:
-        #Sychronize the initial parameters
+        # Synchronize the initial parameters
         autograd.training = True
         x = np.random.randn(batch_size, 3, IMG_SIZE,
                             IMG_SIZE).astype(np.float32)
@@ -220,7 +220,7 @@
         if ((DIST == False) or (sgd.global_rank == 0)):
             print('Starting Epoch %d:' % (epoch))
 
-        #Training phase
+        # Training phase
         autograd.training = True
         train_correct = np.zeros(shape=[1], dtype=np.float32)
         test_correct = np.zeros(shape=[1], dtype=np.float32)
@@ -258,11 +258,11 @@
                   flush=True)
 
         if partial_update:
-            # Sychronize parameters before evaluation phase
+            # Synchronize parameters before evaluation phase
             for p in param:
                 synchronize(p, sgd)
 
-        #Evaulation phase
+        # Evaluation phase
         autograd.training = False
         for b in range(num_test_batch):
             x = test_x[b * batch_size:(b + 1) * batch_size]
@@ -275,7 +275,7 @@
                                      to_categorical(y, num_classes))
 
         if DIST:
-            # Reduce the evaulation accuracy from multiple devices
+            # Reduce the evaluation accuracy from multiple devices
             test_correct = reduce_variable(test_correct, sgd, reducer)
 
         # Output the evaluation accuracy
diff --git a/examples/cifar_distributed_cnn/autograd/xceptionnet.py b/examples/cifar_distributed_cnn/autograd/xceptionnet.py
index 357e47d..ce28640 100644
--- a/examples/cifar_distributed_cnn/autograd/xceptionnet.py
+++ b/examples/cifar_distributed_cnn/autograd/xceptionnet.py
@@ -140,7 +140,7 @@
         self.conv2 = layer.Conv2d(32, 64, 3, 1, 1, bias=False)
         self.bn2 = layer.BatchNorm2d(64)
         self.relu2 = layer.ReLU()
-        # do relu here
+        # Relu Layer
 
         self.block1 = Block(64,
                             128,
@@ -225,7 +225,7 @@
         self.bn3 = layer.BatchNorm2d(1536)
         self.relu3 = layer.ReLU()
 
-        # do relu here
+        # Relu Layer
         self.conv4 = layer.SeparableConv2d(1536, 2048, 3, 1, 1)
         self.bn4 = layer.BatchNorm2d(2048)
 
@@ -279,9 +279,8 @@
 
 if __name__ == '__main__':
     model = Xception(num_classes=1000)
-    print('Start intialization............')
+    print('Start initialization............')
     dev = device.create_cuda_gpu_on(0)
-    #dev = device.create_cuda_gpu()
 
     niters = 20
     batch_size = 16
diff --git a/examples/cifar_distributed_cnn/benchmark.py b/examples/cifar_distributed_cnn/benchmark.py
index 9156927..997d5a4 100644
--- a/examples/cifar_distributed_cnn/benchmark.py
+++ b/examples/cifar_distributed_cnn/benchmark.py
@@ -42,7 +42,7 @@
 
     # For distributed training, sequential has better throughput in the current version
     if DIST == True:
-        sgd = opt.DistOpt(sgd)
+        sgd = opt.DistOpt(sgd)  # Need to make sure DistOpt is working for multiple GPUs/nodes
         world_size = sgd.world_size
         local_rank = sgd.local_rank
         global_rank = sgd.global_rank
diff --git a/examples/cifar_distributed_cnn/data/cifar10.py b/examples/cifar_distributed_cnn/data/cifar10.py
index 3b83ad7..1f57d03 100644
--- a/examples/cifar_distributed_cnn/data/cifar10.py
+++ b/examples/cifar_distributed_cnn/data/cifar10.py
@@ -82,7 +82,7 @@
         val_x[:, ch, :, :] /= std[ch]
     return train_x, val_x
 
-def load():
+def load():  # Need to pass in the path for loading training data
     train_x, train_y = load_train_data()
     val_x, val_y = load_test_data()
     train_x, val_x = normalize(train_x, val_x)
diff --git a/examples/cifar_distributed_cnn/data/download_mnist.py b/examples/cifar_distributed_cnn/data/download_mnist.py
index 65acb0e..18d71b9 100644
--- a/examples/cifar_distributed_cnn/data/download_mnist.py
+++ b/examples/cifar_distributed_cnn/data/download_mnist.py
@@ -23,7 +23,7 @@
 
 def check_exist_or_download(url):
 
-    download_dir = '/tmp/'
+    download_dir = '/tmp/'  # downloaded to the /tmp/ folder
     name = url.rsplit('/', 1)[-1]
     filename = os.path.join(download_dir, name)
 
@@ -36,14 +36,14 @@
 
 if __name__ == '__main__':
 
-    #url of the mnist dataset
+    # List urls of the mnist dataset
     train_x_url = 'http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz'
     train_y_url = 'http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz'
     valid_x_url = 'http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz'
     valid_y_url = 'http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz'
 
-    #download the mnist dataset
+    # Download the mnist dataset
     check_exist_or_download(train_x_url)
     check_exist_or_download(train_y_url)
     check_exist_or_download(valid_x_url)
-    check_exist_or_download(valid_y_url)
+    check_exist_or_download(valid_y_url)
\ No newline at end of file
diff --git a/examples/cifar_distributed_cnn/run-rtx.sh b/examples/cifar_distributed_cnn/run-rtx.sh
index 9b73653..2f4262c 100755
--- a/examples/cifar_distributed_cnn/run-rtx.sh
+++ b/examples/cifar_distributed_cnn/run-rtx.sh
@@ -20,7 +20,19 @@
 #!/usr/bin/env python -W ignore::DeprecationWarning
 
 # resnet
+mpiexec -np 8 python train_mpi.py resnet mnist -l 0.015 -b 32
 mpiexec -np 8 python train_mpi.py resnet cifar10 -l 0.015 -b 32
+mpiexec -np 8 python train_mpi.py resnet cifar100 -l 0.015 -b 32
 
 # cnn
+mpiexec -np 8 python train_mpi.py cnn mnist -l 0.015 -b 32
 mpiexec -np 8 python train_mpi.py cnn cifar10 -l 0.015 -b 32
+mpiexec -np 8 python train_mpi.py cnn cifar100 -l 0.015 -b 32
+
+# mlp 
+mpiexec -np 8 python train_mpi.py mlp cifar10 -l 0.015 -b 32
+mpiexec -np 8 python train_mpi.py mlp cifar100 -l 0.015 -b 32
+
+# alexnet 
+mpiexec -np 8 python train_mpi.py alexnet cifar10 -l 0.015 -b 32
+mpiexec -np 8 python train_mpi.py alexnet cifar100 -l 0.015 -b 32
diff --git a/examples/cifar_distributed_cnn/train_cnn.py b/examples/cifar_distributed_cnn/train_cnn.py
old mode 100755
new mode 100644
index d102623..eb0fbf3
--- a/examples/cifar_distributed_cnn/train_cnn.py
+++ b/examples/cifar_distributed_cnn/train_cnn.py
@@ -111,7 +111,7 @@
         dist_option='plain',
         spars=None,
         precision='float32'):
-    dev = device.create_cuda_gpu_on(local_rank)
+    dev = device.create_cuda_gpu_on(local_rank)  # need to change to CPU device for CPU-only machines
     dev.SetRandSeed(0)
     np.random.seed(0)
 
@@ -224,6 +224,8 @@
     start = time.time()
 
     for b in range(niters):
+        # if b % 100 == 0:
+        #     print ("b: \n", b)
         # Generate the patch data in this iteration
         # Train the model
         model(tx, ty, dist_option, spars)
@@ -265,7 +267,7 @@
                         dest='precision')
     parser.add_argument('-m',
                         '--max-epoch',
-                        default=10,
+                        default=20,
                         type=int,
                         help='maximum epochs',
                         dest='max_epoch')
diff --git a/examples/cnn/autograd/cifar10_multiprocess.py b/examples/cnn/autograd/cifar10_multiprocess.py
index b5e51ad..4b3cb0f 100644
--- a/examples/cnn/autograd/cifar10_multiprocess.py
+++ b/examples/cnn/autograd/cifar10_multiprocess.py
@@ -26,7 +26,7 @@
     # Generate a NCCL ID to be used for collective communication
     nccl_id = singa.NcclIdHolder()
 
-    # number of GPUs to be used
+    # Configure the number of GPUs to be used
     world_size = int(sys.argv[1])
 
     # Testing the experimental partial-parameter update asynchronous training
@@ -40,4 +40,4 @@
                                           partial_update)))
 
     for p in process:
-        p.start()
+        p.start()
\ No newline at end of file
diff --git a/examples/cnn/autograd/resnet_cifar10.py b/examples/cnn/autograd/resnet_cifar10.py
index 3c6876f..7541736 100644
--- a/examples/cnn/autograd/resnet_cifar10.py
+++ b/examples/cnn/autograd/resnet_cifar10.py
@@ -199,7 +199,7 @@
     idx = np.arange(train_x.shape[0], dtype=np.int32)

 

     if DIST:

-        #Sychronize the initial parameters

+        # Sychronize the initial parameters

         autograd.training = True

         x = np.random.randn(batch_size, 3, IMG_SIZE,

                             IMG_SIZE).astype(np.float32)

@@ -220,7 +220,7 @@
         if ((DIST == False) or (sgd.global_rank == 0)):

             print('Starting Epoch %d:' % (epoch))

 

-        #Training phase

+        # Training phase

         autograd.training = True

         train_correct = np.zeros(shape=[1], dtype=np.float32)

         test_correct = np.zeros(shape=[1], dtype=np.float32)

@@ -262,7 +262,7 @@
             for p in param:

                 synchronize(p, sgd)

 

-        #Evaulation phase

+        # Evaulation phase

         autograd.training = False

         for b in range(num_test_batch):

             x = test_x[b * batch_size:(b + 1) * batch_size]

diff --git a/examples/cnn/autograd/xceptionnet.py b/examples/cnn/autograd/xceptionnet.py
index 357e47d..8fb23d8 100644
--- a/examples/cnn/autograd/xceptionnet.py
+++ b/examples/cnn/autograd/xceptionnet.py
@@ -225,7 +225,7 @@
         self.bn3 = layer.BatchNorm2d(1536)
         self.relu3 = layer.ReLU()
 
-        # do relu here
+        # Relu Layer
         self.conv4 = layer.SeparableConv2d(1536, 2048, 3, 1, 1)
         self.bn4 = layer.BatchNorm2d(2048)
 
diff --git a/examples/cnn/data/cifar10.py b/examples/cnn/data/cifar10.py
index d61d84e..5caaf30 100644
--- a/examples/cnn/data/cifar10.py
+++ b/examples/cnn/data/cifar10.py
@@ -40,7 +40,7 @@
     return image, label

 

 

-def load_train_data(dir_path='/tmp/cifar-10-batches-py', num_batches=5):

+def load_train_data(dir_path='/tmp/cifar-10-batches-py', num_batches=5):  # need to save to specific local directories

     labels = []

     batchsize = 10000

     images = np.empty((num_batches * batchsize, 3, 32, 32), dtype=np.uint8)

@@ -54,7 +54,7 @@
     return images, labels

 

 

-def load_test_data(dir_path='/tmp/cifar-10-batches-py'):

+def load_test_data(dir_path='/tmp/cifar-10-batches-py'):  # need to save to specific local directories

     images, labels = load_dataset(check_dataset_exist(dir_path + "/test_batch"))

     return np.array(images, dtype=np.float32), np.array(labels, dtype=np.int32)

 

diff --git a/examples/cnn/data/download_cifar10.py b/examples/cnn/data/download_cifar10.py
index a010b2e..8e44679 100755
--- a/examples/cnn/data/download_cifar10.py
+++ b/examples/cnn/data/download_cifar10.py
@@ -30,7 +30,7 @@
     if os.path.exists(filepath):
         print('The tar file does exist. Extracting it now..')
         with tarfile.open(filepath, 'r') as f:
-            f.extractall('/tmp/')
+            f.extractall('/tmp/')  # need to specify a local directory
         print('Finished!')
         sys.exit(0)
 
@@ -43,7 +43,7 @@
 
 
 if __name__ == '__main__':
-    dirpath = '/tmp/'
+    dirpath = '/tmp/'  # need to specify a local directory
     gzfile = dirpath + 'cifar-10-python.tar.gz'
     url = 'http://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz'
     do_download(dirpath, gzfile, url)
diff --git a/examples/cnn/data/download_cifar100.py b/examples/cnn/data/download_cifar100.py
index 59f9d23..5f1e21b 100755
--- a/examples/cnn/data/download_cifar100.py
+++ b/examples/cnn/data/download_cifar100.py
@@ -20,7 +20,7 @@
 from download_cifar10 import do_download
 
 if __name__ == '__main__':
-    dirpath = '/tmp/'
+    dirpath = '/tmp/'  # need to specify a local directory
     gzfile = dirpath + 'cifar-100-python.tar.gz'
     url = 'http://www.cs.toronto.edu/~kriz/cifar-100-python.tar.gz'
     do_download(dirpath, gzfile, url)
diff --git a/examples/cnn/data/mnist.py b/examples/cnn/data/mnist.py
index 9cd1a84..b25bf5e 100644
--- a/examples/cnn/data/mnist.py
+++ b/examples/cnn/data/mnist.py
@@ -34,10 +34,10 @@
 
 
 def load_dataset():
-    train_x_path = '/tmp/train-images-idx3-ubyte.gz'
-    train_y_path = '/tmp/train-labels-idx1-ubyte.gz'
-    valid_x_path = '/tmp/t10k-images-idx3-ubyte.gz'
-    valid_y_path = '/tmp/t10k-labels-idx1-ubyte.gz'
+    train_x_path = '/tmp/train-images-idx3-ubyte.gz'  # need to change to local disk
+    train_y_path = '/tmp/train-labels-idx1-ubyte.gz'  # need to change to local disk
+    valid_x_path = '/tmp/t10k-images-idx3-ubyte.gz'  # need to change to local disk
+    valid_y_path = '/tmp/t10k-labels-idx1-ubyte.gz'  # need to change to local disk
 
     train_x = read_image_file(check_dataset_exist(train_x_path)).astype(
         np.float32)
diff --git a/examples/cnn/run.sh b/examples/cnn/run.sh
new file mode 100644
index 0000000..9f1c4aa
--- /dev/null
+++ b/examples/cnn/run.sh
@@ -0,0 +1,36 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+### mnist
+python train_cnn.py mlp mnist
+python train_cnn.py cnn mnist
+python train_cnn.py resnet mnist
+python train_cnn.py alexnet mnist
+
+### cifar10
+python train_cnn.py mlp cifar10
+python train_cnn.py cnn cifar10
+python train_cnn.py resnet cifar10
+python train_cnn.py alexnet cifar10
+
+### cifar100
+python train_cnn.py mlp cifar100
+python train_cnn.py cnn cifar100
+python train_cnn.py resnet cifar100
+python train_cnn.py alexnet cifar100
diff --git a/examples/cnn/train_cnn.py b/examples/cnn/train_cnn.py
index bcccc51..5c06470 100644
--- a/examples/cnn/train_cnn.py
+++ b/examples/cnn/train_cnn.py
@@ -107,7 +107,7 @@
         dist_option='plain',
         spars=None,
         precision='float32'):
-    dev = device.create_cuda_gpu_on(local_rank)
+    dev = device.create_cuda_gpu_on(local_rank)  # need to change to CPU device for CPU-only machines
     dev.SetRandSeed(0)
     np.random.seed(0)
 
@@ -165,13 +165,6 @@
         train_x, train_y, val_x, val_y = partition(global_rank, world_size,
                                                    train_x, train_y, val_x,
                                                    val_y)
-    '''
-    # check dataset shape correctness
-    if global_rank == 0:
-        print("Check the shape of dataset:")
-        print(train_x.shape)
-        print(train_y.shape)
-    '''
 
     if model.dimension == 4:
         tx = tensor.Tensor(
@@ -207,6 +200,8 @@
 
         model.train()
         for b in range(num_train_batch):
+            # if b % 100 == 0:
+            #     print ("b: \n", b)
             # Generate the patch data in this iteration
             x = train_x[idx[b * batch_size:(b + 1) * batch_size]]
             if model.dimension == 4:
@@ -282,7 +277,7 @@
                         dest='precision')
     parser.add_argument('-m',
                         '--max-epoch',
-                        default=10,
+                        default=100,
                         type=int,
                         help='maximum epochs',
                         dest='max_epoch')
diff --git a/examples/demos/Classification/BloodMnist/ClassDemo.py b/examples/demos/Classification/BloodMnist/ClassDemo.py
index faf0036..a6872f8 100644
--- a/examples/demos/Classification/BloodMnist/ClassDemo.py
+++ b/examples/demos/Classification/BloodMnist/ClassDemo.py
@@ -181,13 +181,13 @@
     return correct
 
 
-# define pre-processing methods (transforms)
+# Define pre-processing methods (transforms)
 transforms = Compose([
     ToTensor(),
     Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
 ])
 
-# dataset loading
+# Dataset loading
 dataset_path = "./bloodmnist"
 train_path = os.path.join(dataset_path, "train")
 val_path = os.path.join(dataset_path, "val") 
@@ -201,12 +201,12 @@
 
 batch_size = 256
 
-# model configuration
+# Model configuration for CNN
 model = CNNModel(num_classes=num_class)
 criterion = layer.SoftMaxCrossEntropy()
 optimizer_ft = opt.Adam(lr=1e-3)
 
-# start training
+# Start training
 dev = device.create_cpu_device()
 dev.SetRandSeed(0)
 np.random.seed(0)
@@ -234,10 +234,10 @@
     test_correct = np.zeros(shape=[1], dtype=np.float32)
     train_loss = np.zeros(shape=[1], dtype=np.float32)
 
-    # training part
+    # Training part
     model.train()
     for b in tqdm(range(num_train_batch)):
-        # extract batch from image list
+        # Extract batch from image list
         x, y = train_dataset.batchgenerator(idx[b * batch_size:(b + 1) * batch_size], 
             batch_size=batch_size, data_size=(3, model.input_size, model.input_size))
         x = x.astype(np_dtype['float32'])
@@ -252,7 +252,7 @@
                   (train_loss, train_correct /
                    (num_train_batch * batch_size)))
 
-    # validation part
+    # Validation part
     model.eval()
     for b in tqdm(range(num_val_batch)):
         x, y = train_dataset.batchgenerator(idx[b * batch_size:(b + 1) * batch_size], 
diff --git a/examples/demos/Classification/BloodMnist/transforms.py b/examples/demos/Classification/BloodMnist/transforms.py
index 1375ffd..5b51117 100644
--- a/examples/demos/Classification/BloodMnist/transforms.py
+++ b/examples/demos/Classification/BloodMnist/transforms.py
@@ -84,14 +84,14 @@
         if not isinstance(pic, Image.Image):
            raise TypeError('pic should be PIL Image. Got {}'.format(type(pic)))
 
-        # handle PIL Image
+        # Handle PIL Image
         mode_to_nptype = {'I': np.int32, 'I;16': np.int16, 'F': np.float32}
         img = np.array(pic, mode_to_nptype.get(pic.mode, np.uint8), copy=True)
 
         if pic.mode == '1':
             img = 255 * img
 
-        # put it from HWC to CHW format
+        # Put it from HWC to CHW format
         img = np.transpose(img, (2, 0, 1))
 
         if img.dtype == np.uint8:
@@ -128,7 +128,7 @@
         self.std = std
         self.inplace = inplace
 
-    def forward(self, img: np.ndarray) -> np.ndarray:
+    def forward(self, img: np.ndarray):
         """
         Args:
             img (Numpy ndarray): Array image to be normalized.
diff --git a/examples/largedataset_cnn/autograd/cifar10_multiprocess.py b/examples/largedataset_cnn/autograd/cifar10_multiprocess.py
new file mode 100644
index 0000000..df2dba8
--- /dev/null
+++ b/examples/largedataset_cnn/autograd/cifar10_multiprocess.py
@@ -0,0 +1,43 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+from resnet_cifar10 import *
+import multiprocessing
+import sys
+
+if __name__ == '__main__':
+
+    # Generate a NCCL ID to be used for collective communication
+    nccl_id = singa.NcclIdHolder()
+
+    # Configure number of GPUs to be used
+    world_size = int(sys.argv[1])
+
+    # Testing the experimental partial-parameter update asynchronous training
+    partial_update = True
+
+    process = []
+    for local_rank in range(0, world_size):
+        process.append(
+            multiprocessing.Process(target=train_cifar10,
+                                    args=(True, local_rank, world_size, nccl_id,
+                                          partial_update)))
+
+    for p in process:
+        p.start()
diff --git a/examples/largedataset_cnn/autograd/mnist_cnn.py b/examples/largedataset_cnn/autograd/mnist_cnn.py
new file mode 100644
index 0000000..9fa921c
--- /dev/null
+++ b/examples/largedataset_cnn/autograd/mnist_cnn.py
@@ -0,0 +1,304 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+from singa import singa_wrap as singa
+from singa import autograd
+from singa import layer
+from singa import tensor
+from singa import device
+from singa import opt
+import numpy as np
+import os
+import sys
+import gzip
+import codecs
+import time
+
+
+class CNN:
+
+    def __init__(self):
+        self.conv1 = layer.Conv2d(1, 20, 5, padding=0)
+        self.conv2 = layer.Conv2d(20, 50, 5, padding=0)
+        self.linear1 = layer.Linear(4 * 4 * 50, 500)
+        self.linear2 = layer.Linear(500, 10)
+        self.pooling1 = layer.MaxPool2d(2, 2, padding=0)
+        self.pooling2 = layer.MaxPool2d(2, 2, padding=0)
+        self.relu1 = layer.ReLU()
+        self.relu2 = layer.ReLU()
+        self.relu3 = layer.ReLU()
+        self.flatten = layer.Flatten()
+
+    def forward(self, x):
+        y = self.conv1(x)
+        y = self.relu1(y)
+        y = self.pooling1(y)
+        y = self.conv2(y)
+        y = self.relu2(y)
+        y = self.pooling2(y)
+        y = self.flatten(y)
+        y = self.linear1(y)
+        y = self.relu3(y)
+        y = self.linear2(y)
+        return y
+
+
+def check_dataset_exist(dirpath):
+    if not os.path.exists(dirpath):
+        print(
+            'The MNIST dataset does not exist. Please download the mnist dataset using download_mnist.py (e.g. python3 download_mnist.py)'
+        )
+        sys.exit(0)
+    return dirpath
+
+
+def load_dataset():
+    train_x_path = '/tmp/train-images-idx3-ubyte.gz'
+    train_y_path = '/tmp/train-labels-idx1-ubyte.gz'
+    valid_x_path = '/tmp/t10k-images-idx3-ubyte.gz'
+    valid_y_path = '/tmp/t10k-labels-idx1-ubyte.gz'
+
+    train_x = read_image_file(check_dataset_exist(train_x_path)).astype(
+        np.float32)
+    train_y = read_label_file(check_dataset_exist(train_y_path)).astype(
+        np.float32)
+    valid_x = read_image_file(check_dataset_exist(valid_x_path)).astype(
+        np.float32)
+    valid_y = read_label_file(check_dataset_exist(valid_y_path)).astype(
+        np.float32)
+    return train_x, train_y, valid_x, valid_y
+
+
+def read_label_file(path):
+    with gzip.open(path, 'rb') as f:
+        data = f.read()
+        assert get_int(data[:4]) == 2049
+        length = get_int(data[4:8])
+        parsed = np.frombuffer(data, dtype=np.uint8, offset=8).reshape((length))
+        return parsed
+
+
+def get_int(b):
+    return int(codecs.encode(b, 'hex'), 16)
+
+
+def read_image_file(path):
+    with gzip.open(path, 'rb') as f:
+        data = f.read()
+        assert get_int(data[:4]) == 2051
+        length = get_int(data[4:8])
+        num_rows = get_int(data[8:12])
+        num_cols = get_int(data[12:16])
+        parsed = np.frombuffer(data, dtype=np.uint8, offset=16).reshape(
+            (length, 1, num_rows, num_cols))
+        return parsed
+
+
+def to_categorical(y, num_classes):
+    y = np.array(y, dtype="int")
+    n = y.shape[0]
+    categorical = np.zeros((n, num_classes))
+    categorical[np.arange(n), y] = 1
+    categorical = categorical.astype(np.float32)
+    return categorical
+
+
+def accuracy(pred, target):
+    y = np.argmax(pred, axis=1)
+    t = np.argmax(target, axis=1)
+    a = y == t
+    return np.array(a, "int").sum()
+
+
+# Function to all reduce NUMPY accuracy and loss from multiple devices
+def reduce_variable(variable, dist_opt, reducer):
+    reducer.copy_from_numpy(variable)
+    dist_opt.all_reduce(reducer.data)
+    dist_opt.wait()
+    output = tensor.to_numpy(reducer)
+    return output
+
+
+# Function to sychronize SINGA TENSOR initial model parameters
+def synchronize(tensor, dist_opt):
+    dist_opt.all_reduce(tensor.data)
+    dist_opt.wait()
+    tensor /= dist_opt.world_size
+
+
+# Data augmentation
+def augmentation(x, batch_size):
+    xpad = np.pad(x, [[0, 0], [0, 0], [4, 4], [4, 4]], 'symmetric')
+    for data_num in range(0, batch_size):
+        offset = np.random.randint(8, size=2)
+        x[data_num, :, :, :] = xpad[data_num, :, offset[0]:offset[0] + 28,
+                                    offset[1]:offset[1] + 28]
+        if_flip = np.random.randint(2)
+        if (if_flip):
+            x[data_num, :, :, :] = x[data_num, :, :, ::-1]
+    return x
+
+
+# Data partition
+def data_partition(dataset_x, dataset_y, global_rank, world_size):
+    data_per_rank = dataset_x.shape[0] // world_size
+    idx_start = global_rank * data_per_rank
+    idx_end = (global_rank + 1) * data_per_rank
+    return dataset_x[idx_start:idx_end], dataset_y[idx_start:idx_end]
+
+
+def train_mnist_cnn(DIST=False,
+                    local_rank=None,
+                    world_size=None,
+                    nccl_id=None,
+                    spars=0,
+                    topK=False,
+                    corr=True):
+
+    # Define the hypermeters for the mnist_cnn
+    max_epoch = 10
+    batch_size = 64
+    sgd = opt.SGD(lr=0.005, momentum=0.9, weight_decay=1e-5)
+
+    # Prepare training and valadiation data
+    train_x, train_y, test_x, test_y = load_dataset()
+    IMG_SIZE = 28
+    num_classes = 10
+    train_y = to_categorical(train_y, num_classes)
+    test_y = to_categorical(test_y, num_classes)
+
+    # Normalization
+    train_x = train_x / 255
+    test_x = test_x / 255
+
+    if DIST:
+        # For distributed GPU training
+        sgd = opt.DistOpt(sgd,
+                          nccl_id=nccl_id,
+                          local_rank=local_rank,
+                          world_size=world_size)
+        dev = device.create_cuda_gpu_on(sgd.local_rank)
+
+        # Dataset partition for distributed training
+        train_x, train_y = data_partition(train_x, train_y, sgd.global_rank,
+                                          sgd.world_size)
+        test_x, test_y = data_partition(test_x, test_y, sgd.global_rank,
+                                        sgd.world_size)
+        world_size = sgd.world_size
+    else:
+        # For single GPU
+        dev = device.create_cuda_gpu()
+        world_size = 1
+
+    # Create model
+    model = CNN()
+
+    tx = tensor.Tensor((batch_size, 1, IMG_SIZE, IMG_SIZE), dev, tensor.float32)
+    ty = tensor.Tensor((batch_size, num_classes), dev, tensor.int32)
+    num_train_batch = train_x.shape[0] // batch_size
+    num_test_batch = test_x.shape[0] // batch_size
+    idx = np.arange(train_x.shape[0], dtype=np.int32)
+
+    if DIST:
+        #Sychronize the initial parameters
+        autograd.training = True
+        x = np.random.randn(batch_size, 1, IMG_SIZE,
+                            IMG_SIZE).astype(np.float32)
+        y = np.zeros(shape=(batch_size, num_classes), dtype=np.int32)
+        tx.copy_from_numpy(x)
+        ty.copy_from_numpy(y)
+        out = model.forward(tx)
+        loss = autograd.softmax_cross_entropy(out, ty)
+        for p, g in autograd.backward(loss):
+            synchronize(p, sgd)
+
+    # Training and evaulation loop
+    for epoch in range(max_epoch):
+        start_time = time.time()
+        np.random.shuffle(idx)
+
+        if ((DIST == False) or (sgd.global_rank == 0)):
+            print('Starting Epoch %d:' % (epoch))
+
+        # Training phase
+        autograd.training = True
+        train_correct = np.zeros(shape=[1], dtype=np.float32)
+        test_correct = np.zeros(shape=[1], dtype=np.float32)
+        train_loss = np.zeros(shape=[1], dtype=np.float32)
+
+        for b in range(num_train_batch):
+            x = train_x[idx[b * batch_size:(b + 1) * batch_size]]
+            x = augmentation(x, batch_size)
+            y = train_y[idx[b * batch_size:(b + 1) * batch_size]]
+            tx.copy_from_numpy(x)
+            ty.copy_from_numpy(y)
+            out = model.forward(tx)
+            loss = autograd.softmax_cross_entropy(out, ty)
+            train_correct += accuracy(tensor.to_numpy(out), y)
+            train_loss += tensor.to_numpy(loss)[0]
+            if DIST:
+                if (spars == 0):
+                    sgd.backward_and_update(loss, threshold=50000)
+                else:
+                    sgd.backward_and_sparse_update(loss,
+                                                   spars=spars,
+                                                   topK=topK,
+                                                   corr=corr)
+            else:
+                sgd(loss)
+
+        if DIST:
+            # Reduce the evaluation accuracy and loss from multiple devices
+            reducer = tensor.Tensor((1,), dev, tensor.float32)
+            train_correct = reduce_variable(train_correct, sgd, reducer)
+            train_loss = reduce_variable(train_loss, sgd, reducer)
+
+        # Output the training loss and accuracy
+        if ((DIST == False) or (sgd.global_rank == 0)):
+            print('Training loss = %f, training accuracy = %f' %
+                  (train_loss, train_correct /
+                   (num_train_batch * batch_size * world_size)),
+                  flush=True)
+
+        # Evaluation phase
+        autograd.training = False
+        for b in range(num_test_batch):
+            x = test_x[b * batch_size:(b + 1) * batch_size]
+            y = test_y[b * batch_size:(b + 1) * batch_size]
+            tx.copy_from_numpy(x)
+            ty.copy_from_numpy(y)
+            out_test = model.forward(tx)
+            test_correct += accuracy(tensor.to_numpy(out_test), y)
+
+        if DIST:
+            # Reduce the evaulation accuracy from multiple devices
+            test_correct = reduce_variable(test_correct, sgd, reducer)
+
+        # Output the evaluation accuracy
+        if ((DIST == False) or (sgd.global_rank == 0)):
+            print('Evaluation accuracy = %f, Elapsed Time = %fs' %
+                  (test_correct / (num_test_batch * batch_size * world_size),
+                   time.time() - start_time),
+                  flush=True)
+
+
+if __name__ == '__main__':
+
+    DIST = False
+    train_mnist_cnn(DIST=DIST)
\ No newline at end of file
diff --git a/examples/largedataset_cnn/autograd/mnist_dist.py b/examples/largedataset_cnn/autograd/mnist_dist.py
new file mode 100644
index 0000000..e705c31
--- /dev/null
+++ b/examples/largedataset_cnn/autograd/mnist_dist.py
@@ -0,0 +1,25 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+from mnist_cnn import *
+
+if __name__ == '__main__':
+
+    DIST = True
+    train_mnist_cnn(DIST=DIST)
\ No newline at end of file
diff --git a/examples/largedataset_cnn/autograd/mnist_multiprocess.py b/examples/largedataset_cnn/autograd/mnist_multiprocess.py
new file mode 100644
index 0000000..f51344f
--- /dev/null
+++ b/examples/largedataset_cnn/autograd/mnist_multiprocess.py
@@ -0,0 +1,39 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+from mnist_cnn import *
+import multiprocessing
+import sys
+
+if __name__ == '__main__':
+
+    # Generate a NCCL ID to be used for collective communication
+    nccl_id = singa.NcclIdHolder()
+
+    # Number of GPUs to be used
+    world_size = int(sys.argv[1])
+
+    process = []
+    for local_rank in range(0, world_size):
+        process.append(
+            multiprocessing.Process(target=train_mnist_cnn,
+                                    args=(True, local_rank, world_size, nccl_id)))
+
+    for p in process:
+        p.start()
diff --git a/examples/largedataset_cnn/autograd/resnet_cifar10.py b/examples/largedataset_cnn/autograd/resnet_cifar10.py
new file mode 100644
index 0000000..a8e6efd
--- /dev/null
+++ b/examples/largedataset_cnn/autograd/resnet_cifar10.py
@@ -0,0 +1,292 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+try:
+    import pickle
+except ImportError:
+    import cPickle as pickle
+
+from singa import singa_wrap as singa
+from singa import autograd
+from singa import tensor
+from singa import device
+from singa import opt
+from PIL import Image
+import numpy as np
+import os
+import sys
+import time
+
+
+def load_dataset(filepath):
+    with open(filepath, 'rb') as fd:
+        try:
+            cifar10 = pickle.load(fd, encoding='latin1')
+        except TypeError:
+            cifar10 = pickle.load(fd)
+    image = cifar10['data'].astype(dtype=np.uint8)
+    image = image.reshape((-1, 3, 32, 32))
+    label = np.asarray(cifar10['labels'], dtype=np.uint8)
+    label = label.reshape(label.size, 1)
+    return image, label
+
+
+def load_train_data(dir_path='cifar-10-batches-py', num_batches=5):
+    labels = []
+    batchsize = 10000
+    images = np.empty((num_batches * batchsize, 3, 32, 32), dtype=np.uint8)
+    for did in range(1, num_batches + 1):
+        fname_train_data = dir_path + "/data_batch_{}".format(did)
+        image, label = load_dataset(check_dataset_exist(fname_train_data))
+        images[(did - 1) * batchsize:did * batchsize] = image
+        labels.extend(label)
+    images = np.array(images, dtype=np.float32)
+    labels = np.array(labels, dtype=np.int32)
+    return images, labels
+
+
+def load_test_data(dir_path='cifar-10-batches-py'):
+    images, labels = load_dataset(check_dataset_exist(dir_path + "/test_batch"))
+    return np.array(images, dtype=np.float32), np.array(labels, dtype=np.int32)
+
+
+def check_dataset_exist(dirpath):
+    if not os.path.exists(dirpath):
+        print(
+            'Please download the cifar10 dataset using download_data.py (e.g. python ~/singa/examples/cifar10/download_data.py py)'
+        )
+        sys.exit(0)
+    return dirpath
+
+
+def normalize_for_resnet(train_x, test_x):
+    mean = [0.4914, 0.4822, 0.4465]
+    std = [0.2023, 0.1994, 0.2010]
+    train_x /= 255
+    test_x /= 255
+    for ch in range(0, 2):
+        train_x[:, ch, :, :] -= mean[ch]
+        train_x[:, ch, :, :] /= std[ch]
+        test_x[:, ch, :, :] -= mean[ch]
+        test_x[:, ch, :, :] /= std[ch]
+    return train_x, test_x
+
+
+def resize_dataset(x, IMG_SIZE):
+    num_data = x.shape[0]
+    dim = x.shape[1]
+    X = np.zeros(shape=(num_data, dim, IMG_SIZE, IMG_SIZE), dtype=np.float32)
+    for n in range(0, num_data):
+        for d in range(0, dim):
+            X[n, d, :, :] = np.array(Image.fromarray(x[n, d, :, :]).resize(
+                (IMG_SIZE, IMG_SIZE), Image.BILINEAR),
+                                     dtype=np.float32)
+    return X
+
+
+def augmentation(x, batch_size):
+    xpad = np.pad(x, [[0, 0], [0, 0], [4, 4], [4, 4]], 'symmetric')
+    for data_num in range(0, batch_size):
+        offset = np.random.randint(8, size=2)
+        x[data_num, :, :, :] = xpad[data_num, :, offset[0]:offset[0] + 32,
+                                    offset[1]:offset[1] + 32]
+        if_flip = np.random.randint(2)
+        if (if_flip):
+            x[data_num, :, :, :] = x[data_num, :, :, ::-1]
+    return x
+
+
+def accuracy(pred, target):
+    y = np.argmax(pred, axis=1)
+    t = np.argmax(target, axis=1)
+    a = y == t
+    return np.array(a, "int").sum()
+
+
+def to_categorical(y, num_classes):
+    y = np.array(y, dtype="int")
+    n = y.shape[0]
+    categorical = np.zeros((n, num_classes))
+    for i in range(0, n):
+        categorical[i, y[i]] = 1
+        categorical = categorical.astype(np.float32)
+    return categorical
+
+
+# Function to all reduce NUMPY accuracy and loss from multiple devices
+def reduce_variable(variable, dist_opt, reducer):
+    reducer.copy_from_numpy(variable)
+    dist_opt.all_reduce(reducer.data)
+    dist_opt.wait()
+    output = tensor.to_numpy(reducer)
+    return output
+
+
+# Function to synchronize SINGA TENSOR initial model parameters
+def synchronize(tensor, dist_opt):
+    dist_opt.all_reduce(tensor.data)
+    dist_opt.wait()
+    tensor /= dist_opt.world_size
+
+
+# Data partition
+def data_partition(dataset_x, dataset_y, global_rank, world_size):
+    data_per_rank = dataset_x.shape[0] // world_size
+    idx_start = global_rank * data_per_rank
+    idx_end = (global_rank + 1) * data_per_rank
+    return dataset_x[idx_start:idx_end], dataset_y[idx_start:idx_end]
+
+
+def train_cifar10(DIST=False,
+                  local_rank=None,
+                  world_size=None,
+                  nccl_id=None,
+                  partial_update=False):
+
+    # Define the hyperparameters for the train_cifar10
+    sgd = opt.SGD(lr=0.005, momentum=0.9, weight_decay=1e-5)
+    max_epoch = 5
+    batch_size = 32
+
+    train_x, train_y = load_train_data()
+    test_x, test_y = load_test_data()
+    train_x, test_x = normalize_for_resnet(train_x, test_x)
+    IMG_SIZE = 224
+    num_classes = 10
+
+    if DIST:
+        # For distributed GPU training
+        sgd = opt.DistOpt(sgd,
+                          nccl_id=nccl_id,
+                          local_rank=local_rank,
+                          world_size=world_size)
+        dev = device.create_cuda_gpu_on(sgd.local_rank)
+
+        # Dataset partition for distributed training
+        train_x, train_y = data_partition(train_x, train_y, sgd.global_rank,
+                                          sgd.world_size)
+        test_x, test_y = data_partition(test_x, test_y, sgd.global_rank,
+                                        sgd.world_size)
+        world_size = sgd.world_size
+    else:
+        # For single GPU
+        dev = device.create_cuda_gpu()
+        world_size = 1
+
+    from resnet import resnet50
+    model = resnet50(num_classes=num_classes)
+
+    tx = tensor.Tensor((batch_size, 3, IMG_SIZE, IMG_SIZE), dev, tensor.float32)
+    ty = tensor.Tensor((batch_size,), dev, tensor.int32)
+    num_train_batch = train_x.shape[0] // batch_size
+    num_test_batch = test_x.shape[0] // batch_size
+    idx = np.arange(train_x.shape[0], dtype=np.int32)
+
+    if DIST:
+        # Synchronize the initial parameters
+        autograd.training = True
+        x = np.random.randn(batch_size, 3, IMG_SIZE,
+                            IMG_SIZE).astype(np.float32)
+        y = np.zeros(shape=(batch_size,), dtype=np.int32)
+        tx.copy_from_numpy(x)
+        ty.copy_from_numpy(y)
+        out = model(tx)
+        loss = autograd.softmax_cross_entropy(out, ty)
+        param = []
+        for p, _ in autograd.backward(loss):
+            synchronize(p, sgd)
+            param.append(p)
+
+    for epoch in range(max_epoch):
+        start_time = time.time()
+        np.random.shuffle(idx)
+
+        if ((DIST == False) or (sgd.global_rank == 0)):
+            print('Starting Epoch %d:' % (epoch))
+
+        # Training phase
+        autograd.training = True
+        train_correct = np.zeros(shape=[1], dtype=np.float32)
+        test_correct = np.zeros(shape=[1], dtype=np.float32)
+        train_loss = np.zeros(shape=[1], dtype=np.float32)
+
+        for b in range(num_train_batch):
+            x = train_x[idx[b * batch_size:(b + 1) * batch_size]]
+            x = augmentation(x, batch_size)
+            x = resize_dataset(x, IMG_SIZE)
+            y = train_y[idx[b * batch_size:(b + 1) * batch_size]]
+            tx.copy_from_numpy(x)
+            ty.copy_from_numpy(y)
+            out = model(tx)
+            loss = autograd.softmax_cross_entropy(out, ty)
+            train_correct += accuracy(tensor.to_numpy(out),
+                                      to_categorical(y, num_classes)).astype(
+                                          np.float32)
+            train_loss += tensor.to_numpy(loss)[0]
+            if not partial_update:
+                sgd.backward_and_update(loss)
+            else:
+                sgd.backward_and_partial_update(loss)
+
+        if DIST:
+            # Reduce the evaluation accuracy and loss from multiple devices
+            reducer = tensor.Tensor((1,), dev, tensor.float32)
+            train_correct = reduce_variable(train_correct, sgd, reducer)
+            train_loss = reduce_variable(train_loss, sgd, reducer)
+
+        # Output the training loss and accuracy
+        if ((DIST == False) or (sgd.global_rank == 0)):
+            print('Training loss = %f, training accuracy = %f' %
+                  (train_loss, train_correct /
+                   (num_train_batch * batch_size * world_size)),
+                  flush=True)
+
+        if partial_update:
+            # Synchronize parameters before evaluation phase
+            for p in param:
+                synchronize(p, sgd)
+
+        # Evaluation phase
+        autograd.training = False
+        for b in range(num_test_batch):
+            x = test_x[b * batch_size:(b + 1) * batch_size]
+            x = resize_dataset(x, IMG_SIZE)
+            y = test_y[b * batch_size:(b + 1) * batch_size]
+            tx.copy_from_numpy(x)
+            ty.copy_from_numpy(y)
+            out_test = model(tx)
+            test_correct += accuracy(tensor.to_numpy(out_test),
+                                     to_categorical(y, num_classes))
+
+        if DIST:
+            # Reduce the evaluation accuracy from multiple devices
+            test_correct = reduce_variable(test_correct, sgd, reducer)
+
+        # Output the evaluation accuracy
+        if ((DIST == False) or (sgd.global_rank == 0)):
+            print('Evaluation accuracy = %f, Elapsed Time = %fs' %
+                  (test_correct / (num_test_batch * batch_size * world_size),
+                   time.time() - start_time),
+                  flush=True)
+
+
+if __name__ == '__main__':
+
+    DIST = False
+    train_cifar10(DIST=DIST)
diff --git a/examples/largedataset_cnn/autograd/resnet_dist.py b/examples/largedataset_cnn/autograd/resnet_dist.py
new file mode 100755
index 0000000..6f9b56c
--- /dev/null
+++ b/examples/largedataset_cnn/autograd/resnet_dist.py
@@ -0,0 +1,87 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+# the code is modified from
+# https://github.com/pytorch/vision/blob/master/torchvision/models/resnet.py
+
+from singa import autograd
+from singa import tensor
+from singa import device
+from singa import opt
+
+import numpy as np
+from tqdm import trange
+
+if __name__ == "__main__":
+    sgd = opt.SGD(lr=0.1, momentum=0.9, weight_decay=1e-5)
+    sgd = opt.DistOpt(sgd)
+
+    if (sgd.global_rank == 0):
+        print("Start intialization...........", flush=True)
+
+    dev = device.create_cuda_gpu_on(sgd.local_rank)
+
+    from resnet import resnet50
+    model = resnet50()
+
+    niters = 100
+    batch_size = 32
+    IMG_SIZE = 224
+
+    tx = tensor.Tensor((batch_size, 3, IMG_SIZE, IMG_SIZE), dev)
+    ty = tensor.Tensor((batch_size,), dev, tensor.int32)
+    autograd.training = True
+    x = np.random.randn(batch_size, 3, IMG_SIZE, IMG_SIZE).astype(np.float32)
+    y = np.random.randint(0, 1000, batch_size, dtype=np.int32)
+    tx.copy_from_numpy(x)
+    ty.copy_from_numpy(y)
+
+    import time
+
+    dev.Sync()
+    start = time.time()
+    fd = 0
+    softmax = 0
+    update = 0
+    with trange(niters) as t:
+        for _ in t:
+            dev.Sync()
+            tick = time.time()
+            x = model(tx)
+            dev.Sync()
+            fd += time.time() - tick
+            tick = time.time()
+            loss = autograd.softmax_cross_entropy(x, ty)
+            dev.Sync()
+            softmax += time.time() - tick
+            sgd.backward_and_update(loss)
+
+    dev.Sync()
+    end = time.time()
+    throughput = float(sgd.world_size * niters * batch_size) / (end - start)
+    titer = (end - start) / float(niters)
+    tforward = float(fd) / float(niters)
+    tsoftmax = float(softmax) / float(niters)
+    tbackward = titer - tforward - tsoftmax
+
+    if (sgd.global_rank == 0):
+        print("\nThroughput = {} per second".format(throughput), flush=True)
+        print("Total={}, forward={}, softmax={}, backward={}".format(
+            titer, tforward, tsoftmax, tbackward),
+              flush=True)
diff --git a/examples/largedataset_cnn/autograd/sparsification_mnist.py b/examples/largedataset_cnn/autograd/sparsification_mnist.py
new file mode 100644
index 0000000..315605a
--- /dev/null
+++ b/examples/largedataset_cnn/autograd/sparsification_mnist.py
@@ -0,0 +1,45 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+from mnist_cnn import *
+import multiprocessing
+import sys
+
+if __name__ == '__main__':
+
+    # Generate a NCCL ID to be used for collective communication
+    nccl_id = singa.NcclIdHolder()
+
+    # Number of GPUs to be used
+    world_size = int(sys.argv[1])
+
+    # Use sparsification with parameters
+    topK = False  # When topK = False, Sparsification based on a constant absolute threshold
+    corr = True  # If True, uses local accumulate gradient for the correction
+    sparsThreshold = 0.05  # The constant absolute threshold for sparsification
+
+    process = []
+    for local_rank in range(0, world_size):
+        process.append(
+            multiprocessing.Process(target=train_mnist_cnn,
+                                    args=(True, local_rank, world_size, nccl_id,
+                                          sparsThreshold, topK, corr)))
+
+    for p in process:
+        p.start()
diff --git a/examples/largedataset_cnn/autograd/xceptionnet.py b/examples/largedataset_cnn/autograd/xceptionnet.py
new file mode 100644
index 0000000..ce28640
--- /dev/null
+++ b/examples/largedataset_cnn/autograd/xceptionnet.py
@@ -0,0 +1,302 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+from singa import autograd
+from singa import tensor
+from singa import device
+from singa import layer
+from singa import opt
+
+import numpy as np
+from tqdm import trange
+
+# the code is modified from
+# https://github.com/Cadene/pretrained-models.pytorch/blob/master/pretrainedmodels/models/xception.py
+
+
+class Block(layer.Layer):
+
+    def __init__(self,
+                 in_filters,
+                 out_filters,
+                 reps,
+                 strides=1,
+                 padding=0,
+                 start_with_relu=True,
+                 grow_first=True):
+        super(Block, self).__init__()
+
+        if out_filters != in_filters or strides != 1:
+            self.skip = layer.Conv2d(in_filters,
+                                     out_filters,
+                                     1,
+                                     stride=strides,
+                                     padding=padding,
+                                     bias=False)
+            self.skipbn = layer.BatchNorm2d(out_filters)
+        else:
+            self.skip = None
+
+        self.layers = []
+
+        filters = in_filters
+        if grow_first:
+            self.layers.append(layer.ReLU())
+            self.layers.append(
+                layer.SeparableConv2d(in_filters,
+                                      out_filters,
+                                      3,
+                                      stride=1,
+                                      padding=1,
+                                      bias=False))
+            self.layers.append(layer.BatchNorm2d(out_filters))
+            filters = out_filters
+
+        for i in range(reps - 1):
+            self.layers.append(layer.ReLU())
+            self.layers.append(
+                layer.SeparableConv2d(filters,
+                                      filters,
+                                      3,
+                                      stride=1,
+                                      padding=1,
+                                      bias=False))
+            self.layers.append(layer.BatchNorm2d(filters))
+
+        if not grow_first:
+            self.layers.append(layer.ReLU())
+            self.layers.append(
+                layer.SeparableConv2d(in_filters,
+                                      out_filters,
+                                      3,
+                                      stride=1,
+                                      padding=1,
+                                      bias=False))
+            self.layers.append(layer.BatchNorm2d(out_filters))
+
+        if not start_with_relu:
+            self.layers = self.layers[1:]
+        else:
+            self.layers[0] = layer.ReLU()
+
+        if strides != 1:
+            self.layers.append(layer.MaxPool2d(3, strides, padding + 1))
+
+        self.register_layers(*self.layers)
+
+        self.add = layer.Add()
+
+    def forward(self, x):
+        y = self.layers[0](x)
+        for layer in self.layers[1:]:
+            if isinstance(y, tuple):
+                y = y[0]
+            y = layer(y)
+
+        if self.skip is not None:
+            skip = self.skip(x)
+            skip = self.skipbn(skip)
+        else:
+            skip = x
+        y = self.add(y, skip)
+        return y
+
+
+__all__ = ['Xception']
+
+
+class Xception(layer.Layer):
+    """
+    Xception optimized for the ImageNet dataset, as specified in
+    https://arxiv.org/pdf/1610.02357.pdf
+    """
+
+    def __init__(self, num_classes=1000):
+        """ Constructor
+        Args:
+            num_classes: number of classes
+        """
+        super(Xception, self).__init__()
+        self.num_classes = num_classes
+
+        self.conv1 = layer.Conv2d(3, 32, 3, 2, 0, bias=False)
+        self.bn1 = layer.BatchNorm2d(32)
+        self.relu1 = layer.ReLU()
+
+        self.conv2 = layer.Conv2d(32, 64, 3, 1, 1, bias=False)
+        self.bn2 = layer.BatchNorm2d(64)
+        self.relu2 = layer.ReLU()
+        # Relu Layer
+
+        self.block1 = Block(64,
+                            128,
+                            2,
+                            2,
+                            padding=0,
+                            start_with_relu=False,
+                            grow_first=True)
+        self.block2 = Block(128,
+                            256,
+                            2,
+                            2,
+                            padding=0,
+                            start_with_relu=True,
+                            grow_first=True)
+        self.block3 = Block(256,
+                            728,
+                            2,
+                            2,
+                            padding=0,
+                            start_with_relu=True,
+                            grow_first=True)
+
+        self.block4 = Block(728,
+                            728,
+                            3,
+                            1,
+                            start_with_relu=True,
+                            grow_first=True)
+        self.block5 = Block(728,
+                            728,
+                            3,
+                            1,
+                            start_with_relu=True,
+                            grow_first=True)
+        self.block6 = Block(728,
+                            728,
+                            3,
+                            1,
+                            start_with_relu=True,
+                            grow_first=True)
+        self.block7 = Block(728,
+                            728,
+                            3,
+                            1,
+                            start_with_relu=True,
+                            grow_first=True)
+
+        self.block8 = Block(728,
+                            728,
+                            3,
+                            1,
+                            start_with_relu=True,
+                            grow_first=True)
+        self.block9 = Block(728,
+                            728,
+                            3,
+                            1,
+                            start_with_relu=True,
+                            grow_first=True)
+        self.block10 = Block(728,
+                             728,
+                             3,
+                             1,
+                             start_with_relu=True,
+                             grow_first=True)
+        self.block11 = Block(728,
+                             728,
+                             3,
+                             1,
+                             start_with_relu=True,
+                             grow_first=True)
+
+        self.block12 = Block(728,
+                             1024,
+                             2,
+                             2,
+                             start_with_relu=True,
+                             grow_first=False)
+
+        self.conv3 = layer.SeparableConv2d(1024, 1536, 3, 1, 1)
+        self.bn3 = layer.BatchNorm2d(1536)
+        self.relu3 = layer.ReLU()
+
+        # Relu Layer
+        self.conv4 = layer.SeparableConv2d(1536, 2048, 3, 1, 1)
+        self.bn4 = layer.BatchNorm2d(2048)
+
+        self.relu4 = layer.ReLU()
+        self.globalpooling = layer.MaxPool2d(10, 1)
+        self.flatten = layer.Flatten()
+        self.fc = layer.Linear(2048, num_classes)
+
+    def features(self, input):
+        x = self.conv1(input)
+        x = self.bn1(x)
+        x = self.relu1(x)
+
+        x = self.conv2(x)
+        x = self.bn2(x)
+        x = self.relu2(x)
+
+        x = self.block1(x)
+        x = self.block2(x)
+        x = self.block3(x)
+        x = self.block4(x)
+        x = self.block5(x)
+        x = self.block6(x)
+        x = self.block7(x)
+        x = self.block8(x)
+        x = self.block9(x)
+        x = self.block10(x)
+        x = self.block11(x)
+        x = self.block12(x)
+
+        x = self.conv3(x)
+        x = self.bn3(x)
+        x = self.relu3(x)
+
+        x = self.conv4(x)
+        x = self.bn4(x)
+        return x
+
+    def logits(self, features):
+        x = self.relu4(features)
+        x = self.globalpooling(x)
+        x = self.flatten(x)
+        x = self.fc(x)
+        return x
+
+    def forward(self, input):
+        x = self.features(input)
+        x = self.logits(x)
+        return x
+
+
+if __name__ == '__main__':
+    model = Xception(num_classes=1000)
+    print('Start initialization............')
+    dev = device.create_cuda_gpu_on(0)
+
+    niters = 20
+    batch_size = 16
+    IMG_SIZE = 299
+    sgd = opt.SGD(lr=0.1, momentum=0.9, weight_decay=1e-5)
+
+    tx = tensor.Tensor((batch_size, 3, IMG_SIZE, IMG_SIZE), dev)
+    ty = tensor.Tensor((batch_size,), dev, tensor.int32)
+    autograd.training = True
+    x = np.random.randn(batch_size, 3, IMG_SIZE, IMG_SIZE).astype(np.float32)
+    y = np.random.randint(0, 1000, batch_size, dtype=np.int32)
+    tx.copy_from_numpy(x)
+    ty.copy_from_numpy(y)
+
+    with trange(niters) as t:
+        for _ in t:
+            x = model(tx)
+            loss = autograd.softmax_cross_entropy(x, ty)
+            sgd(loss)
diff --git a/examples/largedataset_cnn/train_largedata.py b/examples/largedataset_cnn/train_largedata.py
index 6dfd548..a6b7704 100755
--- a/examples/largedataset_cnn/train_largedata.py
+++ b/examples/largedataset_cnn/train_largedata.py
@@ -142,13 +142,6 @@
         train_x, train_y, val_x, val_y = partition(global_rank, world_size,
                                                    train_x, train_y, val_x,
                                                    val_y)
-    '''
-    # Check dataset shape correctness
-    if global_rank == 0:
-        print("Check the shape of dataset:")
-        print(train_x.shape)
-        print(train_y.shape)
-    '''
 
     if model.dimension == 4:
         tx = tensor.Tensor(
diff --git a/examples/singa_easy/singa_easy/__init__.py b/examples/singa_easy/singa_easy/__init__.py
new file mode 100644
index 0000000..b6cab62
--- /dev/null
+++ b/examples/singa_easy/singa_easy/__init__.py
@@ -0,0 +1,20 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+#!/usr/bin/env python
\ No newline at end of file
diff --git a/examples/singa_easy/singa_easy/modules/__init__.py b/examples/singa_easy/singa_easy/modules/__init__.py
new file mode 100644
index 0000000..752cd17
--- /dev/null
+++ b/examples/singa_easy/singa_easy/modules/__init__.py
@@ -0,0 +1,21 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+#!/usr/bin/env python
+
diff --git a/examples/singa_easy/singa_easy/modules/explanations/__init__.py b/examples/singa_easy/singa_easy/modules/explanations/__init__.py
new file mode 100644
index 0000000..b6cab62
--- /dev/null
+++ b/examples/singa_easy/singa_easy/modules/explanations/__init__.py
@@ -0,0 +1,20 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+#!/usr/bin/env python
\ No newline at end of file
diff --git a/examples/singa_easy/singa_easy/modules/explanations/lime/__init__.py b/examples/singa_easy/singa_easy/modules/explanations/lime/__init__.py
index fe95886..b6cab62 100644
--- a/examples/singa_easy/singa_easy/modules/explanations/lime/__init__.py
+++ b/examples/singa_easy/singa_easy/modules/explanations/lime/__init__.py
@@ -16,3 +16,5 @@
 # specific language governing permissions and limitations
 # under the License.
 #
+
+#!/usr/bin/env python
\ No newline at end of file