Merge pull request #836 from lzjpaul/cnn-autograd

Update cnn autograd
diff --git a/examples/cnn/autograd/mnist_cnn.py b/examples/cnn/autograd/mnist_cnn.py
index ff2e1dc..16752ce 100644
--- a/examples/cnn/autograd/mnist_cnn.py
+++ b/examples/cnn/autograd/mnist_cnn.py
@@ -126,7 +126,7 @@
     return np.array(a, "int").sum()
 
 
-# Function to all reduce NUMPY Accuracy and Loss from Multiple Devices
+# Function to all reduce NUMPY accuracy and loss from multiple devices
 def reduce_variable(variable, dist_opt, reducer):
     reducer.copy_from_numpy(variable)
     dist_opt.all_reduce(reducer.data)
@@ -171,7 +171,7 @@
                     topK=False,
                     corr=True):
 
-    # Define the hypermeters good for the mnist_cnn
+    # Define the hypermeters for the mnist_cnn
     max_epoch = 10
     batch_size = 64
     sgd = opt.SGD(lr=0.005, momentum=0.9, weight_decay=1e-5)
@@ -188,12 +188,13 @@
     test_x = test_x / 255
 
     if DIST:
-        # For Distributed GPU Training
+        # For distributed GPU training
         sgd = opt.DistOpt(sgd,
                           nccl_id=nccl_id,
                           local_rank=local_rank,
                           world_size=world_size)
         dev = device.create_cuda_gpu_on(sgd.local_rank)
+
         # Dataset partition for distributed training
         train_x, train_y = data_partition(train_x, train_y, sgd.global_rank,
                                           sgd.world_size)
@@ -201,11 +202,11 @@
                                         sgd.world_size)
         world_size = sgd.world_size
     else:
-        # For Single GPU
+        # For single GPU
         dev = device.create_cuda_gpu()
         world_size = 1
 
-    # create model
+    # Create model
     model = CNN()
 
     tx = tensor.Tensor((batch_size, 1, IMG_SIZE, IMG_SIZE), dev, tensor.float32)
@@ -227,7 +228,7 @@
         for p, g in autograd.backward(loss):
             synchronize(p, sgd)
 
-    # Training and Evaulation Loop
+    # Training and evaulation loop
     for epoch in range(max_epoch):
         start_time = time.time()
         np.random.shuffle(idx)
@@ -235,7 +236,7 @@
         if ((DIST == False) or (sgd.global_rank == 0)):
             print('Starting Epoch %d:' % (epoch))
 
-        # Training Phase
+        # Training phase
         autograd.training = True
         train_correct = np.zeros(shape=[1], dtype=np.float32)
         test_correct = np.zeros(shape=[1], dtype=np.float32)
@@ -263,19 +264,19 @@
                 sgd(loss)
 
         if DIST:
-            # Reduce the Evaluation Accuracy and Loss from Multiple Devices
+            # Reduce the evaluation accuracy and loss from multiple devices
             reducer = tensor.Tensor((1,), dev, tensor.float32)
             train_correct = reduce_variable(train_correct, sgd, reducer)
             train_loss = reduce_variable(train_loss, sgd, reducer)
 
-        # Output the Training Loss and Accuracy
+        # Output the training loss and accuracy
         if ((DIST == False) or (sgd.global_rank == 0)):
             print('Training loss = %f, training accuracy = %f' %
                   (train_loss, train_correct /
                    (num_train_batch * batch_size * world_size)),
                   flush=True)
 
-        # Evaluation Phase
+        # Evaluation phase
         autograd.training = False
         for b in range(num_test_batch):
             x = test_x[b * batch_size:(b + 1) * batch_size]
@@ -286,10 +287,10 @@
             test_correct += accuracy(tensor.to_numpy(out_test), y)
 
         if DIST:
-            # Reduce the Evaulation Accuracy from Multiple Devices
+            # Reduce the evaulation accuracy from multiple devices
             test_correct = reduce_variable(test_correct, sgd, reducer)
 
-        # Output the Evaluation Accuracy
+        # Output the evaluation accuracy
         if ((DIST == False) or (sgd.global_rank == 0)):
             print('Evaluation accuracy = %f, Elapsed Time = %fs' %
                   (test_correct / (num_test_batch * batch_size * world_size),
diff --git a/examples/cnn/autograd/mnist_multiprocess.py b/examples/cnn/autograd/mnist_multiprocess.py
index f5c2763..f51344f 100644
--- a/examples/cnn/autograd/mnist_multiprocess.py
+++ b/examples/cnn/autograd/mnist_multiprocess.py
@@ -26,7 +26,7 @@
     # Generate a NCCL ID to be used for collective communication
     nccl_id = singa.NcclIdHolder()
 
-    # number of GPUs to be used
+    # Number of GPUs to be used
     world_size = int(sys.argv[1])
 
     process = []
diff --git a/examples/cnn/autograd/resnet_cifar10.py b/examples/cnn/autograd/resnet_cifar10.py
index 14005bc..3c6876f 100644
--- a/examples/cnn/autograd/resnet_cifar10.py
+++ b/examples/cnn/autograd/resnet_cifar10.py
@@ -129,7 +129,7 @@
     return categorical

 

 

-# Function to all reduce NUMPY Accuracy and Loss from Multiple Devices

+# Function to all reduce NUMPY accuracy and loss from multiple devices

 def reduce_variable(variable, dist_opt, reducer):

     reducer.copy_from_numpy(variable)

     dist_opt.all_reduce(reducer.data)

@@ -159,7 +159,7 @@
                   nccl_id=None,

                   partial_update=False):

 

-    # Define the hypermeters good for the train_cifar10

+    # Define the hypermeters for the train_cifar10

     sgd = opt.SGD(lr=0.005, momentum=0.9, weight_decay=1e-5)

     max_epoch = 5

     batch_size = 32

@@ -171,12 +171,13 @@
     num_classes = 10

 

     if DIST:

-        # For Distributed GPU Training

+        # For distributed GPU training

         sgd = opt.DistOpt(sgd,

                           nccl_id=nccl_id,

                           local_rank=local_rank,

                           world_size=world_size)

         dev = device.create_cuda_gpu_on(sgd.local_rank)

+

         # Dataset partition for distributed training

         train_x, train_y = data_partition(train_x, train_y, sgd.global_rank,

                                           sgd.world_size)

@@ -184,7 +185,7 @@
                                         sgd.world_size)

         world_size = sgd.world_size

     else:

-        # For Single GPU

+        # For single GPU

         dev = device.create_cuda_gpu()

         world_size = 1

 

@@ -219,7 +220,7 @@
         if ((DIST == False) or (sgd.global_rank == 0)):

             print('Starting Epoch %d:' % (epoch))

 

-        #Training Phase

+        #Training phase

         autograd.training = True

         train_correct = np.zeros(shape=[1], dtype=np.float32)

         test_correct = np.zeros(shape=[1], dtype=np.float32)

@@ -244,12 +245,12 @@
                 sgd.backward_and_partial_update(loss)

 

         if DIST:

-            # Reduce the Evaluation Accuracy and Loss from Multiple Devices

+            # Reduce the evaluation accuracy and loss from multiple devices

             reducer = tensor.Tensor((1,), dev, tensor.float32)

             train_correct = reduce_variable(train_correct, sgd, reducer)

             train_loss = reduce_variable(train_loss, sgd, reducer)

 

-        # Output the Training Loss and Accuracy

+        # Output the training loss and accuracy

         if ((DIST == False) or (sgd.global_rank == 0)):

             print('Training loss = %f, training accuracy = %f' %

                   (train_loss, train_correct /

@@ -257,11 +258,11 @@
                   flush=True)

 

         if partial_update:

-            # sychronize parameters before evaluation phase

+            # Sychronize parameters before evaluation phase

             for p in param:

                 synchronize(p, sgd)

 

-        #Evaulation Phase

+        #Evaulation phase

         autograd.training = False

         for b in range(num_test_batch):

             x = test_x[b * batch_size:(b + 1) * batch_size]

@@ -274,10 +275,10 @@
                                      to_categorical(y, num_classes))

 

         if DIST:

-            # Reduce the Evaulation Accuracy from Multiple Devices

+            # Reduce the evaulation accuracy from multiple devices

             test_correct = reduce_variable(test_correct, sgd, reducer)

 

-        # Output the Evaluation Accuracy

+        # Output the evaluation accuracy

         if ((DIST == False) or (sgd.global_rank == 0)):

             print('Evaluation accuracy = %f, Elapsed Time = %fs' %

                   (test_correct / (num_test_batch * batch_size * world_size),

diff --git a/examples/cnn/autograd/sparsification_mnist.py b/examples/cnn/autograd/sparsification_mnist.py
index cc9b585..315605a 100644
--- a/examples/cnn/autograd/sparsification_mnist.py
+++ b/examples/cnn/autograd/sparsification_mnist.py
@@ -26,7 +26,7 @@
     # Generate a NCCL ID to be used for collective communication
     nccl_id = singa.NcclIdHolder()
 
-    # number of GPUs to be used
+    # Number of GPUs to be used
     world_size = int(sys.argv[1])
 
     # Use sparsification with parameters