Merge pull request #27 from chrishkchris/singa3.1

Update of documentation in distributed training and graph
diff --git a/docs-site/docs/dist-train.md b/docs-site/docs/dist-train.md
index c456440..24d956f 100644
--- a/docs-site/docs/dist-train.md
+++ b/docs-site/docs/dist-train.md
@@ -42,37 +42,63 @@
 1. Define the neural network model:
 
 ```python
-class CNN:
-    def __init__(self):
-        self.conv1 = autograd.Conv2d(1, 20, 5, padding=0)
-        self.conv2 = autograd.Conv2d(20, 50, 5, padding=0)
-        self.linear1 = autograd.Linear(4 * 4 * 50, 500)
-        self.linear2 = autograd.Linear(500, 10)
-        self.pooling1 = autograd.MaxPool2d(2, 2, padding=0)
-        self.pooling2 = autograd.MaxPool2d(2, 2, padding=0)
+class CNN(model.Model):
+
+    def __init__(self, num_classes=10, num_channels=1):
+        super(CNN, self).__init__()
+        self.conv1 = layer.Conv2d(num_channels, 20, 5, padding=0, activation="RELU")
+        self.conv2 = layer.Conv2d(20, 50, 5, padding=0, activation="RELU")
+        self.linear1 = layer.Linear(500)
+        self.linear2 = layer.Linear(num_classes)
+        self.pooling1 = layer.MaxPool2d(2, 2, padding=0)
+        self.pooling2 = layer.MaxPool2d(2, 2, padding=0)
+        self.relu = layer.ReLU()
+        self.flatten = layer.Flatten()
+        self.softmax_cross_entropy = layer.SoftMaxCrossEntropy()
 
     def forward(self, x):
         y = self.conv1(x)
-        y = autograd.relu(y)
         y = self.pooling1(y)
         y = self.conv2(y)
-        y = autograd.relu(y)
         y = self.pooling2(y)
-        y = autograd.flatten(y)
+        y = self.flatten(y)
         y = self.linear1(y)
-        y = autograd.relu(y)
+        y = self.relu(y)
         y = self.linear2(y)
         return y
 
+    def train_one_batch(self, x, y, dist_option='fp32', spars=0):
+        out = self.forward(x)
+        loss = self.softmax_cross_entropy(out, y)
+
+        # Allow different options for distributed training
+        # See the section "Optimizations for Distributed Training"
+        if dist_option == 'fp32':
+            self.optimizer(loss)
+        elif dist_option == 'fp16':
+            self.optimizer.backward_and_update_half(loss)
+        elif dist_option == 'partialUpdate':
+            self.optimizer.backward_and_partial_update(loss)
+        elif dist_option == 'sparseTopK':
+            self.optimizer.backward_and_sparse_update(loss,
+                                                      topK=True,
+                                                      spars=spars)
+        elif dist_option == 'sparseThreshold':
+            self.optimizer.backward_and_sparse_update(loss,
+                                                      topK=False,
+                                                      spars=spars)
+        return out, loss
+
 # create model
 model = CNN()
 ```
 
-2. Create the `DistOpt` instance:
+2. Create the `DistOpt` instance and attach it to the created model:
 
 ```python
 sgd = opt.SGD(lr=0.005, momentum=0.9, weight_decay=1e-5)
 sgd = opt.DistOpt(sgd)
+model.set_optimizer(sgd)
 dev = device.create_cuda_gpu_on(sgd.local_rank)
 ```
 
@@ -116,26 +142,23 @@
 
 A partition of the dataset is returned for this `dev`.
 
+Here, `world_size` represents the total number of processes in all the nodes you
+are using for distributed training.
+
 4. Initialize and synchronize the model parameters among all workers:
 
 ```python
-def synchronize(tensor, dist_opt):
-    dist_opt.all_reduce(tensor.data)
-    tensor /= dist_opt.world_size
-
 #Synchronize the initial parameter
 tx = tensor.Tensor((batch_size, 1, IMG_SIZE, IMG_SIZE), dev, tensor.float32)
 ty = tensor.Tensor((batch_size, num_classes), dev, tensor.int32)
+model.compile([tx], is_train=True, use_graph=graph, sequential=True)
 ...
-out = model.forward(tx)
-loss = autograd.softmax_cross_entropy(out, ty)
-for p, g in autograd.backward(loss):
-    synchronize(p, sgd)
+#Use the same random seed for different ranks
+seed = 0
+dev.SetRandSeed(seed)
+np.random.seed(seed)
 ```
 
-Here, `world_size` represents the total number of processes in all the nodes you
-are using for distributed training.
-
 5. Run BackPropagation and distributed SGD
 
 ```python
@@ -149,6 +172,8 @@
         loss = autograd.softmax_cross_entropy(out, ty)
         # do backpropagation and all-reduce
         sgd.backward_and_update(loss)
+        # Train the model
+        out, loss = model(tx, ty)
 ```
 
 ### Execution Instruction
@@ -321,10 +346,18 @@
 reduce the communication cost. Refer to the API for `DistOpt` for the
 configuration of each strategy.
 
+When we use `model.Model` to build a model, we need to put the options for
+distributed training in the `train_one_batch` method. Please refer to the
+example code on top of this page. We could just copy the code for the options
+and use it in other models.
+
+With the defined options, we can put the arguments `dist_option` and `spars` when we 
+start the training with `model(tx, ty, dist_option, spars)`
+
 ### No Optimizations
 
 ```python
-sgd.backward_and_update(loss)
+out, loss = model(tx, ty)
 ```
 
 `loss` is the output tensor from the loss function, e.g., cross-entropy for
@@ -333,7 +366,7 @@
 ### Half-precision Gradients
 
 ```python
-sgd.backward_and_update_half(loss)
+out, loss = model(tx, ty, dist_option = 'fp16')
 ```
 
 It converts each gradient value to 16-bit representation (i.e., half-precision)
@@ -342,7 +375,7 @@
 ### Partial Synchronization
 
 ```python
-sgd.backward_and_partial_update(loss)
+out, loss = model(tx, ty, dist_option = 'partialUpdate')
 ```
 
 In each iteration, every rank do the local sgd update. Then, only a chunk of
@@ -351,10 +384,6 @@
 
 ### Gradient Sparsification
 
-```python
-sgd.backward_and_sparse_update(loss)
-```
-
 It applies sparsification schemes to select a subset of gradients for
 all-reduce. There are two scheme:
 
@@ -362,14 +391,14 @@
   elements selected.
 
 ```python
-sgd.backward_and_sparse_update(loss = loss, spars = spars, topK = True)
+out, loss = model(tx, ty, dist_option = 'sparseTopK', spars = spars)
 ```
 
 - All gradients whose absolute value are larger than predefined threshold spars
   are selected.
 
 ```python
-sgd.backward_and_sparse_update(loss = loss, spars = spars, topK = False)
+out, loss = model(tx, ty, dist_option = 'sparseThreshold', spars = spars)
 ```
 
 The hyper-parameters are configured when creating the `DistOpt` instance.
diff --git a/docs-site/docs/graph.md b/docs-site/docs/graph.md
index fb4ba69..abce638 100644
--- a/docs-site/docs/graph.md
+++ b/docs-site/docs/graph.md
@@ -18,59 +18,60 @@
 
 ## Example
 
-The following code illustrates the usage of the `Module` API.
+The following code illustrates the usage of the `Model` API.
 
-1. Implement the new model as a subclass the Module class.
+1. Implement the new model as a subclass the Model class.
 
 ```Python
-class CNN(module.Module):
+class CNN(model.Model):
 
-    def __init__(self, optimizer):
+    def __init__(self, num_classes=10, num_channels=1):
         super(CNN, self).__init__()
-        # define layers
-        self.conv1 = autograd.Conv2d(1, 20, 5, padding=0)
-        self.conv2 = autograd.Conv2d(20, 50, 5, padding=0)
-        self.linear1 = autograd.Linear(4 * 4 * 50, 500)
-        self.linear2 = autograd.Linear(500, 10)
-        self.pooling1 = autograd.MaxPool2d(2, 2, padding=0)
-        self.pooling2 = autograd.MaxPool2d(2, 2, padding=0)
-
-        self.optimizer = optimizer
+        self.conv1 = layer.Conv2d(num_channels, 20, 5, padding=0, activation="RELU")
+        self.conv2 = layer.Conv2d(20, 50, 5, padding=0, activation="RELU")
+        self.linear1 = layer.Linear(500)
+        self.linear2 = layer.Linear(num_classes)
+        self.pooling1 = layer.MaxPool2d(2, 2, padding=0)
+        self.pooling2 = layer.MaxPool2d(2, 2, padding=0)
+        self.relu = layer.ReLU()
+        self.flatten = layer.Flatten()
+        self.softmax_cross_entropy = layer.SoftMaxCrossEntropy()
 
     def forward(self, x):
-        # define the forward operations
         y = self.conv1(x)
-        y = autograd.relu(y)
         y = self.pooling1(y)
         y = self.conv2(y)
-        y = autograd.relu(y)
         y = self.pooling2(y)
-        y = autograd.flatten(y)
+        y = self.flatten(y)
         y = self.linear1(y)
-        y = autograd.relu(y)
+        y = self.relu(y)
         y = self.linear2(y)
         return y
 
-    def loss(self, x, ty):
-        # define the training loss
-        return autograd.softmax_cross_entropy(x, ty)
-
-    def optim(self, loss):
-        # update the parameters using SGD algorithms
-        self.optimizer.backward_and_update(loss)
+    def train_one_batch(self, x, y):
+        out = self.forward(x)
+        loss = self.softmax_cross_entropy(out, y)
+        self.optimizer(loss)
+        return out, loss
 ```
 
-2. Create an instance of the model, and do some configurations
+2. Create an instance of model, optimizer, device, etc. Compile the model
 
 ```python
-model = CNN(sgd)
-# set the mode of running the operations:
-# True for training; False for evaluation
-model.train(mode=True)
-# set the device for running the operations
-model.on_device(dev)
-# whether to create the graph or run the operations imperatively
-model.graph(mode=True)
+model = CNN()
+
+# initialize optimizer and attach it to the model
+sgd = opt.SGD(lr=0.005, momentum=0.9, weight_decay=1e-5)
+
+# initialize device
+dev = device.create_cuda_gpu()
+
+# input and target placeholders for the model
+tx = tensor.Tensor((batch_size, 1, IMG_SIZE, IMG_SIZE), dev, tensor.float32)
+ty = tensor.Tensor((batch_size, num_classes), dev, tensor.int32)
+
+# compile the model before training
+model.compile([tx], is_train=True, use_graph=True, sequential=False)
 ```
 
 3. Train the model iteratively
@@ -84,11 +85,8 @@
     tx.copy_from_numpy(x)
     ty.copy_from_numpy(y)
 
-    # run forward propagation
-    out = model(tx)
-    loss = model.loss(out, ty)
-    # run backward propagation
-    model.optim(loss)
+    # Training with one batch
+    out, loss = model(tx, ty)
 ```
 
 A Google Colab notebook of this example is available
@@ -116,13 +114,19 @@
 class
 
 ```python
-class MLP(module.Module):
+class MLP(model.Model):
+
+    def __init__(self, data_size=10, perceptron_size=100, num_classes=10):
+        super(MLP, self).__init__()
+        self.linear1 = layer.Linear(perceptron_size)
+        ...
 
     def forward(self, inputs):
-        x = autograd.matmul(inputs, self.w0)
+        y = self.linear1(inputs)
         ...
 ```
 
+The `Linear` layer is composed of the `mutmul` operator.
 `autograd` implements the `matmul` operator by calling the function `Mult`
 exposed from CPP via SWIG.
 
@@ -148,7 +152,7 @@
 addition, it also has the information about the blocks (a block is a chunk of
 memory for a tensor) to be read and written by this function.
 
-Once `Module.forward()` has been executed once, all operations are buffered by
+Once `Model.forward()` has been executed once, all operations are buffered by
 `Device`. Next, the read/write information of all operations are analyzed to
 create the computational graph. For example, if a block `b` is written by one
 operation O1 and is later read by another operation O2, we would know O2 depends