merge updates from chris
diff --git a/docs-site/docs/autograd.md b/docs-site/docs/autograd.md
index ece3b53..20cfaa5 100644
--- a/docs-site/docs/autograd.md
+++ b/docs-site/docs/autograd.md
@@ -192,7 +192,7 @@
 
 ### Using the Model API
 
-The following
+The following <<<<<<< HEAD
 [example](https://github.com/apache/singa/blob/master/examples/cnn/model/cnn.py)
 implements a CNN model using the [Model API](./graph).
 
@@ -206,57 +206,78 @@
 ```python
 class MLP(model.Model):  # the model is a subclass of Model
 
-    def __init__(self, optimizer):
+    def __init__(self, data_size=10, perceptron_size=100, num_classes=10):
         super(MLP, self).__init__()
 
         # init the operators, layers and other objects
-        self.w0 = Tensor(shape=(2, 3), requires_grad=True, stores_grad=True)
-        self.w0.gaussian(0.0, 0.1)
-        self.b0 = Tensor(shape=(3,), requires_grad=True, stores_grad=True)
-        self.b0.set_value(0.0)
-
-        self.w1 = Tensor(shape=(3, 2), requires_grad=True, stores_grad=True)
-        self.w1.gaussian(0.0, 0.1)
-        self.b1 = Tensor(shape=(2,), requires_grad=True, stores_grad=True)
-        self.b1.set_value(0.0)
-
-        # init the optimizer
-        self.optimizer = optimizer
+        self.relu = layer.ReLU()
+        self.linear1 = layer.Linear(perceptron_size)
+        self.linear2 = layer.Linear(num_classes)
+        self.softmax_cross_entropy = layer.SoftMaxCrossEntropy()
 
     def forward(self, inputs):  # define the forward function
-        x = autograd.matmul(inputs, self.w0)
-        x = autograd.add_bias(x, self.b0)
-        x = autograd.relu(x)
-        x = autograd.matmul(x, self.w1)
-        x = autograd.add_bias(x, self.b1)
-        return x
+        y = self.linear1(inputs)
+        y = self.relu(y)
+        y = self.linear2(y)
+        return y
 
-    def loss(self, out, target): # define the loss function
-        # can use the loss operations provided by SINGA or self-defined function
-        return autograd.softmax_cross_entropy(out, target)
+    def train_one_batch(self, x, y):
+        out = self.forward(x)
+        loss = self.softmax_cross_entropy(out, y)
+        self.optimizer(loss)
+        return out, loss
 
-    def optim(self, loss):       # define the optim function
-        # can use the optimizer provided by SINGA or self-defined function
-        return self.optimizer.backward_and_update(loss)
+    def set_optimizer(self, optimizer):  # attach an optimizer
+        self.optimizer = optimizer
 ```
 
 #### Training
 
 ```python
 # create a model instance
-model = MLP(sgd)
-# declare what device to train on
-model.on_device(dev)
-# declare execution mode and order
-model.graph(graph, sequential)
+model = MLP()
+# initialize optimizer and attach it to the model
+sgd = opt.SGD(lr=0.005, momentum=0.9, weight_decay=1e-5)
+model.set_optimizer(sgd)
+# input and target placeholders for the model
+tx = tensor.Tensor((batch_size, 1, IMG_SIZE, IMG_SIZE), dev, tensor.float32)
+ty = tensor.Tensor((batch_size, num_classes), dev, tensor.int32)
+# compile the model before training
+model.compile([tx], is_train=True, use_graph=True, sequential=False)
 
-for i in range(niters):
-    out = model(inputs)
-    loss = model.loss(out, target)
-    model.optim(loss)
+# train the model iteratively
+for b in range(num_train_batch):
+    # generate the next mini-batch
+    x, y = ...
 
-    if i % (niters / 10) == 0 and rank_in_global == 0:
-        print("training loss = ", tensor.to_numpy(loss)[0], flush=True)
+    # Copy the data into input tensors
+    tx.copy_from_numpy(x)
+    ty.copy_from_numpy(y)
+
+    # Training with one batch
+    out, loss = model(tx, ty)
+```
+
+#### Save a model checkpoint
+
+```python
+# define the path to save the checkpoint
+checkpointpath="checkpoint.zip"
+
+# save a checkpoint
+model.save_states(fpath=checkpointpath)
+```
+
+#### Load a model checkpoint
+
+```python
+# define the path to load the checkpoint
+checkpointpath="checkpoint.zip"
+
+# load a checkpoint
+import os
+if os.path.exists(checkpointpath):
+    model.load_states(fpath=checkpointpath)
 ```
 
 ### Python API
diff --git a/docs-site/docs/graph.md b/docs-site/docs/graph.md
index e08e69f..bf6bac0 100644
--- a/docs-site/docs/graph.md
+++ b/docs-site/docs/graph.md
@@ -71,6 +71,7 @@
 
 # initialize optimizer and attach it to the model
 sgd = opt.SGD(lr=0.005, momentum=0.9, weight_decay=1e-5)
+model.set_optimizer(sgd)
 
 # initialize device
 dev = device.create_cuda_gpu()
diff --git a/docs-site/docs/optimizer.md b/docs-site/docs/optimizer.md
new file mode 100644
index 0000000..4949471
--- /dev/null
+++ b/docs-site/docs/optimizer.md
@@ -0,0 +1,127 @@
+---
+id: optimizer
+title: Optimizer
+---
+
+<!--- Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements.  See the NOTICE file distributed with this work for additional information regarding copyright ownership.  The ASF licenses this file to you under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License.  You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  See the License for the specific language governing permissions and limitations under the License.  -->
+
+SINGA supports various popular optimizers including stochastic gradient descent
+with momentum, Adam, RMSProp, and AdaGrad, etc. For each of the optimizer, it
+supports to use a decay schedular to schedule the learning rate to be applied in
+different epochs. The optimizers and the decay schedulers are included in
+`singa/opt.py`.
+
+## Create an optimizer
+
+1. SGD with momentum
+
+```python
+# define hyperparameter learning rate
+lr = 0.001
+# define hyperparameter momentum
+momentum = 0.9
+# define hyperparameter weight decay
+weight_decay = 0.0001
+
+from singa import opt
+sgd = opt.SGD(lr=lr, momentum=momentum, weight_decay=weight_decay)
+```
+
+2. RMSProp
+
+```python
+# define hyperparameter learning rate
+lr = 0.001
+# define hyperparameter rho
+rho = 0.9
+# define hyperparameter epsilon
+epsilon = 1e-8
+# define hyperparameter weight decay
+weight_decay = 0.0001
+
+from singa import opt
+sgd = opt.RMSProp(lr=lr, rho=rho, epsilon=epsilon, weight_decay=weight_decay)
+```
+
+3. AdaGrad
+
+```python
+# define hyperparameter learning rate
+lr = 0.001
+# define hyperparameter epsilon
+epsilon = 1e-8
+# define hyperparameter weight decay
+weight_decay = 0.0001
+
+from singa import opt
+sgd = opt.AdaGrad(lr=lr, epsilon=epsilon, weight_decay=weight_decay)
+```
+
+4. Adam
+
+```python
+# define hyperparameter learning rate
+lr = 0.001
+# define hyperparameter beta 1
+beta_1= 0.9
+# define hyperparameter beta 2
+beta_1= 0.999
+# define hyperparameter epsilon
+epsilon = 1e-8
+# define hyperparameter weight decay
+weight_decay = 0.0001
+
+from singa import opt
+sgd = opt.Adam(lr=lr, beta_1=beta_1, beta_2=beta_2, epsilon=epsilon, weight_decay=weight_decay)
+```
+
+## Create a Decay Scheduler
+
+```python
+from singa import opt
+
+# define initial learning rate
+lr_init = 0.001
+# define the rate of decay in the decay scheduler
+decay_rate = 0.95
+# define whether the learning rate schedule is a staircase shape
+staircase=True
+# define the decay step of the decay scheduler (in this example the lr is decreased at every 2 steps)
+decay_steps = 2
+
+# create the decay scheduler, the schedule of lr becomes lr_init * (decay_rate ^ (step // decay_steps) )
+lr = opt.ExponentialDecay(0.1, 2, 0.5, True)
+# Use the lr to create an optimizer
+sgd = opt.SGD(lr=lr, momentum=0.9, weight_decay=0.0001)
+```
+
+## Use the optimizer in Model API
+
+When we create the model, we need to attach the optimizer to the model.
+
+```python
+# create a CNN using the Model API
+model = CNN()
+
+# initialize optimizer and attach it to the model
+sgd = opt.SGD(lr=0.005, momentum=0.9, weight_decay=1e-5)
+model.set_optimizer(sgd)
+```
+
+Then, when we call the model, it runs the `train_one_batch` method that utilizes
+the optimizer.
+
+Hence, an example of an iterative loop to optimize the model is:
+
+```python
+for b in range(num_train_batch):
+    # generate the next mini-batch
+    x, y = ...
+
+    # Copy the data into input tensors
+    tx.copy_from_numpy(x)
+    ty.copy_from_numpy(y)
+
+    # Training with one batch
+    out, loss = model(tx, ty)
+```
diff --git a/docs-site/docs/time-profiling.md b/docs-site/docs/time-profiling.md
new file mode 100644
index 0000000..8f9fda1
--- /dev/null
+++ b/docs-site/docs/time-profiling.md
@@ -0,0 +1,165 @@
+---
+id: time-profiling
+title: Time Profiling
+---
+
+<!--- Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements.  See the NOTICE file distributed with this work for additional information regarding copyright ownership.  The ASF licenses this file to you under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License.  You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  See the License for the specific language governing permissions and limitations under the License.  -->
+
+SINGA supports the time profiling of each of the operators buffered in the
+graph. To utilize the time profiling function, we first call the
+`device.SetVerbosity` method to set the verbosity of the time profilier, and
+then call the `device.PrintTimeProfiling` to print out the results of time
+profiling.
+
+## Setup the Time Profiling Verbosity
+
+To use the time profiling function, we need to set the verbosity. There are
+three levels of verbosity. With the default value `verbosity == 0`, it will not
+do any time profiling. When we set `verbosity == 1`, it will profile the forward
+and backward propagation time. When `verbosity == 2`, it will profile the time
+spent on every buffered operation in the graph.
+
+The following is the example code to setup the time profiling function:
+
+```python
+# create a device
+from singa import device
+dev = device.create_cuda_gpu()
+# set the verbosity
+verbosity = 2
+dev.SetVerbosity(verbosity)
+# optional: skip the first 5 iterations when profiling the time
+dev.SetSkipIteration(5)
+```
+
+Then, after we have completed the training at the end of the program, we can
+print the time profiling result by calling the `device.PrintTimeProfiling`
+method:
+
+```python
+dev.PrintTimeProfiling()
+```
+
+## Example Outputs for Different Verbosity
+
+We can run the ResNet
+[example](https://github.com/apache/singa/blob/master/examples/cnn/benchmark.py)
+to see the output with different setting of verbosity:
+
+1. `verbosity == 1`
+
+```
+Time Profiling:
+Forward Propagation Time : 0.0409127 sec
+Backward Propagation Time : 0.114813 sec
+```
+
+2. `verbosity == 2`
+
+```
+Time Profiling:
+OP_ID0. SetValue : 1.73722e-05 sec
+OP_ID1. cudnnConvForward : 0.000612724 sec
+OP_ID2. GpuBatchNormForwardTraining : 0.000559449 sec
+OP_ID3. ReLU : 0.000375004 sec
+OP_ID4. GpuPoolingForward : 0.000240041 sec
+OP_ID5. SetValue : 3.4176e-06 sec
+OP_ID6. cudnnConvForward : 0.000115619 sec
+OP_ID7. GpuBatchNormForwardTraining : 0.000150415 sec
+OP_ID8. ReLU : 9.95494e-05 sec
+OP_ID9. SetValue : 3.22432e-06 sec
+OP_ID10. cudnnConvForward : 0.000648668 sec
+OP_ID11. GpuBatchNormForwardTraining : 0.000149793 sec
+OP_ID12. ReLU : 9.92118e-05 sec
+OP_ID13. SetValue : 3.37728e-06 sec
+OP_ID14. cudnnConvForward : 0.000400953 sec
+OP_ID15. GpuBatchNormForwardTraining : 0.000572181 sec
+OP_ID16. SetValue : 3.21312e-06 sec
+OP_ID17. cudnnConvForward : 0.000398698 sec
+OP_ID18. GpuBatchNormForwardTraining : 0.00056836 sec
+OP_ID19. Add : 0.000542246 sec
+OP_ID20. ReLU : 0.000372783 sec
+OP_ID21. SetValue : 3.25312e-06 sec
+OP_ID22. cudnnConvForward : 0.000260731 sec
+OP_ID23. GpuBatchNormForwardTraining : 0.000149041 sec
+OP_ID24. ReLU : 9.9072e-05 sec
+OP_ID25. SetValue : 3.10592e-06 sec
+OP_ID26. cudnnConvForward : 0.000637481 sec
+OP_ID27. GpuBatchNormForwardTraining : 0.000152577 sec
+OP_ID28. ReLU : 9.90518e-05 sec
+OP_ID29. SetValue : 3.28224e-06 sec
+OP_ID30. cudnnConvForward : 0.000404586 sec
+OP_ID31. GpuBatchNormForwardTraining : 0.000569679 sec
+OP_ID32. Add : 0.000542291 sec
+OP_ID33. ReLU : 0.00037211 sec
+OP_ID34. SetValue : 3.13696e-06 sec
+OP_ID35. cudnnConvForward : 0.000261219 sec
+OP_ID36. GpuBatchNormForwardTraining : 0.000148281 sec
+OP_ID37. ReLU : 9.89299e-05 sec
+OP_ID38. SetValue : 3.25216e-06 sec
+OP_ID39. cudnnConvForward : 0.000633644 sec
+OP_ID40. GpuBatchNormForwardTraining : 0.000150711 sec
+OP_ID41. ReLU : 9.84902e-05 sec
+OP_ID42. SetValue : 3.18176e-06 sec
+OP_ID43. cudnnConvForward : 0.000402752 sec
+OP_ID44. GpuBatchNormForwardTraining : 0.000571523 sec
+OP_ID45. Add : 0.000542435 sec
+OP_ID46. ReLU : 0.000372539 sec
+OP_ID47. SetValue : 3.24672e-06 sec
+OP_ID48. cudnnConvForward : 0.000493054 sec
+OP_ID49. GpuBatchNormForwardTraining : 0.000293142 sec
+OP_ID50. ReLU : 0.000190047 sec
+OP_ID51. SetValue : 3.14784e-06 sec
+OP_ID52. cudnnConvForward : 0.00148837 sec
+OP_ID53. GpuBatchNormForwardTraining : 8.34794e-05 sec
+OP_ID54. ReLU : 5.23254e-05 sec
+OP_ID55. SetValue : 3.40096e-06 sec
+OP_ID56. cudnnConvForward : 0.000292971 sec
+OP_ID57. GpuBatchNormForwardTraining : 0.00029174 sec
+OP_ID58. SetValue : 3.3248e-06 sec
+OP_ID59. cudnnConvForward : 0.000590154 sec
+OP_ID60. GpuBatchNormForwardTraining : 0.000294149 sec
+OP_ID61. Add : 0.000275119 sec
+OP_ID62. ReLU : 0.000189268 sec
+OP_ID63. SetValue : 3.2704e-06 sec
+OP_ID64. cudnnConvForward : 0.000341232 sec
+OP_ID65. GpuBatchNormForwardTraining : 8.3304e-05 sec
+OP_ID66. ReLU : 5.23667e-05 sec
+OP_ID67. SetValue : 3.19936e-06 sec
+OP_ID68. cudnnConvForward : 0.000542484 sec
+OP_ID69. GpuBatchNormForwardTraining : 8.60537e-05 sec
+OP_ID70. ReLU : 5.2479e-05 sec
+OP_ID71. SetValue : 3.41824e-06 sec
+OP_ID72. cudnnConvForward : 0.000291295 sec
+OP_ID73. GpuBatchNormForwardTraining : 0.000292795 sec
+OP_ID74. Add : 0.000274438 sec
+OP_ID75. ReLU : 0.000189689 sec
+OP_ID76. SetValue : 3.21984e-06 sec
+OP_ID77. cudnnConvForward : 0.000338776 sec
+OP_ID78. GpuBatchNormForwardTraining : 8.484e-05 sec
+OP_ID79. ReLU : 5.29408e-05 sec
+OP_ID80. SetValue : 3.18208e-06 sec
+OP_ID81. cudnnConvForward : 0.000545542 sec
+OP_ID82. GpuBatchNormForwardTraining : 8.40976e-05 sec
+OP_ID83. ReLU : 5.2256e-05 sec
+OP_ID84. SetValue : 3.36256e-06 sec
+OP_ID85. cudnnConvForward : 0.000293003 sec
+OP_ID86. GpuBatchNormForwardTraining : 0.0002989 sec
+OP_ID87. Add : 0.000275041 sec
+OP_ID88. ReLU : 0.000189867 sec
+OP_ID89. SetValue : 3.1184e-06 sec
+OP_ID90. cudnnConvForward : 0.000340417 sec
+OP_ID91. GpuBatchNormForwardTraining : 8.39395e-05 sec
+OP_ID92. ReLU : 5.26544e-05 sec
+OP_ID93. SetValue : 3.2336e-06 sec
+OP_ID94. cudnnConvForward : 0.000539787 sec
+OP_ID95. GpuBatchNormForwardTraining : 8.2753e-05 sec
+OP_ID96. ReLU : 4.86758e-05 sec
+OP_ID97. SetValue : 3.24384e-06 sec
+OP_ID98. cudnnConvForward : 0.000287108 sec
+OP_ID99. GpuBatchNormForwardTraining : 0.000293127 sec
+OP_ID100. Add : 0.000269478 sec
+.
+.
+.
+```
diff --git a/docs-site/website/sidebars.json b/docs-site/website/sidebars.json
index 2db3746..7bbe66a 100644
--- a/docs-site/website/sidebars.json
+++ b/docs-site/website/sidebars.json
@@ -9,9 +9,11 @@
       "device",
       "tensor",
       "autograd",
+      "optimizer",
       "graph",
       "onnx",
-      "dist-train"
+      "dist-train",
+      "time-profiling"
     ],
     "Development": [
       "download-singa",