update due to change of API
diff --git a/docs-site/docs/autograd.md b/docs-site/docs/autograd.md
index 4d42070..bc28a8c 100644
--- a/docs-site/docs/autograd.md
+++ b/docs-site/docs/autograd.md
@@ -190,73 +190,72 @@
             sgd.update(p, gp)
 ```
 
-### Using the Module API
+### Using the Model API
 
 The following
 [example](https://github.com/apache/singa/blob/master/examples/autograd/cnn_module.py)
-implements a CNN model using the Module provided by the module.
+implements a CNN model using the Model provided by the model.
 
-#### Define the subclass of Module
+#### Define the subclass of Model
 
-Define the model class, it should be the subclass of the Module. In this way,
+Define the model class, it should be the subclass of the Model. In this way,
 all operations used during traing phase will form a calculation graph and will
 be analyzed. The operations in the graph will be scheduled and executed
-efficiently. Layers can also be included in the module class.
+efficiently. Layers can also be included in the model class.
 
 ```python
-class MLP(module.Module):  # the model is a subclass of Module
+class MLP(model.Model):  # the model is a subclass of Model
 
-    def __init__(self, optimizer):
+    def __init__(self, data_size=10, perceptron_size=100, num_classes=10):
         super(MLP, self).__init__()
 
         # init the operators, layers and other objects
-        self.w0 = Tensor(shape=(2, 3), requires_grad=True, stores_grad=True)
-        self.w0.gaussian(0.0, 0.1)
-        self.b0 = Tensor(shape=(3,), requires_grad=True, stores_grad=True)
-        self.b0.set_value(0.0)
-
-        self.w1 = Tensor(shape=(3, 2), requires_grad=True, stores_grad=True)
-        self.w1.gaussian(0.0, 0.1)
-        self.b1 = Tensor(shape=(2,), requires_grad=True, stores_grad=True)
-        self.b1.set_value(0.0)
-
-        # init the optimizer
-        self.optimizer = optimizer
+        self.relu = layer.ReLU()
+        self.linear1 = layer.Linear(perceptron_size)
+        self.linear2 = layer.Linear(num_classes)
+        self.softmax_cross_entropy = layer.SoftMaxCrossEntropy()
 
     def forward(self, inputs):  # define the forward function
-        x = autograd.matmul(inputs, self.w0)
-        x = autograd.add_bias(x, self.b0)
-        x = autograd.relu(x)
-        x = autograd.matmul(x, self.w1)
-        x = autograd.add_bias(x, self.b1)
-        return x
+        y = self.linear1(inputs)
+        y = self.relu(y)
+        y = self.linear2(y)
+        return y
 
-    def loss(self, out, target): # define the loss function
-        # can use the loss operations provided by SINGA or self-defined function
-        return autograd.softmax_cross_entropy(out, target)
+    def train_one_batch(self, x, y):
+        out = self.forward(x)
+        loss = self.softmax_cross_entropy(out, y)
+        self.optimizer(loss)
+        return out, loss
 
-    def optim(self, loss):       # define the optim function
-        # can use the optimizer provided by SINGA or self-defined function
-        return self.optimizer.backward_and_update(loss)
+    def set_optimizer(self, optimizer):  # attach an optimizer
+        self.optimizer = optimizer
 ```
 
 #### Training
 
 ```python
 # create a model instance
-model = MLP(sgd)
-# declare what device to train on
-model.on_device(dev)
-# declare execution mode and order
-model.graph(graph, sequential)
+model = MLP()
+# initialize optimizer and attach it to the model
+sgd = opt.SGD(lr=0.005, momentum=0.9, weight_decay=1e-5)
+model.set_optimizer(sgd)
+# input and target placeholders for the model
+tx = tensor.Tensor((batch_size, 1, IMG_SIZE, IMG_SIZE), dev, tensor.float32)
+ty = tensor.Tensor((batch_size, num_classes), dev, tensor.int32)
+# compile the model before training
+model.compile([tx], is_train=True, use_graph=True, sequential=False)
 
-for i in range(niters):
-    out = model(inputs)
-    loss = model.loss(out, target)
-    model.optim(loss)
+# train the model iteratively
+for b in range(num_train_batch):
+    # generate the next mini-batch
+    x, y = ...
 
-    if i % (niters / 10) == 0 and rank_in_global == 0:
-        print("training loss = ", tensor.to_numpy(loss)[0], flush=True)
+    # Copy the data into input tensors
+    tx.copy_from_numpy(x)
+    ty.copy_from_numpy(y)
+
+    # Training with one batch
+    out, loss = model(tx, ty)
 ```
 
 ### Python API
diff --git a/docs-site/docs/graph.md b/docs-site/docs/graph.md
index abce638..e4dc7ba 100644
--- a/docs-site/docs/graph.md
+++ b/docs-site/docs/graph.md
@@ -13,7 +13,7 @@
 speed and memory optimization can be conducted by scheduling the execution of
 the operations and memory allocation/release intelligently. In SINGA, users only
 need to define the neural network model using the
-[Module](https://github.com/apache/singa/blob/master/python/singa/module.py)
+[Model](https://github.com/apache/singa/blob/master/python/singa/model.py)
 API. The graph is constructed and optimized at the C++ backend automatically.
 
 ## Example
@@ -62,6 +62,7 @@
 
 # initialize optimizer and attach it to the model
 sgd = opt.SGD(lr=0.005, momentum=0.9, weight_decay=1e-5)
+model.set_optimizer(sgd)
 
 # initialize device
 dev = device.create_cuda_gpu()
@@ -94,7 +95,7 @@
 
 More examples:
 
-- [MLP](https://github.com/apache/singa/blob/master/examples/mlp/module.py)
+- [MLP](https://github.com/apache/singa/blob/master/examples/mlp/model.py)
 - [CNN](https://github.com/apache/singa/blob/master/examples/cnn/model/cnn.py)
 - [ResNet](https://github.com/apache/singa/blob/master/examples/cnn/model/resnet.py)
 
@@ -109,7 +110,7 @@
 3. create the nodes and edges based on the dependencies
 
 Take the matrix multiplication operation from the dense layer of a
-[MLP model](https://github.com/apache/singa/blob/master/examples/mlp/module.py)
+[MLP model](https://github.com/apache/singa/blob/master/examples/mlp/model.py)
 as an example. The operation is called in the `forward` function of the MLP
 class
 
@@ -315,7 +316,7 @@
   - Model
     - Using layer: ResNet50 in
       [resnet.py](https://github.com/apache/singa/blob/master/examples/cnn/autograd/resnet_cifar10.py)
-    - Using module: ResNet50 in
+    - Using model: ResNet50 in
       [resnet.py](https://github.com/apache/singa/blob/master/examples/cnn/model/resnet.py)
   - GPU: NVIDIA RTX 2080Ti
 - Notations
@@ -350,7 +351,7 @@
           <td>1.0000</td>
       </tr>
       <tr>
-          <td nowrap>module:disable graph</td>
+          <td nowrap>model:disable graph</td>
           <td>4995</td>
           <td>14.1264</td>
           <td>14.1579</td>
@@ -359,7 +360,7 @@
           <td>1.0049</td>
       </tr>
       <tr>
-          <td nowrap>module:enable graph, bfs</td>
+          <td nowrap>model:enable graph, bfs</td>
           <td>3283</td>
           <td>13.7438</td>
           <td>14.5520</td>
@@ -368,7 +369,7 @@
           <td>1.0328</td>
       </tr>
       <tr>
-          <td nowrap>module:enable graph, serial</td>
+          <td nowrap>model:enable graph, serial</td>
           <td>3265</td>
           <td>13.7420</td>
           <td>14.5540</td>
@@ -387,7 +388,7 @@
           <td>1.0000</td>
       </tr>
       <tr>
-          <td nowrap>module:disable graph</td>
+          <td nowrap>model:disable graph</td>
           <td>10109</td>
           <td>13.2952</td>
           <td>7.5315</td>
@@ -396,7 +397,7 @@
           <td>1.0123</td>
       </tr>
       <tr>
-          <td nowrap>module:enable graph, bfs</td>
+          <td nowrap>model:enable graph, bfs</td>
           <td>6839</td>
           <td>13.1059</td>
           <td>7.6302</td>
@@ -405,7 +406,7 @@
           <td>1.0269</td>
       </tr>
       <tr>
-          <td nowrap>module:enable graph, serial</td>
+          <td nowrap>model:enable graph, serial</td>
           <td>6845</td>
           <td>13.0489</td>
           <td>7.6635</td>
@@ -421,7 +422,7 @@
   - Model
     - using Layer: ResNet50 in
       [resnet_dist.py](https://github.com/apache/singa/blob/master/examples/cnn/autograd/resnet_dist.py)
-    - using Module: ResNet50 in
+    - using Model: ResNet50 in
       [resnet.py](https://github.com/apache/singa/blob/master/examples/cnn/model/resnet.py)
   - GPU: NVIDIA RTX 2080Ti \* 2
   - MPI: two MPI processes on one node
@@ -449,7 +450,7 @@
           <td>1.0000</td>
       </tr>
       <tr>
-          <td nowrap>module:disable graph</td>
+          <td nowrap>model:disable graph</td>
           <td>5427</td>
           <td>17.8232</td>
           <td>11.2213</td>
@@ -458,7 +459,7 @@
           <td>0.9725</td>
       </tr>
       <tr>
-          <td nowrap>module:enable graph, bfs</td>
+          <td nowrap>model:enable graph, bfs</td>
           <td>3389</td>
           <td>18.2310</td>
           <td>10.9703</td>
@@ -467,7 +468,7 @@
           <td>0.9507</td>
       </tr>
       <tr>
-          <td nowrap>module:enable graph, serial</td>
+          <td nowrap>model:enable graph, serial</td>
           <td>3437</td>
           <td>17.0389</td>
           <td>11.7378</td>
@@ -486,7 +487,7 @@
           <td>1.0000</td>
       </tr>
       <tr>
-          <td nowrap>module:disable graph</td>
+          <td nowrap>model:disable graph</td>
           <td>10503</td>
           <td>14.7746</td>
           <td>6.7684</td>
@@ -495,7 +496,7 @@
           <td>1.0060</td>
       </tr>
       <tr>
-          <td nowrap>module:enable graph, bfs</td>
+          <td nowrap>model:enable graph, bfs</td>
           <td>6935</td>
           <td>14.8553</td>
           <td>6.7316</td>
@@ -504,7 +505,7 @@
           <td>1.0006</td>
       </tr>
       <tr>
-          <td nowrap>module:enable graph, serial</td>
+          <td nowrap>model:enable graph, serial</td>
           <td>7027</td>
           <td>14.3271</td>
           <td>6.9798</td>