Merge pull request #1170 from dcslin/feature/ms_model_mlp

update msmlp
diff --git a/examples/cnn_ms/README.md b/examples/cnn_ms/README.md
new file mode 100644
index 0000000..177ae4d
--- /dev/null
+++ b/examples/cnn_ms/README.md
@@ -0,0 +1,44 @@
+<!--
+    Licensed to the Apache Software Foundation (ASF) under one
+    or more contributor license agreements.  See the NOTICE file
+    distributed with this work for additional information
+    regarding copyright ownership.  The ASF licenses this file
+    to you under the Apache License, Version 2.0 (the
+    "License"); you may not use this file except in compliance
+    with the License.  You may obtain a copy of the License at
+      http://www.apache.org/licenses/LICENSE-2.0
+    Unless required by applicable law or agreed to in writing,
+    software distributed under the License is distributed on an
+    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+    KIND, either express or implied.  See the License for the
+    specific language governing permissions and limitations
+    under the License.
+-->
+
+# Image Classification using Convolutional Neural Networks
+
+Examples inside this folder show how to train CNN models using 
+SINGA for image classification.
+
+* `data` includes the scripts for preprocessing image datasets.
+  Currently, MNIST, CIFAR10 and CIFAR100 are included.
+
+* `model` includes the CNN model construction codes by creating
+  a subclass of `Module` to wrap the neural network operations 
+  of each model. Then computational graph is enabled to optimized 
+  the memory and efficiency.
+
+* `autograd` includes the codes to train CNN models by calling the
+  [neural network operations](../../python/singa/autograd.py) imperatively. 
+  The computational graph is not created.
+
+* `train_cnn.py` is the training script, which controls the training flow by
+  doing BackPropagation and SGD update.
+
+* `train_multiprocess.py` is the script for distributed training on a single
+  node with multiple GPUs; it uses Python's multiprocessing module and NCCL.
+
+* `train_mpi.py` is the script for distributed training (among multiple nodes) 
+  using MPI and NCCL for communication.
+
+* `benchmark.py` tests the training throughput using `ResNet50` as the workload.
\ No newline at end of file
diff --git a/examples/cnn_ms/autograd/cifar10_multiprocess.py b/examples/cnn_ms/autograd/cifar10_multiprocess.py
new file mode 100644
index 0000000..815d011
--- /dev/null
+++ b/examples/cnn_ms/autograd/cifar10_multiprocess.py
@@ -0,0 +1,43 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+from resnet_cifar10 import *
+import multiprocessing
+import sys
+
+if __name__ == '__main__':
+
+    # Generate a NCCL ID to be used for collective communication
+    nccl_id = singa.NcclIdHolder()
+
+    # Configure the number of GPUs to be used
+    world_size = int(sys.argv[1])
+
+    # Testing the experimental partial-parameter update asynchronous training
+    partial_update = True
+
+    process = []
+    for local_rank in range(0, world_size):
+        process.append(
+            multiprocessing.Process(target=train_cifar10,
+                                    args=(True, local_rank, world_size, nccl_id,
+                                          partial_update)))
+
+    for p in process:
+        p.start()
diff --git a/examples/ms_model_mlp/native.py b/examples/ms_model_mlp/native.py
new file mode 100644
index 0000000..a82ec3b
--- /dev/null
+++ b/examples/ms_model_mlp/native.py
@@ -0,0 +1,137 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+from singa import tensor
+from singa.tensor import Tensor
+from singa import autograd
+from singa import opt
+import numpy as np
+from singa import device
+import argparse
+
+np_dtype = {"float16": np.float16, "float32": np.float32}
+
+singa_dtype = {"float16": tensor.float16, "float32": tensor.float32}
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-p',
+                        choices=['float32', 'float16'],
+                        default='float32',
+                        dest='precision')
+    parser.add_argument('-m',
+                        '--max-epoch',
+                        default=1001,
+                        type=int,
+                        help='maximum epochs',
+                        dest='max_epoch')
+    args = parser.parse_args()
+
+    np.random.seed(0)
+
+    autograd.training = True
+
+    # prepare training data in numpy array
+
+    # generate the boundary
+    f = lambda x: (5 * x + 1)
+    bd_x = np.linspace(-1.0, 1, 200)
+    bd_y = f(bd_x)
+
+    # generate the training data
+    x = np.random.uniform(-1, 1, 400)
+    y = f(x) + 2 * np.random.randn(len(x))
+
+    # convert training data to 2d space
+    label = np.asarray([5 * a + 1 > b for (a, b) in zip(x, y)])
+    data = np.array([[a, b] for (a, b) in zip(x, y)], dtype=np.float32)
+
+    def to_categorical(y, num_classes):
+        """
+        Converts a class vector (integers) to binary class matrix.
+
+        Args:
+            y: class vector to be converted into a matrix
+                (integers from 0 to num_classes).
+            num_classes: total number of classes.
+
+        Returns:
+            A binary matrix representation of the input.
+        """
+        y = np.array(y, dtype="int")
+        n = y.shape[0]
+        categorical = np.zeros((n, num_classes))
+        categorical[np.arange(n), y] = 1
+        return categorical
+
+    label = to_categorical(label, 2).astype(np.float32)
+    print("train_data_shape:", data.shape)
+    print("train_label_shape:", label.shape)
+
+    precision = singa_dtype[args.precision]
+    np_precision = np_dtype[args.precision]
+
+    dev = device.create_cuda_gpu()
+
+    inputs = Tensor(data=data, device=dev)
+    target = Tensor(data=label, device=dev)
+
+    inputs = inputs.as_type(precision)
+    target = target.as_type(tensor.int32)
+
+    w0_np = np.random.normal(0, 0.1, (2, 3)).astype(np_precision)
+    w0 = Tensor(data=w0_np,
+                device=dev,
+                dtype=precision,
+                requires_grad=True,
+                stores_grad=True)
+    b0 = Tensor(shape=(3,),
+                device=dev,
+                dtype=precision,
+                requires_grad=True,
+                stores_grad=True)
+    b0.set_value(0.0)
+
+    w1_np = np.random.normal(0, 0.1, (3, 2)).astype(np_precision)
+    w1 = Tensor(data=w1_np,
+                device=dev,
+                dtype=precision,
+                requires_grad=True,
+                stores_grad=True)
+    b1 = Tensor(shape=(2,),
+                device=dev,
+                dtype=precision,
+                requires_grad=True,
+                stores_grad=True)
+    b1.set_value(0.0)
+
+    sgd = opt.SGD(0.05, 0.8)
+
+    # training process
+    for i in range(args.max_epoch):
+        x = autograd.matmul(inputs, w0)
+        x = autograd.add_bias(x, b0)
+        x = autograd.relu(x)
+        x = autograd.matmul(x, w1)
+        x = autograd.add_bias(x, b1)
+        loss = autograd.softmax_cross_entropy(x, target)
+        sgd(loss)
+
+        if i % 100 == 0:
+            print("%d, training loss = " % i, tensor.to_numpy(loss)[0])