Merge pull request #436 from moazreyad/SINGA-428

SINGA-428 Move Docker images under Apache user name

The latest images are pushed to https://hub.docker.com/r/apache/singa
diff --git a/CMakeLists.txt b/CMakeLists.txt
index b630497..be42858 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -19,10 +19,10 @@
 CMAKE_MINIMUM_REQUIRED(VERSION 2.8)
 
 PROJECT(singa)
-SET(PACKAGE_VERSION "1.1.1")
+SET(PACKAGE_VERSION "1.2.0")
 SET(SINGA_MAJOR_VERSION 1)  # 0 -
-SET(SINGA_MINOR_VERSION 1)  # 0 - 9
-SET(SINGA_PATCH_VERSION 1)  # 0 - 99
+SET(SINGA_MINOR_VERSION 2)  # 0 - 9
+SET(SINGA_PATCH_VERSION 0)  # 0 - 99
 MATH(EXPR SINGA_VERSION "${SINGA_MAJOR_VERSION} * 1000 + ${SINGA_MINOR_VERSION} * 100 + ${SINGA_PATCH_VERSION}")
 
 LIST(APPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake/Thirdparty)
@@ -66,6 +66,7 @@
 OPTION(ENABLE_DIST "Enable distributed training" OFF)
 OPTION(DISABLE_WARNINGS "Disable warnings under windows" ON)
 OPTION(USE_MODULES "Compile dependent libs as submodules together with singa" OFF)
+OPTION(USE_MKLDNN "Use mkl-dnn libs" OFF)
 
 
 # TODO: remove all USE_CBLAS in codes
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 6fd833c..1b855fa 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -1,3 +1,21 @@
+<!--
+    Licensed to the Apache Software Foundation (ASF) under one
+    or more contributor license agreements.  See the NOTICE file
+    distributed with this work for additional information
+    regarding copyright ownership.  The ASF licenses this file
+    to you under the Apache License, Version 2.0 (the
+    "License"); you may not use this file except in compliance
+    with the License.  You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing,
+    software distributed under the License is distributed on an
+    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+    KIND, either express or implied.  See the License for the
+    specific language governing permissions and limitations
+    under the License.
+-->
 # How to contribute 
 
 [Getting Started](./doc/en/develop/how-contribute.md)
diff --git a/LICENSE b/LICENSE
index 9a119a7..d3358c1 100644
--- a/LICENSE
+++ b/LICENSE
@@ -296,6 +296,7 @@
 =====================================================================
 SINGA bundles the following under MIT license:
 cmake/ThirdParty/FindOpenCL.cmake
+Open Neural Network Exchange
 
 Copyright (c) 2010-2016 Institute for Microelectronics,
                         Institute for Analysis and Scientific Computing, TU Wien.
@@ -304,6 +305,9 @@
 is owned by The United States Government, and operated by UChicago Argonne, LLC
 under provision of a contract with the Department of Energy.
 
+Copyright (c) ONNX Project Contributors.
+All rights reserved.
+
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
 in the Software without restriction, including without limitation the rights
@@ -419,3 +423,55 @@
 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+=====================================================================
+SINGA bundles the following under Apache License v2.0:
+mkl-dnn
+
+Copyright 2017-2018 Intel Corporation
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+
+=====================================================================
+SINGA bundles the following under Apache License v2.0:
+examples/imagenet/inception/convert.py
+
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+
+=====================================================================
+SINGA bundles the following under New BSD license: 
+doc/en/docs/notebook/utils.py 
+
+Copyright (c) 2008–2013, Theano Development Team All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+
+        Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+        Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
+        Neither the name of Theano nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ‘’AS IS’’ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
\ No newline at end of file
diff --git a/RELEASE_NOTES b/RELEASE_NOTES
index 43d4004..a75ceea 100644
--- a/RELEASE_NOTES
+++ b/RELEASE_NOTES
@@ -1,3 +1,102 @@
+Release Notes - SINGA - Version singa-incubating-2.0.0
+
+SINGA is a general distributed deep learning platform for training big deep
+learning models over large datasets.
+
+This release includes following features:
+
+  * Core components
+    * [SINGA-434] Support tensor broadcasting
+    * [SINGA-370] Improvement to tensor reshape and various misc. changes related to SINGA-341 and 351
+
+  * Model components
+    * [SINGA-333] Add support for Open Neural Network Exchange (ONNX) format
+    * [SINGA-385] Add new python module for optimizers
+    * [SINGA-394] Improve the CPP operations via Intel MKL DNN lib
+    * [SINGA-425] Add 3 operators , Abs(), Exp() and leakyrelu(), for Autograd 
+    * [SINGA-410] Add two function, set_params() and get_params(), for Autograd Layer class
+    * [SINGA-383] Add Separable Convolution for autograd
+    * [SINGA-388] Develop some RNN layers by calling tiny operations like matmul, addbias.
+    * [SINGA-382] Implement concat operation for autograd    
+    * [SINGA-378] Implement maxpooling operation and its related functions for autograd
+    * [SINGA-379] Implement batchnorm operation and its related functions for autograd
+
+  * Utility functions and CI
+    * [SINGA-432] Update depdent lib versions in conda-build config
+    * [SINGA-429] Update docker images for latest cuda and cudnn
+    * [SINGA-428] Move Docker images under Apache user name
+
+  * Documentation and usability
+    * [SINGA-395] Add documentation for autograd APIs
+    * [SINGA-344] Add a GAN example
+    * [SINGA-390] Update installation.md
+    * [SINGA-384] Implement ResNet using autograd API
+    * [SINGA-352] Complete SINGA documentation in Chinese version
+      
+  * Bugs fixed
+    * [SINGA-431] Unit Test failed - Tensor Transpose
+    * [SINGA-422] ModuleNotFoundError: No module named "_singa_wrap"
+    * [SINGA-418] Unsupportive type 'long' in python3.  
+    * [SINGA-409] Basic `singa-cpu` import throws error
+    * [SINGA-408] Unsupportive function definition in python3
+    * [SINGA-380] Fix bugs from Reshape  
+
+---------------------------------------------------------------
+Release Notes - SINGA - Version singa-incubating-1.2.0
+
+SINGA is a general distributed deep learning platform for training big deep
+learning models over large datasets.
+
+This release includes following features:
+
+  * Core components
+      * [SINGA-290] Upgrade to Python 3
+      * [SINGA-341] Added stride functionality to tensors for CPP
+      * [SINGA-347] Create a function that supports einsum
+      * [SINGA-351] Added stride support and cudnn codes to cuda
+
+  * Model components
+      * [SINGA-300] Add residual networks for imagenet classification
+      * [SINGA-312] Rename layer parameters
+      * [SINGA-313] Add L2 norm layer
+      * [SINGA-315] Reduce memory footprint by Python generator for parameter
+      * [SINGA-316] Add SigmoidCrossEntropy
+      * [SINGA-324] Extend RNN layer to accept variant seq length across batches
+      * [SINGA-326] Add Inception V4 for ImageNet classification
+      * [SINGA-328] Add VGG models for ImageNet classification
+      * [SINGA-329] Support layer freezing during training (fine-tuning)
+      * [SINGA-346] Update cudnn from V5 to V7
+      * [SINGA-349] Create layer operations for autograd
+      * [SINGA-363] Add DenseNet for Imagenet classification
+
+  * Utility functions and CI
+      * [SINGA-274] Improve Debian packaging with CPack
+      * [SINGA-303] Create conda packages
+      * [SINGA-337] Add test cases for code
+      * [SINGA-348] Support autograd MLP Example
+      * [SINGA-345] Update Jenkins and fix bugs in compliation
+      * [SINGA-354] Update travis scripts to use conda-build for all platforms
+      * [SINGA-358] Consolidated RUN steps and cleaned caches in Docker containers
+      * [SINGA-359] Create alias for conda packages
+
+  * Documentation and usability
+      * [SINGA-223] Fix side navigation menu in the website
+      * [SINGA-294] Add instructions to run CUDA unit tests on Windows
+      * [SINGA-305] Add jupyter notebooks for SINGA V1 tutorial
+      * [SINGA-319] Fix link errors on the index page
+      * [SINGA-352] Complete SINGA documentation in Chinese version
+      * [SINGA-361] Add git instructions for contributors and committers
+
+  * Bugs fixed
+      * [SINGA-330] fix openblas building on i7 7700k
+      * [SINGA-331] Fix the bug of tensor division operation
+      * [SINGA-350] Error from python3 test
+      * [SINGA-356] Error using travis tool to build SINGA on mac os
+      * [SINGA-363] Fix some bugs in imagenet examples
+      * [SINGA-368] Fix the bug in Cifar10 examples
+      * [SINGA-369] the errors of examples in testing
+
+---------------------------------------------------------------
 Release Notes - SINGA - Version singa-incubating-1.1.0
 
 SINGA is a general distributed deep learning platform for training big deep learning models over large datasets.
diff --git a/cmake/Cuda.cmake b/cmake/Cuda.cmake
index 35109aa..5f72f27 100644
--- a/cmake/Cuda.cmake
+++ b/cmake/Cuda.cmake
@@ -31,10 +31,10 @@
 IF(USE_CUDNN)
 #include(cmake/Modules/Cudnn.cmake)
     FIND_PACKAGE(CUDNN REQUIRED)
-    INCLUDE_DIRECTORIES(SYSTEM ${CUDNN_INCLUDE_DIR})
+    INCLUDE_DIRECTORIES( ${CUDNN_INCLUDE_DIR})
     LIST(APPEND SINGA_LINKER_LIBS ${CUDNN_LIBRARIES})
 ENDIF()
 
-INCLUDE_DIRECTORIES(SYSTEM ${CUDA_INCLUDE_DIRS})
+INCLUDE_DIRECTORIES( ${CUDA_INCLUDE_DIRS})
 LIST(APPEND SINGA_LINKER_LIBS ${CUDA_CUDART_LIBRARY} ${CUDA_curand_LIBRARY} ${CUDA_CUBLAS_LIBRARIES})
 #MESSAGE(STATUS "libs " ${SINGA_LINKER_LIBS})
diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
index e221aa8..d1d8060 100644
--- a/cmake/Dependencies.cmake
+++ b/cmake/Dependencies.cmake
@@ -41,12 +41,12 @@
     SET(PROTOBUF_LIBRARY "${CMAKE_BINARY_DIR}/lib/libprotobuf.a")
     SET(PROTOBUF_PROTOC_LIBRARY "${CMAKE_BINARY_DIR}/lib/libprotobuf.a")
     SET(PROTOBUF_PROTOC_EXECUTABLE "${CMAKE_BINARY_DIR}/bin/protoc")
-    INCLUDE_DIRECTORIES(SYSTEM ${PROTOBUF_INCLUDE_DIR})
+    INCLUDE_DIRECTORIES( ${PROTOBUF_INCLUDE_DIR})
     LIST(APPEND SINGA_LINKER_LIBS ${PROTOBUF_LIBRARY})
     #IF(USE_CBLAS)
     SET(CBLAS_INCLUDE_DIR "${CMAKE_BINARY_DIR}/include")
     SET(CBLAS_LIBRARIES "${CMAKE_BINARY_DIR}/lib/libopenblas.a")
-    INCLUDE_DIRECTORIES(SYSTEM ${CBLAS_INCLUDE_DIR})
+    INCLUDE_DIRECTORIES( ${CBLAS_INCLUDE_DIR})
     LIST(APPEND SINGA_LINKER_LIBS ${CBLAS_LIBRARIES})
     #ENDIF()
     #ENDIF()
@@ -56,7 +56,7 @@
     LIST(APPEND SINGA_LINKER_LIBS ${PROTOBUF_LIBRARY})
     #IF(USE_CBLAS)
     FIND_PACKAGE(CBLAS REQUIRED)
-    INCLUDE_DIRECTORIES(SYSTEM ${CBLAS_INCLUDE_DIR})
+    INCLUDE_DIRECTORIES( ${CBLAS_INCLUDE_DIR})
     LIST(APPEND SINGA_LINKER_LIBS ${CBLAS_LIBRARIES})
     #MESSAGE(STATUS "Found cblas at ${CBLAS_LIBRARIES}")
     #ENDIF()
@@ -76,7 +76,7 @@
 
 IF(USE_LMDB)
     FIND_PACKAGE(LMDB REQUIRED)
-    INCLUDE_DIRECTORIES(SYSTEM ${LMDB_INCLUDE_DIR})
+    INCLUDE_DIRECTORIES( ${LMDB_INCLUDE_DIR})
     LIST(APPEND SINGA_LINKER_LIBS ${LMDB_LIBRARIES})
     #MESSAGE(STATUS "FOUND lmdb at ${LMDB_INCLUDE_DIR}")
 ENDIF()
@@ -95,14 +95,14 @@
     IF(NOT OPENCL_FOUND)
         MESSAGE(SEND_ERROR "OpenCL was requested, but not found.")
     ELSE()
-        INCLUDE_DIRECTORIES(SYSTEM ${OPENCL_INCLUDE_DIR})
+        INCLUDE_DIRECTORIES( ${OPENCL_INCLUDE_DIR})
         LIST(APPEND SINGA_LINKER_LIBS ${OPENCL_LIBRARIES})
         FIND_PACKAGE(ViennaCL REQUIRED)
         IF(NOT ViennaCL_FOUND)
             MESSAGE(SEND_ERROR "ViennaCL is required if OpenCL is enabled.")
         ELSE()
             #MESSAGE(STATUS "Found ViennaCL headers at ${ViennaCL_INCLUDE_DIR}")
-            INCLUDE_DIRECTORIES(SYSTEM ${ViennaCL_INCLUDE_DIR})
+            INCLUDE_DIRECTORIES( ${ViennaCL_INCLUDE_DIR})
             LIST(APPEND SINGA_LINKER_LIBS ${ViennaCL_LIBRARIES})
         ENDIF()
     ENDIF()
@@ -116,7 +116,7 @@
 IF(USE_OPENCV)
     FIND_PACKAGE(OpenCV REQUIRED)
     MESSAGE(STATUS "Found OpenCV_${OpenCV_VERSION} at ${OpenCV_INCLUDE_DIRS}")
-    INCLUDE_DIRECTORIES(SYSTEM ${OpenCV_INCLUDE_DIRS})
+    INCLUDE_DIRECTORIES( ${OpenCV_INCLUDE_DIRS})
     LIST(APPEND SINGA_LINKER_LIBS ${OpenCV_LIBRARIES})
 ENDIF()
 
@@ -141,3 +141,11 @@
     FIND_PACKAGE(JNI REQUIRED)
     FIND_PACKAGE(SWIG 3.0 REQUIRED)
 ENDIF()
+
+IF(USE_MKLDNN)
+    FIND_PATH(MKLDNN_INCLUDE_DIR NAME "mkldnn.hpp" PATHS "$ENV{CMAKE_INCLUDE_PATH}")
+    FIND_LIBRARY(MKLDNN_LIBRARIES NAME "mkldnn" PATHS "$ENV{CMAKE_LIBRARY_PATH}")
+    MESSAGE(STATUS "Found MKLDNN at ${MKLDNN_INCLUDE_DIR}")
+    INCLUDE_DIRECTORIES(${MKLDNN_INCLUDE_DIR})
+    LIST(APPEND SINGA_LINKER_LIBS ${MKLDNN_LIBRARIES})
+ENDIF()
diff --git a/cmake/Templates/singa_config.h.in b/cmake/Templates/singa_config.h.in
index e35230c..1fb7645 100644
--- a/cmake/Templates/singa_config.h.in
+++ b/cmake/Templates/singa_config.h.in
@@ -51,3 +51,5 @@
 // #cmakedefine CUDNN_MINOR_VERSION @CUDNN_MINOR_VERSION@
 // #cmakedefine CUDNN_PATCH_VERSION @CUDNN_PATCH_VERSION@
 // #cmakedefine CUDNN_VERSION @CUDNN_VERSION@
+
+#cmakedefine USE_MKLDNN
diff --git a/doc/_static/style.css b/doc/_static/style.css
index 86cc1f9..6a0bbc3 100644
--- a/doc/_static/style.css
+++ b/doc/_static/style.css
@@ -1,3 +1,22 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
 .wy-nav-content {
     max-width: none;
 }
diff --git a/doc/build.sh b/doc/build.sh
old mode 100755
new mode 100644
index 44eb1c2..c00be39
--- a/doc/build.sh
+++ b/doc/build.sh
@@ -36,4 +36,5 @@
     $SPHINXBUILD -b html -c . -d $BUILDDIR/doctree ${LANG_ARR[i]} $BUILDDIR/html/${LANG_ARR[i]}
   done
   echo "<script language=\"javascript\" type=\"text/javascript\">window.location.href='en/index.html';</script>" > $BUILDDIR/html/index.html
+  ( cat Doxyfile ; echo "OUTPUT_DIRECTORY=$BUILDDIR/html/doxygen" ) | doxygen - 
 fi
diff --git a/doc/en/community/issue-tracking.md b/doc/en/community/issue-tracking.md
index 26b23dd..c6ff200 100644
--- a/doc/en/community/issue-tracking.md
+++ b/doc/en/community/issue-tracking.md
@@ -1,3 +1,21 @@
+<!--
+    Licensed to the Apache Software Foundation (ASF) under one
+    or more contributor license agreements.  See the NOTICE file
+    distributed with this work for additional information
+    regarding copyright ownership.  The ASF licenses this file
+    to you under the Apache License, Version 2.0 (the
+    "License"); you may not use this file except in compliance
+    with the License.  You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing,
+    software distributed under the License is distributed on an
+    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+    KIND, either express or implied.  See the License for the
+    specific language governing permissions and limitations
+    under the License.
+-->
 ## Issue Tracking
 
 ___
diff --git a/doc/en/develop/contribute-code.md b/doc/en/develop/contribute-code.md
index 39d11f8..bbb22de 100644
--- a/doc/en/develop/contribute-code.md
+++ b/doc/en/develop/contribute-code.md
@@ -1,3 +1,21 @@
+<!--
+    Licensed to the Apache Software Foundation (ASF) under one
+    or more contributor license agreements.  See the NOTICE file
+    distributed with this work for additional information
+    regarding copyright ownership.  The ASF licenses this file
+    to you under the Apache License, Version 2.0 (the
+    "License"); you may not use this file except in compliance
+    with the License.  You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing,
+    software distributed under the License is distributed on an
+    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+    KIND, either express or implied.  See the License for the
+    specific language governing permissions and limitations
+    under the License.
+-->
 ## How to Contribute Code
 
 
diff --git a/doc/en/develop/how-contribute.md b/doc/en/develop/how-contribute.md
index ab4ee66..399a954 100644
--- a/doc/en/develop/how-contribute.md
+++ b/doc/en/develop/how-contribute.md
@@ -1,3 +1,21 @@
+<!--
+    Licensed to the Apache Software Foundation (ASF) under one
+    or more contributor license agreements.  See the NOTICE file
+    distributed with this work for additional information
+    regarding copyright ownership.  The ASF licenses this file
+    to you under the Apache License, Version 2.0 (the
+    "License"); you may not use this file except in compliance
+    with the License.  You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing,
+    software distributed under the License is distributed on an
+    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+    KIND, either express or implied.  See the License for the
+    specific language governing permissions and limitations
+    under the License.
+-->
 # How to Contribute to SINGA
 
 As with any open source project, there are several ways you can help:
diff --git a/doc/en/docs/autograd.md b/doc/en/docs/autograd.md
index 6070629..f80f2cb 100644
--- a/doc/en/docs/autograd.md
+++ b/doc/en/docs/autograd.md
@@ -1,4 +1,24 @@
-# Autograd in Singa
+<!--
+    Licensed to the Apache Software Foundation (ASF) under one
+    or more contributor license agreements.  See the NOTICE file
+    distributed with this work for additional information
+    regarding copyright ownership.  The ASF licenses this file
+    to you under the Apache License, Version 2.0 (the
+    "License"); you may not use this file except in compliance
+    with the License.  You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing,
+    software distributed under the License is distributed on an
+    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+    KIND, either express or implied.  See the License for the
+    specific language governing permissions and limitations
+    under the License.
+-->
+
+
+# Autograd in Singa
 
 There are two typical ways to implement autograd, via symbolic differentiation like [Theano](http://deeplearning.net/software/theano/index.html) or reverse differentiation like [Pytorch](https://pytorch.org/docs/stable/notes/autograd.html). Singa follows Pytorch way, which records the computation graph and apply the backward propagation automatically after forward propagation. The autograd algorithm is explained in details [here](https://pytorch.org/docs/stable/notes/autograd.html). We explain the relevant modules in Singa and give an example to illustrate the usage. 
 
diff --git a/doc/en/docs/cnn.md b/doc/en/docs/cnn.md
old mode 100755
new mode 100644
index 6609137..64aad5a
--- a/doc/en/docs/cnn.md
+++ b/doc/en/docs/cnn.md
@@ -1,3 +1,21 @@
+<!--
+    Licensed to the Apache Software Foundation (ASF) under one
+    or more contributor license agreements.  See the NOTICE file
+    distributed with this work for additional information
+    regarding copyright ownership.  The ASF licenses this file
+    to you under the Apache License, Version 2.0 (the
+    "License"); you may not use this file except in compliance
+    with the License.  You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing,
+    software distributed under the License is distributed on an
+    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+    KIND, either express or implied.  See the License for the
+    specific language governing permissions and limitations
+    under the License.
+-->
 # Quickstart - Cifar10 example
 Convolution neural network (CNN) is a type of feed-forward artificial neural network widely used for image classification. In this example, we will use a deep CNN model to do image classification for the [CIFAR10 dataset](http://www.cs.toronto.edu/~kriz/cifar.html).
 
diff --git a/doc/en/docs/dependencies.md b/doc/en/docs/dependencies.md
index a812c05..febf6da 100644
--- a/doc/en/docs/dependencies.md
+++ b/doc/en/docs/dependencies.md
@@ -1,3 +1,21 @@
+<!--
+    Licensed to the Apache Software Foundation (ASF) under one
+    or more contributor license agreements.  See the NOTICE file
+    distributed with this work for additional information
+    regarding copyright ownership.  The ASF licenses this file
+    to you under the Apache License, Version 2.0 (the
+    "License"); you may not use this file except in compliance
+    with the License.  You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing,
+    software distributed under the License is distributed on an
+    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+    KIND, either express or implied.  See the License for the
+    specific language governing permissions and limitations
+    under the License.
+-->
 # Dependent library installation
 
 ## Windows
diff --git a/doc/en/docs/docker.md b/doc/en/docs/docker.md
index 8e5743e..287e52d 100644
--- a/doc/en/docs/docker.md
+++ b/doc/en/docs/docker.md
@@ -1,3 +1,21 @@
+<!--
+    Licensed to the Apache Software Foundation (ASF) under one
+    or more contributor license agreements.  See the NOTICE file
+    distributed with this work for additional information
+    regarding copyright ownership.  The ASF licenses this file
+    to you under the Apache License, Version 2.0 (the
+    "License"); you may not use this file except in compliance
+    with the License.  You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing,
+    software distributed under the License is distributed on an
+    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+    KIND, either express or implied.  See the License for the
+    specific language governing permissions and limitations
+    under the License.
+-->
 # Docker Images
 
 
@@ -31,28 +49,23 @@
 
 The `<TAG>` is named as
 
-    devel|runtime[-OS][-CUDA|OPENCL][-CUDNN]
+    devel|runtime[-CUDA|CPU][-CUDNN]
 
 * devel: development images with all dependent libs' header files installed and SINGA's source code; runtime: the minimal images which can run SINGA programs.
-* OS: ubuntu, ubuntu14.04, centos, centos6
-* CUDA: cuda, cuda8.0, cuda7.0
-* CUDNN: cudnn, cudnn5, cudnn4
-* OPENCL: opencl, opencl1.2
+* CUDA: cuda10.0, cuda9.0
+* CUDNN: cudnn7
 
-By default, if the version is not included in the tag, the latest stable version is used.
-The default OS is Ubuntu. The version is the latest stable version (e.g., 16.04 for now).
-For -cuda version, the **cudnn** is included by default. Their versions are also the latest stable version, i.e., cuda8.0 and cudnn5 for now.
+Here are some example tags:
 
-Here are some example tags,
+`devel-cuda9-cudnn7`, `devel-cuda9-cudnn7`, `devel-cuda10-cudnn7`, `devel-cpu`, `runtime-gpu` and `runtime-cpu`
 
-`devel`, `devel-cuda`, `runtime`, `runtime-cuda`, `devel-centos7-cuda`, `devel-ubuntu14.04`, `devel-ubuntu14.04-cuda7.5-cudnn4`
 
 Please follow the existing Dockefiles under tool/docker/ to create other Dockefiles.
 The folder structure is like
 
     level1: devel|runtime
     level2: Dockerfile, OS
-    level3: Dockerfile, CUDA|OPENCL
-    level4: CUDNN
+    level3: Dockerfile, CUDA|MKLDNN
 
-For example, the path of the Dockerfile for `devel-cuda` is `tool/docker/devel/cuda/Dockerfile`.
+
+For example, the path of the Dockerfile for `devel-cuda9-cudnn7` is `tool/docker/devel/ubuntu/cuda9/Dockerfile`.
diff --git a/doc/en/docs/installation.md b/doc/en/docs/installation.md
old mode 100755
new mode 100644
index e7462fa..40bf83c
--- a/doc/en/docs/installation.md
+++ b/doc/en/docs/installation.md
@@ -1,3 +1,21 @@
+<!--
+    Licensed to the Apache Software Foundation (ASF) under one
+    or more contributor license agreements.  See the NOTICE file
+    distributed with this work for additional information
+    regarding copyright ownership.  The ASF licenses this file
+    to you under the Apache License, Version 2.0 (the
+    "License"); you may not use this file except in compliance
+    with the License.  You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing,
+    software distributed under the License is distributed on an
+    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+    KIND, either express or implied.  See the License for the
+    specific language governing permissions and limitations
+    under the License.
+-->
 # Installation
 
 ## From Conda
@@ -100,6 +118,7 @@
     * `USE_PYTHON=ON`, used for compiling PySINGA
     * `USE_PYTHON3=ON`, used for compiling with Python 3 support. (The default is Python 2)
     * `USE_OPENCL=ON`, used for compiling with OpenCL support
+    * `USE_MKLDNN=ON`, used for compiling with Intel MKL-dnn support
     * `PACKAGE=ON`, used for building the Debian package
     * `ENABLE_TEST`, used for compiling unit test cases
 
@@ -203,6 +222,21 @@
 
     $ cmake -DUSE_OPENCL=ON ..
     $ make
+    
+#### USE_MKLDNN
+
+User can enable MKL-DNN to enhance the performance of CPU computation.
+
+Installation guide of MKL-DNN could be found [here](https://github.com/intel/mkl-dnn#installation).
+
+SINGA has been tested over MKL-DNN v0.17.2.
+
+To build SINGA with MKL-DNN support:
+
+    # Dependent libs are installed already
+    $ cmake -DUSE_MKLDNN=ON ..
+    $ make
+
 
 #### PACKAGE
 
diff --git a/doc/en/docs/neural-net.md b/doc/en/docs/neural-net.md
index 0a97f21..e59a20c 100644
--- a/doc/en/docs/neural-net.md
+++ b/doc/en/docs/neural-net.md
@@ -1,3 +1,21 @@
+<!--
+    Licensed to the Apache Software Foundation (ASF) under one
+    or more contributor license agreements.  See the NOTICE file
+    distributed with this work for additional information
+    regarding copyright ownership.  The ASF licenses this file
+    to you under the Apache License, Version 2.0 (the
+    "License"); you may not use this file except in compliance
+    with the License.  You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing,
+    software distributed under the License is distributed on an
+    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+    KIND, either express or implied.  See the License for the
+    specific language governing permissions and limitations
+    under the License.
+-->
 # Neural Net
 
 
diff --git a/doc/en/docs/notebook/cnn.ipynb b/doc/en/docs/notebook/cnn.ipynb
index d5198b2..7d33e1a 100644
--- a/doc/en/docs/notebook/cnn.ipynb
+++ b/doc/en/docs/notebook/cnn.ipynb
@@ -4,6 +4,13 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
+    "Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements; and to You under the Apache License, Version 2.0. "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
     "# Classify images from MNIST using LeNet"
    ]
   },
diff --git a/doc/en/docs/notebook/core.ipynb b/doc/en/docs/notebook/core.ipynb
index c16cc1f..50bdcf2 100644
--- a/doc/en/docs/notebook/core.ipynb
+++ b/doc/en/docs/notebook/core.ipynb
@@ -2,6 +2,13 @@
  "cells": [
   {
    "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements; and to You under the Apache License, Version 2.0. "
+   ]
+  },
+  {
+   "cell_type": "markdown",
    "metadata": {
     "collapsed": true
    },
diff --git a/doc/en/docs/notebook/index.ipynb b/doc/en/docs/notebook/index.ipynb
index 29f689f..22b9678 100644
--- a/doc/en/docs/notebook/index.ipynb
+++ b/doc/en/docs/notebook/index.ipynb
@@ -4,6 +4,13 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
+    "Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements; and to You under the Apache License, Version 2.0. "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
     "![Apache Singa](http://singa.apache.org/en/_static/singa.png)\n",
     "\n",
     "# A Tutorial of SINGA V1\n",
diff --git a/doc/en/docs/notebook/installation.ipynb b/doc/en/docs/notebook/installation.ipynb
index 65093dc..5d8793c 100644
--- a/doc/en/docs/notebook/installation.ipynb
+++ b/doc/en/docs/notebook/installation.ipynb
@@ -4,6 +4,13 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
+    "Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements; and to You under the Apache License, Version 2.0. "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
     "The easiast way to install SINGA is via [conda](https://conda.io/docs/).\n",
     "\n",
     "## Install Conda\n",
diff --git a/doc/en/docs/notebook/mlp.ipynb b/doc/en/docs/notebook/mlp.ipynb
old mode 100755
new mode 100644
index de5fae5..199c44a
--- a/doc/en/docs/notebook/mlp.ipynb
+++ b/doc/en/docs/notebook/mlp.ipynb
@@ -4,6 +4,13 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
+    "Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements; and to You under the Apache License, Version 2.0. "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
     "# Train a multi-layer perceptron (MLP) model \n",
     "\n",
     "In this notebook, we are going to use PySINGA to train a MLP model for classifying 2-d points into two categories (i.e., positive and negative). We use this example to illustrate the usage of PySINGA's modules. Please refer to the [documentation page](http://singa.apache.org/en/docs/index.html) for the functions of each module."
diff --git a/doc/en/docs/notebook/model.ipynb b/doc/en/docs/notebook/model.ipynb
index 6888435..35380da 100644
--- a/doc/en/docs/notebook/model.ipynb
+++ b/doc/en/docs/notebook/model.ipynb
@@ -4,6 +4,13 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
+    "Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements; and to You under the Apache License, Version 2.0. "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
     "# SINGA Model Classes\n",
     "\n",
     "<img src=\"http://singa.apache.org/en/_static/images/singav1-sw.png\" width=\"500px\"/>"
diff --git a/doc/en/docs/notebook/rbm.ipynb b/doc/en/docs/notebook/rbm.ipynb
old mode 100755
new mode 100644
index fd3309c..44c7125
--- a/doc/en/docs/notebook/rbm.ipynb
+++ b/doc/en/docs/notebook/rbm.ipynb
@@ -4,6 +4,13 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
+    "Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements; and to You under the Apache License, Version 2.0. "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
     "# Train a RBM model\n",
     "\n",
     "This notebook woul train a Restricted Boltzmann Machine (RBM) over the MNIST dataset using PySINGA. The RBM model would learn a feature representation of a digit image like MNIST images.\n",
diff --git a/doc/en/docs/notebook/regression.ipynb b/doc/en/docs/notebook/regression.ipynb
old mode 100755
new mode 100644
index 4484564..a1b8884
--- a/doc/en/docs/notebook/regression.ipynb
+++ b/doc/en/docs/notebook/regression.ipynb
@@ -4,6 +4,13 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
+    "Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements; and to You under the Apache License, Version 2.0. "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
     "# Train a linear regression model\n",
     "\n",
     "In this notebook, we are going to use the tensor module from PySINGA to train a linear regression model. We use this example to illustrate the usage of tensor of PySINGA. Please refer the [documentation page](http://singa.apache.org/en/docs/tensor.html) to for more tensor functions provided by PySINGA. "
diff --git a/doc/en/docs/notebook/requirements.txt b/doc/en/docs/notebook/requirements.txt
index 21e293b..a128d1b 100644
--- a/doc/en/docs/notebook/requirements.txt
+++ b/doc/en/docs/notebook/requirements.txt
@@ -1,3 +1,22 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
 matplotlib=2.0.0=np112py27_0
 nb_conda_kernels=2.0.0=py27_0
 nb_conda=2.0.0=py27_0
diff --git a/doc/en/docs/notebook/rnn.ipynb b/doc/en/docs/notebook/rnn.ipynb
index f05c5b6..c34230d 100644
--- a/doc/en/docs/notebook/rnn.ipynb
+++ b/doc/en/docs/notebook/rnn.ipynb
@@ -4,6 +4,13 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
+    "Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements; and to You under the Apache License, Version 2.0. "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
     "# RNN for Character Level Language Modeling"
    ]
   },
diff --git a/doc/en/docs/notebook/utils.py b/doc/en/docs/notebook/utils.py
index 3af9ec5..dd18964 100755
--- a/doc/en/docs/notebook/utils.py
+++ b/doc/en/docs/notebook/utils.py
@@ -21,9 +21,14 @@
     return ndar
 
 
-def tile_raster_images(X, img_shape, tile_shape, tile_spacing=(0, 0),
-                       scale_rows_to_unit_interval=True,
-                       output_pixel_vals=True):
+def tile_raster_images(
+    X,
+    img_shape,
+    tile_shape,
+    tile_spacing=(0, 0),
+    scale_rows_to_unit_interval=True,
+    output_pixel_vals=True,
+):
     """
     Transform an array with one flattened image per row, into an array in
     which images are reshaped and layed out like tiles on a floor.
@@ -76,17 +81,19 @@
         assert len(X) == 4
         # Create an output numpy ndarray to store the image
         if output_pixel_vals:
-            out_array = numpy.zeros((out_shape[0], out_shape[1], 4),
-                                    dtype='uint8')
+            out_array = numpy.zeros(
+                (out_shape[0], out_shape[1], 4), dtype="uint8"
+            )
         else:
-            out_array = numpy.zeros((out_shape[0], out_shape[1], 4),
-                                    dtype=X.dtype)
+            out_array = numpy.zeros(
+                (out_shape[0], out_shape[1], 4), dtype=X.dtype
+            )
 
-        #colors default to 0, alpha defaults to 1 (opaque)
+        # colors default to 0, alpha defaults to 1 (opaque)
         if output_pixel_vals:
             channel_defaults = [0, 0, 0, 255]
         else:
-            channel_defaults = [0., 0., 0., 1.]
+            channel_defaults = [0.0, 0.0, 0.0, 1.0]
 
         for i in range(4):
             if X[i] is None:
@@ -94,17 +101,21 @@
                 # dtype
                 dt = out_array.dtype
                 if output_pixel_vals:
-                    dt = 'uint8'
-                out_array[:, :, i] = numpy.zeros(
-                    out_shape,
-                    dtype=dt
-                ) + channel_defaults[i]
+                    dt = "uint8"
+                out_array[:, :, i] = (
+                    numpy.zeros(out_shape, dtype=dt) + channel_defaults[i]
+                )
             else:
                 # use a recurrent call to compute the channel and store it
                 # in the output
                 out_array[:, :, i] = tile_raster_images(
-                    X[i], img_shape, tile_shape, tile_spacing,
-                    scale_rows_to_unit_interval, output_pixel_vals)
+                    X[i],
+                    img_shape,
+                    tile_shape,
+                    tile_spacing,
+                    scale_rows_to_unit_interval,
+                    output_pixel_vals,
+                )
         return out_array
 
     else:
@@ -115,7 +126,7 @@
         # generate a matrix to store the output
         dt = X.dtype
         if output_pixel_vals:
-            dt = 'uint8'
+            dt = "uint8"
         out_array = numpy.zeros(out_shape, dtype=dt)
 
         for tile_row in range(tile_shape[0]):
@@ -127,7 +138,8 @@
                         # do this by calling the `scale_to_unit_interval`
                         # function
                         this_img = scale_to_unit_interval(
-                            this_x.reshape(img_shape))
+                            this_x.reshape(img_shape)
+                        )
                     else:
                         this_img = this_x.reshape(img_shape)
                     # add the slice to the corresponding position in the
@@ -136,7 +148,7 @@
                     if output_pixel_vals:
                         c = 255
                     out_array[
-                        tile_row * (H + Hs): tile_row * (H + Hs) + H,
-                        tile_col * (W + Ws): tile_col * (W + Ws) + W
-                    ] = this_img * c
+                        tile_row * (H + Hs) : tile_row * (H + Hs) + H,
+                        tile_col * (W + Ws) : tile_col * (W + Ws) + W,
+                    ] = (this_img * c)
         return out_array
diff --git a/doc/en/docs/software_stack.md b/doc/en/docs/software_stack.md
index c60b6a5..141f4ab 100644
--- a/doc/en/docs/software_stack.md
+++ b/doc/en/docs/software_stack.md
@@ -1,3 +1,21 @@
+<!--
+    Licensed to the Apache Software Foundation (ASF) under one
+    or more contributor license agreements.  See the NOTICE file
+    distributed with this work for additional information
+    regarding copyright ownership.  The ASF licenses this file
+    to you under the Apache License, Version 2.0 (the
+    "License"); you may not use this file except in compliance
+    with the License.  You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing,
+    software distributed under the License is distributed on an
+    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+    KIND, either express or implied.  See the License for the
+    specific language governing permissions and limitations
+    under the License.
+-->
 # Software Stack
 
 SINGA's software stack includes three major components, namely, core, IO and
diff --git a/doc/en/downloads.md b/doc/en/downloads.md
index 1226d6a..f65222e 100644
--- a/doc/en/downloads.md
+++ b/doc/en/downloads.md
@@ -1,3 +1,21 @@
+<!--
+    Licensed to the Apache Software Foundation (ASF) under one
+    or more contributor license agreements.  See the NOTICE file
+    distributed with this work for additional information
+    regarding copyright ownership.  The ASF licenses this file
+    to you under the Apache License, Version 2.0 (the
+    "License"); you may not use this file except in compliance
+    with the License.  You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing,
+    software distributed under the License is distributed on an
+    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+    KIND, either express or implied.  See the License for the
+    specific language governing permissions and limitations
+    under the License.
+-->
 ## Download SINGA
 
 * To verify the downloaded tar.gz file, download the KEY and ASC files and then execute the following commands
diff --git a/doc/en/index.rst b/doc/en/index.rst
old mode 100755
new mode 100644
index a62a3e3..b92a618
--- a/doc/en/index.rst
+++ b/doc/en/index.rst
@@ -82,7 +82,7 @@
 -------------
 
 * Documentation and Python APIs are listed `here <docs.html>`_.
-* `C++ APIs <http://www.comp.nus.edu.sg/~dbsystem/singa/api/>`_ are generated by Doxygen.
+* `C++ APIs <../doxygen/html/index.html>`_ are generated by Doxygen.
 * Research publication list is available `here <http://www.comp.nus.edu.sg/~dbsystem/singa/research/publication/>`_.
 
 How to contribute
diff --git a/doc/en/releases/RELEASE_NOTES_0.1.0.md b/doc/en/releases/RELEASE_NOTES_0.1.0.md
index f0de7a5..b2d8bfb 100644
--- a/doc/en/releases/RELEASE_NOTES_0.1.0.md
+++ b/doc/en/releases/RELEASE_NOTES_0.1.0.md
@@ -1,3 +1,21 @@
+<!--
+    Licensed to the Apache Software Foundation (ASF) under one
+    or more contributor license agreements.  See the NOTICE file
+    distributed with this work for additional information
+    regarding copyright ownership.  The ASF licenses this file
+    to you under the Apache License, Version 2.0 (the
+    "License"); you may not use this file except in compliance
+    with the License.  You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing,
+    software distributed under the License is distributed on an
+    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+    KIND, either express or implied.  See the License for the
+    specific language governing permissions and limitations
+    under the License.
+-->
 # singa-incubating-0.1.0 Release Notes
 
 ---
diff --git a/doc/en/releases/RELEASE_NOTES_0.2.0.md b/doc/en/releases/RELEASE_NOTES_0.2.0.md
index f2133e3..d933f54 100644
--- a/doc/en/releases/RELEASE_NOTES_0.2.0.md
+++ b/doc/en/releases/RELEASE_NOTES_0.2.0.md
@@ -1,3 +1,21 @@
+<!--
+    Licensed to the Apache Software Foundation (ASF) under one
+    or more contributor license agreements.  See the NOTICE file
+    distributed with this work for additional information
+    regarding copyright ownership.  The ASF licenses this file
+    to you under the Apache License, Version 2.0 (the
+    "License"); you may not use this file except in compliance
+    with the License.  You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing,
+    software distributed under the License is distributed on an
+    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+    KIND, either express or implied.  See the License for the
+    specific language governing permissions and limitations
+    under the License.
+-->
 # singa-incubating-0.2.0 Release Notes
 
 ---
diff --git a/doc/en/releases/RELEASE_NOTES_0.3.0.md b/doc/en/releases/RELEASE_NOTES_0.3.0.md
index 4298aa6..a5fa1bb 100644
--- a/doc/en/releases/RELEASE_NOTES_0.3.0.md
+++ b/doc/en/releases/RELEASE_NOTES_0.3.0.md
@@ -1,3 +1,21 @@
+<!--
+    Licensed to the Apache Software Foundation (ASF) under one
+    or more contributor license agreements.  See the NOTICE file
+    distributed with this work for additional information
+    regarding copyright ownership.  The ASF licenses this file
+    to you under the Apache License, Version 2.0 (the
+    "License"); you may not use this file except in compliance
+    with the License.  You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing,
+    software distributed under the License is distributed on an
+    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+    KIND, either express or implied.  See the License for the
+    specific language governing permissions and limitations
+    under the License.
+-->
 # singa-incubating-0.3.0 Release Notes
 
 ---
diff --git a/doc/en/releases/RELEASE_NOTES_1.0.0.md b/doc/en/releases/RELEASE_NOTES_1.0.0.md
index dde2c63..b45e600 100644
--- a/doc/en/releases/RELEASE_NOTES_1.0.0.md
+++ b/doc/en/releases/RELEASE_NOTES_1.0.0.md
@@ -1,3 +1,21 @@
+<!--
+    Licensed to the Apache Software Foundation (ASF) under one
+    or more contributor license agreements.  See the NOTICE file
+    distributed with this work for additional information
+    regarding copyright ownership.  The ASF licenses this file
+    to you under the Apache License, Version 2.0 (the
+    "License"); you may not use this file except in compliance
+    with the License.  You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing,
+    software distributed under the License is distributed on an
+    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+    KIND, either express or implied.  See the License for the
+    specific language governing permissions and limitations
+    under the License.
+-->
 # singa-incubating-1.0.0 Release Notes
 
 ---
diff --git a/doc/en/releases/RELEASE_NOTES_1.1.0.md b/doc/en/releases/RELEASE_NOTES_1.1.0.md
index 75d086d..fffe6b0 100644
--- a/doc/en/releases/RELEASE_NOTES_1.1.0.md
+++ b/doc/en/releases/RELEASE_NOTES_1.1.0.md
@@ -1,3 +1,21 @@
+<!--
+    Licensed to the Apache Software Foundation (ASF) under one
+    or more contributor license agreements.  See the NOTICE file
+    distributed with this work for additional information
+    regarding copyright ownership.  The ASF licenses this file
+    to you under the Apache License, Version 2.0 (the
+    "License"); you may not use this file except in compliance
+    with the License.  You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing,
+    software distributed under the License is distributed on an
+    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+    KIND, either express or implied.  See the License for the
+    specific language governing permissions and limitations
+    under the License.
+-->
 # singa-incubating-1.1.0 Release Notes
 
 ---
diff --git a/doc/en/releases/RELEASE_NOTES_1.2.0.md b/doc/en/releases/RELEASE_NOTES_1.2.0.md
index ca4ace6..2c7a134 100644
--- a/doc/en/releases/RELEASE_NOTES_1.2.0.md
+++ b/doc/en/releases/RELEASE_NOTES_1.2.0.md
@@ -1,3 +1,21 @@
+<!--
+    Licensed to the Apache Software Foundation (ASF) under one
+    or more contributor license agreements.  See the NOTICE file
+    distributed with this work for additional information
+    regarding copyright ownership.  The ASF licenses this file
+    to you under the Apache License, Version 2.0 (the
+    "License"); you may not use this file except in compliance
+    with the License.  You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing,
+    software distributed under the License is distributed on an
+    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+    KIND, either express or implied.  See the License for the
+    specific language governing permissions and limitations
+    under the License.
+-->
 # singa-incubating-1.2.0 Release Notes
 
 ---
diff --git a/doc/en/releases/RELEASE_NOTES_2.0.0.md b/doc/en/releases/RELEASE_NOTES_2.0.0.md
new file mode 100644
index 0000000..3a9159c
--- /dev/null
+++ b/doc/en/releases/RELEASE_NOTES_2.0.0.md
@@ -0,0 +1,45 @@
+# singa-incubating-2.0.0 Release Notes
+
+---
+
+SINGA is a general distributed deep learning platform for training big deep
+learning models over large datasets.
+
+This release includes following features:
+
+  * Core components
+    * [SINGA-434] Support tensor broadcasting
+    * [SINGA-370] Improvement to tensor reshape and various misc. changes related to SINGA-341 and 351
+
+  * Model components
+    * [SINGA-333] Add support for Open Neural Network Exchange (ONNX) format
+    * [SINGA-385] Add new python module for optimizers
+    * [SINGA-394] Improve the CPP operations via Intel MKL DNN lib
+    * [SINGA-425] Add 3 operators , Abs(), Exp() and leakyrelu(), for Autograd 
+    * [SINGA-410] Add two function, set_params() and get_params(), for Autograd Layer class
+    * [SINGA-383] Add Separable Convolution for autograd
+    * [SINGA-388] Develop some RNN layers by calling tiny operations like matmul, addbias.
+    * [SINGA-382] Implement concat operation for autograd    
+    * [SINGA-378] Implement maxpooling operation and its related functions for autograd
+    * [SINGA-379] Implement batchnorm operation and its related functions for autograd
+
+  * Utility functions and CI
+    * [SINGA-432] Update depdent lib versions in conda-build config
+    * [SINGA-429] Update docker images for latest cuda and cudnn
+    * [SINGA-428] Move Docker images under Apache user name
+
+  * Documentation and usability
+    * [SINGA-395] Add documentation for autograd APIs
+    * [SINGA-344] Add a GAN example
+    * [SINGA-390] Update installation.md
+    * [SINGA-384] Implement ResNet using autograd API
+    * [SINGA-352] Complete SINGA documentation in Chinese version
+      
+
+  * Bugs fixed
+    * [SINGA-431] Unit Test failed - Tensor Transpose
+    * [SINGA-422] ModuleNotFoundError: No module named "_singa_wrap"
+    * [SINGA-418] Unsupportive type 'long' in python3.  
+    * [SINGA-409] Basic `singa-cpu` import throws error
+    * [SINGA-408] Unsupportive function definition in python3
+    * [SINGA-380] Fix bugs from Reshape  
diff --git a/doc/notebook/index.ipynb b/doc/notebook/index.ipynb
index f4e1e49..f96fe66 100644
--- a/doc/notebook/index.ipynb
+++ b/doc/notebook/index.ipynb
@@ -4,6 +4,13 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
+    "Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements; and to You under the Apache License, Version 2.0. "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
     "![Apache Singa](http://singa.apache.org/en/_static/singa.png)"
    ]
   },
diff --git a/doc/zh/community/source-repository.md b/doc/zh/community/source-repository.md
index cc4cb9c..d5e7de0 100644
--- a/doc/zh/community/source-repository.md
+++ b/doc/zh/community/source-repository.md
@@ -1,3 +1,21 @@
+<!--
+    Licensed to the Apache Software Foundation (ASF) under one
+    or more contributor license agreements.  See the NOTICE file
+    distributed with this work for additional information
+    regarding copyright ownership.  The ASF licenses this file
+    to you under the Apache License, Version 2.0 (the
+    "License"); you may not use this file except in compliance
+    with the License.  You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing,
+    software distributed under the License is distributed on an
+    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+    KIND, either express or implied.  See the License for the
+    specific language governing permissions and limitations
+    under the License.
+-->
 # 源代码库
 
 ___
diff --git a/doc/zh/docs/data.md b/doc/zh/docs/data.md
index f60f7fc..93d5043 100644
--- a/doc/zh/docs/data.md
+++ b/doc/zh/docs/data.md
@@ -1,3 +1,21 @@
+<!--
+    Licensed to the Apache Software Foundation (ASF) under one
+    or more contributor license agreements.  See the NOTICE file
+    distributed with this work for additional information
+    regarding copyright ownership.  The ASF licenses this file
+    to you under the Apache License, Version 2.0 (the
+    "License"); you may not use this file except in compliance
+    with the License.  You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing,
+    software distributed under the License is distributed on an
+    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+    KIND, either express or implied.  See the License for the
+    specific language governing permissions and limitations
+    under the License.
+-->
 # 数据(Data)
 
 这个模块包含加载和预获取批数据的类。
diff --git a/doc/zh/docs/image_tool.md b/doc/zh/docs/image_tool.md
index fa7c425..3e78924 100644
--- a/doc/zh/docs/image_tool.md
+++ b/doc/zh/docs/image_tool.md
@@ -1,3 +1,21 @@
+<!--
+    Licensed to the Apache Software Foundation (ASF) under one
+    or more contributor license agreements.  See the NOTICE file
+    distributed with this work for additional information
+    regarding copyright ownership.  The ASF licenses this file
+    to you under the Apache License, Version 2.0 (the
+    "License"); you may not use this file except in compliance
+    with the License.  You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing,
+    software distributed under the License is distributed on an
+    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+    KIND, either express or implied.  See the License for the
+    specific language governing permissions and limitations
+    under the License.
+-->
 # 图像工具
 
 图像增强的模型。
diff --git a/doc/zh/docs/initializer.md b/doc/zh/docs/initializer.md
index b2686cf..91ddb47 100644
--- a/doc/zh/docs/initializer.md
+++ b/doc/zh/docs/initializer.md
@@ -1,3 +1,21 @@
+<!--
+    Licensed to the Apache Software Foundation (ASF) under one
+    or more contributor license agreements.  See the NOTICE file
+    distributed with this work for additional information
+    regarding copyright ownership.  The ASF licenses this file
+    to you under the Apache License, Version 2.0 (the
+    "License"); you may not use this file except in compliance
+    with the License.  You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing,
+    software distributed under the License is distributed on an
+    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+    KIND, either express or implied.  See the License for the
+    specific language governing permissions and limitations
+    under the License.
+-->
 # 初始化器(Initializer)
 
 ## Python API
diff --git a/doc/zh/docs/layer.md b/doc/zh/docs/layer.md
index db7d712..4ca3ba1 100644
--- a/doc/zh/docs/layer.md
+++ b/doc/zh/docs/layer.md
@@ -1,3 +1,21 @@
+<!--
+    Licensed to the Apache Software Foundation (ASF) under one
+    or more contributor license agreements.  See the NOTICE file
+    distributed with this work for additional information
+    regarding copyright ownership.  The ASF licenses this file
+    to you under the Apache License, Version 2.0 (the
+    "License"); you may not use this file except in compliance
+    with the License.  You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing,
+    software distributed under the License is distributed on an
+    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+    KIND, either express or implied.  See the License for the
+    specific language governing permissions and limitations
+    under the License.
+-->
 # 层(Layer)
 
 ## Python API
diff --git a/doc/zh/docs/optimizer.md b/doc/zh/docs/optimizer.md
index 6657e41..2913123 100644
--- a/doc/zh/docs/optimizer.md
+++ b/doc/zh/docs/optimizer.md
@@ -1,3 +1,21 @@
+<!--
+    Licensed to the Apache Software Foundation (ASF) under one
+    or more contributor license agreements.  See the NOTICE file
+    distributed with this work for additional information
+    regarding copyright ownership.  The ASF licenses this file
+    to you under the Apache License, Version 2.0 (the
+    "License"); you may not use this file except in compliance
+    with the License.  You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing,
+    software distributed under the License is distributed on an
+    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+    KIND, either express or implied.  See the License for the
+    specific language governing permissions and limitations
+    under the License.
+-->
 # 优化器(Optimizer)
 
 这个模块包含一系列用于模型参数更新的优化器。
diff --git a/doc/zh/docs/snapshot.md b/doc/zh/docs/snapshot.md
index 9978ca1..5401e35 100644
--- a/doc/zh/docs/snapshot.md
+++ b/doc/zh/docs/snapshot.md
@@ -1,3 +1,21 @@
+<!--
+    Licensed to the Apache Software Foundation (ASF) under one
+    or more contributor license agreements.  See the NOTICE file
+    distributed with this work for additional information
+    regarding copyright ownership.  The ASF licenses this file
+    to you under the Apache License, Version 2.0 (the
+    "License"); you may not use this file except in compliance
+    with the License.  You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing,
+    software distributed under the License is distributed on an
+    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+    KIND, either express or implied.  See the License for the
+    specific language governing permissions and limitations
+    under the License.
+-->
 # Snapshot
 
 此模块包含io::snapshot类及其方法。
diff --git a/doc/zh/docs/utils.md b/doc/zh/docs/utils.md
index 41a60d1..17df7ad 100644
--- a/doc/zh/docs/utils.md
+++ b/doc/zh/docs/utils.md
@@ -1,3 +1,21 @@
+<!--
+    Licensed to the Apache Software Foundation (ASF) under one
+    or more contributor license agreements.  See the NOTICE file
+    distributed with this work for additional information
+    regarding copyright ownership.  The ASF licenses this file
+    to you under the Apache License, Version 2.0 (the
+    "License"); you may not use this file except in compliance
+    with the License.  You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing,
+    software distributed under the License is distributed on an
+    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+    KIND, either express or implied.  See the License for the
+    specific language governing permissions and limitations
+    under the License.
+-->
 # Utils
 
 ---
diff --git a/doc/zh/downloads.md b/doc/zh/downloads.md
index 819fd34..6018dc3 100644
--- a/doc/zh/downloads.md
+++ b/doc/zh/downloads.md
@@ -1,3 +1,21 @@
+<!--
+    Licensed to the Apache Software Foundation (ASF) under one
+    or more contributor license agreements.  See the NOTICE file
+    distributed with this work for additional information
+    regarding copyright ownership.  The ASF licenses this file
+    to you under the Apache License, Version 2.0 (the
+    "License"); you may not use this file except in compliance
+    with the License.  You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing,
+    software distributed under the License is distributed on an
+    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+    KIND, either express or implied.  See the License for the
+    specific language governing permissions and limitations
+    under the License.
+-->
 ## 下载 SINGA
 
 * 要验证下载的tar.gz文件,请下载KEY和ASC文件,然后执行以下命令
diff --git a/doc/zh/index.rst b/doc/zh/index.rst
index aec5343..5a069e4 100644
--- a/doc/zh/index.rst
+++ b/doc/zh/index.rst
@@ -75,7 +75,7 @@
 -------------
 
 * `这里 <docs.html>`_ 列出了文档和 Python API.
-* `C++ APIs <http://www.comp.nus.edu.sg/~dbsystem/singa/api/>`_ 由 Doxygen 生成.
+* `C++ APIs <../doxygen/html/index.html>`_ 由 Doxygen 生成.
 * 研究出版物清单可在 `此处 <http://www.comp.nus.edu.sg/~dbsystem/singa/research/publication/>`_ 查阅.
 
 如何贡献
diff --git a/examples/autograd/mlp.py b/examples/autograd/mlp.py
index dfc67b3..e2bc7ac 100755
--- a/examples/autograd/mlp.py
+++ b/examples/autograd/mlp.py
@@ -24,7 +24,7 @@
 import numpy as np
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
 
     autograd.training = True
 
@@ -32,7 +32,7 @@
 
     # generate the boundary
     f = lambda x: (5 * x + 1)
-    bd_x = np.linspace(-1., 1, 200)
+    bd_x = np.linspace(-1.0, 1, 200)
     bd_y = f(bd_x)
     # generate the training data
     x = np.random.uniform(-1, 1, 400)
@@ -42,7 +42,7 @@
     data = np.array([[a, b] for (a, b) in zip(x, y)], dtype=np.float32)
 
     def to_categorical(y, num_classes):
-        '''
+        """
         Converts a class vector (integers) to binary class matrix.
 
         Args
@@ -52,16 +52,16 @@
 
         Return
             A binary matrix representation of the input.
-        '''
-        y = np.array(y, dtype='int')
+        """
+        y = np.array(y, dtype="int")
         n = y.shape[0]
         categorical = np.zeros((n, num_classes))
         categorical[np.arange(n), y] = 1
         return categorical
 
     label = to_categorical(label, 2).astype(np.float32)
-    print('train_data_shape:', data.shape)
-    print('train_label_shape:', label.shape)
+    print("train_data_shape:", data.shape)
+    print("train_label_shape:", label.shape)
 
     inputs = Tensor(data=data)
     target = Tensor(data=label)
@@ -87,7 +87,7 @@
         x = autograd.softmax(x)
         loss = autograd.cross_entropy(x, target)
         for p, gp in autograd.backward(loss):
-            sgd.apply(0, gp, p, '')
+            sgd.apply(0, gp, p, "")
 
-        if (i % 100 == 0):
-            print('training loss = ', tensor.to_numpy(loss)[0])
+        if i % 100 == 0:
+            print("training loss = ", tensor.to_numpy(loss)[0])
diff --git a/examples/autograd/mnist_cnn.py b/examples/autograd/mnist_cnn.py
index 62ae5b2..99ef49c 100755
--- a/examples/autograd/mnist_cnn.py
+++ b/examples/autograd/mnist_cnn.py
@@ -29,14 +29,14 @@
 
 def load_data(path):
     f = np.load(path)
-    x_train, y_train = f['x_train'], f['y_train']
-    x_test, y_test = f['x_test'], f['y_test']
+    x_train, y_train = f["x_train"], f["y_train"]
+    x_test, y_test = f["x_test"], f["y_test"]
     f.close()
     return (x_train, y_train), (x_test, y_test)
 
 
 def to_categorical(y, num_classes):
-    '''
+    """
     Converts a class vector (integers) to binary class matrix.
 
     Args
@@ -46,8 +46,8 @@
 
     Return
         A binary matrix representation of the input.
-    '''
-    y = np.array(y, dtype='int')
+    """
+    y = np.array(y, dtype="int")
     n = y.shape[0]
     categorical = np.zeros((n, num_classes))
     categorical[np.arange(n), y] = 1
@@ -66,24 +66,25 @@
     y = np.argmax(pred, axis=1)
     t = np.argmax(target, axis=1)
     a = y == t
-    return np.array(a, 'int').sum() / float(len(t))
+    return np.array(a, "int").sum() / float(len(t))
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
 
-    parser = argparse.ArgumentParser(description='Train CNN over MNIST')
-    parser.add_argument('file_path', type=str, help='the dataset path')
-    parser.add_argument('--use_cpu', action='store_true')
+    parser = argparse.ArgumentParser(description="Train CNN over MNIST")
+    parser.add_argument("file_path", type=str, help="the dataset path")
+    parser.add_argument("--use_cpu", action="store_true")
     args = parser.parse_args()
 
-    assert os.path.exists(args.file_path), \
-        'Pls download the MNIST dataset from https://s3.amazonaws.com/img-datasets/mnist.npz'
+    assert os.path.exists(
+        args.file_path
+    ), "Pls download the MNIST dataset from https://s3.amazonaws.com/img-datasets/mnist.npz"
 
     if args.use_cpu:
-        print('Using CPU')
+        print("Using CPU")
         dev = device.get_default_device()
     else:
-        print('Using GPU')
+        print("Using GPU")
         dev = device.create_cuda_gpu()
 
     train, test = load_data(args.file_path)
@@ -99,10 +100,10 @@
 
     x_test = preprocess(test[0])
     y_test = to_categorical(test[1], num_classes)
-    print('the shape of training data is', x_train.shape)
-    print('the shape of training label is', y_train.shape)
-    print('the shape of testing data is', x_test.shape)
-    print('the shape of testing label is', y_test.shape)
+    print("the shape of training data is", x_train.shape)
+    print("the shape of training label is", y_train.shape)
+    print("the shape of testing data is", x_test.shape)
+    print("the shape of testing label is", y_test.shape)
 
     # operations initialization
     conv1 = autograd.Conv2d(1, 32, 3, padding=1, bias=False)
@@ -134,18 +135,30 @@
     autograd.training = True
     for epoch in range(epochs):
         for i in range(batch_number):
-            inputs = tensor.Tensor(device=dev, data=x_train[
-                                   i * 100:(1 + i) * 100], stores_grad=False)
-            targets = tensor.Tensor(device=dev, data=y_train[
-                                    i * 100:(1 + i) * 100], requires_grad=False, stores_grad=False)
+            inputs = tensor.Tensor(
+                device=dev,
+                data=x_train[i * 100 : (1 + i) * 100],
+                stores_grad=False,
+            )
+            targets = tensor.Tensor(
+                device=dev,
+                data=y_train[i * 100 : (1 + i) * 100],
+                requires_grad=False,
+                stores_grad=False,
+            )
 
             loss, y = forward(inputs, targets)
 
-            accuracy_rate = accuracy(tensor.to_numpy(y),
-                                     tensor.to_numpy(targets))
-            if (i % 5 == 0):
-                print('accuracy is:', accuracy_rate, 'loss is:',
-                      tensor.to_numpy(loss)[0])
+            accuracy_rate = accuracy(
+                tensor.to_numpy(y), tensor.to_numpy(targets)
+            )
+            if i % 5 == 0:
+                print(
+                    "accuracy is:",
+                    accuracy_rate,
+                    "loss is:",
+                    tensor.to_numpy(loss)[0],
+                )
 
             for p, gp in autograd.backward(loss):
                 sgd.update(p, gp)
diff --git a/examples/autograd/xceptionnet.py b/examples/autograd/xceptionnet.py
index f52a8ac..22070a9 100755
--- a/examples/autograd/xceptionnet.py
+++ b/examples/autograd/xceptionnet.py
@@ -1,3 +1,20 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
 from singa import autograd
 from singa import tensor
 from singa import device
diff --git a/examples/imagenet/inception/convert.py b/examples/imagenet/inception/convert.py
index 6406b70..973debd 100644
--- a/examples/imagenet/inception/convert.py
+++ b/examples/imagenet/inception/convert.py
@@ -1,17 +1,3 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
 """Converting tensorflow checkpoint file to key-val pkl file."""
 from __future__ import absolute_import
 from __future__ import division
@@ -29,74 +15,80 @@
 
 try:
     import cPickle as pickle
-except ModuleNotFoundError:
+except ImportError:
     import pickle
 
 FLAGS = None
 
 
 def rename(name, suffix):
-    p = name.rfind('/')
+    p = name.rfind("/")
     if p == -1:
-        print('Bad name=%s' % name)
-    return name[0:p+1] + suffix
+        print("Bad name=%s" % name)
+    return name[0 : p + 1] + suffix
 
 
 def convert(model, file_name):
-    if model == 'v3':
+    if model == "v3":
         net, _ = inception_v3.create_net()
     else:
         net, _ = inception_v4.create_net()
-    params = {'SINGA_VERSION': 1101}
+    params = {"SINGA_VERSION": 1101}
     try:
         reader = pywrap_tensorflow.NewCheckpointReader(file_name)
         for pname, pval in zip(net.param_names(), net.param_values()):
-            if 'weight' in pname:
-                val = reader.get_tensor(rename(pname, 'weights'))
-                if 'Conv' in pname:
+            if "weight" in pname:
+                val = reader.get_tensor(rename(pname, "weights"))
+                if "Conv" in pname:
                     val = val.transpose((3, 2, 0, 1))
                     val = val.reshape((val.shape[0], -1))
-            elif 'bias' in pname:
-                val = reader.get_tensor(rename(pname, 'biases'))
-            elif 'mean' in pname:
-                val = reader.get_tensor(rename(pname, 'moving_mean'))
-            elif 'var' in pname:
-                val = reader.get_tensor(rename(pname, 'moving_variance'))
-            elif 'beta' in pname:
-                val= reader.get_tensor(pname)
-            elif 'gamma' in pname:
+            elif "bias" in pname:
+                val = reader.get_tensor(rename(pname, "biases"))
+            elif "mean" in pname:
+                val = reader.get_tensor(rename(pname, "moving_mean"))
+            elif "var" in pname:
+                val = reader.get_tensor(rename(pname, "moving_variance"))
+            elif "beta" in pname:
+                val = reader.get_tensor(pname)
+            elif "gamma" in pname:
                 val = np.ones(pval.shape)
             else:
-                print('not matched param %s' % pname)
-            assert val.shape == pval.shape, ('the shapes not match ',
-                    val.shape, pval.shape)
+                print("not matched param %s" % pname)
+            assert val.shape == pval.shape, (
+                "the shapes not match ",
+                val.shape,
+                pval.shape,
+            )
             params[pname] = val.astype(np.float32)
-            print('converting:', pname, pval.shape)
+            print("converting:", pname, pval.shape)
         var_to_shape_map = reader.get_variable_to_shape_map()
         for key in var_to_shape_map:
-            if 'weights' in key:
-                key = rename(key, 'weight')
-            elif 'biases' in key:
-                key = rename(key, 'bias')
-            elif 'moving_mean' in key:
-                key = rename(key, 'mean')
-            elif 'moving_variance' in key:
-                key = rename(key, 'var')
+            if "weights" in key:
+                key = rename(key, "weight")
+            elif "biases" in key:
+                key = rename(key, "bias")
+            elif "moving_mean" in key:
+                key = rename(key, "mean")
+            elif "moving_variance" in key:
+                key = rename(key, "var")
             if key not in params:
-                print('key=%s not in the net' % key)
-        '''
+                print("key=%s not in the net" % key)
+        """
         for key in var_to_shape_map:
             print("tensor_name: ", key, var_to_shape_map[key])
-        '''
-        with open(os.path.splitext(file_name)[0] + '.pickle', 'wb') as fd:
+        """
+        with open(os.path.splitext(file_name)[0] + ".pickle", "wb") as fd:
             pickle.dump(params, fd)
     except Exception as e:  # pylint: disable=broad-except
         print(str(e))
         if "corrupted compressed block contents" in str(e):
-            print("It's likely that your checkpoint file has been compressed "
-                    "with SNAPPY.")
-        if ("Data loss" in str(e) and
-            (any([e in file_name for e in [".index", ".meta", ".data"]]))):
+            print(
+                "It's likely that your checkpoint file has been compressed "
+                "with SNAPPY."
+            )
+        if "Data loss" in str(e) and (
+            any([e in file_name for e in [".index", ".meta", ".data"]])
+        ):
             proposed_file = ".".join(file_name.split(".")[0:-1])
             v2_file_error_template = """
     It's likely that this is a V2 checkpoint and you need to provide
@@ -105,7 +97,6 @@
         print(v2_file_error_template.format(proposed_file))
 
 
-
 def main(unused_argv):
     if not FLAGS.file_name:
         print("Usage: convert.py --file_name=checkpoint_file_name ")
@@ -117,7 +108,7 @@
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     parser.register("type", "bool", lambda v: v.lower() == "true")
-    parser.add_argument("model", choices=['v3', 'v4'], help="inception version")
+    parser.add_argument("model", choices=["v3", "v4"], help="inception version")
     parser.add_argument("file_name", help="Checkpoint path")
     FLAGS, unparsed = parser.parse_known_args()
     app.run(main=main, argv=[sys.argv[0]] + unparsed)
diff --git a/examples/onnx/backend.py b/examples/onnx/backend.py
new file mode 100644
index 0000000..1420786
--- /dev/null
+++ b/examples/onnx/backend.py
@@ -0,0 +1,46 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+
+# load and run the onnx model exported from pytorch
+# https://github.com/onnx/tutorials/blob/master/tutorials/PytorchOnnxExport.ipynb
+
+
+import argparse
+from singa import device
+from singa import sonnx
+from singa import tensor
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Load model from pytorch")
+    parser.add_argument("--use_cpu", action="store_true")
+    args = parser.parse_args()
+    if args.use_cpu:
+        print("Using CPU")
+        dev = device.get_default_device()
+    else:
+        print("Using GPU")
+        dev = device.create_cuda_gpu()
+    model = sonnx.load("alexnet.onnx")
+    backend = sonnx.prepare(model, dev)
+    input_name = model.graph.inputs[0].name
+    inputs = tensor.Tensor(shape=(2, 3, 224, 224), device=dev, name=input_name)
+    inputs.gaussian(0, 0.01)
+    y = backend.run([inputs])[0]
diff --git a/examples/onnx/cnn.py b/examples/onnx/cnn.py
new file mode 100755
index 0000000..ce3bcbe
--- /dev/null
+++ b/examples/onnx/cnn.py
@@ -0,0 +1,196 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+"""
+1. Export singa model to onnx
+2. Load onnx model and run it via singa backend
+"""
+
+import numpy as np
+import os
+
+from singa import device
+from singa import tensor
+from singa import autograd
+from singa import opt
+from singa import sonnx
+
+
+def load_data(path):
+    f = np.load(path)
+    x_train, y_train = f["x_train"], f["y_train"]
+    x_test, y_test = f["x_test"], f["y_test"]
+    f.close()
+    return (x_train, y_train), (x_test, y_test)
+
+
+def to_categorical(y, num_classes):
+    """
+    Converts a class vector (integers) to binary class matrix.
+    Args
+        y: class vector to be converted into a matrix
+            (integers from 0 to num_classes).
+        num_classes: total number of classes.
+    Return
+        A binary matrix representation of the input.
+    """
+    y = np.array(y, dtype="int")
+    n = y.shape[0]
+    categorical = np.zeros((n, num_classes))
+    categorical[np.arange(n), y] = 1
+    categorical = categorical.astype(np.float32)
+    return categorical
+
+
+def preprocess(data):
+    data = data.astype(np.float32)
+    data /= 255
+    data = np.expand_dims(data, axis=1)
+    return data
+
+
+def common(use_cpu):
+    file_path = "mnist.npz"
+    assert os.path.exists(
+        file_path
+    ), "Pls download the MNIST dataset from https://s3.amazonaws.com/img-datasets/mnist.npz"
+    if use_cpu:
+        print("Using CPU")
+        dev = device.get_default_device()
+    else:
+        print("Using GPU")
+        dev = device.create_cuda_gpu()
+
+    train, test = load_data(file_path)
+    print(train[0].shape)
+    x_train = preprocess(train[0])
+    y_train = to_categorical(train[1], 10)
+
+    x_test = preprocess(test[0])
+    y_test = to_categorical(test[1], 10)
+    print("the shape of training data is", x_train.shape)
+    print("the shape of training label is", y_train.shape)
+    print("the shape of testing data is", x_test.shape)
+    print("the shape of testing label is", y_test.shape)
+    return (x_train, y_train), (x_test, y_test), dev
+
+
+def accuracy(pred, target):
+    y = np.argmax(pred, axis=1)
+    t = np.argmax(target, axis=1)
+    a = y == t
+    return np.array(a, "int").sum() / float(len(t))
+
+
+def singa_to_onnx(epochs, use_cpu=False, batchsize=32):
+    sgd = opt.SGD(lr=0.1)
+
+    # operations initialization
+    conv1 = autograd.Conv2d(1, 8, 3, 2, padding=1) # 28 - 14
+    conv2 = autograd.Conv2d(8, 4, 3, 2, padding=1) # 14 - 7
+    pooling = autograd.MaxPool2d(3, 2, padding=1) # 7 - 4
+    linear = autograd.Linear(64, 10)
+
+    def forward(x, t):
+        y = conv1(x)
+        y = autograd.relu(y)
+        y = conv2(y)
+        y = autograd.relu(y)
+        y = pooling(y)
+        y = autograd.flatten(y)
+        y = linear(y)
+        loss = autograd.softmax_cross_entropy(y, t)
+        return loss, y
+
+    autograd.training = True
+    (x_train, y_train), (x_test, y_test), dev = common(use_cpu)
+
+    niter = 1 # x_train.shape[0] // batchsize
+    for epoch in range(epochs):
+        accuracy_rate = 0.0
+        loss_rate = 0.0
+        for i in range(niter):
+            inputs = tensor.Tensor(
+                device=dev,
+                data=x_train[i * batchsize : (i + 1) * batchsize],
+                stores_grad=False,
+                name="input",
+            )
+            targets = tensor.Tensor(
+                device=dev,
+                data=y_train[i * batchsize : (i + 1) * batchsize],
+                requires_grad=False,
+                stores_grad=False,
+                name="target",
+            )
+            loss, y = forward(inputs, targets)
+            accuracy_rate += accuracy(
+                tensor.to_numpy(y), y_train[i * batchsize : (i + 1) * batchsize]
+            )
+            loss_rate += tensor.to_numpy(loss)[0]
+            for p, gp in autograd.backward(loss):
+                sgd.update(p, gp)
+        print( "accuracy is {}, loss is {}".format( accuracy_rate / niter, loss_rate / niter))
+    model = sonnx.to_onnx_model([inputs], [y])
+    sonnx.save(model, "cnn.onnx")
+
+
+def onnx_to_singa(epochs, use_cpu=False, batchsize=32):
+    (x_train, y_train), (x_test, y_test), dev = common(use_cpu)
+    model = sonnx.load("cnn.onnx")
+    backend = sonnx.prepare(model, dev)
+    autograd.training = True
+    sgd = opt.SGD(lr=0.01)
+    niter = x_train.shape[0] // batchsize
+    for epoch in range(epochs):
+        accuracy_rate = 0.0
+        loss_rate = 0.0
+        for i in range(niter):
+            inputs = tensor.Tensor(
+                device=dev,
+                data=x_train[i * batchsize : (i + 1) * batchsize],
+                stores_grad=False,
+                name="input",
+            )
+            targets = tensor.Tensor(
+                device=dev,
+                data=y_train[i * batchsize : (i + 1) * batchsize],
+                requires_grad=False,
+                stores_grad=False,
+                name="target",
+            )
+            y = backend.run([inputs])[0]
+            loss = autograd.softmax_cross_entropy(y, targets)
+
+            accuracy_rate += accuracy(
+                tensor.to_numpy(y), y_train[i * batchsize : (i + 1) * batchsize]
+            )
+            loss_rate += tensor.to_numpy(loss)[0]
+
+            for p, gp in autograd.backward(loss):
+                sgd.update(p, gp)
+
+        print("accuracy is {}, loss is {}".format(accuracy_rate / niter, loss_rate / niter))
+
+
+if __name__ == "__main__":
+    print("Train a model and convert it into onnx")
+    singa_to_onnx(3, True)
+    print("Load the onnx model and continue training")
+    onnx_to_singa(3, True)
diff --git a/examples/onnx/mlp.py b/examples/onnx/mlp.py
new file mode 100644
index 0000000..9f74d13
--- /dev/null
+++ b/examples/onnx/mlp.py
@@ -0,0 +1,164 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+"""
+1. Export singa model to onnx
+2. Load onnx model and run it via singa backend
+"""
+
+from singa.tensor import Tensor
+from singa import tensor
+from singa import device
+from singa import autograd
+from singa import opt
+from singa import sonnx
+import numpy as np
+
+# prepare training data in numpy array
+# generate the boundary
+f = lambda x: (5 * x + 1)
+bd_x = np.linspace(-1.0, 1, 200)
+bd_y = f(bd_x)
+# generate the training data
+x = np.random.uniform(-1, 1, 300)
+y = f(x) + 2 * np.random.randn(len(x))
+# convert training data to 2d space
+label = np.asarray([5 * a + 1 > b for (a, b) in zip(x, y)])
+data = np.array([[a, b] for (a, b) in zip(x, y)], dtype=np.float32)
+autograd.training = True
+
+
+def to_categorical(y, num_classes):
+    """
+    Converts a class vector (integers) to binary class matrix.
+
+    Args
+        y: class vector to be converted into a matrix
+            (integers from 0 to num_classes).
+        num_classes: total number of classes.
+
+    Return
+        A binary matrix representation of the input.
+    """
+    y = np.array(y, dtype="int")
+    n = y.shape[0]
+    categorical = np.zeros((n, num_classes))
+    categorical[np.arange(n), y] = 1
+    return categorical
+
+
+label = to_categorical(label, 2).astype(np.float32)
+print("train_data_shape:", data.shape)
+print("train_label_shape:", label.shape)
+
+
+def accuracy(pred, target):
+    y = np.argmax(pred, axis=1)
+    t = np.argmax(target, axis=1)
+    a = y == t
+    return np.array(a, "int").sum() / float(len(t))
+
+
+def singa_to_onnx(niter, use_cpu=False):
+    if use_cpu:
+        print("Using CPU")
+        dev = device.get_default_device()
+    else:
+        print("Using GPU")
+        dev = device.create_cuda_gpu()
+    inputs = Tensor(
+        data=data,
+        device=dev,
+        requires_grad=False,
+        stores_grad=False,
+        name="input",
+    )
+    target = Tensor(
+        data=label,
+        device=dev,
+        requires_grad=False,
+        stores_grad=False,
+        name="target",
+    )
+
+    w0 = Tensor(shape=(2, 3), device=dev, requires_grad=True, stores_grad=True)
+    w0.gaussian(0.0, 0.1)
+    b0 = Tensor(shape=(3,), device=dev, requires_grad=True, stores_grad=True)
+    b0.set_value(0.0)
+
+    w1 = Tensor(shape=(3, 2), device=dev, requires_grad=True, stores_grad=True)
+    w1.gaussian(0.0, 0.1)
+    b1 = Tensor(shape=(2,), device=dev, requires_grad=True, stores_grad=True)
+    b1.set_value(0.0)
+
+    sgd = opt.SGD(0.1)
+    # training process
+    for i in range(100):
+        x = autograd.matmul(inputs, w0)
+        x = autograd.add_bias(x, b0)
+        x = autograd.relu(x)
+        x = autograd.matmul(x, w1)
+        x = autograd.add_bias(x, b1)
+        loss = autograd.softmax_cross_entropy(x, target)
+        for p, gp in autograd.backward(loss):
+            sgd.update(p, gp)
+
+        print("training loss = ", tensor.to_numpy(loss)[0])
+    sonnx.export([inputs], [x], file_path="mlp.onnx")
+
+
+def onnx_to_singa(niter, use_cpu=False):
+    if use_cpu:
+        print("Using CPU")
+        dev = device.get_default_device()
+    else:
+        print("Using GPU")
+        dev = device.create_cuda_gpu()
+    model = sonnx.load("mlp.onnx")
+    backend = sonnx.prepare(model, device=dev)
+    sgd = opt.SGD(0.1)
+    inputs = Tensor(
+        data=data,
+        device=dev,
+        requires_grad=False,
+        stores_grad=False,
+        name="input",
+    )
+    target = Tensor(
+        data=label,
+        device=dev,
+        requires_grad=False,
+        stores_grad=False,
+        name="target",
+    )
+
+    for i in range(100):
+        y = backend.run([inputs])[0]
+        loss = autograd.softmax_cross_entropy(y, target)
+        for p, gp in autograd.backward(loss):
+            sgd.update(p, gp)
+        loss_rate = tensor.to_numpy(loss)[0]
+        accuracy_rate = accuracy(tensor.to_numpy(y), label)
+
+        print("Iter {}, accurate={}, loss={}".format(i, accuracy_rate, loss_rate))
+
+
+if __name__ == "__main__":
+    singa_to_onnx(3, True)
+    onnx_to_singa(3, True)
diff --git a/include/singa/core/common.h b/include/singa/core/common.h
index 2c6d1d8..b6500ef 100644
--- a/include/singa/core/common.h
+++ b/include/singa/core/common.h
@@ -39,6 +39,10 @@
 #include "singa/utils/opencl_utils.h"
 #endif  // USE_OPENCL
 
+#ifdef USE_MKLDNN
+#include <mkldnn.hpp>
+#endif  // USE_MKLDNN
+
 using std::atomic;
 
 namespace singa {
@@ -111,6 +115,10 @@
   long vcl_ctx_id;
 #endif
 
+#ifdef USE_MKLDNN
+  mkldnn::engine *engine;
+#endif  // USE_MKLDNN
+
 } Context;
 
 }  // namespace singa
diff --git a/include/singa/core/device.h b/include/singa/core/device.h
index 1a960d8..7a5dda1 100644
--- a/include/singa/core/device.h
+++ b/include/singa/core/device.h
@@ -39,6 +39,10 @@
 #include "singa/utils/opencl_utils.h"
 #endif // USE_OPENCL
 
+#ifdef USE_MKLDNN
+#include "singa/utils/mkldnn_utils.h"
+#endif  // USE_MKLDNN
+
 using std::vector;
 using std::string;
 using std::function;
@@ -141,7 +145,7 @@
 /// It runs cpp code.
 class CppCPU : public Device {
  public:
-  ~CppCPU() {};
+  ~CppCPU();
   CppCPU();
 
   std::shared_ptr<Device> host() const override { return defaultDevice;}
diff --git a/include/singa/core/tensor.h b/include/singa/core/tensor.h
index 905da27..ed37cdb 100755
--- a/include/singa/core/tensor.h
+++ b/include/singa/core/tensor.h
@@ -103,20 +103,21 @@
 
   bool empty() const { return nDim() == 0; }
 
-  /// Check if the tensor's last stride==1
+  /// The stride should decrease except dim with stride=0 due to broadcasting
   bool transpose() const {
-    if (!strides_.empty()) {
-      auto last = strides_.front();
-      for (auto s : strides_) {
-        if (s > last)
+    if (!stride_.empty()) {
+      auto last = stride_.front();
+      for (auto s : stride_) {
+        if (s > last && last > 0)  
           return true;
-        last = s;
+        if (s > 0)
+          last = s;
       }
     }
     return false;
   }
 
-  const vector<int>& strides() const { return strides_; }
+  const vector<int>& stride() const { return stride_; }
 
   /// Return true if the content of the tensor is initialized
   bool initailized() const {
@@ -151,30 +152,13 @@
   /// Return average L2 norm
   float L2() const;
   // --------------------------------------------------------------------------
-  // ---Following methods changes the internal members
+  // ---Following methods changes the internal data
   // --------------------------------------------------------------------------
 
-  /// Reset the shape, device, and data type as given tensor.
-  /// If block size changes, then reallocate a new block.
-  /// The previous block would be deleted.
-  void ResetLike(const Tensor &t);
-
-  /// Reset the data type, it would reallocate block if type changes.
-  void AsType(const DataType type);
-
-  /// Reset the device.
-  /// If the target device is a diff device, then do deep data copy.
-  void ToDevice(std::shared_ptr<Device> dev);
-
-  /// Equivalent to ToDevice(host_dev).
-  void ToHost();
-
   /// Set each element of the tensor to be x
   template <typename SType>
   void SetValue(const SType x);
 
-  void SetShape(const Shape& shape);
-
   /// For init the tensor values, copy 'num' elements from 'src' to the internal
   /// memory with 'offset' (elements).
   template <typename SType>
@@ -242,6 +226,10 @@
   /// change the shape (and stride); the block may be reallocated.
   Tensor &Reshape(const Shape &shape);
 
+
+  /// Resize the memory and return itself
+  Tensor& Resize(const Shape& shape);
+
   /// Matrix transpose.  Valid only if shape.size() == 2.
   Tensor& T();
 
@@ -251,13 +239,32 @@
   /// Change the axes
   Tensor& Transpose(const vector<size_t> &axes);
 
+  /// Return a view of the input tensor whose shape is broadcasted to be
+  /// compitable with the given shape
+  Tensor& Broadcast(const Shape& shape);
+
+  /// Reset the shape, device, and data type as given tensor.
+  /// If block size changes, then reallocate a new block.
+  /// The previous block would be deleted.
+  Tensor& ResetLike(const Tensor &t);
+
+  /// Reset the data type, it would reallocate block if type changes.
+  Tensor& AsType(const DataType type);
+
+  /// Reset the device.
+  /// If the target device is a diff device, then do deep data copy.
+  Tensor& ToDevice(std::shared_ptr<Device> dev);
+
+  /// Equivalent to ToDevice(host_dev).
+  Tensor& ToHost();
+
  protected:
 
   //generate strides automatically if stride field is not passed
-  void generate_strides() {
-    strides_.clear();
+  void generate_stride() {
+    stride_.clear();
     if (shape_.size() == 0) {
-      strides_.push_back(1);
+      stride_.push_back(1);
       return;
     }
 
@@ -265,12 +272,12 @@
     int cumulative_product = 1;
     for (size_t n = 0; n < shape_.size(); ++n) {
       cumulative_product = cumulative_product * shape_[n];
-      strides_.push_back(dim / cumulative_product);
+      stride_.push_back(dim / cumulative_product);
     }
   }
 
   void set_strides(const vector<int> new_strides) {
-    strides_ = new_strides;
+    stride_ = new_strides;
   }
 
  protected:
@@ -280,7 +287,7 @@
   /// If you want to get an allocated Block, use block() instead of block_.
   Block *block_ = nullptr;
   Shape shape_ = {};
-  vector<int> strides_ = {};
+  vector<int> stride_ = {};
 }; //end of tensor class
 
 
@@ -306,14 +313,21 @@
   return static_cast<ToType>(x);
 }
 
+Tensor Boradcast(const Shape& shape);
 
-/// Reshape the given tensor and generate a new tensor,
+/// Reshape the given tensor and generate a new tensor; the total vol should match
 /// which shares the memory with in if possible
 Tensor Reshape(const Tensor &in, const Shape &s);
 
+Tensor Resize(const Tensor &in, const Shape &s);
+
 /// Reverse the shape vector
 Tensor Transpose(const Tensor& in);
 
+/// Return a view of the input tensor whose shape is broadcasted to be
+/// compitable with the given shape
+Tensor Broadcast(const Tensor& in, const Shape& shape);
+
 /// Change the axes
 Tensor Transpose(const Tensor& in, const vector<size_t> &axes);
 
diff --git a/include/singa/utils/mkldnn_utils.h b/include/singa/utils/mkldnn_utils.h
new file mode 100644
index 0000000..59bcf63
--- /dev/null
+++ b/include/singa/utils/mkldnn_utils.h
@@ -0,0 +1,56 @@
+/*********************************************************
+*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*
+************************************************************/
+#ifndef SINGA_UTILS_MKLDNN_UTILS_H_
+#define SINGA_UTILS_MKLDNN_UTILS_H_
+
+#include <mkldnn.hpp>
+
+namespace singa {
+  /*
+   supported data type by mkldnn
+   mkldnn_f32 - 32-bit/single-precision floating point.
+   mkldnn_s32 - 32-bit signed integer.
+   mkldnn_s16 - 16-bit signed integer.
+   mkldnn_s8 - 8-bit signed integer.
+   mkldnn_u8 - 8-bit unsigned integer.
+   */
+  inline mkldnn::memory::data_type GetMKLDNNDataType(DataType dtype) {
+    mkldnn::memory::data_type ret = mkldnn::memory::data_type::f32;
+    switch (dtype) {
+      case kFloat32:
+        ret = mkldnn::memory::data_type::f32;
+        break;
+      case kDouble:
+        LOG(FATAL) << "The data type " << DataType_Name(dtype)
+                   << " is not support by mkldnn";
+        break;
+      case kFloat16:
+        LOG(FATAL) << "The data type " << DataType_Name(dtype)
+                   << " is not support by mkldnn";
+        break;
+      default:
+        LOG(FATAL) << "The data type " << DataType_Name(dtype)
+                   << " is not support by mkldnn";
+    }
+    return ret;
+  }
+}
+#endif // SINGA_UTILS_MKLDNN_UTILS_H_
diff --git a/python/singa/autograd.py b/python/singa/autograd.py
old mode 100755
new mode 100644
index 8e272e8..c9b43ce
--- a/python/singa/autograd.py
+++ b/python/singa/autograd.py
@@ -24,53 +24,61 @@
 import math
 
 from .tensor import Tensor
-from . import layer
-from singa.proto import model_pb2
 from . import singa_wrap as singa
-#from .tensor import einsum
+
+# from .tensor import einsum
 
 CTensor = singa.Tensor
 training = False
 
 
 def infer_dependency(op):
-    '''
+    """
     Infer the dependency of all operations with the
     given op as the last operation.
-    Operation A is depending on B is A uses the output(s) of B.
+    Operation A is depending on B if A uses the output(s) of B.
+
     Args:
         op: an Operation instance, e.g. the loss operation.
+
     Return:
         a Counter instance with the operation as the key,
-        and the number of operations that are depending on it as the value
-    '''
-    # not count the dependency of current op.
+        and the number of operations that are depending on it as the value;
+        and a Counter instance with the id of the output tensor as the key, and 
+        the number of operations that are depending on it as the value.
+    """
+
+    # current op is not inserted into the dependency_count
     # if the current op is not a terminal op, then this function may just
     # count dependency of a branch.
-    dependency_count = Counter()
+    op_count = Counter()
+    tensor_count = Counter()
     queue = deque([op])
     while len(queue) > 0:
         cur_op = queue.pop()
-        for src_op, _, _, _ in cur_op.src:
-            if src_op not in dependency_count:
-                # dependency[src_op] = [Counter() for _ in src_op.y_id2idx]
-                if isinstance(src_op, Dummy):
-                    # only when a Dummy operator needs store grads, its
-                    # dependency needs to be counted.
-                    if src_op.stores_grad:
-                        dependency_count[src_op] = 0
-                        queue.append(src_op)
-                else:
-                    dependency_count[src_op] = 0
-                    queue.append(src_op)
-            # y_idx = src_op.y_id2idx[x_id]
-            # dependency[src_op][y_idx][cur_op] += 1
-            if dependency_count.has_key(src_op):
-                dependency_count[src_op] += 1
-    return dependency_count
+        for src_op, xid, _, _ in cur_op.src:
+            if src_op not in op_count:
+                op_count[src_op] = 1
+                queue.append(src_op)
+            else:
+                op_count[src_op] += 1
+            tensor_count[xid] += 1
+    return op_count, tensor_count
 
 
 def gradients(y, dy=None):
+    """
+    Compute the gradients of the output w.r.t the parameters
+
+    Args:
+        y: the output tensor, e.g., the loss
+        dy: gradient of the target w.r.t y; None indicates the gradient is 1.0;
+            it can be used to rescale the loss.
+
+    Return:
+        a dictionary storing the gradient tensors of all tensors
+            whose stores_grad is true (e.g. parameter tensors)
+    """
     grads = {}  # mapping: x->dx if x.stores_grad
     for p, dp in backward(y, dy):
         grads[p] = dp
@@ -78,20 +86,21 @@
 
 
 def backward(y, dy=None):
-    '''
+    """
     Run the backward propagation starting at y.
     Args:
         y: a Tensor instance, usually the loss
         dy: a number or a Tensor instance, for the gradient of the
-            objective/loss w.r.t y, usually 1.0
+            objective/loss w.r.t y, usually None, i.e., 1.0
     Return:
-        a dictionary storing the gradient tensors of all tensors
-        whose stores_grad is true (e.g. parameter tensors)
-    '''
-    assert isinstance(y, Tensor), 'wrong input type.'
-    dependency = infer_dependency(y.creator)
-    assert y.size() == 1, 'y must be a Tensor with a single value;'\
-        'size of y is % d' % y.size()
+        yeild the parameter (tensor with stores_grad true) and the
+            gradient tensors.
+    """
+    assert isinstance(y, Tensor), "wrong input type."
+    op_dep, tensor_dep = infer_dependency(y.creator)
+    assert y.size() == 1, (
+        "y must be a Tensor with a single value;" "size of y is % d" % y.size()
+    )
 
     # by default the dy is a tensor with 1.0 for each sample;
     if dy is None:
@@ -106,7 +115,7 @@
     not_ready = {}  # mapping: op->[dy]
 
     if y.stores_grad:
-        #gradients[y] = dy
+        # gradients[y] = dy
         if isinstance(dy, float):
             g = np.array(dy)
         else:
@@ -121,9 +130,11 @@
         # if not isinstance(op, tensor.Dummy):
         dxs = op._do_backward(*dys)
         # TODO src and dx must match
-        assert len(op.src) == len(dxs), \
-            'the number of src ops (=%d) and dx (=%d) not match' \
+
+        assert len(op.src) == len(dxs), (
+            "the number of src ops (=%d) and dx (=%d) not match"
             % (len(op.src), len(dxs))
+        )
         for (src_op, x_id, y, y_stores_grad), dx in zip(op.src, dxs):
             # prefix x is w.r.t op; prefix y is w.r.t src_op.
             # x_id is the python id of one input arg of src_op, denoted as x.
@@ -137,9 +148,8 @@
             # operations have been backwarded.
             # y is None if y.stores_grad is false; otherwise it is a Tensor
 
-            if isinstance(src_op, Dummy):
-                if not src_op.stores_grad:
-                    continue
+            if isinstance(src_op, Dummy) and (not src_op.stores_grad):
+                continue
 
             y_idx = src_op.y_id2idx[x_id]
             if src_op not in not_ready:
@@ -147,60 +157,93 @@
                 not_ready[src_op] = [None for _ in src_op.y_id2idx]
                 not_ready[src_op][y_idx] = dx
             else:
-                dxs = not_ready[src_op]
-                if dxs[y_idx] is None:
-                    dxs[y_idx] = dx
+                dxs_ = not_ready[src_op]
+                if dxs_[y_idx] is None:
+                    dxs_[y_idx] = dx
                 else:
                     # add the gradient from another children operation that
                     # uses y_idx'th output of src_op as input arg
-                    dxs[y_idx] += dx
+                    dxs_[y_idx] += dx
 
-            dependency[src_op] -= 1
+            op_dep[src_op] -= 1
+            tensor_dep[x_id] -= 1
 
-            if y_stores_grad:
-                if dependency[src_op] == 0:
-                    # store the gradient for final return, e.g. if x is parameter
-                    # may cause a delay output, as only after src_op is ready
-                    # then output, not the current outlet of src_op is ready
-                    # then output.
-                    g = not_ready[src_op][y_idx]
-                    tg = Tensor(device=g.device(), data=g)
-                    yield (y, tg)
+            if y_stores_grad and tensor_dep[x_id] == 0:
+                # store the gradient for final return, e.g. for parameters.
+                # it may cause a delay to yield. Only after src_op's all
+                # output tensors have recieved the gradients, then output
+                g = not_ready[src_op][y_idx]
+                tg = Tensor(
+                    device=g.device(), data=g, name=src_op.grad_name(y_idx)
+                )
+                yield (y, tg)
 
-            if src_op.requires_grad is True:
-                if dependency[src_op] == 0:
-                    if not isinstance(src_op, Dummy):
-                        # Dummy can be in not_ready list but cannot be in ready
-                        # list.
-                        ready.append((src_op, not_ready[src_op]))
-                    del not_ready[src_op]
+            if op_dep[src_op] == 0:
+                if src_op.requires_grad is True:
+                    assert not isinstance(
+                        src_op, Dummy
+                    ), "Dummy op does not do backward()"
+                    ready.append((src_op, not_ready[src_op]))
+                del not_ready[src_op]
         del op  # delete the operation to free all tensors from this op
 
 
 class Operation(object):
-    '''
+    """
     An operation includes the forward and backward function of
     tensor calculation.
     Steps to add a specific operation Xxxx:
     1. create a subclass of Operation, name it as Xxxx
     2. override the forward() and backward(); The arguments of forward()
        and backward() should only include CTensor;
-    '''
+    """
+
+    op_count = 0
+
+    def __init__(self, name=None):
+        if name is None:
+            self.name = "{}#{}".format(
+                self.__class__.__name__, Operation.op_count
+            )
+            Operation.op_count += 1
+        else:
+            self.name = name
 
     def __call__(self, *xs):
         return self._do_forward(*xs)
 
+    def output_name(self, idx):
+        """
+        Args:
+            idx: index of the output among all outputs
+
+        Return:
+            the name of the output tensor
+        """
+        return "{}:{}".format(self.name, idx)
+
+    def grad_name(self, idx):
+        """
+        Args:
+            idx: index of the output among all outputs
+
+        Return:
+            the name of the gradient of the output tensor
+        """
+        return "{}_g".format(self.output_name(idx))
+
     def _do_forward(self, *xs):
-        '''
+        """
         Do not call this function from user code. It is called by __call__().
         Args:
             xs, Tensor instance(s)
         Returns:
             Tensor instance(s)
-        '''
+        """
         # TODO add the pre hook
-        assert all([isinstance(x, Tensor) for x in xs]), \
-            'xs should include only Tensor instances'
+        assert all(
+            [isinstance(x, Tensor) for x in xs]
+        ), "xs should include only Tensor instances"
 
         # need to do backward if any of its input arg needs gradient
         self.requires_grad = any([x.requires_grad for x in xs])
@@ -223,10 +266,16 @@
             ys = (ys,)
         # create Tensor based on CTensor(data);
         # assume outputs are all Tensor instances
-        ys = tuple(Tensor(device=y.device(),
-                          data=y,
-                          requires_grad=self.requires_grad,
-                          creator=self) for y in ys)
+        ys = tuple(
+            Tensor(
+                device=y.device(),
+                data=y,
+                requires_grad=self.requires_grad,
+                creator=self,
+                name=self.output_name(idx),
+            )
+            for idx, y in enumerate(ys)
+        )
         # map from python id to output index
         self.y_id2idx = {id(y): i for i, y in enumerate(ys)}
         # TODO add the post hook
@@ -239,21 +288,21 @@
         return dxs
 
     def forward(self, *xs):
-        '''Forward propagation.
+        """Forward propagation.
         Args:
             xs: input args consisting of only CTensors.
         Returns:
             CTensor instance(s)
-        '''
+        """
         raise NotImplementedError
 
     def backward(self, *dys):
-        ''' Backward propagation.
+        """ Backward propagation.
         Args:
             dys: input args consisting of only CTensors.
         Returns:
             CTensor instance(s)
-        '''
+        """
         raise NotImplementedError
 
     def get_params(self):
@@ -261,39 +310,47 @@
 
 
 class Dummy(Operation):
-    '''Dummy operation whice serves as a placehoder for autograd
+    """Dummy operation whice serves as a placehoder for autograd
     Args:
         name(string): set it for debug
-    '''
+    """
 
     def __init__(self, tensor, name=None):
-        self.name = name
+        super(Dummy, self).__init__(name)
         self.src = []
         self.y_id2idx = {id(tensor): 0}
         self.stores_grad = tensor.stores_grad
         self.requires_grad = False
 
+    def output_name(self, idx):
+        return self.name
+
+    def grad_name(self, idx):
+        return "{}_g".format(self.name)
+
 
 class ReLU(Operation):
+    def __init__(self):
+        super(ReLU, self).__init__()
 
     def forward(self, x):
-        '''
+        """
         Args:
             x(CTensor): input tensor
         Returns:
             a new CTensor whose element y = x if x >= 0; otherwise 0;
-        '''
+        """
         if training:
             self.input = x
         return singa.ReLU(x)
 
     def backward(self, dy):
-        '''
+        """
         Args:
             dy(CTensor): dL / dy
         Returns:
             dx(CTensor): dL / dx = dy if x >= 0; otherwise 0;
-        '''
+        """
         dx = singa.GTFloat(self.input, 0.0)
         return singa.__mul__(dy, dx)
 
@@ -303,30 +360,35 @@
 
 
 class Matmul(Operation):
-    '''For matrix multiplication'''
+    """For matrix multiplication"""
+
+    def __init__(self):
+        super(Matmul, self).__init__()
 
     def forward(self, x, w):
-        '''Do forward propgation.
+        """Do forward propgation.
         Store the x(or w) if w(or x) requires gradient.
         Args:
             x (CTensor): matrix
             w (CTensor): matrix
         Returns:
             a CTensor for the result
-        '''
+        """
         if training:
             self.input = (x, w)
         return singa.Mult(x, w)
 
     def backward(self, dy):
-        '''
+        """
         Args:
             dy (CTensor): data for the dL / dy, L is the loss
         Returns:
             a tuple for (dx, dw)
-        '''
-        return singa.Mult(dy, singa.DefaultTranspose(self.input[1])), \
-            singa.Mult(singa.DefaultTranspose(self.input[0]), dy)
+        """
+        return (
+            singa.Mult(dy, singa.DefaultTranspose(self.input[1])),
+            singa.Mult(singa.DefaultTranspose(self.input[0]), dy),
+        )
 
 
 def matmul(x, w):
@@ -334,26 +396,27 @@
 
 
 class AddBias(Operation):
-    '''
+    """
     Add Bias to each row / column of the Tensor, depending on the axis arg.
-    '''
+    """
 
     def __init__(self, axis=0):
-        '''
+        """
         To indicate the calculation axis, 0 for row, 1 for column.
         Args:
             axis: 0 or 1, default is 0.
-        '''
+        """
+        super(AddBias, self).__init__()
         self.axis = axis
 
     def forward(self, x, b):
-        '''
+        """
         Args:
             x: matrix.
             b: bias to be added.
         Return:
             the result Tensor
-        '''
+        """
         if self.axis == 0:
             singa.AddRow(b, x)
         elif self.axis == 1:
@@ -361,13 +424,13 @@
         return x
 
     def backward(self, dy):
-        '''
+        """
         Args:
             dy (CTensor): data for the dL / dy, L is the loss.
         Return:
             a tuple for (db, dx), db is data for dL / db, dx is data
             for dL / dx.
-        '''
+        """
         if self.axis == 0:
             return dy, singa.Sum(dy, 0)
         elif self.axis == 1:
@@ -379,6 +442,8 @@
 
 
 class Add(Operation):
+    def __init__(self):
+        super(Add, self).__init__()
 
     def forward(self, a, b):
         return singa.__add__(a, b)
@@ -392,21 +457,22 @@
 
 
 class SoftMax(Operation):
-    '''
+    """
     Apply SoftMax for each row of the Tensor or each column of the Tensor
     according to the parameter axis.
-    '''
+    """
 
     def __init__(self, axis=0):
+        super(SoftMax, self).__init__()
         self.axis = axis
 
     def forward(self, x):
-        '''
+        """
         Args:
             x(data): the input 1d or 2d tensor
         Returns:
             the result Tensor
-        '''
+        """
         if self.axis == 1:
             x = singa.DefaultTranspose(x)
         self.output = singa.SoftMax(x)
@@ -416,32 +482,32 @@
             return singa.DefaultTranspose(self.output)
 
     def backward(self, dy):
-        '''
+        """
         Args:
             dy (CTensor): data for the dL / dy, L is the loss
         Returns:
             dx (Ctensor): data for the dL / dx, L is the loss,
             x is the input of current Opertion
-        '''
+        """
         # calculations are made on numpy array
         if self.axis == 1:
             dy = singa.DefaultTranspose(dy)
         grad = ctensor2numpy(dy)
         output = ctensor2numpy(self.output)
-        out_1 = np.einsum('ki,ki->ki', grad, output)
-        medium_out = np.einsum('ki,kj->kij', output, output)
-        out_2 = np.einsum('kij,kj->ki', medium_out, grad)
+        out_1 = np.einsum("ki,ki->ki", grad, output)
+        medium_out = np.einsum("ki,kj->kij", output, output)
+        out_2 = np.einsum("kij,kj->ki", medium_out, grad)
         out = out_1 - out_2
         dx = CTensor(out_1.shape)
         dx.CopyFloatDataFromHostPtr(out.flatten())
-        '''grad = Tensor(data=dy)
+        """grad = Tensor(data=dy)
         output = Tensor(data=self.output)
         out_1 = einsum('ki,ki->ki', grad, output)
         medium_out = einsum('ki,kj->kij', output, output)
         out_2 = einsum('kij,kj->ki', medium_out, grad)
         out = out_1 - out_2
         dx = CTensor(out_1.data.shape)
-        dx.CopyFloatDataFromHostPtr(out.data.flatten())'''
+        dx.CopyFloatDataFromHostPtr(out.data.flatten())"""
         if self.axis == 0:
             return dx
         elif self.axis == 1:
@@ -453,19 +519,22 @@
 
 
 class CrossEntropy(Operation):
-    '''
+    def __init__(self):
+        super(CrossEntropy, self).__init__()
+
+    """
     Calculte negative log likelihood loss for a batch of training data.
-    '''
+    """
 
     def forward(self, x, t):
-        '''
+        """
         Args:
             x (CTensor): 1d or 2d tensor, the prediction data(output)
                          of current network.
             t (CTensor): 1d or 2d tensor, the target data for training.
         Returns:
             loss (CTensor): scalar.
-        '''
+        """
         loss = CTensor((1,))
         loss_data = -singa.SumAsFloat(singa.__mul__(t, singa.Log(x)))
         loss.SetFloatValue(loss_data / x.shape()[0])
@@ -475,7 +544,7 @@
         return loss
 
     def backward(self, dy=1.0):
-        '''
+        """
         Args:
             dy (float or CTensor): scalar, accumulate gradient from outside
                                 of current network, usually equal to 1.0
@@ -483,7 +552,7 @@
             dx (CTensor): data for the dL /dx, L is the loss, x is the output
                           of current network. note that this is true for
                           dy = 1.0
-        '''
+        """
         dx = singa.__div__(self.t, self.x)
         dx *= float(-1 / self.x.shape()[0])
         if isinstance(dy, float):
@@ -499,8 +568,8 @@
 
 
 class SoftMaxCrossEntropy(Operation):
-
     def __init__(self, t):
+        super(SoftMaxCrossEntropy, self).__init__()
         self.t = t.data
 
     def forward(self, x):
@@ -521,6 +590,8 @@
 
 
 class MeanSquareError(Operation):
+    def __init__(self):
+        super(MeanSquareError, self).__init__()
 
     def forward(self, x, t):
         self.err = singa.__sub__(x, t)
@@ -545,20 +616,20 @@
 
 
 def ctensor2numpy(x):
-    '''
+    """
     To be used in SoftMax Operation.
     Convert a singa_tensor to numpy_tensor.
-    '''
+    """
     np_array = x.GetFloatValue(int(x.Size()))
     return np_array.reshape(x.shape())
 
 
 class Flatten(Operation):
-
     def __init__(self, start_axis=1):
+        super(Flatten, self).__init__()
         # flatten all axis after (inclusive) start_axis
         self.start_axis = start_axis
-        assert start_axis == 1, 'must flatten into 2d array not'
+        assert start_axis == 1, "must flatten into 2d array not"
 
     def forward(self, x):
         # TODO Do flatten start from axis != 1
@@ -576,14 +647,14 @@
 
 
 class Layer(object):
-
     def __init__(self):
         pass
 
     def device_check(self, *inputs):
         x_device = inputs[0].device
+        x_dev_id = x_device.id()
         for var in inputs:
-            if var.device.id() != x_device:
+            if var.device.id() != x_dev_id:
                 var.to_device(x_device)
 
     def find_sublayers(self):
@@ -609,45 +680,46 @@
         # Layer.set_params(**{'block1':{'linear1':{'W':np.ones((in, out),
         # dtype=np.float32)}}})
         for (parameter_name, parameter_value) in parameters.items():
-            #assert isinstance(self.__dict__[parameter_name], Layer)
-            assert parameter_name in self.__dict__, 'please input correct parameters.'
+            # assert isinstance(self.__dict__[parameter_name], Layer)
+            assert (
+                parameter_name in self.__dict__
+            ), "please input correct parameters."
             if isinstance(self.__dict__[parameter_name], Layer):
                 self.__dict__[parameter_name].set_params(
-                    **parameters[parameter_name])
+                    **parameters[parameter_name]
+                )
             elif isinstance(self.__dict__[parameter_name], Tensor):
                 self.set_one_param(parameter_name, parameter_value)
             else:
-                raise ValueError('please input correct parameters.')
+                raise ValueError("please input correct parameters.")
 
     def set_one_param(self, parameter_name, parameter_value):
-        assert parameter_name in self.allow_params, 'please input allowed parameters.'
-        assert parameter_value.shape == self.__dict__[
-            parameter_name].shape, 'Shape dismatched.'
+        assert (
+            parameter_name in self.allow_params
+        ), "please input allowed parameters."
+        assert (
+            parameter_value.shape == self.__dict__[parameter_name].shape
+        ), "Shape dismatched."
         if isinstance(parameter_value, Tensor):
-            self.__dict__[parameter_name].reset_like(
-                parameter_value)
+            self.__dict__[parameter_name].reset_like(parameter_value)
         elif isinstance(parameter_value, np.ndarray):
-            self.__dict__[parameter_name].copy_from_numpy(
-                parameter_value)
+            self.__dict__[parameter_name].copy_from_numpy(parameter_value)
         else:
-            raise ValueError('parameters should be Tensor or Numpy array.')
+            raise ValueError("parameters should be Tensor or Numpy array.")
 
 
 class Linear(Layer):
-
     def __init__(self, in_features, out_features, bias=True):
         w_shape = (in_features, out_features)
-        b_shape = (1, out_features)
+        b_shape = (out_features,)
         self.bias = bias
 
-        self.W = Tensor(shape=w_shape,
-                        requires_grad=True, stores_grad=True)
+        self.W = Tensor(shape=w_shape, requires_grad=True, stores_grad=True)
         std = math.sqrt(2.0 / (in_features + out_features))
         self.W.gaussian(0.0, std)
 
         if self.bias:
-            self.b = Tensor(shape=b_shape,
-                            requires_grad=True, stores_grad=True)
+            self.b = Tensor(shape=b_shape, requires_grad=True, stores_grad=True)
             self.b.set_value(0.0)
 
     def __call__(self, x):
@@ -662,25 +734,26 @@
 
     def get_params(self):
         if self.bias:
-            return {'W': self.W, 'b': self.b}
+            return {"W": self.W, "b": self.b}
         else:
-            return {'W': self.W}
+            return {"W": self.W}
 
     def set_params(self, **parameters):
+        # TODO(wangwei) remove this funciton as Opeation's set_params() enough
         # set parameters for Linear Layer
         # input should be either a PyTensor or numpy ndarray.
         # examples: Linear.set_params(W=np.ones((in, out), dtype=np.float32)),
         # Linear.set_params(**{'W':np.ones((in, out), dtype=np.float32)})
-        self.allow_params = ['W', 'b']
+        self.allow_params = ["W", "b"]
         super(Linear, self).set_params(**parameters)
         for parameter_name in parameters:
-            if parameter_name is 'b':
+            if parameter_name is "b":
                 self.bias = True
 
 
 class Concat(Operation):
-
     def __init__(self, axis=0):
+        super(Concat, self).__init__()
         self.axis = axis
 
     def forward(self, *xs):
@@ -695,8 +768,9 @@
 
     def backward(self, dy):
         assert hasattr(
-            self, 'slice_point'), 'Please set training as True before do BP. '
-        assert self.slice_point[-1] == dy.shape()[self.axis], 'Shape dismatched.'
+            self, "slice_point"
+        ), "Please set training as True before do BP. "
+        assert self.slice_point[-1] == dy.shape()[self.axis], "Shape mismatch."
         dxs = []
         last_offset = 0
         for p in self.slice_point:
@@ -711,301 +785,384 @@
 
 
 class _Conv2d(Operation):
-
     def __init__(self, handle):
+        super(_Conv2d, self).__init__()
         self.handle = handle
 
     def forward(self, x, W, b):
-        assert x.nDim() == 4, 'The dimensions of input should be 4D.'
+        assert x.nDim() == 4, "The dimensions of input should be 4D."
 
         if training:
             if self.handle.bias_term:
                 self.inputs = (x, W, b)
             else:
                 self.inputs = (x, W)
-
-        if self.handle.device_id == -1:
-            return singa.CpuConvForward(x, W, b, self.handle)
-
-        else:
+        if isinstance(self.handle, singa.CudnnConvHandle):
             return singa.GpuConvForward(x, W, b, self.handle)
+        else:
+            return singa.CpuConvForward(x, W, b, self.handle)
 
     def backward(self, dy):
         assert training is True and hasattr(
-            self, 'inputs'), 'Please set training as True before do BP. '
-
-        if dy.device().id() != self.handle.device_id:
-            dy.ToDevice(self.inputs[0].device())
-
-        if self.handle.device_id == -1:
-            dx = singa.CpuConvBackwardx(
-                dy, self.inputs[1], self.inputs[0], self.handle)
-            dW = singa.CpuConvBackwardW(
-                dy, self.inputs[0], self.inputs[1], self.handle)
-            if self.handle.bias_term:
-                db = singa.CpuConvBackwardb(dy, self.inputs[2], self.handle)
-                return dx, dW, db
-            else:
-                return dx, dW, None
-        else:
+            self, "inputs"
+        ), "Please set training as True before do BP. "
+        
+        if isinstance(self.handle, singa.CudnnConvHandle):
             dx = singa.GpuConvBackwardx(
-                dy, self.inputs[1], self.inputs[0], self.handle)
+                dy, self.inputs[1], self.inputs[0], self.handle
+            )
             dW = singa.GpuConvBackwardW(
-                dy, self.inputs[0], self.inputs[1], self.handle)
+                dy, self.inputs[0], self.inputs[1], self.handle
+            )
             if self.handle.bias_term:
                 db = singa.GpuConvBackwardb(dy, self.inputs[2], self.handle)
                 return dx, dW, db
             else:
                 return dx, dW, None
+        else:
+            dx = singa.CpuConvBackwardx(
+                dy, self.inputs[1], self.inputs[0], self.handle
+            )
+            dW = singa.CpuConvBackwardW(
+                dy, self.inputs[0], self.inputs[1], self.handle
+            )
+            if self.handle.bias_term:
+                db = singa.CpuConvBackwardb(dy, self.inputs[2], self.handle)
+                return dx, dW, db
+            else:
+                return dx, dW, None
 
-
-def conv2d(handle, x, W, b):
-    return _Conv2d(handle)(x, W, b)[0]
+def conv2d(handle, x, W, b=None):
+    if b is None:
+        return _Conv2d(handle)(x, W)[0]
+    else:
+        return _Conv2d(handle)(x, W, b)[0]
 
 
 class Conv2d(Layer):
-
-    def __init__(self, in_channels, out_channels, kernel_size, stride=1,
-                 padding=0, dilation=1, groups=1, bias=True, **kwargs):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride=1,
+        padding=0,
+        dilation=1,
+        group=1,
+        bias=True,
+        **kwargs
+    ):
 
         self.in_channels = in_channels
         self.out_channels = out_channels
 
-        self.groups = groups
+        self.group = group
 
-        assert self.groups >= 1 and self.in_channels % self.groups == 0, 'please set reasonable groups.'
+        assert (
+            self.group >= 1 and self.in_channels % self.group == 0
+        ), "please set reasonable group."
 
-        # each group should contribute equally to the output feature maps. shown as the later part of
-        # the following judgement.
-        assert self.out_channels >= self.groups and self.out_channels % self.groups == 0, 'out_channels and groups dismatched.'
+        assert (
+            self.out_channels >= self.group
+            and self.out_channels % self.group == 0
+        ), "out_channels and group dismatched."
 
         if isinstance(kernel_size, int):
             self.kernel_size = (kernel_size, kernel_size)
         elif isinstance(kernel_size, tuple):
             self.kernel_size = kernel_size
         else:
-            raise TypeError('Wrong kernel_size type.')
+            raise TypeError("Wrong kernel_size type.")
 
         if isinstance(stride, int):
             self.stride = (stride, stride)
         elif isinstance(stride, tuple):
             self.stride = stride
         else:
-            raise TypeError('Wrong stride type.')
+            raise TypeError("Wrong stride type.")
 
         if isinstance(padding, int):
             self.padding = (padding, padding)
         elif isinstance(padding, tuple):
             self.padding = padding
         else:
-            raise TypeError('Wrong padding type.')
+            raise TypeError("Wrong padding type.")
 
         if dilation != 1:
-            raise ValueError('Not implemented yet')
+            raise ValueError("Not implemented yet")
 
         self.bias = bias
 
-        self.inner_params = {'cudnn_prefer': 'fastest',
-                             'workspace_MB_limit': 1024}
+        self.inner_params = {
+            "cudnn_prefer": "fastest",
+            "workspace_MB_limit": 1024,
+        }
         # TODO valid value of inner_params check
 
         for kwarg in kwargs:
             if kwarg not in self.inner_params:
-                raise TypeError('Keyword argument not understood:', kwarg)
+                raise TypeError("Keyword argument not understood:", kwarg)
             else:
                 self.inner_params[kwarg] = kwargs[kwarg]
 
-        w_shape = (self.out_channels, int(self.in_channels / self.groups),
-                   self.kernel_size[0], self.kernel_size[1])
+        w_shape = (
+            self.out_channels,
+            int(self.in_channels / self.group),
+            self.kernel_size[0],
+            self.kernel_size[1],
+        )
 
         self.W = Tensor(shape=w_shape, requires_grad=True, stores_grad=True)
         # std = math.sqrt(
         # 2.0 / (self.in_channels * self.kernel_size[0] * self.kernel_size[1] +
         # self.out_channels))
         std = math.sqrt(
-            2.0 / (w_shape[1] * self.kernel_size[0] * self.kernel_size[1] + self.out_channels))
+            2.0
+            / (
+                w_shape[1] * self.kernel_size[0] * self.kernel_size[1]
+                + self.out_channels
+            )
+        )
         self.W.gaussian(0.0, std)
 
         if self.bias:
             b_shape = (self.out_channels,)
-            self.b = Tensor(shape=b_shape, requires_grad=True,
-                            stores_grad=True)
+            self.b = Tensor(shape=b_shape, requires_grad=True, stores_grad=True)
             self.b.set_value(0.0)
         else:
             # to keep consistency when to do forward.
-            self.b = Tensor(data=CTensor(
-                []), requires_grad=False, stores_grad=False)
+            self.b = None
+            # Tensor(data=CTensor([]), requires_grad=False, stores_grad=False)
 
     def __call__(self, x):
-        assert x.shape[1] == self.in_channels, 'in_channels dismatched'
+        assert x.shape[1] == self.in_channels, "in_channels dismatched"
 
         self.device_check(x, self.W, self.b)
 
         if x.device.id() == -1:
-            if self.groups != 1:
-                raise ValueError('Not implemented yet')
+            if self.group != 1:
+                raise ValueError("Not implemented yet")
             else:
-                if not hasattr(self, 'handle'):
-                    self.handle = singa.ConvHandle(x.data, self.kernel_size, self.stride,
-                                                   self.padding, self.in_channels, self.out_channels, self.bias)
-                elif x.shape[0] != self.handle.batchsize:
-                    self.handle = singa.ConvHandle(x.data, self.kernel_size, self.stride,
-                                                   self.padding, self.in_channels, self.out_channels, self.bias)
+                if (not hasattr(self, "handle")) or (
+                    x.shape[0] != self.handle.batchsize
+                ):
+                    self.handle = singa.ConvHandle(
+                        x.data,
+                        self.kernel_size,
+                        self.stride,
+                        self.padding,
+                        self.in_channels,
+                        self.out_channels,
+                        self.bias,
+                        self.group,
+                    )
         else:
-            if not hasattr(self, 'handle'):
-                self.handle = singa.CudnnConvHandle(x.data, self.kernel_size, self.stride,
-                                                    self.padding, self.in_channels, self.out_channels, self.bias, self.groups)
-            elif x.shape[0] != self.handle.batchsize:
-                self.handle = singa.CudnnConvHandle(x.data, self.kernel_size, self.stride,
-                                                    self.padding, self.in_channels, self.out_channels, self.bias, self.groups)
-        self.handle.device_id = x.device.id()
+            if (not hasattr(self, "handle")) or (
+                x.shape[0] != self.handle.batchsize
+            ):
+                self.handle = singa.CudnnConvHandle(
+                    x.data,
+                    self.kernel_size,
+                    self.stride,
+                    self.padding,
+                    self.in_channels,
+                    self.out_channels,
+                    self.bias,
+                    self.group,
+                )
 
         y = conv2d(self.handle, x, self.W, self.b)
         return y
 
     def get_params(self):
         if self.bias:
-            return {'W': self.W, 'b': self.b}
+            return {"W": self.W, "b": self.b}
         else:
-            return {'W': self.W}
+            return {"W": self.W}
 
     def set_params(self, **parameters):
-        # set parameters for Conv2d Layer
+        # TODO(wangwei) remove it as Operation's set_params() is enough
         # input should be either a PyTensor or numpy ndarray.
-        # examples: Conv2d.set_params(W=np.ones((n, c, h, w), dtype=np.float32)),
-        #          Conv2d.set_params(**{'W':np.ones((n, c, h, w), dtype=np.float32)})
-        self.allow_params = ['W', 'b']
+        # Conv2d.set_params(W=np.ones((n, c, h, w), dtype=np.float32)),
+        # Conv2d.set_params(**{'W':np.ones((n, c, h, w), dtype=np.float32)})
+        self.allow_params = ["W", "b"]
         super(Conv2d, self).set_params(**parameters)
         for parameter_name in parameters:
-            if parameter_name is 'b':
+            if parameter_name is "b":
                 self.bias = True
 
 
 class SeparableConv2d(Layer):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride=1,
+        padding=0,
+        bias=False,
+    ):
+        self.depthwise_conv = Conv2d(
+            in_channels,
+            in_channels,
+            kernel_size,
+            stride,
+            padding,
+            group=in_channels,
+            bias=bias,
+        )
 
-    def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=0, bias=False):
-
-        self.spacial_conv = Conv2d(
-            in_channels, in_channels, kernel_size, stride, padding, groups=in_channels, bias=bias)
-
-        self.depth_conv = Conv2d(in_channels, out_channels, 1, bias=bias)
+        self.point_conv = Conv2d(in_channels, out_channels, 1, bias=bias)
 
     def __call__(self, x):
-        y = self.spacial_conv(x)
-        y = self.depth_conv(y)
+        y = self.depthwise_conv(x)
+        y = self.point_conv(y)
         return y
 
 
 class BatchNorm2d(Layer):
-
     def __init__(self, num_features, momentum=0.9):
         self.channels = num_features
         self.momentum = momentum
 
         param_shape = (self.channels,)
 
-        self.scale = Tensor(shape=param_shape,
-                            requires_grad=True, stores_grad=True)
+        self.scale = Tensor(
+            shape=param_shape, requires_grad=True, stores_grad=True
+        )
         self.scale.set_value(1.0)
 
-        self.bias = Tensor(shape=param_shape,
-                           requires_grad=True, stores_grad=True)
+        self.bias = Tensor(
+            shape=param_shape, requires_grad=True, stores_grad=True
+        )
         self.bias.set_value(0.0)
 
         self.running_mean = Tensor(
-            shape=param_shape, requires_grad=False, stores_grad=False)
+            shape=param_shape, requires_grad=False, stores_grad=False
+        )
         self.running_var = Tensor(
-            shape=param_shape, requires_grad=False, stores_grad=False)
+            shape=param_shape, requires_grad=False, stores_grad=False
+        )
 
     def __call__(self, x):
-        assert x.shape[1] == self.channels, 'number of channels dismatched. %d vs %d' % (
-            x.shape[1], self.channels)
+        assert x.shape[1] == self.channels, (
+            "number of channels dismatched. %d vs %d"
+            % (x.shape[1], self.channels)
+        )
 
-        self.device_check(x, self.scale, self.bias,
-                          self.running_mean, self.running_var)
+        self.device_check(
+            x, self.scale, self.bias, self.running_mean, self.running_var
+        )
 
         if x.device.id() == -1:
-            raise NotImplementedError
-
-        else:
-            if not hasattr(self, 'handle'):
-                self.handle = singa.CudnnBatchNormHandle(
-                    self.momentum, x.data)
+            if not hasattr(self, "handle"):
+                self.handle = singa.BatchNormHandle(self.momentum, x.data)
             elif x.shape[0] != self.handle.batchsize:
-                self.handle = singa.CudnnBatchNormHandle(
-                    self.momentum, x.data)
-        self.handle.device_id = x.device.id()
+                self.handle = singa.BatchNormHandle(self.momentum, x.data)
+        else:
+            if not hasattr(self, "handle"):
+                self.handle = singa.CudnnBatchNormHandle(self.momentum, x.data)
+            elif x.shape[0] != self.handle.batchsize:
+                self.handle = singa.CudnnBatchNormHandle(self.momentum, x.data)
 
-        y = batchnorm_2d(self.handle, x, self.scale, self.bias,
-                         self.running_mean, self.running_var)
+        y = batchnorm_2d(
+            self.handle,
+            x,
+            self.scale,
+            self.bias,
+            self.running_mean,
+            self.running_var,
+        )
         return y
 
     def get_params(self):
-        return {'scale': self.scale, 'bias': self.bias}
+        return {"scale": self.scale, "bias": self.bias}
 
     def set_params(self, **parameters):
         # set parameters for BatchNorm2d Layer
         # input should be either a PyTensor or numpy ndarray.
-        # examples: Batchnorm2d.set_params(scale=np.ones((1,), dtype=np.float32)),
-        #          Batchnorm2d.set_params(**{'bias':np.ones((1), dtype=np.float32)})
-        self.allow_params = ['scale', 'bias']
+        # examples:
+        #   Batchnorm2d.set_params(scale=np.ones((1,), dtype=np.float32)),
+        #   Batchnorm2d.set_params(**{'bias':np.ones((1), dtype=np.float32)})
+        self.allow_params = ["scale", "bias"]
         super(BatchNorm2d, self).set_params(**parameters)
 
 
 class _BatchNorm2d(Operation):
-
-    def __init__(self, handle, running_mean, running_var):
-        self.running_mean = running_mean.data
-        self.running_var = running_var.data
+    def __init__(self, handle, name=None):
+        super(_BatchNorm2d, self).__init__(name)
         self.handle = handle
 
-    def forward(self, x, scale, bias):
+    def forward(self, x, scale, bias, running_mean, running_var):
+        self.running_mean = running_mean
+        self.running_var = running_var
         if training:
 
-            if self.handle.device_id == -1:
-                raise NotImplementedError
+            if isinstance(self.handle, singa.BatchNormHandle):
+                y, mean, var = singa.CpuBatchNormForwardTraining(
+                    self.handle, x, scale, bias, running_mean, running_var
+                )
+                self.cache = (x, scale, mean, var)
             else:
-                y, mean, var = singa.GpuBatchNormForwardTraining(self.handle,
-                                                                 x, scale, bias, self.running_mean, self.running_var)
+                y, mean, var = singa.GpuBatchNormForwardTraining(
+                    self.handle, x, scale, bias, running_mean, running_var
+                )
+
                 self.cache = (x, scale, mean, var)
         else:
-            if self.handle.device_id == -1:
-                raise NotImplementedError
-            else:
+            if isinstance(self.handle, singa.CudnnBatchNormHandle):
                 y = singa.GpuBatchNormForwardInference(
-                    self.handle, x, scale, bias, self.running_mean, self.running_var)
+                    self.handle,
+                    x,
+                    scale,
+                    bias,
+                    running_mean,
+                    running_var,
+                )
+            else:
+                y = singa.CpuBatchNormForwardInference(
+                    self.handle,
+                    x,
+                    scale,
+                    bias,
+                    running_mean,
+                    running_var,
+                )
+
         return y
 
     def backward(self, dy):
         assert training is True and hasattr(
-            self, 'cache'), 'Please set training as True before do BP. '
+            self, "cache"
+        ), "Please set training as True before do BP. "
 
-        if dy.device().id() != self.handle.device_id:
-            dy.ToDevice(self.cache[0].device())
-
-        if self.handle.device_id == -1:
-            raise NotImplementedError
-        else:
-            x, scale, mean, var = self.cache
+        x, scale, mean, var = self.cache
+        if isinstance(self.handle, singa.CudnnBatchNormHandle):
             dx, ds, db = singa.GpuBatchNormBackward(
-                self.handle, dy, x, scale, mean, var)
-            return dx, ds, db
+                self.handle, dy, x, scale, mean, var
+            )
+        else:
+            dx, ds, db = singa.CpuBatchNormBackward(
+                self.handle, dy, x, scale, mean, var
+            )
+            
+        return dx, ds, db
 
 
 def batchnorm_2d(handle, x, scale, bias, running_mean, running_var):
-    return _BatchNorm2d(handle, running_mean, running_var)(x, scale, bias)[0]
+    return _BatchNorm2d(handle)(x, scale, bias, running_mean, running_var)[0]
 
 
 class _Pooling2d(Operation):
-
     def __init__(self, handle):
+        super(_Pooling2d, self).__init__()
         self.handle = handle
 
     def forward(self, x):
-        if self.handle.device_id == -1:
-            raise NotImplementedError
-        else:
+        if isinstance(self.handle, singa.CudnnPoolingHandle):
             y = singa.GpuPoolingForward(self.handle, x)
+        else:
+            y = singa.CpuPoolingForward(self.handle, x)
 
         if training:
             self.cache = (x, y)
@@ -1013,11 +1170,15 @@
         return y
 
     def backward(self, dy):
-        if self.handle.device_id == -1:
-            raise NotImplementedError
+        if isinstance(self.handle, singa.CudnnPoolingHandle):
+            dx = singa.GpuPoolingBackward(
+                self.handle, dy, self.cache[0], self.cache[1]
+            )
         else:
-            dx = singa.GpuPoolingBackward(self.handle,
-                                          dy, self.cache[0], self.cache[1])
+            dx = singa.CpuPoolingBackward(
+                self.handle, dy, self.cache[0], self.cache[1]
+            )
+            
         return dx
 
 
@@ -1026,14 +1187,13 @@
 
 
 class Pooling2d(Layer):
-
     def __init__(self, kernel_size, stride=None, padding=0, is_max=True):
         if isinstance(kernel_size, int):
             self.kernel_size = (kernel_size, kernel_size)
         elif isinstance(kernel_size, tuple):
             self.kernel_size = kernel_size
         else:
-            raise TypeError('Wrong kernel_size type.')
+            raise TypeError("Wrong kernel_size type.")
 
         if stride is None:
             self.stride = self.kernel_size
@@ -1041,80 +1201,116 @@
             self.stride = (stride, stride)
         elif isinstance(stride, tuple):
             self.stride = stride
-            assert stride[0] > 0 or (kernel_size[0] == 1 and padding[
-                0] == 0), 'stride[0]=0, but kernel_size[0]=%d, padding[0]=%d' % (kernel_size[0], padding[0])
+            assert stride[0] > 0 or (kernel_size[0] == 1 and padding[0] == 0), (
+                "stride[0]=0, but kernel_size[0]=%d, padding[0]=%d"
+                % (kernel_size[0], padding[0])
+            )
         else:
-            raise TypeError('Wrong stride type.')
+            raise TypeError("Wrong stride type.")
 
         if isinstance(padding, int):
             self.padding = (padding, padding)
         elif isinstance(padding, tuple):
             self.padding = padding
         else:
-            raise TypeError('Wrong padding type.')
+            raise TypeError("Wrong padding type.")
 
         self.is_max = is_max
 
     def __call__(self, x):
 
-        out_shape_h = int(
-            (x.shape[2] + 2 * self.padding[0] - self.kernel_size[0]) // self.stride[0]) + 1
-        out_shape_w = int(
-            (x.shape[3] + 2 * self.padding[1] - self.kernel_size[1]) // self.stride[1]) + 1
+        out_shape_h = (
+            int(
+                (x.shape[2] + 2 * self.padding[0] - self.kernel_size[0])
+                // self.stride[0]
+            )
+            + 1
+        )
+        out_shape_w = (
+            int(
+                (x.shape[3] + 2 * self.padding[1] - self.kernel_size[1])
+                // self.stride[1]
+            )
+            + 1
+        )
         if x.device.id() == -1:
-            if not hasattr(self, 'handle'):
+            if not hasattr(self, "handle"):
                 self.handle = singa.PoolingHandle(
-                    x.data, self.kernel_size, self.stride, self.padding, self.is_max)
-            elif x.shape[0] != self.handle.batchsize or out_shape_h != self.handle.pooled_height or \
-                    out_shape_w != self.handle.pooled_width:
-                self.handle = singa.PoolingHandle(x.data, self.kernel_size, self.stride,
-                                                  self.padding, self.is_max)
+                    x.data,
+                    self.kernel_size,
+                    self.stride,
+                    self.padding,
+                    self.is_max,
+                )
+            elif (
+                x.shape[0] != self.handle.batchsize
+                or out_shape_h != self.handle.pooled_height
+                or out_shape_w != self.handle.pooled_width
+            ):
+                self.handle = singa.PoolingHandle(
+                    x.data,
+                    self.kernel_size,
+                    self.stride,
+                    self.padding,
+                    self.is_max,
+                )
         else:
-            if not hasattr(self, 'handle'):
-                self.handle = singa.CudnnPoolingHandle(x.data, self.kernel_size, self.stride,
-                                                       self.padding, self.is_max)
-            elif x.shape[0] != self.handle.batchsize or out_shape_h != self.handle.pooled_height or \
-                    out_shape_w != self.handle.pooled_width:
-                self.handle = singa.CudnnPoolingHandle(x.data, self.kernel_size, self.stride,
-                                                       self.padding, self.is_max)
-
-        self.handle.device_id = x.device.id()
+            if not hasattr(self, "handle"):
+                self.handle = singa.CudnnPoolingHandle(
+                    x.data,
+                    self.kernel_size,
+                    self.stride,
+                    self.padding,
+                    self.is_max,
+                )
+            elif (
+                x.shape[0] != self.handle.batchsize
+                or out_shape_h != self.handle.pooled_height
+                or out_shape_w != self.handle.pooled_width
+            ):
+                self.handle = singa.CudnnPoolingHandle(
+                    x.data,
+                    self.kernel_size,
+                    self.stride,
+                    self.padding,
+                    self.is_max,
+                )
 
         y = pooling_2d(self.handle, x)
         return y
 
 
 class MaxPool2d(Pooling2d):
-
     def __init__(self, kernel_size, stride=None, padding=0):
         super(MaxPool2d, self).__init__(kernel_size, stride, padding, True)
 
 
 class AvgPool2d(Pooling2d):
-
     def __init__(self, kernel_size, stride=None, padding=0):
         super(AvgPool2d, self).__init__(kernel_size, stride, padding, False)
 
 
 class MaxPool1d(Pooling2d):
-
     def __init__(self, kernel_size, stride=None, padding=0):
         if stride is None:
             stride = kernel_size
         super(MaxPool2d, self).__init__(
-            (1, kernel_size), (0, stride), (0, padding), True)
+            (1, kernel_size), (0, stride), (0, padding), True
+        )
 
 
 class AvgPool1d(Pooling2d):
-
     def __init__(self, kernel_size, stride=None, padding=0):
         if stride is None:
             stride = kernel_size
         super(MaxPool2d, self).__init__(
-            (1, kernel_size), (0, stride), (0, padding), False)
+            (1, kernel_size), (0, stride), (0, padding), False
+        )
 
 
 class Tanh(Operation):
+    def __init__(self):
+        super(Tanh, self).__init__()
 
     def forward(self, x):
         out = singa.Tanh(x)
@@ -1135,6 +1331,8 @@
 
 
 class Sigmoid(Operation):
+    def __init__(self):
+        super(Sigmoid, self).__init__()
 
     def forward(self, x):
         out = singa.Sigmoid(x)
@@ -1155,6 +1353,8 @@
 
 
 class ElemMatmul(Operation):
+    def __init__(self):
+        super(ElemMatmul, self).__init__()
 
     def forward(self, x1, x2):
         if training:
@@ -1181,7 +1381,6 @@
 
 
 class RNN_Base(Layer):
-
     def __init__(self):
         raise NotImplementedError
 
@@ -1193,8 +1392,17 @@
 
 
 class RNN(RNN_Base):
-
-    def __init__(self, input_size, hidden_size, num_layers=1, nonlinearity='tanh', bias=True, batch_first=False, dropout=0, bidirectional=False):
+    def __init__(
+        self,
+        input_size,
+        hidden_size,
+        num_layers=1,
+        nonlinearity="tanh",
+        bias=True,
+        batch_first=False,
+        dropout=0,
+        bidirectional=False,
+    ):
         self.nonlinearity = nonlinearity
 
         Wx_shape = (input_size, hidden_size)
@@ -1217,7 +1425,7 @@
             xs = tuple(xs)
         inputs = xs + (h0,)
         self.device_check(*inputs)
-        #self.device_check(inputs[0], *self.params)
+        # self.device_check(inputs[0], *self.params)
         self.device_check(inputs[0], self.Wx, self.Wh, self.b)
         batchsize = xs[0].shape[0]
         out = []
@@ -1234,9 +1442,9 @@
         y1 = matmul(x, Wx)
         y = add(y2, y1)
         y = add_bias(y, b, axis=0)
-        if self.nonlinearity == 'tanh':
+        if self.nonlinearity == "tanh":
             y = tanh(y)
-        elif self.nonlinearity == 'relu':
+        elif self.nonlinearity == "relu":
             y = relu(y)
         else:
             raise ValueError
@@ -1244,8 +1452,17 @@
 
 
 class LSTM(RNN_Base):
-
-    def __init__(self, input_size, hidden_size, nonlinearity='tanh', num_layers=1, bias=True, batch_first=False, dropout=0, bidirectional=False):
+    def __init__(
+        self,
+        input_size,
+        hidden_size,
+        nonlinearity="tanh",
+        num_layers=1,
+        bias=True,
+        batch_first=False,
+        dropout=0,
+        bidirectional=False,
+    ):
         self.nonlinearity = nonlinearity
 
         Wx_shape = (input_size, hidden_size)
@@ -1269,7 +1486,6 @@
             b.set_value(0.0)
             self.Bx.append(b)
 
-        Bh_shape = (hidden_size,)
         self.Bh = []
         for i in range(4):
             b = Tensor(shape=Bx_shape, requires_grad=True, stores_grad=True)
@@ -1286,17 +1502,19 @@
             xs = list(xs)
         inputs = xs + list((h0, c0))
         self.device_check(*inputs)
-        #self.device_check(inputs[0], *self.params)
+        # self.device_check(inputs[0], *self.params)
         self.device_check(inputs[0], *(self.Wx + self.Wh + self.Bx + self.Bh))
         batchsize = xs[0].shape[0]
         out = []
         h, c = self.step_forward(
-            xs[0], h0, c0, self.Wx, self.Wh, self.Bx, self.Bh)
+            xs[0], h0, c0, self.Wx, self.Wh, self.Bx, self.Bh
+        )
         out.append(h)
         for x in xs[1:]:
             assert x.shape[0] == batchsize
             h, c = self.step_forward(
-                x, h, c, self.Wx, self.Wh, self.Bx, self.Bh)
+                x, h, c, self.Wx, self.Wh, self.Bx, self.Bh
+            )
             out.append(h)
         return out, h, c
 
@@ -1336,9 +1554,9 @@
         hout = tanh(cout)
         hout = mul(o, hout)
         return hout, cout
-    
-class Abs(Operation):
 
+
+class Abs(Operation):
     def forward(self, a):
         if training:
             self.input = a
@@ -1348,11 +1566,12 @@
         dx = singa.Sign(self.input)
         return singa.__mul__(dy, dx)
 
+
 def abs(a):
     return Abs()(a)[0]
 
-class Exp(Operation):
 
+class Exp(Operation):
     def forward(self, a):
         if training:
             self.input = a
@@ -1362,29 +1581,34 @@
         dx = singa.Exp(self.input)
         return singa.__mul__(dy, dx)
 
+
 def exp(a):
     return Exp()(a)[0]
 
-class LeakyRelu(Operation):
 
-    def forward(self, x, a):
+class LeakyRelu(Operation):
+    def __init__(self, a):
+        super().__init__(self)
+        self.a = a
+
+    def forward(self, x):
         if training:
             self.input = x
         x1 = singa.LTFloat(x, 0.0)
         x1 = singa.__mul__(x, x1)
-        x1 = singa.MultFloat(x1, a)  
+        x1 = singa.MultFloat(x1, self.a)
         x2 = singa.ReLU(x)
         x1 = singa.__add__(x1, x2)
         return x1
 
-    def backward(self, dy, a):
-        
+    def backward(self, dy):
+        # TODO(wangwei) check the correctness
         dx1 = singa.GTFloat(self.input, 0.0)
         dx2 = singa.LTFloat(self.input, 0.0)
-        dx2 = singa.MultFloat(x1, a) 
-        dx =  singa.__add__(x1, x2) 
+        dx2 = singa.MultFloat(x1, self.a)
+        dx = singa.__add__(x1, x2)
         return singa.__mul__(dy, dx)
 
 
-def leakyrelu(x,a=0.01):
-    return LeakyRelu()(x,a)[0]
+def leakyrelu(x, a=0.01):
+    return LeakyRelu(a)(x)[0]
diff --git a/python/singa/opt.py b/python/singa/opt.py
index f744f57..76ac55d 100755
--- a/python/singa/opt.py
+++ b/python/singa/opt.py
@@ -127,6 +127,7 @@
                 grad(Tensor): param gradients; the values may be updated
                         in this function; cannot use it anymore
         """
+        assert param.shape == grad.shape, ("shape mismatch", param.shape, grad.shape)
         group = self.default_config
         if param in self.param2config:
             group = self.param2config[param]
diff --git a/python/singa/sonnx.py b/python/singa/sonnx.py
new file mode 100755
index 0000000..e68f3a4
--- /dev/null
+++ b/python/singa/sonnx.py
@@ -0,0 +1,479 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+
+from __future__ import division
+
+import warnings
+from collections import deque
+from onnx import helper, checker
+from onnx import TensorProto
+from onnx import numpy_helper
+from onnx.backend.base import BackendRep as backendRep
+from onnx.backend.base import Backend as backend
+import onnx
+
+from . import singa_wrap as singa
+from . import autograd
+from . import tensor
+
+
+class Handle(object):
+    @staticmethod
+    def conv(inputs, attrs):
+        # inputs: a list of the input tensors
+        kernel = tuple(attrs["kernel_shape"])
+        padding = tuple(attrs["pads"][0:2])
+        stride = tuple(attrs["strides"])
+        group = attrs["group"]
+
+        bias = len(inputs) == 3
+        x = inputs[0]
+        x_shape = inputs[0].shape
+        in_channels = x_shape[1]
+        w_shape = inputs[1].shape
+        out_channels = w_shape[0]
+        assert w_shape[1] == in_channels // group
+
+        if inputs[0].device.id() == -1:
+            if group != 1:
+                raise NotImplementedError
+            else:
+                handle = singa.ConvHandle(
+                    x.data,
+                    kernel,
+                    stride,
+                    padding,
+                    in_channels,
+                    out_channels,
+                    bias,
+                    group
+                )
+        else:
+            handle = singa.CudnnConvHandle(
+                x.data,
+                kernel,
+                stride,
+                padding,
+                in_channels,
+                out_channels,
+                bias,
+                group
+            )
+        return handle
+
+    @staticmethod
+    def max_pool(inputs, attrs):
+        x = inputs[0]
+        kernel = tuple(attrs["kernel_shape"])
+        padding = tuple(attrs["pads"][0:2])
+        stride = tuple(attrs["strides"])
+        if x.device.id() == -1:
+            handle = singa.PoolingHandle(x.data, kernel, stride, padding, True)
+        else:
+            handle = singa.CudnnPoolingHandle(
+                x.data, kernel, stride, padding, True
+            )
+        return handle
+
+    @staticmethod
+    def avg_pool(inputs, attrs):
+        x = inputs[0]
+        kernel = tuple(attrs["kernel_shape"])
+        padding = tuple(attrs["pads"][0:2])
+        stride = tuple(attrs["strides"])
+        if x.device.id() == -1:
+            handle = singa.PoolingHandle(
+                x.data, kernel, stride, padding, False
+            )
+        else:
+            handle = singa.CudnnPoolingHandle(
+                x.data, kernel, stride, padding, False
+            )
+        return handle
+
+    @staticmethod
+    def batchnorm(inputs, attrs):
+        x = inputs[0]
+        factor = attrs["momentum"]
+        if x.device.id() == -1:
+            raise NotImplementedError
+        else:
+            handle = singa.CudnnBatchNormHandle(factor, x.data)
+        return handle
+
+
+
+UnaryOp = {
+    "Relu": autograd.relu,
+    "Softmax": autograd.softmax,
+    "Flatten": autograd.flatten,
+    "Tanh": autograd.tanh,
+    "Sigmoid": autograd.sigmoid,
+}
+BinaryOp = {
+    "Add": autograd.add_bias,
+    "Mul": autograd.mul,
+    "MatMul": autograd.matmul,
+}
+
+OtherOp = {
+    "Conv": (Handle.conv, autograd.conv2d),
+    "MaxPool": (Handle.max_pool, autograd.pooling_2d),
+    "AveragePool": (Handle.avg_pool, autograd.pooling_2d),
+    "BatchNormalization": (Handle.batchnorm, autograd.batchnorm_2d),
+}
+
+class SingaBackendRep(backendRep):
+    def __init__(self, model, device, tensor_dict):
+        """
+        Args:
+            model: onnx model proto
+            device: singa device
+            tensor_dict: dict for weight tensors
+        """
+        self.model = model
+        self.device = device
+        self.tensor_dict = tensor_dict
+        self.handle_dict = {}
+
+    @staticmethod
+    def run_node(node, inputs, handles):
+        """
+        Args:
+            node: onnx node proto
+            inputs: a list of input tensors
+            handles: dict from node name to handle
+
+        Return:
+            a list out output tensors
+        """
+        attrs = attribute2dict(node)
+        op = node.op_type
+        if op in UnaryOp:
+            out = UnaryOp[op](inputs[0])
+        elif op in BinaryOp:
+            out = BinaryOp[op](inputs[0], inputs[1])
+        elif op in OtherOp:
+            handle, forward = OtherOp[op]
+            if node.name not in handles:
+                handles[node.name] = handle(inputs, attrs)
+            out = forward(handles[node.name], *inputs)
+        elif op == "Concat":
+            out = autograd.cat(tuple(inputs), attrs["axis"])
+        else:
+            raise NotImplementedError("Not supported op: {}".format(op))
+        return [out]
+
+    def run(self, inputs):
+        """
+        Run the graph with given inputs.
+
+        Args:
+            inputs: a list of tensors whose name and order match the
+                graph inputs.
+
+        Return:
+            a list of output tensors whose order match the graph outputs.
+        """
+        # input_dict: dict from input name to numpy array
+        tensors = self.tensor_dict.copy()
+        for i, x in enumerate(inputs):
+            tensors[x.name] = x
+            if x.name != self.model.graph.input[i].name:
+                warnings.warn("the inputs do not match the graph inputs")
+
+        for node in self.model.graph.node:
+            if node.op_type != "Constant":
+                inputs = [tensors[x] for x in node.input]
+                outputs = SingaBackendRep.run_node(
+                    node, inputs, self.handle_dict
+                )
+                for (key, val) in zip(node.output, outputs):
+                    tensors[key] = val
+        y = []
+        for i in self.model.graph.output:
+            y.append(tensors[i.name])
+        return y
+
+
+def attribute2dict(node):
+    # create a dictionary from the node attribute name to value
+    attr = {}
+    for a in node.attribute:
+        attr[a.name] = helper.get_attribute_value(a)
+    return attr
+
+
+class SingaBackend(backend):
+    @classmethod
+    def prepare(
+        cls,
+        model,  # type: ModelProto
+        device,  # type: singa device
+        **kwargs  # type: Any
+    ):  # type: (...) -> Optional[BackendRep]
+        """
+        Args:
+            model: onnx model proto
+            device: singa device
+        Return:
+            SingaBackendRep instance
+        """
+        super(SingaBackend, cls).prepare(model, device, **kwargs)
+        name2tensor = {}
+        for node in model.graph.node:
+            if node.op_type == "Constant":
+                data = helper.get_attribute_value(node.attribute[0])
+                requires_grad, stores_grad = True, True
+                if len(node.attribute) == 3:
+                    requires_grad = helper.get_attribute_value(
+                        node.attribute[1]
+                    )
+                    stores_grad = helper.get_attribute_value(node.attribute[2])
+                t = tensor.Tensor(
+                    device=device,
+                    data=numpy_helper.to_array(data),
+                    requires_grad=requires_grad,
+                    stores_grad=stores_grad,
+                )
+
+                name2tensor[node.output[0]] = t
+
+        return SingaBackendRep(model, device, name2tensor)
+
+    @classmethod
+    def run_node(cls, node, inputs, device, outputs_info=None, **kwargs):
+        """
+        Args:
+            node: onnx node proto
+            inputs: list of singa tensors; the names should match
+                node inputs
+        Return:
+            a list of singa tensors as the node outputs
+        """
+        super(SingaBackend, cls).run_node(node, inputs, device)
+        handles = {}
+        outputs = SingaBackendRep.run_node(node, inputs, handles)
+        return outputs
+
+
+def to_onnx_model(inputs, y, model_name="sonnx"):
+    """
+    get onnx model from singa computational graph
+    Args:
+        inputs: a list of input tensors (each is initialized with a name)
+        y: a list of tensors, usually the outputs of the graph
+    Return:
+        the onnx model
+    """
+    assert len(y) == 1  # assume there is only one output
+    y = y[0]  
+    node = []
+    dependency, _ = autograd.infer_dependency(y.creator)
+
+    input_ids = set(id(x) for x in inputs)
+    X = []
+    for x in inputs:
+        dtype = TensorProto.FLOAT
+        if y.dtype == tensor.int32:
+            dtype = TensorProto.INT
+        X.append(helper.make_tensor_value_info(x.name, dtype, x.shape))
+    Y = [helper.make_tensor_value_info(y.name, TensorProto.FLOAT, y.shape)]
+    ready = deque([y.creator])
+
+    while len(ready) > 0:
+        op = ready.pop()
+        assert not isinstance(op, autograd.Dummy)
+        outputs = [op.output_name(idx) for yid, idx in op.y_id2idx.items()]
+        inputs = [
+            srcop.output_name(srcop.y_id2idx[yid])
+            for (srcop, yid, _, _) in op.src
+        ]
+        opname = op.name 
+        optype = str(op).split(".")[-1].split(" ")[0]
+        if isinstance(op, autograd.Concat):
+            node.append(
+                helper.make_node(
+                    "Concat",
+                    inputs=inputs,
+                    outputs=outputs,
+                    name=opname,
+                    axis=op.axis,
+                )
+            )
+        elif isinstance(op, autograd._Conv2d):
+            pads = [
+                op.handle.pad_h,
+                op.handle.pad_w,
+                op.handle.pad_w,
+                op.handle.pad_h,
+            ]
+            stride = [op.handle.stride_h, op.handle.stride_w]
+            k = [op.handle.kernel_h, op.handle.kernel_w]
+            node.append(
+                helper.make_node(
+                    "Conv",
+                    inputs=inputs,
+                    outputs=outputs,
+                    name=opname,
+                    kernel_shape=k,
+                    pads=pads,
+                    strides=stride,
+                    group=op.handle.group,
+                )
+            )
+        elif isinstance(op, autograd._Pooling2d):
+            k = [op.handle.kernel_h, op.handle.kernel_w]
+            s = [op.handle.stride_h, op.handle.stride_w]
+            p = [
+                op.handle.pad_h,
+                op.handle.pad_w,
+                op.handle.pad_w,
+                op.handle.pad_h,
+            ]
+            if op.handle.is_max_pooling:
+                node.append(
+                    helper.make_node(
+                        "MaxPool",
+                        inputs=inputs,
+                        outputs=outputs,
+                        name=opname,
+                        kernel_shape=k,
+                        pads=p,
+                        strides=s,
+                    )
+                )
+            else:
+                node.append(
+                    helper.make_node(
+                        "AveragePool",
+                        inputs=inputs,
+                        outputs=outputs,
+                        name=opname,
+                        kernel_shape=k,
+                        pads=p,
+                        strides=s,
+                    )
+                )
+        elif isinstance(op, autograd._BatchNorm2d):
+            node.append(
+                helper.make_node(
+                    "BatchNormalization",
+                    inputs=inputs,
+                    outputs=outputs,
+                    name=opname,
+                    momentum=op.handle.factor,
+                )
+            )
+            # [(<singa.autograd.Sigmoid object at 0x7fd5ec09cb90>, 140556764852432, None, False),
+            # (<singa.autograd.Dummy object at 0x7fd5ec09c390>, 140556764824208,
+            # <singa.tensor.Tensor object at 0x7fd5ec09c290>, True),
+            # (<singa.autograd.Dummy object at 0x7fd5ec09c490>, 140556764824528,
+            # <singa.tensor.Tensor object at 0x7fd5ec09c3d0>, True),
+            # (<singa.autograd.Dummy object at 0x7fd5ec09c590>, 140556764824784, None, False),
+            # (<singa.autograd.Dummy object at 0x7fd5ec09c690>, 140556764825040, None, False)])
+            # two dummy operators do not have values, so take the values from handle
+            """
+            dummy0 = tensor.to_numpy(
+                tensor.Tensor(
+                    device=op.running_mean.device(), data=op.running_mean
+                )
+            )
+            dummy1 = tensor.to_numpy(
+                tensor.Tensor(
+                    device=op.running_var.device(), data=op.running_var
+                )
+            )
+            dummy0 = helper.make_node(
+                "Constant",
+                inputs=[],
+                outputs=[inputs[3]],
+                value=numpy_helper.from_array(dummy0),
+            )
+            dummy1 = helper.make_node(
+                "Constant",
+                inputs=[],
+                outputs=[inputs[4]],
+                value=numpy_helper.from_array(dummy1),
+            )
+            node.append(dummy0)
+            node.append(dummy1)
+            """
+        else:
+            singa2onnx = {
+                "SoftMax": "Softmax",
+                "AddBias": "Add",
+                "Add": "Add",
+                "Matmul": "MatMul",
+                "ReLU": "Relu",
+                "ElemMatmul": "Mul",
+                "Flatten": "Flatten",
+                "Tanh": "Tanh",
+                "Sigmoid": "Sigmoid"
+            }
+            assert optype in singa2onnx, "Unsupported op:{}".format(optype)
+            onnx_op = singa2onnx[optype]
+            node.append(
+                helper.make_node(
+                    onnx_op, inputs=inputs, outputs=outputs, name=opname
+                )
+            )
+
+        for srcop, yid, y, _ in op.src:
+            dependency[srcop] -= 1
+            if dependency[srcop] == 0:
+                if isinstance(srcop, autograd.Dummy):
+                    if yid not in input_ids:
+                        tmp = helper.make_node(
+                            "Constant",
+                            inputs=[],
+                            outputs=[srcop.output_name(0)],
+                            value=helper.make_tensor(
+                                name=opname,
+                                data_type=TensorProto.FLOAT,
+                                dims=y.shape,
+                                vals=tensor.to_numpy(y)
+                                .flatten()
+                                .astype(float),
+                            ),
+                        )
+                        node.append(tmp)
+                else:
+                    ready.append(srcop)
+
+    # print(node)
+    onnx_model = helper.make_model(
+        helper.make_graph(node[::-1], model_name, X, Y)
+    )
+    checker.check_model(onnx_model)
+    return onnx_model
+
+
+def export(inputs, y, file_path, model_name="sonnx"):
+    onnx_model = to_onnx_model(inputs, y, model_name)
+    onnx.save(onnx_model, file_path)
+
+
+run_model = SingaBackend.run_model
+run_node = SingaBackend.run_node
+supports_device = SingaBackend.supports_device
+prepare = SingaBackend.prepare
+save = onnx.save
+load = onnx.load
diff --git a/python/singa/tensor.py b/python/singa/tensor.py
index bcbb9fe..07e5d5f 100755
--- a/python/singa/tensor.py
+++ b/python/singa/tensor.py
@@ -87,10 +87,10 @@
                      grad but not store grad; But if a tensor stores grad
                      then it must require grad.
     '''
-
+    tensor_count = 0
     def __init__(self, shape=(), device=None, dtype=float32,
                  data=None, requires_grad=True, stores_grad=False,
-                 creator=None):
+                 creator=None,name=None):
         if device is None:
             device = get_default_device()
         if isinstance(data, np.ndarray):
@@ -107,9 +107,14 @@
         self.dtype = self.data.data_type()
         self.requires_grad = requires_grad
         self.stores_grad = stores_grad
+        if name is None:
+            self.name = 'Dummy#{}'.format(Tensor.tensor_count)
+            Tensor.tensor_count += 1
+        else:
+            self.name = name
         if creator is None:
             from . import autograd
-            self.creator = autograd.Dummy(self)
+            self.creator = autograd.Dummy(self,name)
         else:
             self.creator = creator
 
@@ -472,9 +477,9 @@
             x (float or Tensor):
         '''
         if isinstance(x, Tensor):
-            self.data /= x.data
+            self.data *= (1.0/x.data)
         else:
-            self.data /= float(x)
+            self.data *= (1.0/float(x))
         return self
 
     '''
diff --git a/rat-excludes b/rat-excludes
index 9dd92e6..0930056 100644
--- a/rat-excludes
+++ b/rat-excludes
@@ -1,13 +1 @@
-rat-excludes
-Doxyfile
-Makefile.*
-configure
-.gitignore
-doc/*
-config/*
-\.dirstamp
-config.*
-stamp-h1
-.*\.conf
-.*\.md
-control
+rat-excludes
\ No newline at end of file
diff --git a/src/api/.gitignore b/src/api/.gitignore
deleted file mode 100644
index adb5d03..0000000
--- a/src/api/.gitignore
+++ /dev/null
@@ -1 +0,0 @@
-config.i
diff --git a/src/api/config.i.in b/src/api/config.i.in
index 05ddf6e..5bbfa1d 100644
--- a/src/api/config.i.in
+++ b/src/api/config.i.in
@@ -1,8 +1,28 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+
+
 // Pass in cmake configurations to swig
 #cmakedefine01 USE_CUDA
 #cmakedefine01 USE_CUDNN
 #cmakedefine01 USE_OPENCL
 #cmakedefine01 USE_PYTHON
+#cmakedefine01 USE_MKLDNN
 #cmakedefine01 USE_JAVA
 #cmakedefine CUDNN_VERSION ${CUDNN_VERSION}
 
diff --git a/src/api/model_operation.i b/src/api/model_operation.i
index 56141d8..02c957a 100755
--- a/src/api/model_operation.i
+++ b/src/api/model_operation.i
@@ -1,3 +1,24 @@
+/************************************************************
+*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*
+*************************************************************/
+
 %module model_operation
 
 %include "config.i"
@@ -17,9 +38,18 @@
   ConvHandle(const Tensor &input, const std::vector<size_t>& kernel_size,
              const std::vector<size_t>& stride, const std::vector<size_t>& padding,
              const size_t in_channels, const size_t out_channels,
-             const bool bias);
+             const bool bias, const size_t groups);
   bool bias_term;
   size_t batchsize;
+  size_t pad_w;
+  size_t pad_h;
+  size_t stride_h;
+  size_t stride_w;
+  size_t kernel_h;
+  size_t kernel_w;
+  size_t channels;
+  size_t num_filters;
+  size_t group;
 };
 
 Tensor CpuConvForward(const Tensor &x, Tensor &W,  Tensor &b, const ConvHandle &ch);
@@ -38,6 +68,31 @@
     size_t batchsize;
 };
 
+#if USE_MKLDNN
+
+
+Tensor CpuBatchNormForwardInference(const BatchNormHandle &bnh,
+                                    const Tensor &x,
+                                    const Tensor &bnScale,
+                                    const Tensor &bnBias,
+                                    Tensor &running_mean,
+                                    Tensor &running_var);
+
+const std::vector<Tensor> CpuBatchNormForwardTraining(const BatchNormHandle &bnh,
+                                                      const Tensor &x,
+                                                      const Tensor &bnScale,
+                                                      const Tensor &bnBias,
+                                                      Tensor &running_mean,
+                                                      Tensor &running_var);
+
+const std::vector<Tensor> CpuBatchNormBackwardx(const BatchNormHandle &bnh,
+                                                const Tensor &y, const Tensor &dy,
+                                                const Tensor &x,
+                                                const Tensor &bnScale, const Tensor &bnBias,
+                                                const Tensor &mean, const Tensor &var);
+
+#endif  //USE_MKLDNN
+
 
 class PoolingHandle {
  public:
@@ -46,11 +101,23 @@
                 const bool is_max=true);
 
   int batchsize;
-
+  int stride_h;
+  int stride_w;
+  int kernel_h;
+  int kernel_w;
+  int pad_h;
+  int pad_w;
   int pooled_height;
   int pooled_width;
+  bool is_max_pooling;
 };
 
+#if USE_MKLDNN
+
+Tensor CpuPoolingForward(const PoolingHandle &ph, const Tensor &x);
+Tensor CpuPoolingBackward(const PoolingHandle &ph, const Tensor &dy,
+                            const Tensor& x, const Tensor& y);
+#endif  //USE_MKLDNN
 
 #if USE_CUDNN
 class CudnnConvHandle: public ConvHandle {
@@ -62,6 +129,15 @@
                   const std::string& prefer = "fastest");
   bool bias_term;
   size_t batchsize;
+  size_t pad_w;
+  size_t pad_h;
+  size_t stride_h;
+  size_t stride_w;
+  size_t kernel_h;
+  size_t kernel_w;
+  size_t channels;
+  size_t num_filters;
+  size_t group;
 };
 
 Tensor GpuConvForward(const Tensor &x, const Tensor &W, const Tensor &b, const CudnnConvHandle &cch);
@@ -76,8 +152,9 @@
 class CudnnBatchNormHandle: public BatchNormHandle{
     public:
       CudnnBatchNormHandle(const float momentum, const Tensor& input);
-
+    size_t channels;
     size_t batchsize;
+    float factor;
 };
 
 const std::vector<Tensor> GpuBatchNormForwardTraining(const CudnnBatchNormHandle &cbnh,
@@ -100,6 +177,14 @@
 
   int pooled_height;
   int pooled_width;
+  int kernel_h;
+  int kernel_w;
+  int pad_h;
+  int pad_w;
+
+  int stride_h;
+  int stride_w;
+
 };
 
 Tensor GpuPoolingForward(const CudnnPoolingHandle &cph, const Tensor &x);
diff --git a/src/core/device/cpp_cpu.cc b/src/core/device/cpp_cpu.cc
index 04209ab..ac2d42c 100644
--- a/src/core/device/cpp_cpu.cc
+++ b/src/core/device/cpp_cpu.cc
@@ -24,9 +24,18 @@
 
 CppCPU::CppCPU() : Device(-1, 1) {
   lang_ = kCpp;
+#ifdef USE_MKLDNN
+  ctx_.engine = new mkldnn::engine(mkldnn::engine::cpu, 0);
+#endif //USE_MKLDNN
   //host_ = nullptr;
 }
 
+CppCPU::~CppCPU() {
+#ifdef USE_MKLDNN
+  delete(ctx_.engine);
+#endif //USE_MKLDNN
+
+};
 
 void CppCPU::SetRandSeed(unsigned seed) {
   ctx_.random_generator.seed(seed);
diff --git a/src/core/tensor/tensor.cc b/src/core/tensor/tensor.cc
index 720ef90..8c50437 100755
--- a/src/core/tensor/tensor.cc
+++ b/src/core/tensor/tensor.cc
@@ -16,6 +16,7 @@
  * limitations under the License.
  */
 #include "singa/core/tensor.h"
+// #include "singa/utils/stacktrace.h"
 #include "./tensor_math.h"
 #include "./tensor_math_cpp.h"
 #include "./tensor_math_cuda.h"
@@ -36,7 +37,7 @@
 
 Tensor::Tensor() {
   device_ = defaultDevice;
-  strides_ = {1};
+  stride_ = {1};
 }
 
 //non-strided constructors
@@ -45,7 +46,7 @@
   size_t size = Product(shape_) * SizeOf(data_type_);
   if (size)
     block_ = device_->NewBlock((int)size);
-  generate_strides();
+  generate_stride();
 }
 
 
@@ -56,13 +57,13 @@
   size_t size = Product(shape_) * SizeOf(data_type_);
   if (size)
     block_ = device_->NewBlock((int)size);
-  generate_strides();
+  generate_stride();
 }
 
 
 Tensor::Tensor(const Tensor &in) : data_type_(in.data_type_),
   device_(in.device_),  block_(in.block()),  shape_(in.shape_),
-  strides_(in.strides_) {
+  stride_(in.stride_) {
   if (block_ != nullptr)
     block_->IncRefCount();
 }
@@ -70,13 +71,13 @@
 
 Tensor::Tensor(Tensor &&in) : data_type_(in.data_type_),
   device_(in.device_), shape_(std::move(in.shape_)),
-  strides_(std::move(in.strides_)) {
+  stride_(std::move(in.stride_)) {
   block_ = in.block_;
   in.block_ = nullptr;
 }
 
 
-void Tensor::ResetLike(const Tensor &in) {
+Tensor& Tensor::ResetLike(const Tensor &in) {
   if (block_ == nullptr || device_ != in.device_ || MemSize() != in.MemSize()) {
     if (block_ != nullptr && block_->DecRefCount() == 0)
       device_->FreeBlock(block_);
@@ -85,30 +86,39 @@
     block_ = device_->NewBlock((int)in.MemSize());
   }
   shape_ = in.shape_;
-  strides_ = in.strides_;
+  stride_ = in.stride_;
+  return *this;
 }
 
-void Tensor::SetShape(const Shape& shape) {
-  if (Product(shape_) != Product(shape)) {
+Tensor& Tensor::Resize(const Shape& shape) {
+  if (Size() != Product(shape)) {
     if (block_ != nullptr && block_->DecRefCount() == 0)
       device_->FreeBlock(block_);
     block_ = device_->NewBlock((int)(Product(shape) * SizeOf(data_type_)));
   }
   shape_ = shape;
-  generate_strides();
+  generate_stride();
+  return *this;
+}
+
+Tensor Resize(const Tensor& in, const Shape& shape) {
+  Tensor out(in);
+  out.Resize(shape);
+  return out;
 }
 
 
-void Tensor::AsType(const DataType type) {
+Tensor& Tensor::AsType(const DataType type) {
   if (data_type_ != type) {
     if (block_ != nullptr && block_->DecRefCount() == 0)
       device_->FreeBlock(block_);
     block_ = device_->NewBlock((int)(Product(shape_) * SizeOf(type)));
     data_type_ = type;
   }
+  return *this;
 }
 
-void Tensor::ToDevice(std::shared_ptr<Device> dst) {
+Tensor& Tensor::ToDevice(std::shared_ptr<Device> dst) {
   // TODO(wangwei) the comparison is restricted. May compare against device ID?
   if (device_ != dst) {
     Tensor tmp(shape_, dst, data_type_);
@@ -120,10 +130,12 @@
     tmp.block_ = nullptr;
     device_ = dst;
   }
+  return *this;
 }
 
-void Tensor::ToHost() {
+Tensor& Tensor::ToHost() {
   if (device_ != defaultDevice) ToDevice(device_->host());
+  return *this;
 }
 
 template <typename DType>
@@ -179,8 +191,8 @@
   data_type_ = proto.data_type();
   block_ = device_->NewBlock((int)(Product(shape()) * SizeOf(data_type_)));
   //transpose_ = proto.transpose();
-  strides_.clear();
-  for (int32_t s : proto.strides()) strides_.push_back(s);
+  stride_.clear();
+  for (int32_t s : proto.stride()) stride_.push_back(s);
   switch (data_type_) {
   case kFloat32: {
     std::unique_ptr<float[]> data_ptr(new float[Product(shape_)]);
@@ -230,9 +242,9 @@
   }
   proto->set_data_type(data_type_);
   //proto->set_transpose(transpose_);
-  proto->clear_strides();
-  for (auto s : strides_) {
-    proto->add_strides(s);
+  proto->clear_stride();
+  for (auto s : stride_) {
+    proto->add_stride(s);
   }
   switch (data_type_) {
   case kFloat32: {
@@ -314,7 +326,7 @@
     }
   }
   Tensor t(tshape, device_);
-  //t.strides_.push_back(1);
+  //t.stride_.push_back(1);
   t.RepeatData(repeats, axis, total_repeats, *this);
   return t;
 }
@@ -323,11 +335,36 @@
   if (device == nullptr) device = device_;
   Tensor t(shape_, device_, data_type_);
   //t.transpose_ = transpose_;
-  t.strides_ = strides_;
+  t.stride_ = stride_;
   t.CopyData(*this);
   return t;
 }
 
+Tensor& Tensor::Broadcast(const Shape& shape) {
+  // TODO(wangwei) do we need to transform the mem layout if the tensor was
+  // transposed?
+  auto m = shape_.size() - 1, n = shape.size() - 1;
+  for (size_t i = 0; i <= std::min(m, n); i++) {
+    if ((shape.at(n - i) != shape_.at(m - i)) && (shape.at(n - i) != 1)) {
+      CHECK_EQ(shape_.at(m - i), 1) << "i= " << i << "\n"; // << Backtrace();
+      shape_.at(m - i) = shape.at(n - i);
+      stride_.at(m - i) = 0;
+    }
+  }
+  if (m < n) {
+    for (size_t i = m + 1; i <= n; i++) {
+      shape_.emplace(shape_.begin(), shape.at(n - i));
+      stride_.emplace(stride_.begin(), 0);
+    }
+  }
+  return *this;
+}
+
+Tensor Broadcast(const Tensor& in, const Shape& shape) {
+  Tensor out(in);
+  return out.Broadcast(shape);
+}
+
 Tensor& Tensor::T() {
   // this function only works for 2d tensors
   CHECK_EQ(shape_.size(), 2u);
@@ -338,7 +375,7 @@
 //normal transpose without axes
 Tensor& Tensor::Transpose() {
   std::reverse(shape_.begin(), shape_.end());
-  std::reverse(strides_.begin(), strides_.end());
+  std::reverse(stride_.begin(), stride_.end());
   return *this;
 }
 
@@ -348,12 +385,12 @@
                                        "Tranpose axes's length should be equal to shape";
 
   auto shape = shape_;
-  auto strides = strides_;
+  auto stride = stride_;
   shape_.clear();
-  strides_.clear();
+  stride_.clear();
   for (size_t n = 0; n < axes.size(); ++n) {
     shape_.push_back(shape[axes[n]]);
-    strides_.push_back(strides[axes[n]]);
+    stride_.push_back(stride[axes[n]]);
   }
   return *this;
 }
@@ -375,7 +412,7 @@
 Tensor &Tensor::operator=(const Tensor &in) {
   if (block_ != nullptr && block_->DecRefCount() == 0)
     device_->FreeBlock(block_);
-  strides_ = in.strides_;
+  stride_ = in.stride_;
   data_type_ = in.data_type_;
   shape_ = in.shape_;
   device_ = in.device_;
@@ -388,7 +425,7 @@
 Tensor &Tensor::operator=(Tensor &&in) {
   if (block_ != nullptr && block_->DecRefCount() == 0)
     device_->FreeBlock(block_);
-    strides_ = std::move(in.strides_);
+  stride_ = std::move(in.stride_);
   data_type_ = in.data_type_;
   shape_ = std::move(in.shape_);
   device_ = in.device_;
@@ -659,14 +696,33 @@
 
 #define GenBinaryTensorFn(op, fn)                              \
   Tensor op(const Tensor &lhs, const Tensor &rhs) {            \
-    Tensor ret(lhs.shape(), lhs.device(), lhs.data_type());    \
-    fn(lhs, rhs, &ret);                                        \
-    return ret;                                                \
+    if (lhs.shape() != rhs.shape()) {                          \
+      auto lhs_ = Broadcast(lhs, rhs.shape());                 \
+      auto rhs_ = Broadcast(rhs, lhs.shape());                 \
+      Tensor ret(lhs_.shape(), lhs.device(), lhs.data_type()); \
+      fn(lhs_, rhs_, &ret);                                      \
+      return ret;                                              \
+    } else {                                                   \
+      Tensor ret(lhs.shape(), lhs.device(), lhs.data_type());  \
+      fn(lhs, rhs, &ret);                                      \
+      return ret;                                              \
+    }                                                          \
   }                                                            \
   void fn(const Tensor &lhs, const Tensor &rhs, Tensor *ret) { \
-    EltwiseBinaryTensorFn(fn, lhs, rhs, ret);                  \
+    CHECK_EQ(lhs.device(), ret->device());                     \
+    CHECK_EQ(rhs.device(), ret->device());                     \
+    if (lhs.shape() != rhs.shape()) {                          \
+      auto lhs_ = Broadcast(lhs, rhs.shape());                 \
+      auto rhs_ = Broadcast(rhs, lhs.shape());                 \
+      CHECK(lhs_.shape() == ret->shape());                    \
+      EltwiseBinaryTensorFn(fn, lhs_, rhs_, ret);              \
+    } else {                                                   \
+      CHECK(lhs.shape() == ret->shape());                      \
+      EltwiseBinaryTensorFn(fn, lhs, rhs, ret);                \
+    }                                                          \
   }
 
+// boradcasting operations: https://github.com/onnx/onnx/blob/master/docs/Broadcasting.md
 GenBinaryTensorFn(operator+, Add);
 GenBinaryTensorFn(operator-, Sub);
 GenBinaryTensorFn(operator*, EltwiseMult);
@@ -676,6 +732,7 @@
 GenBinaryTensorFn(operator<=, LE);
 GenBinaryTensorFn(operator>, GT);
 GenBinaryTensorFn(operator>=, GE);
+
 #define EltwiseTensorScalarFn(fn, t, x, ret)                            \
   do {                                                                  \
     TYPE_LANG_SWITCH(t.data_type(), DType, t.device()->lang(), Lang, {  \
@@ -1225,9 +1282,12 @@
 }
 
 
-// if tensor is not transposed yet, we change the shape and generate new strides
-// if tensor is already transposed, we reallocate the memory and generate strides
+// if tensor is not transposed yet, we change the shape and generate new stride
+// if tensor is already transposed, we reallocate the memory and generate stride
 Tensor& Tensor::Reshape(const Shape &shape) {
+  // Check original volumn with the new one
+  // do not use Product(shape_) due to stride 0 from broadcasting.
+  CHECK_EQ(Product(shape), Size());
   if (transpose()) {
     Tensor t(shape, device_, data_type_);
     singa::Transform(*this, &t);
@@ -1235,8 +1295,8 @@
     std::swap(t.block_, block_);
   } else {
     shape_ = shape;
-    generate_strides();
   }
+  generate_stride();
   return *this;
 }
 
diff --git a/src/core/tensor/tensor_math.h b/src/core/tensor/tensor_math.h
index f5fbc84..10d79cf 100644
--- a/src/core/tensor/tensor_math.h
+++ b/src/core/tensor/tensor_math.h
@@ -18,6 +18,11 @@
 #ifndef SINGA_CORE_MATH_H_
 #define SINGA_CORE_MATH_H_
 #include <type_traits>
+#include <string> 
+#include <algorithm> 
+#include <sstream> 
+#include <iterator> 
+#include <iostream> 
 #include "singa/core/common.h"
 #include "singa/core/tensor.h"
 #include "singa/utils/logging.h"
@@ -50,10 +55,32 @@
 /// 7. Use size_t for the number of elements, rows or columns.
 /// 8. Use the same name for the Tensor and Tensor level math functions.
 
+const std::string vec2str(const std::vector<int>& vec){
+  std::ostringstream vts; 
+  if (!vec.empty())  {
+  // Convert all but the last element to avoid a trailing "," 
+    std::copy(vec.begin(), vec.end(), std::ostream_iterator<int>(vts, ", ")); 
+  }
+  return vts.str();
+}
+
+const std::string vec2str(const std::vector<size_t>& vec){
+  std::ostringstream vts; 
+  if (!vec.empty())  {
+  // Convert all but the last element to avoid a trailing "," 
+    std::copy(vec.begin(), vec.end(), std::ostream_iterator<size_t>(vts, ", ")); 
+  }
+  return vts.str();
+}
+	  
+			                      
 
 // **************************************
-// Element-wise functions
-// **************************************
+// // Element-wise functions
+// // Cpp tensors support multi-dimensional broadcasting; 
+// // Cuda supports unidirectional broadcasting, 
+// // i.e., the lhs and the output have the same shape
+// // **************************************
 
 /// out[i] = |in[i]|
 template <typename DType, typename Lang>
diff --git a/src/core/tensor/tensor_math_cpp.h b/src/core/tensor/tensor_math_cpp.h
index c1af523..fc487a3 100644
--- a/src/core/tensor/tensor_math_cpp.h
+++ b/src/core/tensor/tensor_math_cpp.h
@@ -19,10 +19,15 @@
 #define SINGA_CORE_TENSOR_TENSOR_MATH_CPP_H_
 
 #include "./tensor_math.h"
+//#include "./stacktrace.h"
 #include <cfloat>
 #include "singa/core/common.h"
 #include "singa/core/tensor.h"
 #include <math.h>
+#include <algorithm>
+#include <sstream>
+#include <iterator>
+#include <iostream>
 
 #ifdef USE_CBLAS
 #include <cblas.h>
@@ -103,13 +108,28 @@
   update_base_index(x, traversal_info);
   traversal_info[x.shape().size() + 1] = determine_order(shape_multipliers, counter);
   traversal_info[x.shape().size()] = traversal_info[traversal_info[x.shape().size() + 1]] +
-                                     x.strides()[x.strides().size() - traversal_info[x.shape().size() + 1] - 1];
+                                     x.stride()[x.stride().size() - traversal_info[x.shape().size() + 1] - 1];
 };
 
+inline int next_offset(int offset, const vector<size_t>& shape, const vector<int>& stride, vector<int> *index) {
+  for (int k = shape.size() - 1; k >= 0; k--) {
+    if (index->at(k) + 1 < int(shape.at(k))) {
+      offset += stride.at(k);
+      index->at(k) += 1;
+      break;
+    }
+    index->at(k) = 0;
+    offset -= stride.at(k) * (shape.at(k) - 1);
+  }
+  return offset;
+}
+
 template <typename DType>
-void traverse_unary(const Tensor &in, Tensor* out, std::function<DType(DType)> func) {
+void traverse_unary(const Tensor & in, Tensor * out, std::function<DType(DType)> func) {
+
   DType *outPtr = static_cast<DType *>(out->block()->mutable_data());
   const DType *inPtr = static_cast<const DType *>(in.block()->data());
+  /*
   vector<int> traversal_info = generate_traversal_info(in);
   vector<int> shape_multipliers = generate_shape_multipliers(in);
 
@@ -117,14 +137,31 @@
     outPtr[i] = func(inPtr[traversal_info[in.shape().size()]]);
     traverse_next(in, shape_multipliers, traversal_info, i + 1);
   }
+  */
+  CHECK(in.shape() == out->shape());
+  if (in.stride() == out->stride()) {
+    for (size_t i = 0; i < in.Size(); i++)
+      outPtr[i] = func(inPtr[i]);
+  } else {
+    LOG(INFO) << "not equal stride";
+    size_t in_offset = 0, out_offset = 0;
+    vector<int> in_idx(in.nDim(), 0), out_idx(out->nDim(), 0);
+    for (size_t i = 0; i < Product(in.shape()); i++) {
+      outPtr[out_offset] = func(inPtr[in_offset]);
+      out_offset = next_offset(out_offset, out->shape(), out->stride(), &out_idx);
+      in_offset = next_offset(in_offset, in.shape(), in.stride(), &in_idx);
+    }
+  }
 }
 
+
 template <typename DType>
 void traverse_binary(const Tensor &in1, const Tensor &in2, Tensor* out,
-                    std::function<DType(DType, DType)> func) {
+                     std::function<DType(DType, DType)> func) {
   DType *outPtr = static_cast<DType *>(out->block()->mutable_data());
   const DType *in1Ptr = static_cast<const DType *>(in1.block()->data());
   const DType *in2Ptr = static_cast<const DType *>(in2.block()->data());
+  /*
   vector<int> traversal_info_in1 = generate_traversal_info(in1);
   vector<int> traversal_info_in2 = generate_traversal_info(in2);
   vector<int> shape_multipliers_in1 = generate_shape_multipliers(in1);
@@ -136,6 +173,41 @@
     traverse_next(in1, shape_multipliers_in1, traversal_info_in1, i + 1);
     traverse_next(in2, shape_multipliers_in2, traversal_info_in2, i + 1);
   }
+  */
+  auto prod = Product(in1.shape());
+  CHECK(in1.shape() == out->shape());
+  CHECK(in2.shape() == out->shape());
+  if ((in1.stride() == out->stride()) && (in2.stride() == in1.stride())) {
+    for (size_t i = 0; i < prod; i++)
+      outPtr[i] = func(in1Ptr[i], in2Ptr[i]);
+  } else {
+    /*
+    LOG(INFO) << "not equal stride";
+    std::ostringstream s1, s2, s3, s4, s5, s6;
+    std::copy(in1.stride().begin(), in1.stride().end(), std::ostream_iterator<int>(s1, ", "));
+    std::copy(in2.stride().begin(), in2.stride().end(), std::ostream_iterator<int>(s2, ", "));
+    std::copy(out->stride().begin(), out->stride().end(), std::ostream_iterator<int>(s3, ", "));
+
+    std::copy(in1.shape().begin(), in1.shape().end(), std::ostream_iterator<int>(s4, ", "));
+    std::copy(in2.shape().begin(), in2.shape().end(), std::ostream_iterator<int>(s5, ", "));
+    std::copy(out->shape().begin(), out->shape().end(), std::ostream_iterator<int>(s6, ", "));
+
+    LOG(INFO) << s1.str() << ": " << s4.str();
+    LOG(INFO) << s2.str() << ": " << s5.str();
+    LOG(INFO) << s3.str() << ": " << s6.str();
+    LOG(INFO) << Backtrace();
+    */
+
+    size_t in1_offset = 0, in2_offset = 0, out_offset = 0;
+    vector<int> in1_idx(in1.nDim(), 0), in2_idx(in2.nDim(), 0), out_idx(out->nDim(), 0);
+    for (size_t i = 0; i < prod; i++) {
+      outPtr[out_offset] = func(in1Ptr[in1_offset], in2Ptr[in2_offset]);
+      out_offset = next_offset(out_offset, out->shape(), out->stride(), &out_idx);
+      in1_offset = next_offset(in1_offset, in1.shape(), in1.stride(), &in1_idx);
+      in2_offset = next_offset(in2_offset, in2.shape(), in2.stride(), &in2_idx);
+      // LOG(INFO) <<  in1_offset << ", " << in2_offset << ", " << out_offset;
+    }
+  }
 }
 
 // ******************************************************************************************
@@ -182,35 +254,15 @@
 template <>
 void Div<float, lang::Cpp>(const float x, const Tensor& in, Tensor* out,
                            Context *ctx) {
-  const float *inPtr = static_cast<const float *>(in.block()->data());
-  float *outPtr = static_cast<float *>(out->block()->mutable_data());
-  vector<int> traversal_info = generate_traversal_info(in);
-  vector<int> shape_multipliers = generate_shape_multipliers(in);
-
-  for (size_t i = 0; i < in.Size(); i++) {
-    CHECK_NE(inPtr[traversal_info[in.shape().size()]], 0.f);
-    outPtr[i] = x / inPtr[traversal_info[in.shape().size()]];
-    traverse_next(in, shape_multipliers, traversal_info, i + 1);
-  }
+  auto const_div = [&x](float a) {CHECK_NE(a, 0.f); return x / a;};
+  traverse_unary<float>(in, out, const_div);
 }
 
 template <>
 void Div<float, lang::Cpp>(const Tensor& in1, const Tensor& in2,
                            Tensor* out, Context *ctx) {
-  float *outPtr = static_cast<float *>(out->block()->mutable_data());
-  const float *in1Ptr = static_cast<const float *>(in1.block()->data());
-  const float *in2Ptr = static_cast<const float *>(in2.block()->data());
-  vector<int> traversal_info_in1 = generate_traversal_info(in1);
-  vector<int> traversal_info_in2 = generate_traversal_info(in2);
-  vector<int> shape_multipliers_in1 = generate_shape_multipliers(in1);
-  vector<int> shape_multipliers_in2 = generate_shape_multipliers(in2);
-
-  for (size_t i = 0; i < in1.Size(); i++) {
-    CHECK_NE(in2Ptr[traversal_info_in2[in2.shape().size()]], 0.f);
-    outPtr[i] = in1Ptr[traversal_info_in1[in1.shape().size()]] / in2Ptr[traversal_info_in2[in2.shape().size()]];
-    traverse_next(in1, shape_multipliers_in1, traversal_info_in1, i + 1);
-    traverse_next(in2, shape_multipliers_in2, traversal_info_in2, i + 1);
-  }
+  auto binary_div = [](float a, float b) {CHECK_NE(b, 0.f); return a / b;};
+  traverse_binary<float>(in1, in2, out, binary_div);
 }
 
 template <>
@@ -293,16 +345,8 @@
 template <>
 void Log<float, lang::Cpp>(const Tensor& in, Tensor* out,
                            Context *ctx) {
-  float *outPtr = static_cast<float *>(out->block()->mutable_data());
-  const float *inPtr = static_cast<const float *>(in.block()->data());
-  vector<int> traversal_info = generate_traversal_info(in);
-  vector<int> shape_multipliers = generate_shape_multipliers(in);
-
-  for (size_t i = 0; i < in.Size(); i++) {
-    CHECK_GT(inPtr[traversal_info[in.shape().size()]], 0.f);
-    outPtr[i] = log(inPtr[traversal_info[in.shape().size()]]);
-    traverse_next(in, shape_multipliers, traversal_info, i + 1);
-  }
+  auto ulog = [](float a) {CHECK_GT(a, 0.f); return log(a);};
+  traverse_unary<float>(in, out, ulog);
 }
 
 template <>
@@ -382,16 +426,8 @@
 template <>
 void Sqrt<float, lang::Cpp>(const Tensor& in, Tensor* out,
                             Context *ctx) {
-  float *outPtr = static_cast<float *>(out->block()->mutable_data());
-  const float *inPtr = static_cast<const float *>(in.block()->data());
-  vector<int> traversal_info = generate_traversal_info(in);
-  vector<int> shape_multipliers = generate_shape_multipliers(in);
-
-  for (size_t i = 0; i < in.Size(); i++) {
-    CHECK_GE(inPtr[traversal_info[in.shape().size()]], 0.f);
-    outPtr[i] = sqrt(inPtr[traversal_info[in.shape().size()]]);
-    traverse_next(in, shape_multipliers, traversal_info, i + 1);
-  }
+  auto usqrt = [](float a) {CHECK_GE(a, 0.f); return sqrt(a);};
+  traverse_unary<float>(in, out, usqrt);
 }
 
 template <>
@@ -428,16 +464,9 @@
 
 template <>
 void Transform<float, lang::Cpp>(const Tensor& in, Tensor* out,
-                            Context *ctx) {
-  float *outPtr = static_cast<float *>(out->block()->mutable_data());
-  const float *inPtr = static_cast<const float *>(in.block()->data());
-  vector<int> traversal_info = generate_traversal_info(in);
-  vector<int> shape_multipliers = generate_shape_multipliers(in);
-
-  for (size_t i = 0; i < in.Size(); i++) {
-    outPtr[i] = inPtr[traversal_info[in.shape().size()]];
-    traverse_next(in, shape_multipliers, traversal_info, i + 1);
-  }
+                                 Context *ctx) {
+  auto identity = [](float a) {return a;};
+  traverse_unary<float>(in, out, identity);
 }
 
 template <>
@@ -482,23 +511,23 @@
   float *outPtr = static_cast<float *>(out->block()->mutable_data());
   const size_t nrow = M.shape(0);
   const size_t ncol = M.shape(1);
-  vector<int> traversal_info = generate_traversal_info(M);
-  vector<int> shape_multipliers = generate_shape_multipliers(M);
 
   if (side_right) {
     for (size_t r = 0; r < nrow; r++) {
-      size_t offset = r * ncol;
+      size_t in_offset = M.stride()[0] * r, out_offset = out->stride()[0] * r;
       for (size_t c = 0; c < ncol; c++) {
-        outPtr[traversal_info[M.shape().size()]] = MPtr[traversal_info[M.shape().size()]] * vPtr[c];
-        traverse_next(M, shape_multipliers, traversal_info, offset + c + 1);
+        outPtr[out_offset] = MPtr[in_offset] * vPtr[c];
+        in_offset += M.stride()[1];
+        out_offset += out->stride()[1];
       }
     }
   } else {
     for (size_t r = 0; r < nrow; r++) {
-      size_t offset = r * ncol;
+      size_t in_offset = M.stride()[0] * r, out_offset = out->stride()[0] * r;
       for (size_t c = 0; c < ncol; c++) {
-        outPtr[traversal_info[M.shape().size()]] = MPtr[traversal_info[M.shape().size()]] * vPtr[r];
-        traverse_next(M, shape_multipliers, traversal_info, offset + c + 1);
+        outPtr[out_offset] = MPtr[in_offset] * vPtr[r];
+        in_offset += M.stride()[1];
+        out_offset += out->stride()[1];
       }
     }
   }
@@ -524,7 +553,7 @@
 // void Axpy<float, lang::Cpp>(const float alpha,
 //                             const Tensor& in, Tensor *out, Context *ctx) {
 //   //check input tensor for strides first
-//   if (in.strides() == out->strides()) {
+//   if (in.stride() == out->stride()) {
 //     const float *inPtr = static_cast<const float *>(in.block()->data());
 //     float *outPtr = static_cast<float *>(out->block()->mutable_data());
 //     cblas_saxpy(in.Size(), alpha, inPtr, 1, outPtr, 1);
@@ -541,7 +570,7 @@
   const float *inPtr = static_cast<const float *>(in.block()->data());
   float *outPtr = static_cast<float *>(out->block()->mutable_data());
 
-  if (in.strides() == out->strides()) {
+  if (in.stride() == out->stride()) {
     cblas_saxpy(in.Size(), alpha, inPtr, 1, outPtr, 1);
   } else {
     //LOG(FATAL) << "Axpy, input and output strides do not match." ;
@@ -556,7 +585,7 @@
 // void Axpy<float, lang::Cpp>(const float alpha,
 //                            const Tensor& in, Tensor *out, Context *ctx) {
 //  //check input tensor for strides first
-//  if (in.strides() == out->strides()) {
+//  if (in.stride() == out->stride()) {
 //    const float *inPtr = static_cast<const float *>(in.block()->data());
 //    float *outPtr = static_cast<float *>(out->block()->mutable_data());
 //    cblas_saxpy(in.Size(), alpha, inPtr, 1, outPtr, 1);
diff --git a/src/core/tensor/tensor_math_cuda.h b/src/core/tensor/tensor_math_cuda.h
index dfe5724..c92d80e 100644
--- a/src/core/tensor/tensor_math_cuda.h
+++ b/src/core/tensor/tensor_math_cuda.h
@@ -57,13 +57,14 @@
   Shape shape = x.shape();
   CHECK_LE(shape.size(), 5) << "Dimensions (shape) beyond 5 are currently not supported" ;
   vector<int> shape_arr;
-  if (shape.size() <= 4) {
-    for (int n = 0; n < 4 - shape.size(); ++n) {
+  if (shape.size() < 4) {
+    for (int n = 0; n < 4 - int(shape.size()); ++n) {
       shape_arr.push_back(1);
     }
   }
-  for(auto x: shape)
+  for(auto x: shape) {
     shape_arr.push_back(static_cast<int>(x));
+  }
   return shape_arr;
 }
 
@@ -84,11 +85,11 @@
   */
 vector<int> generate_strides_cuda(const Tensor& x) {
   Shape shape = x.shape();
-  auto& strides = x.strides();
+  auto& strides = x.stride();
   vector<int> strides_arr;
   int product = Product(shape);
-  if (shape.size() <= 4) {
-    for (int n = 0; n < 4 - shape.size(); ++n) {
+  if (shape.size() < 4) {
+    for (int n = 0; n < 4 - int(shape.size()); ++n) {
       strides_arr.push_back(product);
     }
   }
@@ -100,11 +101,30 @@
 cudnnTensorDescriptor_t generate_tensor_nd_desc(const Tensor& x) {
   cudnnTensorDescriptor_t x_desc;
   check_cudnn(cudnnCreateTensorDescriptor(&x_desc));
+  // LOG(INFO) << vec2str(x.shape());
+  // LOG(INFO) << vec2str(x.stride());
+  auto st = x.stride();
+  std::vector<size_t> sh;
+  bool reshape = false;
+  for(size_t i = 0; i < st.size(); i++) {
+    if (st[i] == 0) {
+      sh.push_back(1);
+      reshape = true;
+    } else {
+      sh.push_back(x.shape(i));
+    }
+  }
+  auto y = x;
+  if (reshape)
+    y = Reshape(x, sh);
+  auto shape = generate_shape_cuda(y);
+  auto stride = generate_strides_cuda(y);
+ 
+  // LOG(INFO) << vec2str(shape);
+  // LOG(INFO) << vec2str(stride);
+  // LOG(INFO) << "";
   check_cudnn(cudnnSetTensorNdDescriptor(x_desc, CUDNN_DATA_FLOAT,
-                             generate_dim_cuda(x),
-                             generate_shape_cuda(x).data(),
-                             generate_strides_cuda(x).data()
-                            ));
+    generate_dim_cuda(y), shape.data(), stride.data()));
 
   return x_desc;
 }
@@ -244,7 +264,7 @@
   float* outPtr = static_cast<float*>(out->block()->mutable_data());
   const size_t num = in.Size();
   //if both in and out strides are the same, we proceed to normal cuda::clamp
-  if (in.strides() == out->strides()) {
+  if (in.stride() == out->stride()) {
     cuda::clamp(num, low, high, inPtr, outPtr, ctx->stream);
   } else { //else we transform in to out to store first
     Transform<float, lang::Cuda>(in, out, ctx);
@@ -263,7 +283,7 @@
 
   //if both in1 and in2 are not transposed, and have the same strides,
   //we proceed to normal cuda::div
-  if (!in1.transpose() && !in2.transpose() && (in1.strides() == in2.strides())) {
+  if (!in1.transpose() && !in2.transpose() && (in1.stride() == in2.stride())) {
     cuda::div(num, inPtr1, inPtr2, outPtr, ctx->stream);
   } else { //else we check whether in1 or in2 or both are transposed
     if (in1.transpose() && in2.transpose()) {
@@ -290,7 +310,7 @@
   float* outPtr = static_cast<float*>(out->block()->mutable_data());
   const size_t num = in.Size();
 
-  if (in.strides() == out->strides()) {
+  if (in.stride() == out->stride()) {
     cuda::div(num, x, inPtr, outPtr, ctx->stream);
   } else { //else we transform in to out to store first
     Transform<float, lang::Cuda>(in, out, ctx);
@@ -324,7 +344,7 @@
 
   //if both in1 and in2 are not transposed, and have the same strides,
   //we proceed to normal cuda::mult
-  if (!in1.transpose() && !in2.transpose() && (in1.strides() == in2.strides())) {
+  if (!in1.transpose() && !in2.transpose() && (in1.stride() == in2.stride())) {
     cuda::mult(num, inPtr1, inPtr2, outPtr, ctx->stream);
   } else { //else we check whether in1 or in2 or both are transposed
     if (in1.transpose() && in2.transpose()) {
@@ -352,7 +372,7 @@
   float* outPtr = static_cast<float*>(out->block()->mutable_data());
   const size_t num = in.Size();
 
-  if (in.strides() == out->strides()) {
+  if (in.stride() == out->stride()) {
     cuda::exp(num, inPtr, outPtr, ctx->stream);
   } else { //else we transform in to out to store first
     Transform<float, lang::Cuda>(in, out, ctx);
@@ -367,7 +387,7 @@
   const float* inPtr = static_cast<const float*>(in.block()->data());
   const size_t num = in.Size();
 
-  if (in.strides() == out->strides()) {
+  if (in.stride() == out->stride()) {
     cuda::ge(num, inPtr, x, outPtr, ctx->stream);
   } else { //else we transform in to out to store first
     Transform<float, lang::Cuda>(in, out, ctx);
@@ -391,7 +411,7 @@
   const float* inPtr = static_cast<const float*>(in.block()->data());
   const size_t num = in.Size();
 
-  if (in.strides() == out->strides()) {
+  if (in.stride() == out->stride()) {
     cuda::gt(num, inPtr, x, outPtr, ctx->stream);
   } else { //else we transform in to out to store first
     Transform<float, lang::Cuda>(in, out, ctx);
@@ -414,7 +434,7 @@
   const float* inPtr = static_cast<const float*>(in.block()->data());
   const size_t num = in.Size();
 
-  if (in.strides() == out->strides()) {
+  if (in.stride() == out->stride()) {
     cuda::le(num, inPtr, x, outPtr, ctx->stream);
   } else { //else we transform in to out to store first
     Transform<float, lang::Cuda>(in, out, ctx);
@@ -438,7 +458,7 @@
   float* outPtr = static_cast<float*>(out->block()->mutable_data());
   const size_t num = in.Size();
 
-  if (in.strides() == out->strides()) {
+  if (in.stride() == out->stride()) {
     cuda::log(num, inPtr, outPtr, ctx->stream);
   } else { //else we transform in to out to store first
     Transform<float, lang::Cuda>(in, out, ctx);
@@ -453,7 +473,7 @@
   const float* inPtr = static_cast<const float*>(in.block()->data());
   const size_t num = in.Size();
 
-  if (in.strides() == out->strides()) {
+  if (in.stride() == out->stride()) {
     cuda::lt(num, inPtr, x, outPtr, ctx->stream);
   } else { //else we transform in to out to store first
     Transform<float, lang::Cuda>(in, out, ctx);
@@ -477,7 +497,7 @@
   float* outPtr = static_cast<float*>(out->block()->mutable_data());
   const size_t num = in.Size();
 
-  if (in.strides() == out->strides()) {
+  if (in.stride() == out->stride()) {
     cuda::pow(num, inPtr, x, outPtr, ctx->stream);
   } else { //else we transform in to out to store first
     Transform<float, lang::Cuda>(in, out, ctx);
@@ -495,7 +515,7 @@
 
   //if both in1 and in2 are not transposed, and have the same strides,
   //we proceed to normal cuda::pow
-  if (!in1.transpose() && !in2.transpose() && (in1.strides() == in2.strides())) {
+  if (!in1.transpose() && !in2.transpose() && (in1.stride() == in2.stride())) {
     cuda::pow(num, inPtr1, inPtr2, outPtr, ctx->stream);
   } else { //else we check whether in1 or in2 or both are transposed
     if (in1.transpose() && in2.transpose()) {
@@ -553,7 +573,7 @@
   float* outPtr = static_cast<float*>(out->block()->mutable_data());
   const size_t num = in.Size();
 
-  if (in.strides() == out->strides()) {
+  if (in.stride() == out->stride()) {
     cuda::relu(num, inPtr, outPtr, ctx->stream);
   } else { //else we transform in to out to store first
     Transform<float, lang::Cuda>(in, out, ctx);
@@ -601,7 +621,7 @@
   float* outPtr = static_cast<float*>(out->block()->mutable_data());
   const size_t num = in.Size();
 
-  if (in.strides() == out->strides()) {
+  if (in.stride() == out->stride()) {
     cuda::sigmoid(num, inPtr, outPtr, ctx->stream);
   } else { //else we transform in to out to store first
     Transform<float, lang::Cuda>(in, out, ctx);
@@ -617,7 +637,7 @@
   float* outPtr = static_cast<float*>(out->block()->mutable_data());
   const size_t num = in.Size();
 
-  if (in.strides() == out->strides()) {
+  if (in.stride() == out->stride()) {
     cuda::sign(num, inPtr, outPtr, ctx->stream);
   } else { //else we transform in to out to store first
     Transform<float, lang::Cuda>(in, out, ctx);
@@ -657,7 +677,7 @@
   float* outPtr = static_cast<float*>(out->block()->mutable_data());
   const size_t num = in.Size();
 
-  if (in.strides() == out->strides()) {
+  if (in.stride() == out->stride()) {
     cuda::square(num, inPtr, outPtr, ctx->stream);
   } else { //else we transform in to out to store first
     Transform<float, lang::Cuda>(in, out, ctx);
@@ -713,7 +733,7 @@
   float* outPtr = static_cast<float*>(out->block()->mutable_data());
   const size_t num = in.Size();
 
-  if (in.strides() == out->strides()) {
+  if (in.stride() == out->stride()) {
     cuda::tanh(num, inPtr, outPtr, ctx->stream);
   } else { //else we transform in to out to store first
     Transform<float, lang::Cuda>(in, out, ctx);
diff --git a/src/io/image_transformer.cc b/src/io/image_transformer.cc
index 0f49321..d9a8be1 100644
--- a/src/io/image_transformer.cc
+++ b/src/io/image_transformer.cc
@@ -226,7 +226,7 @@
           }
         }
       }
-      output.SetShape(Shape{channel, crop_height, crop_width});
+      output.Resize(Shape{channel, crop_height, crop_width});
       output.CopyDataFromHostPtr<float>(out, crop_height * crop_width * channel);
       delete[] out;
     } else if (image_dim_order == "HWC") {
@@ -244,7 +244,7 @@
           }
         }
       }
-      output.SetShape(Shape{crop_height, crop_width, channel});
+      output.Resize(Shape{crop_height, crop_width, channel});
       output.CopyDataFromHostPtr<float>(out, crop_height * crop_width * channel);
       delete[] out;
     } else {
@@ -263,7 +263,7 @@
         out[out_idx] = in[in_idx];
       }
     }
-    output.SetShape(Shape{crop_height, crop_width});
+    output.Resize(Shape{crop_height, crop_width});
     output.CopyDataFromHostPtr<float>(out, crop_height * crop_width);
     delete[] out;
   }
@@ -301,7 +301,7 @@
           }
         }
       }
-      output.SetShape(Shape{channel, height, width});
+      output.Resize(Shape{channel, height, width});
       output.CopyDataFromHostPtr<float>(out, height * width * channel);
       delete[] out;
     } else if (image_dim_order == "HWC") {
@@ -322,7 +322,7 @@
           }
         }
       }
-      output.SetShape(Shape{height, width, channel});
+      output.Resize(Shape{height, width, channel});
       output.CopyDataFromHostPtr<float>(out, height * width * channel);
       delete[] out;
     } else {
@@ -344,7 +344,7 @@
         out[out_idx] = in[in_idx];
       }
     }
-    output.SetShape(Shape{height, width});
+    output.Resize(Shape{height, width});
     output.CopyDataFromHostPtr<float>(out, height * width);
     delete[] out;
   }
diff --git a/src/model/layer/batchnorm.cc b/src/model/layer/batchnorm.cc
index d2b0c3e..a4b9b24 100644
--- a/src/model/layer/batchnorm.cc
+++ b/src/model/layer/batchnorm.cc
@@ -44,7 +44,7 @@
   else
     is_2d_ = false;
 
-  bnScale_.SetShape(Shape{channels_});
+  bnScale_.Resize(Shape{channels_});
   bnBias_.ResetLike(bnScale_);
   runningMean_.ResetLike(bnScale_);
   runningVariance_.ResetLike(bnScale_);
diff --git a/src/model/layer/convolution.cc b/src/model/layer/convolution.cc
index 3718d8d..7913913 100755
--- a/src/model/layer/convolution.cc
+++ b/src/model/layer/convolution.cc
@@ -96,9 +96,9 @@
   col_width_ = conv_height_ * conv_width_;
 
   // Setup shape of weight_ and bias_
-  weight_.SetShape(Shape{num_filters_, col_height_});
+  weight_.Resize(Shape{num_filters_, col_height_});
   if (bias_term_)
-    bias_.SetShape(Shape{num_filters_});
+    bias_.Resize(Shape{num_filters_});
   // Assume the order of param is: weight, bias
   for (const auto &spec : conf.param()) param_specs_.push_back(spec);
 }
diff --git a/src/model/layer/cudnn_batchnorm.cc b/src/model/layer/cudnn_batchnorm.cc
index 4816817..c5baefb 100755
--- a/src/model/layer/cudnn_batchnorm.cc
+++ b/src/model/layer/cudnn_batchnorm.cc
@@ -39,8 +39,8 @@
 
 void CudnnBatchNorm::Setup(const Shape& in_sample, const LayerConf& conf) {
   BatchNorm::Setup(in_sample, conf);
-  resultSaveMean_.SetShape(Shape{channels_});
-  resultSaveVariance_.SetShape(Shape{channels_});
+  resultSaveMean_.Resize(Shape{channels_});
+  resultSaveVariance_.Resize(Shape{channels_});
 }
 
 void CudnnBatchNorm::InitCudnn(const Shape& shape, DataType dtype) {
diff --git a/src/model/layer/dense.cc b/src/model/layer/dense.cc
index 36a7a91..385d5cd 100644
--- a/src/model/layer/dense.cc
+++ b/src/model/layer/dense.cc
@@ -40,11 +40,11 @@
   transpose_ = dense_conf.transpose();

   bias_term_ = dense_conf.bias_term();

   if (transpose_)  // was {vdim_, hdim} by zhaojing?

-    weight_.SetShape(Shape{hdim_, vdim_});

+    weight_.Resize(Shape{hdim_, vdim_});

   else

-    weight_.SetShape(Shape{vdim_, hdim_});

+    weight_.Resize(Shape{vdim_, hdim_});

   if (bias_term_)

-    bias_.SetShape(Shape{hdim_});

+    bias_.Resize(Shape{hdim_});

   for (auto specs: conf.param())

     param_specs_.push_back(specs);

 }

diff --git a/src/model/layer/lrn.cc b/src/model/layer/lrn.cc
index a1776fa..18e5d06 100644
--- a/src/model/layer/lrn.cc
+++ b/src/model/layer/lrn.cc
@@ -60,6 +60,7 @@
       tmp = Pow(tmp, beta_);
 
       ch = CopyRows(image, c, c + 1);
+      ch.Reshape(tmp.shape());
       ch = ch / tmp;
       ch.Reshape(Shape{input.shape(2), input.shape(3)});
       channels.push_back(ch);
@@ -135,6 +136,7 @@
     }
     Tensor tmp2 = ConcatenateRows(images);
     tmp2 *= (-2.0f * beta_ * alpha_);
+    tmp2.Reshape(x.shape());
     tmp2 = tmp2 * x;
     dx = dx + tmp2;
     dx.Reshape(grad.shape());
diff --git a/src/model/layer/prelu.cc b/src/model/layer/prelu.cc
index e567172..fe6447f 100644
--- a/src/model/layer/prelu.cc
+++ b/src/model/layer/prelu.cc
@@ -64,6 +64,7 @@
     } else {
       LOG(FATAL) << "Incorrect input format for prelu layer.";
     }
+    temp.Reshape(input.shape());
     output = input * ((input > 0.f) + temp);
   } else {
     // share the first param of Tensor A along all channels
diff --git a/src/model/layer/rnn.cc b/src/model/layer/rnn.cc
index e565abc..0fb920c 100644
--- a/src/model/layer/rnn.cc
+++ b/src/model/layer/rnn.cc
@@ -79,7 +79,7 @@
       dim = hidden_size_ * (hidden_size_ +  hidden_size_ + 2);
     weight_size += mult * dim;
   }
-  weight_.SetShape(Shape{weight_size});
+  weight_.Resize(Shape{weight_size});
 }
 
 const vector<Tensor> RNN::Forward(int flag, const vector<Tensor>& inputs) {
diff --git a/src/model/operation/batchnorm.cc b/src/model/operation/batchnorm.cc
index 4673919..7a5a0f1 100755
--- a/src/model/operation/batchnorm.cc
+++ b/src/model/operation/batchnorm.cc
@@ -1,3 +1,23 @@
+/*********************************************************
+*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*
+************************************************************/
 #include "./batchnorm.h"
 
 namespace singa {
@@ -17,8 +37,201 @@
   } else {
     LOG(FATAL) << "The dimension of input should either be 4D or 2D.";
   }
+
+
+#ifdef USE_MKLDNN
+  if (input.device()->lang() == kCpp) {
+    dtype = GetMKLDNNDataType(input.data_type());
+    epsilon = 1e-5f;
+    data_memory_format = is_2d ? mkldnn::memory::format::nc : mkldnn::memory::format::nchw;
+    if (is_2d) {
+      x_dims = {(int)batchsize, (int)channels};
+      y_dims = {(int)batchsize, (int)channels};
+    } else {
+      x_dims = {(int)batchsize, (int)channels, (int)height, (int)width};
+      y_dims = {(int)batchsize, (int)channels, (int)height, (int)width};
+    }
+
+    auto eng = *input.device()->context(0)->engine;
+    x_md = new mkldnn::memory::desc(x_dims, dtype, data_memory_format);
+    dx_md = new mkldnn::memory::desc(x_dims, dtype, data_memory_format);
+    bn_fwd_d = new mkldnn::batch_normalization_forward::desc(mkldnn::forward_training, *x_md, epsilon,
+        mkldnn::use_scale_shift);
+    bn_fwd_pd = new mkldnn::batch_normalization_forward::primitive_desc(*bn_fwd_d, eng);
+  }
+#endif // USE_MKLDNN
+
 };
 
+
+BatchNormHandle::~BatchNormHandle() {
+#ifdef USE_MKLDNN
+  if (x_md != nullptr) {
+    delete (x_md);
+    delete (dx_md);
+    delete (bn_fwd_d);
+    delete (bn_fwd_pd);
+  }
+#endif // USE_MKLDNN
+}
+
+#ifdef USE_MKLDNN
+
+Tensor CpuBatchNormForwardInference(const BatchNormHandle &bnh, const Tensor& x, const Tensor& bnScale, const Tensor& bnBias,
+                                    Tensor& running_mean, Tensor& running_var) {
+
+  CHECK_EQ(x.device()->lang(), kCpp);
+  Tensor y;
+  y.ResetLike(x);
+
+
+  Tensor w = get_bn_weight_from(bnScale, bnBias);
+
+  y.device()->Exec([&y, &x, &running_mean, &running_var, &w, &bnh](Context * ctx) {
+    try {
+      auto eng = *ctx->engine;
+      using namespace mkldnn;
+      auto x_mem = memory({{{bnh.x_dims}, bnh.dtype, bnh.data_memory_format}, eng}, x.block()->mutable_data());
+      auto y_mem = memory({{{bnh.y_dims}, bnh.dtype, bnh.data_memory_format}, eng}, y.block()->mutable_data());
+
+      // indicates using scale&bias and running mean&var
+      auto flags = use_scale_shift | use_global_stats;
+      auto bn_fwd_d = batch_normalization_forward::desc(forward_inference, *bnh.x_md, bnh.epsilon, flags);
+      auto bn_fwd_pd = batch_normalization_forward::primitive_desc(bn_fwd_d, eng);
+
+      auto m_mem = memory(bn_fwd_pd.mean_primitive_desc(), running_mean.block()->mutable_data());
+      auto v_mem = memory(bn_fwd_pd.variance_primitive_desc(), running_var.block()->mutable_data());
+      auto w_mem = memory(bn_fwd_pd.weights_primitive_desc(), w.block()->mutable_data());
+
+      // inputs require explicitly be indicated by casting according to
+      // https://intel.github.io/mkl-dnn/structmkldnn_1_1batch__normalization__forward.html
+      auto bn = batch_normalization_forward(bn_fwd_pd, x_mem, (const primitive::at)m_mem, (const primitive::at)v_mem, w_mem, y_mem);
+
+      stream(stream::kind::eager).submit({bn}).wait();
+    } catch (mkldnn::error &e) {
+      InitLogging("");
+      LOG(FATAL) << "MKLDNN Batch Norm " << "Status: " << e.status << " Message: " << e.message;
+    }
+
+  }, {y.block(), x.block(), w.block()}, {y.block()});
+
+  return y;
+
+}
+
+const std::vector<Tensor>
+CpuBatchNormForwardTraining(const BatchNormHandle &bnh, const Tensor &x, const Tensor &bnScale, const Tensor &bnBias,
+                            Tensor &running_mean, Tensor &running_var) {
+
+  Tensor y;
+  y.ResetLike(x);
+
+  // mean and var for local batch
+  Tensor mean;
+  mean.ResetLike(running_mean);
+  Tensor var;
+  var.ResetLike(running_var);
+
+  // combine scale and bias to construct weight tensor in required format for backward
+  Tensor w = get_bn_weight_from(bnScale, bnBias);
+
+  y.device()->Exec([&x, &y, &mean, &var, &w, &bnh](Context * ctx) {
+    try {
+      auto eng = *ctx->engine;
+      using namespace mkldnn;
+
+      auto x_mem = memory({{{bnh.x_dims}, bnh.dtype, bnh.data_memory_format}, eng},
+      x.block()->mutable_data());
+      auto y_mem = memory({{{bnh.x_dims}, bnh.dtype, bnh.data_memory_format}, eng},
+      y.block()->mutable_data());
+      auto m_mem = memory(bnh.bn_fwd_pd->mean_primitive_desc(), mean.block()->mutable_data());
+
+      auto v_mem = memory(bnh.bn_fwd_pd->variance_primitive_desc(), var.block()->mutable_data());
+
+      auto w_mem = memory(bnh.bn_fwd_pd->weights_primitive_desc(), w.block()->mutable_data());
+
+      auto bn_fwd = batch_normalization_forward(*bnh.bn_fwd_pd, x_mem, w_mem, y_mem, m_mem, v_mem);
+
+      stream(stream::kind::eager).submit({bn_fwd}).wait();
+    } catch (mkldnn::error &e) {
+      singa::InitLogging("");
+      LOG(FATAL) << "MKLDNN Batch Norm Backward" << "Status: " << e.status << " Message: " << e.message;
+    }
+  }, {x.block(), w.block()}, {y.block(), mean.block(), var.block()});
+
+
+  // local implemented running mean as mkldnn does not support it yet:
+  // https://github.com/intel/mkl-dnn/issues/371
+  running_mean = running_mean * bnh.factor + mean * (1 - bnh.factor);
+  running_var = running_var * bnh.factor + var * (1 - bnh.factor);
+
+
+  return {y, running_mean, running_var};
+
+}
+
+const std::vector<Tensor> CpuBatchNormBackwardx(const BatchNormHandle &bnh,
+    const Tensor &y, const Tensor &dy,
+    const Tensor &x,
+    const Tensor &bnScale, const Tensor &bnBias,
+    const Tensor &mean, const Tensor &var) {
+  Tensor dx;
+  dx.ResetLike(dy);
+
+  // combine scale and bias to construct weight tensor in required format for backward
+  Tensor w = get_bn_weight_from(bnScale, bnBias);
+
+  Tensor dw(Shape{bnScale.Size(), 2});
+
+  dx.device()->Exec([&dw, &x, &dx, &y, &dy, &w, &mean, &var, &bnh](Context * ctx) {
+
+    try {
+      auto eng = *ctx->engine;
+      using namespace mkldnn;
+
+      auto  x_mem = memory({{{bnh.x_dims}, bnh.dtype, bnh.data_memory_format}, eng},  x.block()->mutable_data());
+      auto dx_mem = memory({{{bnh.x_dims}, bnh.dtype, bnh.data_memory_format}, eng}, dx.block()->mutable_data());
+      auto  y_mem = memory({{{bnh.x_dims}, bnh.dtype, bnh.data_memory_format}, eng},  y.block()->mutable_data());
+      auto dy_mem = memory({{{bnh.x_dims}, bnh.dtype, bnh.data_memory_format}, eng}, dy.block()->mutable_data());
+
+      auto m_mem = memory(bnh.bn_fwd_pd->mean_primitive_desc(), mean.block()->mutable_data());
+      auto v_mem = memory(bnh.bn_fwd_pd->variance_primitive_desc(), var.block()->mutable_data());
+      auto w_mem = memory(bnh.bn_fwd_pd->weights_primitive_desc(), w.block()->mutable_data());
+
+
+      auto bn_bwd_d = batch_normalization_backward::desc(backward, *bnh.dx_md, *bnh.x_md, bnh.epsilon, use_scale_shift);
+      auto bn_bwd_pd = batch_normalization_backward::primitive_desc(bn_bwd_d, eng, *bnh.bn_fwd_pd);
+
+
+      auto dw_mem = memory(bn_bwd_pd.diff_weights_primitive_desc(), dw.block()->mutable_data());
+
+      auto bn_bwd = batch_normalization_backward(bn_bwd_pd, x_mem, m_mem, v_mem, dy_mem, w_mem, dx_mem, dw_mem);
+
+      stream(stream::kind::eager).submit({bn_bwd}).wait();
+    } catch (mkldnn::error &e) {
+      singa::InitLogging("");
+      LOG(FATAL) << "MKLDNN Batch Norm Backward" << "Status: " << e.status << " Message: " << e.message;
+    }
+
+  }, {x.block(), dy.block(), mean.block(), var.block()},
+  {dx.block(), dw.block()});
+
+  singa::Tensor dbnScale(bnScale.shape());
+  CopyDataToFrom(&dbnScale, dw, bnScale.Size(), 0, 0);
+  singa::Tensor dbnBias(bnBias.shape());
+  CopyDataToFrom(&dbnBias, dw, bnBias.Size(), 0, bnScale.Size());
+
+  CHECK(dbnScale.nDim() == bnScale.nDim()) << "dbnScale ndim not match bnScale";
+  CHECK(dbnBias.nDim() == bnBias.nDim()) << "dbnScale ndim not match bnScale";
+  CHECK(dbnScale.shape()[0] == bnScale.shape()[0]) << "dbnScale shape not match bnScale";
+  CHECK(dbnBias.shape()[0] == bnBias.shape()[0]) << "dbnBias shape not match bnBias";
+
+  return {dx, dbnScale, dbnBias};
+}
+
+
+#endif  // USE_MKLDNN
+
 #ifdef USE_CUDNN
 CudnnBatchNormHandle::CudnnBatchNormHandle(const float momentum,
     const Tensor& input): BatchNormHandle(momentum, input) {
@@ -39,8 +252,8 @@
 };
 
 const std::vector<Tensor> GpuBatchNormForwardTraining(const CudnnBatchNormHandle &cbnh,
-                                   const Tensor& x, const Tensor& bnScale, const Tensor& bnBias,
-                                   Tensor& running_mean, Tensor& running_var) {
+    const Tensor& x, const Tensor& bnScale, const Tensor& bnBias,
+    Tensor& running_mean, Tensor& running_var) {
   CHECK_EQ(x.device()->lang(), kCuda);
   CHECK_EQ(bnScale.device()->lang(), kCuda);
   CHECK_EQ(bnBias.device()->lang(), kCuda);
diff --git a/src/model/operation/batchnorm.h b/src/model/operation/batchnorm.h
index f4372e3..648614a 100755
--- a/src/model/operation/batchnorm.h
+++ b/src/model/operation/batchnorm.h
@@ -1,3 +1,23 @@
+/*********************************************************
+*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*
+************************************************************/
 //#ifndef SINGA_MODEL_OPERATION_BATCHNORM_H_
 //#define SINGA_MODEL_OPERATION_BATCHNORM_H_
 
@@ -7,13 +27,28 @@
 #ifdef USE_CUDNN
 #include <cudnn.h>
 #include "../layer/cudnn_utils.h" // check_cudnn
-#endif // USE_CUDNN 
+#endif // USE_CUDNN
+
+#ifdef USE_MKLDNN
+#include <mkldnn.hpp>
+
+// combine scale and bias into weight format recognised by mkldnn api
+static inline singa::Tensor get_bn_weight_from(const singa::Tensor &s, const singa::Tensor &b) {
+  singa::Tensor w(singa::Shape{s.Size(), b.Size()});
+  CopyDataToFrom(&w, s, s.Size(), 0, 0);
+  CopyDataToFrom(&w, b, b.Size(), s.Size(), 0);
+  return w;
+}
+
+
+#endif // USE_MKLDNN
 
 namespace singa {
 
 class BatchNormHandle {
  public:
   BatchNormHandle(const float momentum, const Tensor& input);
+  ~BatchNormHandle();
 
   float factor;
 
@@ -23,13 +58,37 @@
   size_t width;
   bool is_2d;
   //bool train = true;
+#ifdef USE_MKLDNN
+  mkldnn::memory::data_type dtype;
+  mkldnn::memory::dims x_dims;
+  mkldnn::memory::dims y_dims;
+  mkldnn::memory::desc *x_md = nullptr;
+  mkldnn::memory::desc *dx_md = nullptr;
+  mkldnn::batch_normalization_forward::desc *bn_fwd_d = nullptr;
+  mkldnn::batch_normalization_forward::primitive_desc *bn_fwd_pd = nullptr;
+  float epsilon;
+  mkldnn::memory::format data_memory_format;
+#endif //USE_MKLDNN
 };
 
-//Tensor CpuBatchNormForwardTraining();
 
-//Tensor CpuBatchNormForwardInference();
+#ifdef USE_MKLDNN
 
-//Tensor CpuBatchNormBackwardx();
+Tensor
+CpuBatchNormForwardInference(const BatchNormHandle &bnh, const Tensor &x, const Tensor &bnScale, const Tensor &bnBias,
+                             Tensor &running_mean, Tensor &running_var);
+
+const std::vector<Tensor>
+CpuBatchNormForwardTraining(const BatchNormHandle &bnh, const Tensor &x, const Tensor &bnScale, const Tensor &bnBias,
+                            Tensor &running_mean, Tensor &running_var);
+
+const std::vector<Tensor> CpuBatchNormBackwardx(const BatchNormHandle &bnh,
+    const Tensor &y, const Tensor &dy,
+    const Tensor &x,
+    const Tensor &bnScale, const Tensor &bnBias,
+    const Tensor &mean, const Tensor &var);
+
+#endif // USE_MKLDNN
 
 
 #ifdef USE_CUDNN
diff --git a/src/model/operation/convolution.cc b/src/model/operation/convolution.cc
index beb824d..f2dc9a3 100755
--- a/src/model/operation/convolution.cc
+++ b/src/model/operation/convolution.cc
@@ -1,14 +1,33 @@
+/*********************************************************
+*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*
+************************************************************/
 #include "./convolution.h"
 #include "../layer/convolution.h"
 
-
 namespace singa {
 
 ConvHandle::ConvHandle(const Tensor &input,
                        const std::vector<size_t>& kernel_size,
                        const std::vector<size_t>& stride, const std::vector<size_t>& padding,
                        const size_t in_channels, const size_t out_channels,
-                       const bool bias) {
+                       const bool bias, const size_t groups) {
   kernel_h = kernel_size[0];
   kernel_w = kernel_size[1];
 
@@ -18,8 +37,10 @@
   stride_h = stride[0];
   stride_w = stride[1];
 
+
   channels = in_channels;
   num_filters = out_channels;
+  group = groups;
 
   bias_term = bias;
 
@@ -37,9 +58,52 @@
   col_height = in_channels * kernel_w * kernel_h;
   col_width = conv_height * conv_width;
   imagesize = input.Size() / batchsize;
+
+#ifdef USE_MKLDNN
+  if (input.device()->lang() == kCpp) {
+    const int groups = 1; // only groups 1 is supported for now
+    dtype = GetMKLDNNDataType(input.data_type());
+
+    x_dims = {(int)input.shape(0), (int)in_channels, (int)input.shape(2), (int)input.shape(3)};
+    b_dims = {(int)out_channels};
+    s_dims = {(int)stride_h, (int)stride_w};
+    p_dims = {(int)pad_h, (int)pad_w};
+    o_dims = {(int)input.shape(0), (int)out_channels, (int)conv_height, (int)conv_width};
+    w_dims = {groups, (int)out_channels / groups, (int)in_channels / groups, (int)kernel_size[0], (int)kernel_size[1] };
+
+    x_md = new mkldnn::memory::desc( x_dims, dtype, mkldnn::memory::format::nchw);
+    w_md = new mkldnn::memory::desc( w_dims, dtype, mkldnn::memory::format::goihw);
+    b_md = new mkldnn::memory::desc( b_dims, dtype, mkldnn::memory::format::x);
+    y_md = new mkldnn::memory::desc( o_dims, dtype, mkldnn::memory::format::nchw);
+
+    // convolution forward primitive descriptor is shared between forward and backward process
+    conv_d = new mkldnn::convolution_forward::desc(
+      mkldnn::prop_kind::forward_inference, mkldnn::convolution_direct, *x_md,
+      *w_md, *b_md, *y_md, s_dims,
+      p_dims, p_dims, mkldnn::padding_kind::zero);
+
+    auto eng = *input.device()->context(0)->engine;
+    conv_pd = new mkldnn::convolution_forward::primitive_desc(*conv_d, eng);
+
+    // mkldnn calculate dw and db in one go, a workaround to be compatible with singa api
+    db = new Tensor(Shape{num_filters}, input.device(), input.data_type() );
+  }
+#endif // USE_MKLDNN
 }
 
-
+ConvHandle::~ConvHandle() {
+#ifdef USE_MKLDNN
+  if (x_md != nullptr) {
+    delete(x_md);
+    delete(w_md);
+    delete(b_md);
+    delete(y_md);
+    delete(conv_d);
+    delete(conv_pd);
+    delete(db);
+  }
+#endif // USE_MKLDNN
+}
 
 Tensor CpuConvForward(const Tensor &x, Tensor &W,  Tensor &b,
                       const ConvHandle &ch) {
@@ -52,6 +116,39 @@
         W.shape(2) == ch.kernel_h
         && W.shape(3) == ch.kernel_w) << "weights shape should not change";
 
+#ifdef USE_MKLDNN
+
+  DataType dtype = x.data_type();
+  auto dev = x.device();
+
+  Shape shape{ch.batchsize, ch.num_filters, ch.conv_height, ch.conv_width};
+  Tensor output(shape, dev, dtype);
+
+  output.device()->Exec([&output, &x, &W, &b, &ch](Context * ctx) {
+    Block *inblock = x.block(), *outblock = output.block(), *wblock = W.block(), *bblock = b.block();
+
+    try {
+      using namespace mkldnn;
+
+      auto eng = *ctx->engine;
+      auto x_mem = memory({{{ch.x_dims}, ch.dtype, memory::format::nchw}, eng}, inblock->mutable_data());
+      auto w_mem = memory({{{ch.w_dims}, ch.dtype, memory::format::goihw}, eng},  wblock->mutable_data());
+      auto b_mem = memory({{{ch.b_dims}, ch.dtype, memory::format::x},    eng},  bblock->mutable_data());
+      auto y_mem = memory(ch.conv_pd->dst_primitive_desc(), outblock->mutable_data());
+
+      auto conv_fwd = convolution_forward(*ch.conv_pd, x_mem, w_mem, b_mem, y_mem);
+
+      stream(stream::kind::eager).submit({conv_fwd}).wait();
+    } catch (mkldnn::error &e) {
+      singa::InitLogging("");
+      LOG(FATAL) << "MKLDNN conv fwd " << "Status: " << e.status << " Message: " << e.message;
+    }
+
+  }, {x.block(), W.block(), b.block()}, {output.block()});
+
+  return output;
+
+#else // ifndef USE_MKLDNN
   Shape w_shape = W.shape();
   Shape b_shape;
   if (ch.bias_term)
@@ -86,6 +183,7 @@
   if (ch.bias_term)
     b.Reshape(b_shape);
   return output;
+#endif  // USE_MKLDNN
 }
 
 Tensor CpuConvBackwardx(const Tensor &dy, Tensor &W, const Tensor &x,
@@ -99,6 +197,41 @@
         W.shape(2) == ch.kernel_h
         && W.shape(3) == ch.kernel_w) << "weights shape should not change";
 
+
+#ifdef USE_MKLDNN
+
+  Tensor dx;
+  dx.ResetLike(x);
+
+  dy.device()->Exec([&x, &dx, &dy, &W, &ch](Context * ctx) {
+    Block *wblock = W.block(), *dyblock = dy.block(), *dxblock = dx.block(), *inblock = x.block();
+
+    try {
+      auto eng = *ctx->engine;
+      using namespace mkldnn;
+      auto x_mem = memory({{{ch.x_dims}, ch.dtype, memory::format::nchw}, eng}, inblock->mutable_data());
+      auto w_mem = memory({{{ch.w_dims}, ch.dtype, memory::format::goihw}, eng}, wblock->mutable_data());
+      auto dx_mem = memory({{{ch.x_dims}, ch.dtype, memory::format::nchw}, eng}, dxblock->mutable_data());
+      auto dy_mem = memory({{{ch.o_dims}, ch.dtype, memory::format::nchw}, eng}, dyblock->mutable_data());
+
+
+      auto conv_bwd_data_d = convolution_backward_data::desc(convolution_direct, *ch.x_md, *ch.w_md, *ch.y_md, ch.s_dims,
+                             ch.p_dims, ch.p_dims, padding_kind::zero);
+      auto conv_bwd_data_pd = convolution_backward_data::primitive_desc(conv_bwd_data_d, eng, *ch.conv_pd);
+      auto conv_bwd_data = convolution_backward_data(conv_bwd_data_pd, dy_mem, w_mem, dx_mem);
+
+
+      stream(stream::kind::eager).submit({conv_bwd_data}).wait();
+    } catch (mkldnn::error &e) {
+      singa::InitLogging("");
+      LOG(FATAL) << "MKLDNN conv fwd " << "Status: " << e.status << " Message: " << e.message;
+    }
+
+  }, {x.block(), dy.block(), W.block()}, {dx.block()});
+
+  return dx;
+
+#else // ifndef USE_MKLDNN
   Shape w_shape = W.shape();
   W.Reshape(Shape{ch.num_filters, ch.col_height});
 
@@ -119,6 +252,7 @@
   }
   W.Reshape(w_shape);
   return dx;
+#endif  // USE_MKLDNN
 }
 
 Tensor CpuConvBackwardW(const Tensor &dy, const Tensor &x, const Tensor &W,
@@ -131,6 +265,39 @@
   CHECK(x.shape(1) == ch.channels && x.shape(2) == ch.height &&
         x.shape(3) == ch.width) << "input sample shape should not change";
 
+#ifdef USE_MKLDNN
+  Tensor dW;
+  dW.ResetLike(W);
+
+  dy.device()->Exec([&x, &dy, &dW, &ch](Context * ctx) {
+    Block *dwblock = dW.block(), *dyblock = dy.block(), *inblock = x.block(), *dbblock = ch.db->block();
+
+    try {
+      auto eng = *ctx->engine;
+      using namespace mkldnn;
+
+      auto x_mem = memory({{{ch.x_dims}, ch.dtype, memory::format::nchw}, eng}, inblock->mutable_data());
+      auto dy_mem = memory({{{ch.o_dims}, ch.dtype, memory::format::nchw}, eng}, dyblock->mutable_data());
+      auto dw_mem = memory({{{ch.w_dims}, ch.dtype, memory::format::goihw}, eng}, dwblock->mutable_data());
+      auto db_mem = memory({{{ch.b_dims}, ch.dtype, memory::format::x}, eng}, dbblock->mutable_data());
+
+      auto conv_dw_d = convolution_backward_weights::desc(convolution_direct, *ch.x_md, *ch.w_md, *ch.b_md, *ch.y_md,
+                       ch.s_dims, ch.p_dims, ch.p_dims, padding_kind::zero);
+      auto conv_dw_pd = convolution_backward_weights::primitive_desc(conv_dw_d, eng, *ch.conv_pd);
+      auto conv_dw = convolution_backward_weights(conv_dw_pd, x_mem, dy_mem, dw_mem, db_mem);
+
+      mkldnn::stream(mkldnn::stream::kind::eager).submit({conv_dw}).wait();
+    } catch (mkldnn::error &e) {
+      singa::InitLogging("");
+      LOG(FATAL) << "MKLDNN Conv backward W " << "Status: " << e.status << " Message: " << e.message;
+    }
+
+  }, {x.block(), dy.block(), W.block()}, {dW.block()});
+
+  return dW;
+
+
+#else // USE_MKLDNN
   Tensor dW;
   dW.ResetLike(W);
   dW.SetValue(0.0f);
@@ -153,6 +320,7 @@
   }
   dW.Reshape(w_shape);
   return dW;
+#endif // USE_MKLDNN
 }
 
 Tensor CpuConvBackwardb(const Tensor &dy, const Tensor &b,
@@ -164,6 +332,10 @@
 
   CHECK(b.shape(0) == ch.num_filters) << "bias shape should not change";
 
+#ifdef USE_MKLDNN
+  Tensor db = ch.db->Clone();
+  return db;
+#else // USE_MKLDNN
   Tensor db;
   db.ResetLike(b);
 
@@ -177,6 +349,7 @@
   SumRows(tmp3, &db);
 
   return db;
+#endif // USE_MKLDNN
 };
 
 #ifdef USE_CUDNN
@@ -187,7 +360,7 @@
                                  const size_t groups,
                                  const size_t workspace_byte_limit, const std::string& prefer)
   : ConvHandle(input, kernel_size, stride, padding, in_channels, out_channels,
-               bias) {
+               bias, groups) {
 
   DataType dtype = input.data_type();
   auto dev = input.device();
@@ -219,8 +392,7 @@
                                              ));
   if (CUDNN_MAJOR >= 7 && groups > 1) {
     CUDNN_CHECK(cudnnSetConvolutionGroupCount(conv_desc, groups));
-  }
-  else if (groups > 1) {LOG(FATAL) << "The current version of cuDNN not support grouped convolution.";};
+  } else if (groups > 1) {LOG(FATAL) << "The current version of cuDNN not support grouped convolution.";};
 
   CUDNN_CHECK(cudnnSetFilter4dDescriptor(filter_desc, GetCudnnDataType(dtype),
                                          CUDNN_TENSOR_NCHW, num_filters,
@@ -311,6 +483,12 @@
 Tensor GpuConvForward(const Tensor &x, const Tensor &W, const Tensor &b,
                       const CudnnConvHandle &cch) {
   CHECK_EQ(x.device()->lang(), kCuda);
+  CHECK(x.shape(1) == cch.channels && x.shape(2) == cch.height &&
+        x.shape(3) == cch.width) << "input sample shape should not change";
+
+  CHECK(W.shape(0) == cch.num_filters && W.shape(1) == cch.channels &&
+        W.shape(2) == cch.kernel_h
+        && W.shape(3) == cch.kernel_w) << "weights shape should not change";
 
   DataType dtype = x.data_type();
   auto dev = x.device();
diff --git a/src/model/operation/convolution.h b/src/model/operation/convolution.h
index 7fd1ce7..3148633 100755
--- a/src/model/operation/convolution.h
+++ b/src/model/operation/convolution.h
@@ -1,3 +1,23 @@
+/*********************************************************
+*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*
+************************************************************/
 #ifndef SINGA_MODEL_OPERATION_CONVOLUTION_H_
 #define SINGA_MODEL_OPERATION_CONVOLUTION_H_
 
@@ -12,16 +32,21 @@
 #include "../layer/cudnn_utils.h"
 #endif // USE_CUDNN
 
+#ifdef USE_MKLDNN
+#include <mkldnn.hpp>
+#endif // USE_MKLDNN
 
 namespace singa {
 
 class ConvHandle {
 
-public:
+ public:
   ConvHandle(const Tensor &input, const std::vector<size_t>& kernel_size,
              const std::vector<size_t>& stride, const std::vector<size_t>& padding,
              const size_t in_channels, const size_t out_channels,
-             const bool bias);
+             const bool bias, const size_t groups = 1);
+
+  ~ConvHandle();
 
   size_t kernel_w;
   size_t pad_w;
@@ -32,6 +57,7 @@
 
   size_t channels;
   size_t num_filters;
+  size_t group;
 
   bool bias_term;
 
@@ -44,6 +70,25 @@
   size_t col_height;
   size_t col_width;
   size_t imagesize;
+
+#ifdef USE_MKLDNN
+  mkldnn::memory::data_type dtype;
+  mkldnn::memory::dims b_dims;
+  mkldnn::memory::dims s_dims;
+  mkldnn::memory::dims p_dims;
+  mkldnn::memory::dims x_dims;
+  mkldnn::memory::dims o_dims;
+  mkldnn::memory::dims w_dims;
+
+  const mkldnn::memory::desc *x_md = nullptr;
+  const mkldnn::memory::desc *w_md = nullptr;
+  const mkldnn::memory::desc *b_md = nullptr;
+  const mkldnn::memory::desc *y_md = nullptr;
+  const mkldnn::convolution_forward::desc *conv_d = nullptr;
+  const mkldnn::convolution_forward::primitive_desc *conv_pd = nullptr;
+
+  const Tensor *db = nullptr;
+#endif // USE_MKLDNN
 };
 
 
@@ -59,7 +104,7 @@
 
 #ifdef USE_CUDNN
 class CudnnConvHandle: public ConvHandle {
-public:
+ public:
   CudnnConvHandle(const Tensor &input, const std::vector<size_t>& kernel_size,
                   const std::vector<size_t>& stride, const std::vector<size_t>& padding,
                   const size_t in_channels, const size_t out_channels,
diff --git a/src/model/operation/pooling.cc b/src/model/operation/pooling.cc
index efc03ff..4878e49 100755
--- a/src/model/operation/pooling.cc
+++ b/src/model/operation/pooling.cc
@@ -1,3 +1,23 @@
+/*********************************************************
+*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*
+************************************************************/
 #include "./pooling.h"
 #include <cmath>
 
@@ -25,12 +45,131 @@
 
   if (stride_h > 0)
     pooled_height = std::floor(
-      ((height + 2 * pad_h - kernel_h) / stride_h)) + 1;
+                      ((height + 2 * pad_h - kernel_h) / stride_h)) + 1;
   pooled_width = std::floor(
-    ((width + 2 * pad_w - kernel_w) / stride_w)) + 1;
+                   ((width + 2 * pad_w - kernel_w) / stride_w)) + 1;
   is_max_pooling = is_max;
+
+
+#ifdef USE_MKLDNN
+  if (input.device()->lang() == kCpp) {
+    dtype = GetMKLDNNDataType(input.data_type());
+    x_dims = {batchsize, channels, height, width};
+    y_dims = {batchsize, channels, pooled_height, pooled_width};
+    s_dims = {stride};
+    k_dims = {kernel_size};
+    p_dims = {padding};
+
+    auto eng = *input.device()->context(0)->engine;
+    x_md = new mkldnn::memory::desc({x_dims}, dtype, mkldnn::memory::format::nchw);
+    y_md = new mkldnn::memory::desc({y_dims}, dtype, mkldnn::memory::format::nchw);
+
+    // allow max or avg (follow cudnn implementation convention)
+    pooling_algo = mkldnn::pooling_avg_exclude_padding;
+    if (is_max_pooling)
+      pooling_algo = mkldnn::pooling_max;
+
+    pool_fwd_d = new mkldnn::pooling_forward::desc(mkldnn::forward_training, pooling_algo, *x_md, *y_md, s_dims,
+        k_dims, p_dims, p_dims, mkldnn::padding_kind::zero);
+    pool_fwd_pd = new mkldnn::pooling_forward::primitive_desc(*pool_fwd_d, eng);
+
+    if (is_max_pooling) {
+//    During training max pooling requires workspace on forward (mkldnn_forward_training) and backward
+//    (mkldnn_backward) passes to save indices where maximum was found. Workspace layout is opaque and
+//    the indices cannot be restored from it. However one can use backward pooling to perform up-sampling
+//    (used in some detection topologies).
+      auto temp = pool_fwd_pd->workspace_primitive_desc();
+      pool_ws_d = &temp;
+      ws_mem = new mkldnn::memory(*pool_ws_d);
+    }
+  }
+#endif // USE_MKLDNN
 }
 
+PoolingHandle::~PoolingHandle() {
+#ifdef USE_MKLDNN
+  if (x_md == nullptr) {
+    delete(x_md);
+    delete(y_md);
+    delete(pool_fwd_d);
+    delete(pool_fwd_pd);
+    if (is_max_pooling)
+      delete(ws_mem);
+  }
+#endif // USE_MKLDNN
+}
+
+#ifdef USE_MKLDNN
+
+Tensor CpuPoolingForward(const PoolingHandle &ph, const Tensor &x) {
+
+
+  Tensor y({(unsigned long) ph.batchsize, (unsigned long) ph.channels, (unsigned long) ph.pooled_height,
+            (unsigned long) ph.pooled_width
+           }, x.device(), x.data_type());
+
+
+  y.device()->Exec([&y, &x, &ph](Context * ctx) {
+
+
+    try {
+
+      auto eng = *ctx->engine;
+      using namespace mkldnn;
+
+      auto y_mem = memory(ph.pool_fwd_pd->dst_primitive_desc(), y.block()->mutable_data());
+      auto x_mem = memory({{{ph.x_dims}, ph.dtype, memory::format::nchw}, eng},
+      x.block()->mutable_data());
+
+      auto p_fwd = ph.is_max_pooling ? pooling_forward(*ph.pool_fwd_pd, x_mem, y_mem, *ph.ws_mem) : pooling_forward(
+                     *ph.pool_fwd_pd, x_mem, y_mem);
+
+      stream(stream::kind::eager).submit({p_fwd}).wait();
+    } catch (mkldnn::error &e) {
+      LOG(FATAL) << "MKLDNN pooling fwd" << "Status: " << e.status << " Message: " << e.message;
+    }
+
+  }, {x.block()}, {y.block()});
+
+  return y;
+
+}
+
+Tensor CpuPoolingBackward(const PoolingHandle &ph, const Tensor &grad, const Tensor &x, const Tensor &y) {
+
+
+  Tensor in_grad;
+  in_grad.ResetLike(x);
+
+  in_grad.device()->Exec([&in_grad, &grad, &ph](Context * ctx) {
+    try {
+      auto eng = *ctx->engine;
+      using namespace mkldnn;
+      auto pool_bwd_d = pooling_backward::desc(ph.pooling_algo, *ph.x_md, *ph.y_md, ph.s_dims, ph.k_dims, ph.p_dims,
+                        ph.p_dims,
+                        padding_kind::zero);
+      auto pool_bwd_pd = pooling_backward::primitive_desc(pool_bwd_d, eng, *ph.pool_fwd_pd);
+
+      auto dx_mem = memory({{{ph.x_dims}, ph.dtype, memory::format::nchw}, eng},
+      in_grad.block()->mutable_data());
+      auto dy_mem = memory({{{ph.y_dims}, memory::data_type::f32, memory::format::nchw}, eng},
+      grad.block()->mutable_data());
+
+      auto p_bwd = ph.is_max_pooling ? pooling_backward(pool_bwd_pd, dy_mem, *ph.ws_mem, dx_mem) : pooling_backward( pool_bwd_pd, dy_mem, dx_mem);
+
+      stream(stream::kind::eager).submit({p_bwd}).wait();
+    } catch (mkldnn::error &e) {
+      LOG(FATAL) << "MKLDNN pooling bwd" << "Status: " << e.status << " Message: " << e.message;
+    }
+
+  }, {x.block(), y.block(), grad.block()}, {in_grad.block()});
+
+  return in_grad;
+
+}
+
+#endif
+
 #ifdef USE_CUDNN
 
 CudnnPoolingHandle::CudnnPoolingHandle(const Tensor &input,
diff --git a/src/model/operation/pooling.h b/src/model/operation/pooling.h
index b6a4d21..9154a5b 100644
--- a/src/model/operation/pooling.h
+++ b/src/model/operation/pooling.h
@@ -1,9 +1,33 @@
+/*********************************************************
+*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*
+************************************************************/
 #ifndef SINGA_MODEL_OPERATION_POOLING_H_
 #define SINGA_MODEL_OPERATION_POOLING_H_
 
 #include <string>
 #include "singa/core/tensor.h"
 
+#ifdef USE_MKLDNN
+#include <mkldnn.hpp>
+#endif // USE_MKLDNN
+
 #ifdef USE_CUDNN
 #include <cudnn.h>
 #include "../layer/cudnn_utils.h"
@@ -16,6 +40,7 @@
   PoolingHandle(const Tensor &input, const std::vector<int>& kernel_size,
                 const std::vector<int>& stride, const std::vector<int>& padding,
                 const bool is_max = true);
+  ~PoolingHandle();
 
   int kernel_w;
   int pad_w;
@@ -33,8 +58,32 @@
   int pooled_width;
 
   bool is_max_pooling;
+
+#ifdef USE_MKLDNN
+  mkldnn::memory::data_type dtype;
+  mkldnn::memory::dims x_dims;
+  mkldnn::memory::dims y_dims;
+  mkldnn::memory::dims s_dims;
+  mkldnn::memory::dims k_dims;
+  mkldnn::memory::dims p_dims;
+  mkldnn::algorithm pooling_algo;
+  const mkldnn::memory::desc *x_md = nullptr;
+  const mkldnn::memory::desc *y_md = nullptr;
+  const mkldnn::pooling_forward::desc *pool_fwd_d = nullptr;
+  const mkldnn::pooling_forward::primitive_desc *pool_fwd_pd = nullptr;
+  const mkldnn::memory::primitive_desc *pool_ws_d = nullptr;
+  const mkldnn::memory *ws_mem = nullptr;
+#endif // USE_MKLDNN
 };
 
+#ifdef USE_MKLDNN
+
+Tensor CpuPoolingForward(const PoolingHandle &ph, const Tensor &x);
+Tensor CpuPoolingBackward(const PoolingHandle &ph, const Tensor &dy,
+                          const Tensor& x, const Tensor& y);
+
+#endif // USE_MKLDNN
+
 #ifdef USE_CUDNN
 class CudnnPoolingHandle : public PoolingHandle {
  public:
diff --git a/src/model/updater/local_updater.cc b/src/model/updater/local_updater.cc
index 04593f4..f672134 100644
--- a/src/model/updater/local_updater.cc
+++ b/src/model/updater/local_updater.cc
@@ -43,7 +43,7 @@
   int nth = dev_index_[name]++;
   auto key = std::make_pair(nth, name);
   if (grad_buffer_[key].Size() != grad.Size()) {
-    grad_buffer_[key].SetShape(grad.shape());
+    grad_buffer_[key].Resize(grad.shape());
     grad_buffer_[key].AsType(grad.data_type());
   }
   grad_buffer_[key].CopyData(grad);
@@ -56,7 +56,7 @@
     }
   } else {
     if (param_buffer_[name].Size() != value.Size()) {
-      param_buffer_[name].SetShape(value.shape());
+      param_buffer_[name].Resize(value.shape());
       param_buffer_[name].AsType(value.data_type());
       param_buffer_[name].CopyData(value);
       sum_[name].ResetLike(param_buffer_[name]);
diff --git a/src/proto/core.proto b/src/proto/core.proto
index fd25607..5c4d997 100644
--- a/src/proto/core.proto
+++ b/src/proto/core.proto
@@ -71,7 +71,7 @@
   repeated uint32 shape = 1;
   optional DataType data_type = 2;
   //optional bool transpose = 3;
-  repeated int32 strides = 3;
+  repeated int32 stride = 3;
   repeated float float_data = 4 [packed = true];
   repeated double double_data = 5 [packed = true];
   repeated int32 int_data = 6 [packed = true];
diff --git a/test/python/test_mkldnn.py b/test/python/test_mkldnn.py
new file mode 100755
index 0000000..3c2b115
--- /dev/null
+++ b/test/python/test_mkldnn.py
@@ -0,0 +1,189 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+import unittest
+from singa import singa_wrap
+
+
+class TestPythonOperation(unittest.TestCase):
+
+    def test_conv2d(self):
+        print("TEST CONV2D FORWARD")
+        x_shape = [2, 1, 3, 3]
+        x = singa_wrap.Tensor(x_shape)
+        x.CopyFloatDataFromHostPtr(
+            [1, 2, 3, 4, 5, 6, 7, 8, 9, 1, 2, 3, 4, 5, 6, 7, 8, 9])
+
+        W_shape = [1, 1, 3, 3]
+        W = singa_wrap.Tensor(W_shape)
+        W.CopyFloatDataFromHostPtr([1, 1, 0, 0, 0, -1, 0, 1, 0])
+
+        b_shape = [1]
+        b = singa_wrap.Tensor(b_shape)
+        b.CopyFloatDataFromHostPtr([1])
+
+        dy_shape = [2, 1, 2, 2]
+        dy = singa_wrap.Tensor(dy_shape)
+        dy.CopyFloatDataFromHostPtr([0.1, 0.2, 0.3, 0.4, 0.1, 0.2, 0.3, 0.4])
+
+        handle = singa_wrap.ConvHandle(x, (3, 3), (2, 2), (1, 1), 1, 1, True)
+        y = singa_wrap.CpuConvForward(x, W, b, handle)
+
+        self.assertListEqual([2, 1, 2, 2], list(y.shape()))
+
+        _y = y.GetFloatValue(int(y.Size()))
+        self.assertAlmostEqual(3.0, _y[0])
+        self.assertAlmostEqual(7.0, _y[1])
+        self.assertAlmostEqual(-3.0, _y[2])
+        self.assertAlmostEqual(12.0, _y[3])
+        self.assertAlmostEqual(3.0, _y[4])
+        self.assertAlmostEqual(7.0, _y[5])
+        self.assertAlmostEqual(-3.0, _y[6])
+        self.assertAlmostEqual(12.0, _y[7])
+
+        print("TEST CONV2D DATA BACKWARD")
+
+        dx = singa_wrap.CpuConvBackwardx(dy, W, x, handle)
+        self.assertListEqual([2, 1, 3, 3], list(dx.shape()))
+
+        _dx = dx.GetFloatValue(int(dx.Size()))
+        self.assertAlmostEqual(0.0, _dx[0])
+        self.assertAlmostEqual(-0.1, _dx[1])
+        self.assertAlmostEqual(0.0, _dx[2])
+        self.assertAlmostEqual(0.4, _dx[3])
+        self.assertAlmostEqual(0.4, _dx[4])
+        self.assertAlmostEqual(0.6, _dx[5])
+        self.assertAlmostEqual(0.0, _dx[6])
+        self.assertAlmostEqual(-0.3, _dx[7])
+
+        print("TEST CONV2D WEIGHT BACKWARD")
+        dW = singa_wrap.CpuConvBackwardW(dy, x, W, handle)
+        self.assertListEqual([1, 1, 3, 3], list(dW.shape()))
+
+        _dW = dW.GetFloatValue(int(dW.Size()))
+        self.assertAlmostEqual(4.0, _dW[0], places=5)
+        self.assertAlmostEqual(7.2, _dW[1], places=5)
+        self.assertAlmostEqual(3.0, _dW[2], places=5)
+        self.assertAlmostEqual(7.2, _dW[3], places=5)
+        self.assertAlmostEqual(12.8, _dW[4], places=5)
+        self.assertAlmostEqual(5.2, _dW[5], places=5)
+        self.assertAlmostEqual(2.0, _dW[6], places=5)
+        self.assertAlmostEqual(3.2, _dW[7], places=5)
+        self.assertAlmostEqual(1.0, _dW[8], places=5)
+
+        print("TEST CONV2D DATA BACKWARD")
+        db = singa_wrap.CpuConvBackwardb(dy, b, handle)
+        self.assertEqual(1, dW.shape()[0])
+
+        _db = db.GetFloatValue(int(db.Size()))
+        print(_db)
+        self.assertAlmostEqual(2.0, _db[0], places=5)
+
+    def test_pooling(self):
+        x_shape = [2, 1, 3, 3]
+        x = singa_wrap.Tensor(x_shape)
+        x.CopyFloatDataFromHostPtr(
+            [1, 2, 3, 4, 5, 6, 7, 8, 9, 1, 2, 3, 4, 5, 6, 7, 8, 9])
+
+        y_shape = [2, 1, 2, 2]
+        dy = singa_wrap.Tensor(y_shape)
+        dy.CopyFloatDataFromHostPtr([0.1, 0.2, 0.3, 0.4, 0.1, 0.2, 0.3, 0.4])
+
+        k_dim = [2, 2]
+        s_dim = [1, 1]
+        p_dim = [0, 0]
+
+        # max pooling
+        handle = singa_wrap.PoolingHandle(x, k_dim, s_dim, p_dim, True)
+        y = singa_wrap.CpuPoolingForward(handle, x)
+        self.assertListEqual([2, 1, 2, 2], list(y.shape()))
+        dx = singa_wrap.CpuPoolingBackward(handle, dy, x, y)
+        self.assertListEqual([2, 1, 3, 3], list(dx.shape()))
+
+        # avg pooling
+        handle = singa_wrap.PoolingHandle(x, k_dim, s_dim, p_dim, False)
+        y = singa_wrap.CpuPoolingForward(handle, x)
+        self.assertListEqual([2, 1, 2, 2], list(y.shape()))
+        dx = singa_wrap.CpuPoolingBackward(handle, dy, x, y)
+        self.assertListEqual([2, 1, 3, 3], list(dx.shape()))
+
+    def test_batch_norm(self):
+        x_shape = [2, 2]
+        x = singa_wrap.Tensor(x_shape)
+        x.CopyFloatDataFromHostPtr([1, 2, 3, 4])
+
+        dy_shape = [2, 2]
+        dy = singa_wrap.Tensor(dy_shape)
+        dy.CopyFloatDataFromHostPtr([4, 3, 2, 1])
+
+        scale_shape = [2]
+        scale = singa_wrap.Tensor(scale_shape)
+        scale.CopyFloatDataFromHostPtr([1, 1])
+
+        bias_shape = [2]
+        bias = singa_wrap.Tensor(bias_shape)
+        bias.CopyFloatDataFromHostPtr([0, 0])
+
+        mean_shape = [2]
+        mean = singa_wrap.Tensor(mean_shape)
+        mean.CopyFloatDataFromHostPtr([1, 2])
+        var = singa_wrap.Tensor(mean_shape)
+        var.CopyFloatDataFromHostPtr([1, 2])
+
+        handle = singa_wrap.BatchNormHandle(0.9, x)
+
+        # 2D Forward Inference
+        y = singa_wrap.CpuBatchNormForwardInference(handle, x, scale, bias,
+                                                    mean, var)
+        self.assertListEqual([2, 2], list(y.shape()))
+
+        # 2D Forward Training
+        (y, mean_updated, var_updated) = singa_wrap.CpuBatchNormForwardTraining(
+            handle, x, scale, bias, mean, var)
+        self.assertListEqual([2, 2], list(y.shape()))
+        self.assertListEqual([2], list(mean_updated.shape()))
+        self.assertListEqual([2], list(var_updated.shape()))
+
+        # 2D Backward dx
+        (dx, dscale, dbias) = singa_wrap.CpuBatchNormBackwardx(handle, y, dy, x,
+                                                               scale, bias,
+                                                               mean_updated,
+                                                               var_updated)
+        self.assertListEqual([2, 2], list(dx.shape()))
+        self.assertListEqual([2], list(dscale.shape()))
+        self.assertListEqual([2], list(dbias.shape()))
+
+        # 4D Forward Inference
+
+        x2_shape = [1, 2, 4, 4]
+        x2 = singa_wrap.Tensor(x2_shape)
+        x2.CopyFloatDataFromHostPtr(
+            [0.0736655, 0.0459045, 0.0779517, 0.0771059, 0.0586862, 0.0561263,
+             0.0708457, 0.0977273, 0.0405025, -0.170897, 0.0208982, 0.136865,
+             -0.0367905, -0.0618205, -0.0103908, -0.0522777, -0.122161,
+             -0.025427, -0.0718576, -0.185941, 0.0166533, 0.178679, -0.0576606,
+             -0.137817, 0.150676, 0.153442, -0.0929899, -0.148675, -0.112459,
+             -0.106284, -0.103074, -0.0668811])
+
+        handle = singa_wrap.BatchNormHandle(0.9, x)
+        y2 = singa_wrap.CpuBatchNormForwardInference(handle, x2, scale, bias,
+                                                     mean, var)
+        self.assertListEqual([1, 2, 4, 4], list(y2.shape()))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/test/python/test_operation.py b/test/python/test_operation.py
index 3e11156..4d49287 100755
--- a/test/python/test_operation.py
+++ b/test/python/test_operation.py
@@ -1,3 +1,20 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
 import unittest
 from builtins import str
 
diff --git a/test/singa/test_cross_entropy.cc b/test/singa/test_cross_entropy.cc
index 3d704c8..eb2d606 100644
--- a/test/singa/test_cross_entropy.cc
+++ b/test/singa/test_cross_entropy.cc
@@ -29,9 +29,9 @@
 class TestSoftmaxCrossEntropy : public ::testing::Test {
  protected:
   virtual void SetUp() {
-    p.Reshape(singa::Shape{2, 4});
-    t.Reshape(singa::Shape{2, 1});
-    ta.Reshape(singa::Shape{2, 4});
+    p.Resize(singa::Shape{2, 4});
+    t.Resize(singa::Shape{2, 1});
+    ta.Resize(singa::Shape{2, 4});
   }
   const float pdat[8] = {0.1f, 0.1f, 0.1f, 0.1f, 0.1f, 0.1f, 0.1f, 0.1f };
   const int tdat[2] = {0, 2};
diff --git a/test/singa/test_mse.cc b/test/singa/test_mse.cc
index 7aa3326..98a86ab 100644
--- a/test/singa/test_mse.cc
+++ b/test/singa/test_mse.cc
@@ -28,8 +28,8 @@
 class TestMSE : public ::testing::Test {
  protected:
   virtual void SetUp() {
-    p.Reshape(singa::Shape{2, 3});
-    t.Reshape(singa::Shape{2, 3});
+    p.Resize(singa::Shape{2, 3});
+    t.Resize(singa::Shape{2, 3});
     p.CopyDataFromHostPtr(pdat, sizeof(pdat) / sizeof(float));
     t.CopyDataFromHostPtr(tdat, sizeof(pdat) / sizeof(float));
   }
diff --git a/test/singa/test_operation_batchnorm.cc b/test/singa/test_operation_batchnorm.cc
new file mode 100644
index 0000000..fc8c8b0
--- /dev/null
+++ b/test/singa/test_operation_batchnorm.cc
@@ -0,0 +1,252 @@
+/*********************************************************
+*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*
+************************************************************/
+
+#include "../src/model/operation/batchnorm.h"
+#include "gtest/gtest.h"
+#include <iostream>
+
+using namespace singa;
+
+#ifdef USE_MKLDNN
+
+TEST(OperationBatchNorm, ForwardInference) {
+  const float x_data[] = {1, 2,
+                          3, 4};
+  Tensor in(Shape{2, 2});
+  in.CopyDataFromHostPtr(x_data, 2 * 2);
+
+  const float alpha_[] = {1, 1};
+  Tensor alpha(Shape{2});
+  alpha.CopyDataFromHostPtr(alpha_, 2);
+
+  const float beta_[] = {2, 2};
+  Tensor beta(Shape{2});
+  beta.CopyDataFromHostPtr(beta_, 2);
+
+  const float mean_[] = {2, 3};
+  Tensor moving_mean(Shape{2});
+  moving_mean.CopyDataFromHostPtr(mean_, 2);
+
+  const float var_[] = {1, 1};
+  Tensor moving_var(Shape{2});
+  moving_var.CopyDataFromHostPtr(var_, 2);
+
+  // momentum
+  BatchNormHandle batch_norm_handle(0u,in);
+  Tensor y = CpuBatchNormForwardInference(batch_norm_handle, in, alpha, beta, moving_mean, moving_var);
+
+
+  const float *outptr = y.data<float>();
+  const auto &shape = y.shape();
+  EXPECT_EQ(2u, shape.size());
+  EXPECT_EQ(2u, shape[0]);
+  EXPECT_EQ(2u, shape[1]);
+  EXPECT_NEAR(1.0f, outptr[0], 1e-4f);
+  EXPECT_NEAR(1.0f, outptr[1], 1e-4f);
+  EXPECT_NEAR(3.0f, outptr[2], 1e-4f);
+  EXPECT_NEAR(3.0f, outptr[3], 1e-4f);
+}
+
+TEST(OperationBatchNorm, ForwardInference4D) {
+  float x_data[] = {
+      0.0736655, 0.0459045, 0.0779517, 0.0771059,
+      0.0586862, 0.0561263, 0.0708457, 0.0977273,
+      0.0405025, -0.170897, 0.0208982, 0.136865,
+      -0.0367905, -0.0618205, -0.0103908, -0.0522777,
+      -0.122161, -0.025427, -0.0718576, -0.185941,
+      0.0166533, 0.178679, -0.0576606, -0.137817,
+      0.150676, 0.153442, -0.0929899, -0.148675,
+      -0.112459, -0.106284, -0.103074, -0.0668811
+  };
+  Tensor in(Shape{1, 2, 4, 4});
+  in.CopyDataFromHostPtr(x_data, 2*4*4);
+
+  const float alpha_[] = {1,1};
+  Tensor alpha(Shape{2});
+  alpha.CopyDataFromHostPtr(alpha_, 2);
+
+  const float beta_[] = {0,0};
+  Tensor beta(Shape{2});
+  beta.CopyDataFromHostPtr(beta_, 2);
+
+
+  const float mean_[] = { 0.02650639, -0.04573606};
+  Tensor moving_mean(Shape{2});
+  moving_mean.CopyDataFromHostPtr(mean_, 2);
+
+  const float var_[] = {0.00546934, 0.01202502};
+  Tensor moving_var(Shape{2});
+  moving_var.CopyDataFromHostPtr(var_, 2);
+
+  // momentum
+  BatchNormHandle batch_norm_handle(0.0f,in);
+  Tensor y = CpuBatchNormForwardInference(batch_norm_handle, in, alpha, beta, moving_mean, moving_var);
+
+
+  // y = {1,1,1,1, 3,3,3,3}
+  const float *outptr = y.data<float>();
+  const auto &shape = y.shape();
+  EXPECT_EQ(4u, shape.size());
+  EXPECT_EQ(1u, shape[0]);
+  EXPECT_EQ(2u, shape[1]);
+  EXPECT_EQ(4u, shape[2]);
+  EXPECT_EQ(4u, shape[3]);
+  EXPECT_NEAR(0.637092, outptr[0],  1e-4f);
+  EXPECT_NEAR(0.262057, outptr[1],  1e-4f);
+  EXPECT_NEAR(0.694995, outptr[2],  1e-4f);
+  EXPECT_NEAR(0.683569, outptr[3],  1e-4f);
+  EXPECT_NEAR(0.434730, outptr[4],  1e-4f);
+  EXPECT_NEAR(0.400147, outptr[5],  1e-4f);
+  EXPECT_NEAR(0.598998, outptr[6],  1e-4f);
+  EXPECT_NEAR(0.962152, outptr[7],  1e-4f);
+  EXPECT_NEAR(0.189079, outptr[8],  1e-4f);
+  EXPECT_NEAR(-2.66680, outptr[9],  1e-4f);
+  EXPECT_NEAR(-0.07576, outptr[10], 1e-4f);
+  EXPECT_NEAR(1.490880, outptr[11], 1e-4f);
+  EXPECT_NEAR(-0.85510, outptr[12], 1e-4f);
+  EXPECT_NEAR(-1.19324, outptr[13], 1e-4f);
+  EXPECT_NEAR(-0.49845, outptr[14], 1e-4f);
+  EXPECT_NEAR(-1.06433, outptr[15], 1e-4f);
+  EXPECT_NEAR(-0.69664, outptr[16], 1e-4f);
+  EXPECT_NEAR(0.185125, outptr[17], 1e-4f);
+  EXPECT_NEAR(-0.23810, outptr[18], 1e-4f);
+  EXPECT_NEAR(-1.27803, outptr[19], 1e-4f);
+  EXPECT_NEAR(0.568704, outptr[20], 1e-4f);
+  EXPECT_NEAR(2.045640, outptr[21], 1e-4f);
+  EXPECT_NEAR(-0.10869, outptr[22], 1e-4f);
+  EXPECT_NEAR(-0.83935, outptr[23], 1e-4f);
+  EXPECT_NEAR(1.790380, outptr[24], 1e-4f);
+  EXPECT_NEAR(1.815590, outptr[25], 1e-4f);
+  EXPECT_NEAR(-0.43073, outptr[26], 1e-4f);
+  EXPECT_NEAR(-0.93833, outptr[27], 1e-4f);
+  EXPECT_NEAR(-0.60820, outptr[28], 1e-4f);
+  EXPECT_NEAR(-0.55192, outptr[29], 1e-4f);
+  EXPECT_NEAR(-0.52265, outptr[30], 1e-4f);
+  EXPECT_NEAR(-0.19274, outptr[31], 1e-4f);
+}
+
+TEST(OperationBatchNorm, ForwardTraining) {
+  const float x_data[] = {1, 2, 3, 4};
+  Tensor x(Shape{2, 2});
+  x.CopyDataFromHostPtr(x_data, 2 * 2);
+
+  const float y_data[] = {9, 9, 9, 9};
+  Tensor y(Shape{2, 2});
+  y.CopyDataFromHostPtr(y_data, 2 * 2);
+
+  const float dy_data[] = {4, 3, 2, 1};
+  Tensor dy(Shape{2, 2});
+  dy.CopyDataFromHostPtr(dy_data, 2 * 2);
+
+  const float alpha_[] = {1, 1};
+  Tensor alpha(Shape{2});
+  alpha.CopyDataFromHostPtr(alpha_, 2);
+
+  const float beta_[] = {0, 0};
+  Tensor beta(Shape{2});
+  beta.CopyDataFromHostPtr(beta_, 2);
+
+
+  // 0 momentum will ignore running mean and var
+  BatchNormHandle batch_norm_handle(0.3f,x);
+  const float running_mean_[] = {0,0};
+  Tensor running_mean(Shape{2});
+  Tensor running_var(Shape{2});
+  running_mean.CopyDataFromHostPtr(running_mean_, 2);
+  running_var.CopyDataFromHostPtr(running_mean_, 2);
+
+
+  // training operation calculate the running mean and var for backward
+  auto ret1 = CpuBatchNormForwardTraining(batch_norm_handle, x, alpha, beta, running_mean, running_var);
+  const float *yptr = ret1[0].data<float>();
+  EXPECT_NEAR(-1.0f, yptr[0], 1e-4f);
+  EXPECT_NEAR(-1.0f, yptr[1], 1e-4f);
+  EXPECT_NEAR(1.0f, yptr[2], 1e-4f);
+  EXPECT_NEAR(1.0f, yptr[3], 1e-4f);
+  const float *meanptr = ret1[1].data<float>();
+  EXPECT_NEAR(1.4f, meanptr[0], 1e-4f);
+  EXPECT_NEAR(2.1f, meanptr[1], 1e-4f);
+  const float *varptr = ret1[2].data<float>();
+  EXPECT_NEAR(0.7f, varptr[0], 1e-4f);
+  EXPECT_NEAR(0.7f, varptr[1], 1e-4f);
+}
+
+TEST(OperationBatchNorm, Backward) {
+  const float x_data[] = {1, 2, 3, 4};
+  Tensor x(Shape{2, 2});
+  x.CopyDataFromHostPtr(x_data, 2 * 2);
+
+  const float y_data[] = {9, 9, 9, 9};
+  Tensor y(Shape{2, 2});
+  y.CopyDataFromHostPtr(y_data, 2 * 2);
+
+  const float dy_data[] = {4, 3, 2, 1};
+  Tensor dy(Shape{2, 2});
+  dy.CopyDataFromHostPtr(dy_data, 2 * 2);
+
+  const float alpha_[] = {1, 1};
+  Tensor alpha(Shape{2});
+  alpha.CopyDataFromHostPtr(alpha_, 2);
+
+  const float beta_[] = {0, 0};
+  Tensor beta(Shape{2});
+  beta.CopyDataFromHostPtr(beta_, 2);
+
+
+  // 0 momentum will ignore running mean and var
+  BatchNormHandle batch_norm_handle(0.0f,x);
+  const float running_mean_[] = {1,2};
+  Tensor running_mean(Shape{2});
+  Tensor running_var(Shape{2});
+  running_mean.CopyDataFromHostPtr(running_mean_, 2);
+  running_var.CopyDataFromHostPtr(running_mean_, 2);
+
+
+  // training operation calculate the running mean and var for backward
+  auto ret1 = CpuBatchNormForwardTraining(batch_norm_handle, x, alpha, beta, running_mean, running_var);
+
+  // calculate dx, dscale, dbias
+  auto ret2 = CpuBatchNormBackwardx( batch_norm_handle, y, dy, x, alpha, beta, ret1[1],  ret1[2]);
+
+  const auto &shape = ret2[0].shape();
+  EXPECT_EQ(2u, shape.size());
+  EXPECT_EQ(2u, shape[0]);
+  EXPECT_EQ(2u, shape[1]);
+  const float *dxptr = ret2[0].data<float>();
+  EXPECT_NEAR(.0f, dxptr[0], 1e-4f);
+  EXPECT_NEAR(.0f, dxptr[1], 1e-4f);
+  EXPECT_NEAR(.0f, dxptr[2], 1e-4f);
+  EXPECT_NEAR(.0f, dxptr[3], 1e-4f);
+
+
+  const auto &dbnScaleShape = ret2[1].shape();
+  EXPECT_EQ(2u, dbnScaleShape[0]);
+  const auto &dbnBiasShape = ret2[2].shape();
+  EXPECT_EQ(2u, dbnBiasShape[0]);
+  const float *dbnScaleptr = ret2[1].data<float>();
+  EXPECT_NEAR(-2.0f, dbnScaleptr[0], 1e-4f);
+  EXPECT_NEAR(-2.0f, dbnScaleptr[1], 1e-4f);
+  const float *dbnBiasptr = ret2[2].data<float>();
+  EXPECT_NEAR(6.0f, dbnBiasptr[0], 1e-4f);
+  EXPECT_NEAR(4.0f, dbnBiasptr[1], 1e-4f);
+}
+
+#endif // USE_MKLDNN
diff --git a/test/singa/test_operation_convolution.cc b/test/singa/test_operation_convolution.cc
new file mode 100644
index 0000000..81bea38
--- /dev/null
+++ b/test/singa/test_operation_convolution.cc
@@ -0,0 +1,171 @@
+/************************************************************
+*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*
+*************************************************************/
+#include "singa/singa_config.h"
+
+#ifdef USE_CBLAS
+
+#include "../src/model/operation/convolution.h"
+
+#include "gtest/gtest.h"
+
+using namespace singa;
+
+#ifdef USE_MKLDNN
+
+TEST(Operation_Convolution, Forward) {
+  const size_t batch_size = 2, c = 1, h = 3, w = 3;
+  const float x[batch_size * c * h * w] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f,
+                                          7.0f, 8.0f, 9.0f, 1.0f, 2.0f, 3.0f,
+                                          4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f};
+  Tensor in(Shape{batch_size, c, h, w});
+  in.CopyDataFromHostPtr(x, batch_size * c * h * w);
+
+  const size_t num_filters = 1;
+  const size_t kernel_w = 3;
+  const size_t kernel_h = 3;
+  const std::vector<size_t> stride = {2, 2};
+  const std::vector<size_t> padding = {1, 1};
+  const bool bias_flag = true;
+
+  const float we[num_filters * kernel_w * kernel_h] = {1.0f, 1.0f, 0.0f,
+                                                       0.0f, 0.0f, -1.0f,
+                                                       0.0f, 1.0f, 0.0f};
+  Tensor weight(Shape{num_filters, num_filters, 3, 3});
+  weight.CopyDataFromHostPtr(we, num_filters * num_filters * kernel_w * kernel_h);
+
+  const float b[num_filters] = {1.0f};
+  Tensor bias(Shape{num_filters});
+  bias.CopyDataFromHostPtr(b, num_filters);
+
+
+  ConvHandle conv_handle(in, {kernel_w, kernel_h}, stride, padding, c, num_filters, bias_flag);
+  Tensor out1 = CpuConvForward(in, weight, bias, conv_handle);
+
+  const float *out_ptr1 = out1.data<float>();
+  // Input: 3*3; kernel: 3*3; stride: 2*2; padding: 1*1.
+  EXPECT_EQ(8u, out1.Size());
+
+  EXPECT_EQ(3.0f, out_ptr1[0]);
+  EXPECT_EQ(7.0f, out_ptr1[1]);
+  EXPECT_EQ(-3.0f, out_ptr1[2]);
+  EXPECT_EQ(12.0f, out_ptr1[3]);
+  EXPECT_EQ(3.0f, out_ptr1[4]);
+  EXPECT_EQ(7.0f, out_ptr1[5]);
+  EXPECT_EQ(-3.0f, out_ptr1[6]);
+  EXPECT_EQ(12.0f, out_ptr1[7]);
+}
+
+TEST(Operation_Convolution, Backward) {
+  const size_t batch_size = 2, c = 1, h = 3, w = 3;
+  const float x[batch_size * c * h * w] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f,
+                                           7.0f, 8.0f, 9.0f, 1.0f, 2.0f, 3.0f,
+                                           4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f};
+  Tensor in(Shape{batch_size, c, h, w});
+  in.CopyDataFromHostPtr(x, batch_size * c * h * w);
+
+  const size_t num_filters = 1;
+  const size_t kernel_w = 3;
+  const size_t kernel_h = 3;
+  const std::vector<size_t> stride = {2, 2};
+  const std::vector<size_t> padding = {1, 1};
+  const bool bias_flag = true;
+
+  const float we[num_filters * kernel_w * kernel_h] = {1.0f, 1.0f, 0.0f,
+                                                       0.0f, 0.0f, -1.0f,
+                                                       0.0f, 1.0f, 0.0f};
+  Tensor weight(Shape{num_filters, num_filters, 3, 3});
+  weight.CopyDataFromHostPtr(we, num_filters * num_filters * kernel_w * kernel_h);
+
+  const float b[num_filters] = {1.0f};
+  Tensor bias(Shape{num_filters});
+  bias.CopyDataFromHostPtr(b, num_filters);
+
+
+  ConvHandle conv_handle(in, {kernel_w, kernel_h}, stride, padding, c, num_filters, bias_flag);
+  Tensor out1 = CpuConvForward(in, weight, bias, conv_handle);
+
+  // grad
+  const size_t grad_h = 2, grad_w = 2;
+  const float dy[batch_size * num_filters * grad_h * grad_w] = {
+      0.1f, 0.2f, 0.3f, 0.4f, 0.1f, 0.2f, 0.3f, 0.4f};
+  Tensor grad(Shape{batch_size, num_filters, grad_h, grad_w});
+  grad.CopyDataFromHostPtr(dy, batch_size * num_filters * grad_h * grad_w);
+
+  Tensor in_grad = CpuConvBackwardx(grad, weight, in, conv_handle);
+
+  const float *dx = in_grad.data<float>();
+  const float *wptr = we;
+  EXPECT_EQ(18u, in_grad.Size());
+  EXPECT_EQ(dy[0] * wptr[4], dx[0]);
+  EXPECT_EQ(dy[0] * wptr[5] + dy[1] * wptr[3], dx[1]);
+  EXPECT_EQ(dy[1] * wptr[4], dx[2]);
+  EXPECT_EQ(dy[0] * wptr[7] + dy[2] * wptr[1], dx[3]);
+  EXPECT_EQ(
+      dy[0] * wptr[8] + dy[1] * wptr[6] + dy[2] * wptr[2] + dy[3] * wptr[0],
+      dx[4]);
+  EXPECT_EQ(dy[1] * wptr[7] + dy[3] * wptr[1], dx[5]);
+  EXPECT_EQ(dy[2] * wptr[4], dx[6]);
+  EXPECT_EQ(dy[2] * wptr[5] + dy[3] * wptr[3], dx[7]);
+  EXPECT_EQ(dy[3] * wptr[4], dx[8]);
+  EXPECT_EQ(dy[4] * wptr[4], dx[9]);
+  EXPECT_EQ(dy[4] * wptr[5] + dy[1] * wptr[3], dx[10]);
+  EXPECT_EQ(dy[5] * wptr[4], dx[11]);
+  EXPECT_EQ(dy[4] * wptr[7] + dy[2] * wptr[1], dx[12]);
+  EXPECT_EQ(
+      dy[4] * wptr[8] + dy[5] * wptr[6] + dy[6] * wptr[2] + dy[7] * wptr[0],
+      dx[13]);
+  EXPECT_EQ(dy[5] * wptr[7] + dy[7] * wptr[1], dx[14]);
+  EXPECT_EQ(dy[6] * wptr[4], dx[15]);
+  EXPECT_EQ(dy[6] * wptr[5] + dy[7] * wptr[3], dx[16]);
+  EXPECT_EQ(dy[7] * wptr[4], dx[17]);
+
+
+  Tensor dw = CpuConvBackwardW(grad, in, weight, conv_handle);
+  Tensor db = CpuConvBackwardb(grad, bias, conv_handle);
+
+
+  const float *dbptr = db.data<float>();
+  EXPECT_FLOAT_EQ(dy[0] + dy[1] + dy[2] + dy[3] + dy[4] + dy[5] + dy[6] + dy[7],
+                  dbptr[0]);
+
+  const float *dwptr = dw.data<float>();
+  EXPECT_EQ(9u, dw.Size());
+  EXPECT_FLOAT_EQ(dy[3] * x[4] + dy[7] * x[13], dwptr[0]);
+  EXPECT_FLOAT_EQ(dy[3] * x[5] + dy[7] * x[14] + dy[2] * x[3] + dy[6] * x[12],
+                  dwptr[1]);
+  EXPECT_FLOAT_EQ(dy[2] * x[4] + dy[6] * x[13], dwptr[2]);
+  EXPECT_FLOAT_EQ(dy[1] * x[1] + dy[5] * x[10] + dy[3] * x[7] + dy[7] * x[16],
+                  dwptr[3]);
+  EXPECT_FLOAT_EQ(dy[0] * x[0] + dy[4] * x[9] + dy[1] * x[2] + dy[5] * x[11] +
+                  dy[2] * x[6] + dy[6] * x[15] + dy[3] * x[8] +
+                  dy[7] * x[17],
+                  dwptr[4]);
+  EXPECT_FLOAT_EQ(dy[0] * x[1] + dy[4] * x[10] + dy[2] * x[7] + dy[6] * x[16],
+                  dwptr[5]);
+  EXPECT_FLOAT_EQ(dy[1] * x[4] + dy[5] * x[13], dwptr[6]);
+  EXPECT_FLOAT_EQ(dy[0] * x[3] + dy[4] * x[12] + dy[1] * x[5] + dy[5] * x[14],
+                  dwptr[7]);
+  EXPECT_FLOAT_EQ(dy[0] * x[4] + dy[4] * x[13], dwptr[8]);
+}
+
+#endif  // USE_MKLDNN
+
+#endif  // USE_CBLAS
diff --git a/test/singa/test_operation_pooling.cc b/test/singa/test_operation_pooling.cc
new file mode 100644
index 0000000..3bcc896
--- /dev/null
+++ b/test/singa/test_operation_pooling.cc
@@ -0,0 +1,183 @@
+/************************************************************
+*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*
+*************************************************************/
+#include "../src/model/operation/pooling.h"
+
+#include "gtest/gtest.h"
+
+using namespace singa;
+
+#ifdef USE_MKLDNN
+TEST(OperationPooling, Forward) {
+  const size_t batchsize = 2, c = 1, h = 3, w = 3;
+  const float x[batchsize * c * h * w] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f,
+                                          7.0f, 8.0f, 9.0f, 1.0f, 2.0f, 3.0f,
+                                          4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f};
+  Tensor in(Shape{batchsize, c, h, w});
+  in.CopyDataFromHostPtr(x, batchsize * c * h * w);
+
+
+  PoolingHandle pool_handle(in, {2, 2}, {1,1}, {0,0}, true);
+  Tensor out1 = CpuPoolingForward(pool_handle, in);
+
+  // Parameter "flag" does not influence pooling
+  const float *outptr1 = out1.data<float>();
+  // Input: 3*3; kernel: 2*2; stride: 1*1; no padding.
+  EXPECT_EQ(8u, out1.Size());
+  EXPECT_EQ(5.0f, outptr1[0]);
+  EXPECT_EQ(6.0f, outptr1[1]);
+  EXPECT_EQ(8.0f, outptr1[2]);
+  EXPECT_EQ(9.0f, outptr1[3]);
+  EXPECT_EQ(5.0f, outptr1[4]);
+  EXPECT_EQ(6.0f, outptr1[5]);
+  EXPECT_EQ(8.0f, outptr1[6]);
+  EXPECT_EQ(9.0f, outptr1[7]);
+
+}
+
+TEST(OperationPooling, ForwardAverage) {
+  const size_t batchsize = 2, c = 1, h = 3, w = 3;
+  const float x[batchsize * c * h * w] = {1.0f, 2.0f, 3.0f,
+                                          4.0f, 5.0f, 6.0f,
+                                          7.0f, 8.0f, 9.0f,
+
+                                          1.0f, 2.0f, 3.0f,
+                                          4.0f, 5.0f, 6.0f,
+                                          7.0f, 8.0f, 9.0f};
+  Tensor in(Shape{batchsize, c, h, w});
+  in.CopyDataFromHostPtr(x, batchsize * c * h * w);
+
+
+  PoolingHandle pool_handle(in, {2, 2}, {1,1}, {0,0}, false);
+  Tensor out1 = CpuPoolingForward(pool_handle, in);
+
+  // Parameter "flag" does not influence pooling
+  const float *outptr1 = out1.data<float>();
+  // Input: 3*3; kernel: 2*2; stride: 1*1; no padding.
+  EXPECT_EQ(8u, out1.Size());
+  EXPECT_EQ(3.0f, outptr1[0]);
+  EXPECT_EQ(4.0f, outptr1[1]);
+  EXPECT_EQ(6.0f, outptr1[2]);
+  EXPECT_EQ(7.0f, outptr1[3]);
+  EXPECT_EQ(3.0f, outptr1[4]);
+  EXPECT_EQ(4.0f, outptr1[5]);
+  EXPECT_EQ(6.0f, outptr1[6]);
+  EXPECT_EQ(7.0f, outptr1[7]);
+
+}
+
+
+TEST(OperationPooling, Backward) {
+  // src_data
+  const size_t batchsize = 2, c = 1, src_h = 3, src_w = 3;
+  const float x[batchsize * c * src_h * src_w] = {
+      1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f,
+      1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f};
+  Tensor in(Shape{batchsize, c, src_h, src_w});
+  in.CopyDataFromHostPtr(x, batchsize * c * src_h * src_w);
+
+
+  PoolingHandle pool_handle(in, {2, 2}, {1,1}, {0,0}, true);
+
+  Tensor out = CpuPoolingForward(pool_handle, in);
+
+  // grad - bwd
+  const size_t grad_h = 2, grad_w = 2;
+  const float dy[batchsize * c * grad_h * grad_w] = {0.1f, 0.2f, 0.3f, 0.4f,
+                                                     0.1f, 0.2f, 0.3f, 0.4f};
+  Tensor grad(Shape{batchsize, c, grad_h, grad_w});
+  grad.CopyDataFromHostPtr(dy, batchsize * c * grad_h * grad_w);
+
+  Tensor in_grad = CpuPoolingBackward(pool_handle, grad, in, out);
+
+
+  const float *dx = in_grad.data<float>();
+  EXPECT_EQ(18u, in_grad.Size());
+  EXPECT_EQ(0.0f, dx[0]);
+  EXPECT_EQ(0.0f, dx[1]);
+  EXPECT_EQ(0.0f, dx[2]);
+  EXPECT_EQ(0.0f, dx[3]);
+  EXPECT_EQ(0.1f, dx[4]);
+  EXPECT_EQ(0.2f, dx[5]);
+  EXPECT_EQ(0.0f, dx[6]);
+  EXPECT_EQ(0.3f, dx[7]);
+  EXPECT_EQ(0.4f, dx[8]);
+  EXPECT_EQ(0.0f, dx[9]);
+  EXPECT_EQ(0.0f, dx[10]);
+  EXPECT_EQ(0.0f, dx[11]);
+  EXPECT_EQ(0.0f, dx[12]);
+  EXPECT_EQ(0.1f, dx[13]);
+  EXPECT_EQ(0.2f, dx[14]);
+  EXPECT_EQ(0.0f, dx[15]);
+  EXPECT_EQ(0.3f, dx[16]);
+  EXPECT_EQ(0.4f, dx[17]);
+}
+
+TEST(OperationPooling, BackwardAvg) {
+  // src_data
+  const size_t batchsize = 2, c = 1, src_h = 3, src_w = 3;
+  const float x[batchsize * c * src_h * src_w] = {
+      1.0f, 2.0f, 3.0f,
+      4.0f, 5.0f, 6.0f,
+      7.0f, 8.0f, 9.0f,
+
+      1.0f, 2.0f, 3.0f,
+      4.0f, 5.0f, 6.0f,
+      7.0f, 8.0f, 9.0f};
+  Tensor in(Shape{batchsize, c, src_h, src_w});
+  in.CopyDataFromHostPtr(x, batchsize * c * src_h * src_w);
+
+
+  PoolingHandle pool_handle(in, {2, 2}, {1,1}, {0,0}, false);
+
+  Tensor out = CpuPoolingForward(pool_handle, in);
+
+  // grad - bwd
+  const size_t grad_h = 2, grad_w = 2;
+  const float dy[batchsize * c * grad_h * grad_w] = {0.1f, 0.2f, 0.3f, 0.4f,
+                                                     0.1f, 0.2f, 0.3f, 0.4f};
+  Tensor grad(Shape{batchsize, c, grad_h, grad_w});
+  grad.CopyDataFromHostPtr(dy, batchsize * c * grad_h * grad_w);
+
+  Tensor in_grad = CpuPoolingBackward(pool_handle, grad, in, out);
+
+  const float *dx = in_grad.data<float>();
+  EXPECT_EQ(18u, in_grad.Size());
+  EXPECT_NEAR(0.0250f, dx[0], 1e-5f);
+  EXPECT_NEAR(0.0750f, dx[1], 1e-5f);
+  EXPECT_NEAR(0.0500f, dx[2], 1e-5f);
+  EXPECT_NEAR(0.1000f, dx[3], 1e-5f);
+  EXPECT_NEAR(0.2500f, dx[4], 1e-5f);
+  EXPECT_NEAR(0.1500f, dx[5], 1e-5f);
+  EXPECT_NEAR(0.0750f, dx[6], 1e-5f);
+  EXPECT_NEAR(0.1750f, dx[7], 1e-5f);
+  EXPECT_NEAR(0.1000f, dx[8], 1e-5f);
+  EXPECT_NEAR(0.0250f, dx[9], 1e-5f);
+  EXPECT_NEAR(0.0750f, dx[10], 1e-5f);
+  EXPECT_NEAR(0.0500f, dx[11], 1e-5f);
+  EXPECT_NEAR(0.1000f, dx[12], 1e-5f);
+  EXPECT_NEAR(0.2500f, dx[13], 1e-5f);
+  EXPECT_NEAR(0.1500f, dx[14], 1e-5f);
+  EXPECT_NEAR(0.0750f, dx[15], 1e-5f);
+  EXPECT_NEAR(0.1750f, dx[16], 1e-5f);
+  EXPECT_NEAR(0.1000f, dx[17], 1e-5f);
+}
+
+#endif // USE_MKLDNN
diff --git a/test/singa/test_tensor.cc b/test/singa/test_tensor.cc
index c8df3ee..5c42b7e 100644
--- a/test/singa/test_tensor.cc
+++ b/test/singa/test_tensor.cc
@@ -22,8 +22,8 @@
 using singa::Shape;
 using singa::Device;
 
-TEST(TensorTest, TestConstructor) {
-  singa::Tensor float_t(singa::Shape{2,3});
+TEST(TensorClass, Constructor) {
+  singa::Tensor float_t(singa::Shape{2, 3});
   EXPECT_EQ(6u, float_t.Size());
   EXPECT_EQ(sizeof(float) * 6, float_t.MemSize());
   EXPECT_EQ(singa::kFloat32, float_t.data_type());
@@ -33,7 +33,7 @@
 
   EXPECT_NE(float_t.device(), nullptr);
 
-  singa::Tensor float16_t(Shape{2,3}, singa::kFloat16);
+  singa::Tensor float16_t(Shape{2, 3}, singa::kFloat16);
   EXPECT_EQ(singa::kFloat16, float16_t.data_type());
   EXPECT_EQ(6u, float16_t.Size());
   EXPECT_EQ(12u, float16_t.block()->size());
@@ -53,18 +53,18 @@
 
 TEST(TensorClass, Reshape) {
   Tensor t;
-  t.Reshape(Shape{2,3});
-  EXPECT_TRUE((Shape{2,3} == t.shape()));
+  t.Resize(Shape{2, 3});
+  EXPECT_TRUE((Shape{2, 3} == t.shape()));
 
-  t.Reshape(Shape{3,3, 4});
-  EXPECT_TRUE((Shape{3,3, 4} == t.shape()));
+  t.Resize(Shape{3, 3, 4});
+  EXPECT_TRUE((Shape{3, 3, 4} == t.shape()));
 
-  t.Reshape(Shape{12});
+  t.Resize(Shape{12});
   EXPECT_TRUE((Shape{12} == t.shape()));
 
   Tensor o;
   EXPECT_TRUE(o.shape() != t.shape());
-  o.Reshape(Shape{3, 3});
+  o.Resize(Shape{3, 3});
   EXPECT_TRUE(o.shape() != t.shape());
 }
 
@@ -76,7 +76,7 @@
 }
 
 TEST(TensorClass, ToDevice) {
-  Tensor t(Shape{2,3});
+  Tensor t(Shape{2, 3});
   EXPECT_EQ(singa::defaultDevice, t.device());
   auto dev = std::make_shared<singa::CppCPU>();
   t.ToDevice(dev);
@@ -119,9 +119,10 @@
 }
 
 TEST(TensorClass, T) {
-  Tensor t(Shape{2,3});
+  Tensor t(Shape{2, 3});
   EXPECT_FALSE(t.transpose());
-  Tensor o = t.T();
+  Tensor o = t.T(); // o = t = {3,2}
+  t.T(); // t = {2,3}
   EXPECT_EQ(true, o.transpose());
   EXPECT_EQ(t.block(), o.block());
   EXPECT_EQ(t.data_type(), o.data_type());
@@ -134,7 +135,7 @@
   Tensor t(Shape{3});
   t.CopyDataFromHostPtr(data, 3);
 
-  Tensor o = t.Repeat(vector <size_t>{2},9999);
+  Tensor o = t.Repeat(vector <size_t> {2}, 9999);
   const float* dptr = static_cast<const float*>(o.block()->data());
   EXPECT_FLOAT_EQ(1.0f, dptr[0]);
   EXPECT_FLOAT_EQ(1.0f, dptr[1]);
@@ -144,13 +145,13 @@
   EXPECT_FLOAT_EQ(3.0f, dptr[5]);
 }
 
-TEST(TensorCLass, RepeatData) {
+TEST(TensorClass, RepeatData) {
   float data[] = {1.0f, 2.0f, 3.0f};
   Tensor t(Shape{3});
   t.CopyDataFromHostPtr(data, 3);
 
   Tensor o(Shape{6});
-  o.RepeatData({2},9999,2,t);
+  o.RepeatData({2}, 9999, 2, t);
   const float* dptr = static_cast<const float*>(o.block()->data());
   EXPECT_FLOAT_EQ(1.0f, dptr[0]);
   EXPECT_FLOAT_EQ(1.0f, dptr[1]);
@@ -160,3 +161,63 @@
   EXPECT_FLOAT_EQ(3.0f, dptr[5]);
 }
 
+TEST(TensorClass, Broadcast) {
+  {
+    Tensor a1(Shape{2, 3, 4, 5}), b1(Shape{5});
+    auto c1 = Broadcast(a1, b1.shape()).shape();
+    auto c2 = Broadcast(b1, a1.shape()).shape();
+    EXPECT_EQ(c1[0], 2);
+    EXPECT_EQ(c1[1], 3);
+    EXPECT_EQ(c1[2], 4);
+    EXPECT_EQ(c1[3], 5);
+
+    EXPECT_EQ(c2[0], 2);
+    EXPECT_EQ(c2[1], 3);
+    EXPECT_EQ(c2[2], 4);
+    EXPECT_EQ(c2[3], 5);
+  }
+  {
+    Tensor a1(Shape{4, 5}), b1(Shape{2, 3, 4, 5});
+    auto c1 = Broadcast(a1, b1.shape()).shape();
+    auto c2 = Broadcast(b1, a1.shape()).shape();
+    EXPECT_EQ(c1[0], 2);
+    EXPECT_EQ(c1[1], 3);
+    EXPECT_EQ(c1[2], 4);
+    EXPECT_EQ(c1[3], 5);
+
+    EXPECT_EQ(c2[0], 2);
+    EXPECT_EQ(c2[1], 3);
+    EXPECT_EQ(c2[2], 4);
+    EXPECT_EQ(c2[3], 5);
+  }
+  {
+    Tensor a1(Shape{1, 4, 5}), b1(Shape{2, 3, 1, 1});
+    auto c1 = Broadcast(a1, b1.shape()).shape();
+    auto c2 = Broadcast(b1, a1.shape()).shape();
+ 
+    EXPECT_EQ(c1[0], 2);
+    EXPECT_EQ(c1[1], 3);
+    EXPECT_EQ(c1[2], 4);
+    EXPECT_EQ(c1[3], 5);
+
+    EXPECT_EQ(c2[0], 2);
+    EXPECT_EQ(c2[1], 3);
+    EXPECT_EQ(c2[2], 4);
+    EXPECT_EQ(c2[3], 5);
+  }
+  {
+    Tensor a1(Shape{3, 4, 5}), b1(Shape{2, 1, 1, 1});
+    auto c1 = Broadcast(a1, b1.shape()).shape();
+    auto c2 = Broadcast(b1, a1.shape()).shape();
+ 
+    EXPECT_EQ(c1[0], 2);
+    EXPECT_EQ(c1[1], 3);
+    EXPECT_EQ(c1[2], 4);
+    EXPECT_EQ(c1[3], 5);
+
+    EXPECT_EQ(c2[0], 2);
+    EXPECT_EQ(c2[1], 3);
+    EXPECT_EQ(c2[2], 4);
+    EXPECT_EQ(c2[3], 5);
+  }
+}
diff --git a/test/singa/test_tensor_math.cc b/test/singa/test_tensor_math.cc
index 116262c..6228607 100644
--- a/test/singa/test_tensor_math.cc
+++ b/test/singa/test_tensor_math.cc
@@ -22,14 +22,14 @@
 using singa::Shape;
 using singa::Device;
 
-class TestTensorMath : public ::testing::Test {
+class TensorMath : public ::testing::Test {
  protected:
   virtual void SetUp() {
-    a.Reshape(singa::Shape{6});
-    b.Reshape(singa::Shape{6});
-    c.Reshape(singa::Shape{6, 1});
-    d.Reshape(singa::Shape{3, 2});
-    e.Reshape(singa::Shape{3, 2});
+    a.Resize(singa::Shape{6});
+    b.Resize(singa::Shape{6});
+    c.Resize(singa::Shape{6, 1});
+    d.Resize(singa::Shape{3, 2});
+    e.Resize(singa::Shape{3, 2});
 
     a.CopyDataFromHostPtr<float>(dat1, 6);
     b.CopyDataFromHostPtr<float>(dat2, 6);
@@ -40,7 +40,7 @@
   const float dat2[6] = {1.1f, 2.1f, 3.1f, 4.1f, 5.1f, 6.1f};
 };
 
-TEST_F(TestTensorMath, MemberAbs) {
+TEST_F(TensorMath, AbsCpp) {
   Tensor aa = a.Clone();
   Tensor bb = b.Clone();
   Tensor cc = aa - bb;
@@ -56,7 +56,7 @@
   EXPECT_NEAR(0.1, dptr1[2], 1e-5);
 }
 
-TEST_F(TestTensorMath, MemberExp) {
+TEST_F(TensorMath, ExpCpp) {
   Tensor p = Exp(a);
   const float *dptr1 = p.data<float>();
   EXPECT_NEAR(exp(1.0f), dptr1[0], 1e-5);
@@ -64,7 +64,17 @@
   EXPECT_NEAR(exp(3.0f), dptr1[2], 1e-5);
 }
 
-TEST_F(TestTensorMath, MemberLog) {
+TEST_F(TensorMath, ExpStrideCpp) {
+  auto x = singa::Tensor(singa::Shape{2, 1, 3});
+  auto y = singa::Transpose(x, {1, 2, 0});
+  Exp(singa::Reshape(a, singa::Shape{1, 3, 2}), &y);
+  const float *dptr1 = y.data<float>();
+  EXPECT_NEAR(exp(dat1[0]), dptr1[0], 1e-5);
+  EXPECT_NEAR(exp(dat1[4]), dptr1[2], 1e-5);
+  EXPECT_NEAR(exp(dat1[3]), dptr1[4], 1e-5);
+}
+
+TEST_F(TensorMath, LogCpp) {
   Tensor p = Log(a);
   const float *dptr1 = p.data<float>();
   EXPECT_NEAR(log(1.0f), dptr1[0], 1e-5);
@@ -72,7 +82,7 @@
   EXPECT_NEAR(log(3.0f), dptr1[2], 1e-5);
 }
 
-TEST_F(TestTensorMath, MemberReLU) {
+TEST_F(TensorMath, ReLUCpp) {
   Tensor aa = a.Clone();
   Tensor cc = aa - 2.0f;
   const float *dptr = cc.data<float>();
@@ -87,7 +97,7 @@
   EXPECT_NEAR(1.0f, dptr1[2], 1e-5);
 }
 
-TEST_F(TestTensorMath, MemberSigmoid) {
+TEST_F(TensorMath, SigmoidCpp) {
   Tensor p = Sigmoid(a);
   const float *dptr1 = p.data<float>();
   EXPECT_NEAR(1.0f / (1.0f + exp(-1.0f)), dptr1[0], 1e-5);
@@ -95,7 +105,7 @@
   EXPECT_NEAR(1.0f / (1.0f + exp(-3.0f)), dptr1[2], 1e-5);
 }
 
-TEST_F(TestTensorMath, MemberSign) {
+TEST_F(TensorMath, SignCpp) {
   Tensor aa = a.Clone();
   Tensor cc = aa - 2.0f;
   const float *dptr = cc.data<float>();
@@ -110,7 +120,7 @@
   EXPECT_EQ(1.0f, dptr1[2]);
 }
 
-TEST_F(TestTensorMath, MemberSqrt) {
+TEST_F(TensorMath, SqrtCpp) {
   Tensor p = Sqrt(a);
   const float *dptr1 = p.data<float>();
   EXPECT_NEAR(sqrt(1.0), dptr1[0], 1e-5);
@@ -118,7 +128,7 @@
   EXPECT_NEAR(sqrt(3.0), dptr1[2], 1e-5);
 }
 
-TEST_F(TestTensorMath, MemberSquare) {
+TEST_F(TensorMath, SquareCpp) {
   Tensor p = Square(a);
   const float *dptr1 = p.data<float>();
   EXPECT_NEAR(1.0, dptr1[0], 1e-5);
@@ -126,7 +136,7 @@
   EXPECT_NEAR(9.0, dptr1[2], 1e-5);
 }
 
-TEST_F(TestTensorMath, MemberTanh) {
+TEST_F(TensorMath, TanhCpp) {
   Tensor p = Tanh(a);
   const float *dptr1 = p.data<float>();
   EXPECT_NEAR(tanh(1.0), dptr1[0], 1e-5);
@@ -134,7 +144,7 @@
   EXPECT_NEAR(tanh(3.0), dptr1[2], 1e-5);
 }
 
-TEST_F(TestTensorMath, Sum) {
+TEST_F(TensorMath, SumCpp) {
   Tensor p1 = Sum(e, 0);
   const float *dptr1 = p1.data<float>();
   EXPECT_FLOAT_EQ(9.0f, dptr1[0]);
@@ -148,7 +158,7 @@
   EXPECT_FLOAT_EQ(11.0f, dptr2[2]);
 }
 
-TEST_F(TestTensorMath, SoftMax) {
+TEST_F(TensorMath, SoftMaxCpp) {
   Tensor p1 = SoftMax(Reshape(e, Shape{1, 6}));
   const float *dptr1 = p1.data<float>();
   float sum = 0;
@@ -166,7 +176,7 @@
   EXPECT_NEAR(exp(2) / (exp(1) + exp(2)), dptr2[1], 1e-5);
 }
 
-TEST_F(TestTensorMath, MemberLT) {
+TEST_F(TensorMath, LTCpp) {
   Tensor p1 = a < 2.0f;
   const float *dptr1 = p1.data<float>();
   EXPECT_FLOAT_EQ(1.0f, dptr1[0]);
@@ -174,7 +184,7 @@
   EXPECT_FLOAT_EQ(0.0f, dptr1[2]);
 }
 
-TEST_F(TestTensorMath, MemberLE) {
+TEST_F(TensorMath, LECpp) {
   Tensor p1 = a <= 2.0f;
   const float *dptr1 = p1.data<float>();
   EXPECT_FLOAT_EQ(1.0f, dptr1[0]);
@@ -182,7 +192,7 @@
   EXPECT_FLOAT_EQ(0.0f, dptr1[2]);
 }
 
-TEST_F(TestTensorMath, MemberGT) {
+TEST_F(TensorMath, GTCpp) {
   Tensor p1 = a > 2.0f;
   const float *dptr1 = p1.data<float>();
   EXPECT_FLOAT_EQ(0.0f, dptr1[0]);
@@ -190,7 +200,7 @@
   EXPECT_FLOAT_EQ(1.0f, dptr1[2]);
 }
 
-TEST_F(TestTensorMath, MemberGE) {
+TEST_F(TensorMath, GECpp) {
   Tensor p1 = a >= 2.0f;
   const float *dptr1 = p1.data<float>();
   EXPECT_FLOAT_EQ(0.0f, dptr1[0]);
@@ -198,7 +208,7 @@
   EXPECT_FLOAT_EQ(1.0f, dptr1[2]);
 }
 
-TEST_F(TestTensorMath, MemberPow) {
+TEST_F(TensorMath, PowCpp) {
   Tensor p1 = Pow(b, 3.0f);
   const float *dptr1 = p1.data<float>();
   EXPECT_FLOAT_EQ(pow(1.1f, 3.0f), dptr1[0]);
@@ -214,7 +224,7 @@
   // EXPECT_FLOAT_EQ(pow(3.0f,3.1f), dptr2[2]);
 }
 
-TEST_F(TestTensorMath, MemberSub) {
+TEST_F(TensorMath, SubCpp) {
   Tensor p1 = a - b;
   const float *dptr1 = p1.data<float>();
   EXPECT_NEAR(-0.1, dptr1[0], 1e-5);
@@ -222,7 +232,7 @@
   EXPECT_NEAR(-0.1, dptr1[2], 1e-5);
 }
 
-TEST_F(TestTensorMath, MemberEltwiseMult) {
+TEST_F(TensorMath, EltwiseMultCpp) {
   Tensor p1 = a * b;
   const float *dptr1 = p1.data<float>();
   EXPECT_NEAR(1.0 * 1.1, dptr1[0], 1e-5);
@@ -230,7 +240,7 @@
   EXPECT_NEAR(3.0 * 3.1, dptr1[2], 1e-5);
 }
 
-TEST_F(TestTensorMath, MemberDiv) {
+TEST_F(TensorMath, DivCpp) {
   Tensor p1 = a / b;
   const float *dptr1 = p1.data<float>();
   EXPECT_NEAR(1.0 / 1.1, dptr1[0], 1e-5);
@@ -250,7 +260,7 @@
   EXPECT_NEAR(3.0 / 8.0, dptr3[2], 1e-5);
 }
 
-TEST_F(TestTensorMath, MemberBernoulli) {
+TEST_F(TensorMath, BernoulliCpp) {
   Tensor p1(Shape{10000});
   Bernoulli(0.3f, &p1);
   const float *dptr1 = p1.data<float>();
@@ -265,7 +275,7 @@
   EXPECT_NEAR(variance, 0.3 * 0.7, 1e-2);
 }
 
-TEST_F(TestTensorMath, MemberUniform) {
+TEST_F(TensorMath, UniformCpp) {
   Tensor p1(Shape{10000});
   Uniform(0.1f, 0.2f, &p1);
   const float *dptr1 = p1.data<float>();
@@ -280,7 +290,7 @@
   EXPECT_NEAR(variance, 0.01f / 12, 1e-3);
 }
 
-TEST_F(TestTensorMath, MemberGaussian) {
+TEST_F(TensorMath, GaussianCpp) {
   Tensor p1(Shape{50000});
   Gaussian(0.0f, 1.0f, &p1);
   const float *dptr1 = p1.data<float>();
@@ -295,7 +305,7 @@
   EXPECT_NEAR(variance, 1.0, 1e-2);
 }
 
-TEST_F(TestTensorMath, MemberAddTensor) {
+TEST_F(TensorMath, AddTensorCpp) {
   Tensor aa = a.Clone();
   aa += a;
   const float *dptr = aa.data<float>();
@@ -319,7 +329,7 @@
   EXPECT_FLOAT_EQ(12.1f, dptr2[5]);
 }
 
-TEST_F(TestTensorMath, AddTensors) {
+TEST_F(TensorMath, AddTensorsCpp) {
   Tensor ret(a.shape(), a.device(), a.data_type());
   Add(a, b, &ret);
   const float *dptr = ret.data<float>();
@@ -343,14 +353,14 @@
   EXPECT_FLOAT_EQ(12.1f, dptr1[5]);
 }
 
-TEST_F(TestTensorMath, SetValue) {
+TEST_F(TensorMath, SetValueCpp) {
   Tensor t(Shape{4});
   t.SetValue(0.3f);
   const float *ptr = t.data<float>();
   for (int i = 0; i < 4; i++) EXPECT_FLOAT_EQ(ptr[i], 0.3f);
 }
 
-TEST_F(TestTensorMath, Reshape) {
+TEST_F(TensorMath, ReshapeCpp) {
   Tensor t(Shape{4});
   t.SetValue(0.3f);
   Tensor p = Reshape(t, Shape{4, 1});
@@ -359,14 +369,84 @@
   EXPECT_EQ(p.shape(1), 1u);
   for (int i = 0; i < 4; i++) EXPECT_FLOAT_EQ(ptr[i], 0.3f);
 }
+
+
+TEST_F(TensorMath, BroadcastCpp) {
+  Tensor x(Shape{1});
+  x.SetValue(1.0f);
+  {
+    auto y = x + a;
+    const float *dptr = y.data<float>();
+    EXPECT_FLOAT_EQ(2.0f, dptr[0]);
+    EXPECT_FLOAT_EQ(3.0f, dptr[1]);
+    EXPECT_FLOAT_EQ(4.0f, dptr[2]);
+  }
+
+  {
+    auto y = x + e;
+    const float *dptr = y.data<float>();
+    EXPECT_FLOAT_EQ(2.0f, dptr[0]);
+    EXPECT_FLOAT_EQ(3.0f, dptr[1]);
+    EXPECT_FLOAT_EQ(4.0f, dptr[2]);
+  }
+
+  auto p = Reshape(e, Shape{3, 1, 2});
+  {
+    Tensor q(Shape{3, 1, 1});
+    q.CopyDataFromHostPtr(dat1, 3);
+    auto z = p + q;
+    const float *dptr = z.data<float>();
+    EXPECT_FLOAT_EQ(2.0f, dptr[0]);
+    EXPECT_FLOAT_EQ(3.0f, dptr[1]);
+    EXPECT_FLOAT_EQ(5.0f, dptr[2]);
+    EXPECT_FLOAT_EQ(6.0f, dptr[3]);
+    EXPECT_FLOAT_EQ(8.0f, dptr[4]);
+    EXPECT_FLOAT_EQ(9.0f, dptr[5]);
+  }
+
+  {
+    Tensor q(Shape{2});
+    q.CopyDataFromHostPtr(dat1, 2);
+    auto z = p + q;
+    const float *dptr = z.data<float>();
+    EXPECT_FLOAT_EQ(2.0f, dptr[0]);
+    EXPECT_FLOAT_EQ(4.0f, dptr[1]);
+    EXPECT_FLOAT_EQ(4.0f, dptr[2]);
+    EXPECT_FLOAT_EQ(6.0f, dptr[3]);
+    EXPECT_FLOAT_EQ(6.0f, dptr[4]);
+    EXPECT_FLOAT_EQ(8.0f, dptr[5]);
+  }
+
+  {
+    Tensor q(Shape{3, 1, 2, 1});
+    q.CopyDataFromHostPtr(dat1, 6);
+    auto z = p + q;
+    EXPECT_EQ(z.shape().size(), 4);
+    EXPECT_EQ(z.shape(0), 3);
+    EXPECT_EQ(z.shape(1), 3);
+    EXPECT_EQ(z.shape(2), 2);
+    EXPECT_EQ(z.shape(3), 2);
+    const float *dptr = z.data<float>();
+    EXPECT_FLOAT_EQ(2.0f, dptr[0]);
+    EXPECT_FLOAT_EQ(3.0f, dptr[1]);
+    EXPECT_FLOAT_EQ(3.0f, dptr[2]);
+    EXPECT_FLOAT_EQ(4.0f, dptr[3]);
+    EXPECT_FLOAT_EQ(6.0f, dptr[16]);
+    EXPECT_FLOAT_EQ(7.0f, dptr[17]);
+    EXPECT_FLOAT_EQ(7.0f, dptr[18]);
+    EXPECT_FLOAT_EQ(8.0f, dptr[19]);
+  }
+}
+
+
 #ifdef USE_CBLAS
-TEST_F(TestTensorMath, L2Cpp) {
+TEST_F(TensorMath, L2Cpp) {
   float l2 = a.L2();
   float target = 0.0f;
   for (size_t i = 0; i < a.Size(); i++) target += dat1[i] * dat1[i];
   EXPECT_FLOAT_EQ(l2, sqrt(target) / a.Size());
 }
-TEST_F(TestTensorMath, MultCpp) {
+TEST_F(TensorMath, MultCpp) {
   const float x[4] = {1.0f, 2.0f, 3.0f, 4.0f};
   Tensor t(Shape{2, 2});
   t.CopyDataFromHostPtr(x, 4);
@@ -413,7 +493,7 @@
   }
 }
 
-TEST_F(TestTensorMath, AddColumnCpp) {
+TEST_F(TensorMath, AddColumnCpp) {
   const float x[3] = {1.0f, 2.0f, 3.0f};
   Tensor t(Shape{3});
   t.CopyDataFromHostPtr(x, 3);
@@ -426,7 +506,7 @@
     }
   }
 }
-TEST_F(TestTensorMath, SubColumnCpp) {
+TEST_F(TensorMath, SubColumnCpp) {
   const float x[3] = {1.0f, 2.0f, 3.0f};
   Tensor t(Shape{3});
   t.CopyDataFromHostPtr(x, 3);
@@ -440,7 +520,7 @@
   }
 }
 
-TEST_F(TestTensorMath, DivColumnCpp) {
+TEST_F(TensorMath, DivColumnCpp) {
   const float x[3] = {1.0f, 2.0f, 3.0f};
   Tensor t(Shape{3});
   t.CopyDataFromHostPtr(x, 3);
@@ -454,7 +534,7 @@
   }
 }
 
-TEST_F(TestTensorMath, AddRowCpp) {
+TEST_F(TensorMath, AddRowCpp) {
   const float x[2] = {1.1f, 2.1f};
   Tensor t(Shape{2});
   t.CopyDataFromHostPtr(x, 2);
@@ -468,7 +548,7 @@
   }
 }
 
-TEST_F(TestTensorMath, SubRowCpp) {
+TEST_F(TensorMath, SubRowCpp) {
   const float x[2] = {1.1f, 2.1f};
   Tensor t(Shape{2});
   t.CopyDataFromHostPtr(x, 2);
@@ -482,7 +562,7 @@
   }
 }
 
-TEST_F(TestTensorMath, MultRowCpp) {
+TEST_F(TensorMath, MultRowCpp) {
   const float x[2] = {1.1f, 2.1f};
   Tensor t(Shape{2});
   t.CopyDataFromHostPtr(x, 2);
@@ -496,11 +576,39 @@
   }
 }
 
-TEST_F(TestTensorMath, SumRowsCpp) {
+TEST_F(TensorMath, MultColumnCpp) {
+  const float x[3] = {1.0f, 2.0f, 3.0f};
+  Tensor t(Shape{3});
+  t.CopyDataFromHostPtr(x, 3);
+  d.CopyDataFromHostPtr(dat1, 6);
+  MultColumn(t, &d);
+  const float *xptr = d.data<float>();
+  for (int i = 0; i < 3; i++) {
+    for (int j = 0; j < 2; j++) {
+      EXPECT_FLOAT_EQ(xptr[i * 2 + j], dat1[i * 2 + j] * x[i]);
+    }
+  }
+}
+
+TEST_F(TensorMath, DivRowCpp) {
+  const float x[2] = {1.1f, 2.1f};
+  Tensor t(Shape{2});
+  t.CopyDataFromHostPtr(x, 2);
+  d.CopyDataFromHostPtr(dat1, 6);
+  DivRow(t, &d);
+  const float *xptr = d.data<float>();
+  for (int i = 0; i < 3; i++) {
+    for (int j = 0; j < 2; j++) {
+      EXPECT_FLOAT_EQ(xptr[i * 2 + j], dat1[i * 2 + j] / x[j]);
+    }
+  }
+}
+
+TEST_F(TensorMath, SumRowsCpp) {
   Tensor t(Shape{2});
   float dat[6];
   for (int i = 0; i < 6; i ++)
-    dat[i] = (float)rand()/(float)(RAND_MAX/ 10);
+    dat[i] = (float)rand() / (float)(RAND_MAX / 10);
   d.CopyDataFromHostPtr(dat, 6);
   SumRows(d, &t);
   const float *tptr = t.data<float>();
@@ -513,7 +621,7 @@
   }
 }
 
-TEST_F(TestTensorMath, SumColumnsCpp) {
+TEST_F(TensorMath, SumColumnsCpp) {
   Tensor t(Shape{3});
   d.CopyDataFromHostPtr(dat1, 6);
   SumColumns(d, &t);
@@ -526,9 +634,62 @@
     EXPECT_FLOAT_EQ(tptr[i], tmp);
   }
 }
+
+TEST_F(TensorMath, ConcatenateRowsCpp) {
+  d.CopyDataFromHostPtr<float>(dat1, 6);
+  e.CopyDataFromHostPtr<float>(dat2, 6);
+  const auto ret = singa::ConcatenateRows(vector<Tensor> {d, e});
+  EXPECT_EQ(ret.shape(0), d.shape(0) + e.shape(0));
+  EXPECT_EQ(ret.shape(1), d.shape(1));
+  const float *retPtr = ret.data<float>();
+  for (int i = 0; i < 6; i++) EXPECT_FLOAT_EQ(retPtr[i], dat1[i]);
+  for (int i = 0; i < 6; i++) EXPECT_FLOAT_EQ(retPtr[i + 6], dat2[i]);
+}
+
+TEST_F(TensorMath, ConcatenateColumnsCpp) {
+  d.CopyDataFromHostPtr<float>(dat1, 6);
+  e.CopyDataFromHostPtr<float>(dat2, 6);
+  const auto ret = singa::ConcatenateColumns(vector<Tensor> {d, e});
+  EXPECT_EQ(ret.shape(0), d.shape(0));
+  EXPECT_EQ(ret.shape(1), d.shape(1) + e.shape(1));
+
+  const float *retPtr = ret.data<float>();
+  for (int i = 0; i < 3; i++) {
+    for (int j = 0; j < 2; j++)
+      EXPECT_FLOAT_EQ(retPtr[i * 4 + j], dat1[i * 2 + j]);
+    for (int j = 0; j < 2; j++)
+      EXPECT_FLOAT_EQ(retPtr[i * 4 + 2 + j], dat2[i * 2 + j]);
+  }
+}
+
+TEST_F(TensorMath, CopyRowsCpp) {
+  const auto ret = singa::CopyRows(e, 1, 2);
+  EXPECT_EQ(ret.shape(0), 1u);
+  EXPECT_EQ(ret.shape(1), e.shape(1));
+  const float *retPtr = ret.data<float>();
+  for (size_t i = 0; i < ret.Size(); i++)
+    EXPECT_FLOAT_EQ(retPtr[i], dat1[1 * 2 + i]);
+}
+
+TEST_F(TensorMath, CopyColumnsCpp) {
+  a.Reshape(Shape{2, 3});
+  const auto ret = singa::CopyColumns(a, 1, 3);
+  EXPECT_EQ(ret.shape(0), a.shape(0));
+  EXPECT_EQ(ret.shape(1), 2u);
+  const float *retPtr = ret.data<float>();
+  for (size_t i = 0; i < ret.shape(0); i++)
+    for (size_t j = 0; j < ret.shape(1); j++)
+      EXPECT_FLOAT_EQ(retPtr[i * ret.shape(1) + j],
+                      dat1[i * a.shape(1) + j + 1]);
+}
 #endif
+
+
+
+
+//////////////////////////////////////////////////////////
 #ifdef USE_CUDA
-TEST_F(TestTensorMath, L2Cuda) {
+TEST_F(TensorMath, L2Cuda) {
   auto dev = std::make_shared<singa::CudaGPU>();
   Tensor t(Shape{3, 2}, dev);
   t.CopyDataFromHostPtr(dat1, 6);
@@ -537,7 +698,7 @@
   for (size_t i = 0; i < t.Size(); i++) target += dat1[i] * dat1[i];
   EXPECT_FLOAT_EQ(l2, sqrt(target) / t.Size());
 }
-TEST_F(TestTensorMath, MultCuda) {
+TEST_F(TensorMath, MultCuda) {
   const float x[4] = {1.0f, 2.0f, 3.0f, 4.0f};
   auto dev = std::make_shared<singa::CudaGPU>();
   Tensor t(Shape{2, 2}, dev);
@@ -590,7 +751,7 @@
   p.ToHost();
 }
 
-TEST_F(TestTensorMath, AddColumnCuda) {
+TEST_F(TensorMath, AddColumnCuda) {
   const float x[3] = {1.0f, 2.0f, 3.0f};
   auto dev = std::make_shared<singa::CudaGPU>();
   Tensor t(Shape{3}, dev);
@@ -607,7 +768,7 @@
   }
 }
 
-TEST_F(TestTensorMath, SubColumnCuda) {
+TEST_F(TensorMath, SubColumnCuda) {
   const float x[3] = {1.0f, 2.0f, 3.0f};
   auto dev = std::make_shared<singa::CudaGPU>();
   Tensor t(Shape{3}, dev);
@@ -623,22 +784,8 @@
     }
   }
 }
-#endif
-TEST_F(TestTensorMath, MultColumnCpp) {
-  const float x[3] = {1.0f, 2.0f, 3.0f};
-  Tensor t(Shape{3});
-  t.CopyDataFromHostPtr(x, 3);
-  d.CopyDataFromHostPtr(dat1, 6);
-  MultColumn(t, &d);
-  const float *xptr = d.data<float>();
-  for (int i = 0; i < 3; i++) {
-    for (int j = 0; j < 2; j++) {
-      EXPECT_FLOAT_EQ(xptr[i * 2 + j], dat1[i * 2 + j] * x[i]);
-    }
-  }
-}
-#ifdef USE_CUDA
-TEST_F(TestTensorMath, MultColumnCuda) {
+
+TEST_F(TensorMath, MultColumnCuda) {
   const float x[3] = {1.0f, 2.0f, 3.0f};
   auto dev = std::make_shared<singa::CudaGPU>();
   Tensor t(Shape{3}, dev);
@@ -654,7 +801,7 @@
     }
   }
 }
-TEST_F(TestTensorMath, DivColumnCuda) {
+TEST_F(TensorMath, DivColumnCuda) {
   const float x[3] = {1.0f, 2.0f, 3.0f};
   auto dev = std::make_shared<singa::CudaGPU>();
   Tensor t(Shape{3}, dev);
@@ -670,7 +817,7 @@
     }
   }
 }
-TEST_F(TestTensorMath, AddRowCuda) {
+TEST_F(TensorMath, AddRowCuda) {
   const float x[2] = {1.1f, 2.1f};
   auto dev = std::make_shared<singa::CudaGPU>();
   Tensor t(Shape{2}, dev);
@@ -686,7 +833,7 @@
     }
   }
 }
-TEST_F(TestTensorMath, SubRowCuda) {
+TEST_F(TensorMath, SubRowCuda) {
   const float x[2] = {1.1f, 2.1f};
   auto dev = std::make_shared<singa::CudaGPU>();
   Tensor t(Shape{2}, dev);
@@ -702,7 +849,7 @@
     }
   }
 }
-TEST_F(TestTensorMath, MultRowCuda) {
+TEST_F(TensorMath, MultRowCuda) {
   const float x[2] = {1.1f, 2.1f};
   auto dev = std::make_shared<singa::CudaGPU>();
   Tensor t(Shape{2}, dev);
@@ -718,22 +865,8 @@
     }
   }
 }
-#endif
-TEST_F(TestTensorMath, DivRowCpp) {
-  const float x[2] = {1.1f, 2.1f};
-  Tensor t(Shape{2});
-  t.CopyDataFromHostPtr(x, 2);
-  d.CopyDataFromHostPtr(dat1, 6);
-  DivRow(t, &d);
-  const float *xptr = d.data<float>();
-  for (int i = 0; i < 3; i++) {
-    for (int j = 0; j < 2; j++) {
-      EXPECT_FLOAT_EQ(xptr[i * 2 + j], dat1[i * 2 + j] / x[j]);
-    }
-  }
-}
-#ifdef USE_CUDA
-TEST_F(TestTensorMath, DivRowCuda) {
+
+TEST_F(TensorMath, DivRowCuda) {
   const float x[2] = {1.1f, 2.1f};
   auto dev = std::make_shared<singa::CudaGPU>();
   Tensor t(Shape{2}, dev);
@@ -749,7 +882,7 @@
     }
   }
 }
-TEST_F(TestTensorMath, SumRowsCuda) {
+TEST_F(TensorMath, SumRowsCuda) {
   auto dev = std::make_shared<singa::CudaGPU>();
   Tensor t(Shape{2}, dev);
   d.CopyDataFromHostPtr(dat1, 6);
@@ -766,7 +899,7 @@
   }
   d.ToHost();
 }
-TEST_F(TestTensorMath, SumColumnCuda) {
+TEST_F(TensorMath, SumColumnCuda) {
   auto dev = std::make_shared<singa::CudaGPU>();
   Tensor t(Shape{3}, dev);
   d.CopyDataFromHostPtr(dat1, 6);
@@ -784,65 +917,28 @@
   d.ToHost();
 }
 
-#endif
-
-TEST_F(TestTensorMath, ConcatenateRowsCpp) {
+TEST_F(TensorMath, ExpStrideCuda) {
+  auto dev = std::make_shared<singa::CudaGPU>();
+  a.ToDevice(dev);
+  auto x = singa::Tensor(singa::Shape{2, 1, 3});
+  x.ToDevice(dev);
   d.CopyDataFromHostPtr<float>(dat1, 6);
-  e.CopyDataFromHostPtr<float>(dat2, 6);
-  const auto ret = singa::ConcatenateRows(vector<Tensor>{d, e});
-  EXPECT_EQ(ret.shape(0), d.shape(0) + e.shape(0));
-  EXPECT_EQ(ret.shape(1), d.shape(1));
-  const float *retPtr = ret.data<float>();
-  for (int i = 0; i < 6; i++) EXPECT_FLOAT_EQ(retPtr[i], dat1[i]);
-  for (int i = 0; i < 6; i++) EXPECT_FLOAT_EQ(retPtr[i + 6], dat2[i]);
+  auto y = singa::Transpose(x, {1, 2, 0});
+  Exp(singa::Reshape(a, singa::Shape{1, 3, 2}), &y);
+  y.ToHost();
+  const float *dptr1 = y.data<float>();
+  EXPECT_NEAR(exp(dat1[0]), dptr1[0], 1e-5);
+  EXPECT_NEAR(exp(dat1[4]), dptr1[2], 1e-5);
+  EXPECT_NEAR(exp(dat1[3]), dptr1[4], 1e-5);
 }
 
-TEST_F(TestTensorMath, ConcatenateColumnsCpp) {
-  d.CopyDataFromHostPtr<float>(dat1, 6);
-  e.CopyDataFromHostPtr<float>(dat2, 6);
-  const auto ret = singa::ConcatenateColumns(vector<Tensor>{d, e});
-  EXPECT_EQ(ret.shape(0), d.shape(0));
-  EXPECT_EQ(ret.shape(1), d.shape(1) + e.shape(1));
-
-  const float *retPtr = ret.data<float>();
-  for (int i = 0; i < 3; i++) {
-    for (int j = 0; j < 2; j++)
-      EXPECT_FLOAT_EQ(retPtr[i * 4 + j], dat1[i * 2 + j]);
-    for (int j = 0; j < 2; j++)
-      EXPECT_FLOAT_EQ(retPtr[i * 4 + 2 + j], dat2[i * 2 + j]);
-  }
-}
-
-TEST_F(TestTensorMath, CopyRowsCpp) {
-  const auto ret = singa::CopyRows(e, 1, 2);
-  EXPECT_EQ(ret.shape(0), 1u);
-  EXPECT_EQ(ret.shape(1), e.shape(1));
-  const float *retPtr = ret.data<float>();
-  for (size_t i = 0; i < ret.Size(); i++)
-    EXPECT_FLOAT_EQ(retPtr[i], dat1[1 * 2 + i]);
-}
-
-TEST_F(TestTensorMath, CopyColumnsCpp) {
-  a.Reshape(Shape{2, 3});
-  const auto ret = singa::CopyColumns(a, 1, 3);
-  EXPECT_EQ(ret.shape(0), a.shape(0));
-  EXPECT_EQ(ret.shape(1), 2u);
-  const float *retPtr = ret.data<float>();
-  for (size_t i = 0; i < ret.shape(0); i++)
-    for (size_t j = 0; j < ret.shape(1); j++)
-      EXPECT_FLOAT_EQ(retPtr[i * ret.shape(1) + j],
-                      dat1[i * a.shape(1) + j + 1]);
-}
-
-#ifdef USE_CUDA
-
-TEST_F(TestTensorMath, ConcatenateRowsCuda) {
+TEST_F(TensorMath, ConcatenateRowsCuda) {
   auto dev = std::make_shared<singa::CudaGPU>();
   d.ToDevice(dev);
   e.ToDevice(dev);
   d.CopyDataFromHostPtr<float>(dat1, 6);
   e.CopyDataFromHostPtr<float>(dat2, 6);
-  auto ret = singa::ConcatenateRows(vector<Tensor>{d, e});
+  auto ret = singa::ConcatenateRows(vector<Tensor> {d, e});
   EXPECT_EQ(ret.shape(0), d.shape(0) + e.shape(0));
   EXPECT_EQ(ret.shape(1), d.shape(1));
   ret.ToHost();
@@ -851,13 +947,13 @@
   for (int i = 0; i < 6; i++) EXPECT_FLOAT_EQ(retPtr[i + 6], dat2[i]);
 }
 
-TEST_F(TestTensorMath, ConcatenateColumnsCuda) {
+TEST_F(TensorMath, ConcatenateColumnsCuda) {
   auto dev = std::make_shared<singa::CudaGPU>();
   d.ToDevice(dev);
   e.ToDevice(dev);
   d.CopyDataFromHostPtr<float>(dat1, 6);
   e.CopyDataFromHostPtr<float>(dat2, 6);
-  auto ret = singa::ConcatenateColumns(vector<Tensor>{d, e});
+  auto ret = singa::ConcatenateColumns(vector<Tensor> {d, e});
   ret.ToHost();
   EXPECT_EQ(ret.shape(0), d.shape(0));
   EXPECT_EQ(ret.shape(1), d.shape(1) + e.shape(1));
@@ -871,7 +967,7 @@
   }
 }
 
-TEST_F(TestTensorMath, CopyRowsCuda) {
+TEST_F(TensorMath, CopyRowsCuda) {
   auto dev = std::make_shared<singa::CudaGPU>();
   e.ToDevice(dev);
   auto ret = singa::CopyRows(e, 1, 2);
@@ -883,7 +979,7 @@
     EXPECT_FLOAT_EQ(retPtr[i], dat1[1 * 2 + i]);
 }
 
-TEST_F(TestTensorMath, CopyColumnsCuda) {
+TEST_F(TensorMath, CopyColumnsCuda) {
   auto dev = std::make_shared<singa::CudaGPU>();
   a.Reshape(Shape{2, 3});
   a.ToDevice(dev);
@@ -898,4 +994,85 @@
                       dat1[i * a.shape(1) + j + 1]);
 }
 
+
+TEST_F(TensorMath, BroadcastCuda) {
+  auto dev = std::make_shared<singa::CudaGPU>();
+  Tensor x(Shape{1});
+  x.ToDevice(dev);
+  x.SetValue(1.0f);
+  a.ToDevice(dev);
+  {
+    auto y = a + x;
+    y.ToHost();
+    const float *dptr = y.data<float>();
+    EXPECT_FLOAT_EQ(2.0f, dptr[0]);
+    EXPECT_FLOAT_EQ(3.0f, dptr[1]);
+    EXPECT_FLOAT_EQ(4.0f, dptr[2]);
+  }
+
+  e.ToDevice(dev);
+  {
+    auto y = e + x;
+    y.ToHost();
+    const float *dptr = y.data<float>();
+    EXPECT_FLOAT_EQ(2.0f, dptr[0]);
+    EXPECT_FLOAT_EQ(3.0f, dptr[1]);
+    EXPECT_FLOAT_EQ(4.0f, dptr[2]);
+  }
+
+  auto p = Reshape(e, Shape{3, 1, 2});
+  {
+    Tensor q(Shape{3, 1, 1}, dev);
+    q.CopyDataFromHostPtr(dat1, 3);
+    auto z = p + q;
+    z.ToHost();
+    const float *dptr = z.data<float>();
+    EXPECT_FLOAT_EQ(2.0f, dptr[0]);
+    EXPECT_FLOAT_EQ(3.0f, dptr[1]);
+    EXPECT_FLOAT_EQ(5.0f, dptr[2]);
+    EXPECT_FLOAT_EQ(6.0f, dptr[3]);
+    EXPECT_FLOAT_EQ(8.0f, dptr[4]);
+    EXPECT_FLOAT_EQ(9.0f, dptr[5]);
+  }
+
+  {
+    Tensor q(Shape{2}, dev);
+    q.CopyDataFromHostPtr(dat1, 2);
+    auto z = p + q;
+    EXPECT_EQ(z.shape().size(), 3);
+    EXPECT_EQ(z.shape(0), 3);
+    EXPECT_EQ(z.shape(1), 1);
+    EXPECT_EQ(z.shape(2), 2);
+    z.ToHost();
+    const float *dptr = z.data<float>();
+    EXPECT_FLOAT_EQ(2.0f, dptr[0]);
+    EXPECT_FLOAT_EQ(4.0f, dptr[1]);
+    EXPECT_FLOAT_EQ(4.0f, dptr[2]);
+    EXPECT_FLOAT_EQ(6.0f, dptr[3]);
+    EXPECT_FLOAT_EQ(6.0f, dptr[4]);
+    EXPECT_FLOAT_EQ(8.0f, dptr[5]);
+  }
+/*
+  {
+    Tensor q(Shape{3, 1, 2, 1}, dev);
+    q.CopyDataFromHostPtr(dat1, 6);
+    auto z = p + q;
+    z.ToHost();
+    EXPECT_EQ(z.shape().size(), 4);
+    EXPECT_EQ(z.shape(0), 3);
+    EXPECT_EQ(z.shape(1), 3);
+    EXPECT_EQ(z.shape(2), 2);
+    EXPECT_EQ(z.shape(3), 2);
+    const float *dptr = z.data<float>();
+    EXPECT_FLOAT_EQ(2.0f, dptr[0]);
+    EXPECT_FLOAT_EQ(3.0f, dptr[1]);
+    EXPECT_FLOAT_EQ(3.0f, dptr[2]);
+    EXPECT_FLOAT_EQ(4.0f, dptr[3]);
+    EXPECT_FLOAT_EQ(6.0f, dptr[16]);
+    EXPECT_FLOAT_EQ(7.0f, dptr[17]);
+    EXPECT_FLOAT_EQ(7.0f, dptr[18]);
+    EXPECT_FLOAT_EQ(8.0f, dptr[19]);
+  }
+  */
+}
 #endif
diff --git a/tool/conda/cpu/meta.yaml b/tool/conda/cpu/meta.yaml
index 6c6f7cb..169dea8 100644
--- a/tool/conda/cpu/meta.yaml
+++ b/tool/conda/cpu/meta.yaml
@@ -24,7 +24,7 @@
 
 requirements:
   run:
-    - singa 1.2.0 py36_cpu
+    - singa 1.2.0 cpu
 
 about:
   home: http://singa.apache.org/
diff --git a/tool/docker/devel/conda/cuda/Dockerfile b/tool/conda/docker/cuda10/Dockerfile
similarity index 87%
copy from tool/docker/devel/conda/cuda/Dockerfile
copy to tool/conda/docker/cuda10/Dockerfile
index debafe6..c95e652 100644
--- a/tool/docker/devel/conda/cuda/Dockerfile
+++ b/tool/conda/docker/cuda10/Dockerfile
@@ -15,8 +15,15 @@
 # limitations under the License.
 #
 
-# Change tags to build with different cuda/cudnn versions:
-FROM nvidia/cuda:9.0-cudnn7-devel-ubuntu16.04
+# 18.04 has erros in ssh 
+FROM nvidia/cuda:10.0-devel-ubuntu16.04
+
+ENV CUDNN_VERSION 7.4.2.24
+RUN apt-get update && apt-get install -y --no-install-recommends \
+        libcudnn7=$CUDNN_VERSION-1+cuda10.0 \
+        libcudnn7-dev=$CUDNN_VERSION-1+cuda10.0 && \
+    apt-mark hold libcudnn7 && \
+    rm -rf /var/lib/apt/lists/*
 
 # install dependencies
 RUN apt-get update \
@@ -45,8 +52,6 @@
     # config ssh service
     && mkdir /var/run/sshd \
     && echo 'root:singa' | chpasswd \
-    # for ubuntu 14.04
-    # RUN sed -i 's/PermitRootLogin without-password/PermitRootLogin yes/' /etc/ssh/sshd_config
     # for ubuntu 16.04 prohibit
     && sed -i 's/PermitRootLogin prohibit-password/PermitRootLogin yes/' /etc/ssh/sshd_config \
     # SSH login fix. Otherwise user is kicked off after login
diff --git a/tool/docker/devel/conda/cuda/Dockerfile b/tool/conda/docker/cuda9/Dockerfile
similarity index 88%
rename from tool/docker/devel/conda/cuda/Dockerfile
rename to tool/conda/docker/cuda9/Dockerfile
index debafe6..fc0ffc4 100644
--- a/tool/docker/devel/conda/cuda/Dockerfile
+++ b/tool/conda/docker/cuda9/Dockerfile
@@ -16,7 +16,16 @@
 #
 
 # Change tags to build with different cuda/cudnn versions:
-FROM nvidia/cuda:9.0-cudnn7-devel-ubuntu16.04
+FROM nvidia/cuda:9.0-devel-ubuntu16.04
+
+
+ENV CUDNN_VERSION 7.4.2.24
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+        libcudnn7=$CUDNN_VERSION-1+cuda9.0 \
+        libcudnn7-dev=$CUDNN_VERSION-1+cuda9.0 && \
+    apt-mark hold libcudnn7 && \
+    rm -rf /var/lib/apt/lists/*
 
 # install dependencies
 RUN apt-get update \
@@ -45,8 +54,6 @@
     # config ssh service
     && mkdir /var/run/sshd \
     && echo 'root:singa' | chpasswd \
-    # for ubuntu 14.04
-    # RUN sed -i 's/PermitRootLogin without-password/PermitRootLogin yes/' /etc/ssh/sshd_config
     # for ubuntu 16.04 prohibit
     && sed -i 's/PermitRootLogin prohibit-password/PermitRootLogin yes/' /etc/ssh/sshd_config \
     # SSH login fix. Otherwise user is kicked off after login
diff --git a/tool/conda/gpu/meta.yaml b/tool/conda/gpu/meta.yaml
index b78ef29..57c5b21 100644
--- a/tool/conda/gpu/meta.yaml
+++ b/tool/conda/gpu/meta.yaml
@@ -23,7 +23,7 @@
 
 requirements:
   run:
-    - singa 1.2.0 py36_cuda9.0_cudnn7.1.2
+    - singa 1.2.0 cudnn7.3.1_cuda10.0
 
 about:
   home: http://singa.apache.org/
diff --git a/tool/conda/singa/README.md b/tool/conda/singa/README.md
index 0a3f42d..8d970be 100644
--- a/tool/conda/singa/README.md
+++ b/tool/conda/singa/README.md
@@ -23,28 +23,20 @@
 
 ## Environment variables
 
-Build string is a part of the conda package specification. We include the cuda and cudnn version in it if Singa is compiled with CUDA enabled.
+We export the CUDA version if Singa is compiled with CUDA enabled. The cuDNN version is fixed by Singa and cuDNN is installed from [anaconda cloud](https://anaconda.org/anaconda/cudnn).
 
-	# for singa with gpu, e.g. cuda8.0-cudnn7.0.5
-    export BUILD_STR=cudax.y-cudnna.b.c
+    # for singa with gpu, e.g. cuda9.0-cudnn7.3.1
+    export CUDA=9.0
 
-    # for singa running only on cpu
-    export BUILD_STR=cpu
-
-
-To package Singa with CUDA and CUDNN,
-
-    export CUDNN_PATH=<path to cudnn folder>
-
-this folder should include a subfolder `include/cudnn.h` for the header file, and another subfolder `lib64` for the shared libraries. The BUILD_STR and CUDNN_PATH must be consistent. For example, if CUDNN_PATH is set, then BUILD_STR must be like cudax.y-cudnna.b.c. CUDNN must be provided if we want to compiled Singa with CUDA enabled.
+For CPU-only version, we do not export CUDA.
 
 ## Instruction
 
 After exporting the environment variables, execute the following command to compile Singa and package it
 
-    conda-build .  --python 3.6  (or 2.7)
+    conda-build .  --python 3.6
 
-You will see the package path from the screen output.
+You will see the package path from the screen output, e.g., `xx/yy/singa-1.2.0-cpu.tar.bz2` or `xx/yy/singa-1.2.0-cudnn7.3.1_cuda9.0.tar.bz2`.
 
 To clean the cache
 
diff --git a/tool/conda/singa/build.sh b/tool/conda/singa/build.sh
index b54e451..a486b56 100644
--- a/tool/conda/singa/build.sh
+++ b/tool/conda/singa/build.sh
@@ -23,23 +23,23 @@
 export CMAKE_INCLUDE_PATH=$PREFIX/include:$CMAKE_INCLUDE_PATH
 export CMAKE_LIBRARY_PATH=$PREFIX/lib:$CMAKE_LIBRARY_PATH
 
-echo "----------------------$CUDNN_PATH---------------"
 
-if [ -z ${CUDNN_PATH+x} ]; then
+
+# USE_PYTHON3=OFF
+# PY3K is set by conda
+# if  [ "$PY3K" == "1" ]; then USE_PYTHON3=ON; fi
+
+# if [ -z ${CUDA+x} ]; then
+if [ -z "$CUDA" ]; then
 	USE_CUDA=OFF
 else
 	USE_CUDA=ON
-	cp $CUDNN_PATH/include/* $PREFIX/include/ 
-	cp -P $CUDNN_PATH/lib64/libcudnn.so* $PREFIX/lib/
 fi
 
-USE_PYTHON3=OFF
-# PY3K is set by conda
-if  [ "$PY3K" == "1" ]; then USE_PYTHON3=ON; fi
-
 
 mkdir build
 cd build
-cmake -DCMAKE_INSTALL_PREFIX=$PREFIX -DUSE_CUDA=$USE_CUDA -DUSE_PYTHON3=$USE_PYTHON3 ..
+cmake -DCMAKE_INSTALL_PREFIX=$PREFIX -DUSE_CUDA=$USE_CUDA \
+	-DUSE_PYTHON3=ON -DUSE_MKLDNN=ON ..
 make
 make install
diff --git a/tool/conda/singa/conda_build_config.yaml b/tool/conda/singa/conda_build_config.yaml
new file mode 100644
index 0000000..aa516db
--- /dev/null
+++ b/tool/conda/singa/conda_build_config.yaml
@@ -0,0 +1,32 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+c_compiler_version:         # [linux]
+    - 5.4                   # [linux]
+cxx_compiler_version:       # [linux]
+    - 5.4                   # [linux]
+cudnn:                      # [linux]
+    - "7.3.1 cuda10.0_0"    # [environ.get("CUDA")=="10.0"]
+    - "7.3.1 cuda9.0_0"     # [environ.get("CUDA")=="9.0"]
+mkldnn:
+    - 0.14
+build_str:
+    - "cudnn7.3.1_cuda10.0" # [environ.get("CUDA")=="10.0"]
+    - "cudnn7.3.1_cuda9.0"  # [environ.get("CUDA")=="9.0"]
+    - "cpu"                 # [environ.get("CUDA", "")== ""]
\ No newline at end of file
diff --git a/tool/conda/singa/meta.yaml b/tool/conda/singa/meta.yaml
index ff576ea..6cca7c4 100644
--- a/tool/conda/singa/meta.yaml
+++ b/tool/conda/singa/meta.yaml
@@ -22,39 +22,46 @@
   version: "{{ GIT_DESCRIBE_TAG }}"
 
 source:
-  # path: /path to/incubator-singa/
+  # path: ../../../
   git_url: https://github.com/apache/incubator-singa.git
 
 build:
   number: 0
-  string: py{{py}}_{{ environ.get('BUILD_STR', 'cpu') }}
   script_env:
-    - CUDNN_PATH
+    - CUDA   # E.g., export CUDA=9.0
+  string: {{ build_str }}
+  # py{{py}}_{{ environ.get('BUILD_STR', 'cpu') }}
 
 requirements:
   build:
-    - swig 3.0.10
-    - openblas 0.2.19
-    - protobuf 3.2.0
-    - glog 0.3.4
-    - libgfortran 3.0.0 # [osx]
-    - gcc 4.8.5 # [linux]
+    - {{ compiler('cxx') }}
+    - {{ compiler('c') }}
+    - cmake >=3.12.2
+    - make # [unix]
+
+  host:
+    - swig 3.0.12
+    - openblas 0.3.3
+    - protobuf 3.4.0
+    - glog 0.3.5
     - python 3.6*
-    - numpy 1.12.0
+    - numpy 1.16.0
+    - cudnn {{ cudnn }}       # ['cudnn' in str(build_str)]
+    - mkl-dnn {{ mkldnn }}
 
   run:
-    - openblas 0.2.19
-    - protobuf 3.2.0
-    - glog 0.3.4
-    - libgfortran 3.0.0 # [osx]
-    - libgcc 4.8.5 # [linux]
+    - {{ pin_compatible('libopenblas', max_pin='x.x') }}
+    - {{ pin_compatible('libprotobuf', max_pin='x.x') }}
+    - {{ pin_compatible('glog', max_pin='x.x') }}
+    - {{ pin_compatible('numpy', max_pin='x.x') }}
+    - {{ pin_compatible('mkl-dnn', max_pin='x.x') }}
+    - cudnn {{ cudnn }}       # ['cudnn' in str(build_str)]
     - python 3.6*
-    - numpy >=1.12.0
-    - flask >=0.10.1
-    - flask-cors >=3.0.2
-    - pillow >=2.3.0
-    - future >=0.16.0
+    - pillow
+    - future
     - tqdm
+    - onnx >=1.3.0
+
 
 test:
   source_files:
diff --git a/tool/docker/README.md b/tool/docker/README.md
index c95ab49..287e52d 100644
--- a/tool/docker/README.md
+++ b/tool/docker/README.md
@@ -16,23 +16,56 @@
     specific language governing permissions and limitations
     under the License.
 -->
-# SINGA Docker Images
-
-## Availabe images
+# Docker Images
 
 
-| Tag | OS version | devel/runtime | Device|CUDA/CUDNN|Python|
-|:----|:-----------|:--------------|:------|:---------|:-----|
-|runtime| Ubuntu16.04|runtime|CPU|-|3.6|
-|conda-cuda9.0| Ubuntu16.04|devel|GPU|CUDA9.0+CUDNN7.1.2|3.6|
-|cuda9.0-py2| Ubuntu16.04|devel|GPU|CUDA9.0+CUDNN7.1.2|2.7|
-|cuda9.0-py3| Ubuntu16.04|devel|GPU|CUDA9.0+CUDNN7.1.2|3.6|
+## Available tags
 
-runtime and conda-xxx image has installed miniconda3;
-cudaxxx images have installed all depedent libs using apt-get.
+* `devel`, with SINGA and the development packages installed on Ubuntu16.04 (no GPU)
+* `devel-cuda`, with SINGA, CUDA8.0, CUDNN5, and other development packages installed on Ubuntu16.04
 
-## Usage
+## Use the existing Docker images
 
-    docker pull nusdbsystem/singa:<Tag>
-    docker run -it nusdbsystem/singa:<Tag> /bin/bash
-    nvidia-docker run -it nusdbsystem/singa:<Tag> /bin/bash
+Users can pull the Docker images from Dockerhub via
+
+    docker pull apache/singa:devel
+    # or
+    docker pull apache/singa:devel-cuda
+
+Run the docker container using
+
+    docker run -it apache/singa:devel /bin/bash
+    # or
+    docker run -it apache/singa:devel-cuda /bin/bash
+
+The latest SINGA code is under the `incubator-singa` folder.
+
+## Create new Docker images from Dockerfile
+
+New Docker images could be created by executing the following command within the
+Dockerfile folder, e.g., tool/docker/devel/
+
+    docker build -t singa:<TAG> -f Dockerfile
+
+The `<TAG>` is named as
+
+    devel|runtime[-CUDA|CPU][-CUDNN]
+
+* devel: development images with all dependent libs' header files installed and SINGA's source code; runtime: the minimal images which can run SINGA programs.
+* CUDA: cuda10.0, cuda9.0
+* CUDNN: cudnn7
+
+Here are some example tags:
+
+`devel-cuda9-cudnn7`, `devel-cuda9-cudnn7`, `devel-cuda10-cudnn7`, `devel-cpu`, `runtime-gpu` and `runtime-cpu`
+
+
+Please follow the existing Dockefiles under tool/docker/ to create other Dockefiles.
+The folder structure is like
+
+    level1: devel|runtime
+    level2: Dockerfile, OS
+    level3: Dockerfile, CUDA|MKLDNN
+
+
+For example, the path of the Dockerfile for `devel-cuda9-cudnn7` is `tool/docker/devel/ubuntu/cuda9/Dockerfile`.
diff --git a/tool/docker/build.sh b/tool/docker/build.sh
index 666c91f..d0bcaa3 100755
--- a/tool/docker/build.sh
+++ b/tool/docker/build.sh
@@ -30,8 +30,7 @@
 echo "###################"
 echo "build singa:cudax.y"
 echo "###################"
-docker build tool/docker/devel/native/ubuntu/cuda/py2 --force-rm -t nusdbsystem/singa:cuda9.0-cudnn7.1.2-py2
-docker build tool/docker/devel/native/ubuntu/cuda/py3 --force-rm -t nusdbsystem/singa:cuda9.0-cudnn7.1.2-py3
+docker build tool/docker/devel/native/ubuntu/cuda9 --force-rm -t nusdbsystem/singa:cuda9-cudnn7
 
 if [ $1 = "PUSH" ]; then
   echo "##########################################"
diff --git a/tool/docker/devel/native/centos6/Dockerfile b/tool/docker/devel/centos6/Dockerfile
similarity index 100%
rename from tool/docker/devel/native/centos6/Dockerfile
rename to tool/docker/devel/centos6/Dockerfile
diff --git a/tool/docker/devel/native/ubuntu/cuda/py2/Dockerfile b/tool/docker/devel/native/ubuntu/cuda/py2/Dockerfile
deleted file mode 100644
index c48adf2..0000000
--- a/tool/docker/devel/native/ubuntu/cuda/py2/Dockerfile
+++ /dev/null
@@ -1,71 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# 
-# Change tags to build with different cuda/cudnn versions:
-FROM nvidia/cuda:9.0-cudnn7-devel-ubuntu16.04
-
-# install dependencies
-RUN apt-get update \
-    && apt-get install -y --no-install-recommends \
-        git \
-        build-essential \
-        autoconf \
-        libtool \
-        cmake \
-        libprotobuf-dev \
-        libopenblas-dev \
-        libpcre3-dev \
-        protobuf-compiler \
-        wget \
-        openssh-server \
-        swig \
-        python-dev \
-        python-pip \
-        python-setuptools \
-    && apt-get clean \
-    && apt-get autoremove \
-    && apt-get autoclean \
-    && rm -rf /var/lib/apt/lists/* \
-    && pip install -U --no-cache \
-        pip \
-        wheel \
-        numpy \
-        setuptools \
-        protobuf \
-        future \
-    # install swig > 3.0.10 for ubuntu < 16.04
-    # RUN wget http://prdownloads.sourceforge.net/swig/swig-3.0.10.tar.gz && \
-    #     tar zxf swig-3.0.10.tar.gz && cd swig-3.0.10 && \
-    #     ./configure && make && make install
-    # set environment
-    # ENV CMAKE_INCLUDE_PATH /usr/local/cuda/include:${CMAKE_INCLUDE_PATH}
-    # ENV CMAKE_LIBRARY_PATH /usr/local/cuda/lib64:${CMAKE_LIBRARY_PATH}
-    # config ssh service
-    && mkdir /var/run/sshd \
-    && echo 'root:singa' | chpasswd \
-    # for ubuntu 14.04
-    # RUN sed -i 's/PermitRootLogin without-password/PermitRootLogin yes/' /etc/ssh/sshd_config
-    # for ubuntu 16.04 prohibit
-    && sed -i 's/PermitRootLogin prohibit-password/PermitRootLogin yes/' /etc/ssh/sshd_config \
-    # SSH login fix. Otherwise user is kicked off after login
-    && sed 's@session\s*required\s*pam_loginuid.so@session optional pam_loginuid.so@g' -i /etc/pam.d/sshd \
-    # dump environment variables into files, so that ssh can see also
-    && env | grep _ >> /etc/environment
-
-EXPOSE 22
-
-CMD ["/usr/sbin/sshd", "-D"]
diff --git a/tool/docker/devel/ubuntu/cpu/Dockerfile b/tool/docker/devel/ubuntu/cpu/Dockerfile
new file mode 100644
index 0000000..d8a6cc1
--- /dev/null
+++ b/tool/docker/devel/ubuntu/cpu/Dockerfile
@@ -0,0 +1,75 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+FROM ubuntu:18.04
+
+# install dependencies
+RUN apt-get update \
+    && apt-get install -y --no-install-recommends \
+        git \
+        build-essential \
+        autoconf \
+        libtool \
+        libprotobuf-dev \
+        libopenblas-dev \
+        libpcre3-dev \
+        protobuf-compiler \
+        wget \
+        swig \
+        openssh-server \
+        python3-dev \
+        python3-pip \
+        python3-setuptools \
+        libgoogle-glog-dev \
+        cmake \
+    && apt-get clean \
+    && apt-get autoremove \
+    && apt-get autoclean \
+    && rm -rf /var/lib/apt/lists/* \
+    && pip3 install -U --no-cache \
+        wheel \
+        numpy \
+        setuptools \
+        protobuf \
+        future
+
+# install mkldnn
+RUN wget https://github.com/intel/mkl-dnn/archive/v0.18.tar.gz -P /tmp/ \
+    && tar zxf /tmp/v0.18.tar.gz -C /tmp/ \
+    && cd /tmp/mkl-dnn-0.18/ \
+    && mkdir -p build && cd build && cmake .. \
+    && make && make install
+
+# config ssh service
+RUN mkdir /var/run/sshd \
+    && echo 'root:singa' | chpasswd \
+    && sed -ri 's/^#?PermitRootLogin\s+.*/PermitRootLogin yes/' /etc/ssh/sshd_config \
+    && sed -ri 's/UsePAM yes/#UsePAM yes/g' /etc/ssh/sshd_config \
+    && mkdir /root/.ssh
+
+# build incubator singa
+RUN git clone https://github.com/apache/incubator-singa.git $HOME/incubator-singa \
+    && cd $HOME/incubator-singa \
+    && mkdir build && cd build \
+    && cmake -DENABLE_TEST=ON -DUSE_PYTHON3=ON -DUSE_MKLDNN=ON ..
+RUN cd $HOME/incubator-singa/build && make && make install
+
+WORKDIR $HOME/incubator-singa
+EXPOSE 22
+
+CMD ["/usr/sbin/sshd", "-D"]
+
diff --git a/tool/docker/devel/ubuntu/cuda10/Dockerfile b/tool/docker/devel/ubuntu/cuda10/Dockerfile
new file mode 100644
index 0000000..6d29f9b
--- /dev/null
+++ b/tool/docker/devel/ubuntu/cuda10/Dockerfile
@@ -0,0 +1,86 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# 
+# Change tags to build with different cuda/cudnn versions:
+FROM nvidia/cuda:10.0-devel-ubuntu18.04
+
+ENV CUDNN_VERSION 7.4.2.24
+RUN apt-get update && apt-get install -y --no-install-recommends \
+        libcudnn7=$CUDNN_VERSION-1+cuda10.0 \
+        libcudnn7-dev=$CUDNN_VERSION-1+cuda10.0 && \
+    apt-mark hold libcudnn7 && \
+    rm -rf /var/lib/apt/lists/*
+
+# install dependencies
+RUN apt-get update \
+    && apt-get install -y --no-install-recommends \
+        git \
+        build-essential \
+        autoconf \
+        libtool \
+        libprotobuf-dev \
+        libopenblas-dev \
+        libpcre3-dev \
+        protobuf-compiler \
+        wget \
+        swig \
+        openssh-server \
+        python3-dev \
+        python3-pip \
+        python3-setuptools \
+        libgoogle-glog-dev \
+    && apt-get clean \
+    && apt-get autoremove \
+    && apt-get autoclean \
+    && rm -rf /var/lib/apt/lists/* \
+    && pip3 install -U --no-cache \
+        wheel \
+        numpy \
+        setuptools \
+        protobuf \
+        future
+
+# install cmake to correctly find Cuda 10
+RUN wget https://github.com/Kitware/CMake/releases/download/v3.12.2/cmake-3.12.2.tar.gz -P /tmp/ \
+    && tar zxf /tmp/cmake-3.12.2.tar.gz -C /tmp/ \
+    && cd /tmp/cmake-3.12.2/ && ./bootstrap && make -j4 && make install
+
+# install mkldnn
+RUN wget https://github.com/intel/mkl-dnn/archive/v0.18.tar.gz -P /tmp/ \
+    && tar zxf /tmp/v0.18.tar.gz -C /tmp/ \
+    && cd /tmp/mkl-dnn-0.18/ \
+    && mkdir -p build && cd build && cmake .. \
+    && make && make install
+
+# config ssh service
+RUN mkdir /var/run/sshd \
+    && echo 'root:singa' | chpasswd \
+    && sed -ri 's/^#?PermitRootLogin\s+.*/PermitRootLogin yes/' /etc/ssh/sshd_config \
+    && sed -ri 's/UsePAM yes/#UsePAM yes/g' /etc/ssh/sshd_config \
+    && mkdir /root/.ssh
+
+# build incubator singa
+RUN git clone https://github.com/apache/incubator-singa.git $HOME/incubator-singa \
+    && cd $HOME/incubator-singa \
+    && mkdir build && cd build \
+    && /usr/local/bin/cmake -DENABLE_TEST=ON -DUSE_CUDA=ON -DUSE_PYTHON3=ON -DUSE_MKLDNN=ON ..
+RUN cd $HOME/incubator-singa/build && make && make install
+
+WORKDIR $HOME/incubator-singa
+EXPOSE 22
+
+CMD ["/usr/sbin/sshd", "-D"]
diff --git a/tool/docker/devel/native/ubuntu/cuda/py3/Dockerfile b/tool/docker/devel/ubuntu/cuda9/Dockerfile
similarity index 62%
rename from tool/docker/devel/native/ubuntu/cuda/py3/Dockerfile
rename to tool/docker/devel/ubuntu/cuda9/Dockerfile
index edae0f1..98a0a88 100644
--- a/tool/docker/devel/native/ubuntu/cuda/py3/Dockerfile
+++ b/tool/docker/devel/ubuntu/cuda9/Dockerfile
@@ -16,7 +16,15 @@
 #
 # 
 # Change tags to build with different cuda/cudnn versions:
-FROM nvidia/cuda:9.0-cudnn7-devel-ubuntu16.04
+FROM nvidia/cuda:9.0-devel-ubuntu16.04
+
+ENV CUDNN_VERSION 7.4.2.24
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+        libcudnn7=$CUDNN_VERSION-1+cuda9.0 \
+        libcudnn7-dev=$CUDNN_VERSION-1+cuda9.0 && \
+    apt-mark hold libcudnn7 && \
+    rm -rf /var/lib/apt/lists/*
 
 # install dependencies
 RUN apt-get update \
@@ -25,17 +33,17 @@
         build-essential \
         autoconf \
         libtool \
-        cmake \
         libprotobuf-dev \
         libopenblas-dev \
         libpcre3-dev \
         protobuf-compiler \
         wget \
         openssh-server \
-        swig \
         python3-dev \
         python3-pip \
         python3-setuptools \
+        libgoogle-glog-dev \
+        cmake \
     && apt-get clean \
     && apt-get autoremove \
     && apt-get autoclean \
@@ -45,19 +53,23 @@
         numpy \
         setuptools \
         protobuf \
-        future \
-    # install swig > 3.0.10 for ubuntu < 16.04
-    # RUN wget http://prdownloads.sourceforge.net/swig/swig-3.0.10.tar.gz && \
-    #     tar zxf swig-3.0.10.tar.gz && cd swig-3.0.10 && \
-    #     ./configure && make && make install
-    # set environment
-    # ENV CMAKE_INCLUDE_PATH /usr/local/cuda/include:${CMAKE_INCLUDE_PATH}
-    # ENV CMAKE_LIBRARY_PATH /usr/local/cuda/lib64:${CMAKE_LIBRARY_PATH}
-    # config ssh service
-    && mkdir /var/run/sshd \
+        future
+
+# install swig > 3.0.10
+RUN wget http://prdownloads.sourceforge.net/swig/swig-3.0.10.tar.gz -P /tmp/ \
+    && tar zxf /tmp/swig-3.0.10.tar.gz -C /tmp/ \
+    && cd /tmp/swig-3.0.10 && ./configure && make && make install
+
+# install mkldnn
+RUN wget https://github.com/intel/mkl-dnn/archive/v0.18.tar.gz -P /tmp/ \
+    && tar zxf /tmp/v0.18.tar.gz -C /tmp/ \
+    && cd /tmp/mkl-dnn-0.18/ \
+    && mkdir -p build && cd build && cmake .. \
+    && make && make install
+
+# config ssh service
+RUN mkdir /var/run/sshd \
     && echo 'root:singa' | chpasswd \
-    # for ubuntu 14.04
-    # RUN sed -i 's/PermitRootLogin without-password/PermitRootLogin yes/' /etc/ssh/sshd_config
     # for ubuntu 16.04 prohibit
     && sed -i 's/PermitRootLogin prohibit-password/PermitRootLogin yes/' /etc/ssh/sshd_config \
     # SSH login fix. Otherwise user is kicked off after login
@@ -65,6 +77,14 @@
     # dump environment variables into files, so that ssh can see also
     && env | grep _ >> /etc/environment
 
+# build incubator singa
+RUN git clone https://github.com/apache/incubator-singa.git $HOME/incubator-singa \
+    && cd $HOME/incubator-singa \
+    && mkdir build && cd build \
+    && cmake -DENABLE_TEST=ON -DUSE_CUDA=ON -DUSE_MKLDNN=ON -DUSE_PYTHON3=ON ..
+RUN cd $HOME/incubator-singa/build && make && make install
+
+WORKDIR $HOME/incubator-singa
 EXPOSE 22
 
 CMD ["/usr/sbin/sshd", "-D"]
diff --git a/tool/docker/runtime/Dockerfile b/tool/docker/runtime/cpu/Dockerfile
similarity index 71%
rename from tool/docker/runtime/Dockerfile
rename to tool/docker/runtime/cpu/Dockerfile
index d250612..c4717c1 100644
--- a/tool/docker/runtime/Dockerfile
+++ b/tool/docker/runtime/cpu/Dockerfile
@@ -17,34 +17,36 @@
 # limitations under the License.
 #
 # Base unbuntu 16.04 image
-FROM ubuntu:latest
+FROM ubuntu:18.04
 
 MAINTAINER incubator-singa dev@singa.incubator.apache.org
 
 # install dependencies
 RUN apt-get update \
-    && apt-get install -y --no-install-recommends subversion git wget openssh-server bzip2\
+    && apt-get install -y --no-install-recommends subversion git wget openssh-server bzip2 \
     && apt-get clean && apt-get autoremove && apt-get autoclean \
     && rm -rf /var/lib/apt/lists/*
 
-RUN wget --no-check-certificate https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh;
+# install conda
+RUN wget --no-check-certificate https://repo.continuum.io/miniconda/Miniconda3-4.5.12-Linux-x86_64.sh -O miniconda.sh;
 RUN bash miniconda.sh -b -p /root/miniconda
 ENV PATH /root/miniconda/bin:${PATH}
 RUN conda config --set always_yes yes --set changeps1 no
-RUN conda update -q conda
+RUN conda install -c nusdbsystem singa-cpu
 RUN conda install -c conda-forge sphinx
 RUN conda install -c conda-forge sphinx_rtd_theme
 RUN conda install -c conda-forge recommonmark
-RUN conda install -c nusdbsystem singa_cpu
 
-RUN mkdir /var/run/sshd
-RUN echo 'root:singa' | chpasswd
-RUN sed -i 's/PermitRootLogin prohibit-password/PermitRootLogin yes/' /etc/ssh/sshd_config
-# SSH login fix. Otherwise user is kicked off after login
-RUN sed 's@session\s*required\s*pam_loginuid.so@session optional pam_loginuid.so@g' -i /etc/pam.d/sshd
 
-# dump environment variables into files, so that ssh can see also
-# RUN env | grep _ >> /etc/environment
+# config ssh service
+RUN mkdir /var/run/sshd \
+    && echo 'root:singa' | chpasswd \
+    && sed -ri 's/^#?PermitRootLogin\s+.*/PermitRootLogin yes/' /etc/ssh/sshd_config \
+    && sed -ri 's/UsePAM yes/#UsePAM yes/g' /etc/ssh/sshd_config \
+    && mkdir /root/.ssh
+
+# add conda bin path to login or non-login shell
+RUN echo PATH=$PATH:/root/miniconda/bin >> /etc/profile
 
 EXPOSE 22
 
diff --git a/tool/docker/runtime/Dockerfile b/tool/docker/runtime/gpu/Dockerfile
similarity index 71%
copy from tool/docker/runtime/Dockerfile
copy to tool/docker/runtime/gpu/Dockerfile
index d250612..89f7754 100644
--- a/tool/docker/runtime/Dockerfile
+++ b/tool/docker/runtime/gpu/Dockerfile
@@ -17,34 +17,37 @@
 # limitations under the License.
 #
 # Base unbuntu 16.04 image
-FROM ubuntu:latest
+FROM nvidia/cuda:9.0-devel-ubuntu16.04
 
 MAINTAINER incubator-singa dev@singa.incubator.apache.org
 
 # install dependencies
 RUN apt-get update \
-    && apt-get install -y --no-install-recommends subversion git wget openssh-server bzip2\
+    && apt-get install -y --no-install-recommends subversion git wget openssh-server bzip2 \
     && apt-get clean && apt-get autoremove && apt-get autoclean \
     && rm -rf /var/lib/apt/lists/*
 
-RUN wget --no-check-certificate https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh;
+# install conda
+RUN wget --no-check-certificate https://repo.continuum.io/miniconda/Miniconda3-4.5.12-Linux-x86_64.sh -O miniconda.sh;
 RUN bash miniconda.sh -b -p /root/miniconda
 ENV PATH /root/miniconda/bin:${PATH}
 RUN conda config --set always_yes yes --set changeps1 no
-RUN conda update -q conda
+RUN conda install -c nusdbsystem singa-gpu
 RUN conda install -c conda-forge sphinx
 RUN conda install -c conda-forge sphinx_rtd_theme
 RUN conda install -c conda-forge recommonmark
-RUN conda install -c nusdbsystem singa_cpu
 
-RUN mkdir /var/run/sshd
-RUN echo 'root:singa' | chpasswd
-RUN sed -i 's/PermitRootLogin prohibit-password/PermitRootLogin yes/' /etc/ssh/sshd_config
-# SSH login fix. Otherwise user is kicked off after login
-RUN sed 's@session\s*required\s*pam_loginuid.so@session optional pam_loginuid.so@g' -i /etc/pam.d/sshd
 
-# dump environment variables into files, so that ssh can see also
-# RUN env | grep _ >> /etc/environment
+# config ssh service
+RUN mkdir /var/run/sshd \
+    && echo 'root:singa' | chpasswd \
+    && sed -ri 's/^#?PermitRootLogin\s+.*/PermitRootLogin yes/' /etc/ssh/sshd_config \
+    && sed -ri 's/UsePAM yes/#UsePAM yes/g' /etc/ssh/sshd_config \
+    && mkdir /root/.ssh
+
+# add conda bin path to login or non-login shell
+RUN echo PATH=$PATH:/root/miniconda/bin >> /etc/profile
+
 
 EXPOSE 22
 
diff --git a/tool/jenkins/README.md b/tool/jenkins/README.md
index 26275e8..bc24189 100644
--- a/tool/jenkins/README.md
+++ b/tool/jenkins/README.md
@@ -103,7 +103,7 @@
 
 To run the docker images,
 
-    nvidia-docker run --name <node name> -d <Image ID> -P
+    nvidia-docker run --name <node name> -P -d <Image ID>
 
 To add the container into a network for easy access
 
@@ -124,9 +124,9 @@
 The working nodes (or Docker containers) are configured in Jenkins-Manage Jenkins-Mange Nodes.
 Each node should configure the following environment variable
 
-    export CUDNN_PATH=<path to cudnn folder>
+    export CUDA=<cuda version, e.g., 9.0>
 
-where the cudnn folder should include `inlcude/cudnn.h` and `lib64/libcudnn.so*`. [Dockerfiles](../docker/README.md) are provided to create the working nodes.
+[Dockerfiles](../conda/docker) are provided to create the working nodes.
 
 ## Configure Jenkins for Singa Website Updates