Merge branch 'v1-rc0'
diff --git a/.gitignore b/.gitignore
index 035d147..887c409 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,55 +1,14 @@
-*.orig
 *.swp
 *.o
-*.lo
 *.bin
 *.a
 *.so
-*.dat
-*~
 *.bak
-*.P
-*.odp
-*.project
-*.cproject
 *.log
-*.nfs*
 *.pb.h
 *.pb.cc
-*.hosts
-*.id
-*.pids
-*.tmp
-*.out
-tool/pb2/*
-tool/python/pb2/*
-src/test/data/*
-tmp
-log*
+*.cxx
 build/
-tmp/
-.sync
-*lmdb
-*.binaryproto
-singa
-singatool
-singatest
-.libs
-*.la
-*.deps
-*.dirstamp
-libtool
-stamp-h1
-*.cache
-*.status
-config.h
-Makefile
-config/*
-config.h.in
-configure
-aclocal.m4
-Makefile.in
 thirdparty/*
 !thirdparty/install.sh
-!include/singa
-doc/
+test/samples/
diff --git a/.gitmodules b/.gitmodules
new file mode 100644
index 0000000..cd0a9d2
--- /dev/null
+++ b/.gitmodules
@@ -0,0 +1,3 @@
+[submodule "lib/cnmem"]
+	path = lib/cnmem
+	url = https://github.com/NVIDIA/cnmem.git
diff --git a/.travis.yml b/.travis.yml
new file mode 100644
index 0000000..8b1f89c
--- /dev/null
+++ b/.travis.yml
@@ -0,0 +1,21 @@
+sudo: required
+language: cpp
+compiler: gcc
+dist: trusty
+
+before_install:
+ - sudo apt-get -qq update
+ - sudo apt-get install -qq -y libopenblas-dev libgoogle-glog-dev libprotobuf-dev protobuf-compiler
+ - sudo apt-get install -qq -y opencl-headers ocl-icd-*
+ - wget https://github.com/KhronosGroup/OpenCL-CLHPP/releases/download/v2.0.9/cl2.hpp
+ - sudo mv cl2.hpp /usr/include/CL/
+#- sudo apt-get install -qq libgtest-dev
+
+before_script:
+ - mkdir build && cd build
+ - cmake .. -DUSE_CUDA=OFF -DUSE_CUDNN=OFF -DUSE_PYTHON=OFF -DBUILD_OPENCL_TESTS=OFF
+
+script:
+ - make
+ - ./bin/test_singa --gtest_output=xml:./../gtest.xml
+
diff --git a/CMakeLists.txt b/CMakeLists.txt
new file mode 100644
index 0000000..2bed134
--- /dev/null
+++ b/CMakeLists.txt
@@ -0,0 +1,77 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# 
+
+CMAKE_MINIMUM_REQUIRED(VERSION 2.6)
+
+PROJECT(singa)
+SET(PACKAGE_VERSION "1.0.0")
+SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -g -O2 ")
+
+LIST(APPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake/Thirdparty)
+#message(STATUS "module path: ${CMAKE_MODULE_PATH}")
+
+# Flags
+IF(UNIX OR APPLE)
+  SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC -Wall")
+ENDIF()
+IF(CMAKE_BUILD_TYPE=Debug)
+  SET(NVCC_FLAG "${NVCC_FLAG} -g -G ")
+ENDIF()
+#message(STATUS "${CMAKE_CXX_FLAGS}")
+SET(SINGA_INCLUDE_DIR
+    "${CMAKE_SOURCE_DIR}/include;${CMAKE_SOURCE_DIR}/lib/cnmem/include;${PROJECT_BINARY_DIR}")
+INCLUDE_DIRECTORIES(${SINGA_INCLUDE_DIR})
+
+OPTION(USE_CBLAS "Use CBlas libs" ON)
+OPTION(USE_CUDA "Use Cuda libs" OFF)
+OPTION(USE_CUDNN "Use Cudnn libs" ON)
+OPTION(USE_OPENCV "Use opencv" OFF)
+OPTION(USE_LMDB "Use LMDB libs" OFF)
+OPTION(USE_PYTHON "Generate py wrappers" ON)
+OPTION(USE_OPENCL "Use OpenCL" OFF)
+OPTION(ENABLE_DIST "enable distributed training" OFF)
+#OPTION(BUILD_OPENCL_TESTS "Build OpenCL tests" OFF)
+
+INCLUDE("cmake/Dependencies.cmake")
+INCLUDE("cmake/Utils.cmake")
+ADD_DEFINITIONS(-DUSE_CMAKE)
+#message(STATUS "${SINGA_INCLUDE_DIR}")
+
+CONFIGURE_FILE (
+    "${PROJECT_SOURCE_DIR}/cmake/Templates/singa_config.h.in"
+    "${PROJECT_BINARY_DIR}/include/singa/singa_config.h")
+
+#set(SINGA_CONFIGURE_SRC "${PROJECT_BINARY_DIR}/singa_config.h")
+#LIST(APPEND SRCS ${SINGA_CONFIGURE_SRCS} ${PROJECT_BINARY_DIR}/singa_config.h)
+
+SET(LIBRARY_OUTPUT_PATH ${PROJECT_BINARY_DIR}/lib)
+SET(EXECUTABLE_OUTPUT_PATH ${PROJECT_BINARY_DIR}/bin)
+
+IF (USE_CUDA)
+    ADD_SUBDIRECTORY(lib/cnmem)
+    LIST(APPEND SINGA_LINKER_LIBS cnmem)
+ENDIF()
+
+# TODO(wangwei) detect the ev lib
+IF (ENABLE_DIST)
+  LIST(APPEND SINGA_LINKER_LIBS ev)
+ENDIF()
+
+ADD_SUBDIRECTORY(src)
+ADD_SUBDIRECTORY(test)
+ADD_SUBDIRECTORY(examples)
diff --git a/LICENSE b/LICENSE
index 75001c1..4f9f923 100644
--- a/LICENSE
+++ b/LICENSE
@@ -205,25 +205,35 @@
 subcomponents is subject to the terms and conditions of the following
 licenses.
 
-============================================================================
-SINGA bundles the following under BSD 2-clause license: include/singa/utils/blob.h,
-src/utils/blob.cc, include/singa/utils/common.h, src/utils/common.cc, include/singa/utils/cuda_utils.h
 
-Copyright (c) 2014, 2015, The Regents of the University of California (Regents)
-Copyright (c) 2014, the respective contributors
-https://github.com/BVLC/caffe/blob/master/LICENSE
+============================================================================
+SINGA bundles the following under BSD 3-clause license:
+cmake/Profobuf.cmake
+
+Copyright 2009 Kitware, Inc.
+Copyright 2009-2011 Philip Lowman <philip@yhbt.com>
+Copyright 2008 Esben Mose Hansen, Ange Optimization ApS
 
 =====================================================================
-SINGA bundles the following under BSD 2-clause license: include/singa/utils/tinydir.h
+SINGA bundles the following under BSD 2-clause license:
+include/singa/utils/tinydir.h
 
 Copyright (c) 2013, Cong Xu, Baudouin Feildel
 https://github.com/cxong/tinydir/blob/master/COPYING
 
-=====================================================================
-SINGA bundles the following under Apache v2.0 license: include/mshadow/*
 
-Copyright (c) 2014 by Contributors
-https://github.com/dmlc/mshadow/blob/master/LICENSE
+===========================================================================
+SINGA bundles the following under BSD 2-clause license:
+include/singa/utils/cuda_utils.h, src/core/tensor/distribution.cl
+
+All contributions by the University of California:
+Copyright (c) 2014, 2015, The Regents of the University of California (Regents)
+All rights reserved.
+
+All other contributions:
+Copyright (c) 2014, 2015, the respective contributors
+All rights reserved.
+https://github.com/BVLC/caffe/blob/master/LICENSE
 
 =====================================================================
 SINGA bundles the following under New BSD license: include/gtest/*
@@ -238,9 +248,13 @@
 https://github.com/google/styleguide/tree/gh-pages/cpplint
 
 =====================================================================
-SINGA bundles the following under New BSD license: examples/rnnlm/create_data.cc
+SINGA bundles the following under New BSD license: lib/cnmem/*
 
-Copyright (c) 2010-2012 Tomas Mikolov
-Copyright (c) 2013 Cantab Research Ltd
+Copyright (c) 2015, NVIDIA CORPORATION. All rights reserved.
+https://github.com/NVIDIA/cnmem
+
+=====================================================================
+SINGA bundles the following under New BSD license: src/python/swig/numpy.i
+
+Copyright (c) 2005-2015, NumPy Developers.
 All rights reserved.
-http://www.fit.vutbr.cz/~imikolov/rnnlm/
diff --git a/Makefile.am b/Makefile.am
deleted file mode 100644
index a30b9d1..0000000
--- a/Makefile.am
+++ /dev/null
@@ -1,381 +0,0 @@
-ACLOCAL_AMFLAGS = -I config
-AUTOMAKE_OPTIONS = foreign subdir-objects
-
-#AM_CPPFLAGS = -I$(top_srcdir)/src
-#AM_LDFLAGS = $(LD_FLAGS)
-
-MSHADOW_FLAGS = -DMSHADOW_USE_CUDA=0 -DMSHADOW_USE_CBLAS=1 -DMSHADOW_USE_MKL=0
-DEFAULT_FLAGS = -Wall -pthread -fPIC -std=c++11 -Wno-unknown-pragmas \
-              $(MSHADOW_FLAGS) -funroll-loops -DTHREADED
-
-CFLAGS = $(DEBUG)
-CXXFLAGS = $(DEBUG)
-#AC_CXXFLAGS = $(DEBUG)
-
-INCLUDES = -I$(top_srcdir)/include
-
-PROTOS := $(top_srcdir)/src/proto/singa.proto \
-          $(top_srcdir)/src/proto/job.proto \
-          $(top_srcdir)/src/proto/common.proto
-PROTO_SRCS := src/proto/singa.pb.cc \
-              src/proto/job.pb.cc \
-              src/proto/common.pb.cc
-PROTO_HDRS := include/proto/singa.pb.h \
-              include/proto/job.pb.h \
-              include/proto/common.pb.h
-PROTO_PYS := tool/python/pb2/singa_pb2.py \
-             tool/python/pb2/job_pb2.py \
-             tool/python/pb2/common_pb2.py
-
-CUDA_SRCS := src/utils/math_kernel.cu
-CUDA_OBJS := src/utils/math_kernel.o
-CUDA_HDRS := include/singa/utils/math_kernel.h
-
-CUDNN_SRCS := src/neuralnet/loss_layer/cudnn_softmaxloss.cc \
-			  src/neuralnet/neuron_layer/cudnn_softmax.cc \
-			  src/neuralnet/neuron_layer/cudnn_pooling.cc \
-			  src/neuralnet/neuron_layer/cudnn_activation.cc \
-			  src/neuralnet/neuron_layer/cudnn_lrn.cc \
-			  src/neuralnet/neuron_layer/cudnn_convolution.cc \
-			  src/neuralnet/neuron_layer/cudnn_bm.cc
-
-PY_SRCS := tool/python/singa/driver_wrap.cxx \
-           src/driver.cc
-
-ZOOKEEPER_SRCS := src/utils/zk_service.cc
-ZOOKEEPER_HDRS := include/singa/utils/zk_service.h
-
-HDFS_SRCS := src/io/hdfsfile.cc \
-             src/io/hdfsfile_store.cc
-HDFS_HDRS := include/singa/io/hdfsfile.h \
-             include/singa/io/hdfsfile_store.h
-
-SINGA_SRCS := src/driver.cc \
-              src/server.cc \
-              src/worker.cc \
-              src/stub.cc \
-              src/neuralnet/layer.cc \
-              src/neuralnet/connection_layer/bridge.cc \
-              src/neuralnet/connection_layer/concate.cc \
-              src/neuralnet/connection_layer/slice.cc \
-              src/neuralnet/connection_layer/split.cc \
-              src/neuralnet/connection_layer/rnn_dummy.cc \
-              src/neuralnet/input_layer/char_rnn.cc \
-              src/neuralnet/input_layer/onehot.cc \
-              src/neuralnet/input_layer/csv.cc \
-              src/neuralnet/input_layer/image_preprocess.cc \
-              src/neuralnet/input_layer/record.cc \
-              src/neuralnet/input_layer/deprecated.cc \
-              src/neuralnet/input_layer/store.cc \
-              src/neuralnet/input_layer/rnn_label.cc \
-              src/neuralnet/output_layer/accuracy.cc \
-              src/neuralnet/output_layer/argsort.cc \
-              src/neuralnet/output_layer/csv.cc \
-              src/neuralnet/output_layer/record.cc \
-              src/neuralnet/output_layer/char_rnn.cc \
-              src/neuralnet/loss_layer/euclidean.cc \
-              src/neuralnet/loss_layer/softmax.cc \
-              src/neuralnet/neuron_layer/activation.cc \
-              src/neuralnet/neuron_layer/bm.cc \
-              src/neuralnet/neuron_layer/convolution.cc \
-              src/neuralnet/neuron_layer/dropout.cc \
-              src/neuralnet/neuron_layer/dummy.cc \
-              src/neuralnet/neuron_layer/embedding.cc \
-              src/neuralnet/neuron_layer/inner_product.cc \
-              src/neuralnet/neuron_layer/lrn.cc \
-              src/neuralnet/neuron_layer/pooling.cc \
-              src/neuralnet/neuron_layer/rbm.cc \
-              src/neuralnet/neuron_layer/gru.cc \
-              src/neuralnet/neuron_layer/relu.cc \
-              src/neuralnet/neuron_layer/sigmoid.cc \
-              src/neuralnet/neuron_layer/softmax.cc \
-              src/neuralnet/neuron_layer/stanh.cc \
-              src/neuralnet/neuralnet.cc \
-              src/comm/socket.cc \
-              src/comm/msg.cc \
-              src/io/kvfile.cc \
-              src/io/kvfile_store.cc \
-              src/io/textfile_store.cc \
-              src/io/store.cc \
-              src/utils/cluster.cc \
-              src/utils/cluster_rt.cc \
-              src/utils/graph.cc \
-              src/utils/common.cc \
-              src/utils/param.cc \
-              src/utils/updater.cc \
-              src/utils/blob.cc \
-              src/utils/image_transform.cc \
-              src/utils/job_manager.cc
-
-
-SINGA_HDRS := include/singa.h \
-              include/singa/utils/math_blob.h \
-              include/singa/utils/math_addr.h \
-              include/singa/utils/cluster.h \
-              include/utils/cluster_rt.h \
-              include/utils/param.h \
-              include/utils/common.h \
-              include/utils/factory.h \
-              include/utils/data_shard.h \
-              include/utils/singleton.h \
-              include/utils/graph.h \
-              include/utils/blob.h \
-              include/utils/updater.h \
-              include/utils/tinydir.h \
-              include/utils/tokenizer.h \
-              include/utils/image_transform.h \
-              include/utils/job_manager.h \
-              include/server.h \
-              include/worker.h \
-              include/stub.h \
-              include/neuralnet/layer.h \
-              include/neuralnet/output_layer.h \
-              include/neuralnet/input_layer.h \
-              include/neuralnet/loss_layer.h \
-              include/neuralnet/neuron_layer.h \
-              include/neuralnet/connection_layer.h \
-              include/neuralnet/neuralnet.h \
-              include/singa/comm/msg.h \
-              include/singa/comm/socket.h \
-              include/singa/io/store.h \
-              include/singa/io/kvfile.h \
-              include/singa/io/kvfile_store.h \
-              include/singa/io/textfile_store.h \
-              include/mshadow/cxxnet_op.h \
-              include/mshadow/tensor_expr.h \
-              include/mshadow/tensor_container.h \
-              include/mshadow/tensor_expr_ext.h \
-              include/mshadow/tensor.h \
-              include/mshadow/tensor_io.h \
-              include/mshadow/tensor_base.h \
-              include/mshadow/tensor_random.h
-
-GTEST_SRCS := include/gtest/gtest-all.cc
-GTEST_HRDS := include/gtest/gtest.h
-TEST_SRCS := include/gtest/gtest_main.cc \
-             src/test/test_cluster.cc \
-             src/test/test_common.cc \
-             src/test/test_msg.cc \
-             src/test/test_math.cc \
-             src/test/test_neuralnet.cc \
-             src/test/test_paramslicer.cc \
-             src/test/test_kvfile.cc \
-             src/test/test_store.cc \
-             src/test/test_connection_layers.cc \
-             src/test/test_record_input_layer.cc \
-             src/test/test_csv_input_layer.cc \
-             src/test/test_gru_layer.cc \
-             src/test/test_unrolling.cc
-
-#EXTRA_PROGRAMS = $(PROGS)
-EXTRA_PROGRAMS = singatest test
-#EXTRA_LTLIBRARIES = $(LTLIBS)
-EXTRA_LTLIBRARIES = libgtest.la _driver.la
-
-lib_LTLIBRARIES = libsinga.la $(LTLIBS)
-bin_PROGRAMS = singa singatool $(PROGS)
-pydir = $(CURDIR)/tool/python/singa/
-py_LTLIBRARIES = $(PY_PROGS)
-#gpudir = $(CURDIR)/.libs
-#gpu_LTLIBRARIES = libsingagpu.so
-
-#lib_LTLIBRARIES = libsinga.la
-libsinga_la_SOURCES = $(PROTO_SRCS) $(SINGA_SRCS)
-libsinga_la_CXXFLAGS = $(DEFAULT_FLAGS) -msse3 -fpermissive -I$(top_srcdir)/include
-libsinga_la_LDFLAGS =
-if LMDB
-libsinga_la_CXXFLAGS += -DUSE_LMDB
-endif
-
-if DCUDNN
-libsinga_la_SOURCES += $(CUDNN_SRCS)
-libsinga_la_CXXFLAGS += $(CUDNN_CFLAGS)
-libsinga_la_LDFLAGS += $(CUDNN_LDFLAGS) $(CUDNN_LIBS)
-endif
-
-if DCUDA
-libsinga_la_SOURCES += $(CUDA_SRCS) $(CUDA_HDRS)
-libsinga_la_CXXFLAGS += $(CUDA_CFLAGS)
-libsinga_la_LDFLAGS += $(CUDA_LDFLAGS) $(CUDA_LIBS) -L./ -lsingagpu -Wl,-rpath=.
-libsinga_la_LIBADD = libsingagpu.so
-endif
-
-if DDIST
-libsinga_la_SOURCES += $(ZOOKEEPER_SRCS)
-libsinga_la_CXXFLAGS += $(DIST_CFLAGS)
-libsinga_la_LDFLAGS += $(DIST_LDFLAGS) $(DIST_LIBS)
-endif
-
-if DHDFS
-libsinga_la_SOURCES += $(HDFS_SRCS)
-libsinga_la_CXXFLAGS += $(HDFS_CFLAGS)
-libsinga_la_LDFLAGS += $(HDFS_LDFLAGS) $(HDFS_LIBS)
-endif
-
-#bin_PROGRAMS = singa
-singa_SOURCES = src/main.cc
-singa_CXXFLAGS = $(DEFAULT_FLAGS) -MMD -I$(top_srcdir)/include
-singa_LDFLAGS = -lsinga \
-                -lglog  \
-                -lprotobuf \
-                -lopenblas \
-                -lczmq
-if LMDB
-singa_LDFLAGS += -llmdb
-endif
-
-if DCUDNN
-singa_SOURCES += $(CUDNN_SRCS)
-singa_CXXFLAGS += $(CUDNN_CFLAGS)
-singa_LDFLAGS += $(CUDNN_LDFLAGS) $(CUDNN_LIBS)
-endif
-
-if DCUDA
-singa_SOURCES += $(CUDA_SRCS) $(CUDA_HDRS)
-singa_CXXFLAGS += $(CUDA_CFLAGS)
-singa_LDFLAGS += $(CUDA_LDFLAGS) $(CUDA_LIBS)
-endif
-
-if DDIST
-singa_SOURCES += $(ZOOKEEPER_SRCS)
-singa_CXXFLAGS += $(DIST_CFLAGS)
-singa_LDFLAGS += $(DIST_LDFLAGS) $(DIST_LIBS)
-endif
-
-if DHDFS
-singa_SOURCES += $(HDFS_SRCS)
-singa_CXXFLAGS += $(HDFS_CFLAGS)
-singa_LDFLAGS += $(HDFS_LDFLAGS) $(HDFS_LIBS)
-endif
-#bin_PROGRAMS += singatool
-singatool_SOURCES = src/utils/tool.cc #$(CUDA_SRCS) $(CUDA_HDRS) $(CUDNN_SRCS)
-singatool_CXXFLAGS = -Wall -pthread -fPIC -std=c++11 -MMD -Wno-unknown-pragmas \
-              -funroll-loops -DTHREADED -I$(top_srcdir)/include $(DEFAULT_FLAGS)
-singatool_LDFLAGS = -lsinga \
-                    -lglog  \
-                    -lprotobuf
-
-if DDIST
-singatool_SOURCES += $(ZOOKEEPER_SRCS)
-singatool_CXXFLAGS += $(DIST_CFLAGS)
-singatool_LDFLAGS += $(DIST_LDFLAGS) $(DIST_LIBS)
-endif
-
-#lib_LTLIBRARIES += libgtest.la
-libgtest_la_SOURCES = $(GTEST_HDRS) $(GTEST_SRCS)
-libgtest_la_CXXFLAGS = $(DEFAULT_FLAGS) -msse3 -fpermissive -I$(top_srcdir)/include
-if LMDB
-libgtest_la_CXXFLAGS += -DUSE_LMDB
-endif
-#libgtest_la_LDFLAGS = -I$(top_srcdir)/include
-
-#bin_PROGRAMS += test
-
-singatest_SOURCES = $(GTEST_HDRS) $(TEST_SRCS)
-singatest_CXXFLAGS = $(DEFAULT_FLAGS) -I$(top_srcdir)/include
-singatest_LDADD = ./libgtest.la
-singatest_LDFLAGS = -lsinga \
-                -lglog  \
-                -lprotobuf \
-                -lopenblas \
-                -lczmq \
-                -lgtest
-if LMDB
-singatest_LDFLAGS += -llmdb
-endif
-
-if DCUDNN
-singatest_SOURCES += $(CUDNN_SRCS)
-singatest_CXXFLAGS += $(CUDNN_CFLAGS)
-singatest_LDFLAGS += $(CUDNN_LDFLAGS) $(CUDNN_LIBS)
-endif
-
-if DCUDA
-singatest_SOURCES += $(CUDA_SRCS) $(CUDA_HDRS)
-singatest_CXXFLAGS += $(CUDA_CFLAGS)
-singatest_LDFLAGS += $(CUDA_LDFLAGS) $(CUDA_LIBS)
-endif
-
-if DDIST
-singatest_SOURCES += $(ZOOKEEPER_SRCS)
-singatest_CXXFLAGS += $(DIST_CFLAGS)
-singatest_LDFLAGS += $(DIST_LDFLAGS) $(DIST_LIBS)
-endif
-
-_driver_la_SOURCES = $(PY_SRCS)
-_driver_la_CXXFLAGS = $(DEFAULT_FLAGS) $(MSHADOW_FLAGS) -I$(top_srcdir)/include $(PYFLAGS)
-_driver_la_LDFLAGS = -lsinga -module -shared $(PYLIBS) -avoid-version -rpath $(pydir)
-
-if DCUDNN
-_driver_la_CXXFLAGS += $(CUDNN_CFLAGS)
-_driver_la_LDFLAGS += $(CUDNN_LDFLAGS) $(CUDNN_LIBS)
-endif
-
-if DCUDA
-_driver_la_CXXFLAGS += $(CUDA_CFLAGS)
-_driver_la_LDFLAGS += $(CUDA_LDFLAGS) $(CUDA_LIBS)
-endif
-
-clean-local:
-	rm -rf $(PROTO_SRCS) $(PROTO_HDRS)
-	rm -rf $(PROTO_PYS)
-	rm -rf neuralnet/neuron_layer/*.o
-	rm -rf src/utils/math_kernel.o
-	rm -rf rat_check
-	rm -rf tool/python/pb2
-	rm -rf libsingagpu.so
-
-# Add scrips for py driver installation
-all-local:
-	@if [ -f ".libs/_driver.so" ]; then \
-		echo "Copy libs for python wrapper"; \
-		cp -f .libs/_driver.so tool/python/singa/; \
-		touch tool/python/singa/__init__.py; \
-	fi
-	@if [ -f "libsingagpu.so" ]; then \
-		cp libsingagpu.so .libs/; \
-	fi
-
-# For rat check
-rat:
-	@if test ! -z '$(shell command -v java 2>/dev/null)'; then \
-		if test ! -z '$(shell echo $$RAT_PATH)'; then \
-			make distclean;\
-			java -jar $(RAT_PATH) -E rat-excludes -d . > rat_check; \
-		else \
-			echo "RAT_PATH is not set to correct jar file. Apache RAT can be downloaded at http://creadur.apache.org/rat/download_rat.cgi"; \
-		fi \
-	else \
-		echo "java is not found"; \
-	fi
-
-.cu.o: .cu
-	$(NVCC) $(MSHADOW_FLAGS) --shared -Xcompiler -fPIC $(CUDA_CFLAGS) $(CUDA_LDFLAGS) $(CUDA_LIBS) -I$(top_srcdir)/include -std=c++11 -G -c -o $@ $<
-
-# Generate gpu libs for singa
-libsingagpu.so: $(CUDA_OBJS)
-	$(NVCC) -o libsingagpu.so -shared -Xcompiler -fPIC $(CUDA_OBJS)  $(CUDA_CFLAGS) $(CUDA_LDFLAGS) $(CUDA_LIBS) -I$(top_srcdir)/include -std=c++11 -G
-
-# Create python class files
-install-pyLTLIBRARIES: $(py_LTLIBRARIES)
-	touch tool/python/singa/__init__.py
-	@if [ -f ".libs/_driver.so" ]; then \
-	  cp -f .libs/_driver.so tool/python/singa/;\
-	fi
-
-uninstall-pyLTLIBRARIES:
-	rm -f tool/python/singa/__init__.py
-	rm -f tool/python/singa/_driver.so
-
-# For autorun singatest
-test: singatest
-	@./singatest
-
-$(PROTO_HDRS) $(PROTO_SRCS): $(PROTOS)
-	protoc --proto_path=$(top_srcdir)/src/proto --cpp_out=$(top_srcdir)/src/proto $(PROTOS)
-	mkdir -p $(top_srcdir)/tool/python/pb2/
-	touch $(top_srcdir)/tool/python/pb2/__init__.py
-	protoc --proto_path=$(top_srcdir)/src/proto --python_out=$(top_srcdir)/tool/python/pb2 $(PROTOS)
-	mkdir -p $(top_srcdir)/include/singa/proto/
-	cp $(top_srcdir)/src/proto/*.pb.h $(top_srcdir)/include/singa/proto/
-	@echo
diff --git a/Makefile.example b/Makefile.example
deleted file mode 100644
index 4fb0c66..0000000
--- a/Makefile.example
+++ /dev/null
@@ -1,116 +0,0 @@
-#/**
-# * Licensed to the Apache Software Foundation (ASF) under one
-# * or more contributor license agreements.  See the NOTICE file
-# * distributed with this work for additional information
-# * regarding copyright ownership.  The ASF licenses this file
-# * to you under the Apache License, Version 2.0 (the
-# * "License"); you may not use this file except in compliance
-# * with the License.  You may obtain a copy of the License at
-# *
-# *     http://www.apache.org/licenses/LICENSE-2.0
-# *
-# * Unless required by applicable law or agreed to in writing, software
-# * distributed under the License is distributed on an "AS IS" BASIS,
-# * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# * See the License for the specific language governing permissions and
-# * limitations under the License.
-# */
-
-###################User Config Varaibles #############################
-# third-party library installation folder
-HOME_DIR := /usr
-# Lib folder for system and external libs. You may need to change it.
-LIBRARY_DIRS := $(HOME_DIR)/lib64 $(HOME_DIR)/lib $(HOME_DIR)/local/lib
-# Header folder for system and external libs. You may need to change it.
-INCLUDE_DIRS := $(HOME_DIR)/include ./include $(HOME_DIR)/local/include/zookeeper
-# g++ location, should support c++11, tested with 4.8.1
-CXX := g++
-
-######################Setting Varialbes#######################################
-LIBRARIES := glog protobuf openblas zmq czmq zookeeper_mt
-
-LDFLAGS := $(foreach librarydir, $(LIBRARY_DIRS), -L$(librarydir))\
-	$(foreach library, $(LIBRARIES), -l$(library))
-# Folder to store compiled files
-BUILD_DIR := .libs
-MSHADOW_FLAGS :=-DMSHADOW_USE_CUDA=0 -DMSHADOW_USE_CBLAS=1 -DMSHADOW_USE_MKL=0
-ZK_FLAGS :=-DTHREADED -fpermissive
-CXXFLAGS := -O2 -msse3 -Wall -pthread -fPIC -std=c++11 -Wno-unknown-pragmas \
-	$(MSHADOW_FLAGS) -DCPU_ONLY=1 $(ZK_FLAGS)\
-	-funroll-loops $(foreach includedir, $(INCLUDE_DIRS), -I$(includedir))
-
-# find user defined .proto file, and then compute the corresponding .h, .cc
-# files, which cannot be found by shell find, because they haven't been
-# generated currently
-PROTOS := $(shell find src/proto/ -name "*.proto")
-PROTO_SRCS :=$(PROTOS:.proto=.pb.cc)
-PROTO_HDRS :=$(patsubst src%, include%, $(PROTOS:.proto=.pb.h))
-PROTO_OBJS :=$(addprefix $(BUILD_DIR)/, $(PROTO_SRCS:.cc=.o))
-
-# each singa src file will generate a .o file
-SINGA_SRCS := $(shell find src/ \( -path "src/test" -o -path "src/main.cc" -o -path "src/utils/tool.cc" \) \
-	-prune -o \( -name "*.cc" -type f \) -print )
-SINGA_OBJS := $(sort $(addprefix $(BUILD_DIR)/, $(SINGA_SRCS:.cc=.o)) \
-	$(PROTO_OBJS) )
--include $(SINGA_OBJS:%.o=%.P)
-
-TEST_SRCS :=$(shell find src/test/ -maxdepth 1 -name "*.cc")
-TEST_OBJS := $(sort $(addprefix $(BUILD_DIR)/, $(TEST_SRCS:.cc=.o)))
--include $(TEST_OBJS:%.o=%.P)
-
-GTEST_SRC := include/gtest/gtest-all.cc
-GTEST_HDR := include/gtest/gtest.h
-GTEST_LIB := $(BUILD_DIR)/libgtest.a
-
-OBJS := $(sort $(SINGA_OBJS) $(TEST_OBJS) )
-
-########################Compilation Section###################################
-.PHONY: singa test
-
-singa: $(PROTO_OBJS) $(SINGA_OBJS)
-	$(CXX) -shared -o $(BUILD_DIR)/libsinga.so $(SINGA_OBJS)
-	$(CXX) $(SINGA_OBJS) src/main.cc -o singa $(CXXFLAGS) $(LDFLAGS)
-	@echo
-	$(CXX) $(SINGA_OBJS) src/utils/tool.cc -o singatool $(CXXFLAGS) $(LDFLAGS)
-	@echo
-
-loader: proto $(LOADER_OBJS)
-	$(CXX) $(LOADER_OBJS) -o $(BUILD_DIR)/loader $(CXXFLAGS) $(LDFLAGS)
-	@echo
-
-test:  proto $(GTEST_LIB) $(TEST_OBJS) $(SINGA_OBJS)
-	$(CXX) $(TEST_OBJS) include/gtest/gtest_main.cc $(GTEST_LIB) \
-		$(SINGA_OBJS) -o $(BUILD_DIR)/test $(CXXFLAGS) $(LDFLAGS)
-	@echo
-
-$(GTEST_LIB): $(GTEST_HDR) $(GTEST_SRC)
-	$(CXX) $(GTEST_SRC) -c -o $(BUILD_DIR)/gtest-all.o $(CXXFLAGS)
-	ar -rv $(GTEST_LIB) $(BUILD_DIR)/gtest-all.o
-
-# compile all files
-$(OBJS):$(BUILD_DIR)/%.o : %.cc
-	@mkdir -p $(dir $@)
-	$(CXX) $<  $(CXXFLAGS) -MMD -c -o $@
-	cp $(BUILD_DIR)/$*.d $(BUILD_DIR)/$*.P; \
-	sed -e 's/#.*//' -e 's/^[^:]*: *//' -e 's/ *\\$$//' \
-		-e '/^$$/ d' -e 's/$$/ :/' < $(BUILD_DIR)/$*.d >> $(BUILD_DIR)/$*.P; \
-	rm -f $*.d
-
-proto: $(PROTO_OBJS)
-
-$(PROTO_SRCS): $(PROTOS)
-	protoc --proto_path=src/proto --cpp_out=src/proto $(PROTOS)
-	mkdir -p include/proto/
-	cp src/proto/*.pb.h include/proto/
-	mkdir -p tool/pb2/
-	touch tool/pb2/__init__.py
-	protoc --proto_path=src/proto --python_out=tool/pb2/ $(PROTOS)
-	@echo
-
-clean:
-	rm -rf *.a *.so
-	rm -rf include/proto/*
-	rm -rf src/proto/*.pb.h src/proto/*.pb.cc
-	rm -rf tool/pb2/*
-	rm -rf $(BUILD_DIR)
-	@echo
diff --git a/Makefile.gpu b/Makefile.gpu
deleted file mode 100644
index 0615f6b..0000000
--- a/Makefile.gpu
+++ /dev/null
@@ -1,154 +0,0 @@
-#/**
-# * Licensed to the Apache Software Foundation (ASF) under one
-# * or more contributor license agreements.  See the NOTICE file
-# * distributed with this work for additional information
-# * regarding copyright ownership.  The ASF licenses this file
-# * to you under the Apache License, Version 2.0 (the
-# * "License"); you may not use this file except in compliance
-# * with the License.  You may obtain a copy of the License at
-# *
-# *     http://www.apache.org/licenses/LICENSE-2.0
-# *
-# * Unless required by applicable law or agreed to in writing, software
-# * distributed under the License is distributed on an "AS IS" BASIS,
-# * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# * See the License for the specific language governing permissions and
-# * limitations under the License.
-# */
-
-###################User Config Varaibles #############################
-# third-party library installation folder
-HOME_DIR := /home/wangwei/local
-
-# must config the cudnn folder if using cudnn
-CUDNN_DIR := $(HOME_DIR)/cudnn
-
-CUDA_DIR := /usr/local/cuda
-
-# Lib folder for system and external libs. You may need to change it.
-LIBRARY_DIRS := $(HOME_DIR)/lib64 $(HOME_DIR)/lib $(CUDNN_DIR)/lib64 $(CUDA_DIR)/lib64 $(CUDA_DIR)/lib
-# Header folder for system and external libs. You may need to change it.
-INCLUDE_DIRS := ./include $(HOME_DIR)/include $(CUDNN_DIR)/include $(CUDA_DIR)/include
-# g++ location, should support c++11, tested with 4.8.1
-CXX := g++
-CUCXX := nvcc
-
-######################Setting Varialbes#######################################
-LIBRARIES := glog protobuf openblas zmq czmq zookeeper_mt
-
-ifneq ($(CUDA_DIR),)
-	LIBRARIES := $(LIBRARIES) cublas cudart curand cudnn
-endif
-
-LDFLAGS := $(foreach librarydir, $(LIBRARY_DIRS), -L$(librarydir))\
-	$(foreach library, $(LIBRARIES), -l$(library))
-# Folder to store compiled files
-BUILD_DIR := .libs
-MSHADOW_FLAGS :=-DMSHADOW_USE_CUDA=0 -DMSHADOW_USE_CBLAS=1 -DMSHADOW_USE_MKL=0
-ZK_FLAGS :=-DTHREADED -fpermissive
-CXXFLAGS := -O2 -msse3 -Wall -pthread -fPIC -std=c++11 -Wno-unknown-pragmas \
-	$(MSHADOW_FLAGS) -DUSE_CUDNN $(ZK_FLAGS)\
-	-funroll-loops $(foreach includedir, $(INCLUDE_DIRS), -I$(includedir))
-CUCXXFLAGS := -DUSE_CUDNN $(MSHADOW_FLAGS) -std=c++11 $(CUDA_ARCH) \
-	$(foreach includedir, $(INCLUDE_DIRS), -I$(includedir))
-
-#Add device compile option
-ifeq ($(CUDA_DIR),)
-	MSHADOW_FLAGS := $(MSHADOW_FLAGS) -DCPU_ONLY
-	CXXFLAGS := $(CXXFLAGS) -DCPU_ONLY
-else
-	CXXFLAGS := $(CXXFLAGS) -DUSE_GPU
-endif
-
-# find user defined .proto file, and then compute the corresponding .h, .cc
-# files, which cannot be found by shell find, because they haven't been
-# generated currently
-PROTOS := $(shell find src/proto/ -name "*.proto")
-PROTO_SRCS :=$(PROTOS:.proto=.pb.cc)
-PROTO_HDRS :=$(patsubst src%, include%, $(PROTOS:.proto=.pb.h))
-PROTO_OBJS :=$(addprefix $(BUILD_DIR)/, $(PROTO_SRCS:.cc=.o))
-
-# each singa src file will generate a .o file
-SINGA_SRCS := $(shell find src/ \( -path "src/test" -o -path "src/main.cc" -o -path "src/utils/tool.cc" \) \
-	-prune -o \( -name "*.cc" -type f \) -print )
-SINGA_OBJS := $(sort $(addprefix $(BUILD_DIR)/, $(SINGA_SRCS:.cc=.o)) \
-	$(PROTO_OBJS) )
--include $(SINGA_OBJS:%.o=%.P)
-
-TEST_SRCS :=$(shell find src/test/ -maxdepth 1 -name "*.cc")
-TEST_OBJS := $(sort $(addprefix $(BUILD_DIR)/, $(TEST_SRCS:.cc=.o)))
--include $(TEST_OBJS:%.o=%.P)
-
-TEST_CUDA_SRCS :=$(shell find src/test/ -maxdepth 1 -name "*.cu")
-TEST_CUDA_OBJS := $(sort $(addprefix $(BUILD_DIR)/, $(TEST_CUDA_SRCS:.cu=.o)))
--include $(TEST_CUDA_OBJS:%.o=%.P)
-
-SINGA_CUDA_SRCS := $(shell find src/ \( -path "src/test" \) -prune -o \( -name "*.cu" -type f \) -print )
-SINGA_CUDA_OBJS := $(sort $(addprefix $(BUILD_DIR)/, $(SINGA_CUDA_SRCS:.cu=.o)))
--include $(SINGA_CUDA_OBJS:%.o=%.P)
-
-GTEST_SRC := include/gtest/gtest-all.cc
-GTEST_HDR := include/gtest/gtest.h
-GTEST_LIB := $(BUILD_DIR)/libgtest.a
-
-OBJS := $(sort $(SINGA_OBJS) $(TEST_OBJS) )
-CUOBJS := $(sort $(SINGA_CUDA_OBJS) $(TEST_CUDA_OBJS) )
-
-########################Compilation Section###################################
-.PHONY: singa test
-
-singa: $(PROTO_OBJS) $(SINGA_OBJS) $(SINGA_CUDA_OBJS)
-	$(CXX) -shared -o $(BUILD_DIR)/libsinga.so $(SINGA_OBJS)
-	$(CXX) $(SINGA_OBJS) $(SINGA_CUDA_OBJS) src/main.cc -o singa $(CXXFLAGS) $(LDFLAGS)
-	@echo
-	$(CXX) $(BUILD_DIR)/libsinga.so src/utils/tool.cc -o singatool $(CXXFLAGS) $(LDFLAGS) -Wl,-unresolved-symbols=ignore-in-shared-libs
-	@echo
-
-loader: proto $(LOADER_OBJS)
-	$(CXX) $(LOADER_OBJS) -o $(BUILD_DIR)/loader $(CXXFLAGS) $(LDFLAGS)
-	@echo
-
-test:  proto $(GTEST_LIB) $(TEST_OBJS) $(TEST_CUDA_OBJS) $(SINGA_OBJS) $(SINGA_CUDA_OBJS)
-	$(CXX) $(TEST_OBJS) $(TEST_CUDA_OBJS) include/gtest/gtest_main.cc $(GTEST_LIB) \
-		$(SINGA_OBJS) $(SINGA_CUDA_OBJS) -o $(BUILD_DIR)/test $(CXXFLAGS) $(LDFLAGS)
-	@echo
-
-$(GTEST_LIB): $(GTEST_HDR) $(GTEST_SRC)
-	$(CXX) $(GTEST_SRC) -c -o $(BUILD_DIR)/gtest-all.o $(CXXFLAGS)
-	ar -rv $(GTEST_LIB) $(BUILD_DIR)/gtest-all.o
-
-# compile all files
-$(OBJS):$(BUILD_DIR)/%.o : %.cc
-	@mkdir -p $(dir $@)
-	$(CXX) $<  $(CXXFLAGS) -MMD -c -o $@
-	cp $(BUILD_DIR)/$*.d $(BUILD_DIR)/$*.P; \
-	sed -e 's/#.*//' -e 's/^[^:]*: *//' -e 's/ *\\$$//' \
-		-e '/^$$/ d' -e 's/$$/ :/' < $(BUILD_DIR)/$*.d >> $(BUILD_DIR)/$*.P; \
-	rm -f $*.d
-
-$(CUOBJS):$(BUILD_DIR)/%.o : %.cu
-	@mkdir -p $(dir $@)
-	$(CUCXX) $< -c -o $@ $(CUCXXFLAGS)
-	cp $(BUILD_DIR)/$*.d $(BUILD_DIR)/$*.P; \
-	sed -e 's/#.*//' -e 's/^[^:]*: *//' -e 's/ *\\$$//' \
-		-e '/^$$/ d' -e 's/$$/ :/' < $(BUILD_DIR)/$*.d >> $(BUILD_DIR)/$*.P; \
-	rm -f $*.d
-
-proto: $(PROTO_OBJS)
-
-$(PROTO_SRCS): $(PROTOS)
-	protoc --proto_path=src/proto --cpp_out=src/proto $(PROTOS)
-	mkdir -p include/proto/
-	cp src/proto/*.pb.h include/singa/proto/
-	mkdir -p tool/pb2/
-	touch tool/pb2/__init__.py
-	protoc --proto_path=src/proto --python_out=tool/pb2/ $(PROTOS)
-	@echo
-
-clean:
-	rm -rf *.a *.so
-	rm -rf include/proto/*
-	rm -rf src/proto/*.pb.h src/proto/*.pb.cc
-	rm -rf tool/pb2/*
-	rm -rf $(BUILD_DIR)
-	@echo
diff --git a/NOTICE b/NOTICE
index c74e53a..092ec36 100644
--- a/NOTICE
+++ b/NOTICE
@@ -4,4 +4,4 @@
 This product includes software developed at
 The Apache Software Foundation (http://www.apache.org/).
 
-Portions of this software were developed at the National University of Singapore.
+Codebase originally donated by National University of Singapore.
diff --git a/README.md b/README.md
index 4d124aa..fc80db4 100644
--- a/README.md
+++ b/README.md
@@ -1,228 +1,16 @@
 #Apache SINGA
 
+[![Build Status](https://travis-ci.org/apache/incubator-singa.png)](https://travis-ci.org/apache/incubator-singa)
+
 Distributed deep learning system
 
-##Project Website
 
-All the details can be found in [Project Website](http://singa.incubator.apache.org), including the following instructions.
+## Quick Start
+
+* [Installation](doc/en/docs/installation.md)
+* [Examples](examples)
 
 ##Mailing Lists
 
 * [Development Mailing List](mailto:dev-subscribe@singa.incubator.apache.org) ([Archive](http://mail-archives.apache.org/mod_mbox/singa-dev/))
 * [Commits Mailing List](mailto:commits-subscribe@singa.incubator.apache.org) ([Archive](http://mail-archives.apache.org/mod_mbox/singa-commits/))
-
-<a name="Dependencies"</a>
-##Dependencies
-The current code depends on the following external libraries:
-
-  * `glog` (New BSD)
-  * `google-protobuf` (New BSD)
-  * `openblas` (New BSD)
-
-###Optional dependencies
-For advanced features, the following libraries are needed:
-
-  * `zeromq` (LGPLv3 + static link exception),`czmq` (Mozilla Public License Version 2.0) and `zookeeper` (Apache 2.0), for distributed training with multiple processes. Compile SINGA with `--enable-dist`
-  * `cuda` (NVIDIA CUDA Toolkit EUL) for training using NVIDIA GPUs.
-  * `cudnn` (NVIDIA CuDNN EULA) for training using NVIDIA's CuDNN library.
-  * `Apache Mesos` (Apache 2.0)
-  * `Apache Hadoop` (Apache 2.0)
-  * `libhdfs3` (Apache 2.0)
-  * `swig` (GPL) for using Python Binding.
-
-We have tested SINGA on Ubuntu 12.04, Ubuntu 14.01 and CentOS 6.
-You can install all dependencies (including optional dependencies) into `$PREFIX` folder by
-
-    ./thirdparty/install.sh all $PREFIX
-
-If `$PREFIX` is not a system path (e.g., `/usr/local/`), please export the following
-variables to continue the building instructions,
-
-    $ export LD_LIBRARY_PATH=$PREFIX/lib:$LD_LIBRARY_PATH
-    $ export CPLUS_INCLUDE_PATH=$PREFIX/include:$CPLUS_INCLUDE_PATH
-    $ export LIBRARY_PATH=$PREFIX/lib:$LIBRARY_PATH
-    $ export PATH=$PREFIX/bin:$PATH
-
-
-##Documentation
-
-Full documentation is available online at [Official Documentation](https://singa.incubator.apache.org/docs/overview.html).
-
-##Building SINGA
-
-Please make sure you have `g++ >= 4.8.1` before building SINGA.
-
-    $ ./autogen.sh
-    # refer to the FAQs below for errors during configure, including blas_segmm() error
-    $ ./configure
-    # refer to the FAQs below for error during make
-    $ make
-
-To compile with GPU support, you should run:
-
-    $ ./configure --enable-cuda --with-cuda=/CUDA/PATH --enable-cudnn --with-cudnn=/CUDNN/PATH
-
---with-cuda and --with-cudnn are optional as by default the script will search system paths. We have tested with CUDA V7.0 and V7.5, CUDNN V3 and V4.
-Please kindly set proper environment parameters (LD_LIBRARY_PATH, LIBRARY_PATH, etc.) when you run the code.
-
-To compile with HDFS support, you should run:
-
-    $ ./configure --enable-hdfs --with-libhdfs=/PATH/TO/HDFS3
-
---with-libhdfs is optional as by default the path is /usr/local/.
-
-To compile with python wrappers, you should run:
-
-	$ ./tool/python/singa/generatepy.sh
-	$ ./configure --enable-python --with-python=/PATH/TO/Python.h
-
---with-python is optional as by default the path is /usr/local/include.
-
-You can also run the following command for further configuration.
-
-    $ ./configure --help
-
-##Running Examples
-
-Let us train the [CNN model](http://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks) over the
-[CIFAR-10](http://www.cs.toronto.edu/~kriz/cifar.html) dataset without parallelism as an example. The hyper-parameters
-are set following [cuda-convnet](https://code.google.com/p/cuda-convnet/). More details about this example are available
-at [CNN example](http://singa.incubator.apache.org/docs/cnn).
-
-First, download the dataset and create data shards:
-
-    $ cd examples/cifar10/
-    $ cp Makefile.example Makefile
-    $ make download
-    $ make create
-
-If it reports errors due to library missing, e.g., `libopenblas` or `libprotobuf`,
-please export the environment variables shown in the [Dependencies](#Dependencies) section and
-continue with the following instructions,
-
-    # delete the newly created folders
-    $ rm -rf cifar10_t*
-    $ make create
-
-Next, start the training:
-
-    $ cd ../../
-    $ ./singa -conf examples/cifar10/job.conf
-
-For GPU training or distributed training, please refer to the [online guide](http://singa.apache.org/docs).
-
-##LICENSE
-
-Apache SINGA is licensed under the [Apache License, Version 2.0](http://www.apache.org/licenses/LICENSE-2.0).
-
-For additional information, see the `LICENSE` and `NOTICE` files.
-
-## FAQ
-
-* Q1:I get error `./configure --> cannot find blas_segmm() function` even I
-have installed `OpenBLAS`.
-
-  A1: This means the compiler cannot find the `OpenBLAS` library. If you have installed `OpenBLAS` via `apt-get install`, then export the path to `$LD_LIBRARY_PATH` (e.g. `/usr/lib/openblas-base`). If you installed it with
-  `./thirdparty/install.sh`, then export the correct path based on `$PREFIX` (e.g. `/opt/OpenBLAS/lib`):
-
-      # using apt-get install for openblas
-      $ export LIBRARY_PATH=$PATH_TO_OPENBLAS_LIB:$LIBRARY_PATH
-
-      # using ./thirdparty/install.sh for openblas:
-      $ export LIBRARY_PATH=/opt/OpenBLAS/lib:$LIBRARY_PATH
-
-
-* Q2: I get error `cblas.h no such file or directory exists`.
-
-  A2: You need to include the folder containing `cblas.h` into `$CPLUS_INCLUDE_PATH`,
-  e.g.,
-
-      $ export CPLUS_INCLUDE_PATH=$PREFIX/include:$CPLUS_INCLUDE_PATH
-      # e.g.,
-      $ export CPLUS_INCLUDE_PATH=/opt/OpenBLAS/include:$CPLUS_INCLUDE_PATH
-      # then reconfigure and make SINGA
-      $ ./configure
-      $ make
-
-
-* Q3: When compiling, I get error `SSE2 instruction set not enabled`
-
-  A3: You can try following command:
-
-      $ make CFLAGS='-msse2' CXXFLAGS='-msse2'
-
-
-* Q4: I get `ImportError: cannot import name enum_type_wrapper` from
-`google.protobuf.internal` when I try to import `.py` files.
-
-  A4: After installing `protobuf` by `make install`, we should install `python`
-  runtime libraries. Go to `protobuf` source directory, run:
-
-      $ cd /PROTOBUF/SOURCE/FOLDER
-      $ cd python
-      $ python setup.py build
-      $ python setup.py install
-
-  You may need `sudo` when you try to install `python` runtime libraries in
-  the system folder.
-
-
-* Q5: I get a linking error caused by `gflags`.
-
-  A5: SINGA does not depend on `gflags`. But you may have installed the `glog` with
-  `gflags`. In that case you can reinstall `glog` using `thirdparty/install.sh` into
-  a another folder and export the `$LDFLAGS` and `$CPPFLAGS` to include that folder.
-
-
-* Q6: While compiling SINGA and installing `glog` on mac OS X, I get fatal error
-`'ext/slist' file not found`
-
-  A6: We have not done thorough test on Mac OS. If you want to install `glog`, please goto glog folder and try:
-
-      $ make CFLAGS='-stdlib=libstdc++' CXXFLAGS='stdlib=libstdc++'
-
-* Q7: When I start a training job, it reports error related to `ZOO_ERROR...zk retcode=-4...`.
-
-  A7: This is because `zookeeper` is not started. Please start the service
-
-      $ ./bin/zk-service.sh start
-
-  If the error still exists, probably that you do not have `java`. You can simply
-  check it by
-
-      $ java --version
-
-* Q8: When I build `OpenBLAS` from source, I am told that I need a fortran compiler.
-
-  A8: You can compile `OpenBLAS` by
-
-      $ make ONLY_CBLAS=1
-
-  or install it using
-
-      $ sudo apt-get install openblas-dev
-
-  or
-
-      $ sudo yum install openblas-devel
-
-  It is worth noting that you need root access to run the last two commands.
-  Remember to set the environment variables to include the header and library
-  paths of `OpenBLAS` after installation (please refer to the [Dependencies](#Dependencies) section).
-
-* Q9: When I build protocol buffer, it reports that `GLIBC++_3.4.20 not found in /usr/lib64/libstdc++.so.6`.
-
-  A9: This means the linker found `libstdc++.so.6` but that library
-  belongs to an older version of `GCC` than was used to compile and link the
-  program. The program depends on code defined in
-  the newer `libstdc++` that belongs to the newer version of GCC, so the linker
-  must be told how to find the newer `libstdc++` shared library.
-  The simplest way to fix this is to find the correct `libstdc++` and export it to
-  `$LD_LIBRARY_PATH`. For example, if `GLIBC++_3.4.20` is listed in the output of the
-  following command,
-
-      $ strings /usr/local/lib64/libstdc++.so.6|grep GLIBC++
-
-  then just set your environment variable as
-
-      $ export LD_LIBRARY_PATH=/usr/local/lib64:$LD_LIBRARY_PATH
diff --git a/RELEASE_NOTES b/RELEASE_NOTES
index 5786ad2..e36dce8 100644
--- a/RELEASE_NOTES
+++ b/RELEASE_NOTES
@@ -1,3 +1,91 @@
+Release Notes - SINGA - Version singa-incubating-1.0.0
+
+SINGA is a general distributed deep learning platform for training big deep learning models over large datasets.
+
+This release includes following features:
+
+  * Core abstractions including Tensor and Device
+      * [SINGA-207]  Update Tensor functions for matrices
+      * [SINGA-205]  Enable slice and concatenate operations for Tensor objects
+      * [SINGA-197]  Add CNMem as a submodule in lib/
+      * [SINGA-196]  Rename class Blob to Block
+      * [SINGA-194]  Add a Platform singleton
+      * [SINGA-175]  Add memory management APIs and implement a subclass using CNMeM
+      * [SINGA-173]  * [SINGA-162 OpenCL Implementation
+      * [SINGA-171]  Create CppDevice and CudaDevice
+      * [SINGA-168]  Implement Cpp Math functions APIs
+      * [SINGA-162]  Overview of features for V1.x
+      * [SINGA-165]  Add cross-platform timer API to singa
+      * [SINGA-167]  Add Tensor Math function APIs
+      * [SINGA-166]  light built-in logging for making glog optional
+      * [SINGA-164]  Add the base Tensor class
+
+
+  * IO components for file read/write, network and data pre-processing
+      * [SINGA-233]  New communication interface
+      * [SINGA-215]  Implement Image Transformation for Image Pre-processing
+      * [SINGA-214]  Add LMDBReader and LMDBWriter for LMDB
+      * [SINGA-213]  Implement Encoder and Decoder for CSV
+      * [SINGA-211]  Add TextFileReader and TextFileWriter for CSV files
+      * [SINGA-210]  Enable checkpoint and resume for v1.0
+      * [SINGA-208]  Add DataIter base class and a simple implementation
+      * [SINGA-203]  Add OpenCV detection for cmake compilation
+      * [SINGA-202]  Add reader and writer for binary file
+      * [SINGA-200]  Implement Encoder and Decoder for data pre-processing
+
+
+
+  * Module components including layer classes, training algorithms and Python binding
+      * [SINGA-235]  Unify the engines for cudnn and singa layers
+      * [SINGA-230]  OpenCL Convolution layer and Pooling layer
+      * [SINGA-222]  Fixed bugs in IO
+      * [SINGA-218]  Implementation for RNN CUDNN version
+      * [SINGA-204]  Support the training of feed-forward neural nets
+      * [SINGA-199]  Implement Python classes for SGD optimizers
+      * [SINGA-198]  Change Layer::Setup API to include input Tensor shapes
+      * [SINGA-193]  Add Python layers
+      * [SINGA-192]  Implement optimization algorithms for Singa v1 (nesterove, adagrad, rmsprop)
+      * [SINGA-191]  Add "autotune" for CudnnConvolution Layer
+      * [SINGA-190]  Add prelu layer and flatten layer
+      * [SINGA-189]  Generate python outputs of proto files
+      * [SINGA-188]  Add Dense layer
+      * [SINGA-187]  Add popular parameter initialization methods
+      * [SINGA-186]  Create Python Tensor class
+      * [SINGA-184]  Add Cross Entropy loss computation
+      * [SINGA-183]  Add the base classes for optimizer, constraint and regularizer
+      * [SINGA-180]  Add Activation layer and Softmax layer
+      * [SINGA-178]  Add Convolution layer and Pooling layer
+      * [SINGA-176]  Add loss and metric base classes
+      * [SINGA-174]  Add Batch Normalization layer and Local Response Nomalization layer.
+      * [SINGA-170]  Add Dropout layer and CudnnDropout layer.
+      * [SINGA-169]  Add base Layer class for V1.0
+
+
+  * Examples
+      * [SINGA-232]  Alexnet on Imagenet
+      * [SINGA-231]  Batchnormlized VGG model for cifar-10
+      * [SINGA-228]  Add Cpp Version of Convolution and Pooling layer
+      * [SINGA-227]  Add Split and Merge Layer and add ResNet Implementation
+
+  * Documentation
+      * [SINGA-239]  Transfer documentation files of v0.3.0 to github
+      * [SINGA-238]  RBM on mnist
+      * [SINGA-225]  Documentation for installation and Cifar10 example
+      * [SINGA-223]  Use Sphinx to create the website
+
+  * Tools for compilation and some utility code
+      * [SINGA-229]  Complete install targets
+      * [SINGA-221]  Support for Travis-CI
+      * [SINGA-217]  build python package with setup.py
+      * [SINGA-216]  add jenkins for CI support
+      * [SINGA-212]  Disable the compilation of libcnmem if USE_CUDA is OFF
+      * [SINGA-195]  Channel for sending training statistics
+      * [SINGA-185]  Add CBLAS and GLOG detection for singav1
+      * [SINGA-181]  Add NVCC supporting for .cu files
+      * [SINGA-177]  Add fully cmake supporting for the compilation of singa_v1
+      * [SINGA-172]  Add CMake supporting for Cuda and Cudnn libs
+
+----------------------------------------------------------
 Release Notes - SINGA - Version singa-incubating-0.3.0
 
 SINGA is a general distributed deep learning platform for training big deep learning models over large datasets.
diff --git a/cmake/Cuda.cmake b/cmake/Cuda.cmake
new file mode 100644
index 0000000..e590bb1
--- /dev/null
+++ b/cmake/Cuda.cmake
@@ -0,0 +1,42 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# 
+
+
+FIND_PACKAGE(CUDA 5.5 QUIET)
+
+IF(NOT CUDA_FOUND)
+    return()
+ENDIF()
+
+SET(HAVE_CUDA TRUE)
+MESSAGE(STATUS "Found cuda_v${CUDA_VERSION}")
+#ADD_DEFINITIONS(-DUSE_CUDA)
+#message(STATUS "linking: ${CUDA_CUDART_LIBRARY} ${CUDA_curand_LIBRARY} ${CUDA_CUBLAS_LIBRARIES}")
+
+IF(USE_CUDNN)
+#include(cmake/Modules/Cudnn.cmake)
+    FIND_PACKAGE(CUDNN REQUIRED)
+    INCLUDE_DIRECTORIES(SYSTEM ${CUDNN_INCLUDE_DIR})
+    LIST(APPEND SINGA_LINKER_LIBS ${CUDNN_LIBRARIES})
+    #ADD_DEFINITIONS(-DUSE_CUDNN)
+    #ADD_DEFINITIONS(-DCUDNN_VERSION_MAJOR=${CUDNN_VERSION_MAJOR})
+ENDIF()
+
+INCLUDE_DIRECTORIES(SYSTEM ${CUDA_INCLUDE_DIRS})
+LIST(APPEND SINGA_LINKER_LIBS ${CUDA_CUDART_LIBRARY} ${CUDA_curand_LIBRARY} ${CUDA_CUBLAS_LIBRARIES})
+#MESSAGE(STATUS "libs " ${SINGA_LINKER_LIBS})
diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
new file mode 100644
index 0000000..b5fda6d
--- /dev/null
+++ b/cmake/Dependencies.cmake
@@ -0,0 +1,91 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# 
+
+SET(SINGA_LINKER_LIBS "")
+
+#INCLUDE("cmake/ProtoBuf.cmake")
+
+FIND_PACKAGE( Protobuf REQUIRED )
+INCLUDE_DIRECTORIES(SYSTEM ${PROTOBUF_INCLUDE_DIR})
+MESSAGE(STATUS "proto libs " ${PROTOBUF_LIBRARIES})
+LIST(APPEND SINGA_LINKER_LIBS ${PROTOBUF_LIBRARIES})
+INCLUDE("cmake/Protobuf.cmake")
+
+#FIND_PACKAGE(Glog)
+#IF(GLOG_FOUND)
+#    MESSAGE(STATUS "GLOG FOUND at ${GLOG_INCLUDE_DIR}")
+#    ADD_DEFINITIONS("-DUSE_GLOG")
+#    LIST(APPEND SINGA_LINKER_LIBS ${GLOG_LIBRARIES})
+#ENDIF()
+
+IF(USE_LMDB)
+    FIND_PACKAGE(LMDB REQUIRED)
+    INCLUDE_DIRECTORIES(SYSTEM ${LMDB_INCLUDE_DIR})
+    LIST(APPEND SINGA_LINKER_LIBS ${LMDB_LIBRARIES})
+    MESSAGE(STATUS "FOUND lmdb at ${LMDB_INCLUDE_DIR}")
+ENDIF()
+
+IF(USE_CUDA)
+    INCLUDE("cmake/Cuda.cmake")
+ELSE()
+    SET(USE_CUDNN FALSE)
+ENDIF()
+
+IF(USE_CBLAS)
+    FIND_PACKAGE(CBLAS REQUIRED)
+    INCLUDE_DIRECTORIES(SYSTEM ${CBLAS_INCLUDE_DIR})
+    LIST(APPEND SINGA_LINKER_LIBS ${CBLAS_LIBRARIES})
+    MESSAGE(STATUS "FOUND cblas at ${CBLAS_LIBRARIES}")
+ENDIF()
+
+IF(USE_OPENCL)
+    FIND_PACKAGE(OpenCL REQUIRED)
+    IF(NOT OPENCL_FOUND)
+        MESSAGE(SEND_ERROR "OpenCL was requested, but not found.")
+    ELSE()
+        INCLUDE_DIRECTORIES(SYSTEM ${OpenCL_INCPATH})
+        LIST(APPEND SINGA_LINKER_LIBS ${OPENCL_LIBRARIES})
+        MESSAGE(STATUS "Found OpenCL at ${OPENCL_INCLUDE_DIRS}")
+        IF(NOT OPENCL_HAS_CPP_BINDINGS)
+            MESSAGE(SEND_ERROR "OpenCL C++ bindings cl2.hpp was not found.")
+        ELSE()
+            MESSAGE(STATUS "Found OpenCL C++ bindings.")
+        ENDIF()
+    ENDIF()
+ENDIF()
+
+FIND_PACKAGE(Glog REQUIRED)
+INCLUDE_DIRECTORIES(SYSTEM ${GLOG_INCLUDE_DIRS})
+LIST(APPEND SINGA_LINKER_LIBS ${GLOG_LIBRARIES})
+#MESSAGE(STATUS "Found glog at ${GLOG_INCLUDE_DIRS}")
+
+IF(USE_OPENCV)
+    FIND_PACKAGE(OpenCV REQUIRED)
+    MESSAGE(STATUS "Found OpenCV_${OpenCV_VERSION} at ${OpenCV_INCLUDE_DIRS}")
+    INCLUDE_DIRECTORIES(SYSTEM ${OpenCV_INCLUDE_DIRS})
+    LIST(APPEND SINGA_LINKER_LIBS ${OpenCV_LIBRARIES})
+ENDIF()
+
+#LIST(APPEND SINGA_LINKER_LIBS "/home/wangwei/local/lib/libopenblas.so")
+#MESSAGE(STATUS "link lib : " ${SINGA_LINKER_LIBS})
+
+IF(USE_PYTHON)
+    FIND_PACKAGE(PythonLibs 2.7 REQUIRED)
+    FIND_PACKAGE(PythonInterp 2.7 REQUIRED)
+    FIND_PACKAGE(SWIG 3.0 REQUIRED)
+ENDIF()
diff --git a/cmake/Protobuf.cmake b/cmake/Protobuf.cmake
new file mode 100644
index 0000000..70cf0fe
--- /dev/null
+++ b/cmake/Protobuf.cmake
@@ -0,0 +1,31 @@
+# This script is taken from
+# https://github.com/Kitware/CMake/blob/master/Modules/FindProtobuf.cmake
+# and modified to our compilation.
+
+function(PROTOBUF_GENERATE_PYTHON OUTPUT)
+    if(NOT ARGN)
+        message(SEND_ERROR "Error: PROTOBUF_GENERATE_PYTHON() called 
+        without any proto files")
+        return()
+    endif(NOT ARGN)
+
+    set(${OUTPUT})
+    foreach(FIL ${ARGN})
+        get_filename_component(ABS_FIL ${FIL} ABSOLUTE)
+        get_filename_component(FIL_WE ${FIL} NAME_WE)
+        get_filename_component(PATH ${FIL} PATH)
+
+        list(APPEND ${OUTPUT} "${CMAKE_BINARY_DIR}/python/singa/proto/${FIL_WE}_pb2.py")
+
+        add_custom_command(
+            OUTPUT "${CMAKE_BINARY_DIR}/python/singa/proto/${FIL_WE}_pb2.py"
+            COMMAND ${PROTOBUF_PROTOC_EXECUTABLE}
+            ARGS --python_out ${CMAKE_BINARY_DIR}/python/singa/proto
+                 --proto_path ${PATH} ${ABS_FIL}
+            DEPENDS ${ABS_FIL}
+            COMMENT "Running Python protocol buffer compiler on ${FIL}" VERBATIM)
+    endforeach()
+    
+    set_source_files_properties(${${SRCS}} ${${HDRS}} PROPERTIES GENERATED TRUE)
+    set(${OUTPUT} ${${OUTPUT}} PARENT_SCOPE)
+endfunction()
diff --git a/cmake/Templates/singa_config.h.in b/cmake/Templates/singa_config.h.in
new file mode 100644
index 0000000..f3500d0
--- /dev/null
+++ b/cmake/Templates/singa_config.h.in
@@ -0,0 +1,45 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+*/
+
+// Source directory
+#define SOURCE_FOLDER "${PROJECT_SOURCE_DIR}"
+
+// Binaries director
+#define BINARY_FOLDER "${PROJECT_BINARY_DIR}"
+
+#cmakedefine CPU_ONLY
+
+#cmakedefine USE_CBLAS
+
+#cmakedefine USE_OPENCV
+// cuda
+#cmakedefine USE_CUDA
+
+#cmakedefine USE_CUDNN
+#cmakedefine CUDNN_VERSION_MAJOR @CUDNN_VERSION_MAJOR@
+#cmakedefine CUDNN_VERSION_MINOR @CUDNN_VERSION_MINOR@
+#cmakedefine CUDNN_VERSION_PATCH @CUDNN_VERSION_PATCH@
+#cmakedefine CUDNN_VERSION_SWIG @CUDNN_VERSION_SWIG@
+
+#cmakedefine USE_OPENCL
+
+#cmakedefine ENABLE_DIST
+
+// lmdb
+#cmakedefine USE_LMDB
+
diff --git a/cmake/Thirdparty/FindCBLAS.cmake b/cmake/Thirdparty/FindCBLAS.cmake
new file mode 100644
index 0000000..76c9118
--- /dev/null
+++ b/cmake/Thirdparty/FindCBLAS.cmake
@@ -0,0 +1,29 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# 
+
+
+FIND_PATH(CBLAS_INCLUDE_DIR NAMES cblas.h PATHS "$ENV{CBLAS_DIR}/include")
+FIND_LIBRARY(CBLAS_LIBRARIES NAMES openblas PATHS "$ENV{CBLAS_DIR}/lib")
+
+INCLUDE(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(CBLAS DEFAULT_MSG CBLAS_INCLUDE_DIR CBLAS_LIBRARIES)
+
+IF(CBLAS_FOUND)
+    #    MESSAGE(STATUS "Found cblas at ${CBLAS_INCLUDE_DIR}")
+    MARK_AS_ADVANCED(CBLAS_INCLUDE_DIR CBLAS_LIBRARIES)
+ENDIF()
diff --git a/cmake/Thirdparty/FindCUDNN.cmake b/cmake/Thirdparty/FindCUDNN.cmake
new file mode 100644
index 0000000..451b79b
--- /dev/null
+++ b/cmake/Thirdparty/FindCUDNN.cmake
@@ -0,0 +1,51 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# 
+
+
+FIND_PATH(CUDNN_INCLUDE_DIR NAME "cudnn.h" PATHS "$ENV{CMAKE_INCLUDE_PATH}")
+FIND_LIBRARY(CUDNN_LIBRARIES NAME "libcudnn.so" PATHS "$ENV{CMAKE_LIBRARY_PATH}")
+
+#message("cudnn include path:${CUDNN_INCLUDE_DIR}  lib path: ${CUDNN_LIBRARIES}")
+#message("env include path:$ENV{CUDNN_DIR} next: $ENV{CMAKE_INCLUDE_PATH}")
+INCLUDE(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(CUDNN DEFAULT_MSG CUDNN_INCLUDE_DIR CUDNN_LIBRARIES)
+
+IF(CUDNN_FOUND)
+    FILE(READ ${CUDNN_INCLUDE_DIR}/cudnn.h CUDNN_VERSION_FILE_CONTENTS)
+    STRING(REGEX MATCH "define CUDNN_MAJOR * +([0-9]+)"
+        CUDNN_VERSION_MAJOR "${CUDNN_VERSION_FILE_CONTENTS}")
+    STRING(REGEX REPLACE "define CUDNN_MAJOR * +([0-9]+)" "\\1"
+        CUDNN_VERSION_MAJOR "${CUDNN_VERSION_MAJOR}")
+    STRING(REGEX MATCH "define CUDNN_MINOR * +([0-9]+)"
+        CUDNN_VERSION_MINOR "${CUDNN_VERSION_FILE_CONTENTS}")
+    STRING(REGEX REPLACE "define CUDNN_MINOR * +([0-9]+)" "\\1"
+        CUDNN_VERSION_MINOR "${CUDNN_VERSION_MINOR}")
+    STRING(REGEX MATCH "define CUDNN_PATCHLEVEL * +([0-9]+)"
+        CUDNN_VERSION_PATCH "${CUDNN_VERSION_FILE_CONTENTS}")
+    STRING(REGEX REPLACE "define CUDNN_PATCHLEVEL * +([0-9]+)" "\\1"
+        CUDNN_VERSION_PATCH "${CUDNN_VERSION_PATCH}")
+
+    IF(NOT CUDNN_VERSION_MAJOR)
+        SET(CUDNN_VERSION "???")
+    ELSE()
+      MATH(EXPR CUDNN_VERSION_SWIG "${CUDNN_VERSION_MAJOR} * 1000 + ${CUDNN_VERSION_MINOR} * 100 + ${CUDNN_VERSION_PATCH}")
+    ENDIF()
+    MESSAGE(STATUS "Found Cudnn_v${CUDNN_VERSION_SWIG} at ${CUDNN_INCLUDE_DIR} ${CUDNN_LIBRARIES}")
+    MARK_AS_ADVANCED(CUDNN_INCLUDE_DIR CUDNN_LIBRARIES)
+
+ENDIF()
diff --git a/cmake/Thirdparty/FindGlog.cmake b/cmake/Thirdparty/FindGlog.cmake
new file mode 100644
index 0000000..e18c602
--- /dev/null
+++ b/cmake/Thirdparty/FindGlog.cmake
@@ -0,0 +1,29 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# 
+
+
+FIND_PATH(GLOG_INCLUDE_DIR NAMES glog/logging.h PATHS "$ENV{GLOG_DIR}/include")
+FIND_LIBRARY(GLOG_LIBRARIES NAMES glog)
+
+INCLUDE(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(GLOG DEFAULT_MSG GLOG_INCLUDE_DIR GLOG_LIBRARIES)
+
+IF(GLOG_FOUND)
+    #    MESSAGE(STATUS "Found glog at ${GLOG_INCLUDE_DIR}")
+    MARK_AS_ADVANCED(GLOG_INCLUDE_DIR GLOG_LIBRARIES)
+ENDIF()
diff --git a/cmake/Thirdparty/FindLMDB.cmake b/cmake/Thirdparty/FindLMDB.cmake
new file mode 100644
index 0000000..0553b19
--- /dev/null
+++ b/cmake/Thirdparty/FindLMDB.cmake
@@ -0,0 +1,30 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# 
+
+
+FIND_PATH(LMDB_INCLUDE_DIR NAMES lmdb.h PATHS "$ENV{LMDB_DIR}/include")
+FIND_LIBRARY(LMDB_LIBRARIES NAMES lmdb PATHS "$ENV{LMDB_DIR}/include")
+
+INCLUDE(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(LMDB DEFAULT_MSG LMDB_INCLUDE_DIR LMDB_LIBRARIES)
+
+IF(LMDB_FOUND)
+    MESSAGE(STATUS "Found lmdb at ${LMDB_INCLUDE_DIR}")
+    MARK_AS_ADVANCED(LMDB_INCLUDE_DIR LMDB_LIBRARIES)
+    
+ENDIF()
diff --git a/cmake/Utils.cmake b/cmake/Utils.cmake
new file mode 100644
index 0000000..a0373b8
--- /dev/null
+++ b/cmake/Utils.cmake
@@ -0,0 +1,70 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# 
+
+
+macro(swig_generate_cxx pylist_variable)
+    if(NOT EXISTS "${CMKAE_BINARY_DIR}/python")
+        execute_process(
+            COMMAND mkdir ${CMAKE_BINARY_DIR}/python
+            COMMAND mkdir ${CMAKE_BINARY_DIR}/python/singa
+            COMMAND mkdir ${CMAKE_BINARY_DIR}/python/singa/proto
+            ERROR_QUIET)
+    endif()
+    execute_process(
+        COMMAND swig -c++ -python -I${CMAKE_SOURCE_DIR}/include 
+        -outdir ${CMAKE_BINARY_DIR}/python/singa
+        ${ARGN})
+
+    set(${pylist_variable} "${CMAKE_SOURCE_DIR}/src/python/swig/singa_wrap.cxx")
+endmacro()
+
+function (create_symlinks)
+    # Do nothing if building in-source
+    if (${CMAKE_CURRENT_BINARY_DIR} STREQUAL ${CMAKE_CURRENT_SOURCE_DIR})
+        return()
+    endif()
+
+    foreach (path_file ${ARGN})
+        get_filename_component(folder ${path_file} PATH)
+
+        # Create REAL folder
+        file(MAKE_DIRECTORY "${CMAKE_BINARY_DIR}/${folder}")
+
+        # Delete symlink if it exists
+        file(REMOVE "${CMAKE_BINARY_DIR}/${path_file}")
+
+        # Get OS dependent path to use in `execute_process`
+        file(TO_NATIVE_PATH "${CMAKE_BINARY_DIR}/${path_file}" link)
+        file(TO_NATIVE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/${path_file}" target)
+
+        if (UNIX)
+            set(command ln -s ${target} ${link})
+        else()
+            set(command cmd.exe /c mklink ${link} ${target})
+        endif()
+
+        execute_process(COMMAND ${command} 
+                        RESULT_VARIABLE result
+                        ERROR_VARIABLE output)
+
+        if (NOT ${result} EQUAL 0)
+            message(FATAL_ERROR "Could not create symbolic link for: ${target} --> ${output}")
+        endif()
+
+    endforeach(path_file)
+endfunction(create_symlinks)
diff --git a/conf/hostfile b/conf/hostfile
deleted file mode 100644
index 2fbb50c..0000000
--- a/conf/hostfile
+++ /dev/null
@@ -1 +0,0 @@
-localhost
diff --git a/conf/profile b/conf/profile
deleted file mode 100644
index 72a8600..0000000
--- a/conf/profile
+++ /dev/null
@@ -1,3 +0,0 @@
-# Please add here the environment variables that cannot be recognized after ssh.
-# This file will be `source`ed upon ssh
-
diff --git a/conf/singa.conf b/conf/singa.conf
deleted file mode 100644
index 20cff98..0000000
--- a/conf/singa.conf
+++ /dev/null
@@ -1,7 +0,0 @@
-# point to your active zookeeper service
-# this is comma separated host:port pairs, each corresponding to a zk server
-# e.g. "127.0.0.1:3000,127.0.0.1:3001,127.0.0.1:3002"
-zookeeper_host: "localhost:2181"
-
-# set if you want to change log directory
-log_dir: "/tmp/singa-log/"
diff --git a/configure.ac b/configure.ac
deleted file mode 100644
index 21f699c..0000000
--- a/configure.ac
+++ /dev/null
@@ -1,311 +0,0 @@
-#/**
-# * Licensed to the Apache Software Foundation (ASF) under one
-# * or more contributor license agreements.  See the NOTICE file
-# * distributed with this work for additional information
-# * regarding copyright ownership.  The ASF licenses this file
-# * to you under the Apache License, Version 2.0 (the
-# * "License"); you may not use this file except in compliance
-# * with the License.  You may obtain a copy of the License at
-# *
-# *     http://www.apache.org/licenses/LICENSE-2.0
-# *
-# * Unless required by applicable law or agreed to in writing, software
-# * distributed under the License is distributed on an "AS IS" BASIS,
-# * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# * See the License for the specific language governing permissions and
-# * limitations under the License.
-# */
-#                                               -*- Autoconf -*-
-# Process this file with autoconf to produce a configure script.
-
-AC_PREREQ(2.59)
-AC_INIT(singa, 0.1.0, dev@singa.incubator.apache.org)
-AC_CONFIG_AUX_DIR(config)
-AC_CONFIG_MACRO_DIR(config)
-AC_CONFIG_SRCDIR([src/utils/common.cc])
-AC_CONFIG_HEADER([config.h])
-AM_INIT_AUTOMAKE([subdir-objects foreign])
-m4_ifdef([AM_SILENT_RULES], [AM_SILENT_RULES([yes])])
-ACLOCAL_AMFLAGS = -I m4 --install
-
-# Checks for programs.
-AC_PROG_CXX([g++])
-AC_PROG_CC
-AC_DISABLE_STATIC
-AC_PROG_LIBTOOL
-
-# Checks for libraries.
-AC_SEARCH_LIBS([cblas_sgemm], [openblas], [], [
-  AC_MSG_ERROR([unable to find cblas_sgemm() function])
-  ])
-AC_SEARCH_LIBS([zmsg_new], [czmq], [], [
-  AC_MSG_ERROR([unable to find zmsg_new() function])
-  ])
-AC_CHECK_LIB([glog], [main], [], [
-  AC_MSG_ERROR([unable to find glog library])
-  ])
-AC_CHECK_LIB([protobuf], [main], [], [
-  AC_MSG_ERROR([unable to find protobuf library])
-  ])
-
-PROGS=''
-LTLIBS=''
-
-# Setup custom CUDNN paths
-AC_ARG_ENABLE([cudnn],
-    [AS_HELP_STRING(--enable-cudnn,enable CUDNN support)],
-    [enable_cudnn="yes"], [enable_cudnn="no"])
-AM_CONDITIONAL(DCUDNN, [test "$enable_cudnn" = "yes"])
-AC_ARG_WITH([cudnn],
-    [AS_HELP_STRING([--with-cudnn=PATH], [prefix where CUDNN is installed])],
-    [cudnn_prefix=$withval], [cudnn_prefix="/usr/local/cudnn"])
-if test "$cudnn_prefix" == "yes"; then
-    if test "$withval" == "yes"; then
-        cudnn_prefix="/usr/local/cudnn"
-    fi
-fi
-if test x"$enable_cudnn" == x"yes"; then
-    CUDNN_CFLAGS="-I$cudnn_prefix/include"
-    CUDNN_LDFLAGS="-L$cudnn_prefix/lib64 -L$cudnn_prefix/lib"
-    CUDNN_LIBS="-lcudnn"
-    LIBS="$LIBS $CUDNN_LIBS"
-    LDFLAGS="$LDFLAGS $CUDNN_LDFLAGS"
-    DEBUG="-DUSE_CUDNN"
-    AC_DEFINE(DCUDNN,[1],[Defined if CUDNN should be used])
-    AC_CHECK_LIB([cudnn], [main], [], [
-        AC_MSG_ERROR([unable to find cudnn library])
-        ])
-else
-    CUDNN_CFLAGS=""
-    CUDNN_LDFLAGS=""
-    CUDNN_LIBS=""
-fi
-AC_SUBST(CUDNN_CFLAGS)
-AC_SUBST(CUDNN_LDFLAGS)
-AC_SUBST(CUDNN_LIBS)
-
-# Setup custom CUDA paths
-AC_ARG_ENABLE(cuda,
-  [AS_HELP_STRING(--enable-cuda,enable CUDA support)],
-    cudaval="yes",
-    cudaval="no")
-AM_CONDITIONAL(DCUDA, [test "$cudaval" = "yes"])
-AC_ARG_WITH([cuda],
-   [AS_HELP_STRING(
-        [--with-cuda=PATH],
-        [prefix where CUDA is installed])],
-   [cuda_prefix=$withval],
-   [cuda_prefix="/usr/local/cuda"])
-if test "$cuda_prefix" == "yes"; then
-    if test "$withval" == "yes"; then
-        cuda_prefix="/usr/local/cuda"
-    fi
-fi
-if test x"$cudaval" = x"yes"; then
-    AC_MSG_CHECKING([nvcc in $cuda_prefix/bin])
-    if test -x "$cuda_prefix/bin/nvcc"; then
-        AC_MSG_RESULT([found])
-        AC_DEFINE_UNQUOTED([NVCC_PATH], ["$cuda_prefix/bin/nvcc"], [Path to nvcc binary])
-    else
-        AC_MSG_RESULT([not found!])
-        AC_MSG_FAILURE([nvcc was not found in $cuda_prefix/bin])
-    fi
-    CUDA_CFLAGS="-I$cuda_prefix/include"
-    CUDA_LDFLAGS="-L$cuda_prefix/lib64 -L$cuda_prefix/lib"
-    CUDA_LIBS="-lcublas -lcudart -lcurand"
-    LIBS="$LIBS $CUDA_LIBS"
-    LDFLAGS="$LDFLAGS $CUDA_LDFLAGS -L./"
-    LIBTOOL='LD_LIBRARY_PATH=$(PWD) $(SHELL) $(top_builddir)/libtool'
-    NVCC="nvcc"
-  DEBUG+=" -DUSE_GPU"
-    AC_DEFINE(DCUDA,[1],[Defined if CUDA should be used])
-  AC_CHECK_LIB([cublas], [main], [], [
-     AC_MSG_ERROR([unable to find cuda library])
-    ])
-  AC_CHECK_LIB([cudart], [main], [], [
-      AC_MSG_ERROR([unable to find cudart library])
-    ])
-  AC_CHECK_LIB([curand], [main], [], [
-      AC_MSG_ERROR([unable to find curand library])
-    ])
-else
-    CUDA_CFLAGS=""
-    CUDA_LDFLAGS=""
-    CUDA_LIBS=""
-    NVCC=""
-    DEBUG="-DCPU_ONLY"
-fi
-AC_SUBST(LIBTOOL)
-AC_SUBST(NVCC)
-AC_SUBST(CUDA_LDFLAGS)
-AC_SUBST(CUDA_LIBS)
-AC_SUBST(CUDA_CFLAGS)
-
-# Setup custom zookeeper and zmq paths
-AC_ARG_ENABLE(dist,
-  AS_HELP_STRING([--enable-dist],[enable dist support]),
-  [enable_dist="yes"],[enable_dist="no"])
-AM_CONDITIONAL(DDIST, test "$enable_dist" = "yes")
-AC_ARG_WITH([dist],
-    [AS_HELP_STRING([--with-dist=PATH], [prefix where dist libraries,i.e.
-     zookeeper/zmq is installed])],
-    [dist_prefix=$withval], [dist_prefix="/usr/local"])
-if test "$dist_prefix" == "yes"; then
-    if test "$withval" == "yes"; then
-        dist_prefix="/usr/local"
-    fi
-fi
-if test x"$enable_dist" == x"yes"; then
-  AC_CHECK_LIB([zookeeper_mt], [main], [], [
-                AC_MSG_ERROR([unable to find zookeeper library])
-        ])
-  AC_SEARCH_LIBS([zmq_ctx_new], [zmq], [], [
-                  AC_MSG_ERROR([unable to find zmq_ctx_new() function])
-        ])
-  DIST_CFLAGS="-I$dist_prefix/include"
-  DIST_LDFLAGS="-L$dist_prefix/lib"
-  DIST_LIBS="-lzookeeper_mt -lzmq"
-  LIBS="$LIBS $DIST_LIBS"
-  LDFLAGS="$LDFLAGS $DIST_LDFLAGS"
-  DEBUG+=" -DUSE_ZOOKEEPER -DUSE_ZMQ"
-  AC_DEFINE(DDIST,[1],[Defined if dist should be used])
-else
-  DIST_CFLAGS=""
-  DIST_LDFLAGS=""
-  DIST_LIBS=""
-fi
-AC_SUBST(DIST_CFLAGS)
-AC_SUBST(DIST_LDFLAGS)
-AC_SUBST(DIST_LIBS)
-
-# Setup custom lmdb paths
-AC_ARG_ENABLE(lmdb,
-     AS_HELP_STRING([--enable-lmdb],[enable debug option]),
-   [enable_lmdb=yes],[enable_lmdb=no])
-AM_CONDITIONAL(LMDB, test "$enable_lmdb" = yes)
-if test x"$enable_lmdb" = x"yes"; then
-  AC_SEARCH_LIBS([mdb_env_create], [lmdb], [], [
-    AC_MSG_ERROR([unable to find mdb_env_create() function])
-    ])
-  AC_DEFINE(LMDB, 1, [Enable Option layer])
-fi
-
-# Setup custom libhdfs paths
-AC_ARG_ENABLE(hdfs,
-  AS_HELP_STRING([--enable-hdfs],[enable hdfs support]),
-  [enable_hdfs=yes],[enable_hdfs=no])
-AM_CONDITIONAL(DHDFS, test "$enable_hdfs" = yes)
-AC_ARG_WITH([libhdfs],
-    [AS_HELP_STRING([--with-libhdfs=PATH], [prefix where libhdfs is installed])],
-    [hdfs_prefix=$withval], [hdfs_prefix="/usr/local"])
-if test "$hdfs_prefix" == "yes"; then
-    if test "$withval" == "yes"; then
-        cudnn_prefix="/usr/local"
-    fi
-fi
-if test x"$enable_hdfs" != x"no"; then
-  HDFS_CFLAGS="-I$hdfs_prefix/include"
-  HDFS_LDFLAGS="-L$hdfs_prefix/lib"
-  HDFS_LIBS="-lhdfs3"
-  LIBS="$LIBS $HDFS_LIBS"
-  LDFLAGS="$LDFLAGS $HDFS_LDFLAGS"
-  DEBUG+=" -DUSE_HDFS"
-  AC_DEFINE(DHDFS,[1],[Defined if HDFS should be used])
-  AC_CHECK_LIB([hdfs3], [main], [], [
-      AC_MSG_ERROR([unable to find hdfs3 library])
-      ])
-else
-  HDFS_CFLAGS=""
-  HDFS_LDFLAGS=""
-  HDFS_LIBS=""
-fi
-AC_SUBST(HDFS_CFLAGS)
-AC_SUBST(HDFS_LDFLAGS)
-AC_SUBST(HDFS_LIBS)
-
-# Setup for test args
-AC_ARG_ENABLE(test,
-  AS_HELP_STRING([--enable-test],[enable singa test]),
-  [enable_test=yes],[enable_test=no])
-AM_CONDITIONAL(SINGATEST, test "$enable_test" = yes)
-if test x"$enable_test" != x"no"; then
-  PROGS+='singatest test '
-  LTLIBS+='libgtest.la '
-else
-  PROGS+=''
-  LTLIBS+=''
-fi
-
-# Setup for debug args
-AC_ARG_ENABLE(debug,
-  AS_HELP_STRING([--enable-debug],[enable debug mode]),
-  [enable_debug=yes],[enable_debug=no])
-AM_CONDITIONAL(SINGADEBUG, [test "$enable_debug" = yes])
-if test x"$enable_debug" == x"yes"; then
-  DEBUG+=' -g'
-else
-  DEBUG+=' -O2'
-fi
-
-# Setup for python args
-AC_ARG_ENABLE(python,
-    [AS_HELP_STRING([--enable-python],[enable python binding])],
-    [enable_python=yes],[enable_python=no])
-AM_CONDITIONAL([PY], [test "enable_python" = "yes"])
-if test x"$enable_python" != x"no"; then
-    AC_CHECK_PROG(PY_CHECK,python,yes)
-    if test x"$PY_CHECK" != x"yes"; then
-        AC_MSG_ERROR([Cannot find command "python". Please intall before make.])
-    else
-        PYLIBS="-lpython`python -V 2>&1 | awk '{print substr($2,1,3)}'`"
-    fi
-fi
-AC_SUBST(PYLIBS)
-
-# Setup for python paths
-AC_ARG_WITH([python],
-    [AS_HELP_STRING([--with-python=PATH],[prefix where python is installed])],
-    [python_prefix=$withval],[python_prefix="/usr/include/python`python -V 2>&1 | awk '{print substr($2,1,3)}'`"])
-if test "$python_prefix" == "yes"; then
-    if test "$withval" == "yes"; then
-        python_prefix="/usr/include/python`python -V 2>&1 | awk '{print substr($2,1,3)}'`"
-    fi
-fi
-if test x"$enable_python" != x"no"; then
-    AC_MSG_CHECKING([Python.h in $python_prefix])
-    if test -f "$python_prefix/Python.h"; then
-        AC_MSG_RESULT([Python.h found])
-        AC_DEFINE_UNQUOTED([PYTHON_PATH], ["$python_prefix"], [Path to python binary])
-    else
-        AC_MSG_RESULT([Python.h not found!])
-        AC_MSG_FAILURE([Python.h was not found in $python_prefix])
-    fi
-    #AC_DEFINE(PY,[1],[Defined if PY should be used])
-    PY_PROGS='_driver.la '
-    PYFLAGS="-I$python_prefix "
-else
-    PY_PROGS=''
-    PYFLAGS=''
-fi
-AC_SUBST([PROGS])
-AC_SUBST([LTLIBS])
-AC_SUBST([DEBUG])
-AC_SUBST([PYFLAGS])
-AC_SUBST([PY_PROGS])
-
-# Checks for header files.
-AC_HEADER_STDC
-AC_CHECK_HEADERS([fcntl.h malloc.h stdlib.h])
-
-# Checks for typedefs, structures, and compiler characteristics.
-AC_HEADER_STDBOOL
-AC_C_CONST
-AC_C_INLINE
-AC_TYPE_SIZE_T
-AC_C_VOLATILE
-
-# Checks for library functions.
-AC_FUNC_MALLOC
-AC_FUNC_STAT
-AC_CHECK_FUNCS([gethostname memset mkdir pow sqrt])
-AC_OUTPUT(Makefile)
diff --git a/Doxyfile b/doc/Doxyfile
similarity index 99%
rename from Doxyfile
rename to doc/Doxyfile
index 79d2d7b..5fc0e94 100644
--- a/Doxyfile
+++ b/doc/Doxyfile
@@ -44,7 +44,7 @@
 # for a project that appears at the top of each page and should give viewer a
 # quick idea about the purpose of the project. Keep the description short.
 
-PROJECT_BRIEF          = "A General Distributed Deep Learning Platform"
+PROJECT_BRIEF          = "A General Distributed Deep Learning Library"
 
 # With the PROJECT_LOGO tag one can specify a logo or an icon that is included
 # in the documentation. The maximum height of the logo should not exceed 55
@@ -58,7 +58,7 @@
 # entered, it will be relative to the location where doxygen was started. If
 # left blank the current directory will be used.
 
-OUTPUT_DIRECTORY       = doc/doxygen
+OUTPUT_DIRECTORY       = doxygen
 
 # If the CREATE_SUBDIRS tag is set to YES then doxygen will create 4096 sub-
 # directories (in 2 levels) under the output directory of each output format and
@@ -758,7 +758,7 @@
 # spaces.
 # Note: If this tag is empty the current directory is searched.
 
-INPUT                  =  "include" "src"
+INPUT                  =  "../include"
 
 # This tag can be used to specify the character encoding of the source files
 # that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses
diff --git a/doc/Readme.md b/doc/Readme.md
deleted file mode 100644
index 2fe1af8..0000000
--- a/doc/Readme.md
+++ /dev/null
@@ -1,3 +0,0 @@
-To generate docs, run "doxygen" in the incubator-singa/ directory
-
-Doxygen >= 1.8 recommended
diff --git a/doc/_static/apache.jpg b/doc/_static/apache.jpg
new file mode 100755
index 0000000..697e02d
--- /dev/null
+++ b/doc/_static/apache.jpg
Binary files differ
diff --git a/doc/_static/images/mlp-net.png b/doc/_static/images/mlp-net.png
new file mode 100644
index 0000000..742c5dc
--- /dev/null
+++ b/doc/_static/images/mlp-net.png
Binary files differ
diff --git a/doc/_static/images/model-category.png b/doc/_static/images/model-category.png
new file mode 100644
index 0000000..608340c
--- /dev/null
+++ b/doc/_static/images/model-category.png
Binary files differ
diff --git a/doc/_static/images/overview.png b/doc/_static/images/overview.png
new file mode 100644
index 0000000..a3244b3
--- /dev/null
+++ b/doc/_static/images/overview.png
Binary files differ
diff --git a/doc/_static/images/partition_fc.png b/doc/_static/images/partition_fc.png
new file mode 100644
index 0000000..030f18a
--- /dev/null
+++ b/doc/_static/images/partition_fc.png
Binary files differ
diff --git a/doc/_static/images/rbm-rnn.png b/doc/_static/images/rbm-rnn.png
new file mode 100644
index 0000000..892c5bd
--- /dev/null
+++ b/doc/_static/images/rbm-rnn.png
Binary files differ
diff --git a/doc/_static/images/sgd.png b/doc/_static/images/sgd.png
new file mode 100644
index 0000000..a0ec66f
--- /dev/null
+++ b/doc/_static/images/sgd.png
Binary files differ
diff --git a/doc/_static/images/singa.png b/doc/_static/images/singa.png
new file mode 100644
index 0000000..d9ce10f
--- /dev/null
+++ b/doc/_static/images/singa.png
Binary files differ
diff --git a/doc/_static/images/singav1-sw.png b/doc/_static/images/singav1-sw.png
new file mode 100644
index 0000000..e443c6e
--- /dev/null
+++ b/doc/_static/images/singav1-sw.png
Binary files differ
diff --git a/doc/_static/singa.png b/doc/_static/singa.png
new file mode 100755
index 0000000..30be5c1
--- /dev/null
+++ b/doc/_static/singa.png
Binary files differ
diff --git a/doc/_static/style.css b/doc/_static/style.css
new file mode 100644
index 0000000..b07bdb1
--- /dev/null
+++ b/doc/_static/style.css
@@ -0,0 +1,3 @@
+.wy-nav-content {
+    max-width: none;
+}
diff --git a/doc/_templates/layout.html b/doc/_templates/layout.html
new file mode 100755
index 0000000..b149652
--- /dev/null
+++ b/doc/_templates/layout.html
@@ -0,0 +1,57 @@
+{#
+ Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements.  See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership.  The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License.  You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+#}
+{% extends "!layout.html" %}
+
+{% block extrahead %}
+    <link href="{{ pathto("_static/style.css", True) }}" rel="stylesheet" type="text/css">
+{% endblock %}
+
+{% block footer %}
+
+<div class="rst-versions shift-up" data-toggle="rst-versions" role="note" aria-label="versions">
+<a href="http://incubator.apache.org/">
+<img src= "{{pathto('_static/'+ 'apache.jpg' , 1) }}">
+</a>
+
+  <span class="rst-current-version" data-toggle="rst-current-version">
+    <span class="fa fa-book"> incubator-singa </span>
+    v: {{ version }}
+    <span class="fa fa-caret-down"></span>
+  </span>
+    <div class="rst-other-versions">
+        <dl>
+            <dt>Languages</dt>
+            <dd><a href="{{ pathto('../en/index.html', 1) }}">English</a></dd>
+            <dd><a href="{{ pathto('../zh/index.html', 1) }}">中文</a></dd>
+        </dl>
+        <dl>
+            <dt>Versions</dt>
+            <dd><a href="http://singa.apache.org/v0.3.0/">0.3</a></dd>
+        </dl>
+
+    </div>
+</div>
+
+ <a href="https://github.com/apache/incubator-singa">
+    <img style="position: absolute; top: 0; right: 0; border: 0; z-index: 10000;"
+        src="https://s3.amazonaws.com/github/ribbons/forkme_right_orange_ff7600.png"
+        alt="Fork me on GitHub">
+</a>
+
+{{ super() }}
+{% endblock %}
diff --git a/doc/build.sh b/doc/build.sh
new file mode 100755
index 0000000..eb5b90c
--- /dev/null
+++ b/doc/build.sh
@@ -0,0 +1,38 @@
+#!/bin/bash
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+SPHINXBUILD="sphinx-build"
+BUILDDIR="_build"
+LANG_ARR=(en zh)
+
+if [ "$1"x = "clean"x ]; then
+	rm -rf $BUILDDIR/*
+	rm -rf en/docs/examples
+	echo "clean up $BUILDDIR"
+fi
+
+
+if [ "$1"x = "html"x ]; then
+	cp -rf ../examples en/docs/
+	for (( i=0; i<${#LANG_ARR[@]}; i++)) do
+		echo "building language ${LANG_ARR[i]} ..."
+		$SPHINXBUILD -b html -c . -d $BUILDDIR/doctree ${LANG_ARR[i]} $BUILDDIR/html/${LANG_ARR[i]}
+	done
+	echo "<script language=\"javascript\" type=\"text/javascript\">window.location.href='en/index.html';</script>" > $BUILDDIR/html/index.html
+fi
diff --git a/doc/conf.py b/doc/conf.py
new file mode 100755
index 0000000..08e391e
--- /dev/null
+++ b/doc/conf.py
@@ -0,0 +1,354 @@
+# -*- coding: utf-8 -*-
+#
+# incubator-singa documentation build configuration file, created by
+# sphinx-quickstart on Sat Jul  9 20:36:57 2016.
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This file is execfile()d with the current directory set to its
+# containing dir.
+#
+# Note that not all possible configuration values are present in this
+# autogenerated file.
+#
+# All configuration values have a default; values that are commented out
+# serve to show the default.
+
+# If extensions (or modules to document with autodoc) are in another directory,
+# add these directories to sys.path here. If the directory is relative to the
+# documentation root, use os.path.abspath to make it absolute, like shown here.
+#
+import os
+import sys
+sys.path.insert(0, os.path.abspath('.'))
+sys.path.insert(1, os.path.abspath('../build/python/'))
+
+# -- General configuration ------------------------------------------------
+from recommonmark.parser import CommonMarkParser
+
+source_parsers = {
+    '.md': CommonMarkParser,
+}
+
+# If your documentation needs a minimal Sphinx version, state it here.
+#
+# needs_sphinx = '1.0'
+
+# Add any Sphinx extension module names here, as strings. They can be
+# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
+# ones.
+extensions = ['sphinx.ext.autodoc', 'sphinx.ext.napoleon']
+napoleon_google_docstring = True
+# Add any paths that contain templates here, relative to this directory.
+templates_path = ['_templates']
+
+# The suffix(es) of source filenames.
+# You can specify multiple suffix as a list of string:
+#
+# source_suffix = ['.rst', '.md']
+source_suffix = ['.rst', '.md']
+
+# The encoding of source files.
+#
+# source_encoding = 'utf-8-sig'
+
+# The master toctree document.
+master_doc = 'index'
+
+# General information about the project.
+project = u'incubator-singa'
+copyright = u'2016 The Apache Software Foundation. All rights reserved. Apache Singa, Apache, the Apache feather logo, and the Apache Singa project logos are trademarks of The Apache Software Foundation. All other marks mentioned may be trademarks or registered trademarks of their respective owners.'
+author = u'moaz'
+
+# The version info for the project you're documenting, acts as replacement for
+# |version| and |release|, also used in various other places throughout the
+# built documents.
+#
+# The short X.Y version.
+version = u'1.0.0'
+# The full version, including alpha/beta/rc tags.
+release = u'1.0.0'
+
+# The language for content autogenerated by Sphinx. Refer to documentation
+# for a list of supported languages.
+#
+# This is also used if you do content translation via gettext catalogs.
+# Usually you set "language" from the command line for these cases.
+language = None
+
+# There are two options for replacing |today|: either, you set today to some
+# non-false value, then it is used:
+#
+# today = ''
+#
+# Else, today_fmt is used as the format for a strftime call.
+#
+# today_fmt = '%B %d, %Y'
+
+# List of patterns, relative to source directory, that match files and
+# directories to ignore when looking for source files.
+# This patterns also effect to html_static_path and html_extra_path
+exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
+
+# The reST default role (used for this markup: `text`) to use for all
+# documents.
+#
+# default_role = None
+
+# If true, '()' will be appended to :func: etc. cross-reference text.
+#
+# add_function_parentheses = True
+
+# If true, the current module name will be prepended to all description
+# unit titles (such as .. function::).
+#
+# add_module_names = True
+
+# If true, sectionauthor and moduleauthor directives will be shown in the
+# output. They are ignored by default.
+#
+# show_authors = False
+
+# The name of the Pygments (syntax highlighting) style to use.
+pygments_style = 'sphinx'
+
+# A list of ignored prefixes for module index sorting.
+# modindex_common_prefix = []
+
+# If true, keep warnings as "system message" paragraphs in the built documents.
+# keep_warnings = False
+
+# If true, `todo` and `todoList` produce output, else they produce nothing.
+todo_include_todos = False
+
+
+# -- Options for HTML output ----------------------------------------------
+
+# The theme to use for HTML and HTML Help pages.  See the documentation for
+# a list of builtin themes.
+#
+html_theme = 'sphinx_rtd_theme'
+
+# Theme options are theme-specific and customize the look and feel of a theme
+# further.  For a list of options available for each theme, see the
+# documentation.
+#
+# html_theme_options = {}
+
+# Add any paths that contain custom themes here, relative to this directory.
+# html_theme_path = []
+
+# The name for this set of Sphinx documents.
+# "<project> v<release> documentation" by default.
+#
+# html_title = u'Singa v1.0.0'
+
+# A shorter title for the navigation bar.  Default is the same as html_title.
+#
+# html_short_title = None
+
+# The name of an image file (relative to this directory) to place at the top
+# of the sidebar.
+#
+html_logo = '/singa.png'
+
+# The name of an image file (relative to this directory) to use as a favicon of
+# the docs.  This file should be a Windows icon file (.ico) being 16x16 or 32x32
+# pixels large.
+#
+# html_favicon = None
+
+# Add any paths that contain custom static files (such as style sheets) here,
+# relative to this directory. They are copied after the builtin static files,
+# so a file named "default.css" will overwrite the builtin "default.css".
+html_static_path = ['_static']
+
+# Add any extra paths that contain custom files (such as robots.txt or
+# .htaccess) here, relative to this directory. These files are copied
+# directly to the root of the documentation.
+#
+# html_extra_path = []
+
+# If not None, a 'Last updated on:' timestamp is inserted at every page
+# bottom, using the given strftime format.
+# The empty string is equivalent to '%b %d, %Y'.
+#
+# html_last_updated_fmt = None
+
+# If true, SmartyPants will be used to convert quotes and dashes to
+# typographically correct entities.
+#
+# html_use_smartypants = True
+
+# Custom sidebar templates, maps document names to template names.
+#
+# html_sidebars = {}
+
+# Additional templates that should be rendered to pages, maps page names to
+# template names.
+#
+# html_additional_pages = {}
+
+# If false, no module index is generated.
+#
+# html_domain_indices = True
+
+# If false, no index is generated.
+#
+# html_use_index = True
+
+# If true, the index is split into individual pages for each letter.
+#
+# html_split_index = False
+
+# If true, links to the reST sources are added to the pages.
+#
+html_show_sourcelink = False
+
+# If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
+#
+# html_show_sphinx = True
+
+# If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
+#
+# html_show_copyright = True
+
+# If true, an OpenSearch description file will be output, and all pages will
+# contain a <link> tag referring to it.  The value of this option must be the
+# base URL from which the finished HTML is served.
+#
+# html_use_opensearch = ''
+
+# This is the file name suffix for HTML files (e.g. ".xhtml").
+# html_file_suffix = None
+
+# Language to be used for generating the HTML full-text search index.
+# Sphinx supports the following languages:
+#   'da', 'de', 'en', 'es', 'fi', 'fr', 'hu', 'it', 'ja'
+#   'nl', 'no', 'pt', 'ro', 'ru', 'sv', 'tr', 'zh'
+#
+# html_search_language = 'en'
+
+# A dictionary with options for the search language support, empty by default.
+# 'ja' uses this config value.
+# 'zh' user can custom change `jieba` dictionary path.
+#
+# html_search_options = {'type': 'default'}
+
+# The name of a javascript file (relative to the configuration directory) that
+# implements a search results scorer. If empty, the default will be used.
+#
+# html_search_scorer = 'scorer.js'
+
+# Output file base name for HTML help builder.
+htmlhelp_basename = 'Singadoc'
+
+# -- Options for LaTeX output ---------------------------------------------
+
+latex_elements = {
+     # The paper size ('letterpaper' or 'a4paper').
+     #
+     # 'papersize': 'letterpaper',
+
+     # The font size ('10pt', '11pt' or '12pt').
+     #
+     # 'pointsize': '10pt',
+
+     # Additional stuff for the LaTeX preamble.
+     #
+     # 'preamble': '',
+
+     # Latex figure (float) alignment
+     #
+     # 'figure_align': 'htbp',
+}
+
+# Grouping the document tree into LaTeX files. List of tuples
+# (source start file, target name, title,
+#  author, documentclass [howto, manual, or own class]).
+latex_documents = [
+    (master_doc, 'incubator-singa.tex', u'incubator-singa Documentation',
+     u'moaz', 'manual'),
+]
+
+# The name of an image file (relative to this directory) to place at the top of
+# the title page.
+#
+# latex_logo = None
+
+# For "manual" documents, if this is true, then toplevel headings are parts,
+# not chapters.
+#
+# latex_use_parts = False
+
+# If true, show page references after internal links.
+#
+# latex_show_pagerefs = False
+
+# If true, show URL addresses after external links.
+#
+# latex_show_urls = False
+
+# Documents to append as an appendix to all manuals.
+#
+# latex_appendices = []
+
+# If false, no module index is generated.
+#
+# latex_domain_indices = True
+
+
+# -- Options for manual page output ---------------------------------------
+
+# One entry per manual page. List of tuples
+# (source start file, name, description, authors, manual section).
+man_pages = [
+    (master_doc, 'incubator-singa', u'incubator-singa Documentation',
+     [author], 1)
+]
+
+# If true, show URL addresses after external links.
+#
+# man_show_urls = False
+
+
+# -- Options for Texinfo output -------------------------------------------
+
+# Grouping the document tree into Texinfo files. List of tuples
+# (source start file, target name, title, author,
+#  dir menu entry, description, category)
+texinfo_documents = [
+    (master_doc, 'incubator-singa', u'incubator-singa Documentation',
+     author, 'incubator-singa', 'One line description of project.',
+     'Miscellaneous'),
+]
+
+# Documents to append as an appendix to all manuals.
+#
+# texinfo_appendices = []
+
+# If false, no module index is generated.
+#
+# texinfo_domain_indices = True
+
+# How to display URL addresses: 'footnote', 'no', or 'inline'.
+#
+# texinfo_show_urls = 'footnote'
+
+# If true, do not generate a @detailmenu in the "Top" node's menu.
+#
+# texinfo_no_detailmenu = False
diff --git a/doc/en/_templates/layout.html b/doc/en/_templates/layout.html
new file mode 100755
index 0000000..2f9ca0d
--- /dev/null
+++ b/doc/en/_templates/layout.html
@@ -0,0 +1,56 @@
+{#
+ Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements.  See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership.  The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License.  You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+#}
+{% extends "!layout.html" %}
+
+{% block extrahead %}
+    <link href="{{ pathto("_static/style.css", True) }}" rel="stylesheet" type="text/css">
+{% endblock %}
+
+{% block footer %}
+
+<div class="rst-versions shift-up" data-toggle="rst-versions" role="note" aria-label="versions">
+<a href="http://incubator.apache.org/">
+<img src= "{{pathto('_static/'+ 'apache.jpg' , 1) }}">
+</a>
+
+  <span class="rst-current-version" data-toggle="rst-current-version">
+    <span class="fa fa-book"> incubator-singa </span>
+    v: {{ version }}
+    <span class="fa fa-caret-down"></span>
+  </span>
+  <div class="rst-other-versions">
+    <dl>
+       <dd><a href="">English</a></dd>
+       <dd><a href="{{pathto('zh/'+ 'index.html' , 1) }}">中文</a></dd>
+	  <!--dd><a href="/jp/latest/">日本語</a></dd>
+	  <dd><a href="/kr/latest/">한국어</a></dd>
+	  <dd><a href="/it/latest/">Italiano</a></dd>
+	  <dd><a href="/ar/latest/">العربية</a></dd-->
+    </dl>
+    </dl>
+  </div>
+</div>
+
+ <a href="https://github.com/apache/incubator-singa">
+    <img style="position: absolute; top: 0; right: 0; border: 0; z-index: 10000;"
+        src="https://s3.amazonaws.com/github/ribbons/forkme_right_orange_ff7600.png"
+        alt="Fork me on GitHub">
+</a>
+
+{{ super() }}
+{% endblock %}
diff --git a/doc/en/community/issue-tracking.md b/doc/en/community/issue-tracking.md
new file mode 100644
index 0000000..26b23dd
--- /dev/null
+++ b/doc/en/community/issue-tracking.md
@@ -0,0 +1,9 @@
+## Issue Tracking
+
+___
+
+SINGA uses [JIRA](https://www.atlassian.com/software/jira) a J2EE-based, issue tracking and project management application.
+
+Issues, bugs, and feature requests should be submitted to the following issue tracking system for this project.
+
+* https://issues.apache.org/jira/browse/singa
diff --git a/doc/en/community/mail-lists.rst b/doc/en/community/mail-lists.rst
new file mode 100644
index 0000000..a170042
--- /dev/null
+++ b/doc/en/community/mail-lists.rst
@@ -0,0 +1,28 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+   or more contributor license agreements.  See the NOTICE file
+   distributed with this work for additional information
+   regarding copyright ownership.  The ASF licenses this file
+   to you under the Apache License, Version 2.0 (the
+   "License"); you may not use this file except in compliance
+   with the License.  You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing,
+   software distributed under the License is distributed on an
+   "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+   KIND, either express or implied.  See the License for the
+   specific language governing permissions and limitations
+   under the License.
+
+
+Project Mailing Lists
+=====================
+
+These are the mailing lists that have been established for this project. For each list, there is a subscribe, unsubscribe, and an archive link.
+
+.. csv-table:: Mailing Lists
+	:header: "Name", "Post", "Subscribe", "Unsubscribe", "Archive"
+
+        "Development", "dev@singa.incubator.apache.org", "`Subscribe <mailto:dev-subscribe@singa.incubator.apache.org>`_", "`Unsubscribe <mailto:dev-unsubscribe@singa.incubator.apache.org.>`_", "`mail-archives.apache.org <http://mail-archives.apache.org/mod_mbox/singa-dev/>`_"
+        "Commits", "commits@singa.incubator.apache.org", "`Subscribe <mailto:commits-subscribe@singa.incubator.apache.org>`_", "`Unsubscribe <mailto:commits-unsubscribe@singa.incubator.apache.org>`_", "`mail-archives.apache.org  <http://mail-archives.apache.org/mod_mbox/singa-commits/>`_"
diff --git a/doc/en/community/source-repository.md b/doc/en/community/source-repository.md
new file mode 100644
index 0000000..8864629
--- /dev/null
+++ b/doc/en/community/source-repository.md
@@ -0,0 +1,22 @@
+# Source Repository
+
+___
+
+This project uses [Git](http://git-scm.com/) to manage its source code. Instructions on Git use can be found at [http://git-scm.com/documentation](http://git-scm.com/documentation).
+
+## Web Access
+
+The following is a link to the online source repository.
+
+* [https://git-wip-us.apache.org/repos/asf?p=incubator-singa.git;a=summary](https://git-wip-us.apache.org/repos/asf?p=incubator-singa.git;a=summary)
+
+
+## Upstream for committers
+
+Committers need to set the upstream endpoint to the Apache git (not github) repo address, e.g.,
+
+    $ git remote add asf https://git-wip-us.apache.org/repos/asf/incubator-singa.git
+
+Then you (committer) can push your code in this way,
+
+    $ git push asf <local-branch>:<remote-branch>
diff --git a/doc/en/community/team-list.rst b/doc/en/community/team-list.rst
new file mode 100644
index 0000000..abff0a8
--- /dev/null
+++ b/doc/en/community/team-list.rst
@@ -0,0 +1,82 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+   or more contributor license agreements.  See the NOTICE file
+   distributed with this work for additional information
+   regarding copyright ownership.  The ASF licenses this file
+   to you under the Apache License, Version 2.0 (the
+   "License"); you may not use this file except in compliance
+   with the License.  You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing,
+   software distributed under the License is distributed on an
+   "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+   KIND, either express or implied.  See the License for the
+   specific language governing permissions and limitations
+   under the License.
+
+
+The SINGA Team
+==============
+
+A successful project requires many people to play many roles. Some members write code or documentation, while others are valuable as testers, submitting patches and suggestions.
+
+Mentors
+-------
+
+==================   ============
+Name                 Email
+==================   ============
+Daniel Dai           daijy@apache.org
+Ted Dunning	     tdunning@apache.org
+Alan Gates	     gates@apache.org
+Thejas Nair	     thejas@apache.org
+==================   ============
+
+Developers
+----------
+
++-------------------+--------------------------------+----------------------------------------------+
+| Name              |  Email                         |  Organization                                |
++-------------------+--------------------------------+----------------------------------------------+
+|Gang Chen          |  cg@zju.edu.cn                 |   Zhejiang University                        |
++-------------------+--------------------------------+----------------------------------------------+
+| Haibo Chen        | hzchenhaibo@corp.netease.com   |  NetEase                                     |
++-------------------+--------------------------------+----------------------------------------------+
+| Anh Dinh	    |     dinhtta@apache.org	     |         National University of Singapore     |
++-------------------+--------------------------------+----------------------------------------------+
+| Jinyang Gao	    |     jinyang@apache.org	     |         National University of Singapore	    |
++-------------------+--------------------------------+----------------------------------------------+
+| Xing Ji	    |         jixin@comp.nus.edu.sg  |          National University of Singapore    |
++-------------------+--------------------------------+----------------------------------------------+
+| Chonho Lee	    |  chonho@gmail.com              |   National University of Singapore           |
++-------------------+--------------------------------+----------------------------------------------+
+| Zhaojing Luo	    | zhaojing@apache.org	     | National University of Singapore	            |
++-------------------+--------------------------------+----------------------------------------------+
+| Beng Chin Ooi	    | ooibc@comp.nus.edu.sg          | National University of Singapore	            |
++-------------------+--------------------------------+----------------------------------------------+
+| Kian-Lee Tan	    |    tankl@apache.org            | National University of Singapore	            |
++-------------------+--------------------------------+----------------------------------------------+
+|Anthony K. H. Tung |  atung@comp.nus.edu.sg         |   National University of Singapore	    |
++-------------------+--------------------------------+----------------------------------------------+
+| Ji Wang	    |         wangji@comp.nus.edu.sg |	      National University of Singapore	    |
++-------------------+--------------------------------+----------------------------------------------+
+| Sheng Wang	    |    wangsh@apache.org           | National University of Singapore	            |
++-------------------+--------------------------------+----------------------------------------------+
+| Wei Wang	    |    wangwei@apache.org	     |         National University of Singapore	    |
++-------------------+--------------------------------+----------------------------------------------+
+| Yuan Wang         |  wangyuan@corp.netease.com     |   NetEase                                    |
++-------------------+--------------------------------+----------------------------------------------+
+| Wenfeng Wu	    |     wuwf@comp.nus.edu.sg       |  National University of Singapore            |
++-------------------+--------------------------------+----------------------------------------------+
+| Zhongle Xie	    |     zhongle@apache.org	     |        National University of Singapore      |
++-------------------+--------------------------------+----------------------------------------------+
+| Meihui Zhang	    |     meihui_zhang@sutd.edu.sg   |Singapore University of Technology and Design |
++-------------------+--------------------------------+----------------------------------------------+
+| Kaiping Zheng     |     kaiping@apache.org	     |         National University of Singapore	    |
++-------------------+--------------------------------+----------------------------------------------+
+| Ming Zhong        | hzzhongming15@corp.netease.com |   Zhejiang University                        |
++-------------------+--------------------------------+----------------------------------------------+
+
+
+
diff --git a/doc/en/develop/contribute-code.md b/doc/en/develop/contribute-code.md
new file mode 100644
index 0000000..98e5aee
--- /dev/null
+++ b/doc/en/develop/contribute-code.md
@@ -0,0 +1,60 @@
+## How to Contribute Code
+
+_____
+
+### Coding Style
+
+The SINGA codebase follows the [Google C++ Style Guide](http://google-styleguide.googlecode.com/svn/trunk/cppguide.xml).
+
+To check if your code follows the style, you can use the provided cpplint tool:
+    
+    $ ./tool/cpplint.py YOUR_FILE
+
+
+### JIRA format
+
+Like other Apache projects, SINGA uses JIRA to track bugs, improvements and
+other high-level discussions (e.g., system design and features).  Github pull requests are
+used for implementation discussions, e.g., code review and code merge.
+
+* Provide a descriptive Title.
+* Write a detailed Description. For bug reports, this should ideally include a
+  short reproduction of the problem. For new features, it may include a design
+  document.
+* Set [required fields](https://cwiki.apache.org/confluence/display/SPARK/Contributing+to+Spark#ContributingtoSpark-JIRA)
+
+### Pull Request
+
+The work flow is
+
+* Fork the [SINGA Github repository](https://github.com/apache/incubator-singa) to
+your own Github account.
+
+* Clone your fork, create a new branch (e.g., feature-foo or fixbug-foo),
+ work on it. After finishing your job,
+ [rebase](https://git-scm.com/book/en/v2/Git-Branching-Rebasing) it to the
+ current latest master and push commits to your own Github account (the new
+ branch).
+
+* Open a pull request against the master branch of apache/incubator-singa.
+The PR title should be of the form SINGA-xxxx Title, where
+SINGA-xxxx is the relevant JIRA number, and Title may be the JIRA's title or a
+more specific title describing the PR itself, for example, "SINGA-6 Implement thread-safe singleton". Detailed description can be copied from the JIRA.
+Consider identifying committers or other contributors who have worked on the
+code being changed. Find the file(s) in Github and click "Blame" to see a
+line-by-line annotation of who changed the code last.  You can add @username in
+the PR description to ping them immediately.
+Please state that the contribution is your original work and that you license
+the work to the project under the project's open source license. Further commits (e.g., bug fix)
+to your new branch will be added to this pull request automatically by Github.
+
+* Wait for one committer to review the patch. If no conflicts, the committers will merge it with
+the master branch. The merge should a) not use rebase b) disable fast forward merge c) check the 
+commit message format and test the code/feature.
+
+* If there are too many small commit messages, you will be told to squash your commits into fewer meaningful
+commits. If your commit message does not follow the format (i.e., SINGA-xxxx), you will be told to
+reword your commit message. Both changes can be done using interactive git rebase. Once you
+get the commits corrected, push them to you own github again. Your pull request 
+will be automatically updated. For details, please refer to 
+[Rebase Pull Requests](https://github.com/edx/edx-platform/wiki/How-to-Rebase-a-Pull-Request).
\ No newline at end of file
diff --git a/doc/en/develop/contribute-docs.md b/doc/en/develop/contribute-docs.md
new file mode 100644
index 0000000..5e21a0f
--- /dev/null
+++ b/doc/en/develop/contribute-docs.md
@@ -0,0 +1,28 @@
+# How to Contribute Documentation
+
+___
+
+
+## Website
+This document gives step-by-step instructions for deploying [Singa website](http://singa.incubator.apache.org).
+
+Singa website is built by [Sphinx](http://www.sphinx-doc.org) 1.4.4 from a source tree stored in git: https://github.com/apache/incubator-singa/tree/master/doc.
+
+To install Sphinx on Ubuntu:
+
+    $ apt-get install python-sphinx
+
+To install the markdown support for Sphinx:
+
+    $ pip install recommonmark
+
+You can build the website by executing the following command from the doc folder:
+
+    $ make html
+
+The procedure for contributing documentation is the same as [contributing code](contribute-code.html).
+
+
+## CPP API
+
+To generate docs, run "doxygen" from the doc folder (Doxygen >= 1.8 recommended)
diff --git a/doc/en/develop/how-contribute.md b/doc/en/develop/how-contribute.md
new file mode 100644
index 0000000..8687b5a
--- /dev/null
+++ b/doc/en/develop/how-contribute.md
@@ -0,0 +1,11 @@
+# How to Contribute to SINGA
+
+___
+
+As with any open source project, there are several ways you can help:
+
+* Join the [mailing list](../community/mail-lists.html) and answer other user's questions.
+* [Build Singa](../quick-start.html) for yourself, in order to fix bugs.
+* Report bugs, feature requests and other issues in the [issue tracking](../community/issue-tracking.html) application.
+* Check SINGA's [development schedule](schedule.html) and [contribute code](contribute-code.html) by providing patches.
+* [Help with the documentation](contribute-docs.html) by updating webpages that are lacking or unclear.
diff --git a/doc/en/develop/schedule.rst b/doc/en/develop/schedule.rst
new file mode 100644
index 0000000..73d713c
--- /dev/null
+++ b/doc/en/develop/schedule.rst
@@ -0,0 +1,57 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+   or more contributor license agreements.  See the NOTICE file
+   distributed with this work for additional information
+   regarding copyright ownership.  The ASF licenses this file
+   to you under the Apache License, Version 2.0 (the
+   "License"); you may not use this file except in compliance
+   with the License.  You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing,
+   software distributed under the License is distributed on an
+   "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+   KIND, either express or implied.  See the License for the
+   specific language governing permissions and limitations
+   under the License.
+
+
+Development Schedule
+====================
+
+.. csv-table::
+	:header: "Release", "Module", "Feature", "Status"
+
+	" 0.1 Sep 2015     "," Neural Network          "," Feed forward neural network, including CNN, MLP                                                                 "," done  "
+	"                  ","                         "," RBM-like model, including RBM                                                                                   "," done   "
+	"                  ","                         "," Recurrent neural network, including standard RNN                                                                "," done   "
+	"                  ","  Architecture           "," One worker group on single node (with data partition)                                                           "," done   "
+	"                  ","                         "," Multi worker groups on single node using [Hogwild](http://www.eecs.berkeley.edu/~brecht/papers/hogwildTR.pdf)      ","done"
+	"                  ","                         "," Distributed Hogwild","done"
+	"                  ","                         "," Multi groups across nodes, like [Downpour](http://papers.nips.cc/paper/4687-large-scale-distributed-deep-networks) ","done"
+	"                  ","                         "," All-Reduce training architecture like [DeepImage](http://arxiv.org/abs/1501.02876) ","done"
+	"                  ","                         "," Load-balance among servers "," done"
+	"                  ","  Failure recovery       "," Checkpoint and restore ","done"
+	"                  ","  Tools                  "," Installation with GNU auto tools"," done"
+	"0.2 Jan 2016      "," Neural Network          "," Feed forward neural network, including AlexNet, cuDNN layers, etc."," done "
+	"                  ","                         "," Recurrent neural network, including GRULayer and BPTT","done "
+	"                  ","                         "," Model partition and hybrid partition","done"
+	"      		   "," Tools                   "," Integration with Mesos for resource management","done"
+	"         	   ","                         "," Prepare Docker images for deployment","done"
+	"              	   ","                         "," Visualization of neural net and debug information ","done"
+	"                  "," Binding                 "," Python binding for major components ","done"
+	"                  "," GPU                     "," Single node with multiple GPUs ","done"
+	"0.3 April 2016    "," GPU                     "," Multiple nodes, each with multiple GPUs","done"
+	"                  ","                         "," Heterogeneous training using both GPU and CPU [CcT](http://arxiv.org/abs/1504.04343)","done"
+	"                  ","                         "," Support cuDNN v4 "," done"
+	"                  "," Installation            "," Remove dependency on ZeroMQ, CZMQ, Zookeeper for single node training","done"
+	"                  "," Updater                 "," Add new SGD updaters including Adam, AdamMax and AdaDelta","done"
+	"                  "," Binding                 "," Enhance Python binding for training","done"
+	"1.0 Aug 2016     "," Programming abstraction ","Tensor with linear algebra, neural net and random operations "," "
+	"                  ","                         ","Updater for distributed parameter updating ",""
+	"                  "," Optimization            "," Execution and memory optimization",""
+	"                  "," Hardware                "," Use Cuda and Cudnn for Nvidia GPU",""
+	"                  ","                         "," Use OpenCL for AMD GPU or other devices",""
+	"                  "," Cross-platform          "," To extend from Linux to MacOS and Windows",""
+	"                  "," Examples                "," Speech recognition example",""
+	"                  ","                         ","Large image models, e.g., [VGG](https://arxiv.org/pdf/1409.1556.pdf) and [Residual Net](http://arxiv.org/abs/1512.03385)",""
diff --git a/doc/en/docs.rst b/doc/en/docs.rst
new file mode 100644
index 0000000..1b94d02
--- /dev/null
+++ b/doc/en/docs.rst
@@ -0,0 +1,23 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+   or more contributor license agreements.  See the NOTICE file
+   distributed with this work for additional information
+   regarding copyright ownership.  The ASF licenses this file
+   to you under the Apache License, Version 2.0 (the
+   "License"); you may not use this file except in compliance
+   with the License.  You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing,
+   software distributed under the License is distributed on an
+   "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+   KIND, either express or implied.  See the License for the
+   specific language governing permissions and limitations
+   under the License.
+
+
+Documentation
+=============
+
+.. toctree::
+   docs/index
diff --git a/doc/en/docs/cnn.md b/doc/en/docs/cnn.md
new file mode 100755
index 0000000..21ef1f7
--- /dev/null
+++ b/doc/en/docs/cnn.md
@@ -0,0 +1,141 @@
+#Quickstart - Cifar10 example

+Convolution neural network (CNN) is a type of feed-forward artificial neural network widely used for image classification. In this example, we will use a deep CNN model to do image classification for the [CIFAR10 dataset](http://www.cs.toronto.edu/~kriz/cifar.html).

+

+## Running instructions for CPP version

+Please refer to [Installation](installation.html) page for how to install SINGA. Currently, we CNN requires CUDNN, hence both CUDA and CUDNN should be installed and SINGA should be compiled with CUDA and CUDNN.

+

+The Cifar10 dataset could be downloaded by running

+

+    # switch to cifar10 directory

+    $ cd ../examples/cifar10

+    # download data for CPP version

+    $ python download_data.py bin

+

+'bin' is for downloading binary version of Cifar10 data.

+

+During downloading, you should see the detailed output like

+

+     Downloading CIFAR10 from http://www.cs.toronto.edu/~kriz/cifar-10-binary.tar.gz

+     The tar file does exist. Extracting it now..

+     Finished!

+

+Now you have prepared the data for this Cifar10 example, the final step is to execute the `run.sh` script,

+

+    # in SINGA_ROOT/examples/cifar10/

+    $ ./run.sh

+

+You should see the detailed output as follows: first read the data files in order, show the statistics of training and testing data, then show the details of neural net structure with some parameter information, finally illustrate the performance details during training and validation process. The number of epochs can be specified in `run.sh` file.

+

+    Start training

+    Reading file cifar-10-batches-bin/data_batch_1.bin

+    Reading file cifar-10-batches-bin/data_batch_2.bin

+    Reading file cifar-10-batches-bin/data_batch_3.bin

+    Reading file cifar-10-batches-bin/data_batch_4.bin

+    Reading file cifar-10-batches-bin/data_batch_5.bin

+    Reading file cifar-10-batches-bin/test_batch.bin

+    Training samples = 50000, Test samples = 10000

+    conv1(32, 32, 32, )

+    pool1(32, 16, 16, )

+    relu1(32, 16, 16, )

+    lrn1(32, 16, 16, )

+    conv2(32, 16, 16, )

+    relu2(32, 16, 16, )

+    pool2(32, 8, 8, )

+    lrn2(32, 8, 8, )

+    conv3(64, 8, 8, )

+    relu3(64, 8, 8, )

+    pool3(64, 4, 4, )

+    flat(1024, )

+    ip(10, )

+    conv1_weight : 8.09309e-05

+    conv1_bias : 0

+    conv2_weight : 0.00797731

+    conv2_bias : 0

+    conv3_weight : 0.00795888

+    conv3_bias : 0

+    ip_weight : 0.00798683

+    ip_bias : 0

+    Messages will be appended to an existed file: train_perf

+    Messages will be appended to an existed file: val_perf

+    Epoch 0, training loss = 1.828369, accuracy = 0.329420, lr = 0.001000

+    Epoch 0, val loss = 1.561823, metric = 0.420600

+    Epoch 1, training loss = 1.465898, accuracy = 0.469940, lr = 0.001000

+    Epoch 1, val loss = 1.361778, metric = 0.513300

+    Epoch 2, training loss = 1.320708, accuracy = 0.529000, lr = 0.001000

+    Epoch 2, val loss = 1.242080, metric = 0.549100

+    Epoch 3, training loss = 1.213776, accuracy = 0.571620, lr = 0.001000

+    Epoch 3, val loss = 1.175346, metric = 0.582000

+

+The training details are stored in `train_perf` file in the same directory and the validation details in `val_perf` file.

+

+

+## Running instructions for Python version

+To run CNN example in Python version, we need to compile SINGA with Python binding,

+

+    $ mkdir build && cd build

+    $ cmake -DUSE_PYTHON=ON ..

+    $ make

+

+Now download the Cifar10 dataset,

+

+    # switch to cifar10 directory

+    $ cd ../examples/cifar10

+    # download data for Python version

+    $ python download_data.py py

+

+During downloading, you should see the detailed output like

+

+     Downloading CIFAR10 from http://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz

+     The tar file does exist. Extracting it now..

+     Finished!

+

+Then execute the `train.py` script to build the model

+

+    $ python train.py

+

+You should see the output as follows including the details of neural net structure with some parameter information, reading data files, and the performance details during training and testing process.

+

+    (32L, 32L, 32L)

+    (32L, 16L, 16L)

+    (32L, 16L, 16L)

+    (32L, 16L, 16L)

+    (32L, 16L, 16L)

+    (32L, 16L, 16L)

+    (32L, 8L, 8L)

+    (32L, 8L, 8L)

+    (64L, 8L, 8L)

+    (64L, 8L, 8L)

+    (64L, 4L, 4L)

+    (1024L,)

+    Start intialization............

+    conv1_weight gaussian 7.938460476e-05

+    conv1_bias constant 0.0

+    conv2_weight gaussian 0.00793507322669

+    conv2_bias constant 0.0

+    conv3_weight gaussian 0.00799657031894

+    conv3_bias constant 0.0

+    dense_weight gaussian 0.00804364029318

+    dense_bias constant 0.0

+    Loading data ..................

+    Loading data file cifar-10-batches-py/data_batch_1

+    Loading data file cifar-10-batches-py/data_batch_2

+    Loading data file cifar-10-batches-py/data_batch_3

+    Loading data file cifar-10-batches-py/data_batch_4

+    Loading data file cifar-10-batches-py/data_batch_5

+    Loading data file cifar-10-batches-py/test_batch

+    Epoch 0

+    training loss = 1.881866, training accuracy = 0.306360 accuracy = 0.420000

+    test loss = 1.602577, test accuracy = 0.412200

+    Epoch 1

+    training loss = 1.536011, training accuracy = 0.441940 accuracy = 0.500000

+    test loss = 1.378170, test accuracy = 0.507600

+    Epoch 2

+    training loss = 1.333137, training accuracy = 0.519960 accuracy = 0.520000

+    test loss = 1.272205, test accuracy = 0.540600

+    Epoch 3

+    training loss = 1.185212, training accuracy = 0.574120 accuracy = 0.540000

+    test loss = 1.211573, test accuracy = 0.567600

+

+This script will call `alexnet.py` file to build the alexnet model. After the training is finished, SINGA will save the model parameters into a checkpoint file `model.bin` in the same directory. Then we can use this `model.bin` file for prediction.

+

+    $ python predict.py

diff --git a/doc/en/docs/device.rst b/doc/en/docs/device.rst
new file mode 100644
index 0000000..57993f9
--- /dev/null
+++ b/doc/en/docs/device.rst
@@ -0,0 +1,54 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+   or more contributor license agreements.  See the NOTICE file
+   distributed with this work for additional information
+   regarding copyright ownership.  The ASF licenses this file
+   to you under the Apache License, Version 2.0 (the
+   "License"); you may not use this file except in compliance
+   with the License.  You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing,
+   software distributed under the License is distributed on an
+   "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+   KIND, either express or implied.  See the License for the
+   specific language governing permissions and limitations
+   under the License.
+
+
+Device
+=======
+
+
+The Device abstract represents any hardware device with memory and compuation units.
+All [Tensor operations](tensor.html) are scheduled by the resident device for execution.
+Tensor memory is also managed by the device's memory manager. Therefore, optimization
+of memory and execution are implemented in the Device class.
+
+Specific devices
+----------------
+Currently, SINGA has three Device implmentations,
+
+1. CudaGPU for an Nvidia GPU card which runs Cuda code
+2. CppCPU for a CPU which runs Cpp code
+3. OpenclGPU for a GPU card which runs OpenCL code
+
+
+Python API
+----------
+
+.. automodule:: singa.device
+   :members: create_cuda_gpus, create_cuda_gpus_on, get_default_device
+
+
+The following code provides examples of creating devices::
+
+   from singa import device
+   cuda = device.create_cuda_gpu_on(0)  # use GPU card of ID 0
+   host = device.get_default_device()  # get the default host device (a CppCPU)
+   ary1 = device.create_cuda_gpus(2)  # create 2 devices, starting from ID 0
+   ary2 = device.create_cuda_gpus([0,2])  # create 2 devices on ID 0 and 2
+
+
+CPP API
+---------
diff --git a/doc/en/docs/index.rst b/doc/en/docs/index.rst
new file mode 100644
index 0000000..d6d7516
--- /dev/null
+++ b/doc/en/docs/index.rst
@@ -0,0 +1,33 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+   or more contributor license agreements.  See the NOTICE file
+   distributed with this work for additional information
+   regarding copyright ownership.  The ASF licenses this file
+   to you under the Apache License, Version 2.0 (the
+   "License"); you may not use this file except in compliance
+   with the License.  You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing,
+   software distributed under the License is distributed on an
+   "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+   KIND, either express or implied.  See the License for the
+   specific language governing permissions and limitations
+   under the License.
+
+
+Documentation
+=============
+
+.. toctree::
+
+   installation
+   software_stack
+   device
+   tensor
+   layer
+   initializer
+   loss
+   metric
+   optimizer
+   examples/index
diff --git a/doc/en/docs/initializer.rst b/doc/en/docs/initializer.rst
new file mode 100644
index 0000000..6790a8e
--- /dev/null
+++ b/doc/en/docs/initializer.rst
@@ -0,0 +1,30 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+   or more contributor license agreements.  See the NOTICE file
+   distributed with this work for additional information
+   regarding copyright ownership.  The ASF licenses this file
+   to you under the Apache License, Version 2.0 (the
+   "License"); you may not use this file except in compliance
+   with the License.  You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing,
+   software distributed under the License is distributed on an
+   "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+   KIND, either express or implied.  See the License for the
+   specific language governing permissions and limitations
+   under the License.
+
+
+Initializer
+===========
+
+Python API
+----------
+
+.. automodule:: singa.initializer
+   :members: uniform, gaussian
+   :member-order: bysource
+
+CPP API
+--------
diff --git a/doc/en/docs/installation.md b/doc/en/docs/installation.md
new file mode 100755
index 0000000..9f112f4
--- /dev/null
+++ b/doc/en/docs/installation.md
@@ -0,0 +1,233 @@
+# Installation
+
+## Dependencies
+
+### Required
+* google protobuf (>=2.5,<3)
+* blas (tested with openblas >=0.2.10)
+* cmake (>=2.6)
+
+
+### Optional
+* glog
+* opencv (tested with 2.4.8)
+* lmdb (tested with 0.9)
+* cuda (tested with 6.5, 7.0 and 7.5)
+* cudnn (v4 and v5)
+
+PySINGA has additional dependencies
+
+* python(==2.7)
+* pip(>=1.5)
+* swig(>=3.0)
+* numpy(>=1.11.0)
+* openblas (>=0.2.10)
+
+Users are encouraged to install the cuda and [cudnn](https://developer.nvidia.com/cudnn) for running SINGA on GPUs to
+get better performance.
+Most of the dependent libraries could be installed via package mangers like
+apt-get or homebrew.
+
+    # for ubuntu users, tested on 14.04
+    sudo apt-get install libprotobuf-dev libopenblas-dev libopencv-dev protobuf-compiler libgoogle-glog-dev liblmdb-dev python2.7-dev python-pip python-numpy
+
+    # for Mac OS users
+    brew install -vd glog lmdb
+    brew tap homebrew/science
+    brew install opencv
+    brew install openblas
+    brew tap homebrew/python
+    brew install python
+    brew install numpy  --with-openblas
+
+
+## Install PySINGA
+
+### From wheel
+
+After installing the dependencies for SINGA and PySINGA, please download the correct binary:
+
+    # Ubuntu/Linux 64-bit, CPU only, Python 2.7, Protobuf 2.5
+    $ export SINGA_WHEEL_URL=http://comp.nus.edu.sg/~dbsystem/singa/assets/file/pb2.5/singa-1.0.0-cp27-none-linux_x86_64.whl
+
+    # Ubuntu/Linux 64-bit, CPU only, Python 2.7, Protobuf 2.6
+    $ export SINGA_WHEEL_URL=http://comp.nus.edu.sg/~dbsystem/singa/assets/file/pb2.6/singa-1.0.0-cp27-none-linux_x86_64.whl
+
+    # Ubuntu/Linux 64-bit, GPU enabled, Python 2.7, Protobuf 2.5, CUDA toolkit 7.5 and CuDNN v5
+    $ export SINGA_WHEEL_URL=http://comp.nus.edu.sg/~dbsystem/singa/assets/file/pb2.5-cuda7.5-cudnn5/singa-1.0.0-cp27-none-linux_x86_64.whl
+
+    # Ubuntu/Linux 64-bit, GPU enabled, Python 2.7, Protobuf 2.6, CUDA toolkit 7.5 and CuDNN v5
+    $ export SINGA_WHEEL_URL=http://comp.nus.edu.sg/~dbsystem/singa/assets/file/pb2.6-cuda7.5-cudnn5/singa-1.0.0-cp27-none-linux_x86_64.whl
+
+Then, run the following command
+
+    $ sudo pip install --upgrade $SINGA_WHEEL_URL
+
+If you do not have sudo right, you can run `pip install` in a python virtual environment.
+Note that in python virtual environment, you may need to reset the `PYTHONPATH` to empty
+to avoid the conflicts of system path and virtual environment path.
+
+
+### From source
+
+Please compile SINGA from source (see the next section) with the 'USE_PYTHON' option on,
+and then run the following commands,
+
+    # under the build directory
+    $ cd python
+    $ sudo pip install .
+
+If you are using a virtual environment, you can ignore the `sudo` keyword.
+
+Developers can build the wheel file via
+
+    # under the build directory
+    $ cd python
+    $ python setup.py bdist_wheel
+
+The generated wheel file is under "dist" directory.
+To build cnmem into the wheel file, please change CMakeLists.txt by replacing
+'SHARED' with 'STATIC'.
+
+
+## Build SINGA from source
+
+Please clone the newest code from [Github](https://github.com/apache/incubator-singa) and execute the following commands,
+
+    $ git clone https://github.com/apache/incubator-singa.git
+    $ cd incubator-singa/
+
+If you use CUDA, then [CNMeM](https://github.com/NVIDIA/cnmem) is necessary,
+which could be downloaded as
+
+    $ git submodule init
+    $ git submodule update
+
+
+### Linux & MacOS
+
+GCC (>=4.8.1) is required to compile SINGA on Linux.
+For Mac OS users, you can use either GCC or Clang.
+
+In SINGA_ROOT, execute the following commands for compiling SINGA,
+
+    $ mkdir build && cd build
+    $ cmake ..
+    $ make
+    $ make install
+
+Note that if you are using CUDNN and it is not installed under system default
+folder, you need to let cmake know the paths to CUDNN,
+
+    $ export CMAKE_INCLUDE_PATH=<path to cudnn>/include:$CMAKE_INCLUDE_PATH
+    $ export CMAKE_LIBRARY_PATH=<path to cudnn>/lib64:$CMAKE_LIBRARY_PATH
+
+You can use `ccmake ..` to configure the compilation options including using
+generating python binding and changing the installation folder.
+If the dependent libraries are not in the system default paths, you need to export
+the following environment variables
+
+    export CMAKE_INCLUDE_PATH=<path to your header file folder>
+    export CMAKE_LIBRARY_PATH=<path to your lib file folder>
+
+After compiling SINGA, you can run the unit tests by
+
+    $ ./bin/test_singa
+
+You can see all the testing cases with testing results. If SINGA passes all
+tests, then you have successfully installed SINGA. Please proceed to try the examples!
+
+
+### Windows
+To be added.
+
+
+## FAQ
+
+* Q: Error from running `cmake ..`, which cannot find the dependent libraries.
+
+    A: If you haven't installed the libraries, please install them. If you installed
+    the libraries in a folder that is outside of the system folder, e.g. /usr/local,
+    please export the following variables
+
+        export CMAKE_INCLUDE_PATH=<path to your header file folder>
+        export CMAKE_LIBRARY_PATH=<path to your lib file folder>
+
+
+* Q: Error from `make`, e.g. the linking phase
+
+    A: If your libraries are in other folders than system default paths, you need
+    to export the following varaibles
+
+    $ export LIBRARY_PATH=<path to your lib file folder>
+    $ export LD_LIBRARY_PATH=<path to your lib file folder>
+
+
+* Q: Error from header files, e.g. 'cblas.h no such file or directory exists'
+
+    A: You need to include the folder of the cblas.h into CPLUS_INCLUDE_PATH,
+    e.g.,
+
+        $ export CPLUS_INCLUDE_PATH=/opt/OpenBLAS/include:$CPLUS_INCLUDE_PATH
+
+* Q:While compiling SINGA, I get error `SSE2 instruction set not enabled`
+
+    A:You can try following command:
+
+        $ make CFLAGS='-msse2' CXXFLAGS='-msse2'
+
+* Q:I get `ImportError: cannot import name enum_type_wrapper` from google.protobuf.internal when I try to import .py files.
+
+    A: You need to install the python binding of protobuf, which could be installed via
+
+        $ sudo apt-get install protobuf
+
+    or from source
+
+        $ cd /PROTOBUF/SOURCE/FOLDER
+        $ cd python
+        $ python setup.py build
+        $ python setup.py install
+
+* Q: When I build OpenBLAS from source, I am told that I need a Fortran compiler.
+
+    A: You can compile OpenBLAS by
+
+        $ make ONLY_CBLAS=1
+
+    or install it using
+
+        $ sudo apt-get install libopenblas-dev
+
+* Q: When I build protocol buffer, it reports that GLIBC++_3.4.20 not found in /usr/lib64/libstdc++.so.6.
+
+    A9: This means the linker found libstdc++.so.6 but that library
+    belongs to an older version of GCC than was used to compile and link the
+    program. The program depends on code defined in
+    the newer libstdc++ that belongs to the newer version of GCC, so the linker
+    must be told how to find the newer libstdc++ shared library.
+    The simplest way to fix this is to find the correct libstdc++ and export it to
+    LD_LIBRARY_PATH. For example, if GLIBC++_3.4.20 is listed in the output of the
+    following command,
+
+        $ strings /usr/local/lib64/libstdc++.so.6|grep GLIBC++
+
+    then you just set your environment variable as
+
+        $ export LD_LIBRARY_PATH=/usr/local/lib64:$LD_LIBRARY_PATH
+
+* Q: When I build glog, it reports that "src/logging_unittest.cc:83:20: error: ‘gflags’ is not a namespace-name"
+
+    A: It maybe that you have installed gflags with a different namespace such as "google". so glog can't find 'gflags' namespace.
+    Because it is not necessary to have gflags to build glog. So you can change the configure.ac file to ignore gflags.
+
+        1. cd to glog src directory
+        2. change line 125 of configure.ac  to "AC_CHECK_LIB(gflags, main, ac_cv_have_libgflags=0, ac_cv_have_libgflags=0)"
+        3. autoreconf
+
+    After this, you can build glog again.
+
+* Q: When using virtual environment, everytime I run pip install, it would reinstall numpy. However, the numpy would not be used when I `import numpy`
+
+    A: It could be caused by the `PYTHONPATH` which should be set to empty when you are using virtual environment to avoid the conflicts with the path of
+    the virtual environment.
diff --git a/doc/en/docs/layer.rst b/doc/en/docs/layer.rst
new file mode 100644
index 0000000..1a576f1
--- /dev/null
+++ b/doc/en/docs/layer.rst
@@ -0,0 +1,32 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+   or more contributor license agreements.  See the NOTICE file
+   distributed with this work for additional information
+   regarding copyright ownership.  The ASF licenses this file
+   to you under the Apache License, Version 2.0 (the
+   "License"); you may not use this file except in compliance
+   with the License.  You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing,
+   software distributed under the License is distributed on an
+   "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+   KIND, either express or implied.  See the License for the
+   specific language governing permissions and limitations
+   under the License.
+
+
+Layer
+======
+
+Python API
+-----------
+.. automodule:: singa.layer
+   :members:
+   :member-order: bysource
+   :show-inheritance:
+   :undoc-members:
+
+
+CPP API
+--------
diff --git a/doc/en/docs/loss.rst b/doc/en/docs/loss.rst
new file mode 100644
index 0000000..18c587a
--- /dev/null
+++ b/doc/en/docs/loss.rst
@@ -0,0 +1,25 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+   or more contributor license agreements.  See the NOTICE file
+   distributed with this work for additional information
+   regarding copyright ownership.  The ASF licenses this file
+   to you under the Apache License, Version 2.0 (the
+   "License"); you may not use this file except in compliance
+   with the License.  You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing,
+   software distributed under the License is distributed on an
+   "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+   KIND, either express or implied.  See the License for the
+   specific language governing permissions and limitations
+   under the License.
+
+
+Loss
+=========
+
+
+.. automodule:: singa.loss
+   :members:
+   :show-inheritance:
diff --git a/doc/en/docs/metric.rst b/doc/en/docs/metric.rst
new file mode 100644
index 0000000..20a7144
--- /dev/null
+++ b/doc/en/docs/metric.rst
@@ -0,0 +1,26 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+   or more contributor license agreements.  See the NOTICE file
+   distributed with this work for additional information
+   regarding copyright ownership.  The ASF licenses this file
+   to you under the Apache License, Version 2.0 (the
+   "License"); you may not use this file except in compliance
+   with the License.  You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing,
+   software distributed under the License is distributed on an
+   "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+   KIND, either express or implied.  See the License for the
+   specific language governing permissions and limitations
+   under the License.
+
+
+Metric
+=========
+
+
+.. automodule:: singa.metric
+   :members:
+   :show-inheritance:
+   :member-order: bysource
diff --git a/doc/en/docs/neural-net.md b/doc/en/docs/neural-net.md
new file mode 100644
index 0000000..c10baf8
--- /dev/null
+++ b/doc/en/docs/neural-net.md
@@ -0,0 +1,327 @@
+# Neural Net
+
+---
+
+`NeuralNet` in SINGA represents an instance of user's neural net model. As the
+neural net typically consists of a set of layers, `NeuralNet` comprises
+a set of unidirectionally connected [Layer](layer.html)s.
+This page describes how to convert an user's neural net into
+the configuration of `NeuralNet`.
+
+<img src="../_static/images/model-category.png" align="center" width="200px"/>
+<span><strong>Figure 1 - Categorization of popular deep learning models.</strong></span>
+
+## Net structure configuration
+
+Users configure the `NeuralNet` by listing all layers of the neural net and
+specifying each layer's source layer names. Popular deep learning models can be
+categorized as Figure 1. The subsequent sections give details for each
+category.
+
+### Feed-forward models
+
+<div align = "left">
+<img src="../_static/images/mlp-net.png" align="center" width="200px"/>
+<span><strong>Figure 2 - Net structure of a MLP model.</strong></span>
+</div>
+
+Feed-forward models, e.g., CNN and MLP, can easily get configured as their layer
+connections are undirected without circles. The
+configuration for the MLP model shown in Figure 1 is as follows,
+
+    net {
+      layer {
+        name : 'data"
+        type : kData
+      }
+      layer {
+        name : 'image"
+        type : kImage
+        srclayer: 'data'
+      }
+      layer {
+        name : 'label"
+        type : kLabel
+        srclayer: 'data'
+      }
+      layer {
+        name : 'hidden"
+        type : kHidden
+        srclayer: 'image'
+      }
+      layer {
+        name : 'softmax"
+        type : kSoftmaxLoss
+        srclayer: 'hidden'
+        srclayer: 'label'
+      }
+    }
+
+### Energy models
+
+<img src="../_static/images/rbm-rnn.png" align="center" width="500px"/>
+<span><strong>Figure 3 - Convert connections in RBM and RNN.</strong></span>
+
+
+For energy models including RBM, DBM,
+etc., their connections are undirected (i.e., Category B). To represent these models using
+`NeuralNet`, users can simply replace each connection with two directed
+connections, as shown in Figure 3a. In other words, for each pair of connected layers, their source
+layer field should include each other's name.
+The full [RBM example](rbm.html) has
+detailed neural net configuration for a RBM model, which looks like
+
+    net {
+      layer {
+        name : "vis"
+        type : kVisLayer
+        param {
+          name : "w1"
+        }
+        srclayer: "hid"
+      }
+      layer {
+        name : "hid"
+        type : kHidLayer
+        param {
+          name : "w2"
+          share_from: "w1"
+        }
+        srclayer: "vis"
+      }
+    }
+
+### RNN models
+
+For recurrent neural networks (RNN), users can remove the recurrent connections
+by unrolling the recurrent layer.  For example, in Figure 3b, the original
+layer is unrolled into a new layer with 4 internal layers. In this way, the
+model is like a normal feed-forward model, thus can be configured similarly.
+The [RNN example](rnn.html) has a full neural net
+configuration for a RNN model.
+
+
+## Configuration for multiple nets
+
+Typically, a training job includes three neural nets for
+training, validation and test phase respectively. The three neural nets share most
+layers except the data layer, loss layer or output layer, etc..  To avoid
+redundant configurations for the shared layers, users can uses the `exclude`
+filed to filter a layer in the neural net, e.g., the following layer will be
+filtered when creating the testing `NeuralNet`.
+
+
+    layer {
+      ...
+      exclude : kTest # filter this layer for creating test net
+    }
+
+
+
+## Neural net partitioning
+
+A neural net can be partitioned in different ways to distribute the training
+over multiple workers.
+
+### Batch and feature dimension
+
+<img src="../_static/images/partition_fc.png" align="center" width="400px"/>
+<span><strong>Figure 4 - Partitioning of a fully connected layer.</strong></span>
+
+
+Every layer's feature blob is considered a matrix whose rows are feature
+vectors. Thus, one layer can be split on two dimensions. Partitioning on
+dimension 0 (also called batch dimension) slices the feature matrix by rows.
+For instance, if the mini-batch size is 256 and the layer is partitioned into 2
+sub-layers, each sub-layer would have 128 feature vectors in its feature blob.
+Partitioning on this dimension has no effect on the parameters, as every
+[Param](param.html) object is replicated in the sub-layers. Partitioning on dimension
+1 (also called feature dimension) slices the feature matrix by columns. For
+example, suppose the original feature vector has 50 units, after partitioning
+into 2 sub-layers, each sub-layer would have 25 units. This partitioning may
+result in [Param](param.html) object being split, as shown in
+Figure 4. Both the bias vector and weight matrix are
+partitioned into two sub-layers.
+
+
+### Partitioning configuration
+
+There are 4 partitioning schemes, whose configurations are give below,
+
+  1. Partitioning each singe layer into sub-layers on batch dimension (see
+  below). It is enabled by configuring the partition dimension of the layer to
+  0, e.g.,
+
+          # with other fields omitted
+          layer {
+            partition_dim: 0
+          }
+
+  2. Partitioning each singe layer into sub-layers on feature dimension (see
+  below).  It is enabled by configuring the partition dimension of the layer to
+  1, e.g.,
+
+          # with other fields omitted
+          layer {
+            partition_dim: 1
+          }
+
+  3. Partitioning all layers into different subsets. It is enabled by
+  configuring the location ID of a layer, e.g.,
+
+          # with other fields omitted
+          layer {
+            location: 1
+          }
+          layer {
+            location: 0
+          }
+
+
+  4. Hybrid partitioning of strategy 1, 2 and 3. The hybrid partitioning is
+  useful for large models. An example application is to implement the
+  [idea proposed by Alex](http://arxiv.org/abs/1404.5997).
+  Hybrid partitioning is configured like,
+
+          # with other fields omitted
+          layer {
+            location: 1
+          }
+          layer {
+            location: 0
+          }
+          layer {
+            partition_dim: 0
+            location: 0
+          }
+          layer {
+            partition_dim: 1
+            location: 0
+          }
+
+Currently SINGA supports strategy-2 well. Other partitioning strategies are
+are under test and will be released in later version.
+
+## Parameter sharing
+
+Parameters can be shared in two cases,
+
+  * sharing parameters among layers via user configuration. For example, the
+  visible layer and hidden layer of a RBM shares the weight matrix, which is configured through
+  the `share_from` field as shown in the above RBM configuration. The
+  configurations must be the same (except name) for shared parameters.
+
+  * due to neural net partitioning, some `Param` objects are replicated into
+  different workers, e.g., partitioning one layer on batch dimension. These
+  workers share parameter values. SINGA controls this kind of parameter
+  sharing automatically, users do not need to do any configuration.
+
+  * the `NeuralNet` for training and testing (and validation) share most layers
+  , thus share `Param` values.
+
+If the shared `Param` instances resident in the same process (may in different
+threads), they use the same chunk of memory space for their values. But they
+would have different memory spaces for their gradients. In fact, their
+gradients will be averaged by the stub or server.
+
+## Advanced user guide
+
+### Creation
+
+    static NeuralNet* NeuralNet::Create(const NetProto& np, Phase phase, int num);
+
+The above function creates a `NeuralNet` for a given phase, and returns a
+pointer to the `NeuralNet` instance. The phase is in {kTrain,
+kValidation, kTest}. `num` is used for net partitioning which indicates the
+number of partitions.  Typically, a training job includes three neural nets for
+training, validation and test phase respectively. The three neural nets share most
+layers except the data layer, loss layer or output layer, etc.. The `Create`
+function takes in the full net configuration including layers for training,
+validation and test.  It removes layers for phases other than the specified
+phase based on the `exclude` field in
+[layer configuration](layer.html):
+
+    layer {
+      ...
+      exclude : kTest # filter this layer for creating test net
+    }
+
+The filtered net configuration is passed to the constructor of `NeuralNet`:
+
+    NeuralNet::NeuralNet(NetProto netproto, int npartitions);
+
+The constructor creates a graph representing the net structure firstly in
+
+    Graph* NeuralNet::CreateGraph(const NetProto& netproto, int npartitions);
+
+Next, it creates a layer for each node and connects layers if their nodes are
+connected.
+
+    void NeuralNet::CreateNetFromGraph(Graph* graph, int npartitions);
+
+Since the `NeuralNet` instance may be shared among multiple workers, the
+`Create` function returns a pointer to the `NeuralNet` instance .
+
+### Parameter sharing
+
+ `Param` sharing
+is enabled by first sharing the Param configuration (in `NeuralNet::Create`)
+to create two similar (e.g., the same shape) Param objects, and then calling
+(in `NeuralNet::CreateNetFromGraph`),
+
+    void Param::ShareFrom(const Param& from);
+
+It is also possible to share `Param`s of two nets, e.g., sharing parameters of
+the training net and the test net,
+
+    void NeuralNet:ShareParamsFrom(NeuralNet* other);
+
+It will call `Param::ShareFrom` for each Param object.
+
+### Access functions
+`NeuralNet` provides a couple of access function to get the layers and params
+of the net:
+
+    const std::vector<Layer*>& layers() const;
+    const std::vector<Param*>& params() const ;
+    Layer* name2layer(string name) const;
+    Param* paramid2param(int id) const;
+
+
+### Partitioning
+
+
+#### Implementation
+
+SINGA partitions the neural net in `CreateGraph` function, which creates one
+node for each (partitioned) layer. For example, if one layer's partition
+dimension is 0 or 1, then it creates `npartition` nodes for it; if the
+partition dimension is -1, a single node is created, i.e., no partitioning.
+Each node is assigned a partition (or location) ID. If the original layer is
+configured with a location ID, then the ID is assigned to each newly created node.
+These nodes are connected according to the connections of the original layers.
+Some connection layers will be added automatically.
+For instance, if two connected sub-layers are located at two
+different workers, then a pair of bridge layers is inserted to transfer the
+feature (and gradient) blob between them. When two layers are partitioned on
+different dimensions, a concatenation layer which concatenates feature rows (or
+columns) and a slice layer which slices feature rows (or columns) would be
+inserted. These connection layers help making the network communication and
+synchronization transparent to the users.
+
+#### Dispatching partitions to workers
+
+Each (partitioned) layer is assigned a location ID, based on which it is dispatched to one
+worker. Particularly, the pointer to the `NeuralNet` instance is passed
+to every worker within the same group, but each worker only computes over the
+layers that have the same partition (or location) ID as the worker's ID.  When
+every worker computes the gradients of the entire model parameters
+(strategy-2), we refer to this process as data parallelism.  When different
+workers compute the gradients of different parameters (strategy-3 or
+strategy-1), we call this process model parallelism.  The hybrid partitioning
+leads to hybrid parallelism where some workers compute the gradients of the
+same subset of model parameters while other workers compute on different model
+parameters.  For example, to implement the hybrid parallelism in for the
+[DCNN model](http://arxiv.org/abs/1404.5997), we set `partition_dim = 0` for
+lower layers and `partition_dim = 1` for higher layers.
+
diff --git a/doc/en/docs/optimizer.rst b/doc/en/docs/optimizer.rst
new file mode 100644
index 0000000..e6f1da9
--- /dev/null
+++ b/doc/en/docs/optimizer.rst
@@ -0,0 +1,29 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+   or more contributor license agreements.  See the NOTICE file
+   distributed with this work for additional information
+   regarding copyright ownership.  The ASF licenses this file
+   to you under the Apache License, Version 2.0 (the
+   "License"); you may not use this file except in compliance
+   with the License.  You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing,
+   software distributed under the License is distributed on an
+   "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+   KIND, either express or implied.  See the License for the
+   specific language governing permissions and limitations
+   under the License.
+
+
+Optimizer
+=========
+
+
+.. automodule:: singa.optimizer
+   :members:
+   :member-order: bysource
+   :show-inheritance:
+   :undoc-members:
+
+
diff --git a/doc/en/docs/software_stack.md b/doc/en/docs/software_stack.md
new file mode 100644
index 0000000..c60b6a5
--- /dev/null
+++ b/doc/en/docs/software_stack.md
@@ -0,0 +1,99 @@
+# Software Stack
+
+SINGA's software stack includes three major components, namely, core, IO and
+model. Figure 1 illustrates these components together with the hardware.
+The core component provides memory management and tensor operations;
+IO has classes for reading (and writing) data from (to) disk and network; The
+model component provides data structures and algorithms for machine learning models,
+e.g., layers for neural network models, optimizers/initializer/metric/loss for
+general machine learning models.
+
+
+<img src="../_static/images/singav1-sw.png" align="center" width="500px"/>
+<br/>
+<span><strong>Figure 1 - SINGA V1 software stack.</strong></span>
+
+## Core
+
+[Tensor](tensor.html) and [Device](device.html) are two core abstractions in SINGA. Tensor class represents a
+multi-dimensional array, which stores model variables and provides linear algebra
+operations for machine learning
+algorithms, including matrix multiplication and random functions. Each tensor
+instance (i.e. a tensor) is allocated on a Device instance.
+Each Device instance (i.e. a device) is created against one hardware device,
+e.g. a GPU card or a CPU core. Devices manage the memory of tensors and execute
+tensor operations on its execution units, e.g. CPU threads or CUDA streams.
+
+Depending on the hardware and the programming language, SINGA have implemented
+the following specific device classes:
+
+* **CudaGPU** represents an Nvidia GPU card. The execution units are the CUDA streams.
+* **CppCPU** represents a normal CPU. The execution units are the CPU threads.
+* **OpenclGPU** represents normal GPU card from both Nvidia and AMD.
+  The execution units are the CommandQueues. Given that OpenCL is compatible with
+  many hardware devices, e.g. FPGA and ARM, the OpenclGPU has the potential to be
+  extended for other devices.
+
+Different types of devices use different programming languages to write the kernel
+functions for tensor operations,
+
+* CppMath (tensor_math_cpp.h) implements the tensor operations using Cpp for CppCPU
+* CudaMath (tensor_math_cuda.h) implements the tensor operations using CUDA for CudaGPU
+* OpenclMath (tensor_math_opencl.h) implements the tensor operations using OpenCL for OpenclGPU
+
+In addition, different types of data, such as float32 and float16, could be supported by adding
+the corresponding tensor functions.
+
+Typically, users would create a device instance and pass it to create multiple
+tensor instances. When users call the Tensor functions, these function would invoke
+the corresponding implementation (CppMath/CudaMath/OpenclMath) automatically. In
+other words, the implementation of Tensor operations is transparent to users.
+
+Most machine learning algorithms could be expressed using (dense or sparse) tensors.
+Therefore, with the Tensor abstraction, SINGA would be able to run a wide range of models,
+including deep learning models and other traditional machine learning models.
+
+The Tensor and Device abstractions are extensible to support a wide range of hardware device
+using different programming languages. A new hardware device would be supported by
+adding a new Device subclass and the corresponding implementation of the Tensor
+operations (xxxMath).
+
+Optimizations in terms of speed and memory could be implemented by Device, which
+manages both operation execution and memory malloc/free. More optimization details
+would be described in the [Device page](device.html).
+
+
+## Model
+
+On top of the Tensor and Device abstractions, SINGA provides some higher level
+classes for machine learning modules.
+
+* [Layer](layer.html) and its subclasses are specific for neural networks. Every layer provides
+  functions for forward propagating features and backward propagating gradients w.r.t the training loss functions.
+  They wraps the complex layer operations so that users can easily create neural nets
+  by connecting a set of layers.
+
+* [Initializer](initializer.html) and its subclasses provide variant methods of initializing
+  model parameters (stored in Tensor instances), following Uniform, Gaussian, etc.
+
+* [Loss](loss.html) and its subclasses defines the training objective loss functions.
+  Both functions of computing the loss values and computing the gradient of the prediction w.r.t the
+  objective loss are implemented. Example loss functions include squared error and cross entropy.
+
+* [Metric](metric.html) and its subclasses provide the function to measure the
+  performance of the model, e.g., the accuracy.
+
+* [Optimizer](optimizer.html) and its subclasses implement the methods for updating
+  model parameter values using parameter gradients, including SGD, AdaGrad, RMSProp etc.
+
+
+## IO
+
+The IO module consists of classes for data loading, data preprocessing and message passing.
+
+* Reader and its subclasses load string records from disk files
+* Writer and its subclasses write string records to disk files
+* Encoder and its subclasses encode Tensor instances into string records
+* Decoder and its subclasses decodes string records into Tensor instances
+* Endpoint represents a communication endpoint which provides functions for passing messages to each other.
+* Message represents communication messages between Endpoint instances. It carries both meta data and payload.
diff --git a/doc/en/docs/tensor.rst b/doc/en/docs/tensor.rst
new file mode 100644
index 0000000..d9e7f18
--- /dev/null
+++ b/doc/en/docs/tensor.rst
@@ -0,0 +1,48 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+   or more contributor license agreements.  See the NOTICE file
+   distributed with this work for additional information
+   regarding copyright ownership.  The ASF licenses this file
+   to you under the Apache License, Version 2.0 (the
+   "License"); you may not use this file except in compliance
+   with the License.  You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing,
+   software distributed under the License is distributed on an
+   "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+   KIND, either express or implied.  See the License for the
+   specific language governing permissions and limitations
+   under the License.
+
+
+Tensor
+========
+
+Each Tensor instance is a multi-dimensional array allocated on a specific
+Device instance. Tensor instances store variables and provide
+linear algebra operations over different types of hardware devices without user
+awareness. Note that users need to make sure the tensor operands are
+allocated on the same device except copy functions.
+
+
+Tensor implementation
+---------------------
+
+SINGA has three different sets of implmentations of Tensor functions, one for each
+type of Device.
+
+* 'tensor_math_cpp.h' implements operations using Cpp (with CBLAS) for CppGPU devices.
+* 'tensor_math_cuda.h' implements operations using Cuda (with cuBLAS) for CudaGPU devices.
+* 'tensor_math_opencl.h' implements operations using OpenCL for OpenclGPU devices.
+
+Python API
+----------
+
+
+.. automodule:: singa.tensor
+   :members:
+
+
+CPP API
+---------
diff --git a/doc/en/docs/utils.rst b/doc/en/docs/utils.rst
new file mode 100644
index 0000000..4736ce1
--- /dev/null
+++ b/doc/en/docs/utils.rst
@@ -0,0 +1,24 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+   or more contributor license agreements.  See the NOTICE file
+   distributed with this work for additional information
+   regarding copyright ownership.  The ASF licenses this file
+   to you under the Apache License, Version 2.0 (the
+   "License"); you may not use this file except in compliance
+   with the License.  You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing,
+   software distributed under the License is distributed on an
+   "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+   KIND, either express or implied.  See the License for the
+   specific language governing permissions and limitations
+   under the License.
+
+
+Misc.
+=========
+
+
+.. automodule:: singa.utils
+   :members:
diff --git a/doc/en/downloads.md b/doc/en/downloads.md
new file mode 100644
index 0000000..fe0c30a
--- /dev/null
+++ b/doc/en/downloads.md
@@ -0,0 +1,66 @@
+## Download SINGA
+
+* Latest code: please clone the dev branch from [Github](https://github.com/apache/incubator-singa)
+
+* v0.3.0 (20 April 2016):
+    * [Apache SINGA 0.3.0](http://www.apache.org/dyn/closer.cgi/incubator/singa/0.3.0/apache-singa-incubating-0.3.0.tar.gz)
+      [\[MD5\]](https://dist.apache.org/repos/dist/release/incubator/singa/0.3.0/apache-singa-incubating-0.3.0.tar.gz.md5)
+      [\[KEYS\]](https://dist.apache.org/repos/dist/release/incubator/singa/0.3.0/KEYS)
+    * [Release Notes 0.3.0](releases/RELEASE_NOTES_0.3.0.html)
+    * New features and major updates,
+        * [Training on GPU cluster](v0.3.0/gpu.html) enables training of deep learning models over a GPU cluster.
+        * [Python wrapper improvement](v0.3.0/python.html) makes it easy to configure the job, including neural net and SGD algorithm.
+        * [New SGD updaters](v0.3.0/updater.html) are added, including Adam, AdaDelta and AdaMax.
+        * [Installation](v0.3.0/installation.html) has fewer dependent libraries for single node training.
+        * Heterogeneous training with CPU and GPU.
+        * Support cuDNN V4.
+        * Data prefetching.
+        * Fix some bugs.
+
+
+
+* v0.2.0 (14 January 2016):
+    * [Apache SINGA 0.2.0](http://www.apache.org/dyn/closer.cgi/incubator/singa/0.2.0/apache-singa-incubating-0.2.0.tar.gz)
+      [\[MD5\]](https://archive.apache.org/dist/incubator/singa/0.2.0/apache-singa-incubating-0.2.0.tar.gz.md5)
+      [\[KEYS\]](https://archive.apache.org/dist/incubator/singa/0.2.0/KEYS)
+    * [Release Notes 0.2.0](releases/RELEASE_NOTES_0.2.0.html)
+    * New features and major updates,
+        * [Training on GPU](v0.2.0/gpu.html) enables training of complex models on a single node with multiple GPU cards.
+        * [Hybrid neural net partitioning](v0.2.0/hybrid.html) supports data and model parallelism at the same time.
+        * [Python wrapper](v0.2.0/python.html) makes it easy to configure the job, including neural net and SGD algorithm.
+        * [RNN model and BPTT algorithm](v0.2.0/general-rnn.html) are implemented to support applications based on RNN models, e.g., GRU.
+        * [Cloud software integration](v0.2.0/distributed-training.html) includes Mesos, Docker and HDFS.
+        * Visualization of neural net structure and layer information, which is helpful for debugging.
+        * Linear algebra functions and random functions against Blobs and raw data pointers.
+        * New layers, including SoftmaxLayer, ArgSortLayer, DummyLayer, RNN layers and cuDNN layers.
+        * Update Layer class to carry multiple data/grad Blobs.
+        * Extract features and test performance for new data by loading previously trained model parameters.
+        * Add Store class for IO operations.
+
+
+* v0.1.0 (8 October 2015):
+    * [Apache SINGA 0.1.0](http://www.apache.org/dyn/closer.cgi/incubator/singa/apache-singa-incubating-0.1.0.tar.gz)
+      [\[MD5\]](https://archive.apache.org/dist/incubator/singa/apache-singa-incubating-0.1.0.tar.gz.md5)
+      [\[KEYS\]](https://archive.apache.org/dist/incubator/singa/KEYS)
+    * [Amazon EC2 image](https://console.aws.amazon.com/ec2/v2/home?region=ap-southeast-1#LaunchInstanceWizard:ami=ami-b41001e6)
+    * [Release Notes 0.1.0](releases/RELEASE_NOTES_0.1.0.html)
+    * Major features include,
+        * Installation using GNU build utility
+        * Scripts for job management with zookeeper
+        * Programming model based on NeuralNet and Layer abstractions.
+        * System architecture based on Worker, Server and Stub.
+        * Training models from three different model categories, namely, feed-forward models, energy models and RNN models.
+        * Synchronous and asynchronous distributed training frameworks using CPU
+        * Checkpoint and restore
+        * Unit test using gtest
+
+**Disclaimer**
+
+Apache SINGA is an effort undergoing incubation at The Apache Software
+Foundation (ASF), sponsored by the name of Apache Incubator PMC. Incubation is
+required of all newly accepted projects until a further review indicates that
+the infrastructure, communications, and decision making process have stabilized
+in a manner consistent with other successful ASF projects. While incubation
+status is not necessarily a reflection of the completeness or stability of the
+code, it does indicate that the project has yet to be fully endorsed by the
+ASF.
diff --git a/doc/en/index.rst b/doc/en/index.rst
new file mode 100755
index 0000000..bdf5b1d
--- /dev/null
+++ b/doc/en/index.rst
@@ -0,0 +1,124 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+   or more contributor license agreements.  See the NOTICE file
+   distributed with this work for additional information
+   regarding copyright ownership.  The ASF licenses this file
+   to you under the Apache License, Version 2.0 (the
+   "License"); you may not use this file except in compliance
+   with the License.  You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing,
+   software distributed under the License is distributed on an
+   "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+   KIND, either express or implied.  See the License for the
+   specific language governing permissions and limitations
+   under the License.
+
+
+.. Singa documentation master file, created by
+   sphinx-quickstart on Sat Jul  9 20:36:57 2016.
+   You can adapt this file completely to your liking, but it should at least
+   contain the root `toctree` directive.
+Welcome to Apache Singa
+=======================
+
+Recent News
+-----------
+
+* The **third release** is now available, 20 April, 2016. `Download SINGA v0.3.0 <downloads.html>`_
+
+* The **second release** is now available, 14 Jan, 2016. `Download SINGA v0.2.0 <downloads.html>`_.
+
+* SINGA will be presented at `Strata+Hadoop <http://strataconf.com/big-data-conference-sg-2015/public/schedule/detail/45123>`_ on 2 Dec, 2015
+
+* SINGA was presented at `ACM Multimedia <http://www.acmmm.org/2015/at-a-glance/>`_ Best Paper session and Open Source Software Competition session, 26-30 Oct, 2015 (`Slides <files/mm2015.ppt>`_)
+
+* The **first release** is now available, 8 Oct, 2015. `Download SINGA v0.1.0 <downloads.html>`_.
+
+* SINGA was presented at `workshop on deep learning <http://www.comp.nus.edu.sg/~dbsystem/singa/workshop>`_  held on 16 Sep, 2015
+
+* SINGA was presented at `BOSS <http://boss.dima.tu-berlin.de/>`_ of `VLDB 2015 <http://www.vldb.org/2015/>`_ at Hawaii, 4 Sep, 2015. (slides: `overview <files/singa-vldb-boss.pptx>`_, `basic <files/basic-user-guide.pptx>`_, `advanced <files/advanced-user-guide.pptx>`_)
+
+* SINGA was presented at `ADSC/I2R Deep Learning Workshop <http://adsc.illinois.edu/contact-us>`_, 25 Aug, 2015.
+
+* A tutorial on SINGA was given at VLDB summer school at Tsinghua University,  25-31 July, 2015.
+
+* A half day tutorial on SINGA was given at I2R, 29 June, 2015.
+
+* SINGA was presented at `DanaC <http://danac.org/>`_ of `SIGMOD 2015 <http://www.sigmod2015.org/index.shtml>`_ at Melbourne, 31 May - 4 June, 2015.
+
+* SINGA has been accepted by `Apache Incubator <http://incubator.apache.org/>`_, 17 March, 2015.
+
+Getting Started
+---------------
+* The `Software stack <docs/software_stack.html>`_ page gives an overview of SINGA.
+
+* The `Installation <docs/installation.html>`_ guide describes details on downloading and installing SINGA.
+
+* Please follow the `Examples <docs/examples/index.html>`_ guide to run simple applications on SINGA.
+
+Documentation
+-------------
+
+* Documentations are listed `here <docs.html>`_.
+
+* Research publication list is available `here <http://www.comp.nus.edu.sg/~dbsystem/singa/research/publication/>`_.
+
+How to contribute
+----------------------
+
+* Please subscribe to our development mailing list dev-subscribe@singa.incubator.apache.org.
+
+* If you find any issues using SINGA, please report it to the `Issue Tracker <https://issues.apache.org/jira/browse/singa>`_.
+
+* You can also contact with `SINGA committers <community.html>`_ directly.
+
+More details on contributing to SINGA is described `here <develop/how-contribute.html>`_ .
+
+Citing SINGA
+------------
+
+Please cite the following two papers if you use SINGA in your research:
+
+* B. C. Ooi, K.-L. Tan, S. Wang, W. Wang, Q. Cai, G. Chen, J. Gao, Z. Luo, A. K. H. Tung, Y. Wang, Z. Xie, M. Zhang, and K. Zheng. `SINGA: A distributed deep learning platform <http://www.comp.nus.edu.sg/~ooibc/singaopen-mm15.pdf>`_. ACM Multimedia (Open Source Software Competition) 2015 (`BibTex <http://www.comp.nus.edu.sg/~dbsystem/singa//assets/file/bib-oss.txt>`_).
+
+* W. Wang, G. Chen, T. T. A. Dinh, B. C. Ooi, K.-L.Tan, J. Gao, and S. Wang. `SINGA: putting deep learning in the hands of multimedia users <http://www.comp.nus.edu.sg/~ooibc/singa-mm15.pdf>`_. ACM Multimedia 2015 (`BibTex <http://www.comp.nus.edu.sg/~dbsystem/singa//assets/file/bib-singa.txt>`_, `Slides <files/mm2015.ppt>`_).
+
+.. toctree::
+   :hidden:
+
+   downloads
+   docs/index
+
+.. toctree::
+   :hidden:
+   :maxdepth: 2
+   :caption: Development
+
+   develop/schedule
+   develop/how-contribute
+   develop/contribute-code
+   develop/contribute-docs
+
+.. toctree::
+   :hidden:
+   :maxdepth: 2
+   :caption: Community
+
+   community/source-repository
+   community/mail-lists
+   community/issue-tracking
+   community/team-list
+
+
+
+License
+----------
+SINGA is released under `Apache License Version 2.0 <http://www.apache.org/licenses/LICENSE-2.0>`_.
+
+Disclaimers
+-----------
+
+Apache SINGA is an effort undergoing incubation at The Apache Software Foundation (ASF), sponsored by the Apache Incubator. Incubation is required of all newly accepted projects until a further review indicates that the infrastructure, communications, and decision making process have stabilized in a manner consistent with other successful ASF projects. While incubation status is not necessarily a reflection of the completeness or stability of the code, it does indicate that the project has yet to be fully endorsed by the ASF.
+
diff --git a/doc/en/releases/RELEASE_NOTES_0.1.0.md b/doc/en/releases/RELEASE_NOTES_0.1.0.md
new file mode 100644
index 0000000..2674d90
--- /dev/null
+++ b/doc/en/releases/RELEASE_NOTES_0.1.0.md
@@ -0,0 +1,99 @@
+#singa-incubating-0.1.0 Release Notes
+
+---
+
+SINGA is a general distributed deep learning platform for training big deep learning models over large datasets. It is
+designed with an intuitive programming model based on the layer abstraction. SINGA supports a wide variety of popular
+deep learning models.
+
+This release includes following features:
+
+  * Job management
+    * [SINGA-3](https://issues.apache.org/jira/browse/SINGA-3)  Use Zookeeper to check stopping (finish) time of the system
+    * [SINGA-16](https://issues.apache.org/jira/browse/SINGA-16)  Runtime Process id Management
+    * [SINGA-25](https://issues.apache.org/jira/browse/SINGA-25)  Setup glog output path
+    * [SINGA-26](https://issues.apache.org/jira/browse/SINGA-26)  Run distributed training in a single command
+    * [SINGA-30](https://issues.apache.org/jira/browse/SINGA-30)  Enhance easy-to-use feature and support concurrent jobs
+    * [SINGA-33](https://issues.apache.org/jira/browse/SINGA-33)  Automatically launch a number of processes in the cluster
+    * [SINGA-34](https://issues.apache.org/jira/browse/SINGA-34)  Support external zookeeper service
+    * [SINGA-38](https://issues.apache.org/jira/browse/SINGA-38)  Support concurrent jobs
+    * [SINGA-39](https://issues.apache.org/jira/browse/SINGA-39)  Avoid ssh in scripts for single node environment
+    * [SINGA-43](https://issues.apache.org/jira/browse/SINGA-43)  Remove Job-related output from workspace
+    * [SINGA-56](https://issues.apache.org/jira/browse/SINGA-56)  No automatic launching of zookeeper service
+    * [SINGA-73](https://issues.apache.org/jira/browse/SINGA-73)  Refine the selection of available hosts from host list
+
+
+  * Installation with GNU Auto tool
+    * [SINGA-4](https://issues.apache.org/jira/browse/SINGA-4)  Refine thirdparty-dependency installation
+    * [SINGA-13](https://issues.apache.org/jira/browse/SINGA-13)  Separate intermediate files of compilation from source files
+    * [SINGA-17](https://issues.apache.org/jira/browse/SINGA-17)  Add root permission within thirdparty/install.
+    * [SINGA-27](https://issues.apache.org/jira/browse/SINGA-27)  Generate python modules for proto objects
+    * [SINGA-53](https://issues.apache.org/jira/browse/SINGA-53)  Add lmdb compiling options
+    * [SINGA-62](https://issues.apache.org/jira/browse/SINGA-62)  Remove building scrips and auxiliary files
+    * [SINGA-67](https://issues.apache.org/jira/browse/SINGA-67)  Add singatest into build targets
+
+
+  * Distributed training
+    * [SINGA-7](https://issues.apache.org/jira/browse/SINGA-7)  Implement shared memory Hogwild algorithm
+    * [SINGA-8](https://issues.apache.org/jira/browse/SINGA-8)  Implement distributed Hogwild
+    * [SINGA-19](https://issues.apache.org/jira/browse/SINGA-19)  Slice large Param objects for load-balance
+    * [SINGA-29](https://issues.apache.org/jira/browse/SINGA-29)  Update NeuralNet class to enable layer partition type customization
+    * [SINGA-24](https://issues.apache.org/jira/browse/SINGA-24)  Implement Downpour training framework
+    * [SINGA-32](https://issues.apache.org/jira/browse/SINGA-32)  Implement AllReduce training framework
+    * [SINGA-57](https://issues.apache.org/jira/browse/SINGA-57)  Improve Distributed Hogwild
+
+
+  * Training algorithms for different model categories
+    * [SINGA-9](https://issues.apache.org/jira/browse/SINGA-9)  Add Support for Restricted Boltzman Machine (RBM) model
+    * [SINGA-10](https://issues.apache.org/jira/browse/SINGA-10)  Add Support for Recurrent Neural Networks (RNN)
+
+
+  * Checkpoint and restore
+    * [SINGA-12](https://issues.apache.org/jira/browse/SINGA-12)  Support Checkpoint and Restore
+
+
+  * Unit test
+    * [SINGA-64](https://issues.apache.org/jira/browse/SINGA-64)  Add the test module for utils/common
+
+
+  * Programming model
+    * [SINGA-36](https://issues.apache.org/jira/browse/SINGA-36)  Refactor job configuration, driver program and scripts
+    * [SINGA-37](https://issues.apache.org/jira/browse/SINGA-37)  Enable users to set parameter sharing in model configuration
+    * [SINGA-54](https://issues.apache.org/jira/browse/SINGA-54)  Refactor job configuration to move fields in ModelProto out
+    * [SINGA-55](https://issues.apache.org/jira/browse/SINGA-55)  Refactor main.cc and singa.h
+    * [SINGA-61](https://issues.apache.org/jira/browse/SINGA-61)  Support user defined classes
+    * [SINGA-65](https://issues.apache.org/jira/browse/SINGA-65)  Add an example of writing user-defined layers
+
+
+  * Other features
+    * [SINGA-6](https://issues.apache.org/jira/browse/SINGA-6)  Implement thread-safe singleton
+    * [SINGA-18](https://issues.apache.org/jira/browse/SINGA-18)  Update API for displaying performance metric
+    * [SINGA-77](https://issues.apache.org/jira/browse/SINGA-77)  Integrate with Apache RAT
+
+
+Some bugs are fixed during the development of this release
+
+  * [SINGA-2](https://issues.apache.org/jira/browse/SINGA-2) Check failed: zsock_connect
+  * [SINGA-5](https://issues.apache.org/jira/browse/SINGA-5) Server early terminate when zookeeper singa folder is not initially empty
+  * [SINGA-15](https://issues.apache.org/jira/browse/SINGA-15) Fixg a bug from ConnectStub function which gets stuck for connecting layer_dealer_
+  * [SINGA-22](https://issues.apache.org/jira/browse/SINGA-22) Cannot find openblas library when it is installed in default path
+  * [SINGA-23](https://issues.apache.org/jira/browse/SINGA-23) Libtool version mismatch error.
+  * [SINGA-28](https://issues.apache.org/jira/browse/SINGA-28) Fix a bug from topology sort of Graph
+  * [SINGA-42](https://issues.apache.org/jira/browse/SINGA-42) Issue when loading checkpoints
+  * [SINGA-44](https://issues.apache.org/jira/browse/SINGA-44) A bug when reseting metric values
+  * [SINGA-46](https://issues.apache.org/jira/browse/SINGA-46) Fix a bug in updater.cc to scale the gradients
+  * [SINGA-47](https://issues.apache.org/jira/browse/SINGA-47) Fix a bug in data layers that leads to out-of-memory when group size is too large
+  * [SINGA-48](https://issues.apache.org/jira/browse/SINGA-48) Fix a bug in trainer.cc that assigns the same NeuralNet instance to workers from diff groups
+  * [SINGA-49](https://issues.apache.org/jira/browse/SINGA-49) Fix a bug in HandlePutMsg func that sets param fields to invalid values
+  * [SINGA-66](https://issues.apache.org/jira/browse/SINGA-66) Fix bugs in Worker::RunOneBatch function and ClusterProto
+  * [SINGA-79](https://issues.apache.org/jira/browse/SINGA-79) Fix bug in singatool that can not parse -conf flag
+
+
+Features planned for the next release
+
+  * [SINGA-11](https://issues.apache.org/jira/browse/SINGA-11) Start SINGA using Mesos
+  * [SINGA-31](https://issues.apache.org/jira/browse/SINGA-31) Extend Blob to support xpu (cpu or gpu)
+  * [SINGA-35](https://issues.apache.org/jira/browse/SINGA-35) Add random number generators
+  * [SINGA-40](https://issues.apache.org/jira/browse/SINGA-40) Support sparse Param update
+  * [SINGA-41](https://issues.apache.org/jira/browse/SINGA-41) Support single node single GPU training
+
diff --git a/doc/en/releases/RELEASE_NOTES_0.2.0.md b/doc/en/releases/RELEASE_NOTES_0.2.0.md
new file mode 100644
index 0000000..38f498a
--- /dev/null
+++ b/doc/en/releases/RELEASE_NOTES_0.2.0.md
@@ -0,0 +1,84 @@
+#singa-incubating-0.2.0 Release Notes
+
+---
+
+SINGA is a general distributed deep learning platform for training big deep
+learning models over large datasets. It is designed with an intuitive
+programming model based on the layer abstraction. SINGA supports a wide variety
+of popular deep learning models.
+
+This release includes the following **major features**:
+
+* [Training on GPU](../docs/gpu.html) enables training of complex models on a single node with multiple GPU cards.
+* [Hybrid neural net partitioning](../docs/hybrid.html) supports data and model parallelism at the same time.
+* [Python wrapper](../docs/python.html) makes it easy to configure the job, including neural net and SGD algorithm.
+* [RNN model and BPTT algorithm](../docs/general-rnn.html) are implemented to support applications based on RNN models, e.g., GRU.
+* [Cloud software integration](../docs/distributed-training.md) includes Mesos, Docker and HDFS.
+
+
+**More details** are listed as follows,
+
+  * Programming model
+    * [SINGA-80] New Blob Level and Address Level Math Operation Interface
+    * [SINGA-82] Refactor input layers using data store abstraction
+    * [SINGA-87] Replace exclude field to include field for layer configuration
+    * [SINGA-110] Add Layer member datavec_ and gradvec_
+    * [SINGA-120] Implemented GRU and BPTT (BPTTWorker)
+
+
+  * Neuralnet layers
+    * [SINGA-91] Add SoftmaxLayer and ArgSortLayer
+    * [SINGA-106] Add dummy layer for test purpose
+    * [SINGA-120] Implemented GRU and BPTT (GRULayer and OneHotLayer)
+
+
+  * GPU training support
+    * [SINGA-100] Implement layers using CUDNN for GPU training
+    * [SINGA-104] Add Context Class
+    * [SINGA-105] Update GUN make files for compiling cuda related code
+    * [SINGA-98] Add Support for AlexNet ImageNet Classification Model
+
+
+  * Model/Hybrid partition
+    * [SINGA-109] Refine bridge layers
+    * [SINGA-111] Add slice, concate and split layers
+    * [SINGA-113] Model/Hybrid Partition Support
+
+
+  * Python binding
+    * [SINGA-108] Add Python wrapper to singa
+
+
+  * Predict-only mode
+    * [SINGA-85] Add functions for extracting features and test new data
+
+
+  * Integrate with third-party tools
+    * [SINGA-11] Start SINGA on Apache Mesos
+    * [SINGA-78] Use Doxygen to generate documentation
+    * [SINGA-89] Add Docker support
+
+
+  * Unit test
+    * [SINGA-95] Add make test after building
+
+
+  * Other improvment
+    * [SINGA-84] Header Files Rearrange
+    * [SINGA-93] Remove the asterisk in the log tcp://169.254.12.152:*:49152
+    * [SINGA-94] Move call to google::InitGoogleLogging() from Driver::Init() to main()
+    * [SINGA-96] Add Momentum to Cifar10 Example
+    * [SINGA-101] Add ll (ls -l) command in .bashrc file when using docker
+    * [SINGA-114] Remove short logs in tmp directory
+    * [SINGA-115] Print layer debug information in the neural net graph file
+    * [SINGA-118] Make protobuf LayerType field id easy to assign
+    * [SIGNA-97] Add HDFS Store
+
+
+  * Bugs fixed
+    * [SINGA-85] Fix compilation errors in examples
+    * [SINGA-90] Miscellaneous trivial bug fixes
+    * [SINGA-107] Error from loading pre-trained params for training stacked RBMs
+    * [SINGA-116] Fix a bug in InnerProductLayer caused by weight matrix sharing
+
+
diff --git a/doc/en/releases/RELEASE_NOTES_0.3.0.md b/doc/en/releases/RELEASE_NOTES_0.3.0.md
new file mode 100644
index 0000000..c169e12
--- /dev/null
+++ b/doc/en/releases/RELEASE_NOTES_0.3.0.md
@@ -0,0 +1,37 @@
+#singa-incubating-0.3.0 Release Notes
+
+---
+
+SINGA is a general distributed deep learning platform for training big deep
+learning models over large datasets. It is designed with an intuitive
+programming model based on the layer abstraction. SINGA supports a wide variety
+of popular deep learning models.
+
+This release includes following features:
+
+  * GPU Support
+    * [SINGA-131] Implement and optimize hybrid training using both CPU and GPU
+    * [SINGA-136] Support cuDNN v4
+    * [SINGA-134] Extend SINGA to run over a GPU cluster
+    * [Singa-157] Change the priority of cudnn library and install libsingagpu.so
+
+  * Remove Dependences
+    * [SINGA-156] Remove the dependency on ZMQ for single process training
+    * [SINGA-155] Remove zookeeper for single-process training
+
+  * Python Binding
+    * [SINGA-126] Python Binding for Interactive Training
+
+  * Other Improvements
+    * [SINGA-80] New Blob Level and Address Level Math Operation Interface
+    * [SINGA-130] Data Prefetching
+    * [SINGA-145] New SGD based optimization Updaters: AdaDelta, Adam, AdamMax
+
+  * Bugs Fixed
+    * [SINGA-148] Race condition between Worker threads and Driver
+    * [SINGA-150] Mesos Docker container failed
+    * [SIGNA-141] Undesired Hash collision when locating process id to worker…
+    * [SINGA-149] Docker build fail
+    * [Singa-143] The compilation cannot detect libsingagpu.so file
+
+
diff --git a/doc/zh/index.rst b/doc/zh/index.rst
new file mode 100644
index 0000000..3d59dd3
--- /dev/null
+++ b/doc/zh/index.rst
@@ -0,0 +1,27 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+   or more contributor license agreements.  See the NOTICE file
+   distributed with this work for additional information
+   regarding copyright ownership.  The ASF licenses this file
+   to you under the Apache License, Version 2.0 (the
+   "License"); you may not use this file except in compliance
+   with the License.  You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing,
+   software distributed under the License is distributed on an
+   "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+   KIND, either express or implied.  See the License for the
+   specific language governing permissions and limitations
+   under the License.
+
+
+SINGA 中文文档
+==============
+
+.. toctree::
+
+   overview
+   installation_source
+   programming-guide
+
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
new file mode 100644
index 0000000..0bb6c2f
--- /dev/null
+++ b/examples/CMakeLists.txt
@@ -0,0 +1,20 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# 
+
+ADD_SUBDIRECTORY(cifar10)
+ADD_SUBDIRECTORY(imagenet)
diff --git a/examples/alexnet/Makefile.example b/examples/alexnet/Makefile.example
deleted file mode 100644
index f895146..0000000
--- a/examples/alexnet/Makefile.example
+++ /dev/null
@@ -1,29 +0,0 @@
-#/**
-# * Licensed to the Apache Software Foundation (ASF) under one
-# * or more contributor license agreements.  See the NOTICE file
-# * distributed with this work for additional information
-# * regarding copyright ownership.  The ASF licenses this file
-# * to you under the Apache License, Version 2.0 (the
-# * "License"); you may not use this file except in compliance
-# * with the License.  You may obtain a copy of the License at
-# *
-# *     http://www.apache.org/licenses/LICENSE-2.0
-# *
-# * Unless required by applicable law or agreed to in writing, software
-# * distributed under the License is distributed on an "AS IS" BASIS,
-# * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# * See the License for the specific language governing permissions and
-# * limitations under the License.
-# */
-
-libs :=singa glog protobuf
-
-.PHONY: all create
-
-create:
-	$(CXX) im2rec.cc `pkg-config opencv --cflags --libs` -std=c++11 -lsinga -lprotobuf -lglog \
-		-I../../include -L../../.libs/ -Wl,-unresolved-symbols=ignore-in-shared-libs \
-		-Wl,-rpath=../../.libs/  -o im2rec.bin
-	$(CXX) rec2im_test.cc `pkg-config opencv --cflags --libs` -std=c++11 -lsinga -lprotobuf -lglog \
-		-I../../include -L../../.libs/ -Wl,-unresolved-symbols=ignore-in-shared-libs \
-		-Wl,-rpath=../../.libs/  -o rec2im_test.bin
diff --git a/examples/alexnet/cudnn.conf b/examples/alexnet/cudnn.conf
deleted file mode 100644
index e8d14c6..0000000
--- a/examples/alexnet/cudnn.conf
+++ /dev/null
@@ -1,448 +0,0 @@
-name: "alexnet"
-train_steps: 450000
-test_steps: 500
-test_freq: 1000
-disp_freq: 20
-checkpoint_freq: 100000
-checkpoint_after: 100000
-gpu: 0
-#debug: true
-#checkpoint_path: "examples/alexnet/checkpoint/step10000-worker0"
-train_one_batch {
-  alg: kBP
-}
-updater{
-  type: kSGD
-  weight_decay: 0.0005
-  momentum: 0.9
-  learning_rate {
-    type: kStep
-    base_lr: 0.01
-    step_conf {
-      gamma: 0.1
-      change_freq: 100000
-    }
-  }
-}
-neuralnet {
-  layer{
-    name: "data"
-    type: kRecordInput
-    store_conf {
-      backend: "kvfile"
-      path :"/data/dataset/imagenet/train_record.bin"
-      mean_file: "/data/dataset/imagenet/image_mean.bin"
-      batchsize: 256
-      #random_skip: 1000
-      shape: 3
-      shape: 256
-      shape: 256
-    }
-    include: kTrain
-  }
-  layer{
-    name: "data"
-    type: kRecordInput
-    store_conf {
-      backend: "kvfile"
-      path :"/data/dataset/imagenet/val_record.bin"
-      mean_file: "/data/dataset/imagenet/image_mean.bin"
-      batchsize: 100
-      shape: 3
-      shape: 256
-      shape: 256
-    }
-    include: kTest
-  }
-  layer{
-    name: "image"
-    type: kImagePreprocess
-    rgbimage_conf {
-      cropsize: 227
-      mirror: true
-    }
-#    partition_dim: 0
-    srclayers: "data"
-  }
-  layer{
-    name: "conv1"
-    type: kCudnnConv
-    srclayers: "image"
-    convolution_conf {
-      num_filters: 96
-      kernel: 11
-      stride: 4
-    }
-#    partition_dim: 0
-    param {
-      name: "w1"
-      init {
-        type: kGaussian
-        std: 0.01
-      }
-    }
-    param {
-      name: "b1"
-      lr_scale: 2
-      wd_scale: 0
-      init {
-        type: kConstant
-        value: 0
-      }
-    }
-  }
-  layer {
-    name: "relu1"
-    type: kCudnnActivation
-    activation_conf {
-      type: RELU
-    }
-    share_src_blobs: true
-    srclayers: "conv1"
-#    partition_dim: 0
-  }
-  layer {
-    name: "pool1"
-    type: kCudnnPool
-    pooling_conf {
-      pool: MAX
-      kernel: 3
-      stride: 2
-    }
-    srclayers: "relu1"
-#    partition_dim: 0
-  }
-  layer {
-    name: "norm1"
-    type: kCudnnLRN
-    lrn_conf {
-      local_size: 5
-      alpha: 0.0001
-      beta: 0.75
-      knorm: 1
-    }
-    srclayers: "pool1"
-#    partition_dim: 0
-  }
-
-  layer{
-    name: "conv2"
-    type: kCudnnConv
-    srclayers: "norm1"
-    convolution_conf {
-      num_filters: 256
-      kernel: 5
-      pad: 2
-    }
-#    partition_dim: 0
-    param {
-      name: "w2"
-      init {
-        type: kGaussian
-        std: 0.01
-      }
-    }
-    param {
-      name: "b2"
-      lr_scale: 2
-      wd_scale: 0
-      init {
-        type: kConstant
-        value: 1
-      }
-    }
-  }
-  layer {
-    name: "relu2"
-    type: kCudnnActivation
-    activation_conf {
-      type: RELU
-    }
-    share_src_blobs: true
-    srclayers: "conv2"
-#    partition_dim: 0
-  }
-  layer {
-    name: "pool2"
-    type: kCudnnPool
-    pooling_conf {
-      pool: MAX
-      kernel: 3
-      stride: 2
-    }
-    srclayers: "relu2"
-#    partition_dim: 0
-  }
-
-  layer {
-    name: "norm2"
-    type: kCudnnLRN
-    lrn_conf {
-      local_size: 5
-      alpha: 0.0001
-      beta: 0.75
-      knorm: 1
-    }
-    srclayers: "pool2"
-#    partition_dim: 0
-  }
-  layer{
-    name: "conv3"
-    type: kCudnnConv
-    srclayers: "norm2"
-    convolution_conf {
-      num_filters: 384
-      kernel: 3
-      pad: 1
-    }
-#    partition_dim: 0
-    param {
-      name: "w3"
-      init {
-        type: kGaussian
-        std: 0.01
-      }
-    }
-    param {
-      name: "b3"
-      lr_scale: 2
-      wd_scale: 0
-      init {
-        type: kConstant
-        value: 0
-      }
-    }
-  }
-  layer {
-    name: "relu3"
-    type: kCudnnActivation
-    activation_conf {
-      type: RELU
-    }
-    share_src_blobs: true
-    srclayers: "conv3"
-#    partition_dim: 0
-  }
-  layer{
-    name: "conv4"
-    type: kCudnnConv
-    srclayers: "relu3"
-    convolution_conf {
-      num_filters: 384
-      kernel: 3
-      pad: 1
-    }
-#    partition_dim: 0
-    param {
-      name: "w4"
-      init {
-        type: kGaussian
-        std: 0.01
-      }
-    }
-    param {
-      name: "b4"
-      lr_scale:2
-      wd_scale:0
-      init {
-        type: kConstant
-        value: 1
-      }
-    }
-  }
-  layer {
-    name: "relu4"
-    type: kCudnnActivation
-    activation_conf {
-      type: RELU
-    }
-    share_src_blobs: true
-    srclayers: "conv4"
-#    partition_dim: 0
-  }
-  layer{
-    name: "conv5"
-    type: kCudnnConv
-    srclayers: "relu4"
-    convolution_conf {
-      num_filters: 256
-      kernel: 3
-      pad: 1
-    }
-#    partition_dim: 0
-    param {
-      name: "w5"
-      init {
-        type: kGaussian
-        std: 0.01
-      }
-    }
-    param {
-      name: "b5"
-      lr_scale: 2
-      wd_scale: 0
-      init {
-        type: kConstant
-        value: 1
-      }
-    }
-  }
-  layer {
-    name: "relu5"
-    type: kCudnnActivation
-    activation_conf {
-      type: RELU
-    }
-    share_src_blobs: true
-    srclayers: "conv5"
-#    partition_dim: 0
-  }
-  layer {
-    name: "pool5"
-    type: kCudnnPool
-    pooling_conf {
-      pool: MAX
-      kernel: 3
-      stride: 2
-    }
-    srclayers: "relu5"
-#    partition_dim: 0
-  }
-  layer {
-    name: "ip6"
-    type: kInnerProduct
-    innerproduct_conf {
-      num_output: 4096
-    }
-    param {
-      name: "w6"
-      init {
-        type: kGaussian
-        std: 0.005
-      }
-    }
-    param {
-      name: "b6"
-      lr_scale: 2
-      wd_scale: 0
-      init {
-        type: kConstant
-        value: 1
-      }
-    }
-    srclayers: "pool5"
-#    partition_dim: 1
-  }
-  layer {
-    name: "relu6"
-    type: kCudnnActivation
-    activation_conf {
-      type: RELU
-    }
-    share_src_blobs: true
-    srclayers: "ip6"
-#    partition_dim: 1
-  }
-  layer {
-    name: "drop6"
-    type: kDropout
-    srclayers: "relu6"
-#    partition_dim: 1
-  }
-  layer {
-    name: "ip7"
-    type: kInnerProduct
-    innerproduct_conf {
-      num_output: 4096
-    }
-#    partition_dim: 1
-    param {
-      name: "w7"
-      init {
-        type: kGaussian
-        std: 0.005
-      }
-    }
-    param {
-      name: "b7"
-      lr_scale: 2
-      wd_scale: 0
-      init {
-        type: kConstant
-        value: 1
-      }
-    }
-    srclayers: "drop6"
-  }
-  layer {
-    name: "relu7"
-    type: kCudnnActivation
-    activation_conf {
-      type: RELU
-    }
-    share_src_blobs: true
-    srclayers: "ip7"
-#    partition_dim: 1
-  }
-  layer {
-    name: "drop7"
-    type: kDropout
-    srclayers: "relu7"
-#    partition_dim: 1
-  }
-  layer {
-    name: "ip8"
-    type: kInnerProduct
-    innerproduct_conf {
-      num_output: 1000
-    }
-#    partition_dim: 1
-    param {
-      name: "w8"
-      init {
-        type: kGaussian
-        std: 0.01
-      }
-    }
-    param {
-      name: "b8"
-      lr_scale: 2
-      wd_scale: 0
-      init {
-        type: kConstant
-        value: 0
-      }
-    }
-    srclayers: "drop7"
-  }
-  layer {
-    name: "loss"
-    type: kCudnnSoftmaxLoss
-    softmaxloss_conf {
-      topk:1
-    }
-    srclayers: "ip8"
-    srclayers: "data"
-    include: kTrain
-  }
-  layer {
-   name : "softmax"
-   type: kCudnnSoftmax
-   srclayers: "ip8"
-   include: kTest
-  }
-  layer {
-   name : "accuracy"
-   type: kAccuracy
-   srclayers: "softmax"
-   srclayers: "data"
-   include: kTest
-  }
-}
-cluster {
-  nworker_groups: 1
-  nserver_groups: 1
-  nworkers_per_group: 1
-  nworkers_per_procs: 1
-  workspace: "examples/alexnet"
-}
diff --git a/examples/alexnet/im2rec.cc b/examples/alexnet/im2rec.cc
deleted file mode 100644
index 58ee44f..0000000
--- a/examples/alexnet/im2rec.cc
+++ /dev/null
@@ -1,157 +0,0 @@
-/************************************************************
-*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*
-*************************************************************/
-
-
-#include <glog/logging.h>
-#include <opencv2/opencv.hpp>
-#include <algorithm>
-#include <random>
-#include <chrono>
-#include <fstream>
-#include <string>
-#include <cstdint>
-#include <iostream>
-#include <vector>
-
-#include "singa/io/store.h"
-#include "singa/proto/common.pb.h"
-
-using std::string;
-
-const int kImageSize = 256;
-const int kImageNBytes = 256*256*3;
-
-void create_data(const string& image_list,
-    const string& input_folder,
-    const string& output_folder,
-    const string& backend = "kvfile") {
-  singa::RecordProto image;
-  image.add_shape(3);
-  image.add_shape(kImageSize);
-  image.add_shape(kImageSize);
-
-  singa::RecordProto mean;
-  mean.CopyFrom(image);
-  for (int i = 0; i < kImageNBytes; ++i)
-    mean.add_data(0.f);
-
-  auto store = singa::io::CreateStore(backend);
-  if (backend == "lmdb")
-    CHECK(store->Open(output_folder + "/image_record", singa::io::kCreate));
-  else
-    CHECK(store->Open(output_folder + "/image_record.bin", singa::io::kCreate));
-
-  LOG(INFO) << "Generating image record";
-
-  std::ifstream image_list_file(image_list.c_str(), std::ios::in);
-  CHECK(image_list_file.is_open()) << "Unable to open image list";
-
-  string image_file_name;
-  int label;
-  char str_buffer[kImageNBytes];
-  string rec_buf;
-  cv::Mat img, res;
-  std::vector<std::pair<string, int>> file_list;
-  while (image_list_file >> image_file_name >> label)
-    file_list.push_back(std::make_pair(image_file_name, label));
-  LOG(INFO) << "Data Shuffling";
-  unsigned seed = std::chrono::system_clock::now().time_since_epoch().count();
-  std::shuffle(file_list.begin(), file_list.end()
-      , std::default_random_engine());
-  LOG(INFO) << "Total number of images is " << file_list.size();
-  int ImageNum = file_list.size();
-
-  for (int imageid = 0; imageid < ImageNum; ++imageid) {
-    string path = input_folder + "/" + file_list[imageid].first;
-    img = cv::imread(path, CV_LOAD_IMAGE_COLOR);
-    CHECK(img.data != NULL) << "OpenCV load image fail" << path;
-    cv::resize(img, res, cv::Size(kImageSize, kImageSize),
-        0, 0, CV_INTER_LINEAR);
-    for (int h = 0; h < kImageSize; ++h) {
-      const uchar* ptr = res.ptr<uchar>(h);
-      int img_index = 0;
-      for (int w = 0; w < kImageSize; ++w)
-        for (int c = 0; c < 3; ++c)
-          str_buffer[(c*kImageSize+h)*kImageSize+w] =
-            static_cast<uint8_t>(ptr[img_index++]);
-    }
-    /*
-    for (int i = 0; i < kImageSize; ++i) {
-      for (int j = 0; j < kImageSize; ++j) {
-        cv::Vec3b pixel = res.at<cv::Vec3b>(j, i);
-        str_buffer[i*kImageSize+j] = static_cast<uint8_t>(pixel.val[2]);
-        str_buffer[kImageSize*kImageSize+i*kImageSize+j] = static_cast<uint8_t>(pixel.val[1]);
-        str_buffer[kImageSize*kImageSize*2+i*kImageSize+j] = static_cast<uint8_t>(pixel.val[0]);
-      }
-    }
-    */
-    image.set_label(file_list[imageid].second);
-    image.set_pixel(str_buffer, kImageNBytes);
-    image.SerializeToString(&rec_buf);
-
-    int length = snprintf(str_buffer, kImageNBytes, "%08d", imageid);
-    CHECK(store->Write(string(str_buffer, length), rec_buf));
-    if ((imageid+1) % 1000 == 0) {
-      store->Flush();
-      LOG(INFO) << imageid+1 << " files processed.";
-    }
-    const string& pixels = image.pixel();
-    for (int i = 0; i < kImageNBytes; ++i)
-      mean.set_data(i, mean.data(i) + static_cast<uint8_t>(pixels[i]));
-  }
-  if (ImageNum % 1000 != 0)
-      LOG(INFO) << ImageNum << " files processed.";
-
-  store->Flush();
-  store->Close();
-
-  LOG(INFO) << "Create image mean";
-  if (backend == "lmdb")
-    CHECK(store->Open(output_folder + "/image_mean", singa::io::kCreate));
-  else
-    CHECK(store->Open(output_folder + "/image_mean.bin", singa::io::kCreate));
-  for (int i = 0; i < kImageNBytes; i++)
-    mean.set_data(i, mean.data(i) / ImageNum);
-  mean.SerializeToString(&rec_buf);
-  store->Write("mean", rec_buf);
-  store->Flush();
-  store->Close();
-  delete store;
-
-  LOG(INFO) << "Done!";
-}
-
-int main(int argc, char** argv) {
-  if (argc < 4) {
-    std::cout << "Create data stores for ImageNet dataset.\n"
-      << "Usage: <image_list_file> <input_image_folder> <output_folder>"
-      << " <Optional: backend {lmdb, kvfile} default: kvfile>\n";
-  } else {
-    google::InitGoogleLogging(argv[0]);
-    FLAGS_alsologtostderr = 1;
-    if (argc == 4)
-      create_data(string(argv[1]), string(argv[2]), string(argv[3]));
-    else
-      create_data(string(argv[1]), string(argv[2]),
-          string(argv[3]), string(argv[4]));
-  }
-  return 0;
-}
diff --git a/examples/alexnet/job.conf b/examples/alexnet/job.conf
deleted file mode 100644
index 3b7eaf4..0000000
--- a/examples/alexnet/job.conf
+++ /dev/null
@@ -1,403 +0,0 @@
-name: "alexnet"
-train_steps: 450000
-test_steps: 500
-test_freq: 1000
-disp_freq: 20
-checkpoint_freq: 100000
-checkpoint_after: 100000
-train_one_batch {
-  alg: kBP
-}
-updater{
-  type: kSGD
-  weight_decay: 0.0005
-  momentum: 0.9
-  learning_rate {
-    type: kStep
-    base_lr: 0.01
-    step_conf {
-      gamma: 0.1
-      change_freq: 100000
-    }
-  }
-}
-neuralnet {
-  layer{
-    name: "data"
-    type: kRecordInput
-    store_conf {
-      backend: "kvfile"
-      path :"/data/dataset/imagenet/train_record.bin"
-      mean_file: "/data/dataset/imagenet/image_mean.bin"
-      batchsize: 256
-      #random_skip: 1000
-      shape: 3
-      shape: 256
-      shape: 256
-    }
-    include: kTrain
-  }
-  layer{
-    name: "data"
-    type: kRecordInput
-    store_conf {
-      backend: "kvfile"
-      path :"/data/dataset/imagenet/val_record.bin"
-      mean_file: "/data/dataset/imagenet/image_mean.bin"
-      batchsize: 100
-      shape: 3
-      shape: 256
-      shape: 256
-    }
-    include: kTest
-  }
-  layer{
-    name: "image"
-    type: kImagePreprocess
-    rgbimage_conf {
-      cropsize: 227
-      mirror: true
-    }
-#    partition_dim: 0
-    srclayers: "data"
-  }
-  layer{
-    name: "conv1"
-    type: kConvolution
-    srclayers: "image"
-    convolution_conf {
-      num_filters: 96
-      kernel: 11
-      stride: 4
-    }
-#    partition_dim: 0
-    param {
-      name: "w1"
-      init {
-        type: kGaussian
-        std: 0.01
-      }
-    }
-    param {
-      name: "b1"
-      lr_scale: 2
-      wd_scale: 0
-      init {
-        type: kConstant
-        value: 0
-      }
-    }
-  }
-  layer {
-    name: "relu1"
-    type: kReLU
-    srclayers: "conv1"
-#    partition_dim: 0
-  }
-  layer {
-    name: "pool1"
-    type: kPooling
-    pooling_conf {
-      pool: MAX
-      kernel: 3
-      stride: 2
-    }
-    srclayers: "relu1"
-#    partition_dim: 0
-  }
-  layer {
-    name: "norm1"
-    type: kLRN
-    lrn_conf {
-      local_size: 5
-      alpha: 0.0001
-      beta: 0.75
-      knorm: 1
-    }
-    srclayers: "pool1"
-#    partition_dim: 0
-  }
-
-  layer{
-    name: "conv2"
-    type: kConvolution
-    srclayers: "norm1"
-    convolution_conf {
-      num_filters: 256
-      kernel: 5
-      pad: 2
-    }
-#    partition_dim: 0
-    param {
-      name: "w2"
-      init {
-        type: kGaussian
-        std: 0.01
-      }
-    }
-    param {
-      name: "b2"
-      lr_scale: 2
-      wd_scale: 0
-      init {
-        type: kConstant
-        value: 1
-      }
-    }
-  }
-  layer {
-    name: "relu2"
-    type: kReLU
-    srclayers: "conv2"
-#    partition_dim: 0
-  }
-  layer {
-    name: "pool2"
-    type: kPooling
-    pooling_conf {
-      pool: MAX
-      kernel: 3
-      stride: 2
-    }
-    srclayers: "relu2"
-#    partition_dim: 0
-  }
-
-  layer {
-    name: "norm2"
-    type: kLRN
-    lrn_conf {
-      local_size: 5
-      alpha: 0.0001
-      beta: 0.75
-      knorm: 1
-    }
-    srclayers: "pool2"
-#    partition_dim: 0
-  }
-  layer{
-    name: "conv3"
-    type: kConvolution
-    srclayers: "norm2"
-    convolution_conf {
-      num_filters: 384
-      kernel: 3
-      pad: 1
-    }
-#    partition_dim: 0
-    param {
-      name: "w3"
-      init {
-        type: kGaussian
-        std: 0.01
-      }
-    }
-    param {
-      name: "b3"
-      lr_scale: 2
-      wd_scale: 0
-      init {
-        type: kConstant
-        value: 0
-      }
-    }
-  }
-  layer {
-    name: "relu3"
-    type: kReLU
-    srclayers: "conv3"
-#    partition_dim: 0
-  }
-  layer{
-    name: "conv4"
-    type: kConvolution
-    srclayers: "relu3"
-    convolution_conf {
-      num_filters: 384
-      kernel: 3
-      pad: 1
-    }
-#    partition_dim: 0
-    param {
-      name: "w4"
-      init {
-        type: kGaussian
-        std: 0.01
-      }
-    }
-    param {
-      name: "b4"
-      lr_scale:2
-      wd_scale:0
-      init {
-        type: kConstant
-        value: 1
-      }
-    }
-  }
-  layer {
-    name: "relu4"
-    type: kReLU
-    srclayers: "conv4"
-#    partition_dim: 0
-  }
-  layer{
-    name: "conv5"
-    type: kConvolution
-    srclayers: "relu4"
-    convolution_conf {
-      num_filters: 256
-      kernel: 3
-      pad: 1
-    }
-#    partition_dim: 0
-    param {
-      name: "w5"
-      init {
-        type: kGaussian
-        std: 0.01
-      }
-    }
-    param {
-      name: "b5"
-      lr_scale: 2
-      wd_scale: 0
-      init {
-        type: kConstant
-        value: 1
-      }
-    }
-  }
-  layer {
-    name: "relu5"
-    type: kReLU
-    srclayers: "conv5"
-#    partition_dim: 0
-  }
-  layer {
-    name: "pool5"
-    type: kPooling
-    pooling_conf {
-      pool: MAX
-      kernel: 3
-      stride: 2
-    }
-    srclayers: "relu5"
-#    partition_dim: 0
-  }
-  layer {
-    name: "ip6"
-    type: kInnerProduct
-    innerproduct_conf {
-      num_output: 4096
-    }
-    param {
-      name: "w6"
-      init {
-        type: kGaussian
-        std: 0.005
-      }
-    }
-    param {
-      name: "b6"
-      lr_scale: 2
-      wd_scale: 0
-      init {
-        type: kConstant
-        value: 1
-      }
-    }
-    srclayers: "pool5"
-#    partition_dim: 1
-  }
-  layer {
-    name: "relu6"
-    type: kReLU
-    srclayers: "ip6"
-#    partition_dim: 1
-  }
-  layer {
-    name: "drop6"
-    type: kDropout
-    srclayers: "relu6"
-#    partition_dim: 1
-  }
-  layer {
-    name: "ip7"
-    type: kInnerProduct
-    innerproduct_conf {
-      num_output: 4096
-    }
-#    partition_dim: 1
-    param {
-      name: "w7"
-      init {
-        type: kGaussian
-        std: 0.005
-      }
-    }
-    param {
-      name: "b7"
-      lr_scale: 2
-      wd_scale: 0
-      init {
-        type: kConstant
-        value: 1
-      }
-    }
-    srclayers: "drop6"
-  }
-  layer {
-    name: "relu7"
-    type: kReLU
-    srclayers: "ip7"
-#    partition_dim: 1
-  }
-  layer {
-    name: "drop7"
-    type: kDropout
-    srclayers: "relu7"
-#    partition_dim: 1
-  }
-  layer {
-    name: "ip8"
-    type: kInnerProduct
-    innerproduct_conf {
-      num_output: 1000
-    }
-#    partition_dim: 1
-    param {
-      name: "w8"
-      init {
-        type: kGaussian
-        std: 0.01
-      }
-    }
-    param {
-      name: "b8"
-      lr_scale: 2
-      wd_scale: 0
-      init {
-        type: kConstant
-        value: 0
-      }
-    }
-    srclayers: "drop7"
-  }
-  layer {
-    name: "loss"
-    type: kSoftmaxLoss
-    softmaxloss_conf {
-      topk:1
-    }
-    srclayers: "ip8"
-    srclayers: "data"
-  }
-}
-cluster {
-  nworker_groups: 1
-  nserver_groups: 1
-  nworkers_per_group: 1
-  nworkers_per_procs: 1
-  workspace: "examples/alexnet"
-}
diff --git a/examples/alexnet/rec2im_test.cc b/examples/alexnet/rec2im_test.cc
deleted file mode 100644
index bb92d95..0000000
--- a/examples/alexnet/rec2im_test.cc
+++ /dev/null
@@ -1,116 +0,0 @@
-/************************************************************
-*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*
-*************************************************************/
-
-
-#include <glog/logging.h>
-#include <opencv2/opencv.hpp>
-
-#include <algorithm>
-#include <random>
-#include <chrono>
-#include <fstream>
-#include <string>
-#include <cstdint>
-#include <iostream>
-#include <vector>
-
-#include "singa/io/store.h"
-#include "singa/proto/common.pb.h"
-
-using std::string;
-
-const int kImageSize = 256;
-const int kImageNBytes = 256*256*3;
-
-void generate_image(const string& output_folder,
-    const string& key,
-    const string& val) {
-  float image_buf[kImageNBytes];
-  singa::RecordProto image;
-  image.ParseFromString(val);
-  cv::Mat img = cv::Mat::zeros(kImageSize, kImageSize, CV_8UC3);
-  string pixel = image.pixel();
-  int label = image.label();
-  string image_name = output_folder+"/"+key+"_"+std::to_string(label)+".jpg";
-  std::cout << "Writing to " << image_name << "...\n";
-  for (int h = 0; h < kImageSize; ++h) {
-    uchar* ptr = img.ptr<uchar>(h);
-    int img_index = 0;
-    for (int w = 0; w < kImageSize; ++w) {
-      for (int c = 0; c < 3; ++c)
-        ptr[img_index++] =
-          static_cast<uchar>(
-              static_cast<uint8_t>(
-                pixel[(c * kImageSize + h) * kImageSize + w]));
-    }
-  }
-
-  cv::imwrite(image_name, img);
-}
-
-void visualize(const string& input_file,
-    const string& output_folder,
-    const string& id_list) {
-  auto store = singa::io::OpenStore("kvfile", input_file,
-      singa::io::kRead);
-
-  std::vector<int> image_id_list;
-
-  std::ifstream id_list_file(id_list.c_str(), std::ios::in);
-  CHECK(id_list_file.is_open()) << "Unable to open image id list";
-  string id_;
-  while (id_list_file >> id_) {
-    int x;
-    x = std::stoi(id_);
-    image_id_list.push_back(x);
-  }
-  std::sort(image_id_list.begin(), image_id_list.end());
-
-  string key, val;
-  for (int i = 0; i < image_id_list[0]; ++i)
-    if (!store->Read(&key, &val)) {
-      store->SeekToFirst();
-      CHECK(store->Read(&key, &val));
-    }
-  generate_image(output_folder, key, val);
-
-  for (size_t i = 1; i != image_id_list.size(); ++i) {
-    for (int j = 0; j < image_id_list[i]-image_id_list[i-1]; ++j)
-      if (!store->Read(&key, &val)) {
-        store->SeekToFirst();
-        CHECK(store->Read(&key, &val));
-      }
-    generate_image(output_folder, key, val);
-  }
-}
-
-int main(int argc, char** argv) {
-  if (argc != 4) {
-    std::cout << "Visualize images from binary kvfile records.\n"
-      << "Usage: <input_file> <output_folder> <id_list>\n";
-  } else {
-    google::InitGoogleLogging(argv[0]);
-    FLAGS_alsologtostderr = 1;
-    visualize(string(argv[1]), string(argv[2]), string(argv[3]));
-  }
-
-  return 0;
-}
diff --git a/examples/char-rnn/README.md b/examples/char-rnn/README.md
new file mode 100644
index 0000000..dcaf652
--- /dev/null
+++ b/examples/char-rnn/README.md
@@ -0,0 +1,33 @@
+# Train Char-RNN over plain text
+
+Recurrent neural networks (RNN) are widely used for modelling sequential data,
+e.g., natural language sentences. This example describes how to implement a RNN
+application (or model) using SINGA's RNN layers.
+We will use the [char-rnn](https://github.com/karpathy/char-rnn) model as an
+example, which trains over sentences or
+source code, with each character as an input unit. Particularly, we will train
+a RNN using GRU over Linux kernel source code. After training, we expect to
+generate meaningful code from the model.
+
+
+## Instructions
+
+* Compile and install SINGA. Currently the RNN implementation depends on Cudnn with version >= 5.05.
+
+* Prepare the dataset. Download the [kernel source code](http://cs.stanford.edu/people/karpathy/char-rnn/).
+Other plain text files can also be used.
+
+* Start the training,
+
+        python train.py linux_input.txt
+
+  Some hyper-parameters could be set through command line,
+
+        python train.py -h
+
+* Sample characters from the model by providing the number of characters to sample and the seed string.
+
+        python sample.py 'model.bin' 100 --seed '#include <std'
+
+  Please replace 'model.bin' with the path to one of the checkpoint paths.
+
diff --git a/examples/char-rnn/data.py b/examples/char-rnn/data.py
deleted file mode 100644
index 20e7262..0000000
--- a/examples/char-rnn/data.py
+++ /dev/null
@@ -1,32 +0,0 @@
-#!/usr/bin/env python
-
-#/************************************************************
-#*
-#* Licensed to the Apache Software Foundation (ASF) under one
-#* or more contributor license agreements.  See the NOTICE file
-#* distributed with this work for additional information
-#* regarding copyright ownership.  The ASF licenses this file
-#* to you under the Apache License, Version 2.0 (the
-#* "License"); you may not use this file except in compliance
-#* with the License.  You may obtain a copy of the License at
-#*
-#*   http://www.apache.org/licenses/LICENSE-2.0
-#*
-#* Unless required by applicable law or agreed to in writing,
-#* software distributed under the License is distributed on an
-#* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-#* KIND, either express or implied.  See the License for the
-#* specific language governing permissions and limitations
-#* under the License.
-#*
-#*************************************************************/
-
-
-# pls get linux_input.txt from http://cs.stanford.edu/people/karpathy/char-rnn/
-data = open('linux_input.txt', 'r').read() # should be simple plain text file
-chars = list(set(data))
-data_size, vocab_size = len(data), len(chars)
-print 'data has %d characters, %d unique.' % (data_size, vocab_size)
-with open('vocab.txt', 'w') as fd:
-  fd.write("".join(chars))
-  fd.flush()
diff --git a/examples/char-rnn/job.conf b/examples/char-rnn/job.conf
deleted file mode 100644
index bd648f1..0000000
--- a/examples/char-rnn/job.conf
+++ /dev/null
@@ -1,253 +0,0 @@
-name:"char-rnn"
-train_steps: 100000
-disp_freq: 100
-#debug: true
-gpu: 0
-train_one_batch {
-  alg: kBPTT
-}
-
-updater {
-  type: kRMSProp
-  rmsprop_conf {
-    rho: 0.95
-  }
-  learning_rate {
-    type: kStep
-    base_lr: 0.002
-    step_conf {
-      gamma: 0.97
-      change_freq: 2000
-    }
-  }
-  clip_low: -5
-  clip_high: 5
-}
-
-neuralnet {
-  unroll_len: 50
-  layer {
-    name: "data"
-    type: kCharRNN
-    unroll_len: 1
-    char_rnn_conf {
-      path: "examples/char-rnn/linux_input.txt"
-      vocab_path:"examples/char-rnn/vocab.txt"
-      batchsize: 50
-      unroll_len: 50
-    }
-  }
-  layer {
-    name: "onehot"
-    type: kOneHot
-    srclayers: "data"
-    unroll_conn_type: kUnrollOneToAll
-    onehot_conf {
-      vocab_size: 101
-    }
-  }
-
-  layer {
-    name: "label"
-    type: kRNNLabel
-    srclayers: "data"
-    unroll_conn_type: kUnrollOneToAll
-  }
-
-  layer {
-    name: "gru1"
-    type: kGRU
-    srclayers: "onehot"
-    gru_conf {
-      dim_hidden: 512
-    }
-    param {
-      name: "z_hx"
-      init {
-        type: kUniform
-        low: -0.08
-        high: 0.08
-      }
-    }
-    param {
-      name: "r_hx"
-      init {
-        type: kUniform
-        low: -0.08
-        high: 0.08
-      }
-    }
-    param {
-      name: "c_hx"
-      init {
-        type: kUniform
-        low: -0.08
-        high: 0.08
-      }
-    }
-    param {
-      name: "z_hh"
-      init {
-        type: kUniform
-        low: -0.08
-        high: 0.08
-      }
-    }
-    param {
-      name: "r_hh"
-      init {
-        type: kUniform
-        low: -0.08
-        high: 0.08
-      }
-    }
-    param {
-      name: "c_hh"
-      init {
-        type: kUniform
-        low: -0.08
-        high: 0.08
-      }
-    }
-    param {
-      name: "z_b"
-      init {
-        type: kUniform
-        low: -0.08
-        high: 0.08
-      }
-    }
-    param {
-      name: "r_b"
-      init {
-        type: kUniform
-        low: -0.08
-        high: 0.08
-      }
-    }
-    param {
-      name: "c_b"
-      init {
-        type: kUniform
-        low: -0.08
-        high: 0.08
-      }
-    }
-
-  }
-#  layer {
-#    name: "gru2"
-#    type: kGRU
-#    srclayers: "gru1"
-#    gru_conf {
-#      dim_hidden: 512
-#    }
-#    param {
-#      name: "z_hx2"
-#      init {
-#        type: kUniform
-#        low: -0.08
-#        high: 0.08
-#      }
-#    }
-#    param {
-#      name: "r_hx2"
-#      init {
-#        type: kUniform
-#        low: -0.08
-#        high: 0.08
-#      }
-#    }
-#    param {
-#      name: "c_hx2"
-#      init {
-#        type: kUniform
-#        low: -0.08
-#        high: 0.08
-#      }
-#    }
-#    param {
-#      name: "z_hh2"
-#      init {
-#        type: kUniform
-#        low: -0.08
-#        high: 0.08
-#      }
-#    }
-#    param {
-#      name: "r_hh2"
-#      init {
-#        type: kUniform
-#        low: -0.08
-#        high: 0.08
-#      }
-#    }
-#    param {
-#      name: "c_hh2"
-#      init {
-#        type: kUniform
-#        low: -0.08
-#        high: 0.08
-#      }
-#    }
-#    param {
-#      name: "z_b2"
-#      init {
-#        type: kUniform
-#        low: -0.08
-#        high: 0.08
-#      }
-#    }
-#    param {
-#      name: "r_b2"
-#      init {
-#        type: kUniform
-#        low: -0.08
-#        high: 0.08
-#      }
-#    }
-#    param {
-#      name: "c_b2"
-#      init {
-#        type: kUniform
-#        low: -0.08
-#        high: 0.08
-#      }
-#    }
-#  }
-#
-  layer {
-    name: "ip1"
-    type: kInnerProduct
-    srclayers: "gru1"
-    innerproduct_conf {
-      num_output: 101
-    }
-    param {
-      name: "w"
-      init {
-        type: kUniform
-        low: -0.08
-        high: 0.08
-      }
-    }
-    param {
-      name: "b"
-      init {
-        type: kUniform
-        low: -0.08
-        high: 0.08
-      }
-    }
-  }
-  layer {
-    name: "loss"
-    type: kSoftmaxLoss
-    srclayers: "ip1"
-    srclayers: "label"
-  }
-}
-
-cluster {
-  workspace: "examples/char-rnn/"
-}
diff --git a/examples/char-rnn/sample.conf b/examples/char-rnn/sample.conf
deleted file mode 100644
index b15ef9e..0000000
--- a/examples/char-rnn/sample.conf
+++ /dev/null
@@ -1,212 +0,0 @@
-name:"char-rnn"
-test_steps: 100
-#debug: true
-gpu: 0
-checkpoint_path: "examples/char-rnn/checkpoint/step2000-worker0"
-train_one_batch {
-  alg: kBPTT
-}
-
-neuralnet {
-  layer {
-    name: "data"
-    type: kRNNDummy
-    rnn_dummy_conf {
-      shape: 1
-      integer: true
-      low: 0
-      high: 101
-      dynamic_srclayer: "argsort"
-    }
-  }
-  layer {
-    name: "onehot"
-    type: kOneHot
-    srclayers: "data"
-  }
-
-  layer {
-    name: "gru1"
-    type: kGRU
-    srclayers: "onehot"
-    gru_conf {
-      dim_hidden: 512
-    }
-    param {
-      name: "z_hx"
-      init {
-        type: kUniform
-        low: -0.08
-        high: 0.08
-      }
-    }
-    param {
-      name: "r_hx"
-      init {
-        type: kUniform
-        low: -0.08
-        high: 0.08
-      }
-    }
-    param {
-      name: "c_hx"
-      init {
-        type: kUniform
-        low: -0.08
-        high: 0.08
-      }
-    }
-    param {
-      name: "z_hh"
-      init {
-        type: kUniform
-        low: -0.08
-        high: 0.08
-      }
-    }
-    param {
-      name: "r_hh"
-      init {
-        type: kUniform
-        low: -0.08
-        high: 0.08
-      }
-    }
-    param {
-      name: "c_hh"
-      init {
-        type: kUniform
-        low: -0.08
-        high: 0.08
-      }
-    }
-    param {
-      name: "z_b"
-      init {
-        type: kUniform
-        low: -0.08
-        high: 0.08
-      }
-    }
-    param {
-      name: "r_b"
-      init {
-        type: kUniform
-        low: -0.08
-        high: 0.08
-      }
-    }
-    param {
-      name: "c_b"
-      init {
-        type: kUniform
-        low: -0.08
-        high: 0.08
-      }
-    }
-  }
-  layer {
-    name: "gru2"
-    type: kGRU
-    srclayers: "gru1"
-    gru_conf {
-      dim_hidden: 512
-    }
-    param {
-      name: "z_hx2"
-      init {
-        type: kUniform
-        low: -0.08
-        high: 0.08
-      }
-    }
-    param {
-      name: "r_hx2"
-      init {
-        type: kUniform
-        low: -0.08
-        high: 0.08
-      }
-    }
-    param {
-      name: "c_hx2"
-      init {
-        type: kUniform
-        low: -0.08
-        high: 0.08
-      }
-    }
-    param {
-      name: "z_hh2"
-      init {
-        type: kUniform
-        low: -0.08
-        high: 0.08
-      }
-    }
-    param {
-      name: "r_hh2"
-      init {
-        type: kUniform
-        low: -0.08
-        high: 0.08
-      }
-    }
-    param {
-      name: "c_hh2"
-      init {
-        type: kUniform
-        low: -0.08
-        high: 0.08
-      }
-    }
-  }
-
-
-  layer {
-    name: "ip1"
-    type: kInnerProduct
-    srclayers: "gru2"
-    innerproduct_conf {
-      num_output: 101
-    }
-    param {
-      name: "w"
-      init {
-        type: kUniform
-        low: -0.08
-        high: 0.08
-      }
-    }
-    param {
-      name: "b"
-      init {
-        type: kUniform
-        low: -0.08
-        high: 0.08
-      }
-    }
-  }
-  layer {
-    name: "softmax"
-    type: kSoftmax
-    srclayers: "ip1"
-  }
-  layer {
-    name: "argsort"
-    type: kArgSort
-    srclayers: "softmax"
-  }
-  layer {
-    name: "sampling"
-    type: kCharRNNOutput
-    srclayers: "argsort"
-    char_rnn_conf {
-      vocab_path: "examples/char-rnn/vocab.txt"
-    }
-  }
-}
-
-cluster {
-  workspace: "examples/char-rnn/"
-}
diff --git a/examples/char-rnn/sample.py b/examples/char-rnn/sample.py
new file mode 100644
index 0000000..bbfb28f
--- /dev/null
+++ b/examples/char-rnn/sample.py
@@ -0,0 +1,102 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+'''Sample characters from the pre-trained model'''
+import sys
+import cPickle as pickle
+import numpy as np
+import argparse
+
+# sys.path.append(os.path.join(os.path.dirname(__file__), '../../build/python'))
+from singa import layer
+from singa import tensor
+from singa import device
+from singa.proto import model_pb2
+
+
+def sample(model_path, nsamples=100, seed_text='', do_sample=True):
+    with open(model_path, 'rb') as fd:
+        d = pickle.load(fd)
+        rnn_w = tensor.from_numpy(d['rnn_w'])
+        idx_to_char = d['idx_to_char']
+        char_to_idx = d['char_to_idx']
+        vocab_size = len(idx_to_char)
+        dense_w = tensor.from_numpy(d['dense_w'])
+        dense_b = tensor.from_numpy(d['dense_b'])
+        hidden_size = d['hidden_size']
+        num_stacks = d['num_stacks']
+        dropout = d['dropout']
+
+    cuda = device.create_cuda_gpu()
+    rnn = layer.LSTM(name='lstm', hidden_size=hidden_size,
+                     num_stacks=num_stacks, dropout=dropout,
+                     input_sample_shape=(len(idx_to_char),))
+    rnn.to_device(cuda)
+    rnn.param_values()[0].copy_data(rnn_w)
+    dense = layer.Dense('dense', vocab_size, input_sample_shape=(hidden_size,))
+    dense.to_device(cuda)
+    dense.param_values()[0].copy_data(dense_w)
+    dense.param_values()[1].copy_data(dense_b)
+    hx = tensor.Tensor((num_stacks, 1, hidden_size), cuda)
+    cx = tensor.Tensor((num_stacks, 1, hidden_size), cuda)
+    hx.set_value(0.0)
+    cx.set_value(0.0)
+    if len(seed_text) > 0:
+        for c in seed_text:
+            x = np.zeros((1, vocab_size), dtype=np.float32)
+            x[0, char_to_idx[c]] = 1
+            tx = tensor.from_numpy(x)
+            tx.to_device(cuda)
+            inputs = [tx, hx, cx]
+            outputs = rnn.forward(model_pb2.kEval, inputs)
+            y = dense.forward(model_pb2.kEval, outputs[0])
+            y = tensor.softmax(y)
+            hx = outputs[1]
+            cx = outputs[2]
+        sys.stdout.write(seed_text)
+    else:
+        y = tensor.Tensor((1, vocab_size), cuda)
+        y.set_value(1.0 / vocab_size)
+
+    for i in range(nsamples):
+        y.to_host()
+        prob = tensor.to_numpy(y)[0]
+        if do_sample:
+            cur = np.random.choice(vocab_size, 1, p=prob)[0]
+        else:
+            cur = np.argmax(prob)
+        sys.stdout.write(idx_to_char[cur])
+        x = np.zeros((1, vocab_size), dtype=np.float32)
+        x[0, cur] = 1
+        tx = tensor.from_numpy(x)
+        tx.to_device(cuda)
+        inputs = [tx, hx, cx]
+        outputs = rnn.forward(model_pb2.kEval, inputs)
+        y = dense.forward(model_pb2.kEval, outputs[0])
+        y = tensor.softmax(y)
+        hx = outputs[1]
+        cx = outputs[2]
+    print ''
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='sample chars from char-rnn')
+    parser.add_argument('model', type=int, help='the model checkpoint file')
+    parser.add_argument('n', type=int, help='num of characters to sample')
+    parser.add_argument('--seed', help='seed text string which warms up the '
+                        ' rnn states for sampling', default='')
+    args = parser.parse_args()
+    assert args.n > 0, 'n must > 0'
+    sample(args.model, args.n, seed_text=args.seed)
diff --git a/examples/char-rnn/train.py b/examples/char-rnn/train.py
new file mode 100644
index 0000000..d28646e
--- /dev/null
+++ b/examples/char-rnn/train.py
@@ -0,0 +1,229 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+'''Train a Char-RNN model using plain text files.
+The model is created following https://github.com/karpathy/char-rnn
+The train file could be any text file,
+e.g., http://cs.stanford.edu/people/karpathy/char-rnn/
+'''
+import cPickle as pickle
+import numpy as np
+import argparse
+
+# sys.path.append(os.path.join(os.path.dirname(__file__), '../../build/python'))
+from singa import layer
+from singa import loss
+from singa import device
+from singa import tensor
+from singa import optimizer
+from singa import initializer
+from singa.proto import model_pb2
+from singa import utils
+
+
+class Data(object):
+
+    def __init__(self, fpath, batch_size=32, seq_length=100, train_ratio=0.8):
+        '''Data object for loading a plain text file.
+
+        Args:
+            fpath, path to the text file.
+            train_ratio, split the text file into train and test sets, where
+                train_ratio of the characters are in the train set.
+        '''
+        self.raw_data = open(fpath, 'r').read()  # read text file
+        chars = list(set(self.raw_data))
+        self.vocab_size = len(chars)
+        self.char_to_idx = {ch: i for i, ch in enumerate(chars)}
+        self.idx_to_char = {i: ch for i, ch in enumerate(chars)}
+        data = [self.char_to_idx[c] for c in self.raw_data]
+        # seq_length + 1 for the data + label
+        nsamples = len(data) / (1 + seq_length)
+        data = data[0:nsamples * (1 + seq_length)]
+        data = np.asarray(data, dtype=np.int32)
+        data = np.reshape(data, (-1, seq_length + 1))
+        # shuffle all sequences
+        np.random.shuffle(data)
+        self.train_dat = data[0:int(data.shape[0]*train_ratio)]
+        self.num_train_batch = self.train_dat.shape[0] / batch_size
+        self.val_dat = data[self.train_dat.shape[0]:]
+        self.num_test_batch = self.val_dat.shape[0] / batch_size
+        print 'train dat', self.train_dat.shape
+        print 'val dat', self.val_dat.shape
+
+
+def numpy2tensors(npx, npy, dev):
+    '''batch, seq, dim -- > seq, batch, dim'''
+    tmpx = np.swapaxes(npx, 0, 1)
+    tmpy = np.swapaxes(npy, 0, 1)
+    inputs = []
+    labels = []
+    for t in range(tmpx.shape[0]):
+        x = tensor.from_numpy(tmpx[t])
+        y = tensor.from_numpy(tmpy[t])
+        x.to_device(dev)
+        y.to_device(dev)
+        inputs.append(x)
+        labels.append(y)
+    return inputs, labels
+
+
+def convert(batch, batch_size, seq_length, vocab_size, dev):
+    '''convert a batch of data into a sequence of input tensors'''
+    y = batch[:, 1:]
+    x1 = batch[:, :seq_length]
+    x = np.zeros((batch_size, seq_length, vocab_size), dtype=np.float32)
+    for b in range(batch_size):
+        for t in range(seq_length):
+            c = x1[b, t]
+            x[b, t, c] = 1
+    return numpy2tensors(x, y, dev)
+
+
+def get_lr(epoch):
+    return 0.001 / float(1 << (epoch / 50))
+
+
+def train(data, max_epoch, hidden_size=100, seq_length=100, batch_size=16,
+          num_stacks=1, dropout=0.5, model_path='model'):
+    # SGD with L2 gradient normalization
+    opt = optimizer.RMSProp(constraint=optimizer.L2Constraint(5))
+    cuda = device.create_cuda_gpu()
+    rnn = layer.LSTM(
+        name='lstm',
+        hidden_size=hidden_size,
+        num_stacks=num_stacks,
+        dropout=dropout,
+        input_sample_shape=(
+            data.vocab_size,
+        ))
+    rnn.to_device(cuda)
+    print 'created rnn'
+    rnn_w = rnn.param_values()[0]
+    rnn_w.uniform(-0.08, 0.08)  # init all rnn parameters
+    print 'rnn weight l1 = %f' % (rnn_w.l1())
+    dense = layer.Dense(
+        'dense',
+        data.vocab_size,
+        input_sample_shape=(
+            hidden_size,
+        ))
+    dense.to_device(cuda)
+    dense_w = dense.param_values()[0]
+    dense_b = dense.param_values()[1]
+    print 'dense w ', dense_w.shape
+    print 'dense b ', dense_b.shape
+    initializer.uniform(dense_w, dense_w.shape[0], 0)
+    print 'dense weight l1 = %f' % (dense_w.l1())
+    dense_b.set_value(0)
+    print 'dense b l1 = %f' % (dense_b.l1())
+
+    g_dense_w = tensor.Tensor(dense_w.shape, cuda)
+    g_dense_b = tensor.Tensor(dense_b.shape, cuda)
+
+    lossfun = loss.SoftmaxCrossEntropy()
+    for epoch in range(max_epoch):
+        train_loss = 0
+        for b in range(data.num_train_batch):
+            batch = data.train_dat[b * batch_size: (b + 1) * batch_size]
+            inputs, labels = convert(batch, batch_size, seq_length,
+                                     data.vocab_size, cuda)
+            inputs.append(tensor.Tensor())
+            inputs.append(tensor.Tensor())
+
+            outputs = rnn.forward(model_pb2.kTrain, inputs)[0:-2]
+            grads = []
+            batch_loss = 0
+            g_dense_w.set_value(0.0)
+            g_dense_b.set_value(0.0)
+            for output, label in zip(outputs, labels):
+                act = dense.forward(model_pb2.kTrain, output)
+                lvalue = lossfun.forward(model_pb2.kTrain, act, label)
+                batch_loss += lvalue.l1()
+                grad = lossfun.backward()
+                grad /= batch_size
+                grad, gwb = dense.backward(model_pb2.kTrain, grad)
+                grads.append(grad)
+                g_dense_w += gwb[0]
+                g_dense_b += gwb[1]
+                # print output.l1(), act.l1()
+            utils.update_progress(
+                b * 1.0 / data.num_train_batch, 'training loss = %f' %
+                (batch_loss / seq_length))
+            train_loss += batch_loss
+
+            grads.append(tensor.Tensor())
+            grads.append(tensor.Tensor())
+            g_rnn_w = rnn.backward(model_pb2.kTrain, grads)[1][0]
+            dense_w, dense_b = dense.param_values()
+            opt.apply_with_lr(epoch, get_lr(epoch), g_rnn_w, rnn_w, 'rnnw')
+            opt.apply_with_lr(
+                epoch, get_lr(epoch),
+                g_dense_w, dense_w, 'dense_w')
+            opt.apply_with_lr(
+                epoch, get_lr(epoch),
+                g_dense_b, dense_b, 'dense_b')
+        print '\nEpoch %d, train loss is %f' % \
+            (epoch, train_loss / data.num_train_batch / seq_length)
+
+        eval_loss = 0
+        for b in range(data.num_test_batch):
+            batch = data.val_dat[b * batch_size: (b + 1) * batch_size]
+            inputs, labels = convert(batch, batch_size, seq_length,
+                                     data.vocab_size, cuda)
+            inputs.append(tensor.Tensor())
+            inputs.append(tensor.Tensor())
+            outputs = rnn.forward(model_pb2.kEval, inputs)[0:-2]
+            for output, label in zip(outputs, labels):
+                output = dense.forward(model_pb2.kEval, output)
+                eval_loss += lossfun.forward(model_pb2.kEval,
+                                             output, label).l1()
+        print 'Epoch %d, evaluation loss is %f' % \
+            (epoch, eval_loss / data.num_test_batch / seq_length)
+
+        if (epoch + 1) % 30 == 0:
+            # checkpoint the file model
+            with open('%s_%d.bin' % (model_path, epoch), 'wb') as fd:
+                print 'saving model to %s' % model_path
+                d = {}
+                for name, w in zip(
+                        ['rnn_w', 'dense_w', 'dense_b'],
+                        [rnn_w, dense_w, dense_b]):
+                    w.to_host()
+                    d[name] = tensor.to_numpy(w)
+                    w.to_device(cuda)
+                d['idx_to_char'] = data.idx_to_char
+                d['char_to_idx'] = data.char_to_idx
+                d['hidden_size'] = hidden_size
+                d['num_stacks'] = num_stacks
+                d['dropout'] = dropout
+
+                pickle.dump(d, fd)
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(
+        description='Train multi-stack LSTM for '
+        'modeling  character sequence from plain text files')
+    parser.add_argument('data', type=str, help='training file')
+    parser.add_argument('-b', type=int, default=32, help='batch_size')
+    parser.add_argument('-l', type=int, default=64, help='sequence length')
+    parser.add_argument('-d', type=int, default=128, help='hidden size')
+    parser.add_argument('-s', type=int, default=2, help='num of stacks')
+    parser.add_argument('-m', type=int, default=50, help='max num of epoch')
+    args = parser.parse_args()
+    data = Data(args.data, batch_size=args.b, seq_length=args.l)
+    train(data, args.m,  hidden_size=args.d, num_stacks=args.s,
+          seq_length=args.l, batch_size=args.b)
diff --git a/examples/cifar10/CMakeLists.txt b/examples/cifar10/CMakeLists.txt
new file mode 100644
index 0000000..313c0eb
--- /dev/null
+++ b/examples/cifar10/CMakeLists.txt
@@ -0,0 +1,36 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# 
+
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
+INCLUDE_DIRECTORIES(${CMAKE_BINARY_DIR}/include)
+
+IF(USE_CUDNN)
+ADD_EXECUTABLE(alexnet alexnet.cc)
+ADD_DEPENDENCIES(alexnet singa_core singa_model singa_utils)
+TARGET_LINK_LIBRARIES(alexnet singa_core singa_utils singa_model protobuf ${SINGA_LIBKER_LIBS})
+
+ADD_EXECUTABLE(alexnet-parallel alexnet-parallel.cc)
+ADD_DEPENDENCIES(alexnet-parallel singa_core singa_model singa_utils)
+TARGET_LINK_LIBRARIES(alexnet-parallel singa_core singa_utils singa_model protobuf ${SINGA_LIBKER_LIBS})
+SET_TARGET_PROPERTIES(alexnet-parallel PROPERTIES LINK_FLAGS "${LINK_FLAGS} -pthread")
+
+ADD_EXECUTABLE(vgg-parallel vgg-parallel.cc)
+ADD_DEPENDENCIES(vgg-parallel singa_core singa_model singa_utils)
+TARGET_LINK_LIBRARIES(vgg-parallel singa_core singa_utils singa_model protobuf ${SINGA_LIBKER_LIBS})
+SET_TARGET_PROPERTIES(vgg-parallel PROPERTIES LINK_FLAGS "${LINK_FLAGS} -pthread")
+ENDIF(USE_CUDNN)
diff --git a/examples/cifar10/Makefile.example b/examples/cifar10/Makefile.example
deleted file mode 100644
index 40e85b1..0000000
--- a/examples/cifar10/Makefile.example
+++ /dev/null
@@ -1,41 +0,0 @@
-#/**
-# * Licensed to the Apache Software Foundation (ASF) under one
-# * or more contributor license agreements.  See the NOTICE file
-# * distributed with this work for additional information
-# * regarding copyright ownership.  The ASF licenses this file
-# * to you under the Apache License, Version 2.0 (the
-# * "License"); you may not use this file except in compliance
-# * with the License.  You may obtain a copy of the License at
-# *
-# *     http://www.apache.org/licenses/LICENSE-2.0
-# *
-# * Unless required by applicable law or agreed to in writing, software
-# * distributed under the License is distributed on an "AS IS" BASIS,
-# * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# * See the License for the specific language governing permissions and
-# * limitations under the License.
-# */
-
-libs :=singa glog protobuf
-
-.PHONY: all download create
-
-HDFS_DIR := hdfs://node0:9000/examples/cifar10
-
-download: cifar-10-binary-bin
-
-cifar-10-binary-bin:
-	wget http://www.cs.toronto.edu/~kriz/cifar-10-binary.tar.gz
-	tar xf cifar-10-binary.tar.gz
-
-create:
-	$(CXX) create_data.cc -std=c++11 -lsinga -lprotobuf -lglog \
-		-I../../include -L../../.libs/ -Wl,-unresolved-symbols=ignore-in-shared-libs \
-		-Wl,-rpath=../../.libs/  -o create_data.bin
-	./create_data.bin cifar-10-batches-bin .
-
-create_hdfs:
-	$(CXX) create_data.cc -std=c++11 -lsinga -lprotobuf -lglog -lhdfs3 \
-		-I../../include -L../../.libs/ -Wl,-unresolved-symbols=ignore-in-shared-libs \
-		-Wl,-rpath=../../.libs/  -o create_data.bin
-	./create_data.bin cifar-10-batches-bin $(HDFS_DIR)
diff --git a/examples/cifar10/README.md b/examples/cifar10/README.md
new file mode 100644
index 0000000..bd5ed7d
--- /dev/null
+++ b/examples/cifar10/README.md
@@ -0,0 +1,77 @@
+# Train CNN over Cifar-10
+
+
+Convolution neural network (CNN) is a type of feed-forward artificial neural
+network widely used for image and video classification. In this example, we
+will train three deep CNN models to do image classification for the CIFAR-10 dataset,
+
+1. [AlexNet](https://code.google.com/p/cuda-convnet/source/browse/trunk/example-layers/layers-18pct.cfg)
+the best validation accuracy (without data augmentation) we achieved was about 82%.
+
+2. [VGGNet](http://torch.ch/blog/2015/07/30/cifar.html), the best validation accuracy (without data augmentation) we achieved was about 89%.
+3. [ResNet](https://github.com/facebook/fb.resnet.torch), the best validation accuracy (without data augmentation) we achieved was about 83%.
+
+
+## Instructions
+
+
+### SINGA installation
+
+Users can compile and install SINGA from source or install the Python version.
+The code can ran on both CPU and GPU. For GPU training, CUDA and CUDNN (V4 or V5)
+are required. Please refer to the installation page for detailed instructions.
+
+### Data preparation
+
+The binary Cifar-10 dataset could be downloaded by
+
+    python download_data.py bin
+
+The Python version could be downloaded by
+
+    python download_data.py py
+
+### Training
+
+There are four training programs
+
+1. train.py. The following command would train the VGG model using the python
+version of the Cifar-10 dataset in 'cifar-10-batches-py' folder.
+
+        python train.py vgg cifar-10-batches-py
+
+    To train other models, please replace 'vgg' to 'alexnet' or 'resnet'. By default
+    the training would run on a CudaGPU device, to run it on CppCPU, add an additional
+    argument
+
+        python train.py vgg cifar-10-batches-py  --use_cpu
+
+2. alexnet.cc. It trains the AlexNet model using the CPP APIs on a CudaGPU,
+
+        ./run.sh
+
+3. alexnet-parallel.cc. It trains the AlexNet model using the CPP APIs on two CudaGPU devices.
+The two devices run synchronously to compute the gradients of the mode parameters, which are
+averaged on the host CPU device and then be applied to update the parameters.
+
+        ./run-parallel.sh
+
+4. vgg-parallel.cc. It train the VGG model using the CPP APIs on two CudaGPU devices similar to alexnet-parallel.cc.
+
+### Prediction
+
+predict.py includes the prediction function
+
+        def predict(net, images, dev, topk=5)
+
+The net is created by loading the previously trained model; Images consist of
+a numpy array of images (one row per image); dev is the training device, e.g.,
+a CudaGPU device or the host CppCPU device; topk labels of each image would be
+returned.
+
+
+
+
+
+
+
diff --git a/examples/cifar10/alexnet-parallel.cc b/examples/cifar10/alexnet-parallel.cc
new file mode 100644
index 0000000..8cc3352
--- /dev/null
+++ b/examples/cifar10/alexnet-parallel.cc
@@ -0,0 +1,265 @@
+/************************************************************
+*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*
+*************************************************************/
+
+#include "cifar10.h"
+#include "singa/model/feed_forward_net.h"
+#include "singa/model/optimizer.h"
+#include "singa/model/updater.h"
+#include "singa/model/initializer.h"
+#include "singa/model/metric.h"
+#include "singa/utils/channel.h"
+#include "singa/utils/string.h"
+#include "singa/core/memory.h"
+#include <thread>
+#include <memory>
+
+namespace singa {
+const std::string engine = "cudnn";
+
+LayerConf GenConvConf(string name, int nb_filter, int kernel, int stride,
+                      int pad, float std) {
+  LayerConf conf;
+  conf.set_name(name);
+  conf.set_type(engine + "_convolution");
+  ConvolutionConf *conv = conf.mutable_convolution_conf();
+  conv->set_num_output(nb_filter);
+  conv->add_kernel_size(kernel);
+  conv->add_stride(stride);
+  conv->add_pad(pad);
+  conv->set_bias_term(true);
+
+  ParamSpec *wspec = conf.add_param();
+  wspec->set_name(name + "_weight");
+  auto wfill = wspec->mutable_filler();
+  wfill->set_type("Gaussian");
+  wfill->set_std(std);
+
+  ParamSpec *bspec = conf.add_param();
+  bspec->set_name(name + "_bias");
+  bspec->set_lr_mult(2);
+  //  bspec->set_decay_mult(0);
+  return conf;
+}
+
+LayerConf GenPoolingConf(string name, bool max_pool, int kernel, int stride,
+                         int pad) {
+  LayerConf conf;
+  conf.set_name(name);
+  conf.set_type(engine + "_pooling");
+  PoolingConf *pool = conf.mutable_pooling_conf();
+  pool->set_kernel_size(kernel);
+  pool->set_stride(stride);
+  pool->set_pad(pad);
+  if (!max_pool) pool->set_pool(PoolingConf_PoolMethod_AVE);
+  return conf;
+}
+
+LayerConf GenReLUConf(string name) {
+  LayerConf conf;
+  conf.set_name(name);
+  conf.set_type(engine + "_relu");
+  return conf;
+}
+
+LayerConf GenDenseConf(string name, int num_output, float std, float wd) {
+  LayerConf conf;
+  conf.set_name(name);
+  conf.set_type("singa_dense");
+  DenseConf *dense = conf.mutable_dense_conf();
+  dense->set_num_output(num_output);
+
+  ParamSpec *wspec = conf.add_param();
+  wspec->set_name(name + "_weight");
+  wspec->set_decay_mult(wd);
+  auto wfill = wspec->mutable_filler();
+  wfill->set_type("Gaussian");
+  wfill->set_std(std);
+
+  ParamSpec *bspec = conf.add_param();
+  bspec->set_name(name + "_bias");
+  bspec->set_lr_mult(2);
+  bspec->set_decay_mult(0);
+
+  return conf;
+}
+
+LayerConf GenLRNConf(string name) {
+  LayerConf conf;
+  conf.set_name(name);
+  conf.set_type(engine + "_lrn");
+  LRNConf *lrn = conf.mutable_lrn_conf();
+  lrn->set_local_size(3);
+  lrn->set_alpha(5e-05);
+  lrn->set_beta(0.75);
+  return conf;
+}
+
+LayerConf GenFlattenConf(string name) {
+  LayerConf conf;
+  conf.set_name(name);
+  conf.set_type("singa_flatten");
+  return conf;
+}
+
+FeedForwardNet CreateNet() {
+  FeedForwardNet net;
+  Shape s{3, 32, 32};
+
+  net.Add(GenConvConf("conv1", 32, 5, 1, 2, 0.0001), &s);
+  net.Add(GenPoolingConf("pool1", true, 3, 2, 1));
+  net.Add(GenReLUConf("relu1"));
+  net.Add(GenLRNConf("lrn1"));
+  net.Add(GenConvConf("conv2", 32, 5, 1, 2, 0.01));
+  net.Add(GenReLUConf("relu2"));
+  net.Add(GenPoolingConf("pool2", false, 3, 2, 1));
+  net.Add(GenLRNConf("lrn2"));
+  net.Add(GenConvConf("conv3", 64, 5, 1, 2, 0.01));
+  net.Add(GenReLUConf("relu3"));
+  net.Add(GenPoolingConf("pool3", false, 3, 2, 1));
+  net.Add(GenFlattenConf("flat"));
+  net.Add(GenDenseConf("ip", 10, 0.01, 250));
+  return net;
+}
+
+void Train(float lr, int num_epoch, string data_dir) {
+  Cifar10 data(data_dir);
+  Tensor train_x, train_y, test_x, test_y;
+  Tensor train_x_1, train_x_2, train_y_1, train_y_2;
+  {
+    auto train = data.ReadTrainData();
+    size_t nsamples = train.first.shape(0);
+    auto mtrain =
+        Reshape(train.first, Shape{nsamples, train.first.Size() / nsamples});
+    const Tensor &mean = Average(mtrain, 0);
+    SubRow(mean, &mtrain);
+    train_x = Reshape(mtrain, train.first.shape());
+    train_y = train.second;
+
+    LOG(INFO) << "Slicing training data...";
+    train_x_1.Reshape(Shape{nsamples / 2, train.first.shape(1),
+        train.first.shape(2), train.first.shape(3)});
+    LOG(INFO) << "Copying first data slice...";
+    CopyDataToFrom(&train_x_1, train_x, train_x.Size() / 2);
+    train_x_2.Reshape(Shape{nsamples / 2, train.first.shape(1),
+        train.first.shape(2), train.first.shape(3)});
+    LOG(INFO) << "Copying second data slice...";
+    CopyDataToFrom(&train_x_2, train_x, train_x.Size() / 2, 0,
+                   train_x.Size() / 2);
+    train_y_1.Reshape(Shape{nsamples / 2});
+    train_y_1.AsType(kInt);
+    LOG(INFO) << "Copying first label slice...";
+    CopyDataToFrom(&train_y_1, train_y, train_y.Size() / 2);
+    train_y_2.Reshape(Shape{nsamples / 2});
+    train_y_2.AsType(kInt);
+    LOG(INFO) << "Copying second label slice...";
+    CopyDataToFrom(&train_y_2, train_y, train_y.Size() / 2, 0,
+                   train_y.Size() / 2);
+
+    auto test = data.ReadTestData();
+    nsamples = test.first.shape(0);
+    auto mtest =
+        Reshape(test.first, Shape{nsamples, test.first.Size() / nsamples});
+    SubRow(mean, &mtest);
+    test_x = Reshape(mtest, test.first.shape());
+    test_y = test.second;
+  }
+
+  CHECK_EQ(train_x.shape(0), train_y.shape(0));
+  CHECK_EQ(test_x.shape(0), test_y.shape(0));
+  LOG(INFO) << "Total Training samples = " << train_y.shape(0)
+            << ", Total Test samples = " << test_y.shape(0);
+  CHECK_EQ(train_x_1.shape(0), train_y_1.shape(0));
+  LOG(INFO) << "On net 1, Training samples = " << train_y_1.shape(0)
+            << ", Test samples = " << test_y.shape(0);
+  CHECK_EQ(train_x_2.shape(0), train_y_2.shape(0));
+  LOG(INFO) << "On net 2, Training samples = " << train_y_2.shape(0);
+
+  auto net_1 = CreateNet();
+  auto net_2 = CreateNet();
+
+  SGD sgd;
+  OptimizerConf opt_conf;
+  opt_conf.set_momentum(0.9);
+  auto reg = opt_conf.mutable_regularizer();
+  reg->set_coefficient(0.004);
+  sgd.Setup(opt_conf);
+  sgd.SetLearningRateGenerator([lr](int step) {
+    if (step <= 120)
+      return 0.001;
+    else if (step <= 130)
+      return 0.0001;
+    else
+      return 0.00001;
+  });
+
+  SoftmaxCrossEntropy loss_1, loss_2;
+  Accuracy acc_1, acc_2;
+  /// Create updater aggregating gradient on CPU
+  std::shared_ptr<Updater> updater = std::make_shared<LocalUpdater>(2, &sgd);
+
+  /// Only need to register parameter once.
+  net_1.Compile(true, true, updater, &loss_1, &acc_1);
+  net_2.Compile(true, false, updater, &loss_2, &acc_1);
+
+  MemPoolConf mem_conf;
+  mem_conf.add_device(0);
+  mem_conf.add_device(1);
+  std::shared_ptr<DeviceMemPool> mem_pool(new CnMemPool(mem_conf));
+  std::shared_ptr<CudaGPU> dev_1(new CudaGPU(0, mem_pool));
+  std::shared_ptr<CudaGPU> dev_2(new CudaGPU(1, mem_pool));
+
+  net_1.ToDevice(dev_1);
+  net_2.ToDevice(dev_2);
+
+  train_x_1.ToDevice(dev_1);
+  train_y_1.ToDevice(dev_1);
+  test_x.ToDevice(dev_1);
+  test_y.ToDevice(dev_1);
+  train_x_2.ToDevice(dev_2);
+  train_y_2.ToDevice(dev_2);
+
+  // net.Train(100, num_epoch, train_x, train_y, test_x, test_y);
+
+  LOG(INFO) << "Launching thread...";
+  std::thread t1 =
+      net_1.TrainThread(50, num_epoch, train_x_1, train_y_1, test_x, test_y);
+  std::thread t2 = net_2.TrainThread(50, num_epoch, train_x_2, train_y_2);
+  t1.join();
+  t2.join();
+}
+}
+
+int main(int argc, char **argv) {
+  singa::InitChannel(nullptr);
+  int pos = singa::ArgPos(argc, argv, "-epoch");
+  int nEpoch = 1;
+  if (pos != -1) nEpoch = atoi(argv[pos + 1]);
+  pos = singa::ArgPos(argc, argv, "-lr");
+  float lr = 0.001;
+  if (pos != -1) lr = atof(argv[pos + 1]);
+  pos = singa::ArgPos(argc, argv, "-data");
+  string data = "cifar-10-batches-bin";
+  if (pos != -1) data = argv[pos + 1];
+
+  LOG(INFO) << "Start training";
+  singa::Train(lr, nEpoch, data);
+  LOG(INFO) << "End training";
+}
diff --git a/examples/cifar10/alexnet.cc b/examples/cifar10/alexnet.cc
new file mode 100644
index 0000000..8a506d2
--- /dev/null
+++ b/examples/cifar10/alexnet.cc
@@ -0,0 +1,203 @@
+/************************************************************
+*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*
+*************************************************************/
+
+#include "./cifar10.h"
+#include "singa/model/feed_forward_net.h"
+#include "singa/model/optimizer.h"
+#include "singa/model/initializer.h"
+#include "singa/model/metric.h"
+#include "singa/utils/channel.h"
+#include "singa/utils/string.h"
+namespace singa {
+// currently supports 'cudnn' and 'singacpp'
+const std::string engine = "cudnn";
+LayerConf GenConvConf(string name, int nb_filter, int kernel, int stride,
+                      int pad, float std) {
+  LayerConf conf;
+  conf.set_name(name);
+  conf.set_type(engine + "_convolution");
+  ConvolutionConf *conv = conf.mutable_convolution_conf();
+  conv->set_num_output(nb_filter);
+  conv->add_kernel_size(kernel);
+  conv->add_stride(stride);
+  conv->add_pad(pad);
+  conv->set_bias_term(true);
+
+  ParamSpec *wspec = conf.add_param();
+  wspec->set_name(name + "_weight");
+  auto wfill = wspec->mutable_filler();
+  wfill->set_type("Gaussian");
+  wfill->set_std(std);
+
+  ParamSpec *bspec = conf.add_param();
+  bspec->set_name(name + "_bias");
+  bspec->set_lr_mult(2);
+//  bspec->set_decay_mult(0);
+  return conf;
+}
+
+LayerConf GenPoolingConf(string name, bool max_pool, int kernel, int stride,
+                         int pad) {
+  LayerConf conf;
+  conf.set_name(name);
+  conf.set_type(engine + "_pooling");
+  PoolingConf *pool = conf.mutable_pooling_conf();
+  pool->set_kernel_size(kernel);
+  pool->set_stride(stride);
+  pool->set_pad(pad);
+  if (!max_pool) pool->set_pool(PoolingConf_PoolMethod_AVE);
+  return conf;
+}
+
+LayerConf GenReLUConf(string name) {
+  LayerConf conf;
+  conf.set_name(name);
+  conf.set_type(engine + "_relu");
+  return conf;
+}
+
+LayerConf GenDenseConf(string name, int num_output, float std, float wd) {
+  LayerConf conf;
+  conf.set_name(name);
+  conf.set_type("singa_dense");
+  DenseConf *dense = conf.mutable_dense_conf();
+  dense->set_num_output(num_output);
+
+  ParamSpec *wspec = conf.add_param();
+  wspec->set_name(name + "_weight");
+  wspec->set_decay_mult(wd);
+  auto wfill = wspec->mutable_filler();
+  wfill->set_type("Gaussian");
+  wfill->set_std(std);
+
+  ParamSpec *bspec = conf.add_param();
+  bspec->set_name(name + "_bias");
+  bspec->set_lr_mult(2);
+  bspec->set_decay_mult(0);
+
+  return conf;
+}
+
+LayerConf GenLRNConf(string name) {
+  LayerConf conf;
+  conf.set_name(name);
+  conf.set_type(engine + "_lrn");
+  LRNConf *lrn = conf.mutable_lrn_conf();
+  lrn->set_local_size(3);
+  lrn->set_alpha(5e-05);
+  lrn->set_beta(0.75);
+  return conf;
+}
+
+LayerConf GenFlattenConf(string name) {
+  LayerConf conf;
+  conf.set_name(name);
+  conf.set_type("singa_flatten");
+  return conf;
+}
+
+FeedForwardNet CreateNet() {
+  FeedForwardNet net;
+  Shape s{3, 32, 32};
+
+  net.Add(GenConvConf("conv1", 32, 5, 1, 2, 0.0001), &s);
+  net.Add(GenPoolingConf("pool1", true, 3, 2, 1));
+  net.Add(GenReLUConf("relu1"));
+  net.Add(GenLRNConf("lrn1"));
+  net.Add(GenConvConf("conv2", 32, 5, 1, 2, 0.01));
+  net.Add(GenReLUConf("relu2"));
+  net.Add(GenPoolingConf("pool2", false, 3, 2, 1));
+  net.Add(GenLRNConf("lrn2"));
+  net.Add(GenConvConf("conv3", 64, 5, 1, 2, 0.01));
+  net.Add(GenReLUConf("relu3"));
+  net.Add(GenPoolingConf("pool3", false, 3, 2, 1));
+  net.Add(GenFlattenConf("flat"));
+  net.Add(GenDenseConf("ip", 10, 0.01, 250));
+  return net;
+}
+
+void Train(int num_epoch, string data_dir) {
+  Cifar10 data(data_dir);
+  Tensor train_x, train_y, test_x, test_y;
+  {
+    auto train = data.ReadTrainData();
+    size_t nsamples = train.first.shape(0);
+    auto mtrain =
+        Reshape(train.first, Shape{nsamples, train.first.Size() / nsamples});
+    const Tensor& mean = Average(mtrain, 0);
+    SubRow(mean, &mtrain);
+    train_x = Reshape(mtrain, train.first.shape());
+    train_y = train.second;
+    auto test = data.ReadTestData();
+    nsamples = test.first.shape(0);
+    auto mtest =
+        Reshape(test.first, Shape{nsamples, test.first.Size() / nsamples});
+    SubRow(mean, &mtest);
+    test_x = Reshape(mtest, test.first.shape());
+    test_y = test.second;
+  }
+  CHECK_EQ(train_x.shape(0), train_y.shape(0));
+  CHECK_EQ(test_x.shape(0), test_y.shape(0));
+  LOG(INFO) << "Training samples = " << train_y.shape(0)
+            << ", Test samples = " << test_y.shape(0);
+  auto net = CreateNet();
+  SGD sgd;
+  OptimizerConf opt_conf;
+  opt_conf.set_momentum(0.9);
+  auto reg = opt_conf.mutable_regularizer();
+  reg->set_coefficient(0.004);
+  sgd.Setup(opt_conf);
+  sgd.SetLearningRateGenerator([](int step) {
+    if (step <= 120)
+      return 0.001;
+    else if (step <= 130)
+      return 0.0001;
+    else
+      return 0.00001;
+  });
+
+  SoftmaxCrossEntropy loss;
+  Accuracy acc;
+  net.Compile(true, &sgd, &loss, &acc);
+
+  auto dev = std::make_shared<CudaGPU>();
+  net.ToDevice(dev);
+  train_x.ToDevice(dev);
+  train_y.ToDevice(dev);
+  test_x.ToDevice(dev);
+  test_y.ToDevice(dev);
+  net.Train(100, num_epoch, train_x, train_y, test_x, test_y);
+}
+}
+
+int main(int argc, char **argv) {
+  singa::InitChannel(nullptr);
+  int pos = singa::ArgPos(argc, argv, "-epoch");
+  int nEpoch = 1;
+  if (pos != -1) nEpoch = atoi(argv[pos + 1]);
+  pos = singa::ArgPos(argc, argv, "-data");
+  string data = "cifar-10-batches-bin";
+  if (pos != -1) data = argv[pos + 1];
+
+  LOG(INFO) << "Start training";
+  singa::Train(nEpoch, data);
+  LOG(INFO) << "End training";
+}
diff --git a/examples/cifar10/alexnet.py b/examples/cifar10/alexnet.py
new file mode 100644
index 0000000..02437b3
--- /dev/null
+++ b/examples/cifar10/alexnet.py
@@ -0,0 +1,61 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+''' This model is created following the structure from
+https://code.google.com/p/cuda-convnet/source/browse/trunk/example-layers/layers-18pct.cfg
+Following the same setting for hyper-parameters and data pre-processing, the final
+validation accuracy would be about 82%.
+'''
+
+# sys.path.append(os.path.join(os.path.dirname(__file__), '../../build/python'))
+from singa import layer
+from singa import metric
+from singa import loss
+from singa import net as ffnet
+
+
+def create_net(use_cpu=False):
+    if use_cpu:
+        layer.engine = 'singacpp'
+
+    net = ffnet.FeedForwardNet(loss.SoftmaxCrossEntropy(), metric.Accuracy())
+    W0_specs = {'init': 'gaussian', 'mean': 0, 'std': 0.0001}
+    W1_specs = {'init': 'gaussian', 'mean': 0, 'std': 0.01}
+    W2_specs = {'init': 'gaussian', 'mean': 0, 'std': 0.01, 'decay_mult': 250}
+
+    b_specs = {'init': 'constant', 'value': 0, 'lr_mult': 2, 'decay_mult': 0}
+    net.add(layer.Conv2D('conv1', 32, 5, 1, W_specs=W0_specs.copy(), b_specs=b_specs.copy(), pad=2, input_sample_shape=(3,32,32,)))
+    net.add(layer.MaxPooling2D('pool1', 3, 2, pad=1))
+    net.add(layer.Activation('relu1'))
+    net.add(layer.LRN(name='lrn1', size=3, alpha=5e-5))
+    net.add(layer.Conv2D('conv2', 32, 5, 1, W_specs=W1_specs.copy(), b_specs=b_specs.copy(), pad=2))
+    net.add(layer.Activation('relu2'))
+    net.add(layer.AvgPooling2D('pool2', 3, 2,  pad=1))
+    net.add(layer.LRN('lrn2', size=3, alpha=5e-5))
+    net.add(layer.Conv2D('conv3', 64, 5, 1, W_specs=W1_specs.copy(), b_specs=b_specs.copy(), pad=2))
+    net.add(layer.Activation('relu3'))
+    net.add(layer.AvgPooling2D('pool3', 3, 2, pad=1))
+    net.add(layer.Flatten('flat'))
+    net.add(layer.Dense( 'dense', 10, W_specs=W2_specs.copy(), b_specs=b_specs.copy()))
+    for (p, specs) in zip(net.param_values(), net.param_specs()):
+        filler = specs.filler
+        if filler.type == 'gaussian':
+            p.gaussian(filler.mean, filler.std)
+        else:
+            p.set_value(0)
+        print specs.name, filler.type, p.l1()
+
+    return net
diff --git a/examples/cifar10/cifar10.h b/examples/cifar10/cifar10.h
new file mode 100644
index 0000000..d2b9225
--- /dev/null
+++ b/examples/cifar10/cifar10.h
@@ -0,0 +1,98 @@
+/************************************************************
+*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*
+*************************************************************/
+#include <fstream>
+#include <string>
+#include <cstdint>
+#include <iostream>
+#include "singa/core/tensor.h"
+using std::string;
+namespace singa {
+/// For reading cifar10 binary data as tensors.
+class Cifar10 {
+ public:
+  /// 'dir_path': path to the folder including the *.bin files
+  Cifar10(string dir_path, bool normalize = true) : dir_path_(dir_path) {}
+
+  /// read all training data into an image Tensor and a label Tensor
+  const std::pair<Tensor, Tensor> ReadTrainData();
+  /// read all test data into an image Tensor and a label Tensor
+  const std::pair<Tensor, Tensor> ReadTestData();
+  /// read data from one file into an image Tensor and a label Tensor
+  const std::pair<Tensor, Tensor> ReadFile(string file);
+
+  void ReadImage(std::ifstream* file, int* label, char* buffer);
+
+ private:
+  const size_t kImageSize = 32;
+  const size_t kImageVol = 3072;
+  const size_t kBatchSize = 10000;
+  const size_t kTrainFiles = 5;
+
+  string dir_path_;
+};
+
+void Cifar10::ReadImage(std::ifstream* file, int* label, char* buffer) {
+  char label_char;
+  file->read(&label_char, 1);
+  *label = static_cast<int>(label_char);
+  file->read(buffer, kImageVol);
+  return;
+}
+const std::pair<Tensor, Tensor> Cifar10::ReadFile(string file) {
+  Tensor images(Shape{kBatchSize, 3, kImageSize, kImageSize});
+  Tensor labels(Shape{kBatchSize}, kInt);
+  if (dir_path_.back() != '/') dir_path_.push_back('/');
+  LOG(INFO) << "Reading file " << dir_path_ + file;
+  std::ifstream data_file((dir_path_ + file).c_str(),
+                          std::ios::in | std::ios::binary);
+  CHECK(data_file.is_open()) << "Unable to open file " << dir_path_ + file;
+  int label;
+  char image[kImageVol];
+  float float_image[kImageVol];
+  int tmplabels[kBatchSize];
+  for (size_t itemid = 0; itemid < kBatchSize; ++itemid) {
+    // LOG(INFO) << "reading " << itemid << "-th image";
+    ReadImage(&data_file, &label, image);
+    for (size_t i = 0; i < kImageVol; i++)
+      float_image[i] = static_cast<float>(static_cast<uint8_t>(image[i]));
+    images.CopyDataFromHostPtr(float_image, kImageVol, itemid * kImageVol);
+    tmplabels[itemid] = label;
+  }
+  labels.CopyDataFromHostPtr(tmplabels, kBatchSize);
+  return std::make_pair(images, labels);
+}
+
+const std::pair<Tensor, Tensor> Cifar10::ReadTrainData() {
+  Tensor images(Shape{kBatchSize * kTrainFiles, 3, kImageSize, kImageSize});
+  Tensor labels(Shape{kBatchSize * kTrainFiles}, kInt);
+  for (size_t fileid = 0; fileid < kTrainFiles; ++fileid) {
+    string file = "data_batch_" + std::to_string(fileid + 1) + ".bin";
+    const auto ret = ReadFile(file);
+    CopyDataToFrom(&images, ret.first, ret.first.Size(),
+                   fileid * ret.first.Size());
+    CopyDataToFrom(&labels, ret.second, kBatchSize, fileid * kBatchSize);
+  }
+  return std::make_pair(images, labels);
+}
+const std::pair<Tensor, Tensor> Cifar10::ReadTestData() {
+  return ReadFile("test_batch.bin");
+}
+}  // namespace singa
diff --git a/examples/cifar10/create_data.cc b/examples/cifar10/create_data.cc
deleted file mode 100644
index 5564c38..0000000
--- a/examples/cifar10/create_data.cc
+++ /dev/null
@@ -1,138 +0,0 @@
-/************************************************************
-*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*
-*************************************************************/
-
-
-/**
- * Create training and test DataShard for CIFAR dataset. 
- * It is adapted from convert_cifar_data from Caffe. 
- *    create_shard.bin <input> <output_folder> 
- * 
- * Read from JobConf object the option to use KVfile, HDFS or other (1st layer
- * store_conf object). 
- * To load to HDFS, specify "hdfs://namenode/examples" as the output folder
- */
-
-#include <glog/logging.h>
-#include <fstream>
-#include <string>
-#include <cstdint>
-#include <iostream>
-
-#include "singa/io/store.h"
-#include "singa/proto/common.pb.h"
-#include "singa/utils/common.h"
-
-using std::string;
-
-const int kCIFARSize = 32;
-const int kCIFARImageNBytes = 3072;
-const int kCIFARBatchSize = 10000;
-const int kCIFARTrainBatches = 5;
-
-void read_image(std::ifstream* file, int* label, char* buffer) {
-  char label_char;
-  file->read(&label_char, 1);
-  *label = label_char;
-  file->read(buffer, kCIFARImageNBytes);
-  return;
-}
-
-void create_data(const string& input_folder, const string& output_folder) {
-  int label;
-  char str_buffer[kCIFARImageNBytes];
-  string rec_buf;
-  singa::RecordProto image;
-  image.add_shape(3);
-  image.add_shape(kCIFARSize);
-  image.add_shape(kCIFARSize);
-
-  singa::RecordProto mean;
-  mean.CopyFrom(image);
-  for (int i = 0; i < kCIFARImageNBytes; i++)
-    mean.add_data(0.f);
-
-  string store_backend = output_folder.find("hdfs") !=-1 ?
-                         "hdfsfile" : "kvfile";
-  auto store = singa::io::CreateStore(store_backend);
-  CHECK(store->Open(output_folder + "/train_data.bin", singa::io::kCreate));
-  LOG(INFO) << "Preparing training data";
-  int count = 0;
-  for (int fileid = 0; fileid < kCIFARTrainBatches; ++fileid) {
-    LOG(INFO) << "Training Batch " << fileid + 1;
-    snprintf(str_buffer, kCIFARImageNBytes, "/data_batch_%d.bin", fileid + 1);
-    std::ifstream data_file((input_folder + str_buffer).c_str(),
-        std::ios::in | std::ios::binary);
-    CHECK(data_file.is_open()) << "Unable to open train file #" << fileid + 1;
-    for (int itemid = 0; itemid < kCIFARBatchSize; ++itemid) {
-      read_image(&data_file, &label, str_buffer);
-      image.set_label(label);
-      image.set_pixel(str_buffer, kCIFARImageNBytes);
-      image.SerializeToString(&rec_buf);
-      int length = snprintf(str_buffer, kCIFARImageNBytes, "%05d", count);
-      CHECK(store->Write(string(str_buffer, length), rec_buf));
-
-      const string& pixels = image.pixel();
-      for (int i = 0; i < kCIFARImageNBytes; i++)
-        mean.set_data(i, mean.data(i) + static_cast<uint8_t>(pixels[i]));
-      count += 1;
-    }
-  }
-  store->Flush();
-  store->Close();
-
-  LOG(INFO) << "Create image mean";
-  store->Open(output_folder + "/image_mean.bin", singa::io::kCreate);
-  for (int i = 0; i < kCIFARImageNBytes; i++)
-    mean.set_data(i, mean.data(i) / count);
-  mean.SerializeToString(&rec_buf);
-  store->Write("mean", rec_buf);
-  store->Flush();
-  store->Close();
-
-  LOG(INFO) << "Create test data";
-  store->Open(output_folder + "/test_data.bin", singa::io::kCreate);
-  std::ifstream data_file((input_folder + "/test_batch.bin").c_str(),
-      std::ios::in | std::ios::binary);
-  CHECK(data_file.is_open()) << "Unable to open test file.";
-  for (int itemid = 0; itemid < kCIFARBatchSize; ++itemid) {
-    read_image(&data_file, &label, str_buffer);
-    image.set_label(label);
-    image.set_pixel(str_buffer, kCIFARImageNBytes);
-    image.SerializeToString(&rec_buf);
-    int length = snprintf(str_buffer, kCIFARImageNBytes, "%05d", itemid);
-    CHECK(store->Write(string(str_buffer, length), rec_buf));
-  }
-  store->Flush();
-  store->Close();
-}
-
-int main(int argc, char** argv) {
-  if (argc != 3) {
-    std::cout <<"Create train and test DataShard for Cifar dataset.\n"
-      << "Usage:\n"
-      << "    create_data.bin input_folder output_folder\n"
-      << "Where the input folder should contain the binary batch files.\n";
-  } else {
-    google::InitGoogleLogging(argv[0]);
-    create_data(string(argv[1]), string(argv[2]));
-  }
-  return 0;
-}
diff --git a/examples/cifar10/cudnn.conf b/examples/cifar10/cudnn.conf
deleted file mode 100644
index 0f9402e..0000000
--- a/examples/cifar10/cudnn.conf
+++ /dev/null
@@ -1,297 +0,0 @@
-name: "cifar10-convnet"
-train_steps: 70000
-test_steps: 100
-test_freq: 1000
-#validate_steps: 100
-#validate_freq: 300
-disp_freq: 200
-gpu: 0
-#checkpoint_path: "examples/cifar10/checkpoint/step1000-worker0"
-train_one_batch {
-  alg: kBP
-}
-updater{
-  type: kSGD
-  weight_decay:0.004
-  momentum:0.9
-  learning_rate {
-    type: kFixedStep
-    fixedstep_conf:{
-      step:0
-      step:60000
-      step:65000
-      step_lr:0.001
-      step_lr:0.0001
-      step_lr:0.00001
-    }
-  }
-}
-neuralnet {
-  layer{
-    name: "data"
-    type: kRecordInput
-    store_conf {
-      backend: "kvfile"
-      path: "examples/cifar10/train_data.bin"
-      mean_file: "examples/cifar10/image_mean.bin"
-      batchsize: 100
-      #random_skip: 5000
-      shape: 3
-      shape: 32
-      shape: 32
-    }
-    include: kTrain
-  }
-#  layer{
-#    name: "data"
-#    type: kRecordInput
-#    store_conf {
-#      backend: "kvfile"
-#      path: "examples/cifar10/val_data.bin"
-#      mean_file: "examples/cifar10/image_mean.bin"
-#      batchsize: 64
-#      random_skip: 5000
-#      shape: 3
-#      shape: 32
-#      shape: 32
-#    }
-#    include: kVal
-#  }
-  layer{
-    name: "data"
-    type: kRecordInput
-    store_conf {
-      backend: "kvfile"
-      path: "examples/cifar10/test_data.bin"
-      mean_file: "examples/cifar10/image_mean.bin"
-      batchsize: 100
-      shape: 3
-      shape: 32
-      shape: 32
-    }
-    include: kTest
-  }
-
-  layer {
-    name: "conv1"
-    type: kCudnnConv
-    srclayers: "data"
-    convolution_conf {
-      num_filters: 32
-      kernel: 5
-      stride: 1
-      pad:2
-    }
-    param {
-      name: "w1"
-      init {
-        type:kGaussian
-        std:0.0001
-      }
-    }
-    param {
-      name: "b1"
-      lr_scale:2.0
-      wd_scale: 0
-      init {
-        type: kConstant
-        value:0
-      }
-    }
-  }
-
-  layer {
-    name: "pool1"
-    type: kCudnnPool
-    srclayers: "conv1"
-    pooling_conf {
-      pool: MAX
-      kernel: 3
-      stride: 2
-    }
-  }
-  layer {
-    name: "relu1"
-    type: kCudnnActivation
-    activation_conf {
-      type: RELU
-    }
-    share_src_blobs: true
-    srclayers:"pool1"
-  }
-  layer {
-    name: "norm1"
-    type: kCudnnLRN
-    lrn_conf {
-      local_size: 3
-      alpha: 5e-05
-      beta: 0.75
-    }
-    srclayers:"relu1"
-  }
-  layer {
-    name: "conv2"
-    type: kCudnnConv
-    srclayers: "norm1"
-    convolution_conf {
-      num_filters: 32
-      kernel: 5
-      stride: 1
-      pad:2
-    }
-    param {
-      name: "w2"
-      init {
-        type:kGaussian
-        std:0.01
-      }
-    }
-    param {
-      name: "b2"
-      lr_scale:2.0
-      wd_scale: 0
-      init {
-        type: kConstant
-        value:0
-      }
-    }
-  }
-  layer {
-    name: "relu2"
-    type: kCudnnActivation
-    activation_conf {
-      type: RELU
-    }
-    share_src_blobs: true
-    srclayers:"conv2"
-  }
-  layer {
-    name: "pool2"
-    type: kCudnnPool
-    srclayers: "relu2"
-    pooling_conf {
-      pool: AVG
-      kernel: 3
-      stride: 2
-    }
-  }
-  layer {
-    name: "norm2"
-    type: kCudnnLRN
-    lrn_conf {
-      local_size: 3
-      alpha: 5e-05
-      beta: 0.75
-    }
-    srclayers:"pool2"
-  }
-  layer {
-    name: "conv3"
-    type: kCudnnConv
-    srclayers: "norm2"
-    convolution_conf {
-      num_filters: 64
-      kernel: 5
-      stride: 1
-      pad:2
-    }
-    param {
-      name: "w3"
-      init {
-        type:kGaussian
-        std:0.01
-      }
-    }
-    param {
-      name: "b3"
-      lr_scale: 2
-      wd_scale: 0
-      init {
-        type: kConstant
-        value:0
-      }
-    }
-  }
-  layer {
-    name: "relu3"
-    type: kCudnnActivation
-    activation_conf {
-      type: RELU
-    }
-    share_src_blobs: true
-    srclayers:"conv3"
-  }
-  layer {
-    name: "pool3"
-    type: kCudnnPool
-    srclayers: "relu3"
-    pooling_conf {
-      pool: AVG
-      kernel: 3
-      stride: 2
-    }
-  }
-  layer {
-    name: "ip1"
-    type: kInnerProduct
-    srclayers:"pool3"
-    innerproduct_conf {
-      num_output: 10
-    }
-    param {
-      name: "w4"
-      wd_scale:250
-      init {
-        type:kGaussian
-        std:0.01
-      }
-    }
-    param {
-      name: "b4"
-      lr_scale:2.0
-      wd_scale:0
-      init {
-        type: kConstant
-        value:0
-      }
-    }
-  }
-  layer {
-   name : "softmax"
-   type: kCudnnSoftmax
-   srclayers: "ip1"
-   include: kTest
-  }
-
-  layer {
-   name : "accuracy"
-   type: kAccuracy
-   srclayers: "softmax"
-   srclayers: "data"
-   include: kTest
-  }
-  layer{
-    name: "loss"
-    type: kSoftmaxLoss
-    srclayers:"ip1"
-    srclayers: "data"
-    include : kTrain
-  }
-# uncomment "softmax", "argsort", "output" layer and comment "loss" layer
-# to extract features from argsort
-#  layer {
-#    name : "output"
-#    type: kCSVOutput
-#    srclayers: "argsort"
-#    store_conf {
-#      path: "examples/cifar10/out.csv"
-#    }
-#  }
-}
-cluster {
-  nworker_groups: 1
-  nserver_groups: 1
-  nworkers_per_group: 1
-  nworkers_per_procs: 1
-  workspace: "examples/cifar10"
-}
diff --git a/examples/cifar10/cudnn_bm.conf b/examples/cifar10/cudnn_bm.conf
deleted file mode 100644
index 2ca30cb..0000000
--- a/examples/cifar10/cudnn_bm.conf
+++ /dev/null
@@ -1,376 +0,0 @@
-name: "cifar10-convnet"
-train_steps: 70000
-test_steps: 100
-test_freq: 1000
-#validate_steps: 100
-#validate_freq: 300
-disp_freq: 200
-gpu: 0
-#checkpoint_path: "examples/cifar10/checkpoint/step1000-worker0"
-train_one_batch {
-  alg: kBP
-}
-updater{
-  type: kSGD
-  weight_decay:0.004
-  momentum:0.9
-  learning_rate {
-    type: kFixedStep
-    fixedstep_conf:{
-      step:0
-      step:60000
-      step:65000
-      step_lr:0.001
-      step_lr:0.0001
-      step_lr:0.00001
-    }
-  }
-}
-neuralnet {
-  layer{
-    name: "data"
-    type: kRecordInput
-    store_conf {
-      backend: "kvfile"
-      path: "examples/cifar10/train_data.bin"
-      mean_file: "examples/cifar10/image_mean.bin"
-      batchsize: 100
-      #random_skip: 5000
-      shape: 3
-      shape: 32
-      shape: 32
-    }
-    include: kTrain
-  }
-#  layer{
-#    name: "data"
-#    type: kRecordInput
-#    store_conf {
-#      backend: "kvfile"
-#      path: "examples/cifar10/val_data.bin"
-#      mean_file: "examples/cifar10/image_mean.bin"
-#      batchsize: 64
-#      random_skip: 5000
-#      shape: 3
-#      shape: 32
-#      shape: 32
-#    }
-#    include: kVal
-#  }
-  layer{
-    name: "data"
-    type: kRecordInput
-    store_conf {
-      backend: "kvfile"
-      path: "examples/cifar10/test_data.bin"
-      mean_file: "examples/cifar10/image_mean.bin"
-      batchsize: 100
-      shape: 3
-      shape: 32
-      shape: 32
-    }
-    include: kTest
-  }
-
-  layer {
-    name: "conv1"
-    type: kCudnnConv
-    srclayers: "data"
-    convolution_conf {
-      num_filters: 32
-      kernel: 5
-      stride: 1
-      pad:2
-    }
-    param {
-      name: "w1"
-      init {
-        type:kGaussian
-        std:0.0001
-      }
-    }
-    param {
-      name: "b1"
-      lr_scale:2.0
-      wd_scale: 0
-      init {
-        type: kConstant
-        value:0
-      }
-    }
-  }
-
-  layer {
-    name: "pool1"
-    type: kCudnnPool
-    srclayers: "conv1"
-    pooling_conf {
-      pool: MAX
-      kernel: 3
-      stride: 2
-    }
-  }
-  layer {
-    name: "bm1"
-    type: kCudnnBM
-      param {
-        name: "s11"
-        init {
- 		  type:kConstant
- 		  value:1
-        }
-      }
-	  param {
-		name: "s12"
-		init {
-		  type:kConstant
-		  value:0
-		}
-	  }
-	  param {
-		name: "s13"
-		init {
-		  type:kConstant
-		  value:0
-		}
-	  }
-	  param {
-		name: "s14"
-		init {
-		  type:kConstant
-		  value:0
-		}
-	  }
-    srclayers:"pool1"
-  }
-  layer {
-    name: "relu1"
-    type: kCudnnActivation
-    activation_conf {
-      type: RELU
-    }
-    share_src_blobs: true
-    srclayers:"bm1"
-  }
-  layer {
-    name: "conv2"
-    type: kCudnnConv
-    srclayers: "relu1"
-    convolution_conf {
-      num_filters: 32
-      kernel: 5
-      stride: 1
-      pad:2
-    }
-    param {
-      name: "w2"
-      init {
-        type:kGaussian
-        std:0.01
-      }
-    }
-    param {
-      name: "b2"
-      lr_scale:2.0
-      wd_scale: 0
-      init {
-        type: kConstant
-        value:0
-      }
-    }
-  }
-  layer {
-    name: "bm2"
-    type: kCudnnBM
-      param {
-        name: "s21"
-        init {
- 		  type:kConstant
- 		  value:1
-        }
-      }
-	  param {
-		name: "s22"
-		init {
-		  type:kConstant
-		  value:0
-		}
-	  }
-	  param {
-		name: "s23"
-		init {
-		  type:kConstant
-		  value:0
-		}
-	  }
-	  param {
-		name: "s24"
-		init {
-		  type:kConstant
-		  value:0
-		}
-	  }
-    srclayers:"conv2"
-  }
-  layer {
-    name: "relu2"
-    type: kCudnnActivation
-    activation_conf {
-      type: RELU
-    }
-    share_src_blobs: true
-    srclayers:"bm2"
-  }
-  layer {
-    name: "pool2"
-    type: kCudnnPool
-    srclayers: "relu2"
-    pooling_conf {
-      pool: AVG
-      kernel: 3
-      stride: 2
-    }
-  }
-  layer {
-    name: "conv3"
-    type: kCudnnConv
-    srclayers: "relu2"
-    convolution_conf {
-      num_filters: 64
-      kernel: 5
-      stride: 1
-      pad:2
-    }
-    param {
-      name: "w3"
-      init {
-        type:kGaussian
-        std:0.01
-      }
-    }
-    param {
-      name: "b3"
-      lr_scale: 2
-      wd_scale: 0
-      init {
-        type: kConstant
-        value:0
-      }
-    }
-  }
-  layer {
-    name: "bm3"
-    type: kCudnnBM
-      param {
-        name: "s31"
-        init {
- 		  type:kConstant
- 		  value:1
-        }
-      }
-	  param {
-		name: "s32"
-		init {
-		  type:kConstant
-		  value:0
-		}
-	  }
-	  param {
-		name: "s33"
-		init {
-		  type:kConstant
-		  value:0
-		}
-	  }
-	  param {
-		name: "s34"
-		init {
-		  type:kConstant
-		  value:0
-		}
-	  }
-    srclayers:"conv3"
-  }
-  layer {
-    name: "relu3"
-    type: kCudnnActivation
-    activation_conf {
-      type: RELU
-    }
-    share_src_blobs: true
-    srclayers:"bm3"
-  }
-  layer {
-    name: "pool3"
-    type: kCudnnPool
-    srclayers: "relu3"
-    pooling_conf {
-      pool: AVG
-      kernel: 3
-      stride: 2
-    }
-  }
-  layer {
-    name: "ip1"
-    type: kInnerProduct
-    srclayers:"pool3"
-    innerproduct_conf {
-      num_output: 10
-    }
-    param {
-      name: "w4"
-      wd_scale:250
-      init {
-        type:kGaussian
-        std:0.01
-      }
-    }
-    param {
-      name: "b4"
-      lr_scale:2.0
-      wd_scale:0
-      init {
-        type: kConstant
-        value:0
-      }
-    }
-  }
-  layer {
-   name : "softmax"
-   type: kCudnnSoftmax
-   srclayers: "ip1"
-   include: kTest
-  }
-
-  layer {
-   name : "accuracy"
-   type: kAccuracy
-   srclayers: "softmax"
-   srclayers: "data"
-   include: kTest
-  }
-  layer{
-    name: "loss"
-    type: kSoftmaxLoss
-    srclayers:"ip1"
-    srclayers: "data"
-    include : kTrain
-  }
-# uncomment "softmax", "argsort", "output" layer and comment "loss" layer
-# to extract features from argsort
-#  layer {
-#    name : "output"
-#    type: kCSVOutput
-#    srclayers: "argsort"
-#    store_conf {
-#      path: "examples/cifar10/out.csv"
-#    }
-#  }
-}
-cluster {
-  nworker_groups: 1
-  nserver_groups: 1
-  nworkers_per_group: 1
-  nworkers_per_procs: 1
-  workspace: "examples/cifar10"
-}
diff --git a/examples/cifar10/cudnn_hybrid.conf b/examples/cifar10/cudnn_hybrid.conf
deleted file mode 100644
index a11145c..0000000
--- a/examples/cifar10/cudnn_hybrid.conf
+++ /dev/null
@@ -1,306 +0,0 @@
-name: "cifar10-convnet"
-train_steps: 10000
-test_steps: 0
-test_freq: 200
-#validate_steps: 100
-#validate_freq: 300
-disp_freq: 200
-gpu: 0
-gpu: 1
-#debug: true
-#checkpoint_path: "examples/cifar10/checkpoint/step1000-worker0"
-train_one_batch {
-  alg: kBP
-}
-updater{
-  type: kSGD
-  weight_decay:0.004
-  momentum:0.9
-  learning_rate {
-    type: kFixedStep
-    fixedstep_conf:{
-      step:0
-      step:60000
-      step:65000
-      step_lr:0.001
-      step_lr:0.0001
-      step_lr:0.00001
-    }
-  }
-}
-neuralnet {
-  layer{
-    name: "data"
-    type: kRecordInput
-    store_conf {
-      backend: "kvfile"
-      path: "examples/cifar10/train_data.bin"
-      mean_file: "examples/cifar10/image_mean.bin"
-      batchsize: 100
-      #random_skip: 5000
-      shape: 3
-      shape: 32
-      shape: 32
-    }
-    include: kTrain
-    partition_dim: 0
-  }
-#  layer{
-#    name: "data"
-#    type: kRecordInput
-#    store_conf {
-#      backend: "kvfile"
-#      path: "examples/cifar10/val_data.bin"
-#      mean_file: "examples/cifar10/image_mean.bin"
-#      batchsize: 64
-#      random_skip: 5000
-#      shape: 3
-#      shape: 32
-#      shape: 32
-#    }
-#    include: kVal
-#  }
-  layer{
-    name: "data"
-    type: kRecordInput
-    store_conf {
-      backend: "kvfile"
-      path: "examples/cifar10/test_data.bin"
-      mean_file: "examples/cifar10/image_mean.bin"
-      batchsize: 100
-      shape: 3
-      shape: 32
-      shape: 32
-    }
-    include: kTest
-    partition_dim: 0
-  }
-
-  layer {
-    partition_dim: 0
-    name: "conv1"
-    type: kCudnnConv
-    srclayers: "data"
-    convolution_conf {
-      num_filters: 32
-      kernel: 5
-      stride: 1
-      pad:2
-    }
-    param {
-      name: "w1"
-      init {
-        type:kGaussian
-        std:0.0001
-      }
-    }
-    param {
-      name: "b1"
-      lr_scale:2.0
-      init {
-        type: kConstant
-        value:0
-      }
-    }
-  }
-
-  layer {
-    partition_dim: 0
-    name: "pool1"
-    type: kCudnnPool
-    srclayers: "conv1"
-    pooling_conf {
-      pool: MAX
-      kernel: 3
-      stride: 2
-    }
-  }
-  layer {
-    partition_dim: 0
-    name: "relu1"
-    type: kCudnnActivation
-    activation_conf {
-      type: RELU
-    }
-    srclayers:"pool1"
-  }
-  layer {
-    partition_dim: 0
-    name: "norm1"
-    type: kCudnnLRN
-    lrn_conf {
-      local_size: 3
-      alpha: 5e-05
-      beta: 0.75
-    }
-    srclayers:"relu1"
-  }
-  layer {
-    partition_dim: 0
-    name: "conv2"
-    type: kCudnnConv
-    srclayers: "norm1"
-    convolution_conf {
-      num_filters: 32
-      kernel: 5
-      stride: 1
-      pad:2
-    }
-    param {
-      name: "w2"
-      init {
-        type:kGaussian
-        std:0.01
-      }
-    }
-    param {
-      name: "b2"
-      lr_scale:2.0
-      init {
-        type: kConstant
-        value:0
-      }
-    }
-  }
-  layer {
-    partition_dim: 0
-    name: "relu2"
-    type: kCudnnActivation
-    activation_conf {
-      type: RELU
-    }
-    srclayers:"conv2"
-  }
-  layer {
-    partition_dim: 0
-    name: "pool2"
-    type: kCudnnPool
-    srclayers: "relu2"
-    pooling_conf {
-      pool: AVG
-      kernel: 3
-      stride: 2
-    }
-  }
-  layer {
-    partition_dim: 0
-    name: "norm2"
-    type: kCudnnLRN
-    lrn_conf {
-      local_size: 3
-      alpha: 5e-05
-      beta: 0.75
-    }
-    srclayers:"pool2"
-  }
-  layer {
-    partition_dim: 0
-    name: "conv3"
-    type: kCudnnConv
-    srclayers: "norm2"
-    convolution_conf {
-      num_filters: 64
-      kernel: 5
-      stride: 1
-      pad:2
-    }
-    param {
-      name: "w3"
-      init {
-        type:kGaussian
-        std:0.01
-      }
-    }
-    param {
-      name: "b3"
-      init {
-        type: kConstant
-        value:0
-      }
-    }
-  }
-  layer {
-    partition_dim: 0
-    name: "relu3"
-    type: kCudnnActivation
-    activation_conf {
-      type: RELU
-    }
-    srclayers:"conv3"
-  }
-  layer {
-    partition_dim: 0
-    name: "pool3"
-    type: kCudnnPool
-    srclayers: "relu3"
-    pooling_conf {
-      pool: AVG
-      kernel: 3
-      stride: 2
-    }
-  }
-  layer {
-    partition_dim: 1
-    name: "ip1"
-    type: kInnerProduct
-    srclayers:"pool3"
-    innerproduct_conf {
-      num_output: 10
-    }
-    param {
-      name: "w4"
-      wd_scale:250
-      init {
-        type:kGaussian
-        std:0.01
-      }
-    }
-    param {
-      name: "b4"
-      lr_scale:2.0
-      wd_scale:0
-      init {
-        type: kConstant
-        value:0
-      }
-    }
-  }
-#  layer {
-#   name : "softmax"
-#   type: kSoftmax
-#   srclayers: "ip1"
-#  }
-#
-#  layer {
-#   name : "argsort"
-#   type: kArgSort
-#   srclayers: "softmax"
-#  }
-  layer{
-    partition_dim: 0
-    name: "loss"
-    type: kSoftmaxLoss
-    softmaxloss_conf{
-      topk:1
-    }
-    srclayers:"ip1"
-    srclayers: "data"
-  }
-# uncomment "softmax", "argsort", "output" layer and comment "loss" layer
-# to extract features from argsort
-#  layer {
-#    name : "output"
-#    type: kCSVOutput
-#    srclayers: "argsort"
-#    store_conf {
-#      path: "examples/cifar10/out.csv"
-#    }
-#  }
-}
-cluster {
-  nworker_groups: 1
-  nserver_groups: 1
-  nworkers_per_group: 2
-  nworkers_per_procs: 2
-  workspace: "examples/cifar10"
-}
diff --git a/examples/cifar10/download_data.py b/examples/cifar10/download_data.py
new file mode 100755
index 0000000..7129b03
--- /dev/null
+++ b/examples/cifar10/download_data.py
@@ -0,0 +1,70 @@
+#!/usr/bin/env python
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# 
+
+import urllib
+import tarfile
+import os
+import sys
+import argparse
+
+
+def extract_tarfile(filepath):
+    if os.path.exists(filepath):
+        print 'The tar file does exist. Extracting it now..'
+        with tarfile.open(filepath, 'r') as f:
+            f.extractall('.')
+        print 'Finished!'
+        sys.exit(0)
+
+
+def check_dir_exist(dirpath):
+    if os.path.exists(dirpath):
+        print 'Directory %s does exist. To redownload the files, '\
+            'remove the existing directory and %s.tar.gz' % (dirpath, dirpath)
+        return True
+    else:
+        return False
+
+
+def do_download(dirpath, gzfile, url):
+    if check_dir_exist(dirpath):
+        sys.exit(0)
+    print 'Downloading CIFAR10 from %s' % (url)
+    urllib.urlretrieve(url, gzfile)
+    extract_tarfile(gzfile)
+    print 'Finished!'
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='Download Cifar10 datasets')
+    parser.add_argument(
+        'file',
+        type=str,
+        choices=['py', 'bin'])
+    args = parser.parse_args()
+    if args.file == 'bin':
+        dirpath = 'cifar-10-batches-bin'
+        gzfile = 'cifar-10-binary' + '.tar.gz'
+        url = 'http://www.cs.toronto.edu/~kriz/cifar-10-binary.tar.gz'
+        do_download(dirpath, gzfile, url)
+    else:
+        dirpath = 'cifar-10-batches-py'
+        gzfile = 'cifar-10-python' + '.tar.gz'
+        url = 'http://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz'
+        do_download(dirpath, gzfile, url)
diff --git a/examples/cifar10/hybrid.conf b/examples/cifar10/hybrid.conf
deleted file mode 100644
index ec3da0c..0000000
--- a/examples/cifar10/hybrid.conf
+++ /dev/null
@@ -1,292 +0,0 @@
-name: "cifar10-convnet"
-train_steps: 1000
-test_steps: 0
-test_freq: 200
-#validate_steps: 100
-#validate_freq: 300
-disp_freq: 30
-#debug: true
-#checkpoint_path: "examples/cifar10/checkpoint/step1000-worker0"
-train_one_batch {
-  alg: kBP
-}
-updater{
-  type: kSGD
-  weight_decay:0.004
-  momentum:0.9
-  learning_rate {
-    type: kFixedStep
-    fixedstep_conf:{
-      step:0
-      step:60000
-      step:65000
-      step_lr:0.001
-      step_lr:0.0001
-      step_lr:0.00001
-    }
-  }
-}
-neuralnet {
-  layer{
-    name: "data"
-    type: kRecordInput
-    store_conf {
-      backend: "kvfile"
-      path: "examples/cifar10/train_data.bin"
-      mean_file: "examples/cifar10/image_mean.bin"
-      batchsize: 100
-      #random_skip: 5000
-      shape: 3
-      shape: 32
-      shape: 32
-    }
-    include: kTrain
-  }
-#  layer{
-#    name: "data"
-#    type: kRecordInput
-#    store_conf {
-#      backend: "kvfile"
-#      path: "examples/cifar10/val_data.bin"
-#      mean_file: "examples/cifar10/image_mean.bin"
-#      batchsize: 64
-#      random_skip: 5000
-#      shape: 3
-#      shape: 32
-#      shape: 32
-#    }
-#    include: kVal
-#  }
-  layer{
-    name: "data"
-    type: kRecordInput
-    store_conf {
-      backend: "kvfile"
-      path: "examples/cifar10/test_data.bin"
-      mean_file: "examples/cifar10/image_mean.bin"
-      batchsize: 100
-      shape: 3
-      shape: 32
-      shape: 32
-    }
-    include: kTest
-  }
-
-  layer {
-    partition_dim: 0
-    name: "conv1"
-    type: kCConvolution
-    srclayers: "data"
-    convolution_conf {
-      num_filters: 32
-      kernel: 5
-      stride: 1
-      pad:2
-    }
-    param {
-      name: "w1"
-      init {
-        type:kGaussian
-        std:0.0001
-      }
-    }
-    param {
-      name: "b1"
-      lr_scale:2.0
-      init {
-        type: kConstant
-        value:0
-      }
-    }
-  }
-
-  layer {
-    partition_dim: 0
-    name: "pool1"
-    type: kCPooling
-    srclayers: "conv1"
-    pooling_conf {
-      pool: MAX
-      kernel: 3
-      stride: 2
-    }
-  }
-  layer {
-    partition_dim: 0
-    name: "relu1"
-    type: kReLU
-    srclayers:"pool1"
-  }
-  layer {
-    partition_dim: 0
-    name: "norm1"
-    type: kLRN
-    lrn_conf {
-      local_size: 3
-      alpha: 5e-05
-      beta: 0.75
-    }
-    srclayers:"relu1"
-  }
-  layer {
-    partition_dim: 0
-    name: "conv2"
-    type: kCConvolution
-    srclayers: "norm1"
-    convolution_conf {
-      num_filters: 32
-      kernel: 5
-      stride: 1
-      pad:2
-    }
-    param {
-      name: "w2"
-      init {
-        type:kGaussian
-        std:0.01
-      }
-    }
-    param {
-      name: "b2"
-      lr_scale:2.0
-      init {
-        type: kConstant
-        value:0
-      }
-    }
-  }
-  layer {
-    partition_dim: 0
-    name: "relu2"
-    type: kReLU
-    srclayers:"conv2"
-  }
-  layer {
-    partition_dim: 0
-    name: "pool2"
-    type: kCPooling
-    srclayers: "relu2"
-    pooling_conf {
-      pool: AVG
-      kernel: 3
-      stride: 2
-    }
-  }
-  layer {
-    partition_dim: 0
-    name: "norm2"
-    type: kLRN
-    lrn_conf {
-      local_size: 3
-      alpha: 5e-05
-      beta: 0.75
-    }
-    srclayers:"pool2"
-  }
-  layer {
-    partition_dim: 0
-    name: "conv3"
-    type: kCConvolution
-    srclayers: "norm2"
-    convolution_conf {
-      num_filters: 64
-      kernel: 5
-      stride: 1
-      pad:2
-    }
-    param {
-      name: "w3"
-      init {
-        type:kGaussian
-        std:0.01
-      }
-    }
-    param {
-      name: "b3"
-      init {
-        type: kConstant
-        value:0
-      }
-    }
-  }
-  layer {
-    partition_dim: 0
-    name: "relu3"
-    type: kReLU
-    srclayers:"conv3"
-  }
-  layer {
-    partition_dim: 0
-    name: "pool3"
-    type: kCPooling
-    srclayers: "relu3"
-    pooling_conf {
-      pool: AVG
-      kernel: 3
-      stride: 2
-    }
-  }
-  layer {
-    partition_dim: 1
-    name: "ip1"
-    type: kInnerProduct
-    srclayers:"pool3"
-    innerproduct_conf {
-      num_output: 10
-    }
-    param {
-      name: "w4"
-      wd_scale:250
-      init {
-        type:kGaussian
-        std:0.01
-      }
-    }
-    param {
-      name: "b4"
-      lr_scale:2.0
-      wd_scale:0
-      init {
-        type: kConstant
-        value:0
-      }
-    }
-  }
-#  layer {
-#   name : "softmax"
-#   type: kSoftmax
-#   srclayers: "ip1"
-#  }
-#
-#  layer {
-#   name : "argsort"
-#   type: kArgSort
-#   srclayers: "softmax"
-#  }
-  layer{
-    name: "loss"
-    type: kSoftmaxLoss
-    softmaxloss_conf{
-      topk:1
-    }
-    srclayers:"ip1"
-    srclayers: "data"
-  }
-# uncomment "softmax", "argsort", "output" layer and comment "loss" layer
-# to extract features from argsort
-#  layer {
-#    name : "output"
-#    type: kCSVOutput
-#    srclayers: "argsort"
-#    store_conf {
-#      path: "examples/cifar10/out.csv"
-#    }
-#  }
-}
-cluster {
-  nworker_groups: 1
-  nserver_groups: 1
-  nworkers_per_group: 2
-  nworkers_per_procs: 2
-  workspace: "examples/cifar10"
-}
diff --git a/examples/cifar10/job.conf b/examples/cifar10/job.conf
deleted file mode 100644
index d20b452..0000000
--- a/examples/cifar10/job.conf
+++ /dev/null
@@ -1,279 +0,0 @@
-name: "cifar10-convnet"
-train_steps: 1000
-test_steps: 100
-test_freq: 200
-#validate_steps: 100
-#validate_freq: 300
-disp_freq: 50
-#checkpoint_path: "examples/cifar10/checkpoint/step1000-worker0"
-train_one_batch {
-  alg: kBP
-}
-updater{
-  type: kSGD
-  weight_decay:0.004
-  momentum:0.9
-  learning_rate {
-    type: kFixedStep
-    fixedstep_conf:{
-      step:0
-      step:60000
-      step:65000
-      step_lr:0.001
-      step_lr:0.0001
-      step_lr:0.00001
-    }
-  }
-}
-neuralnet {
-  layer{
-    name: "data"
-    type: kRecordInput
-    store_conf {
-      backend: "kvfile"
-      path: "examples/cifar10/train_data.bin"
-      mean_file: "examples/cifar10/image_mean.bin"
-      batchsize: 100
-      #random_skip: 5000
-      shape: 3
-      shape: 32
-      shape: 32
-    }
-    include: kTrain
-  }
-#  layer{
-#    name: "data"
-#    type: kRecordInput
-#    store_conf {
-#      backend: "kvfile"
-#      path: "examples/cifar10/val_data.bin"
-#      mean_file: "examples/cifar10/image_mean.bin"
-#      batchsize: 64
-#      random_skip: 5000
-#      shape: 3
-#      shape: 32
-#      shape: 32
-#    }
-#    include: kVal
-#  }
-  layer{
-    name: "data"
-    type: kRecordInput
-    store_conf {
-      backend: "kvfile"
-      path: "examples/cifar10/test_data.bin"
-      mean_file: "examples/cifar10/image_mean.bin"
-      batchsize: 100
-      shape: 3
-      shape: 32
-      shape: 32
-    }
-    include: kTest
-  }
-
-  layer {
-    name: "conv1"
-    type: kCConvolution
-    srclayers: "data"
-    convolution_conf {
-      num_filters: 32
-      kernel: 5
-      stride: 1
-      pad:2
-    }
-    param {
-      name: "w1"
-      init {
-        type:kGaussian
-        std:0.0001
-      }
-    }
-    param {
-      name: "b1"
-      lr_scale:2.0
-      init {
-        type: kConstant
-        value:0
-      }
-    }
-  }
-
-  layer {
-    name: "pool1"
-    type: kCPooling
-    srclayers: "conv1"
-    pooling_conf {
-      pool: MAX
-      kernel: 3
-      stride: 2
-    }
-  }
-  layer {
-    name: "relu1"
-    type: kReLU
-    srclayers:"pool1"
-  }
-  layer {
-    name: "norm1"
-    type: kLRN
-    lrn_conf {
-      local_size: 3
-      alpha: 5e-05
-      beta: 0.75
-    }
-    srclayers:"relu1"
-  }
-  layer {
-    name: "conv2"
-    type: kCConvolution
-    srclayers: "norm1"
-    convolution_conf {
-      num_filters: 32
-      kernel: 5
-      stride: 1
-      pad:2
-    }
-    param {
-      name: "w2"
-      init {
-        type:kGaussian
-        std:0.01
-      }
-    }
-    param {
-      name: "b2"
-      lr_scale:2.0
-      init {
-        type: kConstant
-        value:0
-      }
-    }
-  }
-  layer {
-    name: "relu2"
-    type: kReLU
-    srclayers:"conv2"
-  }
-  layer {
-    name: "pool2"
-    type: kCPooling
-    srclayers: "relu2"
-    pooling_conf {
-      pool: AVG
-      kernel: 3
-      stride: 2
-    }
-  }
-  layer {
-    name: "norm2"
-    type: kLRN
-    lrn_conf {
-      local_size: 3
-      alpha: 5e-05
-      beta: 0.75
-    }
-    srclayers:"pool2"
-  }
-  layer {
-    name: "conv3"
-    type: kCConvolution
-    srclayers: "norm2"
-    convolution_conf {
-      num_filters: 64
-      kernel: 5
-      stride: 1
-      pad:2
-    }
-    param {
-      name: "w3"
-      init {
-        type:kGaussian
-        std:0.01
-      }
-    }
-    param {
-      name: "b3"
-      init {
-        type: kConstant
-        value:0
-      }
-    }
-  }
-  layer {
-    name: "relu3"
-    type: kReLU
-    srclayers:"conv3"
-  }
-  layer {
-    name: "pool3"
-    type: kCPooling
-    srclayers: "relu3"
-    pooling_conf {
-      pool: AVG
-      kernel: 3
-      stride: 2
-    }
-  }
-  layer {
-    name: "ip1"
-    type: kInnerProduct
-    srclayers:"pool3"
-    innerproduct_conf {
-      num_output: 10
-    }
-    param {
-      name: "w4"
-      wd_scale:250
-      init {
-        type:kGaussian
-        std:0.01
-      }
-    }
-    param {
-      name: "b4"
-      lr_scale:2.0
-      wd_scale:0
-      init {
-        type: kConstant
-        value:0
-      }
-    }
-  }
-#  layer {
-#   name : "softmax"
-#   type: kSoftmax
-#   srclayers: "ip1"
-#  }
-#
-#  layer {
-#   name : "argsort"
-#   type: kArgSort
-#   srclayers: "softmax"
-#  }
-  layer{
-    name: "loss"
-    type: kSoftmaxLoss
-    softmaxloss_conf{
-      topk:1
-    }
-    srclayers:"ip1"
-    srclayers: "data"
-  }
-# uncomment "softmax", "argsort", "output" layer and comment "loss" layer
-# to extract features from argsort
-#  layer {
-#    name : "output"
-#    type: kCSVOutput
-#    srclayers: "argsort"
-#    store_conf {
-#      path: "examples/cifar10/out.csv"
-#    }
-#  }
-}
-cluster {
-  nworker_groups: 1
-  nserver_groups: 1
-  nworkers_per_group: 1
-  nworkers_per_procs: 1
-  workspace: "examples/cifar10"
-}
diff --git a/examples/cifar10/predict.py b/examples/cifar10/predict.py
new file mode 100644
index 0000000..307a610
--- /dev/null
+++ b/examples/cifar10/predict.py
@@ -0,0 +1,90 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+import cPickle as pickle
+import numpy as np
+
+# sys.path.append(os.path.join(os.path.dirname(__file__), '../../build/python'))
+
+from singa import device
+from singa import tensor
+import net as ffnet
+
+
+def predict(net, images, dev, topk=5):
+    '''Predict the label of each image.
+
+    Args:
+        net, a pretrained neural net
+        images, a batch of images [batch_size, 3, 32, 32], which have been
+            pre-processed
+        dev, the training device
+        topk, return the topk labels for each image.
+    '''
+    x = tensor.from_numpy(images.astype(np.float32))
+    x.to_device(dev)
+    y = net.predict(x)
+    y.to_host()
+    y = tensor.to_numpy(y)
+    prob = np.average(y, 0)
+    labels = np.flipud(np.argsort(prob))  # sort prob in descending order
+    return labels[0:topk], prob[labels[0:topk]]
+
+
+def load_dataset(filepath):
+    print 'Loading data file %s' % filepath
+    with open(filepath, 'rb') as fd:
+        cifar10 = pickle.load(fd)
+    image = cifar10['data'].astype(dtype=np.uint8)
+    image = image.reshape((-1, 3, 32, 32))
+    label = np.asarray(cifar10['labels'], dtype=np.uint8)
+    label = label.reshape(label.size, 1)
+    return image, label
+
+
+def load_train_data(dir_path, num_batches=5):
+    labels = []
+    batchsize = 10000
+    images = np.empty((num_batches * batchsize, 3, 32, 32), dtype=np.uint8)
+    for did in range(1, num_batches + 1):
+        fname_train_data = dir_path + "/data_batch_{}".format(did)
+        image, label = load_dataset(fname_train_data)
+        images[(did - 1) * batchsize:did * batchsize] = image
+        labels.extend(label)
+    images = np.array(images, dtype=np.float32)
+    labels = np.array(labels, dtype=np.int32)
+    return images, labels
+
+
+def load_test_data(dir_path):
+    images, labels = load_dataset(dir_path + "/test_batch")
+    return np.array(images,  dtype=np.float32), np.array(labels, dtype=np.int32)
+
+
+def compute_image_mean(train_dir):
+    images = np.load(train_dir)
+    return np.average(images, 0)
+
+if __name__ == '__main__':
+    model = ffnet.create_alexnet()
+    model.load('model.bin')
+    cuda = device.create_cuda_gpu()
+    model.to_device(cuda)
+
+    mean = compute_image_mean('cifar-10-batches-py')
+    test_images, _ = load_test_data('cifar-10-batches-py')
+    # minus mean is for alexnet; vgg uses a different pre-processing strategy
+    print predict(model, test_images - mean, cuda)
diff --git a/examples/cifar10/resnet.py b/examples/cifar10/resnet.py
new file mode 100644
index 0000000..6b573e9
--- /dev/null
+++ b/examples/cifar10/resnet.py
@@ -0,0 +1,95 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+"""The resnet model is adapted from http://torch.ch/blog/2016/02/04/resnets.html
+The best validation accuracy we achieved is about 83% without data augmentation.
+The performance could be improved by tuning some hyper-parameters, including
+learning rate, weight decay, max_epoch, parameter initialization, etc.
+"""
+
+import cPickle as pickle
+
+# sys.path.append(os.path.join(os.path.dirname(__file__), '../../build/python'))
+# use the python modules by installing py singa in build/python
+# pip install -e .
+
+from singa import layer
+from singa import initializer
+from singa import metric
+from singa import loss
+from singa import net as ffnet
+
+
+def Block(net, name, nb_filters, stride):
+    split = net.add(layer.Split(name + "-split", 2))
+    if stride > 1:
+        net.add(layer.Conv2D(name + "-br2-conv", nb_filters, 1, stride, pad=0), split)
+        br2bn = net.add(layer.BatchNormalization(name + "-br2-bn"))
+    net.add(layer.Conv2D(name + "-br1-conv1", nb_filters, 3, stride, pad=1), split)
+    net.add(layer.BatchNormalization(name + "-br1-bn1"))
+    net.add(layer.Activation(name + "-br1-relu"))
+    net.add(layer.Conv2D(name + "-br1-conv2", nb_filters, 3, 1, pad=1))
+    br1bn2 = net.add(layer.BatchNormalization(name + "-br1-bn2"))
+    if stride > 1:
+        net.add(layer.Merge(name + "-merge"), [br1bn2, br2bn])
+    else:
+        net.add(layer.Merge(name + "-merge"), [br1bn2, split])
+
+
+def create_net(use_cpu=False):
+    if use_cpu:
+        layer.engine = 'singacpp'
+
+    net = ffnet.FeedForwardNet(loss.SoftmaxCrossEntropy(), metric.Accuracy())
+    net.add(layer.Conv2D("conv1", 16, 3, 1, pad=1, input_sample_shape=(3, 32, 32)))
+    net.add(layer.BatchNormalization("bn1"))
+    net.add(layer.Activation("relu1"))
+
+    Block(net, "2a", 16, 1)
+    Block(net, "2b", 16, 1)
+    Block(net, "2c", 16, 1)
+
+    Block(net, "3a", 32, 2)
+    Block(net, "3b", 32, 1)
+    Block(net, "3c", 32, 1)
+
+    Block(net, "4a", 64, 2)
+    Block(net, "4b", 64, 1)
+    Block(net, "4c", 64, 1)
+
+    net.add(layer.AvgPooling2D("pool4", 8, 8, border_mode='valid'))
+    net.add(layer.Flatten('flat'))
+    net.add(layer.Dense('ip5', 10))
+    print 'Start intialization............'
+    for (p, name) in zip(net.param_values(), net.param_names()):
+        # print name, p.shape
+        if 'mean' in name or 'beta' in name:
+            p.set_value(0.0)
+        elif 'var' in name:
+            p.set_value(1.0)
+        elif 'gamma' in name:
+            initializer.uniform(p, 0, 1)
+        elif len(p.shape) > 1:
+            if 'conv' in name:
+                # initializer.gaussian(p, 0, math.sqrt(2.0/p.shape[1]))
+                initializer.gaussian(p, 0, 9.0 * p.shape[0])
+            else:
+                initializer.uniform(p, p.shape[0], p.shape[1])
+        else:
+            p.set_value(0)
+        # print name, p.l1()
+
+    return net
diff --git a/examples/cifar10/run-parallel.sh b/examples/cifar10/run-parallel.sh
new file mode 100755
index 0000000..91b3b54
--- /dev/null
+++ b/examples/cifar10/run-parallel.sh
@@ -0,0 +1,21 @@
+#!/usr/bin/env sh
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# 
+
+../../build/bin/alexnet-parallel -epoch 4
+#../../build/bin/vgg-parallel -epoch 4
diff --git a/examples/cifar10/run.sh b/examples/cifar10/run.sh
new file mode 100755
index 0000000..279edf0
--- /dev/null
+++ b/examples/cifar10/run.sh
@@ -0,0 +1,20 @@
+#!/usr/bin/env sh
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# 
+
+../../build/bin/alexnet -epoch 140
diff --git a/examples/cifar10/train.py b/examples/cifar10/train.py
new file mode 100644
index 0000000..d2d70df
--- /dev/null
+++ b/examples/cifar10/train.py
@@ -0,0 +1,186 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+""" CIFAR10 dataset is at https://www.cs.toronto.edu/~kriz/cifar.html.
+It includes 5 binary dataset, each contains 10000 images. 1 row (1 image)
+includes 1 label & 3072 pixels.  3072 pixels are 3 channels of a 32x32 image
+"""
+
+import cPickle
+import numpy as np
+import os
+import argparse
+
+# sys.path.append(os.path.join(os.path.dirname(__file__), '../../build/python'))
+from singa import utils
+from singa import optimizer
+from singa import device
+from singa import tensor
+from singa.proto import core_pb2
+
+import alexnet
+import vgg
+import resnet
+
+def load_dataset(filepath):
+    print 'Loading data file %s' % filepath
+    with open(filepath, 'rb') as fd:
+        cifar10 = cPickle.load(fd)
+    image = cifar10['data'].astype(dtype=np.uint8)
+    image = image.reshape((-1, 3, 32, 32))
+    label = np.asarray(cifar10['labels'], dtype=np.uint8)
+    label = label.reshape(label.size, 1)
+    return image, label
+
+
+def load_train_data(dir_path, num_batches=5):
+    labels = []
+    batchsize = 10000
+    images = np.empty((num_batches * batchsize, 3, 32, 32), dtype=np.uint8)
+    for did in range(1, num_batches + 1):
+        fname_train_data = dir_path + "/data_batch_{}".format(did)
+        image, label = load_dataset(fname_train_data)
+        images[(did - 1) * batchsize:did * batchsize] = image
+        labels.extend(label)
+    images = np.array(images, dtype=np.float32)
+    labels = np.array(labels, dtype=np.int32)
+    return images, labels
+
+
+def load_test_data(dir_path):
+    images, labels = load_dataset(dir_path + "/test_batch")
+    return np.array(images,  dtype=np.float32), np.array(labels, dtype=np.int32)
+
+
+def normalize_for_vgg(train_x, test_x):
+    mean = train_x.mean()
+    std = train_x.std()
+    train_x -= mean
+    test_x -= mean
+    train_x /= std
+    test_x /= std
+    return train_x, test_x
+
+
+def normalize_for_alexnet(train_x, test_x):
+    mean = np.average(train_x, axis=0)
+    train_x -= mean
+    test_x -= mean
+    return train_x, test_x
+
+
+def vgg_lr(epoch):
+    return 0.1 / float(1 << ((epoch / 25)))
+
+
+def alexnet_lr(epoch):
+    if epoch < 120:
+        return 0.001
+    elif epoch < 130:
+        return 0.0001
+    else:
+        return 0.00001
+
+def resnet_lr(epoch):
+    if epoch < 80:
+        return 0.02
+    elif epoch < 120:
+        return 0.005
+    else:
+        return 0.001
+
+def train(data, net, max_epoch, get_lr, weight_decay, batch_size=100,
+          use_cpu=False):
+    print 'Start intialization............'
+    if use_cpu:
+        print 'Using CPU'
+        dev = device.get_default_device()
+    else:
+        print 'Using GPU'
+        dev = device.create_cuda_gpu()
+
+    net.to_device(dev)
+    opt = optimizer.SGD(momentum=0.9, weight_decay=weight_decay)
+    for (p, specs) in zip(net.param_names(), net.param_specs()):
+        opt.register(p, specs)
+
+    tx = tensor.Tensor((batch_size, 3, 32, 32), dev)
+    ty = tensor.Tensor((batch_size,), dev, core_pb2.kInt)
+    train_x, train_y, test_x, test_y = data
+    num_train_batch = train_x.shape[0] / batch_size
+    num_test_batch = test_x.shape[0] / batch_size
+    idx = np.arange(train_x.shape[0], dtype=np.int32)
+    for epoch in range(max_epoch):
+        np.random.shuffle(idx)
+        loss, acc = 0.0, 0.0
+        print 'Epoch %d' % epoch
+        for b in range(num_train_batch):
+            x = train_x[idx[b * batch_size: (b + 1) * batch_size]]
+            y = train_y[idx[b * batch_size: (b + 1) * batch_size]]
+            tx.copy_from_numpy(x)
+            ty.copy_from_numpy(y)
+            grads, (l, a) = net.train(tx, ty)
+            loss += l
+            acc += a
+            for (s, p, g) in zip(net.param_names(), net.param_values(), grads):
+                opt.apply_with_lr(epoch, get_lr(epoch), g, p, str(s))
+            # update progress bar
+            utils.update_progress(b * 1.0 / num_train_batch,
+                                  'training loss = %f, accuracy = %f' % (l, a))
+        info = '\ntraining loss = %f, training accuracy = %f, lr = %f' \
+            % (loss / num_train_batch, acc / num_train_batch, get_lr(epoch))
+        print info
+
+        loss, acc = 0.0, 0.0
+        for b in range(num_test_batch):
+            x = test_x[b * batch_size: (b + 1) * batch_size]
+            y = test_y[b * batch_size: (b + 1) * batch_size]
+            tx.copy_from_numpy(x)
+            ty.copy_from_numpy(y)
+            l, a = net.evaluate(tx, ty)
+            loss += l
+            acc += a
+
+        print 'test loss = %f, test accuracy = %f' \
+            % (loss / num_test_batch, acc / num_test_batch)
+    net.save('model.bin')  # save model params into checkpoint file
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='Train vgg/alexnet for cifar10')
+    parser.add_argument('model', choices=['vgg', 'alexnet', 'resnet'], default='alexnet')
+    parser.add_argument('data', default='cifar-10-batches-py')
+    parser.add_argument('--use_cpu', action='store_true')
+    args = parser.parse_args()
+    assert os.path.exists(args.data), \
+        'Pls download the cifar10 dataset via "download_data.py py"'
+    print 'Loading data ..................'
+    train_x, train_y = load_train_data(args.data)
+    test_x, test_y = load_test_data(args.data)
+    if args.model == 'alexnet':
+        train_x, test_x = normalize_for_alexnet(train_x, test_x)
+        net = alexnet.create_net(args.use_cpu)
+        train((train_x, train_y, test_x, test_y), net, 160, alexnet_lr, 0.004,
+              use_cpu=args.use_cpu)
+    elif args.model == 'vgg':
+        train_x, test_x = normalize_for_vgg(train_x, test_x)
+        net = vgg.create_net(args.use_cpu)
+        train((train_x, train_y, test_x, test_y), net, 250, vgg_lr, 0.0005,
+              use_cpu=args.use_cpu)
+    else:
+        train_x, test_x = normalize_for_alexnet(train_x, test_x)
+        net = resnet.create_net(args.use_cpu)
+        train((train_x, train_y, test_x, test_y), net, 200, resnet_lr, 1e-4,
+              use_cpu=args.use_cpu)
diff --git a/examples/cifar10/vgg-parallel.cc b/examples/cifar10/vgg-parallel.cc
new file mode 100644
index 0000000..90e9fce
--- /dev/null
+++ b/examples/cifar10/vgg-parallel.cc
@@ -0,0 +1,327 @@
+/************************************************************
+*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*
+*************************************************************/
+
+#include "cifar10.h"
+#include "singa/model/feed_forward_net.h"
+#include "singa/model/optimizer.h"
+#include "singa/model/updater.h"
+#include "singa/model/initializer.h"
+#include "singa/model/metric.h"
+#include "singa/utils/channel.h"
+#include "singa/utils/string.h"
+#include "singa/core/memory.h"
+#include <thread>
+#include <memory>
+#include <cmath>
+
+namespace singa {
+
+// currently supports 'cudnn' and 'singacpp'
+const std::string engine = "cudnn";
+const float default_wd  = 0.0005f;
+
+LayerConf GenConvConf(string name, int nb_filter, int kernel, int stride,
+                      int pad, float std = .02f, float bias = .0f) {
+  LayerConf conf;
+  conf.set_name(name);
+  conf.set_type(engine + "_convolution");
+  ConvolutionConf *conv = conf.mutable_convolution_conf();
+  conv->set_num_output(nb_filter);
+  conv->add_kernel_size(kernel);
+  conv->add_stride(stride);
+  conv->add_pad(pad);
+  conv->set_bias_term(true);
+
+  ParamSpec *wspec = conf.add_param();
+  wspec->set_name(name + "_weight");
+  auto wfill = wspec->mutable_filler();
+  wfill->set_type("Gaussian");
+  wfill->set_std(sqrt(2.0f/(nb_filter*9.0f)));
+
+  ParamSpec *bspec = conf.add_param();
+  bspec->set_name(name + "_bias");
+  auto bfill = bspec->mutable_filler();
+  bfill->set_value(bias);
+  //  bspec->set_lr_mult(2);
+  //  bspec->set_decay_mult(0);
+  return conf;
+}
+
+LayerConf GenPoolingConf(string name, bool max_pool, int kernel, int stride,
+                         int pad) {
+  LayerConf conf;
+  conf.set_name(name);
+  conf.set_type(engine + "_pooling");
+  PoolingConf *pool = conf.mutable_pooling_conf();
+  pool->set_kernel_size(kernel);
+  pool->set_stride(stride);
+  pool->set_pad(pad);
+  if (!max_pool) pool->set_pool(PoolingConf_PoolMethod_AVE);
+  return conf;
+}
+
+LayerConf GenReLUConf(string name) {
+  LayerConf conf;
+  conf.set_name(name);
+  conf.set_type(engine + "_relu");
+  return conf;
+}
+
+LayerConf GenDenseConf(string name, int num_output, float std, float wd = default_wd) {
+  LayerConf conf;
+  conf.set_name(name);
+  conf.set_type("singa_dense");
+  DenseConf *dense = conf.mutable_dense_conf();
+  dense->set_num_output(num_output);
+
+  ParamSpec *wspec = conf.add_param();
+  wspec->set_name(name + "_weight");
+  wspec->set_decay_mult(wd);
+  auto wfill = wspec->mutable_filler();
+  wfill->set_type("Gaussian");
+  wfill->set_std(std);
+
+  ParamSpec *bspec = conf.add_param();
+  bspec->set_name(name + "_bias");
+  bspec->set_lr_mult(2);
+  bspec->set_decay_mult(0);
+
+  return conf;
+}
+
+LayerConf GenFlattenConf(string name) {
+  LayerConf conf;
+  conf.set_name(name);
+  conf.set_type("singa_flatten");
+  return conf;
+}
+
+LayerConf GenBatchNormConf(string name) {
+  LayerConf conf;
+  conf.set_name(name);
+  conf.set_type(engine + "_batchnorm");
+  ParamSpec *gammaspec = conf.add_param();
+  gammaspec->set_name(name + "_gamma");
+  auto gammafill = gammaspec->mutable_filler();
+  gammafill->set_type("uniform");
+  gammafill->set_min(0);
+  gammafill->set_max(1);
+
+  ParamSpec *betaspec = conf.add_param();
+  betaspec->set_name(name + "_beta");
+  auto betafill = betaspec->mutable_filler();
+  betafill->set_type("constant");
+  betafill->set_value(0);
+
+  ParamSpec *meanspec = conf.add_param();
+  meanspec->set_name(name + "_mean");
+  auto meanfill = meanspec->mutable_filler();
+  meanfill->set_type("constant");
+  meanfill->set_value(0);
+
+  ParamSpec *varspec = conf.add_param();
+  varspec->set_name(name + "_var");
+  auto varfill = varspec->mutable_filler();
+  varfill->set_type("constant");
+  varfill->set_value(1);
+
+  return conf;
+}
+
+LayerConf GenDropoutConf(string name, float dropout_ratio) {
+  LayerConf conf;
+  conf.set_name(name);
+  conf.set_type(engine + "_dropout");
+  DropoutConf *dropout = conf.mutable_dropout_conf();
+  dropout->set_dropout_ratio(dropout_ratio);
+
+  return conf;
+}
+
+void ConvBNReLU(FeedForwardNet& net, string name, int nb_filter, Shape* shape = nullptr) {
+  net.Add(GenConvConf(name+"_conv", nb_filter, 3, 1, 1), shape);
+  net.Add(GenBatchNormConf(name+"_bn"));
+  net.Add(GenReLUConf(name+"_relu"));
+}
+
+FeedForwardNet CreateNet() {
+  FeedForwardNet net;
+  Shape s{3, 32, 32};
+  ConvBNReLU(net, "conv1_1", 64, &s);
+  net.Add(GenDropoutConf("drop1", 0.3));
+  ConvBNReLU(net, "conv1_2", 64);
+  net.Add(GenPoolingConf("pool1", true, 2, 2, 0));
+  ConvBNReLU(net, "conv2_1", 128);
+  net.Add(GenDropoutConf("drop2", 0.4));
+  ConvBNReLU(net, "conv2_2", 128);
+  net.Add(GenPoolingConf("pool2", true, 2, 2, 0));
+  ConvBNReLU(net, "conv3_1", 256);
+  net.Add(GenDropoutConf("drop3_1", 0.4));
+  ConvBNReLU(net, "conv3_2", 256);
+  net.Add(GenDropoutConf("drop3_2", 0.4));
+  ConvBNReLU(net, "conv3_3", 256);
+  net.Add(GenPoolingConf("pool3", true, 2, 2, 0));
+  ConvBNReLU(net, "conv4_1", 512);
+  net.Add(GenDropoutConf("drop4_1", 0.4));
+  ConvBNReLU(net, "conv4_2", 512);
+  net.Add(GenDropoutConf("drop4_2", 0.4));
+  ConvBNReLU(net, "conv4_3", 512);
+  net.Add(GenPoolingConf("pool4", true, 2, 2, 0));
+  ConvBNReLU(net, "conv5_1", 512);
+  net.Add(GenDropoutConf("drop5_1", 0.4));
+  ConvBNReLU(net, "conv5_2", 512);
+  net.Add(GenDropoutConf("drop5_2", 0.4));
+  ConvBNReLU(net, "conv5_3", 512);
+  net.Add(GenPoolingConf("pool5", true, 2, 2, 0));
+  net.Add(GenFlattenConf("flat"));
+  net.Add(GenDropoutConf("flat_drop", 0.5));
+  net.Add(GenDenseConf("ip1", 512, 0.02));
+  net.Add(GenBatchNormConf("ip1_bn"));
+  net.Add(GenReLUConf("ip1_relu"));
+  net.Add(GenDropoutConf("ip1_drop", 0.5));
+  net.Add(GenDenseConf("ip2", 10, 0.02));
+
+  return net;
+}
+
+void Train(float lr, int num_epoch, string data_dir) {
+  Cifar10 data(data_dir);
+  Tensor train_x, train_y, test_x, test_y;
+  Tensor train_x_1, train_x_2, train_y_1, train_y_2;
+  {
+    auto train = data.ReadTrainData();
+    size_t nsamples = train.first.shape(0);
+    auto mtrain =
+        Reshape(train.first, Shape{nsamples, train.first.Size() / nsamples});
+    const Tensor &mean = Average(mtrain, 0);
+    SubRow(mean, &mtrain);
+    Tensor std = Square(mtrain);
+    std = Average(std, 0);
+    std = Sqrt(std);;
+    std += 1e-6f;
+    DivRow(std, &mtrain);
+
+    train_x = Reshape(mtrain, train.first.shape());
+    train_y = train.second;
+
+    LOG(INFO) << "Slicing training data...";
+    train_x_1.Reshape(Shape{nsamples / 2, train.first.shape(1),
+        train.first.shape(2), train.first.shape(3)});
+    LOG(INFO) << "Copying first data slice...";
+    CopyDataToFrom(&train_x_1, train_x, train_x.Size() / 2);
+    train_x_2.Reshape(Shape{nsamples / 2, train.first.shape(1),
+        train.first.shape(2), train.first.shape(3)});
+    LOG(INFO) << "Copying second data slice...";
+    CopyDataToFrom(&train_x_2, train_x, train_x.Size() / 2, 0,
+                   train_x.Size() / 2);
+    train_y_1.Reshape(Shape{nsamples / 2});
+    train_y_1.AsType(kInt);
+    LOG(INFO) << "Copying first label slice...";
+    CopyDataToFrom(&train_y_1, train_y, train_y.Size() / 2);
+    train_y_2.Reshape(Shape{nsamples / 2});
+    train_y_2.AsType(kInt);
+    LOG(INFO) << "Copying second label slice...";
+    CopyDataToFrom(&train_y_2, train_y, train_y.Size() / 2, 0,
+                   train_y.Size() / 2);
+
+    auto test = data.ReadTestData();
+    nsamples = test.first.shape(0);
+    auto mtest =
+        Reshape(test.first, Shape{nsamples, test.first.Size() / nsamples});
+    SubRow(mean, &mtest);
+    DivRow(std, &mtest);
+    test_x = Reshape(mtest, test.first.shape());
+    test_y = test.second;
+  }
+
+  CHECK_EQ(train_x.shape(0), train_y.shape(0));
+  CHECK_EQ(test_x.shape(0), test_y.shape(0));
+  LOG(INFO) << "Total Training samples = " << train_y.shape(0)
+            << ", Total Test samples = " << test_y.shape(0);
+  CHECK_EQ(train_x_1.shape(0), train_y_1.shape(0));
+  LOG(INFO) << "On net 1, Training samples = " << train_y_1.shape(0)
+            << ", Test samples = " << test_y.shape(0);
+  CHECK_EQ(train_x_2.shape(0), train_y_2.shape(0));
+  LOG(INFO) << "On net 2, Training samples = " << train_y_2.shape(0);
+
+  auto net_1 = CreateNet();
+  auto net_2 = CreateNet();
+
+  SGD sgd;
+  OptimizerConf opt_conf;
+  opt_conf.set_momentum(0.9);
+  auto reg = opt_conf.mutable_regularizer();
+  reg->set_coefficient(0.0005);
+  sgd.Setup(opt_conf);
+  sgd.SetLearningRateGenerator([lr](int epoch) {
+    return 0.01f / static_cast<float>(1u << (epoch/30));
+  });
+
+  SoftmaxCrossEntropy loss_1, loss_2;
+  Accuracy acc_1, acc_2;
+  /// Create updater aggregating gradient on CPU
+  std::shared_ptr<Updater> updater = std::make_shared<LocalUpdater>(2, &sgd);
+
+  /// Only need to register parameter once.
+  net_1.Compile(true, true, updater, &loss_1, &acc_1);
+  net_2.Compile(true, false, updater, &loss_2, &acc_2);
+
+  MemPoolConf mem_conf;
+  mem_conf.add_device(0);
+  mem_conf.add_device(1);
+  std::shared_ptr<DeviceMemPool> mem_pool(new CnMemPool(mem_conf));
+  std::shared_ptr<CudaGPU> dev_1(new CudaGPU(0, mem_pool));
+  std::shared_ptr<CudaGPU> dev_2(new CudaGPU(1, mem_pool));
+  net_1.ToDevice(dev_1);
+  net_2.ToDevice(dev_2);
+
+  train_x_1.ToDevice(dev_1);
+  train_y_1.ToDevice(dev_1);
+  test_x.ToDevice(dev_1);
+  test_y.ToDevice(dev_1);
+  train_x_2.ToDevice(dev_2);
+  train_y_2.ToDevice(dev_2);
+
+  LOG(INFO) << "Launching thread...";
+  std::thread t1 =
+      net_1.TrainThread(50, num_epoch, train_x_1, train_y_1, test_x, test_y);
+  std::thread t2 = net_2.TrainThread(50, num_epoch, train_x_2, train_y_2);
+  t1.join();
+  t2.join();
+}
+}
+
+int main(int argc, char **argv) {
+  singa::InitChannel(nullptr);
+  int pos = singa::ArgPos(argc, argv, "-epoch");
+  int nEpoch = 1;
+  if (pos != -1) nEpoch = atoi(argv[pos + 1]);
+  pos = singa::ArgPos(argc, argv, "-lr");
+  float lr = 0.001;
+  if (pos != -1) lr = atof(argv[pos + 1]);
+  pos = singa::ArgPos(argc, argv, "-data");
+  string data = "cifar-10-batches-bin";
+  if (pos != -1) data = argv[pos + 1];
+
+  LOG(INFO) << "Start training";
+  singa::Train(lr, nEpoch, data);
+  LOG(INFO) << "End training";
+}
diff --git a/examples/cifar10/vgg.py b/examples/cifar10/vgg.py
new file mode 100644
index 0000000..89c6fe8
--- /dev/null
+++ b/examples/cifar10/vgg.py
@@ -0,0 +1,94 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+""" The VGG model is adapted from http://torch.ch/blog/2015/07/30/cifar.html.
+The best validation accuracy we achieved is about 89% without data augmentation.
+The performance could be improved by tuning some hyper-parameters, including
+learning rate, weight decay, max_epoch, parameter initialization, etc.
+"""
+
+# sys.path.append(os.path.join(os.path.dirname(__file__), '../../build/python'))
+
+from singa import layer
+from singa import initializer
+from singa import metric
+from singa import loss
+from singa import net as ffnet
+
+
+def ConvBnReLU(net, name, nb_filers, sample_shape=None):
+    net.add(layer.Conv2D(name + '_1', nb_filers, 3, 1, pad=1,
+                         input_sample_shape=sample_shape))
+    net.add(layer.BatchNormalization(name + '_2'))
+    net.add(layer.Activation(name + '_3'))
+
+
+def create_net(use_cpu=False):
+    if use_cpu:
+        layer.engine = 'singacpp'
+    net = ffnet.FeedForwardNet(loss.SoftmaxCrossEntropy(), metric.Accuracy())
+    ConvBnReLU(net, 'conv1_1', 64, (3, 32, 32))
+    net.add(layer.Dropout('drop1', 0.3))
+    ConvBnReLU(net, 'conv1_2', 64)
+    net.add(layer.MaxPooling2D('pool1', 2, 2, border_mode='valid'))
+    ConvBnReLU(net, 'conv2_1', 128)
+    net.add(layer.Dropout('drop2_1', 0.4))
+    ConvBnReLU(net, 'conv2_2', 128)
+    net.add(layer.MaxPooling2D('pool2', 2, 2, border_mode='valid'))
+    ConvBnReLU(net, 'conv3_1', 256)
+    net.add(layer.Dropout('drop3_1', 0.4))
+    ConvBnReLU(net, 'conv3_2', 256)
+    net.add(layer.Dropout('drop3_2', 0.4))
+    ConvBnReLU(net, 'conv3_3', 256)
+    net.add(layer.MaxPooling2D('pool3', 2, 2, border_mode='valid'))
+    ConvBnReLU(net, 'conv4_1', 512)
+    net.add(layer.Dropout('drop4_1', 0.4))
+    ConvBnReLU(net, 'conv4_2', 512)
+    net.add(layer.Dropout('drop4_2', 0.4))
+    ConvBnReLU(net, 'conv4_3', 512)
+    net.add(layer.MaxPooling2D('pool4', 2, 2, border_mode='valid'))
+    ConvBnReLU(net, 'conv5_1', 512)
+    net.add(layer.Dropout('drop5_1', 0.4))
+    ConvBnReLU(net, 'conv5_2', 512)
+    net.add(layer.Dropout('drop5_2', 0.4))
+    ConvBnReLU(net, 'conv5_3', 512)
+    net.add(layer.MaxPooling2D('pool5', 2, 2, border_mode='valid'))
+    net.add(layer.Flatten('flat'))
+    net.add(layer.Dropout('drop_flat', 0.5))
+    net.add(layer.Dense('ip1', 512))
+    net.add(layer.BatchNormalization('batchnorm_ip1'))
+    net.add(layer.Activation('relu_ip1'))
+    net.add(layer.Dropout('drop_ip2', 0.5))
+    net.add(layer.Dense('ip2', 10))
+    print 'Start intialization............'
+    for (p, name) in zip(net.param_values(), net.param_names()):
+        print name, p.shape
+        if 'mean' in name or 'beta' in name:
+            p.set_value(0.0)
+        elif 'var' in name:
+            p.set_value(1.0)
+        elif 'gamma' in name:
+            initializer.uniform(p, 0, 1)
+        elif len(p.shape) > 1:
+            if 'conv' in name:
+                initializer.gaussian(p, 0, 3 * 3 * p.shape[0])
+            else:
+                p.gaussian(0, 0.02)
+        else:
+            p.set_value(0)
+        print name, p.l1()
+
+    return net
diff --git a/examples/imagenet/CMakeLists.txt b/examples/imagenet/CMakeLists.txt
new file mode 100644
index 0000000..465245a
--- /dev/null
+++ b/examples/imagenet/CMakeLists.txt
@@ -0,0 +1,34 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# 
+
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
+INCLUDE_DIRECTORIES(${CMAKE_BINARY_DIR}/include)
+
+IF(USE_CUDNN)
+  IF(USE_OPENCV)
+  SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp ")
+    ADD_EXECUTABLE(imagenet alexnet.cc)
+    ADD_DEPENDENCIES(imagenet singa_core singa_model singa_utils singa_io)
+    TARGET_LINK_LIBRARIES(imagenet singa_core singa_utils singa_model singa_io protobuf ${SINGA_LIBKER_LIBS})
+
+    ADD_EXECUTABLE(createdata ilsvrc12.cc)
+    ADD_DEPENDENCIES(createdata singa_core singa_io singa_model singa_utils)
+    TARGET_LINK_LIBRARIES(createdata singa_core singa_utils singa_io singa_model protobuf ${SINGA_LIBKER_LIBS})
+    #SET_TARGET_PROPERTIES(createdata PROPERTIES LINK_FLAGS "${LINK_FLAGS}")
+  ENDIF(USE_OPENCV)
+ENDIF(USE_CUDNN)
diff --git a/examples/imagenet/README.md b/examples/imagenet/README.md
new file mode 100644
index 0000000..be6797c
--- /dev/null
+++ b/examples/imagenet/README.md
@@ -0,0 +1,58 @@
+# Train AlexNet over ImageNet
+
+Convolution neural network (CNN) is a type of feed-forward neural
+network widely used for image and video classification. In this example, we will
+use a [deep CNN model](http://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks)
+to do image classification against the ImageNet dataset.
+
+## Instructions
+
+### Compile SINGA
+
+Please compile SINGA with CUDA, CUDNN and OpenCV. You can manually turn on the
+options in CMakeLists.txt or run `ccmake ..` in build/ folder.
+
+We have tested CUDNN V4 and V5 (V5 requires CUDA 7.5)
+
+### Data download
+* Please refer to step1-3 on [Instructions to create ImageNet 2012 data](https://github.com/amd/OpenCL-caffe/wiki/Instructions-to-create-ImageNet-2012-data)
+  to download and decompress the data.
+* You can download the training and validation list by
+  [get_ilsvrc_aux.sh](https://github.com/BVLC/caffe/blob/master/data/ilsvrc12/get_ilsvrc_aux.sh)
+  or from [Imagenet](http://www.image-net.org/download-images).
+
+### Data preprocessing
+* Assuming you have downloaded the data and the list.
+  Now we should transform the data into binary files. You can run:
+
+          sh create_data.sh
+
+  The script will generate a test file(`test.bin`), a mean file(`mean.bin`) and
+  several training files(`trainX.bin`) in the specified output folder.
+* You can also change the parameters in `create_data.sh`.
+  + `-trainlist <file>`: the file of training list;
+  + `-trainfolder <folder>`: the folder of training images;
+  + `-testlist <file>`: the file of test list;
+  + `-testfolder <floder>`: the folder of test images;
+  + `-outdata <folder>`: the folder to save output files, including mean, training and test files.
+    The script will generate these files in the specified folder;
+  + `-filesize <int>`: number of training images that stores in each binary file.
+
+### Training
+* After preparing data, you can run the following command to train the Alexnet model.
+
+          sh run.sh
+
+* You may change the parameters in `run.sh`.
+  + `-epoch <int>`: number of epoch to be trained, default is 90;
+  + `-lr <float>`: base learning rate, the learning rate will decrease each 20 epochs,
+    more specifically, `lr = lr * exp(0.1 * (epoch / 20))`;
+  + `-batchsize <int>`: batchsize, it should be changed regarding to your memory;
+  + `-filesize <int>`: number of training images that stores in each binary file, it is the
+    same as the `filesize` in data preprocessing;
+  + `-ntrain <int>`: number of training images;
+  + `-ntest <int>`: number of test images;
+  + `-data <folder>`: the folder which stores the binary files, it is exactly the output
+    folder in data preprocessing step;
+  + `-pfreq <int>`: the frequency(in batch) of printing current model status(loss and accuracy);
+  + `-nthreads <int>`: the number of threads to load data which feed to the model.
diff --git a/examples/imagenet/alexnet.cc b/examples/imagenet/alexnet.cc
new file mode 100644
index 0000000..4ac1130
--- /dev/null
+++ b/examples/imagenet/alexnet.cc
@@ -0,0 +1,402 @@
+/************************************************************
+*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*
+*************************************************************/
+
+#include "singa/singa_config.h"
+#ifdef USE_OPENCV
+#include <cmath>
+#include "./ilsvrc12.h"
+#include "singa/io/snapshot.h"
+#include "singa/model/feed_forward_net.h"
+#include "singa/model/initializer.h"
+#include "singa/model/metric.h"
+#include "singa/model/optimizer.h"
+#include "singa/utils/channel.h"
+#include "singa/utils/string.h"
+#include "singa/utils/timer.h"
+namespace singa {
+
+// currently supports 'cudnn' and 'singacpp'
+const std::string engine = "cudnn";
+LayerConf GenConvConf(string name, int nb_filter, int kernel, int stride,
+                      int pad, float std, float bias = .0f) {
+  LayerConf conf;
+  conf.set_name(name);
+  conf.set_type(engine + "_convolution");
+  ConvolutionConf *conv = conf.mutable_convolution_conf();
+  conv->set_num_output(nb_filter);
+  conv->add_kernel_size(kernel);
+  conv->add_stride(stride);
+  conv->add_pad(pad);
+  conv->set_bias_term(true);
+
+  ParamSpec *wspec = conf.add_param();
+  wspec->set_name(name + "_weight");
+  auto wfill = wspec->mutable_filler();
+  wfill->set_type("Gaussian");
+  wfill->set_std(std);
+
+  ParamSpec *bspec = conf.add_param();
+  bspec->set_name(name + "_bias");
+  bspec->set_lr_mult(2);
+  bspec->set_decay_mult(0);
+  auto bfill = bspec->mutable_filler();
+  bfill->set_value(bias);
+  return conf;
+}
+
+LayerConf GenPoolingConf(string name, bool max_pool, int kernel, int stride,
+                         int pad) {
+  LayerConf conf;
+  conf.set_name(name);
+  conf.set_type(engine + "_pooling");
+  PoolingConf *pool = conf.mutable_pooling_conf();
+  pool->set_kernel_size(kernel);
+  pool->set_stride(stride);
+  pool->set_pad(pad);
+  if (!max_pool) pool->set_pool(PoolingConf_PoolMethod_AVE);
+  return conf;
+}
+
+LayerConf GenReLUConf(string name) {
+  LayerConf conf;
+  conf.set_name(name);
+  conf.set_type(engine + "_relu");
+  return conf;
+}
+
+LayerConf GenDenseConf(string name, int num_output, float std, float wd,
+                       float bias = .0f) {
+  LayerConf conf;
+  conf.set_name(name);
+  conf.set_type("singa_dense");
+  DenseConf *dense = conf.mutable_dense_conf();
+  dense->set_num_output(num_output);
+
+  ParamSpec *wspec = conf.add_param();
+  wspec->set_name(name + "_weight");
+  wspec->set_decay_mult(wd);
+  auto wfill = wspec->mutable_filler();
+  wfill->set_type("Gaussian");
+  wfill->set_std(std);
+
+  ParamSpec *bspec = conf.add_param();
+  bspec->set_name(name + "_bias");
+  bspec->set_lr_mult(2);
+  bspec->set_decay_mult(0);
+  auto bfill = bspec->mutable_filler();
+  bfill->set_value(bias);
+
+  return conf;
+}
+
+LayerConf GenLRNConf(string name) {
+  LayerConf conf;
+  conf.set_name(name);
+  conf.set_type(engine + "_lrn");
+  LRNConf *lrn = conf.mutable_lrn_conf();
+  lrn->set_local_size(5);
+  lrn->set_alpha(1e-04);
+  lrn->set_beta(0.75);
+  return conf;
+}
+
+LayerConf GenFlattenConf(string name) {
+  LayerConf conf;
+  conf.set_name(name);
+  conf.set_type("singa_flatten");
+  return conf;
+}
+
+LayerConf GenDropoutConf(string name, float dropout_ratio) {
+  LayerConf conf;
+  conf.set_name(name);
+  conf.set_type(engine + "_dropout");
+  DropoutConf *dropout = conf.mutable_dropout_conf();
+  dropout->set_dropout_ratio(dropout_ratio);
+  return conf;
+}
+
+FeedForwardNet CreateNet() {
+  FeedForwardNet net;
+  Shape s{3, 227, 227};
+
+  net.Add(GenConvConf("conv1", 96, 11, 4, 0, 0.01), &s);
+  net.Add(GenReLUConf("relu1"));
+  net.Add(GenPoolingConf("pool1", true, 3, 2, 0));
+  net.Add(GenLRNConf("lrn1"));
+  net.Add(GenConvConf("conv2", 256, 5, 1, 2, 0.01, 1.0));
+  net.Add(GenReLUConf("relu2"));
+  net.Add(GenPoolingConf("pool2", true, 3, 2, 0));
+  net.Add(GenLRNConf("lrn2"));
+  net.Add(GenConvConf("conv3", 384, 3, 1, 1, 0.01));
+  net.Add(GenReLUConf("relu3"));
+  net.Add(GenConvConf("conv4", 384, 3, 1, 1, 0.01, 1.0));
+  net.Add(GenReLUConf("relu4"));
+  net.Add(GenConvConf("conv5", 256, 3, 1, 1, 0.01, 1.0));
+  net.Add(GenReLUConf("relu5"));
+  net.Add(GenPoolingConf("pool5", true, 3, 2, 0));
+  net.Add(GenFlattenConf("flat"));
+  net.Add(GenDenseConf("ip6", 4096, 0.005, 1, 1.0));
+  net.Add(GenReLUConf("relu6"));
+  net.Add(GenDropoutConf("drop6", 0.5));
+  net.Add(GenDenseConf("ip7", 4096, 0.005, 1, 1.0));
+  net.Add(GenReLUConf("relu7"));
+  net.Add(GenDropoutConf("drop7", 0.5));
+  net.Add(GenDenseConf("ip8", 1000, 0.01, 1));
+
+  return net;
+}
+
+void TrainOneEpoch(FeedForwardNet &net, ILSVRC &data,
+                   std::shared_ptr<Device> device, int epoch, string bin_folder,
+                   size_t num_train_files, size_t batchsize, float lr,
+                   Channel *train_ch, size_t pfreq, int nthreads) {
+  float loss = 0.0f, metric = 0.0f;
+  float load_time = 0.0f, train_time = 0.0f;
+  size_t b = 0;
+  size_t n_read;
+  Timer timer, ttr;
+  Tensor prefetch_x, prefetch_y;
+  string binfile = bin_folder + "/train1.bin";
+  timer.Tick();
+  data.LoadData(kTrain, binfile, batchsize, &prefetch_x, &prefetch_y, &n_read,
+                nthreads);
+  load_time += timer.Elapsed();
+  CHECK_EQ(n_read, batchsize);
+  Tensor train_x(prefetch_x.shape(), device);
+  Tensor train_y(prefetch_y.shape(), device, kInt);
+  std::thread th;
+  for (size_t fno = 1; fno <= num_train_files; fno++) {
+    binfile = bin_folder + "/train" + std::to_string(fno) + ".bin";
+    while (true) {
+      if (th.joinable()) {
+        th.join();
+        load_time += timer.Elapsed();
+        // LOG(INFO) << "num of samples: " << n_read;
+        if (n_read < batchsize) {
+          if (n_read > 0) {
+            LOG(WARNING) << "Pls set batchsize to make num_total_samples "
+                         << "% batchsize == 0. Otherwise, the last " << n_read
+                         << " samples would not be used";
+          }
+          break;
+        }
+      }
+      if (n_read == batchsize) {
+        train_x.CopyData(prefetch_x);
+        train_y.CopyData(prefetch_y);
+      }
+      timer.Tick();
+      th = data.AsyncLoadData(kTrain, binfile, batchsize, &prefetch_x,
+                              &prefetch_y, &n_read, nthreads);
+      if (n_read < batchsize) continue;
+      CHECK_EQ(train_x.shape(0), train_y.shape(0));
+      ttr.Tick();
+      auto ret = net.TrainOnBatch(epoch, train_x, train_y);
+      train_time += ttr.Elapsed();
+      loss += ret.first;
+      metric += ret.second;
+      b++;
+    }
+    if (b % pfreq == 0) {
+      train_ch->Send(
+          "Epoch " + std::to_string(epoch) + ", training loss = " +
+          std::to_string(loss / b) + ", accuracy = " +
+          std::to_string(metric / b) + ", lr = " + std::to_string(lr) +
+          ", time of loading " + std::to_string(batchsize) + " images = " +
+          std::to_string(load_time / b) +
+          " ms, time of training (batchsize = " + std::to_string(batchsize) +
+          ") = " + std::to_string(train_time / b) + " ms.");
+      loss = 0.0f;
+      metric = 0.0f;
+      load_time = 0.0f;
+      train_time = 0.0f;
+      b = 0;
+    }
+  }
+}
+
+void TestOneEpoch(FeedForwardNet &net, ILSVRC &data,
+                  std::shared_ptr<Device> device, int epoch, string bin_folder,
+                  size_t num_test_images, size_t batchsize, Channel *val_ch,
+                  int nthreads) {
+  float loss = 0.0f, metric = 0.0f;
+  float load_time = 0.0f, eval_time = 0.0f;
+  size_t n_read;
+  string binfile = bin_folder + "/test.bin";
+  Timer timer, tte;
+  Tensor prefetch_x, prefetch_y;
+  timer.Tick();
+  data.LoadData(kEval, binfile, batchsize, &prefetch_x, &prefetch_y, &n_read,
+                nthreads);
+  load_time += timer.Elapsed();
+  Tensor test_x(prefetch_x.shape(), device);
+  Tensor test_y(prefetch_y.shape(), device, kInt);
+  int remain = (int)num_test_images - n_read;
+  CHECK_EQ(n_read, batchsize);
+  std::thread th;
+  while (true) {
+    if (th.joinable()) {
+      th.join();
+      load_time += timer.Elapsed();
+      remain -= n_read;
+      if (remain < 0) break;
+      if (n_read < batchsize) break;
+    }
+    test_x.CopyData(prefetch_x);
+    test_y.CopyData(prefetch_y);
+    timer.Tick();
+    th = data.AsyncLoadData(kEval, binfile, batchsize, &prefetch_x, &prefetch_y,
+                            &n_read, nthreads);
+
+    CHECK_EQ(test_x.shape(0), test_y.shape(0));
+    tte.Tick();
+    auto ret = net.EvaluateOnBatch(test_x, test_y);
+    eval_time += tte.Elapsed();
+    ret.first.ToHost();
+    ret.second.ToHost();
+    loss += Sum(ret.first);
+    metric += Sum(ret.second);
+  }
+  loss /= num_test_images;
+  metric /= num_test_images;
+  val_ch->Send("Epoch " + std::to_string(epoch) + ", val loss = " +
+               std::to_string(loss) + ", accuracy = " + std::to_string(metric) +
+               ", time of loading " + std::to_string(num_test_images) +
+               " images = " + std::to_string(load_time) +
+               " ms, time of evaluating " + std::to_string(num_test_images) +
+               " images = " + std::to_string(eval_time) + " ms.");
+}
+
+void Checkpoint(FeedForwardNet &net, string prefix) {
+  Snapshot snapshot(prefix, Snapshot::kWrite, 200);
+  auto names = net.GetParamNames();
+  auto values = net.GetParamValues();
+  for (size_t k = 0; k < names.size(); k++) {
+    values.at(k).ToHost();
+    snapshot.Write(names.at(k), values.at(k));
+  }
+  LOG(INFO) << "Write snapshot into " << prefix;
+}
+
+void Train(int num_epoch, float lr, size_t batchsize, size_t train_file_size,
+           string bin_folder, size_t num_train_images, size_t num_test_images,
+           size_t pfreq, int nthreads) {
+  ILSVRC data;
+  data.ReadMean(bin_folder + "/mean.bin");
+  auto net = CreateNet();
+  auto cuda = std::make_shared<CudaGPU>(0);
+  net.ToDevice(cuda);
+  SGD sgd;
+  OptimizerConf opt_conf;
+  opt_conf.set_momentum(0.9);
+  auto reg = opt_conf.mutable_regularizer();
+  reg->set_coefficient(0.0005);
+  sgd.Setup(opt_conf);
+  sgd.SetLearningRateGenerator(
+      [lr](int epoch) { return lr * std::pow(0.1, epoch / 20); });
+
+  SoftmaxCrossEntropy loss;
+  Accuracy acc;
+  net.Compile(true, &sgd, &loss, &acc);
+
+  Channel *train_ch = GetChannel("train_perf");
+  train_ch->EnableDestStderr(true);
+  Channel *val_ch = GetChannel("val_perf");
+  val_ch->EnableDestStderr(true);
+  size_t num_train_files = num_train_images / train_file_size +
+                           (num_train_images % train_file_size ? 1 : 0);
+  for (int epoch = 0; epoch < num_epoch; epoch++) {
+    float epoch_lr = sgd.GetLearningRate(epoch);
+    TrainOneEpoch(net, data, cuda, epoch, bin_folder, num_train_files,
+                  batchsize, epoch_lr, train_ch, pfreq, nthreads);
+    if (epoch % 10 == 0 && epoch > 0) {
+      string prefix = "snapshot_epoch" + std::to_string(epoch);
+      Checkpoint(net, prefix);
+    }
+    TestOneEpoch(net, data, cuda, epoch, bin_folder, num_test_images, batchsize,
+                 val_ch, nthreads);
+  }
+}
+}
+
+int main(int argc, char **argv) {
+  singa::InitChannel(nullptr);
+  int pos = singa::ArgPos(argc, argv, "-h");
+  if (pos != -1) {
+    std::cout << "Usage:\n"
+              << "\t-epoch <int>: number of epoch to be trained, default is 90;\n"
+              << "\t-lr <float>: base learning rate;\n"
+              << "\t-batchsize <int>: batchsize, it should be changed regarding "
+                 "to your memory;\n"
+              << "\t-filesize <int>: number of training images that stores in "
+                 "each binary file;\n"
+              << "\t-ntrain <int>: number of training images;\n"
+              << "\t-ntest <int>: number of test images;\n"
+              << "\t-data <folder>: the folder which stores the binary files;\n"
+              << "\t-pfreq <int>: the frequency(in batch) of printing current "
+                 "model status(loss and accuracy);\n"
+              << "\t-nthreads <int>`: the number of threads to load data which "
+                 "feed to the model.\n";
+    return 0;
+  }
+  pos = singa::ArgPos(argc, argv, "-epoch");
+  int nEpoch = 90;
+  if (pos != -1) nEpoch = atoi(argv[pos + 1]);
+
+  pos = singa::ArgPos(argc, argv, "-lr");
+  float lr = 0.01;
+  if (pos != -1) lr = atof(argv[pos + 1]);
+
+  pos = singa::ArgPos(argc, argv, "-batchsize");
+  int batchsize = 256;
+  if (pos != -1) batchsize = atof(argv[pos + 1]);
+
+  pos = singa::ArgPos(argc, argv, "-filesize");
+  size_t train_file_size = 1280;
+  if (pos != -1) train_file_size = atoi(argv[pos + 1]);
+
+  pos = singa::ArgPos(argc, argv, "-ntrain");
+  size_t num_train_images = 1281167;
+  if (pos != -1) num_train_images = atoi(argv[pos + 1]);
+
+  pos = singa::ArgPos(argc, argv, "-ntest");
+  size_t num_test_images = 50000;
+  if (pos != -1) num_test_images = atoi(argv[pos + 1]);
+
+  pos = singa::ArgPos(argc, argv, "-data");
+  string bin_folder = "imagenet_data";
+  if (pos != -1) bin_folder = argv[pos + 1];
+
+  pos = singa::ArgPos(argc, argv, "-pfreq");
+  size_t pfreq = 100;
+  if (pos != -1) pfreq = atoi(argv[pos + 1]);
+
+  pos = singa::ArgPos(argc, argv, "-nthreads");
+  int nthreads = 12;
+  if (pos != -1) nthreads = atoi(argv[pos + 1]);
+
+  LOG(INFO) << "Start training";
+  singa::Train(nEpoch, lr, batchsize, train_file_size, bin_folder,
+               num_train_images, num_test_images, pfreq, nthreads);
+  LOG(INFO) << "End training";
+}
+#endif
diff --git a/examples/imagenet/create_data.sh b/examples/imagenet/create_data.sh
new file mode 100755
index 0000000..4c2c034
--- /dev/null
+++ b/examples/imagenet/create_data.sh
@@ -0,0 +1,21 @@
+#!/usr/bin/env sh
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# 
+
+../../build/bin/createdata -trainlist "imagenet/label/train.txt" -trainfolder "imagenet/ILSVRC2012_img_train" \
+  -testlist "imagenet/label/val.txt" -testfolder "imagenet/ILSVRC2012_img_val" -outdata "imagenet_data" -filesize 1280
diff --git a/examples/imagenet/ilsvrc12.cc b/examples/imagenet/ilsvrc12.cc
new file mode 100644
index 0000000..c9e6d2f
--- /dev/null
+++ b/examples/imagenet/ilsvrc12.cc
@@ -0,0 +1,70 @@
+/************************************************************
+*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*
+*************************************************************/
+#include "singa/singa_config.h"
+#ifdef USE_OPENCV
+#include "ilsvrc12.h"
+#include "singa/utils/channel.h"
+#include "singa/utils/string.h"
+int main(int argc, char **argv) {
+  int pos = singa::ArgPos(argc, argv, "-h");
+  if (pos != -1) {
+    std::cout << "Usage:\n"
+              << "\t-trainlist <file>: the file of training list;\n"
+              << "\t-trainfolder <folder>: the folder of training images;\n"
+              << "\t-testlist <file>: the file of test list;\n"
+              << "\t-testfolder <floder>: the folder of test images;\n"
+              << "\t-outdata <folder>: the folder to save output files;\n"
+              << "\t-filesize <int>: number of training images that stores in "
+                 "each binary file.\n";
+    return 0;
+  }
+  pos = singa::ArgPos(argc, argv, "-trainlist");
+  string train_image_list = "imagenet/label/train.txt";
+  if (pos != -1) train_image_list = argv[pos + 1];
+
+  pos = singa::ArgPos(argc, argv, "-trainfolder");
+  string train_image_folder = "imagenet/ILSVRC2012_img_train";
+  if (pos != -1) train_image_folder = argv[pos + 1];
+
+  pos = singa::ArgPos(argc, argv, "-testlist");
+  string test_image_list = "imagenet/label/val.txt";
+  if (pos != -1) test_image_list = argv[pos + 1];
+
+  pos = singa::ArgPos(argc, argv, "-testfolder");
+  string test_image_folder = "imagenet/ILSVRC2012_img_val";
+  if (pos != -1) test_image_folder = argv[pos + 1];
+
+  pos = singa::ArgPos(argc, argv, "-outdata");
+  string bin_folder = "imagenet_data";
+  if (pos != -1) bin_folder = argv[pos + 1];
+
+  pos = singa::ArgPos(argc, argv, "-filesize");
+  size_t train_file_size = 1280;
+  if (pos != -1) train_file_size = atoi(argv[pos + 1]);
+  singa::ILSVRC data;
+  LOG(INFO) << "Creating training and test data...";
+  data.CreateTrainData(train_image_list, train_image_folder, bin_folder,
+                       train_file_size);
+  data.CreateTestData(test_image_list, test_image_folder, bin_folder);
+  LOG(INFO) << "Data created!";
+  return 0;
+}
+#endif  // USE_OPENCV
diff --git a/examples/imagenet/ilsvrc12.h b/examples/imagenet/ilsvrc12.h
new file mode 100644
index 0000000..a6d4238
--- /dev/null
+++ b/examples/imagenet/ilsvrc12.h
@@ -0,0 +1,380 @@
+/************************************************************
+*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*
+*************************************************************/
+#include "singa/singa_config.h"
+#ifdef USE_OPENCV
+#ifndef SINGA_EXAMPLES_IMAGENET_ILSVRC12_H_
+#define SINGA_EXAMPLES_IMAGENET_ILSVRC12_H_
+#include <omp.h>
+#include <cstdint>
+#include <fstream>
+#include <iostream>
+#include <opencv2/highgui/highgui.hpp>
+#include <opencv2/imgproc/imgproc.hpp>
+#include <string>
+#include <thread>
+#include <vector>
+#include "singa/core/tensor.h"
+#include "singa/io/decoder.h"
+#include "singa/io/encoder.h"
+#include "singa/io/reader.h"
+#include "singa/io/transformer.h"
+#include "singa/io/writer.h"
+#include "singa/proto/io.pb.h"
+#include "singa/utils/timer.h"
+
+using std::string;
+using namespace singa::io;
+namespace singa {
+/// For reading ILSVRC2012 image data as tensors.
+class ILSVRC {
+ public:
+  /// Setup encoder, decoder
+  ILSVRC();
+  ~ILSVRC() {
+    if (encoder != nullptr) delete encoder;
+    if (decoder != nullptr) delete decoder;
+    if (transformer != nullptr) delete transformer;
+    if (reader != nullptr) {
+      reader->Close();
+      delete reader;
+    }
+    if (writer != nullptr) {
+      writer->Close();
+      delete writer;
+    }
+  }
+  /// Create binary files for training data
+  /// train_image_list: list file of training images
+  /// train_image_folder: folder where stores original training images
+  /// train_bin_folder: folder to store binary files
+  /// train_file_size: number of images that are contain in one binary file
+  void CreateTrainData(string train_image_list, string train_image_folder,
+                       string train_bin_folder, size_t train_file_size);
+  /// Create binary files for test data
+  /// train_image_list: list file of test images
+  /// train_image_folder: folder where saves original test images
+  /// train_bin_folder: folder to save binary files
+  void CreateTestData(string test_image_list, string test_image_folder,
+                      string test_bin_folder);
+  /// Load data from a binary file,  return <images, labels> pair
+  /// suppose the data will be loaded file by file.
+  /// flag: kTrain or kTest
+  /// file: binary file which stores the images
+  /// read_size: number of images to be loaded
+  /// offset: offset in the file
+  /// n_read: number of images which are read
+  size_t LoadData(int flag, string file, size_t read_size, Tensor *x, Tensor *y,
+                  size_t *n_read, int nthreads);
+
+  std::thread AsyncLoadData(int flag, string file, size_t read_size, Tensor *x,
+                            Tensor *y, size_t *n_read, int nthreads);
+
+  void DecodeTransform(int flag, int thid, int nthreads,
+                       vector<string *> images, Tensor *x, Tensor *y);
+  std::thread AsyncDecodeTransform(int flag, int thid, int nthreads,
+                                   vector<string *> images, Tensor *x,
+                                   Tensor *y);
+
+  /// Read mean from path
+  void ReadMean(string path);
+
+ protected:
+  /// Read one image at path, resize the image
+  Tensor ReadImage(string path);
+  /// Write buff to the file in kCreate/kAppend mode
+  void Write(string outfile, singa::io::Mode mode);
+  void WriteMean(Tensor &mean, string path);
+
+ private:
+  /// size for resizing
+  const size_t kImageSize = 256;
+  const size_t kImageNBytes = 3 * kImageSize * kImageSize;
+  /// size for cropping
+  const size_t kCropSize = 227;
+  Tensor mean;
+  string last_read_file = "";
+
+  JPGEncoder *encoder = nullptr;
+  JPGDecoder *decoder = nullptr;
+  ImageTransformer *transformer = nullptr;
+  BinFileReader *reader = nullptr;
+  BinFileWriter *writer = nullptr;
+};
+
+ILSVRC::ILSVRC() {
+  EncoderConf en_conf;
+  en_conf.set_image_dim_order("CHW");
+  encoder = new JPGEncoder();
+  encoder->Setup(en_conf);
+
+  DecoderConf de_conf;
+  de_conf.set_image_dim_order("CHW");
+  decoder = new JPGDecoder();
+  decoder->Setup(de_conf);
+
+  TransformerConf trans_conf;
+  trans_conf.add_crop_shape(kCropSize);
+  trans_conf.add_crop_shape(kCropSize);
+  trans_conf.set_image_dim_order("CHW");
+  trans_conf.set_horizontal_mirror(true);
+  transformer = new ImageTransformer();
+  transformer->Setup(trans_conf);
+}
+
+Tensor ILSVRC::ReadImage(string path) {
+  cv::Mat mat = cv::imread(path, CV_LOAD_IMAGE_COLOR);
+  CHECK(mat.data != NULL) << "OpenCV load image fail: " << path;
+  cv::Size size(kImageSize, kImageSize);
+  cv::Mat resized;
+  cv::resize(mat, resized, size);
+  CHECK_EQ((size_t)resized.size().height, kImageSize);
+  CHECK_EQ((size_t)resized.size().width, kImageSize);
+  // dimension_order: CHW
+  Shape shape{(size_t)resized.channels(), (size_t)resized.rows,
+              (size_t)resized.cols};
+  Tensor image(shape, singa::kUChar);
+  unsigned char *data = new unsigned char[kImageNBytes];
+  for (int i = 0; i < resized.rows; i++)
+    for (int j = 0; j < resized.cols; j++)
+      for (int k = 0; k < resized.channels(); k++)
+        data[k * kImageSize * kImageSize + i * kImageSize + j] =
+            resized.at<cv::Vec3b>(i, j)[k];
+  image.CopyDataFromHostPtr<unsigned char>(data, kImageNBytes);
+  delete[] data;
+
+  return image;
+}
+
+void ILSVRC::WriteMean(Tensor &mean, string path) {
+  Tensor mean_lb(Shape{1}, kInt);
+  std::vector<Tensor> input;
+  input.push_back(mean);
+  input.push_back(mean_lb);
+  BinFileWriter bfwriter;
+  bfwriter.Open(path, kCreate);
+  bfwriter.Write(path, encoder->Encode(input));
+  bfwriter.Flush();
+  bfwriter.Close();
+}
+
+void ILSVRC::CreateTrainData(string image_list, string input_folder,
+                             string output_folder, size_t file_size = 12800) {
+  std::vector<std::pair<string, int>> file_list;
+  size_t *sum = new size_t[kImageNBytes];
+  for (size_t i = 0; i < kImageNBytes; i++) sum[i] = 0u;
+  string image_file_name;
+  int label;
+  string outfile;
+  std::ifstream image_list_file(image_list.c_str(), std::ios::in);
+  while (image_list_file >> image_file_name >> label)
+    file_list.push_back(std::make_pair(image_file_name, label));
+  LOG(INFO) << "Data Shuffling";
+  std::shuffle(file_list.begin(), file_list.end(),
+               std::default_random_engine());
+  LOG(INFO) << "Total number of training images is " << file_list.size();
+  size_t num_train_images = file_list.size();
+  num_train_images = 12900;
+  if (file_size == 0) file_size = num_train_images;
+  // todo: accelerate with omp
+  for (size_t imageid = 0; imageid < num_train_images; imageid++) {
+    string path = input_folder + "/" + file_list[imageid].first;
+    Tensor image = ReadImage(path);
+    auto image_data = image.data<unsigned char>();
+    for (size_t i = 0; i < kImageNBytes; i++)
+      sum[i] += static_cast<size_t>(image_data[i]);
+    label = file_list[imageid].second;
+    Tensor lb(Shape{1}, kInt);
+    lb.CopyDataFromHostPtr<int>(&label, 1);
+    std::vector<Tensor> input;
+    input.push_back(image);
+    input.push_back(lb);
+    //  LOG(INFO) << path << "\t" << label;
+    string encoded_str = encoder->Encode(input);
+    if (writer == nullptr) {
+      writer = new BinFileWriter();
+      outfile = output_folder + "/train" +
+                std::to_string(imageid / file_size + 1) + ".bin";
+      writer->Open(outfile, kCreate);
+    }
+    writer->Write(path, encoded_str);
+    if ((imageid + 1) % file_size == 0) {
+      writer->Flush();
+      writer->Close();
+      LOG(INFO) << "Write " << file_size << " images into " << outfile;
+      delete writer;
+      writer = nullptr;
+    }
+  }
+  if (writer != nullptr) {
+    writer->Flush();
+    writer->Close();
+    LOG(INFO) << "Write " << num_train_images % file_size << " images into "
+              << outfile;
+    delete writer;
+    writer = nullptr;
+  }
+  size_t num_file =
+      num_train_images / file_size + ((num_train_images % file_size) ? 1 : 0);
+  LOG(INFO) << "Write " << num_train_images << " images into " << num_file
+            << " binary files";
+  Tensor mean = Tensor(Shape{3, kImageSize, kImageSize}, kUChar);
+  unsigned char *mean_data = new unsigned char[kImageNBytes];
+  for (size_t i = 0; i < kImageNBytes; i++)
+    mean_data[i] = static_cast<unsigned char>(sum[i] / num_train_images);
+  mean.CopyDataFromHostPtr<unsigned char>(mean_data, kImageNBytes);
+  string mean_path = output_folder + "/mean.bin";
+  WriteMean(mean, mean_path);
+  delete[] mean_data;
+  delete[] sum;
+}
+
+void ILSVRC::CreateTestData(string image_list, string input_folder,
+                            string output_folder) {
+  std::vector<std::pair<string, int>> file_list;
+  string image_file_name;
+  string outfile = output_folder + "/test.bin";
+  int label;
+  std::ifstream image_list_file(image_list.c_str(), std::ios::in);
+  while (image_list_file >> image_file_name >> label)
+    file_list.push_back(std::make_pair(image_file_name, label));
+  LOG(INFO) << "Total number of test images is " << file_list.size();
+  size_t num_test_images = file_list.size();
+  num_test_images = 500;
+  for (size_t imageid = 0; imageid < num_test_images; imageid++) {
+    string path = input_folder + "/" + file_list[imageid].first;
+    Tensor image = ReadImage(path);
+    label = file_list[imageid].second;
+    Tensor lb(Shape{1}, singa::kInt);
+    lb.CopyDataFromHostPtr<int>(&label, 1);
+    std::vector<Tensor> input;
+    input.push_back(image);
+    input.push_back(lb);
+    string encoded_str = encoder->Encode(input);
+    if (writer == nullptr) {
+      writer = new BinFileWriter();
+      writer->Open(outfile, kCreate);
+    }
+    writer->Write(path, encoded_str);
+  }
+  if (writer != nullptr) {
+    writer->Flush();
+    writer->Close();
+    delete writer;
+    writer = nullptr;
+  }
+  LOG(INFO) << "Write " << num_test_images << " images into " << outfile;
+}
+
+void ILSVRC::ReadMean(string path) {
+  BinFileReader bfreader;
+  string key, value;
+  bfreader.Open(path);
+  bfreader.Read(&key, &value);
+  auto ret = decoder->Decode(value);
+  bfreader.Close();
+  mean = ret[0];
+}
+/// A wrapper method to spawn a thread to execute LoadData() method.
+std::thread ILSVRC::AsyncLoadData(int flag, string file, size_t read_size,
+                                  Tensor *x, Tensor *y, size_t *n_read,
+                                  int nthreads) {
+  return std::thread(
+      [=]() { LoadData(flag, file, read_size, x, y, n_read, nthreads); });
+}
+
+size_t ILSVRC::LoadData(int flag, string file, size_t read_size, Tensor *x,
+                        Tensor *y, size_t *n_read, int nthreads) {
+  x->Reshape(Shape{read_size, 3, kCropSize, kCropSize});
+  y->AsType(kInt);
+  y->Reshape(Shape{read_size});
+  if (file != last_read_file) {
+    if (reader != nullptr) {
+      reader->Close();
+      delete reader;
+      reader = nullptr;
+    }
+    reader = new BinFileReader();
+    reader->Open(file, 100 << 20);
+    last_read_file = file;
+  } else if (reader == nullptr) {
+    reader = new BinFileReader();
+    reader->Open(file, 100 << 20);
+  }
+  vector<string *> images;
+  for (size_t i = 0; i < read_size; i++) {
+    string image_path;
+    string *image = new string();
+    bool ret = reader->Read(&image_path, image);
+    if (ret == false) {
+      reader->Close();
+      delete reader;
+      reader = nullptr;
+      break;
+    }
+    images.push_back(image);
+  }
+  int nimg = images.size();
+  *n_read = nimg;
+
+  vector<std::thread> threads;
+  for (int i = 1; i < nthreads; i++) {
+    threads.push_back(AsyncDecodeTransform(flag, i, nthreads, images, x, y));
+  }
+  DecodeTransform(flag, 0, nthreads, images, x, y);
+  for (size_t i = 0; i < threads.size(); i++) threads[i].join();
+  for (int k = 0; k < nimg; k++) delete images.at(k);
+  return nimg;
+}
+
+/// A wrapper method to spawn a thread to execute Decodetransform() method.
+std::thread ILSVRC::AsyncDecodeTransform(int flag, int thid, int nthreads,
+                                         vector<string *> images, Tensor *x,
+                                         Tensor *y) {
+  return std::thread(
+      [=]() { DecodeTransform(flag, thid, nthreads, images, x, y); });
+}
+
+void ILSVRC::DecodeTransform(int flag, int thid, int nthreads,
+                             vector<string *> images, Tensor *x, Tensor *y) {
+  int nimg = images.size();
+  int start = nimg / nthreads * thid;
+  int end = start + nimg / nthreads;
+  for (int k = start; k < end; k++) {
+    std::vector<Tensor> pair = decoder->Decode(*images.at(k));
+    auto tmp_image = pair[0] - mean;
+    Tensor aug_image = transformer->Apply(flag, tmp_image);
+    CopyDataToFrom(x, aug_image, aug_image.Size(), k * aug_image.Size());
+    CopyDataToFrom(y, pair[1], 1, k);
+  }
+  if (thid == 0) {
+    for (int k = nimg / nthreads * nthreads; k < nimg; k++) {
+      std::vector<Tensor> pair = decoder->Decode(*images.at(k));
+      auto tmp_image = pair[0] - mean;
+      Tensor aug_image = transformer->Apply(flag, tmp_image);
+      CopyDataToFrom(x, aug_image, aug_image.Size(), k * aug_image.Size());
+      CopyDataToFrom(y, pair[1], 1, k);
+    }
+  }
+}
+}  // namespace singa
+
+#endif  // SINGA_EXAMPLES_IMAGENET_ILSVRC12_H_
+#endif  // USE_OPENCV
diff --git a/examples/imagenet/run.sh b/examples/imagenet/run.sh
new file mode 100755
index 0000000..6277d23
--- /dev/null
+++ b/examples/imagenet/run.sh
@@ -0,0 +1,21 @@
+#!/usr/bin/env sh
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# 
+
+../../build/bin/imagenet -epoch 90 -lr 0.01 -batchsize 256 -filesize 1280 -ntrain 1281167 -ntest 50000 \
+  -data "imagenet_data" -pfreq 100 -nthreads 12
diff --git a/examples/index.rst b/examples/index.rst
new file mode 100644
index 0000000..b501b36
--- /dev/null
+++ b/examples/index.rst
@@ -0,0 +1,28 @@
+.. 
+.. Licensed to the Apache Software Foundation (ASF) under one
+.. or more contributor license agreements.  See the NOTICE file
+.. distributed with this work for additional information
+.. regarding copyright ownership.  The ASF licenses this file
+.. to you under the Apache License, Version 2.0 (the
+.. "License"); you may not use this file except in compliance
+.. with the License.  You may obtain a copy of the License at
+.. 
+..     http://www.apache.org/licenses/LICENSE-2.0
+.. 
+.. Unless required by applicable law or agreed to in writing, software
+.. distributed under the License is distributed on an "AS IS" BASIS,
+.. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+.. See the License for the specific language governing permissions and
+.. limitations under the License.
+.. 
+
+Examples
+========
+
+.. toctree::
+
+   cifar10/README
+   char-rnn/README
+   imagenet/README
+
+
diff --git a/examples/mnist/Makefile.example b/examples/mnist/Makefile.example
deleted file mode 100644
index a041359..0000000
--- a/examples/mnist/Makefile.example
+++ /dev/null
@@ -1,49 +0,0 @@
-#
-#/**
-# * Licensed to the Apache Software Foundation (ASF) under one
-# * or more contributor license agreements.  See the NOTICE file
-# * distributed with this work for additional information
-# * regarding copyright ownership.  The ASF licenses this file
-# * to you under the Apache License, Version 2.0 (the
-# * "License"); you may not use this file except in compliance
-# * with the License.  You may obtain a copy of the License at
-# *
-# *     http://www.apache.org/licenses/LICENSE-2.0
-# *
-# * Unless required by applicable law or agreed to in writing, software
-# * distributed under the License is distributed on an "AS IS" BASIS,
-# * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# * See the License for the specific language governing permissions and
-# * limitations under the License.
-# */
-
-libs :=singa glog protobuf
-
-.PHONY: all download create
-
-HDFS_MNIST_TRAIN := hdfs://node0:9000/examples/mnist/train_data.bin
-HDFS_MNIST_TEST := hdfs://node0:9000/examples/mnist/test_data.bin
-
-download: mnist
-
-mnist:
-	wget http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz
-	wget http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz
-	wget http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz
-	wget http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz
-	gunzip train-images-idx3-ubyte.gz && gunzip train-labels-idx1-ubyte.gz
-	gunzip t10k-images-idx3-ubyte.gz && gunzip t10k-labels-idx1-ubyte.gz
-
-create:
-	$(CXX) create_data.cc -std=c++11 -lsinga -lprotobuf -lglog -I../../include \
-		-L../../.libs/ -Wl,-unresolved-symbols=ignore-in-shared-libs -Wl,-rpath=../../.libs/ \
-		-o create_data.bin
-	./create_data.bin train-images-idx3-ubyte train-labels-idx1-ubyte train_data.bin
-	./create_data.bin t10k-images-idx3-ubyte t10k-labels-idx1-ubyte test_data.bin
-
-create_hdfs:
-	$(CXX) create_data.cc -std=c++11 -lsinga -lprotobuf -lglog -lhdfs3 -I../../include \
-		-L../../.libs/ -Wl,-unresolved-symbols=ignore-in-shared-libs -Wl,-rpath=../../.libs/ \
-		-o create_data.bin
-	./create_data.bin train-images-idx3-ubyte train-labels-idx1-ubyte $(HDFS_MNIST_TRAIN)
-	./create_data.bin t10k-images-idx3-ubyte t10k-labels-idx1-ubyte $(HDFS_MNIST_TEST)
diff --git a/examples/mnist/README.md b/examples/mnist/README.md
new file mode 100644
index 0000000..60a85e0
--- /dev/null
+++ b/examples/mnist/README.md
@@ -0,0 +1,18 @@
+# Train a RBM model against MNIST dataset
+
+This example is to train an RBM model using the
+MNIST dataset. The RBM model and its hyper-parameters are set following
+[Hinton's paper](http://www.cs.toronto.edu/~hinton/science.pdf)
+
+## Running instructions
+
+1. Download the pre-processed [MNIST dataset](https://github.com/mnielsen/neural-networks-and-deep-learning/raw/master/data/mnist.pkl.gz)
+
+2. Start the training
+
+        python train.py mnist.pkl.gz
+
+By default the training code would run on CPU. To run it on a GPU card, please start
+the program with an additional argument
+
+        python train.py mnist.pkl.gz --use_gpu
diff --git a/examples/mnist/conv.conf b/examples/mnist/conv.conf
deleted file mode 100644
index 7818af1..0000000
--- a/examples/mnist/conv.conf
+++ /dev/null
@@ -1,187 +0,0 @@
-name: "conv"
-train_steps: 10000
-test_steps:100
-test_freq:500
-disp_freq:50
-train_one_batch {
-  alg: kBP
-}
-updater {
-  momentum:0.9
-  weight_decay:0.0005
-  type: kSGD
-  learning_rate {
-    type : kInverse
-    base_lr:0.01
-    inverse_conf {
-      gamma:0.0001
-      pow:0.75
-    }
-  }
-}
-neuralnet {
-  layer {
-    name: "data"
-    type: kRecordInput
-    store_conf {
-      backend: "kvfile"
-      path: "examples/mnist/train_data.bin"
-      batchsize: 64
-      std_value: 255
-      random_skip: 5000
-      shape: 1
-      shape: 28
-      shape: 28
-    }
-    include: kTrain
-  }
-
-  layer {
-    name: "data"
-    type: kRecordInput
-    store_conf {
-      backend: "kvfile"
-      path: "examples/mnist/test_data.bin"
-      std_value: 255
-      batchsize: 100
-      shape: 1
-      shape: 28
-      shape: 28
-    }
-    include: kTest
-  }
-
-  layer {
-    name: "conv1"
-    type: kCConvolution
-    srclayers: "data"
-    convolution_conf {
-      num_filters: 20
-      kernel: 5
-      stride: 1
-    }
-    param{
-      name: "w1"
-      init {
-        type : kUniformSqrtFanIn
-      }
-    }
-    param{
-      name: "b1"
-      init {
-        type : kConstant
-        value:0
-      }
-      lr_scale:2.0
-    }
-  }
-  layer {
-    name: "pool1"
-    type: kCPooling
-    srclayers: "conv1"
-    pooling_conf {
-      pool: MAX
-      kernel: 2
-      stride: 2
-    }
-  }
-  layer {
-    name: "conv2"
-    type: kCConvolution
-    srclayers: "pool1"
-    convolution_conf {
-      num_filters: 50
-      kernel: 5
-      stride: 1
-    }
-    param{
-      name: "w2"
-      init {
-        type :kUniformSqrtFanIn
-      }
-    }
-    param{
-      name: "b2"
-      init {
-        type : kConstant
-        value:0
-      }
-      lr_scale:2.0
-    }
-  }
-  layer {
-    name: "pool2"
-    type: kCPooling
-    srclayers: "conv2"
-    pooling_conf {
-      pool: MAX
-      kernel: 2
-      stride: 2
-    }
-  }
-  layer {
-    name: "ip1"
-    type: kInnerProduct
-    srclayers:"pool2"
-    innerproduct_conf {
-      num_output: 500
-    }
-    param{
-      name: "w3"
-      init {
-        type :kUniformSqrtFanIn
-      }
-    }
-    param{
-      name: "b3"
-      init {
-        type : kConstant
-        value:0
-      }
-      lr_scale:2.0
-    }
-  }
-
-  layer {
-    name: "relu1"
-    type: kReLU
-    srclayers:"ip1"
-  }
-
-  layer {
-    name: "ip2"
-    type: kInnerProduct
-    srclayers:"relu1"
-    innerproduct_conf {
-      num_output: 10
-    }
-    param {
-      name: "w4"
-      init {
-        type :kUniformSqrtFanIn
-      }
-    }
-    param {
-      name: "b4"
-      init {
-        type : kConstant
-        value:0
-      }
-      lr_scale:2
-    }
-  }
-  layer{
-    name: "loss"
-    type: kSoftmaxLoss
-    softmaxloss_conf{
-      topk:1
-    }
-    srclayers:"ip2"
-    srclayers:"data"
-  }
-}
-cluster {
-  nworker_groups: 1
-  nserver_groups: 1
-  workspace: "examples/mnist"
-}
diff --git a/examples/mnist/create_data.cc b/examples/mnist/create_data.cc
deleted file mode 100644
index 34c287f..0000000
--- a/examples/mnist/create_data.cc
+++ /dev/null
@@ -1,125 +0,0 @@
-/************************************************************
-*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*
-*************************************************************/
-
-//
-// This code creates DataShard for MNIST dataset.
-// It is adapted from the convert_mnist_data from Caffe
-//
-// Usage:
-//    create_shard.bin input_image_file input_label_file output_folder
-// The MNIST dataset could be downloaded at
-//    http://yann.lecun.com/exdb/mnist/
-
-#include <glog/logging.h>
-#include <cstdint>
-#include <iostream>
-
-#include <fstream>
-#include <string>
-
-#include "singa/io/store.h"
-#include "singa/utils/common.h"
-#include "singa/proto/common.pb.h"
-
-using std::string;
-
-uint32_t swap_endian(uint32_t val) {
-    val = ((val << 8) & 0xFF00FF00) | ((val >> 8) & 0xFF00FF);
-    return (val << 16) | (val >> 16);
-}
-
-// output is the full path, unlike create_data in CIFAR with only
-// specifies the directory
-void create_data(const char* image_filename, const char* label_filename,
-        const char* output) {
-  // Open files
-  std::ifstream image_file(image_filename, std::ios::in | std::ios::binary);
-  std::ifstream label_file(label_filename, std::ios::in | std::ios::binary);
-  CHECK(image_file) << "Unable to open file " << image_filename;
-  CHECK(label_file) << "Unable to open file " << label_filename;
-  // Read the magic and the meta data
-  uint32_t magic;
-  uint32_t num_items;
-  uint32_t num_labels;
-  uint32_t rows;
-  uint32_t cols;
-
-  image_file.read(reinterpret_cast<char*>(&magic), 4);
-  magic = swap_endian(magic);
-  CHECK_EQ(magic, 2051) << "Incorrect image file magic.";
-  label_file.read(reinterpret_cast<char*>(&magic), 4);
-  magic = swap_endian(magic);
-  CHECK_EQ(magic, 2049) << "Incorrect label file magic.";
-  image_file.read(reinterpret_cast<char*>(&num_items), 4);
-  num_items = swap_endian(num_items);
-  label_file.read(reinterpret_cast<char*>(&num_labels), 4);
-  num_labels = swap_endian(num_labels);
-  CHECK_EQ(num_items, num_labels);
-  image_file.read(reinterpret_cast<char*>(&rows), 4);
-  rows = swap_endian(rows);
-  image_file.read(reinterpret_cast<char*>(&cols), 4);
-  cols = swap_endian(cols);
-
-  // read backend from the job.conf
-  string store_backend = string(output).find("hdfs") != -1 ?
-                         "hdfsfile" : "kvfile";
-  auto store = singa::io::OpenStore(store_backend, output, singa::io::kCreate);
-  char label;
-  char* pixels = new char[rows * cols];
-  int count = 0;
-  const int kMaxKeyLength = 10;
-  char key[kMaxKeyLength];
-  string value;
-
-  singa::RecordProto image;
-  image.add_shape(rows);
-  image.add_shape(cols);
-  LOG(INFO) << "A total of " << num_items << " items.";
-  LOG(INFO) << "Rows: " << rows << " Cols: " << cols;
-  for (int item_id = 0; item_id < num_items; ++item_id) {
-    image_file.read(pixels, rows * cols);
-    label_file.read(&label, 1);
-    image.set_pixel(pixels, rows*cols);
-    image.set_label(label);
-    snprintf(key, kMaxKeyLength, "%08d", item_id);
-    image.SerializeToString(&value);
-    store->Write(string(key), value);
-  }
-  delete pixels;
-  store->Flush();
-  delete store;
-}
-
-int main(int argc, char** argv) {
-  if (argc != 4) {
-    std::cout << "This program create a DataShard for a MNIST dataset\n"
-        "Usage:\n"
-        "    create_shard.bin  input_image_file input_label_file"
-        " output_db_file\n"
-        "The MNIST dataset could be downloaded at\n"
-        "    http://yann.lecun.com/exdb/mnist/\n"
-        "You should gunzip them after downloading.";
-  } else {
-    google::InitGoogleLogging(argv[0]);
-    create_data(argv[1], argv[2], argv[3]);
-  }
-  return 0;
-}
diff --git a/examples/mnist/job.conf b/examples/mnist/job.conf
deleted file mode 100644
index 41d6b6f..0000000
--- a/examples/mnist/job.conf
+++ /dev/null
@@ -1,241 +0,0 @@
-name: "mlp"
-train_steps: 1000
-test_steps:10
-test_freq:60
-disp_freq:10
-train_one_batch {
-  alg: kBP
-}
-updater{
-  type: kSGD
-  learning_rate{
-    type : kStep
-    base_lr: 0.001
-    step_conf{
-      change_freq: 60
-      gamma: 0.997
-    }
-  }
-}
-
-neuralnet {
-  layer {
-    name: "data"
-    type: kRecordInput
-    store_conf {
-      backend: "kvfile"
-      path: "examples/mnist/train_data.bin"
-      random_skip: 5000
-      batchsize: 64
-      shape: 784
-      std_value: 127.5
-      mean_value: 127.5
-    }
-    include: kTrain
-  }
-
-  layer {
-    name: "data"
-    type: kRecordInput
-    store_conf {
-      backend: "kvfile"
-      path: "examples/mnist/test_data.bin"
-      batchsize: 100
-      shape: 784
-      std_value: 127.5
-      mean_value: 127.5
-    }
-    include: kTest
-  }
-
-  layer{
-    name: "fc1"
-    type: kInnerProduct
-    srclayers:"data"
-    innerproduct_conf{
-      num_output: 2500
-    }
-    param{
-      name: "w1"
-      init {
-        type: kUniform
-        low:-0.05
-        high:0.05
-      }
-    }
-    param{
-      name: "b1"
-      init {
-        type : kUniform
-        low: -0.05
-        high:0.05
-      }
-    }
-  }
-
-  layer{
-    name: "tanh1"
-    type: kSTanh
-    srclayers:"fc1"
-  }
-  layer{
-    name: "fc2"
-    type: kInnerProduct
-    srclayers:"tanh1"
-    innerproduct_conf{
-      num_output: 2000
-    }
-    param{
-      name: "w2"
-      init {
-        type: kUniform
-        low:-0.05
-        high:0.05
-      }
-    }
-    param{
-      name: "b2"
-      init {
-        type: kUniform
-        low: -0.05
-        high:0.05
-      }
-    }
-  }
-
-  layer{
-    name: "tanh2"
-    type: kSTanh
-    srclayers:"fc2"
-  }
-  layer{
-    name: "fc3"
-    type:  kInnerProduct
-    srclayers:"tanh2"
-    innerproduct_conf{
-      num_output: 1500
-    }
-    param{
-      name: "w3"
-      init{
-        type: kUniform
-        low:-0.05
-        high:0.05
-      }
-    }
-    param{
-      name: "b3"
-      init {
-        type : kUniform
-        low: -0.05
-        high:0.05
-      }
-    }
-
-  }
-
-  layer{
-    name: "tanh3"
-    type: kSTanh
-    srclayers:"fc3"
-  }
-  layer{
-    name: "fc4"
-    type: kInnerProduct
-    srclayers:"tanh3"
-    innerproduct_conf{
-      num_output: 1000
-    }
-    param{
-      name: "w4"
-      init {
-        type : kUniform
-        low:-0.05
-        high:0.05
-      }
-    }
-    param{
-      name: "b4"
-      init {
-        type : kUniform
-        low: -0.05
-        high:0.05
-      }
-    }
-
-  }
-
-  layer{
-    name: "tanh4"
-    type: kSTanh
-    srclayers:"fc4"
-  }
-  layer{
-    name: "fc5"
-    type: kInnerProduct
-    srclayers:"tanh4"
-    innerproduct_conf{
-      num_output: 500
-    }
-    param{
-      name: "w5"
-      init {
-        type : kUniform
-        low:-0.05
-        high:0.05
-      }
-    }
-    param{
-      name: "b5"
-      init {
-        type : kUniform
-        low: -0.05
-        high:0.05
-      }
-    }
-  }
-
-  layer{
-    name: "tanh5"
-    type: kSTanh
-    srclayers:"fc5"
-  }
-  layer{
-    name: "fc6"
-    type: kInnerProduct
-    srclayers:"tanh5"
-    innerproduct_conf{
-      num_output: 10
-    }
-    param{
-      name: "w6"
-      init {
-        type : kUniform
-        low:-0.05
-        high:0.05
-      }
-    }
-    param{
-      name: "b6"
-      init {
-        type : kUniform
-        low: -0.05
-        high:0.05
-      }
-    }
-  }
-  layer{
-    name: "loss"
-    type:kSoftmaxLoss
-    softmaxloss_conf{
-      topk:1
-    }
-    srclayers:"fc6"
-    srclayers:"data"
-  }
-}
-cluster {
-  nworker_groups: 1
-  nserver_groups: 1
-  workspace: "examples/mnist"
-}
diff --git a/examples/mnist/rbm_job.conf b/examples/mnist/rbm_job.conf
deleted file mode 100644
index 59a58dd..0000000
--- a/examples/mnist/rbm_job.conf
+++ /dev/null
@@ -1,95 +0,0 @@
-cluster {
-  nworker_groups: 1
-  nserver_groups: 1
-  nservers_per_group: 1
-  nworkers_per_group: 1
-  workspace: "examples/mnist"
-}
-
-model {
-  name: "deep-big-simple-dbm"
-  train_steps: 46000
-  test_steps:1
-  test_frequency:1000
-  display_frequency: 100
-  alg: kContrastiveDivergence
-  pcd_k: 15
-  updater{
-    base_lr: 0.1
-    lr_change: kFixed
-    type: kSGD
-  }
-
-  neuralnet {
-  layer {
-    name: "data"
-    type: kShardData
-    sharddata_conf {
-      path: "examples/mnist/mnist_train_shard"
-      batchsize: 20
-    }
-    include: kTrain
-  }
-
-
-  layer {
-    name: "data"
-    type: kShardData
-    sharddata_conf {
-      path: "examples/mnist/mnist_test_shard"
-      batchsize: 20
-    }
-    include: kTest
-  }
-
-
-  layer{
-    name:"mnist"
-    type: kMnist
-    srclayers: "data"
-    mnist_conf {
-      norm_a: 255
-      norm_b: 0
-    }
-  }
-
-  layer{
-    name: "RBMVis"
-    type: kRBMVis
-    srclayers:"mnist"
-    srclayers:"RBMHid"
-    rbmvis_conf{
-      num_output: 500
-    }
-    param{
-      name: "w1"
-      init_method: kUniformSqrtFanInOut
-      low:-9.79
-      high:9.79
-    }
-    param{
-      name: "b1"
-      init_method: kConstant
-      value: 0.0
-    }
-  }
-
-  layer{
-    name: "RBMHid"
-    type: kRBMHid
-    srclayers:"RBMVis"
-    rbmhid_conf{
-      hid_dim: 500
-    }
-    param{
-      name: "w2"
-      share_from: "w1"
-    }
-    param{
-      name: "b2"
-      init_method: kConstant
-      value: 0.0
-    }
-  }
-  }
-}
diff --git a/examples/mnist/train.py b/examples/mnist/train.py
new file mode 100644
index 0000000..0a00358
--- /dev/null
+++ b/examples/mnist/train.py
@@ -0,0 +1,133 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+import numpy as np
+import os
+import gzip
+import argparse
+import cPickle
+from singa import initializer
+from singa import utils
+from singa import optimizer
+from singa import device
+from singa import tensor
+
+
+from singa.proto import core_pb2
+
+
+
+def load_train_data(file_path):
+    f = gzip.open(file_path, 'rb')
+    train_set, valid_set, test_set = cPickle.load(f)
+    traindata = train_set[0].astype(np.float32)
+    validdata = valid_set[0].astype(np.float32)
+    print traindata.shape, validdata.shape
+    return traindata, validdata
+
+
+
+def train(data_file, use_gpu, num_epoch=10, batch_size=100):
+    print 'Start intialization............'
+    lr = 0.1   # Learning rate
+    weight_decay  = 0.0002
+    hdim = 1000
+    vdim = 784
+    opt = optimizer.SGD(momentum=0.8, weight_decay=weight_decay)
+
+    tweight = tensor.Tensor((vdim, hdim))
+    tweight.gaussian(0.0, 0.1)
+    tvbias = tensor.from_numpy(np.zeros(vdim, dtype = np.float32))
+    thbias = tensor.from_numpy(np.zeros(hdim, dtype = np.float32))
+    opt = optimizer.SGD(momentum=0.5, weight_decay=weight_decay)
+
+    print 'Loading data ..................'
+    train_x, valid_x = load_train_data(data_file)
+
+    if use_gpu:
+        dev = device.create_cuda_gpu()
+    else:
+        dev = device.get_default_device()
+
+    for t in [tweight, tvbias, thbias]:
+        t.to_device(dev)
+
+    num_train_batch = train_x.shape[0] / batch_size
+    print "num_train_batch = %d " % (num_train_batch)
+    for epoch in range(num_epoch):
+        trainerrorsum = 0.0
+        print 'Epoch %d' % epoch
+        for b in range(num_train_batch):
+            # positive phase
+            tdata = tensor.from_numpy(
+                    train_x[(b * batch_size):((b + 1) * batch_size), : ])
+            tdata.to_device(dev)
+            tposhidprob = tensor.mult(tdata, tweight)
+            tposhidprob.add_row(thbias)
+            tposhidprob = tensor.sigmoid(tposhidprob)
+            tposhidrandom = tensor.Tensor(tposhidprob.shape, dev)
+            tposhidrandom.uniform(0.0, 1.0)
+            tposhidsample = tensor.gt(tposhidprob, tposhidrandom)
+
+            # negative phase
+            tnegdata = tensor.mult(tposhidsample, tweight.T())
+            tnegdata.add_row(tvbias)
+            tnegdata = tensor.sigmoid(tnegdata)
+
+            tneghidprob = tensor.mult(tnegdata, tweight)
+            tneghidprob.add_row(thbias)
+            tneghidprob = tensor.sigmoid(tneghidprob)
+            error = tensor.sum(tensor.square((tdata - tnegdata)))
+            trainerrorsum = error + trainerrorsum
+
+            tgweight = tensor.mult(tnegdata.T(), tneghidprob) -\
+                    tensor.mult(tdata.T(), tposhidprob)
+            tgvbias = tensor.sum(tnegdata, 0) - tensor.sum(tdata, 0)
+            tghbias = tensor.sum(tneghidprob, 0) - tensor.sum(tposhidprob, 0)
+
+            opt.apply_with_lr(epoch, lr / batch_size, tgweight, tweight, 'w')
+            opt.apply_with_lr(epoch, lr / batch_size, tgvbias, tvbias, 'vb')
+            opt.apply_with_lr(epoch, lr / batch_size, tghbias, thbias, 'hb')
+
+        print 'training errorsum = %f' % (trainerrorsum)
+
+        tvaliddata = tensor.from_numpy(valid_x)
+        tvaliddata.to_device(dev)
+        tvalidposhidprob = tensor.mult(tvaliddata, tweight)
+        tvalidposhidprob.add_row(thbias)
+        tvalidposhidprob = tensor.sigmoid(tvalidposhidprob)
+        tvalidposhidrandom = tensor.Tensor(tvalidposhidprob.shape, dev)
+        initializer.uniform(tvalidposhidrandom, 0.0, 1.0)
+        tvalidposhidsample = tensor.gt(tvalidposhidprob, tvalidposhidrandom)
+
+        tvalidnegdata = tensor.mult(tvalidposhidsample, tweight.T())
+        tvalidnegdata.add_row(tvbias)
+        tvalidnegdata = tensor.sigmoid(tvalidnegdata)
+
+        validerrorsum = tensor.sum(tensor.square((tvaliddata - tvalidnegdata)))
+        print 'valid errorsum = %f' % (validerrorsum)
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='Train RBM over MNIST')
+    parser.add_argument('file', type=str, help='the dataset path')
+    parser.add_argument('--use_gpu', action='store_true')
+    args = parser.parse_args()
+
+    assert os.path.exists(args.file), 'Pls download the MNIST dataset from' \
+            'https://github.com/mnielsen/neural-networks-and-deep-learning/raw/master/data/mnist.pkl.gz'
+    train(args.file, args.use_gpu)
diff --git a/examples/rbm/autoencoder.conf b/examples/rbm/autoencoder.conf
deleted file mode 100644
index 223ad0d..0000000
--- a/examples/rbm/autoencoder.conf
+++ /dev/null
@@ -1,229 +0,0 @@
-name: "auto-encoder"
-train_steps: 12200
-test_steps:100
-test_freq:1000
-disp_freq:100
-checkpoint_path: "examples/rbm/rbm4/checkpoint/step6000-worker0"
-checkpoint_path: "examples/rbm/rbm3/checkpoint/step6000-worker0"
-checkpoint_path: "examples/rbm/rbm2/checkpoint/step6000-worker0"
-checkpoint_path: "examples/rbm/rbm1/checkpoint/step6000-worker0"
-train_one_batch{
-  alg: kBP
-}
-updater{
-  type: kAdaGrad
-  learning_rate{
-  base_lr: 0.01
-  type: kFixed
-  }
-}
-
-neuralnet {
-  layer {
-    name: "data"
-    type: kRecordInput
-    store_conf {
-      backend: "kvfile"
-      path: "examples/mnist/train_data.bin"
-      batchsize: 100
-      std_value: 255
-      shape: 784
-    }
-    include: kTrain
-  }
-
-  layer {
-    name: "data"
-    type: kRecordInput
-    store_conf {
-      backend: "kvfile"
-      path: "examples/mnist/test_data.bin"
-      std_value: 255
-      batchsize: 100
-      shape: 784
-    }
-    include: kTest
-  }
-
-
-  layer{
-    name: "Inner1"
-    type: kInnerProduct
-    srclayers:"data"
-    innerproduct_conf{
-      num_output: 1000
-    }
-    param{
-      name: "w1"
-    }
-    param{
-      name: "b12"
-    }
-  }
-
-  layer{
-    name: "Sigmoid1"
-    type: kSigmoid
-    srclayers:"Inner1"
-  }
-  layer{
-    name: "Inner2"
-    type: kInnerProduct
-    srclayers:"Sigmoid1"
-    innerproduct_conf{
-      num_output: 500
-    }
-    param{
-      name: "w2"
-    }
-    param{
-      name: "b22"
-    }
-  }
-
-  layer{
-    name: "Sigmoid2"
-    type: kSigmoid
-    srclayers:"Inner2"
-  }
-
-  layer{
-    name: "Inner3"
-    type:  kInnerProduct
-    srclayers:"Sigmoid2"
-    innerproduct_conf{
-      num_output: 250
-    }
-    param{
-      name: "w3"
-    }
-    param{
-      name: "b32"
-    }
-  }
-
-  layer{
-    name: "Sigmoid3"
-    type: kSigmoid
-    srclayers:"Inner3"
-  }
-
-  layer{
-    name: "Inner4"
-    type: kInnerProduct
-    srclayers:"Sigmoid3"
-    innerproduct_conf{
-      num_output: 30
-    }
-    param{
-      name: "w4"
-    }
-    param{
-      name: "b42"
-
-    }
-  }
-
-  layer{
-    name: "Inner5"
-    type: kInnerProduct
-    #srclayers:"Sigmoid4"
-    srclayers:"Inner4"
-    innerproduct_conf{
-      num_output: 250
-      transpose: true
-    }
-    param{
-      name: "w5"
-      share_from: "w4"
-    }
-    param{
-      name: "b41"
-    }
-  }
-
-  layer{
-    name: "Sigmoid5"
-    type: kSigmoid
-    srclayers:"Inner5"
-  }
-  layer{
-    name: "Inner6"
-    type: kInnerProduct
-    srclayers:"Sigmoid5"
-    innerproduct_conf{
-      num_output: 500
-      transpose: true
-    }
-    param{
-      name: "w6"
-      share_from: "w3"
-    }
-    param{
-      name: "b31"
-    }
-  }
-
-  layer{
-    name: "Sigmoid6"
-    type: kSigmoid
-    srclayers:"Inner6"
-  }
- layer{
-    name: "Inner7"
-    type: kInnerProduct
-    srclayers:"Sigmoid6"
-    innerproduct_conf{
-      num_output: 1000
-      transpose: true
-    }
-    param{
-      name: "w7"
-      share_from: "w2"
-    }
-    param{
-      name: "b21"
-    }
-
-  }
-
-  layer{
-    name: "Sigmoid7"
-    type: kSigmoid
-    srclayers:"Inner7"
-  }
- layer{
-    name: "Inner8"
-    type: kInnerProduct
-    srclayers:"Sigmoid7"
-    innerproduct_conf{
-      num_output: 784
-      transpose: true
-    }
-    param{
-      name: "w8"
-      share_from: "w1"
-    }
-    param{
-      name: "b11"
-    }
-  }
-
-  layer{
-    name: "Sigmoid8"
-    type: kSigmoid
-    srclayers:"Inner8"
-  }
-
-  layer{
-    name: "loss"
-    type:kEuclideanLoss
-    srclayers:"Sigmoid8"
-    srclayers:"data"
-  }
-}
-cluster {
-  nworker_groups: 1
-  nserver_groups: 1
-  workspace: "examples/rbm/autoencoder/"
-}
diff --git a/examples/rbm/rbm1.conf b/examples/rbm/rbm1.conf
deleted file mode 100644
index 696a8cb..0000000
--- a/examples/rbm/rbm1.conf
+++ /dev/null
@@ -1,101 +0,0 @@
-name: "rbm1"
-train_steps: 6000
-test_steps:100
-test_freq:500
-disp_freq: 100
-train_one_batch{
-  alg: kCD
-}
-updater{
-  type: kSGD
-  momentum: 0.8
-  weight_decay: 0.0002
-  learning_rate{
-    base_lr: 0.1
-    type: kFixed
-  }
-}
-
-neuralnet {
-  layer {
-    name: "data"
-    type: kRecordInput
-    store_conf {
-      backend: "kvfile"
-      path: "examples/mnist/train_data.bin"
-      batchsize: 100
-      std_value: 255
-      shape: 784
-    }
-    include: kTrain
-  }
-
-  layer {
-    name: "data"
-    type: kRecordInput
-    store_conf {
-      backend: "kvfile"
-      path: "examples/mnist/test_data.bin"
-      std_value: 255
-      batchsize: 100
-      shape: 784
-    }
-    include: kTest
-  }
-
-layer{
-  name: "RBMVis"
-  type: kRBMVis
-  srclayers:"data"
-  srclayers:"RBMHid"
-  rbm_conf{
-    hdim: 1000
-  }
-  param{
-    name: "w1"
-    init{
-      type: kGaussian
-      mean: 0.0
-      std: 0.1
-    }
-  }
-
-  param{
-    name: "b11"
-    wd_scale: 0
-    init{
-      type: kConstant
-      value: 0.0
-    }
-  }
-}
-
-layer{
-  name: "RBMHid"
-  type: kRBMHid
-  srclayers:"RBMVis"
-  rbm_conf{
-    hdim: 1000
-  }
-  param{
-    name: "w1_"
-    share_from: "w1"
-  }
-
-  param{
-    name: "b12"
-    wd_scale: 0
-    init{
-      type: kConstant
-      value: 0.0
-    }
-  }
-}
-}
-cluster {
-  nworker_groups: 1
-  nserver_groups: 1
-  nservers_per_group: 1
-  nworkers_per_group: 1
-  workspace: "examples/rbm/rbm1/"
-}
diff --git a/examples/rbm/rbm2.conf b/examples/rbm/rbm2.conf
deleted file mode 100644
index ddb9681..0000000
--- a/examples/rbm/rbm2.conf
+++ /dev/null
@@ -1,122 +0,0 @@
-name: "rbm2"
-train_steps: 6000
-test_steps:100
-test_freq:500
-disp_freq: 100
-train_one_batch{
-  alg: kCD
-}
-checkpoint_path: "examples/rbm/rbm1/checkpoint/step6000-worker0"
-updater{
-  type: kSGD
-  momentum: 0.8
-  weight_decay: 0.0002
-  learning_rate{
-  base_lr: 0.1
-  type: kFixed
-  }
-}
-
-neuralnet {
-  layer {
-    name: "data"
-    type: kRecordInput
-    store_conf {
-      backend: "kvfile"
-      path: "examples/mnist/train_data.bin"
-      batchsize: 100
-      std_value: 255
-      shape: 784
-    }
-    include: kTrain
-  }
-
-  layer {
-    name: "data"
-    type: kRecordInput
-    store_conf {
-      backend: "kvfile"
-      path: "examples/mnist/test_data.bin"
-      std_value: 255
-      batchsize: 100
-      shape: 784
-    }
-    include: kTest
-  }
-
-layer{
-  name: "Inner1"
-  type: kInnerProduct
-  srclayers:"data"
-  innerproduct_conf{
-    num_output: 1000
-  }
-  param{
-    name: "w1"
-  }
-  param{
-    name: "b12"
-  }
-}
-
-layer{
-  name: "Sigmoid1"
-  type: kSigmoid
-  srclayers:"Inner1"
-}
-
-layer{
-  name: "RBMVis"
-  type: kRBMVis
-  srclayers:"Sigmoid1"
-  srclayers:"RBMHid"
-  rbm_conf{
-    hdim: 500
-  }
-  param{
-    name: "w2"
-    init{
-      type: kGaussian
-      mean: 0.0
-      std: 0.1
-    }
-  }
-
-  param{
-    name: "b21"
-    wd_scale: 0
-    init{
-    type: kConstant
-    value: 0.0
-    }
-  }
-}
-
-layer{
-  name: "RBMHid"
-  type: kRBMHid
-  srclayers:"RBMVis"
-  rbm_conf{
-    hdim: 500
-  }
-  param{
-    name: "w2_"
-    share_from: "w2"
-  }
- param{
-    name: "b22"
-    wd_scale: 0
-    init{
-      type: kConstant
-      value: 0.0
-    }
-  }
-}
-}
-cluster {
-  nworker_groups: 1
-  nserver_groups: 1
-  nservers_per_group: 1
-  nworkers_per_group: 1
-  workspace: "examples/rbm/rbm2/"
-}
diff --git a/examples/rbm/rbm3.conf b/examples/rbm/rbm3.conf
deleted file mode 100644
index 44eae77..0000000
--- a/examples/rbm/rbm3.conf
+++ /dev/null
@@ -1,147 +0,0 @@
-name: "rbm3"
-train_steps: 6000
-test_steps:100
-test_freq:500
-disp_freq: 100
-train_one_batch{
-  alg: kCD
-}
-checkpoint_path: "examples/rbm/rbm2/checkpoint/step6000-worker0"
-checkpoint_path: "examples/rbm/rbm1/checkpoint/step6000-worker0"
-
-updater{
-  type: kSGD
-  momentum: 0.8
-  weight_decay: 0.0002
-  learning_rate{
-    base_lr: 0.1
-    type: kFixed
-  }
-}
-
-
-neuralnet {
-  layer {
-    name: "data"
-    type: kRecordInput
-    store_conf {
-      backend: "kvfile"
-      path: "examples/mnist/train_data.bin"
-      batchsize: 100
-      std_value: 255
-      shape: 784
-    }
-    include: kTrain
-  }
-
-  layer {
-    name: "data"
-    type: kRecordInput
-    store_conf {
-      backend: "kvfile"
-      path: "examples/mnist/test_data.bin"
-      std_value: 255
-      batchsize: 100
-      shape: 784
-    }
-    include: kTest
-  }
-
-
-layer{
-    name: "Inner1"
-    type: kInnerProduct
-    srclayers:"data"
-    innerproduct_conf{
-      num_output: 1000
-    }
-    param{
-      name: "w1"
-    }
-    param{
-      name: "b12"
-    }
-  }
-
-  layer{
-    name: "Sigmoid1"
-    type: kSigmoid
-    srclayers:"Inner1"
-  }
-
-layer{
-    name: "Inner2"
-    type: kInnerProduct
-    srclayers:"Sigmoid1"
-    innerproduct_conf{
-      num_output: 500
-    }
-    param{
-      name: "w2"
-    }
-    param{
-      name: "b22"
-    }
-  }
-
-  layer{
-    name: "Sigmoid2"
-    type: kSigmoid
-    srclayers:"Inner2"
-  }
-layer{
-  name: "RBMVis"
-  type: kRBMVis
-  srclayers:"Sigmoid2"
-  srclayers:"RBMHid"
-  rbm_conf{
-    hdim: 250
-  }
-  param{
-    name: "w3"
-    init{
-      type: kGaussian
-      mean: 0.0
-      std: 0.1
-    }
-  }
-
-  param{
-    name: "b31"
-    wd_scale: 0
-    init{
-    type: kConstant
-    value: 0.0
-    }
-  }
-}
-
-layer{
-  name: "RBMHid"
-  type: kRBMHid
-  srclayers:"RBMVis"
-  rbm_conf{
-    hdim: 250
-  }
-  param{
-    name: "w3_"
-    share_from: "w3"
-  }
-
-  param{
-    name: "b32"
-    wd_scale: 0
-    init{
-    type: kConstant
-    value: 0.0
-    }
-  }
-}
-}
-cluster {
-  nworker_groups: 1
-  nserver_groups: 1
-  nservers_per_group: 1
-  nworkers_per_group: 1
-  workspace: "examples/rbm/rbm3/"
-}
diff --git a/examples/rbm/rbm4.conf b/examples/rbm/rbm4.conf
deleted file mode 100644
index bb023c4..0000000
--- a/examples/rbm/rbm4.conf
+++ /dev/null
@@ -1,167 +0,0 @@
-name: "rbm4"
-train_steps: 6000
-test_steps: 100
-test_freq: 500
-disp_freq: 100
-train_one_batch{
-  alg: kCD
-}
-checkpoint_path: "examples/rbm/rbm3/checkpoint/step6000-worker0"
-checkpoint_path: "examples/rbm/rbm2/checkpoint/step6000-worker0"
-checkpoint_path: "examples/rbm/rbm1/checkpoint/step6000-worker0"
-updater{
-    type: kSGD
-    momentum: 0.8
-    weight_decay: 0.0002
-    learning_rate{
-      base_lr: 0.001
-      type: kFixed
-    }
-}
-
-neuralnet {
-  layer {
-    name: "data"
-    type: kRecordInput
-    store_conf {
-      backend: "kvfile"
-      path: "examples/mnist/train_data.bin"
-      batchsize: 100
-      std_value: 255
-      shape: 784
-    }
-    include: kTrain
-  }
-
-  layer {
-    name: "data"
-    type: kRecordInput
-    store_conf {
-      backend: "kvfile"
-      path: "examples/mnist/test_data.bin"
-      std_value: 255
-      batchsize: 100
-      shape: 784
-    }
-    include: kTest
-  }
-
-
-  layer{
-    name: "Inner1"
-    type: kInnerProduct
-    srclayers:"data"
-    innerproduct_conf{
-      num_output: 1000
-    }
-    param{
-      name: "w1"
-    }
-    param{
-      name: "b12"
-    }
-  }
-
-  layer{
-    name: "Sigmoid1"
-    type: kSigmoid
-    srclayers:"Inner1"
-  }
-
-layer{
-    name: "Inner2"
-    type: kInnerProduct
-    srclayers:"Sigmoid1"
-    innerproduct_conf{
-      num_output: 500
-    }
-    param{
-      name: "w2"
-    }
-    param{
-      name: "b22"
-    }
-  }
-
-  layer{
-    name: "Sigmoid2"
-    type: kSigmoid
-    srclayers:"Inner2"
-  }
-
-layer{
-    name: "Inner3"
-    type: kInnerProduct
-    srclayers:"Sigmoid2"
-    innerproduct_conf{
-      num_output: 250
-    }
-    param{
-      name: "w3"
-    }
-    param{
-      name: "b32"
-    }
-  }
-
-  layer{
-    name: "Sigmoid3"
-    type: kSigmoid
-    srclayers:"Inner3"
-  }
-
-layer{
-  name: "RBMVis"
-  type: kRBMVis
-  srclayers:"Sigmoid3"
-  srclayers:"RBMHid"
-  rbm_conf{
-    hdim: 30
-  }
-  param{
-    name: "w4"
-    init{
-      type: kGaussian
-      mean: 0.0
-      std: 0.1
-    }
-  }
-  param{
-    name: "b41"
-    wd_scale: 0
-    init{
-    type: kConstant
-    value: 0.0
-    }
-  }
-}
-
-layer{
-  name: "RBMHid"
-  type: kRBMHid
-  srclayers:"RBMVis"
-  rbm_conf{
-    hdim: 30
-    gaussian: true
-  }
-  param{
-    name: "w4_"
-    share_from: "w4"
-  }
-  param{
-    name: "b42"
-    wd_scale: 0
-    init{
-    type: kConstant
-    value: 0.0
-    }
-  }
-}
-}
-cluster {
-  nworker_groups: 1
-  nserver_groups: 1
-  nservers_per_group: 1
-  nworkers_per_group: 1
-  workspace: "examples/rbm/rbm4/"
-}
diff --git a/examples/rnnlm/Makefile.example b/examples/rnnlm/Makefile.example
deleted file mode 100644
index 13c5e42..0000000
--- a/examples/rnnlm/Makefile.example
+++ /dev/null
@@ -1,52 +0,0 @@
-#/************************************************************
-#*
-#* Licensed to the Apache Software Foundation (ASF) under one
-#* or more contributor license agreements.  See the NOTICE file
-#* distributed with this work for additional information
-#* regarding copyright ownership.  The ASF licenses this file
-#* to you under the Apache License, Version 2.0 (the
-#* "License"); you may not use this file except in compliance
-#* with the License.  You may obtain a copy of the License at
-#*
-#*   http://www.apache.org/licenses/LICENSE-2.0
-#*
-#* Unless required by applicable law or agreed to in writing,
-#* software distributed under the License is distributed on an
-#* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-#* KIND, either express or implied.  See the License for the
-#* specific language governing permissions and limitations
-#* under the License.
-#*
-#*************************************************************/
-
-MSHADOW_FLAGS :=-DMSHADOW_USE_CUDA=0 -DMSHADOW_USE_CBLAS=1 -DMSHADOW_USE_MKL=0
-
-libs :=singa glog protobuf
-filename = rnnlm-0.4b.tgz
-# note: filelink for rnnlm-0.4b may change
-filelink = https://f25ea9ccb7d3346ce6891573d543960492b92c30.googledrive.com/host/0ByxdPXuxLPS5RFM5dVNvWVhTd0U
-dirname = $(patsubst %.tgz,%, $(filename))
-numclass = 100
-dirshards = train_shard valid_shard test_shard
-
-
-
-download:
-	wget $(filelink)/$(filename)
-	tar zxf $(filename)
-	rm $(filename)
-
-create:
-	protoc --proto_path=../../src/proto --proto_path=. --cpp_out=. rnnlm.proto
-	$(CXX) create_data.cc rnnlm.pb.cc -std=c++11 -lsinga -lprotobuf -lzookeeper_mt -lglog -I../../include -I../../include/singa/proto \
-		-L../../.libs/ -L/usr/local/lib -Wl,-unresolved-symbols=ignore-in-shared-libs -Wl,-rpath=../../.libs/ \
-		-o create_data.bin
-	for d in $(dirshards); do mkdir -p $${d}; done
-	./create_data.bin -train $(dirname)/train -test $(dirname)/test -valid $(dirname)/valid -class_size $(numclass)
-
-
-rnnlm:
-	protoc --proto_path=../../src/proto --proto_path=. --cpp_out=. rnnlm.proto
-	$(CXX) main.cc rnnlm.cc rnnlm.pb.cc $(MSHADOW_FLAGS) -msse3 -std=c++11 -lsinga -lglog -lprotobuf -lopenblas -I../../include -I../../include/singa/proto \
-		-L../../.libs/ -L/usr/local  -Wl,-unresolved-symbols=ignore-in-shared-libs -Wl,-rpath=../../.libs/\
-		-o rnnlm.bin
diff --git a/examples/rnnlm/README.md b/examples/rnnlm/README.md
deleted file mode 100644
index 9e83686..0000000
--- a/examples/rnnlm/README.md
+++ /dev/null
@@ -1,52 +0,0 @@
-This example trains the [RNN model](http://www.fit.vutbr.cz/research/groups/speech/publi/2010/mikolov_interspeech2010_IS100722.pdf) proposed by Tomas Mikolov for [language modeling](https://en.wikipedia.org/wiki/Language_model) over text dataset contains 71350 words, provided at [RNNLM Toolkit](https://f25ea9ccb7d3346ce6891573d543960492b92c30.googledrive.com/host/0ByxdPXuxLPS5RFM5dVNvWVhTd0U).
-The training objective (loss) is to minimize the [perplexity per word](https://en.wikipedia.org/wiki/Perplexity), which is equivalent to maximize the probability of predicting the next word given the current word in a sentence.
-The purpose of this example is to show users how to implement and use their own layers for RNN in SINGA.
-The example RNN model consists of six layers, namely RnnDataLayer, WordLayer, RnnLabelLayer, EmbeddingLayer, HiddenLayer, and OutputLayer. 
-
-## File description
-
-The files in this folder include:
-
-* rnnlm.proto, definition of the configuration protocol of the layers.
-* rnnlm.h, declaration of the layers.
-* rnnlm.cc, definition of the layers.
-* main.cc, main function that register the layers.
-* Makefile.exmaple, Makefile for compiling all source code in this folder.
-* job.conf, the job configuration for training the RNN language model.
-
-
-## Data preparation
-
-To use the RNNLM dataset, we can download it and create DataShard by typing
-
-    # in rnnlm/ folder
-    cp Makefile.example Makefile
-    make download
-    make create
-
-## Compilation
-
-The *Makefile.example* contains instructions for compiling the source code.
-
-    # in rnnlm/ folder
-    cp Makefile.example Makefile
-    make rnnlm
-
-It will generate an executable file *rnnlm.bin*.
-
-## Running
-
-Make sure that there is one example job configuration file, named *job.conf*.
-
-Before running SINGA, we need to export the `LD_LIBRARY_PATH` to
-include the libsinga.so by the following script.
-
-    # at the root folder of SINGA
-    export LD_LIBRARY_PATH=.libs:$LD_LIBRARY_PATH
-
-Then, we can run SINGA as follows. 
-
-    # at the root folder of SINGA
-    ./bin/singa-run.sh -exec examples/rnnlm/rnnlm.bin -conf examples/rnnlm/job.conf
-
-You will see the values of loss and ppl at each training step.
diff --git a/examples/rnnlm/create_data.cc b/examples/rnnlm/create_data.cc
deleted file mode 100644
index d1edbdb..0000000
--- a/examples/rnnlm/create_data.cc
+++ /dev/null
@@ -1,444 +0,0 @@
-/************************************************************
-*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*
-*************************************************************/
-/*
- * This file include code from rnnlmlib-0.4 under BSD new license.
- * Copyright (c) 2010-2012 Tomas Mikolov
- * Copyright (c) 2013 Cantab Research Ltd
- * All rights reserved.
- */
-
-
-//
-// This code creates DataShard for RNNLM dataset.
-// The RNNLM dataset could be downloaded at
-//    http://www.rnnlm.org/
-//
-// Usage:
-//    create_shard.bin -train [train_file] -valid [valid_file]
-//                     -test [test_file] -class_size [# of classes]
-
-#include <cstring>
-#include <cstdlib>
-#include <cstdio>
-#include <cmath>
-#include <algorithm>
-#include <fstream>
-
-#include "singa/io/store.h"
-#include "singa/utils/common.h"
-#include "singa/proto/common.pb.h"
-#include "./rnnlm.pb.h"
-
-#define MAX_STRING 100
-#define BUFFER_LEN 32
-#define NL_STRING  "</s>"
-
-using std::string;
-using std::max;
-using std::min;
-
-struct vocab_word {
-  int cn;
-  char word[MAX_STRING];
-  int class_index;
-};
-
-struct vocab_word *vocab;
-int vocab_max_size;
-int vocab_size;
-int *vocab_hash;
-int vocab_hash_size;
-int debug_mode;
-int old_classes;
-int *class_start;
-int *class_end;
-int class_size;
-
-char train_file[MAX_STRING];
-char valid_file[MAX_STRING];
-char test_file[MAX_STRING];
-
-int valid_mode;
-int test_mode;
-
-unsigned int getWordHash(char *word) {
-  unsigned int hash, a;
-
-  hash = 0;
-  for (a = 0; a < strlen(word); a++) hash = hash * 237 + word[a];
-  hash = hash % vocab_hash_size;
-
-  return hash;
-}
-
-int searchVocab(char *word) {
-  int a;
-  unsigned int hash;
-
-  hash = getWordHash(word);
-
-  if (vocab_hash[hash] == -1) return -1;
-  if (!strcmp(word, vocab[vocab_hash[hash]].word)) return vocab_hash[hash];
-
-  for (a = 0; a < vocab_size; a++) {   // search in vocabulary
-    if (!strcmp(word, vocab[a].word)) {
-      vocab_hash[hash] = a;
-      return a;
-    }
-  }
-
-  return -1;   // return OOV if not found
-}
-
-int addWordToVocab(char *word) {
-  unsigned int hash;
-
-  snprintf(vocab[vocab_size].word, strlen(word)+1, "%s", word);
-  vocab[vocab_size].cn = 0;
-  vocab_size++;
-
-  if (vocab_size + 2 >= vocab_max_size) {   // reallocate memory if needed
-    vocab_max_size += 100;
-    vocab = (struct vocab_word *) realloc(
-        vocab,
-        vocab_max_size * sizeof(struct vocab_word));
-  }
-
-  hash = getWordHash(word);
-  vocab_hash[hash] = vocab_size - 1;
-
-  return vocab_size - 1;
-}
-
-void readWord(char *word, FILE *fin) {
-  int a = 0, ch;
-
-  while (!feof(fin)) {
-    ch = fgetc(fin);
-
-    if (ch == 13) continue;
-
-    if ((ch == ' ') || (ch == '\t') || (ch == '\n')) {
-      if (a > 0) {
-        if (ch == '\n') ungetc(ch, fin);
-        break;
-      }
-
-      if (ch == '\n') {
-        snprintf(word, strlen(NL_STRING) + 1,
-            "%s", const_cast<char *>(NL_STRING));
-        return;
-      } else {
-        continue;
-      }
-    }
-
-    word[a] = static_cast<char>(ch);
-    a++;
-
-    if (a >= MAX_STRING) {
-      // printf("Too long word found!\n");   //truncate too long words
-      a--;
-    }
-  }
-  word[a] = 0;
-}
-
-void sortVocab() {
-  int a, b, max;
-  vocab_word swap;
-
-  for (a = 1; a < vocab_size; a++) {
-    max = a;
-    for (b = a + 1; b < vocab_size; b++)
-      if (vocab[max].cn < vocab[b].cn) max = b;
-
-    swap = vocab[max];
-    vocab[max] = vocab[a];
-    vocab[a] = swap;
-  }
-}
-
-int learnVocabFromTrainFile() {
-  char word[MAX_STRING];
-  FILE *fin;
-  int a, i, train_wcn;
-
-  for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1;
-
-  fin = fopen(train_file, "rb");
-
-  vocab_size = 0;
-
-  addWordToVocab(const_cast<char *>(NL_STRING));
-
-  train_wcn = 0;
-  while (1) {
-    readWord(word, fin);
-    if (feof(fin)) break;
-
-    train_wcn++;
-
-    i = searchVocab(word);
-    if (i == -1) {
-      a = addWordToVocab(word);
-      vocab[a].cn = 1;
-    } else {
-      vocab[i].cn++;
-    }
-  }
-
-  sortVocab();
-
-  if (debug_mode > 0) {
-    printf("Vocab size: %d\n", vocab_size);
-    printf("Words in train file: %d\n", train_wcn);
-  }
-
-  fclose(fin);
-  return 0;
-}
-
-int splitClasses() {
-  double df, dd;
-  int i, a, b;
-
-  df = 0;
-  dd = 0;
-  a = 0;
-  b = 0;
-
-  class_start = reinterpret_cast<int *>(calloc(class_size, sizeof(int)));
-  memset(class_start, 0x7f, sizeof(int) * class_size);
-  class_end = reinterpret_cast<int *>(calloc(class_size, sizeof(int)));
-  memset(class_end, 0, sizeof(int) * class_size);
-
-  if (old_classes) {    // old classes
-    for (i = 0; i < vocab_size; i++)
-      b += vocab[i].cn;
-    for (i = 0; i < vocab_size; i++) {
-      df += vocab[i].cn / static_cast<double>(b);
-      if (df > 1) df = 1;
-      if (df > (a + 1) / static_cast<double>(class_size)) {
-        vocab[i].class_index = a;
-        if (a < class_size - 1) a++;
-      } else {
-        vocab[i].class_index = a;
-      }
-    }
-  } else {            // new classes
-    for (i = 0; i < vocab_size; i++)
-      b += vocab[i].cn;
-    for (i = 0; i < vocab_size; i++)
-      dd += sqrt(vocab[i].cn / static_cast<double>(b));
-    for (i = 0; i < vocab_size; i++) {
-      df += sqrt(vocab[i].cn / static_cast<double>(b)) / dd;
-      if (df > 1) df = 1;
-      if (df > (a + 1) / static_cast<double>(class_size)) {
-        vocab[i].class_index = a;
-        if (a < class_size - 1) a++;
-      } else {
-        vocab[i].class_index = a;
-      }
-    }
-  }
-
-  // after dividing classes, update class start and class end information
-  for (i = 0; i < vocab_size; i++)  {
-    a = vocab[i].class_index;
-    class_start[a] = min(i, class_start[a]);
-    class_end[a] = max(i + 1, class_end[a]);
-  }
-  return 0;
-}
-
-int init_class() {
-  // debug_mode = 1;
-  debug_mode = 0;
-  vocab_max_size = 100;  // largest length value for each word
-  vocab_size = 0;
-  vocab = (struct vocab_word *) calloc(vocab_max_size,
-      sizeof(struct vocab_word));
-  vocab_hash_size = 100000000;
-  vocab_hash = reinterpret_cast<int *>(calloc(vocab_hash_size, sizeof(int)));
-  old_classes = 1;
-
-  // read vocab
-  learnVocabFromTrainFile();
-
-  // split classes
-  splitClasses();
-
-  return 0;
-}
-
-int create_data(const char *input_file, const char *output) {
-  auto* store = singa::io::OpenStore("kvfile", output, singa::io::kCreate);
-  WordRecord wordRecord;
-
-  FILE *fin;
-  int a, i;
-  fin = fopen(input_file, "rb");
-
-  int wcnt = 0;
-  char key[BUFFER_LEN];
-  char wordstr[MAX_STRING];
-  string value;
-  while (1) {
-    readWord(wordstr, fin);
-    if (feof(fin)) break;
-    i = searchVocab(wordstr);
-    if (i == -1) {
-      if (debug_mode) printf("unknown word [%s] detected!", wordstr);
-    } else {
-      wordRecord.set_word(string(wordstr));
-      wordRecord.set_word_index(i);
-      int class_idx = vocab[i].class_index;
-      wordRecord.set_class_index(class_idx);
-      wordRecord.set_class_start(class_start[class_idx]);
-      wordRecord.set_class_end(class_end[class_idx]);
-      int length = snprintf(key, BUFFER_LEN, "%05d", wcnt++);
-      wordRecord.SerializeToString(&value);
-      store->Write(string(key, length), value);
-    }
-  }
-
-  fclose(fin);
-  store->Flush();
-  delete store;
-  return 0;
-}
-
-int argPos(char *str, int argc, char **argv) {
-  int a;
-
-  for (a = 1; a < argc; a++)
-    if (!strcmp(str, argv[a]))
-      return a;
-
-  return -1;
-}
-
-int main(int argc, char **argv) {
-  int i;
-  FILE *f;
-
-  // set debug mode
-  i = argPos(const_cast<char *>("-debug"), argc, argv);
-  if (i > 0) {
-    debug_mode = 1;
-    if (debug_mode > 0)
-      printf("debug mode: %d\n", debug_mode);
-  }
-
-  // search for train file
-  i = argPos(const_cast<char *>("-train"), argc, argv);
-  if (i > 0) {
-    if (i + 1 == argc) {
-      printf("ERROR: training data file not specified!\n");
-      return 0;
-    }
-
-    snprintf(train_file, strlen(argv[i + 1])+1, "%s", argv[i + 1]);
-
-    if (debug_mode > 0)
-      printf("train file: %s\n", train_file);
-
-    f = fopen(train_file, "rb");
-    if (f == NULL) {
-      printf("ERROR: training data file not found!\n");
-      return 0;
-    }
-    fclose(f);
-  } else {
-    printf("ERROR: training data must be set.\n");
-  }
-
-  // search for valid file
-  i = argPos(const_cast<char *>("-valid"), argc, argv);
-  if (i > 0) {
-    if (i + 1 == argc) {
-      printf("ERROR: validating data file not specified!\n");
-      return 0;
-    }
-
-    snprintf(valid_file, strlen(argv[i + 1])+1, "%s", argv[i + 1]);
-
-    if (debug_mode > 0)
-      printf("valid file: %s\n", valid_file);
-
-    f = fopen(valid_file, "rb");
-    if (f == NULL) {
-      printf("ERROR: validating data file not found!\n");
-      return 0;
-    }
-    fclose(f);
-    valid_mode = 1;
-  }
-
-  // search for test file
-  i = argPos(const_cast<char *>("-test"), argc, argv);
-  if (i > 0) {
-    if (i + 1 == argc) {
-      printf("ERROR: testing data file not specified!\n");
-      return 0;
-    }
-
-    snprintf(test_file, strlen(argv[i + 1])+1, "%s", argv[i + 1]);
-
-    if (debug_mode > 0)
-      printf("test file: %s\n", test_file);
-
-    f = fopen(test_file, "rb");
-    if (f == NULL) {
-      printf("ERROR: testing data file not found!\n");
-      return 0;
-    }
-    fclose(f);
-    test_mode = 1;
-  }
-
-  // search for class size
-  i = argPos(const_cast<char *>("-class_size"), argc, argv);
-  if (i > 0) {
-    if (i + 1 == argc) {
-      printf("ERROR: class size not specified!\n");
-      return 0;
-    }
-
-    class_size = atoi(argv[i + 1]);
-
-    if (debug_mode > 0)
-      printf("class size: %d\n", class_size);
-  }
-  if (class_size <= 0) {
-    printf("ERROR: no or invalid class size received!\n");
-    return 0;
-  }
-
-  init_class();
-
-  create_data(train_file, "train_data.bin");
-  if (valid_mode) create_data(valid_file, "valid_data.bin");
-  if (test_mode) create_data(test_file, "test_data.bin");
-
-  return 0;
-}
diff --git a/examples/rnnlm/job.conf b/examples/rnnlm/job.conf
deleted file mode 100644
index aca1166..0000000
--- a/examples/rnnlm/job.conf
+++ /dev/null
@@ -1,120 +0,0 @@
-name: "rnnlm"
-#To scan the training file (81350) 10 times
-train_steps:81350
-#To scan the validation file (6828) once
-validate_steps:683
-validate_freq:8135
-#disp_freq is specific to training
-disp_freq:8135
-train_one_batch {
-alg: kBP
-}
-updater{
-  type: kSGD
-  learning_rate {
-    type: kFixedStep
-    fixedstep_conf:{
-      step:0
-      step:48810
-      step:56945
-      step:65080
-      step:73215
-      step_lr:0.1
-      step_lr:0.05
-      step_lr:0.025
-      step_lr:0.0125
-      step_lr:0.00625
-    }
-  }
-}
-
-neuralnet {
-layer {
-  name: "data"
-  user_type: "kData"
-  [data_conf] {
-    backend: "kvfile"
-    path: "examples/rnnlm/train_data.bin"
-    max_window: 10
-  }
-  include: kTrain
-}
-
-layer {
-  name: "data"
-  user_type: "kData"
-  [data_conf] {
-    path: "examples/rnnlm/valid_data.bin"
-    max_window: 10
-  }
-  include: kVal
-}
-
-layer{
-  name: "embedding"
-  user_type: "kEmbedding"
-  srclayers: "data"
-  [embedding_conf] {
-    word_dim: 15
-    vocab_size: 3720
-  }
-    param {
-    name: "w1"
-    init {
-       type: kUniform
-       low:-0.3
-       high:0.3
-    }
-  }
-}
-
-layer{
-  name: "hidden"
-  user_type: "kHidden"
-  srclayers:"embedding"
-  param{
-    name: "w2"
-    init {
-      type: kUniform
-      low:-0.3
-      high:0.3
-    }
-  }
-}
-layer{
-  name: "loss"
-  user_type: "kLoss"
-  srclayers:"hidden"
-  srclayers:"data"
-  [loss_conf] {
-    nclass:100
-    vocab_size: 3720
-  }
-  param{
-    name: "w3"
-    init {
-      type: kUniform
-      low:-0.3
-      high:0.3
-    }
-  }
-  param{
-    name: "w4"
-    init {
-      type: kUniform
-      low:-0.3
-      high:0.3
-    }
-  }
-}
-
-}
-cluster {
-  nworker_groups: 1
-  nserver_groups: 1
-  nservers_per_group: 1
-  nworkers_per_group: 1
-  nservers_per_procs: 1
-  nworkers_per_procs: 1
-  workspace: "examples/rnnlm/"
-}
diff --git a/examples/rnnlm/main.cc b/examples/rnnlm/main.cc
deleted file mode 100644
index 9124383..0000000
--- a/examples/rnnlm/main.cc
+++ /dev/null
@@ -1,49 +0,0 @@
-/************************************************************
-*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*
-*************************************************************/
-
-#include <glog/logging.h>
-#include <string>
-#include "singa/singa.h"
-#include "rnnlm.h"
-#include "rnnlm.pb.h"
-
-int main(int argc, char **argv) {
-  // initialize glog before creating the driver
-  google::InitGoogleLogging(argv[0]);
-  
-  singa::Driver driver;
-  driver.Init(argc, argv);
-
-  // if -resume in argument list, set resume to true; otherwise false
-  int resume_pos = singa::ArgPos(argc, argv, "-resume");
-  bool resume = (resume_pos != -1);
-
-  // register all layers for rnnlm
-  driver.RegisterLayer<rnnlm::EmbeddingLayer, std::string>("kEmbedding");
-  driver.RegisterLayer<rnnlm::HiddenLayer, std::string>("kHidden");
-  driver.RegisterLayer<rnnlm::LossLayer, std::string>("kLoss");
-  driver.RegisterLayer<rnnlm::DataLayer, std::string>("kData");
-
-  singa::JobProto jobConf = driver.job_conf();
-
-  driver.Train(resume, jobConf);
-  return 0;
-}
diff --git a/examples/rnnlm/rnnlm.cc b/examples/rnnlm/rnnlm.cc
deleted file mode 100644
index 641b465..0000000
--- a/examples/rnnlm/rnnlm.cc
+++ /dev/null
@@ -1,335 +0,0 @@
-/************************************************************
-*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*
-*************************************************************/
-#include "./rnnlm.h"
-
-#include <string>
-#include <algorithm>
-#include "mshadow/tensor.h"
-#include "mshadow/tensor_expr.h"
-#include "mshadow/cxxnet_op.h"
-#include "./rnnlm.pb.h"
-
-namespace rnnlm {
-using std::vector;
-using std::string;
-
-using namespace mshadow;
-using mshadow::cpu;
-using mshadow::Shape;
-using mshadow::Shape1;
-using mshadow::Shape2;
-using mshadow::Tensor;
-
-inline Tensor<cpu, 2> RTensor2(Blob<float>* blob) {
-  const vector<int>& shape = blob->shape();
-  Tensor<cpu, 2> tensor(blob->mutable_cpu_data(),
-      Shape2(shape[0], blob->count() / shape[0]));
-  return tensor;
-}
-
-inline Tensor<cpu, 1> RTensor1(Blob<float>* blob) {
-  Tensor<cpu, 1> tensor(blob->mutable_cpu_data(), Shape1(blob->count()));
-  return tensor;
-}
-
-
-/*******DataLayer**************/
-DataLayer::~DataLayer() {
-  if (store_ != nullptr)
-    delete store_;
-}
-
-void DataLayer::Setup(const LayerProto& conf, const vector<Layer*>& srclayers) {
-  RNNLayer::Setup(conf, srclayers);
-  string key;
-  max_window_ = conf.GetExtension(data_conf).max_window();
-  data_.Reshape(vector<int>{max_window_ + 1, 4});
-  window_ = 0;
-}
-
-void SetInst(int k, const WordRecord& word, Blob<float>* to) {
-  float* dptr = to->mutable_cpu_data() + k * 4;
-  dptr[0] = static_cast<float>(word.word_index());
-  dptr[1] = static_cast<float>(word.class_index());
-  dptr[2] = static_cast<float>(word.class_start());
-  dptr[3] = static_cast<float>(word.class_end());
-}
-
-void ShiftInst(int from, int to,  Blob<float>* data) {
-  const float* f = data->cpu_data() + from * 4;
-  float* t = data->mutable_cpu_data() + to * 4;
-  // hard code the feature dim to be 4;
-  t[0] = f[0]; t[1] = f[1]; t[2] = f[2]; t[3] = f[3];
-}
-
-void DataLayer::ComputeFeature(int flag, const vector<Layer*>& srclayers) {
-  string key, value;
-  WordRecord word;
-  if (store_ == nullptr) {
-    store_ = singa::io::OpenStore(
-        layer_conf_.GetExtension(data_conf).backend(),
-        layer_conf_.GetExtension(data_conf).path(),
-        singa::io::kRead);
-    store_->Read(&key, &value);
-    word.ParseFromString(value);
-    SetInst(0, word, &data_);
-  }
-  ShiftInst(window_, 0, &data_);
-  window_ = max_window_;
-  for (int i = 1; i <= max_window_; i++) {
-    if (!store_->Read(&key, &value)) {
-      store_->SeekToFirst();
-      CHECK(store_->Read(&key, &value));
-    }
-    word.ParseFromString(value);
-    SetInst(i, word, &data_);
-    if (word.word_index() == 0) {
-      window_ = i;
-      break;
-    }
-  }
-}
-
-/*******LabelLayer**************
-void LabelLayer::Setup(const LayerProto& conf,
-    const vector<Layer*>& srclayers) {
-  RNNLayer::Setup(conf, srclayers);
-  CHECK_EQ(srclayers.size(), 1);
-  int max_window = dynamic_cast<DataLayer*>(srclayers[0])->max_window();
-  data_.Reshape(vector<int>{max_window, 4});
-}
-
-void LabelLayer::ComputeFeature(int flag, const vector<Layer*>& srclayers) {
-  const auto& records = dynamic_cast<DataLayer*>(srclayers[0])->records();
-  float *label = data_.mutable_cpu_data();
-  window_ = dynamic_cast<RNNLayer*>(srclayers[0])->window();
-  for (int i = 0; i < window_; i++) {
-    WordRecord wordrecord = records[i + 1].GetExtension(word);
-    label[4 * i + 0] = wordrecord.class_start();
-    label[4 * i + 1] = wordrecord.class_end();
-    label[4 * i + 2] = wordrecord.word_index();
-    label[4 * i + 3] = wordrecord.class_index();
-  }
-}
-*/
-
-/*******EmbeddingLayer**************/
-EmbeddingLayer::~EmbeddingLayer() {
-  delete embed_;
-}
-
-void EmbeddingLayer::Setup(const LayerProto& conf,
-    const vector<Layer*>& srclayers) {
-  RNNLayer::Setup(conf, srclayers);
-  CHECK_EQ(srclayers.size(), 1);
-  int max_window = srclayers[0]->data(this).shape()[0];
-  word_dim_ = conf.GetExtension(embedding_conf).word_dim();
-  data_.Reshape(vector<int>{max_window, word_dim_});
-  grad_.ReshapeLike(data_);
-  vocab_size_ = conf.GetExtension(embedding_conf).vocab_size();
-  embed_ = Param::Create(conf.param(0));
-  embed_->Setup(vector<int>{vocab_size_, word_dim_});
-}
-
-void EmbeddingLayer::ComputeFeature(int flag, const vector<Layer*>& srclayers) {
-  auto datalayer = dynamic_cast<DataLayer*>(srclayers[0]);
-  window_ = datalayer->window();
-  auto words = RTensor2(&data_);
-  auto embed = RTensor2(embed_->mutable_data());
-
-  const float* idxptr = datalayer->data(this).cpu_data();
-  for (int t = 0; t < window_; t++) {
-    int idx = static_cast<int>(idxptr[t * 4]);
-    CHECK_GE(idx, 0);
-    CHECK_LT(idx, vocab_size_);
-    Copy(words[t], embed[idx]);
-  }
-}
-
-void EmbeddingLayer::ComputeGradient(int flag,
-    const vector<Layer*>& srclayers) {
-  auto grad = RTensor2(&grad_);
-  auto gembed = RTensor2(embed_->mutable_grad());
-  auto datalayer = dynamic_cast<DataLayer*>(srclayers[0]);
-  gembed = 0;
-  const float* idxptr = datalayer->data(this).cpu_data();
-  for (int t = 0; t < window_; t++) {
-    int idx = static_cast<int>(idxptr[t * 4]);
-    Copy(gembed[idx], grad[t]);
-  }
-}
-/***********HiddenLayer**********/
-HiddenLayer::~HiddenLayer() {
-  delete weight_;
-}
-
-void HiddenLayer::Setup(const LayerProto& conf,
-    const vector<Layer*>& srclayers) {
-  RNNLayer::Setup(conf, srclayers);
-  CHECK_EQ(srclayers.size(), 1);
-  const auto& innerproductData = srclayers[0]->data(this);
-  data_.ReshapeLike(srclayers[0]->data(this));
-  grad_.ReshapeLike(srclayers[0]->grad(this));
-  int word_dim = data_.shape()[1];
-  weight_ = Param::Create(conf.param(0));
-  weight_->Setup(std::vector<int>{word_dim, word_dim});
-}
-
-// hid[t] = sigmoid(hid[t-1] * W + src[t])
-void HiddenLayer::ComputeFeature(int flag, const vector<Layer*>& srclayers) {
-  window_ = dynamic_cast<RNNLayer*>(srclayers[0])->window();
-  auto data = RTensor2(&data_);
-  auto src = RTensor2(srclayers[0]->mutable_data(this));
-  auto weight = RTensor2(weight_->mutable_data());
-  for (int t = 0; t < window_; t++) {  // Skip the 1st component
-    if (t == 0) {
-      data[t] = expr::F<op::sigmoid>(src[t]);
-    } else {
-      data[t] = dot(data[t - 1], weight);
-      data[t] += src[t];
-      data[t] = expr::F<op::sigmoid>(data[t]);
-    }
-  }
-}
-
-void HiddenLayer::ComputeGradient(int flag, const vector<Layer*>& srclayers) {
-  auto data = RTensor2(&data_);
-  auto grad = RTensor2(&grad_);
-  auto weight = RTensor2(weight_->mutable_data());
-  auto gweight = RTensor2(weight_->mutable_grad());
-  auto gsrc = RTensor2(srclayers[0]->mutable_grad(this));
-  gweight = 0;
-  TensorContainer<cpu, 1> tmp(Shape1(data_.shape()[1]));
-  // Check!!
-  for (int t = window_ - 1; t >= 0; t--) {
-    if (t < window_ - 1) {
-      tmp = dot(grad[t + 1], weight.T());
-      grad[t] += tmp;
-    }
-    grad[t] = expr::F<op::sigmoid_grad>(data[t])* grad[t];
-  }
-  gweight = dot(data.Slice(0, window_-1).T(), grad.Slice(1, window_));
-  Copy(gsrc, grad);
-}
-
-/*********** Implementation for LossLayer **********/
-LossLayer::~LossLayer() {
-  delete word_weight_;
-  delete class_weight_;
-}
-
-void LossLayer::Setup(const LayerProto& conf, const vector<Layer*>& srclayers) {
-  RNNLayer::Setup(conf, srclayers);
-  CHECK_EQ(srclayers.size(), 2);
-  const auto& src = srclayers[0]->data(this);
-  int max_window = src.shape()[0];
-  int vdim = src.count() / max_window;   // Dimension of input
-  int vocab_size = conf.GetExtension(loss_conf).vocab_size();
-  int nclass = conf.GetExtension(loss_conf).nclass();
-  word_weight_ = Param::Create(conf.param(0));
-  word_weight_->Setup(vector<int>{vocab_size, vdim});
-  class_weight_ = Param::Create(conf.param(1));
-  class_weight_->Setup(vector<int>{nclass, vdim});
-
-  pword_.resize(max_window);
-  pclass_.Reshape(vector<int>{max_window, nclass});
-}
-
-void LossLayer::ComputeFeature(int flag, const vector<Layer*>& srclayers) {
-  window_ = dynamic_cast<RNNLayer*>(srclayers[0])->window();
-  auto pclass = RTensor2(&pclass_);
-  auto src = RTensor2(srclayers[0]->mutable_data(this));
-  auto word_weight = RTensor2(word_weight_->mutable_data());
-  auto class_weight = RTensor2(class_weight_->mutable_data());
-  const float * label = srclayers[1]->data(this).cpu_data();
-
-  float loss = 0.f, ppl = 0.f;
-  for (int t = 0; t < window_; t++) {
-    // label is the next word
-    int start = static_cast<int>(label[(t + 1) * 4 + 2]);
-    int end = static_cast<int>(label[(t + 1) * 4 + 3]);
-
-    auto wordWeight = word_weight.Slice(start, end);
-    CHECK_GT(end, start);
-    pword_[t].Reshape(std::vector<int>{end-start});
-    auto pword = RTensor1(&pword_[t]);
-    pword = dot(src[t], wordWeight.T());
-    Softmax(pword, pword);
-
-    pclass[t] = dot(src[t], class_weight.T());
-    Softmax(pclass[t], pclass[t]);
-
-    int wid = static_cast<int>(label[(t + 1) * 4 + 0]);
-    int cid = static_cast<int>(label[(t + 1) * 4 + 1]);
-    CHECK_GT(end, wid);
-    CHECK_GE(wid, start);
-    loss_ += -log(std::max(pword[wid - start] * pclass[t][cid], FLT_MIN));
-    ppl_ += log10(std::max(pword[wid - start] * pclass[t][cid], FLT_MIN));
-  }
-  num_ += window_;
-}
-
-void LossLayer::ComputeGradient(int flag, const vector<Layer*>& srclayers) {
-  auto pclass = RTensor2(&pclass_);
-  auto src = RTensor2(srclayers[0]->mutable_data(this));
-  auto gsrc = RTensor2(srclayers[0]->mutable_grad(this));
-  auto word_weight = RTensor2(word_weight_->mutable_data());
-  auto gword_weight = RTensor2(word_weight_->mutable_grad());
-  auto class_weight = RTensor2(class_weight_->mutable_data());
-  auto gclass_weight = RTensor2(class_weight_->mutable_grad());
-  const float * label = srclayers[1]->data(this).cpu_data();
-  gclass_weight = 0;
-  gword_weight = 0;
-  for (int t = 0; t < window_; t++) {
-    int start = static_cast<int>(label[(t + 1) * 4 + 2]);
-    int end = static_cast<int>(label[(t + 1) * 4 + 3]);
-    int wid = static_cast<int>(label[(t + 1) * 4 + 0]);
-    int cid = static_cast<int>(label[(t + 1) * 4 + 1]);
-    auto pword = RTensor1(&pword_[t]);
-    CHECK_GT(end, wid);
-    CHECK_GE(wid, start);
-
-    // gL/gclass_act
-    pclass[t][cid] -= 1.0;
-    // gL/gword_act
-    pword[wid - start] -= 1.0;
-
-    // gL/gword_weight
-    gword_weight.Slice(start, end) += dot(pword.FlatTo2D().T(),
-                                          src[t].FlatTo2D());
-    // gL/gclass_weight
-    gclass_weight += dot(pclass[t].FlatTo2D().T(),
-                         src[t].FlatTo2D());
-
-    gsrc[t] = dot(pword, word_weight.Slice(start, end));
-    gsrc[t] += dot(pclass[t], class_weight);
-  }
-}
-
-const std::string LossLayer::ToString(bool debug, int flag) {
-  float loss = loss_ / num_;
-  float ppl = exp10(- ppl_ / num_);
-  loss_ = 0;
-  num_ = 0;
-  ppl_ = 0;
-  return "loss = " + std::to_string(loss) + ", ppl = " + std::to_string(ppl);
-}
-}   // end of namespace rnnlm
diff --git a/examples/rnnlm/rnnlm.h b/examples/rnnlm/rnnlm.h
deleted file mode 100644
index 0e415e3..0000000
--- a/examples/rnnlm/rnnlm.h
+++ /dev/null
@@ -1,158 +0,0 @@
-/************************************************************
-*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*
-*************************************************************/
-
-#ifndef EXAMPLES_RNNLM_RNNLM_H_
-#define EXAMPLES_RNNLM_RNNLM_H_
-
-#include <string>
-#include <vector>
-#include "singa/singa.h"
-#include "./rnnlm.pb.h"
-
-namespace rnnlm {
-using std::vector;
-using singa::LayerProto;
-using singa::Layer;
-using singa::Param;
-using singa::Blob;
-using singa::Metric;
-/**
- * Base RNN layer. May make it a base layer of SINGA.
- */
-class RNNLayer : virtual public singa::Layer {
- public:
-  /**
-   * The recurrent layers may be unrolled different times for different
-   * iterations, depending on the applications. For example, the ending word
-   * of a sentence may stop the unrolling; unrolling also stops when the max
-   * window size is reached. Every layer must reset window_ in its
-   * ComputeFeature function.
-   *
-   * @return the effective BPTT length, which is <= max_window.
-   */
-  inline int window() { return window_; }
-
- protected:
-  //!< effect window size for BPTT
-  int window_;
-};
-
-/**
- * Input layer that get read records from data shard
- */
-class DataLayer : public RNNLayer, public singa::InputLayer {
- public:
-  ~DataLayer();
-  void Setup(const LayerProto& conf, const vector<Layer*>& srclayers) override;
-  void ComputeFeature(int flag, const vector<Layer*>& srclayers) override;
-  int max_window() const {
-    return max_window_;
-  }
-
- private:
-  int max_window_;
-  singa::io::Store* store_ = nullptr;
-};
-
-
-/**
- * LabelLayer that read records_[1] to records_[window_] from DataLayer to
- * offer label information
-class LabelLayer : public RNNLayer {
- public:
-  void Setup(const LayerProto& conf, const vector<Layer*>& srclayers) override;
-  void ComputeFeature(int flag, const vector<Layer*>& srclayers) override;
-  void ComputeGradient(int flag, const vector<Layer*>& srclayers) override {}
-};
- */
-
-
-/**
- * Word embedding layer that get one row from the embedding matrix for each
- * word based on the word index
- */
-class EmbeddingLayer : public RNNLayer {
- public:
-  ~EmbeddingLayer();
-  void Setup(const LayerProto& conf, const vector<Layer*>& srclayers) override;
-  void ComputeFeature(int flag, const vector<Layer*>& srclayers) override;
-  void ComputeGradient(int flag, const vector<Layer*>& srclayers) override;
-  const std::vector<Param*> GetParams() const override {
-    std::vector<Param*> params{embed_};
-    return params;
-  }
-
-
- private:
-  int word_dim_;
-  int vocab_size_;
-  //!< word embedding matrix of size vocab_size_ x word_dim_
-  Param* embed_;
-};
-
-
-/**
- * hid[t] = sigmoid(hid[t-1] * W + src[t])
- */
-class HiddenLayer : public RNNLayer {
- public:
-  ~HiddenLayer();
-  void Setup(const LayerProto& conf, const vector<Layer*>& srclayers) override;
-  void ComputeFeature(int flag, const vector<Layer*>& srclayers) override;
-  void ComputeGradient(int flag, const vector<Layer*>& srclayers) override;
-
-  const std::vector<Param*> GetParams() const override {
-    std::vector<Param*> params{weight_};
-    return params;
-  }
-
-
- private:
-  Param* weight_;
-};
-
-/**
- * p(word at t+1 is from class c) = softmax(src[t]*Wc)[c]
- * p(w|c) = softmax(src[t]*Ww[Start(c):End(c)])
- * p(word at t+1 is w)=p(word at t+1 is from class c)*p(w|c)
- */
-class LossLayer : public RNNLayer {
- public:
-  ~LossLayer();
-  void Setup(const LayerProto& conf, const vector<Layer*>& srclayers) override;
-  void ComputeFeature(int flag, const vector<Layer*>& srclayers) override;
-  void ComputeGradient(int flag, const vector<Layer*>& srclayers) override;
-
-  const std::string ToString(bool debug, int flag) override;
-  const std::vector<Param*> GetParams() const override {
-    std::vector<Param*> params{word_weight_, class_weight_};
-    return params;
-  }
-
- private:
-  std::vector<Blob<float>> pword_;
-  Blob<float> pclass_;
-  Param* word_weight_, *class_weight_;
-  float loss_, ppl_;
-  int num_;
-};
-}  // namespace rnnlm
-#endif  // EXAMPLES_RNNLM_RNNLM_H_
diff --git a/examples/rnnlm/rnnlm.proto b/examples/rnnlm/rnnlm.proto
deleted file mode 100644
index 4a4dcbc..0000000
--- a/examples/rnnlm/rnnlm.proto
+++ /dev/null
@@ -1,53 +0,0 @@
-/************************************************************
-*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*
-*************************************************************/
-
-import "job.proto";
-import "common.proto";
-
-message EmbeddingProto {
-  optional int32 word_dim = 1;
-  optional int32 vocab_size = 2;
-}
-
-message LossProto {
-  optional int32 nclass = 1;
-  optional int32 vocab_size = 2;
-}
-
-message DataProto {
-  required string path = 1;
-  optional int32 max_window = 2;
-  optional string backend = 3 [default = "kvfile"];
-}
-
-extend singa.LayerProto {
-  optional EmbeddingProto embedding_conf = 1001;
-  optional LossProto loss_conf = 1002;
-  optional DataProto data_conf = 1003;
-}
-
-message WordRecord {
-  optional string word = 1;
-  optional int32 word_index = 2;
-  optional int32 class_index = 3;
-  optional int32 class_start = 4;
-  optional int32 class_end = 5;
-}
diff --git a/include/mshadow/cuda/cuda_reduce.cuh b/include/mshadow/cuda/cuda_reduce.cuh
deleted file mode 100644
index b7808a6..0000000
--- a/include/mshadow/cuda/cuda_reduce.cuh
+++ /dev/null
@@ -1,117 +0,0 @@
-#ifndef MSHADOW_CUDA_REDUCE_CUH
-#define MSHADOW_CUDA_REDUCE_CUH
-/*!
- * \file cuda_reduce.cuh
- * \brief helper functions to do reduction
- * \author Tianqi Chen
- */
-namespace mshadow{
-    namespace cuda{
-        /*
-         * \brief reduce over the dimension x
-         * \tparam Reducer reducer
-         * \tparam x_bits dimension = 1<<x_bits
-         */
-        template<typename Reducer,int x_bits>
-        inline __device__ void Reduce1D( volatile real_t buf[1<<x_bits] );
-        /*
-         * \brief reduce over the dimension x
-         * \tparam Reducer reducer
-         * \tparam xmax_bits maximum size of buffer
-         * \param xsize size of x dimension, not sure if aligned
-         */
-        template<typename Reducer, int xmax_bits>
-        inline __device__ void Reduce1DNotAlign( volatile real_t buf[1<<xmax_bits], int xsize );
-    };
-};
-
-// ===============================================x===
-//  implementations afterwards, 
-//  no need to read if only use the functions
-// --------------------------------------------------
-#ifdef  __DEVICE_EMULATION__
-#define __MSHADOW_EMUSYNC__ __syncthreads()
-#else
-#define __MSHADOW_EMUSYNC__ 
-#endif
-
-namespace mshadow{
-    namespace cuda{        
-        template<typename Reducer, int x_bits>
-        inline __device__ void ReduceX( volatile real_t buf[], int tid ){
-            if( x_bits >= 10 ){
-                if( tid < 512 ) Reducer::Reduce( buf[tid] , buf[tid + 512] );
-                __syncthreads(); 
-            }
-            if( x_bits >= 9 ){
-                if( tid < 256 ) Reducer::Reduce( buf[tid] , buf[tid + 256] );
-                __syncthreads(); 
-            }
-            if( x_bits >= 8 ){
-                if( tid < 128 ) Reducer::Reduce( buf[tid] , buf[tid + 128] );
-                __syncthreads(); 
-            }
-            if( x_bits >= 7 ){
-                if( tid < 64  ) Reducer::Reduce( buf[tid] , buf[tid + 64 ] );
-                __syncthreads(); 
-            }            
-            if( x_bits >= 6 ){
-                if( tid < 32 ) Reducer::Reduce( buf[tid] , buf[tid + 32] );
-                __syncthreads();
-            }
-            // in warp optimization
-            if( x_bits >= 5 ){
-                if( tid < 16 ) Reducer::Reduce( buf[tid] , buf[tid + 16] );
-                __MSHADOW_EMUSYNC__;
-            }
-            if( x_bits >= 4 ){
-                if( tid < 8 ) Reducer::Reduce( buf[tid] , buf[tid + 8 ] );
-                __MSHADOW_EMUSYNC__;            
-            }
-            if( x_bits >= 3 ){
-                if( tid < 4 ) Reducer::Reduce( buf[tid] , buf[tid + 4 ] );
-                __MSHADOW_EMUSYNC__;
-            }
-            if( x_bits >= 2 ){
-                if( tid < 2 ) Reducer::Reduce( buf[tid] , buf[tid + 2 ] );
-                __MSHADOW_EMUSYNC__;
-            }
-            if( x_bits >= 1 ){
-                if( tid < 1 ) Reducer::Reduce( buf[tid] , buf[tid + 1 ] );
-                __MSHADOW_EMUSYNC__;
-            }  
-        };
-        
-        template<typename Reducer,int x_bits>
-        inline __device__ void Reduce1D( volatile real_t buf[1<<x_bits] ){
-            ReduceX<Reducer,x_bits>( buf, threadIdx.x );
-        }
-
-        // reduce with a upper bound
-        #define __RD_NON_ALIGN(els,x_bits)                              \
-            els                                                         \
-            if( xmax_bits >= x_bits && x_size >= (1 << x_bits) ){       \
-                if( tid < (1 << x_bits) && tid + (1<<x_bits) < x_size ){ \
-                    Reducer::Reduce( buf[tid] , buf[tid + (1<<x_bits)] ); \
-                }                                                       \
-                __syncthreads();                                        \
-                ReduceX<Reducer, x_bits>( buf, tid );                   \
-            }                                                           \
-            
-        template<typename Reducer, int xmax_bits>
-        inline __device__ void Reduce1DNotAlign( volatile real_t buf[], int x_size ){
-            int tid = threadIdx.x;
-            __RD_NON_ALIGN(, 8)
-            __RD_NON_ALIGN(else, 7)
-            __RD_NON_ALIGN(else, 6)
-            __RD_NON_ALIGN(else, 5) 
-            __RD_NON_ALIGN(else, 4) 
-            __RD_NON_ALIGN(else, 3) 
-            __RD_NON_ALIGN(else, 2) 
-            __RD_NON_ALIGN(else, 1)                     
-        }
-    };
-};
-
-#endif // MSHADOW_CUDA_REDUCE_CUH
-
diff --git a/include/mshadow/cuda/tensor_gpu-inl.cuh b/include/mshadow/cuda/tensor_gpu-inl.cuh
deleted file mode 100644
index 61e477c..0000000
--- a/include/mshadow/cuda/tensor_gpu-inl.cuh
+++ /dev/null
@@ -1,231 +0,0 @@
-#ifndef MSHADOW_TENSOR_GPU_INL_CUH
-#define MSHADOW_TENSOR_GPU_INL_CUH
-/*!
- * \file tensor_gpu-inl.cuh
- * \brief implementation of GPU code using CUDA
- * \author Bing Xu, Tianqi Chen
- */
-#include "../tensor.h"
-#include "cuda_reduce.cuh"
-
-namespace mshadow{
-    namespace cuda{
-        #ifndef __CUDA_ARCH__
-        #warning "__CUDA_ARCH__ is not defined, I will assume compiling with CUDA verion greater than 2.0"
-        #endif
-        /* load unit for memory access */
-        #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 200
-        const int kMemUnitBits = 5;
-        const int kMaxThreadsPerBlock = 1024;
-        #else
-        const int kMemUnitBits = 4;
-        const int kMaxThreadsPerBlock = 512;
-        #endif
-        /*! \brief number of units that can do synchronized update, half warp size */
-        const int kMemUnit     = 1 << kMemUnitBits;
-        /*! \brief mask that could be helpful sometime */
-        const int kMemUnitMask = kMemUnit - 1;
-        /*! \brief suggested thread number(logscale) for mapping kernel */
-        const int kBaseThreadBits = 8;
-        /*! \brief suggested thread number for mapping kernel */
-        const int kBaseThreadNum  = 1 << kBaseThreadBits;
-        /*! \brief maximum value of grid */
-        const int kMaxGridNum     = 65535;
-        /*! \brief suggested grid number for mapping kernel */
-        const int kBaseGridNum    = 1024;
-        
-        /*! \brief get align stride for given size in x dimension */
-        inline index_t GetAlignStride( index_t xsize, index_t xstride ){ 
-            if( (xstride & (kMemUnit-1)) == 0 ){
-                return ( (xsize  + kMemUnit - 1) >> kMemUnitBits) << kMemUnitBits;
-            }else{
-                // if originally space is not aligned, no necessary to to alligned thread allocation
-                return xsize;
-            }
-        }
-        inline void CheckLaunchParam( dim3 dimGrid, dim3 dimBlock, const char *estr = "" ){
-            if( dimBlock.x*dimBlock.y*dimBlock.z > (unsigned)kMaxThreadsPerBlock ||
-                dimGrid.x > 65535 || dimGrid.y > 65535 ){
-                fprintf( stderr, "%s[%u,%u,%u]:", estr, dimBlock.x, dimBlock.y, dimBlock.z );
-                utils::Error( "too large launch parameter\n");
-            } 
-        }        
-    };
-
-    namespace cuda {
-        template<typename Saver, typename Plan, int block_dim_bits>
-        __device__ void MapPlanProc( Tensor<gpu,2> dst, const index_t xstride, const Plan exp, int block_idx ){
-            const index_t tid = (block_idx << block_dim_bits) + threadIdx.x;
-            const int y   = tid / xstride;
-            const int x   = tid % xstride;
-            if (y < dst.shape[1] && x < dst.shape[0]) {
-                Saver::Save(dst[y][x], exp.Eval(y,x));
-            }
-        }
-        template<typename Saver, typename Plan, int block_dim_bits>
-        __global__ void MapPlanKernel( Tensor<gpu,2> dst, const index_t xstride, const Plan exp ){
-            MapPlanProc<Saver, Plan,block_dim_bits>( dst, xstride, exp, blockIdx.x );
-        }
-        template<typename Saver, typename Plan, int block_dim_bits, int grid_size>
-        __global__ void MapPlanLargeKernel( Tensor<gpu,2> dst, const index_t xstride, const Plan exp, int repeat ){
-            for( int i = 0; i < repeat; ++i ){
-                MapPlanProc<Saver, Plan,block_dim_bits>( dst, xstride, exp, blockIdx.x + i*grid_size );
-            }
-        }        
-        
-        template<typename Saver, typename E>
-        inline void MapPlan( Tensor<gpu,2> dst, const expr::Plan<E> &plan ){
-            const index_t xstride = GetAlignStride( dst.shape[0], dst.shape.stride_ );
-            const int num_block = ( dst.shape[1]*xstride + kBaseThreadNum-1) / kBaseThreadNum;
-            dim3 dimBlock(kBaseThreadNum, 1, 1);
-
-            if (num_block < kMaxGridNum) {
-                dim3 dimGrid(num_block, 1, 1);
-                MapPlanKernel<Saver, expr::Plan<E>, kBaseThreadBits>   \
-                    <<<dimGrid,dimBlock>>>(dst, xstride, plan);
-            } else {
-                int repeat = (num_block + kBaseGridNum-1) / kBaseGridNum;
-                dim3 dimGrid( kBaseGridNum, 1 , 1 );
-                MapPlanLargeKernel<Saver,expr::Plan<E>, kBaseThreadBits, kBaseGridNum> \
-                    <<<dimGrid,dimBlock>>>(dst, xstride, plan, repeat );
-            }
-        }        
-    }; // namespace cuda
-    
-    namespace cuda{
-        template<typename Saver,typename Reducer, int warp_bits, typename Plan>
-        __global__ void MapRedKeepLowestKernel( Tensor<gpu,1> dst, Plan plan, real_t scale, Shape<2> eshape ){
-            const unsigned warp_size = 1 << warp_bits;
-            const unsigned x = (blockIdx.x<<warp_bits) + threadIdx.x;
-            // to avoid bank conflict
-            __shared__ real_t s_res[ warp_size ][ warp_size + 1 ];
-
-            // note: reverse store [y][x], so that we can reduce over threadIdx.x, use warp optimization
-            if( threadIdx.y < eshape[1] && x < eshape[0] ){
-                s_res[ threadIdx.x ][ threadIdx.y ] = plan.Eval( threadIdx.y, x );
-            }
-            for( unsigned y = warp_size; y < eshape[1]; y += warp_size ){
-                if( threadIdx.y + y < eshape[1] && x < eshape[0] ){
-                    Reducer::Reduce( s_res[ threadIdx.x ][ threadIdx.y ], plan.Eval( threadIdx.y + y, x ) );
-                }
-            } 
-            __syncthreads();
-            if( eshape[1] >= warp_size ){
-                Reduce1D<Reducer,warp_bits>( s_res[ threadIdx.y ] );
-            }else{
-                Reduce1DNotAlign<Reducer,warp_bits>( s_res[ threadIdx.y ], eshape[1] );
-            }
-            __syncthreads();            
-            
-            if( threadIdx.y == 0 && x < eshape[0] ){
-                Saver::Save( dst[x],  s_res[ threadIdx.x ][ 0 ] * scale );
-            } 
-        }        
-        
-        template<typename Saver, typename Reducer, typename E>
-        inline void MapReduceKeepLowest( Tensor<gpu,1> dst, const expr::Plan<E> &plan, real_t scale, Shape<2> eshape ){
-            dim3 dimBlock( kMemUnit, kMemUnit );
-            dim3 dimGrid ( (eshape[0]+kMemUnit-1) >> kMemUnitBits );
-            CheckLaunchParam( dimGrid, dimBlock, "MapRedKeepLowestKernel" );
-            MapRedKeepLowestKernel<Saver,Reducer,kMemUnitBits><<<dimGrid,dimBlock>>>( dst, plan, scale, eshape );
-        } 
-    }; // namespace cuda
-    
-    namespace cuda{
-        template<typename Saver,typename Reducer, int block_dim_bits, typename Plan>
-        __global__ void MapReduceKeepDim2Kernel( Tensor<gpu,1> dst, Plan plan, real_t scale, Shape<4> pshape ){
-            const int block_size = 1 << block_dim_bits;
-            __shared__ real_t s_rec[ block_size ];
-            const int c = blockIdx.x;            
-            const index_t tot = pshape[0]*pshape[1]*pshape[3];
-
-            real_t res = Reducer::kInitV;
-            for( index_t i_offset = 0; i_offset < tot; i_offset += block_size ){
-                index_t i = i_offset + threadIdx.x;
-                if( i< tot ){
-                    const index_t x = i % pshape[0];
-                    i /= pshape[0]; 
-                    const index_t y = i % pshape[1];
-                    const index_t n = i / pshape[1];
-                    Reducer::Reduce( res, plan.Eval( (n*pshape[2] + c) * pshape[1] + y, x ) );
-                }
-            }                
-            s_rec[ threadIdx.x ] = res;
-            __syncthreads();
-            Reduce1D<Reducer,block_dim_bits>( s_rec );
-            if( threadIdx.x == 0 ){
-                Saver::Save( dst[c], s_rec[0]*scale );
-            }
-        }
-
-        template<typename Saver, typename Reducer, typename Plan>
-        inline void MapReduceKeepDim2( Tensor<gpu,1> dst, const Plan &plan, real_t scale, Shape<4> pshape ){  
-            dim3 dimBlock( kBaseThreadNum );
-            dim3 dimGrid ( dst.shape[0] );
-            CheckLaunchParam( dimGrid, dimBlock, "MapReduceKeepDim2" );
-            MapReduceKeepDim2Kernel<Saver,Reducer,kBaseThreadBits>
-                <<<dimGrid,dimBlock>>>( dst, plan, scale, pshape );
-        }
-    };
-    
-    namespace cuda{
-        template<int x_bits>        
-        __global__ void SoftmaxKernel( Tensor<gpu,2> dst, Tensor<gpu,2> src ){
-            const unsigned x_size = 1 << x_bits;  
-            const int y = blockIdx.x;
-            __shared__ real_t s_rec[ x_size ];
-            
-            // step 1: get max
-            if( threadIdx.x < dst.shape[ 0 ] ){
-                s_rec[ threadIdx.x ] = src[ y ][ threadIdx.x ] ; 
-            }
-            for( unsigned x = x_size; x < dst.shape[0]; x += x_size ){
-                if( x + threadIdx.x < dst.shape[0] ){
-                    real_t a = src[ y ][ x + threadIdx.x ];
-                    s_rec[ threadIdx.x ] = max( a, s_rec[ threadIdx.x] );
-                }
-            }
-            __syncthreads();
-            if( threadIdx.x >= dst.shape[0] ){
-                s_rec[ threadIdx.x ] = s_rec[0];
-            }
-            __syncthreads();
-            Reduce1D<red::maximum,x_bits>( s_rec );
-            __syncthreads();
-            real_t smax = s_rec[0];            
-            __syncthreads();
-            s_rec[ threadIdx.x ] = 0.0f;
-            __syncthreads();
-
-            // calculate normalizer, with writeback
-            for( unsigned x = 0; x < dst.shape[0]; x += x_size ){
-                if( x + threadIdx.x < dst.shape[0] ){
-                    real_t p = expf( src[ y ][ x + threadIdx.x ] - smax );
-                    s_rec[ threadIdx.x ] += p;
-                    // write back first, will fetch later
-                    dst[ y ][ x + threadIdx.x ] = p;
-                }
-            }
-            // calculate normalizer
-            __syncthreads();
-            Reduce1D<red::sum,x_bits>( s_rec );
-            __syncthreads();
-            real_t ssum = s_rec[0];
-
-            for( unsigned x = 0; x < dst.shape[0]; x += x_size ){
-                if( x + threadIdx.x < dst.shape[0] ){
-                    dst[ y ][ x + threadIdx.x ] /= ssum;
-                }
-            }
-        }
-    
-        inline void Softmax( Tensor<gpu,2> &dst, const Tensor<gpu,2> &src ){
-            dim3 dimBlock( kBaseThreadNum );
-            dim3 dimGrid ( dst.shape[1] );
-            utils::Assert( dst.shape == src.shape, "Softmax: shape mismatch" );
-            CheckLaunchParam( dimGrid, dimBlock, "Softmax" );
-            SoftmaxKernel<kBaseThreadBits><<<dimGrid,dimBlock>>>( dst, src );
-        }
-    }; // namespace cuda
-}; // namespace mshadow
-#endif // TENSOR_GPU_INL_H
diff --git a/include/mshadow/cxxnet_op.h b/include/mshadow/cxxnet_op.h
deleted file mode 100644
index 1422070..0000000
--- a/include/mshadow/cxxnet_op.h
+++ /dev/null
@@ -1,127 +0,0 @@
-#ifndef CXXNET_OP_H
-#define CXXNET_OP_H
-#pragma once
-/*!
- * \file cxxnet_op.h
- * \brief extra mshadow operation for cxxnet
- * \author Bing Xu
- */
-#include "mshadow/tensor.h"
-
-namespace mshadow {
-    /*! \brief operations for algorithm */
-    namespace op {
-        struct sigmoid {
-            MSHADOW_XINLINE static real_t Map(real_t a) {
-                return 1.0f / (1.0f + expf(-a));
-            }
-        };
-        struct sigmoid_grad {
-            MSHADOW_XINLINE static real_t Map(real_t a) {
-                return a * ( 1.0f - a );
-            }
-        };
-
-        /*! \brief Rectified Linear Operation */
-        struct relu {
-            MSHADOW_XINLINE static real_t Map(real_t a) {
-                using namespace std;
-                return max( a, 0.0f );
-            }
-        };
-        struct relu_grad {
-            MSHADOW_XINLINE static real_t Map(real_t a) {
-                return a > 0.0f ? 1.0f : 0.0f;
-            }
-        };
-
-        struct tanh {
-            MSHADOW_XINLINE static real_t Map(real_t a) {
-                return tanhf( a );
-            }
-        };
-        struct tanh_grad {
-            MSHADOW_XINLINE static real_t Map(real_t a) {
-                return 1.0f - a * a;
-            }
-        };
-        struct softplus {
-            MSHADOW_XINLINE static real_t Map(real_t a) {
-                return logf(1 + expf(a));
-            }
-        };
-        struct softplus_grad {
-            MSHADOW_XINLINE static real_t Map(real_t a) {
-                return 1.0f / (1.0f + expf(-a));
-            }
-        };
-        struct bnll {
-            MSHADOW_XINLINE static real_t Map(real_t a) {
-                return a > 0.0f ? a + logf(1.0f + expf(-a)) : logf(1.0f + expf(a));
-            }
-        };
-        struct bnll_grad {
-            MSHADOW_XINLINE static real_t Map(real_t a) {
-                real_t expval = a > 50.0f ? 50.0f : a; // kBNLL_THRESHOLD = 50.0f
-                expval = expf(-expval);
-                return 1.0f / (1.0f + expval);
-            }
-        };
-
-        struct square {
-            MSHADOW_XINLINE static real_t Map(real_t a) {
-                return a * a;
-            }
-        };
-       /*! \brief scaled tanh, hard code the scale factor*/
-        struct stanh {
-            MSHADOW_XINLINE static real_t Map(real_t a) {
-              return  1.7159047*tanhf(0.66666667 *a);
-            }
-        };
-        /*! \breif back prop for scaled tanh: */
-        struct stanh_grad {
-            MSHADOW_XINLINE static real_t Map(real_t a) {
-                return 0.66666667*1.7159047 -0.66666667/1.7159047*a*a;
-            }
-        };
-
-        struct abs{
-            MSHADOW_XINLINE static real_t Map(real_t a) {
-                return a < 0 ? -a : a;
-            }
-        };
-
-    }; //namespace op
-
-}; //namespace mshadow
-
-namespace mshadow {
-    namespace op {
-        /*! \brief used for generate Bernoulli mask */
-        struct threshold {
-            MSHADOW_XINLINE static real_t Map(real_t a, real_t b) {
-                return a < b ? 1.0f : 0.0f;
-            }
-        };
-
-        /*! \brief used for generate element of power */
-        struct power {
-            MSHADOW_XINLINE static real_t Map(real_t a, real_t b) {
-                return powf( a, b );
-            }
-        };
-        struct sqrtop {
-            MSHADOW_XINLINE static real_t Map(real_t a, real_t b) {
-                return sqrt(a+b);
-            }
-        };
-        struct max {
-            MSHADOW_XINLINE static real_t Map(real_t a, real_t b) {
-                return a > b ? a : b;
-            }
-        };
-    }; // namespace op
-}; // namespace mshadow
-
-#endif // CXXNET_OP_H
diff --git a/include/mshadow/tensor.h b/include/mshadow/tensor.h
deleted file mode 100644
index 42d13d3..0000000
--- a/include/mshadow/tensor.h
+++ /dev/null
@@ -1,472 +0,0 @@
-#ifndef MSHADOW_TENSOR_H
-#define MSHADOW_TENSOR_H
-/*!
- * \file tensor.h
- * \brief header file of tensor data structure and functions
- *        covention: this lib requires explicit memory allocation and de-allocation
- *                   all the data structure Tensor<cpu,1>, Tensor<gpu,1> are like handles(pointers),
- *                   no memory allocation is happening during calculation
- * \author Bing Xu, Tianqi Chen
- */
-#include "tensor_base.h"
-#include "tensor_expr.h"
-
-namespace mshadow {
-    /*!
-     * \brief shape of a tensor
-     *       IMPORTANT NOTE: this shape is different from numpy.shape
-     *       shape[0] gives the lowest dimension, shape[dimension-1] gives the highest dimension
-     *       shape[k] corresponds to k-th dimension of tensor
-     * \tparam dimension dimension of tensor
-     */
-    template<int dimension>
-    struct Shape {
-    public:
-        /*! \brief maximum dimension of tensor */
-        const static int kMaxShape = dimension;
-        /*! \brief maximum dimension minus 1 */
-        const static int kSubShape = dimension - 1;
-    public:
-        /*! \brief default constructor, do nothing */
-        MSHADOW_XINLINE Shape(void) {}
-        /*! \brief constuctor */
-        MSHADOW_XINLINE Shape( const Shape<dimension> &s ){
-            #pragma unroll
-            for( int i = 0; i < kMaxShape; ++i ){
-                this->shape_[i] = s[i];
-            }
-            this->stride_ = s.stride_;
-        }
-        /*!
-         * \brief get corresponding index
-         * \param idx dimension index
-         * \return the corresponding dimension size
-         */
-        MSHADOW_XINLINE index_t& operator[](index_t idx) {
-            return shape_[ idx ];
-        }
-        /*!
-         * \brief get corresponding index
-         * \param idx dimension index
-         * \return the corresponding dimension size
-         */
-        MSHADOW_XINLINE const index_t& operator[](index_t idx) const {
-            return shape_[ idx ];
-        }
-        /*! \return whether two shape equals */
-        MSHADOW_XINLINE bool operator==(const Shape<kMaxShape> &s) const {
-            #pragma unroll
-            for ( int i = 0; i < kMaxShape; ++i ) {
-                if (s.shape_[i] != this->shape_[i]) return false;
-            }
-            return true;
-        }
-        /*!
-         * flatten the higher dimension to second dimension, return a 2D shape
-         * \return the flat 2d shape
-         */
-        MSHADOW_XINLINE Shape<2> FlatTo2D(void) const {
-            Shape<2> s;
-            s.stride_ = this->stride_;
-            s.shape_[ 0 ] = this->shape_[ 0 ];
-            index_t ymax = 1;
-
-            #pragma unroll
-            for (int i = 1; i < kMaxShape; ++i) {
-                ymax *= this->shape_[ i ];
-            }
-            s.shape_[1] = ymax;
-            return s;
-        }
-        /*! \return number of valid elements */
-        MSHADOW_XINLINE size_t Size(void) const{
-            size_t memsz = this->shape_[ 0 ];
-            #pragma unroll
-            for (int i = 1; i < kMaxShape; ++i) {
-                memsz *= this->shape_[ i ];
-            }
-            return memsz;
-        }
-        /*! \return memory size, including the aligned x dimension */
-        MSHADOW_XINLINE size_t MSize(void) const {
-            size_t memsz = this->stride_;
-            #pragma unroll
-            for (int i = 1; i < kMaxShape; ++i) {
-                memsz *= this->shape_[ i ];
-            }
-            return memsz;
-        }
-        /*!
-         * \return product shape in [dimstart,dimend)
-         * \param dimstart start dimension
-         * \param dimend   end dimension
-         */
-        MSHADOW_XINLINE index_t ProdShape( int dimstart, int dimend ) const{
-            index_t num = 1;
-            #pragma unroll
-            for (int i = dimstart; i < dimend; ++i) {
-                num *= this->shape_[ i ];
-            }
-            return num;
-        }
-        /*!
-         * \brief get subshape
-         * \return subshape
-         */
-        MSHADOW_XINLINE Shape<kSubShape> SubShape(void) const {
-            Shape<kSubShape> s;
-            s.stride_ = this->stride_;
-            // for cuda
-            #pragma unroll
-            for (int i = 0; i < kSubShape; ++i) {
-                s.shape_[ i ] = this->shape_[ i ];
-            }
-            return s;
-        }
-
-    public:
-        /*! \brief storing the dimension information */
-        index_t shape_[ kMaxShape ];
-        /*!
-         * \brief storing the stride information in x dimension
-         *    this is used to deal with pitch allocation in gpu or sse(align x dimension to 64bit) for efficiency
-         */
-        index_t stride_;
-    };
-    // useful construction functions to generate shape
-    /*!
-     * \brief construct a one dimension shape, stride will equal s0
-     * \param s0 size of dimension 0
-     * \return the shape construction
-     */
-    MSHADOW_XINLINE Shape<1> Shape1( index_t s0 ){
-        Shape<1> s; s[0] = s0; s.stride_ = s0;
-        return s;
-    }
-    /*!
-     * \brief construct a two dimension shape, stride will equal s0
-     * \param s1 size of dimension 1
-     * \param s0 size of dimension 0
-     * \return the shape construction
-     */
-    MSHADOW_XINLINE Shape<2> Shape2( index_t s1, index_t s0 ){
-        Shape<2> s; s[0] = s0; s[1] = s1; s.stride_ = s0;
-        return s;
-    }
-    /*!
-     * \brief construct a three dimension shape, stride will equal s0
-     * \param s2 size of dimension 2
-     * \param s1 size of dimension 1
-     * \param s0 size of dimension 0
-     * \return the shape construction
-     */
-    MSHADOW_XINLINE Shape<3> Shape3( index_t s2, index_t s1, index_t s0 ){
-        Shape<3> s;
-        s[0] = s0; s[1] = s1; s[2] = s2; s.stride_ = s0;
-        return s;
-    }
-    /*!
-     * \brief construct a four dimension shape, stride will equal s0
-     * \param s3 size of dimension 3
-     * \param s2 size of dimension 2
-     * \param s1 size of dimension 1
-     * \param s0 size of dimension 0
-     * \return the shape construction
-     */
-    MSHADOW_XINLINE Shape<4> Shape4( index_t s3, index_t s2, index_t s1, index_t s0 ){
-        Shape<4> s;
-        s[0] = s0; s[1] = s1; s[2] = s2; s[3] = s3; s.stride_ = s0;
-        return s;
-    }
-}; // namespace mshadow
-
-namespace mshadow {
-    /*! \brief device name CPU */
-    struct cpu {
-        /*! \brief whether this device is CPU or not */
-        const static bool kDevCPU = true;
-        /*! \brief device flag number, identifies this device */
-        const static int kDevMask = 1<<0;
-    };
-    /*! \brief device name CPU */
-    struct gpu {
-        /*! \brief whether this device is CPU or not */
-        const static bool kDevCPU = false;
-        /*! \brief device flag number, identifies this device */
-        const static int kDevMask = 1<<1;
-    };
-
-    // more compact template
-    /*!
-     * \brief general tensor
-     * \tparam Device which device the tensor is on
-     * \tparam dimension dimension of the tensor
-     */
-    template<typename Device, int dimension>
-    struct Tensor: public expr::ContainerExp< Tensor<Device,dimension> >{
-    public:
-        /*! \brief whether current type lies in cpu */
-        const static bool kDevCPU = Device::kDevCPU;
-        /*! \brief dimension of subtype */
-        const static int  kSubdim = dimension - 1;
-
-    public:
-        /*! \brief pointer to the data */
-        real_t *dptr;
-        /*! \brief shape of the tensor */
-        Shape<dimension> shape;
-    public:
-        /*! \brief default constructor */
-        MSHADOW_XINLINE Tensor(void) {}
-        /*! \brief constructor from shape  */
-        MSHADOW_XINLINE Tensor(const Shape<dimension> &shape): shape(shape) {}
-        /*! \brief constructor from data pointer and shape  */
-        MSHADOW_XINLINE Tensor(real_t *dptr, const Shape<dimension> &shape): dptr((real_t*)dptr), shape(shape) {}
-        /*!
-         * \brief flatten the tensor to 2 dimension, collapse the higher dimensions together
-         * \return tensor after flatten
-         */
-        MSHADOW_XINLINE Tensor<Device, 2> FlatTo2D(void) const {
-            return Tensor<Device, 2>(reinterpret_cast<real_t*> \
-                                     (dptr), shape.FlatTo2D());
-        }
-        /*!
-         * \brief get a element of dimension - 1
-         * \param idx index
-         * \return the result tensor
-         */
-        MSHADOW_XINLINE Tensor<Device, kSubdim> operator[](index_t idx) const {
-            Shape<kSubdim> s = shape.SubShape();
-            return Tensor<Device, kSubdim>(reinterpret_cast<real_t*> \
-                                           (dptr) + s.MSize() * idx, s);
-        }
-        /*!
-         * \brief slice the tensor in highest dimension [begin,end)
-         * \param begin begin position of slice
-         * \param end end position of slice
-         * \return tensor after slice
-         */
-        MSHADOW_XINLINE Tensor<Device, dimension> Slice(index_t begin, index_t end) const {
-            Shape<dimension> s = this->shape;
-            s[ dimension - 1 ] = end - begin;
-            return Tensor<Device, dimension>(reinterpret_cast<real_t*>\
-                                             (dptr) + s.SubShape().MSize() * begin, s);
-        }
-    public:
-        /*!\brief functions to fit expression template */
-        inline Tensor<Device,dimension>& operator=( real_t s ){
-            return this->__assign( s );
-        }
-        /*!\brief functions to fit expression template */
-        template<typename E>
-        inline Tensor<Device,dimension>& operator=( const expr::Exp<E,expr::type::kMapper> &exp ){
-            return this->__assign( exp );
-        }
-        /*!\brief functions to fit expression template */
-        template<typename E>
-        inline Tensor<Device,dimension>& operator=( const expr::Exp<E,expr::type::kComplex> &exp ){
-            return this->__assign( exp );
-        }
-    };
-
-    /*
-     *  respecialized class Tensor1D,thei is due to different implementation in operator[]
-     */
-    template<typename Device>
-    struct Tensor<Device,1>: public expr::ContainerExp< Tensor<Device,1> >{
-    public:
-        real_t *dptr;
-        Shape<1> shape;
-    public:
-        MSHADOW_XINLINE Tensor(void) {}
-        MSHADOW_XINLINE Tensor(const Shape<1> &shape): shape(shape) {}
-        MSHADOW_XINLINE Tensor(real_t *dptr, Shape<1> shape) :dptr(dptr), shape(shape) {}
-
-        MSHADOW_XINLINE Tensor<Device, 2> FlatTo2D(void) const {
-            return Tensor<Device, 2>(reinterpret_cast<real_t*> \
-                                     (dptr), shape.FlatTo2D());
-        }
-        MSHADOW_XINLINE Tensor<Device, 1> Slice(index_t begin, index_t end) const {
-            Shape<1> s;
-            s[0] = s.stride_ = end  - begin;
-            return Tensor<Device, 1>(reinterpret_cast<real_t*> \
-                                     (dptr) + begin, s);
-        }
-        MSHADOW_XINLINE real_t &operator[](index_t idx) { return dptr[ idx ]; }
-        MSHADOW_XINLINE const real_t &operator[](index_t idx)const { return dptr[ idx ]; }
-    public:
-        // functions to fit expression template
-        inline Tensor<Device,1>& operator=( double s ){
-            return this->__assign( s );
-        }
-        template<typename E>
-        inline Tensor<Device,1>& operator=( const expr::Exp<E,expr::type::kMapper> &exp ){
-            return this->__assign( exp );
-        }
-        template<typename E>
-        inline Tensor<Device,1>& operator=( const expr::Exp<E,expr::type::kComplex> &exp ){
-            return this->__assign( exp );
-        }
-    };
-}; // namespace mshadow
-
-// add unroll loops for the shape
-namespace mshadow {
-    // function declarations
-    /*!
-     * \brief initialize tensor engine, used to call intialization functions of dependent libs
-     *        this function should be called before all GPU tensor operations,
-     *        for using tensors in CPU, this call is actually not needed
-     * \param device_id GPU device id to be choosed
-     */
-    inline void InitTensorEngine( int device_id=0 );
-    /*!
-     * \brief Shutdown tensor engine,
-     *        this function should be called after all GPU tensor operations,
-     *        for using tensors in CPU, this call is actually not needed
-     */
-    inline void ShutdownTensorEngine( void );
-
-    /*!
-     * \brief CPU/CPU: allocate space for CTensor, according to the shape in the obj
-     *        this function is responsible to set the stride_ in each obj.shape
-     * \tparam dim specify the dim of tensor
-     * \param obj the tensor object, with shape specified
-     * \param pad whether padding dimension 0, to make last dimension aligned,
-     *            padding may help improve efficiency of matrix multiplications
-     *            if true, will allocate space with stride_ that may not equals shape[0]
-     *            if false, will allocate continuous space
-     */
-    template<int dim>
-    inline void AllocSpace(Tensor<cpu,dim> &obj, bool pad = MSHADOW_ALLOC_PAD);
-    /*! \brief refer to comment of cpu ver \sa AllocSpace */
-    template<int dim>
-    inline void AllocSpace(Tensor<gpu,dim> &obj, bool pad = MSHADOW_ALLOC_PAD);
-
-    /*!
-     * \brief CPU/GPU: free the space of tensor, will set obj.dptr to NULL
-     * \tparam dim specify the dim of tensor
-     * \param obj the tensor object
-     */
-    template<int dim>
-    inline void FreeSpace(Tensor<cpu,dim> &obj);
-    /*! \brief refer to comment of cpu ver \sa FreeSpace */
-    template<int dim>
-    inline void FreeSpace(Tensor<gpu,dim> &obj);
-
-    /*!
-     * \brief CPU/GPU: short cut to allocate and initialize a Tensor
-     * \tparam Device device of tensor
-     * \tparam dim dimention of tensor
-     * \param shape: shape of tensor
-     * \param initv: initialization value
-     * \param pad : padding option
-     * \sa AllocSpace
-     */
-    template<typename Device, int dim>
-    inline Tensor<Device,dim> NewTensor(const Shape<dim> &shape, real_t initv, bool pad = MSHADOW_ALLOC_PAD);
-
-    /*!
-     * \brief copy data from one tensor to another, with same shape
-     * \tparam dim specify the dim of tensor
-     * \param dst target tensor
-     * \param src source tensor
-     */
-    template<int dim>
-    inline void Copy(Tensor<cpu,dim> dst, const Tensor<cpu,dim> &src );
-    /*! \brief refer to comment of cpu ver \sa Copy */
-    template<int dim>
-    inline void Copy(Tensor<cpu,dim> dst, const Tensor<gpu,dim> &src );
-    /*! \brief refer to comment of cpu ver \sa Copy */
-    template<int dim>
-    inline void Copy(Tensor<gpu,dim> dst, const Tensor<cpu,dim> &src );
-    /*! \brief refer to comment of cpu ver \sa Copy */
-    template<int dim>
-    inline void Copy(Tensor<gpu,dim> dst, const Tensor<gpu,dim> &src );
-
-
-    /*!
-     * \brief CPU/GPU: normalize softmax: dst[i][j] = exp( energy[i][j] ) /( sum_j exp( energy[i][j] ) )
-     * \param dst destination
-     * \param energy input energy
-     */
-    inline void Softmax( Tensor<cpu,2> dst, const Tensor<cpu,2> &energy );
-    /*! \brief refer to comment of cpu ver \sa Softmax */
-    inline void Softmax( Tensor<gpu,2> dst, const Tensor<gpu,2> &energy );
-
-}; // namespace mshadow
-
-
-namespace mshadow{
-    // function declarations to support expression, no need to understand them
-    // these functions do not need to be directly used
-
-    /*!
-     * \brief CPU/GPU: map a expression to a tensor, this function calls MapPlan
-     * \tparam Saver specify storage method
-     * \tparam dim dim of the tensor, during usage, there is no need to specify this parameter
-     * \tparam E specifies the expression type, not need to specify this parameter during usage
-     * \tparam etype expression type
-     * \param dst destination
-     * \param exp expression
-     * \sa namespace mshadow:sv, mshadow::op, mshadow::expr
-     */
-    template<typename Saver, int dim, typename E, int etype>
-    inline void MapExp(Tensor<cpu,dim> dst, const expr::Exp<E,etype> &exp );
-    /*! \brief refer to comment of cpu ver \sa MapExp */
-    template<typename Saver, int dim, typename E, int etype>
-    inline void MapExp(Tensor<gpu,dim> dst, const expr::Exp<E,etype> &exp );
-
-    /*!
-     * \brief CPU/GPU: map a expression, do reduction to 1D Tensor in lowest dimension (dimension 0)
-     * \tparam Saver specify storage method
-     * \tparam Reducer specify a reducer method
-     * \tparam E specifies the expression type, not need to specify this parameter during usage
-     * \tparam etype expression type
-     * \param dst destination
-     * \param exp expression
-     * \param scale scale the result before save
-     * \sa namespace mshadow:sv, mshadow::op, mshadow::red, mshadow::expr
-     */
-    template<typename Saver, typename Reducer, typename E, int etype>
-    inline void MapReduceKeepLowest( Tensor<cpu,1> dst, const expr::Exp<E,etype> &exp, real_t scale = 1.0f );
-    /*! \brief refer to comment of cpu ver \sa MapReduceKeepLowest */
-    template<typename Saver, typename Reducer, typename E, int etype>
-    inline void MapReduceKeepLowest( Tensor<gpu,1> dst, const expr::Exp<E,etype> &exp, real_t scale = 1.0f );
-
-
-    /*!
-     * \brief CPU/GPU: map a expression, do reduction to 1D Tensor in third dimension (dimension 2)
-     * \tparam Saver specify storage method
-     * \tparam Reducer specify a reducer method
-     * \tparam E specifies the expression type, not need to specify this parameter during usage
-     * \tparam dimkeep the target dimension to be kept, should be larger than 0, for 0, use MapReduceKeepLowest
-     * \tparam etype expression type
-     * \param dst destination
-     * \param exp expression
-     * \param scale scale the result before save
-     * \sa namespace mshadow:sv, mshadow::op, mshadow::red, mshadow::expr
-     */
-    template<typename Saver, typename Reducer, int dimkeep, typename E, int etype>
-    inline void MapReduceKeepHighDim( Tensor<cpu,1> dst, const expr::Exp<E,etype> &exp, real_t scale = 1.0f );
-    /*! \brief refer to comment of cpu ver \sa MapReduceKeepHighDim */
-    template<typename Saver, typename Reducer, int dimkeep, typename E, int etype>
-    inline void MapReduceKeepHighDim( Tensor<gpu,1> dst, const expr::Exp<E,etype> &exp, real_t scale = 1.0f );
-
-};// namespace mshadow
-
-// execution implementation of expression evaluations
-#include "tensor_expr_engine-inl.hpp"
-// cpu implementation of functions
-#include "tensor_cpu-inl.hpp"
-// gpu implementation of functions
-#include "tensor_gpu-inl.hpp"
-// extension of expressions
-#include "tensor_expr_ext.h"
-// io 
-#include "tensor_io.h"
-// container
-#include "tensor_container.h"
-// random number generator
-#include "tensor_random.h"
-#endif // TENSOR_H
diff --git a/include/mshadow/tensor_base.h b/include/mshadow/tensor_base.h
deleted file mode 100644
index b251cba..0000000
--- a/include/mshadow/tensor_base.h
+++ /dev/null
@@ -1,298 +0,0 @@
-#ifndef MSHADOW_TENSOR_BASE_H
-#define MSHADOW_TENSOR_BASE_H
-/*!
- * \file tensor_base.h
- * \brief definitions of base types, macros functions
- *
- * \author Bing Xu, Tianqi Chen
- */
-#include <cmath>
-#include <cstdio>
-#include <cfloat>
-#include <climits>
-#include <algorithm>
-// macro defintiions
-
-/*!\brief if this macro is define to be 1, mshadow should compile without any of other libs */
-#ifndef MSHADOW_STAND_ALONE
-    #define MSHADOW_STAND_ALONE 0
-#endif
-
-/*! \brief whether do padding during allocation */
-#ifndef MSHADOW_ALLOC_PAD
-    #define MSHADOW_ALLOC_PAD true
-#endif
-
-/*! 
- * \brief x dimension of data must be bigger pad_size * ratio to be alloced padded memory, otherwise use tide allocation 
- *        for example, if pad_ratio=2, GPU memory alignement size is 32, then we will only allocate padded memory if x dimension > 64
- *        set it to 0 then we will always allocate padded memory
- */
-#ifndef MSHADOW_MIN_PAD_RATIO
-    #define MSHADOW_MIN_PAD_RATIO 2
-#endif
-
-#if MSHADOW_STAND_ALONE
-   #define MSHADOW_USE_CBLAS 0
-   #define MSHADOW_USE_MKL   0
-   #define MSHADOW_USE_CUDA  0
-#endif
-
-/*! \brief use CBLAS for CBLAS */
-#ifndef MSHADOW_USE_CBLAS
-   #define MSHADOW_USE_CBLAS 0
-#endif
-/*! \brief use MKL for BLAS */
-#ifndef MSHADOW_USE_MKL
-   #define MSHADOW_USE_MKL   1
-#endif
-/*! \brief use CUDA support, must ensure that the cuda include path is correct, or directly compile using nvcc */
-#ifndef MSHADOW_USE_CUDA
-  #define MSHADOW_USE_CUDA   1
-#endif
-/*! \brief use single precition float */
-#ifndef MSHADOW_SINGLE_PRECISION
-  #define MSHADOW_SINGLE_PRECISION 1
-#endif
-/*! \brief whether use SSE */
-#ifndef MSHADOW_USE_SSE
-  #define MSHADOW_USE_SSE 1
-#endif
-/*! \brief whether use NVML to get dynamic info */
-#ifndef MSHADOW_USE_NVML
-  #define MSHADOW_USE_NVML 0
-#endif
-// SSE is conflict with cudacc
-#ifdef __CUDACC__
-  #undef MSHADOW_USE_SSE
-  #define MSHADOW_USE_SSE 0
-#endif
-
-#if MSHADOW_USE_CBLAS
-extern "C"{
-    #include <cblas.h>
-}
-#elif MSHADOW_USE_MKL
-  #include <mkl.h>
-  #include <mkl_cblas.h>
-  #include <mkl_vsl.h>
-  #include <mkl_vsl_functions.h>
-#endif
-
-#if MSHADOW_USE_CUDA
-  #include <cublas.h>
-  #include <curand.h>
-#endif
-
-#if MSHADOW_USE_NVML
-  #include <nvml.h>
-#endif
-// --------------------------------
-// MSHADOW_XINLINE is used for inlining template code for both CUDA and CPU code.
-#ifdef MSHADOW_XINLINE
-  #error "MSHADOW_XINLINE must not be defined"
-#endif
-#ifdef __CUDACC__
-  #define MSHADOW_XINLINE inline __attribute__((always_inline)) __device__ __host__
-#else
-  #define MSHADOW_XINLINE inline __attribute__((always_inline))
-#endif
-/*! \brief cpu force inline */
-#define MSHADOW_CINLINE inline __attribute__((always_inline))
-
-#if defined(__GXX_EXPERIMENTAL_CXX0X) || defined(__GXX_EXPERIMENTAL_CXX0X__) || __cplusplus >= 201103L
-  #define MSHADOW_CONSTEXPR constexpr
-#else
-  #define MSHADOW_CONSTEXPR const
-#endif
-
-/*! \brief namespace for mshadow */
-namespace mshadow {
-    /*! \brief buffer size for each random number generator */
-    const unsigned kRandBufferSize = 1000000;
-    /*! \brief pi  */
-    const float kPi = 3.1415926f;
-
-#if MSHADOW_SINGLE_PRECISION
-    /*! \brief type that will be used for content */
-    typedef float real_t;
-#else
-    typedef double real_t;
-#endif
-    /*! \brief type that will be used for index */
-    typedef unsigned index_t;
-}; // namespace mshadow
-
-namespace mshadow {
-    /*! \brief namespace for operators */
-    namespace op {
-        // binary operator
-        /*! \brief mul operator */
-        struct mul{
-            /*! \brief map a, b to result using defined operation */
-            MSHADOW_XINLINE static real_t Map(real_t a, real_t b) {
-                return a * b;
-            }
-        };
-        /*! \brief plus operator */
-        struct plus {
-            /*! \brief map a, b to result using defined operation */
-            MSHADOW_XINLINE static real_t Map(real_t a, real_t b) {
-                return a + b;
-            }
-        };
-        /*! \brief minus operator */
-        struct minus {
-            /*! \brief map a, b to result using defined operation */
-            MSHADOW_XINLINE static real_t Map(real_t a, real_t b) {
-                return a - b;
-            }
-        };
-        /*! \brief divide operator */
-        struct div {
-            /*! \brief map a, b to result using defined operation */
-            MSHADOW_XINLINE static real_t Map(real_t a, real_t b) {
-                return a / b;
-            }
-        };
-        /*! \brief get rhs */
-        struct right {
-            /*! \brief map a, b to result using defined operation */
-            MSHADOW_XINLINE static real_t Map(real_t a, real_t b) {
-                return b;
-            }
-        };
-    }; // namespace op
-
-    /*! \brief namespace for savers */
-    namespace sv {
-        /*! \brief save to saver: = */
-        struct saveto {
-            /*! \brief save b to a using save method */
-            MSHADOW_XINLINE static void Save(real_t& a, real_t b) {
-                a  = b;
-            }
-            /*! \brief helper constant to use BLAS, alpha */
-            MSHADOW_CONSTEXPR static real_t kAlphaBLAS = 1.0f;
-            /*! \brief helper constant to use BLAS, beta */
-            MSHADOW_CONSTEXPR static real_t kBetaBLAS  = 0.0f;
-            /*! \brief corresponding binary operator type */
-            typedef op::right OPType;
-        };
-        /*! \brief save to saver: += */
-        struct plusto {
-            /*! \brief save b to a using save method */
-            MSHADOW_XINLINE static void Save(real_t& a, real_t b) {
-                a += b;
-            }
-            /*! \brief helper constant to use BLAS, alpha */
-            MSHADOW_CONSTEXPR static real_t kAlphaBLAS = 1.0f;
-            /*! \brief helper constant to use BLAS, beta */
-            MSHADOW_CONSTEXPR static real_t kBetaBLAS  = 1.0f;
-            /*! \brief corresponding binary operator type */
-            typedef op::plus OPType;
-        };
-        /*! \brief minus to saver: -= */
-        struct minusto {
-            /*! \brief save b to a using save method */
-            MSHADOW_XINLINE static void Save(real_t& a, real_t b) {
-                a -= b;
-            }
-            /*! \brief helper constant to use BLAS, alpha */
-            MSHADOW_CONSTEXPR static real_t kAlphaBLAS = -1.0f;
-            /*! \brief helper constant to use BLAS, beta */
-            MSHADOW_CONSTEXPR static real_t kBetaBLAS  = 1.0f;
-            /*! \brief corresponding binary operator type */
-            typedef op::minus OPType;
-        };
-        /*! \brief multiply to saver: *= */
-        struct multo {
-            /*! \brief save b to a using save method */
-            MSHADOW_XINLINE static void Save(real_t& a, real_t b) {
-                a *= b;
-            }
-            /*! \brief corresponding binary operator type */
-            typedef op::mul OPType;
-        };
-        /*! \brief divide to saver: /= */
-        struct divto {
-            /*! \brief save b to a using save method */
-            MSHADOW_XINLINE static void Save(real_t& a, real_t b) {
-                a /= b;
-            }
-            /*! \brief corresponding binary operator type */
-            typedef op::div OPType;
-        };
-    }; // namespace sv
-
-
-    namespace op {
-        // unary operator/ function: example
-        // these operators can be defined by user, in the same style as binary and unary operator
-        // to use, simply write F<op::identity>( src )
-        /*! \brief identity function that maps a real number to it self */
-        struct identity{
-            /*! \brief map a to result using defined operation */
-            MSHADOW_XINLINE static real_t Map(real_t a) {
-                return a;
-            }
-        };
-    }; // namespace op
-
-    /*! \brief namespace for potential reducer operations */
-    namespace red {
-        /*! \brief sum reducer */
-        struct sum {
-            /*! \brief do reduction into dst */
-            MSHADOW_XINLINE static void Reduce( volatile real_t& dst,  volatile real_t src ) {
-                dst += src;
-            }
-            /*! \brief calculate gradient of redres with respect to redsrc,  redres: reduced result, redsrc: one of reduction element */
-            MSHADOW_XINLINE static real_t PartialGrad( real_t redres, real_t redsrc ) {
-                return 1.0f;
-            }
-            /*! \brief an intial value of reducer */
-            MSHADOW_CONSTEXPR static real_t kInitV = 0.0f;
-        };
-        /*! \brief maximum reducer */
-        struct maximum {
-            /*! \brief do reduction into dst */
-            MSHADOW_XINLINE static void Reduce( volatile real_t& dst,  volatile real_t src ) {
-                using namespace std;
-                dst = max( dst, src );
-            }
-            /*! \brief calculate gradient of redres with respect to redsrc,  redres: reduced result, redsrc: one of reduction element */
-            MSHADOW_XINLINE static real_t PartialGrad( real_t redres, real_t redsrc ) {
-                return redres == redsrc ? 1.0f: 0.0f;
-            }
-            /*! \brief an intial value of reducer */
-#if MSHADOW_SINGLE_PRECISION
-            MSHADOW_CONSTEXPR static real_t kInitV = -FLT_MAX;
-#else
-            MSHADOW_CONSTEXPR static real_t kInitV = -DBL_MAX;
-#endif
-        };
-    };
-
-    /*! \brief namespace for helper utils of the project */
-    namespace utils{
-        /*! \brief send error message then exit */
-        inline void Error( const char *msg ){
-            fprintf( stderr, "Error:%s\n",msg );
-            exit( -1 );
-        }
-        /*! \brief assert a expression is true */
-        inline void Assert( bool exp ){
-            if( !exp ) Error( "AssertError" );
-        }
-        /*! \brief assert a expression is true */
-        inline void Assert( bool exp, const char *msg ){
-            if( !exp ) Error( msg );
-        }
-        /*! \brief warning */
-        inline void Warning( const char *msg ){
-            fprintf( stderr, "warning:%s\n",msg );
-        }
-    }; // namespace utils
-}; // namespace mshadow
-#endif // TENSOR_BASE_H
diff --git a/include/mshadow/tensor_container.h b/include/mshadow/tensor_container.h
deleted file mode 100644
index f0699e7..0000000
--- a/include/mshadow/tensor_container.h
+++ /dev/null
@@ -1,152 +0,0 @@
-#ifndef MSHADOW_TENSOR_CONTAINER_H
-#define MSHADOW_TENSOR_CONTAINER_H
-/*!
- * \file tensor_container.h
- * \brief tensor container that does memory allocation and resize like STL
- * \author Tianqi Chen
- */
-#include "tensor.h"
-#include "tensor_io.h"
-
-namespace mshadow{
-    /*!
-     * \brief tensor container that does memory allocation and resize like STL,
-     *        use it to save the lines of FreeSpace in class.
-     *        Do not abuse it, efficiency can come from pre-allocation and no re-allocation
-     *
-     * \tparam Device which device the tensor is on
-     * \tparam dimension dimension of the tensor
-     */
-    template<typename Device, int dimension>
-    class TensorContainer: public Tensor<Device,dimension>{
-    public:
-        /*! 
-         * \brief constructor 
-         * \param pad whether use padding alignment in space allocation
-         */
-        TensorContainer( bool pad = MSHADOW_ALLOC_PAD ){
-            this->pad_ = pad;
-            this->dptr = data_.dptr = NULL;
-            this->shape[0] = 0;
-            this->shape.stride_ = 0;
-            this->data_.shape.stride_ = 0;
-            this->data_.shape[1] = 0;
-        }
-        /*! 
-         * \brief constructor 
-         * \param shape intial shape
-         */
-        TensorContainer( const Shape<dimension> &shape ){
-            this->pad_ = MSHADOW_ALLOC_PAD;
-            data_.dptr = NULL;
-            this->AllocByShape( shape );
-        }
-        /*! 
-         * \brief constructor 
-         * \param shape intial shape
-         * \param initv intial value
-         */
-        TensorContainer( const Shape<dimension> &shape, real_t initv ){
-            this->pad_ = MSHADOW_ALLOC_PAD;
-            data_.dptr = NULL;
-            this->AllocByShape( shape );
-            (*this) = initv;
-        }
-        ~TensorContainer( void ){
-            this->FreeSpace();
-        }
-        /*! 
-         * \brief resize the container to given shape, content is NOT preserved
-         * \param shape target shape
-         */
-        inline void Resize( const Shape<dimension> &shape ){
-            Shape<2> s2 = shape.FlatTo2D();            
-            if( s2.shape_[0] > data_.shape.stride_ || s2.shape_[1] > data_.shape[1] ){
-                this->AllocByShape( shape );
-            }else{
-                this->shape = shape;
-                if( this->pad_ ){
-                    this->shape.stride_ = data_.shape.stride_;
-                }else{
-                    this->shape.stride_ = this->shape[ 0 ];
-                }
-            }
-        }
-        /*! 
-         * \brief resize the container to given shape, and initialize, content is NOT preserved
-         * \param shape target shape
-         * \param initv initialization value
-         */
-        inline void Resize( const Shape<dimension> &shape, real_t initv ){
-            this->Resize( shape );
-            (*this) = initv;
-        }
-        /*! \brief set whether padding is allowed in tensor */
-        inline void set_pad( bool pad ){
-            this->pad_ = pad;
-        }
-        /*! 
-         * \brief save by binary format
-         * \param fo output binary stream
-         * \tparam TStream type of stream, need to support Read, Write, one example is utils::IStream.
-         */
-        template<typename TStream>
-        inline void SaveBinary( TStream &fo ) const{
-            mshadow::SaveBinary( fo, *this );
-        }
-        /*! 
-         * \brief load by binary format, a temp Tensor<cpu,dim> storage will be allocated
-         * \param fi input binary stream
-         * \tparam TStream type of stream, need to support Read, Write, one example is utils::IStream.
-         */
-        template<typename TStream>
-        inline void LoadBinary( TStream &fi ) {
-            Tensor<cpu,dimension> tmp;
-            mshadow::LoadBinary( fi, tmp, false );
-            this->Resize( tmp.shape );
-            Copy( *this, tmp );
-            mshadow::FreeSpace( tmp );
-        }
-    public:
-        // functions to fit exp template
-        inline Tensor<Device,dimension>& operator=( real_t s ){
-            return this->__assign( s );
-        }
-        template<typename E>
-        inline Tensor<Device,dimension>& operator=( const expr::Exp<E,expr::type::kMapper> &exp ){
-            return this->__assign( exp );
-        }
-        template<typename E>
-        inline Tensor<Device,dimension>& operator=( const expr::Exp<E,expr::type::kComplex> &exp ){
-            return this->__assign( exp );
-        }
-    private:
-        /*! \brief whether we do padding in the space */
-        bool pad_;
-        /*! \brief the shape of data_ is actually current data space */
-        Tensor<Device, 2> data_;
-    private:
-        inline void FreeSpace (void){
-            if( data_.dptr != NULL ){
-                mshadow::FreeSpace( data_ );
-                data_.dptr = this->dptr = NULL;
-            }
-        }
-        inline void AllocByShape (const Shape<dimension>& shape){
-            if( data_.dptr != NULL ){
-                this->FreeSpace();
-            }
-            data_.shape = shape.FlatTo2D();
-            mshadow::AllocSpace( data_, pad_ );
-            this->dptr  = data_.dptr;
-            this->shape = shape;
-            if( this->pad_ ){
-                this->shape.stride_ = data_.shape.stride_;
-            }else{
-                this->shape.stride_ = shape[0];
-            }
-        }
-    };
-};// namespace mshadow
-
-#endif
diff --git a/include/mshadow/tensor_cpu-inl.hpp b/include/mshadow/tensor_cpu-inl.hpp
deleted file mode 100644
index 0fa3cfa..0000000
--- a/include/mshadow/tensor_cpu-inl.hpp
+++ /dev/null
@@ -1,168 +0,0 @@
-#ifndef MSHADOW_TENSOR_CPU_INL_HPP
-#define MSHADOW_TENSOR_CPU_INL_HPP
-/*!
- * \file tensor_cpu-inl.hpp
- * \brief implementation of CPU host code
- * \author Bing Xu, Tianqi Chen
- */
-#include <cstring>
-#include "tensor_base.h"
-#include "tensor_sse-inl.hpp"
-
-namespace mshadow {
-    template<int dim>
-    inline void AllocSpace(Tensor<cpu,dim> &obj, bool pad ){
-        size_t pitch;
-        if( pad ){
-            obj.dptr = (real_t*)sse2::AlignedMallocPitch
-                ( pitch, obj.shape[0] * sizeof(real_t), obj.FlatTo2D().shape[1] );
-            obj.shape.stride_ = static_cast<index_t>( pitch / sizeof(real_t) );
-        }else{
-            obj.shape.stride_ = obj.shape[0];
-            obj.dptr = (real_t*)sse2::AlignedMallocPitch
-                ( pitch, obj.shape.Size() * sizeof(real_t), 1 );
-        }
-    }
-
-    template<typename Device, int dim>
-    inline Tensor<Device,dim> NewTensor(const Shape<dim> &shape, real_t initv, bool pad ){
-        Tensor<Device, dim> obj( shape );
-        AllocSpace( obj, pad );
-        MapExp<sv::saveto>( obj, expr::ScalarExp( initv ) );
-        return obj;
-    }
-
-    template<int dim>
-    inline void FreeSpace(Tensor<cpu,dim> &obj){
-        sse2::AlignedFree( obj.dptr );
-        obj.dptr = NULL;
-    }
-
-    template<int dim>
-    inline void Copy(Tensor<cpu,dim> _dst, const Tensor<cpu,dim> &_src ){
-        utils::Assert( _dst.shape == _src.shape, "Copy:shape mismatch" );
-        Tensor<cpu,2> dst = _dst.FlatTo2D();
-        Tensor<cpu,2> src = _src.FlatTo2D();
-        for (index_t y = 0; y < dst.shape[1]; ++y ) {
-            memcpy( dst[y].dptr, src[y].dptr, sizeof(real_t) * dst.shape[0] );
-        }
-    }
-
-    template<typename Saver, typename E, int dim>
-    inline void MapPlan(Tensor<cpu,dim> _dst, const expr::Plan<E> &plan){
-        Tensor<cpu,2> dst = _dst.FlatTo2D();
-        for (index_t y = 0; y < dst.shape[1]; ++y ) {
-            for (index_t x = 0; x < dst.shape[0]; ++x ) {
-                // trust your compiler! -_- they will optimize it
-                Saver::Save(dst[y][x], plan.Eval( y, x ) );
-            }
-        }
-    }
-
-    // code to handle SSE optimization
-    template<bool pass_check,typename Saver, int dim, typename E, int etype>
-    struct MapExpCPUEngine;
-    template<typename SV, int dim, typename E, int etype>
-    struct MapExpCPUEngine<false,SV,dim,E,etype>{
-        inline static void Map(Tensor<cpu,dim> dst, const expr::Exp<E,etype> &exp ){
-            MapPlan<SV>( dst, MakePlan( exp.self() ) );
-        }
-    };
-
-    #if MSHADOW_USE_SSE
-    template<typename SV, int dim, typename E, int etype>
-    struct MapExpCPUEngine<true,SV,dim,E,etype>{
-        inline static void Map(Tensor<cpu,dim> dst, const expr::Exp<E,etype> &exp ){
-            using namespace expr;
-            if( SSEAlignCheck<dim,E>::Check( exp.self() ) && SSEAlignCheck< dim,Tensor<cpu,dim> >::Check(dst) ){
-                MapSSEPlan<SV>( dst, MakeSSEPlan( exp.self() ) );
-            }else{
-                MapPlan<SV>( dst, MakePlan( exp.self() ) );
-            }
-        }
-    };
-    #endif
-
-    template<typename Saver, int dim, typename E, int etype>
-    inline void MapExp(Tensor<cpu,dim> dst, const expr::Exp<E,etype> &exp ){
-        using namespace expr;
-        TypeCheckPass< TypeCheck<cpu,dim,E>::kMapPass >::Error_All_Tensor_in_Exp_Must_Have_Same_Type();
-        Shape<dim> eshape = ShapeCheck<dim,E>::Check( exp.self() );
-        utils::Assert( eshape[0] == 0 || eshape == dst.shape, "Assignment: Shape of Tensors in expression is not consistent with target" );
-        #if MSHADOW_USE_SSE
-        MapExpCPUEngine< SSECheck<E>::kPass,Saver,dim,E,etype >::Map( dst, exp );
-        #else
-        MapExpCPUEngine< false,Saver,dim,E,etype >::Map( dst, exp );
-        #endif
-    }
-
-    template<typename Saver, typename Reducer, typename E, int etype>
-    inline void MapReduceKeepLowest( Tensor<cpu,1> dst, const expr::Exp<E,etype> &exp, real_t scale ){
-        using namespace expr;
-        TypeCheckPass< TypeCheck<cpu,1,E>::kRedPass >::Error_TypeCheck_Not_Pass_For_Reduce_Exp();
-        Shape<2> eshape = ShapeCheck< ExpInfo<E>::kDim, E >::Check( exp.self() ).FlatTo2D();
-
-        utils::Assert( eshape[0] == dst.shape[0], "reduction dimension do not match" );
-        utils::Assert( eshape[1] != 0, "can not reduce over empty tensor" );
-        // execution
-        expr::Plan<E> plan = MakePlan( exp.self() );
-        for( index_t x = 0; x < eshape[0]; ++x ){
-            real_t res = plan.Eval( 0, x );
-            for( index_t y = 1; y < eshape[1]; ++y ){
-                Reducer::Reduce( res, plan.Eval( y, x ) );
-            }
-            Saver::Save( dst[x], res*scale );
-        }
-    }
-
-    template<typename Saver, typename Reducer, int dimkeep, typename E, int etype>
-    inline void MapReduceKeepHighDim( Tensor<cpu,1> dst, const expr::Exp<E,etype> &exp, real_t scale ){
-        using namespace expr;
-        TypeCheckPass< TypeCheck<cpu,dimkeep,E>::kRedPass >::Error_TypeCheck_Not_Pass_For_Reduce_Exp();
-        typedef Shape< ExpInfo<E>::kDim > EShape;
-        EShape eshape = ShapeCheck< ExpInfo<E>::kDim, E >::Check( exp.self() );
-        utils::Assert( eshape[dimkeep] == dst.shape[0], "reduction dimension do not match" );
-        // use equvalent form
-        Shape<4> pshape = Shape4( eshape.ProdShape(dimkeep+1,EShape::kMaxShape), eshape[dimkeep], 
-                                  eshape.ProdShape(1,dimkeep), eshape[0] );
-
-        // execution
-        expr::Plan<E> plan = MakePlan( exp.self() );
-
-        for( index_t c = 0; c < pshape[2]; ++c ){
-            real_t res = Reducer::kInitV;
-            for( index_t n = 0; n < pshape[3]; ++n ){
-                real_t tres = Reducer::kInitV;
-                for( index_t y = 0; y < pshape[1]; ++y ){
-                    for( index_t x = 0; x < pshape[0]; ++x ){
-                        Reducer::Reduce( tres, plan.Eval( (n*pshape[2] + c) * pshape[1] + y, x ) );
-                    }
-                }
-                Reducer::Reduce( res, tres );
-            }
-            Saver::Save( dst[c], res*scale );
-        }
-    }
-
-    inline void Softmax( Tensor<cpu,1> dst, const Tensor<cpu,1>& energy ){
-        real_t mmax = energy[0];
-        for( real_t x = 1; x < dst.shape[0]; ++x )
-            if( mmax < energy[x] ) mmax = energy[x];
-        real_t sum = 0.0f;
-        for( index_t x = 0; x < dst.shape[0]; ++x ){
-            dst[x] = std::exp( energy[x] - mmax );
-            sum += dst[x];
-        }
-        for( index_t x = 0; x < dst.shape[0]; ++x ){
-            dst[x] /= sum;
-        }
-    }
-    inline void Softmax( Tensor<cpu,2> dst, const Tensor<cpu,2>& energy ){
-        utils::Assert( dst.shape == energy.shape, "Softmax: shape mismatch" );
-        for( index_t y = 0; y < dst.shape[1]; ++y ){
-            Softmax( dst[y], energy[y] );
-        }
-    }
-}; // namespace mshadow
-
-#endif // TENSOR_CPU_INL_HPP
diff --git a/include/mshadow/tensor_expr.h b/include/mshadow/tensor_expr.h
deleted file mode 100644
index ac8fde7..0000000
--- a/include/mshadow/tensor_expr.h
+++ /dev/null
@@ -1,367 +0,0 @@
-#ifndef MSHADOW_TENSOR_EXPR_H
-#define MSHADOW_TENSOR_EXPR_H
-/*!
- * \file tensor_expr.h
- * \brief definitions of abstract expressions and expressions template
- * \author Tianqi Chen, Bing Xu
- */
-#include "tensor_base.h"
-
-namespace mshadow{
-    /*!
-     * \brief namespace for abstract expressions and expressions template,
-     *        have no dependecy on tensor.h,
-     *        These data structure takes no charge in computations,
-     *        they are only used to define operations and represent expression in a symbolic way
-     */
-    namespace expr{
-
-        /*! \brief type of expressions */
-        namespace type{
-            /*! \brief this expression directly correspnds to a data class */
-            const int kContainer = 0;
-            /*! \brief this only contains element-wise vector operations */
-            const int kMapper    = 1;
-            /*! \brief othercase: e.g dot product */
-            const int kComplex   = 3;
-        };
-
-        /*!
-         * \brief expression engine that actually interprets these expressions
-         *        this is a function template that needed to be implemented for specific expressions
-         */
-        template<typename Saver,typename Container>
-        struct ExpEngine{
-            template<typename EType>
-            inline static void Eval( Container& dst, const EType &exp );
-        };
-
-        template<typename Container>
-        class ContainerExp;
-        class ScalarExp;
-
-        /*!
-         * \brief base class for expression
-         * \tparam SubType inheritated class must put their type into this parameter
-         * \tparam exp_type expression type, see namespace type
-         */
-        template<typename SubType, int exp_type>
-        struct Exp{
-        public:
-            /*! \return  subtype instance of current class */
-            inline const SubType& self( void ) const{
-                return *static_cast<const SubType*>(this);
-            }
-            /*! \return reference of subtype instance of current class */
-            inline SubType& refself( void ){
-                return *static_cast<SubType*>(this);
-            }
-        };
-
-        /*! \brief scalar expression */
-        struct ScalarExp: public Exp<ScalarExp, type::kMapper>{
-            /*! \brief scalar value */
-            real_t scalar_;
-            /*! \brief constructor */
-            ScalarExp( real_t scalar ):scalar_(scalar){}
-        };
-
-        /*! \brief represent a transpose expression of a container */
-        template<typename EType>
-        struct TransposeExp: public Exp< TransposeExp<EType>, type::kComplex >{
-        public:
-            /*! \brief expression to be transposed */
-            const EType &exp;
-            /*! \brief constructor */
-            TransposeExp( const EType &e ):exp(e){}
-            /*! \brief transpose expression */
-            inline const EType & T( void ) const{
-                return exp;
-            }
-        };
-        
-        /*!
-         * \brief base class of all variables, that can be assigned to values
-         * \tparam Container the actually class of data container, e.g. CTensor1D
-         */
-        template<typename Container>
-        class ContainerExp: public Exp< Container, type::kContainer >{
-        public:
-            /*!
-             *\brief transpose of a matrix
-             *\return transpose of current expression
-             */
-            inline const TransposeExp<Container> T( void ) const{
-                return TransposeExp<Container>( this->self() );
-            }
-        public:
-            /*! \brief operator overload */
-            inline Container &operator+=( real_t s ){
-                ExpEngine<sv::plusto,Container>::Eval( this->refself(), ScalarExp(s) );
-                return this->refself();
-            }
-            /*! \brief operator overload */
-            inline Container &operator-=( real_t s ){
-                ExpEngine<sv::minusto,Container>::Eval( this->refself(), ScalarExp(s) );
-                return this->refself();
-            }
-            /*! \brief operator overload */
-            inline Container &operator*=( real_t s ){
-                ExpEngine<sv::multo,Container>::Eval( this->refself(), ScalarExp(s) );
-                return this->refself();
-            }
-            /*! \brief operator overload */
-            inline Container &operator/=( real_t s ){
-                ExpEngine<sv::divto,Container>::Eval( this->refself(), ScalarExp(s) );
-                return this->refself();
-            }
-            /*! \brief operator overload */
-            inline Container &__assign( real_t s ){
-                ExpEngine<sv::saveto,Container>::Eval( this->refself(), ScalarExp(s) );
-                return this->refself();
-            }
-        public:
-            /*! \brief implementation of operator=, note that we can not define container = container */
-            template<typename E>
-            inline Container &__assign( const Exp<E,type::kMapper> &exp ){
-                ExpEngine<sv::saveto,Container>::Eval( this->refself(), exp.self() );
-                return this->refself();
-            }
-            /*! \brief implementation of operator=, note that we can not define container = container */
-            template<typename E>
-            inline Container &__assign( const Exp<E,type::kComplex> &exp ){
-                ExpEngine<sv::saveto,Container>::Eval( this->refself(), exp.self() );
-                return this->refself();
-            }
-            /*! \brief implementation of operator+= */
-            template<typename E,int etype>
-            inline Container &operator+=( const Exp<E,etype> &exp ){
-                ExpEngine<sv::plusto,Container>::Eval( this->refself(), exp.self() );
-                return this->refself();
-            }
-            /*! \brief implementation of operator-= */
-            template<typename E,int etype>
-            inline Container &operator-=( const Exp<E,etype> &exp ){
-                ExpEngine<sv::minusto,Container>::Eval( this->refself(), exp.self() );
-                return this->refself();
-            }
-            /*! \brief implementation of operator*= */
-            template<typename E,int etype>
-            inline Container &operator*=( const Exp<E,etype> &exp ){
-                ExpEngine<sv::multo,Container>::Eval( this->refself(), exp.self() );
-                return this->refself();
-            }
-            /*! \brief implementation of operator/= */
-            template<typename E,int etype>
-            inline Container &operator/=( const Exp<E,etype> &exp ){
-                ExpEngine<sv::divto,Container>::Eval( this->refself(), exp.self() );
-                return this->refself();
-            }
-        };
-    }; // namespace expr
-
-    namespace expr{
-        /*!
-         * \brief matrix multiplication expression dot( lhs[.T], rhs[.T] )
-         * \tparam TA type of lhs
-         * \tparam TB type of rhs
-         * \tparam ltrans whether lhs is transposed
-         * \tparam rtrans whether rhs is transposed
-         */
-        template<typename TA,typename TB,bool ltrans,bool rtrans>
-        struct DotExp: public Exp< DotExp<TA,TB,ltrans,rtrans>, type::kComplex >{
-            /*! \brief left operand */
-            const TA& lhs_;
-            /*! \brief right operand */
-            const TB& rhs_;
-            /*! \brief scale over result */
-            real_t scale_;
-            /*! \brief constructor */
-            DotExp( const TA &lhs, const TB &rhs, real_t scale )
-                :lhs_(lhs),rhs_(rhs),scale_(scale){}
-        };
-
-        /*! \brief dot operator def */
-        template<typename TA, typename TB>
-        inline DotExp<TA,TB,false,false> dot( const ContainerExp<TA> &lhs, const ContainerExp<TB> &rhs ){
-            return DotExp<TA,TB,false,false>( lhs.self(), rhs.self(), 1.0f );
-        }
-        /*! \brief dot operator def */
-        template<typename TA, typename TB>
-        inline DotExp<TA,TB,true,false> dot( const TransposeExp<TA> &lhs, const ContainerExp<TB> &rhs ){
-            return DotExp<TA,TB,true,false>( lhs.exp, rhs.self(), 1.0f );
-        }
-        /*! \brief dot operator def */
-        template<typename TA, typename TB>
-        inline DotExp<TA,TB,false,true> dot( const ContainerExp<TA> &lhs, const TransposeExp<TB> &rhs ){
-            return DotExp<TA,TB,false,true>( lhs.self(), rhs.exp, 1.0f );
-        }
-        /*! \brief dot operator def */
-        template<typename TA, typename TB>
-        inline DotExp<TA,TB,true,true> dot( const TransposeExp<TA> &lhs, const TransposeExp<TB> &rhs ){
-            return DotExp<TA,TB,true,true>( lhs.exp, rhs.exp, 1.0f );
-        }
-        /*! \brief dot operator def */
-        template<typename TA, typename TB, bool ltrans, bool rtrans >
-        inline DotExp<TA,TB,ltrans,rtrans> operator*( const DotExp<TA,TB,ltrans,rtrans> &lhs, real_t rhs ){
-            return DotExp<TA,TB,ltrans,rtrans>( lhs.lhs_, lhs.rhs_, lhs.scale_ * rhs );
-        }
-        /*! \brief scale of dot operation */
-        template<typename TA, typename TB, bool ltrans, bool rtrans >
-        inline DotExp<TA,TB,ltrans,rtrans> operator*( real_t lhs, const DotExp<TA,TB,ltrans,rtrans> &rhs ){
-            return DotExp<TA,TB,ltrans,rtrans>( rhs.lhs_, rhs.rhs_, rhs.scale_ * lhs );
-        }
-    }; // namespace expr
-
-    namespace expr{
-        /*!
-         * \brief binary map expression lhs [op] rhs
-         * \tparam OP operator
-         * \tparam TA type of lhs
-         * \tparam TB type of rhs
-         * \tparam etype expression type, sa namespace::type
-         */
-        template<typename OP, typename TA, typename TB, int etype >
-        struct BinaryMapExp: public Exp< BinaryMapExp<OP,TA,TB,etype>, etype >{
-            /*! \brief left operand */
-            const TA& lhs_;
-            /*! \brief right operand */
-            const TB& rhs_;
-            /*! \brief constructor */
-            BinaryMapExp( const TA &lhs, const TB &rhs )
-                :lhs_(lhs), rhs_(rhs){}
-        };
-
-        /*! \brief make expression */
-        template<typename OP,typename TA, typename TB, int ta, int tb>
-        inline BinaryMapExp<OP,TA,TB, (ta|tb|type::kMapper) > MakeExp( const Exp<TA,ta> &lhs, const Exp<TB,tb> &rhs ){
-            return BinaryMapExp<OP,TA,TB, (ta|tb|type::kMapper) >( lhs.self(), rhs.self() );
-        }
-
-        /*! 
-         * \brief short hand for MakeExp, usage F<op>(lhs, rhs). create a binary operation expression 
-         * \param lhs left operand
-         * \param rhs right operand
-         * \tparam binary operator 
-         * \tparam TA lhs expression
-         * \tparam ta lhs expression type
-         * \tparam TB rhs expression
-         * \tparam tb rhs expression type
-         * \sa mshadow::op
-         */
-        template<typename OP,typename TA, typename TB, int ta, int tb>
-        inline BinaryMapExp<OP,TA,TB, (ta|tb|type::kMapper) > F( const Exp<TA,ta> &lhs, const Exp<TB,tb> &rhs ){
-            return MakeExp<OP>( lhs, rhs );
-        }
-        /*! \brief operator overload for const */
-        template<typename OP,typename TA, int ta>
-        inline BinaryMapExp<OP,TA,ScalarExp, (ta|type::kMapper) > F( const Exp<TA,ta> &lhs, const ScalarExp &rhs ){
-            return MakeExp<OP>( lhs, rhs );
-        }
-        /*! \brief operator overload for const */
-        template<typename OP,typename TB, int tb>
-        inline BinaryMapExp<OP,ScalarExp,TB, (tb|type::kMapper) > F( const ScalarExp &lhs, const Exp<TB,tb>& rhs ){
-            return MakeExp<OP>( lhs, rhs );
-        }
-
-        // operator rules
-        /*! \brief operator overload */
-        template<typename TA, typename TB, int ta, int tb>
-        inline BinaryMapExp<op::plus,TA,TB, (ta|tb|type::kMapper) > operator+( const Exp<TA,ta> &lhs, const Exp<TB,tb> &rhs ){
-            return MakeExp<op::plus>( lhs, rhs );
-        }
-        /*! \brief operator overload */
-        template<typename TA, typename TB, int ta, int tb>
-        inline BinaryMapExp<op::minus,TA,TB, (ta|tb|type::kMapper) > operator-( const Exp<TA,ta> &lhs, const Exp<TB,tb> &rhs ){
-            return MakeExp<op::minus>( lhs, rhs );
-        }
-        /*! \brief operator overload */
-        template<typename TA, typename TB, int ta, int tb>
-        inline BinaryMapExp<op::mul,TA,TB, (ta|tb|type::kMapper) > operator*( const Exp<TA,ta> &lhs, const Exp<TB,tb> &rhs ){
-            return MakeExp<op::mul>( lhs, rhs );
-        }
-        /*! \brief operator overload */
-        template<typename TA, typename TB, int ta, int tb>
-        inline BinaryMapExp<op::div,TA,TB, (ta|tb|type::kMapper) > operator/( const Exp<TA,ta> &lhs, const Exp<TB,tb> &rhs ){
-            return MakeExp<op::div>( lhs, rhs );
-        }
-        // constant operators
-        /*! \brief operator overload */
-        template<typename TA, int ta>
-        inline BinaryMapExp<op::plus, TA, ScalarExp, (ta|type::kMapper) > operator+( const Exp<TA,ta>& lhs,  const ScalarExp& rhs ){
-            return MakeExp<op::plus>( lhs, rhs );
-        }
-        /*! \brief operator overload */
-        template<typename TA, int ta>
-        inline BinaryMapExp<op::minus, TA, ScalarExp, (ta|type::kMapper) > operator-( const Exp<TA,ta>& lhs,  const ScalarExp& rhs ){
-            return MakeExp<op::minus>( lhs, rhs );
-        }
-        /*! \brief operator overload */
-        template<typename TA, int ta>
-        inline BinaryMapExp<op::mul, TA, ScalarExp, (ta|type::kMapper) > operator*( const Exp<TA,ta>& lhs,  const ScalarExp& rhs ){
-            return MakeExp<op::mul>( lhs, rhs );
-        }
-        /*! \brief operator overload */
-        template<typename TA, int ta>
-        inline BinaryMapExp<op::div, TA, ScalarExp, (ta|type::kMapper) > operator/( const Exp<TA,ta>& lhs,  const ScalarExp& rhs ){
-            return MakeExp<op::div>( lhs, rhs );
-        }
-        // constant operators 2
-        /*! \brief operator overload */
-        template<typename TB, int tb>
-        inline BinaryMapExp<op::plus, ScalarExp, TB, (tb|type::kMapper) > operator+( const ScalarExp& lhs, const Exp<TB,tb>& rhs ){
-            return MakeExp<op::plus>( lhs, rhs );
-        }
-        /*! \brief operator overload */
-        template<typename TB, int tb>
-        inline BinaryMapExp<op::minus, ScalarExp, TB, (tb|type::kMapper) > operator-( const ScalarExp& lhs, const Exp<TB,tb>& rhs ){
-            return MakeExp<op::minus>( lhs, rhs );
-        }
-        /*! \brief operator overload */
-        template<typename TB, int tb>
-        inline BinaryMapExp<op::mul, ScalarExp, TB, (tb|type::kMapper) > operator*( const ScalarExp& lhs, const Exp<TB,tb>& rhs ){
-            return MakeExp<op::mul>( lhs, rhs );
-        }
-        /*! \brief operator overload */
-        template<typename TB, int tb>
-        inline BinaryMapExp<op::div, ScalarExp, TB, (tb|type::kMapper) > operator/( const ScalarExp& lhs, const Exp<TB,tb>& rhs ){
-            return MakeExp<op::div>( lhs, rhs );
-        }
-    };
-
-    namespace expr{
-        /*!
-         * \brief unary map expression op(src)
-         * \tparam OP operator
-         * \tparam TA type of src
-         * \tparam etype expression type, sa namespace::type
-         */
-        template<typename OP, typename TA, int etype >
-        struct UnaryMapExp: public Exp< UnaryMapExp<OP,TA,etype>, etype >{
-            /*! \brief source expression */
-            const TA& src_;
-            /*! \brief constructor */
-            UnaryMapExp( const TA &src ):src_(src){}
-        };
-
-        /*! \brief make expression */
-        template<typename OP,typename TA, int ta>
-        inline UnaryMapExp<OP,TA,(ta|type::kMapper) > MakeExp( const Exp<TA,ta> &src ){
-            return UnaryMapExp<OP,TA, (ta|type::kMapper) >( src.self() );
-        }
-
-        /*! 
-         * \brief short hand for MakeExp, usage F<op>(src), create a unary operation expression 
-         * \param src source expression
-         * \tparam operator 
-         * \tparam TA source expression
-         * \tparam ta source expression type
-         * \sa mshadow::op
-         */
-        template<typename OP,typename TA, int ta>
-        inline UnaryMapExp<OP,TA,(ta|type::kMapper) > F( const Exp<TA,ta> &src ){
-            return MakeExp<OP>(src);
-        }
-    };
-};
-#endif
diff --git a/include/mshadow/tensor_expr_engine-inl.hpp b/include/mshadow/tensor_expr_engine-inl.hpp
deleted file mode 100644
index 9c5f2c7..0000000
--- a/include/mshadow/tensor_expr_engine-inl.hpp
+++ /dev/null
@@ -1,416 +0,0 @@
-#ifndef MSHADOW_TENSOR_EXPR_ENGINE_INL_HPP
-#define MSHADOW_TENSOR_EXPR_ENGINE_INL_HPP
-/*!
- * \file tensor_expr_engine-inl.hpp
- * \brief definitions of how expressions should be evaluated
- * \author Tianqi Chen, Bing Xu
- */
-#include "tensor_expr.h"
-#include "tensor.h"
-
-namespace mshadow{
-    namespace expr{
-        /*! 
-         * \brief a general class that allows extension that makes tensors of some shape
-         * \tparam SubType type of subclass
-         * \tparam SrcExp source expression of the MakeTensorExp, the source of operation
-         * \tparam dim dimension of the expression
-         */
-        template<typename SubType, typename SrcExp, int dim>
-        struct MakeTensorExp: public Exp< MakeTensorExp<SubType,SrcExp,dim>, type::kMapper >{
-            /*! \brief the shape of this expression */
-            Shape<dim> shape_;
-            /*! \brief true self of subtype */
-            inline const SubType& real_self( void ) const{
-                return *static_cast<const SubType*>(this);
-            }
-        };
-    };
-    
-    namespace expr{
-        /*! \brief This part of code gives plan that can be used to carry out execution */
-        template<typename ExpType>
-        class Plan{
-        public:
-            /*!
-             * \brief evaluate the expression at index [y][x]
-             *        to be implemented by SubType
-             */
-            MSHADOW_XINLINE real_t Eval( index_t y, index_t x ) const;
-        };
-
-        template <typename Device, int dim>
-        class Plan< Tensor<Device,dim> >{
-        public:
-            Plan( const Tensor<Device,dim> &t )
-                :dptr_(t.dptr),stride_(t.shape.stride_){}
-            MSHADOW_XINLINE real_t Eval( index_t y, index_t x ) const{
-                return dptr_[ y * stride_ + x ];
-            }
-        private:
-            const real_t  *dptr_;
-            index_t stride_;
-        };
-        // special evaluation case for 1d tensor
-        template <typename Device>
-        class Plan< Tensor<Device,1> >{
-        public:
-            Plan( const Tensor<Device,1> &t ):dptr_(t.dptr){}
-            MSHADOW_XINLINE real_t Eval( index_t y, index_t x ) const{
-                return dptr_[ x ];
-            }
-        private:
-            const real_t  *dptr_;
-        };
-        
-        template<>
-        class Plan<ScalarExp>{
-        public:
-            Plan( real_t scalar ):scalar_(scalar){}
-            /*! \brief evaluate at [y][x] */
-            MSHADOW_XINLINE real_t Eval( index_t y, index_t x ) const{
-                    return scalar_;
-            }
-        private:
-            real_t scalar_;
-        };
-
-        template<typename OP, typename TA, typename TB,int etype>
-        class Plan< BinaryMapExp<OP,TA,TB,etype> >{
-        public:
-            Plan( const Plan<TA> &lhs, const Plan<TB> &rhs )
-                :lhs_(lhs), rhs_(rhs){}
-            MSHADOW_XINLINE real_t Eval( index_t y, index_t x ) const{
-                return OP::Map( lhs_.Eval( y, x ), rhs_.Eval( y, x ) );
-            }
-        private:
-            Plan<TA> lhs_;
-            Plan<TB> rhs_;
-        };
-
-        template<typename OP, typename TA, int etype>
-        class Plan< UnaryMapExp<OP,TA,etype> >{
-        public:
-            Plan( const Plan<TA> &src ):src_(src){}
-            MSHADOW_XINLINE real_t Eval( index_t y, index_t x ) const{
-                return OP::Map( src_.Eval( y, x ) );
-            }
-        private:
-            Plan<TA> src_;
-        };
-
-        
-        template<typename SubType, typename SrcExp, int dim>
-        struct Plan< MakeTensorExp<SubType,SrcExp,dim> >{
-        public:
-            Plan( const Plan<SubType> &src ):src_(src){}
-            MSHADOW_XINLINE real_t Eval( index_t y, index_t x ) const{
-                return src_.Eval( y, x );
-            }
-        private:
-            Plan<SubType> src_;  
-        };
-
-        // allow UnaryMap see the plan
-        template<typename OP, typename TA, typename TB, int etype>
-        inline Plan< BinaryMapExp<OP,TA,TB,etype> > MakePlan( const BinaryMapExp<OP,TA,TB,etype> &e );
-
-        // translate from exp to execution plan
-        inline Plan<ScalarExp> MakePlan( const ScalarExp &e ){
-            return Plan<ScalarExp>( e.scalar_ );
-        }
-
-        template<typename T>
-        inline Plan<T> MakePlan( const ContainerExp<T> &e ){
-            return Plan<T>( e.self() );
-        }
-
-        template<typename T, typename SrcExp, int dim>
-        inline Plan< T > MakePlan( const MakeTensorExp<T,SrcExp,dim> &e ){
-            return Plan< T >( e.real_self() );
-        }
-
-        template<typename OP, typename TA, int etype>
-        inline Plan< UnaryMapExp<OP,TA,etype> > MakePlan( const UnaryMapExp<OP,TA,etype> &e ){
-            return Plan< UnaryMapExp<OP,TA,etype> >( MakePlan(e.src_) );
-        }
-
-        template<typename OP, typename TA, typename TB, int etype>
-        inline Plan< BinaryMapExp<OP,TA,TB,etype> > MakePlan( const BinaryMapExp<OP,TA,TB,etype> &e ){
-            return Plan< BinaryMapExp<OP,TA,TB,etype> >( MakePlan(e.lhs_), MakePlan(e.rhs_) );
-        }
-    }; // namespace expr
-
-    namespace expr{
-        /*!
-         * \brief static type inference template, 
-         *        used to get the dimension of each expression, 
-         *        if ExpInfo<E>::kDim == -1, this means here are mismatch in expression
-         *        if ( ExpInfo<E>::kDevMask & cpu::kDevMask ) != 0, this means this expression can be assigned to cpu
-         * \tparam E expression
-         */
-        template<typename E>
-        struct ExpInfo{
-            const static int kDim = -1;
-            const static int kDevMask = 0;
-        };
-        template<>
-        struct ExpInfo<ScalarExp>{
-            const static int kDim = 0;
-            const static int kDevMask = 0xffff;
-        };
-        template<typename Device, int dim>
-        struct ExpInfo< Tensor<Device,dim> >{
-            const static int kDim = dim;
-            const static int kDevMask = Device::kDevMask;            
-        };
-        template<typename T, typename SrcExp, int dim>
-        struct ExpInfo< MakeTensorExp<T,SrcExp,dim> >{
-            const static int kDimSrc = ExpInfo<SrcExp>::kDim;
-            const static int kDim = kDimSrc >= 0 ? dim : -1;
-            const static int kDevMask = ExpInfo<SrcExp>::kDevMask;
-        };
-        template<typename OP, typename TA, int etype>
-        struct ExpInfo< UnaryMapExp<OP,TA,etype> >{
-            const static int kDim = ExpInfo<TA>::kDim;
-            const static int kDevMask = ExpInfo<TA>::kDevMask;
-        };
-        template<typename OP, typename TA, typename TB, int etype>
-        struct ExpInfo< BinaryMapExp<OP,TA,TB,etype> >{
-            const static int kDimLhs = ExpInfo<TA>::kDim;
-            const static int kDimRhs = ExpInfo<TB>::kDim;
-            const static int kDim = (kDimLhs>=0 && kDimRhs >= 0) ? \
-                ( kDimLhs==0 ? kDimRhs : ( (kDimRhs==0||kDimLhs==kDimRhs) ? kDimLhs : -1 ) ):-1;
-            const static int kDevMask = ExpInfo<TA>::kDevMask & ExpInfo<TB>::kDevMask;
-        };
-
-        /*! \brief template to do type check */
-        template<typename Device, int dim, typename E>
-        struct TypeCheck{
-            /*! \brief dimension of expression*/
-            const static int kExpDim = ExpInfo<E>::kDim;
-            /*! \brief whether the expression device type matches */
-            const static bool kDevPass = (ExpInfo<E>::kDevMask & Device::kDevMask) != 0;
-            /*! \brief whether the expression can be mapped to expression of dim */
-            const static bool kMapPass = (kExpDim == 0 || kExpDim == dim) && kDevPass;
-            /*! \brief whether the expression can be reduced to expression of dim */
-            const static bool kRedPass = (kExpDim > dim) && kDevPass;
-        };
-
-        template<bool kPass>
-        struct TypeCheckPass;
-        template<>
-        struct TypeCheckPass<false>{};
-        template<>
-        struct TypeCheckPass<true>{
-            inline static void Error_All_Tensor_in_Exp_Must_Have_Same_Type( void ){}
-            inline static void Error_TypeCheck_Not_Pass_For_Reduce_Exp( void ){}
-            inline static void Error_Expression_Does_Not_Meet_Dimension_Req( void ){}
-        };
-    }; // namespace expr
-    
-    namespace expr{
-        // check shape consistency
-        template<int dim,typename E>
-        struct ShapeCheck{
-            inline static Shape<dim> Check( const E &t );
-        };
-        
-        template<int dim>
-        struct ShapeCheck<dim,ScalarExp>{
-            inline static Shape<dim> Check( const ScalarExp &exp ){
-                // use lowest dimension to mark scalar exp
-                Shape<dim> shape; shape[0] = 0; 
-                return shape;
-            }
-        };
-        template<int dim,typename Device>
-        struct ShapeCheck<dim,Tensor<Device,dim> >{
-            inline static Shape<dim> Check( const Tensor<Device,dim> &t ){
-                return t.shape;
-            }
-        };
-        template<int dim,typename SrcExp,typename T>
-        struct ShapeCheck<dim,MakeTensorExp<T,SrcExp,dim> >{
-            inline static Shape<dim> Check( const MakeTensorExp<T,SrcExp,dim> &t ){
-                return t.shape_;
-            }
-        };
-        template<int dim, typename OP, typename TA, int etype>
-        struct ShapeCheck< dim,UnaryMapExp<OP,TA,etype> >{
-            inline static Shape<dim> Check( const UnaryMapExp<OP,TA,etype> &t ){
-                Shape<dim> s = ShapeCheck<dim,TA>::Check( t.src_ );
-                return s;
-            }
-        };
-        template<int dim, typename OP, typename TA, typename TB, int etype>
-        struct ShapeCheck< dim, BinaryMapExp<OP,TA,TB,etype> >{
-            inline static Shape<dim> Check( const BinaryMapExp<OP,TA,TB,etype> &t ){
-                Shape<dim> shape1 = ShapeCheck<dim,TA>::Check( t.lhs_ );
-                Shape<dim> shape2 = ShapeCheck<dim,TB>::Check( t.rhs_ );
-                if( shape1[0] == 0 ) return shape2;
-                if( shape2[0] == 0 ) return shape1;
-                utils::Assert( shape1 == shape2, "BinaryMapExp: Shapes of two tensors in BinaryMapExp expression is not the same");
-                return shape1;
-            }
-        };
-    }; // namespace expr
-
-    // the matrix OP depends on BLAS
-    namespace expr{
-        template<typename SV,typename Device, int ddim, int ldim, int rdim, bool ltrans, bool rtrans>
-        struct DotEngine{
-            inline static void Eval( Tensor<Device,ddim> &dst, const Tensor<Device,ldim> &lhs, const Tensor<Device,rdim> &rhs, real_t scale );
-        };
-
-        // handles the dot
-        template<typename Device>
-        struct BLASEngine;
-
-        #if (MSHADOW_USE_CBLAS||MSHADOW_USE_MKL)
-        template<>
-        struct BLASEngine<cpu>{
-            inline static CBLAS_TRANSPOSE GetT( bool t ){
-                return t ? CblasTrans : CblasNoTrans;
-            }
-            inline static void gemm( bool transa, bool transb, int m, int n, int k, float alpha, \
-                                     const float *A, int lda, const float *B, int ldb, float beta, float *C, int ldc ){
-                cblas_sgemm(CblasColMajor, GetT(transa), GetT(transb), m,n,k,alpha,A,lda,B,ldb,beta,C,ldc);
-            }
-            inline static void gemm( bool transa, bool transb, int m, int n, int k, double alpha, \
-                                     const double *A, int lda, const double *B, int ldb, double beta, double *C, int ldc ){
-                cblas_dgemm(CblasColMajor, GetT(transa), GetT(transb), m,n,k,alpha,A,lda,B,ldb,beta,C,ldc);
-            }
-            inline static void gemv( bool trans, int m, int n, float alpha, const float *A, int lda, \
-                                     const float *X, int incX, float beta, float *Y, int incY ){
-                cblas_sgemv(CblasColMajor, GetT(trans), m,n,alpha,A,lda,X,incX,beta,Y,incY);
-            }
-            inline static void gemv( bool trans, int m, int n, double alpha, const double *A, int lda, \
-                                     const double *X, int incX, double beta, double *Y, int incY ){
-                cblas_dgemv(CblasColMajor, GetT(trans), m,n,alpha,A,lda,X,incX,beta,Y,incY);
-            }
-            inline static void ger( int m, int n, float alpha, const float *X, int incX, const float *Y, int incY, float *A, int lda ){
-                cblas_sger(CblasColMajor,m,n,alpha,X,incX,Y,incY,A,lda);
-            }
-            inline static void ger( int m, int n, double alpha, const double *X, int incX, const double *Y, int incY, double *A, int lda ){
-                cblas_dger(CblasColMajor,m,n,alpha,X,incX,Y,incY,A,lda);
-            }
-        };
-        #endif // MSHADOW_USE_CBLAS || MSHADOW_USE_MKL
-
-        #if MSHADOW_USE_CUDA
-        // All CuBLAS goes to here, use legacy API: not threadsafe
-        template<>
-        struct BLASEngine<gpu>{
-            inline static char GetT( bool t ){
-                return t ? 'T' : 'N';
-            }
-            inline static void gemm( bool transa, bool transb, int m, int n, int k, float alpha, 
-                                     const float *A, int lda, const float *B, int ldb, float beta, float *C, int ldc ){
-                cublasSgemm(GetT(transa),GetT(transb),m,n,k,alpha,A,lda,B,ldb,beta,C,ldc);
-            }
-            inline static void gemm( bool transa, bool transb, int m, int n, int k, double alpha, 
-                                     const double *A, int lda, const double *B, int ldb, double beta, double *C, int ldc ){
-                cublasDgemm(GetT(transa),GetT(transb),m,n,k,alpha,A,lda,B,ldb,beta,C,ldc);                
-            }
-            inline static void gemv( bool trans, int m, int n, float alpha, const float *A, int lda, \
-                                     const float *X, int incX, float beta, float *Y, int incY ){
-                cublasSgemv(GetT(trans), m,n,alpha,A,lda,X,incX,beta,Y,incY);
-            }
-            inline static void gemv( bool trans, int m, int n, double alpha, const double *A, int lda, \
-                                     const double *X, int incX, double beta, double *Y, int incY ){
-                cublasDgemv(GetT(trans), m,n,alpha,A,lda,X,incX,beta,Y,incY);
-            }
-            inline static void ger( int m, int n, float alpha, const float *X, int incX, const float *Y, int incY, float *A, int lda ){
-                cublasSger(m,n,alpha,X,incX,Y,incY,A,lda);
-            }
-            inline static void ger( int m, int n, double alpha, const double *X, int incX, const double *Y, int incY, double *A, int lda ){
-                cublasDger(m,n,alpha,X,incX,Y,incY,A,lda);
-            }
-        };
-        #endif
-
-        // helper function to decide which shape we are in 
-        inline static Shape<2> GetShape( const Shape<2> &shape, bool transpose ){
-            return transpose ? Shape2(shape[0],shape[1]) : shape;
-        }
-        // dst = dot( lhs[.T], rhs[.T] )
-        template<typename SV, typename xpu, bool transpose_left, bool transpose_right>
-        struct DotEngine<SV,xpu,2,2,2,transpose_left,transpose_right>{
-            inline static void Eval( Tensor<xpu,2> &dst, const Tensor<xpu,2> &lhs, const Tensor<xpu,2> &rhs, real_t scale ) {
-                Shape<2> sleft  = GetShape( lhs.shape, transpose_left );
-                Shape<2> sright = GetShape( rhs.shape, transpose_right );
-                utils::Assert( dst.shape[1] == sleft[1] && dst.shape[0] == sright[0] \
-                               && sleft[0] == sright[1] , "dot-gemm: matrix shape mismatch" );
-                // use column major argument to compatible with most BLAS
-                BLASEngine<xpu>::gemm
-                    ( transpose_right , transpose_left,
-                      transpose_right ? rhs.shape[1] : rhs.shape[0],
-                      transpose_left  ? lhs.shape[0] : lhs.shape[1],
-                      transpose_right ? rhs.shape[0] : rhs.shape[1], 
-                      scale * SV::kAlphaBLAS, 
-                      rhs.dptr, rhs.shape.stride_,
-                      lhs.dptr, lhs.shape.stride_,
-                      SV::kBetaBLAS, 
-                      dst.dptr, dst.shape.stride_ );
-            }
-        };
-        template<typename SV, typename xpu, bool transpose_right>
-        struct DotEngine<SV,xpu,1,1,2,false,transpose_right>{
-            inline static void Eval( Tensor<xpu,1> &dst, const Tensor<xpu,1> &lhs, const Tensor<xpu,2> &rhs, real_t scale ) {
-                Shape<2> sright = GetShape( rhs.shape, transpose_right );
-                utils::Assert( dst.shape[0] == sright[0] && lhs.shape[0] == sright[1], "dot-gemv: matrix shape mismatch");
-                BLASEngine<xpu>::gemv
-                    ( transpose_right, 
-                      rhs.shape[0], rhs.shape[1], scale * SV::kAlphaBLAS,
-                      rhs.dptr, rhs.shape.stride_,
-                      lhs.dptr, 1, SV::kBetaBLAS,
-                      dst.dptr, 1 );
-            }
-        };        
-        template<typename SV, typename xpu>
-        struct DotEngine<SV,xpu,2,1,1,true,false>{
-            inline static void Eval( Tensor<xpu,2> &dst, const Tensor<xpu,1> &lhs, const Tensor<xpu,1> &rhs, real_t scale ) {
-                utils::Assert( dst.shape[1] == lhs.shape[0] && dst.shape[0] == rhs.shape[0], "dot-ger: matrix shape mismatch" );
-                if( SV::kBetaBLAS < 1e-6f ){
-                    BLASEngine<xpu>::ger
-                        ( rhs.shape[0], lhs.shape[0], scale * SV::kAlphaBLAS,
-                          rhs.dptr, 1, lhs.dptr, 1, dst.dptr, dst.shape.stride_ );
-                }else{
-                    DotEngine<SV,xpu,2,2,2,true,false>::Eval( dst, lhs.FlatTo2D(), rhs.FlatTo2D(), scale );
-                }
-            }
-        };
-
-    }; // namespace expr
-
-    namespace expr{
-        /*! \brief some engine that evaluate complex expression */
-        template<typename SV, typename Device, int dim, typename E>
-        struct ExpComplexEngine{
-            inline static void Eval( Tensor<Device,dim>& dst, const E &exp );
-        };
-        template<typename SV, typename Device, int dim>
-        struct ExpEngine<SV, Tensor<Device,dim> >{
-            template<typename E>
-            inline static void Eval( Tensor<Device,dim>& dst, const Exp<E,type::kMapper> &exp ){
-                MapExp<SV,dim,E>( dst, exp );
-            }
-            template<typename E>
-            inline static void Eval( Tensor<Device,dim>& dst, const Exp<E,type::kContainer> &exp ){
-                MapExp<SV,dim,E>( dst, exp );
-            }
-            template<typename E>
-            inline static void Eval( Tensor<Device,dim>& dst, const Exp<E,type::kComplex> &exp ){
-                ExpComplexEngine<SV,Device,dim,E>::Eval( dst, exp.self() );
-            }
-        };
-        template<typename SV, typename Device, int dim, int ldim,int rdim,bool ltrans,bool rtrans>
-        struct ExpComplexEngine< SV, Device, dim, DotExp< Tensor<Device,ldim>, Tensor<Device,rdim>, ltrans, rtrans > >{
-            inline static void Eval( Tensor<Device,dim> &dst, const DotExp< Tensor<Device,ldim>, Tensor<Device,rdim>, ltrans, rtrans > &exp ){
-                DotEngine<SV,Device,dim,ldim,rdim,ltrans,rtrans>::Eval( dst, exp.lhs_, exp.rhs_, exp.scale_ );
-            }
-        };
-    }; // namespace expr
-};
-#endif
diff --git a/include/mshadow/tensor_expr_ext.h b/include/mshadow/tensor_expr_ext.h
deleted file mode 100644
index 8399b1b..0000000
--- a/include/mshadow/tensor_expr_ext.h
+++ /dev/null
@@ -1,978 +0,0 @@
-#ifndef MSHADOW_TENSOR_EXPR_EXT_H
-#define MSHADOW_TENSOR_EXPR_EXT_H
-/*!
- * \file tensor_expr_ext.h
- * \brief some extension of expressions, used to support something beyond elementwise op
- * \author Tianqi Chen, Bing Xu
- */
-#include "tensor_expr_engine-inl.hpp"
-namespace mshadow{
-    // Declaration of expressions goes here
-    namespace expr{
-        /*!
-         * \brief broadcast Tensor1D into a higher dimension Tensor
-         * input: Tensor<Device,1>: ishape[0]
-         * output: Tensor<Device,dimdst> : oshape[dimcast] = ishape[0]
-         * \tparam Device which device it lies
-         * \tparam dimdst  target tensor dimension
-         * \tparam dimcast the dimension where the 1D tensor fills in by index
-         */
-        template<typename Device, int dimdst, int dimcast>
-        struct Broadcast1DExp: public MakeTensorExp< Broadcast1DExp<Device,dimdst,dimcast>,Tensor<Device,1>,dimdst>{
-            /*! \brief source operand */
-            const Tensor<Device,1> src_;
-            /*! \brief constructor */
-            Broadcast1DExp( const Tensor<Device,1> &src, Shape<dimdst> shape ):src_(src){
-                this->shape_ = shape;
-            }
-        };
-
-        /*!
-         * \brief unpack local (overlap) patches of image to column of mat, can be used to implement convolution, this expression allow unpack of a batch        
-         *  this is a version support unpacking multiple images
-         *  after getting unpacked mat, we can use: output = dot( weight, mat ) to get covolved results, the relations:
-         * \tparam SrcExp source expression
-         * \tparam dstdim destination dimension
-         */
-        template<typename SrcExp, int srcdim>
-        struct UnpackPatchToColXExp: public MakeTensorExp< UnpackPatchToColXExp<SrcExp,srcdim>, SrcExp, 2>{
-            /*! \brief source operand */
-            const SrcExp& img_;
-            /*! \brief patch size */
-            index_t psize_;
-            /*! \brief patch stride */
-            index_t pstride_;
-            /*! \brief number of input channel */
-            index_t i_channel_;
-            /*! \brief height of img */
-            index_t i_height_;
-            /*! \brief width of img */
-            index_t i_width_;            
-            /*! \brief constructor */
-            UnpackPatchToColXExp( const SrcExp &img, index_t psize, index_t pstride )
-                :img_(img), psize_(psize), pstride_(pstride){
-                Shape<srcdim> imshape = ShapeCheck<srcdim,SrcExp>::Check( img_ );
-                utils::Assert( imshape[0] >= psize && imshape[1] >= psize, "UnpackPatchToCol:image shape smaller than patch size");
-                this->i_channel_ = imshape[2];
-                this->i_height_  = imshape[1];
-                this->i_width_   = imshape[0];
-                // calculate number of batches 
-                const index_t num = imshape.ProdShape( 3, srcdim );
-                const index_t o_height = ( i_height_ - psize ) / pstride + 1;
-                const index_t o_width  = ( i_width_  - psize ) / pstride + 1;
-                this->shape_[0] = o_height * o_width * num;
-                this->shape_[1] = psize * psize * imshape[2];
-            }
-        };
-
-        /*!
-         * \brief reverse operation of UnpackPatchToCol, used to backprop gradient back
-         *    this is a version supporting multiple images
-         * \tparam Device which device it lies
-         * \tparam dstdim destination dimension
-         */
-        template<typename Device, int dstdim>
-        struct PackColToPatchXExp: public MakeTensorExp< PackColToPatchXExp<Device,dstdim>, Tensor<Device,2>, dstdim>{
-            /*! \brief source operand */
-            const Tensor<Device,2>& mat_;
-            /*! \brief patch size */
-            index_t psize_;
-            /*! \brief patch stride */
-            index_t pstride_;
-            /*! \brief constructor */
-            PackColToPatchXExp( const Tensor<Device,2> &mat, Shape<dstdim> imshape, index_t psize, index_t pstride )
-                :mat_(mat), psize_(psize), pstride_(pstride){
-                this->shape_ = imshape;
-                const index_t o_height = ( imshape[1]  - psize ) / pstride + 1;                
-                const index_t o_width  = ( imshape[0]  - psize ) / pstride + 1;                
-                utils::Assert( mat.shape[0] == o_height * o_width * imshape.ProdShape(3,dstdim), "PackColToPatchExp: mat.shape[0] mismatch" );
-                utils::Assert( mat.shape[1] == psize * psize * imshape[2], "PackColToPatchExp: mat.shape[1] mismatch" );
-            }
-        };
-
-        /*!
-         * \brief reshape the content to another shape
-         * input: Tensor<Device,dimsrc>: ishape
-         * output: Tensor<Device,dimdst> ishape.Size() == oshape.Size()
-         * \tparam SrcExp source expression
-         * \tparam dimdst target dimension
-         * \tparam dimsrc source dimension
-         */
-        template<typename SrcExp, int dimdst, int dimsrc>
-        struct ReshapeExp: public MakeTensorExp< ReshapeExp<SrcExp,dimdst,dimsrc>, SrcExp, dimdst>{
-            /*! \brief source expression */
-            const SrcExp& src_;
-            /*! \brief smallest dimension of input */
-            index_t ishape0_;
-            /*! \brief constructor */
-            ReshapeExp( const SrcExp &src, Shape<dimdst> shape ):src_(src){
-                Shape<dimsrc> ishape = ShapeCheck<dimsrc,SrcExp>::Check( src_ );
-                utils::Assert( ishape.Size() == shape.Size(), "reshape size must match" );
-                ishape0_ = ishape[0];
-                this->shape_ = shape;
-            }
-        };
-
-        /*!
-         * \brief swap two axis of a tensor
-         * input: Tensor<Device,dim>: ishape
-         * output: Tensor<Device,dimdst> oshape[a1],oshape[a2] = ishape[a2],oshape[a1]
-         *
-         * \tparam SrcExp type of source expression
-         * \tparam dimsrc source dimension
-         * \tparam a1 smaller dimension to be swapped
-         * \tparam a2 larger dimension to be swapped
-         */
-        template<typename SrcExp,int dimsrc, int a1, int a2>
-        struct SwapAxisExp: public MakeTensorExp< SwapAxisExp<SrcExp,dimsrc,a1,a2>, SrcExp, dimsrc>{
-            /*! \brief source expression */
-            const SrcExp& src_;
-            /*! \brief constructor */
-            SwapAxisExp( const SrcExp &src ):src_(src){                
-                this->shape_ = ShapeCheck<dimsrc,SrcExp>::Check(src); 
-                std::swap( this->shape_[a1], this->shape_[a2] );
-            }
-        };
-
-        /*!
-         * \brief reduction to 1 dimension tensor
-         * input: Tensor<Device,k>: ishape
-         * output: Tensor<Device,1> shape[0] = ishape[dimkeep];
-         *
-         * \tparam EType type of expression to be reduced
-         * \tparam Reducer which reducer to use
-         * \tparam srcdim dimension of source
-         * \tparam dimkeep which dimension to be kept,
-         */
-        template<typename EType, typename Reducer,int dimkeep>
-        struct ReduceTo1DExp: public Exp< ReduceTo1DExp<EType,Reducer, dimkeep>, type::kComplex >{
-            /*! \brief source operand */
-            const EType& src_;
-            /*! \brief source operand, scale of the  */
-            real_t scale_;
-            /*! \brief construct a repmat expression from src and nrow */
-            ReduceTo1DExp( const EType& src, real_t scale ):src_(src),scale_(scale){}
-        };
-
-        /*!
-         * \brief pooling expression, do reduction over local patches of a image
-         * \tparam Reducer reduction method during pooling
-         * \tparam SrcExp source expression to be pooled from
-         * \tparam srcdim dimension of src
-         */
-        template<typename Reducer, typename SrcExp, int srcdim>
-        struct PoolingExp: public MakeTensorExp< PoolingExp<Reducer, SrcExp,srcdim>, SrcExp, srcdim> {
-            /*! \brief source operand */
-            const SrcExp& src_;
-            /*! \brief kernel size */
-            index_t ksize_;
-            /*! \brief kernel stride */
-            index_t kstride_;
-            /*! \brief source height shape[1] */
-            index_t src_height_;
-            /*! \brief source width shape[0] */
-            index_t src_width_;
-            /*! \brief constructor */
-            PoolingExp( const SrcExp &src, index_t ksize, index_t kstride )
-                : src_(src), ksize_(ksize), kstride_(kstride) {
-                Shape< srcdim > sshape = ShapeCheck< srcdim,SrcExp>::Check( src_ );
-                utils::Assert( sshape[0] >= ksize && sshape[1] >= ksize, "pool: kernel must be smaller than image" );
-                this->src_height_ = sshape[1];
-                this->src_width_  = sshape[0];
-                this->shape_ = sshape;
-                this->shape_[1] =  (src_height_ - ksize) / kstride + 1;                
-                this->shape_[0] =  (src_width_  - ksize) / kstride + 1;
-            }
-            /*! \brief constructor, specify shape */
-            PoolingExp( const SrcExp &src, Shape<2> pshape, index_t ksize, index_t kstride )
-                : src_(src), ksize_(ksize), kstride_(kstride) {
-                Shape< srcdim > sshape = ShapeCheck< srcdim,SrcExp>::Check( src_ );
-                utils::Assert( sshape[0] >= ksize && sshape[1] >= ksize, "pool: kernel must be smaller than image" );
-                this->src_height_ = sshape[1];
-                this->src_width_  = sshape[0];
-                this->shape_    = sshape;
-                this->shape_[1] = pshape[1];
-                this->shape_[0] = pshape[0];
-            } 
-        };
-
-        /*!
-         * \brief unpooling expr reverse operation of pooling, used to pass gradient back
-         * \tparam Reducer specifies reduction operation during pooling
-         * \tparam Device which device it lies
-         */
-        template<typename Reducer, typename Device>
-        struct UnPoolingExp: public MakeTensorExp< UnPoolingExp<Reducer, Device>, Tensor<Device,4>, 4> {
-            /*! \brief source input, corresponds to src in pooling */
-            const Tensor<Device, 4>& data_src_;
-            /*! \brief result of pooled data, corresponds to result of pooling */
-            const Tensor<Device, 4>& data_pooled_;
-            /*! \brief gradient data of pooled part, to be propgate down */
-            const Tensor<Device, 4>& grad_pooled_;
-            /*! \brief kernel size */
-            index_t ksize_;
-            /*! \brief kernel stride */
-            index_t kstride_;
-            /*! \brief constructor */
-            UnPoolingExp( const Tensor<Device,4> &data_src,  const Tensor<Device,4> &data_pooled,
-                          const Tensor<Device,4> &grad_pooled, index_t ksize, index_t kstride )
-                : data_src_(data_src), data_pooled_(data_pooled), grad_pooled_(grad_pooled),
-                  ksize_(ksize), kstride_(kstride) {
-                utils::Assert( grad_pooled.shape == data_pooled.shape, "UnPoolingExp: pooled shape mismatch" );
-                utils::Assert( grad_pooled.shape[2] == data_src.shape[2], "UnPoolingExp: pool and src shape mismatch" );
-                utils::Assert( grad_pooled.shape[3] == data_src.shape[3], "UnPoolingExp: pool and src shape mismatch" );
-                this->shape_ = data_src_.shape;
-            }
-        };
-
-        /*!
-         * \brief padding expression, pad a image with zeros
-         * \tparam SrcExp source expression to be pooled from
-         * \tparam srcdim dimension of src
-         */
-        template<typename SrcExp, int srcdim>
-        struct PaddingExp : public MakeTensorExp<PaddingExp<SrcExp, srcdim>, SrcExp, srcdim> {
-            /*! \brief source operand */
-            const SrcExp& src_;
-            /*! \brief pad size */
-            index_t pad_;
-            /*! \brief source tensor height */
-            index_t src_height_;
-            /*! \brief source tensor width */
-            index_t src_width_;
-            /*! \brief constructor */
-            PaddingExp( const SrcExp &src, index_t pad )
-                : src_(src), pad_(pad) {
-                this->shape_ = ShapeCheck<srcdim,SrcExp>::Check( src_ );
-                src_height_ = this->shape_[1];
-                src_width_  = this->shape_[0];
-                this->shape_[1] += pad * 2; // height
-                this->shape_[0] += pad * 2; // width
-            }
-        };
-
-        /*!
-         * \brief crop expression, cut off the boundary region, reverse operation of padding
-         * \tparam SrcExp source expression to be pooled from
-         * \tparam srcdim dimension of src
-         */
-        template<typename SrcExp, int srcdim>
-        struct CroppingExp : public MakeTensorExp< CroppingExp<SrcExp, srcdim>, SrcExp, srcdim> {
-            /*! \brief source operand */
-            const SrcExp& src_;
-            /*! \brief pad height */
-            index_t pad_height_;
-            /*! \brief pad height */
-            index_t pad_width_;
-            /*! \brief src height */
-            index_t src_height_;
-            /*! \brief constructor */
-            CroppingExp(const SrcExp &src, Shape<2> cshape ): src_(src) {
-                this->shape_ = ShapeCheck<srcdim,SrcExp>::Check( src_ );
-                utils::Assert(this->shape_[1] >= cshape[1], "CroppingExp: height requirement not met");
-                utils::Assert(this->shape_[0] >= cshape[0], "CroppingExp: width requirement not met");
-                pad_height_ = (this->shape_[1] - cshape[1]) / 2;
-                pad_width_ = (this->shape_[0] - cshape[0]) / 2;
-                src_height_ = this->shape_[1];
-                this->shape_[1] = cshape[1]; // width
-                this->shape_[0] = cshape[0]; // height
-            }
-            /*! \brief constructor */
-            CroppingExp(const SrcExp &src, Shape<2> cshape, index_t start_height, index_t start_width  )
-                : src_(src), pad_height_(start_height), pad_width_(start_width) {
-                this->shape_ = ShapeCheck<srcdim,SrcExp>::Check( src_ );
-                utils::Assert(this->shape_[1] >= cshape[1], "CroppingExp: height requirement not met");
-                utils::Assert(this->shape_[0] >= cshape[0], "CroppingExp: width requirement not met");
-                src_height_ = this->shape_[1];
-                this->shape_[1] = cshape[1]; // width
-                this->shape_[0] = cshape[0]; // height
-            }
-
-        }; // struct CroppingExp
-
-
-        /*!
-         * \brief mirror expression, mirror a image in width
-         * \tparam SrcExp source expression to be mirrored
-         * \tparam srcdim dimension of src
-         */
-        template<typename SrcExp, int srcdim>
-        struct MirroringExp : public MakeTensorExp<MirroringExp<SrcExp, srcdim>, SrcExp, srcdim> {
-            /*! \brief source operand */
-            const SrcExp& src_;
-            /*! \brief constructor */
-            MirroringExp( const SrcExp &src ): src_(src) {
-                this->shape_ = ShapeCheck<srcdim,SrcExp>::Check( src_ );
-            }
-        };
-
-        /*!
-         * \brief channel pooling expression, do reduction over (local nearby) channels, used to implement local response normalization
-         * \tparam Reducer reduction method during pooling
-         * \tparam SrcExp source expression to be pooled from
-         * \tparam srcdim dimension of src
-         */
-        template<typename Reducer, typename SrcExp, int srcdim>
-        struct ChannelPoolingExp: public MakeTensorExp< ChannelPoolingExp<Reducer, SrcExp,srcdim>, SrcExp, srcdim> {
-            /*! \brief source operand */
-            const SrcExp& src_;
-            /*! \brief neighbor size */
-            index_t nsize_;            
-            /*! \brief constructor */
-            ChannelPoolingExp( const SrcExp &src, index_t nsize ): src_(src), nsize_(nsize){
-                utils::Assert( nsize % 2 == 1, "ChannelPoolingExp: local size must be odd, to make it symmetric" );
-                this->shape_ = ShapeCheck<srcdim,SrcExp>::Check( src_ );
-                utils::Assert( this->shape_[2] >= nsize_, "ChannelPoolingExp: local size need to be smaller than number of channels" );
-            }
-        };
-    }; // namespace expr
-
-
-    // Declaration of all functions go here
-    namespace expr{
-        /*! \brief operator overload */
-        template<typename E, typename R,int d>
-        inline ReduceTo1DExp<E,R,d> operator*( const ReduceTo1DExp<E,R,d> &e, real_t scale ){
-            return ReduceTo1DExp<E,R,d>( e.src_, e.scale_*scale );
-        }
-        /*! \brief operator overload */
-        template<typename E, typename R,int d>
-        inline ReduceTo1DExp<E,R,d> operator*( real_t scale, const ReduceTo1DExp<E,R,d> &e ){
-            return ReduceTo1DExp<E,R,d>( e.src_, e.scale_*scale );
-        }
-
-        /*!
-         * \brief a expression that replicate a 1 dimension tensor in dimension dimcast
-         * \param src Tensor<Device,1>: shape[0]
-         * \param shape shape of output
-         * \return a expresion with type Tensor<Device,dimdst>
-         * \tparam dimcast target dimension where the 1D tensor will be broadcasted
-         * \tparam Device which device it lies
-         * \tparam dimdst dimension of destination tensor
-         */
-        template<int dimcast,typename Device,int dimdst>
-        inline Broadcast1DExp<Device,dimdst,dimcast> broadcast( const Tensor<Device,1> &src, Shape<dimdst> shape ){
-            TypeCheckPass< dimcast<dimdst >::Error_Expression_Does_Not_Meet_Dimension_Req();
-            utils::Assert( src.shape[0] == shape[dimcast], "broadcast, shape mismatch" );
-            return Broadcast1DExp<Device,dimdst,dimcast>( src, shape );
-        }
-
-        /*!
-         * \brief  unpack local (overlap) patches of image to column of mat, can be used to implement convolution
-         *  after getting unpacked mat, we can use: output = dot( weight, mat ) to get covolved results, the relations:
-         *
-         *  weight; shape[1]: out_channel, shape[0]: ichannel*psize*psize
-         *  output; shape[1]: out_channel, shape[0]: out_height*out_width * num_of_images
-         *  out_height = ( in_height - psize ) / pstride + 1, this means we pad inperfect patch with 0
-         *  out_width  = ( in_width - psize ) / pstride + 1
-         *
-         * \return mat target matrix; shape[1]: in_channel*psize*psize  shape[0]: out_height*out_width * num_of_images
-         * \param img source image; shape[2]:  in_channels, shape[1]: in_height, shape[0]: in_width, can be 3D or 4D tensor(multiple images)
-         * \param psize height and width of each patch
-         * \param pstride stride of each patch
-         * \tparam SrcExp source expression
-         * \tparam etype type of expression
-         */
-        template<typename SrcExp, int etype>
-        inline UnpackPatchToColXExp<SrcExp, ExpInfo<SrcExp>::kDim > unpack_patch2col( const Exp<SrcExp,etype> &img, index_t psize, index_t pstride ){
-            TypeCheckPass< ExpInfo<SrcExp>::kDim >= 3 >::Error_Expression_Does_Not_Meet_Dimension_Req();
-            return UnpackPatchToColXExp<SrcExp, ExpInfo<SrcExp>::kDim >( img.self(), psize, pstride );
-        }
-
-        /*!
-         * \brief reverse operation of pack_col2patch, can be used to implement deconvolution
-         * \return packed img expression
-         * \param mat source matrix
-         * \param imshape shape of target img
-         * \param psize height and width of each patch
-         * \param pstride stride of each patch
-         * \tparam Device the Device where input data lies
-         */
-        template<typename Device, int dstdim>
-        inline PackColToPatchXExp<Device,dstdim> pack_col2patch( const Tensor<Device,2> &mat, Shape<dstdim> imshape, index_t psize, index_t pstride ){
-            utils::Assert( imshape[0] >= psize && imshape[1] >= psize, "PackColToPatch:image shape smaller than patch size");
-            return PackColToPatchXExp<Device,dstdim>( mat, imshape, psize, pstride );
-        }
-        /*!
-         * \brief a expression that reshapes a tensor to another shape
-         * \param src Tensor<Device,dimsrc>:
-         * \param oshape target shape
-         * \return a expresion with type Tensor<Device,dimdst>
-         * \tparam SrcExp source expression
-         * \tparam etype source expression type
-         * \tparam dimdst target dimension
-         */
-        template<typename SrcExp, int etype, int dimdst>
-        inline ReshapeExp< SrcExp,dimdst, ExpInfo<SrcExp>::kDim > reshape( const Exp<SrcExp,etype> &src, Shape<dimdst> oshape ){
-            return ReshapeExp< SrcExp,dimdst, ExpInfo<SrcExp>::kDim >( src.self(), oshape );
-        }
-
-        /*!
-         * \brief a expression that reshapes a tensor to another shape
-         * \param src Tensor<Device,dimsrc>:
-         * \return a expresion with type Tensor<Device,dimdst>
-         * \tparam a1 smaller dimension to be swapped
-         * \tparam a2 larger dimension to be swapped
-         * \tparam SrcExp source expression
-         * \tparam etype source expression type
-         */
-        template<int a1, int a2, typename SrcExp, int etype>
-        inline SwapAxisExp< SrcExp, ExpInfo<SrcExp>::kDim, a1,a2> swapaxis( const Exp<SrcExp,etype> &src ){ 
-            typedef ExpInfo<SrcExp> Info;
-            TypeCheckPass< Info::kDim>=a1+1 && Info::kDim >= a2+1 && a1+1 <= a2 >::Error_Expression_Does_Not_Meet_Dimension_Req();
-            return SwapAxisExp< SrcExp,Info::kDim,a1,a2>( src.self() );
-        }
-
-        /*!
-         * \brief a sum over all dimensions, except dimkeep
-         * \param exp input expression that must be a matrix Tensor<?,2>
-         * \return a expresion with type Tensor<Device,1>
-         * \tparam dimkeep the dimension that will be kept
-         * \tparam SrcExp expression
-         * \tparam etype type of expression
-         */
-        template<int dimkeep,  typename SrcExp, int etype>
-        inline ReduceTo1DExp<SrcExp, red::sum, dimkeep > sumall_except_dim( const Exp<SrcExp,etype> &exp ){
-            return ReduceTo1DExp<SrcExp,red::sum,dimkeep>( exp.self(), 1.0f );
-        }
-
-        /*!
-         * \brief pooling subregion results together
-         * \param src source image, shape[3]: batch, shape[2]: channel shape[1]: height shape[0]:width
-         * \param ksize kernel size
-         * \param kstride stride for each kernel
-         * \return expression of pooled result
-         * \tparam Reducer reducer type
-         * \tparam SrcExp source expression
-         * \tparam etype type of expression
-         */
-        template<typename Reducer, typename SrcExp, int etype>
-        inline PoolingExp<Reducer,SrcExp, ExpInfo<SrcExp>::kDim > pool( const Exp<SrcExp,etype> &src, index_t ksize, index_t kstride ) {
-            TypeCheckPass< ExpInfo<SrcExp>::kDim >= 2 >::Error_Expression_Does_Not_Meet_Dimension_Req();
-            return PoolingExp<Reducer,SrcExp, ExpInfo<SrcExp>::kDim >(src.self(), ksize, kstride);
-        }
-        /*! 
-         * \brief same as pool, except the output shape is specified by pshape
-         * \param src source image
-         * \param pshape ouput shape 
-         * \param ksize kernel size
-         * \param kstride stride for each kernel
-         * \return expression of pooled result
-         * \tparam Reducer reducer type
-         * \tparam SrcExp source expression
-         * \tparam etype type of expression
-         */
-        template<typename Reducer, typename SrcExp, int etype>
-        inline PoolingExp<Reducer,SrcExp, ExpInfo<SrcExp>::kDim > pool( const Exp<SrcExp,etype> &src, Shape<2> pshape, index_t ksize, index_t kstride ) {
-            TypeCheckPass< ExpInfo<SrcExp>::kDim >= 2 >::Error_Expression_Does_Not_Meet_Dimension_Req();
-            return PoolingExp<Reducer,SrcExp, ExpInfo<SrcExp>::kDim >(src.self(), pshape, ksize, kstride);
-        }
-        /*!
-         * \brief unpooling gradient for 4D, backprop gradient value back, revserse operation of pooling
-         * \param data_src  source input, corresponds to src in pooling
-         * \param data_pooled result of pooled data, corresponds to result of pooling
-         * \param grad_pooled gradient data of pooled part, to be propgate down
-         * \param ksize kernel size
-         * \param kstride stride for each kernel
-         * \return expression corresponding to unpooled 4D Tensor, storing backproped gradient
-         * \tparam Reducer reducer type
-         * \tparam Device device where data lies
-         */
-         template<typename Reducer, typename Device>
-         inline UnPoolingExp<Reducer, Device> unpool( const Tensor<Device,4>&data_src, const Tensor<Device,4> &data_pooled,
-                                                      const Tensor<Device,4> &grad_pooled, index_t ksize, index_t kstride ) {
-             return UnPoolingExp<Reducer, Device>(data_src, data_pooled, grad_pooled,ksize, kstride);
-         }
-
-        /*!
-         * \brief padding expression, pad a image with zeros on boundaries, padding affects shape[0], and shape[1]
-         * \param src original image batches
-         * \param pad padding size
-         * \return expression corresponding to padded result
-         * \tparam SrcExp source expression
-         * \tparam etype type of expression
-         */
-         template<typename SrcExp, int etype>
-         inline PaddingExp<SrcExp, ExpInfo<SrcExp>::kDim> pad(const Exp<SrcExp, etype> &src, index_t pad) {
-             TypeCheckPass< ExpInfo<SrcExp>::kDim >= 2 >::Error_Expression_Does_Not_Meet_Dimension_Req();
-             return PaddingExp<SrcExp, ExpInfo<SrcExp>::kDim>(src.self(), pad);
-         }
-
-        /*!
-         * \brief revserse operationg of padding, cut off boundaries, crop output from center of input
-         * \param src original image batches
-         * \param oshape output shape to be cropped
-         * \return expression corresponding to padded result
-         * \tparam SrcExp source expression
-         * \tparam etype type of expression
-         */
-         template<typename SrcExp, int etype>
-         inline CroppingExp<SrcExp, ExpInfo<SrcExp>::kDim> crop( const Exp<SrcExp, etype> &src, Shape<2> oshape ) {
-             TypeCheckPass< ExpInfo<SrcExp>::kDim >= 2 >::Error_Expression_Does_Not_Meet_Dimension_Req();
-             return CroppingExp<SrcExp, ExpInfo<SrcExp>::kDim>(src.self(), oshape);
-         }
-        /*!
-         * \brief same as crop, but can specify starting position to do cropping
-         * \param src original image batches
-         * \param oshape output shape to be cropped
-         * \param start_height start height position to do cropping
-         * \param start_width  start width position to do cropping
-         * \return expression corresponding to padded result
-         * \tparam SrcExp source expression
-         * \tparam etype type of expression
-         */
-         template<typename SrcExp, int etype>
-         inline CroppingExp<SrcExp, ExpInfo<SrcExp>::kDim> crop( const Exp<SrcExp, etype> &src, Shape<2> oshape, index_t start_height, index_t start_width ) {
-             TypeCheckPass< ExpInfo<SrcExp>::kDim >= 2 >::Error_Expression_Does_Not_Meet_Dimension_Req();
-             return CroppingExp<SrcExp, ExpInfo<SrcExp>::kDim>(src.self(), oshape, start_height, start_width);
-         }
-
-        /*!
-         * \brief mirroring expression, mirror images in width
-         * \param src original image batches
-         * \return expression corresponding to mirrored result
-         * \tparam SrcExp source expression
-         * \tparam etype type of expression
-         */
-         template<typename SrcExp, int etype>
-         inline MirroringExp<SrcExp, ExpInfo<SrcExp>::kDim> mirror(const Exp<SrcExp, etype> &src) {
-             TypeCheckPass< ExpInfo<SrcExp>::kDim >= 2 >::Error_Expression_Does_Not_Meet_Dimension_Req();
-             return MirroringExp<SrcExp, ExpInfo<SrcExp>::kDim>(src.self());
-         }
-
-        /*!
-         * \brief  channel pooling, do reduction over (local nearby) channels, used to implement local response normalization
-         * \param src source data 
-         * \param nsize neighbor size 
-         * \return expression of pooled result
-         * \tparam Reducer reducer type
-         * \tparam SrcExp source expression
-         * \tparam etype type of expression
-         */
-        template<typename Reducer, typename SrcExp, int etype>
-        inline ChannelPoolingExp<Reducer,SrcExp, ExpInfo<SrcExp>::kDim > chpool( const Exp<SrcExp,etype> &src, index_t nsize ) {
-            TypeCheckPass< ExpInfo<SrcExp>::kDim >= 3 >::Error_Expression_Does_Not_Meet_Dimension_Req();
-            return ChannelPoolingExp<Reducer,SrcExp, ExpInfo<SrcExp>::kDim >(src.self(),nsize);
-        }
-        // short cut functions
-        /*!
-         * \brief a expression that replicate a 1 dimension tensor for nrow times
-         * \param src Tensor<Device,1>: shape[0]
-         * \param nrow number of rows to replicate
-         * \return a expresion with type Tensor<Device,2> shape[0], shape[1] = nrow
-         * \tparam Device which device it lies
-         */
-        template<typename Device>
-        inline Broadcast1DExp<Device,2,0> repmat( const Tensor<Device,1> &src, index_t nrow ){
-            return broadcast<0>( src, Shape2( nrow, src.shape[0] ) );
-        }
-        /*!
-         * \brief a expression that sum over rows of a matrix
-         * \param exp input expression that must be a matrix Tensor<?,2>
-         * \return a expresion with type Tensor<Device,1>
-         * \tparam SrcExp expression
-         * \tparam etype type of expression
-         */
-        template<typename SrcExp, int etype>
-        inline ReduceTo1DExp<SrcExp, red::sum, 0 > sum_rows( const Exp<SrcExp,etype> &exp ){
-            return sumall_except_dim<0>( exp );
-        }
-
-    }; // namespace expr
-}; // namespace mshadow
-
-// ==================================================
-//  implementations afterwards,
-//  no need to read if only use the functions
-// --------------------------------------------------
-namespace mshadow{
-    namespace expr{
-        template<typename SV, typename Device, typename EType, typename Reducer, int dimkeep>
-        struct ExpComplexEngine< SV, Device, 1, ReduceTo1DExp<EType,Reducer,dimkeep> >{
-            inline static void Eval( Tensor<Device,1> &dst, const ReduceTo1DExp<EType,Reducer,dimkeep> &exp ){
-                TypeCheckPass< dimkeep!=0 >::Error_Expression_Does_Not_Meet_Dimension_Req();
-                MapReduceKeepHighDim<SV,Reducer,dimkeep>( dst, exp.src_, exp.scale_ );
-            }
-        };
-
-        template<typename SV, typename Device, typename EType, typename Reducer>
-        struct ExpComplexEngine< SV, Device, 1, ReduceTo1DExp<EType,Reducer,0> >{
-            inline static void Eval( Tensor<Device,1> &dst, const ReduceTo1DExp<EType,Reducer,0> &exp ){
-                MapReduceKeepLowest<SV,Reducer>( dst, exp.src_, exp.scale_ );
-            }
-        };
-    }; // namespace expr
-
-    namespace expr{
-        /*! \brief execution plan of Broadcast1DExp */
-        template<typename Device, int dimdst, int dimcast>
-        struct Plan< Broadcast1DExp<Device,dimdst,dimcast> >{
-        public:
-            Plan( const Broadcast1DExp<Device,dimdst,dimcast> &e )
-                : dptr_( e.src_.dptr ), 
-                  ystride_( e.shape_.ProdShape(1,dimcast) ),
-                  length_(e.shape_[dimcast]){
-                TypeCheckPass< dimcast!=0 >::Error_Expression_Does_Not_Meet_Dimension_Req();
-            }
-            MSHADOW_XINLINE real_t Eval( index_t y, index_t x ) const{
-                return dptr_[ (y / ystride_) % length_ ];
-            }
-        private:
-            const real_t  *dptr_;
-            const index_t  ystride_, length_;
-        };
-
-        /*! \brief execution plan of Broadcast1DExp */
-        template<typename Device, int dimdst>
-        struct Plan< Broadcast1DExp<Device,dimdst,0> >{
-        public:
-            Plan( const Broadcast1DExp<Device,dimdst,0> &e ): dptr_( e.src_.dptr ){}
-            MSHADOW_XINLINE real_t Eval( index_t y, index_t x ) const{
-                return dptr_[ x ];
-            }
-        private:
-            const real_t *dptr_;
-        };
-    }; // namespace expr
-
-    namespace expr{
-        template<typename SrcExp, int srcdim>
-        struct Plan< UnpackPatchToColXExp<SrcExp,srcdim> >{
-        public:
-            Plan( const UnpackPatchToColXExp<SrcExp,srcdim> &e )
-                :src_(MakePlan(e.img_)),psize_(e.psize_), pstride_(e.pstride_),
-                 i_channel_(e.i_channel_), i_height_(e.i_height_), i_width_(e.i_width_),                 
-                 o_height_(( i_height_  - psize_ ) / pstride_ + 1),
-                 o_width_ (( i_width_   - psize_ ) / pstride_ + 1){
-            }
-            MSHADOW_XINLINE real_t Eval( index_t i, index_t j ) const{
-                const index_t x_offset = i % psize_;
-                const index_t idivp    = i / psize_;
-                const index_t y_offset = idivp % psize_;
-                const index_t c = idivp / psize_;                
-                const index_t x = (j % o_width_) * pstride_ + x_offset;
-                const index_t jdivw = j / o_width_;
-                const index_t y = (jdivw % o_height_) * pstride_ + y_offset;
-                const index_t n = jdivw / o_height_;
-
-                if( x < i_width_ && y < i_height_ ){
-                    return src_.Eval( ( n * i_channel_  + c ) * i_height_ + y, x );
-                }else{
-                    return 0.0f;
-                }
-            }
-        private:
-            Plan<SrcExp> src_;
-            const index_t psize_, pstride_, i_channel_, i_height_, i_width_, o_height_, o_width_;
-        };
-
-        template<typename Device, int dstdim>
-        struct Plan< PackColToPatchXExp<Device, dstdim> >{
-        public:
-            Plan( const PackColToPatchXExp<Device, dstdim> &e )
-                :mat_(e.mat_), psize_(e.psize_), pstride_(e.pstride_),
-                 i_channel_(e.shape_[2]), i_height_(e.shape_[1]),
-                 o_width_(( e.shape_[0]  - psize_ ) / pstride_ + 1),
-                 o_height_(( e.shape_[1]  - psize_ ) / pstride_ + 1){
-                // note: i/o convention are same as unpack
-            }
-            MSHADOW_XINLINE real_t Eval( index_t i, index_t j ) const{
-                using namespace std;
-                const index_t y = i % i_height_;
-                const index_t idivh = i / i_height_;                
-                const index_t c = idivh % i_channel_;
-                const index_t n = idivh / i_channel_; 
-                const index_t x = j;
-                const index_t py_min = y < psize_ ? 0 : (y-psize_+pstride_)/pstride_;
-                const index_t px_min = x < psize_ ? 0 : (x-psize_+pstride_)/pstride_;
-                const index_t py_max = min( (y+pstride_)/pstride_, o_height_);
-                const index_t px_max = min( (x+pstride_)/pstride_, o_width_ );
-                real_t res = 0.0f;
-                for( index_t py = py_min; py < py_max; ++py ){
-                    for( index_t px = px_min; px < px_max; ++px ){
-                        res += mat_[ (c * psize_ + y - py*pstride_) * psize_ + x - px*pstride_ ][ (n * o_height_ + py) * o_width_+px ];
-                    }
-                }
-                return res;
-            }
-        private:
-            Tensor<Device,2> mat_;
-            const index_t psize_, pstride_, i_channel_, i_height_, o_width_, o_height_;
-        };
-    };
-
-    namespace expr{
-        template<typename SrcExp, int dimdst, int dimsrc>
-        struct Plan< ReshapeExp<SrcExp,dimdst,dimsrc> >{
-        public:
-            Plan( const ReshapeExp<SrcExp,dimdst,dimsrc> &e )
-                : src_(MakePlan(e.src_)), oshape0_(e.shape_[0]), ishape0_(e.ishape0_){
-            }
-            MSHADOW_XINLINE real_t Eval( index_t y, index_t x ) const{
-                const index_t idx = y * oshape0_ + x;
-                return src_.Eval( idx / ishape0_, idx % ishape0_ );
-            }
-        private:
-            Plan<SrcExp> src_;
-            const index_t oshape0_, ishape0_;
-        };
-        // special work plan for 1 dimensional data
-        template<typename SrcExp,int dimdst>
-        struct Plan< ReshapeExp<SrcExp,dimdst,1> >{
-        public:
-            Plan( const ReshapeExp<SrcExp,dimdst,1> &e )
-                : src_(MakePlan(e.src_)), oshape0_(e.shape_[0]){
-            }
-            MSHADOW_XINLINE real_t Eval( index_t y, index_t x ) const{
-                return src_.Eval( 0, y * oshape0_ + x );
-            }
-        private:
-            Plan<SrcExp> src_;
-            const index_t oshape0_;
-        };
-    };
-    
-    namespace expr{
-        template<typename SrcExp,int dimsrc, int a1, int a2>
-        struct Plan< SwapAxisExp<SrcExp,dimsrc,a1,a2> >{
-        public:
-            Plan( const SwapAxisExp<SrcExp,dimsrc,a1,a2> &e )
-                : src_(MakePlan(e.src_)),
-                  shape1_( e.shape_.ProdShape( 1, a1 ) ),
-                  shape2_( e.shape_[a1] ),
-                  shape3_( e.shape_.ProdShape( a1+1, a2 ) ),
-                  shape4_( e.shape_[a2] ){
-            }
-            MSHADOW_XINLINE real_t Eval( index_t i, index_t j ) const{
-                const index_t y = i % shape1_;
-                i /= shape1_; 
-                const index_t z = i % shape2_;
-                i /= shape2_;
-                const index_t c = i % shape3_;
-                i /= shape3_;
-                const index_t n = i % shape4_;
-                // swap z and n
-                return src_.Eval( ((((i/shape4_)*shape2_ + z) * shape3_+c) * shape4_ + n ) * shape1_ + y, j ); 
-            }
-        private:
-            Plan<SrcExp> src_;
-            const index_t shape1_, shape2_, shape3_, shape4_;
-        };
-
-        template<typename SrcExp,int dimsrc, int a2>
-        struct Plan< SwapAxisExp<SrcExp,dimsrc,0,a2> >{
-        public:
-            Plan( const SwapAxisExp<SrcExp,dimsrc,0,a2> &e )
-                : src_(MakePlan(e.src_)),
-                  shape0_( e.shape_[0] ),
-                  shape1_( e.shape_.ProdShape(1,a2) ),
-                  shape2_( e.shape_[a2] ){
-            }
-            MSHADOW_XINLINE real_t Eval( index_t i, index_t x ) const{
-                // swap x and z
-                const index_t y = i % shape1_;
-                i /= shape1_; 
-                const index_t z = i % shape2_;
-                const index_t n = i / shape2_;
-                return src_.Eval(  ( n*shape0_ + x ) * shape1_ + y , z ); 
-            }
-        private:
-            Plan<SrcExp> src_;
-            const index_t shape0_, shape1_, shape2_;
-        };
-    };
-
-    namespace expr{
-        template<typename Reducer, typename SrcExp, int srcdim>
-        struct Plan< PoolingExp< Reducer, SrcExp, srcdim> > {
-        public:
-            Plan( const PoolingExp<Reducer, SrcExp, srcdim> &e )
-                : src_( MakePlan( e.src_ ) ), ksize_(e.ksize_), kstride_(e.kstride_),
-                  src_height_(e.src_height_),src_width_(e.src_width_), new_height_(e.shape_[1]) {
-            }
-            MSHADOW_XINLINE real_t Eval(index_t i, index_t j) const {
-                using namespace std;
-                const index_t py = i % new_height_;
-                const index_t y_start = py * kstride_;
-                const index_t y_end = min( y_start + ksize_, src_height_ );
-                const index_t px = j;
-                const index_t x_start = px * kstride_;
-                const index_t x_end = min( x_start + ksize_, src_width_ );
-                const index_t c = i / new_height_;
-
-                real_t res = Reducer::kInitV;
-                for (index_t y = y_start; y < y_end; ++y) {
-                    for (index_t x = x_start; x < x_end; ++x) {
-                        Reducer::Reduce( res, src_.Eval( c*src_height_+y, x ) );
-                    }
-                }
-                return res;
-            }
-        private:
-            Plan<SrcExp> src_;
-            const index_t ksize_, kstride_;
-            const index_t src_height_, src_width_;
-            const index_t new_height_;
-        };
-
-        template<typename Reducer, typename Device>
-        struct Plan<UnPoolingExp<Reducer, Device> > {
-        public:
-            Plan(const UnPoolingExp<Reducer, Device> &e)
-                : data_src_(e.data_src_), data_pooled_(e.data_pooled_), grad_pooled_(e.grad_pooled_),
-                  ksize_(e.ksize_), kstride_(e.kstride_) {}
-            MSHADOW_XINLINE real_t Eval(index_t i, index_t j) const {
-                using namespace std;
-                const index_t x = j;
-                const index_t y = i % data_src_.shape[1];
-                const index_t c = i / data_src_.shape[1];
-                const real_t vsrc = data_src_[0][c][y][x];
-
-                const index_t py_min = y < ksize_ ? 0 : (y-ksize_+kstride_)/kstride_;
-                const index_t px_min = x < ksize_ ? 0 : (x-ksize_+kstride_)/kstride_;
-                const index_t py_max = min( (y+kstride_)/kstride_, data_pooled_.shape[1]);
-                const index_t px_max = min( (x+kstride_)/kstride_, data_pooled_.shape[0]);
-
-                real_t val = 0;
-                for( index_t py = py_min; py < py_max; ++py ){
-                    for( index_t px = px_min; px < px_max; ++px ){
-                        val += Reducer::PartialGrad(vsrc, data_pooled_[0][c][py][px]) * grad_pooled_[0][c][py][px];
-                    }
-                }
-                return val;
-            }
-        private:
-            Tensor<Device, 4> data_src_, data_pooled_, grad_pooled_;
-            const index_t ksize_;
-            const index_t kstride_;
-        };
-    }; // namespace expr
-
-    namespace expr{
-        template<typename SrcExp, int srcdim>
-        struct Plan< PaddingExp<SrcExp, srcdim> > {
-        public:
-            Plan(const PaddingExp<SrcExp, srcdim> &e)
-                : src_(MakePlan(e.src_)), pad_(e.pad_), new_height_(e.shape_[1]),
-                  src_height_(e.src_height_), src_width_(e.src_width_) {}
-            MSHADOW_XINLINE real_t Eval(index_t i, index_t j) const {
-                const index_t x = j;
-                const index_t y = i % new_height_;
-                const index_t c = i / new_height_;
-                if (y < pad_ || x < pad_) return 0.0f;
-                const index_t h = y - pad_;
-                const index_t w = x - pad_;
-                if (h < src_height_ && w < src_width_) {
-                    return src_.Eval(c * src_height_ + h, w);
-                } else {
-                    return 0.0f;
-                }
-            }
-        private:
-            Plan<SrcExp> src_;
-            const index_t pad_;
-            const index_t new_height_;
-            const index_t src_height_;
-            const index_t src_width_;
-        };
-
-        template<typename SrcExp, int srcdim>
-        struct Plan<CroppingExp<SrcExp, srcdim> > {
-        public:
-            Plan(const CroppingExp<SrcExp, srcdim> &e)
-                : src_(MakePlan(e.src_)), pad_height_(e.pad_height_),pad_width_(e.pad_width_), 
-                  new_height_(e.shape_[1]), src_height_(e.src_height_) {}
-            MSHADOW_XINLINE real_t Eval(index_t i, index_t j) const {
-                const index_t x = j;
-                const index_t y = i % new_height_;
-                const index_t c = i / new_height_;
-                const index_t h = y + pad_height_;
-                const index_t w = x + pad_width_;
-                return src_.Eval(c * src_height_ + h, w);
-            }
-        private:
-            Plan<SrcExp> src_;
-            const index_t pad_height_, pad_width_;
-            const index_t new_height_;
-            const index_t src_height_;
-        };
-
-        template<typename SrcExp, int srcdim>
-        struct Plan< MirroringExp<SrcExp, srcdim> > {
-        public:
-            Plan(const MirroringExp<SrcExp, srcdim> &e)
-                : src_(MakePlan(e.src_)), width_(e.shape_[0]){}
-            MSHADOW_XINLINE real_t Eval(index_t i, index_t j) const {
-                return src_.Eval( i, width_ - j - 1 );
-            }
-        private:
-            Plan<SrcExp> src_;
-            const index_t width_;
-        };
-    }; // namespace expr
-
-    namespace expr{
-        template<typename Reducer, typename SrcExp, int srcdim>
-        struct Plan< ChannelPoolingExp< Reducer, SrcExp, srcdim> > {
-        public:
-            Plan( const ChannelPoolingExp<Reducer, SrcExp, srcdim> &e )
-                : src_( MakePlan( e.src_ ) ), channel_(e.shape_[2]),
-                  height_(e.shape_[1]),width_(e.shape_[0]), hnsize_(e.nsize_/2){
-            }
-            MSHADOW_XINLINE real_t Eval(index_t i, index_t j) const {
-                using namespace std;
-                const index_t y = i % height_;
-                i /= height_;
-                const index_t c = i % channel_;
-                const index_t n = i / channel_;
-                const index_t x = j;
-                const index_t cstart = c < hnsize_ ? 0  : c - hnsize_;
-                const index_t cend   = min( c + hnsize_ + 1, channel_ );
-                real_t res = Reducer::kInitV;
-                for( index_t cc = cstart; cc < cend; ++ cc ){
-                    Reducer::Reduce( res, src_.Eval( (n*channel_+cc)*height_ + y, x ) );
-                }
-                return res;
-            }
-        private:
-            Plan<SrcExp> src_;
-            const index_t channel_, height_, width_, hnsize_;
-        };
-    };
-}; // namespace mshadow
-
-#if MSHADOW_USE_SSE
-// implementations of SSE support, if possible
-#include "tensor_sse-inl.hpp"
-namespace mshadow{
-    namespace expr{
-        template<int dimdst>
-        struct SSECheck< Broadcast1DExp<cpu,dimdst,0> >{
-            const static bool kPass = true;
-        };
-        template<int dimdst>
-        struct SSEAlignCheck<2, Broadcast1DExp<cpu,dimdst,0> >{
-            inline static bool Check( const Broadcast1DExp<cpu,dimdst,0> &exp ){
-                return sse2::CheckAlign( exp.src_.dptr );
-            }
-        };
-        template<int dimdst>
-        class SSEPlan< Broadcast1DExp<cpu,dimdst,0> >{
-        public:
-            SSEPlan( const Broadcast1DExp<cpu,dimdst,0> &t )
-                :dptr_(t.src_.dptr){}
-            MSHADOW_CINLINE sse2::FVec<real_t> EvalSSE( index_t y, index_t x ) const{
-                return sse2::FVec<real_t>( &dptr_[ x ] );
-            }
-            MSHADOW_CINLINE real_t Eval( index_t y, index_t x ) const{
-                return dptr_[ x ];
-            }
-        private:
-            const real_t  *dptr_;
-        };
-    };
-};
-#endif
-
-#endif
-
diff --git a/include/mshadow/tensor_gpu-inl.hpp b/include/mshadow/tensor_gpu-inl.hpp
deleted file mode 100644
index a2c1fc4..0000000
--- a/include/mshadow/tensor_gpu-inl.hpp
+++ /dev/null
@@ -1,148 +0,0 @@
-#ifndef MSHADOW_TENSOR_GPU_INL_HPP
-#define MSHADOW_TENSOR_GPU_INL_HPP
-/*!
- * \file tensor_gpu-inl.hpp
- * \brief implementation of GPU host code
- * \author Bing Xu, Tianqi Chen
- */
-#include "tensor.h"
-
-#if !(MSHADOW_USE_CUDA)
-namespace mshadow {
-    // do nothing if no GPU operation is involved
-    inline void InitTensorEngine( int dev_id ){
-    }
-    inline void ShutdownTensorEngine( void ){
-    }
-};
-#else
-namespace mshadow {
-    #if (MSHADOW_USE_NVML)
-    inline int AutoSelectDevice(int device_count) {
-        // TODO nvml device id and cuda device id are not consistent
-        return 0;
-    }
-    #endif
-    inline void InitTensorEngine(int dev_id){
-        cudaDeviceProp prop;
-        int device_id = 0;
-        int device_count = 0;
-        cudaGetDeviceCount(&device_count);
-        utils::Assert(device_count > 0, "Cannot find CUDA device. Please check CUDA-Configuration");
-        if (dev_id < 0) {
-            #if (MSHADOW_USE_NVML)
-            device_id = AutoSelectDevice(device_count);
-            #endif
-        } else {
-            device_id = dev_id;
-        }
-        utils::Assert( device_id < device_count, "Incorrect Device ID" );
-        utils::Assert( cudaSetDevice(device_id) == cudaSuccess, "cannot set device" );
-        cudaGetDeviceProperties(&prop, device_id);
-        printf("Use CUDA Device %d: %s\n", device_id, prop.name);
-        cublasInit();
-    }
-    inline void ShutdownTensorEngine( void ){
-        cublasShutdown();
-    }
-
-    template<int dim>
-    inline void AllocSpace(Tensor<gpu,dim> &obj, bool pad){
-        size_t pitch;
-        // common choice for cuda mem align unit is 32
-        if( pad && obj.shape[0] >= MSHADOW_MIN_PAD_RATIO * 32 ){
-            cudaError_t err = cudaMallocPitch( (void**)&obj.dptr, &pitch, \
-                                               obj.shape[0] * sizeof(real_t), obj.FlatTo2D().shape[1] );
-            utils::Assert( err == cudaSuccess, cudaGetErrorString(err) );
-            obj.shape.stride_ = static_cast<index_t>( pitch / sizeof(real_t) );
-        }else{
-            obj.shape.stride_ = obj.shape[0];
-            cudaError_t err = cudaMallocPitch( (void**)&obj.dptr, &pitch, \
-                                               obj.shape.Size() * sizeof(real_t), 1 );
-            utils::Assert( err == cudaSuccess, cudaGetErrorString(err) );
-        }
-    }
-
-    template<int dim>
-    inline void FreeSpace(Tensor<gpu,dim> &obj){
-        cudaFree( obj.dptr ); obj.dptr = NULL;
-    }
-
-    template<typename A,typename B, int dim>
-    inline void Copy(Tensor<A,dim> _dst, Tensor<B,dim> _src, cudaMemcpyKind kind){
-        utils::Assert( _dst.shape == _src.shape, "Copy:shape mismatch" );
-        Tensor<A,2> dst = _dst.FlatTo2D();
-        Tensor<B,2> src = _src.FlatTo2D();
-        cudaError_t err = cudaMemcpy2D( dst.dptr, dst.shape.stride_ * sizeof(real_t),
-                                        src.dptr, src.shape.stride_ * sizeof(real_t),
-                                        dst.shape[0] * sizeof(real_t),
-                                        dst.shape[1], kind );
-        utils::Assert( err == cudaSuccess, cudaGetErrorString(err) );
-    }
-    template<int dim>
-    inline void Copy(Tensor<cpu,dim> dst, const Tensor<gpu,dim> &src){
-        Copy( dst, src, cudaMemcpyDeviceToHost );
-    }
-    template<int dim>
-    inline void Copy(Tensor<gpu,dim> dst, const Tensor<gpu,dim> &src){
-        Copy( dst, src, cudaMemcpyDeviceToDevice );
-    }
-    template<int dim>
-    inline void Copy(Tensor<gpu,dim> dst, const Tensor<cpu,dim> &src){
-        Copy( dst, src, cudaMemcpyHostToDevice );
-    }
-};
-
-#ifdef __CUDACC__
-// the following part is included only if compiler is nvcc
-#include "cuda/tensor_gpu-inl.cuh"
-
-namespace mshadow{
-    template<typename Saver, typename E, int dim>
-    inline void MapPlan(Tensor<gpu,dim> _dst, const expr::Plan<E> &plan){
-        cuda::MapPlan<Saver>( _dst.FlatTo2D(), plan );
-    }
-
-    template<typename Saver, int dim, typename E, int etype>
-    inline void MapExp(Tensor<gpu,dim> dst, const expr::Exp<E,etype> &exp ){
-        using namespace expr;
-        TypeCheckPass< TypeCheck<gpu,dim,E>::kMapPass >::Error_All_Tensor_in_Exp_Must_Have_Same_Type();
-        Shape<dim> eshape = ShapeCheck<dim,E>::Check( exp.self() );
-        utils::Assert( eshape[0] == 0 || eshape == dst.shape, "Assignment: Shape of Tensors in expression is not consistent with target" );
-        MapPlan<Saver>( dst, MakePlan( exp.self() ) );
-    }
-
-    template<typename Saver, typename Reducer, typename E, int etype>
-    inline void MapReduceKeepLowest( Tensor<gpu,1> dst, const expr::Exp<E,etype> &exp, real_t scale ){
-        using namespace expr;
-        TypeCheckPass< TypeCheck<gpu,1,E>::kRedPass >::Error_TypeCheck_Not_Pass_For_Reduce_Exp();
-        Shape<2> eshape = ShapeCheck< ExpInfo<E>::kDim, E >::Check( exp.self() ).FlatTo2D();
-
-        utils::Assert( eshape[0] == dst.shape[0], "reduction dimension do not match" );
-        utils::Assert( eshape[1] != 0, "can not reduce over empty tensor" );
-        cuda::MapReduceKeepLowest<Saver,Reducer>( dst, MakePlan( exp.self() ), scale, eshape );
-    }
-
-    template<typename Saver, typename Reducer, int dimkeep, typename E, int etype>
-    inline void MapReduceKeepHighDim( Tensor<gpu,1> dst, const expr::Exp<E,etype> &exp, real_t scale ){
-        using namespace expr;
-        TypeCheckPass< TypeCheck<gpu,dimkeep,E>::kRedPass >::Error_TypeCheck_Not_Pass_For_Reduce_Exp();
-        typedef Shape< ExpInfo<E>::kDim > EShape;
-        EShape eshape = ShapeCheck< ExpInfo<E>::kDim, E >::Check( exp.self() );
-        utils::Assert( eshape[dimkeep] == dst.shape[0], "reduction dimension do not match" );
-        // use equvalent form
-        Shape<4> pshape = Shape4( eshape.ProdShape(dimkeep+1,EShape::kMaxShape), eshape[dimkeep],
-                                  eshape.ProdShape(1,dimkeep), eshape[0] );
-        // call equavalent map red dim 2
-        cuda::MapReduceKeepDim2<Saver,Reducer>( dst, MakePlan( exp.self() ), scale, pshape );
-    }
-
-    inline void Softmax( Tensor<gpu,2> dst, const Tensor<gpu,2>& src ){
-        cuda::Softmax( dst, src );
-    }
-}; // namespace mshadow
-
-#endif // __CUDACC__
-
-#endif // MSHADOW_USE_CUDA
-#endif // TENSOR_GPU_INL_HPP
diff --git a/include/mshadow/tensor_io.h b/include/mshadow/tensor_io.h
deleted file mode 100644
index 2ce28b3..0000000
--- a/include/mshadow/tensor_io.h
+++ /dev/null
@@ -1,137 +0,0 @@
-#ifndef MSHADOW_TENSOR_IO_H
-#define MSHADOW_TENSOR_IO_H
-/*!
- * \file tensor_io.h
- * \brief definitions of I/O functions for mshadow tensor
- * \author Tianqi Chen
- */
-#include <cstdio>
-#include "tensor.h"
-
-namespace mshadow{
-    namespace utils{
-        /*! 
-         * \brief interface of stream I/O, used to serialize data, 
-         *   it is not restricted to only this interface in SaveBinary/LoadBinary
-         *   mshadow accept all class that implements Read and Write
-         */
-        class IStream{
-        public:
-            /*! 
-             * \brief read data from stream
-             * \param ptr pointer to memory buffer
-             * \param size size of block
-             * \return usually is the size of data readed
-             */
-            virtual size_t Read( void *ptr, size_t size ) = 0;        
-            /*! 
-             * \brief write data to stream
-             * \param ptr pointer to memory buffer
-             * \param size size of block
-             */
-            virtual void Write( const void *ptr, size_t size ) = 0;
-            /*! \brief virtual destructor */
-            virtual ~IStream( void ){}
-        };
-    };
-    
-    /*! 
-     * \brief CPU/GPU: save a tensor by binary format, for GPU version, a temp Tensor<cpu,dim> storage will be allocated
-     * \param fo output binary stream
-     * \param src source data file
-     * \tparam dim dimension of tensor
-     * \tparam TStream type of stream, need to support Read, Write, one example is utils::IStream.
-     */
-    template<int dim,typename TStream>
-    inline void SaveBinary( TStream &fo, const Tensor<cpu,dim> &src );
-    /*! \brief refer to comment of cpu ver \sa SaveBinary */
-    template<int dim,typename TStream>
-    inline void SaveBinary( TStream &fo, const Tensor<gpu,dim> &src );
-
-    /*! 
-     * \brief CPU/GPU: load a tensor by binary format, for GPU version, a temp Tensor<cpu,dim> storage will be allocated
-     *       if pre_alloc is true , then space in dst is preallocated, and must have same shape of the tensor loaded
-     *       if pre_alloc is false, then dst originally does not have space allocated, LoadBinary will allocate space for dst
-     * \param fi output binary stream
-     * \param dst destination file
-     * \param pre_alloc whether space is pre-allocated, if false, space allocation will happen
-     * \tparam dim dimension of tensor     
-     * \tparam TStream type of stream, need to support Read, Write, one example is utils::IStream.
-     */
-    template<int dim,typename TStream>
-    inline void LoadBinary( TStream &fi, Tensor<cpu,dim> &dst, bool pre_alloc );
-    /*! \brief refer to comment of cpu ver \sa LoadBinary */
-    template<int dim,typename TStream>
-    inline void LoadBinary( TStream &fi, Tensor<gpu,dim> &dst, bool pre_alloc );
-    
-    namespace utils{
-        /*! \brief implementation of file i/o stream */
-        class FileStream: public IStream{
-        public:
-            /*! \brief constructor */
-            FileStream( FILE *fp ):fp_(fp){}
-            virtual size_t Read( void *ptr, size_t size ){
-                return fread( ptr, size, 1, fp_ );
-            }
-            virtual void Write( const void *ptr, size_t size ){
-                fwrite( ptr, size, 1, fp_ );
-            }
-            /*! \brief close file */
-            inline void Close( void ){
-                fclose( fp_ );
-            }
-        private:
-            FILE *fp_;
-        };
-    };
-};
-
-namespace mshadow{
-    // implementations
-    template<int dim, typename TStream>
-    inline void SaveBinary( TStream &fo, const Tensor<cpu,dim> &src_ ){
-        fo.Write( src_.shape.shape_, sizeof(index_t) * dim );
-        Tensor<cpu,2> src = src_.FlatTo2D();
-        for( index_t i = 0; i < src.shape[1]; ++ i ){
-            fo.Write( src[i].dptr, sizeof(real_t)*src.shape[0] );
-        }
-    }
-    template<int dim, typename TStream>
-    inline void SaveBinary( TStream &fo, const Tensor<gpu,dim> &src ){
-        // copy to CPU, then save
-        Tensor<cpu,dim> tmp( src.shape ); 
-        AllocSpace( tmp );
-        Copy( tmp, src );
-        SaveBinary( fo, tmp );
-        FreeSpace( tmp );
-    }
-
-    template<int dim, typename TStream>
-    inline void LoadBinary( TStream &fi, Tensor<cpu,dim> &dst_, bool pre_alloc ){
-        Shape<dim> shape;
-        utils::Assert( fi.Read( shape.shape_, sizeof(index_t) * dim ) != 0, "mshadow::LoadBinary" );
-        if( pre_alloc ){
-            utils::Assert( shape == dst_.shape );
-        }else{
-            dst_.shape = shape; AllocSpace( dst_ );
-        }
-        Tensor<cpu,2> dst = dst_.FlatTo2D();
-        if( dst.shape[0] == 0 ) return;        
-        for( index_t i = 0; i < dst.shape[1]; ++ i ){
-            utils::Assert( fi.Read( dst[i].dptr, sizeof(real_t)*dst.shape[0] ) != 0, "mshadow::LoadBinary" );
-        }
-    } 
-    template<int dim, typename TStream>
-    inline void LoadBinary( TStream &fi, Tensor<gpu,dim> &dst, bool pre_alloc ){
-        Tensor<cpu,dim> tmp;
-        LoadBinary( fi, tmp, false );
-        if( pre_alloc ){
-            utils::Assert( tmp.shape == dst.shape );
-        }else{
-            dst.shape = tmp.shape; AllocSpace( dst );
-        }
-        Copy( dst, tmp );
-        FreeSpace( tmp );
-    }
-};
-#endif // TENSOR_IO_H
diff --git a/include/mshadow/tensor_random.h b/include/mshadow/tensor_random.h
deleted file mode 100644
index ae2836a..0000000
--- a/include/mshadow/tensor_random.h
+++ /dev/null
@@ -1,369 +0,0 @@
-#ifndef MSHADOW_TENSOR_RANDOM_H
-#define MSHADOW_TENSOR_RANDOM_H
-/*!
- *  \file tensor_random.h
- *  \brief Random inline functions for tensor.
- *  \author Bing Xu, Tianqi Chen
- *   Based on curand|MKL|stdlib
- */
-#include <cstdlib>
-#include <random>
-#include <chrono>
-#include "tensor.h"
-#include "tensor_container.h"
-
-namespace mshadow {
-    /*!
-     * \brief random number generator
-     * \tparam Device the device of random number generator
-     *
-     * Note: replaced rand (srand) with c++11's random functions.
-     */
-    template<typename Device>
-    class Random {};
-
-    /*! \brief CPU random number generator */
-    template<>
-    class Random<cpu> {
-    public:
-        /*!
-         * \brief constructor of random engine using default seed
-         */
-        Random<cpu> (){
-          // obtain a seed from the system clock:
-          unsigned s= std::chrono::system_clock::now().time_since_epoch().count();
-          Seed(s);
-        }
-        /*!
-         * \brief constructor of random engine
-         * \param seed random number seed
-         */
-        Random<cpu>( int seed ){
-            #if MSHADOW_USE_MKL
-            int status = vslNewStream(&vStream_, VSL_BRNG_MT19937, seed);
-            utils::Assert( status == VSL_STATUS_OK, "MKL VSL Random engine failed to be initialized.\n" );
-            #else
-            //srand(seed);
-            gen_.seed(seed);
-            #endif
-            buffer_.Resize( Shape1( kRandBufferSize ) );
-        }
-        ~Random<cpu>() {
-            #if MSHADOW_USE_MKL
-            vslDeleteStream(&vStream_);
-            #endif
-        }
-        /*!
-         * \brief seed random number generator using this seed
-         * \param seed seed of prng
-         */
-        inline void Seed( int seed ){
-            #if MSHADOW_USE_MKL
-            int status = vslDeleteStream(&vStream_);
-            utils::Assert(status == VSL_STATUS_OK);
-            status = vslNewStream(&vStream_, VSL_BRNG_MT19937, seed);
-            utils::Assert(status == VSL_STATUS_OK);
-            #else
-            // srand( seed );
-            gen_.seed(seed);
-            #endif
-        }
-        template<int dim>
-        inline void SampleBinary(Tensor<cpu, dim> &src) {
-          SampleBinary(src, src);
-        }
-
-        /*!
-         * \brief generate binary data according to a probability matrix
-         * \param src source
-         * \param dst destination
-         * \param a lower bound of uniform
-         * \param b upper bound of uniform
-         * \tparam dim dimension of tensor
-         */
-        template<int dim>
-        inline void SampleBinary(Tensor<cpu, dim> &dst, Tensor<cpu, dim> &src) {
-            real_t a=0.0f;
-            real_t b=1.0f;
-            Tensor<cpu, 2> dmat = dst.FlatTo2D();
-            Tensor<cpu, 2> smat = src.FlatTo2D();
-            std::uniform_real_distribution<real_t> distribution (a,b);
-            for ( index_t i = 0; i < dmat.shape[1]; ++i ) {
-                #if MSHADOW_USE_MKL
-                #if MSHADOW_SINGLE_PRECISION
-                int status = vsRngUniform( 0, vStream_, mat.shape[0], mat[i].dptr, a, b );
-                #else
-                int status = vdRngUniform( 0, vStream_, mat.shape[0], mat[i].dptr, a, b );
-                #endif
-                utils::Assert(status == VSL_STATUS_OK, "Failed to generate random number by MKL.\n" );
-                #else
-                // use stdlib
-                /*
-                for ( index_t j = 0; j < mat.shape[0]; ++j ) {
-                    mat[i][j] = this->RandNext()*(b-a) + a;
-                }
-                */
-                for ( index_t j = 0; j < dmat.shape[0]; ++j ) {
-                    dmat[i][j] = distribution(gen_) > smat[i][j] ? 0.0f: 1.0f;
-                }
-                #endif
-            }
-        }
-        /*!
-         * \brief generate data from uniform [a,b)
-         * \param dst destination
-         * \param a lower bound of uniform
-         * \param b upper bound of uniform
-         * \tparam dim dimension of tensor
-         */
-        template<int dim>
-        inline void SampleUniform( Tensor<cpu, dim> &dst, real_t a=0.0f, real_t b=1.0f ) {
-            Tensor<cpu, 2> mat = dst.FlatTo2D();
-            std::uniform_real_distribution<real_t> distribution (a,b);
-            for ( index_t i = 0; i < mat.shape[1]; ++i ) {
-                #if MSHADOW_USE_MKL
-                #if MSHADOW_SINGLE_PRECISION
-                int status = vsRngUniform( 0, vStream_, mat.shape[0], mat[i].dptr, a, b );
-                #else
-                int status = vdRngUniform( 0, vStream_, mat.shape[0], mat[i].dptr, a, b );
-                #endif
-                utils::Assert(status == VSL_STATUS_OK, "Failed to generate random number by MKL.\n" );
-                #else
-                // use stdlib
-                /*
-                for ( index_t j = 0; j < mat.shape[0]; ++j ) {
-                    mat[i][j] = this->RandNext()*(b-a) + a;
-                }
-                */
-                for ( index_t j = 0; j < mat.shape[0]; ++j ) {
-                    mat[i][j] = distribution(gen_);
-                }
-                #endif
-            }
-        }
-        /*!
-         * \brief generate data from standard gaussian
-         * \param dst destination
-         * \param mu mean variable
-         * \param sigma standard deviation
-         * \tparam dim dimension of tensor
-         */
-        template<int dim>
-        inline void SampleGaussian( Tensor<cpu, dim> &dst, real_t mu = 0.0f, real_t sigma = 1.0f ) {
-            if( sigma <= 0.0f ) {
-                dst = mu; return;
-            }
-            Tensor<cpu, 2> mat = dst.FlatTo2D();
-            std::normal_distribution<real_t> distribution (mu, sigma);
-            for (index_t i = 0; i < mat.shape[1]; ++i) {
-                #if MSHADOW_USE_MKL
-                #if MSHADOW_SINGLE_PRECISION
-                int status = vsRngGaussian( 0, vStream_, mat.shape[0], mat[i].dptr, mu, sigma );
-                #else
-                int status = vdRngGaussian( 0, vStream_, mat.shape[0], mat[i].dptr, mu, sigma );
-                #endif
-                utils::Assert(status == VSL_STATUS_OK, "Failed to generate random number by MKL.\n" );
-                #else
-                /*
-                real_t g1 = 0.0f, g2 = 0.0f;
-                for (index_t j = 0; j < mat.shape[0]; ++j) {
-                    if( (j & 1) == 0 ){
-                        this->SampleNormal2D( g1, g2 );
-                        mat[i][j] = mu + g1 * sigma;
-                    }else{
-                        mat[i][j] = mu + g2 * sigma;
-                    }
-                }
-                */
-                for (index_t j = 0; j < mat.shape[0]; ++j) {
-                  mat[i][j] = distribution(gen_);
-                }
-                #endif
-            }
-        }
-        /*!
-         * \brief return a temporal expression storing standard gaussian random variables
-         *        the temporal tensor is only valid before next call of gaussian or uniform
-         *        can be used as part of expression
-         *  Caution: this means expression such as A = gaussian(s1) * gaussian(s2) will give invalid result,
-         *           since second call of gaussian(s2) makes gaussian(s1) invalid
-         *           A = gaussian(s1)*B+C; is correct; use one gaussian/uniform in each expression
-         * \param shape shape of the tensor
-         * \tparam dim dimension of tensor
-         */
-        template<int dim>
-        inline expr::ReshapeExp<Tensor<cpu,1>,dim,1> gaussian( Shape<dim> shape ){
-            buffer_.Resize( Shape1( shape.Size() ) );
-            this->SampleGaussian( buffer_, 0.0f, 1.0f );
-            return expr::reshape( buffer_, shape );
-        }
-        /*!
-         * \brief return a temporal expression storing standard uniform [0,1)
-         *        the temporal tensor is only valid before next call of gaussian or uniform
-         *        can be used as part of expression
-         *  Caution: this means expression such as A = gaussian(s1) * gaussian(s2) will give invalid result,
-         *           since second call of gaussian(s2) makes gaussian(s1) invalid
-         *           A = gaussian(s1)*B+C; is correct; use one gaussian/uniform in each expression
-         * \param shape shape of the tensor
-         * \tparam dim dimension of tensor
-         */
-        template<int dim>
-        inline expr::ReshapeExp<Tensor<cpu,1>,dim,1> uniform( Shape<dim> shape ){
-            buffer_.Resize( Shape1( shape.Size() ) );
-            this->SampleUniform( buffer_, 0.0f, 1.0f );
-            return expr::reshape( buffer_, shape );
-        }
-    private:
-        /*! \brief get next random number from rand */
-        inline real_t RandNext( void ){
-            return static_cast<real_t>(rand()) / (static_cast<real_t>(RAND_MAX)+1.0f);
-        }
-        /*! \brief return a real numer uniform in (0,1) */
-        inline real_t RandNext2( void ){
-            return (static_cast<real_t>( rand() ) + 1.0 ) / (static_cast<real_t>(RAND_MAX) + 2.0);
-        }
-        /*!
-         * \brief sample iid xx,yy ~N(0,1)
-         * \param xx first  gaussian output
-         * \param yy second gaussian output
-         */
-        inline void SampleNormal2D( real_t &xx, real_t &yy ){
-            real_t x,y,s;
-            do{
-                x = 2.0f * RandNext2() - 1.0f;
-                y = 2.0f * RandNext2() - 1.0f;
-                s = x*x + y*y;
-            }while( s >= 1.0f || s == 0.0f );
-            real_t t = std::sqrt( -2.0f * std::log( s ) / s ) ;
-            xx = x * t; yy = y * t;
-        }
-    private:
-        #if MSHADOW_USE_MKL
-        /*! \brief stream used by MKL VSL */
-        VSLStreamStatePtr vStream_;
-        #endif
-        /*! \brief temporal space used to store random numbers */
-        TensorContainer<cpu,1> buffer_;
-
-        /*! \brief c++11 random generator, added for SINGA use */
-        std::mt19937 gen_;
-    }; // class Random<cpu>
-
-#if MSHADOW_USE_CUDA
-// __CUDACC__
-    /*! \brief GPU random number generator */
-    template<>
-    class Random<gpu> {
-    public:
-        /*!
-         * \brief constructor of random engine
-         * \param seed random number seed
-         */
-        Random<gpu>(int seed) {
-            curandStatus_t status;
-            status = curandCreateGenerator(&gen_, CURAND_RNG_PSEUDO_DEFAULT);
-            utils::Assert(status == CURAND_STATUS_SUCCESS, "Can not create CURAND Generator");
-            this->Seed( seed );
-            buffer_.Resize( Shape1(kRandBufferSize) );
-        }
-
-        ~Random<gpu>() {
-            curandStatus_t status;
-            status = curandDestroyGenerator(gen_);
-            utils::Assert(status == CURAND_STATUS_SUCCESS, "Destory CURAND Gen failed");
-        }
-        /*!
-         * \brief seed random number generator using this seed
-         * \param seed seed of prng
-         */
-        inline void Seed( int seed ){
-            curandStatus_t status;
-            status = curandSetPseudoRandomGeneratorSeed(gen_, seed);
-            utils::Assert(status == CURAND_STATUS_SUCCESS, "Set CURAND seed failed.");
-        }
-        /*!
-         * \brief generate data from uniform [a,b)
-         * \param dst destination
-         * \param a lower bound of uniform
-         * \param b upper bound of uniform
-         * \tparam dim dimension of tensor
-         */
-        template<int dim>
-        inline void SampleUniform(Tensor<gpu, dim> &dst, real_t a=0.0f, real_t b=1.0f) {
-            if( a == 0.0f && b == 1.0f ){
-                dst = this->uniform( dst.shape );
-            }else{
-                dst = this->uniform( dst.shape ) *(b-a) + a;
-            }
-        }
-        /*!
-         * \brief generate data from standard gaussian
-         * \param dst destination
-         * \param mu mean variable
-         * \param sigma standard deviation
-         * \tparam dim dimension of tensor
-         */
-        template<int dim>
-        inline void SampleGaussian(Tensor<gpu, dim> &dst, real_t mu = 0.0f, real_t sigma = 1.0f) {
-            dst = this->gaussian( dst.shape, mu, sigma );
-        }
-        /*!
-         * \brief return a temporal expression storing standard gaussian random variables
-         *        the temporal tensor is only valid before next call of gaussian or uniform
-         *        can be used as part of expression
-         *  Caution: this means expression such as A = gaussian(s1) * gaussian(s2) will give invalid result,
-         *           since second call of gaussian(s2) makes gaussian(s1) invalid
-         *           A = gaussian(s1)*B+C; is correct; use one gaussian/uniform in each expression
-         * \param shape shape of the tensor
-         * \param mu mean
-         * \param sigma variance
-         * \tparam dim dimension of tensor
-         */
-        template<int dim>
-        inline expr::ReshapeExp<Tensor<gpu,1>,dim,1> gaussian( Shape<dim> shape, real_t mu=0.0f, real_t sigma=1.0f){
-            size_t aligned_sz = ((shape.Size() + 1UL)>>1)<<1;
-            // allocate alligned size
-            buffer_.Resize( Shape1( aligned_sz ) );
-            buffer_.Resize( Shape1( shape.Size() ) );
-            curandStatus_t status;
-            #if MSHADOW_SINGLE_PRECISION
-            status = curandGenerateNormal(gen_, buffer_.dptr, aligned_sz , mu, sigma);
-            #else
-            status = curandGenerateNormalDouble(gen_, buffer_.dptr, buffer_.shape[0], mu, sigma);
-            #endif
-            utils::Assert(status == CURAND_STATUS_SUCCESS, "CURAND Gen Uniform failed\n");
-            return expr::reshape( buffer_, shape );
-        }
-        /*!
-         * \brief return a temporal expression storing standard uniform [0,1)
-         *        the temporal tensor is only valid before next call of gaussian or uniform
-         *        can be used as part of expression
-         *  Caution: this means expression such as A = gaussian(s1) * gaussian(s2) will give invalid result,
-         *           since second call of gaussian(s2) makes gaussian(s1) invalid
-         *           A = gaussian(s1)*B+C; is correct; use one gaussian/uniform in each expression
-         * \param shape shape of the tensor
-         * \tparam dim dimension of tensor
-         */
-        template<int dim>
-        inline expr::ReshapeExp<Tensor<gpu,1>,dim,1> uniform(Shape<dim> shape) {
-            buffer_.Resize( Shape1( shape.Size() ) );
-            curandStatus_t status;
-            #if MSHADOW_SINGLE_PRECISION
-            status = curandGenerateUniform(gen_, buffer_.dptr, buffer_.shape[0] );
-            #else
-            status = curandGenerateUniformDouble(gen_, buffer_.dptr, buffer_.shape[0] );
-            #endif
-            utils::Assert(status == CURAND_STATUS_SUCCESS, "CURAND Gen Uniform failed\n");
-            return expr::reshape( buffer_, shape );
-        }
-    private:
-        /*! \brief random numbeer generator */
-        curandGenerator_t gen_;
-        /*! \brief templ buffer */
-        TensorContainer<gpu, 1> buffer_;
-    }; // class Random<gpu>
-    #endif
-
-}; // namespace mshadow
-
-#endif // MSHADOW_TENSOR_RANDOM_H
diff --git a/include/mshadow/tensor_sse-inl.hpp b/include/mshadow/tensor_sse-inl.hpp
deleted file mode 100644
index b98383e..0000000
--- a/include/mshadow/tensor_sse-inl.hpp
+++ /dev/null
@@ -1,431 +0,0 @@
-#ifndef MSHADOW_TENSOR_SSE_INL_HPP
-#define MSHADOW_TENSOR_SSE_INL_HPP
-/*!
- * \file tensor_sse-inl.hpp
- * \brief support of sse2 optimization of some operations
- * \author Tianqi Chen
- */
-#ifdef __APPLE__
-#include <stdlib.h>
-#else
-#include <malloc.h>
-#endif
-
-#include "tensor_expr.h"
-#include "tensor.h"
-
-namespace mshadow {
-    /*! \brief namespace to support sse2 vectorization */
-    namespace sse2{
-        /*! 
-         * \brief analog to cudaMallocPitch, allocate a aligned space with num_line * lspace cells
-         * \param pitch output parameter, the actuall space allocated for each line
-         * \param lspace number of cells required for each line
-         * \param num_line number of lines to be allocated
-         */
-        inline void* AlignedMallocPitch( size_t &pitch, size_t lspace, size_t num_line ){
-            pitch = ((lspace+15) >> 4) << 4;
-            #ifdef _MSC_VER
-            void * res = _aligned_malloc( pitch*num_line, 16 ); 
-            #else
-            #ifdef __APPLE__
-            void *res = malloc( pitch * num_line );
-            #else
-            void * res = memalign( 16, pitch*num_line ); 
-            #endif
-            #endif
-            utils::Assert( res != NULL, "AlignedMallocPitch failed" );
-            return res;
-        }
-        /*! 
-         * \brief free aligned space 
-         * \param ptr pointer to space to be freed
-         */
-        inline void AlignedFree( void *ptr ){
-            #ifdef _MSC_VER
-            _aligned_free( ptr );
-            #else
-            free( ptr );
-            #endif
-        }
-        /*! \brief check if a pointer is aligned */
-        inline bool CheckAlign( size_t pitch ){
-            return !(pitch & ((1<<4)-1));
-        }
-        /*! \brief check if a pointer is aligned */
-        inline bool CheckAlign( void *ptr ){
-            return CheckAlign( (size_t)ptr );
-        }
-        /*! 
-         * \brief get upper bound of aligned index of size 
-         * \param size size of the array
-         * \param fsize size of float
-         */
-        inline index_t UpperAlign( index_t size, size_t fsize ){
-            return (( (size*fsize+15) >> 4 ) << 4) / fsize;
-        }
-        /*! 
-         * \brief get lower bound of aligned index of size 
-         * \param size size of the array
-         * \param fsize size of float
-         */
-        inline index_t LowerAlign( index_t size, size_t fsize ){
-            return (( (size*fsize) >> 4 ) << 4) / fsize;
-        }
-    }; // namespace sse2
-}; // namespace  mshadow
-
-#if MSHADOW_USE_SSE
-// sse types are not compatible with nvcc, only use them in cpu mode
-#include <emmintrin.h>
-
-namespace mshadow{
-    namespace sse2{
-        /*! 
-         * \brief float vector real type, used for vectorization 
-         * \tparam FloatType double or float
-         */
-        template<typename FloatType> struct FVec{};
-        
-        /*! \brief vector real type for float */
-        template<> 
-        struct FVec<float> {
-        public:
-            typedef __m128 DType;
-            /*! \brief number of float in vector */
-            const static index_t kSize = 4;
-            /*! \brief data content */
-            DType data_;
-        public:
-            /* constructors */
-            FVec( void ){}
-            FVec( DType data ):data_(data){}
-            /* set the float */
-            FVec( const float &s ){
-                data_ = _mm_set1_ps( s );
-            }
-            /*!\brief load from pointer src */
-            FVec( const float *src ){
-                data_ = _mm_load_ps( src );                
-            } 
-        public:
-            /*! \brief store data into dst space */
-            inline void Store( float *dst ) const{
-                return _mm_store_ps( dst, data_ );
-            }
-            /*! \brief sum of all content */
-            inline float Sum( void ) const{
-                DType ans  = _mm_add_ps( data_, _mm_movehl_ps( data_, data_ ) );
-                DType rst  = _mm_add_ss( ans, _mm_shuffle_ps( ans, ans, 1 ) );
-                #if defined(_MSC_VER) && ( _MSC_VER <= 1500 ) && defined(_WIN64)
-                return rst.m128_f32[ 0 ];
-                #else
-                float rr = _mm_cvtss_f32( rst ) ;
-                return rr;
-                #endif
-            }
-        };
-
-        /*! \brief vector real type for float */
-        template<> 
-        struct FVec<double> {
-        public:
-            typedef __m128d DType;
-            /*! \brief number of float in vector */
-            const static index_t kSize = 2;
-            /*! \brief data content */
-            DType data_;
-        public:
-            /* constructors */
-            FVec( void ){}
-            FVec( DType data ):data_(data){}
-            /* set the float */
-            FVec( const double &s ){
-                data_ = _mm_set1_pd( s );
-            }
-            /*!\brief load from pointer src */
-            FVec( const double *src ){
-                data_ = _mm_load_pd( src );                
-            } 
-        public:
-            /*! \brief store data into dst space */
-            inline void Store( double *dst ) const{
-                return _mm_store_pd( dst, data_ );
-            }
-            /*! \brief sum of all content */
-            inline double Sum( void ) const{
-                DType tmp =  _mm_add_sd( data_, _mm_unpackhi_pd( data_,data_ ) ) ;
-                #if defined(_MSC_VER) && ( _MSC_VER <= 1500 ) && defined(_WIN64)
-                return tmp.m128d_f64[0];
-                #else
-                double ans = _mm_cvtsd_f64( tmp );
-                return ans;
-                #endif
-            }
-        };
-    };
-
-    namespace sse2{
-        /*! \brief sse2 operator type of certain operator */
-        template<typename OP>
-        struct SSEOp{
-            const static bool kEnabled = false;
-        };        
-        template<>
-        struct SSEOp<op::plus>{
-            const static bool kEnabled = true;
-            MSHADOW_CINLINE static FVec<float> Map( const FVec<float> &lhs, const FVec<float> &rhs ){
-                return FVec<float>( _mm_add_ps( lhs.data_, rhs.data_ ) );
-            }
-            MSHADOW_CINLINE static FVec<double> Map( const FVec<double> &lhs, const FVec<double> &rhs ){
-                return FVec<double>( _mm_add_pd( lhs.data_, rhs.data_ ) );
-            }
-        };
-        template<>
-        struct SSEOp<op::minus>{
-            const static bool kEnabled = true;
-            MSHADOW_CINLINE static FVec<float> Map( const FVec<float> &lhs, const FVec<float> &rhs ){
-                return FVec<float>( _mm_sub_ps( lhs.data_, rhs.data_ ) );
-            }
-            MSHADOW_CINLINE static FVec<double> Map( const FVec<double> &lhs, const FVec<double> &rhs ){
-                return FVec<double>( _mm_sub_pd( lhs.data_, rhs.data_ ) );
-            }
-        };
-        template<>
-        struct SSEOp<op::mul>{
-            const static bool kEnabled = true;
-            MSHADOW_CINLINE static FVec<float> Map( const FVec<float> &lhs, const FVec<float> &rhs ){
-                return FVec<float>( _mm_mul_ps( lhs.data_, rhs.data_ ) );
-            }
-            MSHADOW_CINLINE static FVec<double> Map( const FVec<double> &lhs, const FVec<double> &rhs ){
-                return FVec<double>( _mm_mul_pd( lhs.data_, rhs.data_ ) );
-            }
-        };
-        template<>
-        struct SSEOp<op::div>{
-            const static bool kEnabled = true;
-            MSHADOW_CINLINE static FVec<float> Map( const FVec<float> &lhs, const FVec<float> &rhs ){
-                return FVec<float>( _mm_div_ps( lhs.data_, rhs.data_ ) );
-            }
-            MSHADOW_CINLINE static FVec<double> Map( const FVec<double> &lhs, const FVec<double> &rhs ){
-                return FVec<double>( _mm_div_pd( lhs.data_, rhs.data_ ) );
-            }
-        };
-
-        template<>
-        struct SSEOp<op::identity>{
-            const static bool kEnabled = true;
-            MSHADOW_CINLINE static FVec<float> Map( const FVec<float> &src ){
-                return src;
-            }
-            MSHADOW_CINLINE static FVec<double> Map( const FVec<double> &src ){
-                return src;
-            }
-        };
-    }; // namespace sse2
-    
-    namespace sse2{
-        // savers to do storage
-        template<typename SV, typename TFloat>
-        struct Saver{
-            MSHADOW_CINLINE static void Save( TFloat *dst, const FVec<TFloat> &src ){
-                FVec<TFloat> lhs( dst );
-                FVec<TFloat> ans = SSEOp<typename SV::OPType>::Map( lhs, src );
-                ans.Store( dst );
-            }
-        };
-        template<typename TFloat>
-        struct Saver<sv::saveto,TFloat>{
-            MSHADOW_CINLINE static void Save( TFloat *dst, const FVec<TFloat> &src ){
-                src.Store( dst );
-            }
-        };        
-    }; // namespace sse2
-}; // namespace mshadow
-
-namespace mshadow{
-    namespace expr{
-        // same as plan, but use sse2
-        template<typename ExpType>
-        class SSEPlan {
-        public:
-            /*!
-             * \brief evaluate the expression at index [y][x], x will be aligned to 4
-             *        to be implemented by SubType
-             */
-            MSHADOW_CINLINE sse2::FVec<real_t> EvalSSE( index_t y, index_t x ) const;
-            MSHADOW_CINLINE real_t Eval( index_t y, index_t x ) const;
-        };
-
-        template <typename Device, int dim>
-        class SSEPlan< Tensor<Device,dim> >{
-        public:
-            SSEPlan( const Tensor<Device,dim> &t )
-                :dptr_(t.dptr),stride_(t.shape.stride_){}
-            MSHADOW_CINLINE sse2::FVec<real_t> EvalSSE( index_t y, index_t x ) const{
-                return sse2::FVec<real_t>( &dptr_[ y*stride_+x ] );
-            }
-            MSHADOW_CINLINE real_t Eval( index_t y, index_t x ) const{
-                return dptr_[ y * stride_ + x ];
-            }
-        private:
-            const real_t  *dptr_;
-            index_t stride_;
-        };
-
-        template<>
-        class SSEPlan<ScalarExp>{
-        public:
-            SSEPlan( real_t scalar ):scalar_(scalar){}
-            MSHADOW_CINLINE sse2::FVec<real_t> EvalSSE( index_t y, index_t x ) const{
-                return sse2::FVec<real_t>( scalar_ );
-            }
-            MSHADOW_CINLINE real_t Eval( index_t y, index_t x ) const{
-                return scalar_;
-            }
-        private:
-            real_t scalar_;
-        };
-
-        template<typename OP, typename TA, typename TB,int etype>
-        class SSEPlan< BinaryMapExp<OP,TA,TB,etype> >{
-        public:
-            SSEPlan( const SSEPlan<TA> &lhs, const SSEPlan<TB> &rhs )
-                :lhs_(lhs), rhs_(rhs){}
-            MSHADOW_CINLINE sse2::FVec<real_t> EvalSSE( index_t y, index_t x ) const{
-                return sse2::SSEOp<OP>::Map( lhs_.EvalSSE( y, x ), rhs_.EvalSSE( y, x ) );
-            }
-            MSHADOW_CINLINE real_t Eval( index_t y, index_t x ) const{
-                return OP::Map( lhs_.Eval( y, x ), rhs_.Eval( y, x ) );
-            }
-        private:
-            SSEPlan<TA> lhs_;
-            SSEPlan<TB> rhs_;
-        };
-
-        template<typename OP, typename TA, int etype>
-        class SSEPlan< UnaryMapExp<OP,TA,etype> >{
-        public:
-            SSEPlan( const SSEPlan<TA> &src ):src_(src){}
-            MSHADOW_CINLINE sse2::FVec<real_t> EvalSSE( index_t y, index_t x ) const{
-                return sse2::SSEOp<OP>::Map( src_.EvalSSE( y, x ) );
-            }
-            MSHADOW_CINLINE real_t Eval( index_t y, index_t x ) const{
-                return OP::Map( src_.Eval( y, x ) );
-            }
-        private:
-            SSEPlan<TA> src_;
-        };
-
-        template<typename OP, typename TA, typename TB, int etype>
-        inline SSEPlan< BinaryMapExp<OP,TA,TB,etype> > MakeSSEPlan( const BinaryMapExp<OP,TA,TB,etype> &e );
-
-        inline SSEPlan<ScalarExp> MakeSSEPlan( const ScalarExp &e ){
-            return SSEPlan<ScalarExp>( e.scalar_ );
-        }
-
-        template<typename T>
-        inline SSEPlan<T> MakeSSEPlan( const ContainerExp<T> &e ){
-            return SSEPlan<T>( e.self() );
-        }
-
-        template<typename T,int dim>
-        inline SSEPlan<T> MakeSSEPlan( const MakeTensorExp<T,cpu,dim> &e ){
-            return SSEPlan<T>( e.real_self() );
-        }
-
-        template<typename OP, typename TA, int etype>
-        inline SSEPlan< UnaryMapExp<OP,TA,etype> > MakeSSEPlan( const UnaryMapExp<OP,TA,etype> &e ){
-            return SSEPlan< UnaryMapExp<OP,TA,etype> >( MakeSSEPlan(e.src_) );
-        }
-
-        template<typename OP, typename TA, typename TB, int etype>
-        inline SSEPlan< BinaryMapExp<OP,TA,TB,etype> > MakeSSEPlan( const BinaryMapExp<OP,TA,TB,etype> &e ){
-                return SSEPlan< BinaryMapExp<OP,TA,TB,etype> >( MakeSSEPlan(e.lhs_), MakeSSEPlan(e.rhs_) );
-        }
-    };
-
-    namespace expr{
-        /*!
-         * \brief static check sse enable
-         *        if a expression E can not be evaluated using sse, then kPass = false
-         * \tparam Device the type of Device
-         * \tparam dim dimension of the tensor
-         * \tparam E expression
-         */
-        template<typename E>
-        struct SSECheck{
-            const static bool kPass = false;
-        };
-        template<>
-        struct SSECheck<ScalarExp>{
-            const static bool kPass = true;
-        };
-        template<int dim>
-        struct SSECheck<Tensor<cpu,dim> >{
-            const static bool kPass = true;
-        };
-        
-        template<typename OP, typename TA, int etype>
-        struct SSECheck<UnaryMapExp<OP,TA,etype> >{
-            const static bool kPass = SSECheck<TA>::kPass && sse2::SSEOp<OP>::kEnabled;
-        };
-        template<typename OP, typename TA, typename TB, int etype>
-        struct SSECheck< BinaryMapExp<OP,TA,TB,etype> >{
-            const static bool kPass = SSECheck<TA>::kPass && SSECheck<TB>::kPass && sse2::SSEOp<OP>::kEnabled;
-        }; 
-    }; // namespace expr
-    namespace expr{
-        // check if data is aligned and allow sse operation
-        template<int dim,typename E>
-        struct SSEAlignCheck{
-            inline static bool Check( const E &exp ){
-                return false;
-            }
-        };
-        template<int dim>
-        struct SSEAlignCheck< dim, ScalarExp >{
-            inline static bool Check( const ScalarExp &exp ){
-                return true;
-            }
-        };
-        template<int dim>
-        struct SSEAlignCheck< dim,Tensor<cpu,dim> >{
-            inline static bool Check( const Tensor<cpu,dim> &t ){
-                return sse2::CheckAlign( t.dptr ) && sse2::CheckAlign( t.shape.stride_ * sizeof( real_t ) );
-            }
-        };
-        template<int dim, typename OP, typename TA, int etype>
-        struct SSEAlignCheck< dim, UnaryMapExp<OP,TA,etype> >{
-            inline static bool Check( const UnaryMapExp<OP,TA,etype> &t ){
-                return SSEAlignCheck<dim,TA>::Check( t.src_);
-            }
-        };
-        template<int dim, typename OP, typename TA, typename TB, int etype>
-        struct SSEAlignCheck< dim, BinaryMapExp<OP,TA,TB,etype> >{ 
-            inline static bool Check( const BinaryMapExp<OP,TA,TB,etype> &t ){
-                return SSEAlignCheck<dim,TA>::Check( t.lhs_ ) && 
-                    SSEAlignCheck<dim,TB>::Check( t.rhs_ );
-            }
-        };
-    }; // namespace expr
-
-    /*! 
-     * \brief use SSEPlan to compute result
-     */
-    template<typename SV, typename E, int dim>
-    inline void MapSSEPlan(Tensor<cpu,dim> _dst, const expr::SSEPlan<E> &plan){        
-        Tensor<cpu,2> dst = _dst.FlatTo2D();
-        const index_t xlen = sse2::LowerAlign( dst.shape[0], sizeof(real_t) );
-        for ( index_t y = 0; y < dst.shape[1]; y ++ ) {
-            for( index_t x = 0; x < xlen; x += sse2::FVec<real_t>::kSize ){
-                sse2::Saver<SV,real_t>::Save( &dst[y][x], plan.EvalSSE( y,x ) );
-            }
-            for( index_t x = xlen; x < dst.shape[0]; x ++ ){
-                SV::Save( dst[y][x], plan.Eval(y,x) );
-            }
-        }
-    }
-}; // namespace mshadow
-#endif // MSHADOW_USE_SSE
-#endif // MSHADOW_TENSOR_SSE_INL_HPP
diff --git a/include/singa/comm/msg.h b/include/singa/comm/msg.h
deleted file mode 100644
index 8e03cd5..0000000
--- a/include/singa/comm/msg.h
+++ /dev/null
@@ -1,243 +0,0 @@
-/************************************************************
-*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*
-*************************************************************/
-
-#ifndef SINGA_COMM_MSG_H_
-#define SINGA_COMM_MSG_H_
-
-#include <utility>
-
-// TODO(wangwei): make it a compiler argument
-// #define USE_ZMQ
-
-#include <vector>
-#ifdef USE_ZMQ
-#include <czmq.h>
-#endif
-
-namespace singa {
-/**
- * Wrapper to generate message address
- * @param grp worker/server group id
- * @param id_or_proc worker/server id or procs id
- * @param type msg type
- */
-inline int Addr(int grp, int id_or_proc, int type) {
-  return (grp << 16) | (id_or_proc << 8) | type;
-}
-
-/**
- * Parse group id from addr.
- *
- * @return group id
- */
-inline int AddrGrp(int addr) {
-  return addr >> 16;
-}
-
-/**
- * Parse worker/server id from addr.
- *
- * @return id
- */
-inline int AddrID(int addr) {
-  static const int mask = (1 << 8) - 1;
-  return (addr >> 8) & mask;
-}
-
-/**
- * Parse worker/server procs from addr.
- *
- * @return procs id
- */
-inline int AddrProc(int addr) {
-  return AddrID(addr);
-}
-
-/**
- * Parse msg type from addr
- * @return msg type
- */
-inline int AddrType(int addr) {
-  static const int mask = (1 << 8) -1;
-  return addr & mask;
-}
-
-/**
- * Msg used to transfer Param info (gradient or value), feature blob, etc.
- * between workers, stubs and servers.
- *
- * Each msg has a source addr and dest addr identified by a unique integer.
- * It is also associated with a target field (value and version) for ease of
- * getting some meta info (e.g., parameter id) from the msg.
- *
- * Other data is added into the message as frames.
- */
-class Msg {
- public:
-  ~Msg();
-  Msg();
-  /**
-   * Construct the msg providing source and destination addr.
-   */
-  Msg(int src, int dst);
-  /**
-   * Copy constructor.
-   */
-  Msg(const Msg& msg);
-  /**
-   * Swap the src/dst addr
-   */
-  void SwapAddr();
-  /**
-   * Add a frame (a chunck of bytes) into the message
-   */
-  void AddFrame(const void* addr, int nBytes);
-  /**
-   * @return num of bytes of the current frame.
-   */
-  int FrameSize();
-  /**
-   * @return the pointer to the current frame data.
-   */
-  void* FrameData();
-  /**
-   * @return the data of the current frame as c string
-   */
-  char* FrameStr();
-  /**
-   * Move the cursor to the first frame.
-   */
-  void FirstFrame();
-  /**
-   * Move the cursor to the last frame.
-   */
-  void LastFrame();
-  /**
-   * Move the cursor to the next frame
-   * @return true if the next frame is not NULL; otherwise false
-   */
-  bool NextFrame();
-  /**
-   *  Add a 'format' frame to the msg (like CZMQ's zsock_send).
-   *
-   *  The format is a string that defines the type of each field.
-   *  The format can contain any of these characters, each corresponding to
-   *  one or two arguments:
-   *  i = int (signed)
-   *  1 = uint8_t
-   *  2 = uint16_t
-   *  4 = uint32_t
-   *  8 = uint64_t
-   *  p = void * (sends the pointer value, only meaningful over inproc)
-   *  s = char**
-   *
-   *  Returns size of the added content.
-   */
-  int AddFormatFrame(const char *format, ...);
-  /**
-   *  Parse the current frame added using AddFormatFrame(const char*, ...).
-   *
-   *  The format is a string that defines the type of each field.
-   *  The format can contain any of these characters, each corresponding to
-   *  one or two arguments:
-   *  i = int (signed)
-   *  1 = uint8_t
-   *  2 = uint16_t
-   *  4 = uint32_t
-   *  8 = uint64_t
-   *  p = void * (sends the pointer value, only meaningful over inproc)
-   *  s = char**
-   *
-   *  Returns size of the parsed content.
-   */
-  int ParseFormatFrame(const char* format, ...);
-
-#ifdef USE_ZMQ
-  void ParseFromZmsg(zmsg_t* msg);
-  zmsg_t* DumpToZmsg();
-#endif
-
-  /**
-   * @return msg size in terms of bytes, ignore meta info.
-   */
-  int size() const;
-  /**
-   * Set source addr.
-   * @param addr unique identify one worker/server/stub in the current job
-   */
-  inline void set_src(int addr) { src_ = addr; }
-  /**
-   * @return source addr.
-   */
-  inline int src() const { return src_; }
-  /**
-   * Set destination addr.
-   * @param addr unique identify one worker/server/stub in the current job
-   */
-  inline void set_dst(int addr) { dst_ = addr; }
-  /**
-   * @return dst addr.
-   */
-  inline int dst() const { return dst_; }
-  /**
-   * Set msg type, e.g., kPut, kGet, kUpdate, kRequest
-   */
-  inline void set_type(int type) { type_ = type; }
-  /**
-   * @return msg type.
-   */
-  inline int type() const { return type_; }
-  /**
-   * Set msg target.
-   *
-   * One msg has a target to identify some entity in worker/server/stub.
-   * The target is associated with a version, e.g., Param version.
-   */
-  inline void set_trgt(int val, int version) {
-    trgt_val_ = val;
-    trgt_version_ = version;
-  }
-  inline int trgt_val() const { return trgt_val_; }
-  inline int trgt_version() const { return trgt_version_; }
-
- protected:
-  int src_ = 0;
-  int dst_ = 0;
-  int type_ = 0;
-  int trgt_val_ = 0;
-  int trgt_version_ = 0;
-#ifdef USE_ZMQ
-  zmsg_t* msg_ = nullptr;
-  zframe_t *frame_ = nullptr;
-#else
-  std::vector<std::pair<void*, int>> frames_;
-  unsigned idx_ = 0;
-#endif
-};
-
-inline void DeleteMsg(Msg** msg) {
-  delete *msg;
-  *msg = nullptr;
-}
-
-}  // namespace singa
-
-#endif  // SINGA_COMM_MSG_H_
diff --git a/include/singa/comm/socket.h b/include/singa/comm/socket.h
deleted file mode 100644
index 40d4cc3..0000000
--- a/include/singa/comm/socket.h
+++ /dev/null
@@ -1,123 +0,0 @@
-/************************************************************
-*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*
-*************************************************************/
-
-#ifndef SINGA_COMM_SOCKET_H_
-#define SINGA_COMM_SOCKET_H_
-
-#ifdef USE_ZMQ
-#include <czmq.h>
-#endif
-
-#include <map>
-#include <string>
-#include <vector>
-#include <unordered_map>
-#include "singa/utils/safe_queue.h"
-#include "singa/comm/msg.h"
-
-namespace singa {
-/**
- * Worker and Server use Dealer to communicate with Stub.
- * Stub uses Dealer to communicate with remote Stub.
- */
-class Dealer {
- public:
-   /**
-    * @param id used for identifying the msg queue of this dealer.
-    */
-   explicit Dealer(int id);
-  ~Dealer();
-  /**
-   * Setup the connection with the remote router.
-   *
-   * For local router, there is no need to connect it.
-   *
-   * @param endpoint Identifier of the remote router to connect. It follows
-   * ZeroMQ's format, i.e., IP:port, where IP is the connected process.
-   * @return 1 connection sets up successfully; 0 otherwise
-   */
-  int Connect(const std::string& endpoint);
-  /**
-   * Send a message to the local router (id=-1) or remote outer. It is
-   * non-blocking. The message will be deallocated after sending, thus
-   * should not be used after calling Send();
-   */
-  int Send(Msg** msg);
-  /**
-   * Recv msg from local router.
-   *
-   * @param timeout return if waiting for timeout microseconds.
-   * @return a message pointer if success; nullptr if failure
-   */
-  Msg* Receive(int timeout = 0);
-
- protected:
-  std::string endpoint_;
-  int id_;
-#ifdef USE_ZMQ
-  zsock_t* dealer_ = nullptr;
-#endif
-};
-/**
- * In Singa, since each process has one router used by Stub, hence we fix the
- * router to use the msg queue indexed by -1.
- */
-class Router {
- public:
-  ~Router();
-  Router();
-  /**
-   * Bind the router to an endpoint for recv msg from remote dealer.
-   * If the router is used for intra-communication only, then no need to call
-   * Bind.
-   *
-   * @param endpoint identifier for the Dealer socket in other process
-   * to connect. It has the format IP:Port, where IP is the host machine.
-   * @return number of connected dealers.
-   */
-  int Bind(const std::string& endpoint);
-  /**
-   * Send msg to local dealers by pushing the msg into the msg queue indexed by
-   * dst of the msg.
-   */
-  int Send(Msg** msg);
-  /**
-   * Recv msg from local (msg queue) or remote dealer (via zmq).
-   */
-  Msg* Receive(int timeout = 0);
-
- protected:
-  std::string endpoint_;
-#ifdef USE_ZMQ
-  zsock_t* router_ = nullptr;
-  zpoller_t* poller_ = nullptr;
-#endif
-};
-
-/**
- * Used for intra-process communication.
- * Each dealer/router has a SafeQueue for recieving msgs.
- * The sender pushes msgs onto the queue of the reciever's queue.
- */
-extern std::unordered_map<int, SafeQueue<Msg*>> msgQueues;
-}  // namespace singa
-
-#endif  // SINGA_COMM_SOCKET_H_
diff --git a/include/singa/core/common.h b/include/singa/core/common.h
new file mode 100644
index 0000000..53a9726
--- /dev/null
+++ b/include/singa/core/common.h
@@ -0,0 +1,121 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef SINGA_CORE_COMMON_H_
+#define SINGA_CORE_COMMON_H_
+#include <random>
+#include <chrono>
+#include "./singa/singa_config.h"
+#include <atomic>
+#include <memory>
+#include "singa/utils/logging.h"
+
+#ifdef USE_CUDA
+#include <cuda_runtime.h>
+#include <cublas_v2.h>
+#include <curand.h>
+#ifdef USE_CUDNN
+#include <cudnn.h>
+#endif
+#endif // USE_CUDA
+
+
+#ifdef USE_OPENCL
+#define CL_HPP_MINIMUM_OPENCL_VERSION 120
+#define CL_HPP_TARGET_OPENCL_VERSION 120
+#include <CL/cl2.hpp>
+#include <unordered_map>
+#endif  // USE_OPENCL
+
+using std::atomic;
+
+namespace singa {
+
+namespace lang {
+/// To implemente functions using cpp libraries
+typedef struct _Cpp { } Cpp;
+/// To implemente functions using cuda libraries
+typedef struct _Cuda { } Cuda;
+/// To implement function using opencl libraries
+typedef struct _Opencl { } Opencl;
+}  // namespace lang
+
+/// Block represent a chunk of memory (on device or host).
+class Block {
+ public:
+  Block(void* ptr, size_t size, size_t offset = 0)
+      : data_(ptr), size_(size), offset_(offset) {
+    ref_count_ = 1;  // std::make_shared<std::atomic<int>>(1);
+  }
+  // Disabled as it is not used currently.
+  // Block(void* ptr, size_t size, size_t offset, std::shared_ptr<atomic<int>>
+  //  ref) : data_(ptr), size_(size), offset_(offset), ref_count_(ref) {}
+  void* mutable_data() {
+    initialized_ = true;
+    return static_cast<char*>(data_) + offset_;
+  }
+  const void* data() const {
+    CHECK(initialized_) << "Must initialize data before reading it";
+    return static_cast<char*>(data_) + offset_;
+  }
+  size_t size() const { return size_; }
+  size_t offset() const { return offset_; }
+  int IncRefCount() {
+    return ++ref_count_;  // Note do not use ref_count_++;
+  }
+  int DecRefCount() {
+    return --ref_count_;
+  }
+  int ref_count() const { return ref_count_.load(); }
+
+  bool initialized() const {
+    return initialized_;
+  }
+
+ private:
+  Block() {}
+  void* data_ = nullptr;
+  size_t size_ = 0;
+  size_t offset_ = 0;
+  bool initialized_ = false;
+  // Disabled as it is not used currently.
+  // std::shared_ptr<std::atomic<int>> ref_count_ = nullptr;
+  std::atomic<int> ref_count_;
+};
+
+typedef struct _Context {
+  std::mt19937 random_generator;
+#ifdef USE_CUDA
+  cublasHandle_t cublas_handle;
+  cudaStream_t stream;
+  curandGenerator_t curand_generator;
+#ifdef USE_CUDNN
+  cudnnHandle_t cudnn_handle;
+#endif
+#endif // USE_CUDA
+
+#ifdef USE_OPENCL
+  std::shared_ptr<std::unordered_map<std::string, cl::Kernel>> kernels;
+  cl::CommandQueue ocl_cmdq;
+  cl::Context ocl_ctx;
+#endif
+
+} Context;
+
+}  // namespace singa
+#endif  // SINGA_CORE_COMMON_H_
diff --git a/include/singa/core/device.h b/include/singa/core/device.h
new file mode 100644
index 0000000..810d41f
--- /dev/null
+++ b/include/singa/core/device.h
@@ -0,0 +1,381 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef SINGA_CORE_DEVICE_H_
+#define SINGA_CORE_DEVICE_H_
+
+#include <type_traits>
+#include <vector>
+#include <string>
+#include <functional>
+#include <memory>
+
+#include "singa/singa_config.h"
+#include "singa/core/common.h"
+#include "singa/core/memory.h"
+#include "singa/core/scheduler.h"
+#include "singa/proto/core.pb.h"
+
+#ifdef USE_CUDA
+#include "singa/utils/cuda_utils.h"
+#endif // USE_CUDA
+
+#ifdef USE_OPENCL
+// http://github.khronos.org/OpenCL-CLHPP/
+// cl2.hpp includes cl.h, do not re-include.
+#define CL_HPP_MINIMUM_OPENCL_VERSION 120
+#define CL_HPP_TARGET_OPENCL_VERSION 120
+#include <unordered_map>
+#include <CL/cl2.hpp>
+#include "singa/utils/opencl_utils.h"
+#endif // USE_OPENCL
+
+using std::vector;
+using std::string;
+using std::function;
+using std::shared_ptr;
+
+namespace singa {
+
+/// Allocate memory and execute Tensor operations.
+/// There are three types of devices distinguished by their programming
+/// languages, namely cpp, cuda and opencl.
+class Device {
+  public:
+  // Device() = default;
+  virtual ~Device() {}
+  /// Constructor with device ID, num of executors (e.g., cuda streams),
+  /// max mem size to use (in MB)
+  Device(int id, int num_executors);
+
+  virtual void SetRandSeed(unsigned seed) = 0;
+
+  /// Called by Tensor.
+  Block* NewBlock(int size);
+
+  /// Called by Tensor.
+  void FreeBlock(Block* block);
+
+  /// Return the size (bytes) of memory in use
+  /// TODO(wangwei) override this function for all devices.
+  virtual size_t GetAllocatedMem() {
+    return 0u;
+  }
+
+  /// Copy data within or across devices.
+  void CopyDataToFrom(Block* dst, Block* src, size_t nBytes,
+                      CopyDirection direction, int dst_offset, int src_offset);
+
+  void CopyDataFromHostPtr(Block* dst, const void* src, size_t nBytes,
+                           size_t dst_offset = 0);
+  /// Submit the operation to the device, which may execute it right now or
+  /// delay it depending on the scheduler.
+  void Exec(function<void(Context*)>&& fn, const vector<Block*> read_blocks,
+                    const vector<Block*> write_blocks,
+                    bool use_rand_generator = false);
+
+  // Wait for one event.
+  // void WaitFor();
+
+  /// wait for all operations submitted to this device.
+  void Sync();
+
+  /// Return the programming language for this device.
+  LangType lang() const {
+    return lang_;
+  }
+
+  virtual std::shared_ptr<Device> host() const { return host_;}
+
+  Context* context(int k) {
+    return &ctx_;
+  }
+
+  int id() const { return id_; }
+
+ private:
+  Device() {};
+
+ protected:
+  /// Execute one operation on one executor.
+  virtual void DoExec(function<void(Context*)>&& fn, int executor) = 0;
+
+  virtual void CopyToFrom(void* dst, const void* src, size_t nBytes,
+                          CopyDirection direction, Context* ctx) = 0;
+
+  /// Allocate device memory.
+  virtual void* Malloc(int size) = 0;
+
+  /// Free device memory.
+  virtual void Free(void* ptr) = 0;
+
+ protected:
+  int id_ = 0;
+  int num_executors_ = 0;
+  unsigned seed_ = 0;
+  // Scheduler* scheduler_ = nullptr;
+  // VirtualMemory* vm_ = nullptr;
+  /// Programming language type, could be kCpp, kCuda, kOpencl
+  LangType lang_;
+  // SafeQueue<Operation> op_queue_;
+  // SafeQueue<Operation> op_log_;
+  /// The host device
+  std::shared_ptr<Device> host_;
+  // TODO(wangwei) define multiple contexts, one per executor
+  Context ctx_;
+};
+
+/// a singleton CppDevice as the host for all devices.
+extern std::shared_ptr<Device> defaultDevice;
+
+/// Represent a CPU device which may have multiple threads/executors.
+/// It runs cpp code.
+class CppCPU : public Device {
+ public:
+  ~CppCPU() {};
+  CppCPU();
+
+  std::shared_ptr<Device> host() const override { return defaultDevice;}
+  void SetRandSeed(unsigned seed) override;
+
+ protected:
+  void DoExec(function<void(Context*)>&& fn, int executor) override;
+
+  void CopyToFrom(void* dst, const void* src, size_t nBytes,
+                  CopyDirection direction, Context* ctx) override;
+
+  /// Allocate cpu memory.
+  void* Malloc(int size) override;
+
+  /// Free cpu memory.
+  void Free(void* ptr) override;
+};
+
+
+// Implement Device using OpenCL libs.
+// class OpenclDevice : public Device { };
+
+#ifdef USE_CUDA
+// Represent a Nvidia GPU which runs cuda code.
+class CudaGPU : public Device {
+ public:
+  ~CudaGPU();
+  /// Construct the device using default mem pool setting.
+  CudaGPU(int id = 0);
+  /// Construct the device given the physical device ID and memory pool.
+  CudaGPU(int id, std::shared_ptr<DeviceMemPool> pool);
+
+  void SetRandSeed(unsigned seed) override;
+  size_t GetAllocatedMem() override;
+
+ protected:
+  void DoExec(function<void(Context*)>&& fn, int executor) override;
+
+  void CopyToFrom(void* dst, const void* src, size_t nBytes,
+                  CopyDirection direction, Context* ctx) override;
+
+  /// Allocate cpu memory.
+  void* Malloc(int size) override;
+
+  /// Free cpu memory.
+  void Free(void* ptr) override;
+
+ private:
+  void Setup();
+
+ private:
+	shared_ptr<DeviceMemPool> pool_;
+};
+
+/// CudaCPU which uses cudaMallocHost to allocate pinned memory for host.
+
+#endif  // USE_CUDA
+
+#ifdef USE_OPENCL
+// Implement Device using OpenCL libs.
+class OpenclDevice : public singa::Device {
+public:
+
+  // TODO: Constructor arguments to consider:
+  // Path to kernel sources?
+  // Select only certain device types?
+  OpenclDevice(int id = 0, int num_executors = 1);
+  ~OpenclDevice();
+
+  /// Get the specified kernel.
+  cl::Kernel GetKernel(const std::string& kname, cl_int* status = nullptr);
+
+  /// Get the command queue associated with this device.
+  cl::CommandQueue GetCmdQ() { return cmdq; }
+
+  /// Prints information about all Devices in each Platform.
+  void PrintAllDeviceInfo();
+
+  /// Prints status about CL source code builds.
+  void PrintClBuildInfo(cl::Program &p);
+
+// Overridden, inherited methods
+  void SetRandSeed(unsigned seed) override;
+
+  void CopyDataToFrom(Block* dst, Block* src, size_t nBytes,
+                      CopyDirection direction, int dst_offset = 0,
+                      int src_offset = 0);
+/*
+  void CopyDataFromHostPtr(Block* dst, const void* src, size_t nBytes = 0,
+                           size_t dst_offset = 0) override;*/
+
+protected:
+  /// The OpenCL device that this object represents.
+  /// Each OpenclDevice contains exactly one cl::Device for the lifetime of the
+  /// object.
+  cl::Device this_device;
+
+  /// Each OpenclDevice has one OpenCL context. It is created along with the
+  /// creation of this object.
+  cl::Context ocl_ctx;
+
+  /// The CommandQueue that is associated with this device.
+  /// Since each OpenclDevice contains only one cl::Device and one cl::Context,
+  /// it naturally also contains one cl::CommandQueue that is associated
+  /// with said Device and Context.
+  cl::CommandQueue cmdq;
+
+  /// A list of kernels that has been compiled on this device.
+  std::shared_ptr<std::unordered_map<std::string, cl::Kernel>> kernels;
+
+  /// Searches the given paths for all .cl files and builds
+  /// OpenCL programs, then stores them in the Kernels map.
+  void BuildPrograms(const std::string &kdir = cl_src_path);
+
+// Overridden, inherited methods.
+
+  void DoExec(function<void(Context*)>&& fn, int executor) override;
+
+  void CopyToFrom(void* dst, const void* src, size_t nBytes,
+                  CopyDirection direction, Context* ctx = nullptr) override;
+
+  /// Allocates memory on this OpenCL device
+  /// by creating and returning an empty cl::Buffer object.
+  /// with the indicated size.
+  void* Malloc(int size) override;
+
+  /// Converts the void pointer into a Buffer object, then deletes the object.
+  /// This has the effect of freeing up device memory.
+  void Free(void* ptr) override;
+
+private:
+
+  /// Copies a data block from host to device.
+  /// src: a pointer to an array of data.
+  /// dst: a pointer to a cl::Buffer object.
+  void WriteToDevice(cl::Buffer* dst, const void* src, const size_t size);
+
+  /// Reads a data block from device to host.
+  /// src: a pointer to an cl::Buffer object.
+  /// dst: a pointer to an malloc'ed empty array.
+  void ReadFromDevice(void* dst, const cl::Buffer* src, const size_t size);
+
+  /// Duplicates a block of data on the device.
+  /// src: a pointer to the original cl::Buffer object.
+  /// dst: a pointer to the new cl::Buffer object to copy the data into.
+  void CopyDeviceBuffer(cl::Buffer* dst, const cl::Buffer* src, const size_t size);
+
+  static const std::string cl_src_path;
+};
+#endif  // USE_OPENCL
+
+/// This class queries all available calculating devices on a given machine
+/// grouped according to manufacturer or device drivers. All methods should be static.
+/// If CUDA or OPENCL are not enabled, then the respective related methods should
+/// return something that indicates their absence (for example, 0 devices);
+/// however they should always be available regardless of compile-time switches.
+class Platform {
+public:
+
+  /// Return the defualt host device
+  static std::shared_ptr<Device> GetDefaultDevice() {
+    return defaultDevice;
+  }
+
+#ifdef USE_CUDA
+  /// Return the number of total available GPUs
+  static int GetNumGPUs();
+
+  /// Return the device IDs of available GPUs.
+  /// TODO(wangwei) return the IDs according to free memory in decending order
+  static const std::vector<int> GetGPUIDs();
+
+  static const std::pair<size_t, size_t> GetGPUMemSize(const int device);
+
+  /// Return the memory of a GPU <free, total>
+  static const std::vector<std::pair<size_t, size_t>> GetGPUMemSize();
+
+  /// Return a string containing all hardware info, e.g., version, memory size.
+  static const std::string DeviceQuery(int id, bool verbose = false);
+
+  /// Create a set of CudaGPU Device using 'num_devices' free GPUs.
+  static const std::vector<std::shared_ptr<Device>>
+  CreateCudaGPUs(const size_t num_devices, size_t init_size = 0);
+
+  /// Create a set of CudaGPU Device using given GPU IDs.
+  static const std::vector<std::shared_ptr<Device>>
+  CreateCudaGPUsOn(const std::vector<int> &devices, size_t init_size = 0);
+#endif // USE_CUDA
+
+  /// Create a \p num_devices set of valid OpenCL devices, regardless of
+  /// platforms.  If there are fewer valid devices than requested, then this
+  /// method will return as many as possible.If OpenCL is not in use, this
+  /// method will return an empty array.
+  const std::vector<std::shared_ptr<Device> > CreateOpenclDevices(
+             const size_t num_devices);
+
+  /// Create a set of valid OpenCL devices, regardless of platforms, assigning
+  /// \p id to each device in sequence.
+  /// If there are fewer valid devices than requested, then this method will
+  /// return as many as possible.
+  /// If OpenCL is not in use, this method will return an empty array.
+  const std::vector<std::shared_ptr<Device> >
+  CreateOpenclDevices(const vector<int> &id);
+
+  /// This function is implementd by Caffe (http://caffe.berkeleyvision.org/).
+  /// This function checks the availability of GPU #device_id.
+  /// It attempts to create a context on the device by calling cudaFree(0).
+  /// cudaSetDevice() alone is not sufficient to check the availability.
+  /// It lazily records device_id, however, does not initialize a
+  /// context. So it does not know if the host thread has the permission to use
+  /// the device or not.
+  ///
+  /// In a shared environment where the devices are set to EXCLUSIVE_PROCESS
+  /// or EXCLUSIVE_THREAD mode, cudaSetDevice() returns cudaSuccess
+  /// even if the device is exclusively occupied by another process or thread.
+  /// Cuda operations that initialize the context are needed to check
+  /// the permission. cudaFree(0) is one of those with no side effect,
+  /// except the context initialization.
+  static bool CheckDevice(const int device_id);
+
+
+private:
+#ifdef USE_OPENCL
+  cl::Platform clPlatform;
+#endif  // USE_OPENCL
+};
+
+
+}  // namespace singa
+
+#endif  // SINGA_CORE_DEVICE_H_
diff --git a/include/singa/core/memory.h b/include/singa/core/memory.h
new file mode 100644
index 0000000..f664f95
--- /dev/null
+++ b/include/singa/core/memory.h
@@ -0,0 +1,91 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef SINGA_CORE_MEMORY_H_
+#define SINGA_CORE_MEMORY_H_
+
+#include <mutex>
+#include <atomic>
+#include "singa/proto/core.pb.h"
+#include "singa/singa_config.h"
+
+#ifdef USE_CUDA
+#include "cnmem.h"
+#endif
+
+
+namespace singa {
+
+/// Manage device memory pool including garbage collection, memory opt.
+class VirtualMemory {};
+
+class DeviceMemPool {
+ public:
+  virtual void Malloc(void** ptr, const size_t size)  = 0;
+  virtual void Free(void* ptr)  = 0;
+
+  /// Return a pair for free and total memory managed by this pool.
+  virtual std::pair<size_t, size_t> GetMemUsage() {
+    return std::make_pair(0u, 0u);
+  }
+  virtual ~DeviceMemPool(){};
+
+ protected:
+  size_t usage_;
+//  size_t init_size_ = 0, max_size_ = 0;
+};
+
+#ifdef USE_CUDA
+class CnMemPool : public DeviceMemPool {
+ public:
+  // Create the mem pool by setting the devices [0, numDevices), and
+  // initial pool size (MB), and max pool size (no effect currently).
+  CnMemPool(int numDevices = 1, size_t init_size = 256, size_t max_size = 0);
+  CnMemPool(const MemPoolConf& conf);
+
+  void Malloc(void** ptr, const size_t size);
+  void Free(void* ptr);
+
+  std::pair<size_t, size_t> GetMemUsage() override;
+
+  // release all memory and set cnmem manager to unintialized
+  ~CnMemPool();
+
+ protected:
+  void Init();
+
+
+ private:
+
+  MemPoolConf conf_;
+  // whether the (global) memory pool has been initialized
+  bool initialized_ = false;
+  // lock on the initialized variable
+  std::mutex mtx_;
+
+  static std::atomic<int> pool_count;
+};
+
+class CudaMemPool : public DeviceMemPool {
+ public:
+  void Malloc(void** ptr, const size_t size) override;
+  void Free(void* ptr) override;
+};
+#endif
+}  // namespace singa
+#endif  // SINGA_CORE_MEMORY_H_
diff --git a/include/singa/core/scheduler.h b/include/singa/core/scheduler.h
new file mode 100644
index 0000000..3673c6b
--- /dev/null
+++ b/include/singa/core/scheduler.h
@@ -0,0 +1,27 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef SINGA_CORE_SCHEDULER_H_
+#define SINGA_CORE_SCHEDULER_H_
+
+namespace singa {
+
+/// Scheduling Tensor operations with dependency detection.
+class Scheduler {};
+
+}  // namespace singa
+#endif  // SINGA_CORE_SCHEDULER_H_
diff --git a/include/singa/core/tensor.h b/include/singa/core/tensor.h
new file mode 100644
index 0000000..2075b5d
--- /dev/null
+++ b/include/singa/core/tensor.h
@@ -0,0 +1,466 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef SINGA_CORE_TENSOR_H_
+#define SINGA_CORE_TENSOR_H_
+
+#include <vector>
+#include <tuple>
+#include <memory>
+
+#include "singa/core/common.h"
+#include "singa/core/device.h"
+#include "singa/proto/core.pb.h"
+#include "singa/utils/logging.h"
+
+using std::vector;
+using std::tuple;
+namespace singa {
+
+typedef vector<size_t> Shape;
+/// hardcode the width of types defined in DataType
+const size_t kDataWidth[] = {sizeof(float),  sizeof(float) / 2,
+                             sizeof(int),    sizeof(char),
+                             sizeof(double), sizeof(unsigned char)};
+inline size_t SizeOf(DataType t) {
+  static_assert(kNumDataType == sizeof(kDataWidth) / sizeof(size_t),
+                "Num of data types not match num of data width");
+  CHECK_GT(kNumDataType, t);
+  return kDataWidth[t];
+}
+
+/// A Tensor instance is a multi-dimensional array resident on a Device
+/// (default device is the host CPU). The internal data is allocated in lazy
+/// manner.
+/// Linear algebra, neural net and random operations are provided against
+/// Tensor.
+/// For all operations, if the result tensor is passed as an argument,
+/// then it must be set up correctly (shape, device). Otherwise, runtime error
+/// like SegmentFault would happen. Simply type/device check would be conducted.
+class Tensor {
+ public:
+  ~Tensor();
+  Tensor();
+  explicit Tensor(Shape &&shape, DataType dtype = kFloat32);
+  explicit Tensor(const Shape &shape, DataType dtype = kFloat32);
+  Tensor(Shape &&shape, std::shared_ptr<Device> dev, DataType dtype = kFloat32);
+  Tensor(const Shape &shape, std::shared_ptr<Device> dev,
+         DataType dtype = kFloat32);
+
+  /// Copy Tensor to share the internal data.  No deep copy.
+  Tensor(const Tensor &from);
+  /// Copy Tensor to share the internal data.  No deep copy.
+  Tensor(Tensor &&from);
+
+  /// For functions in xx_math.cc to access the block.
+  /// Users should not operate against Block directly.
+  /// block_ is allocated in constructors.
+  Block *block() const { return block_; }
+  void SetBlock(Block *block);
+
+  std::shared_ptr<Device> device() const { return device_; }
+
+  /// return immutable Tensor values with given type.
+  template <typename SType>
+  const SType *data() const {
+    return static_cast<const SType *>(block()->data());
+  }
+
+  /// used for swig code to convert Tensor into numpy array.
+  /// It gets data into 'value'
+  template <typename SType>
+  void GetValue(SType *value, const size_t num) {
+    CHECK(device_ == defaultDevice);
+    const SType* ptr = data<SType>();
+    for(size_t i = 0; i < num; i++) value[i] = ptr[i];
+  }
+
+  /// data type, including kFloat16, kFloat32, kInt
+  const DataType data_type() const { return data_type_; }
+
+  const Shape &shape() const { return shape_; }
+
+  const size_t shape(const size_t idx) const {
+    CHECK_LT(idx, shape_.size());
+    return shape_.at(idx);
+  }
+
+  size_t nDim() const { return shape_.size(); }
+
+  bool transpose() const { return transpose_; }
+
+  /// return number of total elements
+  size_t Size() const {
+    if (block_ == nullptr) return 0u;
+    CHECK_EQ(block_->size() % SizeOf(data_type_), 0u);
+    return block_->size() / SizeOf(data_type_);
+  }
+
+  /// return memory size (i.e., Bytes)
+  size_t MemSize() const { return block_->size(); }
+
+  /// Reset the tensor shape, it may reallocate block, if MemSize() changes.
+  void Reshape(const Shape &shape);
+  void Reshape(Shape &&shape);
+
+  /// Reset the shape, device, and data type as given tensor.
+  /// If block size changes, then reallocate a new block. The previous block
+  /// would
+  /// be deleted.
+  void ResetLike(const Tensor &t);
+
+  /// Reset the data type, it would reallocate block if type changes.
+  void AsType(const DataType type);
+
+  /// Reset the device.
+  /// If the target device is a diff device, then do deep data copy.
+  void ToDevice(std::shared_ptr<Device> dev);
+
+  /// Equivalent to ToDevice(host_dev).
+  void ToHost();
+
+  /// Set each element of the tensor to be x
+  template <typename SType>
+  void SetValue(const SType x);
+
+  /// For init the tensor values, copy 'num' elements from 'src' to the internal
+  /// memory with 'offset' (elements).
+  template <typename SType>
+  void CopyDataFromHostPtr(const SType *src, const size_t num,
+                           const size_t offset = 0);
+
+  /// Copy data from another Tensor which may be on a diff device.
+  /// Meta data would not be copied!
+  void CopyData(const Tensor &other);
+
+  /// Deserialize data, shape and transpose from protobuf object.
+  void FromProto(const singa::TensorProto &proto);
+
+  /// Serialize data, shape and transpose to protobuf object.
+  void ToProto(singa::TensorProto *proto) const;
+
+  /// return an exactly the same Tensor with data been deep copied to the given
+  /// device. If 'device' is nullptr, then clone it one the current device.
+  Tensor Clone(std::shared_ptr<Device> device = nullptr) const;
+
+  // Tensor operations
+
+  /// Matrix transpose.  Valid only if shape.size() == 2.
+  /// No data copy, just set the transpose_ filed of the returned tensor.
+  Tensor T() const;
+
+  /// Copy the meta info with data block shared.
+  Tensor &operator=(const Tensor &in);
+
+  /// Copy the meta info with data block shared.
+  Tensor &operator=(Tensor &&in);
+
+  Tensor &operator+=(const Tensor &in);
+  // void operator+=(Tensor&& in);
+  Tensor &operator-=(const Tensor &in);
+  // void operator-=(Tensor&& in);
+  Tensor &operator*=(const Tensor &in);
+  // void operator*=(Tensor&& in);
+  Tensor &operator/=(const Tensor &in);
+  // void operator/=(Tensor&& in);
+
+  // Scalar operations.
+
+  /// SType is a scalar type
+  template <typename SType>
+  Tensor &operator+=(const SType x);
+
+  /// SType is a scalar type
+  template <typename SType>
+  Tensor &operator-=(const SType x);
+
+  /// SType is a scalar type
+  template <typename SType>
+  Tensor &operator*=(const SType x);
+
+  /// SType is a scalar type
+  template <typename SType>
+  Tensor &operator/=(const SType x);
+
+  /// Return average L1 norm
+  float L1() const;
+  /// Return average L2 norm
+  float L2() const;
+
+ protected:
+  bool transpose_ = false;
+  DataType data_type_ = kFloat32;
+  std::shared_ptr<Device> device_ = nullptr;
+  /// Note: block_ is allocated in lazy manner to avoid frequent malloc/free.
+  /// If you want to get an allocated Block, use block() instead of block_.
+  Block *block_ = nullptr;
+  Shape shape_ = {};
+};
+
+typedef Shape::iterator ShapeIter;
+inline size_t Product(const Shape &shape, int start = 0, size_t len = 0) {
+  if (len == 0) len = shape.size();
+  if (len == 0)
+    return 0;
+  CHECK_LE(len, shape.size());
+  size_t v = 1;
+  for (unsigned int i = start; i < len; i++) v *= shape[i];
+  return v;
+}
+
+inline void CheckDataTypeAndLang(const Tensor &in1, const Tensor &in2) {
+  CHECK_EQ(in1.data_type(), in2.data_type());
+  CHECK_EQ(in1.device()->lang(), in2.device()->lang());
+}
+
+template <typename FromType, typename ToType>
+ToType TypeCast(const FromType &x) {
+  // TODO(wangwei) cast fp16; prevent some casts, e.g., float to char
+  return static_cast<ToType>(x);
+}
+
+Tensor Reshape(const Tensor &in, const Shape &s);
+Tensor Reshape(const Tensor &in, Shape &&s);
+
+// For tensors with sparse content, e.g., missing columns or rows.
+// class SparseTensor : public Tensor {};
+
+/// Copy 'num' elements of src to dst.
+/// The first 'src_offset' ('dst_offset') elements will be skipped.
+void CopyDataToFrom(Tensor *dst, const Tensor &src, const size_t num,
+                    const size_t dst_offset = 0, const size_t src_offset = 0);
+
+// =============Element-wise operations====================================
+Tensor Abs(const Tensor &in);
+Tensor Exp(const Tensor &in);
+Tensor Log(const Tensor &in);
+Tensor ReLU(const Tensor &in);
+Tensor Sigmoid(const Tensor &in);
+Tensor Sign(const Tensor &in);
+Tensor Sqrt(const Tensor &in);
+Tensor Square(const Tensor &in);
+Tensor Tanh(const Tensor &in);
+
+void Abs(const Tensor &in, Tensor *out);
+void Exp(const Tensor &in, Tensor *out);
+void Log(const Tensor &in, Tensor *out);
+void ReLU(const Tensor &in, Tensor *out);
+void Sigmoid(const Tensor &in, Tensor *out);
+void Sign(const Tensor &in, Tensor *out);
+void Sqrt(const Tensor &in, Tensor *out);
+void Square(const Tensor &in, Tensor *out);
+void Tanh(const Tensor &in, Tensor *out);
+
+/// Element-wise opeartion, out[i]=in[i]^x
+template <typename SType>
+Tensor Pow(const Tensor &in, const SType x);
+/// Element-wise opeartion, out[i]=in[i]^x
+template <typename SType>
+void Pow(const Tensor &in, const SType x, Tensor *out);
+/// Element-wise opeartion, out[i]=baes[i]^exp[i]
+Tensor Pow(const Tensor &base, const Tensor &exp);
+/// Element-wise opeartion, out[i]=baes[i]^exp[i]
+void Pow(const Tensor &base, const Tensor &exp, Tensor *out);
+
+/// Element-wise operation, out[i]= (in[i] < x) ? 1.f : 0.f
+template <typename SType>
+Tensor operator<(const Tensor &in, const SType x);
+template <typename SType>
+void LT(const Tensor &in, const SType x, Tensor *out);
+
+/// Element-wise operation, out[i]= (in1[i] < in2[i]) ? 1.f : 0.f
+Tensor operator<(const Tensor &in1, const Tensor& in2);
+void LT(const Tensor &in1, const Tensor& in2, Tensor *out);
+
+/// Element-wise operation, out[i]= (in[i] <= x) ? 1.f : 0.f
+template <typename SType>
+Tensor operator<=(const Tensor &in, const SType x);
+template <typename SType>
+void LE(const Tensor &in, const SType x, Tensor *out);
+
+/// Element-wise operation, out[i]= (in1[i] <= in2[i]) ? 1.f : 0.f
+Tensor operator<=(const Tensor &in1, const Tensor& in2);
+void LE(const Tensor &in1, const Tensor& in2, Tensor *out);
+
+/// Element-wise operation, out[i]= (in[i] > x) ? 1.f : 0.f
+template <typename SType>
+Tensor operator>(const Tensor &in, const SType x);
+template <typename SType>
+void GT(const Tensor &in, const SType x, Tensor *out);
+
+/// Element-wise operation, out[i]= (in1[i] > in2[i]) ? 1.f : 0.f
+Tensor operator>(const Tensor &in1, const Tensor& in2);
+void GT(const Tensor &in1, const Tensor& in2, Tensor *out);
+
+
+/// Element-wise operation, out[i]= (in[i] >= x) ? 1.f : 0.f
+template <typename SType>
+Tensor operator>=(const Tensor &in, const SType x);
+template <typename SType>
+void GE(const Tensor &in, const SType x, Tensor *out);
+
+/// Element-wise operation, out[i]= (in1[i] >= in2[i]) ? 1.f : 0.f
+Tensor operator>=(const Tensor &in1, const Tensor& in2);
+void GE(const Tensor &in1, const Tensor& in2, Tensor *out);
+
+
+Tensor operator+(const Tensor &lhs, const Tensor &rhs);
+void Add(const Tensor &lhs, const Tensor &rhs, Tensor *out);
+Tensor operator-(const Tensor &lhs, const Tensor &rhs);
+void Sub(const Tensor &lhs, const Tensor &rhs, Tensor *out);
+Tensor operator*(const Tensor &lhs, const Tensor &rhs);
+void EltwiseMult(const Tensor &lhs, const Tensor &rhs, Tensor *out);
+Tensor operator/(const Tensor &lhs, const Tensor &rhs);
+void Div(const Tensor &lhs, const Tensor &rhs, Tensor *out);
+
+template <typename SType>
+Tensor operator+(const Tensor &in, const SType x);
+template <typename SType>
+void Add(const Tensor &in, const SType x, Tensor *out);
+
+template <typename SType>
+Tensor operator-(const Tensor &in, const SType x);
+template <typename SType>
+void Sub(const Tensor &in, const SType x, Tensor *out);
+
+template <typename SType>
+Tensor operator*(const Tensor &in, const SType x);
+template <typename SType>
+void EltwiseMult(const Tensor &in, const SType x, Tensor *out);
+
+/// For each element e of Tensor 'in', compute e / x
+template <typename SType>
+Tensor operator/(const Tensor &in, const SType x);
+/// For each element e of Tensor 'in', compute e / x into out
+template <typename SType>
+void Div(const Tensor &in, const SType x, Tensor *out);
+
+/// For each element e of Tensor 'in', compute x/e
+template <typename SType>
+Tensor Div(const SType x, const Tensor &in);
+/// For each element e of Tensor 'in', compute x/e into 'out'
+template <typename SType>
+void Div(const SType x, const Tensor &in, Tensor *out);
+
+template <typename SType = float>
+SType Sum(const Tensor &in);
+// ============Matrix (row/column) operations==================================
+/// Average elements in the Tensor, currently only support vector and matrix.
+/// if 'axis' is 0, average all rows into a single row
+/// if 'axis' is 1, average all columns into a single column
+/// TODO(wangwei) support arbitrary Tensor like numpy.average
+Tensor Average(const Tensor &in, const int axis);
+
+/// Add column 'v' with each column of matrix M
+void AddColumn(const Tensor &v, Tensor *M);
+/// For each column 'c' of matrix out, do c=alpha*v + beta*c
+template <typename SType>
+void AddColumn(const SType alpha, const SType beta, const Tensor &v,
+               Tensor *out);
+/// Add row 'v' with each row of matrix M; write results into 'out'
+void AddRow(const Tensor &v, Tensor *out);
+/// For each row 'r' of matrix out, do r=alpha*v + beta*r
+template <typename SType>
+void AddRow(const SType alpha, const SType beta, const Tensor &v, Tensor *M);
+/// Divide column 'v' by each column of matrix M; write results into 'out'
+void DivColumn(const Tensor &v, Tensor *M);
+/// Divide row 'v' by each row of matrix M; write results into 'out'
+void DivRow(const Tensor &v, Tensor *M);
+/// Multiply column 'v' and each column of matrix M; write results into 'out'
+void MultColumn(const Tensor &v, Tensor *M);
+/// Multiply row 'v' with each row of matrix M; write results into 'out'
+void MultRow(const Tensor &v, Tensor *M);
+/// Do softmax for each row. 'in' could be a 1-d or 2-d Tensor.
+Tensor SoftMax(const Tensor &in);
+/// Do softmax for each row. 'in' could be a 1-d or 2-d Tensor.
+void SoftMax(const Tensor &in, Tensor *out);
+/// Sub column 'v' by each column of matrix M
+void SubColumn(const Tensor &v, Tensor *M);
+/// Sub row 'v' by each row of matrix M; write results into 'out'
+void SubRow(const Tensor &v, Tensor *M);
+/// Sum all columns of matrix M into a single column as 'out'
+void SumColumns(const Tensor &M, Tensor *out);
+/// Sum all rows of matrix M into a single row as 'out'
+void SumRows(const Tensor &M, Tensor *out);
+
+/// Sum elements in the Tensor, currently only support vector and matrix.
+/// if 'axis' is 0, sum all rows into a single row
+/// if 'axis' is 1, sum all columns into a single column
+/// TODO(wangwei) support arbitrary Tensor like numpy.sum
+Tensor Sum(const Tensor &in, const int axis);
+
+// ================Random operations==========================================
+/// For each element x set x = 1 if random() < p; otherwise x = 1.
+template <typename SType>
+void Bernoulli(const SType p, Tensor *out);
+/// Fill in Tensor 't' following Gaussian distribution.
+template <typename SType>
+void Gaussian(const SType mean, const SType std, Tensor *out);
+/// Fill in Tensor 't' following uniform distribution.
+template <typename SType>
+void Uniform(const SType low, const SType high, Tensor *out);
+
+// ================Blas operations============================================
+// TODO(wangwei) make amax/amin/asum a member function of tensor
+
+/// out = alpha*in + out
+template <typename SType>
+void Axpy(SType alpha, const Tensor &in, Tensor *out);
+
+/// Do matrix vector multipication or matrix matrix multiplication depdending
+/// on the Tensor shape.  result = A * B
+Tensor Mult(const Tensor &A, const Tensor &B);
+/// Do matrix vector multipication or matrix matrix multiplication depdending
+/// on the Tensor shape.  C = A * B
+void Mult(const Tensor &A, const Tensor &B, Tensor *C);
+/// Do matrix vector multipication or matrix matrix multiplication depdending
+/// on the Tensor shape. out = alpha lhs * rhs + beta * out
+template <typename SType>
+void Mult(const SType alpha, const Tensor &A, const Tensor &B, const SType beta,
+          Tensor *C);
+
+// *****************
+// Misc.
+// ****************
+/// Compute the cross entropy loss given the prediction probability 'p' and
+/// the target (ground truth) labels 't'. 'p' and 't' are either 1-d vector
+/// or 2-d matrix. 'loss' is 1-d vector. The loss is computed into p.
+void ComputeCrossEntropy(const Tensor &p, const Tensor &t, Tensor *loss);
+/// Compute the dx, given prediction probability 'p' (p=softmax(x)) and
+/// the target (ground truth) labels 't'. 'p' and 't' are either 1-d vector
+/// or 2-d matrix. 'grad' has the same shape as 'p'. dx is computed into p.
+void SoftmaxCrossEntropyBwd(const Tensor &t, Tensor *p);
+
+/// Return a tensor consisting of rows ([start, end)) from 'in'. It shares the
+/// memory with 'in'. 'in' is a 1D or 2D Tensor.
+Tensor SliceRows(const Tensor &in, const size_t start, const size_t end);
+/// Return a tensor consisting of rows ([start, end)) from 'in'. It copies the
+/// values from 'in'. 'in' ia a 2D Tensor.
+Tensor CopyRows(const Tensor &in, const size_t start, const size_t end);
+/// Return a tensor consisting of columns ([start, end)) from 'in'. It copies
+/// the values from 'in'. 'in' is a  2D Tensor.
+Tensor CopyColumns(const Tensor &in, const size_t start, const size_t end);
+/// Return a tensor which is vertically stacked from tensors in 'in'. Each
+/// tensor in 'in' is a 2D tensor. Values are copied, no memory sharing.
+Tensor ConcatenateRows(const vector<Tensor> &in);
+/// Return a tensor which is horizontally stacked from tensors in 'in'. Each
+/// tensor in 'in' is a 2D tensor. Values are copied, no memory sharing.
+Tensor ConcatenateColumns(const vector<Tensor> &in);
+}  // namespace singa
+
+#endif  // SINGA_CORE_TENSOR_H_
diff --git a/include/singa/driver.h b/include/singa/driver.h
deleted file mode 100644
index 0105158..0000000
--- a/include/singa/driver.h
+++ /dev/null
@@ -1,264 +0,0 @@
-/************************************************************
-*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*
-*************************************************************/
-#ifndef SINGA_DRIVER_H_
-#define SINGA_DRIVER_H_
-
-#include <vector>
-#include <string>
-#include "singa/proto/job.pb.h"
-#include "singa/proto/singa.pb.h"
-#include "singa/utils/factory.h"
-#include "singa/utils/param.h"
-#include "singa/utils/singleton.h"
-#include "singa/utils/updater.h"
-#include "singa/neuralnet/layer.h"
-#include "singa/worker.h"
-#include "singa/server.h"
-
-namespace singa {
-using std::vector;
-class Driver {
- public:
-  /**
-   * Init SINGA
-   * - init glog
-   * - parse job id and job conf from cmd line
-   * - register built-in layer, worker, updater, param subclasses.
-   *
-   * May be used for MPI init if it is used for message passing.
-   */
-  void Init(int argc, char** argv);
-  /**
-   * Init SINGA LOG
-   * Used for python binding. Users can also directly call it as a C++ API.
-   * - init glog with given parameters
-   *
-   */
-  void InitLog(char *arg);
-  /**
-   * Update job configuration and call Train(const JobProto&) to start the
-   * training.
-   *
-   * It sets up the logging path and checkpoing files (if resume), and checks
-   * the existence of the workspace folder .
-   *
-   * @param[in] resume if true resume the training from the latest checkpoint
-   * files.
-   * @param[in] job_conf job configuration.
-   */
-  void Train(bool resume, const JobProto& job_conf);
-  /**
-   * Used for python binding. Users can also directly call it as a C++ API.
-   *
-   * It completes the functions as defined above but accept serialized string
-   * parameters.
-   *
-   * @param[in] resume if true resume the training from the latest checkpoint
-   * files.
-   * @param[in] str serialized string recorded job configuration.
-   */
-  void Train(bool resume, const std::string str);
-  /**
-   * Create workers and servers to conduct the training.
-   *
-   * @param[in] job_conf job configuration with all necessary fields set (e.g.,
-   * by Train(bool, const JobProto&).
-   */
-  void Train(const JobProto& job_conf);
-  /**
-   * Test the pre-trained model by loading parameters from checkpoint files.
-   *
-   * It can be used for both computing accuracy of test data, and extracting
-   * features (predicting label) of new data.
-   * @param[in] job_conf job configuration, which should include the checkpoint
-   * files and test settings (e.g., test steps). To extract features, the output
-   * layers should be added.
-   */
-  void Test(const JobProto& job_conf);
-  /**
-   * Used for python binding. Users can also directly call it as a C++ API.
-   *
-   * It completes the functions as defined above but accept serialized string
-   * parameters.
-   *
-   * @param[in] str serialized string recorded job configuration.
-   */
-  void Test(const std::string str);
-  /**
-   * Setting the checkpoint field of the job configuration to resume training.
-   *
-   * The checkpoint folder will be searched to get the files for the latest
-   * checkpoint, which will be added into the checkpoint field. The workers
-   * would then load the values of params from the checkpoint files.
-   *
-   * @param job_conf job configuration
-   */
-  void SetupForResume(JobProto* job_conf);
-  /**
-   * Create server instances.
-   *
-   * @param[in] job_conf job configuration.
-   * @param[in] net training neural network.
-   * @return server instances
-   */
-  const vector<Server*> CreateServers(const JobProto& job_conf, NeuralNet* net);
-  /**
-   * Create workers instances.
-   * @param[in] job_conf job configuration.
-   * @param[in] net training neural network.
-   * @return worker instances
-   */
-  const vector<Worker*> CreateWorkers(const JobProto& job_conf, NeuralNet* net);
-
-
-  /*********** Subclasses registers *************************/
-  /**
-   * Register a Layer subclass.
-   *
-   * @param type layer type ID. If called to register built-in subclasses,
-   * it is from LayerType; if called to register user-defined
-   * subclass, it is a string;
-   * @return 0 if success; otherwise -1.
-   */
-  template<typename Subclass, typename Type>
-  int RegisterLayer(const Type& type);
-  /**
-   * Register an Updater subclass.
-   *
-   * @param type ID of the subclass. If called to register built-in subclasses,
-   * it is from UpdaterType; if called to register user-defined
-   * subclass, it is a string;
-   * @return 0 if success; otherwise -1.
-   */
-  template<typename Subclass, typename Type>
-  int RegisterUpdater(const Type& type);
-  /**
-   * Register a learning rate generator subclasses.
-   *
-   * @param type ID of the subclass. If called to register built-in subclasses,
-   * it is from ChangeMethod; if called to register user-defined
-   * subclass, it is a string;
-   * @return 0 if success; otherwise -1.
-   */
-  template<typename Subclass, typename Type>
-  int RegisterLRGenerator(const Type& type);
-  /**
-   * Register a Worker subclass.
-   *
-   * @param type ID of the subclass. If called to register built-in subclasses,
-   * it is from TrainOneBatchAlg; if called to register user-defined
-   * subclass, it is a string;
-   * @return 0 if success; otherwise -1.
-   */
-  template<typename Subclass, typename Type>
-  int RegisterWorker(const Type& type);
-  /**
-   * Register a Param subclass.
-   * @param type ID of the subclass. If called to register built-in subclasses,
-   * it is from ParamType; if called to register user-defined
-   * subclass, it is a string;
-   *
-   * @return 0 if success; otherwise -1.
-   */
-  template<typename Subclass, typename Type>
-  int RegisterParam(const Type& type);
-  /**
-   * Register ParamGenerator subclasses for initalizing Param objects.
-   *
-   * @param type ID of the subclass. If called to register built-in subclasses,
-   * it is from InitMethod; if called to register user-defined
-   * subclass, it is a string;
-   * @return 0 if success; otherwise -1.
-   */
-  template<typename Subclass, typename Type>
-  int RegisterParamGenerator(const Type& type);
-
-  /****************** Access function ********************/
-  /**
-   * @return job ID which is generated by zookeeper and passed in by the
-   * launching script.
-   */
-  inline int job_id() const { return job_id_; }
-  /**
-   * @return job conf path which is passed by users at the command line. It
-   * should at least contains the cluster configuration.
-   */
-  inline JobProto job_conf() const { return job_conf_; }
-
- private:
-  int job_id_;
-  std::string hostip_;
-  JobProto job_conf_;
-  SingaProto singa_conf_;
-};
-
-/************* Implementation of template functions*************************
-* Must put the implementation in driver.h file instead of driver.cc.
-* Otherwise there would be linking error caused by unknown registration
-* functions, becuase these function cannot be generated merely based on its
-* declearation in driver.h.
-*/
-
-template<typename Subclass, typename Type>
-int Driver::RegisterLayer(const Type& type) {
-  auto factory = Singleton<Factory<singa::Layer>>::Instance();
-  factory->Register(type, CreateInstance(Subclass, Layer));
-  return 1;
-}
-
-template<typename Subclass, typename Type>
-int Driver::RegisterParam(const Type& type) {
-  auto factory = Singleton<Factory<singa::Param>>::Instance();
-  factory->Register(type, CreateInstance(Subclass, Param));
-  return 1;
-}
-
-template<typename Subclass, typename Type>
-int Driver::RegisterParamGenerator(const Type& type) {
-  auto factory = Singleton<Factory<singa::ParamGenerator>>::Instance();
-  factory->Register(type, CreateInstance(Subclass, ParamGenerator));
-  return 1;
-}
-
-template<typename Subclass, typename Type>
-int Driver::RegisterUpdater(const Type& type) {
-  auto factory = Singleton<Factory<singa::Updater>>::Instance();
-  factory->Register(type, CreateInstance(Subclass, Updater));
-  return 1;
-}
-
-template<typename Subclass, typename Type>
-int Driver::RegisterLRGenerator(const Type& type) {
-  auto factory = Singleton<Factory<singa::LRGenerator>>::Instance();
-  factory->Register(type, CreateInstance(Subclass, LRGenerator));
-  return 1;
-}
-
-template<typename Subclass, typename Type>
-int Driver::RegisterWorker(const Type& type) {
-  auto factory = Singleton<Factory<singa::Worker>>::Instance();
-  factory->Register(type, CreateInstance(Subclass, Worker));
-  return 1;
-}
-
-}  // namespace singa
-
-#endif  // SINGA_DRIVER_H_
diff --git a/include/singa/io/decoder.h b/include/singa/io/decoder.h
new file mode 100644
index 0000000..bf9a1bc
--- /dev/null
+++ b/include/singa/io/decoder.h
@@ -0,0 +1,74 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef SINGA_IO_DECODER_H_
+#define SINGA_IO_DECODER_H_
+
+#include <vector>
+#include <string>
+#include "singa/core/tensor.h"
+#include "singa/proto/io.pb.h"
+
+namespace singa {
+/// The base decoder that converts a string into a set of tensors.
+class Decoder {
+ public:
+  Decoder() { }
+  virtual ~Decoder() { }
+
+  virtual void Setup(const DecoderConf& conf) {}
+
+  /// Decode value to get data and labels
+  virtual std::vector<Tensor> Decode(std::string value) = 0;
+};
+
+#ifdef USE_OPENCV
+/// Decode the string as an ImageRecord object and convert it into a image
+/// tensor (dtype is kFloat32) and a label tensor (dtype is kInt).
+class JPGDecoder : public Decoder {
+ public:
+  void Setup(const DecoderConf& conf) override {
+    image_dim_order_ = conf.image_dim_order();
+  }
+  std::vector<Tensor> Decode(std::string value) override;
+
+  const std::string image_dim_order() const { return image_dim_order_; }
+
+ private:
+  /// Indicate the dimension order for the output image tensor.
+  std::string image_dim_order_ = "CHW";
+};
+#endif
+
+/// Decode the string of csv formated data  into data tensor
+/// (dtype is kFloat32) and optionally a label tensor (dtype is kInt).
+class CSVDecoder : public Decoder {
+ public:
+  void Setup(const DecoderConf& conf) override {
+    has_label_ = conf.has_label();
+  }
+  std::vector<Tensor> Decode(std::string value) override;
+
+  const bool has_label() const { return has_label_; }
+
+ private:
+  /// if ture the first value is the label
+  bool has_label_ = false;
+};
+} // namespace singa
+#endif // SINGA_IO_DECODER_H_
diff --git a/include/singa/io/encoder.h b/include/singa/io/encoder.h
new file mode 100644
index 0000000..5ca7b21
--- /dev/null
+++ b/include/singa/io/encoder.h
@@ -0,0 +1,73 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef SINGA_IO_ENCODER_H_
+#define SINGA_IO_ENCODER_H_
+
+#include <vector>
+#include <string>
+#include "singa/core/tensor.h"
+#include "singa/proto/io.pb.h"
+
+namespace singa {
+
+/// Base encoder class that convert a set of tensors into string for storage.
+class Encoder {
+ public:
+  Encoder() {}
+  virtual ~Encoder() {}
+
+  virtual void Setup(const EncoderConf& conf) {}
+
+  /// Format each sample data as a string,
+  /// whose structure depends on the proto definition.
+  virtual std::string Encode(vector<Tensor>& data) = 0;
+};
+
+#ifdef USE_OPENCV
+/// Convert an image and its label into an ImageRecord (protobuf message).
+class JPGEncoder : public Encoder {
+ public:
+  void Setup(const EncoderConf& conf) override {
+    image_dim_order_ = conf.image_dim_order();
+  }
+  /// 'data' has two tesors, one for the image pixels (3D) and one for the
+  /// label. The image tensor's data type is kUChar.
+  /// The dimension order is indicated in the EncoderConf, i.e. image_dim_order.
+  /// The label tensor's data type is kInt.
+  std::string Encode(vector<Tensor>& data) override;
+
+  const std::string image_dim_order() const { return image_dim_order_; }
+
+ private:
+  /// Indicate the input image tensor's dimension order.
+  std::string image_dim_order_ = "CHW";
+};
+#endif  // USE_OPENCV
+
+/// Convert values from tensors into a csv formated string.
+class CSVEncoder : public Encoder {
+ public:
+  void Setup(const EncoderConf& conf) override {}
+  /// 'data' has two tesors, one for the data vector (1D) and one for the
+  /// label. The data tensor's data type is kFloat.
+  /// The label tensor's data type is kInt.
+  std::string Encode(vector<Tensor>& data) override;
+};
+} // namespace singa
+#endif  // SINGA_IO_ENCODER_H_
diff --git a/include/singa/io/hdfs_store.h b/include/singa/io/hdfs_store.h
deleted file mode 100644
index 1fb9258..0000000
--- a/include/singa/io/hdfs_store.h
+++ /dev/null
@@ -1,58 +0,0 @@
-/************************************************************
-*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*
-*************************************************************/
-
-#ifndef SINGA_IO_HDFS_STORE_H_
-#define SINGA_IO_HDFS_STORE_H_
-
-#include <string>
-#include "singa/io/store.h"
-#include "singa/io/hdfsfile.h"
-
-namespace singa {
-namespace io {
-
-/**
- * HDFS implementation of the Store interface. The store manages key-value 
- * records storing in HDFS files. 
- *
- * The store consists of records of the following format:
- *      [<length><content>] 
- */
-class HDFSStore : public Store {
- public:
-  ~HDFSStore() { Close();}
-  bool Open(const std::string& source, Mode mode) override;
-  void Close() override;
-  bool Read(std::string* key, std::string* value) override;
-  void SeekToFirst() override;
-  void Seek(int offset) override;
-  bool Write(const std::string& key, const std::string& value) override;
-  void Flush() override;
-
- private:
-  HDFSFile* file_ = nullptr;
-  Mode mode_;
-};
-
-}  // namespace io
-}  // namespace singa
-
-#endif  // SINGA_IO_HDFS_STORE_H_
diff --git a/include/singa/io/hdfsfile.h b/include/singa/io/hdfsfile.h
deleted file mode 100644
index cd3ded3..0000000
--- a/include/singa/io/hdfsfile.h
+++ /dev/null
@@ -1,131 +0,0 @@
-/************************************************************
-*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*
-*************************************************************/
-
-#ifndef SINGA_IO_HDFSFILE_H_
-#define SINGA_IO_HDFSFILE_H_
-
-#include <fstream>
-#include <string>
-#include <unordered_set>
-
-
-#define USE_PROTOBUF 1
-
-#ifdef USE_PROTOBUF
-#include <google/protobuf/message.h>
-#endif
-
-#include <hdfs/hdfs.h>
-
-namespace singa {
-namespace io {
-
-/**
- * HDFSFile represents a specific partition of the HDFS file storing training/validation
- * or test data. HDFS library maintains its own buffer, so we don't need one. 
- * 
- * Each record is of the form: <length><content>
- */
-class HDFSFile {
- public:
-  enum Mode {
-    // read only mode used in training
-    kRead = 0,
-    // write mode used in creating HDFSFile (will overwrite previous one)
-    kCreate = 1,
-    // append mode, e.g. used when previous creating crashes
-    kAppend = 2
-  };
-
-  /**
-   * HDFSFile constructor.
-   *
-   * @param path path to file, of the form "hdfs://namenode/file_path"
-   * @param mode HDFSFile::kRead, HDFSFile::kCreate or HDFSFile::kAppend
-   */
-  HDFSFile(const std::string& path, Mode mode);
-  ~HDFSFile();
-
-#ifdef USE_PROTOBUF
-  /**
-   * read next tuple from the HDFSFile.
-   *
-   * @param val Record of type Message
-   * @return false if read unsuccess, e.g., the tuple was not inserted
-   *         completely.
-   */
-  bool Next(google::protobuf::Message* val);
-  /**
-   * Append one record to the HDFSFile.
-   *
-   * @param val
-   * @return false if unsucess, e.g., inserted before
-   */
-  bool Insert(const google::protobuf::Message& tuple);
-#endif
-
-  /**
-   * Read next record from the HDFSFile.
-   *
-   * @param val Record of type string
-   * @return false if unsuccess, e.g. the tuple was not inserted completely.
-   */
-  bool Next(std::string* val);
-  /**
-   * Append record to the KVFile.
-   *
-   * @param key e.g., image path
-   * @param val
-   * @return false if unsucess, e.g., inserted before
-   */
-  bool Insert(const std::string& tuple);
-  /**
-   * Move the read pointer to the head of the KVFile file.
-   * Used for repeated reading.
-   */
-  void Seek(int offset);
-
-  /**
-   * Flush buffered data to disk.
-   * Used only for kCreate or kAppend.
-   */
-  void Flush();
-    /**
-   * @return path to HDFSFile file
-   */
-  inline std::string path() { return path_; }
-
- private:
-  std::string path_ = "";
-  Mode mode_;
-  // handle to HDFS
-  hdfsFS fs_;
-  // handle to the HDFS open file
-  hdfsFile file_;
-
-  //!< to avoid replicated record
-  std::unordered_set<std::string> keys_;
-};
-}  // namespace io
-
-}  // namespace singa
-
-#endif  // SINGA_IO_HDFSFILE_H_
diff --git a/include/singa/io/integer.h b/include/singa/io/integer.h
new file mode 100644
index 0000000..9c2799d
--- /dev/null
+++ b/include/singa/io/integer.h
@@ -0,0 +1,73 @@
+/************************************************************
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ *
+ *************************************************************/
+
+#ifndef INTEGER_H_
+#define INTEGER_H_
+
+#include <cstdint>
+
+namespace singa{
+static bool isNetworkOrder() {
+    int test = 1;
+    return (1 != *(uint8_t*)&test);
+}
+
+template <typename T>
+static inline T byteSwap(const T& v) {
+    int size = sizeof(v);
+    T ret;
+    uint8_t *dest = reinterpret_cast<uint8_t *>(&ret);
+    uint8_t *src = const_cast<uint8_t*>(reinterpret_cast<const uint8_t*>(&v));
+    for (int i = 0; i < size; ++i) {
+        dest[i] = src[size - i - 1];
+    }
+    return ret;
+}
+
+template <typename T>
+static inline T hton(const T& v)
+{
+    return isNetworkOrder() ? v : byteSwap(v);
+}
+
+template <typename T>
+static inline T ntoh(const T& v) 
+{
+    return hton(v);
+}
+
+static inline int appendInteger(char* buf) {return 0;}
+static inline int readInteger(char* buf) {return 0;}
+
+template<typename Type, typename... Types>
+static int appendInteger(char* buf, Type value, Types... values) {
+    *(Type*)buf = hton(value);
+    return sizeof(Type) + appendInteger(buf + sizeof(Type), values...);
+}
+
+template<typename Type, typename... Types>
+static int readInteger(char* buf, Type& value, Types&... values) {
+    value = ntoh(*(Type*)buf);
+    return sizeof(Type) + readInteger(buf + sizeof(Type), values...);
+}
+
+}
+#endif
diff --git a/include/singa/io/kvfile.h b/include/singa/io/kvfile.h
deleted file mode 100644
index 6d9a709..0000000
--- a/include/singa/io/kvfile.h
+++ /dev/null
@@ -1,182 +0,0 @@
-/************************************************************
-*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*
-*************************************************************/
-
-#ifndef SINGA_IO_KVFILE_H_
-#define SINGA_IO_KVFILE_H_
-
-#include <fstream>
-#include <string>
-#include <unordered_set>
-
-#define USE_PROTOBUF 1
-
-#ifdef USE_PROTOBUF
-#include <google/protobuf/message.h>
-#endif
-
-namespace singa {
-namespace io {
-
-/**
- * KVFile stores training/validation/test tuples.
- * Every worker node should have a KVFile for training data (validation/test
- * KVFile is optional).
- * KVFile consists of a set of unordered tuples. Each tuple is
- * encoded as [key_len key val_len val] (key_len and val_len are of type
- * uint32, which indicate the bytes of key and value respectively.
- *
- * When KVFile is created, it will remove the last tuple if the value size
- * and key size do not match because the last write crashed.
- *
- * TODO(wangwei) split one KVFile into multiple KVFile s.
- *
- */
-class KVFile {
- public:
-  enum Mode {
-    // read only mode used in training
-    kRead = 0,
-    // write mode used in creating KVFile (will overwrite previous one)
-    kCreate = 1,
-    // append mode, e.g. used when previous creating crashes
-    kAppend = 2
-  };
-
-  /**
-   * KVFile constructor.
-   *
-   * @param path path to the disk KVFile, it can be
-   *  - a path to local disk file.
-   *  - a path to local directory. This is to be compatible with the older
-   *    version (DataShard). The KVFile is shard.dat under that directory
-   *  - a hdfs file starting with "hdfs://"
-   * @param mode KVFile open mode, KVFile::kRead, KVFile::kWrite or
-   * KVFile::kAppend
-   * @param bufsize Cache bufsize bytes data for every disk op (read or write),
-   * default is 10MB.
-   */
-  KVFile(const std::string& path, Mode mode, int bufsize = 10485760);
-  ~KVFile();
-
-#ifdef USE_PROTOBUF
-  /**
-   * read next tuple from the KVFile.
-   *
-   * @param key Tuple key
-   * @param val Record of type Message
-   * @return false if read unsuccess, e.g., the tuple was not inserted
-   *         completely.
-   */
-  bool Next(std::string* key, google::protobuf::Message* val);
-  /**
-   * Append one tuple to the KVFile.
-   *
-   * @param key e.g., image path
-   * @param val
-   * @return false if unsucess, e.g., inserted before
-   */
-  bool Insert(const std::string& key, const google::protobuf::Message& tuple);
-#endif
-  /**
-   * read next tuple from the KVFile.
-   *
-   * @param key Tuple key
-   * @param val Record of type string
-   * @return false if unsuccess, e.g. the tuple was not inserted completely.
-   */
-  bool Next(std::string* key, std::string* val);
-  /**
-   * Append one tuple to the KVFile.
-   *
-   * @param key e.g., image path
-   * @param val
-   * @return false if unsucess, e.g., inserted before
-   */
-  bool Insert(const std::string& key, const std::string& tuple);
-  /**
-   * Move the read pointer to the head of the KVFile file.
-   * Used for repeated reading.
-   */
-  void SeekToFirst();
-  /**
-   * Flush buffered data to disk.
-   * Used only for kCreate or kAppend.
-   */
-  void Flush();
-  /**
-   * Iterate through all tuples to get the num of all tuples.
-   *
-   * @return num of tuples
-   */
-  int Count();
-  /**
-   * @return path to KVFile file
-   */
-  inline std::string path() { return path_; }
-
- protected:
-  /**
-   * Read the next key and prepare buffer for reading value.
-   *
-   * @param key
-   * @return length (i.e., bytes) of value field.
-   */
-  int Next(std::string* key);
-  /**
-   * Setup the disk pointer to the right position for append in case that
-   * the pervious write crashes.
-   *
-   * @param path KVFile path.
-   * @return offset (end pos) of the last success written record.
-   */
-  int PrepareForAppend(const std::string& path);
-  /**
-   * Read data from disk if the current data in the buffer is not a full field.
-   *
-   * @param size size of the next field.
-   */
-  bool PrepareNextField(int size);
-
- private:
-  std::string path_ = "";
-  Mode mode_;
-  //!< either ifstream or ofstream
-  std::fstream fdat_;
-  //!< to avoid replicated record
-  std::unordered_set<std::string> keys_;
-  //!< internal buffer
-  char* buf_ = nullptr;
-  //!< offset inside the buf_
-  int offset_ = 0;
-  //!< allocated bytes for the buf_
-  int capacity_ = 0;
-  //!< bytes in buf_, used in reading
-  int bufsize_ = 0;
-};
-}  // namespace io
-
-/**
- * @deprecated {ShardData is deprecated! Use KVFile}.
- */
-using DataShard = io::KVFile;
-}  // namespace singa
-
-#endif  // SINGA_IO_KVFILE_H_
diff --git a/include/singa/io/kvfile_store.h b/include/singa/io/kvfile_store.h
deleted file mode 100644
index 50b8f4f..0000000
--- a/include/singa/io/kvfile_store.h
+++ /dev/null
@@ -1,56 +0,0 @@
-/************************************************************
-*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*
-*************************************************************/
-
-#ifndef SINGA_IO_KVFILE_STORE_H_
-#define SINGA_IO_KVFILE_STORE_H_
-
-#include <string>
-#include "singa/io/store.h"
-#include "singa/io/kvfile.h"
-
-namespace singa {
-namespace io {
-
-/**
- * Use the KVFile as the data storage.
- *
- * KVFile is a binary file. Each tuple is stored as byte string.
- */
-class KVFileStore : public Store {
- public:
-  ~KVFileStore() { Close();}
-  bool Open(const std::string& source, Mode mode) override;
-  void Close() override;
-  bool Read(std::string* key, std::string* value) override;
-  void SeekToFirst() override;
-  void Seek(int offset) override;
-  bool Write(const std::string& key, const std::string& value) override;
-  void Flush() override;
-
- private:
-  KVFile* file_ = nullptr;
-  Mode mode_;
-};
-
-}  // namespace io
-}  // namespace singa
-
-#endif  // SINGA_IO_KVFILE_STORE_H_
diff --git a/include/singa/io/network.h b/include/singa/io/network.h
new file mode 100644
index 0000000..63983ad
--- /dev/null
+++ b/include/singa/io/network.h
@@ -0,0 +1,171 @@
+/************************************************************
+*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*
+*************************************************************/
+
+#ifndef SINGA_COMM_NETWORK_H_
+#define SINGA_COMM_NETWORK_H_
+#include "singa/singa_config.h"
+#ifdef ENABLE_DIST
+#include <ev.h>
+#include <thread>
+#include <unordered_map>
+#include <map>
+#include <vector>
+#include <condition_variable>
+#include <mutex>
+#include <atomic>
+#include <string>
+#include <netinet/in.h>
+#include <queue>
+
+namespace singa {
+
+#define LOCKED 1
+#define UNLOCKED 0
+
+#define SIG_EP 1
+#define SIG_MSG 2
+
+#define CONN_INIT 0
+#define CONN_PENDING 1
+#define CONN_EST 2
+#define CONN_ERROR 3
+
+#define MAX_RETRY_CNT 3
+
+#define EP_TIMEOUT 5.
+
+#define MSG_DATA 0
+#define MSG_ACK 1
+
+class NetworkThread;
+class EndPoint;
+class EndPointFactory;
+
+class Message {
+private:
+  uint8_t type_;
+  uint32_t id_;
+  std::size_t msize_ = 0;
+  std::size_t psize_ = 0;
+  std::size_t processed_ = 0;
+  char *msg_ = nullptr;
+  static const int hsize_ =
+      sizeof(id_) + 2 * sizeof(std::size_t) + sizeof(type_);
+  char mdata_[hsize_];
+  friend class NetworkThread;
+  friend class EndPoint;
+
+public:
+  Message(int = MSG_DATA, uint32_t = 0);
+  Message(const Message &) = delete;
+  Message(Message &&);
+  ~Message();
+
+  void setMetadata(const void *, int);
+  void setPayload(const void *, int);
+
+  std::size_t getMetadata(void **);
+  std::size_t getPayload(void **);
+
+  std::size_t getSize();
+  void setId(uint32_t);
+};
+
+class EndPoint {
+private:
+  std::queue<Message *> send_;
+  std::queue<Message *> recv_;
+  std::queue<Message *> to_ack_;
+  std::condition_variable cv_;
+  std::mutex mtx_;
+  struct sockaddr_in addr_;
+  ev_timer timer_;
+  ev_tstamp last_msg_time_;
+  int fd_[2] = { -1, -1 }; // two endpoints simultaneously connect to each other
+  int pfd_ = -1;
+  bool is_socket_loop_ = false;
+  int conn_status_ = CONN_INIT;
+  int pending_cnt_ = 0;
+  int retry_cnt_ = 0;
+  NetworkThread *thread_ = nullptr;
+  EndPoint(NetworkThread *t);
+  ~EndPoint();
+  friend class NetworkThread;
+  friend class EndPointFactory;
+
+public:
+  int send(Message *);
+  Message *recv();
+};
+
+class EndPointFactory {
+private:
+  std::unordered_map<uint32_t, EndPoint *> ip_ep_map_;
+  std::condition_variable map_cv_;
+  std::mutex map_mtx_;
+  NetworkThread *thread_;
+  EndPoint *getEp(uint32_t ip);
+  EndPoint *getOrCreateEp(uint32_t ip);
+  friend class NetworkThread;
+
+public:
+  EndPointFactory(NetworkThread *thread) : thread_(thread) {}
+  ~EndPointFactory();
+  EndPoint *getEp(const char *host);
+  void getNewEps(std::vector<EndPoint *> &neps);
+};
+
+class NetworkThread {
+private:
+  struct ev_loop *loop_;
+  ev_async ep_sig_;
+  ev_async msg_sig_;
+  ev_io socket_watcher_;
+  int port_;
+  int socket_fd_;
+  std::thread *thread_;
+  std::unordered_map<int, ev_io> fd_wwatcher_map_;
+  std::unordered_map<int, ev_io> fd_rwatcher_map_;
+  std::unordered_map<int, EndPoint *> fd_ep_map_;
+  std::map<int, Message> pending_msgs_;
+
+  void handleConnLost(int, EndPoint *, bool = true);
+  void doWork();
+  int asyncSend(int);
+  void asyncSendPendingMsg(EndPoint *);
+  void afterConnEst(EndPoint *ep, int fd, bool active);
+
+public:
+  EndPointFactory *epf_;
+
+  NetworkThread(int);
+  void notify(int signal);
+
+  void onRecv(int fd);
+  void onSend(int fd = -1);
+  void onConnEst(int fd);
+  void onNewEp();
+  void onNewConn();
+  void onTimeout(struct ev_timer *timer);
+};
+}
+#endif  // ENABLE_DIST
+#endif
diff --git a/include/singa/io/reader.h b/include/singa/io/reader.h
new file mode 100644
index 0000000..66d7e37
--- /dev/null
+++ b/include/singa/io/reader.h
@@ -0,0 +1,188 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef SINGA_IO_READER_H_
+#define SINGA_IO_READER_H_
+
+#include <cstring>
+#include <fstream>
+#include <string>
+#include "singa/singa_config.h"
+
+#ifdef USE_LMDB
+#include <lmdb.h>
+#include <sys/stat.h>
+#include <vector>
+#endif  // USE_LMDB
+
+namespace singa {
+namespace io {
+
+using std::string;
+
+/// General Reader that provides functions for reading tuples.
+/// Subclasses implement the functions for a specific data storage, e.g., CSV
+/// file, HDFS, kvfile, leveldb, lmdb, etc.
+class Reader {
+ public:
+  /// In case that users forget to call Close() to release resources, e.g.,
+  /// memory, you can release them here.
+  virtual ~Reader() {}
+
+  /// path is the path to the storage, could be a file path, database
+  /// connection, or hdfs path.
+  /// return true if open successfully, otherwise false.
+  virtual bool Open(const std::string& path) = 0;
+
+  /// Release resources.
+  virtual void Close() = 0;
+
+  /// Read a tuple.
+  /// return true if read successfully;
+  /// return flase if coming to the end of the file;
+  /// LOG(FATAL) if error happens.
+  virtual bool Read(std::string* key, std::string* value) = 0;
+
+  /// Iterate through all tuples to get the num of all tuples.
+  /// return num of tuples
+  virtual int Count() = 0;
+
+  /// Seek to the first tuple when the cursor arrives to the end of the file
+  virtual void SeekToFirst() = 0;
+};
+
+/// Binfilereader reads tuples from binary file with key-value pairs.
+class BinFileReader : public Reader {
+ public:
+  ~BinFileReader() { Close(); }
+  /// \copydoc Open(const std::string& path)
+  bool Open(const std::string& path) override;
+  /// \copydoc Open(const std::string& path), user defines capacity
+  bool Open(const std::string& path, int capacity);
+  /// \copydoc Close()
+  void Close() override;
+  /// \copydoc Read(std::string* key, std::string* value)
+  bool Read(std::string* key, std::string* value) override;
+  /// \copydoc Count()
+  int Count() override;
+  /// \copydoc SeekToFirst()
+  void SeekToFirst() override;
+  /// return path to binary file
+  inline std::string path() { return path_; }
+
+ protected:
+  /// Open a file with path_ and initialize buf_
+  bool OpenFile();
+  /// Read the next filed, including content_len and content;
+  /// return true if succeed.
+  bool ReadField(std::string* content);
+  /// Read data from disk if the current data in the buffer is not a full field.
+  /// size is the size of the next field.
+  bool PrepareNextField(int size);
+
+ private:
+  /// file to be read
+  std::string path_ = "";
+  /// ifstream
+  std::ifstream fdat_;
+  /// internal buffer
+  char* buf_ = nullptr;
+  /// offset inside the buf_
+  int offset_ = 0;
+  /// allocated bytes for the buf_, default is 10M
+  int capacity_ = 10485760;
+  /// bytes in buf_
+  int bufsize_ = 0;
+  /// magic word
+  const char kMagicWord[2] = {'s', 'g'};
+};
+
+/// TextFileReader reads tuples from CSV file.
+class TextFileReader : public Reader {
+ public:
+  ~TextFileReader() { Close(); }
+  /// \copydoc Open(const std::string& path)
+  bool Open(const std::string& path) override;
+  /// \copydoc Close()
+  void Close() override;
+  /// \copydoc Read(std::string* key, std::string* value)
+  bool Read(std::string* key, std::string* value) override;
+  /// \copydoc Count()
+  int Count() override;
+  /// \copydoc SeekToFirst()
+  void SeekToFirst() override;
+  /// return path to text file
+  inline std::string path() { return path_; }
+
+ private:
+  /// file to be read
+  std::string path_ = "";
+  /// ifstream
+  std::ifstream fdat_;
+  /// current line number
+  int lineNo_ = 0;
+};
+
+#ifdef USE_LMDB
+/// LMDBReader reads tuples from LMDB.
+class LMDBReader : public Reader {
+ public:
+  ~LMDBReader() { Close(); }
+  /// \copydoc Open(const std::string& path)
+  bool Open(const std::string& path) override;
+  /// \copydoc Close()
+  void Close() override;
+  /// \copydoc Read(std::string* key, std::string* value)
+  bool Read(std::string* key, std::string* value) override;
+  /// \copydoc Count()
+  int Count() override;
+  /// \copydoc SeekToFirst()
+  void SeekToFirst() override;
+  /// Return path to text file
+  inline std::string path() { return path_; }
+  /// Return valid, to indicate SeekToFirst();
+  inline bool valid() { return valid_; }
+
+ protected:
+  /// Seek to a certain position: MDB_FIRST, MDB_NEXT
+  void Seek(MDB_cursor_op op);
+  inline void MDB_CHECK(int mdb_status);
+
+ private:
+  /// file to be read
+  std::string path_ = "";
+  /// lmdb env variable
+  MDB_env* mdb_env_ = nullptr;
+  /// lmdb db instance
+  MDB_dbi mdb_dbi_;
+  /// lmdb transaction
+  MDB_txn* mdb_txn_ = nullptr;
+  /// lmdb cursor
+  MDB_cursor* mdb_cursor_ = nullptr;
+  /// lmdb key-value pair
+  MDB_val mdb_key_, mdb_value_;
+  /// whether the pair is found
+  bool valid_;
+  /// whether the cursor is at the first place
+  bool first_;
+};
+#endif  // USE_LMDB
+}  // namespace io
+}  // namespace singa
+
+#endif  // SINGA_IO_READER_H_
diff --git a/include/singa/io/snapshot.h b/include/singa/io/snapshot.h
new file mode 100644
index 0000000..0d5aa66
--- /dev/null
+++ b/include/singa/io/snapshot.h
@@ -0,0 +1,81 @@
+/************************************************************
+*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*
+*************************************************************/
+
+#ifndef SINGA_UTILS_SNAPSHOT_H_
+#define SINGA_UTILS_SNAPSHOT_H_
+
+#include "singa/io/reader.h"
+#include "singa/io/writer.h"
+#include "singa/utils/logging.h"
+#include "singa/proto/core.pb.h"
+#include "singa/core/tensor.h"
+
+#include <string>
+#include <unordered_set>
+#include <unordered_map>
+#include <memory>
+
+namespace singa {
+/// The snapshot management.
+/// It dumps the model parameter snapshot as checkpoint files, which coud be
+/// used for fine-tuning and deployment.
+/// The model paramters are separated from model definition, i.e., net
+/// construction. Users either randomly initialize the layer parameters or using
+/// the parameters from checkpoint files using Snapshot after creating the
+/// neural network.
+class Snapshot {
+ public:
+  enum Mode { kRead, kWrite };
+  /// <prefix>.model is the binary file for parameter key-value pair.
+  /// <prefix>.meta is the text file describing information about paramters,
+  /// i.e.
+  /// name and shape, one line per parameter.
+  /// kRead for reading snapshot, whereas kWrite for dumping out snapshot.
+  /// max_param_size: in MB
+  Snapshot(const std::string& prefix, Mode mode, int max_param_size = 10);
+  ~Snapshot() {}
+  /// Read parameters saved as tensors from checkpoint file.
+  std::vector<std::pair<std::string, Tensor>> Read();
+  /// Read parameter shapes from description file.
+  std::vector<std::pair<std::string, Shape>> ReadShape();
+  /// Read parameter returned as a tensor for a given parameter name.
+  Tensor Read(const std::string& Key);
+  /// Read parameter shape for a given parameter name.
+  Shape ReadShape(const std::string& key);
+  /// Serialize and dump out parameter. This method will write two files, one
+  /// binary file is for serialized tensors, the other csv file is for parameter
+  /// names and shapes.
+  void Write(const std::string& key, const Tensor& param);
+
+ private:
+  std::string prefix_;
+  Mode mode_;
+  std::unique_ptr<io::BinFileWriter> bin_writer_ptr_;
+  std::unique_ptr<io::Writer> text_writer_ptr_;
+  std::unique_ptr<io::BinFileReader> bin_reader_ptr_;
+  /// Check whether parameter name is unique.
+  std::unordered_set<std::string> param_names_;
+  /// Preload key-parameter tensor pairs for seeking a specified key.
+  std::unordered_map<std::string, Tensor> param_map_;
+};
+}  //  namespace singa
+
+#endif  //  SINGA_UTILS_SNAPSHOT_H_
diff --git a/include/singa/io/store.h b/include/singa/io/store.h
deleted file mode 100644
index a63a981..0000000
--- a/include/singa/io/store.h
+++ /dev/null
@@ -1,111 +0,0 @@
-/************************************************************
-*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*
-*************************************************************/
-
-#ifndef SINGA_IO_STORE_H_
-#define SINGA_IO_STORE_H_
-
-#include <string>
-
-namespace singa {
-namespace io {
-
-using std::string;
-enum Mode { kCreate, kRead, kAppend };
-
-/**
- * General key-value store that provides functions for reading and writing
- * tuples.
- *
- * Subclasses implement the functions for a specific data storage, e.g., CSV
- * file, HDFS, image folder, singa::io::SFile, leveldb, lmdb, etc.
- */
-class Store {
- public:
-  Store() { }
-  /**
-   * In case that users forget to call Close() to release resources, e.g.,
-   * memory, you can release them here.
-   */
-  virtual ~Store() { }
-  /**
-   * @param[in] source path to the storage, could be a file path, folder path
-   * or hdfs path, or even a http url.
-   * @param[in] mode
-   * @return true if open successfully, otherwise false.
-   */
-  virtual bool Open(const std::string& source, Mode mode) = 0;
-  /**
-   * Release resources.
-   */
-  virtual void Close() = 0;
-  /**
-   * Read a tuple.
-   *
-   * @param[out] key
-   * @param[out] value
-   * @return true if read successfully, otherwise false.
-   */
-  virtual bool Read(std::string* key, std::string* value) = 0;
-  /**
-   * Seek the read header to the first tuple.
-   */
-  virtual void SeekToFirst() = 0;
-
-  /**
-   * Seek to an offset. This allows concurrent workers to start reading from
-   * different positions (HDFS). 
-   */
-  virtual void Seek(int offset) = 0; 
-  /**
-   * Write a tuple.
-   *
-   * @param[in] key
-   * @param[in] value
-   * @return true if success, otherwise false.
-   */
-  virtual bool Write(const std::string& key, const std::string& value) = 0;
-  /**
-   * Flush writing buffer if it has.
-   */
-  virtual void Flush() {}
-};
-
-/**
- * Create a Store object.
- *
- * @param[in] backend identifier for a specific backend. Two backends are
- * inluced currently, i.e., "kvfile", "textfile"
- * @return a pointer to the newly created Store.
- */
-Store* CreateStore(const string& backend);
-/**
- * Create and open a Store object.
- *
- * @param[in] backend, @see CreateStore().
- * @param[in] path
- * @param[in] mode kRead or kCreate or kAppend
- */
-Store* OpenStore(const string& backend, const string& path, Mode mode);
-
-}  // namespace io
-}  // namespace singa
-
-#endif  // SINGA_IO_STORE_H_
diff --git a/include/singa/io/textfile_store.h b/include/singa/io/textfile_store.h
deleted file mode 100644
index 83bcbfa..0000000
--- a/include/singa/io/textfile_store.h
+++ /dev/null
@@ -1,57 +0,0 @@
-/************************************************************
-*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*
-*************************************************************/
-
-#ifndef SINGA_IO_TEXTFILE_STORE_H_
-#define SINGA_IO_TEXTFILE_STORE_H_
-
-#include <fstream>
-#include <string>
-#include "singa/io/store.h"
-
-namespace singa {
-namespace io {
-/**
- * Use text file as the data storage, one line per tuple.
- *
- * It is used for storeing CSV format data where the key is the line No. and
- * the value is the line.
- */
-class TextFileStore : public Store {
- public:
-  ~TextFileStore() { Close(); }
-  bool Open(const std::string& source, Mode mode) override;
-  void Close() override;
-  bool Read(std::string* key, std::string* value) override;
-  void SeekToFirst() override;
-  void Seek(int offset) override;
-  bool Write(const std::string& key, const std::string& value) override;
-  void Flush() override;
-
- private:
-  int lineNo_ = 0;
-  std::fstream* fs_ = nullptr;
-  Mode mode_;
-};
-
-}  // namespace io
-}  // namespace singa
-
-#endif  // SINGA_IO_TEXTFILE_STORE_H_
diff --git a/include/singa/io/transformer.h b/include/singa/io/transformer.h
new file mode 100644
index 0000000..d9a9263
--- /dev/null
+++ b/include/singa/io/transformer.h
@@ -0,0 +1,89 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef SINGA_IO_TRANSFORMER_H_
+#define SINGA_IO_TRANSFORMER_H_
+
+#include <vector>
+#include <string>
+#include "singa/core/tensor.h"
+#include "singa/proto/io.pb.h"
+#include "singa/proto/model.pb.h"
+
+namespace singa {
+
+/// Base apply class that does data transformations in pre-processing stage.
+class Transformer {
+ public:
+  Transformer() {}
+  virtual ~Transformer() {}
+
+  virtual void Setup(const TransformerConf& conf) {}
+
+  virtual Tensor Apply(int flag, Tensor& input) = 0;
+};
+
+class ImageTransformer: public Transformer {
+ public:
+  void Setup(const TransformerConf& conf) override {
+    featurewise_center_ = conf.featurewise_center();
+    featurewise_std_norm_ = conf.featurewise_std_norm();
+    resize_height_ = conf.resize_height();
+    resize_width_ = conf.resize_width();
+    rescale_ = conf.rescale();
+    horizontal_mirror_ = conf.horizontal_mirror();
+    image_dim_order_ = conf.image_dim_order();
+    
+    /// if crop_shape not contain 2 elements, ignore crop option.
+    if (conf.crop_shape_size() == 2)
+      crop_shape_ = {conf.crop_shape(0), conf.crop_shape(1)};      
+  }
+
+  Tensor Apply(int flag, Tensor& input) override;
+
+  const bool featurewise_center() const { return featurewise_center_; }
+  const bool featurewise_std_norm() const { return featurewise_std_norm_; }
+  const bool horizontal_mirror() const { return horizontal_mirror_; }
+  const int resize_height() const { return resize_height_; }
+  const int resize_width() const { return resize_width_; }
+  const float rescale() const { return rescale_; }
+  const Shape crop_shape() const { return crop_shape_; }
+  const string image_dim_order() const { return image_dim_order_; }
+
+ private:
+  bool featurewise_center_ = false;
+  bool featurewise_std_norm_ = false;
+  bool horizontal_mirror_ = false;
+  int resize_height_ = 0;
+  int resize_width_ = 0;
+  float rescale_ = 0.f;
+  Shape crop_shape_ = {};
+  std::string image_dim_order_ = "CHW";
+};
+
+#ifdef USE_OPENCV
+Tensor resize(Tensor& input, const size_t resize_height, 
+         const size_t resize_width, const string& image_dim_order);
+#endif
+Tensor crop(Tensor& input, const size_t crop_height, 
+             const size_t crop_width, const size_t crop_h_offset, 
+             const size_t crop_w_offset, const string& image_dim_order);
+Tensor mirror(Tensor& input, const bool horizontal_mirror, 
+             const bool vertical_mirror, const string& image_dim_order);
+} // namespace singa
+#endif  // SINGA_IO_TRANSFORMER_H_
diff --git a/include/singa/io/writer.h b/include/singa/io/writer.h
new file mode 100644
index 0000000..bd4043a
--- /dev/null
+++ b/include/singa/io/writer.h
@@ -0,0 +1,171 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef SINGA_IO_WRITER_H_
+#define SINGA_IO_WRITER_H_
+
+#include <cstring>
+#include <fstream>
+#include <string>
+#include "singa/singa_config.h"
+
+#ifdef USE_LMDB
+#include <lmdb.h>
+#include <sys/stat.h>
+#include <vector>
+#endif  // USE_LMDB
+
+namespace singa {
+namespace io {
+
+using std::string;
+enum Mode { kCreate, kAppend };
+
+/// General Writer that provides functions for writing tuples.
+/// Subclasses implement the functions for a specific data storage, e.g., CSV
+/// file, HDFS, image folder, leveldb, lmdb, etc.
+class Writer {
+ public:
+  /// In case that users forget to call Close() to release resources, e.g.,
+  /// memory, you can release them here.
+  virtual ~Writer() {}
+
+  /// Open a file.
+  /// path is the path to the disk BinFile, it can be
+  ///  - a path to local disk file.
+  ///  - a path to local directory. This is to be compatible with the older
+  ///    version (DataShard). The KVFile is shard.dat under that directory
+  ///  - a hdfs file starting with "hdfs://"
+  /// mode is open mode(kCreate, kAppend).
+  virtual bool Open(const std::string &path, Mode mode) = 0;
+
+  /// Release resources.
+  virtual void Close() = 0;
+
+  /// Write a key-value tuple.
+  /// return true if success, otherwise false.
+  virtual bool Write(const std::string &key, const std::string &value) = 0;
+
+  /// Flush writing buffer if it has.
+  virtual void Flush() = 0;
+};
+
+/// BinFile stores training/validation/test tuples.
+/// Each tuple is encoded as [magic_word, key_len, key, val_len, val]:
+///  - magic_word has 4 bytes; the first two are "s" and "g", the third one
+/// indicates whether key is null, the last one is reserved for future use.
+///  - key_len and val_len are of type uint32, which indicate the bytes of key
+/// and value respectively;
+///  - key_len and key are optional.)
+/// When BinFile is created, it will remove the last tuple if the value size
+/// and key size do not match because the last write crashed.
+class BinFileWriter : public Writer {
+ public:
+  ~BinFileWriter() { Close(); }
+  /// \copydoc Open(const std::string &path, Mode mode)
+  bool Open(const std::string &path, Mode mode) override;
+  /// \copydoc Open(const std::string& path), user defines capacity
+  bool Open(const std::string &path, Mode mode, int capacity);
+  /// \copydoc Close()
+  void Close() override;
+  /// \copydoc Write(const std::string& key, const std::string& value) override;
+  bool Write(const std::string &key, const std::string &value) override;
+  /// \copydoc Flush()
+  void Flush() override;
+  /// return path to binary file
+  inline std::string path() { return path_; }
+
+ protected:
+  /// Open a file with path_ and initialize buf_
+  bool OpenFile();
+
+ private:
+  /// file to be written
+  std::string path_ = "";
+  Mode mode_;
+  /// ofstream
+  std::ofstream fdat_;
+  /// internal buffer
+  char *buf_ = nullptr;
+  /// allocated bytes for the buf_
+  int capacity_ = 10485760;
+  /// bytes in buf_
+  int bufsize_ = 0;
+  /// magic word
+  const char kMagicWord[2] = {'s', 'g'};
+};
+
+/// TextFileWriter write training/validation/test tuples in CSV file.
+class TextFileWriter : public Writer {
+ public:
+  ~TextFileWriter() { Close(); }
+  /// \copydoc Open(const std::string &path, Mode mode)
+  bool Open(const std::string &path, Mode mode) override;
+  /// \copydoc Close()
+  void Close() override;
+  /// \copydoc Write(const std::string& key, const std::string& value) override;
+  bool Write(const std::string &key, const std::string &value) override;
+  /// \copydoc Flush()
+  void Flush() override;
+  /// return path to text file
+  inline std::string path() { return path_; }
+
+ private:
+  /// file to be written
+  std::string path_ = "";
+  Mode mode_;
+  /// ofstream
+  std::ofstream fdat_;
+};
+
+#ifdef USE_LMDB
+/// LMDBWriter write training/validation/test tuples into LMDB.
+class LMDBWriter : public Writer {
+ public:
+  ~LMDBWriter() { Close(); }
+  /// \copydoc Open(const std::string &path, Mode mode)
+  bool Open(const std::string &path, Mode mode) override;
+  /// \copydoc Close()
+  void Close() override;
+  /// \copydoc Write(const std::string& key, const std::string& value) override;
+  bool Write(const std::string &key, const std::string &value) override;
+  /// \copydoc Flush()
+  void Flush() override;
+  /// return path to text file
+  inline std::string path() { return path_; }
+
+ protected:
+  void DoubleMapSize();
+  inline void MDB_CHECK(int mdb_status);
+
+ private:
+  /// file to be written
+  std::string path_ = "";
+  /// kCreate or kAppend
+  Mode mode_;
+  /// lmdb env variable
+  MDB_env *mdb_env_ = nullptr;
+  /// buffer for key-value pairs
+  std::vector<string> keys, values;
+};
+#endif  // USE_LMDB
+
+}  // namespace io
+}  // namespace singa
+
+#endif  // SINGA_IO_WRITER_H_
diff --git a/include/singa/model/feed_forward_net.h b/include/singa/model/feed_forward_net.h
new file mode 100644
index 0000000..1bf112c
--- /dev/null
+++ b/include/singa/model/feed_forward_net.h
@@ -0,0 +1,166 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef SINGA_MODEL_FEED_FORWARD_NET_H_
+#define SINGA_MODEL_FEED_FORWARD_NET_H_
+#include "singa/model/layer.h"
+#include "singa/model/loss.h"
+#include "singa/model/metric.h"
+#include "singa/model/updater.h"
+#include <thread>
+#include <memory>
+namespace singa {
+
+/// The feed-forward neural net.
+/// It provides functions for constructing the layers, access layer parameters,
+/// and conducting training, evaluation and prediction.
+class FeedForwardNet {
+ public:
+  FeedForwardNet() = default;
+  /// Delete all layers.
+  ~FeedForwardNet();
+
+  /// Add a layer with the assumption that
+  /// 1. this function is called in correct order, i.e., the layers are added
+  ///    following the topological order.
+  /// 2. this layer has already been setup (Setup function is called outside).
+  /// The layer will be freed in the destructor of FeedForwardNet.
+  std::shared_ptr<Layer> Add(std::shared_ptr<Layer> layer);
+
+  // TODO(wangwei) add ConcatenateLayer and SliceLayer
+  // AddConcatenateLayer(vector<Layer*> src, Layer *dst);
+  // AddSliceLayer(Layer* layer, vector<Layer*> dst);
+
+  /// Add a layer by providing its configuration, and setup it.
+  /// Assume the layer is added in corret order.
+  /// For the first layer, 'sample_shape' (the input sample shape) is necessary
+  /// for calling Setup().
+  std::shared_ptr<Layer> Add(const LayerConf& conf,
+      const Shape* sample_shape = nullptr);
+
+  /// Set some fields used for training and evaluating the neural net.
+  /// This method will instantiate an Updater ,then wrap the Optimier into
+  /// Updater and always register the parameters of the net instance.
+  /// If the neural net is constructed for evaluation only, then 'opt' is not
+  /// necessary; But for training, both 'opt' and 'loss' are necessary.
+  /// 'shuffle' indicates shuffling training samples within one epoch it is
+  /// valid using Train(). If to_register is set true, parameter will be
+  /// registered in Updater.;
+  void Compile(bool shuffle, Optimizer* opt, Loss* loss, Metric* metric);
+  /// Set some fields used for training and evaluating the neural net.
+  /// This method is mainly used in parallel training, where we need
+  /// multiple neuralnet instances.
+  /// If the neural net is constructed for evaluation only, then 'updater' is
+  /// not
+  /// necessary; But for training, both 'opt' and 'loss' are necessary.
+  /// 'shuffle' indicates shuffling training samples within one epoch it is
+  /// valid using Train(). If to_register is set true, parameter will be
+  /// registered in Updater.;
+  void Compile(bool shuffle, bool to_register, std::shared_ptr<Updater> updater,
+               Loss* loss, Metric* metric);
+
+  /// Conduct the training giving the training data 'x' and label 'y'.
+  /// 'val_split' of training data is used for
+  /// validation. Validation is performance before every epoch.
+  /// Due to memory limit, 'x' and 'y' could not be very large. Hence, it is
+  /// typically used for small training datasets, e.g., cifar10 and MNIST which
+  /// can be stored in main memory.
+  void Train(size_t batchsize, int nb_epoch, const Tensor& x, const Tensor& y,
+             float val_split = 0.0f);
+  /// Conduct the training given the training and validation data.
+  /// Validation is performance before every epoch.
+  /// Due to memory limit, 'x' and 'y' could not be very large. Hence, it is
+  /// typically used for small training datasets, e.g., cifar10 and MNIST which
+  /// can be stored in main memory.
+  void Train(size_t batchsize, int nb_epoch, const Tensor& x, const Tensor& y,
+             const Tensor& val_x, const Tensor& val_y);
+  /// Train the neural net over one batch of training data.
+  const std::pair<float, float> TrainOnBatch(int epoch, const Tensor& x,
+                                             const Tensor& y);
+
+  /// Evaluate the neural net with given data.
+  /// Returns one tensor for loss values and one tensor for metric values;
+  /// Each sample would have a loss value and a metric value (if 'metic' is set
+  /// in Compile()).'batchsize' is used for controlling the memory footprint.
+  /// It should be smaller than the total number of samples.
+  /// Due to memory limit, 'x' and 'y' could not be very large. Hence, it is
+  /// typically used for small training datasets, e.g., cifar10 and MNIST which
+  /// can be stored in main memory.
+  std::pair<Tensor, Tensor> Evaluate(const Tensor& x, const Tensor& y,
+                                     size_t batchsize = 128);
+  /// Evaluate the neural net for one batch of data
+  std::pair<Tensor, Tensor> EvaluateOnBatch(const Tensor& x, const Tensor& y);
+
+  /// Predict the probability distributation over candicate classes for each
+  /// data sample. 'batchsize' is used for controlling the memory footprint.
+  /// It should be smaller than the total number of samples.
+  /// Due to memory limit, 'x' and 'y' could not be very large. Hence, it is
+  /// typically used for small training datasets, e.g., cifar10 and MNIST which
+  /// can be stored in main memory.
+  const Tensor Predict(const Tensor& x, size_t batchsize = 128);
+  /// Predict for one batch data.
+  const Tensor PredictOnBatch(const Tensor& x);
+
+  /// Forward layers one by one using the data batch 'x'.
+  /// Returns the prediction results (from the last layer).
+  const Tensor Forward(int flag, const Tensor& x);
+  /// Backward layers one by one using the gradient batch 'grad'.
+  /// Returns the parameter gradients.
+  const vector<Tensor> Backward(int flag, const Tensor& grad);
+
+  /// Clone the neuaral net by cloning every layer to the given device.
+  /// If 'device' is nullptr, then clone it one the current device.
+  FeedForwardNet Clone(std::shared_ptr<Device> device);
+  /// Move the layer data to the given device.
+  void ToDevice(std::shared_ptr<Device> device);
+  void ToHost() { ToDevice(defaultDevice); }
+  /// Set the data type of each layer.
+  void AsType(DataType dtype);
+
+  /// A wrapper method to spawn a thread to execute Train() method.
+  std::thread TrainThread(size_t batchsize, int nb_epoch, const Tensor& x,
+                          const Tensor& y, const Tensor& val_x,
+                          const Tensor& val_y) {
+    return std::thread(
+        [=]() { Train(batchsize, nb_epoch, x, y, val_x, val_y); });
+  }
+
+  /// A wrapper method to spawn a thread to execute Train() method.
+  std::thread TrainThread(size_t batchsize, int nb_epoch, const Tensor& x,
+                          const Tensor& y) {
+    return std::thread([=]() { Train(batchsize, nb_epoch, x, y); });
+  }
+
+  const vector<std::shared_ptr<Layer>> layers() const { return layers_; }
+  const vector<string> GetParamNames() const;
+  const vector<ParamSpec> GetParamSpecs() const;
+  const vector<Tensor> GetParamValues() const;
+
+ protected:
+  vector<std::shared_ptr<Layer>> layers_;
+  std::shared_ptr<Updater> updater_;
+  Loss* loss_;
+  Metric* metric_;
+
+  bool shuffle_ = true;
+  Device* device_ = nullptr;
+  DataType dtype_ = kFloat32;
+};
+
+} /* singa */
+
+#endif  // SINGA_MODEL_FEED_FORWARD_NET_H_
diff --git a/include/singa/model/initializer.h b/include/singa/model/initializer.h
new file mode 100644
index 0000000..4592af5
--- /dev/null
+++ b/include/singa/model/initializer.h
@@ -0,0 +1,128 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef SINGA_MODEL_INITIALIZER_H_
+#define SINGA_MODEL_INITIALIZER_H_
+#include <string>
+#include "singa/core/tensor.h"
+#include "singa/proto/model.pb.h"
+#include "singa/utils/string.h"
+namespace singa {
+/// Base class for initializing parameter values.
+using InitializerConf = FillerConf;
+class Initializer {
+ public:
+  Initializer() = default;
+  void Setup(const std::string& str) {
+    InitializerConf conf;
+    conf.ParseFromString(str);
+    Setup(conf);
+  }
+
+  /// Set meta fields from user configurations.
+  virtual void Setup(const InitializerConf& conf) {}
+
+  virtual void Fill(Tensor& t) = 0;
+};
+
+namespace init {
+class Constant : public Initializer {
+public:
+  Constant() = default;
+  Constant(const float x) : v_(x) {}
+  void Setup(const InitializerConf& conf) override { v_ = conf.value(); }
+  void Fill(Tensor& t) override { t.SetValue(v_); }
+
+ private:
+  float v_ = 0;
+};
+
+class Uniform : public Initializer {
+public:
+  Uniform() = default;
+  Uniform(const float low, const float high) : min_(low), max_(high) {}
+  void Setup(const InitializerConf& conf) override {
+    min_ = conf.min();
+    max_ = conf.max();
+  }
+  void Fill(Tensor& t) override { singa::Uniform(min_, max_, &t); }
+
+ private:
+  float min_ = 0, max_ = 1;
+};
+
+class Gaussian : public Initializer {
+public:
+  Gaussian() = default;
+  Gaussian(const float m, const float s): mean_(m), std_(s) {}
+  void Setup(const InitializerConf& conf) override {
+    mean_ = conf.mean();
+    std_ = conf.std();
+  }
+  void Fill(Tensor& t) override { singa::Gaussian(mean_, std_, &t); }
+
+ private:
+  float mean_ = 0, std_ = 1;
+};
+
+/// Ref: [Bengio and Glorot 2010] Understanding the difficulty of training deep
+/// feedforward neural networks
+class Xavier : public Initializer {
+public:
+  void Fill(Tensor& t) override {
+    CHECK_EQ(t.nDim(), 2u);
+    float scale = sqrt(6.0f / (t.shape(0) + t.shape(1)));
+    LOG(INFO) << "xavier scale " << scale;
+    singa::Uniform(-scale, scale, &t);
+  }
+};
+
+/// Ref: [He, Zhang, Ren and Sun 2015]: Delving Deep into Rectifiers:
+/// Surpassing Human-Level Performance on ImageNet Classification
+class MSRA : public Initializer {
+ public:
+  void Fill(Tensor& t) override {
+    CHECK_EQ(t.nDim(), 2u);
+    float std = sqrt(2.0f / t.shape(0));
+    singa::Gaussian(0.0f, std, &t);
+  }
+};
+
+}  // namespace init
+
+/// TODO(wangwei) create the initializers from factory like that for Layer.
+std::shared_ptr<Initializer> CreateInitializer(const InitializerConf& conf) {
+  std::shared_ptr<Initializer> init;
+  if (ToLowerCase(conf.type()) == "constant") {
+    init = std::make_shared<init::Constant>();
+  } else if (ToLowerCase(conf.type()) == "uniform") {
+    init = std::make_shared<init::Uniform>();
+  } else if (ToLowerCase(conf.type()) == "gaussian") {
+    init = std::make_shared<init::Gaussian>();
+  } else if (ToLowerCase(conf.type()) == "xavier") {
+    init = std::make_shared<init::Xavier>();
+  } else if (ToLowerCase(conf.type()) == "msra") {
+    init = std::make_shared<init::MSRA>();
+  } else {
+    LOG(FATAL) << "Unknown initialization type: " << conf.type();
+  }
+  init->Setup(conf);
+  return init;
+}
+}  // namespace singa
+#endif  // SINGA_MODEL_INITIALIZER_H_
diff --git a/include/singa/model/layer.h b/include/singa/model/layer.h
new file mode 100644
index 0000000..e67fcc5
--- /dev/null
+++ b/include/singa/model/layer.h
@@ -0,0 +1,253 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef SINGA_MODEL_LAYER_H_
+#define SINGA_MODEL_LAYER_H_
+
+#include <vector>
+#include <string>
+#include <stack>
+#include <utility>
+#include <memory>
+#include "singa/core/tensor.h"
+#include "singa/proto/model.pb.h"
+#include "singa/utils/factory.h"
+
+namespace singa {
+
+typedef vector<size_t> Shape;
+/// The base layer class.
+/// Generally, a layer conducts feature transformation against a set of Tensor
+/// to generate a set of Tensor. Each layer may have some parameters.
+class Layer {
+ public:
+  Layer() = default;
+
+  /// Set meta data fields from a string representing a proto message.
+  /// 'in_shape' is the shape of the input feature for one sample
+  void Setup(const Shape& in_shape, const string& proto_str) {
+    LayerConf conf;
+    conf.ParseFromString(proto_str);
+    this->Setup(in_shape, conf);
+  }
+
+  /// 'in_shapes' is the shape of the input feature for one sample
+  void Setup(const vector<Shape>& in_shapes, const string& proto_str) {
+    LayerConf conf;
+    conf.ParseFromString(proto_str);
+    this->Setup(in_shapes, conf);
+  }
+
+
+  // ============= Following Functions could be override =====================
+  /// Destruct objects created by this layer.
+  virtual ~Layer() {};
+
+  /// Each layer sub-class would optionaly have a type name.
+  /// Used for debugging and logging.
+  virtual const std::string layer_type() const { return "Unknown"; }
+
+  /// Set meta data fields configured in 'conf' (a proto message).
+  /// Some layers would use input tensor shapes for setting its parameter
+  /// shapes (e.g, desen layer and convolution layer). 'in_shape' provides such
+  /// shape info. It represents the shape of the Tensor (with a single sample)
+  /// from the last layer.
+  /// After calling Setup, the shape info of parameters should be accssed
+  /// correctly. Internal buffer/fields are set assuming batchsize is 1.
+  virtual void Setup(const Shape& in_sample, const LayerConf& conf) {
+    name_ = conf.name();
+    // TODO(wangwei) load param values from checkpoint files.
+  }
+
+  /// Used for layers that have multiple input tensors, e.g., concatenate layer.
+  virtual void Setup(const vector<Shape>& in_samples,
+                     const LayerConf& conf) {
+    name_ = conf.name();
+    // TODO(wangwei) load param values from checkpoint files.
+  }
+
+  /// Return the shape of the generated Tensor without the batchsize dimension
+  virtual const Shape GetOutputSampleShape() const {
+    LOG(FATAL) << "Pls override this function";
+    return vector<size_t>{};
+  }
+  /// Return the shape of the k-th generated tensor without the batchsize
+  /// dimension. Used for layers that generate multiple tensors.
+  virtual const Shape GetOutputSampleShape(int k) {
+    LOG(FATAL) << "Pls override this function";
+    return vector<size_t>{};
+  }
+
+  /// Do feature transformation for the given 'input' tensor (denoted as x).
+  /// 'flag' is either kTrain or kEval for feed-forward nets, and
+  /// would be used for other phases of training other nets. For example, when
+  /// training RBM, we may create an alias of this function as ComputeFeature
+  /// where flag could be kPositive and kNegative.
+  /// It will return a Tensor (denoted as y).
+  /// If the 'input' or 'output' is required for computing the gradients in
+  /// Backward(), then buffer them as internal data.
+  virtual const Tensor Forward(int flag, const Tensor& input) {
+    LOG(FATAL) << "Not implemented";
+    Tensor t;
+    return t;
+  }
+
+  /// \copydoc Forward(int flag, const Tensor& input)
+  /// Accept multiple input tensors and generate multiple output tensors.
+  /// If there is only one input tensor, it will call Forward(int, const
+  /// Tensor&) by default. Users can override this function for layers who
+  /// generate more than one outputs.
+  virtual const vector<Tensor> Forward(int flag, const vector<Tensor>& inputs) {
+    vector<Tensor> ret;
+    if (inputs.size() == 1) ret.push_back(Forward(flag, inputs.at(0)));
+
+    LOG(FATAL) << "Not implemented";
+    return ret;
+  }
+
+  /// Compute gradients of this layer.
+  /// Specifically, there are two types of gradients:
+  /// 1. gradient of the preceding layer, i.e., dx.
+  /// 2. gradients of parameters of this layer, e.g., dw for weight matrix.
+  /// 1 is an empty tensor if there is no preceding layer or there is no need to
+  /// compute dx (e.g., x is from a data layer); 2 is an empty vector if this
+  // layer has no parameters.
+  /// 'flag' is either kTrain or kEval for feed-forward nets, and
+  /// would be used for other phases when training other nets.
+  /// 'grad' is a Tensor for gradient (dy) from the upper layer.
+  virtual const std::pair<Tensor, vector<Tensor>> Backward(int flag,
+                                                           const Tensor& grad) {
+    LOG(FATAL) << "Not implemented!";
+    Tensor t;
+    return std::make_pair(t, vector<Tensor>{});
+  }
+
+  /// \copydoc Backward(int, const vector<Tensor>&)
+  /// For Forward(int, const vector<Tensor>&)
+  virtual const std::pair<vector<Tensor>, vector<Tensor>> Backward(
+      int flag, const vector<Tensor>& grads) {
+    vector<Tensor> input_grad, param_grad;
+    if (grads.size() == 1u) {
+      auto ret = Backward(flag, grads.at(0));
+      input_grad.push_back(ret.first);
+      param_grad = ret.second;
+    } else {
+      LOG(FATAL) << "Not implemented";
+    }
+    return std::make_pair(input_grad, param_grad);
+  }
+
+  /// Clone the layer to the given device. Layer data (e.g., parameters) are
+  /// deep copied. If 'device' is nullptr, then clone it one the current device.
+  // virtual Layer* Clone(std::shared_ptr<Device> device);
+  /// Move the layer (including its parameters and other internal Tensor) onto
+  /// the given device
+  virtual void ToDevice(std::shared_ptr<Device> device) {
+  }
+
+  /// Set the data type of Tensor in this layer.
+  virtual void AsType(DataType dtype) {
+  }
+
+  /// Serialize the layer info (including params) into a LayerConf proto message
+  virtual void ToProto(LayerConf* conf) const {
+    //conf->set_name(name_);
+    //for (const auto& spec : param_specs_) {
+    //  ParamSpec* p = conf->add_param();
+    //  p->CopyFrom(spec);
+    //}
+    // TODO(wangwei) add param values into conf;
+  }
+
+  // ========================================================================
+
+  /// Serialize the layer info, including params_, into a string representing
+  /// a LayerParameter message.
+  std::string ToProtoStr() const {
+    LayerConf conf;
+    ToProto(&conf);
+    string str;
+    conf.SerializeToString(&str);
+    return str;
+  }
+  /// Return specs/configuration of all parameter instances of this layer.
+  /// \ref ParamSpec.
+  const vector<ParamSpec> param_specs() { return param_specs_; }
+
+  /// Return the i-th ParamSpec.
+  const ParamSpec& param_specs(size_t i) {
+    CHECK_LT(i, param_specs_.size());
+    return param_specs_.at(i);
+  }
+
+  /// Return pointers to parameter Tensor s.
+  virtual const vector<Tensor> param_values() {
+    return vector<Tensor>{};
+  }
+
+  /// Return names of all parmaeters.
+  const vector<string> param_names() {
+    vector<string> pname;
+    for (const auto& spec : param_specs_) pname.push_back(spec.name());
+    return pname;
+  }
+
+  /// Return the 'i'-th parameter name.
+  const string& param_name(size_t i) {
+    CHECK_LT(i, param_specs_.size());
+    return param_specs_.at(i).name();
+  }
+
+  /// Each layer instance would optionally have a name.
+  /// Used for debugging and logging.
+  const std::string name() const { return name_; }
+
+ protected:
+  std::string name_;
+  vector<ParamSpec> param_specs_;
+};
+
+/// Name should be formated as cudnn_xxx, singacpp_xxx, singacuda_xxx,
+/// singacl_xxx, where xxx is the real layer type, e.g., convolution, relu, etc.
+/// xxx should only have lower case letters.
+/// if the implmentation is transparent to cpp/cuda/opencl, then register all
+/// possible identifiers. For instance, Dropout is registered three times,
+/// RegisterLayerClass("singacpp_dropout", Dropout)
+/// RegisterLayerClass("singacl_dropout", Dropout)
+/// RegisterLayerClass("singacuda_dropout", Dropout)
+/// to be compatible with previous commits, the following identifier is
+/// registered. Better avoid using it, as it would be deprecated.
+/// RegisterLayerClass("singa_dropout", Dropout)
+#define RegisterLayerClass(Name, SubLayer) \
+  static Registra<Layer, SubLayer> Name##SubLayer(#Name);
+
+inline std::shared_ptr<Layer> CreateLayer(const std::string type) {
+  std::shared_ptr<Layer> layer(Factory<Layer>::Create(type));
+  return layer;
+}
+
+inline const std::vector<std::string> GetRegisteredLayers() {
+  vector<std::string> ret;
+  for (const string type : Factory<Layer>::GetIDs()) {
+    auto layer = CreateLayer(type);
+    ret.push_back("Register type: " + type);
+  }
+  return ret;
+}
+}  // namespace singa
+#endif  // SINGA_MODEL_LAYER_H_
diff --git a/include/singa/model/loss.h b/include/singa/model/loss.h
new file mode 100644
index 0000000..4ee41cb
--- /dev/null
+++ b/include/singa/model/loss.h
@@ -0,0 +1,106 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef SINGA_MODEL_LOSS_H_
+#define SINGA_MODEL_LOSS_H_
+#include <stack>
+#include "singa/proto/model.pb.h"
+#include "singa/core/tensor.h"
+namespace singa {
+
+/// The base loss class, which declares the APIs for computing the objective
+/// score (loss) for a pair of prediction (from the model) and the target (i.e.
+/// the ground truth). It also computes the gradients of the objective w.r.t.
+/// the prediction. It has similar APIs as Layer.
+// template <typename T = Tensor>
+class Loss {
+public:
+  Loss() = default;
+  void Setup(const string &conf) {
+    LossConf loss;
+    loss.ParseFromString(conf);
+    Setup(loss);
+  }
+  virtual ~Loss() {};
+  virtual void ToDevice(std::shared_ptr<Device> device) {}
+  /// Set meta fields from user configurations.
+  virtual void Setup(const LossConf &conf) {}
+
+  /// Compute the loss values for each sample/instance given the prediction
+  /// and the target.
+  virtual Tensor Forward(int flag, const Tensor &prediction,
+                         const Tensor &target) = 0;
+
+  /// Average loss values for all samples in the mini-batch
+  /// It calls Forward() internally. The calling pattern should be
+  /// [Evaluate|Forward] Backward.
+  float Evaluate(int flag, const Tensor &prediction, const Tensor &target) {
+    Tensor loss = Forward(flag, prediction, target);
+    return Sum<float>(loss) / (1.0f * loss.Size());
+  }
+
+  /// Compute the gradients of the loss values w.r.t. the prediction.
+  virtual Tensor Backward() = 0;
+};
+
+// ============= Mean Squared Error ===========================================
+/// MSE is for mean squared error or squared euclidean distance.
+class MSE : public Loss {
+ public:
+  /// Compute the loss values for each sample/instance given the prediction
+  /// and the target, which is 0.5/||prediction-target||^2
+  /// Users can call Average(const Tensor&) to get the average
+  /// loss value over all samples in the batch.
+  Tensor Forward(int flag, const Tensor& prediction, const Tensor& target) override;
+
+  /// Compute the gradients of the loss values w.r.t. the prediction,
+  /// which is (prediction-target)/batchsize
+  Tensor Backward() override;
+
+ private:
+  // to buffer intermediate data, i.e., prediction-target
+  std::stack<Tensor> buf_;
+};
+
+
+// ===============Softamx Cross Entropy =======================================
+/// Softmax + cross entropy for multi-category classification
+class SoftmaxCrossEntropy : public Loss {
+ public:
+  /// Compute the loss values for each sample/instance given the prediction
+  /// and the target, which is -log(p[idx_truth]), idx_truth is the truth
+  /// category's index and p[] is the probability for each category, computed
+  /// from Softmax(prediction).
+  /// Users can call Average(const Tensor&) to get the average
+  /// loss value over all samples in the batch.
+  Tensor Forward(int flag, const Tensor& prediction, const Tensor& target) override;
+
+  /// Compute the gradients of the loss values w.r.t. the prediction,
+  /// which is: p[idx] - 1 if idx is the truth category's index; else,
+  /// p[idx]
+  Tensor Backward() override;
+
+ private:
+  // to buffer intermediate data, i.e., probability for each category and
+  // the target (ground truth)
+  std::stack<Tensor> buf_;
+};
+
+}  // namespace singa
+
+#endif  // SINGA_MODEL_LOSS_H_
diff --git a/include/singa/model/metric.h b/include/singa/model/metric.h
new file mode 100644
index 0000000..ad7f717
--- /dev/null
+++ b/include/singa/model/metric.h
@@ -0,0 +1,81 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef SINGA_MODEL_METRIC_H_
+#define SINGA_MODEL_METRIC_H_
+#include "singa/core/tensor.h"
+#include "singa/proto/model.pb.h"
+namespace singa {
+
+/// The base metric class, which declares the APIs for computing the performance
+/// evaluation metrics given the prediction of the model and the ground truth,
+/// i.e., the target.
+/// The target type is a template argument.  For data samples with a single
+/// label, T could be 1-d tenor (or vector<int>); If each data sample has
+/// multiple labels, T could be vector<vector<int>>, one vector per sample.
+// template <typename T = Tensor>
+class Metric {
+ public:
+  // TODO(wangwei) call Setup using a default MetricConf.
+  Metric() = default;
+  virtual ~Metric() {}
+  virtual void ToDevice(std::shared_ptr<Device> device) {}
+  void Setup(const string& conf) {
+    MetricConf metric;
+    metric.ParseFromString(conf);
+    Setup(metric);
+  }
+
+  /// Set meta fields from user configurations.
+  virtual void Setup(const MetricConf& conf) {}
+
+  /// Compute the metric for each data sample
+  virtual Tensor Forward(const Tensor& prediction, const Tensor& target) = 0;
+
+  /// Comptue the metric value averaged over all samples (in a batch)
+  float Evaluate(const Tensor& prediction, const Tensor& target) {
+    const Tensor metric = Forward(prediction, target);
+    return Sum<float>(metric) / (1.0f * metric.Size());
+  }
+};
+/// Compute the accuray of the prediction, which is matched against the
+/// ground truth labels.
+/// TODO(wangwei) consider multi-label cases.
+class Accuracy : public Metric {
+ public:
+  /// Set meta fields from user configurations.
+  void Setup(const MetricConf& conf) override { top_k_ = conf.top_k(); }
+
+  /// Check the prediction against the target (ground truth) for each data
+  /// sample. The returned Tensor has a float value for each sample, 0 for wrong
+  /// and 1 for correct. Users can call Sum(const Tensor&) / Tensor::Size() to
+  /// get the accuracy.
+  Tensor Forward(const Tensor& prediction, const Tensor& target);
+
+ private:
+  /// \copydoc Match(const Tensor&, const Tensor&);
+  Tensor Match(const Tensor& prediction, const vector<int>& target);
+  /// If the ground truth label is in the top k predicted labels, then the
+  /// prediction is correct.
+  size_t top_k_ = 1;
+};
+
+
+}  // namespace singa
+
+#endif  // SINGA_MODEL_METRIC_H_
diff --git a/include/singa/model/optimizer.h b/include/singa/model/optimizer.h
new file mode 100644
index 0000000..e6e6d1c
--- /dev/null
+++ b/include/singa/model/optimizer.h
@@ -0,0 +1,302 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef SINGA_MODEL_OPTIMIZER_H_
+#define SINGA_MODEL_OPTIMIZER_H_
+
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "singa/core/tensor.h"
+#include "singa/proto/model.pb.h"
+
+using std::string;
+using std::vector;
+using std::unordered_map;
+namespace singa {
+class Constraint;
+class Regularizer;
+/// The base class for gradient descent algorithms used to update the model
+/// parameters in order to optimize the objective (loss) function.
+/// It updates parameters based on the gradients of the loss w.r.t each
+/// parameter. Most sub-classes uses first order gradients.
+/// An overview of gradient descent algorithms,
+/// http://sebastianruder.com/optimizing-gradient-descent/
+class Optimizer {
+ public:
+  Optimizer() = default;
+  virtual ~Optimizer();
+  /// Setup the optimzier using configurations from serialized string (for
+  /// binding languages).
+  void Setup(const string& str) {
+    OptimizerConf conf;
+    conf.ParseFromString(str);
+    this->Setup(conf);
+  }
+
+  /// Setup the meta fields of the optimizer
+  virtual void Setup(const OptimizerConf& conf);
+  /// Register the parameter, e.g., create Constraint and Regularizers.
+  /// If there is no constraint or regularizer, then no need to register the
+  /// parameter.
+  virtual void Register(const string& name, const ParamSpec& specs);
+
+  /// Apply the updating algorithm.
+  /// No learning rate scaling, gradient constraints/regularization will be
+  /// conducted. It assumes all these operations are done either by users or
+  /// by Apply(int, const string&, Tensor*, Tensor*).
+  /// All sub-classes should override this function.
+  virtual void Apply(int step, float lr, const string& name, const Tensor& grad,
+                     Tensor& value) = 0;
+
+  /// Apply the updating algorithm.
+  /// It will apply regularization and constraint to the parameters if
+  /// configured during Register(). If will also scale the learning rate if
+  /// configured in ParamSpecs (see Register).
+  void Apply(int step, const string& name, Tensor& grad, Tensor& value);
+
+  /// The argument is a function that returns the learning rate given the
+  /// current step (i.e., curren running iteration).
+  void SetLearningRateGenerator(function<float(int)> func) {
+    learning_rate_generator_ = func;
+  }
+  float GetLearningRate(int step) {
+    if (learning_rate_generator_)
+      return learning_rate_generator_(step);
+    else
+      return 0;
+  }
+
+ protected:
+  function<float(int)> learning_rate_generator_;
+  std::unordered_map<std::string, float> learning_rate_multplier_;
+  std::unordered_map<std::string, float> weight_decay_multplier_;
+  std::unordered_map<std::string, Constraint*> constraints_;
+  std::unordered_map<std::string, Regularizer*> regularizers_;
+  Constraint* constraint_ = nullptr;
+  Regularizer* regularizer_ = nullptr;
+};
+
+/// Apply constraints for parameters (gradient).
+/// E.g., restrict the norm of parmeter gradients to be within a threshold.
+/// \ref http://keras.io/constraints/
+/// TODO(wangwei) implement a sub-class for each type of constraint
+class Constraint {
+ public:
+  Constraint() = default;
+  explicit Constraint(const ConstraintConf& conf) { Setup(conf); }
+  Constraint(const string& type, float threshold)
+      : type_(type), threshold_(threshold) {}
+  void Setup(const ConstraintConf& conf);
+  void Setup(const string& conf_str) {
+    ConstraintConf conf;
+    conf.ParseFromString(conf_str);
+    Setup(conf);
+  }
+  /// Apply the constraint to a single parmeter object, e.g., W, or b
+  /// e.g., clip each gradient if it is too large w.r.t the threshold,
+  /// \ref
+  /// https://www.reddit.com/r/MachineLearning/comments/31b6x8/gradient_clipping_rnns/
+  void Apply(int step, Tensor& grad, Tensor& value);
+  /// Apply the constraint for multiple parameter objects together.
+  /// \ref https://github.com/Lasagne/Lasagne/blob/master/lasagne/updates.py
+  void Apply(int step, const vector<Tensor>& grads,
+             const vector<Tensor>& values);
+
+ private:
+  /// currently only support "L2" norm constraint, i.e., the norm should be less
+  /// than the configured threshold_, otherwise, the parameters would be clipped
+  /// to make the norm within that threshold.
+  /// TODO(wangwei) consider other constraint, e.g., hard clip and unitnorm.
+  string type_ = "Unknown";
+  float threshold_;
+};
+
+inline std::shared_ptr<Constraint> CreateConstraint(std::string type) {
+  return std::make_shared<Constraint>();
+}
+/// Apply regularization for parameters (gradient), e.g., L1 norm and L2 norm.
+/// TODO(wangwei) implement a sub-class for each type of regularizer
+class Regularizer {
+ public:
+  Regularizer() = default;
+  explicit Regularizer(const RegularizerConf& conf) { Setup(conf); }
+  Regularizer(const string& type, float coefficient)
+      : type_(type), coefficient_(coefficient) {}
+  void Setup(const RegularizerConf& conf);
+  void Setup(const string& conf_str) {
+    RegularizerConf conf;
+    conf.ParseFromString(conf_str);
+    Setup(conf);
+  }
+
+  /// Apply the regularizer to a single parmeter object, e.g., W, or b
+  /// e.g., clip each gradient if it is too large w.r.t the threshold,
+  /// \ref
+  /// https://www.reddit.com/r/MachineLearning/comments/31b6x8/gradient_clipping_rnns/
+  void Apply(int step, Tensor& grad, Tensor& value, float scale = 1.0f);
+  /// Apply the regularizer for multiple parameter objects together.
+  /// \ref https://github.com/Lasagne/Lasagne/blob/master/lasagne/updates.py
+  void Apply(int step, const vector<Tensor>& grads,
+             const vector<Tensor>& values);
+
+ private:
+  /// currently only support "L2" regularizer. type_ is case insensitive.
+  /// TODO(wangwei) add more regularizer, e.g., L1.
+  string type_ = "NotSet";
+  float coefficient_;
+};
+inline std::shared_ptr<Regularizer> CreateRegularizer(std::string type) {
+  return std::make_shared<Regularizer>();
+}
+
+
+
+// =============Vallina SGD with Momentum=====================================
+class SGD : public Optimizer {
+ public:
+  void Setup(const OptimizerConf& conf);
+  /// Apply the updating algorithm.
+  void Apply(int step, float lr, const string& name, const Tensor& grad,
+             Tensor& value) override;
+
+  /// The argument function returns the momentum value given the current running
+  /// step (i.e., iterations/mini-batches).
+  void SetMomentumGenerator(std::function<float(int)> func) {
+    momentum_generator_ = func;
+  }
+
+ private:
+  std::unordered_map<string, Tensor> history_gradient_;
+  std::function<float(int)> momentum_generator_;
+};
+
+// =============Nesterov======================================================
+class Nesterov : public Optimizer {
+ public:
+  void Setup(const OptimizerConf& conf);
+  /// Apply the updating algorithm.
+  void Apply(int step, float lr, const string& name, const Tensor& grad,
+             Tensor& value) override;
+
+  /// The argument function returns the momentum value given the current running
+  /// step (i.e., iterations/mini-batches).
+  void SetMomentumGenerator(std::function<float(int)> func) {
+    momentum_generator_ = func;
+  }
+
+ private:
+  std::unordered_map<string, Tensor> history_gradient_;
+  std::function<float(int)> momentum_generator_;
+};
+
+// =============Adagrad=======================================================
+class AdaGrad : public Optimizer {
+ public:
+  void Setup(const OptimizerConf& conf);
+  /// Apply the updating algorithm.
+  void Apply(int step, float lr, const string& name, const Tensor& grad,
+             Tensor& value) override;
+
+ private:
+  std::unordered_map<string, Tensor> history_gradient_;
+  float delta_;
+};
+// =============RMSProp=======================================================
+class RMSProp : public Optimizer {
+ public:
+  void Setup(const OptimizerConf& conf);
+  /// Apply the updating algorithm.
+  void Apply(int step, float lr, const string& name, const Tensor& grad,
+             Tensor& value) override;
+  virtual ~RMSProp() = default;
+
+ private:
+  std::unordered_map<string, Tensor> history_gradient_;
+  float delta_, rho_;
+};
+
+
+inline std::shared_ptr<Optimizer> CreateOptimizer(const string& type) {
+  std::shared_ptr<Optimizer>  opt;
+  if (type == "SGD")
+    opt = std::shared_ptr<Optimizer>(new SGD());
+  else if (type == "RMSProp")
+    opt = std::shared_ptr<Optimizer>(new RMSProp());
+  else if (type == "AdaGrad")
+    opt = std::shared_ptr<Optimizer>(new AdaGrad());
+  else if (type == "Nesterov")
+    opt = std::shared_ptr<Optimizer>(new Nesterov());
+  else
+    LOG(FATAL) << "Unknown optimizer type : " << type;
+  return opt;
+}
+// ============LocalAllReduce for single node multiple workers ==============
+/// Updater for training models on a single node with multiple devices (workers)
+/// All model parameters are partitioned such that each parameter is updated on
+/// one device. In specific, each worker has a model replica. All workers share
+/// the same LocalAllReduce instance. Parameters are registered at first, and
+/// then after every iteration, the gradients are aggregated by one worker (or
+/// device) for parameter updating.
+/*
+class LocalAllReduce : public Optimizer{
+ pulbic:
+  LocalAllReduce(Optimizer* opt);
+  void Setup(const string& str) {
+    AllReduce conf;
+    conf.ParseFromString(str);
+    this->Setup(conf);
+  }
+  void Setup(const AllReduce& conf) {}
+
+  /// Register all model parameters.
+  /// Instructions include:
+  /// 1. Copy parameters from the master worker (who initialized the parameters)
+  /// to others.
+  /// 2. Partition parameters onto worker devices. For example, model parameter
+  /// set is {A, B, C}, nb_workers = 3, then worker 0/1/2 would be in charge of
+  /// updating A/B/C respectively. A gradient Tensor for A/B/C would be created
+  /// on device 0/1/2, dentoed as GA/GB/GC. 0/1/2 would call the internal opt to
+register the specs
+  /// for A/B/C.
+  void Register(const vector<string>& names,
+                const vector<Tensor>& values,
+                const vector<ParamSpecs>& specs) override;
+
+  /// Aggregate parameter gradients and call internal opt to do the update.
+  /// Continue with the example for Register(), worker 0 would copy B's gradient
+  /// to device 1 and add it with GB.  A callback func is added to
+  /// 1. check UpdateNow() and call opt to do the real update.
+  /// 2. broadcast the new parameters back to worker 0 and 2.
+  void Update(int step, float lr, const string& name, const Tensor& grad,
+              Tensor* param) override;
+
+  /// Decide when to call the internal Optimizer for real update.
+  /// One simple implementation would return true until all workers has
+  /// aggregated their gradients. We can also add a user configuration field
+  /// to control this, e.g., if do it when 80% workers has aggregated.
+  boo UpdateNow();
+
+ private:
+  int nb_workers_;
+  vector<Tensor> aggregated_gradients_;
+};
+*/
+}
+#endif  // SINGA_MODEL_OPTIMIZER_H_
diff --git a/include/singa/model/updater.h b/include/singa/model/updater.h
new file mode 100644
index 0000000..e0a656c
--- /dev/null
+++ b/include/singa/model/updater.h
@@ -0,0 +1,97 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef SINGA_MODEL_UPDATER_H_
+#define SINGA_MODEL_UPDATER_H_
+
+#include "singa/model/optimizer.h"
+#include "singa/core/device.h"
+#include "singa/core/tensor.h"
+#include "singa/utils/logging.h"
+
+#include <memory>
+#include <vector>
+#include <mutex>
+#include <condition_variable>
+#include <string>
+#include <utility>
+#include <unordered_map>
+#include <atomic>
+
+namespace singa {
+/// Basic Updater class just forward all the method function call
+/// to the wrapped Optimizer.
+class Updater {
+ public:
+  explicit Updater(Optimizer* opt) : opt_{opt} {}
+  virtual ~Updater() {}
+  /// Forward Setup() to Optimizer.
+  virtual void Setup(const OptimizerConf& conf);
+  /// Forward Register() to Optimizer.
+  virtual void Register(const string& name, const ParamSpec& specs);
+  /// Forward Apply() to Optimizer.
+  virtual void Apply(int step, const string& name, Tensor& grad, Tensor& value);
+  Optimizer* GetOptimizer() { return opt_; }
+
+  // No copy allowed.
+  Updater(const Updater&) = delete;
+  void operator=(const Updater&) = delete;
+
+ protected:
+  Optimizer* opt_;
+};
+
+/// LocalUpdater do gradient aggregation and update gradient calling
+/// the wrapped Optimizer on a specific device (i.e., CPU or GPU).
+class LocalUpdater : public Updater {
+ public:
+  LocalUpdater(int total_num, Optimizer* opt,
+               std::shared_ptr<Device> dev = defaultDevice)
+      : Updater(opt), total_num_{total_num}, dev_(dev) {}
+  virtual ~LocalUpdater() override {}
+  /// Forward Register() to Optimizer.
+  virtual void Register(const string& name, const ParamSpec& specs) override;
+  /// Update parameter value based on given gradient by invoking optimizer
+  /// algoritim. When tranining net call this function will be blocked until
+  /// all the partial gradients are aggrageted in a synchronized style training.
+  virtual void Apply(int step, const string& name, Tensor& grad,
+                     Tensor& value) override;
+ private:
+  template <typename T1, typename T2>
+  struct key_hasher {
+    size_t operator() (const std::pair<T1, T2>& p) const {
+      auto h1 = std::hash<T1>{}(p.first);
+      auto h2 = std::hash<T2>{}(p.second);
+      return h1 ^ h2;
+    }
+  };
+
+  int total_num_;
+  std::shared_ptr<Device> dev_;
+  std::unordered_map<std::string, std::atomic<int>> dev_index_;
+  std::unordered_map<std::string, int> to_updater_finished_;
+  std::unordered_map<std::pair<int, std::string>, Tensor,
+    key_hasher<int, std::string>> grad_buffer_;
+  std::unordered_map<std::string, Tensor> sum_, param_buffer_;
+  std::unordered_map<std::string, std::mutex> mtx_;
+  std::unordered_map<std::string, std::condition_variable>
+    to_updater_all_finished_;
+};
+}  //  namespace singa
+
+#endif  //  SINGA_MODEL_UPDATER_H_
diff --git a/include/singa/neuralnet/connection_layer.h b/include/singa/neuralnet/connection_layer.h
deleted file mode 100644
index 481d991..0000000
--- a/include/singa/neuralnet/connection_layer.h
+++ /dev/null
@@ -1,187 +0,0 @@
-/************************************************************
-*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*
-*************************************************************/
-
-#ifndef SINGA_NEURALNET_CONNECTION_LAYER_H_
-#define SINGA_NEURALNET_CONNECTION_LAYER_H_
-
-#include <string>
-#include <unordered_map>
-#include <vector>
-#include "singa/comm/socket.h"
-#include "singa/neuralnet/layer.h"
-
-namespace singa {
-/**
- * Used inside SplitLayer and SliceLayer to locate the out-going connection
- * index given the Layer pointer.
- */
-class Layer2Index {
- public:
-  int Get(const Layer* layer) {
-    if (layer2idx_.find(layer) == layer2idx_.end()) {
-      int idx =  layer2idx_.size();
-      layer2idx_[layer] = idx;
-    }
-    return layer2idx_[layer];
-  }
-
- private:
-  std::unordered_map<const Layer*, int> layer2idx_;
-};
-
-
-class BridgeLayer : public ConnectionLayer {
- public:
-  void set_ready(bool a) { ready_ = a; }
-  bool ready() const { return ready_; }
-  // Bind the layer with dealer instance by worker at runtime
-  void MakePaired(Layer* pair, int grp_id, Dealer* dealer,
-                  std::unordered_map<std::string, Layer*>* name2bridge);
-  // Send blobs to other workers due to model partitions
-  void SendBlobs(bool handle_data);
-  // Receive blobs from other workers due to model partitions;
-  void ReceiveBlobs(bool handle_data);
-
- protected:
-  //!< true if received grad from BridgeDstLayer
-  bool ready_ = false;
-  int group_id_ = 0;
-  Layer* pair_ = nullptr;
-  Dealer* dealer_ = nullptr;
-  std::unordered_map<std::string, Layer*>* name2bridge_ = nullptr;
-};
-
-/**
- * For sending data to layer on other threads which may resident on other nodes
- * due to layer/data partition.
- */
-class BridgeSrcLayer : public BridgeLayer {
- public:
-  void Setup(const LayerProto& conf, const vector<Layer*>& srclayers) override;
-  void ComputeFeature(int flag, const vector<Layer*>& srclayers) override;
-  void ComputeGradient(int flag, const vector<Layer*>& srclayers) override;
-};
-
-/**
- * For recv data from layer on other threads which may resident on other nodes
- * due to layer/data partiton
- */
-class BridgeDstLayer : public BridgeLayer {
- public:
-  void Setup(const LayerProto& conf, const vector<Layer*>& srclayers) override;
-  void ComputeFeature(int flag, const vector<Layer*>& srclayers) override;
-  void ComputeGradient(int flag, const vector<Layer*>& srclayers) override;
-};
-/**
- * Connect multiple (src) layers with a single (dst) layer.
- *
- * It concates feature Blobs (i.e., matrix) of src layers on one dimension.
- * The concated feature Blob will be fed into the dst layer.
- */
-class ConcateLayer : public ConnectionLayer {
- public:
-  void Setup(const LayerProto& proto, const vector<Layer*>& srclayers) override;
-  void ComputeFeature(int flag, const vector<Layer*>& srclayers) override;
-  void ComputeGradient(int flag, const vector<Layer*>& srclayers) override;
-
- private:
-  int num_concates_ = 0;
-  int concate_dim_ = 0;
-};
-
-/**
- * Connect a single (src) layer with multiple (dst) layers.
- *
- * It slices the feature Blob (i.e., matrix) of the src layer on one dimension.
- * The sliced feature Blobs will be fed into dst layers.
- */
-class SliceLayer : public ConnectionLayer {
- public:
-  ~SliceLayer();
-  void Setup(const LayerProto& proto, const vector<Layer*>& srclayers) override;
-  void ComputeFeature(int flag, const vector<Layer*>& srclayers) override;
-  void ComputeGradient(int flag, const vector<Layer*>& srclayers) override;
-  const std::string ToString(bool debug, int flag) override;
-  const Blob<float>& data(const Layer* from) override;
-  const Blob<float>& grad(const Layer* from) override;
-  Blob<float>* mutable_data(const Layer* from) override;
-  Blob<float>* mutable_grad(const Layer* from) override;
-
- private:
-  int num_slices_ = 0;
-  int slice_dim_ = 0;
-  Layer2Index layer_idx_;
-};
-
-/**
- * Connect a single (src) layer with multiple dst layers.
- *
- * It replicates the feature Blob of the src layer.
- * Each replicated feature Blob will be fed into one dst layer.
- * It aggregates gradients set by all dst layers and set it to the src layer.
- */
-class SplitLayer : public ConnectionLayer {
- public:
-  ~SplitLayer();
-  void Setup(const LayerProto& proto, const vector<Layer*>& srclayers) override;
-  void ComputeFeature(int flag, const vector<Layer*>& srclayers) override;
-  void ComputeGradient(int flag, const vector<Layer*>& srclayers) override;
-  const std::string ToString(bool debug, int flag) override;
-  const Blob<float>& grad(const Layer* from) override;
-  Blob<float>* mutable_grad(const Layer* from) override;
-
- private:
-  int num_splits_ = 0;
-  Layer2Index layer_idx_;
-};
-
-/**
- * Dummy layer for RNN models, which provides input for other layers.
- *
- * Particularly, it is used in the test phase of RNN models to connect other
- * layers and avoid cycles in the neural net config.
- */
-class RNNDummyLayer : public ConnectionLayer {
- public:
-  void Setup(const LayerProto& proto, const vector<Layer*>& srclayers) override;
-  void ComputeFeature(int flag, const vector<Layer*>& srclayers) override;
-  void ComputeGradient(int flag, const vector<Layer*>& srclayers) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  const string srclayer(int step) const {
-    if (step > 0)
-      return dynamic_src_;
-    else
-      return "";
-  }
-
- private:
-  string dynamic_src_;
-  float low_, high_;
-  bool integer_;
-  Layer* srclayer_;
-};
-
-
-}  // namespace singa
-
-#endif  // SINGA_NEURALNET_CONNECTION_LAYER_H_
diff --git a/include/singa/neuralnet/input_layer.h b/include/singa/neuralnet/input_layer.h
deleted file mode 100644
index 0499c4b..0000000
--- a/include/singa/neuralnet/input_layer.h
+++ /dev/null
@@ -1,336 +0,0 @@
-/************************************************************
-*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*
-*************************************************************/
-
-#ifndef SINGA_NEURALNET_INPUT_LAYER_H_
-#define SINGA_NEURALNET_INPUT_LAYER_H_
-
-#include <string>
-#include <vector>
-#include <thread>
-#include "singa/io/store.h"
-#include "singa/io/kvfile.h"
-#include "singa/neuralnet/layer.h"
-
-namespace singa {
-
-/**
- * Base class for loading data from Store.
- */
-class StoreInputLayer : virtual public InputLayer {
- public:
-  ~StoreInputLayer();
-  void Setup(const LayerProto& proto, const vector<Layer*>& srclayers) override;
-  void ComputeFeature(int flag, const vector<Layer*>& srclayers) override;
-
- protected:
-  /**
-   * Helper method for doing the prefetching, basically read (key,value) pairs
-   * to buf_keys and buf_vals_ vector of size batchsize_.
-   */
-  void fetch_data();
-  /**
-   * Parsing the (key, val) tuple to get feature (and label).
-   * Subclasses must implment this function.
-   * @param[in] k parse this tuple as the k-th instance of one mini-batch.
-   * @param[in] flag used to guide the parsing, e.g., kDeploy phase should not
-   * parse labels from the tuple.
-   * @param[in] key
-   * @param[in] val
-   */
-  virtual bool Parse(int k, int flag, const string& key, const string& val) = 0;
-
- protected:
-  int batchsize_ = 1;
-  int random_skip_ = 0;
-  io::Store* store_ = nullptr;
-  vector<std::string> buf_keys_, buf_vals_;
-  std::thread *thread_ = nullptr;  // prefetching thread
-};
-
-/**
- * Base layer for parsing a key-value tuple as a feature vector with fixed
- * length. The feature shape is indicated by users in the configuration.
- * Each tuple may has a label.
- */
-class SingleLabelRecordLayer : public StoreInputLayer {
- public:
-  void Setup(const LayerProto& proto, const vector<Layer*>& srclayers) override;
-  void ComputeFeature(int flag, const vector<Layer*>& srclayers) override;
-
- protected:
-  /**
-   * Load a single record (tuple), e.g., the mean or standard variance vector.
-   */
-  virtual void LoadRecord(const string& backend, const string& path,
-      Blob<float>* to) = 0;
-
- protected:
-  /**
-   * Feature standardization by processing each feature dimension via
-   * @f$ y = (x - mu)/ std @f$
-   * <a href= "http://ufldl.stanford.edu/wiki/index.php/Data_Preprocessing">
-   * UFLDL</a>
-   */
-  Blob<float> mean_, std_;
-};
-/**
- * Specific layer that parses the value string loaded by Store as a line from
- * a CSV file.
- *
- * It assumes the first column is the label except that has_label_ is configured
- * to false. Or the data is used in deploy mode.
- */
-class CSVInputLayer : public SingleLabelRecordLayer {
- public:
-  void Setup(const LayerProto& proto, const vector<Layer*>& srclayers) override;
-
- protected:
-  bool Parse(int k, int flag, const string& key, const string& val) override;
-  void LoadRecord(const string& backend,
-                  const string& path,
-                  Blob<float>* to) override;
-
- private:
-  std::string sep_;
-  bool has_label_;
-};
-
-
-/**
- * Specific layer that parses the value string loaded by Store into a
- * RecordProto.
- */
-class RecordInputLayer : public SingleLabelRecordLayer {
- public:
-  void Setup(const LayerProto& proto, const vector<Layer*>& srclayers) override;
-
- protected:
-  /**
-   * Parse key as instance ID and val into RecordProto.
-   * @copydetails StoreInputLayer::Parse()
-   */
-  bool Parse(int k, int flag, const string& key, const string& val) override;
-  void LoadRecord(const string& backend,
-                  const string& path,
-                  Blob<float>* to) override;
-
- private:
-  // TODO(wangwei) decode the image
-  bool encoded_;
-};
-
-/**
- * Do preprocessing for images, including cropping, mirroring, resizing.
- */
-class ImagePreprocessLayer : public InputLayer {
- public:
-  void Setup(const LayerProto& proto, const vector<Layer*>& srclayers) override;
-  void ComputeFeature(int flag, const vector<Layer*>& srclayers);
-
- private:
-  bool mirror_ = false;
-  int cropsize_ = 0;
-  int resize_ = 0;
-  float scale_ = 1;
-};
-
-class OneHotLayer : public InputLayer {
- public:
-  void Setup(const LayerProto& proto, const vector<Layer*>& srclayers) override;
-  void ComputeFeature(int flag, const vector<Layer*>& srclayers);
-
- private:
-  int batchsize_, dim_;
-};
-
-/**
- *  * Read the ASCII file as a large string used for RNN model where each character
- *   * is a single input to the unrolled RNN layer.
- *    * max string length is string::max_size();
- *     */
-class CharRNNInputLayer : public InputLayer {
- public:
-  void Setup(const LayerProto& proto, const vector<Layer*>& srclayers) override;
-  void ComputeFeature(int flag, const vector<Layer*>& srclayers);
-
- private:
-  int batchsize_ = 0, unroll_len_ = 1;
-  unsigned offset_ = 0;
-  string path_, vocab_path_;
-  string buf_;
-  vector<int> start_;
-  std::unordered_map<char, int> char2index_;
-};
-
-/**
- * Label layer for fetching labels from the src input layer for RNN models.
- * The i-th unrolled layer fetch label from the input layer via data(i+1).
- * Particularly, it shares data_ Blob with data(i+1) of its src layer.
- */
-class RNNLabelLayer : public InputLayer {
- public:
-  void Setup(const LayerProto& proto, const vector<Layer*>& srclayers);
-  void ComputeFeature(int flag, const vector<Layer*>& srclayers);
-};
-
-
-/****************Deprecated layers******************/
-/**
- * @deprecated please use the StoreInputLayer.
- *
- * Base layer for reading ::Record  from local Shard, HDFS, lmdb, etc.
- */
-class DataLayer: virtual public InputLayer {
- public:
-  Blob<float>* mutable_data(const Layer* layer) override { return nullptr; }
-  ConnectionType dst_layer_connection() const override {
-    return kOneToMany;
-  }
-
-  inline int batchsize() const { return batchsize_; }
-  virtual const Record& sample() const {
-    return sample_;
-  }
-  /**
-   * @return the loaded records
-   */
-  virtual const std::vector<Record>& records() const {
-    return records_;
-  }
-
- protected:
-  int random_skip_;
-  int batchsize_;
-  Record sample_;
-  std::vector<Record> records_;
-};
-/**
- * @deprecated Please use the subclasses of StoreInputLayer.
- *
- * Layer for loading Record from DataShard.
- */
-class ShardDataLayer : public DataLayer {
- public:
-  ~ShardDataLayer();
-
-  void Setup(const LayerProto& proto, const vector<Layer*>& srclayers) override;
-  void ComputeFeature(int flag, const vector<Layer*>& srclayers) override;
-
- private:
-  DataShard* shard_;
-};
-/**
- * @deprecated please use the subclasses of StoreInputLayer.
- *
- * Layer for loading Record from LMDB.
- */
-#ifdef USE_LMDB
-#include <lmdb.h>
-class LMDBDataLayer : public DataLayer {
- public:
-  ~LMDBDataLayer();
-
-  void Setup(const LayerProto& proto, const vector<Layer*>& srclayers) override;
-  void OpenLMDB(const std::string& path);
-  void ComputeFeature(int flag, const vector<Layer*>& srclayers) override;
-  void ConvertCaffeDatumToRecord(const CaffeDatum& datum,
-                                 SingleLabelImageRecord* record);
-
- private:
-  MDB_env* mdb_env_;
-  MDB_dbi mdb_dbi_;
-  MDB_txn* mdb_txn_;
-  MDB_cursor* mdb_cursor_;
-  MDB_val mdb_key_, mdb_value_;
-};
-#endif
-
-/******************Parser layers***************/
-/**
- * @deprecated Please use the subclasses of StoreInputLayer which load and parse
- * data in a single layer.
- *
- * Base layer for parsing the input records into Blobs.
- */
-class ParserLayer : public InputLayer {
- public:
-  void ComputeFeature(int flag, const vector<Layer*>& srclayers) override;
-  void ComputeGradient(int flag, const vector<Layer*>& srclayers) override {}
-  ConnectionType dst_layer_connection() const override {
-    return kOneToMany;
-  }
-  /**
-   * Parse records from DataLayer into blob.
-   */
-  virtual void ParseRecords(int flag, const std::vector<Record>& records,
-      Blob<float>* blob) = 0;
-};
-/**
- *
- * @deprecated Please use the SingleLabelRecordLayer which parses both feature
- * and label for each record. Its aux_data() function returns the parsed labels.
- *
- * Derived from ParserLayer to parse label in SingaleLabelImageRecord loaded by
- * ShardDataLayer.
- */
-class LabelLayer : public ParserLayer {
- public:
-  void Setup(const LayerProto& proto, const vector<Layer*>& srclayers) override;
-  void ParseRecords(int flag, const std::vector<Record>& records,
-                    Blob<float>* blob) override;
-};
-
-/**
- * @deprecated Please use the subclasses of StoreInputLayer.
- *
- * Derived from ParserLayer to parse MNIST feature from SingaleLabelImageRecord.
- */
-class MnistLayer : public ParserLayer {
- public:
-  void Setup(const LayerProto& proto, const vector<Layer*>& srclayers) override;
-  void ParseRecords(int flag, const std::vector<Record>& records,
-                    Blob<float>* blob) override;
-
- protected:
-  float norm_a_, norm_b_;
-};
-/**
- * @deprecated please use the ImagePreprocessLayer which preprocess image
- * feature from data Blob of source layers.
- *
- * Derived from ParserLayer to parse RGB image feature from
- * SingaleLabelImageRecord.
- */
-class RGBImageLayer : public ParserLayer {
- public:
-  void Setup(const LayerProto& proto, const vector<Layer*>& srclayers) override;
-  void ParseRecords(int flag, const std::vector<Record>& records,
-                    Blob<float>* blob) override;
-
- private:
-  float scale_;
-  int cropsize_;
-  bool mirror_;
-  Blob<float> mean_;
-};
-}  // namespace singa
-
-#endif  // SINGA_NEURALNET_INPUT_LAYER_H_
diff --git a/include/singa/neuralnet/layer.h b/include/singa/neuralnet/layer.h
deleted file mode 100644
index c8ea3fc..0000000
--- a/include/singa/neuralnet/layer.h
+++ /dev/null
@@ -1,376 +0,0 @@
-/************************************************************
-*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*
-*************************************************************/
-
-#ifndef SINGA_NEURALNET_LAYER_H_
-#define SINGA_NEURALNET_LAYER_H_
-
-#include <string>
-#include <vector>
-#include "singa/proto/common.pb.h"
-#include "singa/proto/job.pb.h"
-#include "singa/utils/common.h"
-#include "singa/utils/blob.h"
-#include "singa/utils/param.h"
-
-namespace singa {
-using std::vector;
-using std::string;
-
-// TODO(wangwei) make AuxType a template argument for Layer.
-using AuxType = int;
-
-inline const string AddUnrollingPrefix(int unroll_idx, const string& name) {
-  return std::to_string(unroll_idx) + "#" + name;
-}
-inline const string AddPartitionSuffix(int partition_idx, const string& name) {
-  return name + "@" + std::to_string(partition_idx);
-}
-
-
-inline const string AddPrefixSuffix(int unroll_idx, int partition_idx,
-    const string& name) {
-  return std::to_string(unroll_idx) + "#" + name + "@" +
-    std::to_string(partition_idx);
-}
-/**
- * Base layer class.
- *
- * Subclasses should implement at least
- * Layer::ComputeFeature() and Layer::ComputGradient()
- * functions in accordance with the NeuralNet::TrainOneBatch function.
- */
-
-class Layer {
- public:
-  /**
-   * Create a sub-layer instance based on proto.type();
-   *
-   * @param proto configuration of the layer instance.
-   * @return pointer to the newly created layer instance.
-   */
-  static Layer* Create(const LayerProto& proto);
-
-  Layer() {}
-  virtual ~Layer() {}
-
-  /**
-   * Create for python binding, production test mode
-   *
-   */
-  static Layer* CreateLayer(const string str);
-  static void SetupLayer(Layer* layer, const string str, const vector<Layer*>& srclayers);
-
-  /**
-   * Setup layer properties.
-   *
-   * Setup members e.g., shapes of Param objects based on the layer
-   * configuration and connected layers.
-   * It should check the partition setting when setup the properties.
-   *
-   * @param conf layer configuration.
-   * @param srclayers source layers that connect to this layer.
-   */
-  virtual void Setup(const LayerProto& conf, const vector<Layer*>& srclayers) {
-    layer_conf_ = conf;
-    datavec_.push_back(&data_);
-    gradvec_.push_back(&grad_);
-  }
-
-
-  /**
-   * Compute features of this layer based on connected layers.
-   *
-   * @param[in] flag set by the TrainOneBatch function, e.g., to indicate the
-   * running phase (kForward|kTrain, kForward|kTest, etc).
-   * @param[in] srclayers source layers that connect to this layer.
-   */
-  virtual void ComputeFeature(int flag, const vector<Layer*>& srclayers) = 0;
-  /**
-   * Compute gradients for parameters associated with this layer.
-   * It may also compute the gradients of the loss w.r.t the source layers.
-   *
-   * \copydetails ComputeFeature().
-   */
-  virtual void ComputeGradient(int flag, const vector<Layer*>& srclayers) = 0;
-  /**
-   * Layers that have paramters must override this function to return all Param
-   * objects associated with this layer.
-   *
-   * @return parameters associated with this layer.
-   */
-  virtual const std::vector<Param*> GetParams() const {
-    return std::vector<Param*> {};
-  }
-  virtual void SetParams(std::vector<Param*>) {}
-  /**
-   * Return the connection type between one neuron of this layer and its source
-   * layer.
-   *
-   * Currently support two connection types: kOneToOne, and kOneToAll.
-   * - kOneToOne indicates the neuron depends on only one neuron from src layer.
-   * - kOneToAll indicates the neuron depends on all neurons from src layer.
-   * TODO(wangwei) support kOneToMany.
-   *
-   * @param[in] k index of source layer, current only support k = 0.
-   * @return connection type.
-   */
-  virtual ConnectionType src_neuron_connection(int k) const {
-    // CHECK_LT(k, srclayers_.size());
-    return kOneToOne;
-  }
-  /**
-   * Return the connection type of this layer and all dst layers.
-   *
-   * Currently support two connection types: kOneToOne, and kOneToMany.
-   * - kOneToOne indicates the users implement the ComputeFeature and
-   * ComputeGradient function considering only one dst layer. In this case,
-   * a SplitLayer will be added automatically to connect this layer with all
-   * dest layer.
-   * - kOneToMany indicates this layer has already considered multiple dst
-   *   layers in the implementation.
-   *
-   * @return connection type default is kOneToOne.
-   */
-  virtual ConnectionType dst_layer_connection() const {
-    return kOneToOne;
-  }
-  /**
-   * To display layer info, e.g., aggreated loss/accuracy, or norm of feature
-   * vector and norm of parameters.
-   *
-   * @param[in] debug whether print the debug info
-   * @param[in] flag used to get the calling phase, e.g., forward of training
-   * (kForward | kTrain).
-   * @return info string about this layer, which is printed into the log.
-   */
-  virtual const std::string ToString(bool debug, int flag);
-  /**
-   * @return partition dimension of this layer,
-   * - -1 for no partition.
-   * -  0 for partition on the data dimension, i.e., partitioning the mini-batch
-   *    into sub-mini-batches.
-   * -  1 for partition this layer on feature dimension, i.e., the feature
-   *    vector of each instance is partitioned into sub-vectors.
-   */
-  inline int partition_dim() const {
-    CHECK_LE(layer_conf_.partition_dim(), 1);
-    return layer_conf_.partition_dim();
-  }
-  /**
-   * @return the partition ID (i.e., the worker ID to whom is layer is
-   * dispatched) of this layer, which is a sublayer partitioned from the
-   * original layer.
-   */
-  inline int partition_id() const { return layer_conf_.partition_id(); }
-  /**
-   * @return total number of partitions (i.e., sub-layers) of the original
-   * layer of this layer.
-   */
-  inline int num_partitions() const { return layer_conf_.num_partitions(); }
-  /**
-   * @return the type of this layer, only valid for built-in layer (types).
-   */
-  inline LayerType type() const { return layer_conf_.type(); }
-  /**
-   * @return user-defined layer type.
-   */
-  inline const std::string& user_type() const {
-    return layer_conf_.user_type();
-  }
-  /**
-   * Return name of this layer
-   */
-  inline const std::string& name() const { return layer_conf_.name(); }
-  /**
-   * Return the index of the unrolled layer within the unrolling group, which
-   * should be [0, max_unrolling_length)
-   */
-  inline const int unroll_index() const { return layer_conf_.unroll_index(); }
-
-  /**
-   * @return a const ref for Blob vector storing feature values of this layer.
-   */
-  virtual const vector<Blob<float>*>& data() {
-    return datavec_;
-  }
-
-  /**
-   * @param[in] from pointer to one of the dst layer. For some layers, they have
-   * more than one data Blob. In this case, this argument identifies the layer
-   * that is requesting the data Blob.
-   * @return a const ref for Blob storing feature values of this layer.
-   * @deprecated {This function will be deleted, use
-   * virtual const vector<Blob<float>>& data() const or
-   * virtual const Blob<float>& data(int k) const instead}.
-   */
-  virtual const Blob<float>& data(const Layer* from) {
-    return data_;
-  }
-  /**
-   * @return a const ref for the kth Blob.
-   * TODO(wangwei) if make this function const, there will be a warning
-   * indicating that data(const Layer*) and this function are ambiguous for
-   * data(0).
-   */
-  virtual const Blob<float>& data(int k) {
-    return *datavec_.at(k);
-  }
-
-  /**
-   * @see data().
-   * @return the pointer to the Blob storing feature values of this layer.
-   * @deprecated {This function will be deleted, use
-   * virtual Blob<float>* mutable_data(int k) instead}.
-   */
-  virtual Blob<float>* mutable_data(const Layer* from) {
-    return &data_;
-  }
-  /**
-   * @return the pointer to the kth Blob.
-   */
-  virtual Blob<float>* mutable_data(int k) {
-    return datavec_.at(k);
-  }
-  /**
-   * @return auxiliary data, e.g., image label.
-   */
-  virtual const vector<AuxType>& aux_data(const Layer* from = nullptr) {
-    return aux_data_;
-  }
-  /**
-   * @see data().
-   * @return the const ref of the Blob for the gradient of this layer, mainly
-   * used in BP algorithm.
-   * @deprecated {This function will be deleted, use
-   * virtual const vector<Blob<float>>& grad() const or
-   * virtual const Blob<float>& grad(int k) const instead}.
-   */
-  virtual const Blob<float>& grad(const Layer* from) {
-    return grad_;
-  }
-  /**
-   * @see data().
-   * @return the const ref of the Blob vector for the gradient of this layer.
-   */
-  virtual const vector<Blob<float>*>& grad() const {
-    return gradvec_;
-  }
-  /**
-   * @return the const ref of the kth Blob for the gradient of this layer.
-   */
-  virtual const Blob<float>& grad(int k) const {
-    return *gradvec_.at(k);
-  }
-  /**
-   * @see data().
-   * @return a pointer to the Blob storing gradients of this layer, mainly
-   * used in BP algorithm.
-   */
-  virtual Blob<float>* mutable_grad(const Layer* from) {
-    return &grad_;
-  }
-  /**
-   * @see data().
-   * @return a pointer to the kth Blob storing gradients of this layer, mainly
-   * used in BP algorithm.
-   */
-  virtual Blob<float>* mutable_grad(int k) {
-    return gradvec_.at(k);
-  }
-
- protected:
-  LayerProto layer_conf_;
-  Blob<float> data_, grad_;
-  vector<AuxType> aux_data_;
-  vector<Blob<float>*> datavec_, gradvec_;
-};
-/**************** Layer categories *****************/
-/**
- * Base layer for connecting layers when neural net is partitioned.
- */
-class ConnectionLayer : virtual public Layer {
-  // defined as a layer category
-};
-
-
-/**
- * Base layer for getting input data. May include layers for loading records,
- * parsing records.
- */
-class InputLayer : virtual public Layer {
- public:
-  void ComputeGradient(int flag, const vector<Layer*>& srclayers) override {}
-  ConnectionType dst_layer_connection() const override { return kOneToMany; }
-  Blob<float>* mutable_grad(const Layer* layer) override {
-    return nullptr;
-    // LOG(FATAL) << "Input layer has no gradient blob";
-  }
-  const Blob<float>& grad(const Layer* from) override {
-    return grad_;
-    // LOG(FATAL) << "Input layer has no gradient blob";
-  }
-};
-
-using SingleLabelImageRecord = RecordProto;
-
-/**
- * Base layer for feature transformation, e.g., ConvolutionLayer, PoolingLayer,
- * etc.
- */
-class NeuronLayer : virtual public Layer {
-  // defined as a layer category
-};
-
-
-/**
- * Base layer for calculating loss and doing BackPropagation.
- */
-class LossLayer : virtual public Layer {
- public:
-  Blob<float>* mutable_grad(const Layer* layer) override {
-    return nullptr;
-    // LOG(FATAL) << "Loss layer has no gradient blob";
-  }
-  const Blob<float>& grad(const Layer* from) override {
-    return grad_;
-    // LOG(FATAL) << "Loss layer has no gradient blob";
-  }
-};
-
-/**
- * Base layer for collecting features into disk file, HTTP stream, etc.
- */
-class OutputLayer : virtual public Layer {
- public:
-  void ComputeGradient(int flag, const vector<Layer*>& srclayers) override {}
-  Blob<float>* mutable_grad(const Layer* layer) override {
-    return nullptr;
-    // LOG(FATAL) << "Output layer has no gradient blob";
-  }
-  const Blob<float>& grad(const Layer* from) override {
-    return grad_;
-    // LOG(FATAL) << "Output layer has no gradient blob";
-  }
-};
-
-
-}  // namespace singa
-#endif  // SINGA_NEURALNET_LAYER_H_
diff --git a/include/singa/neuralnet/loss_layer.h b/include/singa/neuralnet/loss_layer.h
deleted file mode 100644
index 53ddc82..0000000
--- a/include/singa/neuralnet/loss_layer.h
+++ /dev/null
@@ -1,83 +0,0 @@
-/************************************************************
-*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*
-*************************************************************/
-
-#ifndef SINGA_NEURALNET_LOSS_LAYER_H_
-#define SINGA_NEURALNET_LOSS_LAYER_H_
-
-#include <vector>
-#include <string>
-#include "singa/neuralnet/layer.h"
-#include "singa/neuralnet/neuron_layer.h"
-
-namespace singa {
-using std::vector;
-/**
- * Squared Euclidean loss as @f$0.5 ||p - t||^2@f$, where p is prediction
- * result, t is the ground truth.
- */
-class EuclideanLossLayer : public LossLayer {
- public:
-  void Setup(const LayerProto& conf, const vector<Layer*>& srclayers) override;
-  void ComputeFeature(int flag, const vector<Layer*>& srclayers) override;
-  void ComputeGradient(int flag, const vector<Layer*>& srclayers) override;
-  const std::string ToString(bool debug, int flag) override;
-
- private:
-  int counter_ = 0;
-  float loss_ = 0.0f;
-};
-/**
- * Cross-entropy loss applied to the probabilities computed from Softmax.
- * @f$ L_i = -log P_{t_i}, t_i\in [0, C] @f$ is the label for the i-th object,
- * C is the total number of classes.
- */
-class SoftmaxLossLayer : public LossLayer {
- public:
-  void Setup(const LayerProto& conf, const vector<Layer*>& srclayers) override;
-  void ComputeFeature(int flag, const vector<Layer*>& srclayers) override;
-  void ComputeGradient(int flag, const vector<Layer*>& srclayers) override;
-  const std::string ToString(bool debug, int flag) override;
-
- private:
-  int batchsize_, topk_, dim_, counter_ = 0;
-  float scale_;
-  float loss_ = 0.0f, accuracy_ = 0.0f;
-};
-
-#ifdef USE_CUDNN
-class CudnnSoftmaxLossLayer : public LossLayer{
- public:
-  void Setup(const LayerProto& conf, const vector<Layer*>& srclayers) override;
-  void ComputeFeature(int flag, const vector<Layer*>& srclayers) override;
-  void ComputeGradient(int flag, const vector<Layer*>& srclayers) override;
-  const std::string ToString(bool debug, int flag) override;
-
- private:
-  int batchsize_, dim_;
-  int counter_ = 0;
-  float loss_ = 0.0f;
-
-  CudnnSoftmaxLayer softmax_;
-};
-#endif
-}  // namespace singa
-
-#endif  // SINGA_NEURALNET_LOSS_LAYER_H_
diff --git a/include/singa/neuralnet/neuralnet.h b/include/singa/neuralnet/neuralnet.h
deleted file mode 100644
index 33ad38c..0000000
--- a/include/singa/neuralnet/neuralnet.h
+++ /dev/null
@@ -1,173 +0,0 @@
-/************************************************************
-*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*
-*************************************************************/
-
-#ifndef SINGA_NEURALNET_NEURALNET_H_
-#define SINGA_NEURALNET_NEURALNET_H_
-
-#include <string>
-#include <vector>
-#include <unordered_map>
-
-#include "singa/neuralnet/layer.h"
-#include "singa/proto/job.pb.h"
-#include "singa/utils/factory.h"
-#include "singa/utils/graph.h"
-
-namespace singa {
-using std::unordered_map;
-using std::string;
-using std::vector;
-/**
- * The neural network is constructed from user configurations in NetProto.
- *
- * Some layers, e.g., SplitLayer and BridgeSrcLayer/BridgeDstLayer
- * will be added implicitly to partition the neural network.
- * TODO(wangwei) create wrappers for popular models, e.g., MLP, CNN.
- */
-class NeuralNet {
- public:
-  /**
-   * Create the neural network for training, test or validation.
-   *
-   * Parameters for test/validation net can share those from training after
-   * setup (done outside of this funcion).
-   *
-   * @param net_conf proto for the neural network
-   * @param phase test/training/validation
-   * @param npartitions num of partitions, do partitioning if num > 1
-   * @return pointer to a neural net
-   */
-  static NeuralNet* Create(const NetProto& net_conf, Phase phase,
-                           int npartitions);
-
-  static const NetProto Unrolling(const NetProto& net_conf);
-  /**
-   * construct the net structure from protocol buffer.
-   * @param netproto neural net config
-   * @param npartitions num of partitions. 1 for no partitioning.
-   */
-  NeuralNet(NetProto net_conf, int num_partitions);
-  ~NeuralNet();
-  /**
-   * Load net params from checkpoint fiels.
-   * @param path checkpoint files
-   */
-  void Load(const vector<string>& path);
-  /**
-   * load specified Param objects from from checkpoint files.
-   *
-   * Param objects and blobs are matched based on name.
-   * The param from previous checkpoint files will be overwritten by
-   * the param with the same name in later checkpoint files.
-   *
-   * @param[in] path
-   * @param[in,out] params load Blobs with the same name as the Params in this
-   * this dictionary. The Param values are copied into the corresponding Param
-   * objects.
-   */
-  static void Load(const vector<string>& path,
-                   const unordered_map<string, Param*>& params);
-  /**
-   * To display the adjacency layers
-  std::string ToAdjacency();
-   */
-  /**
-   * Share memory of parameter values from other neuralnet
-   * @param[in] other the neural net from which to share the Params
-   * @param[in] cpu_only if true only share cpu memory; else, share both cpu
-   * and gpu memory.
-   */
-  void ShareParamsFrom(NeuralNet* other, bool cpu_only);
-  inline const std::vector<Layer*>& layers() const { return layers_; }
-  inline const std::vector<Param*>& params() const { return params_; }
-  inline Layer* name2layer(std::string name) const {
-    if (name2layer_.find(name) == name2layer_.end())
-      return nullptr;
-    else
-      return name2layer_.at(name);
-  }
-  inline const std::vector<Layer*>& srclayers(const Layer* layer) const {
-    CHECK(src_map_.find(layer) != src_map_.end())
-      << "layer (" << layer->name() << " ) has no source layers";
-    return src_map_.at(layer);
-  }
-  Layer* last_unroll_layer(const Layer* layer) const {
-    auto pos = layer->name().find("#");
-    if (pos == std::string::npos)
-      return nullptr;
-    string last_name = std::to_string(unroll_len_) + layer->name().substr(pos);
-    CHECK(name2layer_.find(last_name) != name2layer_.end())
-      << "layer name = " << last_name << " has no unroll layers";
-    return name2layer_.at(last_name);
-  }
-  inline Param* paramid2param(int id) const { return paramid2param_.at(id); }
-
-  /**
-   * Conver the neural net into graph representation.
-   * Each layer is converted into a node.
-   * @param include_shape if true label the node with shape info
-   */
-  const Graph ToGraph(bool include_shape) const;
-
- protected:
-  /**
-   * Create a neural net graph, one node for each layer.
-   *
-   * Partition the graph if npartitions > 1, each layer is sliced according to
-   * its own partition setting.
-   * @param netproto
-   * @npartitions
-   * @return neural net graph
-   */
-  Graph* CreateGraph(const NetProto& netproto, int num_partitions);
-  /**
-   * Create neural net from graph, one layer per node.
-   */
-  void CreateNetFromGraph(Graph* graph);
-  /**
-   * prepare data structures, e.g., params_, layers_, etc.
-   */
-  void PrepareDataStructures();
-  void PrepareDataStructures(const NetProto& proto);
-  /**
-   * add split layers, due to connections to multiple dst-layers
-   */
-  NetProto AddModelSplitLayers(const NetProto& netproto);
-  /**
-   * add connection layers, due to partition of the whole nerualnet
-   * this should be done after AddModelSplitLayers()
-   */
-  NetProto AddPartitionConnectionLayers(const NetProto& netproto,
-                                        int npartitions);
-
- protected:
-  int unroll_len_ = 1;
-  std::vector<Layer*> layers_;
-  std::vector<Param*> params_;
-
-  unordered_map<std::string, Layer*> name2layer_;
-  unordered_map<int, Param*> paramid2param_;
-  unordered_map<const Layer*, std::vector<Layer*>> src_map_;
-};
-
-}  // namespace singa
-
-#endif  // SINGA_NEURALNET_NEURALNET_H_
diff --git a/include/singa/neuralnet/neuron_layer.h b/include/singa/neuralnet/neuron_layer.h
deleted file mode 100644
index e6f0fd5..0000000
--- a/include/singa/neuralnet/neuron_layer.h
+++ /dev/null
@@ -1,560 +0,0 @@
-/************************************************************
-*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*
-*************************************************************/
-
-#ifndef SINGA_NEURALNET_NEURON_LAYER_H_
-#define SINGA_NEURALNET_NEURON_LAYER_H_
-
-#include <vector>
-#include <string>
-#include "singa/neuralnet/layer.h"
-#include "singa/proto/job.pb.h"
-#include "singa/utils/context.h"
-#include "singa/utils/singleton.h"
-
-#ifdef USE_CUDNN
-#include <cudnn.h>
-#endif
-
-namespace singa {
-
-/* Activation layer applies following activations,
- * - "relu",    @f$ f(x) = max(0, x)@f$
- * - "sigmoid", @f$ f(x)=1/(1+exp(-x)) @f$
- * - "tanh",    @f$ f(x) = tanh(x) @f$
- * - "stanh",   scaled tanh @f$f(x)=1.7159047 * tanh(0.66666667 * x)@f$, valid
- *   only for CPU training.
- * It may share data and grad with its (single) source layer depending on
- * the share_srclayer_blob configuration field.
- */
-class ActivationLayer : public NeuronLayer {
- public:
-  void Setup(const LayerProto& conf, const vector<Layer*>& srclayers) override;
-  void ComputeFeature(int flag, const vector<Layer*>& srclayers) override;
-  void ComputeGradient(int flag, const vector<Layer*>& srclayers) override;
-
- protected:
-  bool share_with_srclayer = false;
-  std::string method_;
-};
-
-/**
- * Convolution layer.
- * Currently using Mshadow to do convolution operations. TODO(wangwei) remove
- * dependency on Mshadow and using im2col from Caffe to implement this for CPU
- * version. For GPU version, there is class CudnnConvLayer.
- */
-class ConvolutionLayer : public NeuronLayer {
- public:
-  ~ConvolutionLayer();
-
-  void Setup(const LayerProto& proto, const vector<Layer*>& srclayers) override;
-  void ComputeFeature(int flag, const vector<Layer*>& srclayers) override;
-  void ComputeGradient(int flag, const vector<Layer*>& srclayers) override;
-  const std::vector<Param*> GetParams() const override {
-    std::vector<Param*> params{weight_, bias_};
-    return params;
-  }
-  ConnectionType src_neuron_connection(int k) const  override {
-    // CHECK_LT(k, srclayers_.size());
-    return kOneToAll;
-  }
-
- protected:
-  int kernel_x_, pad_x_,  stride_x_;
-  int kernel_y_, pad_y_,  stride_y_;
-  int batchsize_,  channels_, height_, width_;
-  int col_height_, col_width_, conv_height_, conv_width_, num_filters_;
-  Param* weight_ = nullptr, *bias_ = nullptr;
-  Blob<float> col_data_, col_grad_;
-};
-
-/**
- * Implement convolution operations using im2col from Caffe.
- */
-class CConvolutionLayer : public ConvolutionLayer {
- public:
-  void ComputeFeature(int flag, const vector<Layer*>& srclayers) override;
-  void ComputeGradient(int flag, const vector<Layer*>& srclayers) override;
-};
-
-/**
- * Layer that drops out some neurons randomly according to a user defined drop
- * ratio (default is 0.5). It helps reduce overfitting.
- */
-class DropoutLayer : public NeuronLayer {
- public:
-  void Setup(const LayerProto& proto, const vector<Layer*>& srclayers) override;
-  void ComputeFeature(int flag, const vector<Layer*>& srclayers) override;
-  void ComputeGradient(int flag, const vector<Layer*>& srclayers) override;
- protected:
-  // drop probability
-  float pdrop_;
-  /* record which neuron is dropped, required for back propagating gradients,
-   * if mask[i]=0, then the i-th neuron is dropped.
-   */
-  Blob<float> mask_;
-};
-/**
- * This layer is dummy and do no real work.
- * It is used for testing purpose only.
- *
- * Use it as input layer, it will generate random data;
- * Use it as output layer, it will generate random grad;
- * Use it as neuron layer, it will replicates data and grad.
- */
-class DummyLayer: public NeuronLayer {
- public:
-  void Setup(const std::string str, const vector<Layer*>& srclayers);
-  void Setup(const LayerProto& proto, const vector<Layer*>& srclayers) override;
-  void ComputeFeature(int flag, const vector<Layer*>& srclayers) override;
-  void ComputeGradient(int flag, const vector<Layer*>& srclayers) override;
-  void Feed(int batchsize, vector<float>& data, vector<int>& aux_data);
-  Layer* ToLayer() { return this;}
-
- private:
-  bool input_ = false;  // use as input layer
-  bool output_ = false;  // use as output layer
-  int batchsize_ = 1;  // use for input layer
-};
-
-/**
- * Embedding layer that converts an array of index ID into a matrix.
- *
- * Each index ID corresponds to a word (or feature) vector in the vocabulary
- * matrix maintained by the embedding layer.
- * The index ID ranges within [0, |D|), where |D| is the size of the vocabulary,
- * i.e., the number of rows of the vocabulary matrix.
- * If the index is -1, which means it is a padding word. A feature vector with
- * all values 0 will be constructed and inserted into the feature Blob.
- * Users handle special words by themseleves. For example, the index 0 could be
- * the starting word/symbol of a sentence, the index 1 could be the ending
- * word/symbol of a sentence.
- */
-class EmbeddingLayer : public NeuronLayer {
- public:
-  ~EmbeddingLayer() {
-    delete vocab_;
-  }
-  void Setup(const LayerProto& proto, const vector<Layer*>& srclayers) override;
-  void ComputeFeature(int flag, const vector<Layer*>& srclayers) override;
-  void ComputeGradient(int flag, const vector<Layer*>& srclayers) override;
-  const std::vector<Param*> GetParams() const override {
-    std::vector<Param*> params;
-    params.push_back(vocab_);
-    return params;
-  }
-
- private:
-  int vocab_size_, feature_dim_, batchsize_;
-  //!< the vocabulary matrix to be learned
-  Param *vocab_;
-};
-
-class GRULayer : public NeuronLayer {
- public:
-  ~GRULayer();
-  void Setup(const LayerProto& proto, const vector<Layer*>& srclayers) override;
-  void ComputeFeature(int flag, const vector<Layer*>& srclayers) override;
-  void ComputeGradient(int flag, const vector<Layer*>& srclayers) override;
-  ConnectionType dst_layer_connection() const override {
-    return kOneToMany;
-  }
-  Blob<float>* mutable_grad(const Layer* from) override {
-    if (typeid(*from) == typeid(GRULayer))
-      return gradvec_[1];
-    else
-      return gradvec_[0];
-  }
-  const Blob<float>& grad(const Layer* from) override {
-    if (typeid(*from) == typeid(GRULayer))
-      return *gradvec_[1];
-    else
-      return *gradvec_[0];
-  }
-  const std::vector<Param*> GetParams() const override {
-    std::vector<Param*> params{weight_z_hx_, weight_r_hx_, weight_c_hx_,
-      weight_z_hh_, weight_r_hh_, weight_c_hh_};
-
-    if (bias_z_ != nullptr && bias_r_ != nullptr && bias_c_ != nullptr) {
-      params.push_back(bias_z_);
-      params.push_back(bias_r_);
-      params.push_back(bias_c_);
-    }
-    return params;
-  }
-
- private:
-  int batchsize_;  // batch size
-  int vdim_, hdim_;  // dimensions
-  Blob<float> *update_gate_, *reset_gate_, *new_memory_;
-  Param *weight_z_hx_, *weight_z_hh_, *bias_z_;  // update gate
-  Param *weight_r_hx_, *weight_r_hh_, *bias_r_;  // reset gate
-  Param *weight_c_hx_, *weight_c_hh_, *bias_c_;  // new memory
-};
-
-/**
- * Layer that applys linear transformations as
- * @f$ h = v*W+b @f$, where W and b are weight matrix and bias vector.
- */
-class InnerProductLayer : public NeuronLayer {
- public:
-  ~InnerProductLayer();
-  void Setup(const LayerProto& proto, const vector<Layer*>& srclayers) override;
-  void ComputeFeature(int flag, const vector<Layer*>& srclayers) override;
-  void ComputeGradient(int flag, const vector<Layer*>& srclayers) override;
-  ConnectionType src_neuron_connection(int k) const override {
-    return kOneToAll;
-  }
-  const std::vector<Param*> GetParams() const override {
-    std::vector<Param*> params{weight_, bias_};
-    return params;
-  }
-
-  void SetParams(std::vector<Param*> params) {
-    weight_ = params.at(0);
-    bias_ = params.at(1);
-  }
-
- private:
-  int batchsize_;
-  int vdim_, hdim_;
-  bool transpose_;
-  Param *weight_, *bias_;
-};
-
-/**
- * Local Response Normalization edge
- *
- * @f$ b_i=a_i/x_i^beta @f$
- * @f$x_i=knorm+alpha*\sum_{j=max(0,i-n/2)}^{min(N,i+n/2)}(a_j)^2 @f$
- * n is size of local response area.
- * @f$a_i@f$, the activation (after ReLU) of a neuron convolved with the i-th kernel.
- * @f$b_i@f$, the neuron after normalization, N is the total num of kernels
- */
-class LRNLayer : public NeuronLayer {
-  void Setup(const LayerProto& proto, const vector<Layer*>& srclayers) override;
-  void ComputeFeature(int flag, const vector<Layer*>& srclayers) override;
-  void ComputeGradient(int flag, const vector<Layer*>& srclayers) override;
-
- protected:
-  //!< shape of the feature blob of the src layer
-  int batchsize_, channels_, height_, width_;
-  //!< size local response (neighbor) area
-  int lsize_;
-  //!< hyper-parameter
-  float alpha_, beta_, knorm_;
-  Blob<float> norm_;
-};
-
-/**
- * Layer that applies the pooling operation.
- * TODO(wangwei) remove dependenices on mshadow
- */
-class PoolingLayer : public NeuronLayer {
- public:
-  void Setup(const LayerProto& proto, const vector<Layer*>& srclayers) override;
-  void ComputeFeature(int flag, const vector<Layer*>& srclayers) override;
-  void ComputeGradient(int flag, const vector<Layer*>& srclayers) override;
-
- protected:
-  int kernel_x_, pad_x_, stride_x_;
-  int kernel_y_, pad_y_, stride_y_;
-  int batchsize_, channels_, height_, width_, pooled_height_, pooled_width_;
-  PoolingProto_PoolMethod pool_;
-};
-/**
- * Use book-keeping for BP following Caffe's pooling implementation
- */
-class CPoolingLayer : public PoolingLayer {
- public:
-  void Setup(const LayerProto& proto, const vector<Layer*>& srclayers);
-  void ComputeFeature(int flag, const vector<Layer*>& srclayers) override;
-  void ComputeGradient(int flag, const vector<Layer*>& srclayers) override;
-
- private:
-  Blob<float> mask_;
-};
-
-/**
- * @deprecated {please use ActivationLayer}
- */
-class ReLULayer : public NeuronLayer {
- public:
-  void Setup(const LayerProto& proto, const vector<Layer*>& srclayers) override;
-  void ComputeFeature(int flag, const vector<Layer*>& srclayers) override;
-  void ComputeGradient(int flag, const vector<Layer*>& srclayers) override;
-};
-
-/**
- * Softmax layer applies softmax transformation to features from source layers.
- * The feature blob of this layer is of shape (batchsize,
- * num_softmax_per_instance, count_per_softmax), where num_softmax_per_instance
- * is controled by users (default is 1),
- * @f$ count_per_softmax = count / batchsize / num_softmax_per_instance @f$.
- * The softmax is conducted over count_per_softmax elements each time.
-  */
-class SoftmaxLayer : public NeuronLayer {
- public:
-  void Setup(const LayerProto& proto, const vector<Layer*>& srclayers) override;
-  void ComputeFeature(int flag, const vector<Layer*>& srclayers) override;
-  void ComputeGradient(int flag, const vector<Layer*>& srclayers) override;
-  /**
-   * This layer is not recommendeded for partition because it requires the whole
-   * src layer for normalization.
-   */
-  ConnectionType src_neuron_connection(int k) const override {
-    // CHECK_LT(k, srclayers_.size());
-    return kOneToAll;
-  }
- protected:
-  int batchsize_, dim_;
-  //!< set by users (default is 1)
-  // int num_softmax_per_instance_;
-  //!< size of the softmax area/length
-  // int count_per_softmax_;
-};
-/**
- * @deprecated {please use ActivationLayer}
- *
- * This layer apply Sigmoid function to neuron activations.
- * f(x)=1/(1+exp(-x))
- * f'(x)=f(x)*(1-f(x))
- */
-class SigmoidLayer: public Layer {
- public:
-  using Layer::ComputeFeature;
-  using Layer::ComputeGradient;
-
-  void Setup(const LayerProto& proto, const vector<Layer*>& srclayers) override;
-  void ComputeFeature(int flag, const vector<Layer*>& srclayers) override;
-  void ComputeGradient(int flag, const vector<Layer*>& srclayers) override;
-};
-
-/**
- * @deprecated {please use ActivationLayer}
- * This layer apply scaled Tanh function to neuron activations.
- * f(x)=1.7159047  tanh(0.66666667 x)
- */
-class STanhLayer : public NeuronLayer {
- public:
-  void Setup(const LayerProto& proto, const vector<Layer*>& srclayers) override;
-  void ComputeFeature(int flag, const vector<Layer*>& srclayers) override;
-  void ComputeGradient(int flag, const vector<Layer*>& srclayers) override;
-};
-
-
-class BMLayer : public NeuronLayer {
- public:
-  void Setup(const LayerProto& proto, const vector<Layer*>& srclayers) override;
-  void ComputeFeature(int flag, const vector<Layer*>& srclayers) override;
-  void ComputeGradient(int flag, const vector<Layer*>& srclayers) override;
- protected:
-  Param *bnScale_, *bnBias_;
-  Param *resultRunningMean_, *resultRunningInvVariance_;
-  int batchsize_,  channels_, height_, width_;
-};
-
-/*************** Layers implemented using cudnn v3 ***************/
-#ifdef USE_CUDNN
-#define CHECK_CUDNN(x) CHECK_EQ(x, CUDNN_STATUS_SUCCESS)
-
-class CudnnBase : virtual public NeuronLayer {
- public:
-  ~CudnnBase() {
-    if (src_desc_ != nullptr)
-      CHECK_CUDNN(cudnnDestroyTensorDescriptor(src_desc_));
-    if (my_desc_ != nullptr)
-      CHECK_CUDNN(cudnnDestroyTensorDescriptor(my_desc_));
-  }
-  void virtual InitCudnn() {
-    CHECK(!has_init_cudnn_);
-    CHECK_CUDNN(cudnnCreateTensorDescriptor(&src_desc_));
-    CHECK_CUDNN(cudnnCreateTensorDescriptor(&my_desc_));
-    handle_ = Singleton<Context>::Instance()->cudnn_handle();
-    has_init_cudnn_ = true;
-  }
- protected:
-  bool has_init_cudnn_ = false;
-  cudnnHandle_t handle_ = nullptr;
-  cudnnTensorDescriptor_t src_desc_ = nullptr, my_desc_ = nullptr;
-};
-
-/**
- * Activation layer implemented using cudnn v3.
- * Activation methods including
- * - SIGMOID
- * - TANH
- * - RELU
- */
-class CudnnActivationLayer : public ActivationLayer, public CudnnBase {
- public:
-  void InitCudnn() override;
-  void ComputeFeature(int flag, const vector<Layer*>& srclayers) override;
-  void ComputeGradient(int flag, const vector<Layer*>& srclayers) override;
-
- protected:
-  cudnnActivationMode_t mode_;
-};
-
-/**
- * Convolution layer implemeneted using cudnn (v3 version backward functions).
- */
-class CudnnConvLayer : public ConvolutionLayer, public CudnnBase {
- public:
-  ~CudnnConvLayer();
-  void InitCudnn() override;
-  void ComputeFeature(int flag, const vector<Layer*>& srclayers) override;
-  void ComputeGradient(int flag, const vector<Layer*>& srclayers) override;
-
- protected:
-  cudnnTensorDescriptor_t bias_desc_;
-  cudnnFilterDescriptor_t filter_desc_;
-  cudnnConvolutionDescriptor_t conv_desc_;
-  cudnnConvolutionFwdAlgo_t fp_alg_;
-  cudnnConvolutionBwdFilterAlgo_t bp_filter_alg_;
-  cudnnConvolutionBwdDataAlgo_t bp_data_alg_;
-  size_t workspace_byte_limit_, workspace_count_;
-};
-
-class CudnnLRNLayer : public LRNLayer, public CudnnBase {
- public:
-  ~CudnnLRNLayer();
-  void InitCudnn() override;
-  void ComputeFeature(int flag, const vector<Layer*>& srclayers) override;
-  void ComputeGradient(int flag, const vector<Layer*>& srclayers) override;
-
- protected:
-  cudnnLRNMode_t mode_;
-  cudnnLRNDescriptor_t norm_desc_;
-};
-/**
- * Pooling layer implemented using cudnn.
- */
-class CudnnPoolLayer : public PoolingLayer, public CudnnBase {
- public:
-  ~CudnnPoolLayer();
-  void InitCudnn() override;
-  void ComputeFeature(int flag, const vector<Layer*>& srclayers) override;
-  void ComputeGradient(int flag, const vector<Layer*>& srclayers) override;
-
- protected:
-  cudnnPoolingDescriptor_t pool_desc_;
-};
-
-/**
- * Cudnn Softmax layer.
- */
-class CudnnSoftmaxLayer : public SoftmaxLayer, public CudnnBase {
- public:
-  void InitCudnn() override;
-  void ComputeFeature(int flag, const vector<Layer*>& srclayers) override;
-  void ComputeGradient(int flag, const vector<Layer*>& srclayers) override;
-};
-
-
-#if CUDNN_MAJOR == 4
-/**
- * Cudnn Batch Normalization layer -- supported by cudnn_v4
- */
-class CudnnBMLayer : public BMLayer, public CudnnBase {
- public:
-  ~CudnnBMLayer();
-  void InitCudnn() override;
-  void ComputeFeature(int flag, const vector<Layer*>& srclayers) override;
-  void ComputeGradient(int flag, const vector<Layer*>& srclayers) override;
-  const std::vector<Param*> GetParams() const override {
-    std::vector<Param*> params{bnScale_, bnBias_,
-        resultRunningMean_, resultRunningInvVariance_};
-    return params;
-  }
- protected:
-  cudnnBatchNormMode_t mode_;
-  cudnnTensorDescriptor_t bnScaleBiasMeanVar_desc_;
-  cudnnTensorDescriptor_t bnScaleBiasDiff_desc_;
-  Blob<float> resultSaveMean_;
-  Blob<float> resultSaveInvVariance_;
-};
-#endif
-#endif  // USE_CUDNN
-
-/******************** RBM layers *****************/
-/**
- * Base layer for RBM models.
- */
-class RBMLayer: virtual public Layer {
- public:
-  virtual ~RBMLayer() {}
-  void Setup(const LayerProto& proto, const vector<Layer*>& srclayers) override;
-  const std::vector<Param*> GetParams() const override {
-    std::vector<Param*> params{weight_, bias_};
-    return params;
-  }
-  virtual Blob<float>* Sample(int flat);
-
- protected:
-  //! if ture, sampling according to guassian distribution
-  bool gaussian_;
-  //! dimension of the hidden layer
-  int hdim_;
-  //! dimension of the visible layer
-  int vdim_;
-  int batchsize_;
-  bool first_gibbs_;
-  Param* weight_, *bias_;
-  Blob<float> pos_data_;
-  Blob<float> neg_data_;
-  Blob<float> neg_sample_;
-  Blob<float> pos_sample_;
-};
-
-/**
- * RBM visible layer
- */
-class RBMVisLayer: public RBMLayer, public LossLayer {
- public:
-  ~RBMVisLayer();
-  void Setup(const LayerProto& proto, const vector<Layer*>& srclayers) override;
-  void ComputeFeature(int flag, const vector<Layer*>& srclayers) override;
-  void ComputeGradient(int flag, const vector<Layer*>& srclayers) override;
-  const std::string ToString(bool debug, int flag) override;
-
- private:
-  RBMLayer* hid_layer_;
-  Layer* input_layer_;
-  float error_ = 0.0f;
-  int counter_ = 0;
-};
-/**
- * RBM hidden layer
- */
-class RBMHidLayer: public RBMLayer {
- public:
-  ~RBMHidLayer();
-  void Setup(const LayerProto& proto, const vector<Layer*>& srclayers) override;
-  void ComputeFeature(int flag, const vector<Layer*>& srclayers) override;
-  void ComputeGradient(int flag, const vector<Layer*>& srclayers) override;
-
- private:
-  RBMLayer *vis_layer_;
-};
-
-}  // namespace singa
-#endif  // SINGA_NEURALNET_NEURON_LAYER_H_
diff --git a/include/singa/neuralnet/output_layer.h b/include/singa/neuralnet/output_layer.h
deleted file mode 100644
index 9071f33..0000000
--- a/include/singa/neuralnet/output_layer.h
+++ /dev/null
@@ -1,99 +0,0 @@
-/************************************************************
-*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*
-*************************************************************/
-
-#ifndef SINGA_NEURALNET_OUTPUT_LAYER_H_
-#define SINGA_NEURALNET_OUTPUT_LAYER_H_
-
-#include <vector>
-#include <string>
-#include "singa/neuralnet/layer.h"
-#include "singa/io/store.h"
-
-namespace singa {
-/**
- * ArgSort layer used to get topk prediction labels.
- *
- * It sort the labels based on its score (e.g., probability) from large to
- * small. Topk labels will be kepted in the data field. It should not be called
- * during training because this layer does not implement ComputeGradient()
- * function.
- */
-class ArgSortLayer : public OutputLayer {
- public:
-  void Setup(const LayerProto& proto, const vector<Layer*>& srclayers) override;
-  void ComputeFeature(int flag, const vector<Layer*>& srclayers) override;
-
- protected:
-  int batchsize_, dim_;
-  int topk_;
-};
-
-class AccuracyLayer : public ArgSortLayer {
- public:
-  void Setup(const LayerProto& proto, const vector<Layer*>& srclayers) override;
-  void ComputeFeature(int flag, const vector<Layer*>& srclayers) override;
-  const std::string ToString(bool debug, int flag) override;
-
- private:
-  int counter_ = 0;
-  float accuracy_ = 0.0f;
-};
-/**
- * Output data (and label) for its source layer.
- */
-class CSVOutputLayer : public OutputLayer {
- public:
-  ~CSVOutputLayer() { delete store_; }
-  void Setup(const LayerProto& proto, const vector<Layer*>& srclayers) override;
-  void ComputeFeature(int flag, const vector<Layer*>& srclayers) override;
-
- private:
-  int inst_ = 0;
-  io::Store* store_ = nullptr;
-};
-
-class RecordOutputLayer : public OutputLayer {
- public:
-  ~RecordOutputLayer() { delete store_; }
-  void Setup(const LayerProto& proto, const vector<Layer*>& srclayers) override;
-  void ComputeFeature(int flag, const vector<Layer*>& srclayers) override;
-
- private:
-  int inst_ = 0;  //!< instance No.
-  io::Store* store_ = nullptr;
-};
-
-/**
- * Output layer for char rnn model, which convert sample id back to char and
- * dump to stdout.
- */
-class CharRNNOutputLayer : public OutputLayer {
- public:
-  void Setup(const LayerProto& proto, const vector<Layer*>& srclayers) override;
-
-  void ComputeFeature(int flag, const vector<Layer*>& srclayers) override;
-
- private:
-  string vocab_;
-};
-
-}  // namespace singa
-#endif  // SINGA_NEURALNET_OUTPUT_LAYER_H_
diff --git a/include/singa/server.h b/include/singa/server.h
deleted file mode 100644
index d95862d..0000000
--- a/include/singa/server.h
+++ /dev/null
@@ -1,135 +0,0 @@
-/************************************************************
-*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*
-*************************************************************/
-
-#ifndef SINGA_SERVER_H_
-#define SINGA_SERVER_H_
-
-#include <unordered_map>
-#include <vector>
-#include "singa/comm/socket.h"
-#include "singa/proto/job.pb.h"
-#include "singa/utils/param.h"
-#include "singa/utils/updater.h"
-
-namespace singa {
-
- /* Repsond to worker's get/put/udpate request, and periodically syncing with
-  * other servers.
-  *
-  * Normally, the Server creates a response message for each request which
-  * will be sent back to the one who issued the request. However, if the request
-  * are not processed successfully, the original message will be returned. The
-  * sever does not know the returned message is a response or the original
-  * message. It just sends it to the router. The router will decided to
-  * re-send the request to the server or send it to the worker.
-  */
-class Server {
- public:
-  ~Server();
-  Server(int group_id, int server_id,
-      const JobProto& job_conf,
-      const std::vector<int>& slice2group,
-      const std::vector<int>& slice2server);
-  void Run();
-  inline int grp_id() const { return grp_id_; }
-  inline int id() const { return id_; }
-
- protected:
-  /**
-   * Process GET request.
-   *
-   * @return the orignal message or a response message which contains the values
-   * of the Param with the request version.
-   */
-  Msg* HandleGet(Msg** msg);
-  /**
-   * Process Update request.
-   *
-   * It waits until received the gradients from all workers from the same worker
-   * group. After updating, it responses to each sender with the new Param
-   * values. It may generate a sync message to the server group that maintains
-   * the global version of the updated Param (slice).
-   *
-   * Note: there is no counter for each worker group on the number of received
-   * update requests. Hence it is possible that the server would conduct the
-   * update when it receives x requests from group a and y requests from group
-   * b where x + y = group size. To avoid this problem, we can
-   * -# maintain request list for each group for each Param at the server side
-   * -# do not span a worker group among multiple nodes. then the updates from
-   * the same group would be locally aggregated on the worker node. And the
-   * server would conduct the update immediately after receiving the aggregated
-   * request.
-   * -# launch only one worker group.
-   *
-   * @return the orignal message or response message
-   */
-  const std::vector<Msg*> HandleUpdate(Msg **msg);
-  /**
-   * Process PUT request.
-   *
-   * @return the original message or response message. If we don't want to
-   * acknowledge the put request, then return nullptr.
-   */
-  Msg* HandlePut(Msg **msg);
-  /**
-   * Handle sync request from other server groups.
-   *
-   * It adds updates of Param (slice) from other server groups directly to
-   * local Param (slice). Currently, each Param (slice) has a master group,
-   * i.e., slice2group_[sliceid], which would receive such requests from all
-   * other server groups for the Param object.
-   *
-   * @param msg request msg containing the parameter updates
-   * @return response msg that contains the fresh parameter values.
-   */
-  Msg* HandleSyncRequest(Msg** msg);
-  /**
-   * Handle sync response.
-   *
-   * The response msg includes the latest values of a Param object from the
-   * server group that maintainers this Param object.
-   * The local Param values are replaced with the addition result of local
-   * udpates since the sync request was sent and the received Param values.
-   *
-   * @param response message
-   */
-  void HandleSyncResponse(Msg** msg);
-
- protected:
-  int grp_id_ = -1;
-  int id_ = -1;
-  Updater* updater_ = nullptr;
-  //!< map from slice ID to slice and deleted in the destructor
-  std::unordered_map<int, ParamEntry*> shard_;
-  std::vector<int> slice2group_, slice2server_;
-  //!< num of updates from last sync with master server group for a param/slice
-  std::vector<int> n_updates_;
-  //!< num of sync requests that have not been responded
-  std::vector<int> n_pending_sync_;
-  std::vector<Blob<float>> last_sync_;
-  std::unordered_map<int, std::vector<Msg*>> buffer_requests_;
-
-  Dealer* dealer_;
-};
-
-}  // namespace singa
-
-#endif  // SINGA_SERVER_H_
diff --git a/include/singa/singa.h b/include/singa/singa.h
deleted file mode 100644
index 9bc5ba5..0000000
--- a/include/singa/singa.h
+++ /dev/null
@@ -1,37 +0,0 @@
-/************************************************************
-*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*
-*************************************************************/
-
-#ifndef SINGA_SINGA_H_
-#define SINGA_SINGA_H_
-
-#include "singa/comm/socket.h"
-#include "singa/io/store.h"
-#include "singa/neuralnet/neuralnet.h"
-#include "singa/neuralnet/layer.h"
-#include "singa/proto/job.pb.h"
-#include "singa/proto/singa.pb.h"
-#include "singa/utils/common.h"
-#include "singa/utils/param.h"
-#include "singa/utils/singleton.h"
-#include "singa/utils/factory.h"
-#include "singa/driver.h"
-
-#endif  // SINGA_SINGA_H_
diff --git a/include/singa/stub.h b/include/singa/stub.h
deleted file mode 100644
index 4802535..0000000
--- a/include/singa/stub.h
+++ /dev/null
@@ -1,108 +0,0 @@
-/************************************************************
-*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*
-*************************************************************/
-
-#ifndef SINGA_STUB_H_
-#define SINGA_STUB_H_
-
-#include <queue>
-#include <unordered_map>
-#include <vector>
-#include <string>
-#include "singa/comm/socket.h"
-#include "singa/neuralnet/neuralnet.h"
-#include "singa/proto/job.pb.h"
-#include "singa/proto/singa.pb.h"
-#include "singa/utils/factory.h"
-#include "singa/utils/param.h"
-#include "singa/utils/singleton.h"
-#include "singa/server.h"
-#include "singa/worker.h"
-
-namespace singa {
-
-class Stub {
- public:
-  ~Stub();
-  /**
-   * Find an endpoint to bind.
-   */
-  void Setup();
-  /**
-   * The Stub instance runs this function in the main thread to handle (e.g.,
-   * forward) messages from workers and servers.
-   *
-   * @param[in] slice2server the k-th value is the ID of the server that is in
-   * charge of updating the Param slice with ID k. Large Param objects are
-   * sliced into subsets for load-balance. Different subsets are updated by
-   * different servers.
-   */
-  void Run(const vector<int>& slice2server,
-      const std::vector<Worker*>& workers,
-      const std::vector<Server*>& servers);
-
-  void set_router(Router* router) {
-    router_ = router;
-  }
-
- protected:
-  /**
-   * Create a socket to send msg to the specified process
-   * @param dst_procs the dst process (logical) ID
-   * @return the newly created socket
-   */
-  Dealer* CreateInterProcsDealer(int dst_procs);
-  /**
-   * Generate a request message to Get the parameter object.
-   */
-  const std::vector<Msg*> HandleGetRequest(ParamEntry* entry, Msg** msg);
-  void HandleGetResponse(ParamEntry* entry, Msg** msg);
-  /**
-   * Generate a request message to Update the parameter object.
-   */
-  const std::vector<Msg*> HandleUpdateRequest(ParamEntry* entry, Msg** msg);
-  /**
-   * Handle response msg from servers for the update requests.
-   */
-  void HandleUpdateResponse(ParamEntry* entry, Msg** msg);
-  /**
-   * Generate a request message to Put the parameter object.
-   */
-  const std::vector<Msg*> HandlePutRequest(ParamEntry* entry, Msg** msg);
-  /**
-   * Called by HandlePut, HandleUpdate and HandleGet functions
-   * @param type message type
-   * @param version param version
-   * @param entry
-   * @param msg
-   * @param ret generated messages
-   */
-  void GenMsgs(int type, int version, ParamEntry* entry,
-    Msg* msg, std::vector<Msg*> *ret);
-
-
- protected:
-  Router *router_ = nullptr;
-  std::vector<int> slice2server_;
-};
-
-}  // namespace singa
-
-#endif  // SINGA_STUB_H_
diff --git a/include/singa/utils/blob.h b/include/singa/utils/blob.h
deleted file mode 100644
index 1a0a592..0000000
--- a/include/singa/utils/blob.h
+++ /dev/null
@@ -1,414 +0,0 @@
-/**************************************************************
-*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*
-*************************************************************/
-
-/**
- * The code is adapted from that of Caffe which is under BSD 2 Clause License.
- * COPYRIGHT
- * All contributions by the University of California:
- * Copyright (c) 2014, The Regents of the University of California (Regents)
- * All rights reserved.
- * All other contributions:
- * Copyright (c) 2014, the respective contributors
- * All rights reserved.
- */
-#ifndef SINGA_UTILS_BLOB_H_
-#define SINGA_UTILS_BLOB_H_
-
-#include <glog/logging.h>
-#include <memory>
-#include <vector>
-#include "singa/proto/common.pb.h"
-#include "mshadow/tensor.h"
-#include "mshadow/cxxnet_op.h"
-
-namespace singa {
-
-// TODO(wangwei) use cudaMallocHost depending on Context::device.
-inline void MallocHost(void** ptr, size_t size) {
-  *ptr = malloc(size);
-  // cudaMallocHost(ptr, size);
-}
-
-inline void FreeHost(void* ptr) {
-  free(ptr);
-  // cudaFreeHost(ptr);
-}
-
-/**
- * @brief Manages memory allocation and synchronization between the host (CPU)
- *        and device (GPU).
- *
- * TODO(dox): more thorough description.
- */
-class SyncedMemory {
- public:
-  enum SyncedHead { UNINITIALIZED,
-                    HEAD_AT_CPU,
-                    HEAD_AT_GPU,
-                    SYNCED };
-
-  SyncedMemory() {}
-  explicit SyncedMemory(size_t size) : size_(size) {}
-  ~SyncedMemory();
-
-  const void* cpu_data();
-  const void* gpu_data();
-  void* mutable_cpu_data();
-  void* mutable_gpu_data();
-  void set_cpu_data(void* data);
-  inline SyncedHead head() { return head_; }
-  inline size_t size() { return size_; }
-
- private:
-  void to_cpu();
-  void to_gpu();
-
-  void* cpu_ptr_ = nullptr;
-  void* gpu_ptr_ = nullptr;
-  size_t size_ = 0;
-  SyncedHead head_ = UNINITIALIZED;
-  bool own_cpu_data_ = false;
-};  // class SyncedMemory
-
-
-template <typename Dtype>
-class Blob {
- public:
-  Blob() {}
-  /**
-   * Blob constructor with given shape.
-   * @param shape specifies the size of each dimension, shape[0] is the highest
-   * dimension, i.e., stride[0] = shape[1] * shape[2] * ...
-   */
-  explicit Blob(const std::vector<int>& shape) { Reshape(shape); }
-  /**
-   * Blob constructor with given shape.
-   * @param[in] dim0 total num of elements.
-   */
-  explicit Blob(int dim0) { Reshape(dim0); }
-  /**
-   * Blob constructor with given shape.
-   * @param[in] dim0 size of the highest dimension
-   * @param[in] dim1 size of the second highest dimension
-   */
-  explicit Blob(int dim0, int dim1) { Reshape(dim0, dim1); }
-  /**
-   * Blob constructor with given shape.
-   * @param[in] dim0 size of the highest dimension
-   * @param[in] dim1
-   * @param[in] dim2
-   */
-  explicit Blob(int dim0, int dim1, int dim2) { Reshape(dim0, dim1, dim2); }
-  /**
-   * Blob constructor with given shape.
-   * @param[in] dim0 size of the highest dimension
-   * @param[in] dim1
-   * @param[in] dim2
-   * @param[in] dim3
-   */
-  explicit Blob(int dim0, int dim1, int dim2, int dim3) {
-    Reshape(dim0, dim1, dim2, dim3);
-  }
-  /**
-   * Change the shape of the blob, re-allocate memory if Blob size() changes.
-   *
-   * @param[in] shape specifies the size of each dimension, shape[0] is the
-   * highest * dimension, i.e., stride[0] = shape[1] * shape[2] * ...
-   */
-  void Reshape(const std::vector<int>& shape);
-  /**
-   * Helper for Reshape(const std::vector<int>& shape) with shape.size() = 1.
-   *
-   * @see Reshape(const std::vector<int>&).
-   * @param[in] dim0 total num of elements.
-   */
-  void Reshape(int dim0) {
-    Reshape(std::vector<int>{dim0});
-  }
-  /**
-   * Helper for Reshape(const std::vector<int>& shape) with shape.size() = 2.
-   *
-   * @param dim0 the highest dimension size, i.e., dim0 = shape[0]. E.g., dim0
-   * could the batchsize.
-   * @param[in] dim1, dim1 = shape[1], e.g., dim1 could be the length of the
-   * feature vector.
-   */
-  void Reshape(int dim0, int dim1) {
-    Reshape(std::vector<int>{dim0, dim1});
-  }
-  /**
-   * Helper for Reshape(const std::vector<int>& shape) with shape.size() = 3.
-   *
-   * @param[in] dim0, dim0 = shape[0]
-   * @param[in] dim1, dim1 = shape[1]
-   * @param[in] dim2, dim2 = shape[2]
-   */
-  void Reshape(int dim0, int dim1, int dim2) {
-    Reshape(std::vector<int>{dim0, dim1, dim2});
-  }
-  /**
-   * Helper for Reshape(const std::vector<int>& shape) with shape.size() = 4.
-   *
-   * @param[in] dim0, dim0 = shape[0]
-   * @param[in] dim1, dim1 = shape[1]
-   * @param[in] dim2, dim2 = shape[2]
-   * @param[in] dim3, dim3 = shape[3]
-   */
-  void Reshape(int dim0, int dim1, int dim2, int dim3) {
-    Reshape(std::vector<int>{dim0, dim1, dim2, dim3});
-  }
-  /**
-   * Reshape as the shape of *other* Blob.
-   * @param[in] other
-   */
-  void ReshapeLike(const Blob& other);
-  /**
-   * @brief Copy from a source Blob.
-   *
-   * @param source the Blob to copy from
-   * @param reshape if false, require this Blob to be pre-shaped to the shape
-   * of other (and die otherwise); if true, Reshape this Blob to other's
-   * shape if necessary
-   */
-  void CopyFrom(const Blob<Dtype>& source, bool reshape);
-  /**
-   * call CopyFrom(const Blob<Dtype>& source, bool reshape) with reshape = false
-   */
-  void CopyFrom(const Blob<Dtype>& source);
-
-  void FromProto(const singa::BlobProto& proto);
-  void ToProto(singa::BlobProto* proto) const;
-  /**
-   * Set each element to be v
-   */
-  void SetValue(Dtype v);
-  /**
-   * Compute the sum of absolute values (L1 norm) of the data.
-  Dtype AsumData() const;
-   */
-  /**
-   * Sum all elements
-  Dtype SumData() const;
-   */
-  /**
-   * Share data with the other Blob.
-   * Set the data_ shared_ptr to point to the SyncedMemory holding the data_
-   * of Blob other.
-   *
-   * It may deallocate the SyncedMemory holding this Blob's data_, as
-   * shared_ptr calls its destructor when reset with the "=" operator.
-   * @param other the Blob who owns the data
-   * @param cpu_only if true, only share the cpu data; if false, share the whole
-   * data_ field. For training with multi-gpu cards, cpu_only must be true,
-   * becuase gpu memory cannot be shared among different devices.
-   */
-  void ShareData(Blob* other, bool cpu_only = true);
-
-  /*
-  void Swap(Blob& other);
-  */
-  /**
-   * @return the shape vector.
-   */
-  inline const std::vector<int>& shape() const { return shape_; }
-  /**
-   * @return the size of the k-th dimension.
-   */
-  inline int shape(int k) const {
-    CHECK_LT(k, shape_.size());
-    return shape_.at(k);
-  }
-  inline int count() const {
-    return count_;
-  }
-  inline int version() const {
-    return version_;
-  }
-  inline void set_version(int v) {
-    version_ = v;
-  }
-  inline const Dtype* cpu_data() const {
-    CHECK(data_);
-    return static_cast<const Dtype*>(data_->cpu_data());
-  }
-  inline void set_cpu_data(Dtype* data) {
-    CHECK(data);
-    data_->set_cpu_data(data);
-  }
-  inline const Dtype* gpu_data() const {
-    CHECK(data_);
-    return static_cast<const Dtype*>(data_->gpu_data());
-  }
-  inline Dtype* mutable_cpu_data() {
-    CHECK(data_);
-    return static_cast<Dtype*>(data_->mutable_cpu_data());
-  }
-  inline Dtype* mutable_gpu_data() {
-    CHECK(data_);
-    return static_cast<Dtype*>(data_->mutable_gpu_data());
-  }
-  inline void set_transpose(bool val) {
-    transpose_ = val;
-  }
-  inline bool transpose() const {
-    return transpose_;
-  }
-  inline const Blob<Dtype> T() const {
-    Blob<Dtype> ret(*this);
-    ret.transpose_ = !transpose_;
-    return ret;
-  }
-  // to check if two blob has the exact same content
-  bool check_equal(Blob* other) const {
-    if (transpose() != other->transpose()) return false;
-    if (count() != other->count()) return false;
-    if (shape().size() != other->shape().size()) return false;
-    for (unsigned int i = 0; i < shape().size(); i++) {
-      if (shape(i) != other->shape(i)) return false;
-    }
-    const Dtype * a = cpu_data();
-    const Dtype * b = other->cpu_data();
-    for (int i = 0; i < count(); i++) {
-      if (a[i] != b[i]) return false;
-    }
-    return true;
-  }
-
- protected:
-  std::shared_ptr<SyncedMemory> data_ = nullptr;
-  std::vector<int> shape_;
-  int count_ = 0;
-  int capacity_ = 0;
-  int version_ = -1;
-  bool transpose_ = false;
-};  // class Blob
-
-/**
- * Reshape a Blob.
- * @return a new Blob with the given shape, it shares the internal data_ with
- * the original Blob, i.e., no memory copy and allocation.
- */
-template <typename Dtype>
-Blob<Dtype>* Reshape(const Blob<Dtype> & A, const std::vector<int>& shape) {
-  Blob<Dtype>* res = new Blob<Dtype>(A);
-  res->Reshape(shape);
-  return res;
-}
-
-/**
- * Helper of Reshape(const Blob<Dtype>, const std::vector<int>*).
- */
-template <typename Dtype>
-Blob<Dtype>* Reshape(const Blob<Dtype> & A, int count) {
-  std::vector<int> tmpshape;
-  tmpshape.push_back(count);
-  return Reshape(A, tmpshape);
-}
-/**
- * Helper of Reshape(const Blob<Dtype>, const std::vector<int>*).
- */
-template <typename Dtype>
-Blob<Dtype>* Reshape(const Blob<Dtype> & A, int dim0, int dim1) {
-  std::vector<int> tmpshape;
-  tmpshape.push_back(dim0);
-  tmpshape.push_back(dim1);;
-  return Reshape(A, tmpshape);
-}
-/**
- * Helper of Reshape(const Blob<Dtype>, const std::vector<int>*).
- */
-template <typename Dtype>
-Blob<Dtype>* Reshape(const Blob<Dtype> & A, int dim0, int dim1, int dim2) {
-  std::vector<int> tmpshape;
-  tmpshape.push_back(dim0);
-  tmpshape.push_back(dim1);
-  tmpshape.push_back(dim2);
-  return Reshape(A, tmpshape);
-}
-/**
- * Helper of Reshape(const Blob<Dtype>, const std::vector<int>*).
- */
-template <typename Dtype>
-Blob<Dtype>* Reshape(const Blob<Dtype> & A, int dim0, int dim1, int dim2,
-    int dim3) {
-  std::vector<int> tmpshape;
-  tmpshape.push_back(dim0);
-  tmpshape.push_back(dim1);
-  tmpshape.push_back(dim2);
-  tmpshape.push_back(dim3);
-  return Reshape(A, tmpshape);
-}
-
-/**
- * @return a new Blob which share all internal members with the input Blob
- * except that the transpose_ field is set to the opposite value.
- */
-template <typename Dtype>
-Blob<Dtype>* Transpose(const Blob<Dtype> & A) {
-  Blob<Dtype>* res = new Blob<Dtype>(A);
-  bool origin = A.transpose();
-  res->set_transpose(!origin);
-  return res;
-}
-
-// TODO(wangwei) remove mshadow functions.
-using namespace mshadow;
-using mshadow::cpu;
-
-using mshadow::Shape;
-using mshadow::Shape1;
-using mshadow::Shape2;
-using mshadow::Shape3;
-using mshadow::Shape4;
-using mshadow::Tensor;
-
-using std::vector;
-
-inline Tensor<cpu, 4> Tensor4(Blob<float>* blob) {
-  const vector<int>& shape = blob->shape();
-  Tensor<cpu, 4> tensor(blob->mutable_cpu_data(),
-      Shape4(shape[0], shape[1], shape[2], shape[3]));
-  return tensor;
-}
-
-inline Tensor<cpu, 3> Tensor3(Blob<float>* blob) {
-  const vector<int>& shape = blob->shape();
-  Tensor<cpu, 3> tensor(blob->mutable_cpu_data(),
-      Shape3(shape[0], shape[1], blob->count() / shape[0] / shape[1]));
-  return tensor;
-}
-
-inline Tensor<cpu, 2> Tensor2(Blob<float>* blob) {
-  const vector<int>& shape = blob->shape();
-  Tensor<cpu, 2> tensor(blob->mutable_cpu_data(),
-      Shape2(shape[0], blob->count() / shape[0]));
-  return tensor;
-}
-
-inline Tensor<cpu, 1> Tensor1(Blob<float>* blob) {
-  Tensor<cpu, 1> tensor(blob->mutable_cpu_data(), Shape1(blob->count()));
-  return tensor;
-}
-
-
-}  // namespace singa
-
-#endif  // SINGA_UTILS_BLOB_H_
diff --git a/include/singa/utils/channel.h b/include/singa/utils/channel.h
new file mode 100644
index 0000000..b640e90
--- /dev/null
+++ b/include/singa/utils/channel.h
@@ -0,0 +1,85 @@
+/************************************************************
+*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*
+*************************************************************/
+
+#ifndef SINGA_UTILS_CHANNEL_H_
+#define SINGA_UTILS_CHANNEL_H_
+
+#include <google/protobuf/message.h>
+
+#include <iostream>
+#include <fstream>
+#include <map>
+#include <string>
+
+namespace singa {
+
+/// Channel for appending metrics or other information into files or screen.
+class Channel {
+ public:
+  explicit Channel(const std::string& name);
+  ~Channel();
+
+  /// Return the channel name, which is also used for naming the output file.
+  inline const std::string& GetName() { return name_; }
+  /// Disabled by default.
+  inline void EnableDestStderr(bool enable) { stderr_ = enable; }
+  /// Enabled by default.
+  inline void EnableDestFile(bool enable) { file_ = enable; }
+  /// Reset the output file path.
+  /// The dest file is named as global dir + channel name by default.
+  void SetDestFilePath(const std::string& file);
+  /// Append a string message
+  void Send(const std::string& message);
+  /// Append a protobuf message
+  void Send(const google::protobuf::Message& message);
+
+ private:
+  std::string name_ = "";
+  bool stderr_ = false;
+  bool file_ = false;
+  std::ofstream os_;
+};
+
+class ChannelManager {
+ public:
+  ChannelManager() {}
+  ~ChannelManager();
+
+  void Init();
+  void SetDefaultDir(const char* dir);
+  Channel* GetInstance(const std::string& channel);
+
+ private:
+  std::string dir_ = "";
+  std::map<std::string, Channel*> name2ptr_;
+};
+
+/// Initial function for global usage of channel.
+/// 'argv' is for future use.
+void InitChannel(const char* argv);
+/// Set the directory name for persisting channel content
+void SetChannelDirectory(const char* path);
+/// Get the channel instance
+Channel* GetChannel(const std::string& channel_name);
+
+}  // namespace singa
+
+#endif  // SINGA_UTILS_CHANNEL_H__
diff --git a/include/singa/utils/cluster.h b/include/singa/utils/cluster.h
deleted file mode 100644
index 9e36cf8..0000000
--- a/include/singa/utils/cluster.h
+++ /dev/null
@@ -1,161 +0,0 @@
-/************************************************************
-*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*
-*************************************************************/
-
-#ifndef SINGA_UTILS_CLUSTER_H_
-#define SINGA_UTILS_CLUSTER_H_
-
-#include <glog/logging.h>
-#include <string>
-#include <unordered_map>
-#include <memory>
-#include <vector>
-#include "singa/proto/job.pb.h"
-#include "singa/proto/singa.pb.h"
-#include "singa/utils/cluster_rt.h"
-#include "singa/utils/common.h"
-#include "singa/utils/singleton.h"
-
-namespace singa {
-
-/**
- * Cluster is a singleton object, which provides cluster configuations,
- * e.g., the topology of the cluster.
- * All IDs start from 0.
- */
-class Cluster {
- public:
-  // Cluster is a global singleton in a process
-  static Cluster* Setup(int job_id, const SingaProto& singaConf,
-                        const ClusterProto& clusterConf);
-  static Cluster* Get();
-
-  inline int nserver_groups() const { return cluster_.nserver_groups(); }
-  inline int nworker_groups() const { return cluster_.nworker_groups(); }
-  inline int nworkers_per_group() const { return cluster_.nworkers_per_group();}
-  inline int nservers_per_group() const { return cluster_.nservers_per_group();}
-  inline int nworkers_per_procs() const { return cluster_.nworkers_per_procs();}
-  inline int nservers_per_procs() const { return cluster_.nservers_per_procs();}
-  inline int nworker_groups_per_server_group() const {
-    if (nserver_groups() == 0 || nservers_per_group() == 0)
-      return 1;
-    else
-      return cluster_.nworker_groups() / cluster_.nserver_groups();
-  }
-  /**
-   * @return true if the calling procs has server threads, otherwise false
-   */
-  inline bool has_server() const {
-    if (server_worker_separate()) {
-      CHECK_LT(procs_id_, nprocs_);
-      return procs_id_ >= nworker_procs();
-    } else {
-      return procs_id_ < nserver_procs();
-    }
-  }
-  /**
-   * @return true if the calling procs has worker threads.
-   */
-  inline bool has_worker() const {
-    return procs_id_ < nworker_procs();
-  }
-  /**
-   * @return global procs id, which starts from 0.
-   */
-  inline int procs_id() const { return procs_id_; }
-  inline void set_procs_id(int procs_id) { procs_id_ = procs_id; }
-  inline bool server_worker_separate() const {
-    return cluster_.server_worker_separate();
-  }
-  inline int nworker_procs() const {
-    return nworker_groups() * nworkers_per_group() / nworkers_per_procs();
-  }
-  inline int nserver_procs() const {
-    return nserver_groups() * nservers_per_group() / nservers_per_procs();
-  }
-  inline int nprocs() const { return nprocs_; }
-  /**
-   * @return endpoint of the router of a procs with the specified id
-   */
-  inline std::string endpoint(int procs_id) const {
-    CHECK_LT(procs_id, nprocs());
-    CHECK_GE(procs_id, 0);
-    return cluster_rt_->GetProcHost(procs_id);
-  }
-  inline std::string workspace() const { return cluster_.workspace(); }
-  inline std::string vis_folder() const {
-    return cluster_.workspace() + "/visualization";
-  }
-  inline std::string checkpoint_folder() const {
-    return cluster_.workspace() + "/checkpoint";
-  }
-  /*
-  const int stub_timeout() const { return cluster_.stub_timeout(); }
-  const int worker_timeout() const { return cluster_.worker_timeout(); }
-  const int server_timeout() const { return cluster_.server_timeout(); }
-  */
-  inline bool share_memory() const { return cluster_.share_memory(); }
-  inline int sync_freq() const { return cluster_.sync_freq(); }
-  inline int poll_time() const { return cluster_.poll_time(); }
-  ClusterRuntime* runtime() const { return cluster_rt_; }
-
-  /**
-   * @return logical procs ID
-   */
-  inline int ProcsIDOf(int group_id, int id, int flag) {
-    return procs_ids_.at(Hash(group_id, id, flag));
-  }
-
-  /**
-   * @param pid, processs ID
-   * @param group_size, num of executors in a group
-   * @param procs_size, num of executors in a procs
-   *
-   * @return a vector with 4 integers:
-   * [group start, group end), [start executor, end executor)
-   */
-  const std::vector<int> ExecutorRng(int pid, int group_size, int procs_size);
-  /**
-   * Register this process.
-   *
-   * @param pid physical process id get from OS, all other procs ID refers to
-   * logical process ID.
-   * @param endpoint unique string for other procs to connect
-   */
-  void Register(int pid, const std::string& endpoint);
-
- private:
-  void Init(int job, const SingaProto& singaConf,
-          const ClusterProto& clusterConf);
-  void SetupFolders(const ClusterProto &cluster);
-  int Hash(int gid, int id, int flag);
-
-  int procs_id_ = -1;
-  int nprocs_ = 0;
-  // cluster config proto
-  ClusterProto cluster_;
-  SingaProto singa_;
-  ClusterRuntime* cluster_rt_ = nullptr;
-  std::unordered_map<int, int> procs_ids_;
-};
-
-}  // namespace singa
-
-#endif  // SINGA_UTILS_CLUSTER_H_
diff --git a/include/singa/utils/cluster_rt.h b/include/singa/utils/cluster_rt.h
deleted file mode 100644
index 4ab48bd..0000000
--- a/include/singa/utils/cluster_rt.h
+++ /dev/null
@@ -1,105 +0,0 @@
-/************************************************************
-*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-* 
-*   http://www.apache.org/licenses/LICENSE-2.0
-* 
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*
-*************************************************************/
-
-#ifndef SINGA_UTILS_CLUSTER_RT_H_
-#define SINGA_UTILS_CLUSTER_RT_H_
-
-#include <map>
-#include <mutex>
-#include <string>
-#include <vector>
-
-namespace singa {
-
-typedef void (*rt_callback)(void *contest);
-
-struct RTCallback {
-  rt_callback fn;
-  void* ctx;
-};
-
-/**
- * ClusterRuntime is a runtime service that manages dynamic configuration
- * and status of the whole cluster. It mainly provides following services:
- *    1)  Provide running status of each server/worker
- *    2)  Translate process id to (hostname:port)
- */
-class ClusterRuntime {
- public:
-  // ClusterRuntime have different implementation determined when compiling
-  static ClusterRuntime* Create(const std::string&host, int job_id);
-
-  virtual ~ClusterRuntime() {}
-  /**
-   * Initialize the runtime instance
-   */
-  virtual bool Init() = 0;
-  /**
-   * register the process, and get a unique process id
-   *
-   * \return the process id, -1 if failed
-   */
-  virtual int RegistProc(const std::string& host_addr, int pid) = 0;
-  /**
-   * translate the process id to host address
-   *
-   * \return the host and port, "" if no such proc id 
-   */
-  virtual std::string GetProcHost(int proc_id) = 0;
-  /**
-   * Server: watch all workers in a server group,
-   * will be notified when all workers have left
-   */
-  virtual bool WatchSGroup(int gid, int sid, rt_callback fn, void* ctx) = 0;
-  /**
-   * Worker: join a server group (i.e. start to read/update these servers)
-   */
-  virtual bool JoinSGroup(int gid, int wid, int s_group) = 0;
-  /**
-   * Worker: leave a server group (i.e. finish its all work)
-   */
-  virtual bool LeaveSGroup(int gid, int wid, int s_group) = 0;
-};
-
-/*
- * A ClusterRuntime implementation for single-process environment
- */
-class SPClusterRT : public ClusterRuntime {
- public:
-  ~SPClusterRT();
-
-  bool Init() override;
-  int RegistProc(const std::string& host_addr, int pid) override;
-  std::string GetProcHost(int proc_id) override;
-  bool WatchSGroup(int gid, int sid, rt_callback fn, void* ctx) override;
-  bool JoinSGroup(int gid, int wid, int s_group) override;
-  bool LeaveSGroup(int gid, int wid, int s_group) override;
-
- private:
-  std::vector<std::string> proc_list_;
-  std::map<int, std::vector<RTCallback*>> grp_callbacks_;
-  std::map<int, int> grp_count_;
-  std::mutex lock_;
-};
-
-}  // namespace singa
-
-#endif  // SINGA_UTILS_CLUSTER_RT_H_
diff --git a/include/singa/utils/common.h b/include/singa/utils/common.h
deleted file mode 100644
index 0bcec58..0000000
--- a/include/singa/utils/common.h
+++ /dev/null
@@ -1,165 +0,0 @@
-/************************************************************
-*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*
-*************************************************************/
-
-#ifndef SINGA_UTILS_COMMON_H_
-#define SINGA_UTILS_COMMON_H_
-
-#include <google/protobuf/message.h>
-#include <unordered_map>
-#include <sstream>
-#include <string>
-#include <vector>
-#include <utility>
-#include "singa/proto/common.pb.h"
-
-namespace singa {
-
-using std::vector;
-using std::string;
-std::string IntVecToString(const std::vector<int>& vec);
-std::string VStringPrintf(std::string fmt, va_list l);
-std::string StringPrintf(std::string fmt, ...);
-
-/**
- * Locate the position of the arg in arglist.
- *
- * @param argc total num of arguments
- * @param arglist all arguments
- * @param the searched argument
- * @return the position of arg in the arglist; -1 if not found.
- */
-int ArgPos(int argc, char** arglist, const char* arg);
-void CreateFolder(const std::string name);
-/**
- * Slice a set of large Params into small pieces such that they can be roughtly
- * equally partitioned into a fixed number of boxes.
- *
- * @param num total number of boxes to store the small pieces
- * @param sizes size of all Params
- * @return all slices for each Param
- */
-const std::vector<std::vector<int>> Slice(int num,
-    const std::vector<int>& sizes);
-/**
- * Partition slices into boxes.
- *
- * @param num number of boxes
- * @param slices slice sizes
- * @return box id for each slice
- */
-const std::vector<int> PartitionSlices(int num, const std::vector<int>& slices);
-/*
-inline void Sleep(int millisec=1){
-  std::this_thread::sleep_for(std::chrono::milliseconds(millisec));
-}
-*/
-int gcd(int a, int b);
-int LeastCommonMultiple(int a, int b);
-/*
-inline float rand_real() {
-  return  static_cast<float>(rand_r())/(RAND_MAX+1.0f);
-}
-*/
-std::string GetHostIP();
-void SetupLog(const std::string& workspace, const std::string& model);
-
-/**
- * Performance mtrics.
- */
-class Metric {
- public:
-  Metric() {}
-  explicit Metric(const std::string& str);
-  /**
-   * Add one metric.
-   *
-   * If the metric exist, the aggregate. Otherwise create a new entry for it.
-   *
-   * @param name metric name, e.g., 'loss'
-   * @param value metric value
-   */
-  void Add(const std::string& name, float value);
-  void Add(const std::string& name, float value, int count);
-  /**
-   * reset all metric counter and value to 0
-   */
-  void Reset();
-  /**
-   * Generate a one-line string for logging
-   */
-  std::string ToLogString() const;
-  /**
-   * Serialize the object into a string
-   */
-  std::string ToString() const;
-  /**
-   * Parse the metric from a string
-   */
-  void ParseFrom(const std::string& msg);
-
- private:
-  std::unordered_map<std::string, std::pair<int, float>> entry_;
-};
-
-using google::protobuf::Message;
-void Im2col(const float* data_im, const int channels,
-    const int height, const int width, const int kernel_h, const int kernel_w,
-    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
-    float* data_col);
-void Col2im(const float* data_col, const int channels,
-    const int height, const int width, const int patch_h, const int patch_w,
-    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
-    float* data_im);
-void ForwardMaxPooling(const float* bottom, const int num, const int channels,
-    const int height, const int width, const int kernel_h, const int kernel_w,
-    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
-    float* top, float* mask);
-void BackwardMaxPooling(const float* top, const float* mask, const int num,
-    const int channels, const int height, const int width,
-    const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
-    const int stride_h, const int stride_w,
-    float* bottom);
-void ForwardAvgPooling(const float* bottom, const int num, const int channels,
-    const int height, const int width, const int kernel_h, const int kernel_w,
-    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
-    float* top);
-void BackwardAvgPooling(const float* top, const int num, const int channels,
-    const int height, const int width, const int kernel_h, const int kernel_w,
-    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
-    float* bottom);
-
-void ReadProtoFromTextFile(const char* filename, Message* proto);
-void WriteProtoToTextFile(const Message& proto, const char* filename);
-void ReadProtoFromBinaryFile(const char* filename, Message* proto);
-void WriteProtoToBinaryFile(const Message& proto, const char* filename);
-
-/**
- * Write a string (e.g., graph reprensetation of a net) into a text file.
- */
-void WriteStringToTextFile(const string& filename, const string& context);
-
-/**
- * Parse metric pairs (key = value[, key = value]) from string
- */
-const vector<std::pair<string, float>> GetMetricFromString(const string& disp);
-}  // namespace singa
-
-#endif  // SINGA_UTILS_COMMON_H_
diff --git a/include/singa/utils/context.h b/include/singa/utils/context.h
deleted file mode 100644
index 3490d29..0000000
--- a/include/singa/utils/context.h
+++ /dev/null
@@ -1,276 +0,0 @@
-/************************************************************
-*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*
-*************************************************************/
-
-#ifndef SINGA_UTILS_CONTEXT_H_
-#define SINGA_UTILS_CONTEXT_H_
-
-#include <glog/logging.h>
-#include <chrono>
-#include <random>
-#include <thread>
-#include <unordered_map>
-#include <vector>
-
-#ifdef USE_GPU
-#include "singa/utils/cuda_utils.h"
-
-#ifdef USE_CUDNN
-#include <cudnn.h>
-#endif
-
-#endif
-
-namespace singa {
-
-/**
- * Context is used as a global singleton, which stores the mapping from CPU
- * thread id to GPU device id. If a thread has no GPU, then its associated
- * device id is -1. It manages (e.g., creating) the handlers for GPU
- * devices. It also manages the GPU and CPU random generators, which are created
- * when accessed. One CPU thread has a CPU random generator. A GPU device
- * has a GPU random generator, which is accessible after assigning the GPU
- * device with a CPU thread via SetupDevice.
- */
-class Context {
- public:
-   /**
-    * Destructor, release random generators and handlers.
-    */
-  ~Context() {
-#ifdef USE_GPU
-    for (auto& entry : device_id_) {
-      if (entry.second != -1) {
-        cudaSetDevice(entry.second);
-        if (cublas_handle_[entry.second] != nullptr) {
-          cublasDestroy(cublas_handle_[entry.second]);
-          cublas_handle_[entry.second] = nullptr;
-        }
-        if (curand_generator_[entry.second] != nullptr) {
-          curandDestroyGenerator(curand_generator_[entry.second]);
-          curand_generator_[entry.second] = nullptr;
-        }
-      }
-    }
-#ifdef USE_CUDNN
-    for (auto& handle : cudnn_handle_) {
-      if (handle != nullptr)
-        CHECK_EQ(cudnnDestroy(handle), CUDNN_STATUS_SUCCESS);
-      handle = nullptr;
-    }
-#endif
-#endif
-    for (auto& entry : rand_generator_) {
-      if (entry.second != nullptr) {
-        delete entry.second;
-        entry.second = nullptr;
-      }
-    }
-  }
-  /**
-   * Constructor, init handlers and GPU rand generators to nullptr.
-   */
-  Context() {
-    for (int i = 0; i < kMaxNumGPU; i++) {
-#ifdef USE_GPU
-      cublas_handle_.push_back(nullptr);
-      curand_generator_.push_back(nullptr);
-#ifdef USE_CUDNN
-      cudnn_handle_.push_back(nullptr);
-#endif
-#endif
-    }
-  }
-
-  /**
-   * @return the device ID of the current thread.
-   */
-  int device_id() {
-    return device_id(std::this_thread::get_id());
-  }
-  /**
-   * @return the ID of the device attached to a given CPU thread, or -1 if this
-   * thread has not been attached GPU device.
-   */
-  int device_id(const std::thread::id& tid) {
-    if (device_id_.find(tid) != device_id_.end())
-      return device_id_[tid];
-    else
-      return -2;
-  }
-  /**
-   * Setup the CPU thread, which may be assigned a GPU device.
-   * If there is no GPU device, then set did to -1.
-   * Set the random seed to -1.
-   * @param[in] thread::id CPU thread ID
-   * @param[in] device_id GPU device ID
-   */
-  void SetupDevice(const std::thread::id& tid, const int did) {
-    SetupDevice(tid, did, -1);
-  }
-  /**
-   * @copy SetupDevice(const int, const int);
-   * @param[in] seed random seed
-   */
-  void SetupDevice(const std::thread::id& tid, const int did, const int seed) {
-    device_id_[tid] = did;
-    seed_[tid] = seed;
-  }
-
-  /**
-   * Activate the GPU device by calling cudaSetDevice.
-   */
-  void ActivateDevice(const int device_id) {
-    CHECK_GE(device_id, 0);
-#ifdef USE_GPU
-    cudaSetDevice(device_id);
-#endif
-  }
-
-  /**
-   * \copybreif rand_generator(const std::thread::id&);
-   * @return the CPU random generator for the calling thread.
-   */
-  std::mt19937* rand_generator() {
-    return rand_generator(std::this_thread::get_id());
-  }
-  /**
-   * Get the CPU random generator.
-   * If the generator does not exist, then create it now.
-   * If the seed is not set, i.e., seed=-1, then get a seed from system time.
-   * @param[in] thread::id CPU thread ID
-   * @return the CPU random generator
-   */
-  std::mt19937* rand_generator(const std::thread::id& tid) {
-    if (rand_generator_.find(tid) == rand_generator_.end()) {
-      // CHECK(seed_.find(tid) != seed_.end());
-      auto seed = static_cast<unsigned>(seed_[tid]);
-      if (seed_.find(tid) == seed_.end() || seed_.at(tid) == -1)
-        seed = std::chrono::system_clock::now().time_since_epoch().count();
-      rand_generator_[tid] = new std::mt19937(seed);
-    }
-    return rand_generator_[tid];
-  }
-#ifdef USE_GPU
-  /**
-   * \copybreif cublas_handle_(const std::thread::id&);
-   * @return cublas handle for the calling thread.
-   */
-  cublasHandle_t cublas_handle() {
-    return cublas_handle(std::this_thread::get_id());
-  }
-  /**
-   * Get the handler of the GPU which is assigned to the given thread.
-   * Calls cublas_handle(const int);
-   */
-  cublasHandle_t cublas_handle(const std::thread::id thread_id) {
-    return cublas_handle(device_id(thread_id));
-  }
-  /**
-   * Get the handler of the GPU device given its device ID. The device
-   * must be set up via SetupDevice(const std::thread::id, const int) before
-   * calling this function.
-   * @param[in] device_id GPU device ID
-   * @return the GPU handler
-   */
-  cublasHandle_t cublas_handle(const int device_id) {
-    CHECK_GE(device_id, 0);
-    if (cublas_handle_.at(device_id) == nullptr) {
-      cudaSetDevice(device_id);
-      cublasCreate(&cublas_handle_[device_id]);
-    }
-    return cublas_handle_[device_id];
-  }
-  /**
-   * Get the rand generator of the GPU device assigned to the given thread.
-   */
-  curandGenerator_t curand_generator(const std::thread::id thread_id) {
-    return curand_generator(device_id(thread_id));
-  }
-  /**
-   * Get the random generator of the GPU device given the device id.
-   * @param[in] device_id GPU device ID
-   * @return random generator. If it does not exist, then create one.
-   * The random seed will be set to CURAND_RNG_PSEUDO_DEFAULT if it is not set.
-   */
-  curandGenerator_t curand_generator(const int device_id) {
-    CHECK_GE(device_id, 0);
-    CHECK_LT(device_id, cudnn_handle_.size());
-    if (curand_generator_.at(device_id) == nullptr) {
-      // TODO(wangwei) handle user set seed
-      /*
-      CHECK(seed_.find(tid) != seed_.end());
-      auto seed = seed_[tid];
-      */
-      ActivateDevice(device_id);
-      curandCreateGenerator(&curand_generator_[device_id],
-          CURAND_RNG_PSEUDO_DEFAULT);
-    }
-    return curand_generator_[device_id];
-  }
-
-#ifdef USE_CUDNN
-  cudnnHandle_t cudnn_handle() {
-    return cudnn_handle(std::this_thread::get_id());
-  }
-
-  cudnnHandle_t cudnn_handle(const std::thread::id thread_id) {
-    return cudnn_handle(device_id(thread_id));
-  }
-
-  cudnnHandle_t cudnn_handle(const int device_id) {
-    CHECK_GE(device_id, 0);
-    CHECK_LT(device_id, cudnn_handle_.size());
-    if (cudnn_handle_.at(device_id) == nullptr) {
-      ActivateDevice(device_id);
-      // LOG(ERROR) << "create cudnn handle for device " << device_id;
-      CHECK_EQ(cudnnCreate(&cudnn_handle_[device_id]), CUDNN_STATUS_SUCCESS);
-    }
-    // LOG(ERROR) << "use cudnn handle from device " << device_id;
-    return cudnn_handle_[device_id];
-  }
-#endif
-
-#endif
-
- protected:
-  //!< max num of GPUs per process
-  const int kMaxNumGPU = 64;
-  //!< map from thread id to device id
-  std::unordered_map<std::thread::id, int> device_id_;
-  //!< map from thread id to cpu rand generator
-  std::unordered_map<std::thread::id, std::mt19937 *> rand_generator_;
-  //!< map from thread id to cpu rand generator seed
-  std::unordered_map<std::thread::id, int> seed_;
-#ifdef USE_GPU
-  //!< cublas handler indexed by GPU device ID
-  std::vector<cublasHandle_t> cublas_handle_;
-  //!< cublas rand generator indexed by GPU device ID
-  std::vector<curandGenerator_t> curand_generator_;
-
-#ifdef USE_CUDNN
-  std::vector<cudnnHandle_t> cudnn_handle_;
-#endif
-#endif
-};
-
-}  // namespace singa
-
-#endif  // SINGA_UTILS_CONTEXT_H_
diff --git a/include/singa/utils/cuda_utils.h b/include/singa/utils/cuda_utils.h
index 1270e92..2fe7d27 100644
--- a/include/singa/utils/cuda_utils.h
+++ b/include/singa/utils/cuda_utils.h
@@ -1,42 +1,80 @@
-/************************************************************
-*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*
-*************************************************************/
-
-/**
- * The code is adapted from that of Caffe which is under BSD 2 Clause License.
- *
- * COPYRIGHT
- * All contributions by the University of California:
- * Copyright (c) 2014, The Regents of the University of California (Regents)
- * All rights reserved.
- * All other contributions:
- * Copyright (c) 2014, the respective contributors
- * All rights reserved.
- */
+// from caffe include/caffe/util/device_alternative.hpp
 #ifndef SINGA_UTILS_CUDA_UTILS_H_
 #define SINGA_UTILS_CUDA_UTILS_H_
+
+#include "singa/singa_config.h"
+#ifdef USE_CUDA
 #include <cublas_v2.h>
 #include <cuda.h>
 #include <cuda_runtime.h>
 #include <curand.h>
 
+inline const char* cublasGetErrorString(cublasStatus_t error) {
+  switch (error) {
+  case CUBLAS_STATUS_SUCCESS:
+    return "CUBLAS_STATUS_SUCCESS";
+  case CUBLAS_STATUS_NOT_INITIALIZED:
+    return "CUBLAS_STATUS_NOT_INITIALIZED";
+  case CUBLAS_STATUS_ALLOC_FAILED:
+    return "CUBLAS_STATUS_ALLOC_FAILED";
+  case CUBLAS_STATUS_INVALID_VALUE:
+    return "CUBLAS_STATUS_INVALID_VALUE";
+  case CUBLAS_STATUS_ARCH_MISMATCH:
+    return "CUBLAS_STATUS_ARCH_MISMATCH";
+  case CUBLAS_STATUS_MAPPING_ERROR:
+    return "CUBLAS_STATUS_MAPPING_ERROR";
+  case CUBLAS_STATUS_EXECUTION_FAILED:
+    return "CUBLAS_STATUS_EXECUTION_FAILED";
+  case CUBLAS_STATUS_INTERNAL_ERROR:
+    return "CUBLAS_STATUS_INTERNAL_ERROR";
+#if CUDA_VERSION >= 6000
+  case CUBLAS_STATUS_NOT_SUPPORTED:
+    return "CUBLAS_STATUS_NOT_SUPPORTED";
+#endif
+#if CUDA_VERSION >= 6050
+  case CUBLAS_STATUS_LICENSE_ERROR:
+    return "CUBLAS_STATUS_LICENSE_ERROR";
+#endif
+  }
+  return "Unknown cublas status";
+}
+
+inline const char* curandGetErrorString(curandStatus_t error) {
+  switch (error) {
+  case CURAND_STATUS_SUCCESS:
+    return "CURAND_STATUS_SUCCESS";
+  case CURAND_STATUS_VERSION_MISMATCH:
+    return "CURAND_STATUS_VERSION_MISMATCH";
+  case CURAND_STATUS_NOT_INITIALIZED:
+    return "CURAND_STATUS_NOT_INITIALIZED";
+  case CURAND_STATUS_ALLOCATION_FAILED:
+    return "CURAND_STATUS_ALLOCATION_FAILED";
+  case CURAND_STATUS_TYPE_ERROR:
+    return "CURAND_STATUS_TYPE_ERROR";
+  case CURAND_STATUS_OUT_OF_RANGE:
+    return "CURAND_STATUS_OUT_OF_RANGE";
+  case CURAND_STATUS_LENGTH_NOT_MULTIPLE:
+    return "CURAND_STATUS_LENGTH_NOT_MULTIPLE";
+  case CURAND_STATUS_DOUBLE_PRECISION_REQUIRED:
+    return "CURAND_STATUS_DOUBLE_PRECISION_REQUIRED";
+  case CURAND_STATUS_LAUNCH_FAILURE:
+    return "CURAND_STATUS_LAUNCH_FAILURE";
+  case CURAND_STATUS_PREEXISTING_FAILURE:
+    return "CURAND_STATUS_PREEXISTING_FAILURE";
+  case CURAND_STATUS_INITIALIZATION_FAILED:
+    return "CURAND_STATUS_INITIALIZATION_FAILED";
+  case CURAND_STATUS_ARCH_MISMATCH:
+    return "CURAND_STATUS_ARCH_MISMATCH";
+  case CURAND_STATUS_INTERNAL_ERROR:
+    return "CURAND_STATUS_INTERNAL_ERROR";
+  }
+  return "Unknown curand status";
+}
+
+//
+// CUDA macros
+//
+
 // CUDA: various checks for different function calls.
 #define CUDA_CHECK(condition) \
   /* Code block avoids redefinition of cudaError_t error */ \
@@ -45,4 +83,20 @@
     CHECK_EQ(error, cudaSuccess) << " " << cudaGetErrorString(error); \
   } while (0)
 
+#define CUBLAS_CHECK(condition) \
+  do { \
+    cublasStatus_t status = condition; \
+    CHECK_EQ(status, CUBLAS_STATUS_SUCCESS) << " " \
+      << cublasGetErrorString(status); \
+  } while (0)
+
+#define CURAND_CHECK(condition) \
+  do { \
+    curandStatus_t status = condition; \
+    CHECK_EQ(status, CURAND_STATUS_SUCCESS) << " " \
+      << curandGetErrorString(status); \
+  } while (0)
+
+
+#endif  // USE_CUDA
 #endif  // SINGA_UTILS_CUDA_UTILS_H_
diff --git a/include/singa/utils/factory.h b/include/singa/utils/factory.h
index 3af25f0..b53caef 100644
--- a/include/singa/utils/factory.h
+++ b/include/singa/utils/factory.h
@@ -7,9 +7,9 @@
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
-* 
+*
 *   http://www.apache.org/licenses/LICENSE-2.0
-* 
+*
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -22,11 +22,11 @@
 #ifndef SINGA_UTILS_FACTORY_H_
 #define SINGA_UTILS_FACTORY_H_
 
-#include <glog/logging.h>
 #include <functional>
 #include <map>
 #include <string>
 
+#include "singa/utils/logging.h"
 /**
  * Macro that creats a function which instantiate a subclass instance and
  * returns pointer to the base class.
@@ -41,7 +41,7 @@
  * 2. call Create() func to call the creation function and return
  * a pointer to the base calss.
  */
-template<typename T>
+template<typename T, typename ID = std::string>
 class Factory {
  public:
   /**
@@ -51,50 +51,47 @@
    * @param id Identifier of the creating function/class
    * @param func a function that creates a layer instance
    */
-  inline void Register(const std::string& id,
-                       const std::function<T*(void)>& func) {
-    CHECK(str2func_.find(id) == str2func_.end())
-      << "The id has been registered by another function";
-    str2func_[id] = func;
+  static void Register(const ID& id,
+                       const std::function<T*(void)>& creator) {
+    Registry* reg = GetRegistry();
+    // CHECK(reg->find(id) == reg->end())
+    //  << "The id " << id << " has been registered";
+    (*reg)[id] = creator;
   }
-  /**
-   * Register functions to create user defined classes.
-   * This function is called by the REGISTER_FACTORY macro.
-   *
-   * @param id Identifier of the creating function/class
-   * @param func a function that creates a layer instance
-   */
-  inline void Register(int id,
-                       const std::function<T*(void)>& func) {
-    CHECK(id2func_.find(id) == id2func_.end())
-      << "The id has been registered by another function";
-    id2func_[id] = func;
-  }
+
   /**
    * create an instance by providing its id
    *
    * @param id
    */
-  inline T* Create(const std::string& id) {
-    CHECK(str2func_.find(id) != str2func_.end())
+  static T* Create(const ID& id) {
+    Registry* reg = GetRegistry();
+    CHECK(reg->find(id) != reg->end())
       << "The creation function for " << id << " has not been registered";
-    return str2func_[id]();
+    return (*reg)[id]();
   }
-  /**
-   * create an instance by providing its id
-   *
-   * @param id
-   */
-  inline T* Create(int id) {
-    CHECK(id2func_.find(id) != id2func_.end())
-      << "The creation function for " << id << " has not been registered";
-    return id2func_[id]();
+
+  static const std::vector<ID> GetIDs() {
+    std::vector<ID> keys;
+    for (const auto entry : *GetRegistry())
+      keys.push_back(entry.first);
+    return keys;
   }
 
  private:
   // Map that stores the registered creation functions
-  std::map<std::string, std::function<T*(void)>> str2func_;
-  std::map<int, std::function<T*(void)>> id2func_;
+  typedef std::map<ID, std::function<T*(void)>> Registry;
+  static Registry* GetRegistry() {
+    static Registry reg;
+    return &reg;
+  }
 };
 
+template<typename Base, typename Sub, typename ID = std::string>
+class Registra {
+ public:
+  Registra(const ID& id) {
+    Factory<Base, ID>::Register(id, [](void) { return new Sub(); });
+  }
+};
 #endif  // SINGA_UTILS_FACTORY_H_
diff --git a/include/singa/utils/graph.h b/include/singa/utils/graph.h
deleted file mode 100644
index 2462808..0000000
--- a/include/singa/utils/graph.h
+++ /dev/null
@@ -1,196 +0,0 @@
-/************************************************************
-*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*
-*************************************************************/
-
-#ifndef SINGA_UTILS_GRAPH_H_
-#define SINGA_UTILS_GRAPH_H_
-
-#include <stack>
-#include <string>
-#include <map>
-#include <vector>
-namespace singa {
-using std::string;
-using std::map;
-
-/**
- * Node class representing a layer in a neural net.
- *
- * TODO remove layer dependent fields, like origin, and partition_id, to make
- * it an independent and simple class.
- */
-class Node {
- public:
-  /**
-   * Node constructor.
-   *
-   * @param name identifier of the node, e.g, layer name.
-   */
-  explicit Node(string name);
-  /**
-   * Construct a node with specified attributes.
-   * @param name node identifier
-   * @param attrs node attributes for printing, including "shape", "color", etc.
-   * Depending on the visulization engine, if using graphviz, then the attribute
-   * list is http://www.graphviz.org/content/attrs.
-   */
-  Node(string name, const std::map<string, string>& attrs);
-  /**
-   * @deprecated {to make the Graph class an independent class.}
-   *
-   * Node constructor used for model partitioning.
-   *
-   * This node is a partition of some node.
-   * @param name node name
-   * @param origin  name of the original node
-   * @param id partition id of this node
-   * @param proto conf of the corresponding layer
-   */
-  Node(const string& name, const std::string& origin, int id, void* proto);
-  ~Node() {}  // the proto field is deleted outside by other functions
-
-
-  void AddDstNode(Node* dst);
-  void AddSrcNode(Node* src);
-  void RemoveDstNode(Node* dst);
-  void RemoveSrcNode(Node* src);
-
-  string name = "";
-  //! name of the origin node/layer from which is node is derived
-  string origin = "";
-  //! partition id
-  int partition_id = 0;
-  //! proto of the corresponding layer
-  void* proto = nullptr;
-  std::vector<Node*> srcnodes;
-  std::vector<Node*> dstnodes;
-  //!< node attribute including shape, color, etc.
-  std::map<string, string> attrs;
-};
-
-/**
- * Neuralnet is constructed by creating a graph with each node representing one
- * layer at first. After topology sort for graph nodes, layers are created and
- * connected.
- */
-class Graph {
- public:
-  Graph() {}
-  ~Graph();
-  const Graph Reverse() const;
-  /**
-   * @return all nodes of the graph
-   */
-  inline const std::vector<Node*>& nodes() const {
-    return nodes_;
-  }
-  /**
-   * @param name node name
-   * @return return the node of given name
-   */
-  inline Node* node(const string& name) const {
-    return name2node_.at(name);
-  }
-  /**
-   * Add an exiting node into this graph.
-   */
-  void AddNode(Node* node);
-  /**
-   * Creat an node with the given name and add it into the graph.
-   * @return the newly created node.
-   */
-  Node* AddNode(const string& name);
-  /**
-   * Create an node with the given name and attributes.
-   */
-  Node* AddNode(const string& name, const std::map<string, string>& attrs);
-  /**
-   * @deprecated {remove layer related info from node attrs}
-   * Add a node with given name and other info.
-   */
-  Node* AddNode(const std::string& name, const std::string& origin, int id,
-                void* proto);
-  /**
-   * Add an edge connecting the two given nodes.
-   */
-  void AddEdge(Node* srcnode, Node* dstnode);
-  /**
-   * Add an edge connecting the two nodes with the given name.
-   */
-  void AddEdge(const string& src, const std::string& dst);
-  /**
-   * Add an edge connecting the two given nodes, the edge attributes are also
-   * given.
-   */
-  void AddEdge(Node* srcnode, Node* dstnode,
-      const std::map<string, string>& attrs);
-  /**
-   * Add an edge connecting the two nodes with the given names, the edge
-   * attributes are also given, which are used for printing.
-   * http://www.graphviz.org/content/attrs
-   */
-  void AddEdge(const string& src, const std::string& dst,
-      const std::map<string, string>& attrs);
-
-  /**
-   * Remove the edge connecting the two given nodes.
-   */
-  void RemoveEdge(Node* src, Node* dst);
-  /**
-   * Remove the edge connecting two nodes with the given names.
-   */
-  void RemoveEdge(const string &src, const std::string& dst);
-  /**
-   * Dump the graph into json string which can be used to draw a picture by
-   * graphviz.
-   *
-   * It calls ToJson(const std::map<std::string, std::string>& label) with
-   * empty label mapping.
-   */
-  string ToJson() const;
-  /**
-   * \copybreif ToJson()
-   *
-   * @param label information to be displayed as label for each node
-   */
-  string ToJson(const map<std::string, std::string>& label) const;
-  /**
-   * Do topology sort for all nodes of the graph.
-   */
-  void Sort();
-
- private:
-  /**
-   *
-   * @return the name of the edge connecting src to dst
-   */
-  const string GetEdgeName(const string& src, const string& dst) const {
-    return src + "-->" + dst;
-  }
-
- private:
-  std::vector<Node*> nodes_;
-  std::map<string, Node*> name2node_;
-  std::map<string, std::map<string, string>> edge_attrs_;
-};
-
-}  // namespace singa
-
-#endif  // SINGA_UTILS_GRAPH_H_
diff --git a/include/singa/utils/image_transform.h b/include/singa/utils/image_transform.h
deleted file mode 100644
index 2867ad2..0000000
--- a/include/singa/utils/image_transform.h
+++ /dev/null
@@ -1,35 +0,0 @@
-/************************************************************
-*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*
-*************************************************************/
-
-#ifndef SINGA_UTILS_IMAGE_TRANSFORM_H_
-#define SINGA_UTILS_IMAGE_TRANSFORM_H_
-
-#include <glog/logging.h>
-// TODO(wangwei) provide image transformation API, the implementation can be
-// done by opencv, manual transform, or mshadow.
-namespace singa {
-
-void ImageTransform(const float* in, const float* mean, bool mirror, int h_crop,
-    int w_crop, int h_offset, int w_offset, int channel, int height, int width,
-    float scale, float* out);
-}  // namespace singa
-
-#endif  // SINGA_UTILS_IMAGE_TRANSFORM_H_
diff --git a/include/singa/utils/integer.h b/include/singa/utils/integer.h
new file mode 100644
index 0000000..9c2799d
--- /dev/null
+++ b/include/singa/utils/integer.h
@@ -0,0 +1,73 @@
+/************************************************************
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ *
+ *************************************************************/
+
+#ifndef INTEGER_H_
+#define INTEGER_H_
+
+#include <cstdint>
+
+namespace singa{
+static bool isNetworkOrder() {
+    int test = 1;
+    return (1 != *(uint8_t*)&test);
+}
+
+template <typename T>
+static inline T byteSwap(const T& v) {
+    int size = sizeof(v);
+    T ret;
+    uint8_t *dest = reinterpret_cast<uint8_t *>(&ret);
+    uint8_t *src = const_cast<uint8_t*>(reinterpret_cast<const uint8_t*>(&v));
+    for (int i = 0; i < size; ++i) {
+        dest[i] = src[size - i - 1];
+    }
+    return ret;
+}
+
+template <typename T>
+static inline T hton(const T& v)
+{
+    return isNetworkOrder() ? v : byteSwap(v);
+}
+
+template <typename T>
+static inline T ntoh(const T& v) 
+{
+    return hton(v);
+}
+
+static inline int appendInteger(char* buf) {return 0;}
+static inline int readInteger(char* buf) {return 0;}
+
+template<typename Type, typename... Types>
+static int appendInteger(char* buf, Type value, Types... values) {
+    *(Type*)buf = hton(value);
+    return sizeof(Type) + appendInteger(buf + sizeof(Type), values...);
+}
+
+template<typename Type, typename... Types>
+static int readInteger(char* buf, Type& value, Types&... values) {
+    value = ntoh(*(Type*)buf);
+    return sizeof(Type) + readInteger(buf + sizeof(Type), values...);
+}
+
+}
+#endif
diff --git a/include/singa/utils/job_manager.h b/include/singa/utils/job_manager.h
deleted file mode 100644
index 7f1b4f1..0000000
--- a/include/singa/utils/job_manager.h
+++ /dev/null
@@ -1,79 +0,0 @@
-/************************************************************
-*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-* 
-*   http://www.apache.org/licenses/LICENSE-2.0
-* 
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*
-*************************************************************/
-
-#ifndef SINGA_UTILS_JOB_MANAGER_H_
-#define SINGA_UTILS_JOB_MANAGER_H_
-
-#include <string>
-#include <vector>
-
-#ifdef USE_ZOOKEEPER
-#include "singa/utils/zk_service.h"
-#endif
-
-namespace singa {
-
-struct JobInfo {
-  int id;
-  int procs;
-  std::string name;
-};
-
-class JobManager {
- public:
-  // host is comma separated host:port pairs, each corresponding to a zk server.
-  // e.g. "127.0.0.1:3000,127.0.0.1:3001,127.0.0.1:3002"
-  explicit JobManager(const std::string& host);
-
-  // NOTICE: Init must be called once, before start to use other functions
-  bool Init();
-  // generate a unique job id
-  bool GenerateJobID(int* id);
-  // generate a list of hosts for a job conf
-  bool GenerateHostList(const char* host_file, const char* job_file,
-                        std::vector<std::string>* list);
-  // list all jobs recorded in zk
-  bool ListJobs(std::vector<JobInfo>* jobs);
-  // list running processes for a job
-  bool ListJobProcs(int job, std::vector<std::string>* procs);
-  // remove a job path in zk
-  bool Remove(int job);
-  // remove all job paths in zk
-  bool RemoveAllJobs();
-  // remove all singa related paths in zk
-  bool CleanUp();
-
- private:
-  const int kJobsNotRemoved = 10;
-
-  bool CleanPath(const std::string& path, bool remove);
-  std::string ExtractClusterConf(const char* job_file);
-
-  std::string host_ = "";
-#ifdef USE_ZOOKEEPER
-  int timeout_ = 30000;
-  ZKService zk_;
-#endif
-};
-
-}  // namespace singa
-
-#endif  // SINGA_UTILS_JOB_MANAGER_H_
diff --git a/include/singa/utils/logging.h b/include/singa/utils/logging.h
new file mode 100644
index 0000000..00ac02b
--- /dev/null
+++ b/include/singa/utils/logging.h
@@ -0,0 +1,293 @@
+/************************************************************
+*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*
+*************************************************************/
+
+// adapted from google::tensorflow::logging
+
+#ifndef SINGA_UTILS_LOGGING_H_
+#define SINGA_UTILS_LOGGING_H_
+
+#include <stdlib.h>
+#include <sstream>
+#include <string>
+#ifdef USE_GLOG
+#include <glog/logging.h>
+#endif
+
+namespace singa {
+
+/// Global functions for both glog and built-in log
+void InitLogging(const char *argv);
+/// Make it so that all log messages go only to stderr
+void LogToStderr();
+/// Make it so that all log messages of at least a particular severity are
+/// logged to stderr (in addtion to logging to the usual log files)
+void SetStderrLogging(int severity);
+/// Set the file name for logging (and disable logging to stderr)
+void SetLogDestination(int severity, const char* path);
+
+using std::string;
+
+const int INFO = 0;            // base_logging::INFO;
+const int WARNING = 1;         // base_logging::WARNING;
+const int ERROR = 2;           // base_logging::ERROR;
+const int FATAL = 3;           // base_logging::FATAL;
+const int NUM_SEVERITIES = 4;  // base_logging::NUM_SEVERITIES;
+
+#ifndef USE_GLOG
+namespace logging {
+
+class LogMessage : public std::basic_ostringstream<char> {
+ public:
+  LogMessage(const char* fname, int line, int severity);
+  ~LogMessage();
+
+ protected:
+  void GenerateLogMessage();
+  void DoLogging(FILE* file, const struct tm& tm_time);
+
+ private:
+  const char* fname_;
+  int line_;
+  int severity_;
+};
+
+// LogMessageFatal ensures the process will exit in failure after
+// logging this message.
+class LogMessageFatal : public LogMessage {
+ public:
+  LogMessageFatal(const char* file, int line);
+  ~LogMessageFatal();
+};
+
+#define _SINGA_LOG_INFO \
+  ::singa::logging::LogMessage(__FILE__, __LINE__, singa::INFO)
+#define _SINGA_LOG_WARNING \
+  ::singa::logging::LogMessage(__FILE__, __LINE__, singa::WARNING)
+#define _SINGA_LOG_ERROR \
+  ::singa::logging::LogMessage(__FILE__, __LINE__, singa::ERROR)
+#define _SINGA_LOG_FATAL \
+  ::singa::logging::LogMessageFatal(__FILE__, __LINE__)
+
+#define LOG(severity) _SINGA_LOG_##severity
+
+/// CHECK dies with a fatal error if condition is not true.  It is *not*
+/// controlled by NDEBUG, so the check will be executed regardless of
+/// compilation mode.  Therefore, it is safe to do things like:
+///    CHECK(fp->Write(x) == 4)
+#define CHECK(condition)              \
+  if (!(condition)) \
+  LOG(FATAL) << "Check failed: " #condition " "
+
+// Function is overloaded for integral types to allow static const
+// integrals declared in classes and not defined to be used as arguments to
+// CHECK* macros. It's not encouraged though.
+template <typename T>
+  inline const T& GetReferenceableValue(const T& t) {
+    return t;
+  }
+inline char GetReferenceableValue(char t) { return t; }
+inline unsigned char GetReferenceableValue(unsigned char t) { return t; }
+inline signed char GetReferenceableValue(signed char t) { return t; }
+inline short GetReferenceableValue(short t) { return t; }
+inline unsigned short GetReferenceableValue(unsigned short t) { return t; }
+inline int GetReferenceableValue(int t) { return t; }
+inline unsigned int GetReferenceableValue(unsigned int t) { return t; }
+inline long GetReferenceableValue(long t) { return t; }
+inline unsigned long GetReferenceableValue(unsigned long t) { return t; }
+inline long long GetReferenceableValue(long long t) { return t; }
+inline unsigned long long GetReferenceableValue(unsigned long long t) {
+  return t;
+}
+
+// This formats a value for a failing CHECK_XX statement.  Ordinarily,
+// it uses the definition for operator<<, with a few special cases below.
+template <typename T>
+inline void MakeCheckOpValueString(std::ostream* os, const T& v) {
+  (*os) << v;
+}
+
+// Overrides for char types provide readable values for unprintable
+// characters.
+template <>
+void MakeCheckOpValueString(std::ostream* os, const char& v);
+template <>
+void MakeCheckOpValueString(std::ostream* os, const signed char& v);
+template <>
+void MakeCheckOpValueString(std::ostream* os, const unsigned char& v);
+
+// We need an explicit specialization for std::nullptr_t.
+template <>
+void MakeCheckOpValueString(std::ostream* os, const std::nullptr_t& p);
+
+// A container for a string pointer which can be evaluated to a bool -
+// true iff the pointer is non-NULL.
+struct CheckOpString {
+  CheckOpString(string* str) : str_(str) {}
+  // No destructor: if str_ is non-NULL, we're about to LOG(FATAL),
+  // so there's no point in cleaning up str_.
+  operator bool() const { return str_ != NULL; }
+  string* str_;
+};
+
+// Build the error message string. Specify no inlining for code size.
+template <typename T1, typename T2>
+string* MakeCheckOpString(const T1& v1, const T2& v2,
+    const char* exprtext);
+
+// A helper class for formatting "expr (V1 vs. V2)" in a CHECK_XX
+// statement.  See MakeCheckOpString for sample usage.  Other
+// approaches were considered: use of a template method (e.g.,
+// base::BuildCheckOpString(exprtext, base::Print<T1>, &v1,
+// base::Print<T2>, &v2), however this approach has complications
+// related to volatile arguments and function-pointer arguments).
+class CheckOpMessageBuilder {
+ public:
+  // Inserts "exprtext" and " (" to the stream.
+  explicit CheckOpMessageBuilder(const char* exprtext);
+  // Deletes "stream_".
+  ~CheckOpMessageBuilder();
+  // For inserting the first variable.
+  std::ostream* ForVar1() { return stream_; }
+  // For inserting the second variable (adds an intermediate " vs. ").
+  std::ostream* ForVar2();
+  // Get the result (inserts the closing ")").
+  string* NewString();
+
+ private:
+  std::ostringstream* stream_;
+};
+
+template <typename T1, typename T2>
+string* MakeCheckOpString(const T1& v1, const T2& v2, const char* exprtext) {
+  CheckOpMessageBuilder comb(exprtext);
+  MakeCheckOpValueString(comb.ForVar1(), v1);
+  MakeCheckOpValueString(comb.ForVar2(), v2);
+  return comb.NewString();
+}
+
+// Helper functions for CHECK_OP macro.
+// The (int, int) specialization works around the issue that the compiler
+// will not instantiate the template version of the function on values of
+// unnamed enum type - see comment below.
+#define SINGA_DEFINE_CHECK_OP_IMPL(name, op)                         \
+  template <typename T1, typename T2>                                \
+  inline string* name##Impl(const T1& v1, const T2& v2,              \
+                            const char* exprtext) {                  \
+    if (v1 op v2)                                                    \
+      return NULL;                                                   \
+    else                                                             \
+      return ::singa::logging::MakeCheckOpString(v1, v2, exprtext); \
+  }                                                                  \
+  inline string* name##Impl(int v1, int v2, const char* exprtext) {  \
+    return name##Impl<int, int>(v1, v2, exprtext);                   \
+  }
+
+// We use the full name Check_EQ, Check_NE, etc. in case the file including
+// base/logging.h provides its own #defines for the simpler names EQ, NE, etc.
+// This happens if, for example, those are used as token names in a
+// yacc grammar.
+SINGA_DEFINE_CHECK_OP_IMPL(Check_EQ,
+                           == )  // Compilation error with CHECK_EQ(NULL, x)?
+SINGA_DEFINE_CHECK_OP_IMPL(Check_NE, != )  // Use CHECK(x == NULL) instead.
+SINGA_DEFINE_CHECK_OP_IMPL(Check_LE, <= )
+SINGA_DEFINE_CHECK_OP_IMPL(Check_LT, < )
+SINGA_DEFINE_CHECK_OP_IMPL(Check_GE, >= )
+SINGA_DEFINE_CHECK_OP_IMPL(Check_GT, > )
+#undef SINGA_DEFINE_CHECK_OP_IMPL
+
+// In optimized mode, use CheckOpString to hint to compiler that
+// the while condition is unlikely.
+#define CHECK_OP_LOG(name, op, val1, val2)                      \
+  while (::singa::logging::CheckOpString _result =              \
+             ::singa::logging::name##Impl(                      \
+                 ::singa::logging::GetReferenceableValue(val1), \
+                 ::singa::logging::GetReferenceableValue(val2), \
+                 #val1 " " #op " " #val2))                      \
+  ::singa::logging::LogMessageFatal(__FILE__, __LINE__) << *(_result.str_)
+
+#define CHECK_OP(name, op, val1, val2) CHECK_OP_LOG(name, op, val1, val2)
+
+// CHECK_EQ/NE/...
+#define CHECK_EQ(val1, val2) CHECK_OP(Check_EQ, ==, val1, val2)
+#define CHECK_NE(val1, val2) CHECK_OP(Check_NE, !=, val1, val2)
+#define CHECK_LE(val1, val2) CHECK_OP(Check_LE, <=, val1, val2)
+#define CHECK_LT(val1, val2) CHECK_OP(Check_LT, <, val1, val2)
+#define CHECK_GE(val1, val2) CHECK_OP(Check_GE, >=, val1, val2)
+#define CHECK_GT(val1, val2) CHECK_OP(Check_GT, >, val1, val2)
+#define CHECK_NOTNULL(val)                            \
+  ::singa::logging::CheckNotNull(__FILE__, __LINE__, \
+                                  "'" #val "' Must be non NULL", (val))
+
+#ifndef NDEBUG
+// DCHECK_EQ/NE/...
+#define DCHECK(condition) CHECK(condition)
+#define DCHECK_EQ(val1, val2) CHECK_EQ(val1, val2)
+#define DCHECK_NE(val1, val2) CHECK_NE(val1, val2)
+#define DCHECK_LE(val1, val2) CHECK_LE(val1, val2)
+#define DCHECK_LT(val1, val2) CHECK_LT(val1, val2)
+#define DCHECK_GE(val1, val2) CHECK_GE(val1, val2)
+#define DCHECK_GT(val1, val2) CHECK_GT(val1, val2)
+
+#else
+
+#define DCHECK(condition) \
+  while (false && (condition)) LOG(FATAL)
+
+// NDEBUG is defined, so DCHECK_EQ(x, y) and so on do nothing.
+// However, we still want the compiler to parse x and y, because
+// we don't want to lose potentially useful errors and warnings.
+// _DCHECK_NOP is a helper, and should not be used outside of this file.
+#define _SINGA_DCHECK_NOP(x, y) \
+  while (false && ((void)(x), (void)(y), 0)) LOG(FATAL)
+
+#define DCHECK_EQ(x, y) _SINGA_DCHECK_NOP(x, y)
+#define DCHECK_NE(x, y) _SINGA_DCHECK_NOP(x, y)
+#define DCHECK_LE(x, y) _SINGA_DCHECK_NOP(x, y)
+#define DCHECK_LT(x, y) _SINGA_DCHECK_NOP(x, y)
+#define DCHECK_GE(x, y) _SINGA_DCHECK_NOP(x, y)
+#define DCHECK_GT(x, y) _SINGA_DCHECK_NOP(x, y)
+
+#endif
+
+// These are for when you don't want a CHECK failure to print a verbose
+// stack trace.  The implementation of CHECK* in this file already doesn't.
+#define QCHECK(condition) CHECK(condition)
+#define QCHECK_EQ(x, y) CHECK_EQ(x, y)
+#define QCHECK_NE(x, y) CHECK_NE(x, y)
+#define QCHECK_LE(x, y) CHECK_LE(x, y)
+#define QCHECK_LT(x, y) CHECK_LT(x, y)
+#define QCHECK_GE(x, y) CHECK_GE(x, y)
+#define QCHECK_GT(x, y) CHECK_GT(x, y)
+
+template <typename T>
+T&& CheckNotNull(const char* file, int line, const char* exprtext, T&& t) {
+  if (t == nullptr) {
+    LogMessageFatal(file, line) << string(exprtext);
+  }
+  return std::forward<T>(t);
+}
+
+}  // namespace logging
+#endif
+
+}  // namespace singa
+
+#endif  // SINGA_UTILS_LOGGING_H_
diff --git a/include/singa/utils/math_addr.h b/include/singa/utils/math_addr.h
deleted file mode 100644
index cf1d227..0000000
--- a/include/singa/utils/math_addr.h
+++ /dev/null
@@ -1,279 +0,0 @@
-/************************************************************
-*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*
-*************************************************************/
-
-#ifndef SINGA_UTILS_MATH_ADDR_H_
-#define SINGA_UTILS_MATH_ADDR_H_
-
-extern "C" {
-#include <cblas.h>
-}
-#ifdef USE_GPU
-#include <cuda_runtime.h>
-#include <cublas_v2.h>
-#endif
-
-#include "singa/utils/singa_op.h"
-
-namespace singa {
-template<typename Dtype>
-Dtype cpu_asum(int n, const Dtype* A, int inc) {
-  return cblas_sasum(n, A, inc);
-}
-
-template<typename Dtype>
-void cpu_gemm(const Dtype * A, const Dtype * B,
-    const int m, const int n, const int k, const Dtype alpha, const Dtype beta,
-    const bool TranA, const bool TranB, Dtype * C) {
-  int lda, ldb;
-  CBLAS_TRANSPOSE tA, tB;
-  lda = TranA ? m : k;
-  ldb = TranB ? k : n;
-  tA = TranA ? CblasTrans : CblasNoTrans;
-  tB = TranB ? CblasTrans : CblasNoTrans;
-  cblas_sgemm(CblasRowMajor, tA, tB, m, n, k, alpha, A, lda,
-      B, ldb, beta, C, n);
-}
-
-// should be very careful:
-// m is the length of B, and n is the length of C , A is a n*m matrix
-template<typename Dtype>
-void cpu_gemv(const Dtype * A, const Dtype * B, const int m, const int n,
-    const Dtype alpha, const Dtype beta, const bool TranA, Dtype * C) {
-  CBLAS_TRANSPOSE tA;
-  tA = TranA ? CblasTrans : CblasNoTrans;
-  cblas_sgemv(CblasRowMajor, tA, m, n, alpha, A, n, B, 1, beta, C, 1);
-}
-
-template<typename Dtype>
-void cpu_axpy(const int n, const Dtype alpha, const Dtype * A, Dtype * B) {
-  cblas_saxpy(n, alpha, A, 1, B, 1);
-}
-
-template<typename Dtype>
-void cpu_scale(const int n, const Dtype alpha, Dtype * A) {
-  cblas_sscal(n, alpha, A, 1);
-}
-
-template<typename Dtype>
-void cpu_copy(const int n, const Dtype* A, Dtype *B) {
-  cblas_scopy(n, A, 1, B, 1);
-}
-
-template<typename Dtype>
-Dtype cpu_dot(const int n, const Dtype * A, const Dtype * B) {
-  Dtype sum = 0;
-  for (int i = 0 ; i < n ; i++)
-    sum += A[i] * B[i];
-  return sum;
-}
-
-// element-wise
-template<typename Op, typename Dtype>
-void cpu_e_f(const int n, const Dtype * A, Dtype * B) {
-  for (int i = 0 ; i < n ; i++) {
-    Op::Map(A[i], &B[i]);
-  }
-}
-
-template<typename Op, typename Dtype>
-void cpu_e_f(const int n, const Dtype * A, const Dtype * B, Dtype * C) {
-  for (int i = 0 ; i < n ; i++) {
-    Op::Map(A[i], B[i], &C[i]);
-  }
-}
-template<typename Op, typename Dtype>
-void cpu_e_f(const int n, const Dtype alpha, const Dtype * A, Dtype * B) {
-  for (int i = 0 ; i < n ; i++) {
-    Op::Map(alpha, A[i], &B[i]);
-  }
-}
-
-template<typename Op, typename Dtype>
-void cpu_e_f(const int n, const Dtype alpha, const Dtype * A, const Dtype * B,
-    Dtype * C) {
-  for (int i = 0 ; i < n ; i++) {
-    Op::Map(alpha, A[i], B[i], &C[i]);
-  }
-}
-// element-wise generalized operation defined in Op
-
-
-// matrix/vector expand/reduce
-
-template<typename Op, typename Dtype>
-void cpu_reduce_f(const Dtype * A, const int m, const int n, Dtype * B) {
-  for (int i = 0 ; i < m ; i++) {
-    Op::Map(A+i*n, n, B[i]);
-  }
-}
-// reduce each row of A to an element of B e.g. the sum operation in softmax
-template<typename Op, typename Dtype>
-void cpu_expand_f(const Dtype * A, const int m, const int n, Dtype * B) {
-  for (int i = 0 ; i < m ; i++) {
-    Op::Map(A[i], n, B+i*n);
-  }
-}
-
-
-template<typename Dtype>
-void cpu_softmax(int nb_rows, int nb_cols, const Dtype* A, Dtype* B) {
-  for (int i = 0; i < nb_rows; i++) {
-    const Dtype* dptr = A + i * nb_cols;
-    Dtype mmax = dptr[0];
-    for (int x = 1; x < nb_cols; ++x)
-      if (mmax < dptr[x]) mmax = dptr[x];
-    Dtype sum = 0.0f;
-    for (int x = 0; x < nb_cols; ++x) {
-      dptr[x] = std::exp(dptr[x] - mmax);
-      sum += dptr[x];
-    }
-    for (int x = 0; x < nb_cols; ++x) {
-      dptr[x] /= sum;
-    }
-  }
-}
-
-
-
-template<typename Dtype, typename URNG>
-void cpu_sample_uniform(URNG& g, int n, Dtype low, Dtype high, Dtype* A) {
-  std::uniform_real_distribution<Dtype> distribution(low, high);
-  for (int i = 0; i < n; i++)
-    A[i] = distribution(g);
-}
-
-template<typename Dtype, typename URNG>
-void cpu_sample_gaussian(URNG& g, int n, Dtype mean, Dtype std, Dtype* A) {
-  std::normal_distribution<Dtype> distribution(mean, std);
-  for (int i = 0; i < n; i++)
-    A[i] = distribution(g);
-}
-
-#ifdef USE_GPU
-template<typename Dtype>
-Dtype gpu_asum(cublasHandle_t handle, int n, const Dtype* A, int inc) {
-  Dtype result = 0.0;
-  cublasSasum(handle, n, A, inc, &result);
-  return result;
-}
-
-template<typename Dtype>
-void gpu_gemm(cublasHandle_t handle, const Dtype * A, const Dtype * B,
-    const int m, const int n, const int k, const Dtype alpha, const Dtype beta,
-    const bool TranA, const bool TranB, Dtype * C) {
-  int lda = TranA ? m : k;
-  int ldb = TranB ? k : n;
-  int ldc = n;
-  cublasOperation_t tA = (TranA == false) ? CUBLAS_OP_N : CUBLAS_OP_T;
-  cublasOperation_t tB = (TranB == false) ? CUBLAS_OP_N : CUBLAS_OP_T;
-  cublasSgemm(handle, tB, tA, n, m, k, &alpha, B, ldb,
-      A, lda, &beta, C, ldc);
-}
-
-template<typename Dtype>
-void gpu_gemv(cublasHandle_t handle, const Dtype * A, const Dtype * B,
-    const int m, const int n, const Dtype alpha, const Dtype beta,
-    const bool TranA, Dtype * C) {
-  int lda = n;
-  cublasOperation_t tA = (TranA == true) ? CUBLAS_OP_N : CUBLAS_OP_T;
-  cublasSgemv(handle, tA, n, m, &alpha , A, lda, B, 1, &beta, C, 1);
-}
-
-template<typename Dtype>
-void gpu_axpy(cublasHandle_t handle, const int n, const Dtype alpha,
-    const Dtype * A, Dtype * B) {
-  cublasSaxpy(handle, n, &alpha, A, 1, B, 1);
-}
-
-template<typename Dtype>
-void gpu_scale(cublasHandle_t handle, const int n, const Dtype alpha,
-    Dtype * A) {
-  cublasSscal(handle, n, &alpha, A, 1);
-}
-
-template<typename Dtype>
-Dtype gpu_dot(cublasHandle_t handle, const int n, const Dtype * A,
-    const Dtype * B) {
-  Dtype result = 0.0;
-  cublasSdot(handle, n, A, 1, B, 1, &result);
-  return result;
-}
-
-// element-wise
-template<typename Op, typename Dtype>
-void gpu_e_f(const int n, const Dtype alpha, Dtype * A) {
-  Op::CudaMap(alpha, A, n);
-}
-
-template<typename Op, typename Dtype>
-void gpu_e_f(const int n, const Dtype * A, Dtype * B) {
-  Op::CudaMap(A, B, n);
-}
-
-template<typename Op, typename Dtype>
-void gpu_e_f(const int n, const Dtype * A, const Dtype * B, Dtype * C) {
-  Op::CudaMap(A, B, C, n);
-}
-
-template<typename Op, typename Dtype>
-void gpu_e_f(const int n, const Dtype alpha, const Dtype * A, Dtype * B) {
-  Op::CudaMap(alpha, A, B, n);
-}
-
-template<typename Op, typename Dtype>
-void gpu_e_f(const int n, const Dtype alpha, const Dtype beta,
-  const Dtype * A, const Dtype * B, Dtype * C) {
-  Op::CudaMap(alpha, beta, A, B, C, n);
-}
-// element-wise generalized operation defined in Op
-
-// matrix/vector expand/reduce
-
-template<typename Op, typename Dtype>
-void gpu_reduce_f(const Dtype * A, const int m, const int n, Dtype * B) {
-  for (int i = 0 ; i < m ; i++) {
-    Op::CudaMap(A+i*n, n, B[i]);
-  }
-}
-// reduce each row of A to an element of B e.g. the sum operation in softmax
-template<typename Op, typename Dtype>
-void gpu_expand_f(const Dtype * A, const int m, const int n, Dtype * B) {
-  for (int i = 0 ; i < m ; i++) {
-    Op::CudaMap(A[i], n, B+i*n);
-  }
-}
-
-
-template<typename Dtype, typename URNG>
-void gpu_sample_uniform(URNG g, int n, Dtype low, Dtype high, Dtype* A) {
-  curandGenerateUniform(g, A, n);
-}
-
-template<typename Dtype, typename URNG>
-void gpu_sample_gaussian(URNG g, int n, Dtype mean, Dtype std, Dtype* A) {
-  curandGenerateNormal(g, A, n, mean, std);
-}
-
-// expand each element in A into a row of B
-#endif  // USE_GPU
-
-}  // namespace singa
-#endif  // SINGA_UTILS_MATH_ADDR_H_
diff --git a/include/singa/utils/math_blob.h b/include/singa/utils/math_blob.h
deleted file mode 100644
index abe7722..0000000
--- a/include/singa/utils/math_blob.h
+++ /dev/null
@@ -1,762 +0,0 @@
-/************************************************************
-*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*
-*************************************************************/
-
-#ifndef SINGA_UTILS_MATH_BLOB_H_
-#define SINGA_UTILS_MATH_BLOB_H_
-
-#include <vector>
-#include <algorithm>
-#include <thread>
-#include "singa/utils/blob.h"
-#include "singa/utils/singa_op.h"
-#include "singa/utils/math_addr.h"
-#include "singa/utils/singleton.h"
-#include "singa/utils/context.h"
-
-namespace singa {
-
-#define NO_GPU LOG(FATAL) << "Not compiled with GPU";
-/**
- * \file math_blob.h is not tested thorough.
- * Only GEMM() and MMDot() MVSumRow() andMVAddRow() are used now.
- */
-/************* BLAS level 1 *****************/
-/**
- * Scale each element of A with alpha, and put the result into A.
- * Ai = alpha*Ai
- * Use blas scale internally.
- */
-template<typename Dtype>
-void Scale(Dtype alpha, Blob<Dtype> * B) {
-  auto context = Singleton<Context>::Instance();
-  int device = context->device_id(std::this_thread::get_id());
-  if (device < 0) {
-    cpu_scale(B->count(), alpha, B->mutable_cpu_data());
-  } else {
-#ifdef USE_GPU
-    gpu_scale(context->cublas_handle(device), B->count(), alpha,
-        B->mutable_gpu_data());
-#else
-    NO_GPU;
-#endif
-  }
-}
-
-/**
- * Element-wise operation: Bi = alpha*Ai+Bi. A and B should have the same size
- */
-template<typename Dtype>
-void AXPY(Dtype alpha, const Blob<Dtype> & A, Blob<Dtype> * B) {
-  CHECK_EQ(A.count(), B->count());
-  auto context = Singleton<Context>::Instance();
-  int device = context->device_id(std::this_thread::get_id());
-  if (device < 0) {
-    cpu_axpy(A.count(), alpha, A.cpu_data(), B->mutable_cpu_data());
-  } else {
-#ifdef USE_GPU
-    gpu_axpy(context->cublas_handle(device), A.count(), alpha, A.gpu_data(),
-        B->mutable_gpu_data());
-#else
-    NO_GPU;
-#endif
-  }
-}
-
-/************* BLAS level 2 *****************/
-/**
- * Matrix vector multiplication, C = alpha A(.T) * B + beta C.
- * Loose shape checking:
- * - dim of A >=2
- * - row of A is shape(0) (no transpose)
- * - column of A(.T) == B.count()
- * - rows of A(.T) == C.count()
- *
- * @param[in] alpha
- * @param[in] beta
- * @param[in] A, matrix
- * @param[in] B, vector
- * @param[in, out] C, vector
- */
-template<typename Dtype>
-void GEMV(Dtype alpha, Dtype beta, const Blob<Dtype>& A,
-    const Blob<Dtype>& B, Blob<Dtype>* C) {
-  CHECK_EQ(A.shape().size(), 2);
-  int a1, a2, m, n;
-  a1 = A.transpose() ? A.count() / A.shape(0) : A.shape(0);
-  a2 = A.transpose() ? A.shape(0) : A.count() / A.shape(0);
-  m = B.count();
-  n = C->count();
-  CHECK_EQ(a2, m) << "# columns of A(.T) must = length of B";
-  CHECK_EQ(a1, n) << "# rows of A(.T) must = length of C";
-
-  bool TranA = A.transpose();
-  auto context = Singleton<Context>::Instance();
-  int device = context->device_id(std::this_thread::get_id());
-  if (device < 0) {
-    cpu_gemv(A.cpu_data(), B.cpu_data(), m, n, alpha, beta, TranA,
-        C->mutable_cpu_data());
-  } else {
-#ifdef USE_GPU
-    gpu_gemv(context->cublas_handle(device), A.gpu_data(), B.gpu_data(), m, n,
-        alpha, beta, TranA, C->mutable_gpu_data());
-#else
-    NO_GPU;
-#endif  // USE_GPU
-  }
-}
-/**
- * Matrix vector multiplication, C = A(.T) * B, transpose is considered.
- * Loose shape checking:
- * - dim of A >=2
- * - A.count() % B.count() == 0
- * - B.count() == C.count()
- *
- * @param[in] A input matrix
- * @param[in] B input vector
- * @param[out] C output vector
- */
-template <typename Dtype>
-void MVDot(const Blob<Dtype>& A, const Blob<Dtype>& B,
-    Blob<Dtype>* C) {
-  GEMV(Dtype(1), Dtype(0), A, B, C);
-}
-
-/************* BLAS level 3 *****************/
-/**
- * Matrix multiplication, C = alpha A*B + beta C, A, B and C are matrix.
- *
- * Tranpose is considered for A and B.
- * Loose shape checking:
- * - the first dimension is row (no transpose) or col (with transpose) size
- * - shapes match for matrix multiplication
- *
- * @param[in] alpha
- * @param[in] beta
- * @param[in] A, matrix
- * @param[in] B, matrix
- * @param[in, out] C, matrix
- */
-template <typename Dtype>
-void GEMM(Dtype alpha, Dtype beta, const Blob<Dtype>& A, const Blob<Dtype>& B,
-    Blob<Dtype> * C) {
-  CHECK_GE(A.shape().size(), 2);
-  CHECK_GE(B.shape().size(), 2);
-  CHECK_GE(C->shape().size(), 2);
-  int a1, a2, b1, b2, m, n;
-  CHECK(!C->transpose());
-  a1 = A.transpose() ? A.count() / A.shape(0) : A.shape(0);
-  a2 = A.count() / a1;
-  b1 = B.transpose() ? B.count() /B.shape(0) : B.shape(0);
-  b2 = B.count() / b1;
-  m = C->shape(0);
-  n = C->count() / m;
-  CHECK_EQ(a2, b1);
-  CHECK_EQ(a1, m);
-  CHECK_EQ(b2, n);
-
-  int k = a2;
-  bool TranA = A.transpose();
-  bool TranB = B.transpose();
-  auto context = Singleton<Context>::Instance();
-  int device = context->device_id(std::this_thread::get_id());
-  if (device < 0) {
-    cpu_gemm(A.cpu_data(), B.cpu_data(), m, n, k, alpha, beta, TranA, TranB,
-        C->mutable_cpu_data());
-  } else {
-#ifdef USE_GPU
-    gpu_gemm(context->cublas_handle(device), A.gpu_data(), B.gpu_data(),
-        m, n, k, alpha, beta, TranA, TranB, C->mutable_gpu_data());
-#else
-    NO_GPU;
-#endif  // USE_GPU
-  }
-}
-/**
- * Matrix multiplication, C = A(.T) * B(.T), transpose is considered.
- * Strict shape checking:
- * - all are matrix
- * - shapes match for matrix multiplication
- *
- * @param[in] A input matrix
- * @param[in] B input matrix
- * @param[out] C output matrix
- */
-template <typename Dtype>
-void MMDot(const Blob<Dtype>& A, const Blob<Dtype>& B,
-    Blob<Dtype>* C) {
-  GEMM(Dtype(1), Dtype(0), A, B, C);
-}
-
-
-/*********************** Inner and Outer product****************************/
-/**
- * Inner product for two vectors.
- * Loose shape checking, A.count() == B.count.
- *
- * @param[in] A, input vector (shape checking using A.count()).
- * @param[in] B, input vector (shape checking using B.count()).
- * @return inner product value.
- */
-template <typename Dtype>
-Dtype VVDot(const Blob<Dtype> & A, const Blob<Dtype> & B) {
-  Dtype res = 0;
-  CHECK_EQ(A.count(), B.count());
-  int n = A.count();
-  auto context = Singleton<Context>::Instance();
-  int device = context->device_id(std::this_thread::get_id());
-  if (device < 0) {
-    res = cpu_dot(n, A.cpu_data(), B.cpu_data());
-  } else {
-#ifdef USE_GPU
-    res = gpu_dot(context->cublas_handle(device), n, A.gpu_data(),
-        B.gpu_data());
-#else
-    NO_GPU;
-#endif  // USE_GPU
-  }
-  return res;
-}
-
-/**
- * Outer product, C = A ** B, transpose is disabled.
- * Loose shape checking, A.count() * B.count() == C.count()
- *
- * @param[in] A, input vector
- * @param[in] B, input vector
- * @param[out] C, output matrix
- */
-template <typename Dtype>
-void OuterProduct(const Blob<Dtype>& A, const Blob<Dtype>& B, Blob<Dtype> * C) {
-  CHECK(!C->transpose());  // do not support C.T now.
-
-  int m = A.count();
-  int n = B.count();
-  CHECK_EQ(C->count(), m * n);
-  auto context = Singleton<Context>::Instance();
-  int device = context->device_id(std::this_thread::get_id());
-  if (device < 0) {
-    cpu_gemm(A.cpu_data(), B.cpu_data(), m, n, 1, Dtype(1), Dtype(0), false,
-        false, C->mutable_cpu_data());
-  } else {
-#ifdef USE_GPU
-    gpu_gemm(context->cublas_handle(device), A.gpu_data(), B.gpu_data(),
-        m, n, 1, Dtype(1), Dtype(0), false, false, C->mutable_gpu_data());
-#else
-    NO_GPU;
-#endif  // USE_GPU
-  }
-}
-/*********************** Element-wise functions ***********************/
-/**
- * Apply the function from Op for each element in A and put the result into B,
- * i.e., Bi = Op(Ai).
- * Loose shape checking, A.count() == B.count().
- */
-template<typename Op, typename Dtype>
-void Map(const Blob<Dtype> & A, Blob<Dtype> * B) {
-  CHECK_EQ(A.count(), B->count()) << "Blobs must have the same size";
-  auto context = Singleton<Context>::Instance();
-  int device = context->device_id(std::this_thread::get_id());
-  if (device < 0) {
-    cpu_e_f<Op>(A.count(), A.cpu_data(), B->mutable_cpu_data());
-  } else {
-#ifdef USE_GPU
-    gpu_e_f<Op>(A.count(), A.gpu_data(), B->mutable_gpu_data());
-#else
-    NO_GPU;
-#endif  // USE_GPU
-  }
-}
-
-/**
- * Apply the function from Op for each element in A and B, and put the result
- * into C, i.e., Ci = Op(Ai, Bi).
- * Loose shape checking, A, B and C are of the same size.
- */
-template<typename Op, typename Dtype>
-void Map(const Blob<Dtype> & A, const Blob<Dtype> & B, Blob<Dtype> * C) {
-  CHECK_EQ(A.count(), B.count()) << "Blobs must have the same size";
-  CHECK_EQ(A.count(), C->count()) << "Blobs must have the same size";
-  // cpu_e_f<Op>(A.count(), A.cpu_data(), B.cpu_data(), C->mutable_cpu_data());
-  auto context = Singleton<Context>::Instance();
-  int device = context->device_id(std::this_thread::get_id());
-  if (device < 0) {
-    cpu_e_f<Op>(A.count(), A.cpu_data(), B.cpu_data(), C->mutable_cpu_data());
-  } else {
-#ifdef USE_GPU
-    gpu_e_f<Op>(A.count(), A.gpu_data(), B.gpu_data(), C->mutable_gpu_data());
-#else
-    NO_GPU;
-#endif  // USE_GPU
-  }
-}
-
-/**
- * Bi = Op(alpha, Ai)
- * Loose shape checking, A.count() == B.count().
- */
-template<typename Op, typename Dtype>
-void Map(Dtype alpha, const Blob<Dtype>& A, Blob<Dtype>* B) {
-  CHECK_EQ(A.count(), B->count()) << "Blobs must have the same size";
-  auto context = Singleton<Context>::Instance();
-  int device = context->device_id(std::this_thread::get_id());
-  if (device < 0) {
-    cpu_e_f<Op>(A.count(), alpha, A.cpu_data(), B->mutable_cpu_data());
-  } else {
-#ifdef USE_GPU
-    gpu_e_f<Op>(A.count(), alpha, A.gpu_data(), B->mutable_gpu_data());
-#else
-    NO_GPU;
-#endif  // USE_GPU
-  }
-}
-/**
- * Ci = Op(alpha, Ai, Bi)
- * Loose shape checking, A, B and C are of the same size.
- */
-template<typename Op, typename Dtype>
-void Map(Dtype alpha, const Blob<Dtype>& A, const Blob<Dtype>& B,
-    Blob<Dtype>* C) {
-  CHECK_EQ(A.count(), B->count()) << "Blobs must have the same size";
-  auto context = Singleton<Context>::Instance();
-  int device = context->device_id(std::this_thread::get_id());
-  if (device < 0) {
-    cpu_e_f<Op>(A.count(), alpha, A.cpu_data(), B->cpu_data(),
-        C->mutable_cpu_data());
-  } else {
-    // TODO(wangwei) implement gpu version.
-    NO_GPU;
-  }
-}
-
-/**
- * Currently use std::copy which has shown better performance than memcpy.
- * http://stackoverflow.com/questions/4707012/c-memcpy-vs-stdcopy
- * TODO(wangwei) test blas copy vs std::copy.
- *
- * Loose shape checking, A.count() == B.count().
- */
-template<typename Dtype>
-void Copy(const Blob<Dtype>& A, Blob<Dtype>* B) {
-  CHECK_EQ(A.count(), B->count()) << "Blobs must have the same size";
-  auto context = Singleton<Context>::Instance();
-  int device = context->device_id(std::this_thread::get_id());
-  if (device < 0) {
-    std::copy(A.cpu_data(), A.cpu_data() + A.count(), B->mutable_cpu_data());
-  } else {
-#ifdef USE_GPU
-  CUDA_CHECK(cudaMemcpy(static_cast<Dtype*>(B->mutable_gpu_data()),
-             A.gpu_data(), sizeof(Dtype) * A.count(), cudaMemcpyDefault));
-#else
-  NO_GPU;
-#endif
-  }
-}
-
-
-/**
- * B = alpha + A
- * Implemented using Copy and AXPY.
- */
-template<typename Dtype>
-void Add(Dtype alpha,  const Blob<Dtype> & A, Blob<Dtype> * B) {
-  Map<singa::op::Add<Dtype>, Dtype>(alpha, A, B);
-}
-
-/**
- * C = A + B
- * Implemented using Copy and AXPY.
- */
-template<typename Dtype>
-void Add(const Blob<Dtype> & A, const Blob<Dtype> & B,
-    Blob<Dtype> * C) {
-  Copy(A, C);
-  AXPY(Dtype(1), B, C);
-}
-
-/**
- * B = alpha - A
- * Implemented using Copy and AXPY.
- */
-template<typename Dtype>
-void Sub(Dtype alpha, const Blob<Dtype> & A, Blob<Dtype>* B) {
-  Map<singa::op::Sub<Dtype>, Dtype>(alpha, A, B);
-}
-
-/**
- * C = A - B
- * Implemented using Copy and AXPY.
- */
-template<typename Dtype>
-void Sub(const Blob<Dtype> & A, const Blob<Dtype> & B,
-    Blob<Dtype> * C) {
-  Copy(A, C);
-  AXPY(Dtype(-1), B, C);
-}
-
-/**
- * C = A * B, implemented using
- * Map(const Blob<Dtype>&, const Blob<Dtype>&, Blob<Dtype>*).
- */
-template<typename Dtype>
-void Mult(const Blob<Dtype> & A, const Blob<Dtype> & B,
-    Blob<Dtype> * C) {
-  Map<singa::op::Mult<Dtype>, Dtype>(A, B, C);
-  // TODO(wangwei) use MKL's vector func
-}
-
-/**
- * C = A / B, implemented using
- * Map(const Blob<Dtype>&, const Blob<Dtype>&, Blob<Dtype>*).
- */
-template<typename Dtype>
-void Div(const Blob<Dtype> & A, const Blob<Dtype> & B,
-    Blob<Dtype> * C) {
-  Map<singa::op::Div<Dtype>, Dtype>(A, B, C);
-  // TODO(wangwei) use MKL's vector func
-}
-/**
- * B = sqrt(A)
- */
-template<typename Dtype>
-void Sqrt(const Blob<Dtype> & A, Blob<Dtype>* B) {
-  Map<singa::op::Sqrt<Dtype>, Dtype>(A, B);
-}
-/**
- * B = square(A)
- */
-template<typename Dtype>
-void Square(const Blob<Dtype> & A, Blob<Dtype>* B) {
-  Map<singa::op::Square<Dtype>, Dtype>(A, B);
-}
-/**
- * B = exp(A)
- */
-template<typename Dtype>
-void Exp(const Blob<Dtype> & A, Blob<Dtype>* B) {
-  Map<singa::op::Exp<Dtype>, Dtype>(A, B);
-}
-/**
- * B = log(A)
- */
-template<typename Dtype>
-void Log(const Blob<Dtype>& A, Blob<Dtype>* B) {
-  Map<singa::op::Log<Dtype>, Dtype>(A, B);
-}
-/**
- * B = tanh(A)
- */
-template<typename Dtype>
-void Tanh(const Blob<Dtype>& A, Blob<Dtype>* B) {
-  Map<singa::op::Tanh<Dtype>, Dtype>(A, B);
-}
-/*************************1D<-->2D op/transform***************************/
-/**
- * Add A to each column of B, i.e., Bij = alpha*Ai + beta*Bij
- * Loose shape checking, B.count() % A.count() == 0.
- * # columns of B = B.count() / A.count().
- */
-template<typename Dtype>
-void MVAddCol(Dtype alpha, Dtype beta, const Blob<Dtype> & A, Blob<Dtype> * B) {
-  if (B->transpose()) {
-    B->set_transpose(false);
-    MVAddRow(alpha, beta, A, B);
-    B->set_transpose(true);
-  } else {
-    CHECK_EQ(B->count() % A.count(), 0) << "#col of B not match length of A";
-    int m = A.count(), n = B->count() / m;
-    Blob<Dtype> one(n);
-    one.SetValue(1);
-    auto context = Singleton<Context>::Instance();
-    int device = context->device_id(std::this_thread::get_id());
-    if (device < 0) {
-      cpu_gemm(A.cpu_data(), one.cpu_data(), m, n, 1, alpha, beta, false, false,
-          B->mutable_cpu_data());
-    } else {
-#ifdef USE_GPU
-      gpu_gemm(context->cublas_handle(device), A.gpu_data(), one.gpu_data(), m,
-          n, 1, alpha, beta, false, false, B->mutable_gpu_data());
-#else
-      NO_GPU;
-#endif  // USE_GPU
-    }
-  }
-}
-/**
- * Add A to each column of B, i.e., Bij = Ai + Bij
- * Loose shape checking, B.count() % A.count() == 0.
- * # columns of B = B.count() / A.count().
- */
-template<typename Dtype>
-void MVAddCol(const Blob<Dtype> & A, Blob<Dtype>* B) {
-  MVAddCol(Dtype(1), Dtype(1), A, B);
-}
-
-/**
- * Add A to each row of B, i.e., Bij = alpha*Aj + beta*Bij
- * Loose shape checking, B.count() % A.count() == 0.
- * # rows of B = B.count() / A.count().
- */
-template<typename Dtype>
-void MVAddRow(Dtype alpha, Dtype beta, const Blob<Dtype> & A, Blob<Dtype> * B) {
-  if (B->transpose()) {
-    B->set_transpose(false);
-    MVAddCol(alpha, beta, A, B);
-    B->set_transpose(true);
-  } else {
-    CHECK_EQ(B->count() % A.count(), 0) << "#col of B not match length of A";
-    int n = A.count(), m = B->count() / n;
-    auto context = Singleton<Context>::Instance();
-    int device = context->device_id(std::this_thread::get_id());
-    if (device < 0) {
-      Blob<Dtype> one(m);
-      one.SetValue(1);
-      cpu_gemm(one.cpu_data(), A.cpu_data(), m, n, 1, alpha, beta,
-          false, false, B->mutable_cpu_data());
-    } else {
-#ifdef USE_GPU
-      singa_gpu_add_vec_row(A.gpu_data(), B->gpu_data(), B->mutable_gpu_data(),
-          m, n, n);
-#else
-      NO_GPU;
-#endif  // USE_GPU
-    }
-  }
-}
-/**
- * Add A to each row of B, i.e., Bij = Aj + Bij
- * Loose shape checking, B.count() % A.count() == 0.
- * # rows of B = B.count() / A.count().
- */
-template<typename Dtype>
-void MVAddRow(const Blob<Dtype> & A, Blob<Dtype>* B) {
-  MVAddRow(Dtype(1), Dtype(1), A, B);
-}
-
-/**
- * Copy A to each column of B, i.e., Bij = Ai
- * Loose shape checking, B.count() % A.count() == 0,
- * # columns of B = B.count() / A.count().
- */
-template<typename Dtype>
-void RepmatCol(const Blob<Dtype> & A, Blob<Dtype> * B) {
-  MVAddCol(Dtype(1), Dtype(0), A, B);
-}
-
-/**
- * Copy A to each row of B, i.e., Bij = Aj
- * Loose shape checking, B.count() % A.count() == 0,
- * # rows of B = B.count() / A.count().
- */
-template<typename Dtype>
-void RepmatRow(const Blob<Dtype> & A, Blob<Dtype> * B) {
-  MVAddRow(Dtype(1), Dtype(0), A, B);
-}
-
-/**
- * Sum all columns of matrix A to a column vector B,
- * i.e., Bi = \sum_j {alpha*Aij}+beta*Bi
- * Loose shape checking, A.count() % B.count() == 0.
- * # columns of A = A.count() / B.count().
- */
-template<typename Dtype>
-void MVSumCol(Dtype alpha, Dtype beta, const Blob<Dtype> & A, Blob<Dtype> * B) {
-  CHECK_EQ(A.count() % B->count(), 0) << "length of B must = # of cols of A";
-  int m = B->count(), n = A.count() / m;
-  auto context = Singleton<Context>::Instance();
-  int device = context->device_id(std::this_thread::get_id());
-  if (device < 0) {
-    Blob<Dtype> one(n);
-    one.SetValue(1);
-    cpu_gemm(A.cpu_data(), one.cpu_data(), m, 1, n, alpha, beta,
-        A.transpose(), false, B->mutable_cpu_data());
-  } else {
-#ifdef USE_GPU
-    singa_gpu_sum_col(A.gpu_data(), B->mutable_gpu_data(), m, n, n);
-#else
-    NO_GPU;
-#endif  // USE_GPU
-  }
-}
-
-/**
- * Sum all rows of matrix A to a row vector B,
- * i.e., Bj = \sum_i {alpha*Aij}+beta*Bj
- * Loose shape checking, A.count() % B.count() == 0.
- * # rows of A = A.count() / B.count().
- */
-template<typename Dtype>
-void MVSumRow(Dtype alpha, Dtype beta, const Blob<Dtype> & A, Blob<Dtype> * B) {
-  CHECK_EQ(A.count() % B->count(), 0) << "length of B must = # of cols of A";
-  int n = B->count(), m = A.count() / n;
-  auto context = Singleton<Context>::Instance();
-  int device = context->device_id(std::this_thread::get_id());
-  if (device < 0) {
-    Blob<Dtype> one(m);
-    one.SetValue(1);
-    cpu_gemm(one.cpu_data(), A.cpu_data(), 1, n, m, alpha, beta, false,
-             A.transpose(), B->mutable_cpu_data());
-  } else {
-#ifdef USE_GPU
-    singa_gpu_sum_row(A.gpu_data(), B->mutable_gpu_data(), m, n, n);
-#else
-    NO_GPU;
-#endif  // USE_GPU
-  }
-}
-
-/**
- * Reduce each row of A to an element of B.
- * Loose shape checking, A.count() % B.count() == 0.
- * # columns of A = A.count() / B.count().
- */
-template<typename Op, typename Dtype>
-void Reduce2D(const Blob<Dtype> & A, Blob<Dtype> * B) {
-  CHECK_EQ(A.count() % B->count(), 0) << "Row size not match B length";
-  int m = B->count(), n = A.count() / m;
-  auto context = Singleton<Context>::Instance();
-  int device = context->device_id(std::this_thread::get_id());
-  if (device < 0) {
-    cpu_reduce_f<Op>(A.cpu_data(), m, n, B->mutable_cpu_data());
-  } else {
-#ifdef USE_GPU
-    gpu_reduce_f<Op>(A.gpu_data(), m, n, B->mutable_gpu_data());
-#else
-    NO_GPU;
-#endif  // USE_GPU
-  }
-}
-/**
- * Duplicate each element of A into a row of B.
- * Loose shape checking, B.count() % A.count() == 0.
- * # columns of B = B.count() / A.count().
- */
-template<typename Op, typename Dtype>
-void Expand2D(const Blob<Dtype> & A, Blob<Dtype> * B) {
-  CHECK_EQ(B->count() % A.count(), 0) << "Row size of B not match length of A";
-  int m = A.count(), n = B->count() / m;
-  auto context = Singleton<Context>::Instance();
-  int device = context->device_id(std::this_thread::get_id());
-  if (device < 0) {
-    cpu_expand_f<Op>(A.cpu_data(), m, n, B->mutable_cpu_data());
-  } else {
-#ifdef USE_GPU
-    gpu_expand_f<Op>(A.gpu_data(), m, n, B->mutable_gpu_data());
-#else
-    NO_GPU;
-#endif  // USE_GPU
-  }
-}
-
-/**
- * Average the absolute values.
- */
-template<typename Dtype>
-Dtype Asum(const Blob<Dtype>& A) {
-  if (A.count() == 0) return Dtype(0);
-  auto context = Singleton<Context>::Instance();
-  int device = context->device_id(std::this_thread::get_id());
-  Dtype ret = Dtype(0);
-  if (device < 0) {
-    ret = cpu_asum(A.count(), A.cpu_data(), 1) / A.count();
-  } else {
-#ifdef USE_GPU
-    ret = gpu_asum(context->cublas_handle(device), A.count(), A.gpu_data(), 1)
-      / A.count();
-#else
-    NO_GPU;
-#endif
-  }
-  return ret;
-}
-
-
-/*************Random Sample***************/
-template<typename Dtype>
-void SampleUniform(Dtype low, Dtype high, Blob<Dtype>* A) {
-  auto context = Singleton<Context>::Instance();
-  const auto& thread = std::this_thread::get_id();
-  int device = context->device_id(thread);
-  if (device < 0) {
-    cpu_sample_uniform(*context->rand_generator(thread), A->count(), low, high,
-        A->mutable_cpu_data());
-  } else {
-#ifdef USE_GPU
-    gpu_sample_uniform(context->curand_generator(thread), A->count(), low, high,
-        A->mutable_gpu_data());
-#else
-    NO_GPU;
-#endif
-  }
-}
-
-template<typename Dtype>
-void SampleGaussian(Dtype mean, Dtype std, Blob<Dtype>* A) {
-  auto context = Singleton<Context>::Instance();
-  const auto& thread = std::this_thread::get_id();
-  int device = context->device_id(thread);
-  if (device < 0) {
-    cpu_sample_gaussian(*context->rand_generator(thread), A->count(), mean, std,
-        A->mutable_cpu_data());
-  } else {
-#ifdef USE_GPU
-    gpu_sample_gaussian(context->curand_generator(thread), A->count(),
-        mean, std, A->mutable_gpu_data());
-#else
-    NO_GPU;
-#endif
-  }
-}
-
-/************** Other functions ****************/
-template<typename Dtype>
-void Softmax(int nb_rows, const Blob<Dtype>& A, Blob<Dtype>* B) {
-  CHECK_GT(nb_rows, 0);
-  CHECK_EQ(A.count() % nb_rows, 0);
-  CHECK_EQ(A.count(), B->count());
-  auto context = Singleton<Context>::Instance();
-  int device = context->device_id(std::this_thread::get_id());
-  if (device < 0) {
-    cpu_softmax(nb_rows, A.count() / nb_rows, A.cpu_data(),
-      B->mutable_cpu_data());
-  } else {
-    // TODO(wangwei) implement the GPU version.
-    NO_GPU;
-  }
-}
-
-template<typename Dtype>
-void Zero(Blob<Dtype>* B) {
-  auto context = Singleton<Context>::Instance();
-  int device = context->device_id(std::this_thread::get_id());
-  if (device < 0) {
-    B->SetValue(0);
-  } else {
-#ifdef USE_GPU
-    cudaMemset(B->mutable_gpu_data(), 0, B->count() * sizeof(float));
-#else
-    NO_GPU;
-#endif  // USE_GPU
-  }
-}
-}  // end of namespace singa
-
-#endif  // SINGA_UTILS_MATH_BLOB_H_
diff --git a/include/singa/utils/math_kernel.h b/include/singa/utils/math_kernel.h
deleted file mode 100644
index 0239d3d..0000000
--- a/include/singa/utils/math_kernel.h
+++ /dev/null
@@ -1,88 +0,0 @@
-/************************************************************
-*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*
-*************************************************************/
-#ifndef SINGA_UTILS_MATH_KERNEL_H_
-#define SINGA_UTILS_MATH_KERNEL_H_
-
-namespace singa {
-
-extern "C" {
-  void singa_gpu_softmaxloss_forward(int n, int dim, const float *prob,
-      const int *label, float *loss);
-
-  void singa_gpu_softmaxloss_backward(int n, int dim, float scale,
-      const int *label, float *grad);
-
-  void singa_gpu_sum_vec(float *data, float *sum , int n);
-
-  void singa_gpu_sum_col(const float *src_mat_data, float *dst_vec_data,
-    int rows, int cols, int stride);
-
-  void singa_gpu_sum_row(const float *src_mat_data, float *dst_vec_data,
-    int rows, int cols, int stride);
-
-  void singa_gpu_add_vec_row(const float *src_vec_data,
-    const float *src_mat_data, float *des_mat_data,
-    int rows, int cols, int stride);
-
-  void singa_gpu_exp(const float *src_data, float *des_data, int n);
-
-  void singa_gpu_log(const float *src_data, float *des_data, int n);
-
-  void singa_gpu_sigmoid(const float *src_data, float *des_data, int n);
-
-  void singa_gpu_sigmoid_grad(const float *src_data, float *des_data, int n);
-
-  void singa_gpu_relu(const float *src_data, float *des_data, int n);
-
-  void singa_gpu_relu_grad(const float *src_data, float *des_data, int n);
-
-  void singa_gpu_tanh(const float *src_data, float *des_data, int n);
-
-  void singa_gpu_tanh_grad(const float *src_data, float *des_data, int n);
-
-  void singa_gpu_softplus(const float *src_data, float *des_data, int n);
-
-  void singa_gpu_softplus_grad(const float *src_data, float *des_data, int n);
-
-  void singa_gpu_square(const float *src_data, float *des_data, int n);
-
-  void singa_gpu_square_grad(const float *src_data, float *des_data, int n);
-
-  void singa_gpu_sqrt(const float *src_data, float *des_data, int n);
-
-  void singa_gpu_pow(const float *src_data_a, const float *src_data_b,
-    float *des_data, int n);
-
-  void singa_gpu_mult(const float *src_data_a, const float *src_data_b,
-    float *des_data, int n);
-
-  void singa_gpu_div(const float *src_data_a, const float *src_data_b,
-    float *des_data, int n);
-
-  void singa_gpu_set_value(float *data, float value, int n);
-
-  void singa_gpu_threshold(const float *src_data, float *des_data,
-      float alpha, int n);
-};
-
-}  // namespace singa
-
-#endif  // SINGA_UTILS_MATH_KERNEL_H_
diff --git a/include/singa/utils/opencl_utils.h b/include/singa/utils/opencl_utils.h
new file mode 100644
index 0000000..664a9e1
--- /dev/null
+++ b/include/singa/utils/opencl_utils.h
@@ -0,0 +1,144 @@
+/************************************************************
+*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*
+*************************************************************/
+
+#ifndef SINGA_UTILS_OPENCL_UTILS_H_
+#define SINGA_UTILS_OPENCL_UTILS_H_
+
+#ifdef USE_OPENCL
+
+#include <iostream>
+
+// http://github.khronos.org/OpenCL-CLHPP/
+// cl2.hpp includes cl.h, do not re-include.
+#define CL_HPP_MINIMUM_OPENCL_VERSION 120
+#define CL_HPP_TARGET_OPENCL_VERSION 120
+#include <CL/cl2.hpp>
+
+#define CL_BREAK_ON_FAILURE if (status != CL_SUCCESS) return;
+
+
+inline const char* clGetBuildInfoString(const cl_build_status status) {
+  switch (status) {
+	case CL_BUILD_NONE: return "CL_BUILD_NONE";
+	case CL_BUILD_ERROR: return "CL_BUILD_ERROR";
+	case CL_BUILD_SUCCESS: return "CL_BUILD_SUCCESS";
+	case CL_BUILD_IN_PROGRESS: return "CL_BUILD_IN_PROGRESS";
+	default: return "";
+  }
+}
+
+
+inline const char* clGetErrorString(const cl_int status) {
+
+  switch(status) {
+
+  // Run-time and JIT compiler errors
+  case 0: return "CL_SUCCESS";
+  case -1: return "CL_DEVICE_NOT_FOUND";
+  case -2: return "CL_DEVICE_NOT_AVAILABLE";
+  case -3: return "CL_COMPILER_NOT_AVAILABLE";
+  case -4: return "CL_MEM_OBJECT_ALLOCATION_FAILURE";
+  case -5: return "CL_OUT_OF_RESOURCES";
+  case -6: return "CL_OUT_OF_HOST_MEMORY";
+  case -7: return "CL_PROFILING_INFO_NOT_AVAILABLE";
+  case -8: return "CL_MEM_COPY_OVERLAP";
+  case -9: return "CL_IMAGE_FORMAT_MISMATCH";
+  case -10: return "CL_IMAGE_FORMAT_NOT_SUPPORTED";
+  case -11: return "CL_BUILD_PROGRAM_FAILURE";
+  case -12: return "CL_MAP_FAILURE";
+  case -13: return "CL_MISALIGNED_SUB_BUFFER_OFFSET";
+  case -14: return "CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST";
+  case -15: return "CL_COMPILE_PROGRAM_FAILURE";
+  case -16: return "CL_LINKER_NOT_AVAILABLE";
+  case -17: return "CL_LINK_PROGRAM_FAILURE";
+  case -18: return "CL_DEVICE_PARTITION_FAILED";
+  case -19: return "CL_KERNEL_ARG_INFO_NOT_AVAILABLE";
+
+  // Compile-time errors
+  case -30: return "CL_INVALID_VALUE";
+  case -31: return "CL_INVALID_DEVICE_TYPE";
+  case -32: return "CL_INVALID_PLATFORM";
+  case -33: return "CL_INVALID_DEVICE";
+  case -34: return "CL_INVALID_CONTEXT";
+  case -35: return "CL_INVALID_QUEUE_PROPERTIES";
+  case -36: return "CL_INVALID_COMMAND_QUEUE";
+  case -37: return "CL_INVALID_HOST_PTR";
+  case -38: return "CL_INVALID_MEM_OBJECT";
+  case -39: return "CL_INVALID_IMAGE_FORMAT_DESCRIPTOR";
+  case -40: return "CL_INVALID_IMAGE_SIZE";
+  case -41: return "CL_INVALID_SAMPLER";
+  case -42: return "CL_INVALID_BINARY";
+  case -43: return "CL_INVALID_BUILD_OPTIONS";
+  case -44: return "CL_INVALID_PROGRAM";
+  case -45: return "CL_INVALID_PROGRAM_EXECUTABLE";
+  case -46: return "CL_INVALID_KERNEL_NAME";
+  case -47: return "CL_INVALID_KERNEL_DEFINITION";
+  case -48: return "CL_INVALID_KERNEL";
+  case -49: return "CL_INVALID_ARG_INDEX";
+  case -50: return "CL_INVALID_ARG_VALUE";
+  case -51: return "CL_INVALID_ARG_SIZE";
+  case -52: return "CL_INVALID_KERNEL_ARGS";
+  case -53: return "CL_INVALID_WORK_DIMENSION";
+  case -54: return "CL_INVALID_WORK_GROUP_SIZE";
+  case -55: return "CL_INVALID_WORK_ITEM_SIZE";
+  case -56: return "CL_INVALID_GLOBAL_OFFSET";
+  case -57: return "CL_INVALID_EVENT_WAIT_LIST";
+  case -58: return "CL_INVALID_EVENT";
+  case -59: return "CL_INVALID_OPERATION";
+  case -60: return "CL_INVALID_GL_OBJECT";
+  case -61: return "CL_INVALID_BUFFER_SIZE";
+  case -62: return "CL_INVALID_MIP_LEVEL";
+  case -63: return "CL_INVALID_GLOBAL_WORK_SIZE";
+  case -64: return "CL_INVALID_PROPERTY";
+  case -65: return "CL_INVALID_IMAGE_DESCRIPTOR";
+  case -66: return "CL_INVALID_COMPILER_OPTIONS";
+  case -67: return "CL_INVALID_LINKER_OPTIONS";
+  case -68: return "CL_INVALID_DEVICE_PARTITION_COUNT";
+
+  // Extension errors
+  case -1000: return "CL_INVALID_GL_SHAREGROUP_REFERENCE_KHR";
+  case -1001: return "CL_PLATFORM_NOT_FOUND_KHR";
+  case -1002: return "CL_INVALID_D3D10_DEVICE_KHR";
+  case -1003: return "CL_INVALID_D3D10_RESOURCE_KHR";
+  case -1004: return "CL_D3D10_RESOURCE_ALREADY_ACQUIRED_KHR";
+  case -1005: return "CL_D3D10_RESOURCE_NOT_ACQUIRED_KHR";
+
+  default: return "Unknown OpenCL status";
+  }
+}
+
+
+/// Special function used to perform error checking and logging.
+inline bool OCL_CHECK(const cl_int status, const char* what) {
+  if (status == CL_SUCCESS) return true; // Nothing wrong.
+  LOG(ERROR) << status << ": " << clGetErrorString(status) << " " << what << std::endl;
+  return false;
+}
+
+/// Prints information about the specified Platform.
+void PrintPlatformInfo(const cl::Platform &p);
+
+/// Prints information about the specified Device.
+void PrintDeviceInfo(const cl::Device &dev);
+
+#endif // USE_OPENCL
+
+#endif // SINGA_UTILS_OPENCL_UTILS_H_
diff --git a/include/singa/utils/param.h b/include/singa/utils/param.h
deleted file mode 100644
index 319f2b4..0000000
--- a/include/singa/utils/param.h
+++ /dev/null
@@ -1,407 +0,0 @@
-/************************************************************
-*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*
-*************************************************************/
-
-#ifndef SINGA_UTILS_PARAM_H_
-#define SINGA_UTILS_PARAM_H_
-
-#include <memory>
-#include <string>
-#include <vector>
-
-#include "singa/comm/msg.h"
-#include "singa/proto/job.pb.h"
-#include "singa/utils/blob.h"
-
-namespace singa {
-using std::vector;
-/**
- * Base parameter generator which intializes parameter values.
- */
-class ParamGenerator {
- public:
-  static ParamGenerator* Create(const ParamGenProto& proto);
-
-  virtual ~ParamGenerator() {}
-
-  virtual void Init(const ParamGenProto& proto) { proto_ = proto; }
-  virtual void Fill(Blob<float>* data);
-
- protected:
-  ParamGenProto proto_;
-};
-
-class GaussianGen : public ParamGenerator {
- public:
-  void  Fill(Blob<float>* data) override;
-};
-
-class GaussianSqrtFanInGen : public GaussianGen {
- public:
-  void  Fill(Blob<float>* data) override;
-};
-
-class UniformGen : public ParamGenerator {
- public:
-  void  Fill(Blob<float>* data) override;
-};
-
-class UniformSqrtFanInGen : public UniformGen {
- public:
-  void Fill(Blob<float>* data) override;
-};
-
-class UniformSqrtFanInOutGen : public UniformGen {
- public:
-  void Fill(Blob<float>* data) override;
-};
-
-/**
- * Base paramter class.
- *
- * The Param object is a set of parameters, e.g., the (sub) weight matrix or
- * (sub) bias vector.
- *
- * It has at a gradient Blob and data Blob for gradients and parameter values.
- * Since some layers (or neuralnet) share parameter values, the data Blob is a
- * shared pointer which can be assigned to many Param objects' data field.
- *
- * It provides access methods like data(), grad(). It also provides functions
- * for generating messages and parsing messages to transferring the Param
- * objects among worker-worker, worker-server and server-server.
- *
- * Param objects are of different sizes, which makes it hard to acheive
- * load-balance among servers. Hence, we slice large Param objects into small
- * pieces. At the server side, one slice is a Param object.
- */
-class Param {
- public:
-  /**
-   * Create an instance of (sub) Param class based on the type from the
-   * configuration.
-   *
-   * @param[in] conf configuration
-   * @param a pointer to an instance
-   */
-  static Param* Create(const ParamProto& conf);
-
-  /**
-   * Try to slice the Param objects (from a neural net) into a given number of
-   * servers (groups) evenly. This is to achieve load-balance among servers.
-   *
-   * It does not change the Param objects, but just computes the length of each
-   * slice.
-   *
-   * @param num number of servers (groups) for maintaining the Param objects.
-   * @param params all Param objects from a neural net.
-   * @return the length of each slice.
-   */
-  static const vector<int> ComputeSlices(int num, const vector<Param*>& params);
-  /**
-   * It computes the length of each slice and slices the Param objects by adding
-   * the slicing information into every Param object.
-   *
-   * @copydetails ComputeSlices()
-   */
-  static void SliceParams(int num, const vector<Param*>& params);
-
-  Param() {}
-  virtual ~Param() {}
-  void Init(const ParamProto& proto) { proto_ = proto; }
-  /**
-   * Setup param object
-   *
-   * @param conf param configuration, include learning rate multiplier etc.
-   * @param shape one value per dimension
-   */
-  virtual void Setup(const std::vector<int>& shape);
-  /*
-   * Fill the values according to init method, e.g., gaussian distribution.
-   *
-   * @param version initial version
-   */
-  virtual void InitValues();
-  virtual void InitValues(int version);
-  /**
-   * Share the data blob from other Param objects.
-   *
-   * @param other the Param object whose owner owns the data blob
-   * @param cpu_only if true, share only cpu memory (used for training with
-   * multi-gpu cards); else, share both cpu and gpu memory.
-   */
-  void ShareDataFrom(Param* other, bool cpu_only);
-  /**
-   * Share both data and grad from other param
-   */
-  void ShareFrom(Param* other);
-  /**
-   * Init param values from checkpoint blob.
-   */
-  void FromProto(const BlobProto& blob);
-  void FromProto(const std::string str);
-  /**
-   * Dump param values to blob.
-   */
-  void ToProto(BlobProto* blob);
-  /**
-   * Add a slice
-   *
-   * @param slice_id
-   * @param size num of floats for this slice
-   */
-  void AddSlice(int slice_id, int size);
-  /**
-   * Scale the learning rate when updating parameters in the Param object
-   */
-  inline float lr_scale() const { return proto_.lr_scale(); }
-  /**
-   * Scale the weight decay when updating parameters in the Param object
-   */
-  inline float wd_scale() const { return proto_.wd_scale(); }
-  /**
-   * Parameter name used for Param re-use in other model or sharing between
-   * layers
-   */
-  inline const std::string& name() const { return proto_.name(); }
-  inline void set_name(const std::string& name) { proto_.set_name(name); }
-  /**
-   * If it shares data from others, then owner is the id of that Param,
-   * otherwise it is itself's id.
-   */
-  inline int owner() const { return proto_.owner(); }
-  /**
-   * ID start from 0 and ordered for all Param from the same neuralnet
-   */
-  inline int id() const { return proto_.id(); }
-  /**
-   * Set ID
-   */
-  inline void set_id(int id) {
-    proto_.set_id(id);
-    proto_.set_owner(id);
-  }
-  inline int version() const { return version_; }
-  inline void set_version(int v) { version_ = v; }
-  /**
-   * @return the version of the Param when the last Update request was issued.
-   */
-  inline int last_version() const { return last_version_; }
-  inline void set_last_version(int v) { last_version_ = v; }
-
-  /**
-   * @return the sharing Param name which is configured by users in conf file.
-   */
-  inline const std::string& share_from() const { return proto_.share_from(); }
-   /**
-    * @return num of parameters in this Param obj.
-    */
-  inline const std::vector<int>& shape() const { return data_.shape(); }
-  inline int size() const { return data_.count(); }
-  inline const Blob<float>& data() const { return data_; }
-  inline Blob<float>* mutable_data() { return &data_; }
-  inline const Blob<float> &grad() const { return grad_; }
-  inline Blob<float> *mutable_grad() { return &grad_; }
-  inline float* mutable_cpu_data() { return data_.mutable_cpu_data(); }
-  inline float* mutable_cpu_grad() { return grad_.mutable_cpu_data(); }
-  inline float* mutable_cpu_history() { return history_.mutable_cpu_data(); }
-  inline float* mutable_cpu_update() { return update_.mutable_cpu_data(); }
-  /**
-   * @return slice start ID
-   */
-  inline int slice_start() const { return slice_start_; }
-  inline int num_slices() const { return num_slices_; }
-
-  /**
-   * Below are message/request related functions.
-   * The basic communication workflows are as follow:
-   *------------------------------------------------------------------------
-   *         |Put         |Get           |Update           |Sync
-   *------------------------------------------------------------------------
-   * Generate|(stub)      |(stub)        |(stub)           |(server)
-   * Message |GenPutMsg   |GenGetMsg     |GenUpdateMsg     |GenSyncMsg
-   *------------------------------------------------------------------------
-   * Handle  |(server)    |(server)      |(server)         |(server)
-   * Message |HandlePutMsg|HandleGetMsg  |ParseUpdateMsg   |HandleSyncMsg
-   *         |            |              |GenUpdateResMsg  |
-   *------------------------------------------------------------------------
-   * Handle  |            |(stub)        |(stub)           |(server)
-   * Response|            |ParseGetResMsg|ParseUpdateResMsg|ParseSyncResMsg
-   *------------------------------------------------------------------------
-   */
-
-  /**
-   * Generate the message for a put request, i.e., put parameters to a server
-   *
-   * This function is called at worker/stub side.
-   * @param copy decides whether to copy the parameter values from the server.
-   * @param slice_idx index of the slice from which the message is generated.
-   * @return generated message without setting src, dst, target fields.
-   */
-  virtual Msg* GenPutMsg(bool copy, int slice_idx);
-  /**
-   * Generate the message for a get request, i.e., get parameters from a server
-   * \copydetails GenPutMsg(bool, int);
-   */
-  virtual Msg* GenGetMsg(bool copy, int slice_idx);
-  /**
-   * Generate the message for a update request, i.e., pass info to server for
-   * parameter update.
-   * \copydetails GenPutMsg(bool, int);
-   */
-  virtual Msg* GenUpdateMsg(bool copy, int slice_idx);
-  /**
-   * Generate the message for a synchronization request between server groups.
-   *
-   * This function is called at server side where the Param is actually a slice
-   * of an original Param object.
-   * */
-  virtual Msg* GenSyncMsg(int offset, int size);
-  /**
-   * Server handling function for put request.
-   *
-   * @param msg request
-   * @param reserve if true reserve the msg space for the calling function;
-   * otherwise the msg should be freed inside the function.
-   * @return resposne message
-   */
-  virtual Msg* HandlePutMsg(Msg** msg, bool reserve);
-  /**
-   * Server handling function for put request.
-   *
-   * \copydetails HandleGetMsg(Msg**, bool reserve)
-   */
-  virtual Msg* HandleGetMsg(Msg** msg, bool reserve);
-  /**
-   * Server parse update requests.
-   * \copydetails GenUpdateResponseMsgs(const std::vector<Msg*>& msgs);
-   */
-  virtual void ParseUpdateMsgs(const std::vector<Msg*>& msgs);
-  /**
-   * Generate the messages to response the update requests.
-   *
-   * This function is called at the server side, where the Param is actually a
-   * slice of an original Param object.
-   *
-   * @param msgs for synchronous training, there would be multiple procs in
-   * which workers sharing the same Param (slice) objects. Their update requests
-   * is bufferred and handled together. For asynchrnous training, there is only
-   * request in msgs.
-   * @return response messages
-   */
-  virtual const std::vector<Msg*>
-    GenUpdateResponseMsgs(std::vector<Msg*>* msgs, bool reserve);
-  /**
-   * Server handling function for synchronization message
-   *
-   * \copydetails HandleGetMsg(Msg**, bool reserve)
-   */
-  virtual Msg* HandleSyncMsg(Msg** msg, bool reserve);
-  /**
-   * Worker/Stub parsing function for get response.
-   *
-   * @param msg
-   * @param slice_idx index for the slice
-   */
-  virtual int ParseGetResponseMsg(Msg* msg, int slice_idx);
-  /**
-   * Worker/Server parsing function for update response
-   *
-   * \copydetails ParseGetResponseMsg(Msg**, int);
-   */
-  virtual int ParseUpdateResponseMsg(Msg* msg, int slice_idx);
-  /**
-   * Server parsing function for synchronization response.
-   *
-   * \copydetails ParseGetResponseMsg(Msg** , int);
-   */
-  virtual int ParseSyncResponseMsg(Msg* msg, int slice_idx);
-
- protected:
-  /**
-   * Implement the common code of ParseGetResponseMsg and ParseUpdateResponseMsg
-   * \copydetails ParseSyncResponseMsg(Msg* msg, int slice_idx);
-   */
-  void ParseResponseMsg(Msg* msg, int slice_idx);
-
- protected:
-  //!< param version updated by the Update/Sync/Get response
-  //!< only the owner param is initialized.
-  int version_ = -1;
-  //!< param version before last Update/Sync/Get request, set from version_
-  int last_version_ = -1;
-  //!< the global ID of the first slice
-  int slice_start_ = 0;
-  //!< total num of slices for this Parm obj
-  int num_slices_ = 0;
-  // offset and size of each slice
-  std::vector<int> slice_offset_;
-  std::vector<int> slice_size_;
-  // for debug. Put request has no feedback, we do not track its pending status
-  std::vector<bool> pending_get_;
-  std::vector<bool> pending_update_;
-  int num_pending_requests_ = 0;
-  // data, gradient, history gradient of this parameter
-  Blob<float> data_, grad_, history_, update_;
-  ParamProto proto_;
-};
-
-/**
- * ParamEntry is used for aggregating gradients of Params shared by workers from
- * the same group.
- *
- * For each worker group, every unique Param object has a ParamEntry object.
- * Param objects sharing the same values are associated with the same
- * ParamEntry.
- */
-class ParamEntry {
- public:
-  ParamEntry() {}
-  ParamEntry(int total, Param* p);
-  /**
-   * Associate the counter to a Param object.
-   *
-   * @param p
-   * @param local 1 if it is used by workers in this procs, 0 otherwise
-   */
-  void AddParam(bool local, Param* p);
-  int next_version = -1;  // next_version & num_update are directly used by stub
-  int num_update = 0;
-  int num_local = 0;  //!< # local workers using the shared parameter
-  int num_total = 0;  //!< # total workers using the shared parameter
-  //!< Shares are deleted by neuralnet's destructor
-  std::vector<Param*> shares;
-};
-
-inline int ParamTrgt(int param_id, int slice_id) {
-  return (param_id << 16) | slice_id;
-}
-
-inline int ParamID(int param_trgt) {
-  return param_trgt >> 16;
-}
-
-inline int SliceID(int param_trgt) {
-  static const int mask = (1 << 16) -1;
-  return param_trgt & mask;
-}
-
-}  // namespace singa
-
-#endif  // SINGA_UTILS_PARAM_H_
diff --git a/include/singa/utils/singa_op.h b/include/singa/utils/singa_op.h
deleted file mode 100644
index 7499eb1..0000000
--- a/include/singa/utils/singa_op.h
+++ /dev/null
@@ -1,299 +0,0 @@
-/************************************************************
-*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*
-*************************************************************/
-
-#ifndef SINGA_UTILS_SINGA_OP_H_
-#define SINGA_UTILS_SINGA_OP_H_
-
-#include <cmath>
-#include <algorithm>
-
-#ifdef USE_GPU
-#include <cuda_runtime.h>
-#include <cublas_v2.h>
-#include "singa/utils/math_kernel.h"
-#endif  // USE_GPU
-
-namespace singa {
-
-namespace op {
-
-/**
- * b = e^a
- */
-template<typename Dtype>
-struct Exp {
-  inline static void Map(const Dtype & a, Dtype * b) {
-    *b = exp(a);
-  }
-#ifdef USE_GPU
-  inline static void CudaMap(const Dtype * a, Dtype * b, int n) {
-    singa::singa_gpu_exp(a, b, n);
-  }
-#endif  // USE_GPU
-};
-/**
- * b = log(a), base is e
- */
-template<typename Dtype>
-struct Log {
-  inline static void Map(const Dtype & a, Dtype *b) {
-    *b = log(a);
-  }
-#ifdef USE_GPU
-  inline static void CudaMap(const Dtype * a, Dtype * b, int n) {
-    singa::singa_gpu_log(a, b, n);
-  }
-#endif  // USE_GPU
-};
-
-template<typename Dtype>
-struct Sigmoid {
-  inline static void Map(const Dtype & a, Dtype * b) {
-    *b = 1.0f / (1.0f + expf(-a));
-  }
-#ifdef USE_GPU
-  inline static void CudaMap(const Dtype * a, Dtype * b, int n) {
-    singa::singa_gpu_sigmoid(a, b, n);
-  }
-#endif  // USE_GPU
-};
-template<typename Dtype>
-struct SigmoidGrad {
-  inline static void Map(const Dtype & a, Dtype * b) {
-    *b = a * (1.0f - a);
-  }
-#ifdef USE_GPU
-  inline static void CudaMap(const Dtype * a, Dtype * b, int n) {
-    singa::singa_gpu_sigmoid_grad(a, b, n);
-  }
-#endif  // USE_GPU
-};
-
-template<typename Dtype>
-struct Relu {
-  inline static void Map(const Dtype & a, Dtype * b) {
-    *b = std::max(a, 0.0f);
-  }
-#ifdef USE_GPU
-  inline static void CudaMap(const Dtype * a, Dtype * b, int n) {
-    singa::singa_gpu_relu(a, b, n);
-  }
-#endif  // USE_GPU
-};
-
-template<typename Dtype>
-struct ReluGrad {
-  inline static void Map(const Dtype & a, Dtype * b) {
-    *b = a > 0 ? 1 : 0;
-  }
-#ifdef USE_GPU
-  inline static void CudaMap(const Dtype * a, Dtype * b, int n) {
-    singa::singa_gpu_relu_grad(a, b, n);
-  }
-#endif  // USE_GPU
-};
-
-template<typename Dtype>
-struct Tanh {
-  inline static void Map(const Dtype & a, Dtype * b) {
-    *b = tanhf(a);
-  }
-#ifdef USE_GPU
-  inline static void CudaMap(const Dtype * a, Dtype * b, int n) {
-    singa::singa_gpu_tanh(a, b, n);
-  }
-#endif  // USE_GPU
-};
-
-template<typename Dtype>
-struct TanhGrad {
-  inline static void Map(const Dtype & a, Dtype * b) {
-    *b = 1 - a * a;
-  }
-#ifdef USE_GPU
-  inline static void CudaMap(const Dtype * a, Dtype * b, int n) {
-    singa::singa_gpu_tanh_grad(a, b, n);
-  }
-#endif  // USE_GPU
-};
-
-template<typename Dtype>
-struct Softplus {
-  inline static void Map(const Dtype & a, Dtype * b) {
-    *b = logf(1 + expf(a));
-  }
-#ifdef USE_GPU
-  inline static void CudaMap(const Dtype * a, Dtype * b, int n) {
-    singa::singa_gpu_softplus(a, b, n);
-  }
-#endif  // USE_GPU
-};
-
-template<typename Dtype>
-struct SoftplusGrad {
-  inline static void Map(const Dtype & a, Dtype * b) {
-    *b = 1.0f / (1.0f + expf(-a));
-  }
-#ifdef USE_GPU
-  inline static void CudaMap(const Dtype * a, Dtype * b, int n) {
-    singa::singa_gpu_softplus_grad(a, b, n);
-  }
-#endif  // USE_GPU
-};
-
-template<typename Dtype>
-struct Square {
-  inline static void Map(const Dtype & a, Dtype * b) {
-    *b = a * a;
-  }
-#ifdef USE_GPU
-  inline static void CudaMap(const Dtype * a, Dtype * b, int n) {
-    singa::singa_gpu_square(a, b, n);
-  }
-#endif  // USE_GPU
-};
-
-template<typename Dtype>
-struct SquareGrad {
-  inline static void Map(const Dtype & a, Dtype * b) {
-    *b = 2 * sqrt(a);
-  }
-#ifdef USE_GPU
-  inline static void CudaMap(const Dtype * a, Dtype * b, int n) {
-    singa::singa_gpu_square_grad(a, b, 1, n);
-  }
-#endif  // USE_GPU
-};
-
-template<typename Dtype>
-struct Sqrt {
-  inline static void Map(const Dtype & a, Dtype * b) {
-    *b = sqrt(a);
-  }
-#ifdef USE_GPU
-  inline static void CudaMap(const Dtype * a, Dtype * b, int n) {
-    singa::singa_gpu_sqrt(a, b, n);
-  }
-#endif  // USE_GPU
-};
-
-/*********************************************************************/
-/**
- * c = pow(a, b), i.e., c = a^b
- */
-template<typename Dtype>
-struct Pow {
-  inline static void Map(const Dtype & a, const Dtype &b, Dtype * c) {
-    *c = pow(a, b);
-  }
-#ifdef USE_GPU
-  inline static void CudaMap(const Dtype * a,
-      const Dtype * b, Dtype * c, int n) {
-    singa::singa_gpu_pow(a, b, c, n);
-  }
-#endif  // USE_GPU
-};
-
-template<typename Dtype>
-struct Add {
-  inline static void Map(const Dtype & a, const Dtype & b, Dtype * c) {
-    *c =  a + b;
-  }
-#ifdef USE_GPU
-  inline static void CudaMap(const Dtype * a,
-      const Dtype * b, Dtype * c, int n) {
-//    singa::singa_gpu_add(a, b, c, n); // TODO(haibo)
-  }
-#endif  // USE_GPU
-};
-
-template<typename Dtype>
-struct Sub {
-  inline static void Map(const Dtype & a, const Dtype & b, Dtype * c) {
-    *c =  a - b;
-  }
-#ifdef USE_GPU
-  inline static void CudaMap(const Dtype * a,
-      const Dtype * b, Dtype * c, int n) {
-//    singa::singa_gpu_add(a, b, c, n);  // TODO(haibo)
-  }
-#endif  // USE_GPU
-};
-
-
-template<typename Dtype>
-struct Mult {
-  inline static void Map(const Dtype & a, const Dtype & b, Dtype * c) {
-    *c =  a * b;
-  }
-#ifdef USE_GPU
-  inline static void CudaMap(const Dtype * a,
-      const Dtype * b, Dtype * c, int n) {
-    singa::singa_gpu_mult(a, b, c, n);
-  }
-#endif  // USE_GPU
-};
-
-template<typename Dtype>
-struct Div {
-  inline static void Map(const Dtype & a, const Dtype & b, Dtype * c) {
-    *c =  a / b;
-  }
-#ifdef USE_GPU
-  inline static void CudaMap(const Dtype * a,
-      const Dtype * b, Dtype * c, int n) {
-    singa::singa_gpu_div(a, b, c, n);
-  }
-#endif  // USE_GPU
-};
-
-
-/*********************************************************************/
-template<typename Dtype>
-struct Set {
-  inline static void Map(Dtype alpha, Dtype * a) {
-    *a = alpha;
-  }
-#ifdef USE_GPU
-  inline static void CudaMap(Dtype alpha, Dtype * a, int n) {
-    singa::singa_gpu_set_value(a, alpha, n);
-  }
-#endif  // USE_GPU
-};
-
-template<typename Dtype>
-struct Threshold {
-  inline static void Map(Dtype alpha, const Dtype & a, Dtype * b) {
-    *b =  a < alpha ? 1.0f : 0.0f;
-  }
-#ifdef USE_GPU
-  inline static void CudaMap(Dtype alpha,  const Dtype * a,
-      Dtype * b, int n) {
-    singa::singa_gpu_threshold(a, b, alpha, n);
-  }
-#endif  // USE_GPU
-};
-
-};  // namespace op
-
-};  // namespace singa
-
-#endif  // SINGA_UTILS_SINGA_OP_H_
diff --git a/include/singa/utils/singleton.h b/include/singa/utils/singleton.h
index 4cf487e..de831c4 100644
--- a/include/singa/utils/singleton.h
+++ b/include/singa/utils/singleton.h
@@ -7,9 +7,9 @@
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
-* 
+*
 *   http://www.apache.org/licenses/LICENSE-2.0
-* 
+*
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -22,10 +22,8 @@
 #ifndef SINGA_UTILS_SINGLETON_H_
 #define SINGA_UTILS_SINGLETON_H_
 
-/**
-  * Thread-safe implementation for C++11 according to
-  * http://stackoverflow.com/questions/2576022/efficient-thread-safe-singleton-in-c
-  */
+/// Thread-safe implementation for C++11 according to
+//  http://stackoverflow.com/questions/2576022/efficient-thread-safe-singleton-in-c
 template<typename T>
 class Singleton {
  public:
@@ -35,18 +33,18 @@
   }
 };
 
-/**
- * Thread Specific Singleton
- *
- * Each thread will have its own data_ storage.
- */
+/// Thread Specific Singleton
+/// Each thread will have its own data_ storage.
+/*
 template<typename T>
 class TSingleton {
  public:
   static T* Instance() {
-    static thread_local T data_;
+    static thread_local T data_;  // thread_local is not available in some
+                                  // compilers
     return &data_;
   }
 };
+*/
 
 #endif  // SINGA_UTILS_SINGLETON_H_
diff --git a/include/singa/utils/tokenizer.h b/include/singa/utils/string.h
similarity index 67%
rename from include/singa/utils/tokenizer.h
rename to include/singa/utils/string.h
index 9637c75..b4c7c24 100644
--- a/include/singa/utils/tokenizer.h
+++ b/include/singa/utils/string.h
@@ -22,10 +22,46 @@
 #ifndef SINGA_UTILS_TOKENIZER_H_
 #define SINGA_UTILS_TOKENIZER_H_
 
-#include <glog/logging.h>
 #include <string>
+#include <algorithm>
+#include "singa/utils/logging.h"
 
 namespace singa {
+inline bool icasecmp(const string& l, const string& r) {
+  return l.size() == r.size() &&
+         equal(l.cbegin(), l.cend(), r.cbegin(),
+               [](string::value_type l1, string::value_type r1) {
+                 return toupper(l1) == toupper(r1);
+               });
+}
+
+inline string ToLowerCase(const string& input) {
+  string out;
+  out.resize(input.size());
+  std::transform(input.begin(), input.end(), out.begin(), ::tolower);
+  return out;
+}
+
+inline int ArgPos(int argc, char** arglist, const char* arg) {
+  for (int i = 0; i < argc; i++) {
+    if (strcmp(arglist[i], arg) == 0) {
+      return i;
+    }
+  }
+  return -1;
+}
+
+template<typename T>
+inline std::string VecToStr(const std::vector<T> & in) {
+  std::string out = "(";
+
+  for (auto x : in) {
+    out += std::to_string(x) + ", ";
+  }
+  out += ")";
+  return out;
+}
+
 /**
  * Tokenize a string.
  *
diff --git a/include/singa/utils/timer.h b/include/singa/utils/timer.h
new file mode 100644
index 0000000..291c733
--- /dev/null
+++ b/include/singa/utils/timer.h
@@ -0,0 +1,58 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef SINGA_UTILS_TIMER_H
+#define SINGA_UTILS_TIMER_H
+
+#include <chrono>
+
+namespace singa {
+
+/// For benchmarking the time cost of operations.
+class Timer {
+ public:
+  typedef std::chrono::duration<int> Seconds;
+  typedef std::chrono::duration<int, std::milli> Milliseconds;
+  typedef std::chrono::duration<int, std::ratio<60 * 60>> Hours;
+  typedef std::chrono::duration<int, std::micro> Microseconds;
+
+  /// Init the internal time point to the current time
+  Timer() { Tick(); }
+  /// Reset the internal time point to the current time
+  void Tick() { last_ = std::chrono::high_resolution_clock::now(); }
+  /// Return the duration since last call to Tick() or since the creation of
+  /// Timer. The template arg must be from Second or Millisecond or Hour.
+  /// The returned value is the count of the time metric.
+  template <typename T = Milliseconds>
+  int Elapsed() const {
+    static_assert(std::is_same<T, Seconds>::value ||
+                      std::is_same<T, Milliseconds>::value ||
+                      std::is_same<T, Hours>::value ||
+                      std::is_same<T, Microseconds>::value,
+                  "Template arg must be Seconds | Milliseconds | Hours | Microseconds");
+    auto now  = std::chrono::high_resolution_clock::now();
+    return std::chrono::duration_cast<T>(now - last_).count();
+  }
+  /// Return the string rep of current wall time
+  // std::string CurrentTime();
+
+ private:
+  std::chrono::high_resolution_clock::time_point last_;
+};
+}
+#endif
diff --git a/include/singa/utils/updater.h b/include/singa/utils/updater.h
deleted file mode 100644
index 33ad8a7..0000000
--- a/include/singa/utils/updater.h
+++ /dev/null
@@ -1,173 +0,0 @@
-/************************************************************
-*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*
-*************************************************************/
-
-#ifndef SINGA_UTILS_UPDATER_H_
-#define SINGA_UTILS_UPDATER_H_
-
-#include <string>
-#include "singa/proto/job.pb.h"
-#include "singa/utils/param.h"
-#include "singa/neuralnet/layer.h"
-
-namespace singa {
-using std::string;
-/**
- * Base learning rate generator.
- *
- * Generate learning rate for a give training step/iteration.
- * There are many different ways to change the learning rate through time/step.
- * Users can inherint this class to implement their own change method.
- */
-class LRGenerator {
- public:
-  static LRGenerator* Create(const LRGenProto& proto);
-
-  virtual ~LRGenerator() {}
-
-  virtual void Init(const LRGenProto& proto) { proto_ = proto; }
-  /**
-   * @param step training step/iteration.
-   * @return base learning rate regardless of step
-   */
-  virtual float Get(int step) { return proto_.base_lr(); }
-
- protected:
-  LRGenProto proto_;
-};
-
-class FixedStepLRGen : public LRGenerator {
- public:
-  float Get(int step) override;
- private:
-  int last_idx_ = 0;
-};
-
-class StepLRGen : public LRGenerator {
- public:
-  float Get(int step) override;
-};
-
-class LinearLRGen : public LRGenerator {
- public:
-  float Get(int step) override;
-};
-
-class ExpLRGen : public LRGenerator {
- public:
-  float Get(int step) override;
-};
-
-class InvLRGen : public LRGenerator {
- public:
-  float Get(int step) override;
-};
-
-class InvTLRGen : public LRGenerator {
- public:
-  float Get(int step) override;
-};
-
-/**
- * Updater for Param.
- */
-class Updater {
- public:
-
-  /* added for python binding */
-  static Updater* CreateUpdater(const string str);
-  /* ------------------------ */
-
-  static Updater* Create(const UpdaterProto& proto);
-
-  virtual ~Updater() {}
-
-  virtual void Init(const UpdaterProto &proto);
-  virtual void Update(int step, Param* param, float grad_scale) = 0;
-  void Clip(const float low, const float high, Param* param);
- protected:
-  UpdaterProto proto_;
-  LRGenerator* lr_gen_;
-  float weight_decay_;
-  float momentum_;
-  float clip_low_, clip_high_;
-};
-
-class SGDUpdater : public Updater {
- public:
-  void Update(int step, Param* param, float grad_scale) override;
-};
-
-class AdaGradUpdater : public Updater {
- public:
-  void Update(int step, Param* param, float grad_scale) override;
-};
-
-
-class NesterovUpdater : public Updater {
- public:
-  void Update(int step, Param* param, float grad_scale) override;
-};
-
-class RMSPropUpdater : public Updater {
- public:
-  void Init(const UpdaterProto &proto) override;
-  void Update(int step, Param* param, float grad_scale) override;
-
- protected:
-  float rho_;
-  float delta_;
-};
-
-class AdaDeltaUpdater : public Updater {
- public:
-  void Init(const UpdaterProto &proto) override;
-  void Update(int step, Param* param, float grad_scale) override;
-
- protected:
-  float rho_;
-  float delta_;
-};
-
-class AdamUpdater : public Updater {
-  public:
-   void Init(const UpdaterProto &proto) override;
-   void Update(int step, Param* param, float grad_scale) override;
-
-  protected:
-   float beta1_;
-   float beta2_;
-   float delta_;
-};
-
-class AdamMaxUpdater : public Updater {
-  public:
-   void Init(const UpdaterProto &proto) override;
-   void Update(int step, Param* param, float grad_scale) override;
-
-  protected:
-   float beta1_;
-   float beta2_;
-   float delta_;
-};
-
-}  // namespace singa
-
-#endif  // SINGA_UTILS_UPDATER_H_
diff --git a/include/singa/utils/zk_service.h b/include/singa/utils/zk_service.h
deleted file mode 100644
index 789215b..0000000
--- a/include/singa/utils/zk_service.h
+++ /dev/null
@@ -1,116 +0,0 @@
-/************************************************************
-*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-* 
-*   http://www.apache.org/licenses/LICENSE-2.0
-* 
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*
-*************************************************************/
-
-#ifndef SINGA_UTILS_ZK_SERVICE_H_
-#define SINGA_UTILS_ZK_SERVICE_H_
-
-#include <zookeeper/zookeeper.h>
-#include <string>
-#include <vector>
-
-#include "singa/utils/cluster_rt.h"
-
-namespace singa {
-
-const int kZKBufSize = 100;
-// following paths are global
-const std::string kZKPathSinga = "/singa";
-const std::string kZKPathSys =   "/singa/sys";
-const std::string kZKPathJLock = "/singa/sys/job-lock";
-const std::string kZKPathHostIdx = "/singa/sys/host-idx";
-const std::string kZKPathApp =   "/singa/app";
-const std::string kZKPathJob =   "/singa/app/job-";
-// following paths are local under /singa/app/job-X
-const std::string kZKPathJobGroup = "/group";
-const std::string kZKPathJobProc =  "/proc";
-const std::string kZKPathJobPLock = "/proc-lock";
-
-inline std::string GetZKJobWorkspace(int job_id) {
-  char buf[kZKBufSize];
-  snprintf(buf, kZKBufSize, "%010d", job_id);
-  return kZKPathJob + buf;
-}
-
-/*
- * A wrapper for zookeeper service which handles error code and reconnections
- */
-class ZKService {
- public:
-  static void ChildChanges(zhandle_t* zh, int type, int state,
-                           const char *path, void* watcherCtx);
-
-  ~ZKService();
-  bool Init(const std::string& host, int timeout);
-  bool CreateNode(const char* path, const char* val, int flag, char* output);
-  bool DeleteNode(const char* path);
-  bool Exist(const char* path);
-  bool UpdateNode(const char* path, const char* val);
-  bool GetNode(const char* path, char* output);
-  bool GetChild(const char* path, std::vector<std::string>* vt);
-  bool WGetChild(const char* path, std::vector<std::string>* vt,
-                   RTCallback *cb);
-
- private:
-  const int kNumRetry = 5;
-  const int kSleepSec = 1;
-
-  static void WatcherGlobal(zhandle_t* zh, int type, int state,
-                            const char *path, void* watcherCtx);
-
-  zhandle_t* zkhandle_ = nullptr;
-};
-
-/*
- * A ClusterRuntime implementation using zookeeper
- */
-class ZKClusterRT : public ClusterRuntime {
- public:
-  ZKClusterRT(const std::string& host, int job_id);
-  ~ZKClusterRT();
-
-  bool Init() override;
-  int RegistProc(const std::string& host_addr, int pid) override;
-  std::string GetProcHost(int proc_id) override;
-  bool WatchSGroup(int gid, int sid, rt_callback fn, void* ctx) override;
-  bool JoinSGroup(int gid, int wid, int s_group) override;
-  bool LeaveSGroup(int gid, int wid, int s_group) override;
-
- private:
-  inline std::string groupPath(int gid) {
-    return group_path_ + "/sg" + std::to_string(gid);
-  }
-  inline std::string workerPath(int gid, int wid) {
-    return "/g" + std::to_string(gid) + "_w" + std::to_string(wid);
-  }
-
-  int timeout_ = 30000;
-  std::string host_ = "";
-  ZKService zk_;
-  std::string workspace_ = "";
-  std::string group_path_ = "";
-  std::string proc_path_ = "";
-  std::string proc_lock_path_ = "";
-  std::vector<RTCallback*> cb_vec_;
-};
-
-}  // namespace singa
-
-#endif  // SINGA_UTILS_ZK_SERVICE_H_
diff --git a/include/singa/worker.h b/include/singa/worker.h
deleted file mode 100644
index d53e54b..0000000
--- a/include/singa/worker.h
+++ /dev/null
@@ -1,340 +0,0 @@
-/************************************************************
-*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*
-*************************************************************/
-
-#ifndef SINGA_WORKER_H_
-#define SINGA_WORKER_H_
-
-#include <string>
-#include <unordered_map>
-#include <vector>
-#include "singa/comm/socket.h"
-#include "singa/neuralnet/neuralnet.h"
-#include "singa/proto/job.pb.h"
-#include "singa/neuralnet/connection_layer.h"
-#include "singa/neuralnet/neuron_layer.h"
-
-namespace singa {
-
-//!< sleep 5 milliseconds if the Param is not updated to the expected version
-const int kCollectSleepTime = 5;
-/**
- * The Worker class which runs the training algorithm.
- * The first worker group will initialize parameters of the Net,
- * and put them into the distributed memory/table.
- * The virtual function TrainOneBatch and TestOneBatch implement the
- * training and test algorithm for one mini-batch data.
- *
- * Child workers override the two functions to implement their training
- * algorithms, e.g., the BPWorker/CDWorker/BPTTWorker implements the BP/CD/BPTT
- * algorithm respectively.
- */
-class Worker {
- public:
-  /**
-   * Create an instance of the subclass of Worker.
-   *
-   * @param[in] conf configuration of the TrainOneBatch algorithm. Different
-   * Worker subclasses implement different algorithms. Hence the creation is
-   * based on the TrainOneBatch algorithm type. Currently SINGA
-   * provides two algorithms:
-   * -# Back-propagation for the feed-forward models, e.g., CNN and MLP, and the
-   *  recurrent neural networks.
-   * -# Contrastive divergence for the energy models, e.g., RBM.
-   *
-   * @return a pointer to the instance of the Worker subclass.
-   */
-  static Worker* CreateWorker(const std::string str);
-  static Worker* Create(const AlgProto& conf);
-  virtual ~Worker();
-  /**
-   * @param[in] grp_id global worker group ID
-   * @param[in] id worker ID within the group
-   * @param[in] conf job configuration
-   * @param[in] train_net pointer to the training neural net, which could be
-   * shared with other workers from the same group. Different workers run over
-   * differnt subset of layers.
-   * @param[in] val_net pointer to the validation neural net. Currently only the
-   * first worker from the first group would have validation neural net. All
-   * other workers receive nullptr for this argument.
-   * @param[in] test_net pointer to the test neural net. Currently only the
-   * first worker from the first group would have test neural net. All other
-   * workers receive nullptr for this argument.
-   */
-  virtual void Setup(int grp_id, int id, const JobProto& conf,
-      NeuralNet* train_net, NeuralNet* val_net, NeuralNet* test_net);
-  /**
-   * Main function of Worker.
-   *
-   * Train the neuralnet step by step, test/validation is done periodically.
-   */
-  void Run();
-  /**
-   * Run TestOneBatch() over the a neural net for a total number of steps.
-   *
-   * @param[in] steps total number of test iterations.
-   * @param[in] phase kVal or kTest
-   * @param[in] net run test over the passed in neural net
-   */
-  void Test(int steps, Phase phase, NeuralNet* net);
-  /**
-   * Init sockets in a worker, including:
-   * 1. a global socket communicates with stub
-   * 2. a bridge socket dedicated for bridge layer communications
-   *
-   * the bridge socket will be binded to each bridge layer
-   *
-   * @param[in] net pointer to a neural net whose bridge layer will be binded
-   * with a socket.
-   */
-  void InitSockets(const NeuralNet* net);
-  /**
-   * Init values of Param instances assocaited with local layers (i.e., layers
-   * dispatched to this worker).
-   *
-   * If one Param is owned by the worker, then it should be initialized and put
-   * to servers. Otherwise Get() should be called to get the Param. The Get()
-   * may not send get requests if the Param owner is in the same procs, for
-   * which case the memory space of the Param objects are shared. But if this
-   * worker and the Param owner worker run on different devices (e.g., GPUs),
-   * then the get request would be sent.
-   *
-   * If the training starts from scrath, every Param object is initialzed using
-   * ParamGenerator. After that, the worker may
-   * train for a couple of steps to warmup the params before put
-   * them to servers (warmup of JobProto controls this).
-   *
-   * If one Param object's name matches that of one Param object from the
-   * checkpoint files, its Param values would be loaded from checkpoint files.
-   *
-   * @param[in] job_conf job configuration which provides settings for
-   * checkpoint file paths, warmup steps and Param versions.
-   * @param[out] net pointer to a neural net whose Param values will be
-   * initialized.
-   */
-  void InitNetParams(const JobProto& job_conf, NeuralNet* net);
-  void InitNetParams(const std::string& folder, vector<Layer*> net);
-  /**
-   * Checkpoint all Param objects owned by the worker onto disk.
-   * The serialization is done using BlobProtos which includes the name, version
-   * and values of each Param object.
-   * Different workers would generate different checkpoint files. The file path
-   * is <workspace>/checkpoint-<jobname>-step<step>-worker<worker_id>.bin
-   * @param[in] step training step
-   * @param[in] folder directory to put the checkpoint file
-   * @param net the training net whose Param objects will be dumped.
-   */
-  void Checkpoint(int step, const std::string& folder, NeuralNet* net);
-  void Checkpoint(int step, const std::string& folder, vector<Layer*> net);
-  /**
-    * Train one mini-batch.
-    * Test/Validation is done before training.
-    *
-    * @param[in] step training step.
-    * @param[in] net neural net to be trained.
-    */
-  virtual void TrainOneBatch(int step, NeuralNet* net) = 0;
-  /**
-   * Test/validate one mini-batch data.
-   *
-   * @param[in] step test step.
-   * @param[in] phase test could be done for validation or test phase.
-   * @param[in] net neural net for test
-   */
-  virtual void TestOneBatch(int step, Phase phase, NeuralNet* net) = 0;
-  /**
-   * Display infomation from layers.
-   *
-   * @param flag could be a combination of multiple phases, e.g, kTest|kForward,
-   * it is passed to the Layer::ToString() function for each layer to decide
-   * what to display .
-   * @param prefix display prefix, e.g., 'Train step 100', 'Test step 90'.
-   * @param net display layers from this neural net.
-   */
-  virtual void Display(int flag, const std::string& prefix, NeuralNet* net);
-  /**
-   * Put Param values to server.
-   *
-   * @param param
-   * @param step used as current param version for the put request
-   */
-  int Put(int step, Param* param);
-  /**
-   * Get Param with specific version from server
-   * If the current version >= the requested version, then return.
-   * Otherwise send a get request to stub who would forwards it to servers.
-   * @param param
-   * @param step requested param version
-   */
-  int Get(int step, Param* param);
-  /**
-   * Update Param.
-   *
-   * @param param
-   * @param step training step used for updating (e.g., deciding learning rate).
-   */
-  int Update(int step, Param* param);
-  /**
-   * Wait for the response of the update/get requests.
-   *
-   * @param param
-   * @param step not used now.
-   */
-  int Collect(int step, Param* param);
-  /**
-   * Call Collect() for every param of net
-   */
-  int CollectAll(int step, NeuralNet* net);
-  /**
-   * @param[in] step
-   * @return true if it is time to display training info, e.g., loss; otherwise
-   * false.
-   */
-  inline bool DisplayNow(int step) const {
-    return job_conf_.disp_freq() > 0
-           && step >= job_conf_.disp_after()
-           && ((step - job_conf_.disp_after()) % job_conf_.disp_freq() == 0);
-  }
-  /**
-   * @param[in] step
-   * @return true if it is time to finish the training; otherwise false.
-   */
-  inline bool StopNow(int step) const {
-    return step >= job_conf_.train_steps();
-  }
-  /**
-   * @param[in] step
-   * @return true if it is time to do checkpoint Param objects; otherwise false.
-   */
-  inline bool CheckpointNow(int step) const {
-    return job_conf_.checkpoint_freq() > 0
-           && step >= job_conf_.checkpoint_after()
-           && ((step - job_conf_.checkpoint_after())
-              % job_conf_.checkpoint_freq() == 0);
-  }
-  /**
-   * @param[in] step
-   * @return true if it is time to do test over the test dataset.
-   */
-  inline bool TestNow(int step) const {
-    return job_conf_.test_freq() > 0
-      && job_conf_.test_steps() > 0
-      && step >= job_conf_.test_after()
-      && ((step - job_conf_.test_after()) % job_conf_.test_freq() == 0);
-  }
-  /**
-   * @param[in] step
-   * @return true if it is time to do test over the validation dataset.
-   */
-  inline bool ValidateNow(int step) const {
-    return job_conf_.validate_freq() > 0
-      && job_conf_.validate_steps() > 0
-      && step >= job_conf_.validate_after()
-      && ((step - job_conf_.validate_after()) % job_conf_.validate_freq() == 0);
-  }
-  /**
-   * @return a vector with pointers to all neural nets.
-   */
-  const std::vector<NeuralNet*> GetNets() const {
-    return std::vector<NeuralNet*> {train_net_, val_net_, test_net_};
-  }
-  /**
-   * @return training net.
-   */
-  inline NeuralNet* train_net() const {
-    return train_net_;
-  }
-  /**
-   * @return group ID
-   */
-  inline int grp_id() const { return grp_id_; }
-  /**
-   * @reutrn worker ID within the worker group.
-   */
-  inline int id() const { return id_; }
-
- protected:
-  int grp_id_ = -1, id_ = -1;
-  int step_ = 0;
-  JobProto job_conf_;
-  NeuralNet* train_net_ = nullptr;
-  NeuralNet* test_net_ = nullptr;
-  NeuralNet* val_net_ = nullptr;
-  Dealer* dealer_ = nullptr;
-  // bridge layer related
-  Dealer* bridge_dealer_ = nullptr;
-  std::unordered_map<std::string, Layer*> name2bridge_;
-};
-
-class BPWorker: public Worker {
- public:
-  void TrainOneBatch(int step, NeuralNet* net) override;
-  void TestOneBatch(int step, Phase phase, NeuralNet* net) override;
-  virtual void Forward(int step, Phase phase, NeuralNet* net);
-  virtual void Backward(int step, NeuralNet* net);
-};
-
-/**
- * Subclass of Worker that implements BPTT (Backpropagation through time)
- * algorithm for computing gradients of RNN models.
- * Max BPTT/unrolling length is configured by users.
- */
-class BPTTWorker: public BPWorker {
- public:
-  void Forward(int step, Phase phase, NeuralNet* net) override;
-  void Backward(int step, NeuralNet* net) override;
-  void Display(int flag, const std::string& prefix, NeuralNet* net) override;
-
- private:
-  /*
-   * indicator used in truncted BPTT, which feeds the hidden state of the last
-   * unrolled unit to the first unit in Forward() for the next iteration.
-   * currently always feed the last hidden state to the first.
-   */
-  bool full_state_ = false;
-  //!< indicator used for the starting of a new pass of the dataset.
-  bool begin_ = false;
-};
-/**
- * Subclass of Worker that implements the Contrastive Divergence algorithm for
- * computing the gradients of paramters of energy models.
- */
-class CDWorker: public Worker {
- public:
-  void TrainOneBatch(int step, NeuralNet* net) override;
-  void TestOneBatch(int step, Phase phase, NeuralNet* net) override;
-};
-
-inline int BlobTrgt(int grp, int layer) {
-  return (grp << 16) | layer;
-}
-
-inline int BlobGrp(int blob_trgt) {
-  return blob_trgt >> 16;
-}
-
-inline int BlobLayer(int blob_trgt) {
-  static int mask = (1 << 16) -1;
-  return blob_trgt & mask;
-}
-
-}  // namespace singa
-
-#endif  // SINGA_WORKER_H_
diff --git a/tool/python/singa/generatepy.sh b/jenkins.sh
old mode 100755
new mode 100644
similarity index 61%
rename from tool/python/singa/generatepy.sh
rename to jenkins.sh
index 488d96a..347a55e
--- a/tool/python/singa/generatepy.sh
+++ b/jenkins.sh
@@ -1,5 +1,6 @@
-#!/usr/bin/env bash
+#!/usr/bin/env sh
 #/**
+# *
 # * Licensed to the Apache Software Foundation (ASF) under one
 # * or more contributor license agreements.  See the NOTICE file
 # * distributed with this work for additional information
@@ -17,10 +18,34 @@
 # * limitations under the License.
 # */
 
-#The following commands are only for developers adding new py apis.
-swig -c++ -python driver.i
-#g++ -fPIC ../../../src/driver.cc driver_wrap.cxx -shared -o _driver.so \
-# 	 -L../../../.libs/ -lsinga -DMSHADOW_USE_CUDA=0 \
-#    -DMSHADOW_USE_CBLAS=1 -DMSHADOW_USE_MKL=0 -std=c++11 \
-#    -I../../../include \
-#    -I/usr/include/python2.7/
+
+
+if [ $1 = "CPP" ]; then
+  echo "CPP test"
+  mkdir build
+  rm -f gtest.xml
+  cd build
+  cmake -DUSE_CUDNN=OFF -DUSE_CUDA=OFF -DUSE_PYTHON=OFF ../ 
+  make
+  ./bin/test_singa --gtest_output=xml:./../gtest.xml
+fi
+
+if [ $1 = "CUDNN" ]; then
+  echo "CUDNN test"
+  git submodule init
+  git submodule update
+  mkdir build
+  rm -f gtest.xml
+  cd build
+  cmake -DUSE_CUDNN=ON -DUSE_CUDA=ON -DUSE_PYTHON=OFF ../ 
+  make
+  ./bin/test_singa --gtest_output=xml:./../gtest.xml
+fi
+
+cd ..
+rm -rf build
+
+
+
+
+
diff --git a/lib/cnmem b/lib/cnmem
new file mode 160000
index 0000000..28a182d
--- /dev/null
+++ b/lib/cnmem
@@ -0,0 +1 @@
+Subproject commit 28a182d49529da49f4ac4e3941cec3edf16b3540
diff --git a/rat-excludes b/rat-excludes
index 63cb327..f7f79d2 100644
--- a/rat-excludes
+++ b/rat-excludes
@@ -3,7 +3,6 @@
 Makefile.*
 configure
 .gitignore
-conf/*
 doc/*
 config/*
 \.dirstamp
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
new file mode 100644
index 0000000..cc1ee0c
--- /dev/null
+++ b/src/CMakeLists.txt
@@ -0,0 +1,135 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# generate protobuf sources
+
+FILE(GLOB proto_files proto/*.proto)
+protobuf_generate_cpp(proto_srcs proto_hdrs ${proto_files})
+IF (USE_PYTHON)
+    protobuf_generate_python(proto_pys ${proto_files})
+ENDIF()
+INCLUDE_DIRECTORIES("${CMAKE_BINARY_DIR}/include")
+
+#message(STATUS "include: ${CMAKE_BINARY_DIR} ")
+#message(STATUS "srcs: ${proto_srcs}")
+#message(STATUS "hdrs: ${proto_hdrs}")
+#message(STATUS "pys: ${proto_pys}")
+ADD_LIBRARY(singa_proto STATIC ${proto_hdrs} ${proto_srcs} ${proto_pys})
+FOREACH(fil ${proto_hdrs})
+    ADD_CUSTOM_COMMAND(
+        TARGET singa_proto PRE_BUILD
+        COMMAND ${CMAKE_COMMAND} -E make_directory "${CMAKE_BINARY_DIR}/include/singa/proto"
+        COMMAND ${CMAKE_COMMAND} -E copy ${fil} "${CMAKE_BINARY_DIR}/include/singa/proto"
+        #COMMAND ${CMAKE_COMMAND} -E echo "copy done"
+        )
+ENDFOREACH()
+LIST(APPEND SINGA_LINKER_LIBS singa_proto)
+
+SET(PREVIOUS_LINKER_LIBS ${SINGA_LINKER_LIBS})
+
+#FILE(GLOB_RECURSE utils_source ${CMAKE_CURRENT_SOURCE_DIR}/utils/ "*.cc")
+AUX_SOURCE_DIRECTORY(utils utils_source)
+#message(STATUS "UTILS ${utils_source}")
+ADD_LIBRARY(singa_utils SHARED ${utils_source})
+TARGET_LINK_LIBRARIES(singa_utils ${SINGA_LINKER_LIBS})
+LIST(APPEND SINGA_LINKER_LIBS singa_utils)
+
+#FILE(GLOB_RECURSE core_source ${CMAKE_CURRENT_SOURCE_DIR}/core/ "*.cc")
+AUX_SOURCE_DIRECTORY(core/device core_source)
+AUX_SOURCE_DIRECTORY(core/memory core_source)
+AUX_SOURCE_DIRECTORY(core/scheduler core_source)
+AUX_SOURCE_DIRECTORY(core/tensor core_source)
+IF (USE_CUDA)
+    FILE(GLOB_RECURSE cuda_source core "*.cu")
+    SET(FLAGS_BACKUP ${CMAKE_CXX_FLAGS})
+    SET(CMAKE_CXX_FLAGS "")
+    IF (CMAKE_BUILD_TYPE MATCHES DEBUG)
+        CUDA_COMPILE(cuda_objs SHARED ${cuda_source}
+            OPTIONS "-Xcompiler -fPIC -G -g")
+    ELSE (CMAKE_BUILD_TYPE MATCHES  DEBUG)
+        CUDA_COMPILE(cuda_objs SHARED ${cuda_source} OPTIONS "-Xcompiler -fPIC")
+    ENDIF (CMAKE_BUILD_TYPE MATCHES DEBUG)
+    include_directories("${CMAKE_CURRENT_SOURCE_DIR}/core/tensor")
+    SET(CMAKE_CXX_FLAGS ${FLAGS_BACKUP})
+ENDIF (USE_CUDA)
+#message(STATUS "FLAGS ${CMAKE_CXX_FLAGS}")
+#message(STATUS "CORE ${cuda_source}")
+#message(STATUS "OBJ ${cuda_objs}")
+ADD_LIBRARY(singa_core SHARED ${core_source} ${cuda_objs})
+TARGET_LINK_LIBRARIES(singa_core ${SINGA_LINKER_LIBS})
+LIST(APPEND SINGA_LINKER_LIBS singa_core)
+#MESSAGE(STATUS "link libs " ${SINGA_LINKER_LIBS})
+
+#FILE(GLOB_RECURSE model_source ${CMAKE_CURRENT_SOURCE_DIR}/model/ "*.cc")
+AUX_SOURCE_DIRECTORY(model model_source)
+AUX_SOURCE_DIRECTORY(model/layer model_source)
+AUX_SOURCE_DIRECTORY(model/optimizer model_source)
+AUX_SOURCE_DIRECTORY(model/loss model_source)
+AUX_SOURCE_DIRECTORY(model/metric model_source)
+AUX_SOURCE_DIRECTORY(model/updater model_source)
+#MESSAGE(STATUS "MODEL ${model_source}")
+ADD_LIBRARY(singa_model SHARED ${model_source})
+MESSAGE(STATUS "model linker libs ${SINGA_LINKER_LIBS}")
+TARGET_LINK_LIBRARIES(singa_model ${SINGA_LINKER_LIBS})
+LIST(APPEND SINGA_LINKER_LIBS singa_model)
+
+AUX_SOURCE_DIRECTORY(io io_source)
+AUX_SOURCE_DIRECTORY(io/network io_source)
+ADD_LIBRARY(singa_io SHARED ${io_source})
+TARGET_LINK_LIBRARIES(singa_io ${SINGA_LINKER_LIBS})
+LIST(APPEND SINGA_LINKER_LIBS singa_io)
+
+IF(USE_PYTHON)
+
+    FILE(REMOVE "${CMAKE_CURRENT_SOURCE_DIR}/python/swig/config.i")
+    CONFIGURE_FILE("${CMAKE_CURRENT_SOURCE_DIR}/python/swig/config.i.in" "${CMAKE_CURRENT_SOURCE_DIR}/python/swig/config.i")
+
+    FILE(GLOB python_files python/swig/singa.i)
+    # delete old .cxx file
+    FILE(REMOVE "${CMAKE_CURRENT_SOURCE_DIR}/python/swig/singa_wrap.cxx")
+
+    # generate cxx and wrap.py
+    swig_generate_cxx(python_srcs ${python_files})
+
+    #FILE(COPY python/ DESTINATION ${CMAKE_BINARY_DIR}/python/singa FILES_MATCHING PATTERN "swig" EXCLUDE PATTERN "*.py")
+    #Create symlinks for all python source files  Do not omit !!!RELATIVE!!!
+    file(GLOB_RECURSE python_source_files RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} *.py)
+
+    create_symlinks(${python_source_files})
+
+    SET(python_cxxs "${core_source};${io_source};${model_source};${utils_source}")
+    ADD_LIBRARY(_singa_wrap SHARED ${python_srcs} ${python_cxxs} ${cuda_objs})
+    SET(WRAPPER_LINKER_LIBS "${PREVIOUS_LINKER_LIBS}")
+    TARGET_LINK_LIBRARIES(_singa_wrap ${WRAPPER_LINKER_LIBS})
+    TARGET_INCLUDE_DIRECTORIES(_singa_wrap PRIVATE ${PYTHON_INCLUDE_DIRS})
+    #message(STATUS "PREVIOUS_LINKER_LIBS ${PREVIOUS_LINKER_LIBS}")
+
+    SET_TARGET_PROPERTIES(_singa_wrap
+        PROPERTIES PREFIX ""
+        LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/python/singa
+        )
+
+    #SETUP
+    SET(SETUP_PY_IN "python/setup.py.in")
+    SET(SETUP_PY    "${CMAKE_BINARY_DIR}/python/setup.py")
+    CONFIGURE_FILE(${SETUP_PY_IN} ${SETUP_PY})
+
+    #create python/singa/proto/__init__.py
+    FILE(WRITE ${CMAKE_BINARY_DIR}/python/singa/proto/__init__.py "")
+
+ENDIF(USE_PYTHON)
diff --git a/src/comm/msg.cc b/src/comm/msg.cc
deleted file mode 100644
index 8128b46..0000000
--- a/src/comm/msg.cc
+++ /dev/null
@@ -1,265 +0,0 @@
-/************************************************************
-*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*
-*************************************************************/
-
-#include "singa/comm/msg.h"
-
-#include <glog/logging.h>
-#include <stdarg.h>
-
-namespace singa {
-
-Msg::~Msg() {
-#ifdef USE_ZMQ
-  if (msg_ != nullptr)
-    zmsg_destroy(&msg_);
-  frame_ = nullptr;
-#else
-  for (auto& frame : frames_)
-    delete static_cast<char*>(frame.first);
-#endif
-}
-
-Msg::Msg() {
-#ifdef USE_ZMQ
-  msg_ = zmsg_new();
-#endif
-}
-
-Msg::Msg(const Msg& msg) {
-  src_ = msg.src_;
-  dst_ = msg.dst_;
-  type_ = msg.type_;
-  trgt_val_ = msg.trgt_val_;
-  trgt_version_ = msg.trgt_version_;
-#ifdef USE_ZMQ
-  msg_ = zmsg_dup(msg.msg_);
-#endif
-}
-
-Msg::Msg(int src, int dst) {
-  src_ = src;
-  dst_ = dst;
-#ifdef USE_ZMQ
-  msg_ = zmsg_new();
-#endif
-}
-
-void Msg::SwapAddr() {
-  std::swap(src_, dst_);
-}
-
-#ifdef USE_ZMQ
-int Msg::size() const {
-  return zmsg_content_size(msg_);
-}
-void Msg::AddFrame(const void* addr, int nBytes) {
-  zmsg_addmem(msg_, addr, nBytes);
-}
-int Msg::FrameSize() {
-  return zframe_size(frame_);
-}
-char* Msg::FrameStr() {
-  return zframe_strdup(frame_);
-}
-void* Msg::FrameData() {
-  return zframe_data(frame_);
-}
-bool Msg::NextFrame() {
-  frame_ = zmsg_next(msg_);
-  return frame_ != nullptr;
-}
-void Msg::FirstFrame() {
-  frame_ = zmsg_first(msg_);
-}
-void Msg::LastFrame() {
-  frame_ = zmsg_last(msg_);
-}
-void Msg::ParseFromZmsg(zmsg_t* msg) {
-  char* tmp = zmsg_popstr(msg);
-  sscanf(tmp, "%d %d %d %d %d",
-         &src_, &dst_, &type_, &trgt_val_, &trgt_version_);
-  frame_ = zmsg_first(msg);
-  msg_ = msg;
-}
-
-zmsg_t* Msg::DumpToZmsg() {
-  zmsg_pushstrf(msg_, "%d %d %d %d %d",
-      src_, dst_, type_, trgt_val_, trgt_version_);
-  zmsg_t *tmp = msg_;
-  msg_ = nullptr;
-  return tmp;
-}
-
-#else
-
-int Msg::size() const {
-  int s = 0;
-  for (auto& entry : frames_)
-    s += entry.second;
-  return s;
-}
-
-void Msg::AddFrame(const void* addr, int nBytes) {
-  char* tmp = new char[nBytes];
-  memcpy(tmp, addr, nBytes);
-  frames_.push_back(std::make_pair(tmp, nBytes));
-}
-
-int Msg::FrameSize() {
-  return frames_.at(idx_).second;
-}
-
-char* Msg::FrameStr() {
-  char* ret = new char[frames_.at(idx_).second];
-  memcpy(ret, static_cast<char*>(frames_.at(idx_).first), 
-        frames_.at(idx_).second);
-  return ret;
-}
-
-void* Msg::FrameData() {
-  return frames_.at(idx_).first;
-}
-
-bool Msg::NextFrame() {
-  idx_++;
-//  LOG(ERROR) << "idx " << idx_ << " vs size " << frames_.size();
-  return idx_ < frames_.size();
-}
-
-void Msg::FirstFrame() {
-  idx_ = 0;
-}
-
-void Msg::LastFrame() {
-  idx_ = frames_.size() - 1;
-}
-
-#endif
-
-// frame marker indicating this frame is serialize like printf
-#define FMARKER "*singa*"
-
-#define kMaxFrameLen 2048
-
-int Msg::AddFormatFrame(const char *format, ...) {
-  va_list argptr;
-  va_start(argptr, format);
-  int size = strlen(FMARKER);
-  char dst[kMaxFrameLen];
-  memcpy(dst, FMARKER, size);
-  dst[size++] = 0;
-  while (*format) {
-    if (*format == 'i') {
-      int x = va_arg(argptr, int);
-      dst[size++] = 'i';
-      memcpy(dst + size, &x, sizeof(x));
-      size += sizeof(x);
-    } else if (*format == 'f') {
-      float x = static_cast<float> (va_arg(argptr, double));
-      dst[size++] = 'f';
-      memcpy(dst + size, &x, sizeof(x));
-      size += sizeof(x);
-    } else if (*format == '1') {
-      uint8_t x = va_arg(argptr, int);
-      memcpy(dst + size, &x, sizeof(x));
-      size += sizeof(x);
-    } else if (*format == '2') {
-      uint16_t x = va_arg(argptr, int);
-      memcpy(dst + size, &x, sizeof(x));
-      size += sizeof(x);
-    } else if (*format == '4') {
-      uint32_t x = va_arg(argptr, uint32_t);
-      memcpy(dst + size, &x, sizeof(x));
-      size += sizeof(x);
-    } else if (*format == 's') {
-      char* x = va_arg(argptr, char *);
-      dst[size++] = 's';
-      memcpy(dst + size, x, strlen(x));
-      size += strlen(x);
-      dst[size++] = 0;
-    } else if (*format == 'p') {
-      void* x = va_arg(argptr, void *);
-      dst[size++] = 'p';
-      memcpy(dst + size, &x, sizeof(x));
-      size += sizeof(x);
-    } else {
-      LOG(ERROR) << "Unknown format " << *format;
-    }
-    format++;
-    CHECK_LE(size, kMaxFrameLen);
-  }
-  va_end(argptr);
-  AddFrame(dst, size);
-  return size;
-}
-
-int Msg::ParseFormatFrame(const char *format, ...) {
-  va_list argptr;
-  va_start(argptr, format);
-  char* src = FrameStr();
-  CHECK_STREQ(FMARKER, src);
-  int size = strlen(FMARKER) + 1;
-  while (*format) {
-    if (*format == 'i') {
-      int *x = va_arg(argptr, int *);
-      CHECK_EQ(src[size++], 'i');
-      memcpy(x, src + size, sizeof(*x));
-      size += sizeof(*x);
-    } else if (*format == 'f') {
-      float *x = va_arg(argptr, float *);
-      CHECK_EQ(src[size++], 'f');
-      memcpy(x, src + size, sizeof(*x));
-      size += sizeof(*x);
-    } else if (*format == '1') {
-      uint8_t *x = va_arg(argptr, uint8_t *);
-      memcpy(x, src + size, sizeof(*x));
-      size += sizeof(*x);
-    } else if (*format == '2') {
-      uint16_t *x = va_arg(argptr, uint16_t *);
-      memcpy(x, src + size, sizeof(*x));
-      size += sizeof(*x);
-    } else if (*format == '4') {
-      uint32_t *x = va_arg(argptr, uint32_t *);
-      memcpy(x, src + size, sizeof(*x));
-      size += sizeof(*x);
-    } else if (*format == 's') {
-      char* x = va_arg(argptr, char *);
-      CHECK_EQ(src[size++], 's');
-      int len = strlen(src + size);
-      memcpy(x, src + size, len);
-      x[len] = 0;
-      size += len + 1;
-    } else if (*format == 'p') {
-      void** x = va_arg(argptr, void **);
-      CHECK_EQ(src[size++], 'p');
-      memcpy(x, src + size, sizeof(*x));
-      size += sizeof(*x);
-    } else {
-      LOG(ERROR) << "Unknown format type " << *format;
-    }
-    format++;
-  }
-  va_end(argptr);
-  //  delete src;
-  return size;
-}
-
-}  // namespace singa
diff --git a/src/comm/socket.cc b/src/comm/socket.cc
deleted file mode 100644
index eba6a0c..0000000
--- a/src/comm/socket.cc
+++ /dev/null
@@ -1,146 +0,0 @@
-/************************************************************
-*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*
-*************************************************************/
-#include "singa/comm/socket.h"
-
-#include <glog/logging.h>
-
-namespace singa {
-const int TIME_OUT = 2;  // max blocking time in milliseconds.
-std::unordered_map<int, SafeQueue<Msg*>> msgQueues;
-Dealer::~Dealer() {
-#ifdef USE_ZMQ
-  zsock_destroy(&dealer_);
-#endif
-}
-
-Dealer::Dealer(int id) : id_ (id) {
-  msgQueues[id];
-}
-
-int Dealer::Connect(const std::string& endpoint) {
-  if (endpoint.length() > 0) {
-#ifdef USE_ZMQ
-    dealer_ = zsock_new(ZMQ_DEALER);
-    CHECK_NOTNULL(dealer_);
-    CHECK_EQ(zsock_connect(dealer_, "%s", endpoint.c_str()), 0);
-#else
-    LOG(FATAL) << "No message passing lib is linked";
-#endif
-    endpoint_ = endpoint;
-  }
-  return 1;
-}
-
-int Dealer::Send(Msg** msg) {
-  if (endpoint_.length()) {
-#ifdef USE_ZMQ
-    zmsg_t* zmsg = (*msg)->DumpToZmsg();
-    zmsg_send(&zmsg, dealer_);
-#else
-    LOG(FATAL) << "No message passing lib is linked";
-#endif
-    delete *msg;
-    *msg = nullptr;
-  } else {
-    msgQueues.at(-1).Push(*msg);
-  }
-  return 1;
-}
-
-Msg* Dealer::Receive(int timeout) {
-  Msg* msg = nullptr;
-  if (timeout > 0) {
-    if (!msgQueues.at(id_).Pop(msg, timeout))
-      return nullptr;
-  } else {
-    msgQueues.at(id_).Pop(msg);
-  }
-  msg->FirstFrame();
-  return msg;
-}
-
-Router::~Router() {
-#ifdef USE_ZMQ
-  zsock_destroy(&router_);
-#endif
-}
-
-Router::Router() {
-  msgQueues[-1];
-}
-
-int Router::Bind(const std::string& endpoint) {
-  int port = -1;
-  if (endpoint.length() > 0) {
-    endpoint_ = endpoint;
-#ifdef USE_ZMQ
-    router_ = zsock_new(ZMQ_ROUTER);
-    CHECK_NOTNULL(router_);
-    port = zsock_bind(router_, "%s", endpoint.c_str());
-    CHECK_NE(port, -1) << endpoint;
-    LOG(INFO) << "bind successfully to " << zsock_endpoint(router_);
-    poller_ = zpoller_new(router_);
-#else
-    LOG(FATAL) << "No message passing lib is linked";
-#endif
-  }
-  return port;
-}
-
-int Router::Send(Msg **msg) {
-  int dstid = (*msg)->dst();
-  if (msgQueues.find(dstid) != msgQueues.end()) {
-    msgQueues.at(dstid).Push(*msg);
-  } else {
-    LOG(FATAL) << "The dst queue not exist for dstid = " << dstid;
-  }
-  return 1;
-}
-
-Msg* Router::Receive(int timeout) {
-  Msg* msg = nullptr;
-  if (timeout == 0)
-    timeout = TIME_OUT;
-  while (msg == nullptr) {
-#ifdef USE_ZMQ
-    if (router_ != nullptr) {
-      zsock_t* sock = static_cast<zsock_t*>(zpoller_wait(poller_, timeout));
-      if (sock != NULL) {
-        zmsg_t* zmsg = zmsg_recv(router_);
-        if (zmsg == nullptr) {
-          LOG(ERROR) << "Connection broken!";
-          exit(0);
-        }
-        zframe_t* dealer = zmsg_pop(zmsg);
-        zframe_destroy(&dealer);
-        Msg* remote_msg = new Msg();
-        remote_msg->ParseFromZmsg(zmsg);
-        msgQueues.at(-1).Push(remote_msg);
-      }
-    }
-#endif
-    msgQueues.at(-1).Pop(msg, timeout * 10);
-  }
-  msg->FirstFrame();
-  return msg;
-}
-
-}  // namespace singa
diff --git a/src/core/device/cpp_cpu.cc b/src/core/device/cpp_cpu.cc
new file mode 100644
index 0000000..04209ab
--- /dev/null
+++ b/src/core/device/cpp_cpu.cc
@@ -0,0 +1,64 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "singa/core/device.h"
+
+namespace singa {
+
+std::shared_ptr<Device> defaultDevice=std::make_shared<CppCPU>();
+
+CppCPU::CppCPU() : Device(-1, 1) {
+  lang_ = kCpp;
+  //host_ = nullptr;
+}
+
+
+void CppCPU::SetRandSeed(unsigned seed) {
+  ctx_.random_generator.seed(seed);
+}
+
+
+void CppCPU::DoExec(function<void(Context*)>&& fn, int executor) {
+  CHECK_EQ(executor, 0);
+  fn(&ctx_);
+}
+
+
+void* CppCPU::Malloc(int size) {
+  if (size > 0) {
+    void *ptr = malloc(size);
+    memset(ptr, 0, size);
+    return ptr;
+  } else {
+    return nullptr;
+  }
+}
+
+
+void CppCPU::Free(void* ptr) {
+  if (ptr != nullptr)
+    free(ptr);
+}
+
+
+void CppCPU::CopyToFrom(void* dst, const void* src, size_t nBytes,
+                           CopyDirection direction, Context* ctx) {
+  memcpy(dst, src, nBytes);
+}
+
+}  // namespace singa
diff --git a/src/core/device/cuda_gpu.cc b/src/core/device/cuda_gpu.cc
new file mode 100644
index 0000000..0164752
--- /dev/null
+++ b/src/core/device/cuda_gpu.cc
@@ -0,0 +1,126 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "singa/singa_config.h"
+#ifdef USE_CUDA
+#include <cublas_v2.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <curand.h>
+#include <chrono>
+#include <iostream>
+#include "singa/core/device.h"
+#include "singa/utils/cuda_utils.h"
+namespace singa {
+
+const cudaMemcpyKind copyKind[] = {cudaMemcpyHostToHost, cudaMemcpyHostToDevice,
+                                   cudaMemcpyDeviceToHost,
+                                   cudaMemcpyDeviceToDevice};
+
+CudaGPU::~CudaGPU() {
+  if (ctx_.cublas_handle) CUBLAS_CHECK(cublasDestroy(ctx_.cublas_handle));
+  if (ctx_.curand_generator)
+    CURAND_CHECK(curandDestroyGenerator(ctx_.curand_generator));
+#ifdef USE_CUDNN
+  if (ctx_.cudnn_handle) {
+    auto status = cudnnDestroy(ctx_.cudnn_handle);
+    CHECK_EQ(status, CUDNN_STATUS_SUCCESS) << cudnnGetErrorString(status);
+  }
+#endif
+}
+const int kNumCudaStream = 1;
+
+CudaGPU::CudaGPU(int id) : Device(id, kNumCudaStream) {
+  MemPoolConf conf;
+  conf.add_device(id);
+  pool_ = std::make_shared<CnMemPool>(conf);
+  Setup();
+}
+
+CudaGPU::CudaGPU(int id, std::shared_ptr<DeviceMemPool> pool)
+    : Device(id, kNumCudaStream) {
+  CHECK_NE(pool, nullptr);
+  pool_ = pool;
+  Setup();
+}
+
+void CudaGPU::Setup() {
+  lang_ = kCuda;
+  ctx_.stream = NULL;  // use the default sync stream
+  // TODO(wangwei) create one handle for each steam?
+  CUDA_CHECK(cudaSetDevice(id_));
+  // use curandCreateGeneratorHost for CudaHost device
+  CURAND_CHECK(
+      curandCreateGenerator(&ctx_.curand_generator, CURAND_RNG_PSEUDO_DEFAULT));
+  auto seed = std::chrono::system_clock::now().time_since_epoch().count();
+  SetRandSeed(seed);
+  // TODO(wangwei) if one generator per stream, then need diff offset per gen?
+  CURAND_CHECK(curandSetGeneratorOffset(ctx_.curand_generator, 0));
+  CUBLAS_CHECK(cublasCreate(&(ctx_.cublas_handle)));
+
+#ifdef USE_CUDNN
+  // TODO(wangwei) create one handle for each stream?
+  auto status = cudnnCreate(&ctx_.cudnn_handle);
+  CHECK_EQ(status, CUDNN_STATUS_SUCCESS) << cudnnGetErrorString(status);
+#endif  // USE_CUDNN
+}
+
+void CudaGPU::SetRandSeed(unsigned seed) {
+  CHECK(ctx_.curand_generator);
+  CURAND_CHECK(curandSetPseudoRandomGeneratorSeed(ctx_.curand_generator, seed));
+}
+
+void CudaGPU::DoExec(function<void(Context*)>&& fn, int executor) { fn(&ctx_); }
+
+void CudaGPU::CopyToFrom(void* dst, const void* src, size_t nBytes,
+                         CopyDirection direction, Context* ctx) {
+  cudaMemcpy(dst, src, nBytes, copyKind[direction]);
+  // TODO(wangwei) use async copy
+  // cudaMemcpyAsync(dst, src, nBytes,cudaMemcpyDefault, ctx_.stream);
+}
+
+size_t CudaGPU::GetAllocatedMem() {
+  if (pool_ != nullptr) {
+    auto ret = pool_->GetMemUsage();
+    return ret.second - ret.first;
+  }
+  LOG(ERROR) << "The memory pool is not set";
+  return 0u;
+}
+
+/// Allocate gpu memory.
+void* CudaGPU::Malloc(int size) {
+  void* ptr = nullptr;
+  if (size > 0) {
+    CUDA_CHECK(cudaSetDevice(id_));
+    pool_->Malloc((void**)&ptr, size);
+    // TODO(wangwei) remove the memset.
+    CUDA_CHECK(cudaMemset(ptr, 0, size));
+  }
+  return ptr;
+}
+
+/// Free gpu memory.
+void CudaGPU::Free(void* ptr) {
+  if (ptr != nullptr) {
+    CUDA_CHECK(cudaSetDevice(id_));
+    pool_->Free(ptr);
+  }
+}
+
+}  // namespace singa
+#endif  // USE_CUDA
diff --git a/src/core/device/device.cc b/src/core/device/device.cc
new file mode 100644
index 0000000..0220df0
--- /dev/null
+++ b/src/core/device/device.cc
@@ -0,0 +1,74 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "singa/core/device.h"
+
+namespace singa {
+Device::Device(int id, int num_executors)
+    : id_(id), num_executors_(num_executors) {
+  // TODO(wangwei) create scheduler and vm.
+  host_ = defaultDevice;
+}
+
+void Device::Exec(function<void(Context*)>&& fn, const vector<Block*> read_blocks,
+                    const vector<Block*> write_blocks, bool use_rand_generator) {
+  // TODO(wangwei) execute operations scheduled by the scheduler.
+  DoExec(std::move(fn), 0);
+}
+
+// TODO(wangwei) get Block from the memory manager
+Block* Device::NewBlock(int size) {
+  if (size > 0) {
+    void* ptr = Malloc(size);
+    return new Block(ptr, size);
+  } else {
+    return nullptr;
+  }
+}
+
+// TODO(wangwei) return Block to the memory manager
+void Device::FreeBlock(Block* block) {
+  if (block != nullptr) {
+    Free(block->mutable_data());
+    delete block;
+  }
+}
+
+void Device::CopyDataToFrom(Block* dst, Block* src, size_t nBytes,
+                            CopyDirection direct, int dst_offset,
+                            int src_offset) {
+  this->Exec(
+      [this, dst, src, nBytes, direct, dst_offset, src_offset](Context* ctx) {
+        this->CopyToFrom(
+            reinterpret_cast<char*>(dst->mutable_data()) + dst_offset,
+            reinterpret_cast<const char*>(src->data()) + src_offset, nBytes,
+            direct, ctx);
+      },
+      {src}, {dst});
+}
+
+void Device::CopyDataFromHostPtr(Block* dst, const void* src, size_t nBytes,
+                                 size_t dst_offset) {
+  auto direct = lang_ == kCpp ? kHostToHost : kHostToDevice;
+  void* dstptr = reinterpret_cast<char*>(dst->mutable_data()) + dst_offset;
+  Exec([this, dstptr, src, nBytes,
+        direct](Context* ctx) { CopyToFrom(dstptr, src, nBytes, direct, ctx); },
+       {}, {dst});
+}
+void Device::Sync() {}
+}  // namespace singa
diff --git a/src/core/device/opencl_device.cc b/src/core/device/opencl_device.cc
new file mode 100644
index 0000000..b941cd2
--- /dev/null
+++ b/src/core/device/opencl_device.cc
@@ -0,0 +1,248 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <iostream>
+#include <fstream>
+#include <sstream>
+#include <string>
+
+#include "singa/core/device.h"
+#include "singa/utils/tinydir.h"
+
+#ifdef USE_OPENCL
+
+using std::string;
+
+namespace singa {
+
+const string OpenclDevice::cl_src_path = "../src/core/tensor";
+
+OpenclDevice::OpenclDevice(int id, int num_executors)
+	: Device(id, num_executors) {
+  lang_ = kOpencl;
+  this->kernels = std::make_shared<std::unordered_map<string, cl::Kernel>>();
+
+  // Create the OpenCL Device, Context, and CommandQueue.
+  /// TODO: This merely chooses the first device on the first platform.
+  cl_int status = CL_SUCCESS;
+
+  std::vector<cl::Platform> platforms;
+  status = cl::Platform::get(&platforms);
+  OCL_CHECK(status, "Failed to find any OpenCL platforms!");
+
+  std::vector<cl::Device> devices;
+  status = platforms[0].getDevices(CL_DEVICE_TYPE_ALL, &devices);
+  OCL_CHECK(status, "Failed to get list of devices from platform!");
+
+  this->this_device = cl::Device(devices[0]);
+  this->ocl_ctx = cl::Context(this_device, nullptr, nullptr, nullptr, &status);
+  OCL_CHECK(status, "Failed to create context!");
+
+  this->cmdq = cl::CommandQueue(ocl_ctx, this_device, CL_QUEUE_PROFILING_ENABLE, &status);
+  OCL_CHECK(status, "Failed to create a command queue!");
+
+  BuildPrograms();
+
+  ctx_.kernels = kernels;
+  ctx_.ocl_cmdq = cmdq;
+  ctx_.ocl_ctx = ocl_ctx;
+}
+
+
+OpenclDevice::~OpenclDevice() {
+
+  // Flush and finish the command queue.
+  cmdq.flush();
+  cmdq.finish();
+}
+
+
+cl::Kernel OpenclDevice::GetKernel(const std::string& kname, cl_int* status) {
+  if (!status) *status = CL_SUCCESS;
+  if (kernels->find(kname) == kernels->end()) {
+    // TODO: Not found
+    LOG(ERROR) << "Error: Kernel " << kname << " could not be found!";
+    if (!status) *status = CL_INVALID_KERNEL;
+  }
+  return kernels->at(kname);
+}
+
+/*
+void OpenclDevice::PrintAllDeviceInfo() {
+  cl_int status = CL_SUCCESS;
+
+  for (auto dev : devices) {
+    PrintDeviceInfo(d);
+  }
+}
+*/
+
+
+void OpenclDevice::PrintClBuildInfo(cl::Program &p) {
+  cl_int status = CL_SUCCESS;
+
+  auto buildStatus = p.getBuildInfo<CL_PROGRAM_BUILD_STATUS>(&status);
+  for (auto pair : buildStatus)
+	std::cout << clGetBuildInfoString(pair.second) << std::endl;
+
+  auto buildLog = p.getBuildInfo<CL_PROGRAM_BUILD_LOG>(&status);
+  for (auto pair : buildLog)
+	std::cout << pair.second << std::endl;
+}
+
+
+void OpenclDevice::SetRandSeed(unsigned seed) { seed = seed; }
+
+
+void OpenclDevice::CopyDataToFrom(Block* dst, Block* src, size_t nBytes,
+                                  CopyDirection direction, int dst_offset, int src_offset) {
+  // Pointers must be valid.
+  if (!dst || !src) return;
+
+  CopyToFrom(dst->mutable_data(), src->data(), nBytes, direction);
+}
+
+/*
+void OpenclDevice::CopyDataFromHostPtr(Block* dst, const void* src, size_t nBytes, size_t dst_offset) {
+  CopyToFrom(dst->mutable_data(), src, 4, kHostToDevice);
+}
+*/
+
+void OpenclDevice::BuildPrograms(const std::string &kdir) {
+  cl_int status = CL_SUCCESS;
+
+  tinydir_dir dir;
+  tinydir_open(&dir, kdir.c_str());
+
+  while (dir.has_next) {
+	tinydir_file file;
+	tinydir_readfile(&dir, &file);
+	std::string ext(file.extension);
+	if (ext.compare("cl") != 0) {
+	  tinydir_next(&dir);
+	  continue;
+	}
+
+	std::ifstream clFile(file.path, std::ios_base::binary);
+	std::stringstream buffer;
+	buffer << clFile.rdbuf();
+	std::string clSrc(buffer.str());
+
+	cl::Program program(this->ocl_ctx, clSrc, false, &status);
+	OCL_CHECK(status, "Program creation failed.");
+	status = program.build({this_device}, "-cl-std=CL1.2");
+	if (status == CL_SUCCESS) {
+	  std::vector<cl::Kernel> built_kernels;
+	  status = program.createKernels(&built_kernels);
+	  OCL_CHECK(status, "Failed to create kernels in built program.");
+
+	  for (auto k : built_kernels) {
+		std::string name = k.getInfo<CL_KERNEL_FUNCTION_NAME>(&status);
+		this->kernels->insert(std::make_pair(name, k));
+	  }
+	} else {
+	  OCL_CHECK(status, "Build failed on source path");
+	  LOG(ERROR) << file.path << std::endl;
+	  PrintClBuildInfo(program);
+	}
+
+	tinydir_next(&dir);
+  }
+}
+
+// Device IO functions.
+// TODO:
+// Research - MapBuffers can improve performance when the device uses shared memory
+// but is more complex to understand. http://stackoverflow.com/questions/22057692/whats-the-difference-between-clenqueuemapbuffer-and-clenqueuewritebuffer
+// Intel graphics (and possibly AMD APUs) should use MapBuffers?
+// https://software.intel.com/en-us/articles/getting-the-most-from-opencl-12-how-to-increase-performance-by-minimizing-buffer-copies-on-intel-processor-graphics
+
+
+void OpenclDevice::DoExec(function<void(Context*)>&& fn, int executor) {
+  fn(&ctx_);
+}
+
+// NOTE: ASSUMES dst AND/OR src POINTERS CAN BE CAST TO cl::Buffer POINTERS!
+void OpenclDevice::CopyToFrom(void* dst, const void* src, size_t nBytes,
+                  CopyDirection direction, Context* ctx) {
+  // Pointers must be valid.
+  if (!dst || !src) return;
+
+  switch(direction) {
+  case kHostToDevice: {
+    WriteToDevice(static_cast<cl::Buffer*>(dst), src, nBytes);
+    return;
+  }
+  case kDeviceToHost: {
+    ReadFromDevice(dst, static_cast<const cl::Buffer*>(src), nBytes);
+    return;
+  }
+  case kDeviceToDevice: {
+    CopyDeviceBuffer(static_cast<cl::Buffer*>(dst), static_cast<const cl::Buffer*>(src), nBytes);
+    return;
+  }
+  default:
+    return;
+  }
+}
+
+
+void* OpenclDevice::Malloc(int size) {
+  cl_int status = CL_SUCCESS;
+
+  cl::Buffer* buffer = new cl::Buffer(ocl_ctx, CL_MEM_READ_WRITE, size, nullptr, &status);
+  OCL_CHECK(status, "Unable to allocate memory in OpenCL device.");
+
+  return static_cast<void*>(buffer);
+}
+
+
+void OpenclDevice::Free(void* p) {
+  if (!p) return;
+  cl::Buffer* buffer = static_cast<cl::Buffer*>(p);
+  delete buffer;
+}
+
+
+void OpenclDevice::WriteToDevice(cl::Buffer* dst, const void* src, const size_t size) {
+  cl_int status = CL_SUCCESS;
+
+  status = cmdq.enqueueWriteBuffer(*dst, CL_TRUE, 0, size, src);
+  OCL_CHECK(status, "Unable to write data to OpenCL device.");
+}
+
+
+void OpenclDevice::ReadFromDevice(void* dst, const cl::Buffer* src, const size_t size) {
+  cl_int status = CL_SUCCESS;
+
+  status = cmdq.enqueueReadBuffer(*src, CL_TRUE, 0, size, dst);
+  OCL_CHECK(status, "Unable to read data from OpenCL device.");
+}
+
+
+// dst: cl::Buffer pointer    src: cl::Buffer pointer
+void OpenclDevice::CopyDeviceBuffer(cl::Buffer* dst, const cl::Buffer* src, const size_t size) {
+  cl_int status = CL_SUCCESS;
+
+  status = cmdq.enqueueCopyBuffer(*src, *dst, 0, 0, size);
+  OCL_CHECK(status, "Unable to copy buffer in OpenCL device.");
+}
+
+} // namespace singa
+
+#endif // USE_OPENCL
diff --git a/src/core/device/platform.cc b/src/core/device/platform.cc
new file mode 100644
index 0000000..a3661f2
--- /dev/null
+++ b/src/core/device/platform.cc
@@ -0,0 +1,141 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "singa/core/device.h"
+#include "singa/singa_config.h"
+
+#ifdef USE_CUDA
+
+namespace singa {
+
+int Platform::GetNumGPUs() {
+  int count;
+  CUDA_CHECK(cudaGetDeviceCount(&count));
+  return count;
+}
+
+bool Platform::CheckDevice(const int device_id) {
+  bool r = ((cudaSuccess == cudaSetDevice(device_id)) &&
+            (cudaSuccess == cudaFree(0)));
+  // reset any error that may have occurred.
+  cudaGetLastError();
+  return r;
+}
+
+/// Return the total num of free GPUs
+const vector<int> Platform::GetGPUIDs() {
+  vector<int> gpus;
+  int count = Platform::GetNumGPUs();
+  for (int i = 0; i < count; i++) {
+    if (Platform::CheckDevice(i)) {
+      gpus.push_back(i);
+    }
+  }
+  return gpus;
+}
+
+const std::pair<size_t, size_t> Platform::GetGPUMemSize(const int device) {
+  std::pair<size_t, size_t> ret{ 0, 0 };
+  if (Platform::CheckDevice(device)) {
+    CUDA_CHECK(cudaSetDevice(device));
+    size_t free = 0, total = 0;
+    CUDA_CHECK(cudaMemGetInfo(&free, &total));
+    ret = std::make_pair(free, total);
+  } else {
+    LOG(ERROR) << "The device (ID = " << device << ") is not available";
+  }
+  return ret;
+}
+
+const vector<std::pair<size_t, size_t>> Platform::GetGPUMemSize() {
+  vector<std::pair<size_t, size_t>> mem;
+  int count = Platform::GetNumGPUs();
+  for (int i = 0; i < count; i++) {
+    mem.push_back(Platform::GetGPUMemSize(i));
+  }
+  return mem;
+}
+
+const string Platform::DeviceQuery(int device, bool verbose) {
+  if (cudaSuccess != cudaGetDevice(&device)) {
+    return "The device (ID = " + std::to_string(device) + " is not available" ;
+  }
+  cudaDeviceProp prop;
+  CUDA_CHECK(cudaGetDeviceProperties(&prop, device));
+  std::ostringstream out;
+  out << "Device id:                     " << device << '\n';
+  out << "Total global memory:           " << prop.totalGlobalMem << '\n';
+  out << "Total shared memory per block: " << prop.sharedMemPerBlock
+      << '\n';
+  out << "Maximum threads per block:     " << prop.maxThreadsPerBlock
+      << '\n';
+  out << "Maximum dimension of block:    "
+      << prop.maxThreadsDim[0 << '\n'] << ", " << prop.maxThreadsDim[1]
+      << ", " << prop.maxThreadsDim[2] << '\n';
+  out << "Maximum dimension of grid:     " << prop.maxGridSize[0] << ", "
+      << "Concurrent copy and execution: "
+      << (prop.deviceOverlap ? "Yes" : "No") << '\n';
+
+  if (verbose) {
+    out << "Major revision number:         " << prop.major << '\n';
+    out << "Minor revision number:         " << prop.minor << '\n';
+    out << "Name:                          " << prop.name << '\n';
+    out << "Total registers per block:     " << prop.regsPerBlock << '\n';
+    out << "Maximum memory pitch:          " << prop.memPitch << '\n';
+    out << "Warp size:                     " << prop.warpSize
+      << prop.maxGridSize[1] << ", " << prop.maxGridSize[2] << '\n';
+    out << "Clock rate:                    " << prop.clockRate << '\n';
+    out << "Number of multiprocessors:     " << prop.multiProcessorCount
+        << '\n';
+    out << "Kernel execution timeout:      "
+        << (prop.kernelExecTimeoutEnabled ? "Yes" : "No") << '\n';
+  }
+  return out.str();
+}
+
+const vector<shared_ptr<Device> >
+Platform::CreateCudaGPUs(const size_t num_devices, size_t init_size) {
+  const vector<int> gpus = GetGPUIDs();
+  CHECK_LE(num_devices, gpus.size());
+  vector<int> use_gpus(gpus.begin(), gpus.begin() + num_devices);
+  return CreateCudaGPUsOn(use_gpus, init_size);
+}
+
+const vector<shared_ptr<Device> >
+Platform::CreateCudaGPUsOn(const vector<int> &devices, size_t init_size) {
+  MemPoolConf conf;
+  if (init_size > 0)
+    conf.set_init_size(init_size);
+  size_t bytes = conf.init_size() << 20;
+  for (auto device : devices) {
+    conf.add_device(device);
+    CHECK_LE(bytes, Platform::GetGPUMemSize(device).first);
+  }
+  auto pool = std::make_shared<CnMemPool>(conf);
+
+  vector<shared_ptr<Device> > ret;
+  for (auto device : devices) {
+    auto dev = std::make_shared<CudaGPU>(device, pool);
+    ret.push_back(dev);
+  }
+  return ret;
+}
+
+}  // namespace singa
+
+#endif  // USE_CUDA
diff --git a/src/core/memory/memory.cc b/src/core/memory/memory.cc
new file mode 100644
index 0000000..cb33a48
--- /dev/null
+++ b/src/core/memory/memory.cc
@@ -0,0 +1,111 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "singa/core/memory.h"
+#include "singa/utils/logging.h"
+#include "singa/proto/core.pb.h"
+#include <iostream>
+
+#ifdef USE_CUDA
+namespace singa {
+std::atomic<int> CnMemPool::pool_count(0);
+std::pair<size_t, size_t> CnMemPool::GetMemUsage() {
+  size_t free, total;
+  auto status = cnmemMemGetInfo(&free, &total, NULL);
+  CHECK_EQ(status, cnmemStatus_t::CNMEM_STATUS_SUCCESS)
+    << cnmemGetErrorString(status);
+  return std::make_pair(free, total);
+}
+
+CnMemPool::CnMemPool(int numDevices, size_t init_size, size_t max_size) {
+  for (int i = 0; i < numDevices; i++)
+    conf_.add_device(i);
+  conf_.set_init_size(init_size);
+  conf_.set_max_size(max_size);
+  CHECK_LT(++pool_count, 2) << "CnMemPool must be used as a singleton.";
+}
+
+CnMemPool::CnMemPool(const MemPoolConf &conf) {
+  conf_ = conf;
+  CHECK_LT(++pool_count, 2) << "CnMemPool must be used as a singleton.";
+}
+
+void CnMemPool::Init() {
+  mtx_.lock();
+  if (!initialized_) {
+    const size_t kNBytesPerMB = (1u << 20);
+    CHECK_GE(conf_.device_size(), 1);
+    cnmemDevice_t *settingPtr = new cnmemDevice_t[conf_.device_size()];
+    CHECK_GT(conf_.init_size(), 0u);
+    int i = 0;
+    for (auto device : conf_.device()) {
+      settingPtr[i].device = device;
+      settingPtr[i].size = conf_.init_size() * kNBytesPerMB;
+      settingPtr[i].numStreams = 0;
+      settingPtr[i].streams = NULL;
+      settingPtr[i].streamSizes = 0;
+      i++;
+    }
+    auto status = cnmemInit(conf_.device_size(), settingPtr, conf_.flag());
+    CHECK_EQ(status, cnmemStatus_t::CNMEM_STATUS_SUCCESS)
+        << " " << cnmemGetErrorString(status);
+    delete[] settingPtr;
+    initialized_ = true;
+  }
+  mtx_.unlock();
+}
+
+CnMemPool::~CnMemPool() {
+  mtx_.lock();
+  if (initialized_) {
+    cnmemStatus_t status = cnmemFinalize();
+    CHECK_EQ(status, cnmemStatus_t::CNMEM_STATUS_SUCCESS)
+        << " " << cnmemGetErrorString(status);
+    initialized_ = false;
+    --pool_count;
+  }
+  mtx_.unlock();
+}
+
+void CnMemPool::Malloc(void **ptr, const size_t size) {
+  if (!initialized_)
+    Init();
+  cnmemStatus_t status = cnmemMalloc(ptr, size, NULL);
+  CHECK_EQ(status, cnmemStatus_t::CNMEM_STATUS_SUCCESS)
+      << " " << cnmemGetErrorString(status);
+}
+
+void CnMemPool::Free(void *ptr) {
+  CHECK(initialized_) << "Cannot free the memory as the pool is not initialzied";
+  cnmemStatus_t status = cnmemFree(ptr, NULL);
+  CHECK_EQ(status, cnmemStatus_t::CNMEM_STATUS_SUCCESS)
+      << " " << cnmemGetErrorString(status);
+}
+
+// ===========================================================================
+void CudaMemPool::Malloc(void **ptr, const size_t size) {
+  cudaError_t status = cudaMalloc(ptr, size);
+  CHECK_EQ(status, cudaError_t::cudaSuccess);
+}
+
+void CudaMemPool::Free(void *ptr) {
+  cudaError_t status = cudaFree(ptr);
+  CHECK_EQ(status, cudaError_t::cudaSuccess);
+}
+}
+#endif
diff --git a/src/core/scheduler/scheduler.cc b/src/core/scheduler/scheduler.cc
new file mode 100644
index 0000000..183674f
--- /dev/null
+++ b/src/core/scheduler/scheduler.cc
@@ -0,0 +1,19 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "singa/core/scheduler.h"
diff --git a/src/core/tensor/distribution.cl b/src/core/tensor/distribution.cl
new file mode 100644
index 0000000..ce298c0
--- /dev/null
+++ b/src/core/tensor/distribution.cl
@@ -0,0 +1,1020 @@
+// This code is adapted from https://github.com/amd/OpenCL-caffe/blob/stable/src/caffe/ocl/random.cl
+
+//Note: random generator has two parts
+//first part: the open sourced threefy random generator kernel from DE Shaw Research
+//second part. we wrap the kernel up to generate uniform, bernoulli and gaussion distribution generators.
+
+//begin: the open sourced random generator from DE Shaw Research
+//https://www.deshawresearch.com/resources_random123.html
+typedef uint uint32_t;
+
+struct r123array4x32 {
+  uint32_t v[4];
+};
+
+enum r123_enum_threefry32x4 {
+  R_32x4_0_0 = 10,
+  R_32x4_0_1 = 26,
+  R_32x4_1_0 = 11,
+  R_32x4_1_1 = 21,
+  R_32x4_2_0 = 13,
+  R_32x4_2_1 = 27,
+  R_32x4_3_0 = 23,
+  R_32x4_3_1 = 5,
+  R_32x4_4_0 = 6,
+  R_32x4_4_1 = 20,
+  R_32x4_5_0 = 17,
+  R_32x4_5_1 = 11,
+  R_32x4_6_0 = 25,
+  R_32x4_6_1 = 10,
+  R_32x4_7_0 = 18,
+  R_32x4_7_1 = 20
+};
+
+inline uint32_t RotL_32(uint32_t x, unsigned int N) {
+  return (x << (N & 31)) | (x >> ((32 - N) & 31));
+}
+
+typedef struct r123array4x32 threefry4x32_ctr_t;
+typedef struct r123array4x32 threefry4x32_key_t;
+typedef struct r123array4x32 threefry4x32_ukey_t;
+
+inline threefry4x32_ctr_t threefry4x32_R(unsigned int Nrounds, threefry4x32_ctr_t in, threefry4x32_key_t k) {
+  threefry4x32_ctr_t X;
+  uint32_t ks[4 + 1];
+  int i;
+  ks[4] = 0x1BD11BDA;
+
+  {
+    ks[0] = k.v[0];
+    X.v[0] = in.v[0];
+    ks[4] ^= k.v[0];
+
+    ks[1] = k.v[1];
+    X.v[1] = in.v[1];
+    ks[4] ^= k.v[1];
+
+    ks[2] = k.v[2];
+    X.v[2] = in.v[2];
+    ks[4] ^= k.v[2];
+
+    ks[3] = k.v[3];
+    X.v[3] = in.v[3];
+    ks[4] ^= k.v[3];
+  }
+
+  X.v[0] += ks[0];
+  X.v[1] += ks[1];
+  X.v[2] += ks[2];
+  X.v[3] += ks[3];
+
+  if (Nrounds > 0) {
+    X.v[0] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_0_0);
+    X.v[1] ^= X.v[0];
+    X.v[2] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_0_1);
+    X.v[3] ^= X.v[2];
+  }
+
+  if (Nrounds > 1) {
+    X.v[0] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_1_0);
+    X.v[3] ^= X.v[0];
+    X.v[2] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_1_1);
+    X.v[1] ^= X.v[2];
+  }
+
+  if (Nrounds > 2) {
+    X.v[0] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_2_0);
+    X.v[1] ^= X.v[0];
+    X.v[2] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_2_1);
+    X.v[3] ^= X.v[2];
+  }
+
+  if (Nrounds > 3) {
+    X.v[0] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_3_0);
+    X.v[3] ^= X.v[0];
+    X.v[2] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_3_1);
+    X.v[1] ^= X.v[2];
+  }
+
+  if (Nrounds > 3) {
+    X.v[0] += ks[1];
+    X.v[1] += ks[2];
+    X.v[2] += ks[3];
+    X.v[3] += ks[4];
+    X.v[4 - 1] += 1;
+  }
+
+  if (Nrounds > 4) {
+    X.v[0] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_4_0);
+    X.v[1] ^= X.v[0];
+    X.v[2] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_4_1);
+    X.v[3] ^= X.v[2];
+  }
+
+  if (Nrounds > 5) {
+    X.v[0] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_5_0);
+    X.v[3] ^= X.v[0];
+    X.v[2] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_5_1);
+    X.v[1] ^= X.v[2];
+  }
+
+  if (Nrounds > 6) {
+    X.v[0] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_6_0);
+    X.v[1] ^= X.v[0];
+    X.v[2] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_6_1);
+    X.v[3] ^= X.v[2];
+  }
+
+  if (Nrounds > 7) {
+    X.v[0] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_7_0);
+    X.v[3] ^= X.v[0];
+    X.v[2] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_7_1);
+    X.v[1] ^= X.v[2];
+  }
+
+  if (Nrounds > 7) {
+    X.v[0] += ks[2];
+    X.v[1] += ks[3];
+    X.v[2] += ks[4];
+    X.v[3] += ks[0];
+    X.v[4 - 1] += 2;
+  }
+
+  if (Nrounds > 8) {
+    X.v[0] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_0_0);
+    X.v[1] ^= X.v[0];
+    X.v[2] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_0_1);
+    X.v[3] ^= X.v[2];
+  }
+
+  if (Nrounds > 9) {
+    X.v[0] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_1_0);
+    X.v[3] ^= X.v[0];
+    X.v[2] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_1_1);
+    X.v[1] ^= X.v[2];
+  }
+
+  if (Nrounds > 10) {
+    X.v[0] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_2_0);
+    X.v[1] ^= X.v[0];
+    X.v[2] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_2_1);
+    X.v[3] ^= X.v[2];
+  }
+
+  if (Nrounds > 11) {
+    X.v[0] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_3_0);
+    X.v[3] ^= X.v[0];
+    X.v[2] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_3_1);
+    X.v[1] ^= X.v[2];
+  }
+
+  if (Nrounds > 11) {
+    X.v[0] += ks[3];
+    X.v[1] += ks[4];
+    X.v[2] += ks[0];
+    X.v[3] += ks[1];
+    X.v[4 - 1] += 3;
+  }
+
+  if (Nrounds > 12) {
+    X.v[0] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_4_0);
+    X.v[1] ^= X.v[0];
+    X.v[2] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_4_1);
+    X.v[3] ^= X.v[2];
+  }
+
+  if (Nrounds > 13) {
+    X.v[0] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_5_0);
+    X.v[3] ^= X.v[0];
+    X.v[2] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_5_1);
+    X.v[1] ^= X.v[2];
+  }
+
+  if (Nrounds > 14) {
+    X.v[0] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_6_0);
+    X.v[1] ^= X.v[0];
+    X.v[2] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_6_1);
+    X.v[3] ^= X.v[2];
+  }
+
+  if (Nrounds > 15) {
+    X.v[0] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_7_0);
+    X.v[3] ^= X.v[0];
+    X.v[2] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_7_1);
+    X.v[1] ^= X.v[2];
+  }
+
+  if (Nrounds > 15) {
+    X.v[0] += ks[4];
+    X.v[1] += ks[0];
+    X.v[2] += ks[1];
+    X.v[3] += ks[2];
+    X.v[4 - 1] += 4;
+  }
+
+  if (Nrounds > 16) {
+    X.v[0] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_0_0);
+    X.v[1] ^= X.v[0];
+    X.v[2] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_0_1);
+    X.v[3] ^= X.v[2];
+  }
+
+  if (Nrounds > 17) {
+    X.v[0] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_1_0);
+    X.v[3] ^= X.v[0];
+    X.v[2] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_1_1);
+    X.v[1] ^= X.v[2];
+  }
+
+  if (Nrounds > 18) {
+    X.v[0] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_2_0);
+    X.v[1] ^= X.v[0];
+    X.v[2] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_2_1);
+    X.v[3] ^= X.v[2];
+  }
+
+  if (Nrounds > 19) {
+    X.v[0] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_3_0);
+    X.v[3] ^= X.v[0];
+    X.v[2] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_3_1);
+    X.v[1] ^= X.v[2];
+  }
+
+  if (Nrounds > 19) {
+    X.v[0] += ks[0];
+    X.v[1] += ks[1];
+    X.v[2] += ks[2];
+    X.v[3] += ks[3];
+    X.v[4 - 1] += 5;
+  }
+
+  if (Nrounds > 20) {
+    X.v[0] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_4_0);
+    X.v[1] ^= X.v[0];
+    X.v[2] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_4_1);
+    X.v[3] ^= X.v[2];
+  }
+
+  if (Nrounds > 21) {
+    X.v[0] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_5_0);
+    X.v[3] ^= X.v[0];
+    X.v[2] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_5_1);
+    X.v[1] ^= X.v[2];
+  }
+
+  if (Nrounds > 22) {
+    X.v[0] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_6_0);
+    X.v[1] ^= X.v[0];
+    X.v[2] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_6_1);
+    X.v[3] ^= X.v[2];
+  }
+
+  if (Nrounds > 23) {
+    X.v[0] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_7_0);
+    X.v[3] ^= X.v[0];
+    X.v[2] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_7_1);
+    X.v[1] ^= X.v[2];
+  }
+
+  if (Nrounds > 23) {
+    X.v[0] += ks[1];
+    X.v[1] += ks[2];
+    X.v[2] += ks[3];
+    X.v[3] += ks[4];
+    X.v[4 - 1] += 6;
+  }
+
+  if (Nrounds > 24) {
+    X.v[0] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_0_0);
+    X.v[1] ^= X.v[0];
+    X.v[2] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_0_1);
+    X.v[3] ^= X.v[2];
+  }
+
+  if (Nrounds > 25) {
+    X.v[0] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_1_0);
+    X.v[3] ^= X.v[0];
+    X.v[2] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_1_1);
+    X.v[1] ^= X.v[2];
+  }
+
+  if (Nrounds > 26) {
+    X.v[0] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_2_0);
+    X.v[1] ^= X.v[0];
+    X.v[2] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_2_1);
+    X.v[3] ^= X.v[2];
+  }
+
+  if (Nrounds > 27) {
+    X.v[0] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_3_0);
+    X.v[3] ^= X.v[0];
+    X.v[2] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_3_1);
+    X.v[1] ^= X.v[2];
+  }
+
+  if (Nrounds > 27) {
+    X.v[0] += ks[2];
+    X.v[1] += ks[3];
+    X.v[2] += ks[4];
+    X.v[3] += ks[0];
+    X.v[4 - 1] += 7;
+  }
+
+  if (Nrounds > 28) {
+    X.v[0] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_4_0);
+    X.v[1] ^= X.v[0];
+    X.v[2] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_4_1);
+    X.v[3] ^= X.v[2];
+  }
+
+  if (Nrounds > 29) {
+    X.v[0] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_5_0);
+    X.v[3] ^= X.v[0];
+    X.v[2] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_5_1);
+    X.v[1] ^= X.v[2];
+  }
+
+  if (Nrounds > 30) {
+    X.v[0] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_6_0);
+    X.v[1] ^= X.v[0];
+    X.v[2] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_6_1);
+    X.v[3] ^= X.v[2];
+  }
+
+  if (Nrounds > 31) {
+    X.v[0] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_7_0);
+    X.v[3] ^= X.v[0];
+    X.v[2] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_7_1);
+    X.v[1] ^= X.v[2];
+  }
+
+  if (Nrounds > 31) {
+    X.v[0] += ks[3];
+    X.v[1] += ks[4];
+    X.v[2] += ks[0];
+    X.v[3] += ks[1];
+    X.v[4 - 1] += 8;
+  }
+
+  if (Nrounds > 32) {
+    X.v[0] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_0_0);
+    X.v[1] ^= X.v[0];
+    X.v[2] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_0_1);
+    X.v[3] ^= X.v[2];
+  }
+
+  if (Nrounds > 33) {
+    X.v[0] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_1_0);
+    X.v[3] ^= X.v[0];
+    X.v[2] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_1_1);
+    X.v[1] ^= X.v[2];
+  }
+
+  if (Nrounds > 34) {
+    X.v[0] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_2_0);
+    X.v[1] ^= X.v[0];
+    X.v[2] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_2_1);
+    X.v[3] ^= X.v[2];
+  }
+
+  if (Nrounds > 35) {
+    X.v[0] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_3_0);
+    X.v[3] ^= X.v[0];
+    X.v[2] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_3_1);
+    X.v[1] ^= X.v[2];
+  }
+
+  if (Nrounds > 35) {
+    X.v[0] += ks[4];
+    X.v[1] += ks[0];
+    X.v[2] += ks[1];
+    X.v[3] += ks[2];
+    X.v[4 - 1] += 9;
+  }
+
+  if (Nrounds > 36) {
+    X.v[0] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_4_0);
+    X.v[1] ^= X.v[0];
+    X.v[2] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_4_1);
+    X.v[3] ^= X.v[2];
+  }
+
+  if (Nrounds > 37) {
+    X.v[0] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_5_0);
+    X.v[3] ^= X.v[0];
+    X.v[2] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_5_1);
+    X.v[1] ^= X.v[2];
+  }
+
+  if (Nrounds > 38) {
+    X.v[0] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_6_0);
+    X.v[1] ^= X.v[0];
+    X.v[2] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_6_1);
+    X.v[3] ^= X.v[2];
+  }
+
+  if (Nrounds > 39) {
+    X.v[0] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_7_0);
+    X.v[3] ^= X.v[0];
+    X.v[2] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_7_1);
+    X.v[1] ^= X.v[2];
+  }
+
+  if (Nrounds > 39) {
+    X.v[0] += ks[0];
+    X.v[1] += ks[1];
+    X.v[2] += ks[2];
+    X.v[3] += ks[3];
+    X.v[4 - 1] += 10;
+  }
+
+  if (Nrounds > 40) {
+    X.v[0] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_0_0);
+    X.v[1] ^= X.v[0];
+    X.v[2] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_0_1);
+    X.v[3] ^= X.v[2];
+  }
+
+  if (Nrounds > 41) {
+    X.v[0] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_1_0);
+    X.v[3] ^= X.v[0];
+    X.v[2] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_1_1);
+    X.v[1] ^= X.v[2];
+  }
+  if (Nrounds > 42) {
+    X.v[0] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_2_0);
+    X.v[1] ^= X.v[0];
+    X.v[2] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_2_1);
+    X.v[3] ^= X.v[2];
+  }
+
+  if (Nrounds > 43) {
+    X.v[0] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_3_0);
+    X.v[3] ^= X.v[0];
+    X.v[2] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_3_1);
+    X.v[1] ^= X.v[2];
+  }
+
+  if (Nrounds > 43) {
+    X.v[0] += ks[1];
+    X.v[1] += ks[2];
+    X.v[2] += ks[3];
+    X.v[3] += ks[4];
+    X.v[4 - 1] += 11;
+  }
+
+  if (Nrounds > 44) {
+    X.v[0] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_4_0);
+    X.v[1] ^= X.v[0];
+    X.v[2] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_4_1);
+    X.v[3] ^= X.v[2];
+  }
+
+  if (Nrounds > 45) {
+    X.v[0] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_5_0);
+    X.v[3] ^= X.v[0];
+    X.v[2] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_5_1);
+    X.v[1] ^= X.v[2];
+  }
+
+  if (Nrounds > 46) {
+    X.v[0] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_6_0);
+    X.v[1] ^= X.v[0];
+    X.v[2] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_6_1);
+    X.v[3] ^= X.v[2];
+  }
+
+  if (Nrounds > 47) {
+    X.v[0] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_7_0);
+    X.v[3] ^= X.v[0];
+    X.v[2] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_7_1);
+    X.v[1] ^= X.v[2];
+  }
+
+  if (Nrounds > 47) {
+    X.v[0] += ks[2];
+    X.v[1] += ks[3];
+    X.v[2] += ks[4];
+    X.v[3] += ks[0];
+    X.v[4 - 1] += 12;
+  }
+
+  if (Nrounds > 48) {
+    X.v[0] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_0_0);
+    X.v[1] ^= X.v[0];
+    X.v[2] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_0_1);
+    X.v[3] ^= X.v[2];
+  }
+
+  if (Nrounds > 49) {
+    X.v[0] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_1_0);
+    X.v[3] ^= X.v[0];
+    X.v[2] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_1_1);
+    X.v[1] ^= X.v[2];
+  }
+
+  if (Nrounds > 50) {
+    X.v[0] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_2_0);
+    X.v[1] ^= X.v[0];
+    X.v[2] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_2_1);
+    X.v[3] ^= X.v[2];
+  }
+
+  if (Nrounds > 51) {
+    X.v[0] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_3_0);
+    X.v[3] ^= X.v[0];
+    X.v[2] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_3_1);
+    X.v[1] ^= X.v[2];
+  }
+
+  if (Nrounds > 51) {
+    X.v[0] += ks[3];
+    X.v[1] += ks[4];
+    X.v[2] += ks[0];
+    X.v[3] += ks[1];
+    X.v[4 - 1] += 13;
+  }
+
+  if (Nrounds > 52) {
+    X.v[0] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_4_0);
+    X.v[1] ^= X.v[0];
+    X.v[2] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_4_1);
+    X.v[3] ^= X.v[2];
+  }
+
+  if (Nrounds > 53) {
+    X.v[0] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_5_0);
+    X.v[3] ^= X.v[0];
+    X.v[2] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_5_1);
+    X.v[1] ^= X.v[2];
+  }
+
+  if (Nrounds > 54) {
+    X.v[0] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_6_0);
+    X.v[1] ^= X.v[0];
+    X.v[2] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_6_1);
+    X.v[3] ^= X.v[2];
+  }
+
+  if (Nrounds > 55) {
+    X.v[0] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_7_0);
+    X.v[3] ^= X.v[0];
+    X.v[2] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_7_1);
+    X.v[1] ^= X.v[2];
+  }
+
+  if (Nrounds > 55) {
+    X.v[0] += ks[4];
+    X.v[1] += ks[0];
+    X.v[2] += ks[1];
+    X.v[3] += ks[2];
+    X.v[4 - 1] += 14;
+  }
+
+  if (Nrounds > 56) {
+    X.v[0] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_0_0);
+    X.v[1] ^= X.v[0];
+    X.v[2] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_0_1);
+    X.v[3] ^= X.v[2];
+  }
+
+  if (Nrounds > 57) {
+    X.v[0] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_1_0);
+    X.v[3] ^= X.v[0];
+    X.v[2] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_1_1);
+    X.v[1] ^= X.v[2];
+  }
+
+  if (Nrounds > 58) {
+    X.v[0] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_2_0);
+    X.v[1] ^= X.v[0];
+    X.v[2] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_2_1);
+    X.v[3] ^= X.v[2];
+  }
+
+  if (Nrounds > 59) {
+    X.v[0] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_3_0);
+    X.v[3] ^= X.v[0];
+    X.v[2] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_3_1);
+    X.v[1] ^= X.v[2];
+  }
+
+  if (Nrounds > 59) {
+    X.v[0] += ks[0];
+    X.v[1] += ks[1];
+    X.v[2] += ks[2];
+    X.v[3] += ks[3];
+    X.v[4 - 1] += 15;
+  }
+
+  if (Nrounds > 60) {
+    X.v[0] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_4_0);
+    X.v[1] ^= X.v[0];
+    X.v[2] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_4_1);
+    X.v[3] ^= X.v[2];
+  }
+
+  if (Nrounds > 61) {
+    X.v[0] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_5_0);
+    X.v[3] ^= X.v[0];
+    X.v[2] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_5_1);
+    X.v[1] ^= X.v[2];
+  }
+
+  if (Nrounds > 62) {
+    X.v[0] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_6_0);
+    X.v[1] ^= X.v[0];
+    X.v[2] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_6_1);
+    X.v[3] ^= X.v[2];
+  }
+
+  if (Nrounds > 63) {
+    X.v[0] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_7_0);
+    X.v[3] ^= X.v[0];
+    X.v[2] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_7_1);
+    X.v[1] ^= X.v[2];
+  }
+
+  if (Nrounds > 63) {
+    X.v[0] += ks[1];
+    X.v[1] += ks[2];
+    X.v[2] += ks[3];
+    X.v[3] += ks[4];
+    X.v[4 - 1] += 16;
+  }
+
+  if (Nrounds > 64) {
+    X.v[0] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_0_0);
+    X.v[1] ^= X.v[0];
+    X.v[2] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_0_1);
+    X.v[3] ^= X.v[2];
+  }
+
+  if (Nrounds > 65) {
+    X.v[0] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_1_0);
+    X.v[3] ^= X.v[0];
+    X.v[2] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_1_1);
+    X.v[1] ^= X.v[2];
+  }
+
+  if (Nrounds > 66) {
+    X.v[0] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_2_0);
+    X.v[1] ^= X.v[0];
+    X.v[2] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_2_1);
+    X.v[3] ^= X.v[2];
+  }
+
+  if (Nrounds > 67) {
+    X.v[0] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_3_0);
+    X.v[3] ^= X.v[0];
+    X.v[2] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_3_1);
+    X.v[1] ^= X.v[2];
+  }
+
+  if (Nrounds > 67) {
+    X.v[0] += ks[2];
+    X.v[1] += ks[3];
+    X.v[2] += ks[4];
+    X.v[3] += ks[0];
+    X.v[4 - 1] += 17;
+  }
+
+  if (Nrounds > 68) {
+    X.v[0] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_4_0);
+    X.v[1] ^= X.v[0];
+    X.v[2] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_4_1);
+    X.v[3] ^= X.v[2];
+  }
+
+  if (Nrounds > 69) {
+    X.v[0] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_5_0);
+    X.v[3] ^= X.v[0];
+    X.v[2] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_5_1);
+    X.v[1] ^= X.v[2];
+  }
+
+  if (Nrounds > 70) {
+    X.v[0] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_6_0);
+    X.v[1] ^= X.v[0];
+    X.v[2] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_6_1);
+    X.v[3] ^= X.v[2];
+  }
+
+  if (Nrounds > 71) {
+    X.v[0] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_7_0);
+    X.v[3] ^= X.v[0];
+    X.v[2] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_7_1);
+    X.v[1] ^= X.v[2];
+  }
+
+  if (Nrounds > 71) {
+    X.v[0] += ks[3];
+    X.v[1] += ks[4];
+    X.v[2] += ks[0];
+    X.v[3] += ks[1];
+    X.v[4 - 1] += 18;
+  }
+  return X;
+}
+//end: the open sourced random generator from DE Shaw Research
+
+// **************************
+// BERNOULLI DISTRIBUTION
+// **************************
+
+__kernel void PRNG_threefry4x32_bernoulli(
+	__global float4 *randomnumber,
+	threefry4x32_ctr_t ctr_i,
+	float inf, float sup,
+	float threshold,
+	uint nrounds, uint numrandom) {
+
+  size_t gdx = get_global_id(0);
+
+  uint maxUint = 0;
+  maxUint--;
+  float r = (float)maxUint;
+
+  threefry4x32_ctr_t ctr = ctr_i;
+  threefry4x32_ukey_t ukey;
+
+  ukey.v[0] = ukey.v[1] = ukey.v[2] = ukey.v[3] = gdx;
+
+  threefry4x32_ctr_t random4;
+
+  if ( gdx < numrandom ) {
+    random4 = threefry4x32_R(nrounds, ctr, ukey);
+    float4 frnd;
+    frnd.x = ( (((float)random4.v[0]) / r) * (sup - inf) + inf ) < threshold ? 1.0f : 0.0f;
+    frnd.y = ( (((float)random4.v[1]) / r) * (sup - inf) + inf ) < threshold ? 1.0f : 0.0f;
+    frnd.z = ( (((float)random4.v[2]) / r) * (sup - inf) + inf ) < threshold ? 1.0f : 0.0f;
+    frnd.w = ( (((float)random4.v[3]) / r) * (sup - inf) + inf ) < threshold ? 1.0f : 0.0f;
+    randomnumber[gdx] = frnd;
+  }
+}
+
+// **************************
+// UNIFORM DISTRIBUTION (float)
+// **************************
+
+__kernel void PRNG_threefry4x32_uniform(
+	__global float4 *randomnumber,
+	threefry4x32_ctr_t ctr_i,
+	float inf, float sup,
+	uint nrounds, uint numrandom) {
+
+  size_t gdx = get_global_id(0);
+
+  uint maxUint = 0;
+  maxUint--;
+  float r = (float)maxUint;
+
+  threefry4x32_ctr_t ctr = ctr_i;
+  threefry4x32_ukey_t ukey;
+
+  ukey.v[0] = ukey.v[1] = ukey.v[2] = ukey.v[3] = gdx;
+
+  threefry4x32_ctr_t random4;
+
+  if ( gdx < numrandom ) {
+    random4 = threefry4x32_R(nrounds, ctr, ukey);
+    float4 frnd;
+    frnd.x = ( (((float)random4.v[0]) / r) * (sup - inf) + inf );
+    frnd.y = ( (((float)random4.v[1]) / r) * (sup - inf) + inf );
+    frnd.z = ( (((float)random4.v[2]) / r) * (sup - inf) + inf );
+    frnd.w = ( (((float)random4.v[3]) / r) * (sup - inf) + inf );
+    randomnumber[gdx] = frnd;
+  }
+}
+
+// **************************
+// UNIFORM DISTRIBUTION (uint)
+// **************************
+
+__kernel void PRNG_threefry4x32_uint_uniform(
+	__global uint4 *randomnumber,
+	threefry4x32_ctr_t ctr_i,
+	uint inf, uint sup,
+	uint nrounds, uint numrandom) {
+
+  size_t gdx = get_global_id(0);
+
+  threefry4x32_ctr_t ctr = ctr_i;
+  threefry4x32_ukey_t ukey;
+
+  ukey.v[0] = ukey.v[1] = ukey.v[2] = ukey.v[3] = gdx;
+
+  threefry4x32_ctr_t random4;
+
+  if ( gdx < numrandom ) {
+    random4 = threefry4x32_R(nrounds, ctr, ukey);
+    uint4 frnd;
+    frnd.x = random4.v[0] % (sup - inf) + inf;
+    frnd.y = random4.v[1] % (sup - inf) + inf;
+    frnd.z = random4.v[2] % (sup - inf) + inf;
+    frnd.w = random4.v[3] % (sup - inf) + inf;
+    randomnumber[gdx] = frnd;
+  }
+}
+
+// **************************
+// GAUSSIAN DISTRIBUTION
+// **************************
+
+__kernel void PRNG_threefry4x32_gaussian(
+	__global float4 *randomnumber,
+	threefry4x32_ctr_t ctr_i,
+	float E, float V,
+	uint nrounds, uint numrandom) {
+
+  size_t gdx = get_global_id(0);
+
+  uint maxUint = 0;
+  maxUint--;
+  float r = (float)maxUint;
+
+  threefry4x32_ctr_t ctr = ctr_i;
+  threefry4x32_ukey_t ukey1, ukey2;
+
+  ukey1.v[0] = ukey2.v[1] = ukey1.v[2] = ukey2.v[3] = gdx;
+  ukey2.v[0] = ukey1.v[1] = ukey2.v[2] = ukey1.v[3] = 0;
+
+  threefry4x32_ctr_t random1, random2;
+
+  if ( gdx < numrandom ) {
+    random1 = threefry4x32_R(nrounds, ctr, ukey1);
+    random2 = threefry4x32_R(nrounds, ctr, ukey2);
+    float4 frnd1;
+
+    float r1 = (((float)random1.v[0]) / r); // generate a random sequence of uniform distribution
+    float r2 = (((float)random2.v[0]) / r);
+    float r3 = (((float)random1.v[1]) / r);
+    float r4 = (((float)random2.v[1]) / r);
+    float r5 = (((float)random1.v[2]) / r);
+    float r6 = (((float)random2.v[2]) / r);
+    float r7 = (((float)random1.v[3]) / r);
+    float r8 = (((float)random2.v[3]) / r);
+
+    if(r2 == 0 || r4 == 0 || r6 == 0 || r8 == 0) {
+      r2 += 0.0001;
+      r4 += 0.0001;
+      r6 += 0.0001;
+      r8 += 0.0001;
+    }
+
+    frnd1.x = cos(2*M_PI*r1)*sqrt(-2.0*log(r2)) * V + E;// return a pseudo sequence of normal distribution using two above uniform noise data
+    //frnd2.x = sin(2*M_PI*r1)*sqrt(-2.0*log(r2));      // return the quadrature counterpart of the foregoing pseudo normal distribution sequence
+    frnd1.y = cos(2*M_PI*r3)*sqrt(-2.0*log(r4)) * V + E;// return a pseudo sequence of normal distribution using two above uniform noise data
+    //frnd2.y = sin(2*M_PI*r3)*sqrt(-2.0*log(r4));      // return the quadrature counterpart of the foregoing pseudo normal distribution sequence
+    frnd1.z = cos(2*M_PI*r5)*sqrt(-2.0*log(r6)) * V + E;// return a pseudo sequence of normal distribution using two above uniform noise data
+    //frnd2.z = sin(2*M_PI*r5)*sqrt(-2.0*log(r6));      // return the quadrature counterpart of the foregoing pseudo normal distribution sequence
+    frnd1.w = cos(2*M_PI*r7)*sqrt(-2.0*log(r8)) * V + E;// return a pseudo sequence of normal distribution using two above uniform noise data
+    //frnd2.w = sin(2*M_PI*r7)*sqrt(-2.0*log(r8));      // return the quadrature counterpart of the foregoing pseudo normal distribution sequence
+
+    randomnumber[gdx] = frnd1;
+  }
+}
diff --git a/src/core/tensor/math_kernel.cu b/src/core/tensor/math_kernel.cu
new file mode 100644
index 0000000..d3f3335
--- /dev/null
+++ b/src/core/tensor/math_kernel.cu
@@ -0,0 +1,649 @@
+/************************************************************
+*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*
+*************************************************************/
+
+#include "singa/singa_config.h"
+#ifdef USE_CUDA
+#include <cmath>
+#include <algorithm>
+#include <cfloat>
+#include "./math_kernel.h"
+
+#define CU2DBLOCK_X 32
+#define CU2DBLOCK_Y 32
+
+#define CU1DBLOCK 1024
+#define CU1DBLOCKF 1024.0
+
+namespace singa {
+// Cuda Kernel Functions
+namespace cuda {
+/*
+wangwei: Not used due to error in the code.
+__global__ void KernelSum(const size_t n, const float *in, float *out) {
+  int THREADS = blockDim.x;
+
+  __shared__ float aux[CU1DBLOCK];
+  int steps = (n - 1) / THREADS + 1;
+  aux[threadIdx.x] = in[threadIdx.x];
+
+  for (int i = 1; i < steps; ++i) {
+    if (threadIdx.x + i * THREADS < n) {
+      aux[threadIdx.x] += in[threadIdx.x + i * THREADS];
+    }
+  }
+
+  int total_threads = THREADS;
+  __syncthreads();
+
+  while (total_threads > 1) {
+    int half_point = ((1 + total_threads) >> 1);
+    if (threadIdx.x < half_point) {
+      if (threadIdx.x + half_point < total_threads) {
+        aux[threadIdx.x] += aux[threadIdx.x + half_point];
+      }
+    }
+    __syncthreads();
+    total_threads = ((total_threads + 1) >> 1);
+  }
+
+  __syncthreads();
+  *out = aux[0];
+}
+*/
+
+__global__ void KernelAdd(const size_t n, const float *in1, const float *in2,
+                          float *out) {
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n;
+       i += blockDim.x * gridDim.x) {
+    out[i] = in1[i] + in2[i];
+  }
+}
+
+__global__ void KernelAdd(const size_t n, const float *in, const float x,
+                          float *out) {
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n;
+       i += blockDim.x * gridDim.x) {
+    out[i] = in[i] + x;
+  }
+}
+
+__global__ void KernelSub(const size_t n, const float *in1, const float *in2,
+                          float *out) {
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n;
+       i += blockDim.x * gridDim.x) {
+    out[i] = in1[i] - in2[i];
+  }
+}
+
+__global__ void KernelExp(const size_t n, const float *in, float *out) {
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n;
+       i += blockDim.x * gridDim.x) {
+    out[i] = std::exp(in[i]);
+  }
+}
+
+__global__ void KernelLog(const size_t n, const float *in, float *out) {
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n;
+       i += blockDim.x * gridDim.x) {
+    out[i] = std::log(in[i]);
+  }
+}
+
+__global__ void KernelSigmoid(const size_t n, const float *in, float *out) {
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n;
+       i += blockDim.x * gridDim.x) {
+    out[i] = 1.0f / (1.0f + expf(-in[i]));
+  }
+}
+__global__ void KernelSign(const size_t n, const float *in, float *out) {
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n;
+       i += blockDim.x * gridDim.x) {
+    if (in[i] > 0.0f)
+      out[i] = 1.0f;
+    else if (in[i] < 0.0f)
+      out[i] = -1.0f;
+    else
+      out[i] = 0.0f;
+  }
+}
+
+__global__ void KernelClamp(const size_t n, const float low, const float high,
+                            const float *in, float *out) {
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n;
+       i += blockDim.x * gridDim.x) {
+    if (in[i] > high)
+      out[i] = high;
+    else if (in[i] < low)
+      out[i] = low;
+    else
+      out[i] = in[i];
+  }
+}
+
+__global__ void KernelRelu(const size_t n, const float *in, float *out) {
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n;
+       i += blockDim.x * gridDim.x) {
+    out[i] = max(in[i], 0.0f);
+  }
+}
+
+__global__ void KernelAbs(const size_t n, const float *in, float *out) {
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n;
+       i += blockDim.x * gridDim.x) {
+    out[i] =  max(in[i], -in[i]);
+  }
+}
+
+__global__ void KernelTanh(const size_t n, const float *in, float *out) {
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n;
+       i += blockDim.x * gridDim.x) {
+    out[i] = tanhf(in[i]);
+  }
+}
+
+__global__ void KernelSoftplus(const size_t n, const float *in, float *out) {
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n;
+       i += blockDim.x * gridDim.x) {
+    out[i] = logf(1 + expf(in[i]));
+  }
+}
+__global__ void KernelSquare(const size_t n, const float *in, float *out) {
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n;
+       i += blockDim.x * gridDim.x) {
+    out[i] = in[i] * in[i];
+  }
+}
+__global__ void KernelSqrt(const size_t n, const float *in, float *out) {
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n;
+       i += blockDim.x * gridDim.x) {
+    out[i] = std::sqrt(in[i]);
+  }
+}
+
+__global__ void KernelPow(const size_t n, const float *in1, const float *in2,
+                          float *out) {
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n;
+       i += blockDim.x * gridDim.x) {
+    out[i] = std::pow(in1[i], in2[i]);
+  }
+}
+
+__global__ void KernelPow(const size_t n, const float *in, const float x,
+                          float *out) {
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n;
+       i += blockDim.x * gridDim.x) {
+    out[i] = std::pow(in[i], x);
+  }
+}
+
+__global__ void KernelMult(const size_t n, const float *in1, const float *in2,
+                           float *out) {
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n;
+       i += blockDim.x * gridDim.x) {
+    out[i] = in1[i] * in2[i];
+  }
+}
+
+__global__ void KernelMult(const size_t n, const float *in, const float x,
+                           float *out) {
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n;
+       i += blockDim.x * gridDim.x) {
+    out[i] = in[i] * x;
+  }
+}
+
+__global__ void KernelDiv(const size_t n, const float *in1, const float *in2,
+                          float *out) {
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n;
+       i += blockDim.x * gridDim.x) {
+    out[i] = in1[i] / in2[i];
+  }
+}
+__global__ void KernelDiv(const size_t n, const float x, const float *in,
+                          float *out) {
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n;
+       i += blockDim.x * gridDim.x) {
+    out[i] = x / in[i];
+  }
+}
+__global__ static void KernelSet(const size_t n, const float x, float *out) {
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n;
+       i += blockDim.x * gridDim.x) {
+    out[i] = x;
+  }
+}
+
+__global__ void KernelThreshold(const size_t n, const float x, const float *in,
+                                float *out) {
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n;
+       i += blockDim.x * gridDim.x) {
+    out[i] = in[i] < x ? 1.0f : 0.0f;
+  }
+}
+
+__global__ void KernelGE(const size_t num, const float *in, const float x,
+                         float *out) {
+  for (size_t idx = blockIdx.x * blockDim.x + threadIdx.x; idx < num;
+       idx += blockDim.x * gridDim.x) {
+    out[idx] = in[idx] >= x ? 1.0f : 0.0f;
+  }
+}
+
+__global__ void KernelBGE(const size_t num, const float *in1, const float *in2,
+                         float *out) {
+  for (size_t idx = blockIdx.x * blockDim.x + threadIdx.x; idx < num;
+       idx += blockDim.x * gridDim.x) {
+    out[idx] = in1[idx] >= in2[idx] ? 1.0f : 0.0f;
+  }
+}
+__global__ void KernelGT(const size_t num, const float *in, const float x,
+                         float *out) {
+  for (size_t idx = blockIdx.x * blockDim.x + threadIdx.x; idx < num;
+       idx += blockDim.x * gridDim.x) {
+    out[idx] = in[idx] > x ? 1.0f : 0.0f;
+  }
+}
+__global__ void KernelBGT(const size_t num, const float *in1, const float *in2,
+                         float *out) {
+  for (size_t idx = blockIdx.x * blockDim.x + threadIdx.x; idx < num;
+       idx += blockDim.x * gridDim.x) {
+    out[idx] = in1[idx] > in2[idx] ? 1.0f : 0.0f;
+  }
+}
+__global__ void KernelLE(const size_t num, const float *in, const float x,
+                         float *out) {
+  for (size_t idx = blockIdx.x * blockDim.x + threadIdx.x; idx < num;
+       idx += blockDim.x * gridDim.x) {
+    out[idx] = in[idx] <= x ? 1.0f : 0.0f;
+  }
+}
+__global__ void KernelBLE(const size_t num, const float *in1, const float *in2,
+                         float *out) {
+  for (size_t idx = blockIdx.x * blockDim.x + threadIdx.x; idx < num;
+       idx += blockDim.x * gridDim.x) {
+    out[idx] = in1[idx] <= in2[idx] ? 1.0f : 0.0f;
+  }
+}
+__global__ void KernelLT(const size_t num, const float *in, const float x,
+                         float *out) {
+  for (size_t idx = blockIdx.x * blockDim.x + threadIdx.x; idx < num;
+       idx += blockDim.x * gridDim.x) {
+    out[idx] = in[idx] < x ? 1.0f : 0.0f;
+  }
+}
+__global__ void KernelBLT(const size_t num, const float *in1, const float *in2,
+                         float *out) {
+  for (size_t idx = blockIdx.x * blockDim.x + threadIdx.x; idx < num;
+       idx += blockDim.x * gridDim.x) {
+    out[idx] = in1[idx] < in2[idx] ? 1.0f : 0.0f;
+  }
+}
+__global__ void KernelRowMax(const size_t nrow, const size_t ncol, const float *inPtr,
+    float *outPtr) {
+  for (size_t idx = blockIdx.x * blockDim.x + threadIdx.x; idx < nrow;
+       idx += blockDim.x * gridDim.x) {
+    int offset = idx * ncol;
+    float maxval = inPtr[offset];
+    for (size_t k = 1; k < ncol; k++) {
+      maxval = max(maxval, inPtr[offset + k]);
+    }
+    outPtr[idx] = maxval;
+  }
+}
+__global__ void KernelComputeCrossEntropy(const size_t batchsize,
+                                          const size_t dim, const float *p,
+                                          const int *t, float *loss) {
+  size_t sample = blockIdx.x * blockDim.x + threadIdx.x;
+  size_t num_threads = blockDim.x * gridDim.x;
+  for (; sample < batchsize; sample += num_threads) {
+    float prob_of_truth = p[sample * dim + t[sample]];
+    loss[sample] = -std::log(max(prob_of_truth, FLT_MIN));
+  }
+}
+
+__global__ void KernelSoftmaxCrossEntropyBwd(const size_t batchsize,
+                                             const size_t dim, const float *p,
+                                             const int *t, float *grad) {
+  size_t sample = blockIdx.x * blockDim.x + threadIdx.x;
+  size_t num_threads = blockDim.x * gridDim.x;
+  for (; sample < batchsize; sample += num_threads) {
+    size_t pos = sample * dim + t[sample];
+    grad[pos] = p[pos] - 1.0f;  // TODO(wangwei) Consider p and grad are diff
+  }
+}
+
+
+
+// ********************************
+// Functions call kernels
+// ********************************
+
+void set(const size_t n, const float v, float *out, cudaStream_t s) {
+  KernelSet <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, v, out);
+}
+
+void abs(const size_t n, const float *in, float *out, cudaStream_t s) {
+  KernelAbs <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, in, out);
+}
+
+void sign(const size_t n, const float *in, float *out, cudaStream_t s) {
+  KernelSign <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, in, out);
+}
+
+void exp(const size_t n, const float *in, float *out, cudaStream_t s) {
+  KernelExp <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, in, out);
+}
+
+void log(const size_t n, const float *in, float *out, cudaStream_t s) {
+  KernelLog <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, in, out);
+}
+
+void sqrt(const size_t n, const float *in, float *out, cudaStream_t s) {
+  KernelSqrt <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, in, out);
+}
+
+void square(const size_t n, const float *in, float *out, cudaStream_t s) {
+  KernelSquare <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, in, out);
+}
+
+void tanh(const size_t n, const float *in, float *out, cudaStream_t s) {
+  KernelTanh <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, in, out);
+}
+
+void relu(const size_t n, const float *in, float *out, cudaStream_t s) {
+  KernelRelu <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, in, out);
+}
+void sigmoid(const size_t n, const float *in, float *out, cudaStream_t s) {
+  KernelSigmoid <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, in, out);
+}
+void softplus(const size_t n, const float *in, float *out, cudaStream_t s) {
+  KernelSoftplus <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, in, out);
+}
+void clamp(const size_t n, const float low, const float high, const float *in,
+           float *out, cudaStream_t s) {
+  KernelClamp <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, low, high, in, out);
+}
+
+void pow(const size_t n, const float *in, const float x, float *out,
+         cudaStream_t s) {
+  KernelPow <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, in, x, out);
+}
+
+void add(const size_t n, const float *in, const float x, float *out,
+         cudaStream_t s) {
+  KernelAdd <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, in, x, out);
+}
+
+void mult(const size_t n, const float *in, const float x, float *out,
+          cudaStream_t s) {
+  KernelMult <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, in, x, out);
+}
+
+void div(const size_t n, const float x, const float *in, float *out,
+          cudaStream_t s) {
+  KernelDiv <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, x, in, out);
+}
+
+void threshold(const size_t n, const float x, const float *in, float *out,
+               cudaStream_t s) {
+  KernelThreshold <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, x, in, out);
+}
+
+void gt(const size_t num, const float *in, const float x, float *out,
+        cudaStream_t s) {
+  KernelGT <<<ceil(num / CU1DBLOCKF), CU1DBLOCKF>>> (num, in, x, out);
+}
+void gt(const size_t num, const float *in1, const float *in2, float *out,
+        cudaStream_t s) {
+  KernelBGT <<<ceil(num / CU1DBLOCKF), CU1DBLOCKF>>> (num, in1, in2, out);
+}
+void ge(const size_t num, const float *in, const float x, float *out,
+        cudaStream_t s) {
+  KernelGE <<<ceil(num / CU1DBLOCKF), CU1DBLOCKF>>> (num, in, x, out);
+}
+void ge(const size_t num, const float *in1, const float *in2, float *out,
+        cudaStream_t s) {
+  KernelBGE <<<ceil(num / CU1DBLOCKF), CU1DBLOCKF>>> (num, in1, in2, out);
+}
+void lt(const size_t num, const float *in, const float x, float *out,
+        cudaStream_t s) {
+  KernelLT <<<ceil(num / CU1DBLOCKF), CU1DBLOCKF>>> (num, in, x, out);
+}
+void lt(const size_t num, const float *in1, const float *in2, float *out,
+        cudaStream_t s) {
+  KernelBLT <<<ceil(num / CU1DBLOCKF), CU1DBLOCKF>>> (num, in1, in2, out);
+}
+void le(const size_t num, const float *in, const float x, float *out,
+        cudaStream_t s) {
+  KernelLE <<<ceil(num / CU1DBLOCKF), CU1DBLOCKF>>> (num, in, x, out);
+}
+void le(const size_t num, const float *in1, const float *in2, float *out,
+        cudaStream_t s) {
+  KernelBLE <<<ceil(num / CU1DBLOCKF), CU1DBLOCKF>>> (num, in1, in2, out);
+}
+void pow(const size_t n, const float *in1, const float *in2, float *out,
+         cudaStream_t s) {
+  KernelPow <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, in1, in2, out);
+}
+
+void add(const size_t n, const float *in1, const float *in2, float *out,
+         cudaStream_t s) {
+  KernelAdd <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, in1, in2, out);
+}
+
+void sub(const size_t n, const float *in1, const float *in2, float *out,
+         cudaStream_t s) {
+  KernelSub <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, in1, in2, out);
+}
+
+void mult(const size_t n, const float *in1, const float *in2, float *out,
+          cudaStream_t s) {
+  KernelMult <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, in1, in2, out);
+}
+
+void div(const size_t n, const float *in1, const float *in2, float *out,
+         cudaStream_t s) {
+  KernelDiv <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, in1, in2, out);
+}
+
+/*
+void sum(const size_t n, const float *in, float *out, cudaStream_t s) {
+  int threads_per_block = n > CU1DBLOCK ? CU1DBLOCK : n;
+  //  here, we only need one block
+  int num_blocks = 1;
+  KernelSum <<<num_blocks, threads_per_block>>> (n, in, out);
+}
+*/
+
+void ComputeCrossEntropy(size_t batchsize, const size_t dim, const float *p,
+                         const int *t, float *loss, cudaStream_t stream) {
+  KernelComputeCrossEntropy <<<ceil(batchsize / CU1DBLOCKF), CU1DBLOCKF>>>
+      (batchsize, dim, p, t, loss);
+}
+
+void SoftmaxCrossEntropyBwd(size_t batchsize, const size_t dim, const float *p,
+                            const int *t, float *grad, cudaStream_t stream) {
+  KernelSoftmaxCrossEntropyBwd <<<ceil(batchsize / CU1DBLOCKF), CU1DBLOCKF>>>
+      (batchsize, dim, p, t, grad);
+}
+
+void RowMax(const size_t nrow, const size_t ncol, const float *inPtr,
+    float *outPtr, cudaStream_t stream) {
+  KernelRowMax <<<ceil(nrow / CU1DBLOCKF), CU1DBLOCKF>>>(nrow, ncol, inPtr, outPtr);
+}
+
+/*
+void square_grad(int n, const float *in, float *out, cudaStream_t s) {
+  kernel_square_grad <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, n);
+}
+
+void tanh_grad(int n, const float *in, float *out, cudaStream_t s) {
+  kernel_tanh_grad <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, n);
+}
+
+
+void relu_grad(int n, const float *in, float *out, cudaStream_t s) {
+  kernel_relu_grad <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, n);
+}
+
+
+void sigmoid_grad(int n, const float *in, float *out, cudaStream_t s) {
+  kernel_sigmoid_grad <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, n);
+}
+
+void softplus_grad(int n, const float *in, float *out, cudaStream_t s) {
+  kernel_softplus_grad <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, n);
+}
+
+
+__global__ void kernel_sum_col(const float *src_mat_data, float *dst_vec_data,
+                               int rows, int cols, int stride) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int num_threads = blockDim.x * gridDim.x;
+  for (; index < rows; index += num_threads) {
+    dst_vec_data[index] = 0.0f;
+    for (int k = 0; k < cols; k++) {
+      dst_vec_data[index] += src_mat_data[index * stride + k];
+    }
+  }
+}
+
+__global__ void kernel_sum_row(const float *src_mat_data, float *dst_vec_data,
+                               int rows, int cols, int stride) {
+  int j = blockIdx.x;
+  int THREADS = blockDim.x;
+  if (j >= cols) {
+    return;
+  }
+
+  __shared__ float aux[CU1DBLOCK];
+  int steps = (rows - 1) / THREADS + 1;
+  aux[threadIdx.x] = src_mat_data[j + threadIdx.x * stride];
+  for (int i = 1; i < steps; ++i) {
+    if (threadIdx.x + i * THREADS < rows) {
+      aux[threadIdx.x] +=
+          src_mat_data[j + (threadIdx.x + i * THREADS) * stride];
+    }
+  }
+
+  int total_threads = THREADS;
+  __syncthreads();
+  while (total_threads > 1) {
+    int half_point = ((1 + total_threads) >> 1);
+    if (threadIdx.x < half_point) {
+      if (threadIdx.x + half_point < total_threads) {
+        aux[threadIdx.x] += aux[threadIdx.x + half_point];
+      }
+    }
+    __syncthreads();
+    total_threads = ((total_threads + 1) >> 1);
+  }
+
+  __syncthreads();
+  dst_vec_data[j] = aux[0];
+}
+
+
+__global__ void kernel_add_vec_row(const float *src_vec_data,
+                                   const float *src_mat_data,
+                                   float *des_mat_data, int rows, int cols,
+                                   int stride) {
+  int i = blockIdx.x * blockDim.x + threadIdx.x;
+  int j = blockIdx.y * blockDim.y + threadIdx.y;
+  int num_threads_x = blockDim.x * gridDim.x;
+  int num_threads_y = blockDim.y * gridDim.y;
+  int index = 0;
+  for (; i < cols && j < rows; i += num_threads_x, j += num_threads_y) {
+    index = j * stride + i;
+    des_mat_data[index] = src_mat_data[index] + src_vec_data[i];
+  }
+}
+
+__global__ void kernel_sigmoid_grad(const float *src_data, float *des_data,
+                                    int n) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int num_threads = blockDim.x * gridDim.x;
+  for (; index < n; index += num_threads) {
+    des_data[index] = src_data[index] * (1.0f - src_data[index]);
+  }
+}
+
+
+__global__ void kernel_relu_grad(const float *src_data, float *des_data,
+                                 int n) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int num_threads = blockDim.x * gridDim.x;
+  for (; index < n; index += num_threads) {
+    des_data[index] = src_data[index] > 0.0f ? 1.0f : 0.0f;
+  }
+}
+
+__global__ void kernel_tanh_grad(const float *src_data, float *des_data,
+                                 int n) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int num_threads = blockDim.x * gridDim.x;
+  for (; index < n; index += num_threads) {
+    des_data[index] = (1.0f - src_data[index] * src_data[index]);
+  }
+}
+
+
+__global__ void kernel_softplus_grad(const float *src_data, float *des_data,
+                                     int n) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int num_threads = blockDim.x * gridDim.x;
+  for (; index < n; index += num_threads) {
+    des_data[index] = 1.0f / (1.0f + expf(-src_data[index]));
+  }
+}
+__global__ void KernelSquareGrad(const float *src_data, float *des_data,
+                                   int n) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int num_threads = blockDim.x * gridDim.x;
+  for (; index < n; index += num_threads) {
+    des_data[index] = 2 * src_data[index];
+  }
+}
+__global__ void kernel_softmax_loss(const float *prob, const size_t *label,
+                                    float *loss, int n, int dim) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int num_threads = blockDim.x * gridDim.x;
+  for (; index < n; index += num_threads) {
+    float prob_of_truth = prob[index * dim + label[index]];
+    loss[index] -= std::log(max(prob_of_truth, FLT_MIN));
+  }
+}
+__global__ void kernel_softmax_gradient(float *grad, const size_t *label, int n,
+                                        int dim, float scale) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int num_threads = blockDim.x * gridDim.x;
+  for (; index < n; index += num_threads) {
+    int pos = index * dim + label[index];
+    grad[pos] = (grad[pos] - 1.0f) * scale;
+  }
+}
+*/
+
+
+}  // namespace cuda
+}  // namespace singa
+
+#endif  // USE_CUDA
diff --git a/src/core/tensor/math_kernel.h b/src/core/tensor/math_kernel.h
new file mode 100644
index 0000000..cb0cb6a
--- /dev/null
+++ b/src/core/tensor/math_kernel.h
@@ -0,0 +1,120 @@
+/************************************************************
+*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*
+*************************************************************/
+#ifndef SRC_CORE_TENSOR__MATH_KERNEL_H_
+#define SRC_CORE_TENSOR__MATH_KERNEL_H_
+
+
+#include "singa/singa_config.h"
+#ifdef USE_CUDA
+
+/// TODO(wangwei) Clean the function APIs as commented in tensor_math.h
+///  Add 'Context *ctx' as an argument of all cuda functions.
+namespace singa {
+
+// TODO(wangwei) make all function templates.
+namespace cuda {
+
+// 0 input
+void set(const size_t n, const float v, float *out, cudaStream_t s);
+
+// 1 input
+void abs(const size_t n, const float *in, float *out, cudaStream_t s);
+void sign(const size_t n, const float *in, float *out, cudaStream_t s);
+void exp(const size_t n, const float *in, float *out, cudaStream_t s);
+void log(const size_t n, const float *in, float *out, cudaStream_t s);
+void sqrt(const size_t n, const float *in, float *out, cudaStream_t s);
+void square(const size_t n, const float *in, float *out, cudaStream_t s);
+void tanh(const size_t n, const float *in, float *out, cudaStream_t s);
+void relu(const size_t n, const float *in, float *out, cudaStream_t s);
+void sigmoid(const size_t n, const float *in, float *out, cudaStream_t s);
+void softplus(const size_t n, const float *in, float *out, cudaStream_t s);
+void clamp(const size_t n, const float low, const float high, const float *in,
+           float *out, cudaStream_t s);
+
+void pow(const size_t n, const float *in, const float x, float *out,
+         cudaStream_t s);
+
+void add(const size_t n, const float *in, const float x, float *out,
+         cudaStream_t s);
+
+void mult(const size_t n, const float *in, const float x, float *out,
+          cudaStream_t s);
+
+void div(const size_t n, const float x, const float *in, float *out,
+         cudaStream_t s);
+
+void threshold(const size_t n, const float x, const float *in, float *out,
+               cudaStream_t s);
+
+void gt(const size_t num, const float *in, const float x, float *out,
+        cudaStream_t s);
+void gt(const size_t num, const float *in1, const float *in2, float *out,
+        cudaStream_t s);
+
+void ge(const size_t num, const float *in, const float x, float *out,
+        cudaStream_t s);
+void ge(const size_t num, const float *in1, const float *in2, float *out,
+        cudaStream_t s);
+
+
+void lt(const size_t num, const float *in, const float x, float *out,
+        cudaStream_t s);
+void lt(const size_t num, const float *in1, const float *in2, float *out,
+        cudaStream_t s);
+
+void le(const size_t num, const float *in, const float x, float *out,
+        cudaStream_t s);
+void le(const size_t num, const float *in1, const float *in2, float *out,
+        cudaStream_t s);
+
+// 2 inputs
+void pow(const size_t n, const float *in1, const float *in2, float *out,
+         cudaStream_t s);
+
+void add(const size_t n, const float *in1, const float *in2, float *out,
+         cudaStream_t s);
+
+void sub(const size_t n, const float *in1, const float *in2, float *out,
+         cudaStream_t s);
+
+void mult(const size_t n, const float *in1, const float *in2, float *out,
+          cudaStream_t s);
+
+void div(const size_t n, const float *in1, const float *in2, float *out,
+         cudaStream_t s);
+
+// void sum(const size_t n, const float *in, float *out, cudaStream_t s);
+
+void ComputeCrossEntropy(const size_t batchsize, const size_t dim,
+                         const float *p, const int *t, float *loss,
+                         cudaStream_t stream);
+void SoftmaxCrossEntropyBwd(const size_t batchsize, const size_t dim,
+                            const float *p, const int *t, float *grad,
+                            cudaStream_t stream);
+
+void RowMax(const size_t nrow, const size_t ncol, const float *inPtr,
+    float *outPtr, cudaStream_t stream);
+}  // cuda
+
+}  // namespace singa
+
+#endif
+#endif  // SRC_CORE_TENSOR__MATH_KERNEL_H_
diff --git a/src/core/tensor/sparse_tensor.cc b/src/core/tensor/sparse_tensor.cc
new file mode 100644
index 0000000..a8ae973
--- /dev/null
+++ b/src/core/tensor/sparse_tensor.cc
@@ -0,0 +1,19 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "singa/core/tensor.h"
+namespace singa {}
diff --git a/src/core/tensor/tensor.cc b/src/core/tensor/tensor.cc
new file mode 100644
index 0000000..670b27e
--- /dev/null
+++ b/src/core/tensor/tensor.cc
@@ -0,0 +1,1023 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "singa/core/tensor.h"
+#include "./tensor_math.h"
+#include "./tensor_math_cpp.h"
+#include "./tensor_math_cuda.h"
+#include "./tensor_math_opencl.h"
+#include <utility>
+
+namespace singa {
+
+Tensor::~Tensor() {
+  if (block_ != nullptr && block_->DecRefCount() == 0)
+    device_->FreeBlock(block_);
+  block_ = nullptr;
+}
+
+Tensor::Tensor() { device_ = defaultDevice; }
+
+Tensor::Tensor(const Shape &shape, DataType dtype)
+    : data_type_(dtype), device_(defaultDevice), shape_(shape) {
+  size_t size = Product(shape_) * SizeOf(data_type_);
+  if (size)
+    block_ = device_->NewBlock(size);
+}
+Tensor::Tensor(Shape &&shape, DataType dtype)
+    : data_type_(dtype), device_(defaultDevice), shape_(shape) {
+  size_t size = Product(shape_) * SizeOf(data_type_);
+  if (size)
+    block_ = device_->NewBlock(size);
+}
+Tensor::Tensor(const Shape &shape, std::shared_ptr<Device> device,
+               DataType dtype)
+    : data_type_(dtype), device_(device), shape_(shape) {
+  size_t size = Product(shape_) * SizeOf(data_type_);
+  if (size)
+    block_ = device_->NewBlock(size);
+}
+Tensor::Tensor(Shape &&shape, std::shared_ptr<Device> device, DataType dtype)
+    : data_type_(dtype), device_(device), shape_(shape) {
+  size_t size = Product(shape_) * SizeOf(data_type_);
+  if (size)
+    block_ = device_->NewBlock(size);
+}
+Tensor::Tensor(const Tensor &in)
+    : transpose_(in.transpose_),
+      data_type_(in.data_type_),
+      device_(in.device_),
+      block_(in.block()),
+      shape_(in.shape_) {
+  if (block_ != nullptr)
+    block_->IncRefCount();
+}
+
+Tensor::Tensor(Tensor &&in)
+    : transpose_(in.transpose_),
+      data_type_(in.data_type_),
+      device_(in.device_),
+      shape_(std::move(in.shape_)) {
+  block_ = in.block_;
+  in.block_ = nullptr;
+}
+
+void Tensor::SetBlock(Block *block) {
+  LOG(WARNING) << "Pls avoid using this function, which may have side-effect.";
+  if (block_ != nullptr)
+    if (block_->DecRefCount()) device_->FreeBlock(block_);
+  block_ = block;
+}
+
+void Tensor::ResetLike(const Tensor &in) {
+  if (block_ == nullptr || device_ != in.device_ || MemSize() != in.MemSize()) {
+    if (block_ != nullptr && block_->DecRefCount() == 0)
+      device_->FreeBlock(block_);
+    device_ = in.device_;
+    data_type_ = in.data_type_;
+    block_ = device_->NewBlock(in.MemSize());
+  }
+  shape_ = in.shape_;
+}
+
+void Tensor::Reshape(const Shape &shape) {
+  if (Product(shape_) != Product(shape)) {
+    if (block_ != nullptr && block_->DecRefCount() == 0)
+      device_->FreeBlock(block_);
+    block_ = device_->NewBlock(Product(shape) * SizeOf(data_type_));
+  }
+  shape_ = shape;
+}
+
+void Tensor::Reshape(Shape &&shape) {
+  if (Product(shape_) != Product(shape)) {
+    if (block_ != nullptr && block_->DecRefCount() == 0)
+      device_->FreeBlock(block_);
+    block_ = device_->NewBlock(Product(shape) * SizeOf(data_type_));
+  }
+  shape_ = std::move(shape);
+}
+
+void Tensor::AsType(const DataType type) {
+  if (data_type_ != type) {
+    if (block_ != nullptr && block_->DecRefCount() == 0)
+      device_->FreeBlock(block_);
+    block_ = device_->NewBlock(Product(shape_) * SizeOf(type));
+    data_type_ = type;
+  }
+}
+
+void Tensor::ToDevice(std::shared_ptr<Device> dst) {
+  // TODO(wangwei) the comparison is very strict. May compare against device ID?
+  if (device_ != dst) {
+    Tensor tmp(shape_, dst, data_type_);
+    if (block_ != nullptr && Size() && block_->initialized())
+      tmp.CopyData(*this);
+    if (block_ != nullptr && block_->DecRefCount() == 0)
+      device_->FreeBlock(block_);
+    block_ = tmp.block_;
+    tmp.block_ = nullptr;
+    device_ = dst;
+  }
+}
+
+void Tensor::ToHost() {
+  if (device_ != defaultDevice) ToDevice(device_->host());
+}
+
+template <typename DType>
+void Tensor::CopyDataFromHostPtr(const DType *src, const size_t num,
+                                 const size_t offset) {
+  CHECK_EQ(sizeof(DType), SizeOf(data_type_))
+      << "data_type is " << DataType_Name(data_type_)
+      << " user given type is of size " << sizeof(DType);
+  if (src != nullptr) {
+    device_->CopyDataFromHostPtr(block(), src, sizeof(DType) * num,
+                                 sizeof(DType) * offset);
+  } else {
+    LOG(WARNING) << "Copy data from null host ptr";
+  }
+}
+template void Tensor::CopyDataFromHostPtr(const unsigned char *src,
+                                          const size_t num,
+                                          const size_t offset);
+template void Tensor::CopyDataFromHostPtr(const float *src, const size_t num,
+                                          const size_t offset);
+template void Tensor::CopyDataFromHostPtr(const int *src, const size_t num,
+                                          const size_t offset);
+
+void Tensor::CopyData(const Tensor &src) {
+  CHECK_EQ(Size(), src.Size());
+  CHECK(block_ != nullptr);
+  // Do copy only if the src's block is already initialized.
+  if (src.block_ != nullptr) {
+    singa::CopyDataToFrom(this, src, Size(), 0, 0);
+  }
+}
+
+void Tensor::FromProto(const singa::TensorProto &proto) {
+  if (block_ != nullptr && block_->DecRefCount() == 0)
+    device_->FreeBlock(block_);
+  block_ = nullptr;
+  Shape shape;
+  for (uint32_t s : proto.shape()) shape.push_back(s);
+  data_type_ = proto.data_type();
+  Reshape(shape);
+  transpose_ = proto.transpose();
+  switch (data_type_) {
+    case kFloat32: {
+      std::unique_ptr<float[]> data_ptr(new float[Product(shape_)]);
+      for (size_t i = 0; i < Product(shape_); ++i)
+        data_ptr[i] = static_cast<float>(proto.float_data(i));
+      CopyDataFromHostPtr<float>(data_ptr.get(), Product(shape_));
+      break;
+    }
+    case kDouble: {
+      std::unique_ptr<double[]> data(new double[Product(shape_)]);
+      for (size_t i = 0; i < Product(shape_); ++i)
+        data[i] = proto.double_data(i);
+      CopyDataFromHostPtr<double>(data.get(), Product(shape_));
+      break;
+    }
+    case kInt: {
+      std::unique_ptr<int[]> data(new int[Product(shape_)]);
+      for (size_t i = 0; i < Product(shape_); ++i) data[i] = proto.int_data(i);
+      CopyDataFromHostPtr<int>(data.get(), Product(shape_));
+      break;
+    }
+    ///TODO(wangji): Implement to support C++ type char using bytes type in protobuf
+    /// which is equivalent to string type is different from the other cases. The kchar
+    /// and kUChar case is to be implemented.
+    /*
+    case kChar: {
+      std::unique_ptr<char[]> data(new char[Product(shape_)]);
+      for (size_t i = 0; i < Product(shape_); ++i)
+        data[i] = static_cast<char>(proto.bytes_data(i));
+      break;
+    }
+    case kUChar: {
+      std::unique_ptr<unsigned char[]> data(new unsigned char[Product(shape_)]);
+      for (size_t i = 0; i < Product(shape_); ++i)
+        data[i] = static_cast<unsigned char>(proto.bytes_data(i));
+      break;
+    }
+    */
+    default: { LOG(FATAL) << "Unsupported Type" << DataType_Name(data_type_); }
+  }
+}
+
+void Tensor::ToProto(singa::TensorProto *proto) const {
+  proto->clear_shape();
+  for (auto s : shape_) {
+    proto->add_shape(s);
+  }
+  proto->set_data_type(data_type_);
+  proto->set_transpose(transpose_);
+  switch (data_type_) {
+    case kFloat32: {
+      proto->clear_float_data();
+      const float *data_ptr = data<float>();
+      for (size_t i = 0; i < Product(shape_); ++i)
+        proto->add_float_data(data_ptr[i]);
+      break;
+    }
+    case kDouble: {
+      proto->clear_double_data();
+      const double *data_ptr = data<double>();
+      for (size_t i = 0; i < Product(shape_); ++i)
+        proto->add_double_data(data_ptr[i]);
+      break;
+    }
+    case kInt: {
+      proto->clear_int_data();
+      const int *data_ptr = data<int>();
+      for (size_t i = 0; i < Product(shape_); ++i)
+        proto->add_int_data(data_ptr[i]);
+      break;
+    }
+    /*
+    case kChar: {
+      proto->clear_bytes_data();
+      const char *data = data<char>();
+      for (size_t i = 0; i < Product(shape_); ++i)
+        proto->add_bytes_data(static_cast<unsigned char>(data[i]));
+      break;
+    }
+    case kUChar: {
+      proto->clear_bytes_data();
+      const unsigned char *data = data<unsigned char>();
+      for (size_t i = 0; i < Product(shape_); ++i)
+        proto->add_bytes_data(static_cast<unsigned char>(data[i]));
+      break;
+    }
+    */
+    default: { LOG(FATAL) << "Unsupported Type" << DataType_Name(data_type_); }
+  }
+}
+
+Tensor Tensor::Clone(std::shared_ptr<Device> device) const {
+  if (device == nullptr) device = device_;
+  Tensor t(shape_, device_, data_type_);
+  t.transpose_ = transpose_;
+  t.CopyData(*this);
+  return t;
+}
+
+Tensor Tensor::T() const {
+  CHECK_EQ(shape_.size(), 2u);
+  Tensor t;
+  t.device_ = device_;
+  t.data_type_ = data_type_;
+  t.transpose_ = ~transpose_;
+  t.shape_.push_back(shape_[1]);
+  t.shape_.push_back(shape_[0]);
+  t.block_ = block_;
+  block_->IncRefCount();
+  return t;
+}
+
+Tensor &Tensor::operator=(const Tensor &in) {
+  // LOG(ERROR) << "= const &";
+  if (block_ != nullptr && block_->DecRefCount() == 0)
+    device_->FreeBlock(block_);
+  transpose_ = in.transpose_;
+  data_type_ = in.data_type_;
+  shape_ = in.shape_;
+  device_ = in.device_;
+  block_ = in.block();
+  if (block_ != nullptr)
+    block_->IncRefCount();
+  return *this;
+}
+
+Tensor &Tensor::operator=(Tensor &&in) {
+  // LOG(ERROR) << "= &&";
+  if (block_ != nullptr && block_->DecRefCount() == 0)
+    device_->FreeBlock(block_);
+  transpose_ = in.transpose_;
+  data_type_ = in.data_type_;
+  shape_ = std::move(in.shape_);
+  device_ = in.device_;
+  block_ = in.block_;
+  in.block_ = nullptr;
+  return *this;
+}
+
+Tensor Reshape(const Tensor &in, const Shape &s) {
+  Tensor out(in);
+  out.Reshape(s);
+  return out;
+}
+
+Tensor Reshape(const Tensor &in, Shape &&s) {
+  Tensor out(in);
+  out.Reshape(std::move(s));
+  return out;
+}
+
+#define GenUnaryTensorArgMemberFn(op, fn) \
+  Tensor &Tensor::op(const Tensor &in) {  \
+    fn(*this, in, this);                  \
+    return *this;                         \
+  }
+
+GenUnaryTensorArgMemberFn(operator+=, Add);
+GenUnaryTensorArgMemberFn(operator-=, Sub);
+GenUnaryTensorArgMemberFn(operator*=, EltwiseMult);
+GenUnaryTensorArgMemberFn(operator/=, Div);
+
+#define GenUnaryScalarArgMemberFn(op, fn) \
+  template <typename DType>               \
+  Tensor &Tensor::op(const DType x) {     \
+    fn(*this, x, this);                   \
+    return *this;                         \
+  }                                       \
+  template Tensor &Tensor::op<float>(const float x)
+
+GenUnaryScalarArgMemberFn(operator-=, Sub);
+GenUnaryScalarArgMemberFn(operator+=, Add);
+GenUnaryScalarArgMemberFn(operator*=, EltwiseMult);
+GenUnaryScalarArgMemberFn(operator/=, Div);
+
+// ====================Tensor Operations=======================================
+void CopyDataToFrom(Tensor *dst, const Tensor &src, const size_t num,
+                    const size_t dst_offset, const size_t src_offset) {
+  auto width = SizeOf(src.data_type());
+  CHECK_EQ(width, SizeOf(dst->data_type()));
+  size_t nBytes = num * width;
+  auto d_offset = dst_offset * width;
+  auto s_offset = src_offset * width;
+  CHECK_GE(src.MemSize(), s_offset + nBytes);
+  CHECK_GE(dst->MemSize(), d_offset + nBytes);
+
+  std::shared_ptr<Device> src_dev = src.device(), dst_dev = dst->device();
+  Block *from = src.block(), *to = dst->block();
+  if (dst_dev->lang() != src_dev->lang()) {
+    // let the none cpp device conduct copy op
+    if (dst_dev->lang() == kCpp) {
+      src_dev->CopyDataToFrom(to, from, nBytes, kDeviceToHost, d_offset,
+                              s_offset);
+    } else if (src_dev->lang() == kCpp) {
+      dst_dev->CopyDataToFrom(to, from, nBytes, kHostToDevice, d_offset,
+                              s_offset);
+    } else {
+      LOG(FATAL) << "Not support mem copy betwee Cuda and OpenCL device";
+    }
+  } else {
+    auto direct = src_dev->lang() == kCpp ? kHostToHost : kDeviceToDevice;
+    src_dev->CopyDataToFrom(to, from, nBytes, direct, d_offset, s_offset);
+  }
+}
+//============================================================================
+/// typedef DType accroding to type value.
+/// DType would be used in the code block __VA_ARGS__.
+#define TYPE_SWITCH(type, DType, ...)                               \
+  do {                                                              \
+    switch (type) {                                                 \
+      case kFloat32: {                                              \
+        typedef float DType;                                        \
+        { __VA_ARGS__ }                                             \
+        break;                                                      \
+      }                                                             \
+      case kInt: {                                                  \
+        typedef int DType;                                          \
+        { __VA_ARGS__ }                                             \
+        break;                                                      \
+      }                                                             \
+      case kChar: {                                                 \
+        typedef char DType;                                         \
+        { __VA_ARGS__ }                                             \
+        break;                                                      \
+      }                                                             \
+      case kDouble: {                                               \
+        typedef double DType;                                       \
+        { __VA_ARGS__ }                                             \
+        break;                                                      \
+      }                                                             \
+      default:                                                      \
+        LOG(FATAL) << "Unknow data type = " << DataType_Name(type); \
+    }                                                               \
+  } while (0)
+
+/// typedef DType and Lang according to data type and device programming
+/// language respectively.
+/// type is from DataType, and lang is from LangType.
+/// DType and Lang would be used in __VA_ARGS__.
+#define TYPE_LANG_SWITCH(dtype, DType, ltype, Lang, ...)       \
+  do {                                                         \
+    const int _SwitchShift = 3;                                \
+    int _SwitchHash = ((dtype) << _SwitchShift) + (ltype);     \
+    switch (_SwitchHash) {                                     \
+      case ((kFloat32 << _SwitchShift) + kCuda): {             \
+        typedef float DType;                                   \
+        typedef lang::Cuda Lang;                               \
+        { __VA_ARGS__ }                                        \
+        break;                                                 \
+      }                                                        \
+      case ((kFloat32 << _SwitchShift) + kCpp): {              \
+        typedef float DType;                                   \
+        typedef lang::Cpp Lang;                                \
+        { __VA_ARGS__ }                                        \
+        break;                                                 \
+      }                                                        \
+      case ((kFloat32 << _SwitchShift) + kOpencl): {           \
+        typedef float DType;                                   \
+        typedef lang::Opencl Lang;                             \
+        { __VA_ARGS__ }                                        \
+        break;                                                 \
+      }                                                        \
+      default:                                                 \
+        LOG(FATAL) << "Unknown combination of data type "      \
+                   << DataType_Name(dtype) << " and language " \
+                   << LangType_Name(ltype);                    \
+    }                                                          \
+  } while (0)
+
+// =============Element-wise operations====================================
+float Tensor::L1() const {
+  float nrm = 0.0f;
+  TYPE_LANG_SWITCH(data_type_, DType, device_->lang(), Lang, {
+    device_->Exec([&nrm, this](Context *ctx) {
+      DType ret = DType(0);
+      Asum<DType, Lang>(this->Size(), this->block(), &ret, ctx);
+      nrm = TypeCast<DType, float>(ret);
+    }, {this->block()}, {});
+  });
+  return nrm / Size();
+}
+
+/// L2 norm, Do not use Nrm2 (name conflict).
+float Tensor::L2() const {
+  float nrm = 0.0f;
+  TYPE_LANG_SWITCH(data_type_, DType, device_->lang(), Lang, {
+    device_->Exec([&nrm, this](Context *ctx) {
+      DType ret = DType(0);
+      Nrm2<DType, Lang>(this->Size(), this->block(), &ret, ctx);
+      nrm = TypeCast<DType, float>(ret);
+    }, {this->block()}, {});
+  });
+  return nrm / Size();
+}
+
+template <typename SType>
+void Tensor::SetValue(const SType x) {
+  CHECK_EQ(sizeof(SType), SizeOf(data_type_));
+  auto size = Size();
+  auto ptr = block_;
+  TYPE_LANG_SWITCH(data_type_, DType, device_->lang(), Lang, {
+    // cast x to DType
+    device_->Exec([size, x, ptr](Context *ctx) {
+      Set<DType, Lang>(size, x, ptr, ctx);
+    }, {}, {ptr});
+  });
+}
+template void Tensor::SetValue<float>(const float x);
+
+#define EltwiseUnaryTensorFn(fn, t, ret)                               \
+  do {                                                                 \
+    TYPE_LANG_SWITCH(t.data_type(), DType, t.device()->lang(), Lang, { \
+      ret->device()->Exec([t, ret](Context * ctx) {                    \
+        fn<DType, Lang>(t.Size(), t.block(), ret->block(), ctx);       \
+      }, {t.block()}, {ret->block()});                                 \
+    });                                                                \
+  } while (0)
+
+#define GenUnaryTensorFn(fn)                             \
+  Tensor fn(const Tensor &in) {                          \
+    Tensor ret(in.shape(), in.device(), in.data_type()); \
+    auto *retptr = &ret;                                 \
+    EltwiseUnaryTensorFn(fn, in, retptr);                \
+    return ret;                                          \
+  }                                                      \
+  void fn(const Tensor &in, Tensor *out) { EltwiseUnaryTensorFn(fn, in, out); }
+
+GenUnaryTensorFn(Abs);
+GenUnaryTensorFn(Exp);
+GenUnaryTensorFn(Log);
+GenUnaryTensorFn(ReLU);
+GenUnaryTensorFn(Sigmoid);
+GenUnaryTensorFn(Sign);
+GenUnaryTensorFn(Sqrt);
+GenUnaryTensorFn(Square);
+GenUnaryTensorFn(Tanh);
+
+#define EltwiseBinaryTensorFn(fn, lhs, rhs, ret)                            \
+  do {                                                                      \
+    TYPE_LANG_SWITCH(lhs.data_type(), DType, lhs.device()->lang(), Lang, {  \
+      CHECK_EQ(sizeof(DType), SizeOf(rhs.data_type()));                     \
+      ret->device()->Exec([lhs, rhs, ret](Context * ctx) {                  \
+        fn<DType, Lang>(lhs.Size(), lhs.block(), rhs.block(), ret->block(), \
+                        ctx);                                               \
+      }, {lhs.block(), rhs.block()}, {ret->block()});                       \
+    });                                                                     \
+  } while (0)
+
+#define GenBinaryTensorFn(op, fn)                              \
+  Tensor op(const Tensor &lhs, const Tensor &rhs) {            \
+    Tensor ret(lhs.shape(), lhs.device(), lhs.data_type());    \
+    fn(lhs, rhs, &ret);                                        \
+    return ret;                                                \
+  }                                                            \
+  void fn(const Tensor &lhs, const Tensor &rhs, Tensor *ret) { \
+    EltwiseBinaryTensorFn(fn, lhs, rhs, ret);                  \
+  }
+
+GenBinaryTensorFn(operator+, Add);
+GenBinaryTensorFn(operator-, Sub);
+GenBinaryTensorFn(operator*, EltwiseMult);
+GenBinaryTensorFn(operator/, Div);
+GenBinaryTensorFn(Pow, Pow);
+GenBinaryTensorFn(operator<, LT);
+GenBinaryTensorFn(operator<=, LE);
+GenBinaryTensorFn(operator>, GT);
+GenBinaryTensorFn(operator>=, GE);
+#define EltwiseTensorScalarFn(fn, t, x, ret)                            \
+  do {                                                                  \
+    TYPE_LANG_SWITCH(t.data_type(), DType, t.device()->lang(), Lang, {  \
+      static_assert(std::is_same<SType, DType>::value,                  \
+                    "The Scalar type must match the Tensor data type"); \
+      ret->device()->Exec([t, x, ret](Context * ctx) {                  \
+        fn<DType, Lang>(t.Size(), t.block(), x, ret->block(), ctx);     \
+      }, {t.block()}, {ret->block()});                                  \
+    });                                                                 \
+  } while (0)
+
+#define GenTensorScalarFn(op, fn)                             \
+  template <typename SType>                                   \
+  Tensor op(const Tensor &in, const SType x) {                \
+    Tensor ret(in.shape(), in.device(), in.data_type());      \
+    fn(in, x, &ret);                                          \
+    return ret;                                               \
+  }                                                           \
+  template <typename SType>                                   \
+  void fn(const Tensor &in, const SType x, Tensor *ret) {     \
+    EltwiseTensorScalarFn(fn, in, x, ret);                    \
+  }                                                           \
+  template Tensor op<float>(const Tensor &in, const float x); \
+  template void fn<float>(const Tensor &in, const float x, Tensor *ret)
+
+GenTensorScalarFn(operator+, Add);
+GenTensorScalarFn(operator-, Sub);
+GenTensorScalarFn(operator*, EltwiseMult);
+GenTensorScalarFn(operator/, Div);
+GenTensorScalarFn(Pow, Pow);
+GenTensorScalarFn(operator<, LT);
+GenTensorScalarFn(operator<=, LE);
+GenTensorScalarFn(operator>, GT);
+GenTensorScalarFn(operator>=, GE);
+template <typename SType>
+Tensor Div(const SType alpha, const Tensor &in) {
+  Tensor out(in.shape(), in.device(), in.data_type());
+  Div(alpha, in, &out);
+  return out;
+}
+template Tensor Div<float>(const float, const Tensor &);
+
+template <typename SType>
+void Div(const SType alpha, const Tensor &in, Tensor *out) {
+  CheckDataTypeAndLang(in, *out);
+  CHECK(in.shape() == out->shape());
+  TYPE_LANG_SWITCH(in.data_type(), DType, in.device()->lang(), Lang, {
+    // TODO(wangwei) type cast SType to DType;
+    in.device()->Exec([alpha, in, out](Context *ctx) {
+      Div<DType, Lang>(in.Size(), alpha, in.block(), out->block(), ctx);
+    }, {in.block()}, {out->block()});
+  });
+}
+template void Div<float>(const float, const Tensor &, Tensor *);
+
+// =============Matrix operations============================================
+Tensor Average(const Tensor &M, int axis) {
+  // operator/ only has implementation for float scalar type, hence it is
+  // necessary to cast the denominator to a float.
+  // TODO(wangwei) implement function for cast scalar type involved in Tensor
+  // functions. E.g.,
+  // template<S, D>
+  // D CastTo(S x) {
+  //   return D(x);
+  // }
+  // for speical types, e.g., fp16:
+  // tempalte<>
+  // fp16 CastType(float x) {
+  //    ....
+  // }
+  if (axis == 0) {
+    return Sum(M, 0) / (1.0f * M.shape(0));
+  } else {
+    CHECK_EQ(axis, 1);
+    return Sum(M, 1) / (1.0f * M.shape(1));
+  }
+}
+// TODO(wangwei) conside async exec
+template <>
+float Sum<float>(const Tensor &in) {
+  float s = 0.0f;
+  Tensor one(in.shape(), in.device(), in.data_type());
+  one.SetValue(1.0f);
+  TYPE_LANG_SWITCH(in.data_type(), DType, in.device()->lang(), Lang, {
+    one.device()->Exec([in, one, &s](Context *ctx) {
+      DType ret = DType(0);
+      Dot<DType, Lang>(in.Size(), in.block(), one.block(), &ret, ctx);
+      s = ret;
+    }, {in.block(), one.block()}, {});
+  });
+  return s;
+}
+
+Tensor Sum(const Tensor &M, int axis) {
+  if (axis == 0) {
+    Tensor out(Shape{M.shape(1)}, M.device(), M.data_type());
+    SumRows(M, &out);
+    return out;
+  } else {
+    CHECK_EQ(axis, 1) << "Not support Sum over axis = " << axis;
+    Tensor out(Shape{M.shape(0)}, M.device(), M.data_type());
+    SumColumns(M, &out);
+    return out;
+  }
+}
+
+Tensor SoftMax(const Tensor &in) {
+  Tensor out(in.shape(), in.device(), in.data_type());
+  SoftMax(in, &out);
+  return out;
+}
+
+Tensor RowMax(const Tensor &in) {
+  Tensor ret({in.shape(0)}, in.device(), in.data_type());
+  TYPE_LANG_SWITCH(in.data_type(), DType, in.device()->lang(), Lang, {
+    in.device()->Exec([in, ret](Context *ctx) {
+      size_t nrow = 1;
+      if (in.nDim() > 1) nrow = in.shape(0);
+      size_t ncol = in.Size() / nrow;
+      RowMax<DType, Lang>(nrow, ncol, in.block(), ret.block(), ctx);
+    }, {in.block()}, {ret.block()});
+  });
+  return ret;
+}
+
+void SoftMax(const Tensor &in, Tensor *out) {
+  CHECK_LE(in.nDim(), 2u);
+  out->CopyData(in);
+  size_t nrow = 1, ncol = in.Size(), size = ncol;
+  if (in.nDim() == 2u) {
+    nrow = in.shape(0);
+    ncol = size / nrow;
+    out->Reshape(Shape{nrow, ncol});
+  }
+  Tensor tmp = RowMax(*out);
+  SubColumn(tmp, out);
+  Exp(*out, out);
+
+  SumColumns(*out, &tmp);
+  DivColumn(tmp, out);
+  out->Reshape(in.shape());
+}
+
+void AddColumn(const Tensor &v, Tensor *M) { AddColumn(1, 1, v, M); }
+/// Add column 'v' onto each column of matrix M;
+template <typename SType>
+void AddColumn(const SType alpha, const SType beta, const Tensor &v,
+               Tensor *M) {
+  if (M->transpose()) {
+    Tensor X = M->T();
+    AddRow(v, &X);
+  } else {
+    CHECK_EQ(M->nDim(), 2u);
+    // CHECK_EQ(v.nDim(), 1u); (chonho) shape of v is 2-element tuple
+    size_t nb_row = M->shape(0), nb_col = M->shape(1);
+    CHECK_EQ(nb_row, v.Size());
+
+    Tensor one(Shape{1, nb_col}, M->device(), M->data_type());
+    one.SetValue(1.0f);  // TODO(wangwei) cast type
+    Tensor vmat = Reshape(v, Shape{nb_row, 1});
+    Mult(alpha, vmat, one, beta, M);
+  }
+}
+template
+void AddColumn(const float alpha, const float beta, const Tensor &v, Tensor *M);
+
+void AddRow(const Tensor &v, Tensor *M) { AddRow(1, 1, v, M); }
+
+/// Sub column 'v' by each column of matrix M; write results into 'out'
+template <typename SType>
+void AddRow(const SType alpha, const SType beta, const Tensor &v, Tensor *M) {
+  if (M->transpose()) {
+    Tensor X = M->T();
+    AddColumn(v, &X);
+  } else {
+    CHECK_EQ(M->nDim(), 2u);
+    // CHECK_EQ(v.nDim(), 1u); (chonho) shape of v is 2-element tuple
+    size_t nb_row = M->shape(0), nb_col = M->shape(1);
+    CHECK_EQ(nb_col, v.Size());
+
+    Tensor one(Shape{nb_row, 1}, M->device(), M->data_type());
+    one.SetValue(1.0f);
+    Tensor vmat = Reshape(v, Shape{1, nb_col});
+    Mult(alpha, one, vmat, beta, M);
+  }
+}
+template void AddRow(const float alpha, const float beta, const Tensor &v,
+                     Tensor *M);
+
+/// Divide column 'v' by each column of matrix M; write results into 'out'
+void DivColumn(const Tensor &v, Tensor *M) {
+  Tensor inv;
+  TYPE_SWITCH(v.data_type(), DType, { inv = Div(DType(1), v); });
+  MultColumn(inv, M);
+}
+
+Tensor ConcatenateRows(const vector<Tensor> &in) {
+  size_t nrow = 0, ncol = 0;
+  CHECK(in.size());
+  for (const auto &x : in) {
+    CHECK(!x.transpose());
+    CHECK_EQ(x.nDim(), 2u);
+    nrow += x.shape(0);
+    if (ncol == 0)
+      ncol = x.shape(1);
+    else
+      CHECK_EQ(ncol, x.shape(1));
+  }
+  Tensor out(Shape{nrow, ncol}, in.at(0).device(), in.at(0).data_type());
+  size_t dst_offset = 0;
+  for (const auto &x : in) {
+    CopyDataToFrom(&out, x, x.Size(), dst_offset, 0);
+    dst_offset += x.Size();
+  }
+  return out;
+}
+
+// TODO(wangwei) add a copypatch function for improve the efficiency on GPU.
+Tensor ConcatenateColumns(const vector<Tensor> &in) {
+  size_t nrow = 0, ncol = 0;
+  CHECK(in.size());
+  for (const auto &x : in) {
+    CHECK(!x.transpose());
+    CHECK_EQ(x.nDim(), 2u);
+    ncol += x.shape(1);
+    if (nrow == 0)
+      nrow = x.shape(0);
+    else
+      CHECK_EQ(nrow, x.shape(0));
+  }
+  Tensor out(Shape{nrow, ncol}, in.at(0).device(), in.at(0).data_type());
+  for (size_t row = 0; row < nrow; row++) {
+    size_t dst_offset = row * ncol;
+    for (const auto &x : in) {
+      size_t src_offset = row * x.shape(1);
+      CopyDataToFrom(&out, x, x.shape(1), dst_offset, src_offset);
+      dst_offset += x.shape(1);
+    }
+    CHECK_EQ(dst_offset, row * ncol + ncol);
+  }
+  return out;
+}
+Tensor CopyRows(const Tensor &in, const size_t start, const size_t end) {
+  CHECK_LT(start, end);
+  CHECK_GE(in.shape(0), end);
+  Shape s = in.shape();
+  s[0] = end - start;
+  size_t sample_size = in.Size() / in.shape(0);
+  Tensor out(s, in.device(), in.data_type());
+  CopyDataToFrom(&out, in, out.Size(), 0, start * sample_size);
+  return out;
+}
+Tensor CopyColumns(const Tensor &in, const size_t start, const size_t end) {
+  CHECK_EQ(in.nDim(), 2u);
+  CHECK_LT(start, end);
+  CHECK_GE(in.shape(1), end);
+  Shape s{in.shape(0), end - start};
+  Tensor out(s, in.device(), in.data_type());
+  for (size_t row = 0; row < out.shape(0); row++) {
+    size_t src_offset = row * in.shape(1) + start;
+    size_t dst_offset = row * out.shape(1);
+    CopyDataToFrom(&out, in, end - start, dst_offset, src_offset);
+  }
+  return out;
+}
+
+/// Divide row 'v' by each row of matrix M; write results into 'out'
+void DivRow(const Tensor &v, Tensor *M) {
+  Tensor inv;
+  TYPE_SWITCH(v.data_type(), DType, { inv = Div(DType(1), v); });
+  MultRow(inv, M);
+}
+
+/// Multiply column 'v' and each column of matrix M; write results into 'out'
+void MultColumn(const Tensor &v, Tensor *M) {
+  CHECK(!M->transpose()) << "Not supported yet";
+  CHECK_EQ(M->nDim(), 2u);
+  // CHECK_EQ(v.nDim(), 1u); (chonho) shape of v is 2-element tuple
+  CHECK_EQ(v.Size(), M->shape(0));
+  CheckDataTypeAndLang(*M, v);
+  TYPE_LANG_SWITCH(v.data_type(), DType, v.device()->lang(), Lang, {
+    v.device()->Exec([M, v](Context *ctx) {
+      DGMM<DType, Lang>(false, M->shape(0), M->shape(1), M->block(), v.block(),
+                        M->block(), ctx);
+    }, {M->block(), v.block()}, {M->block()});
+  });
+}
+
+/// Multiply row 'v' with each row of matrix M; write results into 'out'
+void MultRow(const Tensor &v, Tensor *M) {
+  CHECK(!M->transpose()) << "Not supported yet";
+  CHECK_EQ(M->nDim(), 2u);
+  // CHECK_EQ(v.nDim(), 1u); (chonho) shape of v is 2-element tuple
+  CHECK_EQ(v.Size(), M->shape(1));
+  CheckDataTypeAndLang(*M, v);
+  TYPE_LANG_SWITCH(v.data_type(), DType, v.device()->lang(), Lang, {
+    v.device()->Exec([M, v](Context *ctx) {
+      DGMM<DType, Lang>(true, M->shape(0), M->shape(1), M->block(), v.block(),
+                        M->block(), ctx);
+    }, {M->block(), v.block()}, {M->block()});
+  });
+}
+
+Tensor SliceRows(const Tensor &in, const size_t start, const size_t end) {
+  LOG(FATAL) << "Tensor::SliceRows is not implemented";
+  Tensor ret;
+  /*
+  CHECK_LE(in.nDim(), 2);
+  CHECK_LT(start, end);
+  CHECK_LE(in.shape(0), end);
+  Shape s;
+  if (in.nDim() == 2)
+    s = Shape{end - start, in.shape(1)};
+  else
+    s = Shape{end - start};
+  Tensor out(s, in.device(), in.data_type());
+  Block *b = out.block();
+  */
+  return ret;
+}
+void SubColumn(const Tensor &v, Tensor *M) { AddColumn(-1, 1, v, M); }
+
+void SubRow(const Tensor &v, Tensor *M) { AddRow(-1, 1, v, M); }
+
+void SumColumns(const Tensor &M, Tensor *v) {
+  if (M.transpose()) {
+    Tensor X = M.T();
+    SumRows(X, v);
+  } else {
+    CHECK_EQ(M.nDim(), 2u);
+    // CHECK_EQ(v->nDim(), 1u); (chonho) shape of v is 2-element tuple
+    size_t nb_row = M.shape().at(0), nb_col = M.shape().at(1);
+    CHECK_EQ(nb_row, v->Size());
+
+    Tensor one(Shape{nb_col}, M.device(), M.data_type());
+    one.SetValue(1.0f);  // TODO(wangwei) cast type
+    Mult(M, one, v);
+  }
+}
+void SumRows(const Tensor &M, Tensor *v) {
+  if (M.transpose()) {
+    Tensor X = M.T();
+    SumColumns(X, v);
+  } else {
+    CHECK_EQ(M.nDim(), 2u);
+    // CHECK_EQ(v->nDim(), 1u); (chonho) shape of v is 2-element tuple
+    size_t nb_row = M.shape(0), nb_col = M.shape(1);
+    CHECK_EQ(nb_col, v->Size());
+
+    Tensor one(Shape{nb_row}, M.device(), M.data_type());
+    one.SetValue(1.0f);  // TODO(wangwei) cast type
+    Tensor X = M.T();
+    Mult(X, one, v);
+  }
+}
+// ====================Random operations=====================================
+template <typename SType>
+void Bernoulli(const SType p, Tensor *out) {
+  TYPE_LANG_SWITCH(out->data_type(), DType, out->device()->lang(), Lang, {
+    auto prob = TypeCast<SType, DType>(p);
+    out->device()->Exec([prob, out](Context *ctx) {
+      Bernoulli<DType, Lang>(out->Size(), prob, out->block(), ctx);
+    }, {}, {out->block()}, true);
+  });
+}
+template void Bernoulli<float>(const float p, Tensor *out);
+
+template <typename SType>
+void Uniform(const SType low, const SType high, Tensor *out) {
+  TYPE_LANG_SWITCH(out->data_type(), DType, out->device()->lang(), Lang, {
+    auto l = TypeCast<SType, DType>(low);
+    auto h = TypeCast<SType, DType>(high);
+    out->device()->Exec([l, h, out](Context *ctx) {
+      Uniform<DType, Lang>(out->Size(), l, h, out->block(), ctx);
+    }, {}, {out->block()}, true);
+  });
+}
+template void Uniform<float>(const float low, const float high, Tensor *out);
+
+template <typename SType>
+void Gaussian(const SType mean, const SType std, Tensor *out) {
+  TYPE_LANG_SWITCH(out->data_type(), DType, out->device()->lang(), Lang, {
+    auto m = TypeCast<SType, DType>(mean);
+    auto s = TypeCast<SType, DType>(std);
+    out->device()->Exec([m, s, out](Context *ctx) {
+      Gaussian<DType, Lang>(out->Size(), m, s, out->block(), ctx);
+    }, {}, {out->block()}, true);
+  });
+}
+template void Gaussian<float>(const float mean, const float std, Tensor *out);
+
+// ================Blas operations============================================
+
+template <typename SType>
+void Axpy(const SType alpha, const Tensor &in, Tensor *out) {
+  TYPE_LANG_SWITCH(in.data_type(), DType, in.device()->lang(), Lang, {
+    auto a = TypeCast<SType, DType>(alpha);
+    out->device()->Exec([a, in, out](Context *ctx) {
+      Axpy<DType, Lang>(in.Size(), a, in.block(), out->block(), ctx);
+    }, {in.block(), out->block()}, {out->block()});
+  });
+}
+template
+void Axpy<float>(const float alpha, const Tensor &in, Tensor *out);
+
+Tensor Mult(const Tensor &A, const Tensor &B) {
+  Shape s;
+  s.push_back(A.shape(0));
+  if (B.nDim() == 2) s.push_back(B.shape(1));
+  Tensor out(s, A.device(), A.data_type());
+  Mult(A, B, &out);
+  return out;
+}
+
+void Mult(const Tensor &A, const Tensor &B, Tensor *out) {
+  Mult(1.0f, A, B, 0.0f, out);
+}
+
+template <typename SType>
+void Mult(const SType alpha, const Tensor &A, const Tensor &B, const SType beta,
+          Tensor *C) {
+  CHECK_EQ(A.shape().size(), 2u);
+  if (B.nDim() == 1u) {
+    TYPE_LANG_SWITCH(A.data_type(), DType, A.device()->lang(), Lang, {
+      auto a = TypeCast<SType, DType>(alpha);
+      auto b = TypeCast<SType, DType>(beta);
+      C->device()->Exec([a, A, b, B, C](Context *ctx) {
+        GEMV<DType, Lang>(A.transpose(), A.shape(0), A.shape(1), a, A.block(),
+                          B.block(), b, C->block(), ctx);
+      }, {A.block(), B.block()}, {C->block()});
+    });
+  } else {
+    CHECK(!C->transpose());
+    TYPE_LANG_SWITCH(A.data_type(), DType, A.device()->lang(), Lang, {
+      auto a = TypeCast<SType, DType>(alpha);
+      auto b = TypeCast<SType, DType>(beta);
+      C->device()->Exec([a, A, b, B, C](Context *ctx) {
+        GEMM<DType, Lang>(A.transpose(), B.transpose(), A.shape(0), B.shape(1),
+                          A.shape(1), a, A.block(), B.block(), b, C->block(),
+                          ctx);
+      }, {A.block(), B.block()}, {C->block()});
+    });
+  }
+}
+
+// ************************
+// Misc.
+// ***********************
+void ComputeCrossEntropy(const Tensor &p, const Tensor &t, Tensor *loss) {
+  CHECK_LE(p.nDim(), 2u);
+  CHECK_LE(t.nDim(), 2u);  // TODO(wangwei) consider multi-labels.
+  size_t batchsize = 1;
+  if (p.nDim() == 2u) batchsize = p.shape(0);
+  size_t dim = p.Size() / batchsize;
+  TYPE_LANG_SWITCH(p.data_type(), DType, p.device()->lang(), Lang, {
+    p.device()->Exec([batchsize, dim, t, p, loss](Context *ctx) {
+      ComputeCrossEntropy<DType, Lang>(batchsize, dim, p.block(), t.block(),
+                                       loss->block(), ctx);
+    }, {p.block(), t.block()}, {loss->block()});
+  });
+}
+void SoftmaxCrossEntropyBwd(const Tensor &t, Tensor *p) {
+  CHECK_LE(p->nDim(), 2u);
+  CHECK_LE(t.nDim(), 2u);  // TODO(wangwei) consider multi-labels.
+  size_t batchsize = 1;
+  if (p->nDim() == 2u) batchsize = p->shape(0);
+  size_t dim = p->Size() / batchsize;
+  TYPE_LANG_SWITCH(p->data_type(), DType, p->device()->lang(), Lang, {
+    p->device()->Exec([batchsize, dim, t, p](Context *ctx) {
+      SoftmaxCrossEntropyBwd<DType, Lang>(batchsize, dim, p->block(), t.block(),
+                                          p->block(), ctx);
+    }, {p->block(), t.block()}, {p->block()});
+  });
+}
+
+}  // namespace singa
diff --git a/src/core/tensor/tensor_math.h b/src/core/tensor/tensor_math.h
new file mode 100644
index 0000000..bf913c0
--- /dev/null
+++ b/src/core/tensor/tensor_math.h
@@ -0,0 +1,416 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef SINGA_CORE_MATH_H_
+#define SINGA_CORE_MATH_H_
+#include <type_traits>
+#include "singa/core/common.h"
+#include "singa/utils/logging.h"
+
+namespace singa {
+
+/// \file math.h Math functions for linear algebra, neural net and random
+/// operations.
+/// All functions have a template argument, DType for DataType, Lang for the
+/// device programming language, e.g., Langice::kCpp, Langice::kCuda
+///
+/// TODO(wangwei) Clean the functions to make the function APIs consistent:
+/// 1. All function names should be like XxxYyy or XY, i.e., capitablize the
+/// first
+///    letter.
+/// 2. Order functions based on function name in alphabetical order.
+/// 3. Function arguments order is [const basic type] [const Block] [mutable
+/// Block].
+/// 4. Function argument names, use 'num' for total number of elements in
+///    elementwise operations; use 'in1' 'in2' for in blocks; use 'out' for
+///    output block or value. With exceptions for some functions, e.g.,
+///      Scale(const float alpha, const Block* in, Block* out);
+///    For such cases, use x, v, alpha, etc for scalar types.
+///    For blas functions, follow the blas style for argument names.
+///    Use 'M' and 'v' for matrix and vector tensors in functions involving both
+///    matrix and vectors.
+/// 5. For Block argument xxx, name its raw pointer as xxxPtr.
+/// 6. Pass the 'cudaStream_t s' to every function in math_kernel.h
+/// 7. Use size_t for the number of elements, rows or columns.
+/// 8. Use the same name for the Tensor and Block level math functions.
+
+// **************************************
+// Element-wise functions
+// **************************************
+
+/// out[i] = |in[i]|
+template <typename DType, typename Lang>
+void Abs(const size_t num, const Block *in, Block *out, Context *ctx) {
+  LOG(FATAL) << "Abs Not Implemented";
+}
+
+/// out[i] = in[i] + x
+template <typename DType, typename Lang>
+void Add(const size_t num, const Block *in, const DType x, Block *out,
+         Context *ctx) {
+  LOG(FATAL) << "Add Not Implemented";
+}
+
+/// out[i] = in1[i] + in2[i]
+template <typename DType, typename Lang>
+void Add(const size_t num, const Block *in1, const Block *in2, Block *out,
+         Context *ctx) {
+  LOG(FATAL) << "Add-Pair Not Implemented";
+}
+/// Clamp every element into [low, high]
+/// if in[i]>high, then out[i]=high; if in[i]<low, then out[i]=low.
+template <typename DType, typename Lang>
+void Clamp(const size_t num, const DType low, const DType high, const Block *in,
+           Block *out, Context *ctx) {
+  LOG(FATAL) << "Clamp Not Implemented";
+}
+
+/// out[i] = x / in[i]
+template <typename DType, typename Lang>
+void Div(const size_t num, const DType x, const Block *in, Block *out,
+         Context *ctx) {
+  LOG(FATAL) << "Div Not Implemented";
+}
+
+/// out[i] = in[i] / x
+template <typename DType, typename Lang>
+void Div(const size_t num, const Block *in, const DType x, Block *out,
+         Context *ctx) {
+  CHECK_NE(x, 0.f);
+  EltwiseMult<DType, Lang>(num, in, DType(1) / x, out, ctx);
+}
+
+/// out[i] = in1[i] / in2[i]
+template <typename DType, typename Lang>
+void Div(const size_t num, const Block *in1, const Block *in2, Block *out,
+         Context *ctx) {
+  LOG(FATAL) << "Div-Pair Not Implemented";
+}
+
+/// out[i] = in[i] * x
+template <typename DType, typename Lang>
+void EltwiseMult(const size_t num, const Block *in, const DType x, Block *out,
+                 Context *ctx) {
+  LOG(FATAL) << "EltwiseMult Not Implemented";
+}
+
+/// out[i] = in1[i] * in2[i]
+template <typename DType, typename Lang>
+void EltwiseMult(const size_t num, const Block *in1, const Block *in2, Block *out,
+                 Context *ctx) {
+  LOG(FATAL) << "EltwiseMult-Pair Not Implemented";
+}
+
+/// Base is e, Neper number. out[i]=exp(in[i])
+template <typename DType, typename Lang>
+void Exp(const size_t num, const Block *in, Block *out, Context *ctx) {
+  LOG(FATAL) << "Exp Not Implemented";
+}
+
+/// out[i]=(in[i]<=x)?1.f:0.f
+template <typename DType, typename Lang>
+void LE(const size_t num, const Block *in, const DType x, Block *out,
+        Context *ctx) {
+  LOG(FATAL) << "LE Not Implemented";
+}
+/// out[i]=(in1[i]<=in2[i])?1.f:0.f
+template <typename DType, typename Lang>
+void LE(const size_t num, const Block *in1, const Block *in2, Block *out,
+        Context *ctx) {
+  LOG(FATAL) << "Tensor-Tensor LE Not Implemented";
+}
+/// Natual logarithm, the base is e, Neper number out[i]=log(in[i]).
+template <typename DType, typename Lang>
+void Log(const size_t num, const Block *in, Block *out, Context *ctx) {
+  LOG(FATAL) << "Log Not Implemented";
+}
+/// out[i]=(in[i]<x)?1.f:0.f
+template <typename DType, typename Lang>
+void LT(const size_t num, const Block *in, const DType x, Block *out,
+        Context *ctx) {
+  LOG(FATAL) << "LT Not Implemented";
+}
+/// out[i]=(in1[i]<in2[i])?1.f:0.f
+template <typename DType, typename Lang>
+void LT(const size_t num, const Block *in1, const Block *in2, Block *out,
+        Context *ctx) {
+  LOG(FATAL) << "Tensor-Tensor LT Not Implemented";
+}
+/// out[i]=(in[i]>=x)?1.f:0.f
+template <typename DType, typename Lang>
+void GE(const size_t num, const Block *in, const DType x, Block *out,
+        Context *ctx) {
+  LOG(FATAL) << "GE Not Implemented";
+}
+/// out[i]=(in1[i]>=in2[i])?1.f:0.f
+template <typename DType, typename Lang>
+void GE(const size_t num, const Block *in1, const Block *in2, Block *out,
+        Context *ctx) {
+  LOG(FATAL) << "Tensor-Tensor GE Not Implemented";
+}
+/// out[i]=(in[i]>x)?1.f:0.f
+template <typename DType, typename Lang>
+void GT(const size_t num, const Block *in, const DType x, Block *out,
+        Context *ctx) {
+  LOG(FATAL) << "GT Not Implemented";
+}
+/// out[i]=(in[i]>in2[i])?1.f:0.f
+template <typename DType, typename Lang>
+void GT(const size_t num, const Block *in, const Block *in2, Block *out,
+        Context *ctx) {
+  LOG(FATAL) << "Tensor-Tensor GT Not Implemented";
+}
+/// out[i] = pow(in[i], x)
+template <typename DType, typename Lang>
+void Pow(const size_t num, const Block *in, const DType x, Block *out,
+         Context *ctx) {
+  LOG(FATAL) << "Pow Not Implemented";
+}
+
+/// out[i]=pow(in1[i], in2[i])
+template <typename DType, typename Lang>
+void Pow(const size_t num, const Block *in1, const Block *in2, Block *out,
+         Context *ctx) {
+  LOG(FATAL) << "Pow-Pair Not Implemented";
+}
+
+/// out[i]=max(0, in[i])
+template <typename DType, typename Lang>
+void ReLU(const size_t num, const Block *in, Block *out, Context *ctx) {
+  LOG(FATAL) << "ReLU Not Implemented";
+}
+
+/// out[i] = x
+template <typename DType, typename Lang>
+void Set(const size_t num, const DType x, Block *out, Context *ctx) {
+  LOG(FATAL) << "Set Not Implemented";
+}
+/// out[i]=sigmoid(in[i])
+template <typename DType, typename Lang>
+void Sigmoid(const size_t num, const Block *in, Block *out, Context *ctx) {
+  LOG(FATAL) << "Sigmoid Not Implemented";
+}
+
+/// out[i] = sign(in[i])
+template <typename DType, typename Lang>
+void Sign(const size_t num, const Block *in, Block *out, Context *ctx) {
+  LOG(FATAL) << "Sign Not Implemented";
+}
+/// out[i]=sqrt(in[i])
+template <typename DType, typename Lang>
+void Sqrt(const size_t num, const Block *in, Block *out, Context *ctx) {
+  LOG(FATAL) << "Sqrt Not Implemented";
+}
+
+/// out[i]=square(in[i])
+template <typename DType, typename Lang>
+void Square(const size_t num, const Block *in, Block *out, Context *ctx) {
+  EltwiseMult<DType, Lang>(num, in, in, out, ctx);
+}
+
+/// out[i] =  in[i] - x
+template <typename DType, typename Lang>
+void Sub(const size_t num, const Block *in, const DType x, Block *out,
+         Context *ctx) {
+  Add<DType, Lang>(num, in, -x, out, ctx);
+}
+
+/// out[i] = in1[i] - in2[i]
+template <typename DType, typename Lang>
+void Sub(const size_t num, const Block *in1, const Block *in2, Block *out,
+         Context *ctx) {
+  LOG(FATAL) << "Sub-Pair Not Implemented";
+}
+
+/// sum all elements of in into out
+template <typename DType, typename Lang>
+void Sum(const size_t num, const Block *in, DType *out, Context *ctx) {
+  LOG(FATAL) << "Sum Not Implemented";
+}
+
+/// out[i]=tanh(in[i])
+template <typename DType, typename Lang>
+void Tanh(const size_t num, const Block *in, Block *out, Context *ctx) {
+  LOG(FATAL) << "Tanh Not Implemented";
+}
+
+// **************************************
+// Random functions
+// **************************************
+/// Each element of out would be 1 with prob p and 0 with 1-p. 0<= p <= 1
+// Get the random generator from 'ctx'
+// If DType is not float, then convert the threshold to DType
+template <typename DType, typename Lang>
+void Bernoulli(const size_t num, const float p, Block *out, Context *ctx) {
+  LOG(FATAL) << "Bernoulli Not Implemented";
+}
+// The random generator should be extracted from ctx.
+// If DType is not float, then convert the mean and std to DType
+template <typename DType, typename Lang>
+void Gaussian(const size_t num, const float mean, const float std, Block *out,
+              Context *ctx) {
+  LOG(FATAL) << "Gaussian Not Implemented";
+}
+// The random generator should be extracted from ctx.
+// If DType is not float, then convert the low and high to DType
+template <typename DType, typename Lang>
+void Uniform(const size_t num, const float low, const float high, Block *out,
+             Context *ctx) {
+  LOG(FATAL) << "Uniform Not Implemented";
+}
+
+// *********************************************************
+// BLAS functions, ref to http://docs.nvidia.com/cuda/cublas
+// *********************************************************
+
+/// outurn the index of the element with the max value.
+template <typename DType, typename Lang>
+void Amax(const size_t num, const Block *in, size_t *out, Context *ctx) {
+  LOG(FATAL) << "Amax Not Implemented";
+}
+
+/// outurn the index of the element with the min value.
+template <typename DType, typename Lang>
+void Amin(const size_t num, const Block *in, size_t *out, Context *ctx) {
+  LOG(FATAL) << "Amin Not Implemented";
+}
+/// out = sum |x| for all x in in
+template <typename DType, typename Lang>
+void Asum(const size_t num, const Block *in, DType *out, Context *ctx) {
+  LOG(FATAL) << "Asum Not Implemented";
+}
+
+/// out = alpha * in + out
+template <typename DType, typename Lang>
+void Axpy(const size_t num, const DType alpha, const Block *in, Block *out,
+          Context *ctx) {
+  LOG(FATAL) << "Axpy Not Implemented";
+}
+
+/// out = ||in||_2^2, i.e, L2 norm.
+template <typename DType, typename Lang>
+void Nrm2(const size_t num, const Block *in, float *out, Context *ctx) {
+  LOG(FATAL) << "Nrm2 Not Implemented";
+}
+
+/// out *= x
+template <typename DType, typename Lang>
+void Scale(const size_t num, const DType x, Block *out, Context *ctx) {
+  LOG(FATAL) << "Scale Not Implemented";
+}
+
+/// inner product of array in1 and in2
+template <typename DType, typename Lang>
+void Dot(const size_t num, const Block *in1, const Block *in2, DType *out,
+         Context *ctx) {
+  LOG(FATAL) << "Dot Not Implemented";
+}
+
+/// out = alpha * A * v + beta * out.
+/// transA indicates if the internal data layout is transposed of A
+template <typename DType, typename Lang>
+void GEMV(bool trans, const size_t m, const size_t n, const DType alpha,
+          const Block *A, const Block *v, const DType beta, Block *out,
+          Context *ctx) {
+  LOG(FATAL) << "GEMV Not Implemented";
+}
+
+/// multiply a matrix with a diagnoal matrix constructed using values from 'v'.
+/// if matrix_lef_side is true, do M*v; else do v*M
+template <typename DType, typename Lang>
+void DGMM(const bool side_right, const size_t nrow, const size_t ncol,
+          const Block *M, const Block *v, Block *out, Context *ctx) {
+  LOG(FATAL) << "DGMM Not Implemented";
+}
+
+/// C = alpha * A * B + beta * C.
+/// transA indicates if the internal data layout is transposed of A
+template <typename DType, typename Lang>
+void GEMM(const bool transA, const bool transB, const size_t nrowA,
+          const size_t ncolB, const size_t ncolA, const DType alpha,
+          const Block *A, const Block *B, const DType beta, Block *C,
+          Context *ctx) {
+  LOG(FATAL) << "GEMM Not Implemented";
+}
+
+/// Divide alpha by each element of 'in'.
+// following the consistency guide.
+template <typename DType, typename Lang>
+void ComputeCrossEntropy(const size_t batchsize, const size_t dim,
+                         const Block *p, const Block *t, Block *loss,
+                         Context *ctx) {
+  LOG(FATAL) << "Not Implemented";
+}
+
+template <typename DType, typename Lang>
+void SoftmaxCrossEntropyBwd(const size_t batchsize, const size_t dim,
+                            const Block *p, const Block *t, Block *grad,
+                            Context *ctx) {
+  LOG(FATAL) << "Not Implemented";
+}
+
+template <typename DType, typename Lang>
+void RowMax(const size_t nrow, const size_t ncol, const Block *in,
+    Block *ret, Context* ctx) {
+  LOG(FATAL) << "Not Implemented";
+}
+// **************************************
+// Matrix functions
+// **************************************
+/*
+/// Add the vector v to every column of A as the column of out
+template <typename DType, typename Lang>
+void AddCol(const size_t nrow, const size_t ncol, const Block *A, const Block *v,
+            Block *out, Context *ctx) {
+  LOG(FATAL) << "AddCol Not Implemented";
+}
+// TODO(wangwei) unify AddRow and AddCol.
+/// Add the vector v to every row of A as the row of out
+template <typename DType, typename Lang>
+void AddRow(const size_t nrow, const size_t ncol, const Block *A, const Block *v,
+            Block *out, Context *ctx) {
+  LOG(FATAL) << "AddRow Not Implemented";
+}
+/// outer-product.
+/// in1 and in2 are vectors of len m and n. out is matrix of shape m * n
+template <typename DType, typename Lang>
+void Outer(const size_t m, const size_t n, const Block *in1, const Block *in2,
+           Block *out, Context *ctx) {
+  LOG(FATAL) << "Outer Not Implemented";
+}
+
+/// Sum the columns of the in matrix into a vector
+template <typename DType, typename Lang>
+void SumColumns(const size_t nrow, const size_t ncol, const Block *in, Block *out,
+                Context *ctx) {
+  LOG(FATAL) << "SumColumns Not Implemented";
+}
+template <typename DType, typename Lang>
+void Set(const size_t num, const DType x, Block *out, Context *ctx) {
+  LOG(FATAL) << "Not Implemented";
+}
+
+// TODO(wangwei) unify SumRow and SumCol.
+/// Sum the rows of the in matrix into a vector
+template <typename DType, typename Lang>
+void SumRows(const size_t nrow, const size_t ncol, const Block *in, Block *out,
+             Context *ctx) {
+  LOG(FATAL) << "SumRows Not Implemented";
+}
+*/
+}  // namespace singa
+#endif  // SINGA_CORE_MATH_H_
diff --git a/src/core/tensor/tensor_math_cpp.h b/src/core/tensor/tensor_math_cpp.h
new file mode 100644
index 0000000..8c8a40a
--- /dev/null
+++ b/src/core/tensor/tensor_math_cpp.h
@@ -0,0 +1,705 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef SINGA_CORE_TENSOR_TENSOR_MATH_CPP_H_
+#define SINGA_CORE_TENSOR_TENSOR_MATH_CPP_H_
+
+#include "./tensor_math.h"
+#include <cfloat>
+#include "singa/core/common.h"
+#include <math.h>
+
+#ifdef USE_CBLAS
+#include <cblas.h>
+#endif
+
+namespace singa {
+
+template <>
+void Abs<float, lang::Cpp>(const size_t num, const Block *in, Block *out,
+                           Context *ctx) {
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  const float *inPtr = static_cast<const float *>(in->data());
+  for (size_t i = 0; i < num; i++) {
+    outPtr[i] = fabs(inPtr[i]);
+  }
+}
+
+template <>
+void Add<float, lang::Cpp>(const size_t num, const Block *in, const float x,
+                           Block *out, Context *ctx) {
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  const float *inPtr = static_cast<const float *>(in->data());
+  for (size_t i = 0; i < num; i++) {
+    outPtr[i] = inPtr[i] + x;
+  }
+}
+
+template <>
+void Add<float, lang::Cpp>(const size_t num, const Block *in1, const Block *in2,
+                           Block *out, Context *ctx) {
+  // CHECK_EQ(ctx->stream, nullptr);
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  const float *in1Ptr = static_cast<const float *>(in1->data());
+  const float *in2Ptr = static_cast<const float *>(in2->data());
+  for (size_t i = 0; i < num; i++) {
+    outPtr[i] = in1Ptr[i] + in2Ptr[i];
+  }
+}
+
+template <>
+void Clamp<float, lang::Cpp>(const size_t num, const float low,
+                             const float high, const Block *in, Block *out,
+                             Context *ctx) {
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  const float *inPtr = static_cast<const float *>(in->data());
+  for (size_t i = 0; i < num; i++) {
+    if (inPtr[i] > high) {
+      outPtr[i] = high;
+    } else if (inPtr[i] < low) {
+      outPtr[i] = low;
+    } else {
+      outPtr[i] = inPtr[i];
+    }
+  }
+}
+
+template <>
+void Div<float, lang::Cpp>(const size_t num, const Block *in1, const Block *in2,
+                           Block *out, Context *ctx) {
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  const float *in1Ptr = static_cast<const float *>(in1->data());
+  const float *in2Ptr = static_cast<const float *>(in2->data());
+  for (size_t i = 0; i < num; i++) {
+    CHECK_NE(in2Ptr[i], 0.f);
+    outPtr[i] = in1Ptr[i] / in2Ptr[i];
+  }
+}
+
+template <>
+void Div<float, lang::Cpp>(const size_t num, const float x, const Block *in,
+                           Block *out, Context *ctx) {
+  const float *inPtr = static_cast<const float *>(in->data());
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  for (size_t i = 0; i < num; i++) {
+    CHECK_NE(inPtr[i], 0.f);
+    outPtr[i] = x / inPtr[i];
+  }
+}
+
+template <>
+void EltwiseMult<float, lang::Cpp>(const size_t num, const Block *in,
+                                   const float x, Block *out, Context *ctx) {
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  const float *inPtr = static_cast<const float *>(in->data());
+  for (size_t i = 0; i < num; i++) {
+    outPtr[i] = inPtr[i] * x;
+  }
+}
+
+template <>
+void EltwiseMult<float, lang::Cpp>(const size_t num, const Block *in1,
+                                   const Block *in2, Block *out, Context *ctx) {
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  const float *in1Ptr = static_cast<const float *>(in1->data());
+  const float *in2Ptr = static_cast<const float *>(in2->data());
+  for (size_t i = 0; i < num; i++) {
+    outPtr[i] = in1Ptr[i] * in2Ptr[i];
+  }
+}
+template <>
+void Exp<float, lang::Cpp>(const size_t num, const Block *in, Block *out,
+                           Context *ctx) {
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  const float *inPtr = static_cast<const float *>(in->data());
+  for (size_t i = 0; i < num; i++) {
+    outPtr[i] = exp(inPtr[i]);
+  }
+}
+
+template <>
+void GE<float, lang::Cpp>(const size_t num, const Block *in, const float x,
+                          Block *out, Context *ctx) {
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  const float *inPtr = static_cast<const float *>(in->data());
+  for (size_t i = 0; i < num; i++) {
+    outPtr[i] = (inPtr[i] >= x) ? 1.f : 0.f;
+  }
+}
+
+template <>
+void GE<float, lang::Cpp>(const size_t num, const Block *in1, const Block *in2,
+                          Block *out, Context *ctx) {
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  const float *inPtr1 = static_cast<const float *>(in1->data());
+  const float *inPtr2 = static_cast<const float *>(in2->data());
+  for (size_t i = 0; i < num; i++) {
+    outPtr[i] = (inPtr1[i] >= inPtr2[i]) ? 1.f : 0.f;
+  }
+}
+template <>
+void GT<float, lang::Cpp>(const size_t num, const Block *in, const float x,
+                          Block *out, Context *ctx) {
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  const float *inPtr = static_cast<const float *>(in->data());
+  for (size_t i = 0; i < num; i++) {
+    outPtr[i] = (inPtr[i] > x) ? 1.f : 0.f;
+  }
+}
+template <>
+void GT<float, lang::Cpp>(const size_t num, const Block *in1, const Block *in2,
+                          Block *out, Context *ctx) {
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  const float *inPtr1 = static_cast<const float *>(in1->data());
+  const float *inPtr2 = static_cast<const float *>(in2->data());
+  for (size_t i = 0; i < num; i++) {
+    outPtr[i] = (inPtr1[i] > inPtr2[i]) ? 1.f : 0.f;
+  }
+}
+
+template <>
+void LE<float, lang::Cpp>(const size_t num, const Block *in, const float x,
+                          Block *out, Context *ctx) {
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  const float *inPtr = static_cast<const float *>(in->data());
+  for (size_t i = 0; i < num; i++) {
+    outPtr[i] = (inPtr[i] <= x) ? 1.f : 0.f;
+  }
+}
+template <>
+void LE<float, lang::Cpp>(const size_t num, const Block *in1, const Block *in2,
+                          Block *out, Context *ctx) {
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  const float *inPtr1 = static_cast<const float *>(in1->data());
+  const float *inPtr2 = static_cast<const float *>(in2->data());
+  for (size_t i = 0; i < num; i++) {
+    outPtr[i] = (inPtr1[i] <= inPtr2[i]) ? 1.f : 0.f;
+  }
+}
+template <>
+void Log<float, lang::Cpp>(const size_t num, const Block *in, Block *out,
+                           Context *ctx) {
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  const float *inPtr = static_cast<const float *>(in->data());
+  for (size_t i = 0; i < num; i++) {
+    CHECK_GT(inPtr[i], 0.f);
+    outPtr[i] = log(inPtr[i]);
+  }
+}
+template <>
+void LT<float, lang::Cpp>(const size_t num, const Block *in, const float x,
+                          Block *out, Context *ctx) {
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  const float *inPtr = static_cast<const float *>(in->data());
+  for (size_t i = 0; i < num; i++) {
+    outPtr[i] = (inPtr[i] < x) ? 1.f : 0.f;
+  }
+}
+template <>
+void LT<float, lang::Cpp>(const size_t num, const Block *in1, const Block *in2,
+                          Block *out, Context *ctx) {
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  const float *inPtr1 = static_cast<const float *>(in1->data());
+  const float *inPtr2 = static_cast<const float *>(in2->data());
+  for (size_t i = 0; i < num; i++) {
+    outPtr[i] = (inPtr1[i] < inPtr2[i]) ? 1.f : 0.f;
+  }
+}
+
+template <>
+void Pow<float, lang::Cpp>(const size_t num, const Block *in, const float x,
+                           Block *out, Context *ctx) {
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  const float *inPtr = static_cast<const float *>(in->data());
+  for (size_t i = 0; i < num; i++) {
+    outPtr[i] = pow(inPtr[i], x);
+  }
+}
+
+template <>
+void Pow<float, lang::Cpp>(const size_t num, const Block *in1, const Block *in2,
+                           Block *out, Context *ctx) {
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  const float *in1Ptr = static_cast<const float *>(in1->data());
+  const float *in2Ptr = static_cast<const float *>(in2->data());
+  for (size_t i = 0; i < num; i++) {
+    outPtr[i] = pow(in1Ptr[i], in2Ptr[i]);
+  }
+}
+template <>
+void ReLU<float, lang::Cpp>(const size_t num, const Block *in, Block *out,
+                            Context *ctx) {
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  const float *inPtr = static_cast<const float *>(in->data());
+  for (size_t i = 0; i < num; i++) {
+    outPtr[i] = (inPtr[i] >= 0.f) ? inPtr[i] : 0.f;
+  }
+}
+template <>
+void Set<float, lang::Cpp>(const size_t num, const float x, Block *out,
+                           Context *ctx) {
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  for (size_t i = 0; i < num; i++) outPtr[i] = x;
+}
+template <>
+void Sigmoid<float, lang::Cpp>(const size_t num, const Block *in, Block *out,
+                               Context *ctx) {
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  const float *inPtr = static_cast<const float *>(in->data());
+  for (size_t i = 0; i < num; i++) {
+    outPtr[i] = 1.f / (1.f + exp(-inPtr[i]));
+  }
+}
+
+template <>
+void Sign<float, lang::Cpp>(const size_t num, const Block *in, Block *out,
+                            Context *ctx) {
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  const float *inPtr = static_cast<const float *>(in->data());
+  for (size_t i = 0; i < num; i++) {
+    outPtr[i] = inPtr[i] > 0 ? 1.0f : 0.0f;
+  }
+}
+
+template <>
+void Sqrt<float, lang::Cpp>(const size_t num, const Block *in, Block *out,
+                            Context *ctx) {
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  const float *inPtr = static_cast<const float *>(in->data());
+  for (size_t i = 0; i < num; i++) {
+    CHECK_GE(inPtr[i], 0.f);
+    outPtr[i] = sqrt(inPtr[i]);
+  }
+}
+/*
+template <>
+void Square<float, lang::Cpp>(const size_t num, const Block *in, Block *out,
+                              Context *ctx) {
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  const float *inPtr = static_cast<const float *>(in->data());
+  for (size_t i = 0; i < num; i++) {
+    outPtr[i] = inPtr[i] * inPtr[i];
+  }
+}
+*/
+
+template <>
+void Sub<float, lang::Cpp>(const size_t num, const Block *in1, const Block *in2,
+                           Block *out, Context *ctx) {
+  // CHECK_EQ(ctx->stream, nullptr);
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  const float *in1Ptr = static_cast<const float *>(in1->data());
+  const float *in2Ptr = static_cast<const float *>(in2->data());
+  for (size_t i = 0; i < num; i++) {
+    outPtr[i] = in1Ptr[i] - in2Ptr[i];
+  }
+}
+
+// sum all elements of input into out
+// TODO(wangwei) optimize using omp
+template <>
+void Sum<float, lang::Cpp>(const size_t num, const Block *in, float *out,
+                           Context *ctx) {
+  float s = 0.f;
+  const float *inPtr = static_cast<const float *>(in->data());
+  for (size_t i = 0; i < num; i++) {
+    s += inPtr[i];
+  }
+  *out = s;
+}
+
+template <>
+void Tanh<float, lang::Cpp>(const size_t num, const Block *in, Block *out,
+                            Context *ctx) {
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  const float *inPtr = static_cast<const float *>(in->data());
+  for (size_t i = 0; i < num; i++) {
+    outPtr[i] = tanh(inPtr[i]);
+  }
+}
+
+// ===============Random operations==========================================
+template <>
+void Bernoulli<float, lang::Cpp>(const size_t num, const float p, Block *out,
+                                 Context *ctx) {
+  std::bernoulli_distribution distribution(p);
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  for (size_t i = 0; i < num; i++) {
+    outPtr[i] = distribution(ctx->random_generator) ? 1.0f : 0.0f;
+  }
+}
+
+template <>
+void Gaussian<float, lang::Cpp>(const size_t num, const float mean,
+                                const float std, Block *out, Context *ctx) {
+  std::normal_distribution<float> distribution(mean, std);
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  for (size_t i = 0; i < num; i++) {
+    outPtr[i] = static_cast<float>(distribution(ctx->random_generator));
+  }
+}
+template <>
+void Uniform<float, lang::Cpp>(const size_t num, const float low,
+                               const float high, Block *out, Context *ctx) {
+  std::uniform_real_distribution<float> distribution(low, high);
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  for (size_t i = 0; i < num; i++) {
+    outPtr[i] = static_cast<float>(distribution(ctx->random_generator));
+  }
+}
+
+// ====================Blas operations======================================
+
+template <>
+void DGMM<float, lang::Cpp>(const bool side_right, const size_t nrow,
+                            const size_t ncol, const Block *M, const Block *v,
+                            Block *out, Context *ctx) {
+  const float *MPtr = static_cast<const float *>(M->data());
+  const float *vPtr = static_cast<const float *>(v->data());
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  if (side_right) {
+    for (size_t r = 0; r < nrow; r++) {
+      size_t offset = r * ncol;
+      for (size_t c = 0; c < ncol; c++) {
+        outPtr[offset + c] = MPtr[offset + c] * vPtr[c];
+      }
+    }
+  } else {
+    for (size_t r = 0; r < nrow; r++) {
+      size_t offset = r * ncol;
+      for (size_t c = 0; c < ncol; c++) {
+        outPtr[offset + c] = MPtr[offset + c] * vPtr[r];
+      }
+    }
+  }
+}
+
+#ifdef USE_CBLAS
+template <>
+void Amax<float, lang::Cpp>(const size_t num, const Block *in, size_t *out,
+                            Context *ctx) {
+  const float *inPtr = static_cast<const float *>(in->data());
+  *out = cblas_isamax(num, inPtr, 1);
+}
+
+template <>
+void Asum<float, lang::Cpp>(const size_t num, const Block *in, float *out,
+                            Context *ctx) {
+  const float *inPtr = static_cast<const float *>(in->data());
+  *out = cblas_sasum(num, inPtr, 1);
+}
+
+template <>
+void Axpy<float, lang::Cpp>(const size_t num, const float alpha,
+                            const Block *in, Block *out, Context *ctx) {
+  const float *inPtr = static_cast<const float *>(in->data());
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  cblas_saxpy(num, alpha, inPtr, 1, outPtr, 1);
+}
+
+template <>
+void Dot<float, lang::Cpp>(const size_t num, const Block *in1, const Block *in2,
+                           float *out, Context *ctx) {
+  const float *in1Ptr = static_cast<const float *>(in1->data());
+  const float *in2Ptr = static_cast<const float *>(in2->data());
+  *out = cblas_sdot(num, in1Ptr, 1, in2Ptr, 1);
+}
+template <>
+void Scale<float, lang::Cpp>(const size_t num, const float x, Block *out,
+                             Context *ctx) {
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  cblas_sscal(num, x, outPtr, 1);
+}
+template <>
+void Nrm2<float, lang::Cpp>(const size_t num, const Block *in, float *out,
+                            Context *ctx) {
+  const float *inPtr = static_cast<const float *>(in->data());
+  *out = cblas_snrm2(num, inPtr, 1);
+}
+
+template <>
+void GEMV<float, lang::Cpp>(bool trans, const size_t m, const size_t n,
+                            const float alpha, const Block *A, const Block *v,
+                            const float beta, Block *out, Context *ctx) {
+  const float *APtr = static_cast<const float *>(A->data());
+  const float *vPtr = static_cast<const float *>(v->data());
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  if (!trans) {
+    cblas_sgemv(CblasRowMajor, CblasNoTrans, m, n, alpha, APtr, n, vPtr, 1,
+                beta, outPtr, 1);
+  } else {
+    cblas_sgemv(CblasRowMajor, CblasTrans, n, m, alpha, APtr, m, vPtr, 1, beta,
+                outPtr, 1);
+  }
+}
+
+template <>
+void GEMM<float, lang::Cpp>(const bool transA, const bool transB,
+                            const size_t nrowA, const size_t ncolB,
+                            const size_t ncolA, const float alpha,
+                            const Block *A, const Block *B, const float beta,
+                            Block *C, Context *ctx) {
+  auto transa = transA ? CblasTrans : CblasNoTrans;
+  auto transb = transB ? CblasTrans : CblasNoTrans;
+  auto lda = transA ? nrowA : ncolA;
+  auto ldb = transB ? ncolA : ncolB;
+  auto ldc = ncolB;
+  const float *APtr = static_cast<const float *>(A->data());
+  const float *BPtr = static_cast<const float *>(B->data());
+  float *CPtr = static_cast<float *>(C->mutable_data());
+  cblas_sgemm(CblasRowMajor, transa, transb, nrowA, ncolB, ncolA, alpha, APtr,
+              lda, BPtr, ldb, beta, CPtr, ldc);
+}
+
+#else
+
+template <>
+void Amax<float, lang::Cpp>(const size_t num, const Block *in, size_t *out,
+                            Context *ctx) {
+  size_t maxPos = 0;
+  float maxVal = 0;
+  const float *inPtr = static_cast<const float *>(in->data());
+  for (size_t i = 0; i < num; i++) {
+    if (i == 0) {
+      maxVal = inPtr[i];
+    } else if (inPtr[i] > maxVal) {
+      maxVal = inPtr[i];
+      maxPos = i;
+    }
+  }
+  *out = maxPos;
+}
+template <>
+void Amin<float, lang::Cpp>(const size_t num, const Block *in, size_t *out,
+                            Context *ctx) {
+  size_t minPos = 0;
+  float minVal = 0;
+  const float *inPtr = static_cast<const float *>(in->data());
+  for (size_t i = 0; i < num; i++) {
+    if (i == 0) {
+      minVal = inPtr[i];
+    } else if (inPtr[i] > minVal) {
+      minVal = inPtr[i];
+      minPos = i;
+    }
+  }
+  *out = minPos;
+}
+
+template <>
+void Asum<float, lang::Cpp>(const size_t num, const Block *in, float *out,
+                            Context *ctx) {
+  float sum = 0;
+  const float *inPtr = static_cast<const float *>(in->data());
+  for (size_t i = 0; i < num; i++) {
+    sum += fabs(inPtr[i]);
+  }
+}
+
+template <>
+void Axpy<float, lang::Cpp>(const size_t num, const float alpha,
+                            const Block *in, Block *out, Context *ctx) {
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  const float *inPtr = static_cast<const float *>(in->data());
+  for (size_t i = 0; i < num; i++) {
+    outPtr[i] += alpha * inPtr[i];
+  }
+}
+
+template <>
+void Scale<float, lang::Cpp>(const size_t num, const float x, Block *out,
+                             Context *ctx) {
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  for (size_t i = 0; i < num; i++) {
+    outPtr[i] *= x;
+  }
+}
+
+template <>
+void Dot<float, lang::Cpp>(const size_t num, const Block *in1, const Block *in2,
+                           float *out, Context *ctx) {
+  float sum = 0;
+  const float *in1Ptr = static_cast<const float *>(in1->data());
+  const float *in2Ptr = static_cast<const float *>(in2->data());
+  for (size_t i = 0; i < num; i++) {
+    sum += in1Ptr[i] * in2Ptr[i];
+  }
+}
+
+template <>
+void GEMV<float, lang::Cpp>(bool trans, const size_t m, const size_t n,
+                            const float alpha, const Block *A, const Block *v,
+                            const float beta, Block *out, Context *ctx) {
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  const float *APtr = static_cast<const float *>(A->data());
+  const float *vPtr = static_cast<const float *>(v->data());
+  for (size_t r = 0; r < m; r++) {
+    float sum = 0;
+    for (size_t c = 0; c < n; c++) {
+      size_t idx = trans ? c * m + r : r * n + c;
+      sum += APtr[idx] * vPtr[c];
+    }
+    outPtr[r] = alpha * sum + beta * outPtr[r];
+  }
+}
+
+#endif  // USE_CBLAS
+template <>
+void ComputeCrossEntropy<float, lang::Cpp>(const size_t batchsize,
+                                           const size_t dim, const Block *p,
+                                           const Block *t, Block *loss,
+                                           Context *ctx) {
+  const float *pPtr = static_cast<const float *>(p->data());
+  const int *tPtr = static_cast<const int *>(t->data());
+  float *lossPtr = static_cast<float *>(loss->mutable_data());
+  for (size_t i = 0; i < batchsize; i++) {
+    int truth_idx = tPtr[i];
+    CHECK_GE(truth_idx, 0);
+    float prob_of_truth = pPtr[i * dim + truth_idx];
+    lossPtr[i] = -std::log(std::max(prob_of_truth, FLT_MIN));
+  }
+}
+
+template <>
+void SoftmaxCrossEntropyBwd<float, lang::Cpp>(const size_t batchsize,
+                                              const size_t dim, const Block *p,
+                                              const Block *t, Block *grad,
+                                              Context *ctx) {
+  CHECK_EQ(p, grad) << "Use the same pointer to optimize performance";
+  // const float* pPtr = static_cast<const float*>(p->data());
+  const int *tPtr = static_cast<const int *>(t->data());
+  float *gradPtr = static_cast<float *>(grad->mutable_data());
+
+  for (size_t i = 0; i < batchsize; i++) {
+    int truth_idx = static_cast<int>(tPtr[i]);
+    CHECK_GE(truth_idx, 0);
+    gradPtr[i * dim + truth_idx] -= 1.0;
+  }
+}
+
+template <>
+void RowMax<float, lang::Cpp>(const size_t nrow, const size_t ncol,
+                              const Block *in, Block *out, Context *ctx) {
+  const float *inPtr = static_cast<const float *>(in->data());
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  for (size_t r = 0; r < nrow; r++) {
+    int offset = r * ncol;
+    float maxval = inPtr[offset];
+    for (size_t c = 1; c < ncol; c++)
+      maxval = std::max(maxval, inPtr[offset + c]);
+    outPtr[r] = maxval;
+  }
+}
+
+// =========Matrix operations ================================================
+/*
+template <>
+void AddCol<float, lang::Cpp>(const size_t nrow, const size_t ncol,
+                              const Block *A, const Block *v, Block *out,
+                              Context *ctx) {
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  const float *APtr = static_cast<const float *>(A->data());
+  const float *vPtr = static_cast<const float *>(v->data());
+  for (size_t r = 0; r < nrow; r++) {
+    size_t offset = r * ncol;
+    for (size_t c = 0; c < ncol; c++) {
+      outPtr[offset + c] = APtr[offset + c] + vPtr[r];
+    }
+  }
+}
+
+template <>
+void AddRow<float, lang::Cpp>(const size_t nrow, const size_t ncol,
+                              const Block *A, const Block *v, Block *out,
+                              Context *ctx) {
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  const float *APtr = static_cast<const float *>(A->data());
+  const float *vPtr = static_cast<const float *>(v->data());
+  for (size_t r = 0; r < nrow; r++) {
+    size_t offset = r * ncol;
+    for (size_t c = 0; c < ncol; c++) {
+      outPtr[offset + c] = APtr[offset + c] + vPtr[c];
+    }
+  }
+}
+template <>
+void Outer<float, lang::Cpp>(const size_t m, const size_t n, const Block *in1,
+                             const Block *in2, Block *out, Context *ctx) {
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  const float *in1Ptr = static_cast<const float *>(in1->data());
+  const float *in2Ptr = static_cast<const float *>(in2->data());
+  for (size_t r = 0; r < m; r++) {
+    size_t offset = r * n;
+    for (size_t c = 0; c < n; c++) {
+      outPtr[offset + c] = in1Ptr[r] * in2Ptr[c];
+    }
+  }
+}
+template <>
+void Softmax<float, lang::Cpp>(const size_t nrow, const size_t ncol,
+                               const Block *in, Block *out, Context *ctx) {
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  const float *inPtr = static_cast<const float *>(in->data());
+  float *bPtr = new float[ncol];
+  for (size_t r = 0; r < nrow; r++) {
+    size_t offset = r * ncol;
+    float denom = 0.f;
+    for (size_t c = 0; c < ncol; c++) {
+      bPtr[c] = exp(inPtr[offset + c]);
+      denom += bPtr[c];
+    }
+    for (size_t c = 0; c < ncol; c++) {
+      size_t idx = offset + c;
+      outPtr[idx] = bPtr[c] / denom;
+    }
+  }
+  delete bPtr;
+}
+
+template <>
+void SumColumns<float, lang::Cpp>(const size_t nrow, const size_t ncol,
+                                  const Block *in, Block *out, Context *ctx) {
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  const float *inPtr = static_cast<const float *>(in->data());
+  for (size_t c = 0; c < ncol; c++) {
+    outPtr[c] = 0.f;
+  }
+  for (size_t r = 0; r < nrow; r++) {
+    size_t offset = r * ncol;
+    for (size_t c = 0; c < ncol; c++) {
+      outPtr[c] += inPtr[offset + c];
+    }
+  }
+}
+
+template <>
+void SumRows<float, lang::Cpp>(const size_t nrow, const size_t ncol,
+                               const Block *in, Block *out, Context *ctx) {
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  const float *inPtr = static_cast<const float *>(in->data());
+  for (size_t r = 0; r < nrow; r++) {
+    size_t offset = r * ncol;
+    outPtr[r] = 0.f;
+    for (size_t c = 0; c < ncol; c++) {
+      outPtr[r] += inPtr[offset + c];
+    }
+  }
+}
+*/
+}  // namespace singa
+
+#endif  // SINGA_CORE_TENSOR_TENSOR_MATH_CPP_H_
diff --git a/src/core/tensor/tensor_math_cuda.h b/src/core/tensor/tensor_math_cuda.h
new file mode 100644
index 0000000..4daa97a
--- /dev/null
+++ b/src/core/tensor/tensor_math_cuda.h
@@ -0,0 +1,468 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef  SINGA_CORE_TENSOR_TENSOR_MATH_CUDA_H_
+#define  SINGA_CORE_TENSOR_TENSOR_MATH_CUDA_H_
+#include "singa/singa_config.h"
+#ifdef USE_CUDA
+#include "./tensor_math.h"
+#include "./math_kernel.h"
+#include "singa/utils/cuda_utils.h"
+#include "singa/core/common.h"
+#include <cuda_runtime.h>
+#include <cublas_v2.h>
+#include "singa/utils/cuda_utils.h"
+
+namespace singa {
+
+/// out[i] = |in[i]|
+template <>
+void Abs<float, lang::Cuda>(const size_t num, const Block* in, Block* out,
+                            Context* ctx) {
+  const float* inPtr = static_cast<const float*>(in->data());
+  float* outPtr = static_cast<float*>(out->mutable_data());
+  cuda::abs(num, inPtr, outPtr, ctx->stream);
+}
+/// out = in + x
+template <>
+void Add<float, lang::Cuda>(const size_t num, const Block* in, const float x,
+                            Block* out, Context* ctx) {
+  const float* inPtr = static_cast<const float*>(in->data());
+  float* outPtr = static_cast<float*>(out->mutable_data());
+  cuda::add(num, inPtr, x, outPtr, ctx->stream);
+}
+/// out = in1 + in2
+template <>
+void Add<float, lang::Cuda>(const size_t num, const Block* in1,
+                            const Block* in2, Block* out, Context* ctx) {
+  const float* inPtr1 = static_cast<const float*>(in1->data());
+  const float* inPtr2 = static_cast<const float*>(in2->data());
+  float* outPtr = static_cast<float*>(out->mutable_data());
+  cuda::add(num, inPtr1, inPtr2, outPtr, ctx->stream);
+}
+/// Element-wise operation, clamp every element into [low, high]
+/// if x>high, then x=high; if x<low, then x=low.
+template <>
+void Clamp<float, lang::Cuda>(const size_t num, const float low,
+                              const float high, const Block* in, Block* out,
+                              Context* ctx) {
+  const float* inPtr = static_cast<const float*>(in->data());
+  float* outPtr = static_cast<float*>(out->mutable_data());
+  cuda::clamp(num, low, high, inPtr, outPtr, ctx->stream);
+}
+/// out = in1 / in2
+template <>
+void Div<float, lang::Cuda>(const size_t num, const Block* in1,
+                            const Block* in2, Block* out, Context* ctx) {
+  const float* inPtr1 = static_cast<const float*>(in1->data());
+  const float* inPtr2 = static_cast<const float*>(in2->data());
+  float* outPtr = static_cast<float*>(out->mutable_data());
+  cuda::div(num, inPtr1, inPtr2, outPtr, ctx->stream);
+}
+
+template <>
+void Div<float, lang::Cuda>(const size_t num, const float x, const Block* in,
+                            Block* out, Context* ctx) {
+  const float* inPtr = static_cast<const float*>(in->data());
+  float* outPtr = static_cast<float*>(out->mutable_data());
+  cuda::div(num, x, inPtr, outPtr, ctx->stream);
+}
+
+/// out = in * x
+template <>
+void EltwiseMult<float, lang::Cuda>(const size_t num, const Block* in,
+                                    const float x, Block* out, Context* ctx) {
+  const float* inPtr = static_cast<const float*>(in->data());
+  float* outPtr = static_cast<float*>(out->mutable_data());
+  cuda::mult(num, inPtr, x, outPtr, ctx->stream);
+}
+/// out = in1 * in2
+template <>
+void EltwiseMult<float, lang::Cuda>(const size_t num, const Block* in1,
+                                    const Block* in2, Block* out,
+                                    Context* ctx) {
+  const float* inPtr1 = static_cast<const float*>(in1->data());
+  const float* inPtr2 = static_cast<const float*>(in2->data());
+  float* outPtr = static_cast<float*>(out->mutable_data());
+  cuda::mult(num, inPtr1, inPtr2, outPtr, ctx->stream);
+}
+/// Base is e. out[i]=e^in[i]
+template <>
+void Exp<float, lang::Cuda>(const size_t num, const Block* in, Block* out,
+                            Context* ctx) {
+  const float* inPtr = static_cast<const float*>(in->data());
+  float* outPtr = static_cast<float*>(out->mutable_data());
+  cuda::exp(num, inPtr, outPtr, ctx->stream);
+}
+
+template <>
+void GE<float, lang::Cuda>(const size_t num, const Block* in, const float x,
+                           Block* out, Context* ctx) {
+  float* outPtr = static_cast<float*>(out->mutable_data());
+  const float* inPtr = static_cast<const float*>(in->data());
+  cuda::ge(num, inPtr, x, outPtr, ctx->stream);
+}
+template <>
+void GE<float, lang::Cuda>(const size_t num, const Block* in1, const Block* in2,
+                           Block* out, Context* ctx) {
+  float* outPtr = static_cast<float*>(out->mutable_data());
+  const float* inPtr1 = static_cast<const float*>(in1->data());
+  const float* inPtr2 = static_cast<const float*>(in2->data());
+  cuda::ge(num, inPtr1, inPtr2, outPtr, ctx->stream);
+}
+
+
+template <>
+void GT<float, lang::Cuda>(const size_t num, const Block* in, const float x,
+                           Block* out, Context* ctx) {
+  float* outPtr = static_cast<float*>(out->mutable_data());
+  const float* inPtr = static_cast<const float*>(in->data());
+  cuda::gt(num, inPtr, x, outPtr, ctx->stream);
+}
+template <>
+void GT<float, lang::Cuda>(const size_t num, const Block* in1, const Block* in2,
+                           Block* out, Context* ctx) {
+  float* outPtr = static_cast<float*>(out->mutable_data());
+  const float* inPtr1 = static_cast<const float*>(in1->data());
+  const float* inPtr2 = static_cast<const float*>(in2->data());
+  cuda::gt(num, inPtr1, inPtr2, outPtr, ctx->stream);
+}
+template <>
+void LE<float, lang::Cuda>(const size_t num, const Block* in, const float x,
+                           Block* out, Context* ctx) {
+  float* outPtr = static_cast<float*>(out->mutable_data());
+  const float* inPtr = static_cast<const float*>(in->data());
+  cuda::le(num, inPtr, x, outPtr, ctx->stream);
+}
+template <>
+void LE<float, lang::Cuda>(const size_t num, const Block* in1, const Block* in2,
+                           Block* out, Context* ctx) {
+  float* outPtr = static_cast<float*>(out->mutable_data());
+  const float* inPtr1 = static_cast<const float*>(in1->data());
+  const float* inPtr2 = static_cast<const float*>(in2->data());
+  cuda::le(num, inPtr1, inPtr2, outPtr, ctx->stream);
+}
+
+/// Natual logarithm, the base is e, Neper number out[i]=ln(in[i]).
+template <>
+void Log<float, lang::Cuda>(const size_t num, const Block* in, Block* out,
+                            Context* ctx) {
+  const float* inPtr = static_cast<const float*>(in->data());
+  float* outPtr = static_cast<float*>(out->mutable_data());
+  cuda::log(num, inPtr, outPtr, ctx->stream);
+}
+template <>
+void LT<float, lang::Cuda>(const size_t num, const Block* in, const float x,
+                           Block* out, Context* ctx) {
+  float* outPtr = static_cast<float*>(out->mutable_data());
+  const float* inPtr = static_cast<const float*>(in->data());
+  cuda::lt(num, inPtr, x, outPtr, ctx->stream);
+}
+template <>
+void LT<float, lang::Cuda>(const size_t num, const Block* in1, const Block* in2,
+                           Block* out, Context* ctx) {
+  float* outPtr = static_cast<float*>(out->mutable_data());
+  const float* inPtr1 = static_cast<const float*>(in1->data());
+  const float* inPtr2 = static_cast<const float*>(in2->data());
+  cuda::lt(num, inPtr1, inPtr2, outPtr, ctx->stream);
+}
+/// Element-wise operation, out[i] = in[i]^x
+template <>
+void Pow<float, lang::Cuda>(const size_t num, const Block* in, const float x,
+                            Block* out, Context* ctx) {
+  const float* inPtr = static_cast<const float*>(in->data());
+  float* outPtr = static_cast<float*>(out->mutable_data());
+  cuda::pow(num, inPtr, x, outPtr, ctx->stream);
+}
+/// Element-wise operation, out[i] = in1[i]^in2[i]
+template <>
+void Pow<float, lang::Cuda>(const size_t num, const Block* in1,
+                            const Block* in2, Block* out, Context* ctx) {
+  const float* inPtr1 = static_cast<const float*>(in1->data());
+  const float* inPtr2 = static_cast<const float*>(in2->data());
+  float* outPtr = static_cast<float*>(out->mutable_data());
+  cuda::pow(num, inPtr1, inPtr2, outPtr, ctx->stream);
+}
+
+/// Element-wise operation, out[i]=max(0, in[i])
+template <>
+void ReLU<float, lang::Cuda>(const size_t num, const Block* in, Block* out,
+                             Context* ctx) {
+  const float* inPtr = static_cast<const float*>(in->data());
+  float* outPtr = static_cast<float*>(out->mutable_data());
+  cuda::relu(num, inPtr, outPtr, ctx->stream);
+}
+
+/// out[i] = x
+template <>
+void Set<float, lang::Cuda>(const size_t num, const float x, Block* out,
+                            Context* ctx) {
+  float* outPtr = static_cast<float*>(out->mutable_data());
+  cuda::set(num, x, outPtr, ctx->stream);
+}
+/// Element-wise operation, out[i]=sigmoid([in[i])
+template <>
+void Sigmoid<float, lang::Cuda>(const size_t num, const Block* in, Block* out,
+                                Context* ctx) {
+  const float* inPtr = static_cast<const float*>(in->data());
+  float* outPtr = static_cast<float*>(out->mutable_data());
+  cuda::sigmoid(num, inPtr, outPtr, ctx->stream);
+}
+// out[i] = sign(in[i])
+template <>
+void Sign<float, lang::Cuda>(const size_t num, const Block* in, Block* out,
+                             Context* ctx) {
+  const float* inPtr = static_cast<const float*>(in->data());
+  float* outPtr = static_cast<float*>(out->mutable_data());
+  cuda::sign(num, inPtr, outPtr, ctx->stream);
+}
+
+/// Element-wise operation, out[i]=sqrt([in[i])
+template <>
+void Sqrt<float, lang::Cuda>(const size_t num, const Block* in, Block* out,
+                             Context* ctx) {
+  const float* inPtr = static_cast<const float*>(in->data());
+  float* outPtr = static_cast<float*>(out->mutable_data());
+  cuda::sqrt(num, inPtr, outPtr, ctx->stream);
+}
+
+/// Element-wise operation, out[i]=in[i]^2
+template <>
+void Square<float, lang::Cuda>(const size_t num, const Block* in, Block* out,
+                               Context* ctx) {
+  const float* inPtr = static_cast<const float*>(in->data());
+  float* outPtr = static_cast<float*>(out->mutable_data());
+  cuda::square(num, inPtr, outPtr, ctx->stream);
+}
+/// out = in1 - in2
+template <>
+void Sub<float, lang::Cuda>(const size_t num, const Block* in1,
+                            const Block* in2, Block* out, Context* ctx) {
+  const float* inPtr1 = static_cast<const float*>(in1->data());
+  const float* inPtr2 = static_cast<const float*>(in2->data());
+  float* outPtr = static_cast<float*>(out->mutable_data());
+  cuda::sub(num, inPtr1, inPtr2, outPtr, ctx->stream);
+}
+
+/// sum all elements of input into out
+template <>
+void Sum<float, lang::Cuda>(const size_t num, const Block* in, float* out,
+                            Context* ctx) {
+  LOG(FATAL) << "Cuda Sum is not implemented!";
+  // const float* inPtr = static_cast<const float*>(in->data());
+  // cuda::sum(num, inPtr, out, ctx->stream);
+}
+
+/// Element-wise operation, out[i]=tanh([in[i])
+template <>
+void Tanh<float, lang::Cuda>(const size_t num, const Block* in, Block* out,
+                             Context* ctx) {
+  const float* inPtr = static_cast<const float*>(in->data());
+  float* outPtr = static_cast<float*>(out->mutable_data());
+  cuda::tanh(num, inPtr, outPtr, ctx->stream);
+}
+
+// ================Random functions===========================================
+/// Each element of out would be 1 with prob p and 0 with 1-p. 0<= p <= 1
+// Get the random generator from 'ctx'
+// If DType is not float, then convert the threshold to DType
+template <>
+void Bernoulli<float, lang::Cuda>(const size_t num, const float p, Block* out,
+                                  Context* ctx) {
+  auto rgen = ctx->curand_generator;
+  float* outPtr = static_cast<float*>(out->mutable_data());
+  CURAND_CHECK(curandGenerateUniform(rgen, outPtr, num));
+  cuda::threshold(num, p, outPtr, outPtr, ctx->stream);
+}
+
+// The random generator should be extracted from ctx.
+// If DType is not float, then convert the low and high to DType
+template <>
+void Uniform<float, lang::Cuda>(const size_t num, const float low,
+                                const float high, Block* out, Context* ctx) {
+  auto rgen = ctx->curand_generator;
+  float* outPtr = static_cast<float*>(out->mutable_data());
+  CURAND_CHECK(curandGenerateUniform(rgen, outPtr, num));
+  cuda::mult(num, outPtr, high - low, outPtr, ctx->stream);
+  cuda::add(num, outPtr, low, outPtr, ctx->stream);
+}
+
+// The random generator should be extracted from ctx.
+// If DType is not float, then convert the mean and delta to DType
+template <>
+void Gaussian<float, lang::Cuda>(const size_t num, const float mean,
+                                 const float std, Block* out, Context* ctx) {
+  auto rgen = ctx->curand_generator;
+  float* outPtr = static_cast<float*>(out->mutable_data());
+  CURAND_CHECK(curandGenerateNormal(rgen, outPtr, num, mean, std));
+}
+
+// =========================Blas operations==================================
+// ref to http://docs.nvidia.com/cuda/cublas
+template <>
+void Amax<float, lang::Cuda>(const size_t num, const Block* in, size_t* out,
+                             Context* ctx) {
+  const float* inPtr = static_cast<const float*>(in->data());
+  auto handle = ctx->cublas_handle;  // TODO(wangwei) set cudastream
+  int idx = 1;
+  CUBLAS_CHECK(cublasIsamax(handle, num, inPtr, 1, &idx));
+  *out = idx - 1;  // cublas index starts from 1
+}
+
+/// return the index of the element with the min value.
+template <>
+void Amin<float, lang::Cuda>(const size_t num, const Block* in, size_t* out,
+                             Context* ctx) {
+  const float* inPtr = static_cast<const float*>(in->data());
+  auto handle = ctx->cublas_handle;  // TODO(wangwei) set cudastream
+  int idx = 1;
+  CUBLAS_CHECK(cublasIsamin(handle, num, inPtr, 1, &idx));
+  *out = idx - 1;
+}
+
+/// out = sum |x| for all x in in
+template <>
+void Asum<float, lang::Cuda>(const size_t num, const Block* in, float* out,
+                             Context* ctx) {
+  const float* inPtr = static_cast<const float*>(in->data());
+  auto handle = ctx->cublas_handle;  // TODO(wangwei) set cudastream
+  CUBLAS_CHECK(cublasSasum(handle, num, inPtr, 1, out));
+}
+
+/// out = alpha * in + out
+template <>
+void Axpy<float, lang::Cuda>(const size_t num, const float alpha,
+                             const Block* in, Block* out, Context* ctx) {
+  const float* inPtr = static_cast<const float*>(in->data());
+  float* outPtr = static_cast<float*>(out->mutable_data());
+  auto handle = ctx->cublas_handle;  // TODO(wangwei) set cudastream
+  CUBLAS_CHECK(cublasSaxpy(handle, num, &alpha, inPtr, 1, outPtr, 1));
+}
+
+/// out = \sum_i in1[i] * in2[i]
+template <>
+void Dot<float, lang::Cuda>(const size_t num, const Block* in1,
+                            const Block* in2, float* out, Context* ctx) {
+  const float* inPtr1 = static_cast<const float*>(in1->data());
+  const float* inPtr2 = static_cast<const float*>(in2->data());
+  auto handle = ctx->cublas_handle;  // TODO(wangwei) set cudastream
+  CUBLAS_CHECK(cublasSdot(handle, num, inPtr1, 1, inPtr2, 1, out));
+}
+template <>
+void Nrm2<float, lang::Cuda>(const size_t num, const Block* in, float* out,
+                             Context* ctx) {
+  auto handle = ctx->cublas_handle;  // TODO(wangwei) set cudastream
+  const float* inPtr = static_cast<const float*>(in->data());
+  cublasSnrm2(handle, num, inPtr, 1, out);
+}
+template <>
+void Scale<float, lang::Cuda>(const size_t num, const float x, Block* out,
+                              Context* ctx) {
+  auto handle = ctx->cublas_handle;  // TODO(wangwei) set cudastream
+  float* outPtr = static_cast<float*>(out->mutable_data());
+  CUBLAS_CHECK(cublasSscal(handle, num, &x, outPtr, 1));
+}
+// NOTE: cublas uses column major order.
+// http://peterwittek.com/cublas-matrix-c-style.html
+template <>
+void DGMM<float, lang::Cuda>(const bool side_right, const size_t nrow,
+                             const size_t ncol, const Block* M, const Block* v,
+                             Block* out, Context* ctx) {
+  auto handle = ctx->cublas_handle;  // TODO(wangwei) set cudastream
+  const float* MPtr = static_cast<const float*>(M->data());
+  const float* vPtr = static_cast<const float*>(v->data());
+  float* outPtr = static_cast<float*>(out->mutable_data());
+  if (side_right) {
+    CUBLAS_CHECK(cublasSdgmm(handle, CUBLAS_SIDE_LEFT, ncol, nrow, MPtr, ncol,
+                             vPtr, 1, outPtr, ncol));
+  } else {
+    CUBLAS_CHECK(cublasSdgmm(handle, CUBLAS_SIDE_RIGHT, ncol, nrow, MPtr, ncol,
+                             vPtr, 1, outPtr, ncol));
+  }
+}
+template <>
+void GEMV<float, lang::Cuda>(bool trans, const size_t m, const size_t n,
+                             const float alpha, const Block* A, const Block* v,
+                             const float beta, Block* out, Context* ctx) {
+  const float* APtr = static_cast<const float*>(A->data());
+  const float* vPtr = static_cast<const float*>(v->data());
+  float* outPtr = static_cast<float*>(out->mutable_data());
+  auto handle = ctx->cublas_handle;  // TODO(wangwei) set cudastream
+  if (!trans)
+    CUBLAS_CHECK(cublasSgemv(handle, CUBLAS_OP_T, n, m, &alpha, APtr, n, vPtr,
+                             1, &beta, outPtr, 1));
+  else
+    CUBLAS_CHECK(cublasSgemv(handle, CUBLAS_OP_N, m, n, &alpha, APtr, m, vPtr,
+                             1, &beta, outPtr, 1));
+}
+
+// http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm
+template <>
+void GEMM<float, lang::Cuda>(const bool transA, const bool transB,
+                             const size_t nrowA, const size_t ncolB,
+                             const size_t ncolA, const float alpha,
+                             const Block* A, const Block* B, const float beta,
+                             Block* C, Context* ctx) {
+  auto transa = transA ? CUBLAS_OP_T : CUBLAS_OP_N;
+  auto transb = transB ? CUBLAS_OP_T : CUBLAS_OP_N;
+  int lda = transA ? nrowA : ncolA;
+  int ldb = transB ? ncolA : ncolB;
+  int ldc = ncolB;
+  const float* APtr = static_cast<const float*>(A->data());
+  const float* BPtr = static_cast<const float*>(B->data());
+  float* CPtr = static_cast<float*>(C->mutable_data());
+  auto handle = ctx->cublas_handle;  // TODO(wangwei) set cudastream
+  CUBLAS_CHECK(cublasSgemm(handle, transb, transa, ncolB, nrowA, ncolA, &alpha,
+                           BPtr, ldb, APtr, lda, &beta, CPtr, ldc));
+}
+
+template <>
+void ComputeCrossEntropy<float, lang::Cuda>(const size_t batchsize,
+                                            const size_t dim, const Block* p,
+                                            const Block* t, Block* loss,
+                                            Context* ctx) {
+  const float* pPtr = static_cast<const float*>(p->data());
+  const int* tPtr = static_cast<const int*>(t->data());
+  float* lossPtr = static_cast<float*>(loss->mutable_data());
+  cuda::ComputeCrossEntropy(batchsize, dim, pPtr, tPtr, lossPtr, ctx->stream);
+}
+template <>
+void SoftmaxCrossEntropyBwd<float, lang::Cuda>(const size_t batchsize,
+                                               const size_t dim, const Block* p,
+                                               const Block* t, Block* grad,
+                                               Context* ctx) {
+  CHECK_EQ(p, grad) << "Use the same pointer to optimize performance";
+  const float* pPtr = static_cast<const float*>(p->data());
+  const int* tPtr = static_cast<const int*>(t->data());
+  float* gradPtr = static_cast<float*>(grad->mutable_data());
+  cuda::SoftmaxCrossEntropyBwd(batchsize, dim, pPtr, tPtr, gradPtr,
+                               ctx->stream);
+}
+
+template <>
+void RowMax<float, lang::Cuda>(const size_t nrow, const size_t ncol,
+                               const Block* in, Block* out,
+                               Context* ctx) {
+  const float* inPtr = static_cast<const float*>(in->data());
+  float* outPtr = static_cast<float*>(out->mutable_data());
+  cuda::RowMax(nrow, ncol, inPtr, outPtr, ctx->stream);
+}
+}  // namespace singa
+
+#endif  // USE_CUDA
+#endif  // SINGA_CORE_TENSOR_TENSOR_MATH_CUDA_H_
diff --git a/src/core/tensor/tensor_math_opencl.cl b/src/core/tensor/tensor_math_opencl.cl
new file mode 100644
index 0000000..f9cf96e
--- /dev/null
+++ b/src/core/tensor/tensor_math_opencl.cl
@@ -0,0 +1,598 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// **************************************
+// Element-wise functions
+// **************************************
+
+// Sum is basically reduction.
+// This reduction code is serial reduction modified from AMD's example.
+// http://developer.amd.com/resources/documentation-articles/articles-whitepapers/opencl-optimization-case-study-simple-reductions/
+__kernel 
+void clkernel_abs(const int num, __global const float* in, __global float* out) {
+  const int i = get_global_id(0);
+  if (i >= num) return;
+  out[i] = fabs(in[i]);
+}
+
+__kernel
+void clkernel_add_scalar(const int num, float x, __global const float* in, __global float* out) {
+  const int i = get_global_id(0);
+  if (i >= num) return;
+  out[i] = in[i] + x;
+}
+
+__kernel
+void clkernel_add(const int num, __global const float* in1, __global const float* in2, 
+		  __global float* out) {
+  const int i = get_global_id(0);
+  if (i >= num) return;
+  out[i] = in1[i] + in2[i];
+}
+
+__kernel
+void clkernel_clamp(const int num, float low, float high, __global const float* in, 
+		  __global float* out) {
+  const int i = get_global_id(0);
+  if (i >= num) return;
+  out[i] = clamp(in[i], low, high);
+}
+
+__kernel
+void clkernel_divide_scalar_matx(const int num, __global const float* in1, const float x,
+		  __global float* out) {
+  const int i = get_global_id(0);
+  if (i >= num) return;
+  out[i] = in1[i] / x;
+}
+
+__kernel
+void clkernel_divide_scalar_xmat(const int num, const float x, __global const float* in1, 
+		  __global float* out) {
+  const int i = get_global_id(0);
+  if (i >= num) return;
+  out[i] = x / in1[i];
+}
+
+__kernel
+void clkernel_divide(const int num, __global const float* in1, __global const float* in2, 
+		  __global float* out) {
+  const int i = get_global_id(0);
+  if (i >= num) return;
+  out[i] = in1[i] / in2[i];
+}
+
+__kernel
+void clkernel_eltmult_scalar(const int num, const float x, __global const float* in, 
+		  __global float* out) {
+  const int i = get_global_id(0);
+  if (i >= num) return;
+  out[i] = in[i] * x;
+}
+
+__kernel
+void clkernel_eltmult(const int num, __global const float* in1, __global const float* in2, 
+		  __global float* out) {
+  const int i = get_global_id(0);
+  if (i >= num) return;
+  out[i] = in1[i] * in2[i];
+}
+
+__kernel
+void clkernel_exp(const int num, __global const float* in, __global float* out) {
+  const int i = get_global_id(0);
+  if (i >= num) return;
+  out[i] = exp(in[i]);
+}
+
+__kernel
+void clkernel_le(const int num, __global const float* in, const float x, 
+		  __global float* out) {
+  const int i = get_global_id(0);
+  if (i >= num) return;
+  out[i] = (in[i] <= x) ? 1.0f : 0.0f;
+}
+
+__kernel
+void clkernel_log(const int num, __global const float* in, __global float* out) {
+  const int i = get_global_id(0);
+  if (i >= num) return;
+  out[i] = log(in[i]);
+}
+
+__kernel
+void clkernel_lt(const int num, __global const float* in, const float x, 
+		  __global float* out) {
+  const int i = get_global_id(0);
+  if (i >= num) return;
+  out[i] = (in[i] < x) ? 1.0f : 0.0f;
+}
+
+__kernel
+void clkernel_ge(const int num, __global const float* in, const float x, 
+		  __global float* out) {
+  const int i = get_global_id(0);
+  if (i >= num) return;
+  out[i] = (in[i] >= x) ? 1.0f : 0.0f;
+}
+
+__kernel
+void clkernel_gt(const int num, __global const float* in, const float x, 
+		  __global float* out) {
+  const int i = get_global_id(0);
+  if (i >= num) return;
+  out[i] = (in[i] > x) ? 1.0f : 0.0f;
+}
+
+__kernel
+void clkernel_pow_scalar(const int num, const float x, __global const float* in, 
+		  __global float* out) {
+  const int i = get_global_id(0);
+  if (i >= num) return;
+  out[i] = pow(in[i], x);
+}
+
+__kernel
+void clkernel_pow(const int num, __global const float* in1, __global const float* in2, 
+		  __global float* out) {
+  const int i = get_global_id(0);
+  if (i >= num) return;
+  out[i] = pow(in1[i], in2[i]);
+}
+
+__kernel
+void clkernel_relu(const int num, __global const float* in, __global float* out) {
+  const int i = get_global_id(0);
+  if (i >= num) return;
+  out[i] = (in[i] >= 0.0f) ? in[i] : 0.0f;
+}
+
+__kernel
+void clkernel_set(const int num, const float x, __global float* out) {
+  const int i = get_global_id(0);
+  if (i >= num) return;
+  out[i] = x;
+}
+
+__kernel
+void clkernel_sigmoid(const int num, __global const float* in, __global float* out) {
+  const int i = get_global_id(0);
+  if (i >= num) return;
+  out[i] = 1 / (1 + exp(-(in[i])));
+}
+
+__kernel
+void clkernel_sign(const int num, __global const float* in, __global float* out) {
+  const int i = get_global_id(0);
+  if (i >= num) return;
+  out[i] = sign(in[i]);
+}
+
+__kernel
+void clkernel_sqrt(const int num, __global const float* in, __global float* out) {
+  const int i = get_global_id(0);
+  if (i >= num) return;
+  out[i] = sqrt(in[i]);
+}
+
+// kernel for square is called pow(2).
+
+__kernel
+void clkernel_subtract_scalar(const int num, __global const float* in, const float x, 
+							  __global float* out) {
+  const int i = get_global_id(0);
+  if (i >= num) return;
+  out[i] = in[i] - x;
+}
+
+__kernel
+void clkernel_subtract(const int num, __global const float* in1, __global const float* in2, 
+					   __global float* out) {
+  const int i = get_global_id(0);
+  if (i >= num) return;
+  out[i] = in1[i] - in2[i];
+}
+
+// reduce3 kernel from
+// https://github.com/sschaetz/nvidia-opencl-examples/blob/master/OpenCL/src/oclReduction/oclReduction_kernel.cl
+__kernel 
+void clkernel_sum(const int num, __global const float* in, __global float* out, 
+				  __local float* sdata) {
+  const int i = get_group_id(0)*(get_local_size(0)*2) + get_local_id(0);
+  const int tid = get_local_id(0);
+  sdata[tid] = (i < num) ? in[i] : 0.0f;
+
+  // Perform the first level of reduction.
+  if (i + get_local_size(0) < num) {
+	sdata[tid] += in[i + get_local_size(0)];
+  }
+  barrier(CLK_LOCAL_MEM_FENCE);
+
+  for (int s = get_local_size(0)/2; s > 0; s >>= 1) {
+	if (tid > s) {
+	  sdata[tid] += sdata[tid + s];
+	}
+	barrier(CLK_LOCAL_MEM_FENCE);
+  }
+
+  if (tid == 0) {
+	out[get_group_id(0)] = sdata[0];
+  }
+}
+
+__kernel
+void clkernel_tanh(const int num, __global const float* in, __global float* out) {
+  const int i = get_global_id(0);
+  if (i >= num) return;
+  out[i] = tanh(in[i]);
+}
+
+// **************************************
+// Random functions
+// **************************************
+
+// See: distribution.cl
+
+// *********************************************************
+// BLAS functions, ref to http://docs.nvidia.com/cuda/cublas
+// *********************************************************
+
+__kernel
+void clkernel_amax(const int num, __global const float* in, __global int* ret, 
+				   __local uint* sdata, __local size_t* temp) {
+  const int gid = get_global_id(0);
+  const int tid = get_local_id(0);
+
+  for(int s = get_local_size(0)/2; s > 0; s >>= 1) {
+	if (tid < s) {
+	  sdata[tid] = (in[sdata[tid]] > in[tid+s]) ? sdata[tid] : tid;
+	}
+	barrier(CLK_LOCAL_MEM_FENCE);
+  }
+  if (tid == 0) {
+	ret[0] = sdata[0];
+  }
+}
+
+
+/* TODO: Fix line 284:20.
+__kernel
+void clkernel_amin(const int num, __global const float* in, __global int* ret, 
+				   __local float* sdata, __local size_t* temp) {
+  const int gid = get_global_id(0);
+  const int tid = get_local_id(0);
+
+  // Initialize the values to pos infinity.
+  sdata[tid] = (gid < num) ? in[gid] : INFINITY;
+  barrier(CLK_LOCAL_MEM_FENCE);
+
+  for(int s = get_local_size(0)/2; s > 0; s >>= 1) {
+	if (tid < s) {
+	  sdata[tid] = (in[sdata[tid]] < in[tid+s]) ? sdata[tid] : tid;
+	}
+	barrier(CLK_LOCAL_MEM_FENCE);
+  }
+  if (tid == 0) {
+	ret[0] = sdata[0];
+  }
+}*/
+
+
+__kernel
+void clkernel_asum(const int num, __global const float* in, __global float* out, 
+				   __local float* sdata) {
+  const int tid = get_local_id(0);
+  const int i = get_global_id(0);
+
+  // Initialize
+  sdata[tid] = (i < num) ? in[i] : INFINITY;
+  // Perform the first level of reduction.
+  if (i + get_local_size(0) < num) {
+	sdata[tid] += in[i + get_local_size(0)];
+  }
+  barrier(CLK_LOCAL_MEM_FENCE);
+
+  for(int s = get_local_size(0)/2; s > 0; s >>= 1) {
+	if (tid < s) {
+	  sdata[tid] = fabs(sdata[tid + s]);
+	}
+	barrier(CLK_LOCAL_MEM_FENCE);
+  }
+  if (tid == 0) {
+	out[0] = sdata[0];
+  }
+}
+
+__kernel
+void clkernel_axpy(const int num, float alpha, __global const float* in, 
+				   __global float* out) {
+  const int i = get_global_id(0);
+  if (i >= num) return;
+  out[i] = fma(alpha, in[i], out[i]);
+}
+
+// This kernel is essentially the same as Sum, except that during the process
+// of reading in data to the local memory, the value is also doubled.
+// Then, just before submitting the sum to out, we do a square-root on it.
+__kernel
+void clkernel_nrm2(const int num, __global const float* in, __global float* out,
+				   __local float* sdata) {
+  const int i = get_group_id(0)*(get_local_size(0)*2) + get_local_id(0);
+  const int tid = get_local_id(0);
+  sdata[tid] = (i < num) ? (in[i] * in[i]) : 0.0f;
+
+  // Perform the first level of reduction.
+  if (i + get_local_size(0) < num) {
+	sdata[tid] += in[i + get_local_size(0)];
+  }
+  barrier(CLK_LOCAL_MEM_FENCE);
+
+  for (int s = get_local_size(0)/2; s > 0; s >>= 1) {
+	if (tid > s) {
+	  sdata[tid] += sdata[tid + s];
+	}
+	barrier(CLK_LOCAL_MEM_FENCE);
+  }
+
+  if (tid == 0) {
+	out[get_group_id(0)] = sqrt(sdata[0]);
+  }
+}
+
+__kernel
+void clkernel_scale(const int num, float x, __global float* out) {
+  const int i = get_global_id(0);
+  if (i >= num) return;
+  out[i] = x * out[i];
+}
+
+__kernel
+void clkernel_dot(const int num, __global const float* in1, __global const float* in2, 
+	  			  __global float* out, __local float* scratch) {
+  const int i = get_global_id(0);
+  if (i >= num) return;
+  int offset = i << 2;
+  scratch[i] = in1[offset] * in2[offset];
+  
+}
+
+// First kernel from http://www.bealto.com/gpu-gemv_intro.html
+// y = α*A*v + β*y
+// fma(a, b, c) == (a * b) + c with infinite precision
+__kernel
+void clkernel_gemv(const int m, const int n, const float alpha,
+				   __global const float* A, __global const float* v, 
+				   const float beta, __global float* out) {
+  const int i = get_global_id(0);
+  float sum  = 0.0f;
+  for (int k = 0; k < n; k++) {
+    sum += fma(beta, out[i + m * k], alpha * A[i + m * k] * v[k]);
+  }
+  out[i] = sum;
+}
+
+// http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-dgmm
+// X[j] = x[j*inc(x)] 						if inc(x) ≥ 0 
+//		= x[(χ − 1)*|inc(x)| − j*|inc(x)|] 	if inc(x) < 0
+
+// C = diag( X )*A
+__kernel
+void clkernel_dgmm_left(const int nrow, const int ncol,
+						__global const float* M, __global const float* v, 
+						__global float* out) {
+  const uint gidx = get_global_id(0);
+
+  uint offset = gidx * ncol;
+  for (uint i = 0; i < ncol; i++) {
+	out[offset + i] = M[offset + i] * v[i];
+  }
+}
+
+// C = A*diag( X )
+__kernel
+void clkernel_dgmm_right(const int nrow, const int ncol,
+						 __global const float* M, __global const float* v, 
+						 __global float* out) {
+  const uint gidx = get_global_id(0);
+
+  uint offset = gidx * ncol;
+  for (uint i = 0; i < ncol; i++) {
+	out[offset + i] = M[offset + i] * v[gidx];
+  }
+}
+
+// TODO: Optimize with Reference from http://www.cedricnugteren.nl/tutorial.php?page=1
+//  C = α*A*B + β*C
+__kernel
+void clkernel_gemm(const uint nrowA, const uint ncolB, const uint ncolA, const float alpha,
+		 		   __global const float* A, __global const float* B, const float beta, 
+		  		   __global float* C, __local float* Asub, __local float* Bsub) {
+
+  const uint lidx = get_local_id(0);
+  const uint lidy = get_local_id(1);
+  const uint TS = get_local_size(0); // Tile size
+  const uint gidx = TS * get_group_id(0) + lidx; // Row ID of C (0..M)
+  const uint gidy = TS * get_group_id(1) + lidy; // Row ID of C (0..N)
+  
+  // Initialise the accumulation register
+  float acc = 0.0f;
+  
+  // Loop over all tiles
+  const int numtiles = ncolA / TS;
+  for (int t = 0; t < numtiles; t++) {
+    const int tiledRow = TS * t + lidx;
+    const int tiledCol = TS * t + lidy;
+    Asub[lidy * TS + lidx] = A[tiledCol * nrowA + gidx];
+    Bsub[lidy * TS + lidx] = B[gidy * ncolA + tiledRow];
+    
+    barrier(CLK_LOCAL_MEM_FENCE);
+    
+    for(int k = 0; k < TS; k++) {
+      acc += Asub[k * TS + lidx] * Bsub[lidy * TS + k] * alpha;
+    }
+    
+    barrier(CLK_LOCAL_MEM_FENCE);
+  }
+  
+  C[gidy * nrowA + gidx] = fma(beta, C[gidy * nrowA + gidx], acc);
+}
+
+
+__kernel
+void clkernel_crossentropy(const uint batchsize, const uint dim, 
+						   __global const float* p, __global const int* t, 
+						   __global float* loss) {
+  const uint gidx = get_global_id(0);
+  if (gidx >= batchsize) return;
+
+  int truth_idx = t[gidx];
+  if (truth_idx <= 0) return;
+  float prob_of_truth = p[gidx + truth_idx];
+  loss[gidx] = -log(fmax(prob_of_truth, -FLT_MIN));
+}
+
+
+__kernel
+void clkernel_softmaxentropy(const uint batchsize, const uint dim,
+							 __global const float* p, __global const int* t,
+							 __global float* grad) {
+  const uint gidx = get_global_id(0);
+  if (gidx >= batchsize) return;
+
+  int truth_idx = t[gidx];
+  if (truth_idx <= 0) return;
+  grad[gidx * dim + truth_idx] -= 1.0;
+}
+
+
+// **************************************
+// Matrix functions
+// **************************************
+/*
+__kernel
+void clkernel_addcol(int nrow, int ncol, __global const float* A, __global const float* v, __global float* out) {
+  const int i = get_global_id(0);
+  const int j = get_global_id(1);
+  if (i >= nrow) return;
+  if (j >= ncol) return;
+  ret[j] = A[j + nrow * i] + v[j];
+}
+
+__kernel
+void clkernel_addrow(int nrow, int ncol, __global const float* A, __global const float* v, __global float* out) {
+  const int i = get_global_id(0);
+  const int j = get_global_id(1);
+  if (i >= nrow) return;
+  if (j >= ncol) return;
+  out[i] = A[i + ncol * j] + v[i];
+}
+
+__kernel
+void clkernel_outerproduct(int m, const int n, __global const float* in1, __global const float* in2, __global float* out) {
+  const int col = get_global_id(0);
+  const int row = get_global_id(1);
+  
+  // TODO: This
+}
+
+__kernel
+void clkernel_sumcol(int nrow, int ncol, __global const float* in, __global float* out) {
+  const int i = get_global_id(0);
+  if (i >= nrow) return;
+
+  float sum = 0.0f;
+  for (int j = 0; j < nrow; j++) {
+	sum += input[nrow * i + j];
+  }
+  out[i] = sum;
+}
+*/
+__kernel
+void clkernel_sumrow(int nrow, int ncol, __global const float* in, __global float* out) {
+  const int idx = get_global_id(0);
+  if (idx >= nrow) return;
+  
+  float sum = 0.0f;
+  for (int j = 0; j < ncol; j++) {
+	sum += in[j + ncol * idx];
+  }
+  out[idx] = sum;
+}
+
+
+// Adapted from http://code.haskell.org/HsOpenCL/tests/bench/transpose.cl
+#define BLOCK_DIM 16
+__kernel
+void clkernel_transpose(uint nrow, uint ncol, 
+						__global const float* in, __global float* out, 
+						__local float* sdata) {
+  uint gidx = get_global_id(0);
+  uint gidy = get_global_id(1);
+
+  if ((gidx < ncol) && (gidy < nrow)) {
+	uint id_in = gidy * ncol + gidx;
+	sdata[get_local_id(1) * (BLOCK_DIM+1) + get_local_id(0)] = in[id_in];
+  }
+
+  barrier(CLK_LOCAL_MEM_FENCE);
+
+  gidx = get_group_id(1) * BLOCK_DIM + get_local_id(0);
+  gidy = get_group_id(0) * BLOCK_DIM + get_local_id(1);
+  if ((gidx < nrow) && (gidy < ncol)) {
+	uint id_out = gidy * nrow + gidx;
+	out[id_out] = sdata[get_local_id(0) * (BLOCK_DIM + 1) + get_local_id(1)];
+  }
+}
+/*
+__kernel
+void clkernel_transpose2(uint nrow, uint ncol, __global const float* in, __global float* out, __local float* sdata) {
+  const uint lidx = get_local_id(0);
+  const uint lidy = get_local_id(1);
+  const uint id0 = get_group_id(0) * ncol * lidx;
+  const uint id1 = get_group_id(1) * nrow * lidy;
+
+  if (id0 < nrow && id1 < ncol) {
+	sdata[lidx][lidy] = in[id1 * nrow + id0];
+  }
+
+  barrier(CLK_LOCAL_MEM_FENCE);
+
+  const uint new_id0 = get_group_id(1) * nrow + lidx;
+  const uint new_id1 = get_group_id(0) * ncol + lidy;
+
+  if (new_id0 < ncol && new_id1 < nrow) {
+	out[new_id1 * ncol + new_id0] = sdata[lidx][lidy];
+  }
+}*/
+
+__kernel
+void clkernel_diagvec_left(uint vsize, __global const float* vin, __global float* out) {
+  const uint gid = get_global_id(0);
+
+  for (uint i = 0; i < vsize; i++)
+	out[gid * vsize + i] = (i == gid) ? vin[gid] : 0.0f;
+}
+
+
+__kernel
+void clkernel_diagvec_right(uint vsize, __global const float* vin, __global float* out) {
+  const uint gid = get_global_id(0);
+
+  for (uint i = 0; i < vsize; i++)
+	out[gid * vsize + i] = (i == gid) ? vin[gid] : 0.0f;
+}
diff --git a/src/core/tensor/tensor_math_opencl.h b/src/core/tensor/tensor_math_opencl.h
new file mode 100644
index 0000000..c289a56
--- /dev/null
+++ b/src/core/tensor/tensor_math_opencl.h
@@ -0,0 +1,1113 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef  SINGA_CORE_TENSOR_TENSOR_MATH_OPENCL_H_
+
+#ifdef USE_OPENCL
+#include <limits>
+
+#include "singa/utils/opencl_utils.h"
+#include "tensor_math.h"
+
+namespace singa {
+
+// Some forward declarations of utility functions that only exist here.
+void Transpose(const size_t nrow, const size_t ncol, cl::Buffer& in, cl::Buffer& out, Context* ctx);
+void DiagVec_Left(const size_t size, cl::Buffer& in, cl::Buffer& out, Context* ctx);
+void DiagVec_Right(const size_t size, cl::Buffer& in, cl::Buffer& out, Context* ctx);
+
+// **************************************
+// Element-wise functions
+// **************************************
+
+template<>
+void Abs<float, lang::Opencl>(const size_t num, const Block* in, Block* out, Context* ctx) {
+  cl_int status = CL_SUCCESS;
+
+  std::string kname = "clkernel_abs";
+  auto kernel = ctx->kernels->at(kname);
+
+  cl::Buffer inbuf = *(static_cast<cl::Buffer*>(in->mutable_data()));
+  cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data()));
+
+  kernel.setArg(0, (cl_int)num);
+  kernel.setArg(1, inbuf);
+  kernel.setArg(2, outbuf);
+
+  status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(num));
+  OCL_CHECK(status, "Failed to enqueue kernel function!");
+}
+
+
+template<>
+void Add<float, lang::Opencl>(const size_t num, const Block* in, const float x, Block* out, Context* ctx) {
+  cl_int status = CL_SUCCESS;
+
+  std::string kname = "clkernel_add_scalar";
+  auto kernel = ctx->kernels->at(kname);
+
+  cl::Buffer inbuf = *(static_cast<cl::Buffer*>(in->mutable_data()));
+  cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data()));
+
+  kernel.setArg(0, (cl_int)num);
+  kernel.setArg(1, x);
+  kernel.setArg(2, inbuf);
+  kernel.setArg(3, outbuf);
+
+  status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(num));
+  OCL_CHECK(status, "Failed to enqueue kernel function!");
+}
+
+
+template<>
+void Add<float, lang::Opencl>(const size_t num, const Block* in1, const Block* in2, Block* out, Context* ctx) {
+  cl_int status = CL_SUCCESS;
+
+  std::string kname = "clkernel_add";
+  auto kernel = ctx->kernels->at(kname);
+  
+  cl::Buffer in1buf = *(static_cast<cl::Buffer*>(in1->mutable_data()));
+  cl::Buffer in2buf = *(static_cast<cl::Buffer*>(in2->mutable_data()));
+  cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data()));
+
+  kernel.setArg(0, (cl_int)num);
+  kernel.setArg(1, in1buf);
+  kernel.setArg(2, in2buf);
+  kernel.setArg(3, outbuf);
+
+  status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(num));
+  OCL_CHECK(status, "Failed to enqueue kernel function!");
+}
+
+
+template<>
+void Clamp<float, lang::Opencl>(const size_t num, const float low, const float high, const Block* in, Block* out, Context* ctx) {
+  cl_int status = CL_SUCCESS;
+
+  std::string kname = "clkernel_clamp";
+  auto kernel = ctx->kernels->at(kname);
+
+  cl::Buffer inbuf = *(static_cast<cl::Buffer*>(in->mutable_data()));
+  cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data()));
+
+  kernel.setArg(0, (cl_int)num);
+  kernel.setArg(1, low);
+  kernel.setArg(2, high);
+  kernel.setArg(3, inbuf);
+  kernel.setArg(4, outbuf);
+
+  status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(num));
+  OCL_CHECK(status, "Failed to enqueue kernel function!");
+}
+
+
+template<>
+void Div<float, lang::Opencl>(const size_t num, const Block* in, const float x, Block* out, Context* ctx) {
+  cl_int status = CL_SUCCESS;
+
+  std::string kname = "clkernel_divide_scalar_matx";
+  auto kernel = ctx->kernels->at(kname);
+  
+  cl::Buffer inbuf = *(static_cast<cl::Buffer*>(in->mutable_data()));
+  cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data()));
+
+  kernel.setArg(0, (cl_int)num);
+  kernel.setArg(1, inbuf);
+  kernel.setArg(2, x);
+  kernel.setArg(3, outbuf);
+
+  status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(num));
+  OCL_CHECK(status, "Failed to enqueue kernel function!");
+}
+
+
+template<>
+void Div<float, lang::Opencl>(const size_t num, const float x, const Block* in, Block* out, Context* ctx) {
+  cl_int status = CL_SUCCESS;
+
+  std::string kname = "clkernel_divide_scalar_xmat";
+  auto kernel = ctx->kernels->at(kname);
+  
+  cl::Buffer inbuf = *(static_cast<cl::Buffer*>(in->mutable_data()));
+  cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data()));
+
+  kernel.setArg(0, (cl_int)num);
+  kernel.setArg(1, x);
+  kernel.setArg(2, inbuf);
+  kernel.setArg(3, outbuf);
+
+  status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(num));
+  OCL_CHECK(status, "Failed to enqueue kernel function!");
+}
+
+
+template<>
+void Div<float, lang::Opencl>(const size_t num, const Block* in1, const Block* in2, Block* out, Context* ctx) {
+  cl_int status = CL_SUCCESS;
+
+  std::string kname = "clkernel_divide";
+  auto kernel = ctx->kernels->at(kname);
+  
+  cl::Buffer in1buf = *(static_cast<cl::Buffer*>(in1->mutable_data()));
+  cl::Buffer in2buf = *(static_cast<cl::Buffer*>(in2->mutable_data()));
+  cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data()));
+
+  kernel.setArg(0, (cl_int)num);
+  kernel.setArg(1, in1buf);
+  kernel.setArg(2, in2buf);
+  kernel.setArg(3, outbuf);
+
+  status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(num));
+  OCL_CHECK(status, "Failed to enqueue kernel function!");
+}
+
+
+template<>
+void EltwiseMult<float, lang::Opencl>(const size_t num, const Block* in, const float x, Block* out, Context* ctx) {
+  cl_int status = CL_SUCCESS;
+
+  std::string kname = "clkernel_eltmult_scalar";
+  auto kernel = ctx->kernels->at(kname);
+
+  cl::Buffer inbuf = *(static_cast<cl::Buffer*>(in->mutable_data()));
+  cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data()));
+
+  kernel.setArg(0, (cl_int)num);
+  kernel.setArg(1, x);
+  kernel.setArg(2, inbuf);
+  kernel.setArg(3, outbuf);
+
+  status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(num));
+  OCL_CHECK(status, "Failed to enqueue kernel function!");
+}
+
+
+template<>
+void EltwiseMult<float, lang::Opencl>(const size_t num, const Block* in1, const Block* in2, Block* out, Context* ctx) {
+  cl_int status = CL_SUCCESS;
+
+  std::string kname = "clkernel_eltmult";
+  auto kernel = ctx->kernels->at(kname);
+
+  cl::Buffer in1buf = *(static_cast<cl::Buffer*>(in1->mutable_data()));
+  cl::Buffer in2buf = *(static_cast<cl::Buffer*>(in2->mutable_data()));
+  cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data()));
+
+  kernel.setArg(0, (cl_int)num);
+  kernel.setArg(1, in1buf);
+  kernel.setArg(2, in2buf);
+  kernel.setArg(3, outbuf);
+
+  status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(num));
+  OCL_CHECK(status, "Failed to enqueue kernel function!");
+}
+
+
+template<>
+void Exp<float, lang::Opencl>(const size_t num, const Block* in, Block* out, Context* ctx) {
+  cl_int status = CL_SUCCESS;
+
+  std::string kname = "clkernel_exp";
+  auto kernel = ctx->kernels->at(kname);
+
+  cl::Buffer inbuf = *(static_cast<cl::Buffer*>(in->mutable_data()));
+  cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data()));
+
+  kernel.setArg(0, (cl_int)num);
+  kernel.setArg(1, inbuf);
+  kernel.setArg(2, outbuf);
+
+  status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(num));
+  OCL_CHECK(status, "Failed to enqueue kernel function!");
+}
+
+
+template<>
+void LE<float, lang::Opencl>(const size_t num, const Block *in, const float x, Block *out, Context *ctx) {
+  cl_int status = CL_SUCCESS;
+
+  std::string kname = "clkernel_le";
+  auto kernel = ctx->kernels->at(kname);
+
+  cl::Buffer inbuf = *(static_cast<cl::Buffer*>(in->mutable_data()));
+  cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data()));
+
+  kernel.setArg(0, (cl_int)num);
+  kernel.setArg(1, inbuf);
+  kernel.setArg(2, x);
+  kernel.setArg(3, outbuf);
+
+  status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(num));
+  OCL_CHECK(status, "Failed to enqueue kernel function!");
+}
+
+
+template<>
+void Log<float, lang::Opencl>(const size_t num, const Block* in, Block* out, Context* ctx) {
+  cl_int status = CL_SUCCESS;
+
+  std::string kname = "clkernel_log";
+  auto kernel = ctx->kernels->at(kname);
+
+  cl::Buffer inbuf = *(static_cast<cl::Buffer*>(in->mutable_data()));
+  cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data()));
+
+  kernel.setArg(0, (cl_int)num);
+  kernel.setArg(1, inbuf);
+  kernel.setArg(2, outbuf);
+
+  status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(num));
+  OCL_CHECK(status, "Failed to enqueue kernel function!");
+}
+
+
+template<>
+void LT<float, lang::Opencl>(const size_t num, const Block *in, const float x, Block *out, Context *ctx) {
+  cl_int status = CL_SUCCESS;
+
+  std::string kname = "clkernel_lt";
+  auto kernel = ctx->kernels->at(kname);
+
+  cl::Buffer inbuf = *(static_cast<cl::Buffer*>(in->mutable_data()));
+  cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data()));
+
+  kernel.setArg(0, (cl_int)num);
+  kernel.setArg(1, inbuf);
+  kernel.setArg(2, x);
+  kernel.setArg(3, outbuf);
+
+  status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(num));
+  OCL_CHECK(status, "Failed to enqueue kernel function!");
+}
+
+
+template<>
+void GE<float, lang::Opencl>(const size_t num, const Block *in, const float x, Block *out, Context *ctx) {
+  cl_int status = CL_SUCCESS;
+
+  std::string kname = "clkernel_ge";
+  auto kernel = ctx->kernels->at(kname);
+
+  cl::Buffer inbuf = *(static_cast<cl::Buffer*>(in->mutable_data()));
+  cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data()));
+
+  kernel.setArg(0, (cl_int)num);
+  kernel.setArg(1, inbuf);
+  kernel.setArg(2, x);
+  kernel.setArg(3, outbuf);
+
+  status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(num));
+  OCL_CHECK(status, "Failed to enqueue kernel function!");
+}
+
+
+template<>
+void GT<float, lang::Opencl>(const size_t num, const Block *in, const float x, Block *out, Context *ctx) {
+  cl_int status = CL_SUCCESS;
+
+  std::string kname = "clkernel_gt";
+  auto kernel = ctx->kernels->at(kname);
+
+  cl::Buffer inbuf = *(static_cast<cl::Buffer*>(in->mutable_data()));
+  cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data()));
+
+  kernel.setArg(0, (cl_int)num);
+  kernel.setArg(1, inbuf);
+  kernel.setArg(2, x);
+  kernel.setArg(3, outbuf);
+
+  status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(num));
+  OCL_CHECK(status, "Failed to enqueue kernel function!");
+}
+
+
+template<>
+void Pow<float, lang::Opencl>(const size_t num, const Block* in, float x, Block* out, Context* ctx) {
+  cl_int status = CL_SUCCESS;
+
+  std::string kname = "clkernel_pow_scalar";
+  auto kernel = ctx->kernels->at(kname);
+
+  cl::Buffer inbuf = *(static_cast<cl::Buffer*>(in->mutable_data()));
+  cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data()));
+
+  kernel.setArg(0, (cl_int)num);
+  kernel.setArg(1, x);
+  kernel.setArg(2, inbuf);
+  kernel.setArg(3, outbuf);
+
+  status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(num));
+  OCL_CHECK(status, "Failed to enqueue kernel function!");
+}
+
+
+template<>
+void Pow<float, lang::Opencl>(const size_t num, const Block* in1, const Block* in2, Block* out, Context* ctx) {
+  cl_int status = CL_SUCCESS;
+
+  std::string kname = "clkernel_pow";
+  auto kernel = ctx->kernels->at(kname);
+
+  cl::Buffer in1buf = *(static_cast<cl::Buffer*>(in1->mutable_data()));
+  cl::Buffer in2buf = *(static_cast<cl::Buffer*>(in2->mutable_data()));
+  cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data()));
+
+  kernel.setArg(0, (cl_int)num);
+  kernel.setArg(1, in1buf);
+  kernel.setArg(2, in2buf);
+  kernel.setArg(3, outbuf);
+
+  status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(num));
+  OCL_CHECK(status, "Failed to enqueue kernel function!");
+}
+
+
+template<>
+void ReLU<float, lang::Opencl>(const size_t num, const Block* in, Block* out, Context* ctx) {
+  cl_int status = CL_SUCCESS;
+
+  std::string kname = "clkernel_relu";
+  auto kernel = ctx->kernels->at(kname);
+
+  cl::Buffer inbuf = *(static_cast<cl::Buffer*>(in->mutable_data()));
+  cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data()));
+
+  kernel.setArg(0, (cl_int)num);
+  kernel.setArg(1, inbuf);
+  kernel.setArg(2, outbuf);
+
+  status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(num));
+  OCL_CHECK(status, "Failed to enqueue kernel function!");
+}
+
+template<>
+void Set<float, lang::Opencl>(const size_t num, const float x, Block* out, Context* ctx) {
+  cl_int status = CL_SUCCESS;
+
+  std::string kname = "clkernel_set";
+  auto kernel = ctx->kernels->at(kname);
+  
+  cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data()));
+
+  kernel.setArg(0, (cl_int)num);
+  kernel.setArg(1, x);
+  kernel.setArg(2, outbuf);
+
+  status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(num));
+  OCL_CHECK(status, "Failed to enqueue kernel function!");
+}
+
+
+template<>
+void Sigmoid<float, lang::Opencl>(const size_t num, const Block* in, Block* out, Context* ctx) {
+  cl_int status = CL_SUCCESS;
+
+  std::string kname = "clkernel_sigmoid";
+  auto kernel = ctx->kernels->at(kname);
+
+  cl::Buffer inbuf = *(static_cast<cl::Buffer*>(in->mutable_data()));
+  cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data()));
+
+  kernel.setArg(0, (cl_int)num);
+  kernel.setArg(1, inbuf);
+  kernel.setArg(2, outbuf);
+
+  status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(num));
+  OCL_CHECK(status, "Failed to enqueue kernel function!");
+}
+
+
+template<>
+void Sign<float, lang::Opencl>(const size_t num, const Block* in, Block* out, Context* ctx) {
+  cl_int status = CL_SUCCESS;
+
+  std::string kname = "clkernel_sign";
+  auto kernel = ctx->kernels->at(kname);
+
+  cl::Buffer inbuf = *(static_cast<cl::Buffer*>(in->mutable_data()));
+  cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data()));
+
+  kernel.setArg(0, (cl_int)num);
+  kernel.setArg(1, inbuf);
+  kernel.setArg(2, outbuf);
+
+  status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(num));
+  OCL_CHECK(status, "Failed to enqueue kernel function!");
+}
+
+
+template<>
+void Sqrt<float, lang::Opencl>(const size_t num, const Block* in, Block* out, Context* ctx) {
+  cl_int status = CL_SUCCESS;
+
+  std::string kname = "clkernel_sqrt";
+  auto kernel = ctx->kernels->at(kname);
+
+  cl::Buffer inbuf = *(static_cast<cl::Buffer*>(in->mutable_data()));
+  cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data()));
+
+  kernel.setArg(0, (cl_int)num);
+  kernel.setArg(1, inbuf);
+  kernel.setArg(2, outbuf);
+
+  status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(num));
+  OCL_CHECK(status, "Failed to enqueue kernel function!");
+}
+
+
+template<>
+void Square<float, lang::Opencl>(const size_t num, const Block* in, Block* out, Context* ctx) {
+  Pow<float, lang::Opencl>(num, in, 2, out, ctx);
+}
+
+
+template<>
+void Sub<float, lang::Opencl>(const size_t num, const Block* in, const float x, Block* out, Context* ctx) {
+  cl_int status = CL_SUCCESS;
+
+  std::string kname = "clkernel_subtract_scalar";
+  auto kernel = ctx->kernels->at(kname);
+
+  cl::Buffer inbuf = *(static_cast<cl::Buffer*>(in->mutable_data()));
+  cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data()));
+
+  kernel.setArg(0, (cl_int)num);
+  kernel.setArg(1, inbuf);
+  kernel.setArg(2, x);
+  kernel.setArg(3, outbuf);
+
+  status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(num));
+  OCL_CHECK(status, "Failed to enqueue kernel function!");
+}
+
+
+template<>
+void Sub<float, lang::Opencl>(const size_t num, const Block* in1, const Block* in2, Block* out, Context* ctx) {
+  cl_int status = CL_SUCCESS;
+
+  std::string kname = "clkernel_subtract";
+  auto kernel = ctx->kernels->at(kname);
+
+  cl::Buffer in1buf = *(static_cast<cl::Buffer*>(in1->mutable_data()));
+  cl::Buffer in2buf = *(static_cast<cl::Buffer*>(in2->mutable_data()));
+  cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data()));
+
+  kernel.setArg(0, (cl_int)num);
+  kernel.setArg(1, in1buf);
+  kernel.setArg(2, in2buf);
+  kernel.setArg(3, outbuf);
+
+  status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(num));
+  OCL_CHECK(status, "Failed to enqueue kernel function!");
+}
+
+
+template<>
+void Sum<float, lang::Opencl>(const size_t num, const Block* in, float* out, Context* ctx) {
+  cl_int status = CL_SUCCESS;
+
+  std::string kname = "clkernel_reduce";
+  auto kernel = ctx->kernels->at(kname);
+
+  cl::Buffer inbuf = *(static_cast<cl::Buffer*>(in->mutable_data()));
+  
+  size_t size = sizeof(float) * num;
+  cl::Buffer outval(ctx->ocl_ctx, CL_MEM_WRITE_ONLY, size, nullptr, &status);
+  OCL_CHECK(status, "Failed to create buffer!");
+
+  kernel.setArg(0, (cl_int)num);
+  kernel.setArg(1, inbuf);
+  kernel.setArg(2, outval);
+  kernel.setArg(3, cl::Local(size));
+  
+  status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(num));
+  OCL_CHECK(status, "Failed to enqueue kernel function!");
+
+  float* temp = new float[num];
+  status = ctx->ocl_cmdq.enqueueReadBuffer(outval, CL_TRUE, 0, size, temp);
+  OCL_CHECK(status, "Failed to read from buffer!");
+  out[0] = temp[0];
+  delete temp;
+}
+
+
+template<>
+void Tanh<float, lang::Opencl>(const size_t num, const Block* in, Block* out, Context* ctx) {
+  cl_int status = CL_SUCCESS;
+
+  std::string kname = "clkernel_tanh";
+  auto kernel = ctx->kernels->at(kname);
+
+  cl::Buffer inbuf = *(static_cast<cl::Buffer*>(in->mutable_data()));
+  cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data()));
+
+  kernel.setArg(0, (cl_int)num);
+  kernel.setArg(1, inbuf);
+  kernel.setArg(2, outbuf);
+
+  status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(num));
+  OCL_CHECK(status, "Failed to enqueue kernel function!");
+}
+
+// **************************************
+// Random functions
+// **************************************
+
+/// Seed value required for generating distributions.
+static unsigned int seed[4] = {0, 32, 42, 888};
+/// Number of generation rounds used in the current algorithm.
+static cl_uint rounds = 8;
+
+template<>
+void Bernoulli<float, lang::Opencl>(const size_t num, const float p, Block* out, Context *ctx) {
+  cl_int status = CL_SUCCESS;
+
+  std::string kname = "PRNG_threefry4x32_bernoulli";
+  auto kernel = ctx->kernels->at(kname);
+  
+  cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data()));
+
+  kernel.setArg(0, outbuf);
+  kernel.setArg(1, seed);
+  kernel.setArg(2, 0.0f); // inf
+  kernel.setArg(3, 1.0f); // sup
+  kernel.setArg(4, p); // threshold
+  kernel.setArg(5, rounds);
+  kernel.setArg(6, cl_uint(num) / 4);
+  
+  status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(num/4));
+  OCL_CHECK(status, "Failed to enqueue kernel function!");
+}
+
+
+template<>
+void Gaussian<float, lang::Opencl>(const size_t num, const float mean, const float std, Block* out, Context *ctx) {
+  cl_int status = CL_SUCCESS;
+
+  std::string kname = "PRNG_threefry4x32_gaussian";
+  auto kernel = ctx->kernels->at(kname);
+
+  cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data()));
+
+  kernel.setArg(0, outbuf);
+  kernel.setArg(1, seed);
+  kernel.setArg(2, mean); // E
+  kernel.setArg(3, std);  // V
+  kernel.setArg(4, rounds);
+  kernel.setArg(5, cl_uint(num) / 4);
+
+  status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(num/4));
+  OCL_CHECK(status, "Failed to enqueue kernel function!");
+}
+
+
+template<>
+void Uniform<float, lang::Opencl>(const size_t num, const float low, const float high, Block* out, Context *ctx) {
+  cl_int status = CL_SUCCESS;
+
+  std::string kname = "PRNG_threefry4x32_uniform";
+  auto kernel = ctx->kernels->at(kname);
+
+  cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data()));
+  
+  status = kernel.setArg(0, outbuf); OCL_CHECK(status, "kernel arg 0");
+  status = kernel.setArg(1, seed); OCL_CHECK(status, "kernel arg 1");
+  status = kernel.setArg(2, low); OCL_CHECK(status, "kernel arg 2");
+  status = kernel.setArg(3, high); OCL_CHECK(status, "kernel arg 3");
+  status = kernel.setArg(4, rounds); OCL_CHECK(status, "kernel arg 4");
+  status = kernel.setArg(5, cl_uint(num) / 4); OCL_CHECK(status, "kernel arg 5");
+
+  status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(num/4));
+  OCL_CHECK(status, "Failed to enqueue kernel function!");
+}
+
+// *********************************************************
+// BLAS functions, ref to http://docs.nvidia.com/cuda/cublas
+// *********************************************************
+
+template<>
+void Amax<float, lang::Opencl>(const size_t num, const Block* in, size_t* out, Context* ctx) {
+  cl_int status = CL_SUCCESS;
+
+  std::string kname = "clkernel_amax";
+  auto kernel = ctx->kernels->at(kname);
+
+  cl::Buffer inbuf = *(static_cast<cl::Buffer*>(in->mutable_data()));
+
+  size_t size = sizeof(size_t) * num;
+  cl::Buffer outval(ctx->ocl_ctx, CL_MEM_WRITE_ONLY, size, nullptr, &status);
+  OCL_CHECK(status, "Failed to create buffer!");
+
+  kernel.setArg(0, (cl_int)num);
+  kernel.setArg(1, inbuf);
+  kernel.setArg(2, outval);
+  kernel.setArg(3, cl::Local(size));
+  kernel.setArg(4, cl::Local(sizeof(size_t)));
+
+  status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(num));
+  OCL_CHECK(status, "Failed to enqueue kernel function!");
+
+  size_t* temp = new size_t[num];
+  status = ctx->ocl_cmdq.enqueueReadBuffer(outval, CL_TRUE, 0, size, temp);
+  OCL_CHECK(status, "Failed to read from buffer!");
+  out[0] = temp[0];
+  delete temp;
+}
+
+
+template<>
+void Amin<float, lang::Opencl>(const size_t num, const Block* in, size_t* out, Context* ctx) {
+  cl_int status = CL_SUCCESS;
+
+  std::string kname = "clkernel_amin";
+  auto kernel = ctx->kernels->at(kname);
+
+  cl::Buffer inbuf = *(static_cast<cl::Buffer*>(in->mutable_data()));
+
+  size_t size = sizeof(size_t) * num;
+  cl::Buffer outval(ctx->ocl_ctx, CL_MEM_WRITE_ONLY, size, nullptr, &status);
+  OCL_CHECK(status, "Failed to create buffer!");
+
+  kernel.setArg(0, (cl_int)num);
+  kernel.setArg(1, inbuf);
+  kernel.setArg(2, outval);
+  kernel.setArg(3, cl::Local(size));
+  kernel.setArg(4, cl::Local(sizeof(size_t)));
+
+  status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(num));
+  OCL_CHECK(status, "Failed to enqueue kernel function!");
+
+  size_t* temp = new size_t[num];
+  status = ctx->ocl_cmdq.enqueueReadBuffer(outval, CL_TRUE, 0, size, temp);
+  OCL_CHECK(status, "Failed to read from buffer!");
+  out[0] = temp[0];
+  delete temp;
+}
+
+
+template<>
+void Asum<float, lang::Opencl>(const size_t num, const Block* in, float* out, Context* ctx) {
+  cl_int status = CL_SUCCESS;
+
+  std::string kname = "clkernel_asum";
+  auto kernel = ctx->kernels->at(kname);
+
+  cl::Buffer inbuf = *(static_cast<cl::Buffer*>(in->mutable_data()));
+  
+  size_t size = sizeof(float) * num;
+  cl::Buffer outval(ctx->ocl_ctx, CL_MEM_WRITE_ONLY, size, nullptr, &status);
+  OCL_CHECK(status, "Failed to create buffer!");
+
+  kernel.setArg(0, (cl_int)num);
+  kernel.setArg(1, inbuf);
+  kernel.setArg(2, outval);
+  kernel.setArg(3, cl::Local(size));
+
+  status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(num));
+  OCL_CHECK(status, "Failed to enqueue kernel function!");
+
+  float* temp = new float[num];
+  status = ctx->ocl_cmdq.enqueueReadBuffer(outval, CL_TRUE, 0, size, temp);
+  OCL_CHECK(status, "Failed to read from buffer!");
+  out[0] = temp[0];
+  delete temp;
+}
+
+
+template<>
+void Axpy<float, lang::Opencl>(const size_t num, const float alpha, const Block* in, Block* out, Context* ctx) {
+  cl_int status = CL_SUCCESS;
+
+  std::string kname = "clkernel_axpy";
+  auto kernel = ctx->kernels->at(kname);
+
+  cl::Buffer inbuf = *(static_cast<cl::Buffer*>(in->mutable_data()));
+  cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data()));
+
+  kernel.setArg(0, (cl_int)num);
+  kernel.setArg(1, alpha);
+  kernel.setArg(2, inbuf);
+  kernel.setArg(3, outbuf);
+
+  status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(num));
+  OCL_CHECK(status, "Failed to enqueue kernel function!");
+}
+
+
+template<>
+void Nrm2<float, lang::Opencl>(const size_t num, const Block* in, float* out, Context* ctx) {
+  cl_int status = CL_SUCCESS;
+
+  std::string kname = "clkernel_nrm2";
+  auto kernel = ctx->kernels->at(kname);
+
+  cl::Buffer inbuf = *(static_cast<cl::Buffer*>(in->mutable_data()));
+
+  size_t size = sizeof(float) * num;
+  cl::Buffer outval(ctx->ocl_ctx, CL_MEM_WRITE_ONLY, size, nullptr, &status);
+  OCL_CHECK(status, "Failed to create buffer!");
+
+  kernel.setArg(0, (cl_int)num);
+  kernel.setArg(1, inbuf);
+  kernel.setArg(2, outval);
+  kernel.setArg(3, cl::Local(sizeof(float) * (std::pow(2, num))));
+
+  status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(num));
+  OCL_CHECK(status, "Failed to enqueue kernel function!");
+
+  float* temp = new float[num];
+  status = ctx->ocl_cmdq.enqueueReadBuffer(outval, CL_TRUE, 0, size, temp);
+  OCL_CHECK(status, "Failed to read from buffer!");
+  out[0] = temp[0];
+  delete temp;
+}
+
+
+template<>
+void Scale<float, lang::Opencl>(const size_t num, const float x, Block* out, Context* ctx) {
+  cl_int status = CL_SUCCESS;
+
+  std::string kname = "clkernel_scale";
+  auto kernel = ctx->kernels->at(kname);
+
+  cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data()));
+
+  kernel.setArg(0, (cl_int)num);
+  kernel.setArg(1, x);
+  kernel.setArg(2, outbuf);
+
+  status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(num));
+  OCL_CHECK(status, "Failed to enqueue kernel function!");
+}
+
+
+template<>
+void Dot<float, lang::Opencl>(const size_t num, const Block *in1, const Block *in2, float *out, Context *ctx) {
+  cl_int status = CL_SUCCESS;
+
+  std::string kname = "clkernel_dot";
+  auto kernel = ctx->kernels->at(kname);
+
+  cl::Buffer in1buf = *(static_cast<cl::Buffer*>(in1->mutable_data()));
+  cl::Buffer in2buf = *(static_cast<cl::Buffer*>(in2->mutable_data()));
+
+  size_t size = sizeof(float) * num;
+  cl::Buffer outval(ctx->ocl_ctx, CL_MEM_WRITE_ONLY, size, nullptr, &status);
+  OCL_CHECK(status, "Failed to create buffer!");
+
+  kernel.setArg(0, (cl_int)num);
+  kernel.setArg(1, in1buf);
+  kernel.setArg(2, in2buf);
+  kernel.setArg(3, outval);
+  kernel.setArg(4, cl::Local(size));
+
+  status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(num));
+  OCL_CHECK(status, "Failed to enqueue kernel function!");
+
+  float* temp = new float[num];
+  status = ctx->ocl_cmdq.enqueueReadBuffer(outval, CL_TRUE, 0, size, temp);
+  OCL_CHECK(status, "Failed to read from buffer!");
+  out[0] = temp[0];
+  delete temp;
+}
+
+
+template<>
+void GEMV<float, lang::Opencl>(bool trans, const size_t m, const size_t n, const float alpha,
+		  const Block *A, const Block *v, const float beta, Block* out, Context* ctx) {
+  cl_int status = CL_SUCCESS;
+
+  std::string kname = "clkernel_gemv";
+  auto kernel = ctx->kernels->at(kname);
+
+  cl::Buffer Abuf = *(static_cast<cl::Buffer*>(A->mutable_data()));
+  cl::Buffer vbuf = *(static_cast<cl::Buffer*>(v->mutable_data()));
+  cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data()));
+
+  kernel.setArg(0, (cl_int)m);
+  kernel.setArg(1, (cl_int)n);
+  kernel.setArg(2, alpha);
+  kernel.setArg(3, Abuf);
+  kernel.setArg(4, vbuf);
+  kernel.setArg(5, beta);
+  kernel.setArg(6, outbuf);
+
+  status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(m, n));
+  OCL_CHECK(status, "Failed to enqueue kernel function!");
+}
+
+
+template<>
+void DGMM<float, lang::Opencl>(bool side_right,
+		  const size_t nrow, const size_t ncol,
+		  const Block *M, const Block *v, Block *out, Context *ctx) {
+  cl_int status = CL_SUCCESS;
+
+  cl::Buffer Mbuf = *(static_cast<cl::Buffer*>(M->mutable_data()));
+  cl::Buffer vbuf = *(static_cast<cl::Buffer*>(v->mutable_data()));
+  cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data()));
+
+  std::string kname;
+  if (side_right) {
+	DiagVec_Right(ncol, vbuf, vbuf, ctx);
+	kname = "clkernel_dgmm_right";
+  } else {
+	DiagVec_Left(nrow, vbuf, vbuf, ctx);
+	kname = "clkernel_dgmm_left";
+  }
+
+  auto kernel = ctx->kernels->at(kname);
+
+  kernel.setArg(0, (cl_int)nrow);
+  kernel.setArg(1, (cl_int)ncol);
+  kernel.setArg(2, Mbuf);
+  kernel.setArg(3, vbuf);
+  kernel.setArg(4, outbuf);
+  kernel.setArg(5, cl::Local(sizeof(float) * nrow * ncol));
+
+  cl::NDRange global(nrow); // Only nrow because current implementation is 1 dimensional
+//  cl::NDRange local();
+
+  status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), global);
+  OCL_CHECK(status, "Failed to enqueue kernel function!");
+}
+
+
+template<>
+void GEMM<float, lang::Opencl>(const bool transA, const bool transB,
+		  const size_t nrowA, const size_t ncolB, const size_t ncolA,
+		  const float alpha, const Block *A, const Block *B, const float beta,
+		  Block *C, Context *ctx) {
+  cl_int status = CL_SUCCESS;
+
+  std::string kname = "clkernel_gemm";
+  auto kernel = ctx->kernels->at(kname);
+  
+  cl::Buffer Abuf = *(static_cast<cl::Buffer*>(A->mutable_data()));
+  cl::Buffer Bbuf = *(static_cast<cl::Buffer*>(B->mutable_data()));
+  cl::Buffer Cbuf = *(static_cast<cl::Buffer*>(C->mutable_data()));
+
+  // If matrix A needs to be transposed, do it.
+  if (transA)
+	Transpose(nrowA, ncolA, Abuf, Abuf, ctx);
+
+  // If vector B needs to be transposed, do it.
+  if (transB)
+	Transpose(nrowA, ncolB, Bbuf, Bbuf, ctx);
+
+  kernel.setArg(0, (cl_int)nrowA);
+  kernel.setArg(1, (cl_int)ncolB);
+  kernel.setArg(2, (cl_int)ncolA);
+  kernel.setArg(3, alpha);
+  kernel.setArg(4, Abuf);
+  kernel.setArg(5, Bbuf);
+  kernel.setArg(6, beta);
+  kernel.setArg(7, Cbuf);
+  kernel.setArg(8, cl::Local(sizeof(float) * nrowA * ncolB));
+  kernel.setArg(9, cl::Local(sizeof(float) * nrowA * ncolB));
+  
+// TODO: Try to make the work group size a power of 2 given an arbitrary matrix.
+  cl::NDRange global(nrowA, ncolB);
+  cl::NDRange local(nrowA, ncolB);
+  
+  status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), global, local);
+  OCL_CHECK(status, "Failed to enqueue kernel function!");
+}
+
+template <>
+void ComputeCrossEntropy<float, lang::Opencl>(const size_t batchsize, const size_t dim,
+                         const Block *p, const Block *t, Block *loss,
+                         Context *ctx) {
+  cl_int status = CL_SUCCESS;
+
+  std::string kname = "clkernel_crossentropy";
+  auto kernel = ctx->kernels->at(kname);
+
+  cl::Buffer pbuf = *(static_cast<cl::Buffer*>(p->mutable_data()));
+  cl::Buffer tbuf = *(static_cast<cl::Buffer*>(t->mutable_data()));
+  cl::Buffer lossbuf = *(static_cast<cl::Buffer*>(loss->mutable_data()));
+
+  kernel.setArg(0, (cl_uint)batchsize);
+  kernel.setArg(1, (cl_uint)dim);
+  kernel.setArg(2, pbuf);
+  kernel.setArg(3, tbuf);
+  kernel.setArg(4, lossbuf);
+
+  cl::NDRange global(batchsize);
+
+  status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), global);
+  OCL_CHECK(status, "Failed to enqueue kernel function!");
+}
+
+template <>
+void SoftmaxCrossEntropyBwd<float, lang::Opencl>(const size_t batchsize, const size_t dim,
+                            const Block *p, const Block *t, Block *grad,
+                            Context *ctx) {
+  cl_int status = CL_SUCCESS;
+
+  std::string kname = "clkernel_softmaxentropy";
+  auto kernel = ctx->kernels->at(kname);
+
+  cl::Buffer pbuf = *(static_cast<cl::Buffer*>(p->mutable_data()));
+  cl::Buffer tbuf = *(static_cast<cl::Buffer*>(t->mutable_data()));
+  cl::Buffer gradbuf = *(static_cast<cl::Buffer*>(grad->mutable_data()));
+
+  kernel.setArg(0, (cl_uint)batchsize);
+  kernel.setArg(1, (cl_uint)dim);
+  kernel.setArg(2, pbuf);
+  kernel.setArg(3, tbuf);
+  kernel.setArg(4, gradbuf);
+
+  cl::NDRange global(batchsize);
+
+  status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), global);
+  OCL_CHECK(status, "Failed to enqueue kernel function!");
+}
+
+// **************************************
+// Matrix functions
+// **************************************
+/*
+template<>
+void AddCol<float, lang::Opencl>(const size_t nrow, const size_t ncol, const Block* A, const Block* v, Block* out, Context* ctx) {
+  std::string kname = "clkernel_addcol";
+  auto kernel = ctx->kernels->at(kname);
+  kernel.setArg(0, (cl_int)nrow);
+  kernel.setArg(1, (cl_int)ncol);
+  kernel.setArg(2, static_cast<const float*>(A->mutable_data()));
+  kernel.setArg(3, static_cast<const float*>(v->mutable_data()));
+  kernel.setArg(3, static_cast<float*>(out->mutable_data()));
+
+  ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(nrow, ncol));
+}
+
+template<>
+void AddRow<float, lang::Opencl>(const size_t nrow, const size_t ncol, const Block* A, const Block* v, Block* out, Context* ctx) {
+  std::string kname = "clkernel_addrow";
+  auto kernel = ctx->kernels->at(kname);
+  kernel.setArg(0, (cl_int)nrow);
+  kernel.setArg(1, (cl_int)ncol);
+  kernel.setArg(2, static_cast<const float*>(A->mutable_data()));
+  kernel.setArg(3, static_cast<const float*>(v->mutable_data()));
+  kernel.setArg(3, static_cast<float*>(out->mutable_data()));
+
+  ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(nrow, ncol));
+}
+
+template<>
+void Outer<float, lang::Opencl>(const size_t m, const size_t n, const Block* lhs, const Block* rhs, Block* out, Context* ctx) {
+  std::string kname = "clkernel_outerproduct";
+  auto kernel = ctx->kernels->at(kname);
+  kernel.setArg(0, (cl_int)m);
+  kernel.setArg(1, (cl_int)n);
+  kernel.setArg(2, static_cast<const float*>(lhs->data()));
+  kernel.setArg(3, static_cast<const float*>(rhs->data()));
+  kernel.setArg(4, static_cast<float*>(out->mutable_data()));
+
+  ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(m, n));
+}
+
+template<>
+void SumColumns<float, lang::Opencl>(const size_t nrow, const size_t ncol, const Block* in, Block* out, Context* ctx) {
+  std::string kname = "clkernel_sumcol";
+  auto kernel = ctx->kernels->at(kname);
+  kernel.setArg(0, (cl_int)nrow);
+  kernel.setArg(1, (cl_int)ncol);
+  kernel.setArg(2, static_cast<const float*>(in->mutable_data()));
+  kernel.setArg(3, static_cast<float*>(out->mutable_data()));
+
+  ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(nrow, ncol));
+}*/
+/*
+template<>
+void SumRows<float, lang::Opencl>(const size_t nrow, const size_t ncol, const Block* in, Block* out, Context* ctx) {
+  cl_int status = CL_SUCCESS;
+
+  std::string kname = "clkernel_sumrow";
+  auto kernel = ctx->kernels->at(kname);
+
+  cl::Buffer inbuf = *(static_cast<cl::Buffer*>(in->mutable_data()));
+  cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data()));
+
+  kernel.setArg(0, (cl_int)nrow);
+  kernel.setArg(1, (cl_int)ncol);
+  kernel.setArg(2, inbuf);
+  kernel.setArg(3, outbuf);
+  kernel.setArg(4, cl::Local(sizeof(float) * nrow * ncol));
+
+  status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(nrow, ncol));
+}
+*/
+
+
+#define BLOCK_DIM 16
+
+void Transpose(const size_t nrow, const size_t ncol, cl::Buffer& in, cl::Buffer& out, Context* ctx) {
+  cl_int status = CL_SUCCESS;
+
+  std::string kname = "clkernel_transpose";
+  auto kernel = ctx->kernels->at(kname);
+
+  kernel.setArg(0, (cl_uint)nrow);
+  kernel.setArg(1, (cl_uint)ncol);
+  kernel.setArg(2, in);
+  kernel.setArg(3, out);
+  kernel.setArg(4, cl::Local((BLOCK_DIM + 1) * BLOCK_DIM));
+
+  status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(nrow, ncol));
+  OCL_CHECK(status, "Failed to enqueue kernel function!");
+}
+
+#undef BLOCK_DIM
+
+
+/// This is a utility function that transforms a single-row vector into a diagonal matrix.
+/// For example, a vector of size n will become a matrix of size n*n where only the positions nx == ny will have values.
+void DiagVec_Left(const size_t size, cl::Buffer& in, cl::Buffer& out, Context* ctx) {
+  cl_int status = CL_SUCCESS;
+
+  std::string kname = "clkernel_diagvec_left";
+  auto kernel = ctx->kernels->at(kname);
+  
+  kernel.setArg(0, (cl_uint)size);
+  kernel.setArg(1, in);
+  kernel.setArg(2, out);
+
+  status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(size));
+  OCL_CHECK(status, "Failed to enqueue kernel function!");
+}
+
+void DiagVec_Right(const size_t size, cl::Buffer& in, cl::Buffer& out, Context* ctx) {
+  cl_int status = CL_SUCCESS;
+
+  std::string kname = "clkernel_diagvec_right";
+  auto kernel = ctx->kernels->at(kname);
+
+  kernel.setArg(0, (cl_uint)size);
+  kernel.setArg(1, in);
+  kernel.setArg(2, out);
+
+  status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(size));
+  OCL_CHECK(status, "Failed to enqueue kernel function!");
+}
+
+} // namespace singa
+
+#endif // USE_OPENCL
+
+#endif  // SINGA_CORE_TENSOR_TENSOR_MATH_OPENCL_H_
diff --git a/src/driver.cc b/src/driver.cc
deleted file mode 100644
index 2e38e53..0000000
--- a/src/driver.cc
+++ /dev/null
@@ -1,402 +0,0 @@
-/************************************************************
-*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*
-*************************************************************/
-
-#include "singa/driver.h"
-
-#include <glog/logging.h>
-#include <set>
-#include <string>
-#include <vector>
-#include "singa/comm/socket.h"
-#include "singa/neuralnet/layer.h"
-#include "singa/utils/common.h"
-#include "singa/utils/tinydir.h"
-#include "singa/utils/cluster.h"
-#include "singa/utils/context.h"
-#include "singa/proto/job.pb.h"
-#include "singa/server.h"
-#include "singa/stub.h"
-#include "singa/worker.h"
-
-#include "singa/neuralnet/connection_layer.h"
-#include "singa/neuralnet/input_layer.h"
-#include "singa/neuralnet/loss_layer.h"
-#include "singa/neuralnet/neuron_layer.h"
-#include "singa/neuralnet/output_layer.h"
-
-extern "C" void openblas_set_num_threads(int num);
-
-namespace singa {
-
-void Driver::Init(int argc, char **argv) {
-  // unique job ID generated from singa-run.sh, passed in as "-singa_job <id>"
-  int arg_pos = ArgPos(argc, argv, "-singa_job");
-  job_id_ = (arg_pos != -1) ? atoi(argv[arg_pos + 1]) : -1;
-  // global signa conf passed by singa-run.sh as "-singa_conf <path>"
-  arg_pos = ArgPos(argc, argv, "-singa_conf");
-  if (arg_pos != -1)
-    ReadProtoFromTextFile(argv[arg_pos + 1], &singa_conf_);
-  // set log path
-  if (singa_conf_.has_log_dir())
-    SetupLog(singa_conf_.log_dir(), "driver");
-  // job conf passed by users as "-conf <path>"
-  arg_pos = ArgPos(argc, argv, "-conf");
-  if (arg_pos != -1)
-    ReadProtoFromTextFile(argv[arg_pos + 1], &job_conf_);
-  arg_pos = ArgPos(argc, argv, "-host");
-  if (arg_pos != -1)
-    hostip_ = argv[arg_pos + 1];
-  else
-    hostip_ = "localhost";
-
-  // register layers
-
-  // input and output layers
-  RegisterLayer<RecordInputLayer, int>(kRecordInput);
-  RegisterLayer<CSVInputLayer, int>(kCSVInput);
-  RegisterLayer<ImagePreprocessLayer, int>(kImagePreprocess);
-  RegisterLayer<RecordOutputLayer, int>(kRecordOutput);
-  RegisterLayer<CSVOutputLayer, int>(kCSVOutput);
-  RegisterLayer<CharRNNInputLayer, int>(kCharRNN);
-  RegisterLayer<RNNLabelLayer, int>(kRNNLabel);
-  RegisterLayer<OneHotLayer, int>(kOneHot);
-  RegisterLayer<CharRNNOutputLayer, int>(kCharRNNOutput);
-
-  // connection layers
-  RegisterLayer<BridgeDstLayer, int>(kBridgeDst);
-  RegisterLayer<BridgeSrcLayer, int>(kBridgeSrc);
-  RegisterLayer<ConcateLayer, int>(kConcate);
-  RegisterLayer<SliceLayer, int>(kSlice);
-  RegisterLayer<SplitLayer, int>(kSplit);
-  RegisterLayer<RNNDummyLayer, int>(kRNNDummy);
-
-  RegisterLayer<AccuracyLayer, int>(kAccuracy);
-  RegisterLayer<ArgSortLayer, int>(kArgSort);
-  RegisterLayer<ConvolutionLayer, int>(kConvolution);
-  RegisterLayer<CConvolutionLayer, int>(kCConvolution);
-  RegisterLayer<CPoolingLayer, int>(kCPooling);
-  RegisterLayer<EmbeddingLayer, int>(kEmbedding);
-  RegisterLayer<ActivationLayer, int>(kActivation);
-
-#ifdef USE_CUDNN
-  RegisterLayer<CudnnActivationLayer, int>(kCudnnActivation);
-  RegisterLayer<CudnnConvLayer, int>(kCudnnConv);
-  RegisterLayer<CudnnPoolLayer, int>(kCudnnPool);
-  RegisterLayer<CudnnLRNLayer, int>(kCudnnLRN);
-  RegisterLayer<CudnnSoftmaxLayer, int>(kCudnnSoftmax);
-  RegisterLayer<CudnnSoftmaxLossLayer, int>(kCudnnSoftmaxLoss);
-#if CUDNN_MAJOR == 4
-  RegisterLayer<CudnnBMLayer, int>(kCudnnBM);
-#endif
-#endif
-
-  RegisterLayer<DropoutLayer, int>(kDropout);
-  RegisterLayer<DummyLayer, int>(kDummy);
-  RegisterLayer<EuclideanLossLayer, int>(kEuclideanLoss);
-  RegisterLayer<InnerProductLayer, int>(kInnerProduct);
-  RegisterLayer<LabelLayer, int>(kLabel);
-  RegisterLayer<LRNLayer, int>(kLRN);
-  RegisterLayer<MnistLayer, int>(kMnist);
-  RegisterLayer<PoolingLayer, int>(kPooling);
-  RegisterLayer<RBMHidLayer, int>(kRBMHid);
-  RegisterLayer<RBMVisLayer, int>(kRBMVis);
-  RegisterLayer<RGBImageLayer, int>(kRGBImage);
-  RegisterLayer<ReLULayer, int>(kReLU);
-  RegisterLayer<ShardDataLayer, int>(kShardData);
-  RegisterLayer<SigmoidLayer, int>(kSigmoid);
-  RegisterLayer<SoftmaxLossLayer, int>(kSoftmaxLoss);
-  RegisterLayer<STanhLayer, int>(kSTanh);
-  RegisterLayer<SoftmaxLayer, int>(kSoftmax);
-  RegisterLayer<GRULayer, int>(kGRU);
-  RegisterLayer<BMLayer, int>(kBM);
-
-#ifdef USE_LMDB
-  RegisterLayer<LMDBDataLayer, int>(kLMDBData);
-#endif
-
-  // register updaters
-  RegisterUpdater<AdaGradUpdater>(kAdaGrad);
-  RegisterUpdater<NesterovUpdater>(kNesterov);
-  RegisterUpdater<RMSPropUpdater>(kRMSProp);
-  RegisterUpdater<AdaDeltaUpdater>(kAdaDelta);
-  RegisterUpdater<AdamUpdater>(kAdam);
-  RegisterUpdater<AdamMaxUpdater>(kAdamMax);
-
-  RegisterUpdater<SGDUpdater>(kSGD);
-
-  // register learning rate change methods
-  RegisterLRGenerator<LRGenerator>(kFixed);
-  RegisterLRGenerator<FixedStepLRGen>(kFixedStep);
-  RegisterLRGenerator<StepLRGen>(kStep);
-  RegisterLRGenerator<LinearLRGen>(kLinear);
-  RegisterLRGenerator<ExpLRGen>(kExponential);
-  RegisterLRGenerator<InvLRGen>(kInverse);
-  RegisterLRGenerator<InvTLRGen>(kInverseT);
-
-  // register workers
-  RegisterWorker<BPWorker>(kBP);
-  RegisterWorker<BPTTWorker>(kBPTT);
-  RegisterWorker<CDWorker>(kCD);
-
-  // register params
-  RegisterParam<Param>(0);
-
-  // register param init methods
-  RegisterParamGenerator<ParamGenerator>(kConstant);
-  RegisterParamGenerator<GaussianGen>(kGaussian);
-  RegisterParamGenerator<UniformGen>(kUniform);
-  RegisterParamGenerator<GaussianSqrtFanInGen>(kGaussianSqrtFanIn);
-  RegisterParamGenerator<UniformSqrtFanInGen>(kUniformSqrtFanIn);
-  RegisterParamGenerator<UniformSqrtFanInOutGen>(kUniformSqrtFanInOut);
-}
-
-void Driver::InitLog(char* arg) {
-    google::InitGoogleLogging(arg);
-}
-
-void Driver::Train(bool resume, const std::string str) {
-  JobProto job_conf;
-  job_conf.ParseFromString(str);
-  Train(resume, job_conf);
-}
-
-void Driver::Train(bool resume, const JobProto& job_conf) {
-  if (singa_conf_.has_log_dir())
-    SetupLog(singa_conf_.log_dir(),
-        std::to_string(job_id_) + "-" + job_conf.name());
-  Cluster::Setup(job_id_, singa_conf_, job_conf.cluster());
-  tinydir_dir workspace;
-  if (tinydir_open(&workspace, job_conf.cluster().workspace().c_str()) == -1)
-    LOG(FATAL) << "workspace not exist: " << job_conf.cluster().workspace();
-  if (job_conf.num_openblas_threads() != 1)
-    LOG(WARNING) << "openblas luanches "
-                 << job_conf.num_openblas_threads() << " threads";
-  openblas_set_num_threads(job_conf.num_openblas_threads());
-
-  JobProto job;
-  job.CopyFrom(job_conf);
-  if (resume)
-    SetupForResume(&job);
-  job.set_id(job_id_);
-  Train(job);
-}
-
-void Driver::Test(const std::string str) {
-  JobProto job_conf;
-  job_conf.ParseFromString(str);
-  Test(job_conf);
-}
-
-void Driver::Test(const JobProto& job_conf) {
-  Cluster::Setup(job_id_, singa_conf_, job_conf.cluster());
-  Cluster::Get()->Register(getpid(), "localhost");
-  // TODO(wangwei) extend to a group with multiple workers
-  auto worker = Worker::Create(job_conf.train_one_batch());
-  worker->Setup(0, 0, job_conf, nullptr, nullptr, nullptr);
-  auto net = NeuralNet::Create(job_conf.neuralnet(), kTest, 1);
-  WriteStringToTextFile(Cluster::Get()->vis_folder() + "/test_net.json",
-      net->ToGraph(true).ToJson());
-  vector<string> paths;
-  for (const auto& p : job_conf.checkpoint_path())
-    paths.push_back(p);
-  net->Load(paths);
-  worker->Test(job_conf.test_steps(), kTest,  net);
-}
-
-void Driver::Train(const JobProto& job_conf) {
-  auto cluster = Cluster::Get();
-  int nserver_grps = cluster->nserver_groups();
-  int grp_size = cluster->nworkers_per_group();
-  Stub stub;
-  // no need to create Stub if there is only a single worker without servers,
-  // i.e., the training will be conducted by the single worker.
-  if (grp_size > 1 || nserver_grps > 0) {
-    auto router = new Router();
-    if (cluster->nprocs() > 1) {
-      int binding_port = router->Bind("tcp://" + hostip_ + ":*");
-      cluster->Register(getpid(), hostip_ + ":" + std::to_string(binding_port));
-    } else {
-      cluster->Register(getpid(), hostip_ + ":0");  // fake endpoint
-    }
-    stub.set_router(router);
-  }
-
-  NeuralNet* net = NeuralNet::Create(job_conf.neuralnet(), kTrain, grp_size);
-  WriteStringToTextFile(cluster->vis_folder() + "/train_net.json",
-      net->ToGraph(true).ToJson());
-  const vector<Worker*> workers = CreateWorkers(job_conf, net);
-  const vector<Server*> servers = CreateServers(job_conf, net);
-
-  vector<std::thread> threads;
-  for (auto server : servers)
-    threads.push_back(std::thread(&Server::Run, server));
-  int gpu = 0;
-  auto context = Singleton<Context>::Instance();
-  // CHECK_LE(workers.size(), job_conf.gpu_size());
-  for (auto worker : workers) {
-    threads.push_back(std::thread(&Worker::Run, worker));
-    int device_id  = -1;
-    if (gpu < job_conf.gpu_size()) {
-      device_id = job_conf.gpu(gpu++);
-    }
-    context->SetupDevice(threads.back().get_id(), device_id);
-  }
-  if (grp_size > 1 || nserver_grps > 0) {
-    int nservers_per_grp = cluster->nservers_per_group();
-    int lcm = LeastCommonMultiple(nservers_per_grp, nserver_grps);
-    auto slices = Param::ComputeSlices(lcm, net->params());
-    auto slice2server = PartitionSlices(nservers_per_grp, slices);
-    stub.Run(slice2server, workers, servers);
-  }
-
-  for (auto& thread : threads)
-    thread.join();
-  for (auto server : servers)
-    delete server;
-  delete net;
-  std::set<NeuralNet*> deleted{net, nullptr};
-  for (auto worker : workers) {
-    for (auto ptr : worker->GetNets())
-    if (deleted.find(ptr) == deleted.end()) {
-      delete ptr;
-      deleted.insert(ptr);
-    }
-    delete worker;
-  }
-}
-
-void Driver::SetupForResume(JobProto* job_conf) {
-  tinydir_dir dir;
-  std::string folder = Cluster::Get()->checkpoint_folder();
-  tinydir_open(&dir, folder.c_str());
-  int latest_step = 0;
-  // there would be multi checkpoint files (from diff workers) for one step
-  vector<std::string> ck_files;
-  // iterate all files to get the files for the last checkpoint
-  while (dir.has_next) {
-    tinydir_file file;
-    tinydir_readfile(&dir, &file);
-    tinydir_next(&dir);
-    char* ch = strstr(file.name, "step");
-    if (ch == nullptr) {
-      if (file.name[0] != '.')
-        LOG(INFO) << "Irregular file in checkpoint folder: " << file.name;
-      continue;
-    }
-    LOG(INFO) << "Add checkpoint file for resume: " << ch;
-    int step = atoi(ch+4);
-    if (step == latest_step) {
-      ck_files.push_back(file.name);
-    } else if (step > latest_step) {
-      latest_step = step;
-      ck_files.clear();
-      ck_files.push_back(std::string(file.name));
-    }
-  }
-  if (latest_step > 0) {
-    job_conf->set_step(latest_step);
-    if (!job_conf->has_reset_param_version())
-      job_conf->set_reset_param_version(false);
-    job_conf->clear_checkpoint_path();
-    for (auto ck_file : ck_files)
-      job_conf->add_checkpoint_path(folder + "/" + ck_file);
-  }
-  tinydir_close(&dir);
-}
-
-const vector<Worker*> Driver::CreateWorkers(const JobProto& job_conf,
-    NeuralNet* net) {
-  auto cluster = Cluster::Get();
-  vector<Worker*> workers;
-  if (!cluster->has_worker()) return workers;
-  int wgrp_size = cluster->nworkers_per_group();
-  int nservers_per_grp = cluster->nservers_per_group();
-  int nserver_grps = cluster->nserver_groups();
-  int lcm = LeastCommonMultiple(nserver_grps, nservers_per_grp);
-  const vector<int> rng = cluster->ExecutorRng(cluster->procs_id(),
-      cluster->nworkers_per_group(), cluster->nworkers_per_procs());
-  int gstart = rng[0], gend = rng[1], wstart = rng[2], wend = rng[3];
-  for (int gid = gstart; gid < gend; gid++) {
-    NeuralNet* train_net = nullptr, *test_net = nullptr, *val_net = nullptr;
-    if (gid == gstart) {
-      train_net = net;
-      Param::SliceParams(lcm, train_net->params());
-      // test and validation are performed by the 1st group.
-      if (gid == 0 && job_conf.test_steps() > 0) {
-        test_net = NeuralNet::Create(job_conf.neuralnet(), kTest, 1);
-        test_net->ShareParamsFrom(train_net, false);
-      }
-      if (gid == 0 && job_conf.validate_steps() > 0) {
-        val_net = NeuralNet::Create(job_conf.neuralnet(), kVal, 1);
-        val_net->ShareParamsFrom(train_net, false);
-      }
-    } else {
-      train_net = NeuralNet::Create(job_conf.neuralnet(), kTrain, wgrp_size);
-      if (cluster->share_memory()) {
-        train_net->ShareParamsFrom(net, true);
-      } else {
-        Param::SliceParams(lcm, train_net->params());
-      }
-    }
-    for (int wid = wstart; wid < wend; wid++) {
-      auto *worker = Worker::Create(job_conf.train_one_batch());
-      // TODO(wangwei) extend to test among workers in a grp
-      if (wid == 0)
-        worker->Setup(gid, wid, job_conf, train_net, val_net, test_net);
-      else
-        worker->Setup(gid, wid, job_conf, train_net, nullptr, nullptr);
-      workers.push_back(worker);
-    }
-  }
-  return workers;
-}
-
-const vector<Server*> Driver::CreateServers(const JobProto& job_conf,
-    NeuralNet* net) {
-  auto cluster = Cluster::Get();
-  vector<Server*> servers;
-  if (!cluster->has_server()) return servers;
-  int nservers_per_grp = cluster->nservers_per_group();
-  int nserver_grps = cluster->nserver_groups();
-  int lcm = LeastCommonMultiple(nserver_grps, nservers_per_grp);
-  auto slices = Param::ComputeSlices(lcm, net->params());
-  // partition among server groups, each group maintains one sub-set for sync
-  auto slice2group = PartitionSlices(nserver_grps, slices);
-  // partition within one server group, each server updates for one sub-set
-  auto slice2server = PartitionSlices(nservers_per_grp, slices);
-
-  int server_procs = cluster->procs_id();
-  // if true, server procs (logical) id starts after worker procs
-  if (cluster->server_worker_separate())
-    server_procs -= cluster->nworker_procs();
-  const vector<int> rng = cluster->ExecutorRng(server_procs,
-      cluster->nservers_per_group(), cluster->nservers_per_procs());
-  int gstart = rng[0], gend = rng[1], start = rng[2], end = rng[3];
-  for (int gid = gstart; gid < gend; gid++) {
-    for (int sid = start; sid < end; sid++) {
-      auto server = new Server(gid, sid, job_conf, slice2group, slice2server);
-      servers.push_back(server);
-    }
-  }
-  return servers;
-}
-
-}  // namespace singa
diff --git a/src/io/binfile_reader.cc b/src/io/binfile_reader.cc
new file mode 100644
index 0000000..9b52a5d
--- /dev/null
+++ b/src/io/binfile_reader.cc
@@ -0,0 +1,136 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "singa/io/reader.h"
+#include "singa/utils/logging.h"
+
+namespace singa {
+namespace io {
+bool BinFileReader::Open(const std::string& path) {
+  path_ = path;
+  return OpenFile();
+}
+
+bool BinFileReader::Open(const std::string& path, int capacity) {
+  path_ = path;
+  capacity_ = capacity;
+  return OpenFile();
+}
+
+void BinFileReader::Close() {
+  if (buf_ != nullptr) {
+    delete[] buf_;
+    buf_ = nullptr;
+  }
+  if (fdat_.is_open()) fdat_.close();
+}
+
+bool BinFileReader::Read(std::string* key, std::string* value) {
+  CHECK(fdat_.is_open()) << "File not open!";
+  char magic[4];
+  int smagic = sizeof(magic);
+  if (!PrepareNextField(smagic)) return false;
+  memcpy(magic, buf_ + offset_, smagic);
+  offset_ += smagic;
+
+  if (magic[0] == kMagicWord[0] && magic[1] == kMagicWord[1]) {
+    if (magic[2] != 0 && magic[2] != 1)
+      LOG(FATAL) << "File format error: magic word does not match!";
+    if (magic[2] == 1)
+      if (!ReadField(key)) return false;
+    if (!ReadField(value)) return false;
+  }
+  else {
+    LOG(FATAL) << "File format error: magic word does not match!";
+  }
+  return true;
+}
+
+int BinFileReader::Count() {
+  std::ifstream fin(path_, std::ios::in | std::ios::binary);
+  CHECK(fin.is_open()) << "Cannot create file " << path_;
+  int count = 0;
+  while (true) {
+    size_t len;
+    char magic[4];
+    fin.read(reinterpret_cast<char*>(magic), sizeof(magic));
+    if (!fin.good()) break;
+    if (magic[2] == 1) {
+      fin.read(reinterpret_cast<char*>(&len), sizeof(len));
+      if (!fin.good()) break;
+      fin.seekg(len, std::ios_base::cur);
+      if (!fin.good()) break;
+    }
+    fin.read(reinterpret_cast<char*>(&len), sizeof(len));
+    if (!fin.good()) break;
+    fin.seekg(len, std::ios_base::cur);
+    if (!fin.good()) break;
+    count++;
+  }
+  fin.close();
+  return count;
+}
+
+void BinFileReader::SeekToFirst() {
+  bufsize_ = 0;
+  offset_ = 0;
+  fdat_.clear();
+  fdat_.seekg(0);
+  CHECK(fdat_.is_open()) << "Cannot create file " << path_;
+}
+
+bool BinFileReader::OpenFile() {
+  buf_ = new char[capacity_];
+  fdat_.open(path_, std::ios::in | std::ios::binary);
+  CHECK(fdat_.is_open()) << "Cannot open file " << path_;
+  return fdat_.is_open();
+}
+
+bool BinFileReader::ReadField(std::string* content) {
+  content->clear();
+  int ssize = sizeof(size_t);
+  if (!PrepareNextField(ssize)) return false;
+  int len = *reinterpret_cast<size_t*>(buf_ + offset_);
+  offset_ += ssize;
+  if (!PrepareNextField(len)) return false;
+  content->reserve(len);
+  content->insert(0, buf_ + offset_, len);
+  //for (int i = 0; i < len; ++i) content->push_back(buf_[offset_ + i]);
+  offset_ += len;
+  return true;
+}
+
+// if the buf does not have the next complete field, read data from disk
+bool BinFileReader::PrepareNextField(int size) {
+  if (offset_ + size > bufsize_) {
+    bufsize_ -= offset_;
+    memcpy(buf_, buf_ + offset_, bufsize_);
+    offset_ = 0;
+    if (fdat_.eof()) {
+      return false;
+    } else {
+      fdat_.read(buf_ + bufsize_, capacity_ - bufsize_);
+      bufsize_ += fdat_.gcount();
+      CHECK_LE(size, bufsize_) << "Field size is too large: " << size;
+    }
+  }
+  return true;
+}
+
+}  // namespace io
+}  // namespace singa
diff --git a/src/io/binfile_writer.cc b/src/io/binfile_writer.cc
new file mode 100644
index 0000000..adc910e
--- /dev/null
+++ b/src/io/binfile_writer.cc
@@ -0,0 +1,112 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "singa/io/writer.h"
+#include "singa/utils/logging.h"
+
+namespace singa {
+namespace io {
+bool BinFileWriter::Open(const std::string& path, Mode mode) {
+  path_ = path;
+  mode_ = mode;
+  return OpenFile();
+}
+
+bool BinFileWriter::Open(const std::string& path, Mode mode, int capacity) {
+  CHECK(!fdat_.is_open());
+  path_ = path;
+  mode_ = mode;
+  capacity_ = capacity;
+  return OpenFile();
+}
+
+void BinFileWriter::Close() {
+  Flush();
+  if (buf_ != nullptr) {
+    delete [] buf_;
+    buf_ = nullptr;
+  }
+  if (fdat_.is_open()) fdat_.close();
+}
+
+bool BinFileWriter::Write(const std::string& key, const std::string& value) {
+  CHECK(fdat_.is_open()) << "File not open!";
+  if (value.size() == 0) return false;
+  // magic_word + (key_len + key) + val_len + val
+  char magic[4];
+  int size;
+  memcpy(magic, kMagicWord, sizeof(kMagicWord));
+  magic[3] = 0;
+  if (key.size() == 0) {
+    magic[2] = 0;
+    size = sizeof(magic) + sizeof(size_t) + value.size();
+  } else {
+    magic[2] = 1;
+    size = sizeof(magic) + 2 * sizeof(size_t) + key.size() + value.size();
+  }
+
+  if (bufsize_ + size > capacity_) {
+    fdat_.write(buf_, bufsize_);
+    bufsize_ = 0;
+    CHECK_LE(size, capacity_) << "Tuple size is larger than capacity "
+                              << "Try a larger capacity size";
+  }
+
+  memcpy(buf_ + bufsize_, magic, sizeof(magic));
+  bufsize_ += sizeof(magic);
+  if (key.size() > 0) {
+    *reinterpret_cast<size_t*>(buf_ + bufsize_) = key.size();
+    bufsize_ += sizeof(size_t);
+    std::memcpy(buf_ + bufsize_, key.data(), key.size());
+    bufsize_ += key.size();
+  }
+  *reinterpret_cast<size_t*>(buf_ + bufsize_) = value.size();
+  bufsize_ += sizeof(size_t);
+  std::memcpy(buf_ + bufsize_, value.data(), value.size());
+  bufsize_ += value.size();
+  return true;
+}
+
+void BinFileWriter::Flush() {
+  if (bufsize_ > 0) {
+    fdat_.write(buf_, bufsize_);
+    fdat_.flush();
+    bufsize_ = 0;
+  }
+}
+
+bool BinFileWriter::OpenFile() {
+  CHECK(buf_ == nullptr);
+  buf_ = new char[capacity_];
+  switch (mode_) {
+    case kCreate:
+      fdat_.open(path_, std::ios::binary | std::ios::out | std::ios::trunc);
+      CHECK(fdat_.is_open()) << "Cannot create file " << path_;
+      break;
+    case kAppend:
+      fdat_.open(path_, std::ios::app | std::ios::binary);
+      CHECK(fdat_.is_open()) << "Cannot open file " << path_;
+      break;
+    default:
+      LOG(FATAL) << "unknown mode to open binary file " << mode_;
+      break;
+  }
+  return fdat_.is_open();
+}
+}  // namespace io
+}  // namespace singa
diff --git a/src/io/csv_decoder.cc b/src/io/csv_decoder.cc
new file mode 100644
index 0000000..72b4e10
--- /dev/null
+++ b/src/io/csv_decoder.cc
@@ -0,0 +1,55 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "singa/io/decoder.h"
+#include <string>
+#include <sstream>
+
+const int kMaxCSVBufSize = 40960;
+
+namespace singa {
+
+std::vector<Tensor> CSVDecoder::Decode(std::string value) {
+  std::vector<Tensor> output;
+  std::stringstream ss;
+  ss.str(value);
+  int l = 0;
+  if (has_label_ == true)
+    ss >> l;
+  std::string str;
+  float d[kMaxCSVBufSize];
+  int size = 0;
+  while (std::getline(ss, str, ',')) {
+    float temp;
+    if (std::stringstream(str) >> temp) {
+      CHECK_LE(size, kMaxCSVBufSize - 1);
+      d[size++] = temp;
+    }
+  }
+
+  Tensor data(Shape {static_cast<size_t>(size)}, kFloat32);
+  data.CopyDataFromHostPtr(d, size);
+  output.push_back(data);
+  if (has_label_ == true) {
+    Tensor label(Shape {1}, kInt);
+    label.CopyDataFromHostPtr(&l, 1);
+    output.push_back(label);
+  }
+  return output;
+}
+}  // namespace singa
diff --git a/src/io/csv_encoder.cc b/src/io/csv_encoder.cc
new file mode 100644
index 0000000..6089ab5
--- /dev/null
+++ b/src/io/csv_encoder.cc
@@ -0,0 +1,43 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "singa/io/encoder.h"
+#include <sstream>
+
+namespace singa {
+
+std::string CSVEncoder::Encode(vector<Tensor>& data) {
+  CHECK_GE(data.size(), 1u);
+  size_t size = data[0].Size();
+  const float* value = data[0].data<float>();
+  std::string des = "";
+  if (data.size() == 2) {
+    const float label = (const float)data[1].data<int>()[0];
+    std::ostringstream buff;
+    buff << label;
+    des += buff.str() + ',';
+  }
+  for (size_t i = 0; i < size; i++) {
+    std::ostringstream buff;
+    buff << value[i];
+    if (i == size - 1) des += buff.str();
+    else des += buff.str() + ',';
+  }
+  return des;
+}
+}  // namespace singa
diff --git a/src/io/hdfsfile.cc b/src/io/hdfsfile.cc
deleted file mode 100644
index e093d81..0000000
--- a/src/io/hdfsfile.cc
+++ /dev/null
@@ -1,135 +0,0 @@
-/************************************************************
-*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*
-*************************************************************/
-
-#include "singa/io/hdfsfile.h"
-
-#include <glog/logging.h>
-#include <iostream>
-namespace singa {
-namespace io {
-
-HDFSFile::HDFSFile(const std::string& path, Mode mode): path_(path),
-  mode_(mode) {
-  // check that path starts with hdfs://
-  CHECK_EQ(path.find("hdfs://"), 0);
-
-  // extract namenode from path
-  int path_idx = path.find_first_of("/", 7);
-  int colon_idx = path.find_first_of(":", 7);
-  std::string namenode = path.substr(7, colon_idx-7);
-  int port = atoi(path.substr(colon_idx+1, path_idx-colon_idx-1).c_str());
-  std::string filepath = path.substr(path_idx);
-
-  // connect to HDFS
-  fs_ = hdfsConnect(namenode.c_str(), port);
-  CHECK_NOTNULL(fs_);
-
-  if (mode == HDFSFile::kRead) {
-    file_ = hdfsOpenFile(fs_, filepath.c_str(), O_RDONLY, 0, 0, 0);
-  } else {
-    // check if the directory exists, create it if not.
-    int file_idx = path.find_last_of("/");
-    std::string hdfs_directory_path = path.substr(path_idx, file_idx-path_idx);
-    if (hdfsExists(fs_, hdfs_directory_path.c_str()) == -1)
-      CHECK_EQ(hdfsCreateDirectory(fs_, hdfs_directory_path.c_str()), 0);
-    file_ = hdfsOpenFile(fs_, filepath.c_str(), O_WRONLY, 0, 0, 0);
-  }
-
-  CHECK_NOTNULL(file_);
-}
-
-HDFSFile::~HDFSFile() {
-  if (mode_ != HDFSFile::kRead)
-    Flush();
-  hdfsCloseFile(fs_, file_);
-}
-
-#ifdef USE_PROTOBUF
-bool HDFSFile::Next(google::protobuf::Message* val) {
-  // read from file_, then turns it to a message
-  // red size, then content
-  int size;
-  if (hdfsRead(fs_, file_, &size, sizeof(int)) <= 0)
-    return false;
-  char *temp_buf = reinterpret_cast<char*>(malloc(size*sizeof(char)));
-  CHECK(hdfsRead(fs_, file_, temp_buf, size));
-  val->ParseFromArray(temp_buf, size);
-  free(temp_buf);
-  return true;
-}
-
-bool HDFSFile::Insert(const google::protobuf::Message& val) {
-  std::string str;
-  val.SerializeToString(&str);
-  return Insert(str);
-}
-#endif
-
-bool HDFSFile::Next(std::string* val) {
-  char size_buf[sizeof(int)];
-  // a hack to read across blocks. The first read my return in complete data,
-  // so try the second read.
-  int read_size_size = hdfsRead(fs_, file_, size_buf, sizeof(int));
-
-  if (read_size_size == 0)
-    return false;
-
-  if (read_size_size < (static_cast<int>(sizeof(int))))
-    CHECK_EQ(hdfsRead(fs_, file_, size_buf+read_size_size,
-      sizeof(int)-read_size_size),
-      sizeof(int)-read_size_size);
-  int size;
-  memcpy(&size, size_buf, sizeof(int));
-
-  char *temp_buf = reinterpret_cast<char*>(malloc(size*sizeof(char)));
-
-  int read_size = hdfsRead(fs_, file_, temp_buf, size);
-  if (read_size < size)
-    CHECK_EQ(hdfsRead(fs_, file_, temp_buf+read_size, size-read_size),
-      size-read_size);
-  val->clear();
-  val->append(temp_buf, size);
-  free(temp_buf);
-  return true;
-}
-
-// append one record to the end of the file
-bool HDFSFile::Insert(const std::string& val) {
-  CHECK(mode_ != HDFSFile::kRead);
-  // write length, then content
-  int size = val.length();
-  CHECK_EQ(hdfsWrite(fs_, file_, &size, sizeof(int)), sizeof(int));
-  CHECK_EQ(hdfsWrite(fs_, file_, val.c_str(), val.length()), val.length());
-  return true;
-}
-
-void HDFSFile::Seek(int offset) {
-  CHECK_EQ(mode_, kRead);
-  // seek back to the parition offset
-  CHECK_EQ(hdfsSeek(fs_, file_, offset), 0);
-}
-
-void HDFSFile::Flush() {
-  CHECK_EQ(hdfsFlush(fs_, file_), 0);
-}
-
-}  // namespace io
-}  // namespace singa
diff --git a/src/io/hdfsfile_store.cc b/src/io/hdfsfile_store.cc
deleted file mode 100644
index 9464169..0000000
--- a/src/io/hdfsfile_store.cc
+++ /dev/null
@@ -1,75 +0,0 @@
-/************************************************************
-*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*
-*************************************************************/
-
-#include <glog/logging.h>
-#include "singa/io/hdfs_store.h"
-
-namespace singa {
-namespace io {
-
-bool HDFSStore::Open(const std::string& source, Mode mode) {
-  CHECK(file_ == nullptr);
-  if (mode == kRead)
-    file_ = new HDFSFile(source, HDFSFile::kRead);
-  else if (mode == kCreate)
-    file_ = new HDFSFile(source, HDFSFile::kCreate);
-  else if (mode == kAppend)
-    file_ = new HDFSFile(source, HDFSFile::kAppend);
-  mode_ = mode;
-  return file_ != nullptr;
-}
-
-void HDFSStore::Close() {
-  if (file_ != nullptr)
-    delete file_;
-  file_ = nullptr;
-}
-
-bool HDFSStore::Read(std::string* key, std::string* value) {
-  CHECK_EQ(mode_, kRead);
-  CHECK(file_ != nullptr);
-  return file_->Next(value);
-}
-
-void HDFSStore::SeekToFirst() {
-  CHECK_EQ(mode_, kRead);
-  CHECK(file_ != nullptr);
-  file_->Seek(0);
-}
-
-void HDFSStore::Seek(int offset) {
-  file_->Seek(offset);
-}
-
-bool HDFSStore::Write(const std::string& key, const std::string& value) {
-  CHECK_NE(mode_, kRead);
-  CHECK(file_ != nullptr);
-  return file_->Insert(value);
-}
-
-void HDFSStore::Flush() {
-  CHECK_NE(mode_, kRead);
-  CHECK(file_!= nullptr);
-  file_->Flush();
-}
-
-}  // namespace io
-}  // namespace singa
diff --git a/src/io/image_transformer.cc b/src/io/image_transformer.cc
new file mode 100644
index 0000000..f233ad3
--- /dev/null
+++ b/src/io/image_transformer.cc
@@ -0,0 +1,356 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "singa/io/transformer.h"
+#include <time.h>
+
+#ifdef USE_OPENCV
+#include <opencv2/highgui/highgui.hpp>
+#include <opencv2/imgproc/imgproc.hpp>
+#endif
+
+namespace singa {
+
+  Tensor ImageTransformer::Apply(int flag, Tensor& input) {
+    CHECK_LE(input.nDim(), 4u);
+    CHECK_GE(input.nDim(), 2u);
+    CHECK_EQ(input.data_type(), kFloat32) << "Data type " << input.data_type()
+      << " is invalid for an raw image";
+    srand(time(NULL));
+    /// TODO
+    /// currently only consider one sample each time
+
+    /// resize image using opencv resize
+    Tensor temp1;
+#ifdef USE_OPENCV
+    temp1 = resize(input, resize_height_, resize_width_, image_dim_order_);
+#else
+    temp1 = input;
+#endif
+
+    /// crop
+    Tensor temp2;
+    size_t height = 0, width = 0;
+    if (input.nDim() >= 3u) {
+      if (image_dim_order_ == "CHW")
+        height = temp1.shape(input.nDim() - 2), width = temp1.shape(input.nDim() - 1);
+      else if (image_dim_order_ == "HWC")
+        height = temp1.shape(input.nDim() - 3), width = temp1.shape(input.nDim() - 2);
+      else
+        LOG(FATAL) << "Unknow dimension order for images " << image_dim_order_
+               << " Only support 'HWC' and 'CHW'";
+    } else /// input is 2D gray image
+      height = temp1.shape(0), width = temp1.shape(1);
+
+    if (crop_shape_.size() == 2) {
+      if (flag == kTrain) { 
+        /// random crop
+        if (crop_shape_[0] > height || crop_shape_[0] > width)
+          LOG(FATAL) << "Crop size larger than the size of raw image";
+        size_t crop_h_offset = rand() % ((height - crop_shape_[0]) / 2), 
+               crop_w_offset = rand() % ((width - crop_shape_[1]) / 2);
+        temp2 = crop(temp1, crop_shape_[0], crop_shape_[1], 
+                  crop_h_offset, crop_w_offset, image_dim_order_);
+      } else if (flag == kEval) {
+        /// central crop
+        size_t crop_h_offset = (height - crop_shape_[0]) / 2,
+               crop_w_offset = (width - crop_shape_[1]) / 2;
+        temp2 = crop(temp1, crop_shape_[0], crop_shape_[1], 
+                  crop_h_offset, crop_w_offset, image_dim_order_); 
+      }
+    }
+    else temp2 = temp1;
+
+    /// mirror
+    Tensor output;
+    if ((flag == kTrain) && (rand() % 2))
+        output = mirror(temp2, true, false, image_dim_order_);
+    else output = temp2;
+    return output;
+  }
+
+#ifdef USE_OPENCV
+  Tensor resize(Tensor& input, const size_t resize_height, 
+               const size_t resize_width, const string& image_dim_order) {
+    CHECK_LE(input.nDim(), 4u);
+    CHECK_GE(input.nDim(), 2u);
+    if (!resize_height || !resize_width) return input;
+    Tensor output;
+    cv::Mat mat;
+    const auto* in = input.data<float>();
+    if (input.nDim() == 4u) {
+      /// TODO
+      /// batch based resize
+      LOG(FATAL) << "Not implemented";
+    } else if (input.nDim() == 3u) {
+      if (image_dim_order == "CHW") {
+        size_t height = input.shape(1), width = input.shape(2),
+               channel = input.shape(0);
+        if (channel == 3u) {
+          mat = cv::Mat(height, width, CV_32FC3, cv::Scalar(0, 0, 0));
+          for (size_t i = 0; i < height; i++)
+            for (size_t j = 0; j < width; j++)
+              for (size_t k = 0; k < channel; k++)
+                mat.at<cv::Vec3f>(i, j)[k] = in[k * height * width + i * width + j];
+        }
+        else if (channel == 1u) {
+          mat = cv::Mat(height, width, CV_32FC1);
+          for (size_t i = 0; i < height; i++)
+            for (size_t j = 0; j < width; j++)
+                mat.at<cv::Vec<float, 1>>(i, j)[0] = in[i * width + j];
+        }
+        else LOG(FATAL) << "Invalid channel size: " << channel;
+      } else if (image_dim_order == "HWC") {
+        size_t height = input.shape(0), width = input.shape(1),
+               channel = input.shape(2);
+        if (channel == 3u) {
+          mat = cv::Mat(height, width, CV_32FC3, cv::Scalar(0, 0, 0));
+          for (size_t i = 0; i < height; i++)
+            for (size_t j = 0; j < width; j++)
+              for (size_t k = 0; k < channel; k++)
+                mat.at<cv::Vec3f>(i, j)[k] =
+                  in[i * width * channel + j * channel + k];
+        } else if (channel == 1u) { /// 2D gray image
+          mat = cv::Mat(height, width, CV_32FC1);
+          for (size_t i = 0; i < height; i++)
+            for (size_t j = 0; j < width; j++)
+              mat.at<cv::Vec<float, 1>>(i, j)[0] = in[i * width + j];
+        } else LOG(FATAL) << "Invalid channel size: " << channel;
+      } else {
+        LOG(FATAL) << "Unknow dimension order for images " << image_dim_order
+                   << " Only support 'HWC' and 'CHW'";
+      }
+    } else { /// 2D gray image
+      size_t height = input.shape(0), width = input.shape(1);
+      mat = cv::Mat(height, width, CV_32FC1);
+      for (size_t i = 0; i < height; i++)
+        for (size_t j = 0; j < width; j++)
+          mat.at<cv::Vec<float, 1>>(i, j)[0] = in[i * width + j];
+    }
+    cv::Size size(resize_width, resize_height);
+    cv::Mat resized;
+    cv::resize(mat, resized, size);
+    CHECK_EQ(resized.size().height, resize_height);
+    CHECK_EQ(resized.size().width, resize_width);
+    size_t new_size = resize_height * resize_width * resized.channels();
+    float* out = new float[new_size];
+    if (input.nDim() == 4u) {
+      /// TODO
+      /// batch based resize
+      LOG(FATAL) << "Not implemented";
+    } else if (input.nDim() == 3u) {
+      if (image_dim_order == "CHW") {
+        size_t height = resize_height, width = resize_width,
+           channel = input.shape(0);
+        if (channel == 3u) {
+          for (size_t i = 0; i < height; i++)
+            for (size_t j = 0; j < width; j++)
+              for (size_t k = 0; k < channel; k++)
+                out[k * height * width + i * width + j] = resized.at<cv::Vec3f>(i, j)[k];
+        } else { /// 2D gray image
+          for (size_t i = 0; i < height; i++)
+            for (size_t j = 0; j < width; j++)
+              out[i * width + j] = resized.at<cv::Vec<float, 1>>(i, j)[0];
+        }
+        Tensor temp(Shape{channel, height, width});
+        temp.CopyDataFromHostPtr<float>(out, new_size);
+        output = temp;
+      } else {
+        size_t height = resize_height, width = resize_width,
+           channel = input.shape(2);
+        if (channel == 3u) {
+          for (size_t i = 0; i < height; i++)
+            for (size_t j = 0; j < width; j++)
+              for (size_t k = 0; k < channel; k++)
+                out[i * width * channel + j * channel + k] = resized.at<cv::Vec3f>(i, j)[k];
+        } else { /// 1 channel
+          for (size_t i = 0; i < height; i++)
+            for (size_t j = 0; j < width; j++)
+              out[i * width + j] = resized.at<cv::Vec<float, 1>>(i, j)[0];
+        }
+        Tensor temp(Shape{height, width, channel}); 
+        temp.CopyDataFromHostPtr<float>(out, new_size);
+        output = temp;
+      }
+    } else { /// 2D gray image
+      size_t height = resize_height, width = resize_width;
+      for (size_t i = 0; i < height; i++)
+        for (size_t j = 0; j < width; j++)
+          out[i * width + j] = resized.at<cv::Vec<float, 1>>(i, j)[0];
+      Tensor temp(Shape{height, width});
+      temp.CopyDataFromHostPtr<float>(out, new_size);
+      output = temp;
+    }
+    delete[] out;
+    return output;
+  }
+#endif
+
+  Tensor crop(Tensor& input, const size_t crop_height, const size_t crop_width, 
+             const size_t crop_h_offset, const size_t crop_w_offset, 
+             const string& image_dim_order) {
+    CHECK_LE(input.nDim(), 4u);
+    CHECK_GE(input.nDim(), 2u);
+
+    Tensor output;
+    const float* in = input.data<float>();
+    size_t out_idx = 0, in_idx = 0;
+    if (input.nDim() == 4u) {
+      /// TODO
+      LOG(FATAL) << "Not implemented";
+    } else if (input.nDim() == 3u) {
+      if (image_dim_order == "CHW") {
+        size_t height = input.shape(1), width = input.shape(2),
+            channel = input.shape(0); 
+        CHECK_LE(crop_height + crop_h_offset, height);
+        CHECK_LE(crop_width + crop_w_offset, width);
+        float* out = new float[crop_height * crop_width * channel];
+        for (size_t c = 0; c < channel; c++) {
+          for (size_t h = 0; h < crop_height; h++) {
+            for (size_t w = 0; w < crop_width; w++) {
+              in_idx = (c * height + crop_h_offset + h) * width + crop_w_offset + w;
+              out_idx = (c * crop_height + h) * crop_width + w;
+              out[out_idx] = in[in_idx];
+            }
+          }
+        }
+        output.Reshape(Shape{channel, crop_height, crop_width});
+        output.CopyDataFromHostPtr<float>(out, crop_height * crop_width * channel);
+        delete[] out;
+      } else if (image_dim_order == "HWC") {
+        size_t height = input.shape(0), width = input.shape(1), 
+               channel = input.shape(2); 
+        CHECK_LE(crop_height + crop_h_offset, height);
+        CHECK_LE(crop_width + crop_w_offset, width);
+        float* out = new float[crop_height * crop_width * channel];
+        for (size_t c = 0; c < channel; c++) {
+          for (size_t h = 0; h < crop_height; h++) {
+            for (size_t w = 0; w < crop_width; w++) {
+              in_idx = ((crop_h_offset + h) * width + crop_w_offset + w) * channel + c;
+              out_idx = (h * crop_width + w) * channel + c;
+              out[out_idx] = in[in_idx];
+            }
+          }
+        }
+        output.Reshape(Shape{crop_height, crop_width, channel});
+        output.CopyDataFromHostPtr<float>(out, crop_height * crop_width * channel);
+        delete[] out;
+      } else {
+        LOG(FATAL) << "Unknow dimension order for images " << image_dim_order
+                   << " Only support 'HWC' and 'CHW'";
+      }
+    } else { /// 2D gray image
+      size_t height = input.shape(0), width = input.shape(1); 
+      CHECK_LE(crop_height + crop_h_offset, height);
+      CHECK_LE(crop_width + crop_w_offset, width);
+      float* out = new float[crop_height * crop_width];
+      for (size_t h = 0; h < crop_height; h++) {
+        for (size_t w = 0; w < crop_width; w++) {
+          in_idx = (crop_h_offset + h) * width + crop_w_offset + w;
+          out_idx = h * crop_width + w;
+          out[out_idx] = in[in_idx];
+        }
+      }
+      output.Reshape(Shape{crop_height, crop_width});
+      output.CopyDataFromHostPtr<float>(out, crop_height * crop_width);
+      delete[] out;
+    }
+    return output;
+  }
+
+  Tensor mirror(Tensor& input, const bool horizontal_mirror,
+             const bool vertical_mirror, const string& image_dim_order) {
+    CHECK_LE(input.nDim(), 4u);
+    CHECK_GE(input.nDim(), 2u);
+    if (!horizontal_mirror && !vertical_mirror) return input;
+
+    Tensor output;
+    const float* in = input.data<float>();
+    size_t out_idx = 0, in_idx = 0;
+    if (input.nDim() == 4u) {
+      /// TODO
+      LOG(FATAL) << "Not implemented";
+    } else if (input.nDim() == 3u) {
+      if (image_dim_order == "CHW") {
+        size_t height = input.shape(1), width = input.shape(2),
+            channel = input.shape(0);
+        float* out = new float[height * width * channel];
+        for (size_t c = 0; c < channel; c++) {
+          for (size_t h = 0; h < height; h++) {
+            for (size_t w = 0; w < width; w++) {
+              in_idx = (c * height + h) * width + w;
+              if (horizontal_mirror && vertical_mirror)
+                out_idx = (c * height + (height - 1 - h)) * width + (width - 1 - w);
+              else if (horizontal_mirror)
+                out_idx = (c * height + h) * width + (width - 1 - w);
+              else /// only do vertical mirror
+                out_idx = (c * height + (height - 1 - h)) * width + w;
+              out[out_idx] = in[in_idx];
+            }
+          }
+        }
+        output.Reshape(Shape{channel, height, width});
+        output.CopyDataFromHostPtr<float>(out, height * width * channel);
+        delete[] out;
+      } else if (image_dim_order == "HWC") {
+        size_t height = input.shape(0), width = input.shape(1),
+            channel = input.shape(2);
+        float* out = new float[height * width * channel];
+        for (size_t c = 0; c < channel; c++) {
+          for (size_t h = 0; h < height; h++) {
+            for (size_t w = 0; w < width; w++) {
+              in_idx = (h * width + w) * channel + c;
+              if (horizontal_mirror && vertical_mirror)
+                out_idx = ((height - 1 - h) * width + (width - 1 - w)) * channel + c;
+              else if (horizontal_mirror)
+                out_idx = (h * width + (width - 1 - w)) * channel + c;
+              else /// only do vertical mirror
+                out_idx = ((height - 1 - h) * width + w) * channel + c;
+              out[out_idx] = in[in_idx];
+            }
+          }
+        }
+        output.Reshape(Shape{height, width, channel});
+        output.CopyDataFromHostPtr<float>(out, height * width * channel);
+        delete[] out;
+      } else {
+        LOG(FATAL) << "Unknow dimension order for images " << image_dim_order
+                   << " Only support 'HWC' and 'CHW'";
+      }
+    } else { /// 2D gray image
+      size_t height = input.shape(0), width = input.shape(1);
+      float* out = new float[height * width];
+      for (size_t h = 0; h < height; h++) {
+        for (size_t w = 0; w < width; w++) {
+          in_idx = h * width + w;
+          if (horizontal_mirror && vertical_mirror)
+            out_idx = (height - 1 - h) * width + (width - 1 - w);
+          else if (horizontal_mirror)
+            out_idx = h * width + (width - 1 - w);
+          else /// only do vertical mirror
+            out_idx = (height - 1 - h) * width + w;
+          out[out_idx] = in[in_idx];
+        }
+      }
+      output.Reshape(Shape{height, width});
+      output.CopyDataFromHostPtr<float>(out, height * width);
+      delete[] out;
+    }
+    return output;
+  }
+} // namespace singa
diff --git a/src/io/jpg_decoder.cc b/src/io/jpg_decoder.cc
new file mode 100644
index 0000000..b778b98
--- /dev/null
+++ b/src/io/jpg_decoder.cc
@@ -0,0 +1,75 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "singa/io/decoder.h"
+
+#ifdef USE_OPENCV
+
+#include <opencv2/highgui/highgui.hpp>
+#include <opencv2/imgproc/imgproc.hpp>
+
+namespace singa {
+
+std::vector<Tensor> JPGDecoder::Decode(std::string value) {
+  std::vector<Tensor> output;
+
+  ImageRecord record;
+  record.ParseFromString(value);
+  std::vector<uchar> pixel(record.pixel().begin(), record.pixel().end());
+
+  // decode image
+  cv::Mat mat = cv::imdecode(cv::Mat(pixel), CV_LOAD_IMAGE_COLOR);
+  size_t height = mat.size().height, width = mat.size().width, channel = mat.channels();
+  Shape shape(record.shape().begin(), record.shape().end());
+  //CHECK_EQ(shape[0], height);
+  //CHECK_EQ(shape[1], width);
+  //CHECK_EQ(shape[2], channel);
+  Tensor image(shape);
+
+  float* data = new float[image.Size()];
+  if (image_dim_order_ == "CHW") {
+    for (size_t i = 0; i < height; i++)
+      for (size_t j = 0; j < width; j++)
+        for (size_t k = 0; k < channel; k++)
+          data[k * height * width + i * width + j] = static_cast<float>(
+              static_cast<int>(mat.at<cv::Vec3b>(i, j)[k]));
+  } else if (image_dim_order_ == "HWC") {
+
+    for (size_t i = 0; i < height; i++)
+      for (size_t j = 0; j < width; j++)
+        for (size_t k = 0; k < channel; k++)
+          data[i * width * channel + j * channel + k] =
+              static_cast<float>(static_cast<int>(mat.at<cv::Vec3b>(i, j)[k]));
+  } else {
+    LOG(FATAL) << "Unknow dimension order for images " << image_dim_order_
+               << " Only support 'HWC' and 'CHW'";
+  }
+  image.CopyDataFromHostPtr<float>(data, image.Size());
+  output.push_back(image);
+  delete[] data;
+
+  if (record.label_size()) {
+    Tensor label(Shape{1}, kInt);
+    int labelid = record.label(0);
+    label.CopyDataFromHostPtr(&labelid, 1);
+    output.push_back(label);
+  }
+  return output;
+}
+}  // namespace singa
+#endif
diff --git a/src/io/jpg_encoder.cc b/src/io/jpg_encoder.cc
new file mode 100644
index 0000000..8335a91
--- /dev/null
+++ b/src/io/jpg_encoder.cc
@@ -0,0 +1,83 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "singa/io/encoder.h"
+
+#ifdef USE_OPENCV
+#include <opencv2/highgui/highgui.hpp>
+#include <opencv2/imgproc/imgproc.hpp>
+
+namespace singa {
+
+std::string JPGEncoder::Encode(vector<Tensor>& data) {
+  // suppose image: image, data[1]: label
+  CHECK_LE(data.size(), 2u);
+  const Tensor& image = data.at(0);
+  CHECK_EQ(image.nDim(), 3u);
+  CHECK_EQ(image.data_type(), kUChar) << "Data type " << image.data_type()
+    << " is invalid for an raw image";
+  const auto* raw = image.data<unsigned char>();
+  cv::Mat mat;
+  if (image_dim_order_ == "HWC") {
+    size_t height = image.shape(0), width = image.shape(1),
+           channel = image.shape(2);
+    mat = cv::Mat(height, width, CV_8UC3, cv::Scalar(0, 0, 0));
+    for (size_t i = 0; i < height; i++)
+      for (size_t j = 0; j < width; j++)
+        for (size_t k = 0; k < channel; k++)
+          mat.at<cv::Vec3b>(i, j)[k] =
+              raw[i * width * channel + j * channel + k];
+  } else if (image_dim_order_ == "CHW") {
+    size_t channel = image.shape(0), height = image.shape(1),
+           width = image.shape(2);
+    mat = cv::Mat(height, width, CV_8UC3, cv::Scalar(0, 0, 0));
+    for (size_t i = 0; i < height; i++)
+      for (size_t j = 0; j < width; j++)
+        for (size_t k = 0; k < channel; k++)
+          mat.at<cv::Vec3b>(i, j)[k] = raw[k * height * width + i * width + j];
+  } else {
+    LOG(FATAL) << "Unknow dimension order for images " << image_dim_order_
+               << " Only support 'HWC' and 'CHW'";
+  }
+
+  // encode image with jpg format
+  std::vector<uchar> buff;
+  std::vector<int> param = std::vector<int>(2);
+  param[0] = CV_IMWRITE_JPEG_QUALITY;
+  param[1] = 100;  // default is 95
+  cv::imencode(".jpg", mat, buff, param);
+  std::string buf(buff.begin(), buff.end());
+
+  std::string output;
+  ImageRecord record;
+  for (size_t i = 0; i < image.nDim(); i++)
+    record.add_shape(image.shape(i));
+  record.set_pixel(buf);
+
+  // suppose each image is attached with at most one label
+  if (data.size() == 2) {
+    const int* label = data[1].data<int>();
+    //CHECK_EQ(label[0], 2);
+    record.add_label(label[0]);
+  }
+
+  record.SerializeToString(&output);
+  return output;
+}
+}  // namespace singa
+#endif  // USE_OPENCV
diff --git a/src/io/kvfile.cc b/src/io/kvfile.cc
deleted file mode 100644
index 3120be1..0000000
--- a/src/io/kvfile.cc
+++ /dev/null
@@ -1,219 +0,0 @@
-/************************************************************
-*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*
-*************************************************************/
-
-#include "singa/io/kvfile.h"
-
-#include <glog/logging.h>
-
-namespace singa {
-namespace io {
-
-KVFile::KVFile(const std::string& path, Mode mode, int capacity) :
-path_(path), mode_(mode), capacity_(capacity) {
-  buf_ = new char[capacity];
-  switch (mode) {
-    case KVFile::kRead:
-      fdat_.open(path_, std::ios::in | std::ios::binary);
-      if (!fdat_.is_open()) {
-        // path may be a directory
-        path_ = path + "/shard.dat";
-        fdat_.open(path_, std::ios::in | std::ios::binary);
-      }
-      CHECK(fdat_.is_open()) << "Cannot create file " << path_;
-      break;
-    case KVFile::kCreate:
-      fdat_.open(path_, std::ios::binary | std::ios::out | std::ios::trunc);
-      CHECK(fdat_.is_open()) << "Cannot create file " << path_;
-      break;
-    case KVFile::kAppend:
-      fdat_.open(path_, std::ios::in | std::ios::binary);
-      if (!fdat_.is_open()) {
-        // path may be a directory
-        path_ = path + "/shard.dat";
-        fdat_.open(path_, std::ios::in | std::ios::binary);
-      }
-      CHECK(fdat_.is_open()) << "Cannot open file " << path_;
-      fdat_.close();
-      {
-        int last_tuple = PrepareForAppend(path_);
-        fdat_.open(path_, std::ios::binary | std::ios::out
-            | std::ios::in | std::ios::ate);
-        fdat_.seekp(last_tuple);
-      }
-      break;
-    default:
-      LOG(FATAL) << "unknown model to open KVFile " << mode;
-      break;
-  }
-}
-
-KVFile::~KVFile() {
-  if (mode_ != kRead)
-    Flush();
-  delete[] buf_;
-  fdat_.close();
-}
-#ifdef USE_PROTOBUF
-bool KVFile::Next(std::string* key, google::protobuf::Message* val) {
-  int vallen = Next(key);
-  if (vallen == 0) return false;
-  val->ParseFromArray(buf_ + offset_, vallen);
-  offset_ += vallen;
-  return true;
-}
-
-bool KVFile::Insert(const std::string& key,
-    const google::protobuf::Message& val) {
-  std::string str;
-  val.SerializeToString(&str);
-  return Insert(key, str);
-}
-#endif
-
-bool KVFile::Next(std::string *key, std::string* val) {
-  int vallen = Next(key);
-  if (vallen == 0) return false;
-  val->clear();
-  for (int i = 0; i < vallen; ++i)
-    val->push_back(buf_[offset_ + i]);
-  offset_ += vallen;
-  return true;
-}
-
-// insert one complete tuple
-bool KVFile::Insert(const std::string& key, const std::string& val) {
-  if (keys_.find(key) != keys_.end() || val.size() == 0)
-    return false;
-  int size = key.size() + val.size() + 2*sizeof(size_t);
-  if (bufsize_ + size > capacity_) {
-    fdat_.write(buf_, bufsize_);
-    bufsize_ = 0;
-    CHECK_LE(size, capacity_) << "Tuple size is larger than capacity "
-      << "Try a larger capacity size";
-  }
-  *reinterpret_cast<size_t*>(buf_ + bufsize_) = key.size();
-  bufsize_ += sizeof(size_t);
-  memcpy(buf_ + bufsize_, key.data(), key.size());
-  bufsize_ += key.size();
-  *reinterpret_cast<size_t*>(buf_ + bufsize_) = val.size();
-  bufsize_ += sizeof(size_t);
-  memcpy(buf_ + bufsize_, val.data(), val.size());
-  bufsize_ += val.size();
-  return true;
-}
-
-void KVFile::SeekToFirst() {
-  CHECK_EQ(mode_, kRead);
-  bufsize_ = 0;
-  offset_ = 0;
-  fdat_.clear();
-  fdat_.seekg(0);
-  CHECK(fdat_.is_open()) << "Cannot create file " << path_;
-}
-
-void KVFile::Flush() {
-  fdat_.write(buf_, bufsize_);
-  fdat_.flush();
-  bufsize_ = 0;
-}
-
-int KVFile::Count() {
-  std::ifstream fin(path_, std::ios::in | std::ios::binary);
-  CHECK(fdat_.is_open()) << "Cannot create file " << path_;
-  int count = 0;
-  while (true) {
-    size_t len;
-    fin.read(reinterpret_cast<char*>(&len), sizeof(len));
-    if (!fin.good()) break;
-    fin.seekg(len, std::ios_base::cur);
-    if (!fin.good()) break;
-    fin.read(reinterpret_cast<char*>(&len), sizeof(len));
-    if (!fin.good()) break;
-    fin.seekg(len, std::ios_base::cur);
-    if (!fin.good()) break;
-    count++;
-  }
-  fin.close();
-  return count;
-}
-
-int KVFile::Next(std::string *key) {
-  key->clear();
-  int ssize = sizeof(size_t);
-  if (!PrepareNextField(ssize)) return 0;
-  int keylen = *reinterpret_cast<size_t*>(buf_ + offset_);
-  offset_ += ssize;
-  if (!PrepareNextField(keylen)) return 0;
-  for (int i = 0; i < keylen; ++i)
-    key->push_back(buf_[offset_ + i]);
-  offset_ += keylen;
-  if (!PrepareNextField(ssize)) return 0;
-  int vallen = *reinterpret_cast<size_t*>(buf_ + offset_);
-  offset_ += ssize;
-  if (!PrepareNextField(vallen)) return 0;
-  return vallen;
-}
-
-int KVFile::PrepareForAppend(const std::string& path) {
-  std::ifstream fin(path, std::ios::in | std::ios::binary);
-  if (!fin.is_open()) return 0;
-  int last_tuple_offset = 0;
-  char buf[256];
-  size_t len;
-  while (true) {
-    fin.read(reinterpret_cast<char*>(&len), sizeof(len));
-    if (!fin.good()) break;
-    fin.read(buf, len);
-    buf[len] = '\0';
-    if (!fin.good()) break;
-    fin.read(reinterpret_cast<char*>(&len), sizeof(len));
-    if (!fin.good()) break;
-    fin.seekg(len, std::ios_base::cur);
-    if (!fin.good()) break;
-    keys_.insert(std::string(buf));
-    last_tuple_offset = fin.tellg();
-  }
-  fin.close();
-  return last_tuple_offset;
-}
-
-// if the buf does not have the next complete field, read data from disk
-bool KVFile::PrepareNextField(int size) {
-  if (offset_ + size > bufsize_) {
-    bufsize_ -= offset_;
-    // wangsh: commented, not sure what this check does
-    // CHECK_LE(bufsize_, offset_);
-    for (int i = 0; i < bufsize_; ++i)
-      buf_[i] = buf_[i + offset_];
-    offset_ = 0;
-    if (fdat_.eof()) {
-      return false;
-    } else {
-      fdat_.read(buf_ + bufsize_, capacity_ - bufsize_);
-      bufsize_ += fdat_.gcount();
-      if (size > bufsize_) return false;
-    }
-  }
-  return true;
-}
-
-}  // namespace io
-}  // namespace singa
diff --git a/src/io/kvfile_store.cc b/src/io/kvfile_store.cc
deleted file mode 100644
index a2a40cd..0000000
--- a/src/io/kvfile_store.cc
+++ /dev/null
@@ -1,76 +0,0 @@
-/************************************************************
-*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*
-*************************************************************/
-
-#include "singa/io/kvfile_store.h"
-
-#include <glog/logging.h>
-
-namespace singa {
-namespace io {
-
-bool KVFileStore::Open(const std::string& source, Mode mode) {
-  CHECK(file_ == nullptr);
-  if (mode == kRead)
-    file_ = new KVFile(source, KVFile::kRead);
-  else if (mode == kCreate)
-    file_ = new KVFile(source, KVFile::kCreate);
-  else if (mode == kAppend)
-    file_ = new KVFile(source, KVFile::kAppend);
-  mode_ = mode;
-  return file_ != nullptr;
-}
-
-void KVFileStore::Close() {
-  if (file_ != nullptr)
-    delete file_;
-  file_ = nullptr;
-}
-
-bool KVFileStore::Read(std::string* key, std::string* value) {
-  CHECK_EQ(mode_, kRead);
-  CHECK(file_ != nullptr);
-  return file_->Next(key, value);
-}
-
-void KVFileStore::SeekToFirst() {
-  CHECK_EQ(mode_, kRead);
-  CHECK(file_ != nullptr);
-  file_->SeekToFirst();
-}
-
-void KVFileStore::Seek(int offset) {
-  LOG(FATAL) << "Operation not supported.";
-}
-
-bool KVFileStore::Write(const std::string& key, const std::string& value) {
-  CHECK_NE(mode_, kRead);
-  CHECK(file_ != nullptr);
-  return file_->Insert(key, value);
-}
-
-void KVFileStore::Flush() {
-  CHECK_NE(mode_, kRead);
-  CHECK(file_!= nullptr);
-  file_->Flush();
-}
-
-}  // namespace io
-}  // namespace singa
diff --git a/src/io/lmdb_reader.cc b/src/io/lmdb_reader.cc
new file mode 100644
index 0000000..7f78080
--- /dev/null
+++ b/src/io/lmdb_reader.cc
@@ -0,0 +1,118 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "singa/io/reader.h"
+#include "singa/utils/logging.h"
+#ifdef USE_LMDB
+
+namespace singa {
+namespace io {
+bool LMDBReader::Open(const std::string& path) {
+  path_ = path;
+  MDB_CHECK(mdb_env_create(&mdb_env_));
+  int flags = MDB_RDONLY | MDB_NOTLS;
+  int rc = mdb_env_open(mdb_env_, path_.c_str(), flags, 0664);
+#ifndef ALLOW_LMDB_NOLOCK
+  MDB_CHECK(rc);
+#else
+  if (rc == EACCES) {
+    LOG(WARNING) << "Permission denied. Trying with MDB_NOLOCK ...";
+    // Close and re-open environment handle
+    mdb_env_close(mdb_env_);
+    MDB_CHECK(mdb_env_create(&mdb_env_));
+    // Try again with MDB_NOLOCK
+    flags |= MDB_NOLOCK;
+    MDB_CHECK(mdb_env_open(mdb_env_, source.c_str(), flags, 0664));
+  } else {
+    MDB_CHECK(rc);
+  }
+#endif
+  MDB_CHECK(mdb_txn_begin(mdb_env_, NULL, MDB_RDONLY, &mdb_txn_));
+  MDB_CHECK(mdb_dbi_open(mdb_txn_, NULL, 0, &mdb_dbi_));
+  MDB_CHECK(mdb_cursor_open(mdb_txn_, mdb_dbi_, &mdb_cursor_));
+  SeekToFirst();
+  return true;
+}
+
+void LMDBReader::Close() {
+  if (mdb_env_ != nullptr) {
+    mdb_cursor_close(mdb_cursor_);
+    mdb_txn_abort(mdb_txn_);
+    mdb_dbi_close(mdb_env_, mdb_dbi_);
+    mdb_env_close(mdb_env_);
+    mdb_env_ = nullptr;
+    mdb_txn_ = nullptr;
+    mdb_cursor_ = nullptr;
+  }
+}
+
+bool LMDBReader::Read(std::string* key, std::string* value) {
+  if (first_ != true)
+    Seek(MDB_NEXT);
+  if (valid_ == false) return false;
+  *key = string(static_cast<const char*>(mdb_key_.mv_data), mdb_key_.mv_size);
+  *value =
+      string(static_cast<const char*>(mdb_value_.mv_data), mdb_value_.mv_size);
+  first_ = false;
+  return true;
+}
+
+int LMDBReader::Count() {
+  MDB_env* env;
+  MDB_dbi dbi;
+  MDB_txn* txn;
+  MDB_cursor* cursor;
+  int flags = MDB_RDONLY | MDB_NOTLS | MDB_NOLOCK;
+  MDB_CHECK(mdb_env_create(&env));
+  MDB_CHECK(mdb_env_open(env, path_.c_str(), flags, 0664));
+  MDB_CHECK(mdb_txn_begin(env, NULL, MDB_RDONLY, &txn));
+  MDB_CHECK(mdb_dbi_open(txn, NULL, 0, &dbi));
+  MDB_CHECK(mdb_cursor_open(txn, dbi, &cursor));
+  int status = MDB_SUCCESS;
+  int count = 0;
+  MDB_val key, value;
+  while (true) {
+    status = mdb_cursor_get(cursor, &key, &value, MDB_NEXT);
+    if (status == MDB_NOTFOUND) break;
+    count++;
+  }
+  mdb_cursor_close(cursor);
+  mdb_txn_abort(txn);
+  mdb_dbi_close(env, dbi);
+  mdb_env_close(env);
+  return count;
+}
+
+void LMDBReader::SeekToFirst() { Seek(MDB_FIRST); first_ = true; }
+
+void LMDBReader::Seek(MDB_cursor_op op) {
+  int mdb_status = mdb_cursor_get(mdb_cursor_, &mdb_key_, &mdb_value_, op);
+  if (mdb_status == MDB_NOTFOUND) {
+    valid_ = false;
+  } else {
+    MDB_CHECK(mdb_status);
+    valid_ = true;
+  }
+}
+
+inline void LMDBReader::MDB_CHECK(int mdb_status) {
+  CHECK_EQ(mdb_status, MDB_SUCCESS) << mdb_strerror(mdb_status);
+}
+}  // namespace io
+}  // namespace singa
+#endif  // USE_LMDB
diff --git a/src/io/lmdb_writer.cc b/src/io/lmdb_writer.cc
new file mode 100644
index 0000000..e89894b
--- /dev/null
+++ b/src/io/lmdb_writer.cc
@@ -0,0 +1,133 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "singa/io/writer.h"
+#include "singa/utils/logging.h"
+#ifdef USE_LMDB
+
+namespace singa {
+namespace io {
+bool LMDBWriter::Open(const std::string& path, Mode mode) {
+  path_ = path;
+  mode_ = mode;
+  MDB_CHECK(mdb_env_create(&mdb_env_));
+  if (mode_ != kCreate && mode_ != kAppend) {
+    LOG(FATAL) << "unknown mode to open LMDB" << mode_;
+    return false;
+  }
+  if (mode_ == kCreate)
+    // It will fail if there is a dir at "path"
+    CHECK_EQ(mkdir(path.c_str(), 0744), 0) << "mkdir " << path << " failed";
+  int flags = 0;
+  int rc = mdb_env_open(mdb_env_, path.c_str(), flags, 0664);
+#ifndef ALLOW_LMDB_NOLOCK
+  MDB_CHECK(rc);
+#else
+  if (rc == EACCES) {
+    LOG(WARNING) << "Permission denied. Trying with MDB_NOLOCK ...";
+    // Close and re-open environment handle
+    mdb_env_close(mdb_env_);
+    MDB_CHECK(mdb_env_create(&mdb_env_));
+    // Try again with MDB_NOLOCK
+    flags |= MDB_NOLOCK;
+    MDB_CHECK(mdb_env_open(mdb_env_, path.c_str(), flags, 0664));
+  } else
+    MDB_CHECK(rc);
+#endif
+  return true;
+}
+
+void LMDBWriter::Close() {
+  Flush();
+  if (mdb_env_ != nullptr) {
+    mdb_env_close(mdb_env_);
+    mdb_env_ = nullptr;
+  }
+}
+
+bool LMDBWriter::Write(const std::string& key, const std::string& value) {
+  CHECK_NE(key, "") << "Key is an empty string!";
+  keys.push_back(key);
+  values.push_back(value);
+  return true;
+}
+
+// Flush is to "commit to DB"
+void LMDBWriter::Flush() {
+  if (keys.size() == 0) return;
+  MDB_dbi mdb_dbi;
+  MDB_val mdb_key, mdb_data;
+  MDB_txn* mdb_txn;
+
+  // Initialize MDB variables
+  MDB_CHECK(mdb_txn_begin(mdb_env_, NULL, 0, &mdb_txn));
+  MDB_CHECK(mdb_dbi_open(mdb_txn, NULL, 0, &mdb_dbi));
+
+  for (size_t i = 0; i < keys.size(); i++) {
+    mdb_key.mv_size = keys[i].size();
+    mdb_key.mv_data = const_cast<char*>(keys[i].data());
+    mdb_data.mv_size = values[i].size();
+    mdb_data.mv_data = const_cast<char*>(values[i].data());
+
+    // Add data to the transaction
+    int put_rc = mdb_put(mdb_txn, mdb_dbi, &mdb_key, &mdb_data, 0);
+    CHECK_NE(put_rc, MDB_KEYEXIST) << "Key already exist: " << keys[i];
+    if (put_rc == MDB_MAP_FULL) {
+      // Out of memory - double the map size and retry
+      mdb_txn_abort(mdb_txn);
+      mdb_dbi_close(mdb_env_, mdb_dbi);
+      DoubleMapSize();
+      Flush();
+      return;
+    }
+    // May have failed for some other reason
+    MDB_CHECK(put_rc);
+  }
+
+  // Commit the transaction
+  int commit_rc = mdb_txn_commit(mdb_txn);
+  if (commit_rc == MDB_MAP_FULL) {
+    // Out of memory - double the map size and retry
+    mdb_dbi_close(mdb_env_, mdb_dbi);
+    DoubleMapSize();
+    Flush();
+    return;
+  }
+  // May have failed for some other reason
+  MDB_CHECK(commit_rc);
+
+  // Cleanup after successful commit
+  mdb_dbi_close(mdb_env_, mdb_dbi);
+  keys.clear();
+  values.clear();
+}
+
+void LMDBWriter::DoubleMapSize() {
+  struct MDB_envinfo current_info;
+  MDB_CHECK(mdb_env_info(mdb_env_, &current_info));
+  size_t new_size = current_info.me_mapsize * 2;
+  LOG(INFO) << "Doubling LMDB map size to " << (new_size >> 20) << "MB ...";
+  MDB_CHECK(mdb_env_set_mapsize(mdb_env_, new_size));
+}
+
+inline void LMDBWriter::MDB_CHECK(int mdb_status) {
+  CHECK_EQ(mdb_status, MDB_SUCCESS) << mdb_strerror(mdb_status);
+}
+}  // namespace io
+}  // namespace singa
+#endif  // USE_LMDB
diff --git a/src/io/network/endpoint.cc b/src/io/network/endpoint.cc
new file mode 100644
index 0000000..e61acdb
--- /dev/null
+++ b/src/io/network/endpoint.cc
@@ -0,0 +1,831 @@
+/************************************************************
+*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*
+*************************************************************/
+#include "singa/singa_config.h"
+#ifdef ENABLE_DIST
+
+#include "singa/io/network.h"
+#include "singa/utils/integer.h"
+#include "singa/utils/logging.h"
+
+#include <sys/socket.h>
+#include <netdb.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <string.h>
+#include <arpa/inet.h>
+
+#include <atomic>
+
+namespace singa {
+
+static void async_ep_cb(struct ev_loop *loop, ev_async *ev, int revent) {
+  reinterpret_cast<NetworkThread *>(ev_userdata(loop))->onNewEp();
+}
+
+static void async_msg_cb(struct ev_loop *loop, ev_async *ev, int revent) {
+  reinterpret_cast<NetworkThread *>(ev_userdata(loop))->onSend();
+}
+
+static void writable_cb(struct ev_loop *loop, ev_io *ev, int revent) {
+  reinterpret_cast<NetworkThread *>(ev_userdata(loop))->onSend(ev->fd);
+}
+
+static void readable_cb(struct ev_loop *loop, ev_io *ev, int revent) {
+  reinterpret_cast<NetworkThread *>(ev_userdata(loop))->onRecv(ev->fd);
+}
+
+static void conn_cb(struct ev_loop *loop, ev_io *ev, int revent) {
+  reinterpret_cast<NetworkThread *>(ev_userdata(loop))->onConnEst(ev->fd);
+}
+
+static void accept_cb(struct ev_loop *loop, ev_io *ev, int revent) {
+  reinterpret_cast<NetworkThread *>(ev_userdata(loop))->onNewConn();
+}
+
+static void timeout_cb(struct ev_loop *loop, ev_timer *ev, int revent) {
+  reinterpret_cast<NetworkThread *>(ev_userdata(loop))->onTimeout(ev);
+}
+
+EndPoint::EndPoint(NetworkThread *t) : thread_(t) {
+  this->timer_.data = reinterpret_cast<void *>(this);
+}
+
+EndPoint::~EndPoint() {
+  while (!recv_.empty()) {
+    delete send_.front();
+    send_.pop();
+  }
+  while (!to_ack_.empty()) {
+    delete send_.front();
+    send_.pop();
+  }
+  while (!send_.empty()) {
+    delete send_.front();
+    send_.pop();
+  }
+}
+
+int EndPoint::send(Message *msg) {
+  CHECK(msg->type_ == MSG_DATA);
+  static std::atomic<uint32_t> id(0);
+  std::unique_lock<std::mutex> lock(this->mtx_);
+
+  if (this->conn_status_ == CONN_ERROR) {
+    LOG(INFO) << "EndPoint " << inet_ntoa(addr_.sin_addr) << " is disconnected";
+    return -1;
+  }
+
+  if (msg->psize_ == 0 && msg->msize_ == 0)
+    // no data to send
+    return 0;
+
+  msg->setId(id++);
+
+  send_.push(new Message(static_cast<Message &&>(*msg)));
+
+  thread_->notify(SIG_MSG);
+  return msg->getSize();
+}
+
+Message *EndPoint::recv() {
+  std::unique_lock<std::mutex> lock(this->mtx_);
+  while (this->recv_.empty() && conn_status_ != CONN_ERROR)
+    this->cv_.wait(lock);
+
+  Message *ret = nullptr;
+  if (!recv_.empty()) {
+    ret = recv_.front();
+    recv_.pop();
+  }
+  return ret;
+}
+
+EndPointFactory::~EndPointFactory() {
+  for (auto &p : ip_ep_map_) {
+    delete p.second;
+  }
+}
+
+EndPoint *EndPointFactory::getOrCreateEp(uint32_t ip) {
+  std::unique_lock<std::mutex> lock(map_mtx_);
+  if (0 == ip_ep_map_.count(ip)) {
+    ip_ep_map_[ip] = new EndPoint(this->thread_);
+  }
+  return ip_ep_map_[ip];
+}
+
+EndPoint *EndPointFactory::getEp(uint32_t ip) {
+  std::unique_lock<std::mutex> lock(map_mtx_);
+  if (0 == ip_ep_map_.count(ip)) {
+    return nullptr;
+  }
+  return ip_ep_map_[ip];
+}
+
+EndPoint *EndPointFactory::getEp(const char *host) {
+  // get the ip address of host
+  struct hostent *he;
+  struct in_addr **list;
+
+  if ((he = gethostbyname(host)) == nullptr) {
+    LOG(INFO) << "Unable to resolve host " << host;
+    return nullptr;
+  }
+
+  list = (struct in_addr **)he->h_addr_list;
+  uint32_t ip = ntohl(list[0]->s_addr);
+
+  EndPoint *ep = nullptr;
+  map_mtx_.lock();
+  if (0 == ip_ep_map_.count(ip)) {
+    ep = new EndPoint(this->thread_);
+    ep->thread_ = this->thread_;
+    ip_ep_map_[ip] = ep;
+
+    // copy the address info
+    bcopy(list[0], &ep->addr_.sin_addr, sizeof(struct in_addr));
+
+    thread_->notify(SIG_EP);
+  }
+  ep = ip_ep_map_[ip];
+  map_mtx_.unlock();
+
+  std::unique_lock<std::mutex> eplock(ep->mtx_);
+  while (ep->conn_status_ == CONN_PENDING || ep->conn_status_ == CONN_INIT) {
+    ep->pending_cnt_++;
+    ep->cv_.wait(eplock);
+    ep->pending_cnt_--;
+  }
+
+  if (ep->conn_status_ == CONN_ERROR) {
+    ep = nullptr;
+  }
+
+  return ep;
+}
+
+void EndPointFactory::getNewEps(std::vector<EndPoint *> &neps) {
+  std::unique_lock<std::mutex> lock(this->map_mtx_);
+  for (auto &p : this->ip_ep_map_) {
+    EndPoint *ep = p.second;
+    std::unique_lock<std::mutex> eplock(ep->mtx_);
+    if (ep->conn_status_ == CONN_INIT) {
+      neps.push_back(ep);
+    }
+  }
+}
+
+NetworkThread::NetworkThread(int port) {
+  this->port_ = port;
+  thread_ = new std::thread([this] { doWork(); });
+  this->epf_ = new EndPointFactory(this);
+}
+
+void NetworkThread::doWork() {
+
+  // prepare event loop
+  if (!(loop_ = ev_default_loop(0))) {
+    // log here
+  }
+
+  ev_async_init(&ep_sig_, async_ep_cb);
+  ev_async_start(loop_, &ep_sig_);
+
+  ev_async_init(&msg_sig_, async_msg_cb);
+  ev_async_start(loop_, &msg_sig_);
+
+  // bind and listen
+  struct sockaddr_in addr;
+  if ((socket_fd_ = socket(AF_INET, SOCK_STREAM, 0)) < 0) {
+    LOG(FATAL) << "Socket Error: " << strerror(errno);
+  }
+
+  bzero(&addr, sizeof(addr));
+  addr.sin_family = AF_INET;
+  addr.sin_port = htons(this->port_);
+  addr.sin_addr.s_addr = INADDR_ANY;
+
+  if (bind(socket_fd_, (struct sockaddr *)&addr, sizeof(addr))) {
+    LOG(FATAL) << "Bind Error: " << strerror(errno);
+  }
+
+  if (listen(socket_fd_, 10)) {
+    LOG(FATAL) << "Listen Error: " << strerror(errno);
+  }
+
+  ev_io_init(&socket_watcher_, accept_cb, socket_fd_, EV_READ);
+  ev_io_start(loop_, &socket_watcher_);
+
+  ev_set_userdata(loop_, this);
+
+  while (1)
+    ev_run(loop_, 0);
+}
+
+void NetworkThread::notify(int signal) {
+  switch (signal) {
+  case SIG_EP:
+    ev_async_send(this->loop_, &this->ep_sig_);
+    break;
+  case SIG_MSG:
+    ev_async_send(this->loop_, &this->msg_sig_);
+    break;
+  default:
+    break;
+  }
+}
+
+void NetworkThread::onNewEp() {
+  std::vector<EndPoint *> neps;
+  this->epf_->getNewEps(neps);
+
+  for (auto &ep : neps) {
+    std::unique_lock<std::mutex> ep_lock(ep->mtx_);
+    int &fd = ep->fd_[0];
+    if (ep->conn_status_ == CONN_INIT) {
+
+      fd = socket(AF_INET, SOCK_STREAM, 0);
+      if (fd < 0) {
+        // resources not available
+        LOG(FATAL) << "Unable to create socket";
+      }
+
+      // set this fd non-blocking
+      fcntl(fd, F_SETFL, fcntl(fd, F_GETFL, 0) | O_NONBLOCK);
+
+      this->fd_ep_map_[fd] = ep;
+
+      // initialize the addess
+      ep->addr_.sin_family = AF_INET;
+      ep->addr_.sin_port = htons(port_);
+      bzero(&(ep->addr_.sin_zero), 8);
+
+      LOG(INFO) << "Connecting to " << inet_ntoa(ep->addr_.sin_addr)
+                << " fd = " << fd;
+      if (connect(fd, (struct sockaddr *)&ep->addr_, sizeof(struct sockaddr))) {
+        LOG(INFO) << "Connect Error: " << strerror(errno);
+        if (errno != EINPROGRESS) {
+          ep->conn_status_ = CONN_ERROR;
+          ep->cv_.notify_all();
+          continue;
+        } else {
+          ep->conn_status_ = CONN_PENDING;
+          ev_io_init(&this->fd_wwatcher_map_[fd], conn_cb, fd, EV_WRITE);
+          ev_io_start(this->loop_, &this->fd_wwatcher_map_[fd]);
+        }
+      } else {
+        afterConnEst(ep, fd, true);
+
+        // connection established immediately
+        // LOG(INFO) << "Connected to " << inet_ntoa(ep->addr_.sin_addr) << " fd
+        // = "<< fd;
+        // ep->conn_status_ = CONN_EST;
+
+        // //ev_io_stop(this->loop_, &this->fd_wwatcher_map_[fd]);
+        // ev_io_init(&fd_wwatcher_map_[fd], writable_cb, fd, EV_WRITE);
+
+        // // poll for new msgs
+        // ev_io_init(&this->fd_rwatcher_map_[fd], readable_cb, fd, EV_READ);
+        // ev_io_start(this->loop_, &this->fd_rwatcher_map_[fd]);
+
+        // asyncSendPendingMsg(ep);
+        // ep->cv_.notify_all();
+      }
+    }
+  }
+}
+
+void NetworkThread::onConnEst(int fd) {
+
+  // EndPoint* ep = epf_->getEp(this->fd_ip_map_[fd]);
+  CHECK(fd_ep_map_.count(fd) > 0);
+  EndPoint *ep = fd_ep_map_.at(fd);
+
+  std::unique_lock<std::mutex> lock(ep->mtx_);
+
+  if (connect(fd, (struct sockaddr *)&ep->addr_, sizeof(struct sockaddr)) < 0 &&
+      errno != EISCONN) {
+    LOG(INFO) << "Unable to connect to " << inet_ntoa(ep->addr_.sin_addr)
+              << ": " << strerror(errno);
+    if (errno == EINPROGRESS) {
+      // continue to watch this socket
+      return;
+    }
+
+    handleConnLost(ep->fd_[0], ep);
+
+    if (ep->conn_status_ == CONN_EST && ep->conn_status_ == CONN_ERROR)
+      ep->cv_.notify_all();
+
+  } else {
+
+    afterConnEst(ep, fd, true);
+
+    // ep->conn_status_ = CONN_EST;
+    //// connect established; poll for new msgs
+    // ev_io_stop(this->loop_, &this->fd_wwatcher_map_[fd]);
+    // ev_io_init(&fd_wwatcher_map_[fd], writable_cb, fd, EV_WRITE);
+
+    // ev_io_init(&this->fd_rwatcher_map_[fd], readable_cb, fd, EV_READ);
+    // ev_io_start(this->loop_, &this->fd_rwatcher_map_[fd]);
+  }
+}
+
+void NetworkThread::onNewConn() {
+  // accept new tcp connection
+  struct sockaddr_in addr;
+  socklen_t len = sizeof(addr);
+  int fd = accept(socket_fd_, (struct sockaddr *)&addr, &len);
+  if (fd < 0) {
+    LOG(INFO) << "Accept Error: " << strerror(errno);
+    return;
+  }
+
+  LOG(INFO) << "Accept a client from " << inet_ntoa(addr.sin_addr)
+            << ", fd = " << fd;
+
+  // set this fd as non-blocking
+  fcntl(fd, F_SETFL, fcntl(fd, F_GETFL, 0) | O_NONBLOCK);
+
+  EndPoint *ep;
+  uint32_t a = ntohl(addr.sin_addr.s_addr);
+
+  ep = epf_->getOrCreateEp(a);
+  std::unique_lock<std::mutex> lock(ep->mtx_);
+
+  // Passive connection
+  afterConnEst(ep, fd, false);
+
+  // record the remote address
+  bcopy(&addr, &ep->addr_, len);
+}
+
+void NetworkThread::onTimeout(struct ev_timer *timer) {
+
+  EndPoint *ep = reinterpret_cast<EndPoint *>(timer->data);
+
+  ev_tstamp timeout = EP_TIMEOUT + ep->last_msg_time_;
+  ev_tstamp now = ev_now(loop_);
+
+  std::unique_lock<std::mutex> lock(ep->mtx_);
+  if (now > timeout) {
+    if (!ep->to_ack_.empty() || !ep->send_.empty()) {
+
+      LOG(INFO) << "EndPoint " << inet_ntoa(ep->addr_.sin_addr) << " timeouts";
+      // we consider this ep has been disconnected
+      for (int i = 0; i < 2; ++i) {
+        int fd = ep->fd_[i];
+        if (fd >= 0)
+          handleConnLost(fd, ep);
+      }
+      return;
+    }
+
+    timer->repeat = EP_TIMEOUT;
+
+  } else {
+    timer->repeat = timeout - now;
+  }
+
+  ev_timer_again(loop_, &ep->timer_);
+}
+
+/**
+ * @brief The processing for a connected socket
+ *
+ * @param ep
+ * @param fd
+ * @param active indicate whethen this socket is locally initiated or not
+ */
+void NetworkThread::afterConnEst(EndPoint *ep, int fd, bool active) {
+
+  if (active)
+    LOG(INFO) << "Connected to " << inet_ntoa(ep->addr_.sin_addr)
+              << ", fd = " << fd;
+
+  int sfd;
+
+  if (active) {
+    ep->fd_[0] = fd;
+    sfd = ep->fd_[1];
+  } else {
+    if (ep->fd_[1] >= 0) {
+      // the previous connection is lost
+      handleConnLost(ep->fd_[1], ep, false);
+    }
+    ep->fd_[1] = fd;
+    sfd = ep->fd_[0];
+  }
+
+  if (sfd == fd) {
+    // this fd is a reuse of a previous socket fd
+    // so we first need to clean the resouce for that fd
+    // we duplicate this fd to let the resouce of the oldf fd can be freed
+    // also indicate there is no need to reconnect
+    fd = dup(fd);
+    handleConnLost(sfd, ep, false);
+  }
+
+  // initialize io watchers and add the read watcher to the ev loop
+  ev_io_init(&fd_rwatcher_map_[fd], readable_cb, fd, EV_READ);
+  ev_io_start(loop_, &fd_rwatcher_map_[fd]);
+
+  // stop watching the writable watcher if necessary
+  if (active)
+    ev_io_stop(loop_, &fd_wwatcher_map_[fd]);
+  ev_io_init(&fd_wwatcher_map_[fd], writable_cb, fd, EV_WRITE);
+
+  ep->last_msg_time_ = ev_now(loop_);
+
+  // see whether there is already a established connection for this fd
+  if (ep->conn_status_ == CONN_EST && sfd >= 0) {
+    // check if fd and sfd are associate with the same socket
+    struct sockaddr_in addr;
+    socklen_t len;
+    if (getsockname(fd, (struct sockaddr *)&addr, &len)) {
+      LOG(INFO) << "Unable to get local socket address: " << strerror(errno);
+    } else {
+      // see whether the local address of fd is the same as the remote side
+      // of sfd, which has already been stored in ep->addr_
+      if (addr.sin_addr.s_addr == ep->addr_.sin_addr.s_addr &&
+          addr.sin_port == ep->addr_.sin_port) {
+        LOG(INFO) << fd << " and " << sfd
+                  << " are associated with the same socket";
+        ep->is_socket_loop_ = true;
+      } else {
+        // this socket is redundant, we close it maunally if the local ip
+        // is smaller than the peer ip
+        if ((addr.sin_addr.s_addr < ep->addr_.sin_addr.s_addr) ||
+            (addr.sin_addr.s_addr == ep->addr_.sin_addr.s_addr &&
+             addr.sin_port < ep->addr_.sin_port))
+          handleConnLost(fd, ep, false);
+      }
+    }
+  } else {
+    ep->pfd_ = fd; // set the primary fd
+    ep->conn_status_ = CONN_EST;
+
+    // start timeout watcher to detect the liveness of EndPoint
+    ev_init(&ep->timer_, timeout_cb);
+    ep->timer_.repeat = EP_TIMEOUT;
+    ev_timer_start(loop_, &ep->timer_);
+    // timeout_cb(loop_, &ep->timer_, EV_TIMER);
+  }
+
+  if (fd == ep->pfd_) {
+    this->asyncSendPendingMsg(ep);
+  }
+
+  fd_ep_map_[fd] = ep;
+
+  // Finally notify all waiting threads
+  // if this connection is initiaed by remote side,
+  // we dont need to notify the waiting thread
+  // later threads wanting to send to this ep, however,
+  // are able to reuse this ep
+  if (active) {
+    ep->cv_.notify_all();
+  }
+}
+
+void NetworkThread::onSend(int fd) {
+  std::vector<int> invalid_fd;
+
+  if (fd == -1) {
+    // LOG(INFO) << "There are " << fd_ip_map_.size() << " connections";
+    // this is a signal of new message to send
+    for (auto &p : fd_ep_map_) {
+      // send message
+      // LOG(INFO) << "Try to send over fd " << p.first;
+      if (asyncSend(p.first) < 0)
+        invalid_fd.push_back(p.first);
+    }
+  } else {
+    if (asyncSend(fd) < 0)
+      invalid_fd.push_back(fd);
+  }
+
+  for (auto &p : invalid_fd) {
+    // EndPoint* ep = epf_->getEp(fd_ip_map_.at(p));
+    EndPoint *ep = fd_ep_map_.at(p);
+    std::unique_lock<std::mutex> lock(ep->mtx_);
+    handleConnLost(p, ep);
+  }
+}
+
+void NetworkThread::asyncSendPendingMsg(EndPoint *ep) {
+  // simply put the pending msgs to the send queue
+
+  LOG(INFO) << "There are " << ep->send_.size() << " to-send msgs, and "
+            << ep->to_ack_.size() << " to-ack msgs";
+
+  if (!ep->to_ack_.empty()) {
+    while (!ep->send_.empty()) {
+      ep->to_ack_.push(ep->send_.front());
+      ep->send_.pop();
+    }
+    std::swap(ep->send_, ep->to_ack_);
+  }
+
+  if (ep->send_.size() > 0) {
+    notify(SIG_MSG);
+  }
+}
+
+/**
+ * @brief non-locking send;
+ *
+ * @param ep
+ *
+ */
+int NetworkThread::asyncSend(int fd) {
+
+  // EndPoint* ep = epf_->getEp(fd_ip_map_[fd]);
+  CHECK(fd_ep_map_.count(fd) > 0);
+  EndPoint *ep = fd_ep_map_.at(fd);
+
+  std::unique_lock<std::mutex> ep_lock(ep->mtx_);
+
+  if (fd != ep->pfd_)
+    // we only send over the primary fd
+    // return -1 to indicate this fd is redundant
+    return ep->is_socket_loop_ ? 0 : -1;
+
+  if (ep->conn_status_ != CONN_EST)
+    // This happens during reconnection
+    goto out;
+
+  while (!ep->send_.empty()) {
+
+    Message &msg = *ep->send_.front();
+    int nbytes;
+
+    while (msg.processed_ < msg.getSize()) {
+      if (msg.type_ == MSG_ACK) {
+        nbytes = write(fd, msg.mdata_ + msg.processed_,
+                       msg.getSize() - msg.processed_);
+      } else
+        nbytes = write(fd, msg.msg_ + msg.processed_,
+                       msg.getSize() - msg.processed_);
+
+      if (nbytes == -1) {
+        if (errno == EWOULDBLOCK) {
+          if (!ev_is_active(&fd_wwatcher_map_[fd]) &&
+              !ev_is_pending(&fd_wwatcher_map_[fd]))
+            ev_io_start(loop_, &fd_wwatcher_map_[fd]);
+          goto out;
+        } else {
+          // this connection is lost; reset the send status
+          // so that next time the whole msg would be sent entirely
+          msg.processed_ = 0;
+          goto err;
+        }
+      } else {
+        ep->last_msg_time_ = ev_now(loop_);
+        msg.processed_ += nbytes;
+      }
+
+      // std::size_t m, p;
+      // uint8_t type;
+      // uint32_t id;
+      // if (msg.msg_) {
+      //    readInteger(msg.msg_, type, id, m, p);
+      //    LOG(INFO) << "Send " << msg.processed_ << " bytes to " <<
+      // inet_ntoa(ep->addr_.sin_addr) << " over fd " << fd << " for the current
+      // DATA MSG " << msg.id_ << ", " << id << ", " << m << ", " << p;
+      //}
+    }
+
+    CHECK(msg.processed_ == msg.getSize());
+
+    if (msg.type_ != MSG_ACK) {
+      LOG(INFO) << "Send a DATA message to " << inet_ntoa(ep->addr_.sin_addr)
+                << " for MSG " << msg.id_ << ", len = " << msg.getSize()
+                << " over fd " << fd;
+      msg.processed_ = 0;
+      ep->to_ack_.push(&msg);
+    } else {
+      // LOG(INFO) << "Send an ACK message to " << inet_ntoa(ep->addr_.sin_addr)
+      // << " for MSG " << msg.id_;
+      delete &msg;
+    }
+
+    ep->send_.pop();
+
+    // for test
+    // if (ep->retry_cnt_ == 0) {
+    //     LOG(INFO) << "Disconnect with Endpoint " <<
+    // inet_ntoa(ep->addr_.sin_addr) << " over fd " << fd;
+    //     close(fd);
+    //     goto err;
+    // }
+  }
+out:
+  if (ep->send_.empty())
+    ev_io_stop(loop_, &this->fd_wwatcher_map_[fd]);
+  return 0;
+err:
+  return -1;
+}
+
+void NetworkThread::onRecv(int fd) {
+
+  Message *m = &pending_msgs_[fd];
+  Message &msg = (*m);
+  int nread;
+  // EndPoint* ep = epf_->getEp(fd_ip_map_[fd]);
+
+  CHECK(fd_ep_map_.count(fd) > 0);
+  EndPoint *ep = fd_ep_map_.at(fd);
+
+  // LOG(INFO) << "Start to read from EndPoint " <<
+  // inet_ntoa(ep->addr_.sin_addr) << " over fd " << fd;
+
+  std::unique_lock<std::mutex> lock(ep->mtx_);
+
+  ep->last_msg_time_ = ev_now(loop_);
+  while (1) {
+    if (msg.processed_ < Message::hsize_) {
+      nread = read(fd, msg.mdata_ + msg.processed_,
+                   Message::hsize_ - msg.processed_);
+
+      if (nread <= 0) {
+        if (errno != EWOULDBLOCK || nread == 0) {
+          // socket error or shuts down
+          if (nread < 0)
+            LOG(INFO) << "Fail to receive from EndPoint "
+                      << inet_ntoa(ep->addr_.sin_addr) << ": "
+                      << strerror(errno);
+          else
+            LOG(INFO) << "Fail to receive from EndPoint "
+                      << inet_ntoa(ep->addr_.sin_addr)
+                      << ": Connection reset by remote side";
+          handleConnLost(fd, ep);
+        }
+        break;
+      }
+
+      msg.processed_ += nread;
+      while (msg.processed_ >= sizeof(msg.type_) + sizeof(msg.id_)) {
+        readInteger(msg.mdata_, msg.type_, msg.id_);
+        if (msg.type_ == MSG_ACK) {
+          LOG(INFO) << "Receive an ACK message from "
+                    << inet_ntoa(ep->addr_.sin_addr) << " for MSG " << msg.id_;
+          while (!ep->to_ack_.empty()) {
+            Message *m = ep->to_ack_.front();
+            if (m->id_ <= msg.id_) {
+              delete m;
+              ep->to_ack_.pop();
+            } else {
+              break;
+            }
+          }
+
+          // reset
+          msg.processed_ -= sizeof(msg.type_) + sizeof(msg.id_);
+          memmove(msg.mdata_, msg.mdata_ + sizeof(msg.type_) + sizeof(msg.id_),
+                  msg.processed_);
+
+        } else
+          break;
+      }
+
+      if (msg.processed_ < Message::hsize_) {
+        continue;
+      }
+
+      // got the whole metadata;
+      readInteger(msg.mdata_, msg.type_, msg.id_, msg.msize_, msg.psize_);
+
+      LOG(INFO) << "Receive a message: id = " << msg.id_
+                << ", msize_ = " << msg.msize_ << ", psize_ = " << msg.psize_
+                << " from " << inet_ntoa(ep->addr_.sin_addr) << " over fd "
+                << fd;
+    }
+
+    // start reading the real data
+    if (msg.msg_ == nullptr) {
+      msg.msg_ = new char[msg.getSize()];
+      memcpy(msg.msg_, msg.mdata_, Message::hsize_);
+    }
+
+    nread = read(fd, msg.msg_ + msg.processed_, msg.getSize() - msg.processed_);
+    if (nread <= 0) {
+      if (errno != EWOULDBLOCK || nread == 0) {
+        // socket error or shuts down
+        if (nread < 0)
+          LOG(INFO) << "Fail to receive from EndPoint "
+                    << inet_ntoa(ep->addr_.sin_addr) << ": " << strerror(errno);
+        else
+          LOG(INFO) << "Fail to receive from EndPoint "
+                    << inet_ntoa(ep->addr_.sin_addr)
+                    << ": Connection reset by remote side";
+        handleConnLost(fd, ep);
+      }
+      break;
+    }
+
+    msg.processed_ += nread;
+
+    // LOG(INFO) << "Receive a message: id = " << msg.id_ << ", msize_ = " <<
+    // msg.msize_ << ", psize_ = " << msg.psize_ << ", processed_ = " <<
+    // msg.processed_ << " from " << inet_ntoa(ep->addr_.sin_addr) << " over fd
+    // " << fd;
+
+    if (msg.processed_ == msg.getSize()) {
+      LOG(INFO) << "Receive a " << msg.processed_ << " bytes DATA message from "
+                << inet_ntoa(ep->addr_.sin_addr) << " with id " << msg.id_;
+      ep->recv_.push(new Message(static_cast<Message &&>(msg)));
+      // notify of waiting thread
+      ep->cv_.notify_one();
+      ep->send_.push(new Message(MSG_ACK, msg.id_));
+      msg.processed_ = 0;
+    }
+  }
+}
+
+/**
+ * @brief clean up for the lost connection; the caller should acquire the lock
+ * for the respective endpoint
+ *
+ * @param fd
+ * @param ep
+ * @param reconn
+ */
+void NetworkThread::handleConnLost(int fd, EndPoint *ep, bool reconn) {
+  CHECK(fd >= 0);
+  LOG(INFO) << "Lost connection to EndPoint " << inet_ntoa(ep->addr_.sin_addr)
+            << ", fd = " << fd;
+
+  this->pending_msgs_.erase(fd);
+  this->fd_ep_map_.erase(fd);
+  ev_io_stop(loop_, &this->fd_wwatcher_map_[fd]);
+  ev_io_stop(loop_, &this->fd_rwatcher_map_[fd]);
+  fd_wwatcher_map_.erase(fd);
+  fd_rwatcher_map_.erase(fd);
+  close(fd);
+
+  if (fd == ep->pfd_) {
+    if (!ep->send_.empty())
+      ep->send_.front()->processed_ = 0;
+  }
+
+  int sfd = (fd == ep->fd_[0]) ? ep->fd_[1] : ep->fd_[0];
+  if (fd == ep->fd_[0])
+    ep->fd_[0] = -1;
+  else
+    ep->fd_[1] = -1;
+
+  if (reconn) {
+    // see if the other fd is alive or not
+    if (sfd < 0) {
+      if (ep->conn_status_ == CONN_EST)
+        ev_timer_stop(loop_, &ep->timer_);
+      if (ep->retry_cnt_ < MAX_RETRY_CNT) {
+        // notify myself for retry
+        ep->retry_cnt_++;
+        ep->conn_status_ = CONN_INIT;
+        LOG(INFO) << "Reconnect to EndPoint " << inet_ntoa(ep->addr_.sin_addr);
+        this->notify(SIG_EP);
+      } else {
+        LOG(INFO) << "Maximum retry count achieved for EndPoint "
+                  << inet_ntoa(ep->addr_.sin_addr);
+        ep->conn_status_ = CONN_ERROR;
+
+        // notify all threads that this ep is no longer connected
+        ep->cv_.notify_all();
+      }
+    } else {
+      if (!ep->is_socket_loop_) {
+        // if there is another working fd, set this fd as primary and
+        // send data over this fd
+        ep->pfd_ = sfd;
+        ep->last_msg_time_ = ev_now(loop_);
+        asyncSendPendingMsg(ep);
+      } else {
+        handleConnLost(sfd, ep);
+      }
+    }
+  }
+}
+}
+
+#endif // ENABLE_DIST
diff --git a/src/io/network/message.cc b/src/io/network/message.cc
new file mode 100644
index 0000000..32f29b7
--- /dev/null
+++ b/src/io/network/message.cc
@@ -0,0 +1,95 @@
+/************************************************************
+*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*
+*************************************************************/
+#include "singa/singa_config.h"
+#ifdef ENABLE_DIST
+
+#include <cstdlib>
+#include <cstring>
+
+#include <atomic>
+
+#include "singa/io/network.h"
+#include "singa/utils/integer.h"
+
+namespace singa {
+
+Message::Message(Message &&msg) {
+  std::swap(msize_, msg.msize_);
+  std::swap(psize_, msg.psize_);
+  std::swap(msg_, msg.msg_);
+  std::swap(type_, msg.type_);
+  std::swap(id_, msg.id_);
+}
+
+Message::Message(int type, uint32_t ack_msg_id) : type_(type), id_(ack_msg_id) {
+  if (type_ == MSG_ACK)
+    appendInteger(mdata_, type_, id_);
+}
+
+Message::~Message() {
+  if (msg_)
+    free(msg_);
+}
+
+std::size_t Message::getSize() {
+  if (type_ == MSG_ACK)
+    return sizeof(type_) + sizeof(id_);
+  else
+    return this->hsize_ + this->psize_ + this->msize_;
+}
+
+void Message::setId(uint32_t id) {
+  this->id_ = id;
+  appendInteger(msg_, type_, id_);
+}
+
+void Message::setMetadata(const void *buf, int size) {
+  this->msize_ = size;
+  msg_ = (char *)malloc(this->getSize());
+  appendInteger(msg_, type_, id_, msize_, psize_);
+  memcpy(msg_ + hsize_, buf, size);
+}
+
+void Message::setPayload(const void *buf, int size) {
+  this->psize_ = size;
+  msg_ = (char *)realloc(msg_, this->getSize());
+  appendInteger(msg_ + hsize_ - sizeof(psize_), psize_);
+  memcpy(msg_ + hsize_ + msize_, buf, size);
+}
+
+std::size_t Message::getMetadata(void **p) {
+  if (this->msize_ == 0)
+    *p = nullptr;
+  else
+    *p = msg_ + hsize_;
+  return this->msize_;
+}
+
+std::size_t Message::getPayload(void **p) {
+  if (this->psize_ == 0)
+    *p = nullptr;
+  else
+    *p = msg_ + hsize_ + msize_;
+  return this->psize_;
+}
+}
+
+#endif  // ENABLE_DIST
diff --git a/src/io/snapshot.cc b/src/io/snapshot.cc
new file mode 100644
index 0000000..58c7044
--- /dev/null
+++ b/src/io/snapshot.cc
@@ -0,0 +1,106 @@
+/************************************************************
+*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*
+*************************************************************/
+
+#include "singa/io/snapshot.h"
+
+#include <string>
+#include <unordered_set>
+#include <unordered_map>
+#include <memory>
+#include <utility>
+#include <iostream>
+
+namespace singa {
+Snapshot::Snapshot(const std::string& prefix, Mode mode, int max_param_size /*in MB*/)
+    : prefix_(prefix),
+      mode_(mode),
+      bin_writer_ptr_(mode_ == kWrite ? (new io::BinFileWriter) : nullptr),
+      text_writer_ptr_(mode_ == kWrite ? (new io::TextFileWriter) : nullptr),
+      bin_reader_ptr_(mode_ == kRead ? (new io::BinFileReader) : nullptr) {
+  if (mode_ == kWrite) {
+    bin_writer_ptr_->Open(prefix + ".model", io::kCreate, max_param_size << 20);
+    text_writer_ptr_->Open(prefix + ".desc", io::kCreate);
+  } else if (mode == kRead) {
+    bin_reader_ptr_->Open(prefix + ".model", max_param_size << 20);
+    std::string key, serialized_str;
+    singa::TensorProto tp;
+    while (bin_reader_ptr_->Read(&key, &serialized_str)) {
+      CHECK(param_names_.count(key) == 0);
+      param_names_.insert(key);
+      CHECK(tp.ParseFromString(serialized_str));
+      param_map_[key].FromProto(tp);
+    }
+  } else {
+    LOG(FATAL)
+        << "Mode for snapshot should be Snapshot::kWrite or Snapshot::kRead";
+  }
+}
+
+void Snapshot::Write(const std::string& key, const Tensor& param) {
+  CHECK(mode_ == kWrite);
+  CHECK(param_names_.count(key) == 0);
+  param_names_.insert(key);
+  TensorProto tp;
+  param.ToProto(&tp);
+  std::string serialized_str;
+  CHECK(tp.SerializeToString(&serialized_str));
+  bin_writer_ptr_->Write(key, serialized_str);
+//  bin_writer_ptr_->Flush();
+
+  std::string desc_str = "parameter name: " + key;
+  Shape shape = param.shape();
+  desc_str += "\tdata type: " + std::to_string(param.data_type());
+  desc_str += "\tdim: " + std::to_string(shape.size());
+  desc_str += "\tshape:";
+  for (size_t s : shape) desc_str += " " + std::to_string(s);
+  text_writer_ptr_->Write(key, desc_str);
+ // text_writer_ptr_->Flush();
+}
+
+std::vector<std::pair<std::string, Tensor>> Snapshot::Read() {
+  CHECK(mode_ == kRead);
+  std::vector<std::pair<std::string, Tensor>> ret;
+  for (auto it = param_map_.begin(); it != param_map_.end(); ++it)
+    ret.push_back(*it);
+  return ret;
+}
+
+std::vector<std::pair<std::string, Shape>> Snapshot::ReadShape() {
+  CHECK(mode_ == kRead);
+  std::vector<std::pair<std::string, Shape>> ret;
+  for (auto it = param_map_.begin(); it != param_map_.end(); ++it)
+    ret.push_back(std::make_pair(it->first, it->second.shape()));
+  return ret;
+}
+
+Tensor Snapshot::Read(const std::string& key) {
+  CHECK(mode_ == kRead);
+  CHECK(param_map_.count(key) == 1);
+  return param_map_[key];
+}
+
+Shape Snapshot::ReadShape(const std::string& key) {
+  CHECK(mode_ == kRead);
+  CHECK(param_map_.count(key) == 1);
+  return param_map_[key].shape();
+}
+
+}  //  namespace singa
diff --git a/src/io/store.cc b/src/io/store.cc
deleted file mode 100644
index 4621772..0000000
--- a/src/io/store.cc
+++ /dev/null
@@ -1,70 +0,0 @@
-/************************************************************
-*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*
-*************************************************************/
-
-#include "singa/io/store.h"
-#include <glog/logging.h>
-#include "singa/io/kvfile_store.h"
-#include "singa/io/textfile_store.h"
-#ifdef USE_HDFS
-#include "singa/io/hdfs_store.h"
-#endif
-
-namespace singa {
-namespace io {
-
-Store* CreateStore(const std::string& backend) {
-  Store *store = nullptr;
-  if (backend.compare("textfile") == 0) {
-    store = new TextFileStore();
-  } else if (backend.compare("kvfile") == 0) {
-    store = new KVFileStore();
-  }
-
-#ifdef USE_LMDB
-  if (backend == "lmdb") {
-    store = new LMDBStore();
-  }
-#endif
-
-#ifdef USE_OPENCV
-  if (backend == "imagefolder") {
-    store = new ImageFolderStore();
-  }
-#endif
-
-#ifdef USE_HDFS
-  if (backend == "hdfsfile") {
-    store = new HDFSStore();
-  }
-#endif
-
-  CHECK(store) << "Backend type (" << backend << ") not recognized";
-  return store;
-}
-
-Store* OpenStore(const string& backend, const string& path, Mode mode) {
-  auto store = CreateStore(backend);
-  store->Open(path, mode);
-  return store;
-}
-
-}  // namespace io
-}  // namespace singa
diff --git a/src/io/textfile_reader.cc b/src/io/textfile_reader.cc
new file mode 100644
index 0000000..16abc9e
--- /dev/null
+++ b/src/io/textfile_reader.cc
@@ -0,0 +1,69 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "singa/io/reader.h"
+#include "singa/utils/logging.h"
+
+namespace singa {
+namespace io {
+bool TextFileReader::Open(const std::string& path) {
+  path_ = path;
+  fdat_.open(path_, std::ios::in);
+  CHECK(fdat_.is_open()) << "Cannot open file " << path_;
+  return fdat_.is_open();
+}
+
+void TextFileReader::Close() {
+  if (fdat_.is_open()) fdat_.close();
+}
+
+bool TextFileReader::Read(std::string* key, std::string* value) {
+  CHECK(fdat_.is_open()) << "File not open!";
+  key->clear();
+  value->clear();
+  if (!std::getline(fdat_, *value)) {
+    if (fdat_.eof())
+      return false;
+    else
+      LOG(FATAL) << "Error in reading text file";
+  }
+  *key = std::to_string(lineNo_++);
+  return true;
+}
+
+int TextFileReader::Count() {
+  std::ifstream fin(path_, std::ios::in);
+  CHECK(fin.is_open()) << "Cannot create file " << path_;
+  int count = 0;
+  string line;
+  while (!fin.eof()) {
+    std::getline(fin, line);
+    if (line != "") count++;
+  }
+  fin.close();
+  return count;
+}
+
+void TextFileReader::SeekToFirst() {
+  CHECK(fdat_.is_open());
+  lineNo_ = 0;
+  fdat_.clear();
+  fdat_.seekg(0);
+}
+}  // namespace io
+}  // namespace singa
diff --git a/src/io/textfile_store.cc b/src/io/textfile_store.cc
deleted file mode 100644
index 4c2f1b9..0000000
--- a/src/io/textfile_store.cc
+++ /dev/null
@@ -1,89 +0,0 @@
-/************************************************************
-*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*
-*************************************************************/
-
-
-#include "singa/io/textfile_store.h"
-#include <glog/logging.h>
-
-namespace singa {
-namespace io {
-
-bool TextFileStore::Open(const std::string& source, Mode mode) {
-  if (mode == kRead)
-    fs_ = new std::fstream(source, std::fstream::in);
-  else if (mode == kCreate)
-    fs_ = new std::fstream(source, std::fstream::out);
-  mode_ = mode;
-  return fs_->is_open();
-}
-
-void TextFileStore::Close() {
-  if (fs_ != nullptr) {
-    if (fs_->is_open()) {
-      if (mode_ != kRead)
-        fs_->flush();
-      fs_->close();
-    }
-    delete fs_;
-    fs_ = nullptr;
-  }
-}
-
-bool TextFileStore::Read(std::string* key, std::string* value) {
-  CHECK_EQ(mode_, kRead);
-  CHECK(fs_ != nullptr);
-  CHECK(value != nullptr);
-  CHECK(key != nullptr);
-  if (!std::getline(*fs_, *value)) {
-    if (fs_->eof())
-      return false;
-    else
-      LOG(FATAL) << "error in reading csv file";
-  }
-  *key = std::to_string(lineNo_++);
-  return true;
-}
-
-void TextFileStore::SeekToFirst() {
-  CHECK_EQ(mode_, kRead);
-  CHECK(fs_ != nullptr);
-  lineNo_ = 0;
-  fs_->clear();
-  fs_->seekg(0);
-}
-
-void TextFileStore::Seek(int offset) {
-}
-
-bool TextFileStore::Write(const std::string& key, const std::string& value) {
-  CHECK_NE(mode_, kRead);
-  CHECK(fs_ != nullptr);
-  // csv store does not write key
-  *fs_ << value << '\n';
-  return true;
-}
-
-void TextFileStore::Flush() {
-  fs_->flush();
-}
-
-}  // namespace io
-}  // namespace singa
diff --git a/src/io/textfile_writer.cc b/src/io/textfile_writer.cc
new file mode 100644
index 0000000..7868b85
--- /dev/null
+++ b/src/io/textfile_writer.cc
@@ -0,0 +1,61 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "singa/io/writer.h"
+#include "singa/utils/logging.h"
+
+namespace singa {
+namespace io {
+bool TextFileWriter::Open(const std::string& path, Mode mode) {
+  CHECK(!fdat_.is_open());
+  path_ = path;
+  mode_ = mode;
+  switch (mode) {
+    case kCreate:
+      fdat_.open(path_, std::ios::out | std::ios::trunc);
+      CHECK(fdat_.is_open()) << "Cannot create file " << path_;
+      break;
+    case kAppend:
+      fdat_.open(path_, std::ios::app);
+      CHECK(fdat_.is_open()) << "Cannot open file " << path_;
+      break;
+    default:
+      LOG(FATAL) << "unknown mode to open text file " << mode;
+      break;
+  }
+  return fdat_.is_open();
+}
+
+void TextFileWriter::Close() {
+  Flush();
+  if (fdat_.is_open()) fdat_.close();
+}
+
+bool TextFileWriter::Write(const std::string& key, const std::string& value) {
+  CHECK(fdat_.is_open()) << "File not open!";
+  if (value.size() == 0) return false;
+  fdat_ << value << std::endl;
+  return true;
+}
+
+void TextFileWriter::Flush() {
+  if (fdat_.is_open())
+    fdat_.flush();
+}
+}  // namespace io
+}  // namespace singa
diff --git a/src/main.cc b/src/main.cc
deleted file mode 100644
index 0ce7d19..0000000
--- a/src/main.cc
+++ /dev/null
@@ -1,79 +0,0 @@
-/************************************************************
-*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*
-*************************************************************/
-
-#include <glog/logging.h>
-#include <iostream>
-#include "singa/singa.h"
-
-/**
- * \file main.cc provides an example main function.
- *
- * Like the main func of Hadoop, it prepares the job configuration and submit it
- * to the Driver which starts the training.
- *
- * Users can define their own main func to prepare the job configuration in
- * different ways other than reading it from a configuration file. But the main
- * func must call Driver::Init at the beginning, and pass the job configuration
- * and resume option to the Driver for job submission.
- *
- * Optionally, users can register their own implemented subclasses of Layer,
- * Updater, etc. through the registration function provided by the Driver.
- *
- * Users must pass at least one argument to the singa-run.sh, i.e., the job
- * configuration file which includes the cluster topology setting. Other fields
- * e.g, neuralnet, updater can be configured in main.cc.
- *
- * TODO
- * Add helper functions for users to generate configurations for popular models
- * easily, e.g., MLP(layer1_size, layer2_size, tanh, loss);
- */
-int main(int argc, char **argv) {
-  if (argc < 2) {
-    std::cout << "Args: -conf JOB_CONF [-singa SINGA_CONF] [-job_id JOB_ID] "
-              << " [-resume|-test]\n"
-              << "-resume\t resume training from latest checkpoint files\n"
-              << "-test\t test performance or extract features\n";
-    return 0;
-  }
-
-  // initialize glog before creating the driver
-  google::InitGoogleLogging(argv[0]);
-
-  // must create driver at the beginning and call its Init method.
-  singa::Driver driver;
-  driver.Init(argc, argv);
-
-  // users can register new subclasses of layer, updater, etc.
-
-  // get the job conf, and custmize it if need
-  singa::JobProto jobConf = driver.job_conf();
-
-  if (singa::ArgPos(argc, argv, "-test") != -1) {
-    driver.Test(jobConf);
-  } else {
-    // if -resume in argument list, set resume to true; otherwise false
-    int resume_pos = singa::ArgPos(argc, argv, "-resume");
-    bool resume = (resume_pos != -1);
-    // submit the job for training
-    driver.Train(resume, jobConf);
-  }
-  return 0;
-}
diff --git a/src/model/feed_forward_net.cc b/src/model/feed_forward_net.cc
new file mode 100644
index 0000000..3875430
--- /dev/null
+++ b/src/model/feed_forward_net.cc
@@ -0,0 +1,302 @@
+/************************************************************
+*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*
+*************************************************************/
+
+#include "singa/model/feed_forward_net.h"
+#include "singa/model/initializer.h"
+#include "singa/utils/logging.h"
+#include "singa/utils/channel.h"
+namespace singa {
+
+FeedForwardNet::~FeedForwardNet() {
+}
+
+std::shared_ptr<Layer> FeedForwardNet::Add(std::shared_ptr<Layer> layer) {
+  layers_.push_back(layer);
+  return layer;
+}
+
+std::shared_ptr<Layer> FeedForwardNet::Add(const LayerConf& conf,
+    const Shape* sample_shape) {
+  std::shared_ptr<Layer> layer(CreateLayer(conf.type()));
+  CHECK(conf.has_name()) << "Must set layer name";
+  if (sample_shape == nullptr)
+    layer->Setup(layers_.back()->GetOutputSampleShape(), conf);
+  else
+    layer->Setup(*sample_shape, conf);
+  Add(layer);
+  LOG(INFO) << layer->name() << VecToStr(layer->GetOutputSampleShape());
+  return layer;
+}
+
+const vector<string> FeedForwardNet::GetParamNames() const {
+  vector<string> names;
+  for (auto layer : layers_)
+    for (const auto name : layer->param_names()) names.push_back(name);
+  return names;
+}
+const vector<Tensor> FeedForwardNet::GetParamValues() const {
+  vector<Tensor> values;
+  for (auto layer : layers_)
+    for (const auto value : layer->param_values()) values.push_back(value);
+  return values;
+}
+
+const vector<ParamSpec> FeedForwardNet::GetParamSpecs() const {
+  vector<ParamSpec> specs;
+  for (auto layer : layers_)
+    for (const auto spec : layer->param_specs()) specs.push_back(spec);
+  return specs;
+}
+
+void FeedForwardNet::Compile(bool shuffle, Optimizer* opt, Loss* loss,
+                             Metric* metric) {
+  std::shared_ptr<Updater> updater = std::make_shared<Updater>(opt);
+  Compile(shuffle, true, updater, loss, metric);
+}
+
+void FeedForwardNet::Compile(bool shuffle, bool to_register,
+                             std::shared_ptr<Updater> updater, Loss* loss,
+                             Metric* metric) {
+  shuffle_ = shuffle;
+  bool train = (updater != nullptr) && (loss != nullptr);
+  bool test = metric != nullptr;
+  CHECK(train || test) << "Must set updater and loss, or set metric";
+  updater_ = updater;
+  loss_ = loss;
+  metric_ = metric;
+  const auto specs = GetParamSpecs();
+  auto params = GetParamValues();
+  CHECK_EQ(specs.size(), params.size());
+  for (size_t k = 0; k < specs.size(); k++) {
+    if (to_register) {
+      updater_->Register(specs[k].name(), specs[k]);
+    }
+    auto init = CreateInitializer(specs[k].filler());
+    init->Fill(params[k]);
+    LOG(INFO) << specs[k].name() << " : " << params[k].L1();
+  }
+}
+
+void FeedForwardNet::ToDevice(std::shared_ptr<Device> device) {
+  for (auto layer : layers_) layer->ToDevice(device);
+  /*
+  opt_->ToDevice(device);
+  loss_->ToDevice(device);
+  metric_->ToDevice(device);
+  */
+}
+
+FeedForwardNet FeedForwardNet::Clone(std::shared_ptr<Device> device) {
+  FeedForwardNet net;
+  /*
+  for (auto layer: layers_)
+    net.layers_.push_back(layer->CloneTo(device));
+  if (opt_ != nullptr)
+    net.opt_ = opt_->CloneTo(device);
+  if (loss_ != nullptr)
+    net.loss_ = loss_.CloneTo(device);
+  if (metric_ != nullptr)
+    net.metric_ = metric_->CloneTo(device);
+  net.shuffle_ = shuffle_;
+  net.device_ = device;
+  net.dtype_ = dtype;
+  */
+  return net;
+}
+
+void FeedForwardNet::AsType(DataType dtype) {
+  LOG(FATAL) << "FeedForwardNet::AsType not implemented";
+}
+
+void FeedForwardNet::Train(size_t batchsize, int nb_epoch, const Tensor& x,
+                           const Tensor& y, float val_split) {
+  CHECK_EQ(x.shape(0), y.shape(0)) << "Diff num of sampels in x and y";
+  size_t num_train = x.shape(0) * val_split;
+  if (val_split == 0.0f) {
+    Tensor dummy;
+    Train(batchsize, nb_epoch, x, y, dummy, dummy);
+  } else {
+    const Tensor train_x = CopyRows(x, 0, num_train);
+    const Tensor train_y = CopyRows(y, 0, num_train);
+    const Tensor test_x = CopyRows(x, num_train, x.shape(0));
+    const Tensor test_y = CopyRows(y, num_train, y.shape(0));
+    Train(batchsize, nb_epoch, train_x, train_y, test_x, test_y);
+  }
+}
+
+void FeedForwardNet::Train(size_t batchsize, int nb_epoch, const Tensor& x,
+                           const Tensor& y, const Tensor& val_x,
+                           const Tensor& val_y) {
+  CHECK_EQ(x.shape(0), y.shape(0)) << "Diff num of sampels in x and y";
+  int num_extra_samples = x.shape(0) % batchsize;
+  if (num_extra_samples != 0)
+    LOG(WARNING) << "Pls set batchsize to make num_total_samples "
+                 << "% batchsize == 0. Otherwise, the last "
+                 << num_extra_samples << " samples would not be used";
+  Channel* train_ch = GetChannel("train_perf");
+  train_ch->EnableDestStderr(true);
+  Channel* val_ch = GetChannel("val_perf");
+  val_ch->EnableDestStderr(true);
+  std::vector<size_t> index;
+  for (size_t i = 0; i < x.shape(0) / batchsize; i++) index.push_back(i);
+  for (int epoch = 0; epoch < nb_epoch; epoch++) {
+    if (shuffle_) std::random_shuffle(index.begin(), index.end());
+    float loss = 0.0f, metric = 0.0f;
+    size_t b = 0;
+    for (; b < x.shape(0) / batchsize; b++) {
+      size_t idx = index[b];
+      const Tensor bx = CopyRows(x, idx * batchsize, (idx + 1) * batchsize);
+      const Tensor by = CopyRows(y, idx * batchsize, (idx + 1) * batchsize);
+      const auto ret = TrainOnBatch(epoch, bx, by);
+      loss += ret.first;
+      metric += ret.second;
+    }
+    if (val_x.Size() == 0) continue;
+    loss /= b;
+    metric /= b;
+    train_ch->Send(
+        "Epoch " + std::to_string(epoch) + ", training loss = " +
+        std::to_string(loss) + ", accuracy = " + std::to_string(metric) +
+        ", lr = " +
+        std::to_string(updater_->GetOptimizer()->GetLearningRate(epoch)));
+    if (val_x.Size() && val_y.Size()) {
+      const auto val_perf = Evaluate(val_x, val_y, batchsize);
+      val_ch->Send("Epoch " + std::to_string(epoch) + ", val loss = " +
+                   std::to_string(Sum(val_perf.first) / val_y.Size()) +
+                   ", metric = " +
+                   std::to_string(Sum(val_perf.second) / val_y.Size()));
+    }
+  }
+}
+
+const std::pair<float, float> FeedForwardNet::TrainOnBatch(int epoch,
+                                                           const Tensor& x,
+                                                           const Tensor& y) {
+  int flag = kTrain;
+  const Tensor fea = Forward(flag, x);
+  float loss = loss_->Evaluate(flag, fea, y);
+  float metric = metric_->Evaluate(fea, y);
+  const Tensor grad = loss_->Backward();
+  auto grads = Backward(kTrain, grad / static_cast<float>(x.shape(0)));
+  auto names = GetParamNames();
+  auto values = GetParamValues();
+  for (size_t k = 0; k < grads.size(); k++) {
+    updater_->Apply(epoch, names[k], grads[k], values.at(k));
+  }
+  return std::make_pair(loss, metric);
+}
+
+const Tensor FeedForwardNet::Forward(int flag, const Tensor& data) {
+  Tensor input = data, output;
+  // LOG(INFO) << data.L1();
+  for (auto layer : layers_) {
+    output = layer->Forward(flag, input);
+    // LOG(INFO) << layer->name() << ": " << output.L2();
+    input = output;
+  }
+  return output;
+}
+
+const vector<Tensor> FeedForwardNet::Backward(int flag, const Tensor& grad) {
+  vector<Tensor> param_grads;
+  std::stack<Tensor> buf;
+  Tensor tmp = grad;
+  for (int i = layers_.size() - 1; i >= 0; i--) {
+    // LOG(INFO) << layers_.at(i)->name() << " : " << tmp.L1();
+    auto ret = layers_.at(i)->Backward(flag, tmp);
+    tmp = ret.first;
+    if (ret.second.size()) {
+      for (int k = ret.second.size() - 1; k >= 0; k--) {
+        buf.push(ret.second[k]);
+        // LOG(INFO) <<  "      " << buf.top().L1();
+      }
+    }
+  }
+  while (!buf.empty()) {
+    param_grads.push_back(buf.top());
+    buf.pop();
+  }
+  return param_grads;
+}
+
+std::pair<Tensor, Tensor> FeedForwardNet::Evaluate(const Tensor& x,
+                                                   const Tensor& y,
+                                                   size_t batchsize) {
+  CHECK_EQ(x.shape(0), y.shape(0)) << "Diff num of sampels in x and y";
+  CHECK_GE(x.shape(0), batchsize);
+  int num_extra_samples = x.shape(0) % batchsize;
+  Tensor loss(Shape{x.shape(0)}), metric(Shape{x.shape(0)});
+  for (size_t b = 0; b < x.shape(0) / batchsize; b++) {
+    int start = b * batchsize, end = start + batchsize;
+    const Tensor bx = CopyRows(x, start, end);
+    const Tensor by = CopyRows(y, start, end);
+    const auto ret = EvaluateOnBatch(bx, by);
+    CopyDataToFrom(&loss, ret.first, batchsize, start, 0);
+    CopyDataToFrom(&metric, ret.second, batchsize, start, 0);
+  }
+  {
+    int start = x.shape(0) - batchsize, end = x.shape(0);
+    const Tensor bx = CopyRows(x, start, end);
+    const Tensor by = CopyRows(y, start, end);
+    const auto ret = EvaluateOnBatch(bx, by);
+    int dst_offset = x.shape(0) - num_extra_samples;
+    int src_offset = batchsize - num_extra_samples;
+    CopyDataToFrom(&loss, ret.first, num_extra_samples, dst_offset, src_offset);
+    CopyDataToFrom(&metric, ret.second, num_extra_samples, dst_offset,
+                   src_offset);
+  }
+  return std::make_pair(loss, metric);
+}
+
+std::pair<Tensor, Tensor> FeedForwardNet::EvaluateOnBatch(const Tensor& x,
+                                                          const Tensor& y) {
+  int flag = kEval;
+  const Tensor fea = Forward(flag, x);
+  const Tensor l = loss_->Forward(flag, fea, y);
+  const Tensor m = metric_->Forward(fea, y);
+  return std::make_pair(l, m);
+}
+
+const Tensor FeedForwardNet::Predict(const Tensor& x, size_t batchsize) {
+  CHECK_GE(x.shape(0), batchsize);
+  int num_extra_samples = x.shape(0) % batchsize;
+  const auto outshape = layers_.back()->GetOutputSampleShape();
+  Tensor y(Shape{x.shape(0), Product(outshape)}, x.device());
+  for (size_t b = 0; b < x.shape(0) / batchsize; b++) {
+    int start = b * batchsize, end = start + batchsize;
+    const Tensor bx = CopyRows(x, start, end);
+    CopyDataToFrom(&y, PredictOnBatch(bx), batchsize * y.shape(1),
+                   start * y.shape(1), 0);
+  }
+  if (num_extra_samples > 0) {
+    int start = x.shape(0) - batchsize, end = x.shape(0);
+    const Tensor bx = CopyRows(x, start, end);
+    CopyDataToFrom(&y, PredictOnBatch(bx), num_extra_samples * y.shape(1),
+                   (x.shape(0) - num_extra_samples) * y.shape(1),
+                   (batchsize - num_extra_samples) * y.shape(1));
+  }
+  return y;
+}
+
+const Tensor FeedForwardNet::PredictOnBatch(const Tensor& x) {
+  return Forward(kEval, x);
+}
+}  // namespace singa
diff --git a/src/model/layer/activation.cc b/src/model/layer/activation.cc
new file mode 100644
index 0000000..eb90d87
--- /dev/null
+++ b/src/model/layer/activation.cc
@@ -0,0 +1,87 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "singa/model/layer.h"
+#include "./activation.h"
+#include "singa/utils/string.h"
+namespace singa {
+
+RegisterLayerClass(singa_relu, Activation);
+RegisterLayerClass(singa_sigmoid, Activation);
+RegisterLayerClass(singa_tanh, Activation);
+
+RegisterLayerClass(singacpp_relu, Activation);
+RegisterLayerClass(singacuda_relu, Activation);
+RegisterLayerClass(singacl_relu, Activation);
+RegisterLayerClass(singacpp_sigmoid, Activation);
+RegisterLayerClass(singacuda_sigmoid, Activation);
+RegisterLayerClass(singacl_sigmoid, Activation);
+RegisterLayerClass(singacpp_tanh, Activation);
+RegisterLayerClass(singacuda_tanh, Activation);
+RegisterLayerClass(singacl_tanh, Activation);
+
+void Activation::Setup(const Shape& in_sample, const LayerConf& conf) {
+  Layer::Setup(in_sample, conf);
+  auto pos = conf.type().find_first_of('_');
+  CHECK_NE(pos, string::npos) << "There should be a '_' in the laye type "
+    << conf.type();
+  mode_ = ToLowerCase(conf.type().substr(pos + 1));
+  if (mode_ != "relu" && mode_ != "sigmoid" && mode_ != "tanh")
+    LOG(FATAL) << "Unkown activation type: " << conf.type() << " " << mode_
+      << ". Please use singa_relu, singa_sigmoid, or singa_tanh";
+  if (mode_ == "relu") {
+    neg_slope_ = conf.relu_conf().negative_slope();
+  }
+  out_sample_shape_ = in_sample;
+}
+
+const Tensor Activation::Forward(int flag, const Tensor& input) {
+  Tensor output;
+  if (mode_ == "sigmoid") {
+    output = Sigmoid(input);
+    if (flag & kTrain) buf_.push(output);
+  } else if (mode_ == "tanh") {
+    output = Tanh(input);
+    if (flag & kTrain) buf_.push(output);
+  } else if (mode_ == "relu") {
+    output = ReLU(input);
+    if (flag & kTrain) buf_.push(input);
+  } else
+    LOG(FATAL) << "Unkown activation: " << mode_;
+  return output;
+}
+
+const std::pair<Tensor, vector<Tensor>> Activation::Backward(
+    int flag, const Tensor& grad) {
+  vector<Tensor> param_grad;
+  CHECK(!buf_.empty());
+  // inout means either input or output, but only one is valid for an
+  // activation.
+  Tensor input_grad, inout = buf_.top();
+  buf_.pop();
+  if (mode_ == "sigmoid")
+    input_grad = grad * inout * (inout * (-1.f) + 1.f);
+  else if (mode_ == "tanh")
+    input_grad = grad * (inout * inout * (-1.f) + 1.f);
+  else if (mode_ == "relu")
+    input_grad = grad * (inout > 0.f) + (inout <= 0.f) * neg_slope_;
+  else LOG(FATAL) << "Unkown activation: " << mode_;
+  return std::make_pair(input_grad, param_grad);
+}
+
+}  // namespace singa
diff --git a/src/model/layer/activation.h b/src/model/layer/activation.h
new file mode 100644
index 0000000..7d15979
--- /dev/null
+++ b/src/model/layer/activation.h
@@ -0,0 +1,57 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef SINGA_MODEL_LAYER_ACTIVATION_H_
+#define SINGA_MODEL_LAYER_ACTIVATION_H_
+#include <utility>
+#include <string>
+#include <vector>
+#include "singa/model/layer.h"
+
+namespace singa {
+class Activation : public Layer {
+ public:
+  /// \copydoc Layer::layer_type()
+  // const std::string layer_type() const override { return "Activation"; }
+
+  /// \copydoc Layer::Setup(const LayerConf&);
+  void Setup(const Shape& in_sample, const LayerConf& conf) override;
+  const Shape GetOutputSampleShape() const override {
+    CHECK(out_sample_shape_.size()) << "You may haven't call Setup()";
+    return out_sample_shape_;
+  }
+
+  /// \copydoc Layer::Forward(int flag, const Tensor&)
+  const Tensor Forward(int flag, const Tensor& input) override;
+
+  /// \copydoc Layer::Backward(int, const Tensor&, const Tensor&);
+  const std::pair<Tensor, vector<Tensor>> Backward(int flag,
+                                                   const Tensor& grad) override;
+
+  const std::string Mode() const { return mode_; }
+
+  const float Negative_slope() const { return neg_slope_; }
+
+ protected:
+  std::string mode_;
+  std::stack<Tensor> buf_;
+  Shape out_sample_shape_;
+  float neg_slope_;
+};
+
+}  // namespace singa
+#endif  // SINGA_MODEL_LAYER_ACTIVATION_H_
diff --git a/src/model/layer/batchnorm.cc b/src/model/layer/batchnorm.cc
new file mode 100644
index 0000000..b345c6b
--- /dev/null
+++ b/src/model/layer/batchnorm.cc
@@ -0,0 +1,200 @@
+/*********************************************************
+*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*
+************************************************************/
+#include "batchnorm.h"
+
+namespace singa {
+RegisterLayerClass(singa_batchnorm, BatchNorm);
+RegisterLayerClass(singacpp_batchnorm, BatchNorm);
+RegisterLayerClass(singacuda_batchnorm, BatchNorm);
+RegisterLayerClass(singacl_batchnorm, BatchNorm);
+void BatchNorm::Setup(const Shape& in_sample, const LayerConf& conf) {
+  Layer::Setup(in_sample, conf);
+  out_sample_shape_ = in_sample;
+  factor_ = conf.batchnorm_conf().factor();
+  channels_ = in_sample.at(0);
+  if (in_sample.size() == 3u)
+    height_ = in_sample.at(1);
+  else
+    height_ = 1;
+  if (in_sample.size() == 3u)
+    width_ = in_sample.at(2);
+  else
+    width_ = 1;
+  if (in_sample.size() == 1u)
+    is_2d_ = true;
+  else
+    is_2d_ = false;
+
+  bnScale_.Reshape(Shape{channels_ * height_ * width_});
+  bnBias_.ResetLike(bnScale_);
+  runningMean_.ResetLike(bnScale_);
+  runningVariance_.ResetLike(bnScale_);
+
+  dbnScale_.ResetLike(bnScale_);
+  dbnBias_.ResetLike(bnBias_);
+  // Push back params into param_values_
+  // Assume the order of param is: bnScale, bnBias, runningMean, runningVariance
+  for (const auto& spec : conf.param()) param_specs_.push_back(spec);
+}
+
+void BatchNorm::ToDevice(std::shared_ptr<Device> device) {
+  bnScale_.ToDevice(device);
+  bnBias_.ToDevice(device);
+  dbnScale_.ToDevice(device);
+  dbnBias_.ToDevice(device);
+  runningMean_.ToDevice(device);
+  runningVariance_.ToDevice(device);
+}
+
+const Tensor BatchNorm::Forward(int flag, const Tensor& input) {
+  Tensor x = input.Clone();
+  x.Reshape(Shape{input.shape(0), input.Size() / input.shape(0)});
+  Tensor output, mean, var, xnorm;
+  output.ResetLike(x);
+
+  if ((flag & kTrain) == kTrain) {
+    mean = Average(x, 0);
+    runningMean_ *= 1.0f - factor_;
+    Axpy(factor_, mean, &runningMean_);
+    xnorm = x.Clone();
+    SubRow(mean, &xnorm);
+    xnorm = Square(xnorm);
+    var = Average(xnorm, 0);
+    runningVariance_ *= 1.0f - factor_;
+    Axpy(factor_, var, &runningVariance_);
+    Tensor tmp = var.Clone();
+    tmp = Sqrt(tmp);
+    tmp += 1e-6f;
+    xnorm = x.Clone();
+    SubRow(mean, &xnorm);
+    DivRow(tmp, &xnorm);
+    output = xnorm.Clone();
+    MultRow(bnScale_, &output);
+    AddRow(bnBias_, &output);
+    buf_.push(x);
+    buf_.push(mean);
+    buf_.push(var);
+    buf_.push(xnorm);
+  } else {
+    xnorm = x.Clone();
+    SubRow(runningMean_, &xnorm);
+    Tensor tmp = runningVariance_.Clone();
+    tmp = Sqrt(tmp);
+    tmp += 1e-6f;
+    DivRow(tmp, &xnorm);
+    output = xnorm.Clone();
+    MultRow(bnScale_, &output);
+    AddRow(bnBias_, &output);
+  }
+
+  if (!is_2d_)
+    output.Reshape(Shape{output.shape(0), channels_, height_, width_});
+  return output;
+}
+
+const std::pair<Tensor, vector<Tensor>> BatchNorm::Backward(
+    int flag, const Tensor& grad) {
+  Tensor dy = grad.Clone();
+  dy.Reshape(Shape{grad.shape(0), grad.Size() / grad.shape(0)});
+  Tensor xnorm = buf_.top();
+  buf_.pop();
+  Tensor var = buf_.top();
+  buf_.pop();
+  Tensor mean = buf_.top();
+  buf_.pop();
+  Tensor input = buf_.top();
+  buf_.pop();
+
+  Tensor dx;
+  vector<Tensor> param_grad;
+
+  if ((flag & kTrain) == kTrain) {
+    // gxnrom
+    Tensor gxnorm = dy.Clone();
+    MultRow(bnScale_, &gxnorm);
+    // gvar
+    Tensor tmp = var.Clone();
+    tmp += 1e-6f;
+    tmp = Pow(var, -1.5f);
+    tmp *= -0.5f;
+
+    Tensor tmpx = input.Clone();
+    SubRow(mean, &tmpx);
+
+    tmpx = tmpx * gxnorm;
+    MultRow(tmp, &tmpx);
+    Tensor gvar;
+    gvar.ResetLike(var);
+    SumRows(tmpx, &gvar);
+    // gmean
+    tmp = var.Clone();
+    tmp += 1e-6f;
+    tmp = Pow(tmp, -0.5f);
+    tmp *= -1.0f;
+    Tensor tmpx_r;
+    tmpx_r.ResetLike(tmp);
+    SumRows(gxnorm, &tmpx_r);
+    Tensor gmean = tmpx_r * tmp;
+
+    tmpx = input.Clone();
+    SubRow(mean, &tmpx);
+    SumRows(tmpx, &tmp);
+    tmp *= -2.0f / input.shape(0);
+    tmp = tmp * gvar;
+    gmean = gmean + tmp;
+    // dx
+    tmp = var.Clone();
+    tmp += 1e-6f;
+    tmp = Pow(tmp, -0.5f);
+    dx = gxnorm.Clone();
+    MultRow(tmp, &dx);
+
+    tmpx = input.Clone();
+    SubRow(mean, &tmpx);
+    tmpx *= 2.0f / input.shape(0);
+    MultRow(gvar, &tmpx);
+    dx = dx + tmpx;
+
+    tmp = gmean.Clone();
+    tmp *= 1.0f / input.shape(0);
+
+    AddRow(tmp, &dx);
+    // dbnScale
+    tmpx = dy * xnorm;
+    SumRows(tmpx, &dbnScale_);
+    // dbnBias
+    SumRows(dy, &dbnBias_);
+    param_grad.push_back(dbnScale_);
+    param_grad.push_back(dbnBias_);
+    Tensor dummy;
+    dummy.ResetLike(runningMean_);
+    dummy.SetValue(.0f);
+    param_grad.push_back(dummy);
+    param_grad.push_back(dummy);
+  } else {
+    LOG(ERROR) << "Do not call backward for evaluation phase";
+  }
+  if (!is_2d_)
+    dx.Reshape(Shape{dx.shape(0), channels_, height_, width_});
+  return std::make_pair(dx, param_grad);
+}
+
+}  // namespace
diff --git a/src/model/layer/batchnorm.h b/src/model/layer/batchnorm.h
new file mode 100644
index 0000000..c2cfde9
--- /dev/null
+++ b/src/model/layer/batchnorm.h
@@ -0,0 +1,90 @@
+/*********************************************************
+*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*
+************************************************************/
+#ifndef SINGA_MODEL_LAYER_BATCHNORM_H
+#define SINGA_MODEL_LAYER_BATCHNORM_H
+#include "singa/model/layer.h"
+#include "singa/core/common.h"
+#include "singa/proto/core.pb.h"
+#include <stack>
+
+namespace singa {
+class BatchNorm : public Layer {
+ public:
+  /// \copydoc Layer::layer_type()
+  // const std::string layer_type() const override { return "BatchNorm"; }
+
+  /// \copydoc Layer::Setup(const LayerConf&);
+  void Setup(const Shape& in_sample, const LayerConf& conf) override;
+  const Shape GetOutputSampleShape() const override {
+    CHECK(out_sample_shape_.size()) << "You may haven't call Setup()";
+    return out_sample_shape_;
+  }
+
+  const Tensor Forward(int flag, const Tensor& input)
+    override;
+
+  /// \copydoc Layer::Backward(int, const Tensor&, const Tensor&);
+  const std::pair<Tensor, vector<Tensor>> Backward(
+      int flag, const Tensor& grad) override;
+  virtual const std::vector<Tensor> param_values() override {
+    return std::vector<Tensor> { bnScale_, bnBias_, runningMean_,
+                                 runningVariance_ };
+  }
+  const float factor() const { return factor_; }
+  const Tensor& bnScale() const { return bnScale_; }
+  const Tensor& bnBias() const { return bnBias_; }
+  const Tensor& runningMean() const { return runningMean_; }
+  const Tensor& runningVariance() const { return runningVariance_; }
+  const size_t channels() const { return channels_; }
+  const size_t height() const { return height_; }
+  const size_t width() const { return width_; }
+  void set_bnScale(Tensor x) {
+    bnScale_.ResetLike(x);
+    bnScale_.CopyData(x);
+  }
+  void set_bnBias(Tensor x) {
+    bnBias_.ResetLike(x);
+    bnBias_.CopyData(x);
+  }
+  void set_runningMean(Tensor x) {
+    runningMean_.ResetLike(x);
+    runningMean_.CopyData(x);
+  }
+  void set_runningVariance(Tensor x) {
+    runningVariance_.ResetLike(x);
+    runningVariance_.CopyData(x);
+  }
+  virtual void ToDevice(std::shared_ptr<Device> device) override;
+
+ protected:
+  float factor_;
+  size_t channels_, height_, width_;
+  bool is_2d_ = false;
+  Tensor bnScale_, bnBias_;
+  Tensor dbnScale_, dbnBias_;
+  Tensor runningMean_, runningVariance_;
+  // Store intermediate data, i.e., input tensor
+  std::stack<Tensor> buf_;
+  Shape out_sample_shape_;
+}; // class batchnorm
+} // namespace
+
+#endif  // SINGA_MODEL_LAYER_BATCHNORM_H
diff --git a/src/model/layer/convolution.cc b/src/model/layer/convolution.cc
new file mode 100644
index 0000000..52e9d93
--- /dev/null
+++ b/src/model/layer/convolution.cc
@@ -0,0 +1,232 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "./convolution.h"
+#include <vector>
+#include "singa/model/layer.h"
+
+namespace singa {
+using std::vector;
+
+RegisterLayerClass(singacpp_convolution, Convolution);
+void Convolution::Setup(const Shape &in_sample, const LayerConf &conf) {
+  Layer::Setup(in_sample, conf);
+  ConvolutionConf conv_conf = conf.convolution_conf();
+  // kernel_size, pad, and stride are repeated fields.
+  if (conv_conf.kernel_size_size() > 0) {
+    if (conv_conf.kernel_size_size() == 1) {
+      kernel_w_ = kernel_h_ = conv_conf.kernel_size(0);
+    } else {
+      kernel_w_ = conv_conf.kernel_size(0);
+      kernel_h_ = conv_conf.kernel_size(1);
+    }
+  } else {
+    kernel_w_ = conv_conf.kernel_w();
+    kernel_h_ = conv_conf.kernel_h();
+  }
+  CHECK_GT(kernel_w_, 0u);
+  CHECK_GT(kernel_h_, 0u);
+
+  if (conv_conf.pad_size() > 0) {
+    if (conv_conf.pad_size() == 1) {
+      pad_w_ = pad_h_ = conv_conf.pad(0);
+    } else {
+      pad_w_ = conv_conf.pad(0);
+      pad_h_ = conv_conf.pad(1);
+    }
+  } else {
+    pad_w_ = conv_conf.pad_w();
+    pad_h_ = conv_conf.pad_h();
+  }
+  CHECK_GE(pad_w_, 0u);
+  CHECK_GE(pad_h_, 0u);
+
+  if (conv_conf.stride_size() > 0) {
+    if (conv_conf.stride_size() == 1) {
+      stride_w_ = stride_h_ = conv_conf.stride(0);
+    } else {
+      stride_w_ = conv_conf.stride(0);
+      stride_h_ = conv_conf.stride(1);
+    }
+  } else {
+    stride_w_ = conv_conf.stride_w();
+    stride_h_ = conv_conf.stride_h();
+  }
+  CHECK_GT(stride_w_, 0u);
+  CHECK_GE(stride_h_, 0u);  // 0 for 1D conv
+
+  num_filters_ = conv_conf.num_output();
+  bias_term_ = conv_conf.bias_term();
+
+  // Shape of input image
+  CHECK_EQ(in_sample.size(), 3u);
+  channels_ = in_sample.at(0);
+  height_ = in_sample.at(1);
+  width_ = in_sample.at(2);
+
+  conv_height_ = 1;
+  if (stride_h_ > 0)
+    conv_height_ = (height_ + 2 * pad_h_ - kernel_h_) / stride_h_ + 1;
+  conv_width_ = (width_ + 2 * pad_w_ - kernel_w_) / stride_w_ + 1;
+  out_sample_shape_ = vector<size_t>{num_filters_, conv_height_, conv_width_};
+
+  col_height_ = channels_ * kernel_w_ * kernel_h_;
+  col_width_ = conv_height_ * conv_width_;
+
+  // Setup shape of weight_ and bias_
+  weight_.Reshape(Shape{num_filters_, col_height_});
+  bias_.Reshape(Shape{num_filters_});
+  // Assume the order of param is: weight, bias
+  for (const auto &spec : conf.param()) param_specs_.push_back(spec);
+}
+
+/// \copydoc Layer::Forward(int flag, const Tensor&)
+const Tensor Convolution::Forward(int flag, const Tensor &input) {
+  CHECK(buf_.empty());
+  CHECK_EQ(input.device()->lang(), kCpp);
+  CHECK_EQ(input.nDim(), 4u);
+  if (flag & kTrain) buf_.push(input);
+  size_t batchsize = input.shape(0);
+  size_t imagesize = input.Size() / batchsize;
+  DataType dtype = input.data_type();
+  auto dev = input.device();
+  Shape shape{batchsize, num_filters_, conv_height_, conv_width_};
+  Tensor output(shape, dev, dtype);
+  Tensor col_data(Shape{col_height_, col_width_});
+  float *data_col = new float[col_height_ * col_width_];
+  auto in_data = input.data<float>();
+  for (size_t b = 0; b < batchsize; b++) {
+    Im2col(in_data + b * imagesize, channels_, height_, width_, kernel_h_,
+           kernel_w_, pad_h_, pad_w_, stride_h_, stride_w_, data_col);
+    col_data.CopyDataFromHostPtr(data_col, col_height_ * col_width_);
+    Tensor each = Mult(weight_, col_data);
+    if (bias_term_) {
+      AddColumn(bias_, &each);
+    }
+    CopyDataToFrom(&output, each, each.Size(), b * each.Size());
+  }
+  delete[] data_col;
+  return output;
+}
+
+/// \copydoc Layer::Backward(int, const Tensor&, const Tensor&);
+const std::pair<Tensor, vector<Tensor>> Convolution::Backward(
+    int flag, const Tensor &grad) {
+  CHECK_EQ(grad.device()->lang(), kCpp);
+  CHECK_EQ(grad.nDim(), 4u);
+  CHECK(!buf_.empty());
+  Tensor src_data = buf_.top();
+  buf_.pop();
+  vector<Tensor> param_grad;
+  Tensor dx;
+  Tensor db, dw;
+  dx.ResetLike(src_data);
+  db.ResetLike(bias_);
+  dw.ResetLike(weight_);
+  dw.SetValue(0.0f);
+  size_t batchsize = grad.shape(0);
+  size_t imagesize = src_data.Size() / batchsize;
+  if (bias_term_) {
+    Tensor tmp1 =
+        Reshape(grad, Shape{batchsize * num_filters_,
+                            grad.Size() / (batchsize * num_filters_)});
+    Tensor tmp2(Shape{batchsize * num_filters_});
+    SumColumns(tmp1, &tmp2);
+    Tensor tmp3 = Reshape(tmp2, Shape{batchsize, num_filters_});
+    SumRows(tmp3, &db);
+  }
+  auto in_data = src_data.data<float>();
+  Tensor col_data(Shape{col_height_, col_width_});
+  float *data_col = new float[col_height_ * col_width_];
+  float *dx_b = new float[imagesize];
+  for (size_t b = 0; b < batchsize; b++) {
+    Im2col(in_data + b * imagesize, channels_, height_, width_, kernel_h_,
+           kernel_w_, pad_h_, pad_w_, stride_h_, stride_w_, data_col);
+    col_data.CopyDataFromHostPtr(data_col, col_height_ * col_width_);
+    Tensor grad_b(Shape{num_filters_, conv_height_ * conv_width_});
+    CopyDataToFrom(&grad_b, grad, grad_b.Size(), 0, b * grad_b.Size());
+    dw += Mult(grad_b, col_data.T());
+    Tensor dcol_b = Mult(weight_.T(), grad_b);
+    auto dcol_data = dcol_b.data<float>();
+    Col2im(dcol_data, channels_, height_, width_, kernel_h_, kernel_w_, pad_h_,
+           pad_w_, stride_h_, stride_w_, dx_b);
+    dx.CopyDataFromHostPtr(dx_b, imagesize, b * imagesize);
+  }
+  param_grad.push_back(dw);
+  param_grad.push_back(db);
+  delete[] data_col;
+  delete[] dx_b;
+  return std::make_pair(dx, param_grad);
+}
+void Convolution::ToDevice(std::shared_ptr<Device> device) {
+  Layer::ToDevice(device);
+  weight_.ToDevice(device);
+  bias_.ToDevice(device);
+}
+
+void Convolution::Im2col(const float *data_im, const int channels,
+                         const int height, const int width, const int kernel_h,
+                         const int kernel_w, const int pad_h, const int pad_w,
+                         const int stride_h, const int stride_w,
+                         float *data_col) {
+  int height_col = (height + 2 * pad_h - kernel_h) / stride_h + 1;
+  int width_col = (width + 2 * pad_w - kernel_w) / stride_w + 1;
+  int channels_col = channels * kernel_h * kernel_w;
+  for (int c = 0; c < channels_col; ++c) {
+    int w_offset = c % kernel_w;
+    int h_offset = (c / kernel_w) % kernel_h;
+    int c_im = c / kernel_h / kernel_w;
+    for (int h = 0; h < height_col; ++h) {
+      for (int w = 0; w < width_col; ++w) {
+        int h_pad = h * stride_h - pad_h + h_offset;
+        int w_pad = w * stride_w - pad_w + w_offset;
+        if (h_pad >= 0 && h_pad < height && w_pad >= 0 && w_pad < width)
+          data_col[(c * height_col + h) * width_col + w] =
+              data_im[(c_im * height + h_pad) * width + w_pad];
+        else
+          data_col[(c * height_col + h) * width_col + w] = 0;
+      }
+    }
+  }
+}
+
+void Convolution::Col2im(const float *data_col, const int channels,
+                         const int height, const int width, const int patch_h,
+                         const int patch_w, const int pad_h, const int pad_w,
+                         const int stride_h, const int stride_w,
+                         float *data_im) {
+  memset(data_im, 0, height * width * channels * sizeof(float));
+  int height_col = (height + 2 * pad_h - patch_h) / stride_h + 1;
+  int width_col = (width + 2 * pad_w - patch_w) / stride_w + 1;
+  int channels_col = channels * patch_h * patch_w;
+  for (int c = 0; c < channels_col; ++c) {
+    int w_offset = c % patch_w;
+    int h_offset = (c / patch_w) % patch_h;
+    int c_im = c / patch_h / patch_w;
+    for (int h = 0; h < height_col; ++h) {
+      for (int w = 0; w < width_col; ++w) {
+        int h_pad = h * stride_h - pad_h + h_offset;
+        int w_pad = w * stride_w - pad_w + w_offset;
+        if (h_pad >= 0 && h_pad < height && w_pad >= 0 && w_pad < width)
+          data_im[(c_im * height + h_pad) * width + w_pad] +=
+              data_col[(c * height_col + h) * width_col + w];
+      }
+    }
+  }
+}
+}  // namespace singa
diff --git a/src/model/layer/convolution.h b/src/model/layer/convolution.h
new file mode 100644
index 0000000..d85a17b
--- /dev/null
+++ b/src/model/layer/convolution.h
@@ -0,0 +1,98 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef SRC_MODEL_LAYER_CONVOLUTION_H_
+#define SRC_MODEL_LAYER_CONVOLUTION_H_
+#include <stack>
+#include <string>
+#include <utility>
+#include <vector>
+#include "singa/model/layer.h"
+
+namespace singa {
+class Convolution : public Layer {
+ public:
+  /// \copydoc Layer::layer_type()
+  // const std::string layer_type() const override { return "Convolution"; }
+
+  /// \copydoc Layer::Setup(const LayerConf&);
+  void Setup(const vector<size_t>& in_shape, const LayerConf& conf) override;
+  const Shape GetOutputSampleShape() const override {
+    CHECK(out_sample_shape_.size()) << "You may haven't call Setup()";
+    return out_sample_shape_;
+  }
+
+  // void SetupParam(const Tensor &input);
+  /// \copydoc Layer::Forward(int flag, const Tensor&)
+  const Tensor Forward(int flag, const Tensor& input) override;
+
+  /// \copydoc Layer::Backward(int, const Tensor&, const Tensor&);
+  const std::pair<Tensor, vector<Tensor>> Backward(int flag,
+                                                   const Tensor& grad) override;
+
+  void ToDevice(std::shared_ptr<Device> device) override;
+
+  void Im2col(const float* data_im, const int channels, const int height,
+              const int width, const int kernel_h, const int kernel_w,
+              const int pad_h, const int pad_w, const int stride_h,
+              const int stride_w, float* data_col);
+
+  void Col2im(const float* data_col, const int channels, const int height,
+              const int width, const int patch_h, const int patch_w,
+              const int pad_h, const int pad_w, const int stride_h,
+              const int stride_w, float* data_im);
+
+  const std::vector<Tensor> param_values() override {
+    return std::vector<Tensor>{weight_, bias_};
+  }
+
+  size_t kernel_w() const { return kernel_w_; }
+  size_t kernel_h() const { return kernel_h_; }
+  size_t pad_w() const { return pad_w_; }
+  size_t pad_h() const { return pad_h_; }
+  size_t stride_w() const { return stride_w_; }
+  size_t stride_h() const { return stride_h_; }
+  size_t num_filters() const { return num_filters_; }
+  size_t channels() const { return channels_; }
+  size_t height() const { return height_; }
+  size_t width() const { return width_; }
+  bool bias_term() const { return bias_term_; }
+  const Tensor& weight() const { return weight_; }
+  const Tensor& bias() const { return bias_; }
+
+  void set_weight(Tensor w) {
+    weight_.ResetLike(w);
+    weight_.CopyData(w);
+  }
+  void set_bias(Tensor b) {
+    bias_.ResetLike(b);
+    bias_.CopyData(b);
+  }
+
+ protected:
+  size_t kernel_w_, pad_w_, stride_w_;
+  size_t kernel_h_, pad_h_, stride_h_;
+  size_t channels_, height_, width_;
+  size_t col_height_, col_width_, conv_height_, conv_width_, num_filters_;
+  Tensor weight_, bias_;
+  // store intermediate data, i.e., input tensor
+  std::stack<Tensor> buf_;
+  bool bias_term_;
+  vector<size_t> out_sample_shape_;
+};
+}  // namespace singa
+#endif  // SRC_MODEL_LAYER_CONVOLUTION_H_
diff --git a/src/model/layer/cudnn_activation.cc b/src/model/layer/cudnn_activation.cc
new file mode 100644
index 0000000..4ecb375
--- /dev/null
+++ b/src/model/layer/cudnn_activation.cc
@@ -0,0 +1,121 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "singa/singa_config.h"
+#ifdef USE_CUDNN
+#include "./cudnn_activation.h"
+#include <cudnn.h>
+
+#include "./cudnn_utils.h"
+#include "singa/core/common.h"
+#include "singa/utils/logging.h"
+
+namespace singa {
+RegisterLayerClass(cudnn_relu, CudnnActivation);
+RegisterLayerClass(cudnn_sigmoid, CudnnActivation);
+RegisterLayerClass(cudnn_tanh, CudnnActivation);
+CudnnActivation::~CudnnActivation() {
+  if (acti_desc_ != nullptr)
+    CUDNN_CHECK(cudnnDestroyActivationDescriptor(acti_desc_));
+  if (desc_ != nullptr) CUDNN_CHECK(cudnnDestroyTensorDescriptor(desc_));
+}
+
+void CudnnActivation::InitCudnn(size_t size, DataType dtype) {
+  CHECK(!has_init_cudnn_);
+  CUDNN_CHECK(cudnnCreateTensorDescriptor(&desc_));
+  CUDNN_CHECK(cudnnCreateActivationDescriptor(&acti_desc_));
+
+  CUDNN_CHECK(cudnnSetTensor4dDescriptor(
+      desc_, CUDNN_TENSOR_NCHW, GetCudnnDataType(dtype), 1, 1, 1, size));
+
+  if (mode_ == "sigmoid")
+    cudnn_mode_ = CUDNN_ACTIVATION_SIGMOID;
+  else if (mode_ == "tanh")
+    cudnn_mode_ = CUDNN_ACTIVATION_TANH;
+  else if (mode_ == "relu")
+    cudnn_mode_ = CUDNN_ACTIVATION_RELU;
+  else
+    LOG(FATAL) << "Unkown activation: " << mode_;
+
+  CUDNN_CHECK(cudnnSetActivationDescriptor(
+        acti_desc_, cudnn_mode_, CUDNN_PROPAGATE_NAN, 0.0f));
+  has_init_cudnn_ = true;
+}
+
+const Tensor CudnnActivation::Forward(int flag, const Tensor& input) {
+  CHECK(buf_.empty());
+  auto size = input.Size();
+  DataType dtype = input.data_type();
+  if (!has_init_cudnn_) {
+    InitCudnn(size, dtype);
+  }
+  Tensor output;
+  output.ResetLike(input);
+  output.device()->Exec([input, output, this](Context* ctx) {
+    Block* inblock = input.block(), * outblock = output.block();
+    float alpha = 1.0f, beta = 0.0f;
+#if CUDNN_VERSION_MAJOR == 5
+    CUDNN_CHECK(cudnnActivationForward(
+        ctx->cudnn_handle, this->acti_desc_, &alpha, this->desc_,
+        inblock->data(), &beta, this->desc_, outblock->mutable_data()));
+#elif CUDNN_VERSION_MAJOR == 4
+    CUDNN_CHECK(cudnnActivationForward_v4(
+        ctx->cudnn_handle, this->acti_desc_, &alpha, this->desc_,
+        inblock->data(), &beta, this->desc_, outblock->mutable_data()));
+#endif
+  }, {input.block()}, {output.block()});
+  if (flag & kTrain) {
+    if (cudnn_mode_ == CUDNN_ACTIVATION_SIGMOID ||
+        cudnn_mode_ == CUDNN_ACTIVATION_TANH) {
+      buf_.push(output);
+    } else if (cudnn_mode_ == CUDNN_ACTIVATION_RELU) {
+      buf_.push(input);
+    }
+  }
+  return output;
+}
+
+const std::pair<Tensor, vector<Tensor>> CudnnActivation::Backward(
+    int flag, const Tensor& grad) {
+  vector<Tensor> param_grad;
+  Tensor dx;
+  CHECK(!buf_.empty());
+  // inout means either used as input or output, only one is valid for one type
+  // of activation
+  Tensor inout = buf_.top();
+  buf_.pop();
+  dx.ResetLike(grad);
+  dx.device()->Exec([dx, grad, inout, this](Context* ctx) {
+    Block* dyblock = grad.block(), * dxblock = dx.block(),
+           * yblock = inout.block(), * xblock = inout.block();
+    float alpha = 1.0f, beta = 0.0f;
+#if CUDNN_VERSION_MAJOR == 5
+    CUDNN_CHECK(cudnnActivationBackward(
+        ctx->cudnn_handle, this->acti_desc_, &alpha, this->desc_,
+        yblock->data(), this->desc_, dyblock->data(), this->desc_,
+        xblock->data(), &beta, this->desc_, dxblock->mutable_data()));
+#elif CUDNN_VERSION_MAJOR == 4
+    CUDNN_CHECK(cudnnActivationBackward_v4(
+        ctx->cudnn_handle, this->acti_desc_, &alpha, this->desc_, yblock->data(),
+        this->desc_, dyblock->data(), this->desc_, xblock->data(), &beta,
+        this->desc_, dxblock->mutable_data()));
+#endif
+  }, {grad.block(), inout.block()}, {dx.block()});
+  return std::make_pair(dx, param_grad);
+}
+}  // namespace singa
+#endif  // USE_CUDNN
diff --git a/src/model/layer/cudnn_activation.h b/src/model/layer/cudnn_activation.h
new file mode 100644
index 0000000..c69d157
--- /dev/null
+++ b/src/model/layer/cudnn_activation.h
@@ -0,0 +1,59 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef SINGA_MODEL_LAYER_CUDNN_ACTIVATION_H_
+#define SINGA_MODEL_LAYER_CUDNN_ACTIVATION_H_
+#include "singa/singa_config.h"
+#ifdef USE_CUDNN
+#include <cudnn.h>
+#include <utility>
+#include <string>
+#include <vector>
+
+#include "./activation.h"
+#include "singa/core/common.h"
+#include "singa/model/layer.h"
+#include "singa/proto/core.pb.h"
+
+namespace singa {
+class CudnnActivation : public Activation {
+ public:
+  ~CudnnActivation();
+  /// \copydoc Layer::layer_type()
+  // const std::string layer_type() const override { return "CudnnActivation"; }
+
+  const Tensor Forward(int flag, const Tensor& input) override;
+  const std::pair<Tensor, vector<Tensor>> Backward(int flag,
+                                                   const Tensor& grad) override;
+
+
+  const cudnnActivationMode_t CudnnMode() const { return cudnn_mode_; }
+
+ private:
+  /// Init cudnn related data structures.
+  void InitCudnn(size_t size, DataType dtype);
+
+ private:
+  bool has_init_cudnn_ = false;
+  cudnnActivationDescriptor_t acti_desc_ = nullptr;
+  cudnnTensorDescriptor_t desc_ = nullptr;
+  cudnnActivationMode_t cudnn_mode_;
+};
+}  // namespace
+#endif  // USE_CUDNN
+#endif  // SINGA_MODEL_LAYER_CUDNN_ACTIVATION_H_
diff --git a/src/model/layer/cudnn_batchnorm.cc b/src/model/layer/cudnn_batchnorm.cc
new file mode 100644
index 0000000..f29679c
--- /dev/null
+++ b/src/model/layer/cudnn_batchnorm.cc
@@ -0,0 +1,230 @@
+/*********************************************************
+*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*
+************************************************************/
+#include "cudnn_batchnorm.h"
+#ifdef USE_CUDNN
+
+namespace singa {
+
+RegisterLayerClass(cudnn_batchnorm, CudnnBatchNorm);
+CudnnBatchNorm::~CudnnBatchNorm() {
+  if (has_init_cudnn_) {
+    CUDNN_CHECK(cudnnDestroyTensorDescriptor(shape_desc_));
+    CUDNN_CHECK(cudnnDestroyTensorDescriptor(param_desc_));
+  }
+}
+
+void CudnnBatchNorm::ToDevice(std::shared_ptr<Device> device) {
+  BatchNorm::ToDevice(device);
+  resultSaveMean_.ToDevice(device);
+  resultSaveVariance_.ToDevice(device);
+}
+
+void CudnnBatchNorm::Setup(const Shape& in_sample, const LayerConf& conf) {
+  BatchNorm::Setup(in_sample, conf);
+  bnScale_.Reshape(Shape{channels_});
+  bnBias_.Reshape(Shape{channels_});
+  dbnScale_.Reshape(Shape{channels_});
+  dbnBias_.Reshape(Shape{channels_});
+  runningMean_.Reshape(Shape{channels_});
+  runningVariance_.Reshape(Shape{channels_});
+  resultSaveMean_.Reshape(Shape{channels_});
+  resultSaveVariance_.Reshape(Shape{channels_});
+}
+
+void CudnnBatchNorm::InitCudnn(const Shape& shape, DataType dtype) {
+  CHECK(!has_init_cudnn_);
+  mode_ = CUDNN_BATCHNORM_SPATIAL;
+  CUDNN_CHECK(cudnnCreateTensorDescriptor(&shape_desc_));
+  CUDNN_CHECK(cudnnCreateTensorDescriptor(&param_desc_));
+  CHECK_EQ(shape.size(), 4u);
+  CUDNN_CHECK(cudnnSetTensor4dDescriptor(shape_desc_,
+        CUDNN_TENSOR_NCHW,
+        GetCudnnDataType(dtype),
+        shape[0],
+        shape[1],
+        shape[2],
+        shape[3]));
+  CUDNN_CHECK(cudnnSetTensor4dDescriptor(param_desc_,
+        CUDNN_TENSOR_NCHW,
+        GetCudnnDataType(dtype),
+        1,
+        shape[1],
+        1,
+        1));
+  has_init_cudnn_ = true;
+}
+const Tensor CudnnBatchNorm::Forward(int flag, const Tensor& input) {
+  auto shape = input.shape();
+  auto dtype = input.data_type();
+  Tensor output;
+  Tensor x;
+  if(is_2d_)
+    x = Reshape(input, Shape{shape.at(0), shape.at(1), 1, 1});
+  else
+    x = input;
+  shape = x.shape();
+  if (!has_init_cudnn_)
+    InitCudnn(shape, dtype);
+  // TODO(wangji): check device id of input and params
+  output.ResetLike(x);
+  if ((flag & kTrain) == kTrain) {
+    output.device()->Exec(
+        [=](Context* ctx) {
+          Block *inBlock = x.block(), *outBlock = output.block(),
+            *saveMeanBlock = resultSaveMean_.block(),
+            *saveVarBlock = resultSaveVariance_.block(),
+            *runningMeanBlock = runningMean_.block(),
+            *runningVarBlock = runningVariance_.block(),
+            *bnScaleBlock = bnScale_.block(),
+            *bnBiasBlock = bnBias_.block();
+          const float alpha = 1.0f, beta = 0.0f;
+          double epsilon = CUDNN_BN_MIN_EPSILON;
+          CUDNN_CHECK(cudnnBatchNormalizationForwardTraining(
+              ctx->cudnn_handle,
+              this->mode_,
+              &alpha,
+              &beta,
+              shape_desc_,
+              inBlock->data(),
+              shape_desc_,
+              outBlock->mutable_data(),
+              param_desc_,
+              bnScaleBlock->data(),
+              bnBiasBlock->data(),
+              factor_,
+              runningMeanBlock->mutable_data(),
+              runningVarBlock->mutable_data(),
+              epsilon,
+              saveMeanBlock->mutable_data(),
+              saveVarBlock->mutable_data()));
+        },
+        {x.block(),
+         bnScale_.block(),
+         bnBias_.block()},
+        {output.block(),
+         runningMean_.block(),
+         runningVariance_.block(),
+         resultSaveMean_.block(),
+         resultSaveVariance_.block()});
+    buf_.push(x);
+  } else {
+    output.device()->Exec(
+        [=](Context* ctx) {
+          Block *inBlock = x.block(), *outBlock = output.block(),
+            *runningMeanBlock = runningMean_.block(),
+            *runningVarBlock = runningVariance_.block(),
+            *bnScaleBlock = bnScale_.block(),
+            *bnBiasBlock = bnBias_.block();
+          const float alpha = 1.0f, beta = 0.0f;
+          double epsilon = CUDNN_BN_MIN_EPSILON;
+          CUDNN_CHECK(cudnnBatchNormalizationForwardInference(
+              ctx->cudnn_handle,
+              this->mode_,
+              &alpha,
+              &beta,
+              shape_desc_,
+              inBlock->data(),
+              shape_desc_,
+              outBlock->mutable_data(),
+              param_desc_,
+              bnScaleBlock->data(),
+              bnBiasBlock->data(),
+              runningMeanBlock->data(),
+              runningVarBlock->data(),
+              epsilon));
+        },
+        {x.block(),
+         bnScale_.block(),
+         bnBias_.block(),
+         runningMean_.block(),
+         runningVariance_.block()},
+        {output.block()});
+  }
+  if (is_2d_)
+    output.Reshape(Shape{shape.at(0), shape.at(1)});
+  return output;
+}
+
+const std::pair<Tensor, vector<Tensor>> CudnnBatchNorm::Backward(
+    int flag, const Tensor& grad) {
+  vector <Tensor> param_grad;
+  Tensor dx;
+  if ((flag & kTrain) == kTrain) {
+    Tensor x = buf_.top();
+    buf_.pop();
+    dx.ResetLike(grad);
+    dx.device()->Exec(
+        [=](Context* ctx) {
+          Block *dyblock = grad.block(), *dxblock = dx.block(),
+            *xblock = x.block(),
+            *bnScaleBlock = bnScale_.block(),
+            *dbnScaleBlock = dbnScale_.block(),
+            *dbnBiasBlock = dbnBias_.block(),
+            *saveMeanBlock = resultSaveMean_.block(),
+            *saveVarBlock = resultSaveVariance_.block();
+          const float alpha = 1.0f, beta = .0f;
+          double epsilon = CUDNN_BN_MIN_EPSILON;
+          CUDNN_CHECK(cudnnBatchNormalizationBackward(ctx->cudnn_handle,
+              this->mode_,
+              &alpha,
+              &beta,
+              &alpha,
+              &beta,
+              shape_desc_,
+              xblock->data(),
+              shape_desc_,
+              dyblock->data(),
+              shape_desc_,
+              dxblock->mutable_data(),
+              param_desc_,
+              bnScaleBlock->data(),
+              dbnScaleBlock->mutable_data(),
+              dbnBiasBlock->mutable_data(),
+              epsilon,
+              saveMeanBlock->data(),
+              saveVarBlock->data()));
+
+        },
+        {dx.block(),
+         grad.block(),
+         bnScale_.block(),
+         resultSaveMean_.block(),
+         resultSaveVariance_.block()},
+        {dx.block(),
+         dbnScale_.block(),
+         dbnBias_.block()});
+  } else {
+    LOG(ERROR) << "Do not call backward for evaluation phase";
+  }
+  param_grad.push_back(dbnScale_);
+  param_grad.push_back(dbnBias_);
+  Tensor dummy;
+  dummy.ResetLike(dbnScale_);
+  dummy.SetValue(.0f);
+  param_grad.push_back(dummy);
+  param_grad.push_back(dummy);
+  if (is_2d_)
+    dx.Reshape(Shape{dx.shape().at(0), dx.shape().at(1)});
+  return std::make_pair(dx, param_grad);
+}
+}  // namespace
+
+#endif  // USE_CUDNN
diff --git a/src/model/layer/cudnn_batchnorm.h b/src/model/layer/cudnn_batchnorm.h
new file mode 100644
index 0000000..c4390a1
--- /dev/null
+++ b/src/model/layer/cudnn_batchnorm.h
@@ -0,0 +1,58 @@
+/*********************************************************
+*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*
+************************************************************/
+#ifndef SINGA_MODEL_LAYER_CUDNN_BATCHNORM_H
+#define SINGA_MODEL_LAYER_CUDNN_BATCHNORM_H
+#include "singa/singa_config.h"
+#ifdef USE_CUDNN
+
+#include "batchnorm.h"
+#include "cudnn_utils.h"
+
+namespace singa {
+class CudnnBatchNorm : public BatchNorm {
+ public:
+  ~CudnnBatchNorm();
+  /// \copy doc Layer::layer_type()
+  // const std::string layer_type() const override { return "CudnnBatchNorm"; }
+
+  void Setup(const Shape& in_sample, const LayerConf& conf) override;
+
+  const Tensor Forward(int flag, const Tensor& input) override;
+  const std::pair<Tensor, vector<Tensor>> Backward(int flag,
+                                                   const Tensor& grad) override;
+  void ToDevice(std::shared_ptr<Device> device) override;
+
+ private:
+  /// Init cudnn related data structures.
+  void InitCudnn(const Shape& shape, DataType dtype);
+
+ private:
+  bool has_init_cudnn_ = false;
+  cudnnBatchNormMode_t mode_;
+  cudnnLRNDescriptor_t lrn_desc_ = nullptr;
+  cudnnTensorDescriptor_t shape_desc_ = nullptr, param_desc_ = nullptr;
+  Tensor resultSaveMean_, resultSaveVariance_;
+
+};  // class CudnnBatchNorm
+}  // namespace
+
+#endif  // USE_CUDNN
+#endif  // SINGA_MODEL_LAYER_CUDNN_BATCHNORM
diff --git a/src/model/layer/cudnn_convolution.cc b/src/model/layer/cudnn_convolution.cc
new file mode 100644
index 0000000..ffd2ab7
--- /dev/null
+++ b/src/model/layer/cudnn_convolution.cc
@@ -0,0 +1,251 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "./cudnn_convolution.h"
+#ifdef USE_CUDNN
+#include <cudnn.h>
+#include <chrono>
+#include "./cudnn_utils.h"
+#include "singa/utils/logging.h"
+
+namespace singa {
+RegisterLayerClass(cudnn_convolution, CudnnConvolution);
+CudnnConvolution::~CudnnConvolution() {
+  if (bias_desc_ != nullptr)
+    CUDNN_CHECK(cudnnDestroyTensorDescriptor(bias_desc_));
+  if (filter_desc_ != nullptr)
+    CUDNN_CHECK(cudnnDestroyFilterDescriptor(filter_desc_));
+  if (conv_desc_ != nullptr)
+    CUDNN_CHECK(cudnnDestroyConvolutionDescriptor(conv_desc_));
+  if (x_desc_ != nullptr) CUDNN_CHECK(cudnnDestroyTensorDescriptor(x_desc_));
+  if (y_desc_ != nullptr) CUDNN_CHECK(cudnnDestroyTensorDescriptor(y_desc_));
+}
+
+void CudnnConvolution::Setup(const Shape& in_sample, const LayerConf &conf) {
+  Convolution::Setup(in_sample, conf);
+  ConvolutionConf conv_conf = conf.convolution_conf();
+  // convert MB to bytes
+  workspace_byte_limit_ = conv_conf.workspace_byte_limit() << 20;
+  prefer_ = ToLowerCase(conv_conf.prefer());
+  CHECK(prefer_ == "fastest" || prefer_ == "limited_workspace" ||
+        prefer_ == "no_workspace" || prefer_ == "autotune")
+      << "CudnnConvolution only supports four algorithm preferences: fastest, "
+         "limited_workspace, no_workspace and autotune";
+}
+
+void CudnnConvolution::ToDevice(std::shared_ptr<Device> device) {
+  Convolution::ToDevice(device);
+  workspace_.ToDevice(device);
+}
+
+void CudnnConvolution::InitCudnn(const Tensor &input) {
+  CHECK(!has_init_cudnn_);
+  DataType dtype = input.data_type();
+  auto dev = input.device();
+  Context *ctx = dev->context(0);
+  size_t batchsize = input.shape(0);
+  CUDNN_CHECK(cudnnCreateTensorDescriptor(&x_desc_));
+  CUDNN_CHECK(cudnnCreateTensorDescriptor(&y_desc_));
+  CUDNN_CHECK(cudnnCreateTensorDescriptor(&bias_desc_));
+  CUDNN_CHECK(cudnnCreateFilterDescriptor(&filter_desc_));
+  CUDNN_CHECK(cudnnCreateConvolutionDescriptor(&conv_desc_));
+
+  CUDNN_CHECK(cudnnSetTensor4dDescriptor(x_desc_, CUDNN_TENSOR_NCHW,
+                                         GetCudnnDataType(dtype), batchsize,
+                                         channels_, height_, width_));
+  CUDNN_CHECK(cudnnSetTensor4dDescriptor(
+      y_desc_, CUDNN_TENSOR_NCHW, GetCudnnDataType(dtype), batchsize,
+      num_filters_, conv_height_, conv_width_));
+  if (bias_term_)
+    CUDNN_CHECK(cudnnSetTensor4dDescriptor(bias_desc_, CUDNN_TENSOR_NCHW,
+                                           GetCudnnDataType(dtype), 1,
+                                           num_filters_, 1, 1));
+  CUDNN_CHECK(cudnnSetConvolution2dDescriptor(conv_desc_, pad_h_, pad_w_,
+                                              stride_h_, stride_w_, 1, 1,
+                                              CUDNN_CROSS_CORRELATION));
+#if CUDNN_VERSION_MAJOR == 5
+  CUDNN_CHECK(cudnnSetFilter4dDescriptor(filter_desc_, GetCudnnDataType(dtype),
+                                         CUDNN_TENSOR_NCHW, num_filters_,
+                                         channels_, kernel_h_, kernel_w_));
+#elif CUDNN_VERSION_MAJOR == 4
+  CUDNN_CHECK(cudnnSetFilter4dDescriptor_v4(
+      filter_desc_, GetCudnnDataType(dtype), CUDNN_TENSOR_NCHW, num_filters_,
+      channels_, kernel_h_, kernel_w_));
+#else
+  LOG(FATAL) << "Not supported CUDNN version = " << CUDNN_VERSION_MAJOR;
+#endif
+
+  if (prefer_ == "fastest" || prefer_ == "limited_workspace" ||
+      prefer_ == "no_workspace") {
+    cudnnConvolutionFwdPreference_t fwd_pref;
+    cudnnConvolutionBwdFilterPreference_t bwd_filt_pref;
+    cudnnConvolutionBwdDataPreference_t bwd_data_pref;
+    if (prefer_ == "fastest") {
+      fwd_pref = CUDNN_CONVOLUTION_FWD_PREFER_FASTEST;
+      bwd_filt_pref = CUDNN_CONVOLUTION_BWD_FILTER_PREFER_FASTEST;
+      bwd_data_pref = CUDNN_CONVOLUTION_BWD_DATA_PREFER_FASTEST;
+    } else if (prefer_ == "limited_workspace") {
+      fwd_pref = CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT;
+      bwd_filt_pref = CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT;
+      bwd_data_pref = CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT;
+    } else {
+      fwd_pref = CUDNN_CONVOLUTION_FWD_NO_WORKSPACE;
+      bwd_filt_pref = CUDNN_CONVOLUTION_BWD_FILTER_NO_WORKSPACE;
+      bwd_data_pref = CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT;
+    }
+    CUDNN_CHECK(cudnnGetConvolutionForwardAlgorithm(
+        ctx->cudnn_handle, x_desc_, filter_desc_, conv_desc_, y_desc_, fwd_pref,
+        workspace_byte_limit_, &fp_alg_));
+    CUDNN_CHECK(cudnnGetConvolutionBackwardFilterAlgorithm(
+        ctx->cudnn_handle, x_desc_, y_desc_, conv_desc_, filter_desc_,
+        bwd_filt_pref, workspace_byte_limit_, &bp_filter_alg_));
+    CUDNN_CHECK(cudnnGetConvolutionBackwardDataAlgorithm(
+        ctx->cudnn_handle, filter_desc_, y_desc_, conv_desc_, x_desc_,
+        bwd_data_pref, workspace_byte_limit_, &bp_data_alg_));
+  } else if (prefer_ == "autotune") {
+    const int topk = 1;
+    int num_fp_alg, num_bp_filt_alg, num_bp_data_alg;
+    cudnnConvolutionFwdAlgoPerf_t fp_alg_perf[topk];
+    cudnnConvolutionBwdFilterAlgoPerf_t bp_filt_perf[topk];
+    cudnnConvolutionBwdDataAlgoPerf_t bp_data_perf[topk];
+    CUDNN_CHECK(cudnnFindConvolutionForwardAlgorithm(
+        ctx->cudnn_handle, x_desc_, filter_desc_, conv_desc_, y_desc_, topk,
+        &num_fp_alg, fp_alg_perf));
+    fp_alg_ = fp_alg_perf[0].algo;
+    CUDNN_CHECK(cudnnFindConvolutionBackwardFilterAlgorithm(
+        ctx->cudnn_handle, x_desc_, y_desc_, conv_desc_, filter_desc_, topk,
+        &num_bp_filt_alg, bp_filt_perf));
+    bp_filter_alg_ = bp_filt_perf[0].algo;
+    CUDNN_CHECK(cudnnFindConvolutionBackwardDataAlgorithm(
+        ctx->cudnn_handle, filter_desc_, y_desc_, conv_desc_, x_desc_, topk,
+        &num_bp_data_alg, bp_data_perf));
+    bp_data_alg_ = bp_data_perf[0].algo;
+  } else {
+    LOG(FATAL) << "Preferred algorithm is not available!";
+  }
+
+  size_t fp_byte, bp_data_byte, bp_filter_byte;
+  CUDNN_CHECK(cudnnGetConvolutionForwardWorkspaceSize(
+      ctx->cudnn_handle, x_desc_, filter_desc_, conv_desc_, y_desc_, fp_alg_,
+      &fp_byte));
+  CUDNN_CHECK(cudnnGetConvolutionBackwardDataWorkspaceSize(
+      ctx->cudnn_handle, filter_desc_, y_desc_, conv_desc_, x_desc_,
+      bp_data_alg_, &bp_data_byte));
+  CUDNN_CHECK(cudnnGetConvolutionBackwardFilterWorkspaceSize(
+      ctx->cudnn_handle, x_desc_, y_desc_, conv_desc_, filter_desc_,
+      bp_filter_alg_, &bp_filter_byte));
+  workspace_count_ = std::max(std::max(fp_byte, bp_data_byte), bp_filter_byte) /
+                         sizeof(float) +
+                     1;
+  workspace_ = Tensor(Shape{workspace_count_}, dev, dtype);
+  has_init_cudnn_ = true;
+}
+
+const Tensor CudnnConvolution::Forward(int flag, const Tensor &input) {
+  CHECK(buf_.empty());
+  CHECK_EQ(input.device()->lang(), kCuda);
+  CHECK_EQ(input.nDim(), 4u);
+  if (flag & kTrain) buf_.push(input);  // buffer the input for backward
+  size_t batchsize = input.shape()[0];
+  DataType dtype = input.data_type();
+  auto dev = input.device();
+
+  if (!has_init_cudnn_) InitCudnn(input);
+
+  Shape shape{batchsize, num_filters_, conv_height_, conv_width_};
+  Tensor output(shape, dev, dtype);
+  output.device()->Exec([input, output, this](Context *ctx) {
+    Block *inblock = input.block(), *outblock = output.block(),
+          *wblock = this->weight_.block();
+    float alpha = 1.f, beta = 0.f;
+    cudnnConvolutionForward(ctx->cudnn_handle, &alpha, this->x_desc_,
+                            inblock->data(), this->filter_desc_, wblock->data(),
+                            this->conv_desc_, this->fp_alg_,
+                            this->workspace_.block()->mutable_data(),
+                            this->workspace_count_ * sizeof(float), &beta,
+                            this->y_desc_, outblock->mutable_data());
+  }, {input.block(), weight_.block()}, {output.block()}, workspace_.block());
+
+  if (bias_term_) {
+    output.device()->Exec([output, this](Context *ctx) {
+      float beta = 1.f, alpha = 1.0f;
+      Block *outblock = output.block(), *bblock = this->bias_.block();
+      cudnnAddTensor(ctx->cudnn_handle, &alpha, this->bias_desc_,
+                     bblock->data(), &beta, this->y_desc_,
+                     outblock->mutable_data());
+    }, {output.block(), bias_.block()}, {output.block()});
+  }
+  return output;
+}
+
+const std::pair<Tensor, vector<Tensor>> CudnnConvolution::Backward(
+    int flag, const Tensor &grad) {
+  CHECK(has_init_cudnn_);
+  CHECK_EQ(grad.device()->lang(), kCuda);
+  CHECK_EQ(grad.nDim(), 4u);
+  CHECK(!buf_.empty());
+  Tensor src_data = buf_.top();
+  buf_.pop();
+  vector<Tensor> param_grad;
+  Tensor dx;
+  dx.ResetLike(src_data);
+  Tensor db, dw;
+  db.ResetLike(bias_);
+  dw.ResetLike(weight_);
+
+  // LOG(ERROR) << "backward bias";
+  if (bias_term_) {
+    dx.device()->Exec([grad, db, this](Context *ctx) {
+      Block *dyblock = grad.block(), *dbblock = db.block();
+      float alpha = 1.f, beta = 0.f;
+      cudnnConvolutionBackwardBias(ctx->cudnn_handle, &alpha, this->y_desc_,
+                                   dyblock->data(), &beta, this->bias_desc_,
+                                   dbblock->mutable_data());
+    }, {grad.block()}, {db.block()});
+  }
+  // LOG(ERROR) << "backward w";
+  dx.device()->Exec([grad, dw, src_data, this](Context *ctx) {
+    Block *inblock = src_data.block(), *dyblock = grad.block(),
+          *dwblock = dw.block();
+    float alpha = 1.f, beta = 0.f;
+    cudnnConvolutionBackwardFilter(
+        ctx->cudnn_handle, &alpha, this->x_desc_, inblock->data(),
+        this->y_desc_, dyblock->data(), this->conv_desc_, this->bp_filter_alg_,
+        this->workspace_.block()->mutable_data(),
+        this->workspace_count_ * sizeof(float), &beta, this->filter_desc_,
+        dwblock->mutable_data());
+  }, {grad.block(), src_data.block()}, {dw.block(), workspace_.block()});
+
+  // LOG(ERROR) << "backward src";
+  dx.device()->Exec([dx, grad, this](Context *ctx) {
+    Block *wblock = this->weight_.block(), *dyblock = grad.block(),
+          *dxblock = dx.block();
+    float alpha = 1.f, beta = 0.f;
+    cudnnConvolutionBackwardData(ctx->cudnn_handle, &alpha, this->filter_desc_,
+                                 wblock->data(), this->y_desc_, dyblock->data(),
+                                 this->conv_desc_, this->bp_data_alg_,
+                                 this->workspace_.block()->mutable_data(),
+                                 this->workspace_count_ * sizeof(float), &beta,
+                                 this->x_desc_, dxblock->mutable_data());
+  }, {grad.block(), weight_.block()}, {dx.block(), workspace_.block()});
+  param_grad.push_back(dw);
+  param_grad.push_back(db);
+  return std::make_pair(dx, param_grad);
+}
+
+}  // namespace singa
+#endif  // USE_CUDNN
diff --git a/src/model/layer/cudnn_convolution.h b/src/model/layer/cudnn_convolution.h
new file mode 100644
index 0000000..545fd5c
--- /dev/null
+++ b/src/model/layer/cudnn_convolution.h
@@ -0,0 +1,73 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef SRC_MODEL_LAYER_CUDNN_CONVOLUTION_H_
+#define SRC_MODEL_LAYER_CUDNN_CONVOLUTION_H_
+#include "singa/singa_config.h"
+#ifdef USE_CUDNN
+#include <string>
+#include <utility>
+#include <vector>
+#include "./convolution.h"
+#include "singa/core/common.h"
+#include "singa/model/layer.h"
+#include "singa/proto/core.pb.h"
+#include "singa/utils/string.h"
+
+namespace singa {
+class CudnnConvolution : public Convolution {
+ public:
+  ~CudnnConvolution();
+  /// \copydoc Layer::layer_type()
+  // const std::string layer_type() const override { return "CudnnConvolution";}
+
+  const Tensor Forward(int flag, const Tensor &input) override;
+  const std::pair<Tensor, vector<Tensor>> Backward(int flag,
+                                                   const Tensor &grad) override;
+
+  /// \copydoc Layer::Setup(const LayerConf&);
+  void Setup(const Shape& in_sample, const LayerConf &conf) override;
+
+  void ToDevice(std::shared_ptr<Device> device) override;
+
+  size_t workspace_byte_limit() { return workspace_byte_limit_; }
+  string prefer() { return prefer_; }
+
+ protected:
+  /// Init cudnn related data structures.
+  void InitCudnn(const Tensor& input);
+
+ protected:
+  bool has_init_cudnn_ = false;
+  cudnnTensorDescriptor_t x_desc_ = nullptr;
+  cudnnTensorDescriptor_t y_desc_ = nullptr;
+  cudnnTensorDescriptor_t bias_desc_ = nullptr;
+  cudnnFilterDescriptor_t filter_desc_ = nullptr;
+  cudnnConvolutionDescriptor_t conv_desc_ = nullptr;
+  cudnnConvolutionFwdAlgo_t fp_alg_;
+  cudnnConvolutionBwdFilterAlgo_t bp_filter_alg_;
+  cudnnConvolutionBwdDataAlgo_t bp_data_alg_;
+  size_t workspace_byte_limit_, workspace_count_;
+  Tensor workspace_;
+  string prefer_;
+};
+
+}  // namespace singa
+
+#endif  // USE_CUDNN
+#endif  // SRC_MODEL_LAYER_CUDNN_CONVOLUTION_H_
diff --git a/src/model/layer/cudnn_dropout.cc b/src/model/layer/cudnn_dropout.cc
new file mode 100644
index 0000000..e05a425
--- /dev/null
+++ b/src/model/layer/cudnn_dropout.cc
@@ -0,0 +1,116 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "./cudnn_dropout.h"
+#ifdef USE_CUDNN
+#include <cudnn.h>
+// cudnn dropout is added in cudnn 5
+#if CUDNN_MAJOR >= 5
+
+#include <chrono>
+
+#include "./cudnn_utils.h"
+#include "singa/utils/logging.h"
+
+namespace singa {
+RegisterLayerClass(cudnn_dropout, CudnnDropout);
+CudnnDropout::~CudnnDropout() {
+  if (drop_desc_ != nullptr)
+    CUDNN_CHECK(cudnnDestroyDropoutDescriptor(drop_desc_));
+  if (x_desc_ != nullptr) CUDNN_CHECK(cudnnDestroyTensorDescriptor(x_desc_));
+  if (y_desc_ != nullptr) CUDNN_CHECK(cudnnDestroyTensorDescriptor(y_desc_));
+}
+
+void CudnnDropout::InitCudnn(int size, DataType dtype,
+                             std::shared_ptr<Device> dev, Context* ctx) {
+  CHECK(!has_init_cudnn_);
+  CUDNN_CHECK(cudnnCreateTensorDescriptor(&x_desc_));
+  CUDNN_CHECK(cudnnCreateTensorDescriptor(&y_desc_));
+  CUDNN_CHECK(cudnnCreateDropoutDescriptor(&drop_desc_));
+
+  CUDNN_CHECK(cudnnSetTensor4dDescriptor(
+      x_desc_, CUDNN_TENSOR_NCHW, GetCudnnDataType(dtype), 1, 1, 1, size));
+  CUDNN_CHECK(cudnnSetTensor4dDescriptor(
+      y_desc_, CUDNN_TENSOR_NCHW, GetCudnnDataType(dtype), 1, 1, 1, size));
+
+  cudnnDropoutGetStatesSize(ctx->cudnn_handle, &state_size_);
+  state_ = Tensor(Shape{state_size_}, dev, kChar);
+  cudnnDropoutGetReserveSpaceSize(x_desc_, &reserve_size_);
+  mask_ = Tensor(Shape{reserve_size_}, dev, kChar);
+  // TODO(wangwei) update for async running,
+  // where reserve_size_ may not available
+  CHECK_EQ(reserve_size_, mask_.MemSize());
+
+  // TODO(wangwei) get seed from ctx or user config?
+  auto seed = std::chrono::system_clock::now().time_since_epoch().count();
+  cudnnSetDropoutDescriptor(drop_desc_, ctx->cudnn_handle, 1 - dropout_ratio_,
+                            state_.block()->mutable_data(), state_size_, seed);
+  has_init_cudnn_ = true;
+}
+
+const Tensor CudnnDropout::Forward(int flag, const Tensor& input) {
+  if (flag & kTrain) {
+    auto size = input.Size();
+    DataType dtype = input.data_type();
+    auto dev = input.device();
+    if (!has_init_cudnn_) {
+      input.device()->Exec([size, dtype, this, dev](Context* ctx) {
+        this->InitCudnn(size, dtype, dev, ctx);
+      }, {}, {this->state_.block()});
+    }
+    Tensor output;
+    output.ResetLike(input);
+    output.device()->Exec([input, output, this](Context* ctx) {
+      Block* inblock = input.block(), * outblock = output.block(),
+             * mblock = mask_.block();
+      cudnnDropoutForward(ctx->cudnn_handle, this->drop_desc_, this->x_desc_,
+                          inblock->data(), this->y_desc_,
+                          outblock->mutable_data(), mblock->mutable_data(),
+                          this->reserve_size_);
+    }, {input.block()}, {output.block(), mask_.block()});
+    return output;
+  } else {
+    return input;
+  }
+}
+
+const std::pair<Tensor, vector<Tensor>> CudnnDropout::Backward(
+    int flag, const Tensor& grad) {
+  vector<Tensor> param_grad;
+  Tensor dx;
+  if (flag & kTrain) {
+    dx.ResetLike(grad);
+    dx.device()->Exec([dx, grad, this](Context* ctx) {
+      Block* dyblock = grad.block(), * dxblock = dx.block(),
+             * mblock = this->mask_.block();
+      cudnnDropoutBackward(ctx->cudnn_handle, this->drop_desc_, this->y_desc_,
+                           dyblock->data(), this->x_desc_,
+                           dxblock->mutable_data(), mblock->mutable_data(),
+                           this->reserve_size_);
+    }, {grad.block(), mask_.block()}, {dx.block()});
+  } else {
+    LOG(ERROR) << "Do not call backward for evaluation phase";
+  }
+  return std::make_pair(dx, param_grad);
+}
+void CudnnDropout::ToDevice(std::shared_ptr<Device> device) {
+  Dropout::ToDevice(device);
+  state_.ToDevice(device);
+}
+}  // namespace singa
+#endif  // CUDNN_MAJOR>=5
+#endif  // USE_CUDNN
diff --git a/src/model/layer/cudnn_dropout.h b/src/model/layer/cudnn_dropout.h
new file mode 100644
index 0000000..c6fc1c9
--- /dev/null
+++ b/src/model/layer/cudnn_dropout.h
@@ -0,0 +1,62 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef SRC_MODEL_LAYER_CUDNN_DROPOUT_H_
+#define SRC_MODEL_LAYER_CUDNN_DROPOUT_H_
+#include "singa/singa_config.h"
+#ifdef USE_CUDNN
+#include <cudnn.h>
+// cudnn dropout is added in cudnn 5
+#if CUDNN_MAJOR >= 5
+#include "./dropout.h"
+
+#include <cudnn.h>
+#include <utility>
+#include <string>
+#include <vector>
+
+#include "singa/model/layer.h"
+
+namespace singa {
+class CudnnDropout : public Dropout {
+ public:
+  ~CudnnDropout();
+  /// \copydoc Layer::layer_type()
+  // const std::string layer_type() const override { return "CudnnDropout"; }
+
+  const Tensor Forward(int flag, const Tensor& input) override;
+  const std::pair<Tensor, vector<Tensor>> Backward(int flag,
+                                                   const Tensor& grad) override;
+
+  void ToDevice(std::shared_ptr<Device> device) override;
+ private:
+  /// Init cudnn related data structures.
+  void InitCudnn(int size, DataType dtype, std::shared_ptr<Device> dev,
+                 Context* ctx);
+
+ private:
+  bool has_init_cudnn_ = false;
+  cudnnDropoutDescriptor_t drop_desc_ = nullptr;
+  cudnnTensorDescriptor_t x_desc_ = nullptr, y_desc_ = nullptr;
+  size_t state_size_, reserve_size_;
+  Tensor state_;
+};
+}  // namespace
+#endif  // CUDNN_MAJOR>=5
+#endif  // USE_CUDNN
+#endif  // SRC_MODEL_LAYER_CUDNN_DROPOUT_H_
diff --git a/src/model/layer/cudnn_lrn.cc b/src/model/layer/cudnn_lrn.cc
new file mode 100644
index 0000000..ac7645e
--- /dev/null
+++ b/src/model/layer/cudnn_lrn.cc
@@ -0,0 +1,95 @@
+/*********************************************************
+*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*
+************************************************************/
+#include "cudnn_lrn.h"
+#ifdef USE_CUDNN
+#include "cudnn_utils.h"
+
+namespace singa {
+RegisterLayerClass(cudnn_lrn, CudnnLRN);
+CudnnLRN::~CudnnLRN() {
+  if (has_init_cudnn_) {
+    CUDNN_CHECK(cudnnDestroyLRNDescriptor(lrn_desc_));
+    CUDNN_CHECK(cudnnDestroyTensorDescriptor(shape_desc_));
+  }
+}
+void CudnnLRN::InitCudnn(const Shape& shape, DataType dtype) {
+  CHECK(!has_init_cudnn_);
+  mode_ = CUDNN_LRN_CROSS_CHANNEL_DIM1;
+  CUDNN_CHECK(cudnnCreateTensorDescriptor(&shape_desc_));
+  CHECK_EQ(shape.size(), 4u);
+  CUDNN_CHECK(cudnnSetTensor4dDescriptor(shape_desc_, CUDNN_TENSOR_NCHW,
+                                         GetCudnnDataType(dtype), shape[0],
+                                         shape[1], shape[2], shape[3]));
+  CUDNN_CHECK(cudnnCreateLRNDescriptor(&lrn_desc_));
+  CUDNN_CHECK(cudnnSetLRNDescriptor(lrn_desc_, local_size_, alpha_, beta_, k_));
+  has_init_cudnn_ = true;
+}
+const Tensor CudnnLRN::Forward(int flag, const Tensor& input) {
+  auto shape = input.shape();
+  auto dtype = input.data_type();
+  if (!has_init_cudnn_) InitCudnn(shape, dtype);
+  Tensor output;
+  output.ResetLike(input);
+  output.device()->Exec([=](Context* ctx) {
+    Block* inblock = input.block(), * outblock = output.block();
+    const float alpha = 1.0f, beta = 0.0f;
+    CUDNN_CHECK(cudnnLRNCrossChannelForward(
+        ctx->cudnn_handle, this->lrn_desc_, this->mode_, &alpha,
+        this->shape_desc_, inblock->data(), &beta, this->shape_desc_,
+        outblock->mutable_data()));
+  }, {input.block()}, {output.block()});
+
+  if (flag & kTrain) {
+    buf_.push(input);
+    buf_.push(output);
+  }
+  return output;
+}
+
+const std::pair<Tensor, vector<Tensor>> CudnnLRN::Backward(int flag,
+                                                           const Tensor& grad) {
+  vector<Tensor> param_grad;
+  Tensor dx;
+  CHECK(!buf_.empty());
+  Tensor output = buf_.top();
+  buf_.pop();
+  Tensor input = buf_.top();
+  buf_.pop();
+  if ((flag & kTrain) == kTrain) {
+    dx.ResetLike(grad);
+    dx.device()->Exec([=](Context* ctx) {
+      Block* dyblock = grad.block(), * dxblock = dx.block();
+      Block* yblock = output.block(), * xblock = input.block();
+      float alpha = 1.0f, beta = 0.0f;
+      CUDNN_CHECK(cudnnLRNCrossChannelBackward(
+          ctx->cudnn_handle, this->lrn_desc_, this->mode_, &alpha,
+          this->shape_desc_, yblock->data(), this->shape_desc_, dyblock->data(),
+          this->shape_desc_, xblock->data(), &beta, this->shape_desc_,
+          dxblock->mutable_data()));
+    }, {output.block(), grad.block(), input.block()}, {dx.block()});
+  } else {
+    LOG(ERROR) << "Do not call backward for evaluation phase";
+  }
+  return std::make_pair(dx, param_grad);
+}
+}  // namespace
+
+#endif  // USE_CUDNN
diff --git a/src/model/layer/cudnn_lrn.h b/src/model/layer/cudnn_lrn.h
new file mode 100644
index 0000000..c48571d
--- /dev/null
+++ b/src/model/layer/cudnn_lrn.h
@@ -0,0 +1,54 @@
+/*********************************************************
+*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*
+************************************************************/
+#ifndef SINGA_MODEL_LAYER_CUDNN_LRN_H_
+#define SINGA_MODEL_LAYER_CUDNN_LRN_H_
+#include "singa/singa_config.h"
+#ifdef USE_CUDNN
+
+#include "lrn.h"
+#include "cudnn_utils.h"
+
+namespace singa {
+class CudnnLRN : public LRN {
+ public:
+  ~CudnnLRN();
+  /// \copy doc Layer::layer_type()
+  // const std::string layer_type() const override { return "CudnnLRN"; }
+
+  const Tensor Forward(int flag, const Tensor& input) override;
+  const std::pair<Tensor, vector<Tensor>> Backward(int flag,
+                                                   const Tensor& grad) override;
+
+ private:
+  /// Init cudnn related data structures.
+  void InitCudnn(const Shape& shape, DataType dtype);
+
+ private:
+  bool has_init_cudnn_ = false;
+  cudnnLRNMode_t mode_;
+  cudnnLRNDescriptor_t lrn_desc_ = nullptr;
+  cudnnTensorDescriptor_t shape_desc_ = nullptr;
+
+};  // class CudnnLRN
+}  // namespcae
+
+#endif  // USE_CUDNN
+#endif  // SINGA_MODEL_LAYER_CUDNN_LRN_H_H
diff --git a/src/model/layer/cudnn_pooling.cc b/src/model/layer/cudnn_pooling.cc
new file mode 100644
index 0000000..895ce3c
--- /dev/null
+++ b/src/model/layer/cudnn_pooling.cc
@@ -0,0 +1,132 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "./cudnn_pooling.h"
+#ifdef USE_CUDNN
+
+#include <cudnn.h>
+#include <chrono>
+
+#include "./cudnn_utils.h"
+#include "singa/utils/logging.h"
+
+namespace singa {
+RegisterLayerClass(cudnn_pooling, CudnnPooling);
+CudnnPooling::~CudnnPooling() {
+  if (pool_desc_ != nullptr)
+    CUDNN_CHECK(cudnnDestroyPoolingDescriptor(pool_desc_));
+  if (x_desc_ != nullptr) CUDNN_CHECK(cudnnDestroyTensorDescriptor(x_desc_));
+  if (y_desc_ != nullptr) CUDNN_CHECK(cudnnDestroyTensorDescriptor(y_desc_));
+}
+
+void CudnnPooling::Setup(const Shape& in_sample, const LayerConf &conf) {
+  Pooling::Setup(in_sample, conf);
+  PoolingConf pool_conf = conf.pooling_conf();
+  if (pool_conf.nan_prop())
+    nan_prop_ = CUDNN_PROPAGATE_NAN;
+  else
+    nan_prop_ = CUDNN_NOT_PROPAGATE_NAN;
+}
+
+void CudnnPooling::InitCudnn(const Tensor &input) {
+  CHECK(!has_init_cudnn_);
+  DataType dtype = input.data_type();
+  size_t batchsize = input.shape(0);
+  CUDNN_CHECK(cudnnCreateTensorDescriptor(&x_desc_));
+  CUDNN_CHECK(cudnnCreateTensorDescriptor(&y_desc_));
+  CUDNN_CHECK(cudnnCreatePoolingDescriptor(&pool_desc_));
+
+  CUDNN_CHECK(cudnnSetTensor4dDescriptor(x_desc_, CUDNN_TENSOR_NCHW,
+                                         GetCudnnDataType(dtype), batchsize,
+                                         channels_, height_, width_));
+  CUDNN_CHECK(cudnnSetTensor4dDescriptor(
+      y_desc_, CUDNN_TENSOR_NCHW, GetCudnnDataType(dtype), batchsize, channels_,
+      pooled_height_, pooled_width_));
+  auto pool_method = CUDNN_POOLING_MAX;
+  if (pool_ == PoolingConf_PoolMethod_MAX)
+    pool_method = CUDNN_POOLING_MAX;
+  else if (pool_ == PoolingConf_PoolMethod_AVE)
+    pool_method = CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING;
+  else
+    LOG(FATAL) << "Not implemented!";
+
+#if CUDNN_VERSION_MAJOR == 5
+  CUDNN_CHECK(cudnnSetPooling2dDescriptor(pool_desc_, pool_method, nan_prop_,
+                                          kernel_h_, kernel_w_, pad_h_, pad_w_,
+                                          stride_h_, stride_w_));
+#elif CUDNN_VERSION_MAJOR == 4
+  CUDNN_CHECK(cudnnSetPooling2dDescriptor_v4(pool_desc_, pool_method, nan_prop_,
+                                             kernel_h_, kernel_w_, pad_h_,
+                                             pad_w_, stride_h_, stride_w_));
+#else
+  LOG(FATAL) << "Not supported CUDNN version = " << CUDNN_VERSION_MAJOR;
+#endif
+  has_init_cudnn_ = true;
+}
+
+const Tensor CudnnPooling::Forward(int flag, const Tensor &input) {
+  CHECK(buf_.empty());
+  CHECK_EQ(input.device()->lang(), kCuda);
+  CHECK_EQ(input.nDim(), 4u);
+  size_t batchsize = input.shape(0);
+  DataType dtype = input.data_type();
+  auto dev = input.device();
+  if (!has_init_cudnn_) InitCudnn(input);
+
+  Shape shape{batchsize, channels_, pooled_height_, pooled_width_};
+  Tensor output = Tensor(shape, dev, dtype);
+  output.device()->Exec([input, output, this](Context *ctx) {
+    Block *inblock = input.block(), *outblock = output.block();
+    float alpha = 1.0f, beta = 0.0f;
+    cudnnPoolingForward(ctx->cudnn_handle, this->pool_desc_, &alpha,
+                        this->x_desc_, inblock->data(), &beta, this->y_desc_,
+                        outblock->mutable_data());
+  }, {input.block()}, {output.block()});
+  if (flag & kTrain) {
+    buf_.push(input);
+    buf_.push(output);
+  }
+  return output;
+}
+
+const std::pair<Tensor, vector<Tensor>> CudnnPooling::Backward(
+    int flag, const Tensor &grad) {
+  CHECK_EQ(grad.device()->lang(), kCuda);
+  CHECK_EQ(grad.nDim(), 4u);
+  vector<Tensor> param_grad;
+  CHECK(!buf_.empty());
+  Tensor y = buf_.top();
+  buf_.pop();
+  Tensor x = buf_.top();
+  buf_.pop();
+  Tensor dx;
+  dx.ResetLike(x);
+
+  dx.device()->Exec([dx, grad, x, y, this](Context *ctx) {
+    Block *dyblock = grad.block(), *dxblock = dx.block(), *yblock = y.block(),
+          *xblock = x.block();
+    float alpha = 1.0f, beta = 0.0f;
+    cudnnPoolingBackward(ctx->cudnn_handle, this->pool_desc_, &alpha,
+                         this->y_desc_, yblock->data(), this->y_desc_,
+                         dyblock->data(), this->x_desc_, xblock->data(), &beta,
+                         this->x_desc_, dxblock->mutable_data());
+  }, {grad.block(), y.block(), x.block()}, {dx.block()});
+
+  return std::make_pair(dx, param_grad);
+}
+}  // namespace singa
+#endif  // USE_CUDNN
diff --git a/src/model/layer/cudnn_pooling.h b/src/model/layer/cudnn_pooling.h
new file mode 100644
index 0000000..2080db3
--- /dev/null
+++ b/src/model/layer/cudnn_pooling.h
@@ -0,0 +1,57 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef SRC_MODEL_LAYER_CUDNN_POOLING_H_
+#define SRC_MODEL_LAYER_CUDNN_POOLING_H_
+#include "singa/singa_config.h"
+#ifdef USE_CUDNN
+#include <cudnn.h>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "./pooling.h"
+#include "singa/core/common.h"
+#include "singa/model/layer.h"
+#include "singa/proto/core.pb.h"
+
+namespace singa {
+class CudnnPooling : public Pooling {
+ public:
+  ~CudnnPooling();
+  /// \copydoc Layer::layer_type()
+  // const std::string layer_type() const override { return "CudnnPooling"; }
+
+  void Setup(const Shape& in_sample, const LayerConf &conf) override;
+  const Tensor Forward(int flag, const Tensor &input) override;
+  const std::pair<Tensor, vector<Tensor>> Backward(int flag,
+                                                   const Tensor &grad) override;
+ private:
+  /// Init cudnn related data structures.
+  void InitCudnn(const Tensor& input);
+
+ private:
+  bool has_init_cudnn_ = false;
+  cudnnTensorDescriptor_t x_desc_ = nullptr;
+  cudnnTensorDescriptor_t y_desc_ = nullptr;
+  cudnnPoolingDescriptor_t pool_desc_ = nullptr;
+  cudnnNanPropagation_t nan_prop_;
+};
+}  // namespace singa
+#endif  // USE_CUDNN
+#endif  // SRC_MODEL_LAYER_CUDNN_POOLING_H_
diff --git a/src/model/layer/cudnn_rnn.cc b/src/model/layer/cudnn_rnn.cc
new file mode 100644
index 0000000..0788801
--- /dev/null
+++ b/src/model/layer/cudnn_rnn.cc
@@ -0,0 +1,427 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "./cudnn_rnn.h"
+#ifdef USE_CUDNN
+#include <cudnn.h>
+#if CUDNN_VERSION >= 5005
+#include <chrono>
+#include "./cudnn_utils.h"
+#include "singa/utils/logging.h"
+
+namespace singa {
+RegisterLayerClass(cudnn_rnn, CudnnRNN);
+CudnnRNN::~CudnnRNN() {
+  if (weight_desc_ != nullptr)
+    CUDNN_CHECK(cudnnDestroyFilterDescriptor(weight_desc_));
+  if (dropout_desc_ != nullptr)
+    CUDNN_CHECK(cudnnDestroyDropoutDescriptor(dropout_desc_));
+  if (rnn_desc_ != nullptr)
+    CUDNN_CHECK(cudnnDestroyRNNDescriptor(rnn_desc_));
+  if (hx_desc_ != nullptr)
+    CUDNN_CHECK(cudnnDestroyTensorDescriptor(hx_desc_));
+  if (hy_desc_ != nullptr)
+    CUDNN_CHECK(cudnnDestroyTensorDescriptor(hy_desc_));
+  if (cx_desc_ != nullptr)
+    CUDNN_CHECK(cudnnDestroyTensorDescriptor(cx_desc_));
+  if (cy_desc_ != nullptr)
+    CUDNN_CHECK(cudnnDestroyTensorDescriptor(cy_desc_));
+  if (dhx_desc_ != nullptr)
+    CUDNN_CHECK(cudnnDestroyTensorDescriptor(dhx_desc_));
+  if (dhy_desc_ != nullptr)
+    CUDNN_CHECK(cudnnDestroyTensorDescriptor(dhy_desc_));
+  if (dcx_desc_ != nullptr)
+    CUDNN_CHECK(cudnnDestroyTensorDescriptor(dcx_desc_));
+  if (dcy_desc_ != nullptr)
+    CUDNN_CHECK(cudnnDestroyTensorDescriptor(dcy_desc_));
+  DestroyIODescriptors();
+}
+
+void CudnnRNN::ToDevice(std::shared_ptr<Device> device) {
+  RNN::ToDevice(device);
+  workspace_.ToDevice(device);
+  reserve_space_.ToDevice(device);
+}
+
+void CudnnRNN::DestroyIODescriptors() {
+  if (x_descs_ != nullptr) {
+    for (size_t i = 0; i < seq_length_; i++) {
+      CUDNN_CHECK(cudnnDestroyTensorDescriptor(x_descs_[i]));
+      CUDNN_CHECK(cudnnDestroyTensorDescriptor(dx_descs_[i]));
+    }
+    delete [] x_descs_;
+    delete [] dx_descs_;
+  }
+  if (y_descs_ != nullptr) {
+    for (size_t i = 0; i < seq_length_; i++) {
+      CUDNN_CHECK(cudnnDestroyTensorDescriptor(y_descs_[i]));
+      CUDNN_CHECK(cudnnDestroyTensorDescriptor(dy_descs_[i]));
+    }
+    delete [] y_descs_;
+    delete [] dy_descs_;
+  }
+}
+
+void CudnnRNN::UpdateIODescriptors(size_t len, const vector<Tensor> &inputs) {
+  bool reset = false;
+  if (seq_length_ < len) {
+    DestroyIODescriptors();
+    x_descs_ = new cudnnTensorDescriptor_t[len];
+    dx_descs_ = new cudnnTensorDescriptor_t[len];
+    y_descs_ = new cudnnTensorDescriptor_t[len];
+    dy_descs_ = new cudnnTensorDescriptor_t[len];
+    for (size_t i = 0; i < len; i++) {
+      CUDNN_CHECK(cudnnCreateTensorDescriptor(&x_descs_[i]));
+      CUDNN_CHECK(cudnnCreateTensorDescriptor(&dx_descs_[i]));
+      CUDNN_CHECK(cudnnCreateTensorDescriptor(&y_descs_[i]));
+      CUDNN_CHECK(cudnnCreateTensorDescriptor(&dy_descs_[i]));
+    }
+    reset = true;
+  }
+
+  for (size_t i = 0; i < len; i++) {
+    CHECK_EQ(inputs[i].shape(1), input_size_);
+    if (inputs[i].shape(0) != batch_size_ || reset) {
+      int d[3] = {1, 1, 1}, s[3] = {1, 1, 1};
+      d[0] = static_cast<int>(inputs[i].shape(0));
+      CHECK_GT(d[0], 0);
+      d[1] = static_cast<int>(inputs[i].shape(1));
+      s[0] = d[1] * d[2];
+      s[1] = d[2];
+      CUDNN_CHECK(cudnnSetTensorNdDescriptor(x_descs_[i], dtype_, 3, d, s));
+      CUDNN_CHECK(cudnnSetTensorNdDescriptor(dx_descs_[i], dtype_, 3, d, s));
+
+      d[0] = static_cast<int>(inputs[i].shape(0));
+      d[1] = static_cast<int>(hidden_size_ * num_directions_);
+      s[0] = d[1] * d[2];
+      s[1] = d[2];
+      CUDNN_CHECK(cudnnSetTensorNdDescriptor(y_descs_[i], dtype_, 3, d, s));
+      CUDNN_CHECK(cudnnSetTensorNdDescriptor(dy_descs_[i], dtype_, 3, d, s));
+    }
+  }
+}
+
+// must be called after setting IO descriptors
+void CudnnRNN::SetRNNDescriptor(shared_ptr<Device> dev) {
+  auto ctx = dev->context(0);
+  CUDNN_CHECK(cudnnCreateDropoutDescriptor(&dropout_desc_));
+  size_t state_size;
+  CUDNN_CHECK(cudnnDropoutGetStatesSize(ctx->cudnn_handle, &state_size));
+  dropout_state_ = Tensor(Shape{state_size}, dev, kChar);
+  CUDNN_CHECK(cudnnSetDropoutDescriptor(
+      dropout_desc_, ctx->cudnn_handle, 1 - dropout_,  // keep probability
+      dropout_state_.block()->mutable_data(), state_size, seed_));
+
+  CUDNN_CHECK(cudnnCreateRNNDescriptor(&rnn_desc_));
+  cudnnRNNInputMode_t input_mode = CUDNN_LINEAR_INPUT;
+  if (input_mode_ == "skip")
+    input_mode = CUDNN_SKIP_INPUT;
+
+  cudnnDirectionMode_t direction = CUDNN_UNIDIRECTIONAL;
+  if (direction_ == "bidirectional")
+    direction = CUDNN_BIDIRECTIONAL;
+
+  cudnnRNNMode_t rnn_mode = CUDNN_LSTM;
+  if (rnn_mode_ == "relu")
+    rnn_mode = CUDNN_RNN_RELU;
+  else if (rnn_mode_ == "tanh")
+    rnn_mode = CUDNN_RNN_TANH;
+  else if (rnn_mode_ == "gru")
+    rnn_mode = CUDNN_GRU;
+  CUDNN_CHECK(cudnnSetRNNDescriptor(rnn_desc_, hidden_size_, num_stacks_,
+                                    dropout_desc_, input_mode, direction,
+                                    rnn_mode, dtype_));
+
+  size_t weight_size;
+  CUDNN_CHECK(cudnnGetRNNParamsSize(ctx->cudnn_handle, rnn_desc_, x_descs_[0],
+                                    &weight_size, dtype_));
+  // check the size manually calculated
+  CHECK_EQ(weight_size, weight_.Size() * sizeof(float));
+  int filter_dim[3] = {static_cast<int>(weight_size), 1, 1};
+  CUDNN_CHECK(cudnnCreateFilterDescriptor(&weight_desc_));
+  CUDNN_CHECK(cudnnSetFilterNdDescriptor(weight_desc_, dtype_,
+                                         CUDNN_TENSOR_NCHW, 3, filter_dim));
+}
+
+void CudnnRNN::ResetHiddenAndCellDescriptors(size_t batch_size) {
+  if (batch_size_ == 0) {
+    CUDNN_CHECK(cudnnCreateTensorDescriptor(&cx_desc_));
+    CUDNN_CHECK(cudnnCreateTensorDescriptor(&dcx_desc_));
+    CUDNN_CHECK(cudnnCreateTensorDescriptor(&cy_desc_));
+    CUDNN_CHECK(cudnnCreateTensorDescriptor(&dcy_desc_));
+    CUDNN_CHECK(cudnnCreateTensorDescriptor(&hx_desc_));
+    CUDNN_CHECK(cudnnCreateTensorDescriptor(&dhx_desc_));
+    CUDNN_CHECK(cudnnCreateTensorDescriptor(&hy_desc_));
+    CUDNN_CHECK(cudnnCreateTensorDescriptor(&dhy_desc_));
+  }
+
+  int dim[3] = {1, 1, 1};
+  dim[0] = static_cast<int>(num_stacks_ * num_directions_);
+  dim[1] = static_cast<int>(batch_size);
+  dim[2] = static_cast<int>(hidden_size_);
+  int stride[3] = {1, 1, 1};
+  stride[0] = dim[1] * dim[2];
+  stride[1] = dim[2];
+  CUDNN_CHECK(cudnnSetTensorNdDescriptor(hx_desc_, dtype_, 3, dim, stride));
+  CUDNN_CHECK(cudnnSetTensorNdDescriptor(dhx_desc_, dtype_, 3, dim, stride));
+  CUDNN_CHECK(cudnnSetTensorNdDescriptor(hy_desc_, dtype_, 3, dim, stride));
+  CUDNN_CHECK(cudnnSetTensorNdDescriptor(dhy_desc_, dtype_, 3, dim, stride));
+  CUDNN_CHECK(cudnnSetTensorNdDescriptor(cx_desc_, dtype_, 3, dim, stride));
+  CUDNN_CHECK(cudnnSetTensorNdDescriptor(dcx_desc_, dtype_, 3, dim, stride));
+  CUDNN_CHECK(cudnnSetTensorNdDescriptor(cy_desc_, dtype_, 3, dim, stride));
+  CUDNN_CHECK(cudnnSetTensorNdDescriptor(dcy_desc_, dtype_, 3, dim, stride));
+}
+
+void CudnnRNN::UpdateSpaces(size_t seq_length, shared_ptr<Device> dev) {
+  size_t count;
+  auto ctx = dev->context(0);
+  CUDNN_CHECK(cudnnGetRNNWorkspaceSize(ctx->cudnn_handle, rnn_desc_,
+                                       seq_length, x_descs_, &count));
+  if (workspace_.Size() != count) {
+    workspace_ = Tensor(Shape{count}, dev, kChar);
+    // workspace_.SetValue(0);
+  }
+
+  CUDNN_CHECK(cudnnGetRNNTrainingReserveSize(ctx->cudnn_handle, rnn_desc_,
+                                             seq_length, x_descs_, &count));
+  if (reserve_space_.Size() != count) {
+    reserve_space_ = Tensor(Shape{count}, dev, kChar);
+    // reserve_space_.SetValue(0);
+  }
+}
+
+void CudnnRNN::UpdateStates(size_t num_x, const vector<Tensor> &inputs) {
+  UpdateIODescriptors(num_x, inputs);
+  size_t new_batch_size = inputs.at(0).shape(0);
+  if (batch_size_ != new_batch_size)
+    ResetHiddenAndCellDescriptors(new_batch_size);
+  if (rnn_desc_ == nullptr)
+    SetRNNDescriptor(inputs.at(0).device());
+  UpdateSpaces(num_x, inputs.at(0).device());
+  batch_size_ = new_batch_size;
+  seq_length_ = num_x;
+}
+
+Tensor CudnnRNN::MergeInputs(size_t num, const vector<Tensor> &in) {
+  if (num == 1)
+    return in.at(0);
+  size_t size = 0;
+  for (size_t i = 0; i < num; i++) size += in.at(i).Size();
+  Tensor out(Shape{size}, in.at(0).device(), in.at(0).data_type());
+  for (size_t i = 0, offset = 0; i < num; i++) {
+    CopyDataToFrom(&out, in.at(i), in.at(i).Size(), offset);
+    offset += in.at(i).Size();
+  }
+  return out;
+}
+
+vector<Tensor> CudnnRNN::SplitOutput(size_t num, size_t dim,
+                                     const vector<Tensor> &in,
+                                     const Tensor output) {
+  vector<Tensor> outputs;
+  if (num == 1) {
+    outputs.push_back(Reshape(output, Shape{in.at(0).shape(0), dim}));
+  } else {
+    for (size_t i = 0, offset = 0; offset < output.Size(); i++) {
+      Shape s{in.at(i).shape(0), dim};
+      Tensor out(s, output.device(), output.data_type());
+      CopyDataToFrom(&out, output, out.Size(), 0, offset);
+      outputs.push_back(out);
+      offset += out.Size();
+    }
+    CHECK_EQ(num, outputs.size());
+  }
+  return outputs;
+}
+
+const vector<Tensor> CudnnRNN::Forward(int flag, const vector<Tensor> &inputs) {
+  DataType dtype = inputs.at(0).data_type();
+  auto dev = inputs.at(0).device();
+
+  // copy input data into a block of contiguous memory
+  // hx (and cx) is at the end of inputs
+  CHECK_GT(inputs.size(), 1u + has_cell_);
+  size_t num_x = inputs.size() - has_cell_ - 1;
+  Tensor input = MergeInputs(num_x, inputs);
+  // LOG(INFO) << "input size " << input.Size() << " value " << input.L1();
+
+  if (rnn_desc_ != nullptr)
+    CHECK_EQ(dtype_, GetCudnnDataType(dtype))
+      << "Cannot change cudnn data type during training from " << dtype_
+      << " to " << GetCudnnDataType(dtype);
+  else
+    dtype_ = GetCudnnDataType(dtype);
+
+  UpdateStates(num_x, inputs);
+  // CheckFowardShapes();
+
+  Shape outshape{input.Size() * hidden_size_ / input_size_ * num_directions_};
+  Tensor output(outshape, dev, dtype);
+  // LOG(INFO) << "output size " << output.Size();
+  Tensor hx = inputs.at(num_x);
+  Shape state_shape{num_stacks_ * num_directions_, batch_size_, hidden_size_};
+  Tensor hy(state_shape, dev, dtype);
+  Tensor cy, cx;
+  if (has_cell_) {
+    cx = inputs.at(num_x + 1);
+    cy.ResetLike(hy);
+  }
+
+  // LOG(INFO) << "hidden size " << hy.Size();
+  // LOG(INFO) << "weight size " << weight_.Size() << " value " << weight_.L1();
+  Block *inb = input.block(), *outb = output.block(),
+        *wb = this->weight_.block(), *hxb = hx.block(), *cxb = cx.block(),
+        *hyb = hy.block(), *cyb = cy.block(),
+        *wspace = this->workspace_.block(),
+        *rspace = this->reserve_space_.block();
+  if (flag & kTrain) {
+    dev->Exec(
+        [inb, outb, wb, hxb, cxb, hyb, cyb, wspace, rspace, this](Context *ctx) {
+        // clang-format off
+        cudnnRNNForwardTraining(
+            ctx->cudnn_handle,
+            this->rnn_desc_,
+            this->seq_length_,
+            this->x_descs_, inb->data(),
+            this->hx_desc_, hxb == nullptr ? nullptr : hxb->data(),
+            this->cx_desc_, cxb == nullptr ? nullptr : cxb->data(),
+            this->weight_desc_, wb->data(),
+            this->y_descs_, outb->mutable_data(),
+            this->hy_desc_, hyb->mutable_data(),
+            this->cy_desc_, cyb == nullptr ? nullptr : cyb->mutable_data(),
+            wspace->mutable_data(),
+            this->workspace_.Size(), rspace->mutable_data(),
+            this->reserve_space_.Size());
+        // clang-format on
+        },
+        {inb, wb, hxb, cxb}, {outb, hyb, cyb, wspace, rspace});
+    buf_.push(input);
+    buf_.push(output);
+    buf_.push(hx);
+    buf_.push(cx);
+  } else {
+    dev->Exec([inb, outb, wb, hxb, cxb, hyb, cyb, wspace, this](Context *ctx) {
+      // clang-format off
+      cudnnRNNForwardInference(
+          ctx->cudnn_handle,
+          this->rnn_desc_,
+          this->seq_length_,
+          this->x_descs_, inb->data(),
+          this->hx_desc_, hxb == nullptr ? nullptr : hxb->data(),
+          this->cx_desc_, cxb == nullptr ? nullptr : cxb->data(),
+          this->weight_desc_, wb->data(),
+          this->y_descs_, outb->mutable_data(),
+          this->hy_desc_, hyb->mutable_data(),
+          this->cy_desc_, cyb == nullptr ? nullptr : cyb->mutable_data(),
+          wspace->mutable_data(), this->workspace_.Size());
+      // clang-format on
+    }, {inb, wb, hxb, cxb}, {outb, hyb, cyb, wspace});
+  }
+  auto outputs =
+      SplitOutput(num_x, hidden_size_ * num_directions_, inputs, output);
+  outputs.push_back(hy);
+  if (has_cell_) outputs.push_back(cy);
+  return outputs;
+}
+
+// TODO(wangwei) check Tensor device to be on cuda?
+const std::pair<vector<Tensor>, vector<Tensor>> CudnnRNN::Backward(
+    int flag, const vector<Tensor> &grads) {
+  // dhy (and dcy) is at last
+  const Tensor cx = buf_.top();  // cannot use const Tensor& due to pop()
+  buf_.pop();
+  const Tensor hx = buf_.top();
+  buf_.pop();
+  const Tensor y = buf_.top();
+  buf_.pop();
+  const Tensor x = buf_.top();
+  buf_.pop();
+
+  auto dev = y.device();
+  auto dtype = y.data_type();
+
+  CHECK_GT(grads.size(), 1u + has_cell_);
+  size_t num_dy = grads.size() - has_cell_ - 1;
+  CHECK_EQ(num_dy, seq_length_);
+  const Tensor dy = MergeInputs(num_dy, grads);
+  CHECK_EQ(dy.Size(), y.Size());
+  const Tensor dhy = grads.at(num_dy);
+  Tensor dcy;
+  if (has_cell_)
+    dcy = grads.at(num_dy + 1);
+
+  Shape xshape{y.Size() * input_size_ / hidden_size_ / num_directions_};
+  Tensor dx(xshape, dev, dtype);
+  Tensor dw(weight_.shape(), dev, dtype);
+  Shape state_shape{num_stacks_ * num_directions_, batch_size_, hidden_size_};
+  Tensor dhx(state_shape, dev, dtype);
+  Tensor dcx;
+  if (has_cell_)
+    dcx.ResetLike(dhx);
+  dw.SetValue(0.0f);
+  Block *yb = y.block(), *dyb = dy.block(), *dhyb = dhy.block(),
+        *dcyb = dcy.block(), *xb = x.block(), *cxb = cx.block(),
+        *wb = weight_.block(), *dwb = dw.block(), *hxb = hx.block(),
+        *dxb = dx.block(), *dhxb = dhx.block(), *dcxb = dcx.block(),
+        *wspace = workspace_.block(), *rspace = reserve_space_.block();
+
+  y.device()->Exec(
+      [yb, dyb, dhyb, dcyb, xb, cxb, wb, dwb, hxb, dxb, dhxb, dcxb, wspace,
+       rspace, this](Context *ctx) {
+        // clang-format off
+        cudnnRNNBackwardData(
+            ctx->cudnn_handle,
+            this->rnn_desc_,
+            this->seq_length_,
+            this->y_descs_, yb->data(),
+            this->dy_descs_, dyb->data(),
+            this->dhy_desc_, dhyb == nullptr ? nullptr : dhyb->data(),
+            this->dcy_desc_, dcyb == nullptr ? nullptr : dcyb->data(),
+            this->weight_desc_, wb->data(),
+            this->hx_desc_, hxb == nullptr ? nullptr : hxb->data(),
+            this->cx_desc_, cxb == nullptr ? nullptr : cxb->data(),
+            this->dx_descs_, dxb->mutable_data(),
+            this->dhx_desc_, dhxb->mutable_data(),
+            this->dcx_desc_, dcxb == nullptr ? nullptr : dcxb->mutable_data(),
+            wspace->mutable_data(), this->workspace_.Size(),
+            rspace->mutable_data(), this->reserve_space_.Size());
+        cudnnRNNBackwardWeights(
+            ctx->cudnn_handle,
+            this->rnn_desc_,
+            this->seq_length_,
+            this->x_descs_, xb->data(),
+            this->hx_desc_, hxb == nullptr ? nullptr : hxb->data(),
+            this->y_descs_, yb->data(),
+            wspace->data(), this->workspace_.Size(),
+            this->dweight_desc_, dwb->mutable_data(),
+            rspace->data(), this->reserve_space_.Size());
+        // clang-format on
+      },
+      {yb, dyb, dhyb, dcyb, xb, wb, wspace, rspace},
+      {dxb, dwb, dhxb, dcxb, wspace, rspace});
+
+  vector <Tensor> param_grad{dw};
+  auto data_grads = SplitOutput(num_dy, input_size_, grads, dx);
+  data_grads.push_back(dhx);
+  if (has_cell_)
+    data_grads.push_back(dcx);
+  return std::make_pair(data_grads, param_grad);
+}
+
+}  // namespace singa
+#endif  // CUDNN_VERSION >= 5005
+#endif  // USE_CUDNN
diff --git a/src/model/layer/cudnn_rnn.h b/src/model/layer/cudnn_rnn.h
new file mode 100644
index 0000000..5e642e0
--- /dev/null
+++ b/src/model/layer/cudnn_rnn.h
@@ -0,0 +1,88 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef SRC_MODEL_LAYER_CUDNN_RNN_H_
+#define SRC_MODEL_LAYER_CUDNN_RNN_H_
+#include "singa/singa_config.h"
+#ifdef USE_CUDNN
+#include <cudnn.h>
+#if CUDNN_VERSION >= 5005
+#include <string>
+#include <utility>
+#include <vector>
+#include "./rnn.h"
+#include "singa/core/common.h"
+#include "singa/model/layer.h"
+#include "singa/proto/core.pb.h"
+#include "singa/utils/string.h"
+#include <cudnn.h>
+#include <chrono>
+#include "./cudnn_utils.h"
+#include "singa/utils/logging.h"
+
+namespace singa {
+class CudnnRNN : public RNN {
+ public:
+  ~CudnnRNN();
+  /// \copydoc Layer::layer_type()
+  // const std::string layer_type() const override { return "CudnnRNN"; }
+
+  const vector<Tensor> Forward(int flag, const vector<Tensor>& inputs) override;
+  const std::pair<vector<Tensor>, vector<Tensor>> Backward(
+      int flag, const vector<Tensor>& grads) override;
+
+  void ToDevice(std::shared_ptr<Device> device) override;
+
+  void SetRNNDescriptor(shared_ptr<Device> dev);
+  void ResetHiddenAndCellDescriptors(size_t batch_size);
+  void DestroyIODescriptors();
+  void UpdateIODescriptors(size_t num, const vector<Tensor>& inputs);
+  void UpdateSpaces(size_t num, shared_ptr<Device> dev);
+  void UpdateStates(size_t num, const vector<Tensor>& inputs);
+  Tensor MergeInputs(size_t num, const vector<Tensor>& in);
+  vector<Tensor> SplitOutput(size_t num, size_t dim, const vector<Tensor>& in,
+                             const Tensor output);
+
+ protected:
+  cudnnTensorDescriptor_t* x_descs_ = nullptr;
+  cudnnTensorDescriptor_t* dx_descs_ = nullptr;
+  cudnnTensorDescriptor_t* y_descs_ = nullptr;
+  cudnnTensorDescriptor_t* dy_descs_ = nullptr;
+  cudnnTensorDescriptor_t hx_desc_ = nullptr;
+  cudnnTensorDescriptor_t dhx_desc_ = nullptr;
+  cudnnTensorDescriptor_t cx_desc_ = nullptr;
+  cudnnTensorDescriptor_t dcx_desc_ = nullptr;
+  cudnnTensorDescriptor_t hy_desc_ = nullptr;
+  cudnnTensorDescriptor_t dhy_desc_ = nullptr;
+  cudnnTensorDescriptor_t cy_desc_ = nullptr;
+  cudnnTensorDescriptor_t dcy_desc_ = nullptr;
+  cudnnFilterDescriptor_t weight_desc_ = nullptr;
+  cudnnFilterDescriptor_t dweight_desc_ = nullptr;
+  cudnnRNNDescriptor_t rnn_desc_ = nullptr;
+  cudnnDropoutDescriptor_t dropout_desc_ = nullptr;
+  cudnnDataType_t dtype_ = CUDNN_DATA_FLOAT;
+  Tensor workspace_;
+  Tensor reserve_space_;
+  Tensor dropout_state_;
+};
+
+}  // namespace singa
+
+#endif  // CUDNN_VERSION >= 5005
+#endif  // USE_CUDNN
+#endif  // SRC_MODEL_LAYER_CUDNN_RNN_H_
diff --git a/src/model/layer/cudnn_softmax.cc b/src/model/layer/cudnn_softmax.cc
new file mode 100644
index 0000000..f1a4a5b
--- /dev/null
+++ b/src/model/layer/cudnn_softmax.cc
@@ -0,0 +1,102 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "singa/singa_config.h"
+#include "./cudnn_softmax.h"
+#ifdef USE_CUDNN
+#include <cudnn.h>
+#include "./cudnn_utils.h"
+#include "singa/utils/logging.h"
+namespace singa {
+
+RegisterLayerClass(cudnn_softmax, CudnnSoftmax);
+CudnnSoftmax::~CudnnSoftmax() {
+  if (desc_ != nullptr) CUDNN_CHECK(cudnnDestroyTensorDescriptor(desc_));
+}
+
+void CudnnSoftmax::Setup(const Shape& in_sample, const LayerConf &conf) {
+  Softmax::Setup(in_sample, conf);
+  SoftmaxConf sft_conf = conf.softmax_conf();
+  std::string algorithm = sft_conf.algorithm();
+  CHECK(algorithm == "accurate" || algorithm == "fast" || algorithm == "log")
+    << "CudnnSoftmax only supports three algorithm preferences: "
+    << "accurate, fast and log.";
+  if (algorithm == "accurate")
+    algorithm_ = CUDNN_SOFTMAX_ACCURATE;
+  else if (algorithm == "fast")
+    algorithm_ = CUDNN_SOFTMAX_FAST;
+  else algorithm_ = CUDNN_SOFTMAX_LOG;
+}
+
+void CudnnSoftmax::InitCudnn(Shape shape, DataType dtype) {
+  CHECK(!has_init_cudnn_);
+  CUDNN_CHECK(cudnnCreateTensorDescriptor(&desc_));
+
+  CHECK_LE(shape.size(), 2u)
+    << "Tensor shape should range from 1 to 2D;"
+    << "otherwise, add flatten layer to transform";
+  if (shape.size() == 1u)
+    CUDNN_CHECK(cudnnSetTensor4dDescriptor( desc_,
+      CUDNN_TENSOR_NCHW, GetCudnnDataType(dtype), 1, shape[0], 1, 1));
+  else
+    CUDNN_CHECK(cudnnSetTensor4dDescriptor( desc_, CUDNN_TENSOR_NCHW,
+      GetCudnnDataType(dtype), shape[0], shape[1], 1, 1));
+  has_init_cudnn_ = true;
+}
+
+const Tensor CudnnSoftmax::Forward(int flag, const Tensor& input) {
+  CHECK(buf_.empty());
+  auto shape = input.shape();
+  DataType dtype = input.data_type();
+  if (!has_init_cudnn_) {
+    InitCudnn(shape, dtype);
+  }
+  Tensor output;
+  output.ResetLike(input);
+  output.device()->Exec([input, output, this](Context* ctx) {
+    Block* inblock = input.block(), * outblock = output.block();
+    float alpha = 1.0f, beta = 0.0f;
+    cudnnSoftmaxForward(ctx->cudnn_handle, this->algorithm_,
+                        CUDNN_SOFTMAX_MODE_INSTANCE,
+                        &alpha, this->desc_, inblock->data(), &beta,
+                        this->desc_, outblock->mutable_data());
+  }, {input.block()}, {output.block()});
+  if (flag & kTrain) buf_.push(output);
+  return output;
+}
+
+const std::pair<Tensor, vector<Tensor>> CudnnSoftmax::Backward(
+    int flag, const Tensor& grad) {
+  vector<Tensor> param_grad;
+  CHECK(!buf_.empty());
+  Tensor dx, output = buf_.top();
+  buf_.pop();
+  dx.ResetLike(grad);
+  dx.device()->Exec([dx, grad, output, this](Context* ctx) {
+    Block* dyblock = grad.block(), * dxblock = dx.block(),
+           * yblock = output.block();
+    float alpha = 1.0f, beta = 0.0f;
+    cudnnSoftmaxBackward(ctx->cudnn_handle, this->algorithm_,
+                         CUDNN_SOFTMAX_MODE_INSTANCE,
+                         &alpha, this->desc_, yblock->data(), this->desc_,
+                         dyblock->data(), &beta, this->desc_,
+                         dxblock->mutable_data());
+  }, {grad.block(), output.block()}, {dx.block()});
+  return std::make_pair(dx, param_grad);
+}
+}  // namespace singa
+#endif  // USE_CUDNN
diff --git a/src/model/layer/cudnn_softmax.h b/src/model/layer/cudnn_softmax.h
new file mode 100644
index 0000000..532a643
--- /dev/null
+++ b/src/model/layer/cudnn_softmax.h
@@ -0,0 +1,59 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef SINGA_MODEL_LAYER_CUDNN_SOFTMAX_H_
+#define SINGA_MODEL_LAYER_CUDNN_SOFTMAX_H_
+#ifdef USE_CUDNN
+#include <cudnn.h>
+#include <utility>
+#include <string>
+#include <vector>
+
+#include "./softmax.h"
+#include "singa/core/common.h"
+#include "singa/model/layer.h"
+#include "singa/proto/core.pb.h"
+
+namespace singa {
+class CudnnSoftmax : public Softmax {
+ public:
+  ~CudnnSoftmax();
+  /// \copydoc Layer::layer_type()
+  // const std::string layer_type() const override { return "CudnnSoftmax"; }
+
+  /// \copydoc Layer::Setup(const LayerConf&);
+  void Setup(const Shape& in_sample_shape, const LayerConf &conf) override;
+
+  const Tensor Forward(int flag, const Tensor& input) override;
+  const std::pair<Tensor, vector<Tensor>> Backward(int flag,
+                                                   const Tensor& grad) override;
+
+  const cudnnSoftmaxAlgorithm_t Algorithm() const { return algorithm_; }
+
+ private:
+  /// Init cudnn related data structures.
+  void InitCudnn(Shape shape, DataType dtype);
+
+ private:
+  bool has_init_cudnn_ = false;
+  cudnnTensorDescriptor_t desc_ = nullptr;
+  cudnnSoftmaxAlgorithm_t algorithm_;
+};
+}  // namespace
+#endif  // USE_CUDNN
+#endif  // SINGA_MODEL_LAYER_CUDNN_SOFTMAX_H_
diff --git a/src/model/layer/cudnn_utils.h b/src/model/layer/cudnn_utils.h
new file mode 100644
index 0000000..64ee758
--- /dev/null
+++ b/src/model/layer/cudnn_utils.h
@@ -0,0 +1,86 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef SRC_MODEL_LAYER_CUDNN_UTILS_H_
+#define SRC_MODEL_LAYER_CUDNN_UTILS_H_
+
+#include "singa/singa_config.h"
+#ifdef USE_CUDNN
+
+#include <cudnn.h>
+#include "singa/proto/core.pb.h"
+#include "singa/utils/logging.h"
+namespace singa {
+inline cudnnDataType_t GetCudnnDataType(DataType dtype) {
+  cudnnDataType_t ret = CUDNN_DATA_FLOAT;
+  switch (dtype) {
+    case kFloat32:
+      ret = CUDNN_DATA_FLOAT;
+      break;
+    case kDouble:
+      ret = CUDNN_DATA_DOUBLE;
+      break;
+    case kFloat16:
+      ret = CUDNN_DATA_HALF;
+      break;
+    default:
+      LOG(FATAL) << "The data type " << DataType_Name(dtype)
+                 << " is not support by cudnn";
+  }
+  return ret;
+}
+
+#define CUDNN_CHECK(condition)                                             \
+  do {                                                                     \
+    cudnnStatus_t status = condition;                                      \
+    CHECK_EQ(status, CUDNN_STATUS_SUCCESS) << " "                          \
+                                           << cudnnGetErrorString(status); \
+  } while (0)
+
+/*
+inline const char* cudnnGetErrorString(cudnnStatus_t status) {
+  switch (status) {
+    case CUDNN_STATUS_SUCCESS:
+      return "CUDNN_STATUS_SUCCESS";
+    case CUDNN_STATUS_NOT_INITIALIZED:
+      return "CUDNN_STATUS_NOT_INITIALIZED";
+    case CUDNN_STATUS_ALLOC_FAILED:
+      return "CUDNN_STATUS_ALLOC_FAILED";
+    case CUDNN_STATUS_BAD_PARAM:
+      return "CUDNN_STATUS_BAD_PARAM";
+    case CUDNN_STATUS_INTERNAL_ERROR:
+      return "CUDNN_STATUS_INTERNAL_ERROR";
+    case CUDNN_STATUS_INVALID_VALUE:
+      return "CUDNN_STATUS_INVALID_VALUE";
+    case CUDNN_STATUS_ARCH_MISMATCH:
+      return "CUDNN_STATUS_ARCH_MISMATCH";
+    case CUDNN_STATUS_MAPPING_ERROR:
+      return "CUDNN_STATUS_MAPPING_ERROR";
+    case CUDNN_STATUS_EXECUTION_FAILED:
+      return "CUDNN_STATUS_EXECUTION_FAILED";
+    case CUDNN_STATUS_NOT_SUPPORTED:
+      return "CUDNN_STATUS_NOT_SUPPORTED";
+    case CUDNN_STATUS_LICENSE_ERROR:
+      return "CUDNN_STATUS_LICENSE_ERROR";
+  }
+  return "Unknown cudnn status";
+}
+*/
+
+}  // namespace singa
+#endif  // USE_CUDNN
+#endif  // SRC_MODEL_LAYER_CUDNN_UTILS_H_
diff --git a/src/model/layer/dense.cc b/src/model/layer/dense.cc
new file mode 100644
index 0000000..64e3d86
--- /dev/null
+++ b/src/model/layer/dense.cc
@@ -0,0 +1,94 @@
+/**

+ * Licensed to the Apache Software Foundation (ASF) under one

+ * or more contributor license agreements.  See the NOTICE file

+ * distributed with this work for additional information

+ * regarding copyright ownership.  The ASF licenses this file

+ * to you under the Apache License, Version 2.0 (the

+ * "License"); you may not use this file except in compliance

+ * with the License.  You may obtain a copy of the License at

+ *

+ *     http://www.apache.org/licenses/LICENSE-2.0

+ *

+ * Unless required by applicable law or agreed to in writing, software

+ * distributed under the License is distributed on an "AS IS" BASIS,

+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

+ * See the License for the specific language governing permissions and

+ * limitations under the License.

+ */

+

+#include "./dense.h"

+#include "singa/model/layer.h"

+#include <vector>

+

+namespace singa {

+using std::vector;

+

+RegisterLayerClass(singa_dense, Dense);

+RegisterLayerClass(singacpp_dense, Dense);

+RegisterLayerClass(singacuda_dense, Dense);

+RegisterLayerClass(singacl_dense, Dense);

+Dense::~Dense() {

+  // delete weight_;

+  // delete bias_;

+}

+void Dense::Setup(const Shape& in_sample, const LayerConf &conf) {

+  Layer::Setup(in_sample, conf);

+  auto dense_conf = conf.dense_conf();

+  CHECK_EQ(in_sample.size(), 1u);

+  vdim_ = in_sample.at(0);

+  hdim_ = dense_conf.num_output();

+  transpose_ = dense_conf.transpose();

+  if (transpose_)  // was {vdim_, hdim} by zhaojing?

+    weight_.Reshape(Shape{hdim_, vdim_});

+  else

+    weight_.Reshape(Shape{vdim_, hdim_});

+  bias_.Reshape(Shape{hdim_});

+  for (auto specs: conf.param())

+    param_specs_.push_back(specs);

+}

+

+/// \copydoc Layer::Forward(int flag, const Tensor&)

+const Tensor Dense::Forward(int flag, const Tensor &input) {

+  CHECK(buf_.empty());

+  Tensor output;

+  CHECK_EQ(input.nDim(), 2u);

+  if (transpose_)  // use the transposed version of weight_ for computing

+    output = Mult(input, weight_.T());

+  else

+    output = Mult(input, weight_);

+  AddRow(bias_, &output);

+  if (flag & kTrain)

+    buf_.push(input);

+  return output;

+}

+

+/// \copydoc Layer::Backward(int, const Tensor&, const Tensor&);

+const std::pair<Tensor, vector<Tensor>> Dense::Backward(int flag,

+                                                        const Tensor &grad) {

+  vector<Tensor> param_grad;

+  CHECK(!buf_.empty());

+  Tensor src_data = buf_.top();

+  buf_.pop();

+  Tensor db, dw, dx;

+  db.ResetLike(bias_);

+  dw.ResetLike(weight_);

+  dx.ResetLike(src_data);

+  SumRows(grad, &db);

+  if (transpose_) {

+    dx = Mult(grad, weight_);

+    dw = Mult(grad.T(), src_data);

+  } else {

+    dx = Mult(grad, weight_.T());

+    dw = Mult(src_data.T(), grad);

+  }

+  param_grad.push_back(dw);

+  param_grad.push_back(db);

+  return std::make_pair(dx, param_grad);

+}

+

+void Dense::ToDevice(std::shared_ptr<Device> device) {

+  Layer::ToDevice(device);

+  weight_.ToDevice(device);

+  bias_.ToDevice(device);

+}

+} // namespace singa

diff --git a/src/model/layer/dense.h b/src/model/layer/dense.h
new file mode 100644
index 0000000..8a149a5
--- /dev/null
+++ b/src/model/layer/dense.h
@@ -0,0 +1,76 @@
+/**

+ * Licensed to the Apache Software Foundation (ASF) under one

+ * or more contributor license agreements.  See the NOTICE file

+ * distributed with this work for additional information

+ * regarding copyright ownership.  The ASF licenses this file

+ * to you under the Apache License, Version 2.0 (the

+ * "License"); you may not use this file except in compliance

+ * with the License.  You may obtain a copy of the License at

+ *

+ *     http://www.apache.org/licenses/LICENSE-2.0

+ *

+ * Unless required by applicable law or agreed to in writing, software

+ * distributed under the License is distributed on an "AS IS" BASIS,

+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

+ * See the License for the specific language governing permissions and

+ * limitations under the License.

+ */

+#ifndef SRC_MODEL_LAYER_DENSE_H_

+#define SRC_MODEL_LAYER_DENSE_H_

+#include <string>

+#include <utility>

+#include <vector>

+#include <stack>

+#include "singa/model/layer.h"

+

+namespace singa {

+class Dense : public Layer {

+ public:

+  ~Dense();

+  /// \copydoc Layer::layer_type()

+  // const std::string layer_type() const override { return "Dense"; }

+

+  /// \copydoc Layer::Setup(const LayerConf&);

+  void Setup(const Shape& in_sample, const LayerConf& conf) override;

+  const Shape GetOutputSampleShape() const override {

+    CHECK(hdim_) << "You may haven't call Setup()";

+    return vector<size_t>{hdim_};

+  }

+

+  /// \copydoc Layer::Forward(int flag, const Tensor&)

+  const Tensor Forward(int flag, const Tensor& input) override;

+

+  /// \copydoc Layer::Backward(int, const Tensor&, const Tensor&);

+  const std::pair<Tensor, vector<Tensor>> Backward(int flag,

+                                                   const Tensor& grad) override;

+

+  void ToDevice(std::shared_ptr<Device> device) override;

+  const std::vector<Tensor> param_values() override {

+    return std::vector<Tensor>{weight_, bias_};

+  }

+  size_t num_output() const { return hdim_; }

+  size_t num_input() const { return vdim_; }

+  bool transpose() const { return transpose_; }

+  const Tensor& weight() const { return weight_; }

+  const Tensor& bias() const { return bias_; }

+

+  void set_weight(Tensor w) {

+    weight_.ResetLike(w);

+    weight_.CopyData(w);

+  }

+  void set_bias(Tensor b) {

+    bias_.ResetLike(b);

+    bias_.CopyData(b);

+  }

+

+ protected:

+  /// Used in auto-encoder, where the decoder would share its weight matrix from

+  /// the encoder's transposed weight matrix.

+  bool transpose_ = false;

+  size_t vdim_, hdim_;

+  Tensor weight_, bias_;

+  // Tensor data_, grad_;

+  std::stack<Tensor> buf_;

+};

+}  // namespace singa

+#endif  // SRC_MODEL_LAYER_DENSE_H_

diff --git a/src/model/layer/dropout.cc b/src/model/layer/dropout.cc
new file mode 100644
index 0000000..d7397a1
--- /dev/null
+++ b/src/model/layer/dropout.cc
@@ -0,0 +1,65 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "singa/model/layer.h"
+#include "./dropout.h"
+namespace singa {
+
+RegisterLayerClass(singa_dropout, Dropout);
+RegisterLayerClass(singacpp_dropout, Dropout);
+RegisterLayerClass(singacuda_dropout, Dropout);
+RegisterLayerClass(singacl_dropout, Dropout);
+void Dropout::Setup(const Shape& in_sample, const LayerConf& conf) {
+  Layer::Setup(in_sample, conf);
+  dropout_ratio_ = conf.dropout_conf().dropout_ratio();
+  out_sample_shape_= in_sample;
+}
+
+const Tensor Dropout::Forward(int flag, const Tensor& input) {
+  Tensor out;
+  if (flag & kTrain) {
+    mask_.ResetLike(input);
+    // set mask_[i] = 1 with prob 1-dropout_rato_
+    Bernoulli(1.0f - dropout_ratio_, &mask_);
+    mask_ *= 1.0f / (1.0f - dropout_ratio_);
+    out = input * mask_;
+  } else {
+    out = input;
+  }
+  return out;
+}
+
+const std::pair<Tensor, vector<Tensor>> Dropout::Backward(int flag,
+                                                          const Tensor& grad) {
+  vector<Tensor> param_grad;
+  Tensor input_grad;
+  if (flag & kTrain) {
+    // note mask is already scaled by 1/(1-dropout_ratio_)
+    input_grad = grad * mask_;
+  } else {
+    LOG(ERROR) << "Do not call backward for evaluation phase";
+  }
+  return std::make_pair(input_grad, param_grad);
+}
+
+void Dropout::ToDevice(std::shared_ptr<Device> device) {
+  Layer::ToDevice(device);
+  mask_.ToDevice(device);
+}
+
+}  // namespace singa
diff --git a/src/model/layer/dropout.h b/src/model/layer/dropout.h
new file mode 100644
index 0000000..711c86b
--- /dev/null
+++ b/src/model/layer/dropout.h
@@ -0,0 +1,67 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef SRC_MODEL_LAYER_DROPOUT_H_
+#define SRC_MODEL_LAYER_DROPOUT_H_
+#include <utility>
+#include <string>
+#include <vector>
+#include "singa/model/layer.h"
+
+namespace singa {
+class Dropout : public Layer {
+ public:
+  /// \copydoc Layer::layer_type()
+  // const std::string layer_type() const override { return "Dropout"; }
+
+  /// \copydoc Layer::Setup(const LayerConf&);
+  void Setup(const Shape& in_sample, const LayerConf& conf) override;
+  const Shape GetOutputSampleShape() const override {
+    CHECK(out_sample_shape_.size()) << "You may haven't call Setup()";
+    return out_sample_shape_;
+  }
+
+  /// \copydoc Layer::Forward(int flag, const Tensor&)
+  /// if flag is kTrain, then do dropout with given dropout_ratio;
+  /// otherwise if it is kEval, copy input directly to the output
+  /// TODO(wangwei) There are diff implementations, Caffe vs
+  /// <a
+  /// href="https://github.com/nitishsrivastava/deepnet/blob/master/deepnet/fastdropoutnet.py">
+  const Tensor Forward(int flag, const Tensor& input) override;
+
+  /// \copydoc Layer::Backward(int, const Tensor&, const Tensor&);
+  const std::pair<Tensor, vector<Tensor>> Backward(int flag,
+                                                   const Tensor& grad) override;
+
+  void ToDevice(std::shared_ptr<Device> device) override;
+
+  float dropout_ratio() const {
+    return dropout_ratio_;
+  }
+
+  const Tensor& mask() const {
+    return mask_;
+  }
+
+ protected:
+  /// the proability to set each element to 0.
+  float dropout_ratio_;
+  Tensor mask_;
+  vector<size_t> out_sample_shape_;
+};
+}  // namespace singa
+#endif  // SRC_MODEL_LAYER_DROPOUT_H_
diff --git a/src/model/layer/flatten.cc b/src/model/layer/flatten.cc
new file mode 100644
index 0000000..561c310
--- /dev/null
+++ b/src/model/layer/flatten.cc
@@ -0,0 +1,57 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "singa/model/layer.h"
+#include "./flatten.h"
+namespace singa {
+
+RegisterLayerClass(singa_flatten, Flatten);
+RegisterLayerClass(singacpp_flatten, Flatten);
+RegisterLayerClass(singacuda_flatten, Flatten);
+RegisterLayerClass(singacl_flatten, Flatten);
+void Flatten::Setup(const Shape& in_sample, const LayerConf &conf) {
+  Layer::Setup(in_sample, conf);
+  axis_ = conf.flatten_conf().axis();
+  size_t len = 1;
+  if (axis_ > 0)
+    for (size_t i = axis_ - 1; i < in_sample.size(); i++)
+      len *= in_sample.at(i);
+  out_sample_shape_.push_back(len);
+}
+
+const Tensor Flatten::Forward(int flag, const Tensor &input) {
+  Tensor output;
+  input_shape_ = input.shape();
+  if (axis_ == 0)
+    output = Reshape(input, vector<size_t>{input.Size()});
+  else
+    output =
+        Reshape(input, vector<size_t>{input.Size() / out_sample_shape_.at(0),
+                                      out_sample_shape_.at(0)});
+  return output;
+}
+
+const std::pair<Tensor, vector<Tensor> > Flatten::Backward(int flag,
+                                                           const Tensor &grad) {
+  vector<Tensor> param_grad;
+  Tensor input_grad = grad;
+  input_grad.Reshape(input_shape_);
+  return std::make_pair(input_grad, param_grad);
+}
+
+} // namespace singa
diff --git a/src/model/layer/flatten.h b/src/model/layer/flatten.h
new file mode 100644
index 0000000..8bbf481
--- /dev/null
+++ b/src/model/layer/flatten.h
@@ -0,0 +1,56 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef SRC_MODEL_LAYER_FLATTEN_H_
+#define SRC_MODEL_LAYER_FLATTEN_H_
+#include <utility>
+#include <string>
+#include <vector>
+#include "singa/model/layer.h"
+
+namespace singa {
+class Flatten : public Layer {
+ public:
+  /// \copydoc Layer::layer_type();
+  // const std::string layer_type() const override { return "Flatten"; }
+
+  /// \copydoc Layer::Setup(const LayerConf&);
+  void Setup(const Shape& in_sample, const LayerConf& conf) override;
+  const Shape GetOutputSampleShape() const override {
+    CHECK(out_sample_shape_.size()) << "You may haven't call Setup()";
+    return out_sample_shape_;
+  }
+
+  /// \copydoc Layer::Forward(int flag, const Tensor&);
+  const Tensor Forward(int flag, const Tensor &input) override;
+
+  /// \copydoc Layer::Backward(int, const Tensor&, const Tensor&);
+  const std::pair<Tensor, vector<Tensor> > Backward(
+      int flag, const Tensor &grad) override;
+
+  const int Axis() const { return axis_; }
+  const Shape input_shape() const { return input_shape_; }
+
+ protected:
+  /// flatten layer reshape the input to 2D, one from 0 to axis_-1, one from
+  /// axis_ to end.
+  /// if axis_ is 0, reshape the input to 1D.
+  int axis_;
+  Shape input_shape_, out_sample_shape_;
+};
+}      // namespace singa
+#endif // SRC_MODEL_LAYER_FLATTEN_H_
diff --git a/src/model/layer/lrn.cc b/src/model/layer/lrn.cc
new file mode 100644
index 0000000..4fdb5c9
--- /dev/null
+++ b/src/model/layer/lrn.cc
@@ -0,0 +1,151 @@
+/*********************************************************
+*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*
+************************************************************/
+#include "lrn.h"
+#include <vector>
+
+namespace singa {
+RegisterLayerClass(singa_lrn, LRN);
+RegisterLayerClass(singacpp_lrn, LRN);
+RegisterLayerClass(singacuda_lrn, LRN);
+RegisterLayerClass(singacl_lrn, LRN);
+void LRN::Setup(const Shape& in_sample, const LayerConf& conf) {
+  Layer::Setup(in_sample, conf);
+  out_sample_shape_ = in_sample;
+  local_size_ = conf.lrn_conf().local_size();
+  CHECK_EQ(local_size_ % 2, 1) << "LRN only supports odd values for Localvol";
+  k_ = conf.lrn_conf().k();
+  alpha_ = conf.lrn_conf().alpha();
+  beta_ = conf.lrn_conf().beta();
+}
+
+const Tensor LRN::Forward(int flag, const Tensor& input) {
+  Tensor x = input.Clone();
+  x.Reshape(Shape{input.shape(0), input.Size() / input.shape(0)});
+  vector<Tensor> channels, images;
+  // for each image
+  for (size_t i = 0; i < input.shape(0); ++i) {
+    Tensor image = CopyRows(x, i, i + 1);
+    image.Reshape(Shape{input.shape(1), input.shape(2) * input.shape(3)});
+    // for each channel of the image
+    channels.clear();
+    for (size_t c = 0; c < input.shape(1); ++c) {
+      Tensor window =
+          CopyRows(image, std::max(0, static_cast<int>(c) - local_size_ / 2),
+                   std::min(input.shape(1), c + local_size_ / 2 + 1));
+      window = Square(window);
+
+      Tensor tmp, ch;
+      tmp.Reshape(Shape{input.shape(2) * input.shape(3)});
+      SumRows(window, &tmp);
+
+      tmp *= alpha_;
+      tmp += k_;
+      tmp = Pow(tmp, beta_);
+
+      ch = CopyRows(image, c, c + 1);
+      ch = ch / tmp;
+      ch.Reshape(Shape{input.shape(2), input.shape(3)});
+      channels.push_back(ch);
+    }
+    Tensor normalized_image = ConcatenateRows(channels);
+    normalized_image.Reshape(
+        Shape{input.shape(1), input.shape(2) * input.shape(3)});
+    images.push_back(normalized_image);
+  }
+  Tensor output = ConcatenateRows(images);
+  output.Reshape(input.shape());
+  buf_.push(input);
+
+  return output;
+}
+
+const std::pair<Tensor, vector<Tensor>> LRN::Backward(int flag,
+                                                      const Tensor& grad) {
+  Tensor dx;
+  if ((flag & kTrain) == kTrain) {
+    Tensor dy = grad.Clone();
+    dy.Reshape(Shape{grad.shape(0), grad.Size() / grad.shape(0)});
+    Tensor x = buf_.top();
+    buf_.pop();
+    x.Reshape(dy.shape());
+    vector<Tensor> channels, images;
+    // for each image
+    for (size_t i = 0; i < grad.shape(0); ++i) {
+      Tensor image = CopyRows(x, i, i + 1);
+      image.Reshape(Shape{grad.shape(1), grad.shape(2) * grad.shape(3)});
+      // for each channel of the image
+      channels.clear();
+      for (size_t c = 0; c < grad.shape(1); ++c) {
+        Tensor window =
+            CopyRows(image, std::max(0, static_cast<int>(c) - local_size_ / 2),
+                     std::min(grad.shape(1), c + local_size_ / 2 + 1));
+        Tensor tmp;
+        tmp.Reshape(Shape{grad.shape(2) * grad.shape(3)});
+        window = Square(window);
+        SumRows(window, &tmp);
+        tmp *= alpha_;
+        tmp += k_;
+        tmp.Reshape(Shape{grad.shape(2), grad.shape(3)});
+        channels.push_back(tmp);
+      }
+      Tensor norm_image = ConcatenateRows(channels);
+      norm_image.Reshape(Shape{grad.shape(1), grad.shape(2) * grad.shape(3)});
+      images.push_back(norm_image);
+    }
+    Tensor norm = ConcatenateRows(images);
+    norm.Reshape(dy.shape());
+    dx = Pow(norm, -beta_);
+    dx = dx * dy;
+    Tensor tmp = dx * x;
+    tmp = tmp / norm;
+    images.clear();
+    for (size_t i = 0; i < grad.shape(0); ++i) {
+      Tensor image = CopyRows(tmp, i, i + 1);
+      image.Reshape(Shape{grad.shape(1), grad.shape(2) * grad.shape(3)});
+      // for each channel of the image
+      channels.clear();
+      for (size_t c = 0; c < grad.shape(1); ++c) {
+        Tensor window =
+            CopyRows(image, std::max(0, static_cast<int>(c) - local_size_ / 2),
+                     std::min(grad.shape(1), c + local_size_ / 2 + 1));
+        Tensor tmpr;
+        tmpr.Reshape(Shape{grad.shape(2) * grad.shape(3)});
+        SumRows(window, &tmpr);
+        tmpr.Reshape(Shape{grad.shape(2), grad.shape(3)});
+        channels.push_back(tmpr);
+      }
+      Tensor pooled_image = ConcatenateRows(channels);
+      pooled_image.Reshape(Shape{grad.shape(1), grad.shape(2) * grad.shape(3)});
+      images.push_back(pooled_image);
+    }
+    Tensor tmp2 = ConcatenateRows(images);
+    tmp2 *= (-2.0f * beta_ * alpha_);
+    tmp2 = tmp2 * x;
+    dx = dx + tmp2;
+    dx.Reshape(grad.shape());
+  } else {
+    LOG(ERROR) << "Do not call backward for evaluation phase";
+  }
+  vector<Tensor> param_grad;
+  return std::make_pair(dx, param_grad);
+}
+
+}  // namespace
diff --git a/src/model/layer/lrn.h b/src/model/layer/lrn.h
new file mode 100644
index 0000000..57e26ba
--- /dev/null
+++ b/src/model/layer/lrn.h
@@ -0,0 +1,73 @@
+/*********************************************************
+*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*
+************************************************************/
+#ifndef SINGA_MODEL_LAYER_LRN_H_
+#define SINGA_MODEL_LAYER_LRN_H_
+#include "singa/model/layer.h"
+#include <stack>
+
+namespace singa {
+class LRN : public Layer {
+ public:
+  /// \copydoc Layer::layer_type()
+  // const std::string layer_type() const override { return "LRN"; }
+
+  /// \copydoc Layer::Setup(const LayerConf&);
+  void Setup(const Shape& in_sample, const LayerConf& conf) override;
+  const Shape GetOutputSampleShape() const override {
+    CHECK(out_sample_shape_.size()) << "You may haven't call Setup()";
+    return out_sample_shape_;
+  }
+
+  /**
+   * Local Response Normalization edge
+   *
+   * @f$ b_i=a_i/x_i^beta @f$
+   * @f$x_i=k+alpha*\sum_{j=max(0,i-n/2)}^{min(N,i+n/2)}(a_j)^2 @f$
+   * n is size of local response area.
+   * @f$a_i@f$, the activation (after ReLU) of a neuron convolved with the i-th kernel.
+   * @f$b_i@f$, the neuron after normalization, N is the total num of kernels
+   */
+  const Tensor Forward(int flag, const Tensor& input)
+    override;
+
+  /// \copydoc Layer::Backward(int, const Tensor&, const Tensor&);
+  const std::pair<Tensor, vector<Tensor>> Backward(
+      int flag, const Tensor& grad) override;
+
+  int local_size() const { return local_size_; }
+  float alpha() const { return alpha_; }
+  float beta() const { return beta_; }
+  float k() const { return k_; }
+
+ protected:
+  //!< hyper-parameter: size local response (neighbor) area
+  int local_size_;
+  //!< other hyper-parameters
+  float alpha_, beta_, k_;
+  // store intermediate data, i.e., input tensor
+  std::stack<Tensor> buf_;
+  Shape out_sample_shape_;
+
+}; // class LRN
+} // namespace
+
+#endif  // SINGA_MODEL_LAYER_LRN_H_
+
diff --git a/src/model/layer/merge.cc b/src/model/layer/merge.cc
new file mode 100644
index 0000000..a517024
--- /dev/null
+++ b/src/model/layer/merge.cc
@@ -0,0 +1,63 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "singa/model/layer.h"
+#include "./merge.h"
+namespace singa {
+
+RegisterLayerClass(singa_merge, Merge);
+RegisterLayerClass(singacpp_merge, Merge);
+RegisterLayerClass(singacuda_merge, Merge);
+RegisterLayerClass(singacl_merge, Merge);
+
+void Merge::Setup(const Shape& in_sample, const LayerConf& conf) {
+  Layer::Setup(in_sample, conf);
+  out_sample_shape_ = in_sample;
+}
+
+const vector<Tensor> Merge::Forward(int flag, const vector<Tensor>& inputs) {
+  vector<Tensor> outputs;
+  input_size_ = inputs.size();
+  if (inputs.size() == 1u) {
+    outputs = inputs;
+  } else {
+    Tensor sum;
+    sum.ResetLike(inputs.at(0));
+    sum.SetValue(0.0f);
+    for (size_t i = 0; i < inputs.size(); i++) {
+      Tensor temp = inputs.at(i);
+      CHECK_EQ(sum.nDim(), temp.nDim());
+      for (size_t j = 0; j < temp.nDim(); j++)
+        CHECK_EQ(sum.shape(j), temp.shape(j));
+      sum += temp;
+    }
+    outputs.push_back(sum);
+  }
+  return outputs;
+}
+
+const std::pair<vector<Tensor>, vector<Tensor>> Merge::Backward(
+    int flag, const vector<Tensor>& grads) {
+  vector<Tensor> input_grad, param_grad;
+  CHECK_EQ(grads.size(), 1u) << "Merge layer only have one output tensor.";
+  for (size_t i = 0; i < input_size_; i++)
+    input_grad.push_back(grads.at(0));
+  return std::make_pair(input_grad, param_grad);
+}
+
+}  // namespace singa
diff --git a/src/model/layer/merge.h b/src/model/layer/merge.h
new file mode 100644
index 0000000..c709d69
--- /dev/null
+++ b/src/model/layer/merge.h
@@ -0,0 +1,53 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef SRC_MODEL_LAYER_MERGE_H_
+#define SRC_MODEL_LAYER_MERGE_H_
+#include <string>
+#include <utility>
+#include <vector>
+#include "singa/model/layer.h"
+
+namespace singa {
+/// Sum features of all input layers
+class Merge : public Layer {
+ public:
+  // const std::string layer_type() const override { return "Merge"; }
+
+   /// the sample shape of all input tesnors should be the same
+   void Setup(const Shape &in_sample, const LayerConf &conf) override;
+   const Shape GetOutputSampleShape() const override {
+     CHECK(out_sample_shape_.size()) << "You may haven't call Setup()";
+     return out_sample_shape_;
+   }
+   /// Sum all tensors in 'inputs'
+   /// Return a vector including the result of the summation
+   const vector<Tensor> Forward(int flag,
+                                const vector<Tensor> &inputs) override;
+
+   /// 'grads' should include only one tensor
+   /// the first result vector includes the gradients for each input layer
+   /// the second result vector is empty
+   const std::pair<vector<Tensor>, vector<Tensor> >
+   Backward(int flag, const vector<Tensor> &grads) override;
+
+ protected:
+  Shape out_sample_shape_;
+  size_t input_size_ = 1u;
+};
+}  // namespace singa
+#endif  // SRC_MODEL_LAYER_MERGE_H_
diff --git a/src/model/layer/pooling.cc b/src/model/layer/pooling.cc
new file mode 100644
index 0000000..1312776
--- /dev/null
+++ b/src/model/layer/pooling.cc
@@ -0,0 +1,295 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "./pooling.h"
+#include "singa/model/layer.h"
+namespace singa {
+
+RegisterLayerClass(singacpp_pooling, Pooling);
+void Pooling::Setup(const Shape& in_sample, const LayerConf& conf) {
+  Layer::Setup(in_sample, conf);
+  PoolingConf pool_conf = conf.pooling_conf();
+  if (pool_conf.has_kernel_size()) {
+    kernel_w_ = kernel_h_ = pool_conf.kernel_size();
+  } else {
+    kernel_w_ = pool_conf.kernel_w();
+    kernel_h_ = pool_conf.kernel_h();
+  }
+  CHECK_GT(kernel_w_, 0u);
+  CHECK_GT(kernel_h_, 0u);
+
+  if (pool_conf.has_pad()) {
+    pad_w_ = pad_h_ = pool_conf.pad();
+  } else {
+    pad_w_ = pool_conf.pad_w();
+    pad_h_ = pool_conf.pad_h();
+  }
+  CHECK_GE(pad_w_, 0u);
+  CHECK_GE(pad_h_, 0u);
+
+  if (pool_conf.has_stride()) {
+    stride_w_ = stride_h_ = pool_conf.stride();
+  } else {
+    stride_w_ = pool_conf.stride_w();
+    stride_h_ = pool_conf.stride_h();
+  }
+  CHECK_GT(stride_w_, 0u);
+  CHECK_GE(stride_h_, 0u);  // 0 for 1D pooling
+
+  pool_ = pool_conf.pool();
+  CHECK(pool_ == PoolingConf_PoolMethod_AVE ||
+        pool_ == PoolingConf_PoolMethod_MAX ||
+        pool_ == PoolingConf_PoolMethod_STOCHASTIC)
+      << "Padding implemented only for average and max pooling.";
+
+  CHECK_EQ(in_sample.size(), 3u);
+  channels_ = in_sample.at(0);
+  height_ = in_sample.at(1);
+  width_ = in_sample.at(2);
+  pooled_height_ = 1;
+  if (stride_h_ > 0)
+    pooled_height_ =
+        static_cast<size_t>((height_ + 2 * pad_h_ - kernel_h_) / stride_h_) + 1;
+  pooled_width_ =
+      static_cast<size_t>((width_ + 2 * pad_w_ - kernel_w_) / stride_w_) + 1;
+  out_sample_shape_ = vector<size_t>{channels_, pooled_height_, pooled_width_};
+}
+
+const Tensor Pooling::Forward(int flag, const Tensor& input) {
+  CHECK(buf_.empty());
+  CHECK_EQ(input.device()->lang(), kCpp);
+  CHECK_EQ(input.nDim(), 4u);
+  size_t batchsize = input.shape(0);
+  DataType dtype = input.data_type();
+  auto dev = input.device();
+  Shape shape{batchsize, channels_, pooled_height_, pooled_width_};
+  Tensor output(shape, dev, dtype);
+  float* outptr = new float[output.Size()];
+  auto inptr = input.data<float>();
+  if (pool_ == PoolingConf_PoolMethod_MAX) {
+    Tensor mask;
+    mask.ResetLike(output);
+    float* maskptr = new float[mask.Size()];
+    ForwardMaxPooling(inptr, batchsize, channels_, height_, width_, kernel_h_,
+                      kernel_w_, pad_h_, pad_w_, stride_h_, stride_w_, outptr,
+                      maskptr);
+    mask.CopyDataFromHostPtr(maskptr, mask.Size());
+    if (flag & kTrain) buf_.push(mask);
+    delete[] maskptr;
+  } else if (pool_ == PoolingConf_PoolMethod_AVE)
+    ForwardAvgPooling(inptr, batchsize, channels_, height_, width_, kernel_h_,
+                      kernel_w_, pad_h_, pad_w_, stride_h_, stride_w_, outptr);
+  else
+    LOG(FATAL) << "Unknow pooling method";
+
+  output.CopyDataFromHostPtr(outptr, output.Size());
+  delete[] outptr;
+  return output;
+}
+
+const std::pair<Tensor, vector<Tensor>> Pooling::Backward(int flag,
+                                                          const Tensor& grad) {
+  CHECK_EQ(grad.device()->lang(), kCpp);
+  CHECK_EQ(grad.nDim(), 4u);
+  vector<Tensor> param_grad;
+    size_t batchsize = grad.shape(0);
+  Shape shape{batchsize, channels_, height_, width_};
+  auto dev = grad.device();
+  DataType dtype = grad.data_type();
+  Tensor dx(shape, dev, dtype);
+  auto gradptr = grad.data<float>();
+  float* dxptr = new float[dx.Size()];
+  if (pool_ == PoolingConf_PoolMethod_MAX) {
+    CHECK(!buf_.empty());
+    Tensor mask = buf_.top();
+    buf_.pop();
+    auto maskptr = mask.data<float>();
+    BackwardMaxPooling(gradptr, maskptr, batchsize, channels_, height_, width_,
+                       kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_,
+                       stride_w_, dxptr);
+  } else if (pool_ == PoolingConf_PoolMethod_AVE) {
+    BackwardAvgPooling(gradptr, batchsize, channels_, height_, width_,
+                       kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_,
+                       stride_w_, dxptr);
+  } else {
+    LOG(FATAL) << "Unknow pooling method";
+  }
+
+  dx.CopyDataFromHostPtr(dxptr, dx.Size());
+  delete[] dxptr;
+  return std::make_pair(dx, param_grad);
+}
+
+void Pooling::ForwardMaxPooling(const float* bottom, const int num,
+                                const int channels, const int height,
+                                const int width, const int kernel_h,
+                                const int kernel_w, const int pad_h,
+                                const int pad_w, const int stride_h,
+                                const int stride_w, float* top, float* mask) {
+  int top_height = (height + pad_h * 2 - kernel_h) / stride_h + 1;
+  int top_width = (width + pad_w * 2 - kernel_w) / stride_w + 1;
+  int top_count = num * top_height * top_width * channels;
+  for (int i = 0; i < top_count; i++) {
+    mask[i] = -1;
+    top[i] = -FLT_MAX;
+  }
+  const int bottom_offset = height * width;
+  const int top_offset = top_height * top_width;
+  // The main loop
+  for (int n = 0; n < num; ++n) {
+    for (int c = 0; c < channels; ++c) {
+      for (int ph = 0; ph < top_height; ++ph) {
+        for (int pw = 0; pw < top_width; ++pw) {
+          int hstart = ph * stride_h - pad_h;
+          int wstart = pw * stride_w - pad_w;
+          int hend = std::min(hstart + kernel_h, height);
+          int wend = std::min(wstart + kernel_w, width);
+          hstart = std::max(hstart, 0);
+          wstart = std::max(wstart, 0);
+          const int top_index = ph * top_width + pw;
+          for (int h = hstart; h < hend; ++h) {
+            for (int w = wstart; w < wend; ++w) {
+              const int index = h * width + w;
+              if (bottom[index] > top[top_index]) {
+                top[top_index] = bottom[index];
+                mask[top_index] = index;
+              }
+            }
+          }
+        }
+      }
+      // compute offset
+      bottom += bottom_offset;
+      top += top_offset;
+      mask += top_offset;
+    }
+  }
+}
+
+void Pooling::BackwardMaxPooling(const float* top, const float* mask,
+                                 const int num, const int channels,
+                                 const int height, const int width,
+                                 const int kernel_h, const int kernel_w,
+                                 const int pad_h, const int pad_w,
+                                 const int stride_h, const int stride_w,
+                                 float* bottom) {
+  int top_height = (height + pad_h * 2 - kernel_h) / stride_h + 1;
+  int top_width = (width + pad_w * 2 - kernel_w) / stride_w + 1;
+  const int top_offset = top_height * top_width;
+  const int bottom_offset = height * width;
+  memset(bottom, 0, sizeof(float) * num * channels * bottom_offset);
+  for (int n = 0; n < num; ++n) {
+    for (int c = 0; c < channels; ++c) {
+      for (int ph = 0; ph < top_height; ++ph) {
+        for (int pw = 0; pw < top_width; ++pw) {
+          const int top_idx = ph * top_width + pw;
+          const int bottom_idx = static_cast<int>(mask[top_idx]);
+          bottom[bottom_idx] += top[top_idx];
+        }
+      }
+      top += top_offset;
+      mask += top_offset;
+      bottom += bottom_offset;
+    }
+  }
+}
+
+void Pooling::ForwardAvgPooling(const float* bottom, const int num,
+                                const int channels, const int height,
+                                const int width, const int kernel_h,
+                                const int kernel_w, const int pad_h,
+                                const int pad_w, const int stride_h,
+                                const int stride_w, float* top) {
+  int top_height = (height + pad_h * 2 - kernel_h) / stride_h + 1;
+  int top_width = (width + pad_w * 2 - kernel_w) / stride_w + 1;
+  int top_count = num * top_height * top_width * channels;
+  for (int i = 0; i < top_count; i++) {
+    top[i] = 0;
+  }
+  const int bottom_offset = height * width;
+  const int top_offset = top_height * top_width;
+  // The main loop
+  for (int n = 0; n < num; ++n) {
+    for (int c = 0; c < channels; ++c) {
+      for (int ph = 0; ph < top_height; ++ph) {
+        for (int pw = 0; pw < top_width; ++pw) {
+          int hstart = ph * stride_h - pad_h;
+          int wstart = pw * stride_w - pad_w;
+          int hend = std::min(hstart + kernel_h, height + pad_h);
+          int wend = std::min(wstart + kernel_w, width + pad_w);
+          int pool_size = (hend - hstart) * (wend - wstart);
+          hstart = std::max(hstart, 0);
+          wstart = std::max(wstart, 0);
+          hend = std::min(hend, height);
+          wend = std::min(wend, width);
+          const int top_index = ph * top_width + pw;
+          for (int h = hstart; h < hend; ++h) {
+            for (int w = wstart; w < wend; ++w) {
+              const int index = h * width + w;
+              top[top_index] += bottom[index];
+            }
+          }
+          top[top_index] /= pool_size;
+        }
+      }
+      // compute offset
+      bottom += bottom_offset;
+      top += top_offset;
+    }
+  }
+}
+
+void Pooling::BackwardAvgPooling(const float* top, const int num,
+                                 const int channels, const int height,
+                                 const int width, const int kernel_h,
+                                 const int kernel_w, const int pad_h,
+                                 const int pad_w, const int stride_h,
+                                 const int stride_w, float* bottom) {
+  int top_height = (height + pad_h * 2 - kernel_h) / stride_h + 1;
+  int top_width = (width + pad_w * 2 - kernel_w) / stride_w + 1;
+  const int top_offset = top_height * top_width;
+  const int bottom_offset = height * width;
+  memset(bottom, 0, sizeof(float) * num * channels * bottom_offset);
+  for (int n = 0; n < num; ++n) {
+    for (int c = 0; c < channels; ++c) {
+      for (int ph = 0; ph < top_height; ++ph) {
+        for (int pw = 0; pw < top_width; ++pw) {
+          int hstart = ph * stride_h - pad_h;
+          int wstart = pw * stride_w - pad_w;
+          int hend = std::min(hstart + kernel_h, height + pad_h);
+          int wend = std::min(wstart + kernel_w, width + pad_w);
+          int pool_size = (hend - hstart) * (wend - wstart);
+          hstart = std::max(hstart, 0);
+          wstart = std::max(wstart, 0);
+          hend = std::min(hend, height);
+          wend = std::min(wend, width);
+          const int top_index = ph * top_width + pw;
+          for (int h = hstart; h < hend; ++h) {
+            for (int w = wstart; w < wend; ++w) {
+              const int index = h * width + w;
+              bottom[index] += top[top_index] / pool_size;
+            }
+          }
+        }
+      }
+      top += top_offset;
+      bottom += bottom_offset;
+    }
+  }
+}
+}  // namespace singa
diff --git a/src/model/layer/pooling.h b/src/model/layer/pooling.h
new file mode 100644
index 0000000..f844799
--- /dev/null
+++ b/src/model/layer/pooling.h
@@ -0,0 +1,90 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef SRC_MODEL_LAYER_POOLING_H_
+#define SRC_MODEL_LAYER_POOLING_H_
+#include <cfloat>
+#include <stack>
+#include <string>
+#include <utility>
+#include <vector>
+#include "singa/model/layer.h"
+
+namespace singa {
+class Pooling : public Layer {
+ public:
+  /// \copydoc Layer::layer_type()
+  // const std::string layer_type() const override { return "Pooling"; }
+
+  /// \copydoc Layer::Setup(const LayerConf&);
+  void Setup(const Shape& in_sample, const LayerConf& conf) override;
+  const Shape GetOutputSampleShape() const override {
+    CHECK(out_sample_shape_.size()) << "You may haven't call Setup()";
+    return out_sample_shape_;
+  }
+  /// \copydoc Layer::Forward(int flag, const Tensor&)
+  const Tensor Forward(int flag, const Tensor& input) override;
+
+  /// \copydoc Layer::Backward(int, const Tensor&, const Tensor&);
+  const std::pair<Tensor, vector<Tensor>> Backward(int flag,
+                                                   const Tensor& grad) override;
+
+  void ForwardMaxPooling(const float* bottom, const int num, const int channels,
+                         const int height, const int width, const int kernel_h,
+                         const int kernel_w, const int pad_h, const int pad_w,
+                         const int stride_h, const int stride_w, float* top,
+                         float* mask);
+
+  void BackwardMaxPooling(const float* top, const float* mask, const int num,
+                          const int channels, const int height, const int width,
+                          const int kernel_h, const int kernel_w,
+                          const int pad_h, const int pad_w, const int stride_h,
+                          const int stride_w, float* bottom);
+
+  void ForwardAvgPooling(const float* bottom, const int num, const int channels,
+                         const int height, const int width, const int kernel_h,
+                         const int kernel_w, const int pad_h, const int pad_w,
+                         const int stride_h, const int stride_w, float* top);
+
+  void BackwardAvgPooling(const float* top, const int num, const int channels,
+                          const int height, const int width, const int kernel_h,
+                          const int kernel_w, const int pad_h, const int pad_w,
+                          const int stride_h, const int stride_w,
+                          float* bottom);
+
+  size_t kernel_w() const { return kernel_w_; }
+  size_t kernel_h() const { return kernel_h_; }
+  size_t pad_w() const { return pad_w_; }
+  size_t pad_h() const { return pad_h_; }
+  size_t stride_w() const { return stride_w_; }
+  size_t stride_h() const { return stride_h_; }
+  PoolingConf_PoolMethod pool_method() const { return pool_; }
+  size_t channels() const { return channels_; }
+  size_t height() const { return height_; }
+  size_t width() const { return width_; }
+
+ protected:
+  size_t kernel_w_, pad_w_, stride_w_;
+  size_t kernel_h_, pad_h_, stride_h_;
+  size_t channels_, height_, width_, pooled_height_, pooled_width_;
+  PoolingConf_PoolMethod pool_;
+  // To store the input and output(of forward) tensors
+  std::stack<Tensor> buf_;
+  Shape out_sample_shape_;
+};
+}  // namespace singa
+#endif  // SRC_MODEL_LAYER_POOLING_H_
diff --git a/src/model/layer/prelu.cc b/src/model/layer/prelu.cc
new file mode 100644
index 0000000..e567172
--- /dev/null
+++ b/src/model/layer/prelu.cc
@@ -0,0 +1,149 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "singa/model/layer.h"
+#include "./prelu.h"
+namespace singa {
+
+RegisterLayerClass(singa_prelu, PReLU);
+RegisterLayerClass(singacpp_prelu, PReLU);
+RegisterLayerClass(singacuda_prelu, PReLU);
+RegisterLayerClass(singacl_prelu, PReLU);
+void PReLU::Setup(const Shape& in_sample, const LayerConf &conf) {
+  Layer::Setup(in_sample, conf);
+  out_sample_shape_ = in_sample;
+  channel_shared_ = conf.prelu_conf().channel_shared();
+  format_ = conf.prelu_conf().format();
+  // Push back params into param_values_
+  for (const auto &spec : conf.param()) param_specs_.push_back(spec);
+//  param_values_.push_back(a_);
+}
+
+const Tensor PReLU::Forward(int flag, const Tensor &input) {
+  Tensor output;
+  if (!channel_shared_) {
+    size_t n, c, h, w;
+    Tensor temp = (input <= 0.f);
+    if (temp.nDim() == 4) {
+      if (format_ == "NCHW") {
+        n = temp.shape(0);
+        c = temp.shape(1);
+        h = temp.shape(2);
+        w = temp.shape(3);
+        temp.Reshape(Shape{n * c, h * w});
+        Tensor temp_a(Shape{n, c}, input.device(), input.data_type());
+        Uniform(1.f, 1.f, &temp_a);
+        MultRow(a_, &temp_a);
+        temp_a.Reshape(Shape{n * c});
+        MultColumn(temp_a, &temp);
+      } else if (format_ == "NHWC") {
+        n = temp.shape(0);
+        h = temp.shape(1);
+        w = temp.shape(2);
+        c = temp.shape(3);
+        temp.Reshape(Shape{n * h * w, c});
+        MultRow(a_, &temp);
+      } else {
+        LOG(FATAL) << "Incorrect input format for prelu layer.";
+      }
+    } else {
+      LOG(FATAL) << "Incorrect input format for prelu layer.";
+    }
+    output = input * ((input > 0.f) + temp);
+  } else {
+    // share the first param of Tensor A along all channels
+    LOG(FATAL) << "Not implemented";
+  // TODO(wangwei) cannot access the data in this way. The data could be on GPU.
+    auto a = a_.data<float>()[0];
+    output = input * ((input > 0.f) + (input <= 0.f) * a);
+  }
+  if (flag & kTrain) buf_.push(input);
+  return output;
+}
+
+const std::pair<Tensor, vector<Tensor> > PReLU::Backward(int flag,
+                                                         const Tensor &grad) {
+  vector<Tensor> param_grad;
+  CHECK(!buf_.empty());
+  Tensor input_grad, input = buf_.top();
+  buf_.pop();
+  Tensor da;
+  da.ResetLike(a_);
+  if (!channel_shared_) {
+    size_t n = 0, c = 0, h = 0, w = 0;
+    Tensor temp1 = (input <= 0.f);
+    if (temp1.nDim() == 4) {
+      if (format_ == "NCHW") {
+        n = temp1.shape(0);
+        c = temp1.shape(1);
+        h = temp1.shape(2);
+        w = temp1.shape(3);
+        temp1.Reshape(Shape{n * c, h * w});
+        Tensor temp_a(Shape{n, c}, grad.device(), grad.data_type());
+        Uniform(1.f, 1.f, &temp_a);
+        MultRow(a_, &temp_a);
+        temp_a.Reshape(Shape{n * c});
+        MultColumn(temp_a, &temp1);
+        temp1.Reshape(Shape{n, c, h, w});
+      } else if (format_ == "NHWC") {
+        n = temp1.shape(0);
+        h = temp1.shape(1);
+        w = temp1.shape(2);
+        c = temp1.shape(3);
+        temp1.Reshape(Shape{n * h * w, c});
+        MultRow(a_, &temp1);
+        temp1.Reshape(Shape{n, h, w, c});
+      } else {
+        LOG(FATAL) << "Incorrect input format for prelu layer.";
+      }
+    } else {
+      LOG(FATAL) << "Incorrect input format for prelu layer.";
+    }
+    input_grad = grad * input * ((input > 0.f) + temp1);
+    Tensor temp2 = grad * input * (input <= 0.f);
+    if (format_ == "NCHW") {
+      Tensor temp3(Shape{n * c}, grad.device(), grad.data_type());
+      temp2.Reshape(Shape{n * c, h * w});
+      SumColumns(temp2, &temp3);
+      temp3.Reshape(Shape{n, c});
+      SumRows(temp3, &da);
+    } else if (format_ == "NHWC") {
+      temp2.Reshape(Shape{n * h * w, c});
+      SumRows(temp2, &da);
+    }
+  } else {
+    // share the first param of Tensor A along all channels
+    LOG(FATAL) << "Not Implemented";
+    // TODO(wangwei) cannot access the data in this way. The data could be on GPU.
+    auto a = a_.data<float>()[0];
+    input_grad = grad * input * ((input > 0.f) + (input <= 0.f) * a);
+    Tensor temp = grad * input * (input <= 0.f);
+    float sum = Sum<float>(temp);
+    Uniform(1.f, 1.f, &da);
+    da *= sum;
+  }
+  param_grad.push_back(da);
+  return std::make_pair(input_grad, param_grad);
+}
+
+void PReLU::ToDevice(std::shared_ptr<Device> device) {
+  Layer::ToDevice(device);
+  a_.ToDevice(device);
+}
+
+} // namespace singa
diff --git a/src/model/layer/prelu.h b/src/model/layer/prelu.h
new file mode 100644
index 0000000..3041d1e
--- /dev/null
+++ b/src/model/layer/prelu.h
@@ -0,0 +1,66 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef SINGA_MODEL_LAYER_PRELU_H_
+#define SINGA_MODEL_LAYER_PRELU_H_
+#include <utility>
+#include <string>
+#include <vector>
+#include "singa/model/layer.h"
+#include "singa/singa_config.h"
+
+namespace singa {
+class PReLU : public Layer {
+ public:
+  /// \copydoc Layer::layer_type()
+  //  const std::string layer_type() const override { return "PReLU"; }
+
+
+  /// \copydoc Layer::Setup(const LayerConf&);
+  void Setup(const Shape& in_sample, const LayerConf& conf) override;
+  const Shape GetOutputSampleShape() const override {
+    CHECK(out_sample_shape_.size()) << "You may haven't call Setup()";
+    return out_sample_shape_;
+  }
+
+  /// \copydoc Layer::Forward(int flag, const Tensor&)
+  const Tensor Forward(int flag, const Tensor &input) override;
+
+  /// \copydoc Layer::Backward(int, const Tensor&, const Tensor&);
+  const std::pair<Tensor, vector<Tensor> > Backward(
+      int flag, const Tensor &grad) override;
+
+  void ToDevice(std::shared_ptr<Device> device);
+
+  const bool Channel_shared() const { return channel_shared_; }
+  const Tensor A() const { return a_; }
+  const std::string Format() const { return format_; }
+
+  void Set_a(Tensor a) {
+    a_.ResetLike(a);
+    a_.CopyData(a);
+  }
+
+ protected:
+  bool channel_shared_;
+  std::string format_;  // format_ has two valid value, i.e. NCHW, NHWC
+  Tensor a_;            // shape of a_ is 2D, i.e. (channels, 1)
+  std::stack<Tensor> buf_;
+  Shape out_sample_shape_;
+};
+}  // namespace singa
+#endif  // SINGA_MODEL_LAYER_PRELU_H_
diff --git a/src/model/layer/rnn.cc b/src/model/layer/rnn.cc
new file mode 100644
index 0000000..b811f9d
--- /dev/null
+++ b/src/model/layer/rnn.cc
@@ -0,0 +1,103 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "./rnn.h"
+#include <vector>
+#include "singa/model/layer.h"
+#include "singa/utils/string.h"
+
+namespace singa {
+RegisterLayerClass(singa_rnn, RNN);
+RegisterLayerClass(singacpp_rnn, RNN);
+RegisterLayerClass(singacuda_rnn, RNN);
+RegisterLayerClass(singacl_rnn, RNN);
+void RNN::Setup(const Shape& in_sample, const LayerConf &conf) {
+  Layer::Setup(in_sample, conf);
+
+  RNNConf rnn_conf = conf.rnn_conf();
+  hidden_size_ = rnn_conf.hidden_size();
+  CHECK_GT(hidden_size_, 0u);
+  num_stacks_ = rnn_conf.num_stacks();
+  CHECK_GT(num_stacks_, 0u);
+  input_size_ = Product(in_sample);
+  CHECK_GT(input_size_, 0u);
+  dropout_ = rnn_conf.dropout();  // drop probability
+  CHECK_GE(dropout_, 0);
+
+  input_mode_ = ToLowerCase(rnn_conf.input_mode());
+  CHECK(input_mode_ == "linear" || input_mode_ == "skip")
+      << "Input mode of " << input_mode_ << " is not supported; Please use "
+      << "'linear' and 'skip'";
+
+  direction_ = ToLowerCase(rnn_conf.direction());
+  if (direction_ == "unidirectional")
+    num_directions_ = 1;
+  else if (direction_ == "bidirectional")
+    num_directions_ = 2;
+  else
+    LOG(FATAL) << "Direction of " << direction_
+      << " is not supported; Please use unidirectional or bidirectional";
+
+  rnn_mode_ = ToLowerCase(rnn_conf.rnn_mode());
+  if (rnn_mode_ == "lstm") {
+    has_cell_ = true;
+  } else if (rnn_mode_ !="relu" && rnn_mode_ != "tanh" && rnn_mode_ != "gru") {
+    LOG(FATAL) << "RNN memory unit (mode) of " << rnn_mode_
+      << " is not supported Please use 'relu', 'tanh', 'lstm' and 'gru'";
+  }
+  // the first constant (4) is the size of float
+  // the second constant (2, 8, 6) is the number of sets of params
+  int mult = 1;
+  if (rnn_mode_ == "relu" || rnn_mode_ == "tanh")
+    mult *= 1;
+  else if (rnn_mode_ == "lstm")
+    mult *= 4;
+  else if (rnn_mode_ == "gru")
+    mult *= 3;
+  if (direction_ == "bidirectional")
+    mult *= 2;
+
+  size_t weight_size = 0;
+  for (size_t i = 0; i < num_stacks_; i++) {
+    size_t dim = hidden_size_ * (in_sample[0] +  hidden_size_ + 2);
+    if (i > 0)
+      dim = hidden_size_ * (hidden_size_ +  hidden_size_ + 2);
+    weight_size += mult * dim;
+  }
+  weight_.Reshape(Shape{weight_size});
+}
+
+const vector<Tensor> RNN::Forward(int flag, const vector<Tensor>& inputs) {
+  vector<Tensor> data_output;
+  LOG(FATAL) << "CPU RNN is not implemented!";
+  return data_output;
+}
+
+const std::pair<vector<Tensor>, vector<Tensor>> RNN::Backward(int flag,
+    const vector<Tensor>& grads) {
+  vector<Tensor> param_grad;
+  vector<Tensor> data_grad;
+  LOG(FATAL) << "CPU RNN is not implemented!";
+  return std::make_pair(data_grad, param_grad);
+}
+
+void RNN::ToDevice(std::shared_ptr<Device> device) {
+  Layer::ToDevice(device);
+  weight_.ToDevice(device);
+}
+}  /* singa */
diff --git a/src/model/layer/rnn.h b/src/model/layer/rnn.h
new file mode 100644
index 0000000..3369a00
--- /dev/null
+++ b/src/model/layer/rnn.h
@@ -0,0 +1,96 @@
+ /**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef SRC_MODEL_LAYER_RNN_H_
+#define SRC_MODEL_LAYER_RNN_H_
+
+#include <utility>
+#include <string>
+#include <vector>
+#include <stack>
+
+#include "singa/model/layer.h"
+
+namespace singa {
+/// To enable use the same layer multiple times in one iteration in RNN,
+/// the Forward() function pushes the 'input' or 'output' that are
+/// necessary for Backward() in a stack (states_). If neither 'input' or
+/// 'output' is used by Backward(), then do not store them. The Backward()
+/// pops data from the states_ stack to compute gradients. Users are
+/// responsible for accumulating the gradients for the same parameters.
+class RNN : public Layer {
+ public:
+  /// \copydoc Layer::layer_type()
+  // const std::string layer_type() const override { return "RNN"; }
+
+  /// Setup the RNN layer.
+  /// in_shape is the shape of a single training instance from one timestep,
+  void Setup(const Shape& in_shape, const LayerConf& conf) override;
+
+  /// The inputs vector includes <x1, ... xn, hx, cx> where xi is the input
+  /// tensor at the i-th time step. hx is used to initialize the hidden tensor,
+  /// which could be a dummy tensor (like Tensor hx;). cx is used to initialize
+  /// the cell tensor, which could be a dummy tensor( like Tensor cx;). For
+  /// dummy tensors, 0's would be used during computation.
+  /// cx is missing for gru/relu/tanh RNNs, and is valid for lstm.
+  /// The dim order of xi is <batch, feature>, and the batchsize of xi must be
+  /// >= that of x(i+1).
+  /// The output vector includes <y1, ... yn, hy, cy> where yi is the output
+  /// tensor at the i-th time step. hy is the final hidden tensor, cy is the
+  /// final cell tensor. cy is missing for gru/relu/tanh RNNs and is valid for
+  /// lstm.
+  const vector<Tensor> Forward(int flag, const vector<Tensor>& inputs) override;
+
+  /// The grads vector includes <dy1, dy2, ... dyn, dhy, dcy>, the symbols are
+  /// similar to those for Forward. dcy is missing for gru/relu/tanh RNNs and is
+  /// valid for lstm.
+  /// The first vector of the output includes <dx1, dx2, ... dxn, dhx, dcx>.
+  /// The second vector of the output includes the gradients of all parameters.
+  const std::pair<vector<Tensor>, vector<Tensor>> Backward(
+      int flag, const vector<Tensor>& grads) override;
+
+  const vector<Tensor> param_values() override {
+    return vector<Tensor>{weight_};
+  }
+
+  void ToDevice(std::shared_ptr<Device> device) override;
+  /// Return the internal state stack, which should be empty at the beginning
+  /// of one iteration.
+  // std::stack<Tensor> states() const { return states_; }
+
+  string input_mode() const { return input_mode_; }
+  string direction() const { return direction_; }
+  string rnn_mode() const { return rnn_mode_; }
+
+ protected:
+  /// Storing input or output from Forward(), which are used in Backward().
+  /// Rules:
+  /// 1. push the 'input' or 'output' into states_ if the flag of Forward() is
+  ///    for kTrain and 'input' or 'output' is necessary for Backward().
+  /// 2. pop data out in Backward().
+  std::stack<Tensor> buf_;
+  bool has_cell_ = false;
+  size_t num_directions_ = 1;
+  size_t input_size_ = 0, hidden_size_ = 0, num_stacks_ = 0, seq_length_ = 0;
+  size_t batch_size_ = 0;
+  size_t seed_ = 0x1234567;
+  float dropout_ = 0.0f;
+  string input_mode_, direction_, rnn_mode_;
+  Tensor weight_;
+};
+}  // namespace singa
+#endif  // SRC_MODEL_LAYER_RNN_H_
diff --git a/src/model/layer/softmax.cc b/src/model/layer/softmax.cc
new file mode 100644
index 0000000..2cbd264
--- /dev/null
+++ b/src/model/layer/softmax.cc
@@ -0,0 +1,74 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "./softmax.h"
+namespace singa {
+
+RegisterLayerClass(singa_softmax, Softmax);
+RegisterLayerClass(singacpp_softmax, Softmax);
+RegisterLayerClass(singacuda_softmax, Softmax);
+RegisterLayerClass(singacl_softmax, Softmax);
+void Softmax::Setup(const Shape& in_sample, const LayerConf& conf) {
+  Layer::Setup(in_sample, conf);
+  CHECK_EQ(in_sample.size(), 1u);
+  out_sample_shape_ = in_sample;
+}
+
+const Tensor Softmax::Forward(int flag, const Tensor& input) {
+  CHECK_LE(input.nDim(), 2u);
+  Tensor output =  SoftMax(input);
+  if (flag & kTrain)
+    buf_.push(output);
+  return output;
+}
+
+const std::pair<Tensor, vector<Tensor>> Softmax::Backward(int flag,
+                                                          const Tensor& grad) {
+  CHECK_LE(grad.nDim(), 2u);
+  Tensor input_grad = grad.Clone();
+  CHECK(!buf_.empty());
+  Tensor y = buf_.top();
+  buf_.pop();
+  CHECK(y.shape() == input_grad.shape());
+  Tensor sigma = input_grad * y;
+
+  size_t nrow = 1, ncol = grad.Size();
+  if (grad.nDim() > 1) {
+    nrow = grad.shape(0);
+    ncol = grad.shape(1);
+  } else {
+    input_grad.Reshape({nrow, ncol});
+    sigma.Reshape({nrow, ncol});
+  }
+  Tensor sum(Shape{nrow}, grad.device(), grad.data_type());
+  SumColumns(sigma, &sum);
+  // dL / dy_i = grad_i
+  // dy_i / dx_i = y_i - y_i^2, if i == j
+  // dy_i / dx_j = - y_i * y_j, if i != j
+  // dL / dx_i = sum_j((dL / dy_j) * (dy_j / dx_i))
+  // dL / dx_i = y_i * (grad_i - sum), where sum = sum_i(grad_i * y_i);
+  SubColumn(sum, &input_grad);
+  input_grad = input_grad * y;
+  if (grad.nDim() == 1)
+    input_grad.Reshape(Shape{ncol});
+  // Mult(input_grad, y, &input_grad);
+  vector<Tensor> param_grad;
+  return std::make_pair(input_grad, param_grad);
+}
+
+}  // namespace singa
diff --git a/src/model/layer/softmax.h b/src/model/layer/softmax.h
new file mode 100644
index 0000000..cf71587
--- /dev/null
+++ b/src/model/layer/softmax.h
@@ -0,0 +1,48 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef SINGA_MODEL_LAYER_SOFTMAX_H_
+#define SINGA_MODEL_LAYER_SOFTMAX_H_
+#include "singa/model/layer.h"
+#include <stack>
+namespace singa {
+/// Do softmax for 1D or 2D tensors along the last dimension.
+class Softmax : public Layer {
+ public:
+  /// \copydoc Layer::layer_type()
+  // const std::string layer_type() const override { return "Softmax"; }
+
+  /// \copydoc Layer::Setup(const LayerConf&);
+  void Setup(const Shape& in_sample, const LayerConf& conf) override;
+  const Shape GetOutputSampleShape() const override {
+    CHECK(out_sample_shape_.size()) << "You may haven't call Setup()";
+    return out_sample_shape_;
+  }
+
+  /// \copydoc Layer::Forward(int flag, const Tensor&)
+  const Tensor Forward(int flag, const Tensor& input) override;
+
+  /// \copydoc Layer::Backward(int flag, const Tensor&, const Tensor&);
+  const std::pair<Tensor, vector<Tensor>> Backward(int flag,
+                                                   const Tensor& grad) override;
+
+ protected:
+  std::stack<Tensor> buf_;
+  Shape out_sample_shape_;
+};
+}  // namespace singa
+#endif  // SINGA_MODEL_LAYER_SOFTMAX_H_
diff --git a/src/model/layer/split.cc b/src/model/layer/split.cc
new file mode 100644
index 0000000..6b38a2b
--- /dev/null
+++ b/src/model/layer/split.cc
@@ -0,0 +1,53 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "singa/model/layer.h"
+#include "./split.h"
+namespace singa {
+
+RegisterLayerClass(singa_split, Split);
+
+void Split::Setup(const Shape& in_sample, const LayerConf& conf) {
+  Layer::Setup(in_sample, conf);
+  SplitConf split_conf = conf.split_conf();
+  output_size_ = split_conf.output_size();
+  out_sample_shape_ = in_sample;
+}
+
+const vector<Tensor> Split::Forward(int flag, const vector<Tensor>& inputs) {
+  vector<Tensor> outputs;
+  CHECK_EQ(inputs.size(), 1u) << "Split layer only have one input tensor.";
+  for (size_t i = 0; i < output_size_; i++)
+    outputs.push_back(inputs.at(0));
+  return outputs;
+}
+
+const std::pair<vector<Tensor>, vector<Tensor>> Split::Backward(
+    int flag, const vector<Tensor>& grads) {
+  vector<Tensor> input_grad, param_grad;
+  CHECK_EQ(grads.size(), output_size_);
+
+  /// Input_grad is the sum of all the output gradients.
+  Tensor temp = grads.at(0);
+  for (size_t i = 1; i < output_size_; i++)
+    temp += grads.at(i);
+  input_grad.push_back(temp);
+  return std::make_pair(input_grad, param_grad);
+}
+
+}  // namespace singa
diff --git a/src/model/layer/split.h b/src/model/layer/split.h
new file mode 100644
index 0000000..d4fd58a
--- /dev/null
+++ b/src/model/layer/split.h
@@ -0,0 +1,55 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef SRC_MODEL_LAYER_SPLIT_H_
+#define SRC_MODEL_LAYER_SPLIT_H_
+#include <string>
+#include <utility>
+#include <vector>
+#include "singa/model/layer.h"
+
+namespace singa {
+/// Duplicate the input into multiple outputs
+/// need to configure the number of outputs
+class Split : public Layer {
+ public:
+  /// \copydoc Layer::layer_type()
+  // const std::string layer_type() const override { return "Split"; }
+
+  /// \copydoc Layer::Setup(const LayerConf&);
+  void Setup(const Shape& in_sample, const LayerConf& conf) override;
+  const Shape GetOutputSampleShape() const override {
+    CHECK(out_sample_shape_.size()) << "You may haven't call Setup()";
+    return out_sample_shape_;
+  }
+  /// The inputs should have only one Tensor
+  /// The outputs is a set of replicated Tensor
+  const vector<Tensor> Forward(int flag, const vector<Tensor> &inputs) override;
+
+  /// \copydoc Layer::Backward(int, const vector<Tensor>&);
+  const std::pair<vector<Tensor>, vector<Tensor> >
+  Backward(int flag, const vector<Tensor> &grads) override;
+
+  const size_t output_size() const { return output_size_; }
+
+ protected:
+  // To store the input and output(of forward) tensors
+  Shape out_sample_shape_;
+  size_t output_size_;
+};
+}  // namespace singa
+#endif  // SRC_MODEL_LAYER_SPLIT_H_
diff --git a/src/model/loss/mse.cc b/src/model/loss/mse.cc
new file mode 100644
index 0000000..6e19059
--- /dev/null
+++ b/src/model/loss/mse.cc
@@ -0,0 +1,42 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "singa/model/loss.h"
+
+namespace singa {
+
+Tensor MSE::Forward(int flag, const Tensor& prediction, const Tensor& target) {
+  CHECK(buf_.empty()) << "Do not call Forward successively for more than twice."
+                      << " The calling pattern is [Forward|Evaluate] Backward";
+  Tensor t = prediction - target;
+  size_t batchsize = 1;
+  if (t.nDim() > 1) batchsize = t.shape().at(0);
+  size_t dim = t.Size() / batchsize;
+  t.Reshape(Shape{batchsize, dim});
+  if (kTrain & flag)
+    buf_.push(t);
+  // TODO(wangwei) use CastType for operator/
+  return Sum(Square(t), 1) * 0.5f;
+}
+
+Tensor MSE::Backward() {
+  Tensor ret = buf_.top();
+  buf_.pop();
+  return ret * (1.0f / ret.shape().at(0));
+}
+}  // namespace singa
diff --git a/src/model/loss/softmax_cross_entropy.cc b/src/model/loss/softmax_cross_entropy.cc
new file mode 100644
index 0000000..3411fbe
--- /dev/null
+++ b/src/model/loss/softmax_cross_entropy.cc
@@ -0,0 +1,56 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <stack>
+#include "singa/model/loss.h"
+
+namespace singa {
+
+Tensor SoftmaxCrossEntropy::Forward(int flag, const Tensor& prediction,
+                                    const Tensor& target) {
+  CHECK(buf_.empty()) << "Do not call Forward successively for more than twice."
+                      << " The calling pattern is [Forward|Evaluate] Backward";
+  size_t batchsize = 1;
+  if (prediction.nDim() > 1) batchsize = prediction.shape().at(0);
+  size_t dim = prediction.Size() / batchsize;
+  const Tensor& input = Reshape(prediction, Shape{batchsize, dim});
+  Tensor prob = SoftMax(input);
+  // LOG(INFO) << "prob: " << prob.L2();
+
+  // buffer intermediate data
+  if (flag & kTrain) {
+    buf_.push(prob);
+    buf_.push(target);
+  }
+  Tensor loss(Shape{batchsize}, prob.device(), prob.data_type());
+
+  ComputeCrossEntropy(prob, target, &loss);
+
+  return loss;
+}
+
+Tensor SoftmaxCrossEntropy::Backward() {
+  const Tensor target = buf_.top();
+  buf_.pop();
+  Tensor prob = buf_.top();
+  buf_.pop();
+  SoftmaxCrossEntropyBwd(target, &prob);
+  return prob;
+}
+}  // namespace singa
+
diff --git a/src/model/metric/accuracy.cc b/src/model/metric/accuracy.cc
new file mode 100644
index 0000000..789e4f6
--- /dev/null
+++ b/src/model/metric/accuracy.cc
@@ -0,0 +1,64 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "singa/model/metric.h"
+#include <algorithm>
+namespace singa {
+
+Tensor Accuracy::Match(const Tensor& predict, const vector<int>& target) {
+  Tensor prediction(predict.shape());
+  prediction.CopyData(predict);
+  size_t batchsize = target.size();
+  size_t nb_classes = prediction.Size() / batchsize;
+  // each row of prediction is the prob distribution for one sample
+  CHECK_EQ(prediction.shape().at(0), batchsize);
+  // TODO(wangwei) CloneToDevice(host);
+  const float* prob = prediction.data<float>();
+  float* score = new float[batchsize];
+  memset(score, 0, batchsize * sizeof(float));
+  for (size_t b = 0; b < batchsize; b++) {
+    vector<std::pair<float, int>> prob_class;
+    for (size_t c = 0; c < nb_classes; c++) {
+      prob_class.push_back(std::make_pair(prob[b * nb_classes + c], c));
+    }
+    std::partial_sort(prob_class.begin(), prob_class.begin() + top_k_,
+                      prob_class.end(), std::greater<std::pair<float, int>>());
+
+    for (size_t k = 0; k < top_k_; k++)
+      if (prob_class.at(k).second == target.at(b)) score[b] = 1;
+  }
+  Tensor ret(Shape{batchsize});
+  ret.CopyDataFromHostPtr(score, batchsize);
+  delete [] score;
+  return ret;
+}
+
+// TODO(wangwei) consider multi-label cases, where target is of shape
+// nb_samples * nb_classes
+Tensor Accuracy::Forward(const Tensor& prediction, const Tensor& t) {
+  Tensor target(t.shape(), t.data_type());
+  target.CopyData(t);
+  vector<int> target_vec;
+  // TODO(wangwei) copy target to host.
+  const int* target_value = target.data<int>();
+  for (size_t i = 0; i < target.Size(); i++)
+    target_vec.push_back(target_value[i]);
+  return Match(prediction, target_vec);
+}
+
+}  // namespace singa
diff --git a/src/model/optimizer/adagrad.cc b/src/model/optimizer/adagrad.cc
new file mode 100644
index 0000000..cdb3fac
--- /dev/null
+++ b/src/model/optimizer/adagrad.cc
@@ -0,0 +1,43 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef SRC_MODEL_OPTIMIZER_ADAGRAD_H_
+#define SRC_MODEL_OPTIMIZER_ADAGRAD_H_
+#include "singa/model/optimizer.h"
+#include <functional>
+namespace singa {
+
+void AdaGrad::Setup(const OptimizerConf& conf) { delta_ = conf.delta(); }
+
+// history += grad*grad;
+// value = value - lr*grad/sqrt(history+delta)
+void AdaGrad::Apply(int step, float lr, const string& name, const Tensor& grad,
+                    Tensor& value) {
+  if (history_gradient_.find(name) == history_gradient_.end()) {
+    history_gradient_[name].ResetLike(value);
+    history_gradient_[name].SetValue(0.0f);
+  }
+  Tensor& history = history_gradient_[name];
+  Tensor tmp = Square(grad);
+  history += tmp;
+  Add(history, delta_, &tmp);
+  Sqrt(tmp, &tmp);
+  Div(grad, tmp, &tmp);
+  Axpy(-lr, tmp, &value);
+}
+}  // namespace singa
+#endif  // SRC_MODEL_OPTIMIZER_ADAGRAD_H_
diff --git a/src/model/optimizer/local_all_reduce.cc b/src/model/optimizer/local_all_reduce.cc
new file mode 100644
index 0000000..ea03e39
--- /dev/null
+++ b/src/model/optimizer/local_all_reduce.cc
@@ -0,0 +1,25 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef SRC_MODEL_OPTIMIZER_LOCAL_ALL_REDUCE_H_
+#define SRC_MODEL_OPTIMIZER_LOCAL_ALL_REDUCE_H_
+#include "singa/model/optimizer.h"
+
+namespace singa {
+}
+
+#endif  // SRC_MODEL_OPTIMIZER_LOCAL_ALL_REDUCE_H_
diff --git a/src/model/optimizer/nesterov.cc b/src/model/optimizer/nesterov.cc
new file mode 100644
index 0000000..051499b
--- /dev/null
+++ b/src/model/optimizer/nesterov.cc
@@ -0,0 +1,51 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef SRC_MODEL_OPTIMIZER_NESTEROV_H_
+#define SRC_MODEL_OPTIMIZER_NESTEROV_H_
+#include "singa/model/optimizer.h"
+#include <functional>
+namespace singa {
+
+void Nesterov::Setup(const OptimizerConf& conf) {
+  float m = conf.momentum();
+  SetMomentumGenerator([m](int step) { return m; });
+}
+
+// tmp = history;
+// history = lr * grad + history * mom
+// tmp = (1+mom) * history - tmp * mom;
+// value = value - tmp;
+void Nesterov::Apply(int step, float lr, const string& name, const Tensor& grad,
+                     Tensor& value) {
+  if (momentum_generator_) {
+    float mom = momentum_generator_(step);
+    if (history_gradient_.find(name) == history_gradient_.end()) {
+      history_gradient_[name].ResetLike(value);
+      history_gradient_[name].SetValue(0.0f);
+    }
+    Tensor& history = history_gradient_[name];
+    Tensor tmp = history.Clone();
+    history *= mom;
+    Axpy(lr, grad, &history);
+    tmp *= -mom;
+    Axpy(1 + mom, history, &tmp);
+    value -= tmp;
+  }
+}
+}  // namespace singa
+#endif  // SRC_MODEL_OPTIMIZER_NESTEROV_H_
diff --git a/src/model/optimizer/optimizer.cc b/src/model/optimizer/optimizer.cc
new file mode 100644
index 0000000..d098249
--- /dev/null
+++ b/src/model/optimizer/optimizer.cc
@@ -0,0 +1,120 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "singa/model/optimizer.h"
+#include "singa/utils/logging.h"
+
+namespace singa {
+
+Optimizer::~Optimizer() {
+  for (auto entry : regularizers_) delete entry.second;
+  for (auto entry : constraints_) delete entry.second;
+  if (constraint_ != nullptr) delete constraint_;
+  if (regularizer_ != nullptr) delete regularizer_;
+}
+void Optimizer::Setup(const OptimizerConf& conf) {
+  if (conf.has_regularizer())
+    regularizer_ = new Regularizer(conf.regularizer());
+  if (conf.has_constraint()) constraint_ = new Constraint(conf.constraint());
+}
+void Optimizer::Register(const string& name, const ParamSpec& specs) {
+  if (specs.has_constraint()) {
+    CHECK(constraints_.find(name) == constraints_.end())
+        << "Parameter with name = " << name << " has already registered";
+    constraints_[name] = new Constraint(specs.constraint());
+  }
+  if (specs.has_regularizer()) {
+    CHECK(regularizers_.find(name) == regularizers_.end())
+        << "Parameter with name = " << name << " has already registered";
+    regularizers_[name] = new Regularizer(specs.regularizer());
+  }
+  if (specs.has_decay_mult()) {
+    CHECK(weight_decay_multplier_.find(name) == weight_decay_multplier_.end())
+        << "Parameter with name = " << name << " has already registered";
+    weight_decay_multplier_[name] = specs.decay_mult();
+  }
+  if (specs.has_lr_mult()) {
+    CHECK(learning_rate_multplier_.find(name) == learning_rate_multplier_.end())
+        << "Parameter with name = " << name << " has already registered";
+    learning_rate_multplier_[name] = specs.lr_mult();
+  }
+  /*
+  if (specs.has_lr_generator()) {
+    LOG(FATAL) << "Not implemented yet";
+  }
+  */
+}
+
+void Optimizer::Apply(int step, const string& name, Tensor& grad,
+                      Tensor& param) {
+  // TODO(wangwei) need to consider the order of constraint and regularizer
+  if (regularizers_.find(name) != regularizers_.end()) {
+    regularizers_.at(name)->Apply(step, param, grad);
+  } else if (regularizer_ != nullptr) {
+    float scale = 1.0f;
+    if (weight_decay_multplier_.find(name) != weight_decay_multplier_.end())
+      scale = weight_decay_multplier_.at(name);
+    regularizer_->Apply(step, param, grad, scale);
+  }
+  if (constraints_.find(name) != constraints_.end())
+    constraints_.at(name)->Apply(step, param, grad);
+  else if (constraint_ != nullptr)
+    constraint_->Apply(step, param, grad);
+  float lr = learning_rate_generator_(step);
+  if (learning_rate_multplier_.find(name) != learning_rate_multplier_.end())
+    lr *= learning_rate_multplier_.at(name);
+  Apply(step, lr, name, grad, param);
+}
+
+void Regularizer::Setup(const RegularizerConf& conf) {
+  type_ = conf.type();
+  coefficient_ = conf.coefficient();
+  if (type_ != "L2" && type_ != "l2") {
+    CHECK(type_ == "NotSet") << "Unknown regularizer type = " << type_;
+  }
+}
+
+void Regularizer::Apply(int step, Tensor& value, Tensor& grad, float scale) {
+  if (type_ == "L2" || type_ == "l2") {
+    Axpy(coefficient_ * scale, value, &grad);
+  } else {
+    CHECK(type_ == "NotSet") << "Unknown regularizer type = " << type_;
+  }
+}
+
+void Regularizer::Apply(int step, const vector<Tensor>& values,
+                        const vector<Tensor>& grads) {
+  LOG(FATAL) << "Not implemented yet";
+}
+
+void Constraint::Setup(const ConstraintConf& conf) {
+  type_ = conf.type();
+  threshold_ = conf.threshold();
+}
+
+void Constraint::Apply(int step, Tensor& value, Tensor& grad) {
+  // TODO(wangwei) implement L2 and hard constraint
+  CHECK(type_ == "NotSet") << "Unknown regularizer type = " << type_;
+}
+
+void Constraint::Apply(int step, const vector<Tensor>& values,
+                       const vector<Tensor>& grads) {
+  LOG(FATAL) << "Not implemented yet";
+}
+
+}  // namespace singa
diff --git a/src/model/optimizer/rmsprop.cc b/src/model/optimizer/rmsprop.cc
new file mode 100644
index 0000000..13e2a75
--- /dev/null
+++ b/src/model/optimizer/rmsprop.cc
@@ -0,0 +1,46 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef SRC_MODEL_OPTIMIZER_ADAGRAD_H_
+#define SRC_MODEL_OPTIMIZER_ADAGRAD_H_
+#include "singa/model/optimizer.h"
+#include <functional>
+namespace singa {
+
+void RMSProp::Setup(const OptimizerConf& conf) {
+  delta_ = conf.delta();
+  rho_ = conf.rho();
+}
+
+// history = history * rho + grad * grad * (1 - rho)
+// value = value - lr * grad / sqrt(history + delta)
+void RMSProp::Apply(int step, float lr, const string& name, const Tensor& grad,
+                    Tensor& value) {
+  if (history_gradient_.find(name) == history_gradient_.end()) {
+    history_gradient_[name].ResetLike(value);
+    history_gradient_[name].SetValue(0.0f);
+  }
+  Tensor& history = history_gradient_[name];
+  history *= rho_;
+  Tensor tmp = Square(grad);
+  Axpy(1 - rho_, tmp, &history);
+  Sqrt(history + delta_, &tmp);
+  Div(grad, tmp, &tmp);
+  Axpy(-lr, tmp, &value);
+}
+}  // namespace singa
+#endif  // SRC_MODEL_OPTIMIZER_ADAGRAD_H_
diff --git a/src/model/optimizer/sgd.cc b/src/model/optimizer/sgd.cc
new file mode 100644
index 0000000..ac453cd
--- /dev/null
+++ b/src/model/optimizer/sgd.cc
@@ -0,0 +1,54 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef SRC_MODEL_OPTIMIZER_SGD_H_
+#define SRC_MODEL_OPTIMIZER_SGD_H_
+#include "singa/model/optimizer.h"
+#include <functional>
+namespace singa {
+
+void SGD::Setup(const OptimizerConf& conf) {
+  Optimizer::Setup(conf);
+  if (conf.has_momentum()) {
+    float m = conf.momentum();
+    SetMomentumGenerator([m](int step) { return m; });
+  }
+}
+
+// history = history * momentum + grad * lr
+// value = value - history
+void SGD::Apply(int step, float lr, const string& name, const Tensor& grad,
+                Tensor& value) {
+  // LOG(INFO) << "param " << name  << " lr = " << lr << " grad = " << grad.L1() << " value = " << value.L1();
+  if (momentum_generator_) {
+    float mom = momentum_generator_(step);
+    if (mom != 0) {
+      if (history_gradient_.find(name) == history_gradient_.end()) {
+        history_gradient_[name].ResetLike(value);
+        history_gradient_[name].SetValue(0.0f);
+      }
+      Tensor& history = history_gradient_[name];
+      history *= mom;
+      Axpy(lr, grad, &history);
+      value -= history;
+      return;
+    }
+  }
+  Axpy(-lr, grad, &value);
+}
+}  // namespace singa
+#endif  // SRC_MODEL_OPTIMIZER_SGD_H_
diff --git a/src/model/rnn.cc b/src/model/rnn.cc
new file mode 100644
index 0000000..d1a7d2c
--- /dev/null
+++ b/src/model/rnn.cc
@@ -0,0 +1,27 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+namespace singa {
+
+
+
+
+
+
+}  /* singa */
diff --git a/src/model/updater/local_updater.cc b/src/model/updater/local_updater.cc
new file mode 100644
index 0000000..c3c6793
--- /dev/null
+++ b/src/model/updater/local_updater.cc
@@ -0,0 +1,77 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "singa/model/updater.h"
+#include <vector>
+
+namespace singa {
+
+void LocalUpdater::Register(const string& name, const ParamSpec& specs) {
+  opt_->Register(name, specs);
+  param_buffer_[name];
+  param_buffer_[name].ToDevice(dev_);
+  sum_[name];
+  sum_[name].ToDevice(dev_);
+  for (int i = 0; i < total_num_; ++i) {
+    grad_buffer_[std::make_pair(i, name)];
+    grad_buffer_[std::make_pair(i, name)].ToDevice(dev_);
+  }
+  dev_index_[name] = 0;
+  to_updater_finished_[name] = 0;
+  mtx_[name];
+}
+
+void LocalUpdater::Apply(int step, const string& name, Tensor& grad,
+                         Tensor& value) {
+  CHECK(param_buffer_.count(name) == 1) << "Parameter " << name
+                                        << " has not been registered before.";
+  int nth = dev_index_[name]++;
+  auto key = std::make_pair(nth, name);
+  if (grad_buffer_[key].Size() != grad.Size()) {
+    grad_buffer_[key].Reshape(grad.shape());
+    grad_buffer_[key].AsType(grad.data_type());
+  }
+  grad_buffer_[key].CopyData(grad);
+
+  std::unique_lock<std::mutex> lock(mtx_[name]);
+  ++to_updater_finished_[name];
+  if (to_updater_finished_[name] != total_num_) {
+    while (to_updater_finished_[name] > 0) {
+      to_updater_all_finished_[name].wait(lock);
+    }
+  } else {
+    if (param_buffer_[name].Size() != value.Size()) {
+      param_buffer_[name].Reshape(value.shape());
+      param_buffer_[name].AsType(value.data_type());
+      param_buffer_[name].CopyData(value);
+      sum_[name].ResetLike(param_buffer_[name]);
+    }
+    sum_[name].SetValue(.0f);
+    for (int i = 0; i < total_num_; ++i)
+      Add(sum_[name], grad_buffer_[std::make_pair(i, name)], &sum_[name]);
+    Div(sum_[name], static_cast<float>(total_num_), &sum_[name]);
+    opt_->Apply(step, name, sum_[name], param_buffer_[name]);
+    to_updater_finished_[name] = 0;
+    dev_index_[name] = 0;
+    to_updater_all_finished_[name].notify_all();
+  }
+  lock.unlock();
+  value.CopyData(param_buffer_[name]);
+}
+
+}  // namesapce singa
diff --git a/src/model/updater/updater.cc b/src/model/updater/updater.cc
new file mode 100644
index 0000000..d386d30
--- /dev/null
+++ b/src/model/updater/updater.cc
@@ -0,0 +1,32 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "singa/model/updater.h"
+
+namespace singa {
+
+void Updater::Setup(const OptimizerConf& conf) { opt_->Setup(conf); }
+
+void Updater::Register(const string& name, const ParamSpec& specs) {
+  opt_->Register(name, specs);
+}
+
+void Updater::Apply(int step, const string& name, Tensor& grad, Tensor& value) {
+  opt_->Apply(step, name, grad, value);
+}
+}  // namesapce singa
diff --git a/src/neuralnet/connection_layer/bridge.cc b/src/neuralnet/connection_layer/bridge.cc
deleted file mode 100644
index 2cfd55a..0000000
--- a/src/neuralnet/connection_layer/bridge.cc
+++ /dev/null
@@ -1,108 +0,0 @@
-/************************************************************
-*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*
-*************************************************************/
-
-#include "singa/neuralnet/connection_layer.h"
-#include "singa/comm/msg.h"
-
-namespace singa {
-
-using std::vector;
-
-void BridgeLayer::MakePaired(Layer* pair, int grp_id, Dealer* dealer,
-    std::unordered_map<std::string, Layer*>* name2bridge) {
-  pair_ = pair;
-  group_id_ = grp_id;
-  dealer_ = dealer;
-  name2bridge_ = name2bridge;
-}
-
-void BridgeLayer::SendBlobs(bool handle_data) {
-  CHECK(dealer_) << "NULL dealer for bridges in worker (" << group_id_
-                 << ", " << partition_id() << ")";
-  Msg *msg = new Msg();
-  msg->set_src(Addr(group_id_, partition_id(), kWorkerLayer));
-  msg->set_dst(Addr(group_id_, pair_->partition_id(), kWorkerLayer));
-  msg->AddFrame(pair_->name().c_str(), pair_->name().length());
-  auto const& blob = handle_data ? data(nullptr) : grad(nullptr);
-  msg->AddFrame(blob.cpu_data(), blob.count() * sizeof(float));
-  dealer_->Send(&msg);
-}
-
-void BridgeLayer::ReceiveBlobs(bool handle_data) {
-  CHECK(dealer_) << "NULL dealer for bridges in worker (" << group_id_
-                 << ", " << partition_id() << ")";
-  while (!ready()) {
-    auto msg = dealer_->Receive();
-    CHECK_EQ(AddrGrp(msg->src()), group_id_);
-    string name(static_cast<char*>(msg->FrameData()), msg->FrameSize());
-    auto receive_layer = name2bridge_->at(name);
-    auto blob = handle_data ? receive_layer->mutable_data(nullptr) :
-                receive_layer -> mutable_grad(nullptr);
-    msg->NextFrame();
-    memcpy(blob->mutable_cpu_data(), msg->FrameData(), msg->FrameSize());
-    dynamic_cast<BridgeLayer*>(receive_layer)->set_ready(true);
-    delete msg;
-  }
-}
-
-void BridgeSrcLayer::Setup(const LayerProto& conf,
-    const vector<Layer*>& srclayers) {
-  CHECK_GE(srclayers.size(), 1);
-  Layer::Setup(conf, srclayers);
-  data_.Reshape(srclayers[0]->data(this).shape());
-  grad_.ReshapeLike(data_);
-  data_.ShareData(srclayers[0]->mutable_data(this), false);
-  grad_.ShareData(srclayers[0]->mutable_grad(this), false);
-}
-
-void BridgeSrcLayer::ComputeFeature(int flag, const vector<Layer*>& srcs) {
-  // send data
-  SendBlobs(true);
-  // reset flag for receiving gradient in compute gradient phase
-  set_ready(false);
-}
-
-void BridgeSrcLayer::ComputeGradient(int flag, const vector<Layer*>& srcs) {
-  // receive gradient
-  ReceiveBlobs(false);
-}
-
-void BridgeDstLayer::Setup(const LayerProto& conf,
-    const vector<Layer*>& srclayers) {
-  CHECK_EQ(srclayers.size(), 1);
-  Layer::Setup(conf, srclayers);
-  data_.Reshape(srclayers[0]->data(this).shape());
-  grad_.ReshapeLike(data_);
-}
-
-void BridgeDstLayer::ComputeFeature(int flag, const vector<Layer*>& srcs) {
-  // receive data
-  ReceiveBlobs(true);
-}
-
-void BridgeDstLayer::ComputeGradient(int flag, const vector<Layer*>& srcs) {
-  // send gradient
-  SendBlobs(false);
-  // reset flag for receiving data in compute feature phase
-  set_ready(false);
-}
-
-}  // namespace singa
diff --git a/src/neuralnet/connection_layer/concate.cc b/src/neuralnet/connection_layer/concate.cc
deleted file mode 100644
index 9d3fd0c..0000000
--- a/src/neuralnet/connection_layer/concate.cc
+++ /dev/null
@@ -1,118 +0,0 @@
-/************************************************************
-*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*
-*************************************************************/
-
-#include "singa/neuralnet/connection_layer.h"
-#include "singa/utils/singleton.h"
-#include "singa/utils/context.h"
-
-namespace singa {
-
-void ConcateLayer::Setup(const LayerProto& conf,
-                         const vector<Layer*>& srclayers) {
-  CHECK_GT(srclayers.size(), 1);
-  Layer::Setup(conf, srclayers);
-  vector<int> shape = srclayers[0]->data(this).shape();
-  concate_dim_ = conf.concate_conf().concate_dim();
-  num_concates_ = conf.concate_conf().num_concates();
-  CHECK_GE(concate_dim_, 0);
-  CHECK_LT(concate_dim_, shape.size());
-  CHECK_EQ(num_concates_, srclayers.size());
-  for (size_t i = 1; i < srclayers.size(); i++) {
-    const vector<int>& src_shape = srclayers[i]->data(this).shape();
-    for (size_t j = 0; j < shape.size(); j++)
-      if (static_cast<int>(j) == concate_dim_)
-        shape[j] += src_shape[j];
-      else
-        CHECK_EQ(shape[j], src_shape[j]);
-  }
-  data_.Reshape(shape);
-  grad_.Reshape(shape);
-}
-
-void ConcateLayer::ComputeFeature(int flag, const vector<Layer*>& srclayers) {
-  CHECK_GT(srclayers.size(), 1);
-  CHECK_EQ(num_concates_, srclayers.size());
-  // calculate step for each memcpy
-  int step = srclayers[0]->data(this).shape()[concate_dim_];
-  for (unsigned i = concate_dim_ + 1; i < data_.shape().size(); ++i)
-    step *= data_.shape()[i];
-  int srclayer_offset = 0;
-  int concate_offset = 0;
-  auto context = Singleton<Context>::Instance();
-  int device = context->device_id(std::this_thread::get_id());
-  while (concate_offset < data_.count()) {
-    for (size_t i = 0; i < srclayers.size(); ++i) {
-      if (device < 0) {
-        const float* src = srclayers[i]->data(this).cpu_data()
-          + srclayer_offset;
-        float* dst = data_.mutable_cpu_data() + concate_offset;
-        memcpy(dst, src, step * sizeof(float));
-      } else {
-#ifdef USE_GPU
-        const float* src = srclayers[i]->data(this).gpu_data()
-          + srclayer_offset;
-        float* dst = data_.mutable_gpu_data() + concate_offset;
-        cudaMemcpy(dst, src, step * sizeof(float), cudaMemcpyDefault);
-#else
-        LOG(FATAL) << "GPU is not supported";
-#endif
-      }
-      concate_offset += step;
-    }
-    srclayer_offset += step;
-  }
-}
-
-void ConcateLayer::ComputeGradient(int flag, const vector<Layer*>& srclayers) {
-  CHECK_GT(srclayers.size(), 1);
-  CHECK_EQ(num_concates_, srclayers.size());
-  // calculate step for each memcpy
-  int step = srclayers[0]->grad(this).shape()[concate_dim_];
-  for (unsigned i = concate_dim_ + 1; i < grad_.shape().size(); ++i)
-    step *= grad_.shape()[i];
-  int srclayer_offset = 0;
-  int concate_offset = 0;
-  auto context = Singleton<Context>::Instance();
-  int device = context->device_id(std::this_thread::get_id());
-  while (concate_offset < grad_.count()) {
-    for (size_t i = 0; i < srclayers.size(); ++i) {
-      if (device < 0) {
-        const float* src = grad_.cpu_data() + concate_offset;
-        float* dst = srclayers[i]->mutable_grad(this)->mutable_cpu_data()
-          + srclayer_offset;
-        memcpy(dst, src, step * sizeof(float));
-      } else {
-#ifdef USE_GPU
-        const float* src = grad_.gpu_data() + concate_offset;
-        float* dst = srclayers[i]->mutable_grad(this)->mutable_gpu_data()
-          + srclayer_offset;
-        cudaMemcpy(dst, src, step * sizeof(float), cudaMemcpyDefault);
-#else
-        LOG(FATAL) << "GPU is not supported";
-#endif
-      }
-      concate_offset += step;
-    }
-    srclayer_offset += step;
-  }
-}
-
-}  // namespace singa
diff --git a/src/neuralnet/connection_layer/rnn_dummy.cc b/src/neuralnet/connection_layer/rnn_dummy.cc
deleted file mode 100644
index 865066f..0000000
--- a/src/neuralnet/connection_layer/rnn_dummy.cc
+++ /dev/null
@@ -1,67 +0,0 @@
-/************************************************************
-*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*
-*************************************************************/
-
-#include "singa/neuralnet/connection_layer.h"
-#include "singa/utils/math_blob.h"
-
-namespace singa {
-
-void RNNDummyLayer::Setup(const LayerProto& conf,
-                       const vector<Layer*>& srclayers) {
-  Layer::Setup(conf, srclayers);
-  dynamic_src_ = AddPrefixSuffix(unroll_index(), partition_id(),
-      conf.rnn_dummy_conf().dynamic_srclayer());
-  LOG(ERROR) << dynamic_src_;
-  vector<int> shape;
-  for (int s : conf.rnn_dummy_conf().shape())
-    shape.push_back(s);
-  integer_ = conf.rnn_dummy_conf().integer();
-  low_ = conf.rnn_dummy_conf().low();
-  high_ = conf.rnn_dummy_conf().high();
-  // if no src layer, then it will genereate data by itself based on shape
-  // and random range
-  if (srclayers.size() == 0) {
-    CHECK(shape.size());
-    CHECK_NE(low_, high_);
-    data_.Reshape(shape);
-    srclayer_ = nullptr;
-  } else {
-    srclayer_ = srclayers.at(0);
-    data_.ReshapeLike(srclayer_->data(this));
-    data_.ShareData(srclayer_->mutable_data(this), false);
-  }
-}
-
-void RNNDummyLayer::ComputeFeature(int flag, const vector<Layer*>& srclayers) {
-  if (srclayers.size() == 0) {
-    SampleUniform(low_, high_, &data_);
-    if (integer_) {
-      for (int i = 0; i < data_.count(); i ++) {
-        data_.mutable_cpu_data()[i] = floor(data_.cpu_data()[i]);
-      }
-    }
-  } else if (srclayer_ != srclayers.at(0)) {
-    srclayer_ = srclayers.at(0);
-    data_.ShareData(srclayer_->mutable_data(this), false);
-  }
-}
-}  // namespace singa
-
diff --git a/src/neuralnet/connection_layer/slice.cc b/src/neuralnet/connection_layer/slice.cc
deleted file mode 100644
index 3cca3fd..0000000
--- a/src/neuralnet/connection_layer/slice.cc
+++ /dev/null
@@ -1,166 +0,0 @@
-/************************************************************
-*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*
-*************************************************************/
-
-#include "singa/neuralnet/connection_layer.h"
-#include "singa/utils/math_blob.h"
-#include "singa/utils/singleton.h"
-#include "singa/utils/context.h"
-
-namespace singa {
-
-using std::vector;
-
-SliceLayer::~SliceLayer() {
-  for (size_t i = 1; i < datavec_.size(); ++i) {
-    if (datavec_[i] != nullptr) delete datavec_[i];
-    if (gradvec_[i] != nullptr) delete gradvec_[i];
-  }
-}
-
-void SliceLayer::Setup(const LayerProto& conf,
-                       const vector<Layer*>& srclayers) {
-  CHECK_EQ(srclayers.size(), 1);
-  Layer::Setup(conf, srclayers);
-  vector<int> shape = srclayers[0]->data(this).shape();
-  slice_dim_ = conf.slice_conf().slice_dim();
-  num_slices_ = conf.slice_conf().num_slices();
-  CHECK_GE(slice_dim_, 0);
-  CHECK_LT(slice_dim_, shape.size());
-  CHECK_GT(num_slices_, 0);
-  // add num_slices-1 more blobs
-  for (int i = 1; i < num_slices_; ++i) {
-    datavec_.push_back(new Blob<float>());
-    gradvec_.push_back(new Blob<float>());
-  }
-  // TODO(wangsh): remove equal-size restrict later
-  CHECK_EQ(shape[slice_dim_] % num_slices_, 0);
-  shape[slice_dim_] /= num_slices_;
-  for (int i = 0; i < num_slices_; ++i) {
-    // if (i == slice_num - 1) shape[slice_dim] += remain;
-    datavec_[i]->Reshape(shape);
-    gradvec_[i]->Reshape(shape);
-  }
-}
-
-void SliceLayer::ComputeFeature(int flag, const vector<Layer*>& srclayers) {
-  CHECK_EQ(srclayers.size(), 1);
-  const Blob<float>& blob = srclayers[0]->data(this);
-  // calculate step for each memcpy
-  int step = datavec_[0]->shape()[slice_dim_];
-  for (unsigned i = slice_dim_ + 1; i < datavec_[0]->shape().size(); ++i)
-    step *= datavec_[0]->shape()[i];
-  int srclayer_offset = 0;
-  int slice_offset = 0;
-  auto context = Singleton<Context>::Instance();
-  int device = context->device_id(std::this_thread::get_id());
-  while (srclayer_offset < blob.count()) {
-    for (int i = 0; i < num_slices_; ++i) {
-      if (device < 0) {
-        const float* src = blob.cpu_data() + srclayer_offset;
-        float* dst = datavec_[i]->mutable_cpu_data() + slice_offset;
-        memcpy(dst, src, step * sizeof(float));
-      } else {
-#ifdef USE_GPU
-        const float* src = blob.gpu_data() + srclayer_offset;
-        float* dst = datavec_[i]->mutable_gpu_data() + slice_offset;
-        cudaMemcpy(dst, src, step * sizeof(float), cudaMemcpyDefault);
-#else
-        LOG(FATAL) << "GPU is not supported";
-#endif
-      }
-      srclayer_offset += step;
-    }
-    slice_offset += step;
-  }
-}
-
-void SliceLayer::ComputeGradient(int flag, const vector<Layer*>& srclayers) {
-  CHECK_EQ(srclayers.size(), 1);
-  Blob<float>* blob = srclayers[0]->mutable_grad(this);
-  // calculate step for each memcpy
-  int step = gradvec_[0]->shape()[slice_dim_];
-  for (size_t i = slice_dim_ + 1; i < gradvec_[0]->shape().size(); ++i)
-    step *= gradvec_[0]->shape()[i];
-  int srclayer_offset = 0;
-  int slice_offset = 0;
-  auto context = Singleton<Context>::Instance();
-  int device = context->device_id(std::this_thread::get_id());
-  while (srclayer_offset < blob->count()) {
-    for (int i = 0; i < num_slices_; ++i) {
-      if (device < 0) {
-        const float* src = gradvec_[i]->cpu_data() + slice_offset;
-        float* dst = blob->mutable_cpu_data() + srclayer_offset;
-        memcpy(dst, src, step * sizeof(float));
-      } else {
-#ifdef USE_GPU
-        const float* src = gradvec_[i]->gpu_data() + slice_offset;
-        float* dst = blob->mutable_gpu_data() + srclayer_offset;
-        cudaMemcpy(dst, src, step * sizeof(float), cudaMemcpyDefault);
-#else
-        LOG(FATAL) << "GPU is not supported";
-#endif
-      }
-      srclayer_offset += step;
-    }
-    slice_offset += step;
-  }
-}
-
-const Blob<float>& SliceLayer::data(const Layer* from) {
-  int idx = from ? layer_idx_.Get(from) : 0;
-  CHECK_LT(idx, num_slices_);
-  return *datavec_[idx];
-}
-
-const Blob<float>& SliceLayer::grad(const Layer* from) {
-  int idx = from ? layer_idx_.Get(from) : 0;
-  CHECK_LT(idx, num_slices_);
-  return *gradvec_[idx];
-}
-
-Blob<float>* SliceLayer::mutable_data(const Layer* from) {
-  CHECK(from);
-  int idx = layer_idx_.Get(from);
-  CHECK_LT(idx, num_slices_);
-  return datavec_[idx];
-}
-
-Blob<float>* SliceLayer::mutable_grad(const Layer* from) {
-  CHECK(from);
-  int idx = layer_idx_.Get(from);
-  CHECK_LT(idx, num_slices_);
-  return gradvec_[idx];
-}
-const std::string SliceLayer::ToString(bool debug, int flag) {
-  if (!debug)
-    return "";
-  string ret = "";
-  if ((flag & kForward) == kForward && data_.count() !=0) {
-    for (unsigned k = 0; k < datavec_.size(); k++)
-      ret += StringPrintf("data-%u :%e ", k, Asum(*datavec_.at(k)));
-  }
-  if ((flag & kBackward) == kBackward && grad_.count() != 0) {
-    for (unsigned k = 0; k < gradvec_.size(); k++)
-    ret += StringPrintf("grad-%u:%e ", k, Asum(*gradvec_.at(k)));
-  }
-  return ret;
-}
-}  // namespace singa
diff --git a/src/neuralnet/connection_layer/split.cc b/src/neuralnet/connection_layer/split.cc
deleted file mode 100644
index e46b902..0000000
--- a/src/neuralnet/connection_layer/split.cc
+++ /dev/null
@@ -1,91 +0,0 @@
-/************************************************************
-*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*
-*************************************************************/
-
-#include "singa/neuralnet/connection_layer.h"
-#include "singa/utils/math_blob.h"
-
-namespace singa {
-
-using std::vector;
-
-SplitLayer::~SplitLayer() {
-  for (size_t i = 1; i < gradvec_.size(); ++i) {
-    if (gradvec_[i] != nullptr) delete gradvec_[i];
-  }
-}
-
-void SplitLayer::Setup(const LayerProto& conf,
-                       const vector<Layer*>& srclayers) {
-  CHECK_EQ(srclayers.size(), 1);
-  Layer::Setup(conf, srclayers);
-  data_.Reshape(srclayers[0]->data(this).shape());
-  data_.ShareData(srclayers[0]->mutable_data(this), false);
-  num_splits_ = conf.split_conf().num_splits();
-  CHECK_GT(num_splits_, 0);
-  // add num_splits-1 more grad blobs
-  for (int i = 1; i < num_splits_; ++i) {
-    gradvec_.push_back(new Blob<float>());
-  }
-  for (int i = 0; i < num_splits_; ++i)
-    gradvec_[i]->Reshape(srclayers[0]->data(this).shape());
-}
-
-void SplitLayer::ComputeFeature(int flag, const vector<Layer*>& srclayers) {
-  // data is shared from its source,
-  // nothing to do in compute feature phase
-}
-
-void SplitLayer::ComputeGradient(int flag, const vector<Layer*>& srclayers) {
-  CHECK_EQ(srclayers.size(), 1);
-  // aggregate all gradients to grad_[0]
-  for (int i = 1; i < num_splits_; ++i)
-    AXPY<float>(1.0, *gradvec_[i], gradvec_[0]);
-  // copy grad_[0] to srclayer's grad
-  Copy(*gradvec_[0], srclayers[0]->mutable_grad(this));
-}
-
-const Blob<float>& SplitLayer::grad(const Layer* from) {
-  CHECK(from);
-  int idx = layer_idx_.Get(from);
-  CHECK_LT(idx, num_splits_);
-  return *gradvec_[idx];
-}
-
-Blob<float>* SplitLayer::mutable_grad(const Layer* from) {
-  CHECK(from);
-  int idx = layer_idx_.Get(from);
-  CHECK_LT(idx, num_splits_);
-  return gradvec_[idx];
-}
-const std::string SplitLayer::ToString(bool debug, int flag) {
-  if (!debug)
-    return "";
-  string ret = "";
-  if ((flag & kForward) == kForward && data_.count() !=0) {
-    ret += StringPrintf("data:%13.9f ", Asum(data_));
-  }
-  if ((flag & kBackward) == kBackward && grad_.count() != 0) {
-    for (unsigned k = 0; k < gradvec_.size(); k++)
-    ret += StringPrintf("grad-%u:%13.9f ", k, Asum(*gradvec_.at(k)));
-  }
-  return ret;
-}
-}  // namespace singa
diff --git a/src/neuralnet/input_layer/char_rnn.cc b/src/neuralnet/input_layer/char_rnn.cc
deleted file mode 100644
index 8a56711..0000000
--- a/src/neuralnet/input_layer/char_rnn.cc
+++ /dev/null
@@ -1,93 +0,0 @@
-/************************************************************
-*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*
-*************************************************************/
-#include <sstream>
-#include <fstream>
-#include "singa/neuralnet/input_layer.h"
-namespace singa {
-
-void CharRNNInputLayer::Setup(const LayerProto& conf,
-    const vector<Layer*>& srclayers) {
-  InputLayer::Setup(conf, srclayers);
-  batchsize_ = conf.char_rnn_conf().batchsize();
-  path_ = conf.char_rnn_conf().path();
-  vocab_path_ = conf.char_rnn_conf().vocab_path();
-  unroll_len_ = conf.char_rnn_conf().unroll_len();
-  datavec_.clear();
-  // each unroll layer has a input blob
-  for (int i = 0; i <= unroll_len_; i++) {
-    datavec_.push_back(new Blob<float>(batchsize_));
-  }
-}
-
-void CharRNNInputLayer::ComputeFeature(int flag,
-    const vector<Layer*>& srclayers) {
-  if (buf_.size() == 0) {
-
-    // read the vocab
-    {
-      std::ifstream fin;
-      fin.open(vocab_path_);
-      CHECK(fin.is_open()) << "Can't open vocab_path = " << vocab_path_;
-      std::stringstream stream;
-      stream << fin.rdbuf();
-      string vocab = stream.str();
-      LOG(ERROR) << "Vocab_size = " << vocab.length();
-      for (char c : vocab)
-        char2index_[c] = char2index_.size() - 1;
-      fin.close();
-    }
-
-    // read the whole text file
-    {
-      std::ifstream fin;
-      fin.open(path_);
-      CHECK(fin.is_open()) << "Can't open filepath = " << path_;
-      std::stringstream stream;
-      stream << fin.rdbuf();
-      buf_ = stream.str();
-      fin.close();
-    }
-
-    // decide the start pos of each instance in one mini-batch
-    int max_offset = buf_.length() / batchsize_;
-    CHECK_GT(max_offset, unroll_len_);
-    LOG(ERROR) << "Max iteration per epoch = " << max_offset / unroll_len_;
-    for (int i = 0; i < batchsize_; i ++) {
-      start_.push_back(i * max_offset);
-    }
-  }
-
-  for (int l = 0; l < unroll_len_ + 1; l++) {
-    float* ptr = datavec_[l]->mutable_cpu_data();
-    for (int i = 0; i < batchsize_; i++) {
-      ptr[i] = static_cast<float>(char2index_.at(buf_[start_[i] + offset_ + l]));
-    }
-  }
-  offset_ += unroll_len_;
-  if (offset_ >= buf_.length() / batchsize_) {
-//  unsigned seed = std::chrono::system_clock::now().time_since_epoch().count();
-//  std::mt19937 g(seed);
-//  std::shuffle(start_.begin(), start_.end(), g);
-    offset_ = 0;
-    // return -1;
-  }
-}
-}  // namespace singa
diff --git a/src/neuralnet/input_layer/csv.cc b/src/neuralnet/input_layer/csv.cc
deleted file mode 100644
index 53cabff..0000000
--- a/src/neuralnet/input_layer/csv.cc
+++ /dev/null
@@ -1,67 +0,0 @@
-/************************************************************
-*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*
-*************************************************************/
-
-#include "singa/neuralnet/input_layer.h"
-#include "singa/utils/tokenizer.h"
-
-namespace singa {
-
-void CSVInputLayer::Setup(const LayerProto& conf,
-    const vector<Layer*>& srclayers) {
-  SingleLabelRecordLayer::Setup(conf, srclayers);
-  sep_ = conf.store_conf().separator();
-}
-
-void CSVInputLayer::LoadRecord(const string& backend,
-    const string&path, Blob<float>* to) {
-  io::Store* store = io::OpenStore(backend, path, io::kRead);
-  string key, val;
-  CHECK(store->Read(&key, &val));
-  float* ptr = to->mutable_cpu_data();
-  Tokenizer t(val, sep_);
-  string x;
-  for (int i = 0; i< to->count(); i++) {
-    t >> x;
-    ptr[i] = stof(x);
-  }
-  CHECK(!t.Valid());
-  delete store;
-}
-
-bool CSVInputLayer::Parse(int k, int flag, const string& key,
-    const string& value) {
-  float* ptr = data_.mutable_cpu_data() + k * data_.count() / batchsize_;
-  Tokenizer t(value, sep_);
-  string x;
-  // parse label if not deploy phase and has_label is set.
-  if ((flag & kDeploy) == 0 && layer_conf_.store_conf().has_label()) {
-    t >> x;
-    aux_data_[k] = stoi(x);
-  }
-  for (int i = 0; i< data_.count() / batchsize_; i++) {
-    t >> x;
-    ptr[i] = stof(x);
-  }
-  CHECK(!t.Valid());
-  return true;
-}
-
-}  // namespace singa
diff --git a/src/neuralnet/input_layer/deprecated.cc b/src/neuralnet/input_layer/deprecated.cc
deleted file mode 100644
index d2901f7..0000000
--- a/src/neuralnet/input_layer/deprecated.cc
+++ /dev/null
@@ -1,373 +0,0 @@
-/************************************************************
-*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*
-*************************************************************/
-
-#include <random>
-#include "singa/neuralnet/input_layer.h"
-#include "singa/utils/context.h"
-#include "singa/utils/singleton.h"
-#include "mshadow/tensor.h"
-namespace singa {
-
-using namespace mshadow;
-using mshadow::cpu;
-using mshadow::Shape4;
-using mshadow::Tensor;
-
-using std::string;
-using std::vector;
-
-ShardDataLayer::~ShardDataLayer() {
-  if (shard_ != nullptr)
-    delete shard_;
-  shard_ = nullptr;
-}
-
-void ShardDataLayer::Setup(const LayerProto& proto,
-    const vector<Layer*>& srclayers) {
-  Layer::Setup(proto, srclayers);
-  shard_ = new DataShard(proto.sharddata_conf().path(), DataShard::kRead);
-  string key;
-  shard_->Next(&key, &sample_);
-  delete shard_;
-  shard_ = nullptr;
-  batchsize_ = proto.sharddata_conf().batchsize();
-  if (partition_dim() == 0)
-    batchsize_ /= proto.num_partitions();
-  records_.resize(batchsize_);
-  random_skip_ = proto.sharddata_conf().random_skip();
-}
-
-void ShardDataLayer::ComputeFeature(int flag, const vector<Layer*>& srclayers) {
-  if (shard_ == nullptr)
-    shard_ = new DataShard(layer_conf_.sharddata_conf().path(),
-                           DataShard::kRead);
-  if (random_skip_) {
-    std::uniform_int_distribution<int> distribution(0, random_skip_);
-    auto generator = Singleton<Context>::Instance()->rand_generator();
-    int nskip = distribution(*generator);
-    LOG(INFO) << "Random Skip " << nskip << " records, there are "
-      << shard_->Count() << " records in total";
-    string key;
-    for (int i = 0; i < nskip; i++) {
-      shard_->Next(&key, &sample_);
-    }
-    random_skip_ = 0;
-  }
-  for (auto& record : records_) {
-    string key;
-    if (!shard_->Next(&key, &record)) {
-      shard_->SeekToFirst();
-      CHECK(shard_->Next(&key, &record));
-    }
-  }
-}
-
-/*****************LMDB data layer*******************/
-#ifdef USE_LMDB
-LMDBDataLayer::~LMDBDataLayer() {
-  mdb_cursor_close(mdb_cursor_);
-  mdb_txn_abort(mdb_txn_);
-  mdb_cursor_ = nullptr;
-}
-
-void LMDBDataLayer::Setup(const LayerProto& proto,
-    const vector<Layer*>& srclayers) {
-  Layer::Setup(proto, srclayers);
-  OpenLMDB(proto.lmdbdata_conf().path());
-  CHECK_EQ(mdb_cursor_get(mdb_cursor_, &mdb_key_, &mdb_value_, MDB_NEXT),
-           MDB_SUCCESS);
-  mdb_cursor_close(mdb_cursor_);
-  mdb_txn_abort(mdb_txn_);
-  mdb_cursor_ = nullptr;
-  CaffeDatum datum;
-  datum.ParseFromArray(mdb_value_.mv_data, mdb_value_.mv_size);
-  SingleLabelImageRecord* record = sample_.mutable_image();
-  ConvertCaffeDatumToRecord(datum, record);
-  batchsize_ = proto.lmdbdata_conf().batchsize();
-  if (partition_dim() == 0)
-    batchsize_ /= proto.num_partitions();
-  records_.resize(batchsize_);
-  random_skip_ = proto.lmdbdata_conf().random_skip();
-}
-
-void LMDBDataLayer::OpenLMDB(const std::string& path) {
-  CHECK_EQ(mdb_env_create(&mdb_env_), MDB_SUCCESS) << "mdb_env_create failed";
-  CHECK_EQ(mdb_env_set_mapsize(mdb_env_, 1099511627776), MDB_SUCCESS);  // 1TB
-  CHECK_EQ(mdb_env_open(mdb_env_, path.c_str(),
-           MDB_RDONLY, 0664), MDB_SUCCESS) << "cannot open lmdb " << path;
-  CHECK_EQ(mdb_txn_begin(mdb_env_, NULL, MDB_RDONLY, &mdb_txn_), MDB_SUCCESS)
-      << "mdb_txn_begin failed";
-  CHECK_EQ(mdb_open(mdb_txn_, NULL, 0, &mdb_dbi_), MDB_SUCCESS)
-      << "mdb_open failed";
-  CHECK_EQ(mdb_cursor_open(mdb_txn_, mdb_dbi_, &mdb_cursor_), MDB_SUCCESS)
-      << "mdb_cursor_open failed";
-  LOG(INFO) << "Opening lmdb " << path;
-  CHECK_EQ(mdb_cursor_get(mdb_cursor_, &mdb_key_, &mdb_value_, MDB_FIRST),
-           MDB_SUCCESS) << "mdb_cursor_get failed";
-}
-
-void LMDBDataLayer::ComputeFeature(int flag, const vector<Layer*>& srclayers) {
-  if (mdb_cursor_ == nullptr)
-    OpenLMDB(layer_conf_.lmdbdata_conf().path());
-  if (random_skip_) {
-    std::uniform_int_distribution<int> distribution(0, random_skip_);
-    auto generator =
-     Singleton<Context>::Instance()->rand_generator(std::this_thread::get_id());
-    int nskip = distribution(*generator);
-
-    int n = 0;
-    CHECK_EQ(mdb_cursor_get(mdb_cursor_, &mdb_key_,
-          &mdb_value_, MDB_FIRST), MDB_SUCCESS);
-    while (mdb_cursor_get(mdb_cursor_, &mdb_key_,
-          &mdb_value_, MDB_NEXT) == MDB_SUCCESS)
-      n++;
-    LOG(INFO) << "Random Skip " << nskip << " records of total "
-      << n << "records";
-    // We have reached the end. Restart from the first.
-    CHECK_EQ(mdb_cursor_get(mdb_cursor_, &mdb_key_,
-          &mdb_value_, MDB_FIRST), MDB_SUCCESS);
-    for (int i = 0; i < nskip; i++) {
-      if (mdb_cursor_get(mdb_cursor_, &mdb_key_,
-            &mdb_value_, MDB_NEXT) != MDB_SUCCESS) {
-        // We have reached the end. Restart from the first.
-        DLOG(INFO) << "Restarting data prefetching from start.";
-        CHECK_EQ(mdb_cursor_get(mdb_cursor_, &mdb_key_,
-              &mdb_value_, MDB_FIRST), MDB_SUCCESS);
-      }
-    }
-    random_skip_ = 0;
-  }
-  CaffeDatum datum;
-  for (auto& record : records_) {
-    SingleLabelImageRecord* image = record.mutable_image();
-    CHECK_EQ(mdb_cursor_get(mdb_cursor_, &mdb_key_,
-             &mdb_value_, MDB_GET_CURRENT), MDB_SUCCESS);
-    datum.ParseFromArray(mdb_value_.mv_data, mdb_value_.mv_size);
-    ConvertCaffeDatumToRecord(datum, image);
-    if (mdb_cursor_get(mdb_cursor_, &mdb_key_,
-        &mdb_value_, MDB_NEXT) != MDB_SUCCESS) {
-      // We have reached the end. Restart from the first.
-      DLOG(INFO) << "Restarting data prefetching from start.";
-      CHECK_EQ(mdb_cursor_get(mdb_cursor_, &mdb_key_,
-               &mdb_value_, MDB_FIRST), MDB_SUCCESS);
-    }
-  }
-}
-
-void LMDBDataLayer::ConvertCaffeDatumToRecord(const CaffeDatum& datum,
-                                              SingleLabelImageRecord* record) {
-  record->set_label(datum.label());
-  record->clear_shape();
-  if (datum.has_channels())
-    record->add_shape(datum.channels());
-  if (datum.has_height())
-    record->add_shape(datum.height());
-  if (datum.has_width())
-    record->add_shape(datum.width());
-  if (datum.has_data())
-    record->set_pixel(datum.data());
-  if (datum.float_data_size()) {
-    record->clear_data();
-    for (float x : datum.float_data())
-      record->add_data(x);
-  }
-}
-#endif
-
-/***************Parser layer*******************/
-void ParserLayer::ComputeFeature(int flag, const vector<Layer*>& srclayers) {
-  CHECK_EQ(srclayers.size(), 1);
-  auto datalayer = dynamic_cast<DataLayer*>(*srclayers.begin());
-  ParseRecords(flag, datalayer->records(), &data_);
-}
-
-/**********Mnist Layer************/
-void MnistLayer::ParseRecords(int flag, const vector<Record>& records,
-    Blob<float>* blob) {
-  LOG_IF(ERROR, records.size() == 0) << "Empty records to parse";
-  int ndim = records.at(0).image().shape_size();
-  int inputsize = records.at(0).image().shape(ndim-1);
-  CHECK_EQ(inputsize, blob->shape()[2]);
-
-  float* dptr = blob->mutable_cpu_data();
-  for (const Record& record : records) {
-    const SingleLabelImageRecord& imagerecord = record.image();
-    if (imagerecord.pixel().size()) {
-      string pixel = imagerecord.pixel();
-      for (int i = 0, k = 0; i < inputsize; i++) {
-        for (int j = 0; j < inputsize; j++) {
-          // NOTE!!! must cast pixel to uint8_t then to float!!! waste a lot of
-          // time to debug this
-          float x =  static_cast<float>(static_cast<uint8_t>(pixel[k++]));
-          x = x / norm_a_-norm_b_;
-          *dptr = x;
-          dptr++;
-        }
-      }
-    } else {
-      for (int i = 0, k = 0; i < inputsize; i++) {
-        for (int j = 0; j < inputsize; j++) {
-          *dptr = imagerecord.data(k++) / norm_a_ - norm_b_;
-          dptr++;
-        }
-      }
-    }
-  }
-  CHECK_EQ(dptr, blob->mutable_cpu_data() + blob->count());
-}
-
-void MnistLayer::Setup(const LayerProto& proto,
-    const vector<Layer*>& srclayers) {
-  Layer::Setup(proto, srclayers);
-  CHECK_EQ(srclayers.size(), 1);
-  int batchsize = dynamic_cast<DataLayer*>(srclayers[0])->batchsize();
-  Record sample = dynamic_cast<DataLayer*>(srclayers[0])->sample();
-  norm_a_ = proto.mnist_conf().norm_a();
-  norm_b_ = proto.mnist_conf().norm_b();
-  int ndim = sample.image().shape_size();
-  CHECK_GE(ndim, 2);
-  int s = sample.image().shape(ndim - 1);
-  CHECK_EQ(s, sample.image().shape(ndim - 2));
-  data_.Reshape(vector<int>{batchsize, 1, s, s});
-}
-
-/**********RGB image layer****************/
-void RGBImageLayer::ParseRecords(int flag, const vector<Record>& records,
-    Blob<float>* blob) {
-  const vector<int>& s = blob->shape();
-  Tensor<cpu, 4> images(data_.mutable_cpu_data(),
-      Shape4(s[0], s[1], s[2], s[3]));
-  const SingleLabelImageRecord& r = records.at(0).image();
-  Tensor<cpu, 3> raw_image(Shape3(r.shape(0), r.shape(1), r.shape(2)));
-  AllocSpace(raw_image);
-  Tensor<cpu, 3> croped_image(nullptr, Shape3(s[1], s[2], s[3]));
-  if (cropsize_)
-    AllocSpace(croped_image);
-  int rid = 0;
-  const float* meandptr = mean_.cpu_data();
-
-  std::uniform_int_distribution<int> distribution(0, r.shape(0) - cropsize_);
-  auto generator =
-    Singleton<Context>::Instance()->rand_generator(std::this_thread::get_id());
-  for (const Record& record : records) {
-    auto image = images[rid];
-    bool do_crop = cropsize_> 0 && ((flag & kTrain) == kTrain);
-    bool do_mirror = mirror_
-                    && (distribution(*generator) % 2)
-                    && ((flag & kTrain) == kTrain);
-    float* dptr = nullptr;
-    if (do_crop || do_mirror)
-      dptr = raw_image.dptr;
-    else
-      dptr = image.dptr;
-    if (record.image().pixel().size()) {
-      string pixel = record.image().pixel();
-      for (size_t i = 0; i < pixel.size(); i++)
-        dptr[i] = static_cast<float>(static_cast<uint8_t>(pixel[i]));
-    } else {
-      memcpy(dptr, record.image().data().data(),
-          sizeof(float) * record.image().data_size());
-    }
-    for (int i = 0; i < mean_.count(); i++)
-      dptr[i] -= meandptr[i];
-    if (do_crop) {
-      int hoff = distribution(*generator);
-      int woff = distribution(*generator);
-      Shape<2> cropshape = Shape2(cropsize_, cropsize_);
-      if (do_mirror) {
-        croped_image = expr::crop(raw_image, cropshape, hoff, woff);
-        image = expr::mirror(croped_image);
-      } else {
-        image = expr::crop(raw_image, cropshape, hoff, woff);
-      }
-    } else if (do_mirror) {
-      image = expr::mirror(raw_image);
-    }
-    rid++;
-  }
-  if (scale_)
-    images = images * scale_;
-  FreeSpace(raw_image);
-  if (cropsize_)
-    FreeSpace(croped_image);
-}
-
-void RGBImageLayer::Setup(const LayerProto& proto,
-    const vector<Layer*>& srclayers) {
-  ParserLayer::Setup(proto, srclayers);
-  CHECK_EQ(srclayers.size(), 1);
-  scale_ = proto.rgbimage_conf().scale();
-  cropsize_ = proto.rgbimage_conf().cropsize();
-  mirror_ = proto.rgbimage_conf().mirror();
-  int batchsize = dynamic_cast<DataLayer*>(srclayers[0])->batchsize();
-  Record sample = dynamic_cast<DataLayer*>(srclayers[0])->sample();
-  vector<int> shape;
-  shape.push_back(batchsize);
-  for (int x : sample.image().shape()) {
-    shape.push_back(x);
-  }
-  CHECK_EQ(shape.size(), 4);
-  if (cropsize_) {
-    shape[2] = cropsize_;
-    shape[3] = cropsize_;
-  }
-  data_.Reshape(shape);
-  mean_.Reshape({shape[1], shape[2], shape[3]});
-  if (proto.rgbimage_conf().has_meanfile()) {
-    if (proto.rgbimage_conf().meanfile().find("binaryproto") != string::npos) {
-      CaffeBlob mean;
-      ReadProtoFromBinaryFile(proto.rgbimage_conf().meanfile().c_str(), &mean);
-      CHECK_EQ(mean_.count(), mean.data_size());
-      memcpy(mean_.mutable_cpu_data(), mean.data().data(),
-             sizeof(float)*mean.data_size());
-    } else {
-      SingleLabelImageRecord mean;
-      ReadProtoFromBinaryFile(proto.rgbimage_conf().meanfile().c_str(), &mean);
-      CHECK_EQ(mean_.count(), mean.data_size());
-      memcpy(mean_.mutable_cpu_data(), mean.data().data(),
-             sizeof(float)*mean.data_size());
-    }
-  } else {
-    memset(mean_.mutable_cpu_data(), 0, sizeof(float) * mean_.count());
-  }
-}
-
-/*************Label layer *************/
-
-void LabelLayer::Setup(const LayerProto& proto,
-    const vector<Layer*>& srclayers) {
-  Layer::Setup(proto, srclayers);
-  CHECK_EQ(srclayers.size(), 1);
-  int batchsize = dynamic_cast<DataLayer*>(srclayers[0])->batchsize();
-  data_.Reshape(vector<int>{batchsize});
-}
-
-void LabelLayer::ParseRecords(int flag, const vector<Record>& records,
-    Blob<float>* blob) {
-  int rid = 0;
-  float *label = blob->mutable_cpu_data();
-  for (const Record& record : records) {
-    label[rid++] = record.image().label();
-    // CHECK_LT(record.image().label(),10);
-  }
-  CHECK_EQ(rid, blob->shape()[0]);
-}
-}  // namespace singa
diff --git a/src/neuralnet/input_layer/image_preprocess.cc b/src/neuralnet/input_layer/image_preprocess.cc
deleted file mode 100644
index 6f2e094..0000000
--- a/src/neuralnet/input_layer/image_preprocess.cc
+++ /dev/null
@@ -1,78 +0,0 @@
-/************************************************************
-*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*
-*************************************************************/
-
-#include "singa/neuralnet/input_layer.h"
-#include "singa/utils/image_transform.h"
-#include "singa/utils/context.h"
-#include "singa/utils/singleton.h"
-
-namespace singa {
-
-using std::vector;
-
-void ImagePreprocessLayer::Setup(const LayerProto& conf,
-    const vector<Layer*>& srclayers) {
-  CHECK_EQ(srclayers.size(), 1);
-  InputLayer::Setup(conf, srclayers);
-  scale_ = conf.rgbimage_conf().scale();
-  cropsize_ = conf.rgbimage_conf().cropsize();
-  mirror_ = conf.rgbimage_conf().mirror();
-  const auto& src = srclayers.at(0)->data(this);
-  const auto& shape = src.shape();
-  CHECK_EQ(shape.size(), 4);
-  CHECK_EQ(shape.at(2), shape.at(3));
-  if (cropsize_ && (cropsize_ != shape.at(2) || cropsize_ != shape.at(3))) {
-    data_.Reshape(vector<int>{shape.at(0), shape.at(1), cropsize_, cropsize_});
-  } else {
-    data_ = src;
-  }
-}
-
-void ImagePreprocessLayer::ComputeFeature(int flag,
-    const vector<Layer*>& srclayers) {
-  const auto& srcdata = srclayers.at(0)->data(this);
-  int batchsize = srcdata.shape(0), channel = srcdata.shape(1);
-  int height = srcdata.shape(2), width = srcdata.shape(3);
-  int srcimage_size = channel * height * width;
-  int image_size = channel * data_.shape(2) * data_.shape(3);
-  std::uniform_int_distribution<int> rand1(0, height - cropsize_);
-  std::uniform_int_distribution<int> rand2(0, width - cropsize_);
-  auto generator = Singleton<Context>::Instance()->rand_generator();
-
-  const float* srcdptr = srcdata.cpu_data();
-  float* dptr = data_.mutable_cpu_data();
-
-  for (int k = 0; k < batchsize; k++) {
-    int h_offset = 0, w_offset = 0;
-    if (cropsize_> 0 && (flag & kTrain)) {
-      h_offset = rand1(*generator);
-      w_offset = rand2(*generator);
-    }
-    bool do_mirror = mirror_
-                    && (rand1(*generator) % 2)
-                    && (flag & kTrain);
-    ImageTransform(srcdptr + k * srcimage_size, nullptr, do_mirror, cropsize_,
-        cropsize_, h_offset, w_offset, channel, height, width,
-        scale_, dptr + k * image_size);
-  }
-}
-
-}  // namespace singa
diff --git a/src/neuralnet/input_layer/onehot.cc b/src/neuralnet/input_layer/onehot.cc
deleted file mode 100644
index 4b83705..0000000
--- a/src/neuralnet/input_layer/onehot.cc
+++ /dev/null
@@ -1,40 +0,0 @@
-/************************************************************
-*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*
-*************************************************************/
-#include "singa/neuralnet/input_layer.h"
-
-namespace singa {
-void OneHotLayer::Setup(const LayerProto& conf,
-    const vector<Layer*>& srclayers) {
-  InputLayer::Setup(conf, srclayers);
-  batchsize_ = srclayers.at(0)->data(unroll_index()).shape(0);
-  dim_ = conf.onehot_conf().vocab_size();
-  data_.Reshape(batchsize_, dim_);
-}
-
-void OneHotLayer::ComputeFeature(int flag, const vector<Layer*>& srclayers) {
-  float* ptr = data_.mutable_cpu_data();
-  memset(ptr, 0, sizeof(float) * data_.count());
-  const float* idx = srclayers[0]->data(unroll_index()).cpu_data();
-  for (int i = 0; i < batchsize_; i++) {
-    ptr[i * dim_ + static_cast<int>(idx[i])] = 1;
-  }
-}
-}  // namespace singa
diff --git a/src/neuralnet/input_layer/record.cc b/src/neuralnet/input_layer/record.cc
deleted file mode 100644
index b14fc80..0000000
--- a/src/neuralnet/input_layer/record.cc
+++ /dev/null
@@ -1,73 +0,0 @@
-/************************************************************
-*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*
-*************************************************************/
-
-#include "singa/neuralnet/input_layer.h"
-namespace singa {
-
-using std::string;
-using std::vector;
-
-void RecordInputLayer::Setup(const LayerProto& conf,
-    const vector<Layer*>& srclayers) {
-  SingleLabelRecordLayer::Setup(conf, srclayers);
-  encoded_ = conf.store_conf().encoded();
-}
-
-void RecordInputLayer::LoadRecord(const string& backend,
-    const string& path, Blob<float>* to) {
-  io::Store* store = io::OpenStore(backend, path, io::kRead);
-  string key, val;
-  CHECK(store->Read(&key, &val));
-  RecordProto image;
-  image.ParseFromString(val);
-  CHECK_EQ(to->count(), image.data_size());
-  float* ptr = to->mutable_cpu_data();
-  for (int i = 0; i< to->count(); i++)
-    ptr[i] = image.data(i);
-  delete store;
-}
-
-bool RecordInputLayer::Parse(int k, int flag, const string& key,
-    const string& value) {
-  RecordProto image;
-  image.ParseFromString(value);
-  int size = data_.count() / batchsize_;
-  if (image.data_size()) {
-    CHECK_EQ(size, image.data_size());
-    float* ptr = data_.mutable_cpu_data() + k * size;
-    for (int i = 0; i< size; i++)
-      ptr[i] = image.data(i);
-  } else if (image.pixel().size()) {
-    CHECK_EQ(size, image.pixel().size());
-    float* ptr = data_.mutable_cpu_data() + k * size;
-    string pixel = image.pixel();
-    for (int i = 0; i < size; i++)
-      ptr[i] =  static_cast<float>(static_cast<uint8_t>(pixel[i]));
-  } else {
-    LOG(ERROR) << "not pixel nor pixel";
-  }
-  if ((flag & kDeploy) == 0) {  // deploy mode does not have label
-    aux_data_.at(k) = image.label();
-  }
-  return true;
-}
-
-}  // namespace singa
diff --git a/src/neuralnet/input_layer/rnn_label.cc b/src/neuralnet/input_layer/rnn_label.cc
deleted file mode 100644
index 4924d87..0000000
--- a/src/neuralnet/input_layer/rnn_label.cc
+++ /dev/null
@@ -1,35 +0,0 @@
-/************************************************************
-*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*
-*************************************************************/
-
-#include "singa/neuralnet/input_layer.h"
-namespace singa {
-void RNNLabelLayer::Setup(const LayerProto& proto,
-    const vector<Layer*>& srclayers) {
-  InputLayer::Setup(proto, srclayers);
-  aux_data_.resize(srclayers[0]->data(unroll_index() + 1).shape(0));
-}
-void RNNLabelLayer::ComputeFeature(int flag, const vector<Layer*>& srclayers) {
-  const float* input = srclayers[0]->data(unroll_index() + 1).cpu_data();
-  for (unsigned i = 0; i < aux_data_.size(); i++) {
-    aux_data_[i] = static_cast<int>(input[i]);
-  }
-}
-}  // namespace singa
diff --git a/src/neuralnet/input_layer/store.cc b/src/neuralnet/input_layer/store.cc
deleted file mode 100644
index 32f1887..0000000
--- a/src/neuralnet/input_layer/store.cc
+++ /dev/null
@@ -1,162 +0,0 @@
-/************************************************************
-*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*
-*************************************************************/
-
-#include "singa/neuralnet/input_layer.h"
-#include "singa/utils/context.h"
-#include "singa/utils/singleton.h"
-namespace singa {
-
-using std::thread;
-
-StoreInputLayer::~StoreInputLayer() {
-  if (thread_ != nullptr) {
-    thread_->join();
-    delete thread_;
-  }
-  if (store_ != nullptr) {
-    delete store_;
-  }
-}
-
-void StoreInputLayer::Setup(const LayerProto& conf,
-    const vector<Layer*>& srclayers) {
-  InputLayer::Setup(conf, srclayers);
-  const auto& batchsize = conf.store_conf().batchsize();
-  CHECK(batchsize.size());
-  if (conf.partition_dim() == 0) {
-    if (batchsize.size() == 1)  // equal partition
-      batchsize_ = batchsize.Get(0) / conf.num_partitions();
-    else  // manual partition
-      batchsize_ = batchsize.Get(conf.partition_id());
-  } else {
-    batchsize_ = conf.store_conf().batchsize(0);
-  }
-
-  vector<int> shape {batchsize_};
-  for (int s : conf.store_conf().shape())
-    shape.push_back(s);
-  data_.Reshape(shape);
-  aux_data_.resize(batchsize_);
-}
-
-void StoreInputLayer::fetch_data() {
-  if (store_ == nullptr) {
-    store_ = io::OpenStore(layer_conf_.store_conf().backend(),
-        layer_conf_.store_conf().path(),
-        io::kRead);
-    if (layer_conf_.store_conf().random_skip() > 0) {
-      std::uniform_int_distribution<int>
-        distribution(0, layer_conf_.store_conf().random_skip());
-      auto generator = Singleton<Context>::Instance()->rand_generator(
-          std::this_thread::get_id());
-      random_skip_ = distribution(*generator);
-    }
-
-    string key, val;
-    while (random_skip_ > 0) {
-      if (!store_->Read(&key, &val)) {
-        store_->SeekToFirst();
-        CHECK(store_->Read(&key, &val));
-      }
-      random_skip_--;
-    }
-    buf_keys_.resize(batchsize_);
-    buf_vals_.resize(batchsize_);
-  }
-  for (int k = 0; k < batchsize_; k++) {
-    if (!store_->Read(&buf_keys_[k], &buf_vals_[k])) {
-      store_->SeekToFirst();
-      CHECK(store_->Read(&buf_keys_[k], &buf_vals_[k]));
-    }
-  }
-}
-
-void StoreInputLayer::ComputeFeature(int flag,
-    const vector<Layer*>& srclayers) {
-
-  // if prefetching, wait for the thread to finish
-  if (layer_conf_.store_conf().prefetching()) {
-    if (thread_ == nullptr) {
-      thread_ = new thread(&StoreInputLayer::fetch_data, this);
-    }
-    thread_->join();
-    delete thread_;
-    thread_ = nullptr;
-  } else {
-    fetch_data();
-  }
-  for (int k = 0; k < batchsize_; k++)
-    Parse(k, flag, buf_keys_[k], buf_vals_[k]);
-  if (layer_conf_.store_conf().prefetching())
-    thread_ = new thread(&StoreInputLayer::fetch_data, this);
-}
-
-void SingleLabelRecordLayer::Setup(const LayerProto& conf,
-    const vector<Layer*>& srclayers) {
-  StoreInputLayer::Setup(conf, srclayers);
-}
-
-void SingleLabelRecordLayer::ComputeFeature(int flag,
-    const vector<Layer*>& srclayers) {
-
-  StoreInputLayer::ComputeFeature(flag, srclayers);
-  auto& store_conf = layer_conf_.store_conf();
-
-  if (store_conf.has_mean_file() && mean_.count() == 0) {
-    mean_.Reshape(vector<int>{data_.count() / batchsize_});
-    LoadRecord(store_conf.backend(), store_conf.mean_file(), &mean_);
-  } else if (store_conf.has_mean_value() && mean_.count() == 0) {
-    mean_.Reshape(vector<int>{data_.count() / batchsize_});
-    for (int i = 0; i < data_.count() / batchsize_; i++)
-      mean_.mutable_cpu_data()[i] = store_conf.mean_value();
-  }
-  if (store_conf.has_std_file() && std_.count() == 0) {
-    std_.Reshape(vector<int>{data_.count() / batchsize_});
-    LoadRecord(store_conf.backend(), store_conf.std_file(), &std_);
-    // TODO(wangwei) check std[i] != 0
-  } else if (store_conf.has_std_value() && std_.count() == 0) {
-    std_.Reshape(vector<int>{data_.count() / batchsize_});
-    CHECK_NE(store_conf.std_value(), 0);
-    for (int i = 0; i < data_.count() / batchsize_; i++)
-      std_.mutable_cpu_data()[i] = store_conf.std_value();
-  }
-
-  if (mean_.count()) {
-    const float* mean = mean_.cpu_data();
-    for (int k = 0; k < batchsize_; k++) {
-      float* dptr = data_.mutable_cpu_data() + k * mean_.count();
-      for (int i = 0; i < mean_.count(); i++) {
-        dptr[i] -= mean[i];
-      }
-    }
-  }
-  if (std_.count()) {
-    const float* std = std_.cpu_data();
-    for (int k = 0; k < batchsize_; k++) {
-      float* dptr = data_.mutable_cpu_data() + k * std_.count();
-      for (int i = 0; i < std_.count(); i++) {
-        dptr[i] /= std[i];
-      }
-    }
-  }
-}
-
-}  // namespace singa
diff --git a/src/neuralnet/layer.cc b/src/neuralnet/layer.cc
deleted file mode 100644
index ef1629f..0000000
--- a/src/neuralnet/layer.cc
+++ /dev/null
@@ -1,82 +0,0 @@
-/************************************************************
-*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*
-*************************************************************/
-
-#include "singa/worker.h"
-#include "singa/neuralnet/layer.h"
-#include "singa/neuralnet/input_layer.h"
-#include "singa/neuralnet/neuron_layer.h"
-#include "singa/neuralnet/loss_layer.h"
-
-#include <cblas.h>
-#include <glog/logging.h>
-#include <math.h>
-#include <cfloat>
-#include "singa/utils/factory.h"
-#include "singa/utils/singleton.h"
-#include "singa/utils/math_blob.h"
-
-namespace singa {
-
-using std::string;
-
-void Layer::SetupLayer(Layer* layer, const string str, const vector<Layer*>& srclayers) {
-  LayerProto layer_conf;
-  layer_conf.ParseFromString(str);
-  layer->Setup(layer_conf, srclayers);
-  for (auto param : layer->GetParams())
-      param->InitValues();
-}
-
-Layer* Layer::CreateLayer(const string str) {
-  LayerProto layer_conf;
-  layer_conf.ParseFromString(str);
-  return Layer::Create(layer_conf);
-}
-
-Layer* Layer::Create(const LayerProto& proto) {
-  auto* factory = Singleton<Factory<Layer>>::Instance();
-  Layer* layer = nullptr;
-  if (proto.has_user_type())
-    layer = factory->Create(proto.user_type());
-  else
-    layer = factory->Create(proto.type());
-  return layer;
-}
-
-const std::string Layer::ToString(bool debug, int flag) {
-  if (!debug)
-    return "";
-  string ret = "";
-  if ((flag & kForward) == kForward && data_.count() !=0) {
-    ret += StringPrintf("data:%e ", Asum(data_));
-    for (Param* p : GetParams())
-      ret += StringPrintf("%s:%13.9f ",
-          p->name().c_str(), Asum(p->data()));
-  }
-  if ((flag & kBackward) == kBackward && grad_.count() != 0) {
-    ret += StringPrintf("grad:%e ", Asum(grad_));
-    for (Param* p : GetParams())
-      ret += StringPrintf("%s:%13.9f ",
-          p->name().c_str(), Asum(p->grad()));
-  }
-  return ret;
-}
-}  // namespace singa
diff --git a/src/neuralnet/loss_layer/cudnn_softmaxloss.cc b/src/neuralnet/loss_layer/cudnn_softmaxloss.cc
deleted file mode 100644
index 0d4ba45..0000000
--- a/src/neuralnet/loss_layer/cudnn_softmaxloss.cc
+++ /dev/null
@@ -1,83 +0,0 @@
-/************************************************************
-*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*
-*************************************************************/
-
-#include "singa/neuralnet/loss_layer.h"
-#include "singa/utils/blob.h"
-#include "singa/utils/math_blob.h"
-#include "singa/utils/math_kernel.h"
-
-namespace singa {
-void CudnnSoftmaxLossLayer::Setup(const LayerProto& conf,
-    const vector<Layer*>& srclayers) {
-  LossLayer::Setup(conf, srclayers);
-  softmax_.Setup(conf, vector<Layer*> {srclayers.at(0)});
-  data_.Reshape(softmax_.data(this).shape());
-  data_.ShareData(softmax_.mutable_data(this), false);
-  batchsize_ = data_.shape(0);
-  dim_ = data_.count() / batchsize_;
-}
-void CudnnSoftmaxLossLayer::ComputeFeature(int flag,
-    const vector<Layer*>& srclayers) {
-  softmax_.ComputeFeature(flag, srclayers);
-  Blob<int> label(batchsize_);
-  int *labelptr = label.mutable_cpu_data();
-  // aux_data: vector<int>, convert vector to int array.
-  for (int i = 0; i < batchsize_; ++i) {
-    labelptr[i] = srclayers[1]->aux_data(this)[i];
-  }
-
-  Blob<float> loss(batchsize_);
-  singa_gpu_softmaxloss_forward(batchsize_, dim_, data_.gpu_data(),
-      label.gpu_data(), loss.mutable_gpu_data());
-  loss_ += Asum(loss);
-  counter_++;
-}
-
-void CudnnSoftmaxLossLayer::ComputeGradient(int flag,
-    const vector<Layer*>& srclayers) {
-  Blob<float>* gsrcblob = srclayers[0]->mutable_grad(this);
-  Copy(data_, gsrcblob);
-  // gsrcblob->CopyFrom(data_);
-  float* gsrcptr = gsrcblob->mutable_gpu_data();
-
-  Blob<int> label(batchsize_);
-  int *labelptr = label.mutable_cpu_data();
-
-  // aux_data: vector<int>, convert vector to int array.
-  for (int i = 0; i < batchsize_; ++i) {
-    labelptr[i] = srclayers[1]->aux_data(this)[i];
-  }
-
-  singa_gpu_softmaxloss_backward(batchsize_, dim_, 1.0f, label.gpu_data(),
-      gsrcptr);
-  Scale(1.0f / batchsize_, gsrcblob);
-}
-
-const std::string CudnnSoftmaxLossLayer::ToString(bool debug, int flag) {
-  if (debug)
-    return Layer::ToString(debug, flag);
-
-  string disp = "Loss = " + std::to_string(loss_ / counter_);
-  counter_ = 0;
-  loss_ = 0;
-  return disp;
-}
-}  // namespace singa
diff --git a/src/neuralnet/loss_layer/euclidean.cc b/src/neuralnet/loss_layer/euclidean.cc
deleted file mode 100644
index 67c0cd5..0000000
--- a/src/neuralnet/loss_layer/euclidean.cc
+++ /dev/null
@@ -1,80 +0,0 @@
-/************************************************************
-*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*
-*************************************************************/
-
-#include <glog/logging.h>
-#include "singa/neuralnet/loss_layer.h"
-#include "mshadow/tensor.h"
-
-namespace singa {
-
-using namespace mshadow;
-using mshadow::cpu;
-
-using mshadow::Shape1;
-using mshadow::Tensor;
-
-using std::vector;
-
-void EuclideanLossLayer::Setup(const LayerProto& conf,
-    const vector<Layer*>& srclayers) {
-  CHECK_EQ(srclayers.size(), 2);
-  Layer::Setup(conf, srclayers);
-}
-
-void EuclideanLossLayer::ComputeFeature(int flag,
-    const vector<Layer*>& srclayers) {
-  int count = srclayers[0]->data(this).count();
-  CHECK_EQ(count, srclayers[1]->data(this).count());
-  const float* reconstruct_dptr = srclayers[0]->data(this).cpu_data();
-  const float* input_dptr = srclayers[1]->data(this).cpu_data();
-  float loss = 0;
-  for (int i = 0; i < count; i++) {
-      loss += (input_dptr[i] - reconstruct_dptr[i]) *
-        (input_dptr[i] - reconstruct_dptr[i]);
-  }
-  loss_ += loss / srclayers[0]->data(this).shape()[0];
-  counter_++;
-}
-
-void EuclideanLossLayer::ComputeGradient(int flag,
-    const vector<Layer*>& srclayers) {
-  int count = srclayers[0]->data(this).count();
-  CHECK_EQ(count, srclayers[1]->data(this).count());
-  const float* reconstruct_dptr = srclayers[0]->data(this).cpu_data();
-  const float* input_dptr = srclayers[1]->data(this).cpu_data();
-  Blob<float>* gsrcblob = srclayers[0]->mutable_grad(this);
-  float* gsrcptr = gsrcblob->mutable_cpu_data();
-  for (int i = 0; i < count; i++) {
-    gsrcptr[i] = reconstruct_dptr[i]-input_dptr[i];
-  }
-  Tensor<cpu, 1> gsrc(gsrcptr, Shape1(gsrcblob->count()));
-  gsrc /= srclayers[0]->data(this).shape()[0];
-}
-const std::string EuclideanLossLayer::ToString(bool debug, int flag) {
-  if (debug)
-    return Layer::ToString(debug, flag);
-
-  string disp = "Loss = " + std::to_string(loss_ / counter_);
-  counter_ = 0;
-  loss_ = 0;
-  return disp;
-}
-}  // namespace singa
diff --git a/src/neuralnet/loss_layer/softmax.cc b/src/neuralnet/loss_layer/softmax.cc
deleted file mode 100644
index 9d0cb1d..0000000
--- a/src/neuralnet/loss_layer/softmax.cc
+++ /dev/null
@@ -1,112 +0,0 @@
-/************************************************************
-*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*
-*************************************************************/
-
-
-#include <glog/logging.h>
-#include <algorithm>
-#include "singa/neuralnet/loss_layer.h"
-#include "mshadow/tensor.h"
-#include "singa/utils/math_blob.h"
-
-namespace singa {
-
-using namespace mshadow;
-using mshadow::cpu;
-
-using mshadow::Shape;
-using mshadow::Shape1;
-using mshadow::Shape2;
-using mshadow::Tensor;
-
-using std::vector;
-
-void SoftmaxLossLayer::Setup(const LayerProto& proto,
-    const vector<Layer*>& srclayers) {
-  CHECK_EQ(srclayers.size(), 2);
-  LossLayer::Setup(proto, srclayers);
-  data_.Reshape(srclayers[0]->data(this).shape());
-  batchsize_ = data_.shape()[0];
-  dim_ = data_.count() / batchsize_;
-  topk_ = proto.softmaxloss_conf().topk();
-  scale_ = proto.softmaxloss_conf().scale();
-}
-
-void SoftmaxLossLayer::ComputeFeature(int flag,
-    const vector<Layer*>& srclayers) {
-  Shape<2> s = Shape2(batchsize_, dim_);
-  Tensor<cpu, 2> prob(data_.mutable_cpu_data(), s);
-  Tensor<cpu, 2> src(srclayers[0]->mutable_data(this)->mutable_cpu_data(), s);
-  Softmax(prob, src);
-  const auto& label = srclayers[1]->aux_data(this);
-  const float* probptr = prob.dptr;
-  float loss = 0, precision = 0;
-  for (int n = 0; n < batchsize_; n++) {
-    int ilabel = static_cast<int>(label[n]);
-    //  CHECK_LT(ilabel,10);
-    CHECK_GE(ilabel, 0);
-    float prob_of_truth = probptr[ilabel];
-    loss -= log(std::max(prob_of_truth, FLT_MIN));
-    vector<std::pair<float, int> > probvec;
-    for (int j = 0; j < dim_; ++j) {
-      probvec.push_back(std::make_pair(probptr[j], j));
-    }
-    std::partial_sort(probvec.begin(), probvec.begin() + topk_, probvec.end(),
-                      std::greater<std::pair<float, int> >());
-    // check if true label is in top k predictions
-    for (int k = 0; k < topk_; k++) {
-      if (probvec[k].second == static_cast<int>(label[n])) {
-        precision++;
-        break;
-      }
-    }
-    probptr += dim_;
-  }
-  CHECK_EQ(probptr, prob.dptr + prob.shape.Size());
-  loss_ += loss * scale_ / (1.0f * batchsize_);
-  accuracy_ += precision * scale_ / (1.0f * batchsize_);
-  counter_++;
-}
-
-void SoftmaxLossLayer::ComputeGradient(int flag,
-    const vector<Layer*>& srclayers) {
-  const auto& label = srclayers[1]->aux_data();
-  Blob<float>* gsrcblob = srclayers[0]->mutable_grad(this);
-  Copy(data_, gsrcblob);
-//  gsrcblob->CopyFrom(data_);
-  float* gsrcptr = gsrcblob->mutable_cpu_data();
-  for (int n = 0; n < batchsize_; n++) {
-    gsrcptr[n*dim_ + static_cast<int>(label[n])] -= 1.0f;
-  }
-  Tensor<cpu, 1> gsrc(gsrcptr, Shape1(gsrcblob->count()));
-  gsrc *= scale_ / (1.0f * batchsize_);
-}
-
-const std::string SoftmaxLossLayer::ToString(bool debug, int flag) {
-  if (debug)
-    return Layer::ToString(debug, flag);
-
-  string disp = "Loss = " + std::to_string(loss_ / counter_)
-    + ", accuracy = " + std::to_string(accuracy_ / counter_);
-  counter_ = 0;
-  loss_ = accuracy_ = 0;
-  return disp;
-}
-}  // namespace singa
diff --git a/src/neuralnet/neuralnet.cc b/src/neuralnet/neuralnet.cc
deleted file mode 100644
index b045e06..0000000
--- a/src/neuralnet/neuralnet.cc
+++ /dev/null
@@ -1,644 +0,0 @@
-/************************************************************
-*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*
-*************************************************************/
-
-
-#include "singa/neuralnet/neuralnet.h"
-#include <unordered_map>
-#include <algorithm>
-#include <queue>
-#include "singa/utils/singleton.h"
-
-namespace singa {
-
-using std::map;
-using std::string;
-using std::vector;
-
-/**
- * Check user defined net config and make some preprocessing, e.g., assing names
- * to params.
- * TODO(wnagwei) implement the following functions.
- * 1. layer and paramname should not include '@', '+' and '#'. '@<suffix>'
- * is used for identifying layer location/partition. '<prefix>#' is used for
- * identifying the unrolled Param in RNN models.
- * 2. assign names to unnamed Param, e.g., p<param_id>+<layer_name>.
- */
-const NetProto NetConfPreprocess(const NetProto& conf) {
-  /*
-  string param_name = "$";
-  // if user does not name the param, then name it based on layer name.
-  if (param->name() == "") {
-    param->set_name(layer->name() + param_name);
-    param_name += "$";
-  }
-  */
-  NetProto proto = conf;
-  for (int i = 0; i < proto.layer_size(); i++) {
-    if (!proto.layer(i).has_unroll_len())
-      proto.mutable_layer(i)->set_unroll_len(proto.unroll_len());
-  }
-  return proto;
-}
-
-NeuralNet* NeuralNet::Create(const NetProto& net_conf, Phase phase,
-    int npartitions) {
-  const NetProto& full_net_conf = NetConfPreprocess(net_conf);
-  NetProto conf = full_net_conf;
-  conf.clear_layer();
-  // flag=0: neither exclude nor include field appears
-  // flag=1: exclude field appears
-  // flag=2: include field appears
-  int flag = 0;
-  // exclude layers according to phase
-  // exclude field is deprecated
-  // please use include field instead
-  for (const auto& layer : full_net_conf.layer()) {
-    bool include = true;
-    for (auto p : layer.exclude()) {
-      // check whether both exclude and include field
-      // appear in the same .conf file
-      CHECK(flag == 0 || flag == 1) << "Don't use include and exclude together";
-      if (p == phase)
-        include = false;
-      flag = 1;
-    }
-    // neural net only include the specified layer in the include field
-    for (auto p : layer.include()) {
-      CHECK(flag == 0 || flag == 2) << "Don't use include and exclude together";
-      if (p == phase) {
-        include = true;
-        break;
-      }
-      include = false;
-      flag = 2;
-    }
-    if (include == false) continue;
-    LayerProto* layer_conf = conf.add_layer();
-    layer_conf->CopyFrom(layer);
-    // using net partition if layer partition is not set
-    if (!layer_conf->has_partition_dim())
-      layer_conf->set_partition_dim(net_conf.partition_dim());
-  }
-  // LOG(INFO) << "Before unrolling: \n" << conf.DebugString();
-  conf = Unrolling(conf);
-
-  // Copy shared parameters for sharing param conf
-  std::vector<ParamProto*> shares;
-  std::unordered_map<string, ParamProto*> name2param;
-  for (int index = 0; index < conf.layer_size(); index++) {
-    LayerProto* layer = conf.mutable_layer(index);
-    for (int i = 0; i < layer->param_size(); i++) {
-      ParamProto* param = layer->mutable_param(i);
-      CHECK(name2param.find(param->name()) == name2param.end())
-        << "Repeated param = " << param->name();
-      name2param[param->name()] = param;
-      if (param->has_share_from() && param->share_from() != "")
-        shares.push_back(param);
-    }
-  }
-  for (auto param : shares) {
-    const std::string from = param->share_from();
-    const std::string name = param->name();
-    CHECK(name2param.find(from) != name2param.end())
-      << "can't find share_from = " << from;
-    // CopyFrom will overwrite the name and share_from fields
-    param->CopyFrom(*name2param.at(from));
-    param->set_name(name);
-    param->set_share_from(from);
-  }
-  LOG(INFO) << "Initial NeuralNet Config is\n" << conf.DebugString();
-  // TODO(wangwei) create net based on net type, e.g., directed, undirected.
-  return new NeuralNet(conf, npartitions);
-}
-
-const NetProto NeuralNet::Unrolling(const NetProto& net_conf) {
-  // Step 1: Unroll each layer & set parameter sharing
-  NetProto conf;
-
-  std::vector<std::vector<int>> layer_groups;
-  std::unordered_map<string, int> org_layer_names;
-  for (int index = 0; index < net_conf.layer_size(); index ++) {
-    const LayerProto& org_layer = net_conf.layer(index);
-    org_layer_names[org_layer.name()] = index;  // layer_name -> index
-
-    std::vector<int> layer_group;
-    for (int i = 0; i < org_layer.unroll_len(); i ++) {  // unroll
-      LayerProto* unroll_layer = conf.add_layer();
-      unroll_layer->CopyFrom(org_layer);  // create a new layer conf
-      // update layer names
-      std::stringstream sstm;
-      sstm << i << '#' << unroll_layer->name();
-      unroll_layer->set_name(sstm.str());
-      unroll_layer->set_unroll_index(i);
-      // update layer parameter sharing
-      for (int j = 0; j < unroll_layer->param_size(); j ++) {
-        ParamProto* param = unroll_layer->mutable_param(j);
-        if (i > 0) {
-          param->set_share_from("0#" + param->name());
-        }
-        std::stringstream sstm1;
-        sstm1 << i << '#' << param->name();
-        param->set_name(sstm1.str());
-      }
-      // clear unrolling related fields
-      unroll_layer->clear_unroll_len();
-      unroll_layer->clear_unroll_conn_type();
-      unroll_layer->clear_shift();
-      unroll_layer->clear_srclayers();
-
-      layer_group.push_back(conf.layer_size() - 1);
-      // LOG(ERROR) << "unrolling layer " << unroll_layer->name();
-    }
-    layer_groups.push_back(layer_group);
-  }
-  // Step 2: Connect unrolled layers by setting `srclayers`
-  for (int index = 0; index < net_conf.layer_size(); index ++) {
-    const LayerProto& org_layer = net_conf.layer(index);
-    if (org_layer.srclayers_size() == 0)
-      continue;   // no src layer
-    for (int i = 0; i < org_layer.srclayers_size(); i ++) {
-      const string& org_layer_src = org_layer.srclayers(i);
-      singa::UnrollConnType unroll_conn_type = kUnrollOneToOne;
-      if (i < org_layer.unroll_conn_type_size())
-        unroll_conn_type = org_layer.unroll_conn_type(i);
-      unsigned int shift = 0;
-      if (i < org_layer.shift_size())
-        shift = org_layer.shift(i);
-
-      const std::vector<int> unroll_layer_srcs
-        = layer_groups[org_layer_names[org_layer_src]];
-
-      for (unsigned int j = 0; j < layer_groups[index].size(); j ++) {
-        LayerProto* unroll_layer = conf.mutable_layer(layer_groups[index][j]);
-        // Update src layers of `unroll_layer` by considering the types
-        if (unroll_conn_type == kUnrollOneToAll) {
-          for (int unroll_layer_src : unroll_layer_srcs) {
-            unroll_layer->add_srclayers(conf.layer(unroll_layer_src).name());
-          }
-        } else if (unroll_conn_type == kUnrollOneToOne) {
-          if (j < shift) continue;  // no need to connect with the src
-          int unroll_layer_src = unroll_layer_srcs[j - shift];
-          unroll_layer->add_srclayers(conf.layer(unroll_layer_src).name());
-        } else if (unroll_conn_type == kUnrollFirstToLast) {
-          if (j > 0) break;
-          int unroll_layer_src =
-            unroll_layer_srcs[unroll_layer_srcs.size() - 1];
-          unroll_layer->add_srclayers(conf.layer(unroll_layer_src).name());
-        }
-      }
-    }
-
-    // TODO(fanju): add LSTM when it is ready
-    if (org_layer.type() == kGRU) {  // connect GRU layers
-      for (unsigned int j = 1; j < layer_groups[index].size(); j ++) {
-        LayerProto* unroll_layer = conf.mutable_layer(layer_groups[index][j]);
-        string srcname = conf.layer(layer_groups[index][j-1]).name();
-        unroll_layer->add_srclayers(srcname);
-      }
-    }
-  }
-  return conf;
-}
-
-
-NeuralNet::NeuralNet(NetProto netproto, int npartitions) {
-  LOG(INFO) << "Constructing NeuralNet...";
-  auto graph = CreateGraph(netproto, npartitions);
-  CreateNetFromGraph(graph);
-  PrepareDataStructures();
-
-  for (Node* node : graph->nodes())
-    delete static_cast<LayerProto*>(node->proto);
-  delete graph;
-  LOG(INFO) << "NeuralNet Constructed";
-  unroll_len_ = netproto.unroll_len();
-}
-
-NeuralNet::~NeuralNet() {
-  for (auto layer : layers_)
-    delete layer;
-}
-void NeuralNet::Load(const vector<string>& paths) {
-  unordered_map<string, Param*> params;
-  for (auto p : params_) {
-    params[p->name()] = p;
-  }
-  Load(paths, params);
-}
-void NeuralNet::Load(const vector<string>& paths,
-    const unordered_map<string, Param*>& params) {
-  for (const auto path : paths) {
-    LOG(ERROR) << "Load from checkpoint file " << path;
-    BlobProtos bps;
-    // TODO(wangwei) extend to read checkpoint from HDFS
-    ReadProtoFromBinaryFile(path.c_str(), &bps);
-    for (int i = 0; i < bps.name_size(); i++) {
-      if (params.find(bps.name(i)) != params.end()) {
-        // LOG(ERROR) << "Loading param = " << bps.name(i);
-        params.at(bps.name(i))->FromProto(bps.blob(i));
-        params.at(bps.name(i))->set_version(bps.version(i));
-      }
-    }
-  }
-}
-
-void NeuralNet::ShareParamsFrom(NeuralNet* other, bool cpu_only) {
-  for (auto& layer : layers_) {
-    auto otherlayer = other->name2layer(layer->name());
-    if (otherlayer != nullptr) {
-      const auto& otherparams = otherlayer->GetParams();
-      const auto& params = layer->GetParams();
-      CHECK_EQ(params.size(), otherparams.size());
-      for (size_t i = 0; i < params.size(); i++) {
-        params[i]->ShareDataFrom(otherparams[i], cpu_only);
-      }
-    }
-  }
-}
-
-// name of connection layers
-string splitName(const string& layer) { return "split("+layer+")"; }
-string sliceName(const string& layer) { return "slice("+layer+")"; }
-string concateName(const string& layer) { return "concate("+layer+")"; }
-string bridgeName(const string& src, const string& dst) { return src+"->"+dst; }
-string bridgeSrcName(const string& src, const string& dst) {
-  return "bridge_src("+bridgeName(src, dst)+")";
-}
-string bridgeDstName(const string& src, const string& dst) {
-  return "bridge_dst("+bridgeName(src, dst)+")";
-}
-
-ConnectionType dstLayerConnection(const LayerProto& proto) {
-  auto layer = Layer::Create(proto);
-  auto ret = layer->dst_layer_connection();
-  delete layer;
-  return ret;
-}
-
-ConnectionType srcNeuronConnection(const LayerProto& proto) {
-  auto layer = Layer::Create(proto);
-  auto ret = layer->src_neuron_connection(0);
-  delete layer;
-  return ret;
-}
-
-NetProto NeuralNet::AddModelSplitLayers(const NetProto& netproto) {
-  NetProto net_w_split;
-  net_w_split.CopyFrom(netproto);
-  net_w_split.clear_layer();
-  // calculate number of dst-layers for each layer
-  map<string, int> dst_count;
-  for (const LayerProto& layer : netproto.layer())
-    for (const string& src_name : layer.srclayers())
-      ++dst_count[src_name];
-  // tag to add split layer if:
-  // dst_count[] > 1 && dst_layer_connection() = OneToOne
-  for (const LayerProto& layer : netproto.layer())
-    if ((dst_count[layer.name()] > 1 && dstLayerConnection(layer) == kOneToOne))
-        dst_count[layer.name()] = -dst_count[layer.name()];
-  // add orginal layers and adjust srclayers
-  for (const LayerProto& layer : netproto.layer()) {
-    LayerProto* proto = net_w_split.add_layer();
-    proto->CopyFrom(layer);
-    proto->clear_srclayers();
-    for (const string& src_name : layer.srclayers())
-      if (dst_count[src_name] < 0)
-        proto->add_srclayers(splitName(src_name));
-      else
-        proto->add_srclayers(src_name);
-  }
-  // add split layers
-  for (const LayerProto& layer : netproto.layer()) {
-    if (dst_count[layer.name()] < 0) {
-      LayerProto* split_proto = net_w_split.add_layer();
-      split_proto->set_name(splitName(layer.name()));
-      split_proto->set_type(kSplit);
-      split_proto->set_partition_dim(layer.partition_dim());
-      split_proto->add_srclayers(layer.name());
-      split_proto->mutable_split_conf()
-                 ->set_num_splits(-dst_count[layer.name()]);
-    }
-  }
-  // LOG(INFO) << "NeuralNet Config After Model Split is\n"
-  //           << net_w_split.DebugString();
-  return net_w_split;
-}
-
-NetProto NeuralNet::AddPartitionConnectionLayers(const NetProto& netproto,
-                                                 int npartitions) {
-  CHECK_GT(npartitions, 0);
-  NetProto net_w_connection;
-  net_w_connection.CopyFrom(netproto);
-  // if npartitions is 1, no need to add connection layers
-  if (npartitions == 1) return net_w_connection;
-  // add original layers, but remove all edges first
-  net_w_connection.clear_layer();
-  map<string, LayerProto*> name2proto;
-  for (const LayerProto& layer : netproto.layer()) {
-    LayerProto* layer_proto = net_w_connection.add_layer();
-    layer_proto->CopyFrom(layer);
-    layer_proto->clear_srclayers();
-    name2proto[layer_proto->name()] = layer_proto;
-  }
-  /*
-   * Add Slice, Concate, Split Layers for Model Partition
-   *
-   * All cases are as follows:
-   * src_pdim | dst_pdim | connection_type | Action
-   *     0    |     0    |     OneToOne    | Direct Connection
-   *     1    |     1    |     OneToOne    | Direct Connection
-   *     0    |     0    |     OneToAll    | Direct Connection
-   *     1    |     0    |     OneToOne    | Slice -> Concate
-   *     0    |     1    |     OneToOne    | Slice -> Concate
-   *     1    |     0    |     OneToAll    | Slice -> Concate
-   *     0    |     1    |     OneToAll    | Split -> Concate
-   *     1    |     1    |     OneToAll    | Split -> Concate
-   *
-   * Logic:
-   * dst_pdim = 1 && OneToAll ?
-   *   (YES) Split -> Concate
-   *   (NO)  src_pdim = dst_pdim ?
-   *           (YES) Direct Connection
-   *           (NO)  Slice -> Concate
-   */
-  for (const LayerProto& origin_layer : netproto.layer()) {
-    LayerProto* dst_layer = name2proto[origin_layer.name()];
-    int dst_pdim = dst_layer->partition_dim();
-    ConnectionType connection = srcNeuronConnection(*dst_layer);
-    for (const string& src_name : origin_layer.srclayers()) {
-      LayerProto* src_layer = name2proto[src_name];
-      int src_pdim = src_layer->partition_dim();
-      // dst_pdim = 1 && OneToAll ?
-      if (dst_pdim == 1 && connection == kOneToAll) {
-        // add split layer
-        LayerProto* split_layer = net_w_connection.add_layer();
-        split_layer->set_name(splitName(src_layer->name()));
-        split_layer->set_type(kSplit);
-        split_layer->set_partition_dim(src_layer->partition_dim());
-        split_layer->add_srclayers(src_layer->name());
-        split_layer->mutable_split_conf()->set_num_splits(npartitions);
-       // add concate layer
-       LayerProto* concate_layer = net_w_connection.add_layer();
-       concate_layer->set_name(concateName(split_layer->name()));
-       concate_layer->set_type(kConcate);
-       concate_layer->set_partition_dim(dst_layer->partition_dim());
-       // concate on src_pdim
-       concate_layer->mutable_concate_conf()
-         ->set_concate_dim(src_layer->partition_dim());
-       concate_layer->mutable_concate_conf()->set_num_concates(npartitions);
-       concate_layer->add_srclayers(split_layer->name());
-       // connect dst_layer to concate layer
-       dst_layer->add_srclayers(concate_layer->name());
-      } else {
-        // src_pdim = dst_pdim ?
-        if (dst_pdim == src_pdim) {
-          // direct connection
-          dst_layer->add_srclayers(src_layer->name());
-        } else {
-          // add slice layer
-          LayerProto* slice_layer = net_w_connection.add_layer();
-          slice_layer->set_name(sliceName(src_layer->name()));
-          slice_layer->set_type(kSlice);
-          slice_layer->set_partition_dim(src_layer->partition_dim());
-          // slice on dst_pdim
-          slice_layer->mutable_slice_conf()
-            ->set_slice_dim(dst_layer->partition_dim());
-          slice_layer->mutable_slice_conf()->set_num_slices(npartitions);
-          slice_layer->add_srclayers(src_layer->name());
-          // add concate layer
-          LayerProto* concate_layer = net_w_connection.add_layer();
-          concate_layer->set_name(concateName(slice_layer->name()));
-          concate_layer->set_type(kConcate);
-          concate_layer->set_partition_dim(dst_layer->partition_dim());
-          // concate on src_pdim
-          concate_layer->mutable_concate_conf()
-            ->set_concate_dim(src_layer->partition_dim());
-          concate_layer->mutable_concate_conf()->set_num_concates(npartitions);
-          concate_layer->add_srclayers(slice_layer->name());
-          // connect dst_layer to concate layer
-          dst_layer->add_srclayers(concate_layer->name());
-        }
-      }
-    }
-  }
-  LOG(INFO) << "NeuralNet Config After Adding Connection Layers is\n"
-            << net_w_connection.DebugString();
-  return net_w_connection;
-}
-
-Graph* NeuralNet::CreateGraph(const NetProto& netproto, int npartitions) {
-  NetProto net_w_split = AddModelSplitLayers(netproto);
-  NetProto net_w_connection =
-    AddPartitionConnectionLayers(net_w_split, npartitions);
-  // for each original layer proto, create #npartitions of nodes
-  Graph* graph = new Graph();
-  map<string, vector<Node*>> name2nodes;
-  map<string, const LayerProto*> name2proto;
-  for (const LayerProto& layer : net_w_connection.layer()) {
-    vector<Node*> nodes;
-    for (int i = 0; i < npartitions; i++) {
-      LayerProto *proto = new LayerProto(layer);
-      // differentiate partitions
-      string nodename = layer.name() + "@" + std::to_string(i);
-      proto->set_name(nodename);
-      proto->set_type(layer.type());
-      proto->set_partition_dim(layer.partition_dim());
-      proto->set_partition_id(i);
-      proto->set_num_partitions(npartitions);
-      Node* node = graph->AddNode(nodename, layer.name(), i, proto);
-      nodes.push_back(node);
-      // TODO(wangwei) update param name
-    }
-    name2nodes[layer.name()] = nodes;
-    name2proto[layer.name()] = &layer;
-  }
-  // connect layers, add bridge layers if partition id is different
-  for (const LayerProto& origin_layer : net_w_connection.layer()) {
-    vector<Node*> dst_nodes = name2nodes[origin_layer.name()];
-    for (const string& src_name : origin_layer.srclayers()) {
-      vector<Node*> src_nodes = name2nodes[src_name];
-      if (origin_layer.type() != kConcate) {
-        for (size_t i = 0; i < src_nodes.size(); ++i) {
-          CHECK_EQ(src_nodes[i]->partition_id, i);
-          CHECK_EQ(dst_nodes[i]->partition_id, i);
-          graph->AddEdge(src_nodes[i], dst_nodes[i]);
-        }
-      } else {
-        // need to add bridge layers
-        for (size_t i = 0; i < src_nodes.size(); ++i) {
-          CHECK_EQ(src_nodes[i]->partition_id, i);
-          for (size_t j = 0; j < dst_nodes.size(); ++j) {
-            CHECK_EQ(dst_nodes[j]->partition_id, j);
-            if (i == j) {  // in same partition, no bridge needed
-              graph->AddEdge(src_nodes[i], dst_nodes[j]);
-            } else {  // add bridges
-              // bridge src && dst layer
-              LayerProto *proto_bsrc = new LayerProto();
-              LayerProto *proto_bdst = new LayerProto();
-              string bsrc_name = bridgeSrcName(src_nodes[i]->name,
-                                               dst_nodes[j]->name);
-              string bdst_name = bridgeDstName(src_nodes[i]->name,
-                                               dst_nodes[j]->name);
-              proto_bsrc->set_name(bsrc_name);
-              proto_bdst->set_name(bdst_name);
-              proto_bsrc->set_type(kBridgeSrc);
-              proto_bdst->set_type(kBridgeDst);
-              proto_bsrc->set_partition_dim(origin_layer.partition_dim());
-              proto_bdst->set_partition_dim(origin_layer.partition_dim());
-              proto_bsrc->set_partition_id(src_nodes[i]->partition_id);
-              proto_bdst->set_partition_id(dst_nodes[j]->partition_id);
-              proto_bsrc->set_num_partitions(npartitions);
-              proto_bdst->set_num_partitions(npartitions);
-              Node* bsrc_node = graph->AddNode(bsrc_name, bsrc_name, i,
-                                               proto_bsrc);
-              Node* bdst_node = graph->AddNode(bdst_name, bdst_name, j,
-                                               proto_bdst);
-              graph->AddEdge(src_nodes[i], bsrc_node);
-              graph->AddEdge(bsrc_node, bdst_node);
-              graph->AddEdge(bdst_node, dst_nodes[j]);
-            }
-          }
-        }
-      }
-    }
-  }
-  graph->Sort();
-  // DLOG(INFO) << "Pure graph structure\n" << graph->ToJson();
-  return graph;
-}
-
-void NeuralNet::CreateNetFromGraph(Graph* graph) {
-  // create one layer per node
-  for (Node* node : graph->nodes()) {
-    auto proto_ptr = static_cast<LayerProto*>(node->proto);
-    auto layer = Layer::Create(*proto_ptr);
-    layers_.push_back(layer);
-    name2layer_[node->name] = layer;
-  }
-  // connect layers
-  for (Node* node : graph->nodes()) {
-    auto layer = name2layer(node->name);
-    src_map_[layer] = vector<Layer*>{};
-    for (Node* src : node->srcnodes)
-      src_map_[layer].push_back(name2layer(src->name));
-  }
-  // setup layers
-  int paramid = 0;
-  map<string, string> layerinfo;
-  map<string, vector<Layer*>> share_param_layers;
-  for (Node* node : graph->nodes()) {
-    LOG(INFO) << "constructing graph: " << node->name;
-    auto layer = name2layer(node->name);
-    layer->Setup(*(static_cast<LayerProto*>(node->proto)), srclayers(layer));
-    DLOG(INFO) << "constructing graph: " << layer->name();
-    layerinfo[layer->name()] = IntVecToString(layer->data(nullptr).shape());
-    for (auto param : layer->GetParams()) {
-      param->set_id(paramid++);
-    }
-    if (layer->partition_dim() == 0)
-      share_param_layers[node->origin].push_back(layer);
-  }
-  // create map from param name to param ptr
-  std::unordered_map<string, Param*> name2param;
-  for (auto layer : layers_) {
-    for (auto param : layer->GetParams()) {
-      name2param[param->name()] = param;
-    }
-  }
-  for (auto & entry : share_param_layers) {
-    // overwrite entries for replicated params due to layer partition (dim 0).
-    for (auto *param : entry.second.front()->GetParams())
-      name2param.at(param->name()) = param;
-  }
-  // share params based on share_from field
-  for (auto & entry : name2param) {
-    Param* param = entry.second;
-    const string share_from = param->share_from();
-    if (param->share_from() != "") {
-      if (name2param.find(share_from) != name2param.end()) {
-        param->ShareDataFrom(name2param.at(param->share_from()), false);
-      } else {
-        LOG(FATAL) << "No param with the name (share_from) " << share_from;
-      }
-    }
-  }
-
-  // share params due to laye unrolling
-  for (auto & entry : name2param) {
-    Param* param = entry.second;
-    auto pos = param->name().find("#");
-    if (pos != std::string::npos && param->owner() != param->id()) {
-      string from = "0" + param->name().substr(pos);
-      CHECK(name2param.find(from) != name2param.end())
-        << "Can't find owner = " << from << " for param = " << param->name();
-      Param* owner = name2param.at(from);
-      param->ShareFrom(owner);
-    }
-  }
-  // share Params for layers generated (partitioned) from the same origin layer
-  for (auto & entry : share_param_layers) {
-    const auto& owner = entry.second.begin();
-    const auto& owner_params = (*owner)->GetParams();
-    for (auto it = owner + 1; it != entry.second.end(); it++) {
-      auto params = (*it)->GetParams();
-      CHECK_EQ(params.size(), owner_params.size());
-      for (size_t i = 0; i < params.size(); i++)
-        params.at(i)->ShareDataFrom(owner_params.at(i), true);
-    }
-  }
-}
-
-void NeuralNet::PrepareDataStructures() {
-  params_.clear();
-  paramid2param_.clear();
-  name2layer_.clear();
-  for (auto& layer : layers_) {
-    name2layer_[layer->name()] = layer;
-    for (Param* p : layer->GetParams()) {
-      paramid2param_[p->id()] = p;
-      params_.push_back(p);
-    }
-  }
-}
-
-const Graph NeuralNet::ToGraph(bool include_shape) const {
-  Graph g;
-  map<string, string> attrs;
-  attrs["shape"] = "box";
-  vector<string> colors {"black", "red", "yellow", "blue"};
-  for (auto layer : layers_) {
-    LOG_IF(WARNING, layer->partition_id() >= static_cast<int>(colors.size()))
-      << "Too many partitions for displaying";
-    attrs["color"] = colors[layer->partition_id() % colors.size()];
-    if (include_shape) {
-      attrs["label"] = "shape: ";
-      for (const auto& x : layer->data(nullptr).shape())
-        attrs["label"] += std::to_string(x) + " ";
-    }
-    g.AddNode(layer->name(), attrs);
-  }
-
-  for (auto layer : layers_)
-    for (auto src : src_map_.at(layer))
-      g.AddEdge(src->name(), layer->name());
-  return g;
-}
-}  // namespace singa
diff --git a/src/neuralnet/neuron_layer/activation.cc b/src/neuralnet/neuron_layer/activation.cc
deleted file mode 100644
index f75961e..0000000
--- a/src/neuralnet/neuron_layer/activation.cc
+++ /dev/null
@@ -1,87 +0,0 @@
-/************************************************************
-*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*
-*************************************************************/
-#include "singa/neuralnet/neuron_layer.h"
-#include "singa/utils/math_blob.h"
-#include "singa/utils/singa_op.h"
-#include "singa/proto/job.pb.h"
-namespace singa {
-
-void ActivationLayer::Setup(const LayerProto& conf,
-    const vector<Layer*>& srclayers) {
-  NeuronLayer::Setup(conf, srclayers);
-  data_.ReshapeLike(srclayers[0]->data(this));
-  grad_.ReshapeLike(data_);
-  if (conf.share_src_blobs()) {
-    data_.ShareData(srclayers[0]->mutable_data(this), false);
-    grad_.ShareData(srclayers[0]->mutable_grad(this), false);
-  }
-}
-void
-ActivationLayer::ComputeFeature(int flag, const vector<Layer*>& srclayers) {
-  switch (layer_conf_.activation_conf().type()) {
-    case RELU:
-      Map<op::Relu<float>, float>(srclayers[0]->data(this), &data_);
-      break;
-    case SIGMOID:
-      Map<op::Sigmoid<float>, float>(srclayers[0]->data(this), &data_);
-      break;
-    case TANH:
-      Map<op::Tanh<float>, float>(srclayers[0]->data(this), &data_);
-      break;
-      /*
-    case ActivationType_STANH:
-      Map<op::STanh<float>, float>(srclayers[0]->data(this), &data_);
-      break;
-      */
-    default:
-      LOG(ERROR) << "Unknow activation type " <<
-        layer_conf_.activation_conf().type();
-  }
-}
-void
-ActivationLayer::ComputeGradient(int flag, const vector<Layer*>& srclayers) {
-  Blob<float> * gsrc = srclayers[0]->mutable_grad(this);
-  switch (layer_conf_.activation_conf().type()) {
-    case RELU:
-      Map<op::ReluGrad<float>, float>(data_, gsrc);
-      Mult(*gsrc, grad_, gsrc);
-      break;
-    case SIGMOID:
-      Map<op::SigmoidGrad<float>, float>(data_, gsrc);
-      Mult(*gsrc, grad_, gsrc);
-      break;
-    case TANH:
-      Map<op::TanhGrad<float>, float>(data_, gsrc);
-      Mult(*gsrc, grad_, gsrc);
-      break;
-      /*
-    case ActivationType_STANH:
-      Map<op::STanhGrad<float>, float>(data_, gsrc);
-      Mult(*gsrc, grad_, gsrc);
-      break;
-      */
-    default:
-      LOG(ERROR) << "Unknow activation type " <<
-        layer_conf_.activation_conf().type();
-  }
-}
-
-}  // namespace singa
diff --git a/src/neuralnet/neuron_layer/bm.cc b/src/neuralnet/neuron_layer/bm.cc
deleted file mode 100644
index 66e303c..0000000
--- a/src/neuralnet/neuron_layer/bm.cc
+++ /dev/null
@@ -1,64 +0,0 @@
-/************************************************************
-*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*
-*************************************************************/
-
-#include <glog/logging.h>
-#include "singa/neuralnet/neuron_layer.h"
-#include "singa/utils/singleton.h"
-
-namespace singa {
-
-using std::vector;
-
-void BMLayer::Setup(const LayerProto& conf,
-    const vector<Layer*>& srclayers) {
-  Layer::Setup(conf, srclayers);
-  data_.ReshapeLike(srclayers[0]->data(this));
-  grad_.ReshapeLike(srclayers[0]->grad(this));
-
-  const vector<int>& srcshape = srclayers[0]->data(this).shape();
-
-  batchsize_ = srcshape[0];
-  channels_ = srcshape[1];
-  height_ = srcshape[2];
-  width_ = srcshape[3];
-
-  bnScale_ = Param::Create(conf.param(0));
-  bnScale_->Setup(vector<int>{1, channels_, 1, 1});
-
-  bnBias_ = Param::Create(conf.param(1));
-  bnBias_->Setup(vector<int>{1, channels_, 1, 1});
-
-  resultRunningMean_ = Param::Create(conf.param(2));
-  resultRunningMean_->Setup(vector<int>{1, channels_, 1, 1});
-
-  resultRunningInvVariance_ = Param::Create(conf.param(3));
-  resultRunningInvVariance_->Setup(vector<int>{1, channels_, 1, 1});
-}
-
-void BMLayer::ComputeFeature(int flag, const vector<Layer*>& srclayers) {
-  // Todo
-}
-
-void BMLayer::ComputeGradient(int flag, const vector<Layer*>& srclayers) {
-  // Todo
-}
-
-}  //  namespace singa
diff --git a/src/neuralnet/neuron_layer/convolution.cc b/src/neuralnet/neuron_layer/convolution.cc
deleted file mode 100644
index e77e9ca..0000000
--- a/src/neuralnet/neuron_layer/convolution.cc
+++ /dev/null
@@ -1,192 +0,0 @@
-/************************************************************
-*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*
-*************************************************************/
-
-#include <glog/logging.h>
-#include "singa/neuralnet/neuron_layer.h"
-#include "singa/utils/singleton.h"
-
-namespace singa {
-using std::vector;
-
-/************ Implementation for ConvolutionLayer*************************/
-ConvolutionLayer::~ConvolutionLayer() {
-  delete weight_;
-  delete bias_;
-}
-void ConvolutionLayer::Setup(const LayerProto& conf,
-    const vector<Layer*>& srclayers) {
-  CHECK_EQ(srclayers.size(), 1);
-  Layer::Setup(conf, srclayers);
-  ConvolutionProto conv_conf = conf.convolution_conf();
-  if (conv_conf.has_kernel()) {
-    kernel_x_ = kernel_y_ = conv_conf.kernel();
-  } else {
-    kernel_x_ = conv_conf.kernel_x();
-    kernel_y_ = conv_conf.kernel_y();
-  }
-  CHECK_NE(kernel_x_, 0);
-  CHECK_NE(kernel_y_, 0);
-
-  if (conv_conf.has_pad()) {
-    pad_x_ = pad_y_ = conv_conf.pad();
-  } else {
-    pad_x_ = conv_conf.pad_x();
-    pad_y_ = conv_conf.pad_y();
-  }
-
-  if (conv_conf.has_stride()) {
-    stride_x_ = stride_y_ = conv_conf.stride();
-  } else {
-    stride_x_ = conv_conf.stride_x();
-    stride_y_ = conv_conf.stride_y();
-  }
-
-  num_filters_ = conv_conf.num_filters();
-  // partition filters
-  if (partition_dim() > 0)
-    num_filters_ /= srclayers.at(0)->num_partitions();
-
-  const vector<int>& srcshape = srclayers[0]->data(this).shape();
-  batchsize_ = srcshape[0];
-  int dim = srcshape.size();
-  CHECK_GT(dim, 2);
-  width_ = srcshape[dim - 1];
-  height_ = srcshape[dim - 2];
-  if (dim > 3)
-    channels_ = srcshape[dim - 3];
-  else if (dim > 2)
-    channels_ = 1;
-
-  conv_height_ = (height_ + 2 * pad_y_ - kernel_y_) / stride_y_ + 1;
-  conv_width_ = (width_ + 2 * pad_x_ - kernel_x_) / stride_x_ + 1;
-  col_height_ = channels_ * kernel_x_ * kernel_y_;
-  col_width_ = conv_height_ * conv_width_;
-  vector<int> shape{batchsize_, num_filters_, conv_height_, conv_width_};
-  data_.Reshape(shape);
-  grad_.Reshape(shape);
-  col_data_.Reshape(vector<int>{col_height_, col_width_});
-  col_grad_.Reshape(vector<int>{col_height_, col_width_});
-  weight_ = Param::Create(conf.param(0));
-  weight_->Setup(vector<int>{num_filters_, col_height_});
-  if (conf.param_size() > 1) {
-    bias_ = Param::Create(conf.param(1));
-    bias_->Setup(vector<int>{num_filters_});
-  }
-}
-
-// TODO(wangwei) remove mshadow's functions
-void ConvolutionLayer::ComputeFeature(int flag,
-    const vector<Layer*>& srclayers) {
-  auto src = Tensor4(srclayers[0]->mutable_data(this));
-  auto data = Tensor3(&data_);
-  auto col = Tensor2(&col_data_);
-  auto weight = Tensor2(weight_->mutable_data());
-  auto bias = Tensor1(bias_->mutable_data());
-  for (int n = 0; n < batchsize_; n++) {
-    if (pad_x_ > 0)
-      col = expr::unpack_patch2col(pad(src[n], pad_x_), kernel_x_, stride_x_);
-    else
-      col = expr::unpack_patch2col(src[n], kernel_x_, stride_x_);
-    data[n] = dot(weight, col);
-  }
-  data += expr::broadcast<1>(bias, data.shape);
-}
-
-void ConvolutionLayer::ComputeGradient(int flag,
-    const vector<Layer*>& srclayers) {
-  auto src = Tensor4(srclayers[0]->mutable_data(this));
-  auto col = Tensor2(&col_data_);
-  auto weight = Tensor2(weight_->mutable_data());
-  auto grad = Tensor3(&grad_);
-  auto gcol = Tensor2(&col_grad_);
-  auto gweight = Tensor2(weight_->mutable_grad());
-  auto gbias = Tensor1(bias_->mutable_grad());
-  Blob<float>* gsrcblob = srclayers[0]->mutable_grad(this);
-  Tensor<cpu, 4> gsrc(nullptr, Shape4(batchsize_, channels_, height_, width_));
-  if (gsrcblob != nullptr)
-    gsrc.dptr = gsrcblob->mutable_cpu_data();
-  gbias = expr::sumall_except_dim<1>(grad);
-  gweight = 0.0f;
-  Shape<3> padshp(gsrc.shape.SubShape());
-  padshp[0] += 2 * pad_y_;
-  padshp[1] += 2 * pad_x_;
-  Shape<2> imgshp = Shape2(height_, width_);
-  for (int n = 0; n < batchsize_; n++) {
-    if (pad_x_ > 0)
-      col = expr::unpack_patch2col(pad(src[n], pad_x_), kernel_x_, stride_x_);
-    else
-      col = expr::unpack_patch2col(src[n], kernel_x_, stride_x_);
-    gweight += dot(grad[n], col.T());
-    if (gsrcblob != nullptr) {
-      gcol = dot(weight.T(), grad[n]);
-      gsrc[n] = crop(expr::pack_col2patch(gcol, padshp, kernel_x_, stride_x_),
-          imgshp);
-    }
-  }
-}
-
-/******************* Implementation for CConvolutionLayer *********/
-void CConvolutionLayer::ComputeFeature(int flag,
-    const vector<Layer*>& srclayers) {
-  auto src = Tensor4(srclayers[0]->mutable_data(this));
-  auto data = Tensor3(&data_);
-  auto col = Tensor2(&col_data_);
-  auto weight = Tensor2(weight_->mutable_data());
-  auto bias = Tensor1(bias_->mutable_data());
-
-  for (int n = 0; n < batchsize_; n++) {
-    Im2col(src[n].dptr, channels_, height_, width_,
-        kernel_y_, kernel_x_, pad_y_, pad_x_, stride_y_, stride_x_, col.dptr);
-    data[n] = dot(weight, col);
-  }
-  data += expr::broadcast<1>(bias, data.shape);
-}
-
-void CConvolutionLayer::ComputeGradient(int flag,
-    const vector<Layer*>& srclayers) {
-  auto src = Tensor4(srclayers[0]->mutable_data(this));
-  auto col = Tensor2(&col_data_);
-  auto weight = Tensor2(weight_->mutable_data());
-
-  auto grad = Tensor3(&grad_);
-  auto gcol = Tensor2(&col_grad_);
-  auto gweight = Tensor2(weight_->mutable_grad());
-  auto gbias = Tensor1(bias_->mutable_grad());
-  gweight = 0.f;
-  Blob<float>* gsrcblob = srclayers[0]->mutable_grad(this);
-  Tensor<cpu, 4> gsrc(nullptr, Shape4(batchsize_, channels_, height_, width_));
-  if (gsrcblob != nullptr)
-    gsrc.dptr = gsrcblob->mutable_cpu_data();
-  gbias = expr::sumall_except_dim<1>(grad);
-  for (int n = 0; n < batchsize_; n++) {
-    Im2col(src[n].dptr, channels_, height_, width_,
-        kernel_y_, kernel_x_, pad_y_, pad_x_, stride_y_, stride_x_, col.dptr);
-    gweight += dot(grad[n], col.T());
-    if (gsrcblob != nullptr) {
-      gcol = dot(weight.T(), grad[n]);
-      Col2im(gcol.dptr, channels_, height_, width_,
-          kernel_y_, kernel_x_, pad_y_, pad_x_, stride_y_, stride_x_,
-          gsrc[n].dptr);
-    }
-  }
-}
-
-}  // namespace singa
diff --git a/src/neuralnet/neuron_layer/cudnn_activation.cc b/src/neuralnet/neuron_layer/cudnn_activation.cc
deleted file mode 100644
index 12b3d48..0000000
--- a/src/neuralnet/neuron_layer/cudnn_activation.cc
+++ /dev/null
@@ -1,108 +0,0 @@
-/************************************************************
-*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*
-*************************************************************/
-
-#include "singa/neuralnet/neuron_layer.h"
-
-namespace singa {
-
-void CudnnActivationLayer::InitCudnn() {
-  CudnnBase::InitCudnn();
-
-  // TODO(wangwei) make the mode case insensitive
-  if (layer_conf_.activation_conf().type() == SIGMOID)
-    mode_ = CUDNN_ACTIVATION_SIGMOID;
-  else if (layer_conf_.activation_conf().type() == TANH)
-    mode_ = CUDNN_ACTIVATION_TANH;
-  else if (layer_conf_.activation_conf().type() == RELU)
-    mode_ = CUDNN_ACTIVATION_RELU;
-  else
-    LOG(FATAL) << "Unkown activation: " << layer_conf_.activation_conf().type();
-
-  const auto& shape = data_.shape();
-  CHECK_GT(shape.size(), 0);
-  // TODO(wangwei) cudnnSetTensorNdDescriptor reports error if nbdim is < 4.
-  const int nbdim = 4;
-  // size of each dimension
-  int* sdim = new int[nbdim];
-  int* stride = new int[nbdim];
-  int i = shape.size() - 1;
-  sdim[i] = shape[i];
-  stride[i] = 1;
-  // LOG(ERROR) << "layer " << name();
-  // LOG(ERROR) << sdim[i] << " " << stride[i];
-  for (--i; i >= 0; i--) {
-    sdim[i] = shape[i];
-    stride[i] = shape[i + 1] * stride[i + 1];
-    // LOG(ERROR) << sdim[i] << " " << stride[i];
-  }
-  // padding sdim and stride to 4 dimensions
-  for (i = shape.size(); i < nbdim; i++) {
-    sdim[i] = 1;
-    stride[i] = 1;
-  }
-  CHECK_CUDNN(cudnnSetTensorNdDescriptor(src_desc_,
-        CUDNN_DATA_FLOAT,
-        nbdim,
-        sdim,
-        stride));
-  CHECK_CUDNN(cudnnSetTensorNdDescriptor(my_desc_,
-        CUDNN_DATA_FLOAT,
-        nbdim,
-        sdim,
-        stride));
-  delete[] sdim;
-  delete[] stride;
-}
-
-void CudnnActivationLayer::ComputeFeature(int flag,
-    const vector<Layer*>& srclayers) {
-  if (!has_init_cudnn_)
-    InitCudnn();
-  float alpha = 1.0f, beta = 0.0f;
-  // currently only consider single src layer
-  CHECK_EQ(srclayers.size(), 1);
-  CHECK_CUDNN(cudnnActivationForward(handle_,
-        mode_,
-        &alpha,
-        src_desc_,
-        srclayers[0]->data(this).gpu_data(),
-        &beta,
-        my_desc_,
-        data_.mutable_gpu_data()));
-}
-
-void CudnnActivationLayer::ComputeGradient(int flag,
-    const vector<Layer*>& srclayers) {
-  float alpha = 1.0f, beta = 0.0f;
-  CHECK_CUDNN(cudnnActivationBackward(handle_,
-        mode_,
-        &alpha,
-        my_desc_,
-        data_.gpu_data(),
-        my_desc_,
-        grad_.gpu_data(),
-        src_desc_,
-        srclayers[0]->data(this).gpu_data(),
-        &beta,
-        src_desc_,
-        srclayers[0]->mutable_grad(this)->mutable_gpu_data()));
-}
-}   // namespace singa
diff --git a/src/neuralnet/neuron_layer/cudnn_bm.cc b/src/neuralnet/neuron_layer/cudnn_bm.cc
deleted file mode 100644
index ca90007..0000000
--- a/src/neuralnet/neuron_layer/cudnn_bm.cc
+++ /dev/null
@@ -1,149 +0,0 @@
-/************************************************************
-*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*
-*************************************************************/
-#include "singa/neuralnet/neuron_layer.h"
-
-#if CUDNN_MAJOR == 4
-namespace singa {
-
-CudnnBMLayer::~CudnnBMLayer() {
-  if (has_init_cudnn_) {
-    CHECK_CUDNN(cudnnDestroyTensorDescriptor(bnScaleBiasMeanVar_desc_));
-    CHECK_CUDNN(cudnnDestroyTensorDescriptor(bnScaleBiasDiff_desc_));
-  }
-}
-
-void CudnnBMLayer::InitCudnn() {
-  CudnnBase::InitCudnn();
-
-  CHECK_CUDNN(cudnnCreateTensorDescriptor(&bnScaleBiasMeanVar_desc_));
-  CHECK_CUDNN(cudnnCreateTensorDescriptor(&bnScaleBiasDiff_desc_));
-
-  CHECK_CUDNN(cudnnSetTensor4dDescriptor(src_desc_,
-        CUDNN_TENSOR_NCHW,
-        CUDNN_DATA_FLOAT,
-        batchsize_,
-        channels_,
-        height_,
-        width_));
-  CHECK_CUDNN(cudnnSetTensor4dDescriptor(my_desc_,
-        CUDNN_TENSOR_NCHW,
-        CUDNN_DATA_FLOAT,
-        batchsize_,
-        channels_,
-        height_,
-        width_));
-  CHECK_CUDNN(cudnnSetTensor4dDescriptor(bnScaleBiasMeanVar_desc_,
-        CUDNN_TENSOR_NCHW,
-        CUDNN_DATA_FLOAT,
-        1,
-        channels_,
-        1,
-        1));
-  CHECK_CUDNN(cudnnSetTensor4dDescriptor(bnScaleBiasDiff_desc_,
-        CUDNN_TENSOR_NCHW,
-        CUDNN_DATA_FLOAT,
-        1,
-        channels_,
-        1,
-        1));
-
-  vector<int> shape{1, channels_, 1, 1};
-
-  resultSaveMean_.Reshape(shape);
-  resultSaveInvVariance_.Reshape(shape);
-
-  mode_ = CUDNN_BATCHNORM_SPATIAL;
-}
-
-void CudnnBMLayer::ComputeFeature(int flag,
-    const vector<Layer*>& srclayers) {
-  if (!has_init_cudnn_)
-    InitCudnn();
-
-  const float alpha = 1.0f, beta = 0.0f;
-  double exponentialAverageFactor = 1.0;
-  double epsilon = CUDNN_BN_MIN_EPSILON;
-
-  // check training
-  if ((flag & kTrain) != kTrain) {
-    CHECK_CUDNN(cudnnBatchNormalizationForwardInference(handle_,
-          mode_,
-          &alpha,
-          &beta,
-          src_desc_,
-          srclayers.at(0)->data(this).gpu_data(),
-          my_desc_,
-          data_.mutable_gpu_data(),
-          bnScaleBiasMeanVar_desc_,
-          bnScale_->data().gpu_data(),
-          bnBias_->data().gpu_data(),
-          resultRunningMean_->data().gpu_data(),
-          resultRunningInvVariance_->data().gpu_data(),
-          epsilon));
-  } else {
-    CHECK_CUDNN(cudnnBatchNormalizationForwardTraining(handle_,
-          mode_,
-          &alpha,
-          &beta,
-          src_desc_,
-          srclayers.at(0)->data(this).gpu_data(),
-          my_desc_,
-          data_.mutable_gpu_data(),
-          bnScaleBiasMeanVar_desc_,
-          bnScale_->data().gpu_data(),
-          bnBias_->data().gpu_data(),
-          exponentialAverageFactor,
-          resultRunningMean_->mutable_data()->mutable_gpu_data(),
-          resultRunningInvVariance_->mutable_data()->mutable_gpu_data(),
-          epsilon,
-          resultSaveMean_.mutable_gpu_data(),
-          resultSaveInvVariance_.mutable_gpu_data()));
-  }
-}
-
-void CudnnBMLayer::ComputeGradient(int flag,
-    const vector<Layer*>& srclayers) {
-
-  const float alpha = 1.0f, beta = 0.0f, alphaDiff = 1.0f, betaDiff = 0.0f;
-  double epsilon = CUDNN_BN_MIN_EPSILON;
-
-  CHECK_CUDNN(cudnnBatchNormalizationBackward(handle_,
-      mode_,
-      &alpha,
-      &beta,
-      &alphaDiff,
-      &betaDiff,
-      src_desc_,
-      srclayers.at(0)->data(this).gpu_data(),
-      my_desc_,
-      grad_.gpu_data(),
-      src_desc_,
-      srclayers.at(0)->mutable_grad(this)->mutable_gpu_data(),
-      bnScaleBiasDiff_desc_,
-      bnScale_->data().gpu_data(),
-      bnScale_->mutable_grad()->mutable_gpu_data(),
-      bnBias_->mutable_grad()->mutable_gpu_data(),
-      epsilon,
-      resultSaveMean_.gpu_data(),
-      resultSaveInvVariance_.gpu_data()));
-}
-}  // namespace singa
-#endif
diff --git a/src/neuralnet/neuron_layer/cudnn_convolution.cc b/src/neuralnet/neuron_layer/cudnn_convolution.cc
deleted file mode 100644
index 560ee63..0000000
--- a/src/neuralnet/neuron_layer/cudnn_convolution.cc
+++ /dev/null
@@ -1,221 +0,0 @@
-/************************************************************
-*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*
-*************************************************************/
-
-#include "singa/neuralnet/neuron_layer.h"
-
-namespace singa {
-
-CudnnConvLayer::~CudnnConvLayer() {
-  if (has_init_cudnn_) {
-    CHECK_CUDNN(cudnnDestroyTensorDescriptor(bias_desc_));
-    CHECK_CUDNN(cudnnDestroyFilterDescriptor(filter_desc_));
-    CHECK_CUDNN(cudnnDestroyConvolutionDescriptor(conv_desc_));
-  }
-}
-
-void CudnnConvLayer::InitCudnn() {
-  CudnnBase::InitCudnn();
-  // convert MB to bytes
-  workspace_byte_limit_
-    = layer_conf_.convolution_conf().workspace_byte_limit() << 20;
-
-  CHECK_CUDNN(cudnnCreateTensorDescriptor(&bias_desc_));
-  CHECK_CUDNN(cudnnCreateFilterDescriptor(&filter_desc_));
-  CHECK_CUDNN(cudnnCreateConvolutionDescriptor(&conv_desc_));
-
-  CHECK_CUDNN(cudnnSetConvolution2dDescriptor(conv_desc_,
-        pad_y_,
-        pad_x_,
-        stride_y_,
-        stride_x_,
-        1,
-        1,
-        CUDNN_CROSS_CORRELATION));
-  CHECK_CUDNN(cudnnSetFilter4dDescriptor(filter_desc_,
-        CUDNN_DATA_FLOAT,
-        num_filters_,
-        channels_,
-        kernel_y_,
-        kernel_x_));
-  if (bias_) {
-    CHECK_CUDNN(cudnnSetTensor4dDescriptor(bias_desc_,
-          CUDNN_TENSOR_NCHW,
-          CUDNN_DATA_FLOAT,
-          1,
-          num_filters_,
-          1,
-          1));
-  }
-  CHECK_CUDNN(cudnnSetTensor4dDescriptor(src_desc_,
-        CUDNN_TENSOR_NCHW,
-        CUDNN_DATA_FLOAT,
-        batchsize_,
-        channels_,
-        height_,
-        width_));
-  CHECK_CUDNN(cudnnSetTensor4dDescriptor(my_desc_,
-        CUDNN_TENSOR_NCHW,
-        CUDNN_DATA_FLOAT,
-        batchsize_,
-        num_filters_,
-        conv_height_,
-        conv_width_));
-
-  CHECK_CUDNN(cudnnGetConvolutionForwardAlgorithm(handle_,
-        src_desc_,
-        filter_desc_,
-        conv_desc_,
-        my_desc_,
-        CUDNN_CONVOLUTION_FWD_PREFER_FASTEST,
-        workspace_byte_limit_,
-        &fp_alg_));
-
-  CHECK_CUDNN(cudnnGetConvolutionBackwardFilterAlgorithm(handle_,
-        src_desc_,
-        my_desc_,
-        conv_desc_,
-        filter_desc_,
-        CUDNN_CONVOLUTION_BWD_FILTER_PREFER_FASTEST,
-        workspace_byte_limit_,
-        &bp_filter_alg_));
-  CHECK_CUDNN(cudnnGetConvolutionBackwardDataAlgorithm(handle_,
-        filter_desc_,
-        my_desc_,
-        conv_desc_,
-        src_desc_,
-        CUDNN_CONVOLUTION_BWD_DATA_PREFER_FASTEST,
-        workspace_byte_limit_,
-        &bp_data_alg_));
-
-  size_t fp_byte, bp_data_byte, bp_filter_byte;
-  CHECK_CUDNN(cudnnGetConvolutionForwardWorkspaceSize(handle_,
-        src_desc_,
-        filter_desc_,
-        conv_desc_,
-        my_desc_,
-        fp_alg_,
-        &fp_byte));
-  CHECK_CUDNN(cudnnGetConvolutionBackwardDataWorkspaceSize(handle_,
-        filter_desc_,
-        my_desc_,
-        conv_desc_,
-        src_desc_,
-        bp_data_alg_,
-        &bp_data_byte));
-  CHECK_CUDNN(cudnnGetConvolutionBackwardFilterWorkspaceSize(handle_,
-        src_desc_,
-        my_desc_,
-        conv_desc_,
-        filter_desc_,
-        bp_filter_alg_,
-        &bp_filter_byte));
-  workspace_count_ = std::max(std::max(fp_byte, bp_data_byte), bp_filter_byte)
-    / sizeof(float) + 1;
-}
-
-void CudnnConvLayer::ComputeFeature(int flag, const vector<Layer*>& srclayers) {
-  if (!has_init_cudnn_)
-    InitCudnn();
-  float alpha = 1.f, beta = 0.f;
-  Blob<float> workspace(vector<int>{static_cast<int>(workspace_count_)});
-  CHECK_CUDNN(cudnnConvolutionForward(handle_,
-        &alpha,
-        src_desc_,
-        srclayers[0]->data(this).gpu_data(),
-        filter_desc_,
-        weight_->data().gpu_data(),
-        conv_desc_,
-        fp_alg_,
-        workspace.mutable_gpu_data(),
-        workspace_count_ * sizeof(float),
-        &beta,
-        my_desc_,
-        data_.mutable_gpu_data()));
-  if (bias_) {
-    beta = 1.f;
-
-#if CUDNN_MAJOR == 4
-    CHECK_CUDNN(cudnnAddTensor(handle_,
-          &alpha,
-          bias_desc_,
-          bias_->data().gpu_data(),
-          &beta,
-          my_desc_,
-          data_.mutable_gpu_data()));
-#else
-    CHECK_CUDNN(cudnnAddTensor(handle_,
-          CUDNN_ADD_SAME_C,
-          &alpha,
-          bias_desc_,
-          bias_->data().gpu_data(),
-          &beta,
-          my_desc_,
-          data_.mutable_gpu_data()));
-#endif
-  }
-}
-
-void
-CudnnConvLayer::ComputeGradient(int flag, const vector<Layer*>& srclayers) {
-  float alpha = 1.f, beta = 0.f;
-  Blob<float> workspace(vector<int>{static_cast<int>(workspace_count_)});
-  // LOG(ERROR) << "backward bias";
-  if (bias_) {
-    CHECK_CUDNN(cudnnConvolutionBackwardBias(handle_,
-          &alpha,
-          my_desc_,
-          grad_.gpu_data(),
-          &beta,
-          bias_desc_,
-          bias_->mutable_grad()->mutable_gpu_data()));
-  }
-  // LOG(ERROR) << "backward w";
-  CHECK_CUDNN(cudnnConvolutionBackwardFilter_v3(handle_,
-        &alpha,
-        src_desc_,
-        srclayers[0]->data(this).gpu_data(),
-        my_desc_,
-        grad_.gpu_data(),
-        conv_desc_,
-        bp_filter_alg_,
-        workspace.mutable_gpu_data(),
-        workspace_count_ * sizeof(float),
-        &beta,
-        filter_desc_,
-        weight_->mutable_grad()->mutable_gpu_data()));
-  // LOG(ERROR) << "backward src";
-  if (srclayers[0]->mutable_grad(this) != nullptr) {
-    CHECK_CUDNN(cudnnConvolutionBackwardData_v3(handle_,
-          &alpha,
-          filter_desc_,
-          weight_->data().gpu_data(),
-          my_desc_,
-          grad_.gpu_data(),
-          conv_desc_,
-          bp_data_alg_,
-          workspace.mutable_gpu_data(),
-          workspace_count_ * sizeof(float),
-          &beta,
-          src_desc_,
-          srclayers[0]->mutable_grad(this)->mutable_gpu_data()));
-  }
-}
-}  // namespace singa
diff --git a/src/neuralnet/neuron_layer/cudnn_lrn.cc b/src/neuralnet/neuron_layer/cudnn_lrn.cc
deleted file mode 100644
index fb8e476..0000000
--- a/src/neuralnet/neuron_layer/cudnn_lrn.cc
+++ /dev/null
@@ -1,87 +0,0 @@
-/************************************************************
-*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*
-*************************************************************/
-
-#include "singa/neuralnet/neuron_layer.h"
-
-namespace singa {
-CudnnLRNLayer::~CudnnLRNLayer() {
-  if (has_init_cudnn_) {
-    cudnnDestroyLRNDescriptor(norm_desc_);
-  }
-}
-
-void CudnnLRNLayer::InitCudnn() {
-  mode_ = CUDNN_LRN_CROSS_CHANNEL_DIM1;
-  CudnnBase::InitCudnn();
-  CHECK_CUDNN(cudnnCreateLRNDescriptor(&norm_desc_));
-  CHECK_CUDNN(cudnnSetLRNDescriptor(norm_desc_,
-        lsize_,
-        alpha_,
-        beta_,
-        knorm_));
-  CHECK_CUDNN(cudnnSetTensor4dDescriptor(src_desc_,
-      CUDNN_TENSOR_NCHW,
-      CUDNN_DATA_FLOAT,
-      batchsize_,
-      channels_,
-      height_,
-      width_));
-  CHECK_CUDNN(cudnnSetTensor4dDescriptor(my_desc_,
-      CUDNN_TENSOR_NCHW,
-      CUDNN_DATA_FLOAT,
-      batchsize_,
-      channels_,
-      height_,
-      width_));
-}
-void CudnnLRNLayer::ComputeFeature(int flag, const vector<Layer*>& srclayers) {
-  if (!has_init_cudnn_)
-    InitCudnn();
-  float alpha = 1.0f, beta = 0.0f;
-  CHECK_CUDNN(cudnnLRNCrossChannelForward(handle_,
-      norm_desc_,
-      mode_,
-      &alpha,
-      src_desc_,
-      srclayers[0]->data(this).gpu_data(),
-      &beta,
-      my_desc_,
-      data_.mutable_gpu_data()));
-}
-void CudnnLRNLayer::ComputeGradient(int flag, const vector<Layer*>& srclayers) {
-  float alpha = 1.0f, beta = 0.0f;
-  CHECK_CUDNN(cudnnLRNCrossChannelBackward(handle_,
-        norm_desc_,
-        mode_,
-        &alpha,
-        my_desc_,
-        data_.gpu_data(),
-        my_desc_,
-        grad_.gpu_data(),
-        src_desc_,
-        srclayers[0]->data(this).gpu_data(),
-        &beta,
-        src_desc_,
-        srclayers[0]->mutable_grad(this)->mutable_gpu_data()));
-}
-
-
-}  // namespace singa
diff --git a/src/neuralnet/neuron_layer/cudnn_pooling.cc b/src/neuralnet/neuron_layer/cudnn_pooling.cc
deleted file mode 100644
index 4c4c038..0000000
--- a/src/neuralnet/neuron_layer/cudnn_pooling.cc
+++ /dev/null
@@ -1,95 +0,0 @@
-/************************************************************
-*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*
-*************************************************************/
-
-#include "singa/neuralnet/neuron_layer.h"
-
-namespace singa {
-
-CudnnPoolLayer::~CudnnPoolLayer() {
-  if (has_init_cudnn_) {
-    CHECK_CUDNN(cudnnDestroyPoolingDescriptor(pool_desc_));
-  }
-}
-
-void CudnnPoolLayer::InitCudnn() {
-  CudnnBase::InitCudnn();
-  CHECK_CUDNN(cudnnCreatePoolingDescriptor(&pool_desc_));
-  CHECK_CUDNN(cudnnSetTensor4dDescriptor(src_desc_,
-        CUDNN_TENSOR_NCHW,
-        CUDNN_DATA_FLOAT,
-        batchsize_,
-        channels_,
-        height_,
-        width_));
-  CHECK_CUDNN(cudnnSetTensor4dDescriptor(my_desc_,
-        CUDNN_TENSOR_NCHW,
-        CUDNN_DATA_FLOAT,
-        batchsize_,
-        channels_,
-        pooled_height_,
-        pooled_width_));
-  auto pool_method = CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING;
-  if (pool_ == PoolingProto_PoolMethod_MAX)
-    pool_method = CUDNN_POOLING_MAX;
-  CHECK_CUDNN(cudnnSetPooling2dDescriptor(pool_desc_,
-        pool_method,
-        kernel_y_,
-        kernel_x_,
-        pad_y_,
-        pad_x_,
-        stride_y_,
-        stride_x_));
-}
-
-void CudnnPoolLayer::ComputeFeature(int flag, const vector<Layer*>& srclayers) {
-  if (!has_init_cudnn_)
-    InitCudnn();
-  float alpha = 1.0f, beta = 0.0f;
-  // currently only consider single src layer
-  CHECK_EQ(srclayers.size(), 1);
-  CHECK_CUDNN(cudnnPoolingForward(handle_,
-        pool_desc_,
-        &alpha,
-        src_desc_,
-        srclayers[0]->data(this).gpu_data(),
-        &beta,
-        my_desc_,
-        data_.mutable_gpu_data()));
-}
-
-void
-CudnnPoolLayer::ComputeGradient(int flag, const vector<Layer*>& srclayers) {
-  float alpha = 1.0f, beta = 0.0f;
-  CHECK_CUDNN(cudnnPoolingBackward(handle_,
-        pool_desc_,
-        &alpha,
-        my_desc_,
-        data_.gpu_data(),
-        my_desc_,
-        grad_.gpu_data(),
-        src_desc_,
-        srclayers[0]->data(this).gpu_data(),
-        &beta,
-        src_desc_,
-        srclayers[0]->mutable_grad(this)->mutable_gpu_data()));
-}
-}  // namespace singa
-
diff --git a/src/neuralnet/neuron_layer/cudnn_softmax.cc b/src/neuralnet/neuron_layer/cudnn_softmax.cc
deleted file mode 100644
index bf5a8d3..0000000
--- a/src/neuralnet/neuron_layer/cudnn_softmax.cc
+++ /dev/null
@@ -1,76 +0,0 @@
-/************************************************************
-*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*
-*************************************************************/
-
-#include "singa/neuralnet/neuron_layer.h"
-
-namespace singa {
-
-void CudnnSoftmaxLayer::InitCudnn() {
-  CudnnBase::InitCudnn();
-  CHECK_CUDNN(cudnnSetTensor4dDescriptor(src_desc_,
-        CUDNN_TENSOR_NCHW,
-        CUDNN_DATA_FLOAT,
-        batchsize_,
-        dim_,
-        1,
-        1));
-  CHECK_CUDNN(cudnnSetTensor4dDescriptor(my_desc_,
-        CUDNN_TENSOR_NCHW,
-        CUDNN_DATA_FLOAT,
-        batchsize_,
-        dim_,
-        1,
-        1));
-}
-
-void CudnnSoftmaxLayer::ComputeFeature(int flag,
-    const vector<Layer*>& srclayers) {
-  if (!has_init_cudnn_)
-    InitCudnn();
-  const float alpha = 1.0f, beta = 0.0f;
-  CHECK_EQ(srclayers.at(0)->data(this).shape().size(), 2);
-  CHECK_CUDNN(cudnnSoftmaxForward(handle_,
-        CUDNN_SOFTMAX_ACCURATE,
-        CUDNN_SOFTMAX_MODE_INSTANCE,
-        &alpha,
-        src_desc_,
-        srclayers.at(0)->data(this).gpu_data(),
-        &beta,
-        my_desc_,
-        data_.mutable_gpu_data()));
-}
-
-void CudnnSoftmaxLayer::ComputeGradient(int flag,
-    const vector<Layer*>& srclayers) {
-  const float alpha = 1.f, beta = 0.f;
-  CHECK_CUDNN(cudnnSoftmaxBackward(handle_,
-        CUDNN_SOFTMAX_ACCURATE,
-        CUDNN_SOFTMAX_MODE_INSTANCE,
-        &alpha,
-        my_desc_,
-        data_.gpu_data(),
-        my_desc_,
-        grad_.gpu_data(),
-        &beta,
-        src_desc_,
-        srclayers.at(0)->mutable_grad(this)->mutable_gpu_data()));
-}
-}  // namespace singa
diff --git a/src/neuralnet/neuron_layer/dropout.cc b/src/neuralnet/neuron_layer/dropout.cc
deleted file mode 100644
index 706b999..0000000
--- a/src/neuralnet/neuron_layer/dropout.cc
+++ /dev/null
@@ -1,62 +0,0 @@
-/************************************************************
-*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*
-*************************************************************/
-
-#include <glog/logging.h>
-#include "singa/neuralnet/neuron_layer.h"
-#include "singa/utils/singleton.h"
-#include "singa/utils/singa_op.h"
-#include "singa/utils/math_blob.h"
-
-namespace singa {
-using std::vector;
-
-void DropoutLayer::Setup(const LayerProto& conf,
-    const vector<Layer*>& srclayers) {
-  Layer::Setup(conf, srclayers);
-  data_.ReshapeLike(srclayers[0]->data(this));
-  grad_.ReshapeLike(*srclayers[0]->mutable_grad(this));
-  mask_.Reshape(srclayers[0]->data(this).shape());
-  pdrop_ = conf.dropout_conf().dropout_ratio();
-}
-
-void DropoutLayer::ComputeFeature(int flag, const vector<Layer*>& srclayers) {
-  // check training
-  if ((flag & kTrain) != kTrain) {
-    data_.CopyFrom(srclayers[0]->data(this));
-    return;
-  }
-
-  float pkeep = 1 - pdrop_;
-  Blob<float> rand(data_.count());
-  SampleUniform(0.0f, 1.0f, &rand);
-  Map<op::Threshold<float>, float>(pkeep, rand, &mask_);
-  // scale the mask to avoid scaling in ComputeGradient
-  Scale(1.0f / pkeep, &mask_);
-  Mult(srclayers[0]->data(this), mask_, &data_);
-}
-
-void DropoutLayer::ComputeGradient(int flag, const vector<Layer*>& srclayers)  {
-  Mult(grad_, mask_, srclayers[0]->mutable_grad(this));
-  // no need to mult scale as mask is scaled already.
-}
-
-}  // namespace singa
-
diff --git a/src/neuralnet/neuron_layer/dummy.cc b/src/neuralnet/neuron_layer/dummy.cc
deleted file mode 100644
index 9796407..0000000
--- a/src/neuralnet/neuron_layer/dummy.cc
+++ /dev/null
@@ -1,102 +0,0 @@
-/************************************************************
-*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*
-*************************************************************/
-
-#include <glog/logging.h>
-#include "singa/neuralnet/neuron_layer.h"
-#include "singa/utils/math_blob.h"
-#include "singa/utils/context.h"
-#include "singa/utils/singleton.h"
-
-namespace singa {
-
-void DummyLayer::Setup(const std::string str,
-                       const vector<Layer*>& srclayers) {
-  LayerProto conf;
-  conf.ParseFromString(str);
-  DummyLayer::Setup(conf, srclayers);
-}
-
-void DummyLayer::Setup(const LayerProto& proto,
-                       const vector<Layer*>& srclayers) {
-  NeuronLayer::Setup(proto, srclayers);
-  if (proto.dummy_conf().input()) {  // use as input layer
-    CHECK_EQ(srclayers.size(), 0);
-    input_ = true;
-    vector<int> shape;
-    for (int s : proto.dummy_conf().shape()) shape.push_back(s);
-    data_.Reshape(shape);
-    grad_.ReshapeLike(data_);
-  } else {
-    CHECK_EQ(srclayers.size(), 1);
-    data_.ReshapeLike(srclayers[0]->data(this));
-    grad_.ReshapeLike(srclayers[0]->grad(this));
-  }
-  if (proto.dummy_conf().output()) {  // use as output layer
-    output_ = true;
-  }
-}
-
-void DummyLayer::ComputeFeature(int flag, const vector<Layer*>& srclayers) {
-  std::uniform_real_distribution<float> dis(0, 1);
-  auto gen = Singleton<Context>::Instance()->rand_generator();
-  if (input_) {
-    // randomly init data with [0,1] values
-    for (int i = 0; i < data_.count(); ++i)
-      data_.mutable_cpu_data()[i] = dis(*gen);
-  }
-  if (srclayers.size() > 0)
-    Copy(srclayers[0]->data(this), &data_);
-}
-
-void DummyLayer::ComputeGradient(int flag, const vector<Layer*>& srclayers) {
-  std::uniform_real_distribution<float> dis(0, 1);
-  auto gen = Singleton<Context>::Instance()->rand_generator();
-  if (output_) {
-    // randomly init data with [0,1] values
-    for (int i = 0; i < data_.count(); ++i)
-      grad_.mutable_cpu_data()[i] = dis(*gen);
-  }
-  if (srclayers.size() > 0)
-    Copy(grad_, srclayers[0]->mutable_grad(this));
-}
-
-void DummyLayer::Feed(int batchsize, vector<float>& data, vector<int>& aux_data){
-
-    batchsize_ = batchsize;
-    // input data
-    if (data.size() > 0) {
-      int size = data.size();
-      float* ptr = data_.mutable_cpu_data();
-      for (int i = 0; i< size; i++) { 
-          ptr[i] = data.at(i);
-      }
-    }
-    // auxiliary data, e.g., label
-    if (aux_data.size() > 0) {
-      aux_data_.resize(batchsize_);
-      for (int i = 0; i< batchsize_; i++) {
-          aux_data_[i] = static_cast<int>(aux_data.at(i));
-      }
-    }
-    return;
-}
-
-}  // namespace singa
diff --git a/src/neuralnet/neuron_layer/embedding.cc b/src/neuralnet/neuron_layer/embedding.cc
deleted file mode 100644
index c980c54..0000000
--- a/src/neuralnet/neuron_layer/embedding.cc
+++ /dev/null
@@ -1,98 +0,0 @@
-/************************************************************
-*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*
-*************************************************************/
-
-#include "singa/neuralnet/neuron_layer.h"
-#include "singa/utils/math_addr.h"
-#include "singa/utils/math_blob.h"
-#include "singa/utils/singleton.h"
-#include "singa/utils/context.h"
-
-namespace singa {
-
-void EmbeddingLayer::Setup(const LayerProto& conf,
-    const vector<Layer*>& srclayers) {
-  NeuronLayer::Setup(conf, srclayers);
-  vocab_size_ = conf.embedding_conf().vocab_size();
-  feature_dim_ = conf.embedding_conf().feature_dim();
-  vocab_ = Param::Create(conf.param(0));
-  vocab_->Setup(vector<int>{vocab_size_, feature_dim_});
-  batchsize_ = srclayers.at(0)->data(unroll_index()).shape(0);
-  data_.Reshape(batchsize_, feature_dim_);
-  grad_.ReshapeLike(data_);
-}
-
-void EmbeddingLayer::ComputeFeature(int flag, const vector<Layer*>& srclayers) {
-  const float* word_idx = srclayers.at(0)->data(unroll_index()).cpu_data();
-  int device = Singleton<Context>::Instance()->device_id();
-  if (device == -1) {
-    const float* src = vocab_->data().cpu_data();
-    float* dst = data_.mutable_cpu_data();
-    for (int i = 0; i < batchsize_; i++) {
-      memcpy(dst + i * feature_dim_,
-          src + static_cast<int>(word_idx[i]) * feature_dim_,
-          feature_dim_ * sizeof(float));
-    }
-  } else {
-#ifdef USE_GPU
-    const float* src = vocab_->data().gpu_data();
-    float* dst = data_.mutable_gpu_data();
-    for (int i = 0; i < batchsize_; i++) {
-      cudaMemcpy(dst + i * feature_dim_,
-          src + static_cast<int>(word_idx[i]) * feature_dim_,
-          feature_dim_ * sizeof(float), cudaMemcpyDefault);
-    }
-#else
-    LOG(FATAL) << "Not implemented";
-#endif
-  }
-}
-
-void EmbeddingLayer::ComputeGradient(int flag,
-    const vector<Layer*>& srclayers) {
-  const float* word_idx = srclayers.at(0)->data(unroll_index()).cpu_data();
-  auto context = Singleton<Context>::Instance();
-  if ((flag & kAggGrad) == 0)
-    Zero(vocab_->mutable_grad());
-
-  if (context->device_id() == -1) {
-    const float* src = grad_.cpu_data();
-    float* dst = vocab_->mutable_grad()->mutable_cpu_data();
-    memset(dst, 0 , sizeof(float) * grad_.count());
-    for (int i = 0; i < batchsize_; i++) {
-      cpu_axpy(feature_dim_, 1.0f, src + i * feature_dim_,
-          dst + static_cast<int>(word_idx[i]) * feature_dim_);
-    }
-  } else {
-#ifdef USE_GPU
-    const float* src = grad_.gpu_data();
-    float* dst = vocab_->mutable_grad()->mutable_gpu_data();
-    for (int i = 0; i < batchsize_; i++) {
-      gpu_axpy(context->cublas_handle(), grad_.count(), 1.0f,
-          src + i * feature_dim_,
-          dst + static_cast<int>(word_idx[i]) * feature_dim_);
-    }
-#else
-    LOG(FATAL) << "Not implemented";
-#endif
-  }
-}
-
-}  // namespace singa
diff --git a/src/neuralnet/neuron_layer/gru.cc b/src/neuralnet/neuron_layer/gru.cc
deleted file mode 100644
index 440da91..0000000
--- a/src/neuralnet/neuron_layer/gru.cc
+++ /dev/null
@@ -1,258 +0,0 @@
-/************************************************************
-*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*
-*************************************************************/
-
-#include <glog/logging.h>
-#include "singa/neuralnet/neuron_layer.h"
-#include "singa/utils/singleton.h"
-#include "singa/utils/math_blob.h"
-#include "singa/utils/singa_op.h"
-
-using namespace std;
-
-namespace singa {
-
-using std::vector;
-
-GRULayer::~GRULayer() {
-  delete weight_z_hx_;
-  delete weight_z_hh_;
-  delete bias_z_;
-
-  delete weight_r_hx_;
-  delete weight_r_hh_;
-  delete bias_r_;
-
-  delete weight_c_hx_;
-  delete weight_c_hh_;
-  delete bias_c_;
-
-  delete update_gate_;
-  delete reset_gate_;
-  delete new_memory_;
-  // delete reset_context_;
-}
-
-void GRULayer::Setup(const LayerProto& conf,
-    const vector<Layer*>& srclayers) {
-  Layer::Setup(conf, srclayers);
-  CHECK_LE(srclayers.size(), 2);
-  const auto& src = srclayers[0]->data(this);
-
-  batchsize_ = src.shape()[0];  // size of batch
-  vdim_ = src.count() / (batchsize_);  // dimension of input
-
-  hdim_ = layer_conf_.gru_conf().dim_hidden();  // dimension of hidden state
-
-  data_.Reshape(vector<int>{batchsize_, hdim_});
-  grad_.ReshapeLike(data_);
-  // one for grad from dst GRU, one for grad from upper layer
-  gradvec_.push_back(new Blob<float>(grad_.shape()));
-
-  // Initialize the parameters
-  weight_z_hx_ = Param::Create(conf.param(0));
-  weight_r_hx_ = Param::Create(conf.param(1));
-  weight_c_hx_ = Param::Create(conf.param(2));
-
-  weight_z_hh_ = Param::Create(conf.param(3));
-  weight_r_hh_ = Param::Create(conf.param(4));
-  weight_c_hh_ = Param::Create(conf.param(5));
-
-  if (conf.param_size() > 6) {
-    bias_z_ = Param::Create(conf.param(6));
-    bias_r_ = Param::Create(conf.param(7));
-    bias_c_ = Param::Create(conf.param(8));
-  }
-
-  weight_z_hx_->Setup(vector<int>{hdim_, vdim_});
-  weight_r_hx_->Setup(vector<int>{hdim_, vdim_});
-  weight_c_hx_->Setup(vector<int>{hdim_, vdim_});
-
-  weight_z_hh_->Setup(vector<int>{hdim_, hdim_});
-  weight_r_hh_->Setup(vector<int>{hdim_, hdim_});
-  weight_c_hh_->Setup(vector<int>{hdim_, hdim_});
-
-  if (conf.param_size() > 6) {
-    bias_z_->Setup(vector<int>{hdim_});
-    bias_r_->Setup(vector<int>{hdim_});
-    bias_c_->Setup(vector<int>{hdim_});
-  }
-
-  update_gate_ = new Blob<float>(batchsize_, hdim_);
-  reset_gate_ = new Blob<float>(batchsize_, hdim_);
-  new_memory_ = new Blob<float>(batchsize_, hdim_);
-}
-
-void GRULayer::ComputeFeature(int flag,
-    const vector<Layer*>& srclayers) {
-  CHECK_LE(srclayers.size(), 2);
-
-  // Do transpose
-  Blob<float> *w_z_hx_t = Transpose(weight_z_hx_->data());
-  Blob<float> *w_z_hh_t = Transpose(weight_z_hh_->data());
-  Blob<float> *w_r_hx_t = Transpose(weight_r_hx_->data());
-  Blob<float> *w_r_hh_t = Transpose(weight_r_hh_->data());
-  Blob<float> *w_c_hx_t = Transpose(weight_c_hx_->data());
-  Blob<float> *w_c_hh_t = Transpose(weight_c_hh_->data());
-
-  // Prepare the data input and the context
-  const auto& src = srclayers[0]->data(this);
-  const Blob<float> *context;
-  if (srclayers.size() == 1) {  // only have data input
-    context = new Blob<float>(batchsize_, hdim_);
-  } else {  // have data input & context
-    context = &srclayers[1]->data(this);
-  }
-
-  // Compute the update gate
-  GEMM(1.0f, 0.0f, src, *w_z_hx_t, update_gate_);
-  if (bias_z_ != nullptr)
-    MVAddRow(1.0f, 1.0f, bias_z_->data(), update_gate_);
-  GEMM(1.0f, 1.0f, *context, *w_z_hh_t, update_gate_);
-  Map<op::Sigmoid<float>, float>(*update_gate_, update_gate_);
-  // LOG(ERROR) << "Update Gate: " << update_gate_->cpu_data()[0];
-  // Compute the reset gate
-  GEMM(1.0f, 0.0f, src, *w_r_hx_t, reset_gate_);
-  if (bias_r_ != nullptr)
-    MVAddRow(1.0f, 1.0f, bias_r_->data(), reset_gate_);
-  GEMM(1.0f, 1.0f, *context, *w_r_hh_t, reset_gate_);
-  Map<op::Sigmoid<float>, float>(*reset_gate_, reset_gate_);
-  // LOG(ERROR) << "Reset Gate: " << reset_gate_->cpu_data()[0];
-  // Compute the new memory
-  GEMM(1.0f, 0.0f, *context, *w_c_hh_t, new_memory_);
-  Mult<float>(*reset_gate_, *new_memory_, new_memory_);
-  GEMM(1.0f, 1.0f, src, *w_c_hx_t, new_memory_);
-  if (bias_c_ != nullptr)
-    MVAddRow(1.0f, 1.0f, bias_c_->data(), new_memory_);
-  Map<op::Tanh<float>, float>(*new_memory_, new_memory_);
-
-  Sub(*context, *new_memory_, &data_);
-  Mult(data_, *update_gate_, &data_);
-  Add(data_, *new_memory_, &data_);
-
-  // delete the pointers
-  if (srclayers.size() == 1)
-    delete context;
-
-  delete w_z_hx_t;
-  delete w_z_hh_t;
-  delete w_r_hx_t;
-  delete w_r_hh_t;
-  delete w_c_hx_t;
-  delete w_c_hh_t;
-}
-
-void GRULayer::ComputeGradient(int flag,
-    const vector<Layer*>& srclayers) {
-  CHECK_LE(srclayers.size(), 2);
-  // agg grad from two dst layers, gradvec_[0] is grad_
-  AXPY(1.0f, *gradvec_[1], &grad_);
-  float beta = 1.0f;  // agg param gradients
-
-  Layer* ilayer = srclayers[0];  // input layer
-  Layer* clayer = nullptr;  // context layer
-  // Prepare the data input and the context
-  const Blob<float>& src = ilayer->data(this);
-  const Blob<float> *context;
-  if (srclayers.size() == 1) {  // only have data input
-    context = new Blob<float>(batchsize_, hdim_);
-  } else {  // have data input & context
-    clayer = srclayers[1];
-    context = &(clayer->data(this));
-  }
-
-  // Compute intermediate gradients which are used for other computations
-  Blob<float> dugatedz(batchsize_, hdim_);
-  Map<singa::op::SigmoidGrad<float>, float>(*update_gate_, &dugatedz);
-  Blob<float> drgatedr(batchsize_, hdim_);
-  Map<singa::op::SigmoidGrad<float>, float>(*reset_gate_, &drgatedr);
-  Blob<float> dnewmdc(batchsize_, hdim_);
-  Map<singa::op::TanhGrad<float>, float>(*new_memory_, &dnewmdc);
-
-  Blob<float> dLdz(batchsize_, hdim_);
-  Sub<float>(*context, *new_memory_, &dLdz);
-  Mult<float>(dLdz, grad_, &dLdz);
-  Mult<float>(dLdz, dugatedz, &dLdz);
-
-  Blob<float> dLdc(batchsize_, hdim_);
-  Blob<float> z1(batchsize_, hdim_);
-  z1.SetValue(1.0f);
-  AXPY<float>(-1.0f, *update_gate_, &z1);
-  Mult(grad_, z1, &dLdc);
-  Mult(dLdc, dnewmdc, &dLdc);
-
-  Blob<float> reset_dLdc(batchsize_, hdim_);
-  Mult(dLdc, *reset_gate_, &reset_dLdc);
-
-  Blob<float> dLdr(batchsize_, hdim_);
-  Blob<float> cprev(batchsize_, hdim_);
-  GEMM(1.0f, 0.0f, *context, weight_c_hh_->data().T(), &cprev);
-  Mult(dLdc, cprev, &dLdr);
-  Mult(dLdr, drgatedr, &dLdr);
-
-  // Compute gradients for parameters of update gate
-  Blob<float> *dLdz_t = Transpose(dLdz);
-  GEMM(1.0f, beta, *dLdz_t, src, weight_z_hx_->mutable_grad());
-  GEMM(1.0f, beta, *dLdz_t, *context, weight_z_hh_->mutable_grad());
-  if (bias_z_ != nullptr)
-    MVSumRow<float>(1.0f, beta, dLdz, bias_z_->mutable_grad());
-  delete dLdz_t;
-
-  // Compute gradients for parameters of reset gate
-  Blob<float> *dLdr_t = Transpose(dLdr);
-  GEMM(1.0f, beta, *dLdr_t, src, weight_r_hx_->mutable_grad());
-  GEMM(1.0f, beta, *dLdr_t, *context, weight_r_hh_->mutable_grad());
-  if (bias_r_ != nullptr)
-    MVSumRow(1.0f, beta, dLdr, bias_r_->mutable_grad());
-  delete dLdr_t;
-
-  // Compute gradients for parameters of new memory
-  Blob<float> *dLdc_t = Transpose(dLdc);
-  GEMM(1.0f, beta, *dLdc_t, src, weight_c_hx_->mutable_grad());
-  if (bias_c_ != nullptr)
-    MVSumRow(1.0f, beta, dLdc, bias_c_->mutable_grad());
-  delete dLdc_t;
-
-  Blob<float> *reset_dLdc_t = Transpose(reset_dLdc);
-  GEMM(1.0f, beta, *reset_dLdc_t, *context, weight_c_hh_->mutable_grad());
-  delete reset_dLdc_t;
-
-  // Compute gradients for data input layer
-  if (srclayers[0]->mutable_grad(this) != nullptr) {
-    GEMM(1.0f, 0.0f, dLdc, weight_c_hx_->data(), ilayer->mutable_grad(this));
-    GEMM(1.0f, 1.0f, dLdz, weight_z_hx_->data(), ilayer->mutable_grad(this));
-    GEMM(1.0f, 1.0f, dLdr, weight_r_hx_->data(), ilayer->mutable_grad(this));
-  }
-
-  if (clayer != nullptr && clayer->mutable_grad(this) != nullptr) {
-    // Compute gradients for context layer
-    GEMM(1.0f, 0.0f, reset_dLdc, weight_c_hh_->data(),
-        clayer->mutable_grad(this));
-    GEMM(1.0f, 1.0f, dLdr, weight_r_hh_->data(), clayer->mutable_grad(this));
-    GEMM(1.0f, 1.0f, dLdz, weight_z_hh_->data(), clayer->mutable_grad(this));
-    Add(clayer->grad(this), *update_gate_, clayer->mutable_grad(this));
-    // LOG(ERROR) << "grad to prev gru " << Asum(clayer->grad(this));
-  }
-
-  if (srclayers.size() == 1)
-    delete context;
-}
-
-}  // namespace singa
diff --git a/src/neuralnet/neuron_layer/inner_product.cc b/src/neuralnet/neuron_layer/inner_product.cc
deleted file mode 100644
index a7378a2..0000000
--- a/src/neuralnet/neuron_layer/inner_product.cc
+++ /dev/null
@@ -1,89 +0,0 @@
-/************************************************************
-*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*
-*************************************************************/
-
-#include <glog/logging.h>
-#include "singa/neuralnet/neuron_layer.h"
-#include "singa/utils/singleton.h"
-#include "singa/utils/math_blob.h"
-
-namespace singa {
-
-using std::vector;
-
-InnerProductLayer::~InnerProductLayer() {
-  delete weight_;
-  delete bias_;
-}
-
-void InnerProductLayer::Setup(const LayerProto& conf,
-    const vector<Layer*>& srclayers) {
-  Layer::Setup(conf, srclayers);
-  CHECK_EQ(srclayers.size(), 1);
-  const auto& src = srclayers[0]->data(this);
-  batchsize_ = src.shape()[0];
-  vdim_ = src.count() / batchsize_;
-  hdim_ = layer_conf_.innerproduct_conf().num_output();
-  transpose_ = conf.innerproduct_conf().transpose();
-  if (partition_dim() > 0)
-    hdim_ /= srclayers.at(0)->num_partitions();
-  data_.Reshape(vector<int>{batchsize_, hdim_});
-  grad_.ReshapeLike(data_);
-  weight_ = Param::Create(conf.param(0));
-  bias_ = Param::Create(conf.param(1));
-  if (transpose_)
-    weight_->Setup(vector<int>{vdim_, hdim_});
-  else
-    weight_->Setup(vector<int>{hdim_, vdim_});
-  bias_->Setup(vector<int>{hdim_});
-}
-
-void InnerProductLayer::ComputeFeature(int flag,
-    const vector<Layer*>& srclayers) {
-  if (transpose_)
-    MMDot(srclayers[0]->data(this), weight_->data(), &data_);
-  else
-    MMDot(srclayers[0]->data(this), weight_->data().T(), &data_);
-  MVAddRow(bias_->data(), &data_);
-}
-
-void InnerProductLayer::ComputeGradient(int flag,
-    const vector<Layer*>& srclayers) {
-  float beta = 0.0f;
-  if (flag & kAggGrad)
-    beta = 1.0f;
-  MVSumRow(1.0f, beta, grad_, bias_->mutable_grad());
-  if (transpose_)
-    GEMM(1.0f, beta, srclayers[0]->data(this).T(), grad_,
-        weight_->mutable_grad());
-  else
-    GEMM(1.0f, beta, grad_.T(), srclayers[0]->data(this),
-        weight_->mutable_grad());
-
-  if (srclayers[0]->mutable_grad(this) != nullptr) {
-    if (transpose_)
-      MMDot(grad_, weight_->data().T(), srclayers[0]->mutable_grad(this));
-    else
-      MMDot(grad_, weight_->data(), srclayers[0]->mutable_grad(this));
-  }
-  //clee auto w = weight_->mutable_cpu_data();
-  //LOG(ERROR) << srclayers[0]->name() << " " << w[0];
-}
-}  // namespace singa
diff --git a/src/neuralnet/neuron_layer/lrn.cc b/src/neuralnet/neuron_layer/lrn.cc
deleted file mode 100644
index b199b9a..0000000
--- a/src/neuralnet/neuron_layer/lrn.cc
+++ /dev/null
@@ -1,75 +0,0 @@
-/************************************************************
-*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*
-*************************************************************/
-
-#include <glog/logging.h>
-#include "singa/neuralnet/neuron_layer.h"
-#include "singa/utils/singleton.h"
-
-
-namespace singa {
-
-using std::vector;
-
-void LRNLayer::Setup(const LayerProto& conf, const vector<Layer*>& srclayers) {
-  Layer::Setup(conf, srclayers);
-  CHECK_EQ(srclayers.size(), 1);
-  lsize_ = conf.lrn_conf().local_size();
-  CHECK_EQ(lsize_ % 2, 1) << "LRN only supports odd values for Localvol";
-  knorm_ = conf.lrn_conf().knorm();
-  alpha_ = conf.lrn_conf().alpha();
-  beta_ = conf.lrn_conf().beta();
-  const vector<int>& s = srclayers[0]->data(this).shape();
-  data_.Reshape(s);
-  grad_.Reshape(s);
-  norm_.Reshape(s);
-  batchsize_ = s[0];
-  channels_ = s[1];
-  height_ = s[2];
-  width_ = s[3];
-}
-
-void LRNLayer::ComputeFeature(int flag, const vector<Layer*>& srclayers) {
-  const float salpha = alpha_ / lsize_;
-  auto src = Tensor4(srclayers[0]->mutable_data(this));
-  auto data = Tensor4(&data_);
-  auto norm = Tensor4(&norm_);
-  // stores normalizer without power
-  norm = expr::chpool<red::sum>(expr::F<op::square>(src), lsize_) * salpha
-    + knorm_;
-  data = src * expr::F<op::power>(norm, -beta_);
-}
-
-void LRNLayer::ComputeGradient(int flag, const vector<Layer*>& srclayers) {
-  const float salpha = alpha_ / lsize_;
-  auto src = Tensor4(srclayers[0]->mutable_data(this));
-  auto norm = Tensor4(&norm_);
-  auto grad = Tensor4(&grad_);
-  auto gsrc = Tensor4(srclayers[0]->mutable_grad(this));
-
-  gsrc = grad * expr::F<op::power>(norm, -beta_);
-  Tensor<cpu, 4> tmp(gsrc.shape);
-  AllocSpace(tmp);
-  tmp = gsrc * src / norm;
-  gsrc += (- 2.0f * beta_ * salpha) * expr::chpool<red::sum>(tmp, lsize_) * src;
-  FreeSpace(tmp);
-}
-
-}  // namespace singa
diff --git a/src/neuralnet/neuron_layer/pooling.cc b/src/neuralnet/neuron_layer/pooling.cc
deleted file mode 100644
index 4eda2e4..0000000
--- a/src/neuralnet/neuron_layer/pooling.cc
+++ /dev/null
@@ -1,146 +0,0 @@
-/************************************************************
-*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*
-*************************************************************/
-
-#include <glog/logging.h>
-#include "singa/neuralnet/neuron_layer.h"
-#include "singa/utils/singleton.h"
-
-
-namespace singa {
-
-using std::vector;
-
-/******************** Implementation for PoolingLayer******************/
-void PoolingLayer::Setup(const LayerProto& conf,
-    const vector<Layer*>& srclayers) {
-  Layer::Setup(conf, srclayers);
-  CHECK_EQ(srclayers.size(), 1);
-  PoolingProto pool_conf = conf.pooling_conf();
-  if (pool_conf.has_kernel()) {
-    kernel_x_ = kernel_y_ = pool_conf.kernel();
-  } else {
-    kernel_x_ = pool_conf.kernel_x();
-    kernel_y_ = pool_conf.kernel_y();
-  }
-  CHECK_NE(kernel_x_, 0);
-  CHECK_NE(kernel_y_, 0);
-
-  if (pool_conf.has_pad()) {
-    pad_x_ = pad_y_ = pool_conf.pad();
-  } else {
-    pad_x_ = pool_conf.pad_x();
-    pad_y_ = pool_conf.pad_y();
-  }
-
-  if (pool_conf.has_stride()) {
-    stride_x_ = stride_y_ = pool_conf.stride();
-  } else {
-    stride_x_ = pool_conf.stride_x();
-    stride_y_ = pool_conf.stride_y();
-  }
-
-  pool_ = conf.pooling_conf().pool();
-  CHECK(pool_ == PoolingProto_PoolMethod_AVG
-        || pool_ == PoolingProto_PoolMethod_MAX)
-        << "Padding implemented only for average and max pooling.";
-  const auto& srcshape = srclayers[0]->data(this).shape();
-  int dim = srcshape.size();
-  CHECK_GT(dim, 2);
-  width_ = srcshape[dim - 1];
-  height_ = srcshape[dim - 2];
-  if (dim > 3)
-    channels_ = srcshape[dim-3];
-  else
-    channels_ = 1;
-  batchsize_ = srcshape[0];
-  pooled_height_ = static_cast<int>(
-      (height_ + 2 * pad_y_- kernel_y_) / stride_y_) + 1;
-  pooled_width_ = static_cast<int>(
-      (width_ + 2* pad_x_ - kernel_x_) / stride_x_) + 1;
-  data_.Reshape(vector<int>{batchsize_, channels_, pooled_height_,
-                            pooled_width_});
-  grad_.ReshapeLike(data_);
-}
-
-void PoolingLayer::ComputeFeature(int flag, const vector<Layer*>& srclayers) {
-  auto src = Tensor4(srclayers[0]->mutable_data(this));
-  auto data = Tensor4(&data_);
-  if (pool_ == PoolingProto_PoolMethod_MAX)
-    data = expr::pool<red::maximum>(src, kernel_x_, stride_x_);
-  else if (pool_ == PoolingProto_PoolMethod_AVG)
-    data = expr::pool<red::sum>(src, kernel_x_, stride_x_)
-      * (1.0f / (kernel_x_ * kernel_x_));
-}
-
-/*
- * partition only on num/channel dim
- * assume grad and data have the same paritition
- */
-void PoolingLayer::ComputeGradient(int flag, const vector<Layer*>& srclayers) {
-  auto src = Tensor4(srclayers[0]->mutable_data(this));
-  auto gsrc = Tensor4(srclayers[0]->mutable_grad(this));
-  auto data = Tensor4(&data_);
-  auto grad = Tensor4(&grad_);
-  if (pool_ == PoolingProto_PoolMethod_MAX)
-    gsrc = expr::unpool<red::maximum>(src, data, grad, kernel_x_, stride_x_);
-  else if (pool_ == PoolingProto_PoolMethod_AVG)
-    gsrc = expr::unpool<red::sum>(src, data, grad, kernel_x_, stride_x_)
-           * (1.0f / (kernel_x_ * kernel_x_));
-}
-
-/***************** Implementation of CPoolingLayer ***************/
-
-void CPoolingLayer::Setup(const LayerProto& conf,
-    const vector<Layer*>& srclayers) {
-  PoolingLayer::Setup(conf, srclayers);
-  if (pool_ == PoolingProto_PoolMethod_MAX)
-      mask_.ReshapeLike(data_);
-}
-void CPoolingLayer::ComputeFeature(int flag, const vector<Layer*>& srclayers) {
-  if (pool_ == PoolingProto_PoolMethod_MAX)
-    ForwardMaxPooling(srclayers[0]->mutable_data(this)->mutable_cpu_data(),
-        batchsize_, channels_, height_, width_, kernel_y_, kernel_x_,
-        pad_y_, pad_y_, stride_y_, stride_x_,
-        data_.mutable_cpu_data(), mask_.mutable_cpu_data());
-  else if (pool_ == PoolingProto_PoolMethod_AVG)
-    ForwardAvgPooling(srclayers[0]->mutable_data(this)->mutable_cpu_data(),
-        batchsize_, channels_, height_, width_, kernel_y_, kernel_x_,
-        pad_y_, pad_x_, stride_y_, stride_y_, data_.mutable_cpu_data());
-  else
-    LOG(FATAL) << "unknow pooling method";
-}
-
-void CPoolingLayer::ComputeGradient(int flag, const vector<Layer*>& srclayers) {
-  if (pool_ == PoolingProto_PoolMethod_MAX)
-    BackwardMaxPooling(grad_.cpu_data(), mask_.cpu_data(), batchsize_,
-        channels_, height_, width_, kernel_y_, kernel_x_, pad_y_, pad_x_,
-        stride_y_, stride_y_,
-        srclayers[0]->mutable_grad(this)->mutable_cpu_data());
-  else if (pool_ == PoolingProto_PoolMethod_AVG)
-    BackwardAvgPooling(grad_.cpu_data(), batchsize_,
-        channels_, height_, width_, kernel_y_, kernel_x_, pad_y_, pad_x_,
-        stride_y_, stride_x_,
-        srclayers[0]->mutable_grad(this)->mutable_cpu_data());
-  else
-    LOG(FATAL) << "unknow pooling method";
-}
-
-}  //  namespace singa
diff --git a/src/neuralnet/neuron_layer/rbm.cc b/src/neuralnet/neuron_layer/rbm.cc
deleted file mode 100644
index 67d0922..0000000
--- a/src/neuralnet/neuron_layer/rbm.cc
+++ /dev/null
@@ -1,200 +0,0 @@
-/************************************************************
-*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*
-*************************************************************/
-
-#include <glog/logging.h>
-#include "singa/neuralnet/neuron_layer.h"
-#include "singa/utils/singleton.h"
-
-namespace singa {
-
-using std::vector;
-
-/**************** Implementation for RBMLayer********************/
-Blob<float>* RBMLayer::Sample(int flag) {
-  Tensor<cpu, 2> sample, data;
-  if ((flag & kPositive) == kPositive || first_gibbs_) {
-    data = Tensor2(&pos_data_);
-    sample = Tensor2(&pos_sample_);
-  } else {
-    data = Tensor2(&neg_data_);
-    sample = Tensor2(&neg_sample_);
-  }
-  auto random = TSingleton<Random<cpu>>::Instance();
-  if (gaussian_) {
-    random->SampleGaussian(sample, 0.0f, 1.0f);
-    sample += data;
-  } else {
-    random->SampleBinary(sample, data);
-  }
-  return (flag & kPositive) == kPositive || first_gibbs_ ?
-    &pos_sample_ : &neg_sample_;
-}
-void RBMLayer::Setup(const LayerProto& conf, const vector<Layer*>& srclayers) {
-  Layer::Setup(conf, srclayers);
-  hdim_ = conf.rbm_conf().hdim();
-  gaussian_ = conf.rbm_conf().gaussian();
-  first_gibbs_ = true;
-  datavec_.clear();
-  datavec_.push_back(&pos_data_);
-  datavec_.push_back(&neg_data_);
-  datavec_.push_back(&neg_sample_);
-  datavec_.push_back(&pos_sample_);
-  gradvec_.resize(4);
-}
-/**************** Implementation for RBMVisLayer********************/
-RBMVisLayer::~RBMVisLayer() {
-  delete weight_;
-  delete bias_;
-}
-
-void RBMVisLayer::Setup(const LayerProto& conf,
-    const vector<Layer*>& srclayers) {
-  CHECK_EQ(srclayers.size(), 2);
-  RBMLayer::Setup(conf, srclayers);
-  CHECK_EQ(srclayers.size(), 2);
-  hid_layer_ = nullptr;
-  for (auto src : srclayers) {
-    if (typeid(*src) == typeid(RBMHidLayer)) {
-      // note the hid layer has may not been set up.
-      CHECK(hid_layer_ == nullptr);
-      hid_layer_ = dynamic_cast<RBMHidLayer*>(src);
-    }
-  }
-  input_layer_ = srclayers[0] != hid_layer_ ? srclayers[0]: srclayers[1];
-  const auto& src = input_layer_->data(this);
-  batchsize_ = src.shape()[0];
-  pos_data_.ReshapeLike(src);
-  neg_data_.ReshapeLike(pos_data_);
-  neg_sample_.ReshapeLike(pos_data_);
-  vdim_ = src.count() / batchsize_;
-  weight_ = Param::Create(conf.param(0));
-  weight_ ->Setup(vector<int>{hdim_, vdim_});
-  bias_ = Param::Create(conf.param(1));
-  bias_->Setup(vector<int>{vdim_});
-}
-
-void RBMVisLayer::ComputeFeature(int flag, const vector<Layer*>& srclayers) {
-  if ((flag & kPositive) == kPositive) {
-    pos_data_.CopyFrom(input_layer_->data(this), true);
-    first_gibbs_ = true;
-  } else if ((flag & kNegative) == kNegative) {
-    // fetch sampling results from hidden layer
-    auto hid_sample = Tensor2(hid_layer_->Sample(flag));
-    auto data = Tensor2(&neg_data_);
-    auto weight = Tensor2(weight_->mutable_data());
-    auto bias = Tensor1(bias_->mutable_data());
-    data = dot(hid_sample, weight);
-    data += expr::repmat(bias, batchsize_);
-    data = expr::F<op::sigmoid>(data);
-    if ((flag & kTest) == kTest) {
-      const float *dptr = pos_data_.cpu_data(), *rcns = neg_data_.cpu_data();
-      float err = 0.f;
-      for (int i = 0; i < pos_data_.count(); i++) {
-        err += (dptr[i] - rcns[i]) * (dptr[i] - rcns[i]);
-      }
-      error_ += err / batchsize_;
-    }
-    first_gibbs_ = false;
-  }
-  counter_ += 1;
-}
-
-void RBMVisLayer::ComputeGradient(int flag, const vector<Layer*>& srclayers) {
-  auto vis_pos = Tensor2(&pos_data_);
-  auto vis_neg = Tensor2(&neg_data_);
-  auto hid_pos = Tensor2(hid_layer_->mutable_data(0));
-  auto hid_neg = Tensor2(hid_layer_->mutable_data(1));
-
-  auto gbias = Tensor1(bias_->mutable_grad());
-  gbias = expr::sum_rows(vis_neg);
-  gbias -= expr::sum_rows(vis_pos);
-  gbias /= batchsize_;
-
-  auto gweight = Tensor2(weight_->mutable_grad());
-  gweight = dot(hid_neg.T(), vis_neg);
-  gweight -= dot(hid_pos.T(), vis_pos);
-  gweight /= batchsize_;
-}
-const std::string RBMVisLayer::ToString(bool debug, int flag) {
-  if (debug)
-    return Layer::ToString(debug, flag);
-
-  string disp = "Squared Error = " + std::to_string(error_ / counter_);
-  counter_ = 0;
-  error_ = 0;
-  return disp;
-}
-/**************** Implementation for RBMHidLayer********************/
-RBMHidLayer::~RBMHidLayer() {
-  delete weight_;
-  delete bias_;
-}
-
-void RBMHidLayer::Setup(const LayerProto& conf,
-      const vector<Layer*>& srclayers) {
-  RBMLayer::Setup(conf, srclayers);
-  CHECK_EQ(srclayers.size(), 1);
-  const auto& src_data = srclayers[0]->data(0);
-  batchsize_ = src_data.shape()[0];
-  vdim_ = src_data.count() / batchsize_;
-  pos_data_.Reshape(vector<int>{batchsize_, hdim_});
-  neg_data_.ReshapeLike(pos_data_);
-  pos_sample_.ReshapeLike(pos_data_);
-  neg_sample_.ReshapeLike(pos_data_);
-  weight_ = Param::Create(conf.param(0));
-  weight_->Setup(vector<int>{hdim_, vdim_});
-  bias_ = Param::Create(conf.param(1));
-  bias_->Setup(vector<int>{hdim_});
-  vis_layer_ = dynamic_cast<RBMVisLayer*> (srclayers[0]);
-}
-
-void RBMHidLayer::ComputeFeature(int flag, const vector<Layer*>& srclayers) {
-  auto weight = Tensor2(weight_->mutable_data());
-  auto bias = Tensor1(bias_->mutable_data());
-
-  Tensor<cpu, 2> data, src;
-  if ((flag & kPositive) == kPositive) {
-    data = Tensor2(&pos_data_);
-    src = Tensor2(vis_layer_->mutable_data(0));
-    first_gibbs_ = true;
-  } else {
-    data = Tensor2(&neg_data_);
-    // hinton's science paper does not sample the vis layer
-    src = Tensor2(vis_layer_->mutable_data(1));
-    first_gibbs_ = false;
-  }
-  data = dot(src, weight.T());
-  data += expr::repmat(bias, batchsize_);
-
-  if (!gaussian_)
-    data = expr::F<op::sigmoid>(data);
-}
-
-void RBMHidLayer::ComputeGradient(int flag, const vector<Layer*>& srclayers) {
-  auto hid_pos = Tensor2(&pos_data_);
-  auto hid_neg = Tensor2(&neg_data_);
-  auto gbias = Tensor1(bias_->mutable_grad());
-  gbias = expr::sum_rows(hid_neg);
-  gbias -= expr::sum_rows(hid_pos);
-  gbias /= batchsize_;
-}
-
-}  // namespace singa
diff --git a/src/neuralnet/neuron_layer/relu.cc b/src/neuralnet/neuron_layer/relu.cc
deleted file mode 100644
index 5d4d954..0000000
--- a/src/neuralnet/neuron_layer/relu.cc
+++ /dev/null
@@ -1,51 +0,0 @@
-/************************************************************
-*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*
-*************************************************************/
-
-#include <glog/logging.h>
-#include "singa/neuralnet/neuron_layer.h"
-#include "singa/utils/singleton.h"
-
-
-namespace singa {
-
-using std::vector;
-
-void ReLULayer::Setup(const LayerProto& conf,
-    const vector<Layer*>& srclayers) {
-  Layer::Setup(conf, srclayers);
-  data_.ReshapeLike(srclayers[0]->data(this));
-  grad_.ReshapeLike(*(srclayers[0]->mutable_grad(this)));
-}
-
-void ReLULayer::ComputeFeature(int flag, const vector<Layer*>& srclayers) {
-  auto data = Tensor1(&data_);
-  auto src = Tensor1(srclayers[0]->mutable_data(this));
-  data = expr::F<op::relu>(src);
-}
-
-void ReLULayer::ComputeGradient(int flag, const vector<Layer*>& srclayers) {
-  auto data = Tensor1(&data_);
-  auto grad = Tensor1(&grad_);
-  auto gsrc = Tensor1(srclayers[0]->mutable_grad(this));
-  gsrc = expr::F<op::relu_grad>(data)*grad;
-}
-
-}  //  namespace singa
diff --git a/src/neuralnet/neuron_layer/sigmoid.cc b/src/neuralnet/neuron_layer/sigmoid.cc
deleted file mode 100644
index 9348011..0000000
--- a/src/neuralnet/neuron_layer/sigmoid.cc
+++ /dev/null
@@ -1,51 +0,0 @@
-/************************************************************
-*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*
-*************************************************************/
-
-#include <glog/logging.h>
-#include "singa/neuralnet/neuron_layer.h"
-#include "singa/utils/singleton.h"
-
-
-namespace singa {
-
-using std::vector;
-
-void SigmoidLayer::Setup(const LayerProto& conf,
-    const vector<Layer*>& srclayers) {
-  Layer::Setup(conf, srclayers);
-  data_.ReshapeLike(srclayers[0]->data(this));
-  grad_.ReshapeLike(srclayers[0]->grad(this));
-}
-
-void SigmoidLayer::ComputeFeature(int flag, const vector<Layer*>& srclayers) {
-  auto data = Tensor1(&data_);
-  auto src = Tensor1(srclayers[0]->mutable_data(this));
-  data = expr::F<op::sigmoid>(src);
-}
-
-void SigmoidLayer::ComputeGradient(int flag, const vector<Layer*>& srclayers) {
-  auto data = Tensor1(&data_);
-  auto grad = Tensor1(&grad_);
-  auto gsrc = Tensor1(srclayers[0]->mutable_grad(this));
-  gsrc = expr::F<op::sigmoid_grad>(data) * grad;
-}
-
-}  //  namespace singa
diff --git a/src/neuralnet/neuron_layer/softmax.cc b/src/neuralnet/neuron_layer/softmax.cc
deleted file mode 100644
index 4a09241..0000000
--- a/src/neuralnet/neuron_layer/softmax.cc
+++ /dev/null
@@ -1,70 +0,0 @@
-/************************************************************
-*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*
-*************************************************************/
-
-#include "singa/neuralnet/neuron_layer.h"
-
-namespace singa {
-
-using namespace mshadow;
-using mshadow::cpu;
-
-using mshadow::Shape;
-using mshadow::Shape1;
-using mshadow::Shape2;
-using mshadow::Tensor;
-
-using std::vector;
-
-void SoftmaxLayer::Setup(const LayerProto& proto,
-    const vector<Layer*>& srclayers) {
-  CHECK_EQ(srclayers.size(), 1);
-  NeuronLayer::Setup(proto, srclayers);
-  const auto& srcdata = srclayers[0]->data(this);
-  batchsize_ = srcdata.shape()[0];
-  dim_ = srcdata.count() / batchsize_;
-  /*
-  num_softmax_per_instance_ = proto.softmax_conf().num_softmax_per_instance();
-  count_per_softmax_ = srcdata.count() / batchsize_ / num_softmax_per_instance_;
-  */
-  data_.Reshape(batchsize_, dim_);
-  grad_.ReshapeLike(data_);
-}
-
-void SoftmaxLayer::ComputeFeature(int flag,
-    const vector<Layer*>& srclayers) {
-  int dim = data_.count() / batchsize_;
-  Shape<2> s = Shape2(batchsize_, dim);
-  Tensor<cpu, 2> prob(data_.mutable_cpu_data(), s);
-  Tensor<cpu, 2> src(srclayers[0]->mutable_data(this)->mutable_cpu_data(), s);
-  Softmax(prob, src);
-}
-
-void SoftmaxLayer::ComputeGradient(int flag,
-    const vector<Layer*>& srclayers) {
-  int batchsize = data_.shape()[0];
-  LOG(FATAL) << "not implemented";
-  for (int n = 0; n < batchsize; n++) {
-    // TODO(wangwei) finish the code using new math API
-    // gxi=[(gyi+gyi*yi)-\sum_k(gyk*yk)]*yi
-  }
-}
-
-}  // namespace singa
diff --git a/src/neuralnet/neuron_layer/stanh.cc b/src/neuralnet/neuron_layer/stanh.cc
deleted file mode 100644
index 70b9cd1..0000000
--- a/src/neuralnet/neuron_layer/stanh.cc
+++ /dev/null
@@ -1,48 +0,0 @@
-/************************************************************
-*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*
-*************************************************************/
-
-#include "singa/neuralnet/neuron_layer.h"
-
-namespace singa {
-
-using std::vector;
-
-void STanhLayer::Setup(const LayerProto& conf,
-    const vector<Layer*>& srclayers) {
-  Layer::Setup(conf, srclayers);
-  data_.ReshapeLike(srclayers[0]->data(this));
-  grad_.ReshapeLike(srclayers[0]->grad(this));
-}
-
-void STanhLayer::ComputeFeature(int flag, const vector<Layer*>& srclayers) {
-  auto data = Tensor1(&data_);
-  auto src = Tensor1(srclayers[0]->mutable_data(this));
-  data = expr::F<op::stanh>(src);
-}
-
-void STanhLayer::ComputeGradient(int flag, const vector<Layer*>& srclayers) {
-  auto data = Tensor1(&data_);
-  auto grad = Tensor1(&grad_);
-  auto gsrc = Tensor1(srclayers[0]->mutable_grad(this));
-  gsrc = expr::F<op::stanh_grad>(data) * grad;
-}
-
-}  // namespace singa
diff --git a/src/neuralnet/output_layer/accuracy.cc b/src/neuralnet/output_layer/accuracy.cc
deleted file mode 100644
index 53a9406..0000000
--- a/src/neuralnet/output_layer/accuracy.cc
+++ /dev/null
@@ -1,61 +0,0 @@
-/************************************************************
-*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*
-*************************************************************/
-
-#include <algorithm>
-#include "singa/neuralnet/output_layer.h"
-
-namespace singa {
-
-void AccuracyLayer::Setup(const LayerProto& proto,
-    const vector<Layer*>& srclayers) {
-  CHECK_EQ(srclayers.size(), 2);
-  ArgSortLayer::Setup(proto, vector<Layer*>{srclayers.at(0)});
-}
-
-void AccuracyLayer::ComputeFeature(int flag,
-    const vector<Layer*>& srclayers) {
-  ArgSortLayer::ComputeFeature(flag, vector<Layer*>{srclayers.at(0)});
-  const auto& label = srclayers[1]->aux_data(this);
-  int ncorrect = 0;
-  for (int n = 0; n < batchsize_; n++) {
-    const float* pos = data_.cpu_data() + topk_ * n;
-    // check if true label is in top k predictions
-    for (int k = 0; k < topk_; k++) {
-      if (pos[k] == label[n]) {
-        ncorrect++;
-        break;
-      }
-    }
-  }
-  accuracy_ += ncorrect * 1.0f / batchsize_;
-  counter_++;
-}
-
-const std::string AccuracyLayer::ToString(bool debug, int flag) {
-  if (debug)
-    return Layer::ToString(debug, flag);
-
-  string disp = "accuracy = " + std::to_string(accuracy_ / counter_);
-  counter_ = 0;
-  accuracy_ = 0;
-  return disp;
-}
-}  // namespace singa
diff --git a/src/neuralnet/output_layer/argsort.cc b/src/neuralnet/output_layer/argsort.cc
deleted file mode 100644
index 869bc65..0000000
--- a/src/neuralnet/output_layer/argsort.cc
+++ /dev/null
@@ -1,56 +0,0 @@
-/************************************************************
-*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*
-*************************************************************/
-
-#include <algorithm>
-#include "singa/neuralnet/output_layer.h"
-
-namespace singa {
-
-void ArgSortLayer::Setup(const LayerProto& proto,
-    const vector<Layer*>& srclayers) {
-  CHECK_EQ(srclayers.size(), 1);
-  OutputLayer::Setup(proto, srclayers);
-  batchsize_ = srclayers[0]->data(this).shape()[0];
-  dim_ = srclayers[0]->data(this).count() / batchsize_;
-  topk_ = proto.argsort_conf().topk();
-  data_.Reshape(vector<int>{batchsize_, topk_});
-}
-
-void ArgSortLayer::ComputeFeature(int flag,
-    const vector<Layer*>& srclayers) {
-  // TODO(wangwei) check flag to ensure it is not called in training phase
-  const float* srcptr = srclayers.at(0)->data(this).cpu_data();
-  float* ptr = data_.mutable_cpu_data();
-  for (int n = 0; n < batchsize_; n++) {
-    vector<std::pair<float, int> > vec;
-    for (int j = 0; j < dim_; ++j)
-      vec.push_back(std::make_pair(srcptr[j], j));
-    std::partial_sort(vec.begin(), vec.begin() + topk_, vec.end(),
-                      std::greater<std::pair<float, int> >());
-
-    for (int j = 0; j < topk_; ++j)
-      ptr[j] = static_cast<float> (vec.at(j).second);
-    ptr += topk_;
-    srcptr += dim_;
-  }
-}
-
-}  // namespace singa
diff --git a/src/neuralnet/output_layer/char_rnn.cc b/src/neuralnet/output_layer/char_rnn.cc
deleted file mode 100644
index c3f1733..0000000
--- a/src/neuralnet/output_layer/char_rnn.cc
+++ /dev/null
@@ -1,51 +0,0 @@
-/************************************************************
-*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*
-*************************************************************/
-
-#include <algorithm>
-#include <iostream>
-#include <fstream>
-#include "singa/neuralnet/output_layer.h"
-
-namespace singa {
-
-void CharRNNOutputLayer::Setup(const LayerProto& proto,
-    const vector<Layer*>& srclayers) {
-  CHECK_EQ(srclayers.size(), 1);
-  OutputLayer::Setup(proto, srclayers);
-  std::ifstream fin;
-  const string path = proto.char_rnn_conf().vocab_path();
-  fin.open(path);
-  CHECK(fin.is_open()) << "Can't open vocab_path = " << path;
-  std::stringstream stream;
-  stream << fin.rdbuf();
-  vocab_ = stream.str();
-  fin.close();
-}
-
-void CharRNNOutputLayer::ComputeFeature(int flag,
-    const vector<Layer*>& srclayers) {
-  const float* dptr =  srclayers[0]->data(this).cpu_data();
-  for (int i = 0; i < srclayers[0]->data(this).shape(0); i++) {
-    std::cout<<vocab_[static_cast<int>(dptr[i])];
-  }
-}
-
-}  // namespace singa;
diff --git a/src/neuralnet/output_layer/csv.cc b/src/neuralnet/output_layer/csv.cc
deleted file mode 100644
index d2512da..0000000
--- a/src/neuralnet/output_layer/csv.cc
+++ /dev/null
@@ -1,59 +0,0 @@
-/************************************************************
-*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*
-*************************************************************/
-#include "singa/neuralnet/output_layer.h"
-
-namespace singa {
-
-void CSVOutputLayer::Setup(const LayerProto& conf,
-    const vector<Layer*>& srclayers) {
-  OutputLayer::Setup(conf, srclayers);
-  CHECK_EQ(srclayers.size(), 1);
-}
-
-void CSVOutputLayer::ComputeFeature(int flag, const vector<Layer*>& srclayers) {
-  if (store_ == nullptr) {
-    string backend = "textfile";
-    const auto& conf = layer_conf_.store_conf();
-    if (conf.has_backend())
-      backend = conf.has_backend();
-    store_ = io::OpenStore(backend, conf.path(), io::kCreate);
-  }
-  const auto& data = srclayers.at(0)->data(this);
-  const auto& label = srclayers.at(0)->aux_data();
-  int batchsize = data.shape()[0];
-  CHECK_GT(batchsize, 0);
-  int dim = data.count() / batchsize;
-  if (label.size())
-    CHECK_EQ(label.size(), batchsize);
-  CHECK_GT(dim, 0);
-  for (int k = 0; k < batchsize; k++) {
-    std::ostringstream record;
-    if (label.size())
-      record << std::to_string(label[k]) << ",";
-    auto* dptr = data.cpu_data() + k * dim;
-    for (int i = 0; i < dim - 1; i++)
-      record << std::to_string(dptr[i]) << ",";
-    record << std::to_string(dptr[dim - 1]);
-    store_->Write(std::to_string(inst_++), record.str());
-  }
-  store_->Flush();
-}
-}  // namespace singa
diff --git a/src/neuralnet/output_layer/record.cc b/src/neuralnet/output_layer/record.cc
deleted file mode 100644
index f7b3e01..0000000
--- a/src/neuralnet/output_layer/record.cc
+++ /dev/null
@@ -1,56 +0,0 @@
-/************************************************************
-*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*
-*************************************************************/
-#include "singa/neuralnet/output_layer.h"
-#include "singa/proto/common.pb.h"
-namespace singa {
-
-void RecordOutputLayer::Setup(const LayerProto& conf,
-    const vector<Layer*>& srclayers) {
-  OutputLayer::Setup(conf, srclayers);
-  CHECK_EQ(srclayers.size(), 1);
-}
-
-void RecordOutputLayer::ComputeFeature(int flag,
-    const vector<Layer*>& srclayers) {
-  if (store_ == nullptr)
-    store_ = io::OpenStore(layer_conf_.store_conf().backend(),
-        layer_conf_.store_conf().path(), io::kCreate);
-  const auto& data = srclayers.at(0)->data(this);
-  const auto& label = srclayers.at(0)->aux_data();
-  int batchsize = data.shape()[0];
-  CHECK_GT(batchsize, 0);
-  int dim = data.count() / batchsize;
-  if (label.size())
-    CHECK_EQ(label.size(), batchsize);
-  for (int k = 0; k < batchsize; k++) {
-    SingleLabelImageRecord image;
-    if (label.size())
-      image.set_label(label[k]);
-    auto* dptr = data.cpu_data() + k * dim;
-    for (int i = 0; i < dim; i++)
-      image.add_data(dptr[i]);
-    std::string val;
-    image.SerializeToString(&val);
-    store_->Write(std::to_string(inst_++), val);
-  }
-  store_->Flush();
-}
-}  // namespace singa
diff --git a/src/proto/common.proto b/src/proto/common.proto
deleted file mode 100644
index b1ba1b6..0000000
--- a/src/proto/common.proto
+++ /dev/null
@@ -1,114 +0,0 @@
-/************************************************************
-*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*
-*************************************************************/
-
-package singa;
-
-enum MsgType {
-  kGet = 0;
-  kPut = 1;
-  kSync = 2;
-  kUpdate = 3;
-  kSyncRequest = 4;
-  kSyncResponse = 5;
-  kStop = 6;
-  kData = 7;
-  kRGet = 8;
-  kRUpdate = 9;
-  kConnect = 10;
-  kMetric = 11;
-};
-
-enum EntityType {
-  kWorkerParam = 0;
-  kWorkerLayer = 1;
-  kServer = 2;
-  kStub = 3;
-  kRuntime = 4;
-};
-
-enum ConnectionType {
-  kOneToOne = 0;
-  kOneToAll = 1;
-  kOneToMany = 2;
-}
-
-// to import caffe's lmdb dataset
-message CaffeDatum {
-  optional int32 channels = 1;
-  optional int32 height = 2;
-  optional int32 width = 3;
-  // the actual image data, in bytes
-  optional bytes data = 4;
-  optional int32 label = 5;
-  // Optionally, the datum could also hold float data.
-  repeated float float_data = 6;
-  // If true data contains an encoded image that need to be decoded
-  optional bool encoded = 7 [default = false];
-}
-
-// to import caffe's blob, e.g., image mean
-message CaffeBlob {
-  optional int32 num = 1 [default = 0];
-  optional int32 channels = 2 [default = 0];
-  optional int32 height = 3 [default = 0];
-  optional int32 width = 4 [default = 0];
-  repeated float data = 5 [packed = true];
-  repeated float diff = 6 [packed = true];
-}
-
-message BlobProto {
-  repeated int32 shape = 1;
-  repeated float data = 2 [packed = true];
-}
-
-message BlobProtos {
-  repeated int32 id = 2;
-  repeated int32 version = 3;
-  repeated string name = 4;
-  repeated BlobProto blob = 5;
-}
-
-message Record {
-  enum Type {
-    // each record contains image raw feature and its label.
-    kSingleLabelImage = 0;
-  }
-  optional Type type = 1 [default = kSingleLabelImage];
-  optional string user_type =2;
-  // configuration for
-  optional RecordProto image = 5;
-
-  extensions 101 to 200;
-}
-
-// rename SingleLabelImageRecord to RecordProto
-message RecordProto {
-  repeated int32 shape = 1;
-  optional int32 label = 2;
-  optional bytes pixel = 3;
-  repeated float data = 4 [packed = true];
-}
-
-message MetricProto {
-  repeated string name = 1;
-  repeated int32 count = 2;
-  repeated float val = 3;
-}
diff --git a/src/proto/core.proto b/src/proto/core.proto
new file mode 100644
index 0000000..c88bee9
--- /dev/null
+++ b/src/proto/core.proto
@@ -0,0 +1,76 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package singa;
+
+// TODO(wangwei) check protobuf version to include the syntax
+//syntax = "proto2";
+
+enum DataType {
+  kFloat32 = 0;
+  kFloat16 = 1;
+  kInt = 2;
+  kChar = 3;
+  kDouble = 4;
+  kUChar = 5;
+  kNumDataType = 6;
+}
+
+enum LangType {
+  kCpp = 0;
+  kCuda = 1;
+  kOpencl = 2;
+  kNumDeviceType = 4;
+}
+
+enum CopyDirection {
+  kHostToHost = 0;
+  kHostToDevice = 1;
+  kDeviceToHost = 2;
+  kDeviceToDevice = 3;
+  kNumDirection = 4;
+}
+
+// configuration for device memory pool
+message MemPoolConf {
+	optional string type = 1 [default = "cnmem"];
+	// allocation size for each device, default is 256 MB
+	optional uint32 init_size = 2 [default = 256];
+  // size limit in MB; report error/warning if this limit is reached.
+  // 0 for unlimited memory, i.e., use as much memory as the device has
+  // not used currently.
+	optional uint32 max_size = 3 [default = 0];
+
+	// memory manager flag for cnmem
+	// flag = 0: default flag
+	// flag = 1: prevent the manager from growing its memory consumption
+	// flag = 2: prevent the manager from stealing memory
+	optional uint32 flag = 11 [default = 0];
+  repeated uint32 device = 12;
+}
+
+// For tensor serialization
+message TensorProto {
+  repeated uint32 shape = 1;
+  optional DataType data_type = 2;
+  optional bool transpose = 3;
+  repeated float float_data = 4;
+  repeated double double_data = 5;
+  repeated int32 int_data = 6;
+  repeated bytes bytes_data = 7;
+}
diff --git a/src/proto/io.proto b/src/proto/io.proto
new file mode 100644
index 0000000..f349f74
--- /dev/null
+++ b/src/proto/io.proto
@@ -0,0 +1,58 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package singa;
+
+
+message EncoderConf {
+  optional string type = 1 [default = "jpg2proto"];
+  optional string image_dim_order = 2 [default = "HWC"];
+}
+
+message DecoderConf {
+  optional string type = 1 [default = "proto2jpg"];
+  optional string image_dim_order = 2 [default = "CHW"];
+  optional bool has_label = 3 [default = true];
+}
+
+message TransformerConf {
+  optional bool featurewise_center = 1 [default = false];
+  optional bool samplewise_center = 2 [default = false];
+  optional bool featurewise_std_norm = 3 [default = false];
+  optional bool samplewise_std_norm = 4 [default = false];
+  optional bool zca_whitening = 5 [default = false];
+  optional int32 rotation_range = 6 [default = 0];
+  /// crop_shape should exactly contain 2 elements in order,
+  /// i.e., crop_height, crop_width.
+  repeated uint32 crop_shape = 7 [packed = true];
+  optional int32 resize_height = 8 [default = 0];
+  optional int32 resize_width = 9 [default = 0];
+  optional bool horizontal_mirror = 10 [default = false];
+  optional bool vertical_mirror = 11 [default = false];
+  /// if input tensor is 4D or 3D,
+  /// supported shape of inputs are "CHW" and "HWC".
+  /// if input tensor is 2D, this field will be ignored.
+  optional string image_dim_order = 12 [default = "CHW"];
+  optional float rescale = 13 [default = 0];
+}
+
+message ImageRecord {
+  repeated int32 shape = 1;
+  repeated int32 label = 2;
+  optional bytes pixel = 3;
+}
diff --git a/src/proto/job.proto b/src/proto/job.proto
deleted file mode 100644
index b4aa971..0000000
--- a/src/proto/job.proto
+++ /dev/null
@@ -1,816 +0,0 @@
-/************************************************************
-*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*
-*************************************************************/
-
-package singa;
-
-/*
- * To start a training job, all we need is a JobProto object.
- * It should contain following fields
- *  - Job Name (name)
- *      the name to identify the job
- *  - NeuralNet (neuralnet)
- *      the neural network structure contains a set of layers
- *  - Train One Batch (alg)
- *      the training algorithm
- *  - Updater (updater)
- *      the protocol for updating parameters at server side
- *  - Cluster Topology (cluster)
- *      the distributed topology of workers/servers
- *  - Training Steps (train_steps)
- *      the number of training iteration
- *  All other fields/functions are optional, e.g., test, checkpoint
- */
-
-message JobProto {
-  // job name, e.g., "cifar10-dcnn", "mnist-mlp"
-  optional string name = 1;
-  // neural net consits of a set of connected layers
-  optional NetProto neuralnet = 3;
-  // algorithm for computing gradients over one mini-batch
-  optional AlgProto train_one_batch = 5;
-  // configuration of SGD updater, including learning rate, etc.
-  optional UpdaterProto updater = 7;
-  // cluster toplogy conf
-  optional ClusterProto cluster = 9;
-  // total num of steps for training
-  optional int32 train_steps = 16;
-  // frequency of displaying training info
-  optional int32 disp_freq = 17 [default = 0];
-  // GPU device IDs for use, if fewer than workers per procs, some workers run
-  // on GPU and the rest run on CPU.
-  repeated int32 gpu = 18;
-
-  // frequency of test, e.g., do test every 100 training steps
-  optional int32 test_freq = 20 [default = 0];
-  // total num of steps for testing all test data;
-  // TODO(wangwei): set -1 for test forever
-  optional int32 test_steps =  21 [default = 0];
-  // frequency of validation, e.g., do validation every 100 training steps
-  optional int32 validate_freq = 25 [default = 0];
-  // total num of steps for validating all validation data
-  optional int32 validate_steps = 26 [default = 0];
-  // frequency of checkpoint
-  optional int32 checkpoint_freq = 30 [default = 0];
-
-  // for loading checkpoint files to init parameters
-  repeated string checkpoint_path = 60;
-  // send parameters to servers after training for this num of steps
-  optional int32 warmup_steps = 61 [default = 0];
-  // display debug info
-  optional bool debug = 62 [default = false];
-  // reset the version of params loaded from checkpoint file to step
-  optional bool reset_param_version = 63 [default = true];
-  // set num of threads used by openblas
-  optional int32 num_openblas_threads = 64 [default = 1];
-
-  // start checkpoint after this num steps
-  optional int32 checkpoint_after = 80 [default = 0];
-  // start display after this num steps
-  optional int32 disp_after =  81[default = 0];
-  // start test after this num steps
-  optional int32 test_after = 82 [default = 0];
-  // start validation after this num steps
-  optional int32 validate_after = 83 [default = 0];
-
-  // for internal use
-  // users typically do not touch following fields
-
-  // resume flag
-  optional bool resume = 90 [default = false];
-  // last snapshot step
-  optional int32 step = 91 [default = 0];
-  // job id allocated by zookeeper
-  optional int32 id = 92 [default = -1];
-
-  extensions 101 to 200;
-}
-
-// Protos used by JobProto
-// -----------------------
-
-message AlgProto {
-  // algorithms calculating gradients for one mini-batch/iteration
-  optional AlgType alg = 1 [default = kUserAlg];
-  // user defined algorithm
-  optional string user_alg = 2;
-  // for setting CD fields
-  optional CDProto cd_conf = 10;
-
-  extensions 101 to 200;
-}
-message NetProto {
-  repeated LayerProto layer = 1;
-  // partitioning type for parallelism
-  optional int32 partition_dim = 20 [default = 0];
-  // Each layer corresponds to a group of unrolled layers, used in RNN models
-  repeated LayerGroupProto layer_group = 21;
-  optional int32 unroll_len = 22 [default = 1];
-}
-
-message LayerGroupProto {
-  // name of the layers belong to the same group
-  repeated string layer = 1;
-}
-
-message UpdaterProto {
-  // built-in updater type
-  optional UpdaterType type = 1 [default = kUserUpdater];
-  // user-defned updater type
-  optional string user_type = 2;
-
-  // configuration for RMSProp algorithm
-  optional RMSPropProto rmsprop_conf = 3;
-  // congiguration for AdaDelta algorithm
-  optional AdaDeltaProto adadelta_conf = 4;
-  // congiguration for Adam algorithm
-  optional AdamProto adam_conf = 5;
-  // congiguration for AdamMax algorithm
-  optional AdamMaxProto adammax_conf = 6;
-
-  // learning rate generator
-  optional LRGenProto learning_rate = 11;
-  optional float momentum = 31 [default = 0];
-  optional float weight_decay = 32 [default = 0];
-
-  // used to avoid divide by 0, i.e. x/(y+delta)
-  optional float delta = 35 [default = 0.00000001];
-
-  optional float clip_low = 36 [default = 0];
-  optional float clip_high = 37 [default = 0];
-
-  extensions 101 to 200;
-}
-
-message ClusterProto {
-  optional int32 nworker_groups = 1 [default = 1];
-  optional int32 nserver_groups = 2 [default = 1];
-  optional int32 nworkers_per_group = 3 [default = 1];
-  optional int32 nservers_per_group = 4 [default = 1];
-  optional int32 nworkers_per_procs = 5 [default = 1];
-  optional int32 nservers_per_procs = 6 [default = 1];
-  // local workspace for checkpoint files and vis files
-  //required string workspace = 10;
-  optional string workspace = 10;
-
-  // servers and workers in different processes?
-  optional bool server_worker_separate = 20 [default = false];
-
-  // sync frequency between server groups
-  optional int32 sync_freq = 21 [default = 1];
-
-  // port number used by ZeroMQ
-  optional int32 start_port = 60 [default = 6723];
-  // share memory space between worker groups in one procs
-  optional bool share_memory = 62 [default = true];
-
-  // poll time in milliseconds
-  optional int32 poll_time = 81 [default = 100];
-}
-
-message CDProto {
-  //number of steps for gibbs sampling
-  optional int32 cd_k = 1 [default = 1];
-}
-
-message LayerProto {
-  // the layer name used for identification
-  required string name = 1;
-  // source layer names
-  repeated string srclayers = 3;
-  // parameters, e.g., weight matrix or bias vector
-  repeated ParamProto param = 12;
-  // all layers are included in the net structure for training phase by default.
-  // some layers like data layer for loading test data are not used by training
-  // phase should be removed by setting the exclude field.
-  repeated Phase exclude = 15;
-  // exclude field is deprecated, please use include field instead!!!
-  // some layers like data layer for loading test data are not used by training
-  // in this case, only test phase should be included by setting the include field.
-  repeated Phase include = 14;
-  // type of built-in layer
-  optional LayerType type = 20 [default = kUserLayer];
-  // type of user layer
-  optional string user_type = 21;
-  // share data and grad blob with the single src layer, e.g., relu layer can
-  // share blobs from conv layer. It is useful for saving memory space.
-  optional bool share_src_blobs = 22 [default = false];
-  // for unrolling layers in RNN model
-  optional int32 unroll_len = 23 [default = 1];
-  optional int32 unroll_index = 24 [default = 0];
-  repeated UnrollConnType unroll_conn_type = 25;
-  repeated int32 shift = 26;
-
-  // overrides the partition dimension for neural net
-  optional int32 partition_dim = 60 [default = -1];
-  // names of parameters shared from other layers
-  optional int32 partition_id = 90 [default = 0];
-  // num of partitions for this layer
-  optional int32 num_partitions = 91 [default = 1];
-
-  // layer specific configuration
-  // configuration for input layers, id range [100, 200)
-  optional StoreProto store_conf = 100;
-  optional DataProto lmdbdata_conf = 190;
-  optional MnistProto mnist_conf = 192;
-  optional RGBImageProto rgbimage_conf = 193;
-  optional DataProto sharddata_conf = 194;
-  optional CharRNNProto char_rnn_conf = 195;
-  optional OnehotProto onehot_conf = 196;
-
-  // configuration for neuron layers id range [200, 300)
-  optional ActivationProto activation_conf = 200;
-  optional ConvolutionProto convolution_conf = 201;
-  optional DropoutProto dropout_conf = 203;
-  optional DummyProto dummy_conf = 204;
-  optional InnerProductProto innerproduct_conf = 205;
-  optional LRNProto lrn_conf = 206;
-  optional PoolingProto pooling_conf = 207;
-  optional RBMProto rbm_conf = 209;
-  optional ReLUProto relu_conf = 211;
-  optional SoftmaxProto softmax_conf = 214;
-  optional GRUProto gru_conf = 215;
-  optional EmbeddingProto embedding_conf = 216;
-  optional BMProto bm_conf = 217;
-
-  // configuration for loss layers, id range [300, 400)
-  optional SoftmaxLossProto softmaxloss_conf = 301;
-
-  // configuration for output layers id range [400, 500)
-  optional ArgSortProto argsort_conf = 401;
-
-  // configuration for connection layers, id range [501, )
-  optional ConcateProto concate_conf = 502;
-  optional SliceProto slice_conf = 503;
-  optional SplitProto split_conf = 504;
-  optional RNNDummyProto rnn_dummy_conf = 505;
-
-  extensions 1001 to 1100;
-}
-
-// weight matrix should be defined before bias vector
-// TODO(wangwei): separate conf for diff init method
-message ParamProto {
-  // used for identifying the same params from diff models and display deug info
-  optional string name =  1 [default = ""];
-  // for built-in Param
-  optional ParamType type = 3 [default = kParam];
-  // for user-defined Param
-  optional string user_type = 4;
-
-  optional ParamGenProto init =5;
-    // multiplied on the global learning rate.
-  optional float lr_scale = 15 [default = 1];
-  // multiplied on the global weight decay.
-  optional float wd_scale = 16 [default = 1];
-
-  // name of the owner param from which this param shares the values
-  optional string share_from = 60;
-
-  // used interally
-  optional int32 id = 90;
-  // used internally
-  optional int32 owner = 91 [default = -1];
-  // partition dimension, -1 for no partition
-  optional int32 partition_dim = 92;
-  // usually, the program will infer the param shape
-  repeated int32 shape = 93;
-
-  extensions 101 to 200;
-}
-
-// ---------------------------
-// protos for different layers
-// ---------------------------
-// learning rate generator proto
-message LRGenProto {
-  // user-defined change method
-  optional ChangeMethod type = 1 [default = kUserChange];
-  optional string user_type = 2;
-
-  optional float base_lr = 3 [default = 0.01];
-
-  optional FixedStepProto fixedstep_conf = 40;
-  optional StepProto step_conf = 41;
-  optional LinearProto linear_conf = 42;
-  optional ExponentialProto exponential_conf = 43;
-  optional InverseProto inverse_conf = 44;
-  optional InverseTProto inverset_conf = 45;
-
-  extensions 101 to 200;
-}
-
-message ParamGenProto {
-  optional InitMethod type = 1 [default = kUserInit];
-  optional string user_type =2;
-  // constant init
-  optional float value = 3 [default = 1];
-  // for gaussian sampling
-  optional float mean = 4 [default = 0];
-  optional float std = 5 [default = 1];
-  // for uniform sampling
-  optional float low = 8 [default = -1];
-  optional float high = 9 [default = 1];
-
-  extensions 101 to 200;
-}
-
-enum ActivationType {
-  RELU = 1;
-  SIGMOID = 2;
-  TANH = 3;
-  STANH = 4;
-}
-
-message ActivationProto {
-  optional ActivationType type = 1 [default = RELU];
-}
-
-message OnehotProto {
-  optional int32 vocab_size = 1 [default = 0];
-}
-
-message RGBImageProto {
-  // scale factor for each pixel
-  optional float scale = 1 [default = 1.0];
-  // size after cropping
-  optional int32 cropsize = 2 [default = 0];
-  // mirror the image
-  optional bool mirror = 3 [default = false];
-  // meanfile path
-  optional string meanfile = 4 [default = ""];
-}
-
-message SplitProto {
-  optional int32 num_splits = 1 [default = 1];
-}
-
-message StoreProto {
-  optional string backend = 1;
-  optional string path = 2;
-  optional string separator = 3 [default = ","];
-  optional string mean_file = 4;
-  optional string std_file = 5;
-  optional float mean_value = 6;
-  optional float std_value = 7;
-  repeated int32 batchsize = 8;
-  repeated int32 shape = 9;
-  optional bool encoded = 10 [default = false];
-  optional int32 random_skip = 11 [default = 0];
-  optional bool has_label = 12 [default = true];
-  optional bool prefetching = 13 [default = false];
-}
-
-message CharRNNProto {
-  optional string path = 1;
-  optional string vocab_path = 2;
-  // num of chars to read per instance,  should = NetProto::unroll_len
-  optional int32 unroll_len = 3 [default = 50];
-  optional int32 batchsize = 4 [default = 1];
-}
-
-message EmbeddingProto {
-  optional int32 vocab_size = 1 [default = 0];
-  optional int32 feature_dim = 2 [default = 100];
-
-}
-
-message BMProto {
-}
-
-message SoftmaxLossProto {
-  // computing accuracy against topk results
-  optional int32 topk = 1 [default = 1];
-  // loss scale factor
-  optional float scale = 30 [default = 1];
-}
-
-message ArgSortProto {
-  // keep labels with topk scores
-  optional int32 topk = 1 [default = 1];
-}
-
-message ConcateProto {
-  optional int32 concate_dim = 1 [default = 0];
-  optional int32 num_concates = 2 [default = 1];
-}
-
-message ConvolutionProto {
-  // The number of outputs for the layer
-  optional int32 num_filters = 1;
-  // the kernel height/width
-  optional int32 kernel = 2 [default = 3];
-  // The padding height/width
-  optional int32 pad = 30 [default = 0];
-  // the stride
-  optional int32 stride = 31 [default = 1];
-
-  optional int32 kernel_x = 41 [default = 3];
-  optional int32 kernel_y = 42 [default = 3];
-
-  optional int32 pad_x = 44 [default = 0];
-  optional int32 pad_y = 45 [default = 0];
-
-  optional int32 stride_x = 47 [default = 1];
-  optional int32 stride_y = 48 [default = 1];
-
-  // cudnn workspace size in MB
-  optional int32 workspace_byte_limit = 50 [default = 512];
-}
-
-message DataProto {
-  // path to the data file/folder, absolute or relative to the workspace
-  required string path = 2;
-  // batch size.
-  required int32 batchsize = 4;
-  // skip [0,random_skip] records
-  optional int32 random_skip = 30 [default = 0];
-}
-
-message MnistProto {
-  // normalization x/norm_a
-  required float norm_a = 1 [default = 1];
-  // normalization x-norm_b
-  required float norm_b = 2 [default = 0];
-
-  // elastic distortion
-  optional int32 kernel = 30 [default = 0];
-  optional float sigma = 31 [default = 0];
-  optional float alpha = 32 [default = 0];
-  // rotation or horizontal shearing
-  optional float beta = 33 [default = 0];
-  // scaling
-  optional float gamma = 34 [default = 0];
-  // scale to this size as input for deformation
-  optional int32 resize = 35 [default = 0] ;
-  optional int32 elastic_freq = 36 [default = 0];
-}
-
-message DummyProto {
-  // shape of data and grad blobs
-  optional bool input = 1 [default = false];
-  optional bool output = 2 [default = false];
-  repeated int32 shape = 3;
-}
-
-message RNNDummyProto {
-  optional string dynamic_srclayer = 1;
-  // if shape set, random generate the data blob
-  repeated int32 shape = 2;
-  // if integer is true, generate integer data
-  optional bool integer = 3 [default = false];
-  // range of the random generation
-  optional float low = 4 [default = 0];
-  optional float high = 5 [default = 0];
-}
-
-// Message that stores parameters used by DropoutLayer
-message DropoutProto {
-  // dropout ratio
-  optional float dropout_ratio = 30 [default = 0.5];
-}
-
-message RBMProto {
-  required int32 hdim = 1; // The number of outputs for the layer
-  optional bool bias_term = 2 [default = true]; // whether to have bias terms
-  optional bool gaussian = 3 [default = false]; // use gaussian sampling or not
-}
-
-// Message that stores parameters used by GRULayer
-message GRUProto {
-  // dimension of hidden state for the layer
-  required int32 dim_hidden = 1;
-  // use bias vector or not
-  optional bool bias_term = 2 [default = true];
-}
-
-
-// Message that stores parameters used by InnerProductLayer
-message InnerProductProto {
-  // number of outputs for the layer
-  required int32 num_output = 1;
-  // use bias vector or not
-  optional bool bias_term = 30 [default = true];
-  // transpose or not
-  optional bool transpose = 31 [default = false];
-}
-
-message LRNProto {
-  // local response size
-  required int32 local_size = 1 [default = 5];
-  // scale factor
-  optional float alpha = 31 [default = 1.0];
-  // exponential number
-  optional float beta = 32 [default = 0.75];
-  // offset
-  optional float knorm = 34 [default = 1.0];
-}
-
-message PoolingProto {
-  // The kernel size (square)
-  optional int32 kernel= 1 [default = 3];
-  enum PoolMethod {
-    MAX = 0;
-    AVG = 1;
-  }
-  // The pooling method
-  optional PoolMethod pool = 30 [default = MAX];
-  // The padding size
-  optional uint32 pad = 31 [default = 0];
-  // The stride
-  optional uint32 stride = 32 [default = 2];
-
-  optional int32 kernel_x = 41 [default = 3];
-  optional int32 kernel_y = 42 [default = 3];
-
-  optional int32 pad_x = 44 [default = 0];
-  optional int32 pad_y = 45 [default = 0];
-
-  optional int32 stride_x = 47 [default = 2];
-  optional int32 stride_y = 48 [default = 2];
-}
-
-message ReLUProto {
-  // Ref. Maas, A. L., Hannun, A. Y., & Ng, A. Y. (2013).
-  // Rectifier nonlinearities improve neural network acoustic models.
-  // In ICML Workshop on Deep Learning for Audio, Speech, and Language Processing.
-  optional float negative_slope = 1 [default = 0];
-}
-
-message SliceProto {
-  optional int32 slice_dim = 1 [default = 0];
-  optional int32 num_slices = 2 [default = 1];
-}
-
-message SoftmaxProto {
-  // Can be used to do softmax over each channel of one image by setting it to
-  // be the size of the second dimension (the first dimension is batchsize).
-  optional int32 num_softmax_per_instance = 1 [default = 1];
-}
-
-message RMSPropProto {
- // history=history*rho_+(1-rho_)*(grad*grad_scale);
-  required float rho = 1;
-}
-message AdaDeltaProto {
-  required float rho = 1 [default = 0.9];
-}
-message AdamProto {
-  required float beta1 = 1 [default = 0.9];
-  required float beta2 = 2 [default = 0.999];
-}
-message AdamMaxProto {
-  required float beta1 = 1 [default = 0.9];
-  required float beta2 = 2 [default = 0.999];
-}
-
-message FixedStepProto {
-  repeated int32 step = 28;
-  // lr = step_lr[i] if current step >= step[i]
-  repeated float step_lr = 29;
-}
-
-message StepProto {
-  // lr = base_lr * gamma^(step/change_freq)
-  required float gamma = 35 [default = 1];
-  // lr = base_lr * gamma^(step/change_freq)
-  required int32 change_freq = 40;
-}
-
-message LinearProto {
-  // lr = (1 - step / freq) * base_lr + (step / freq) * final_lr
-  required int32 change_freq= 40;
-  // lr = (1 - step / freq) * base_lr + (step / freq) * final_lr
-  required float final_lr = 39;
-}
-
-message ExponentialProto {
-  // lr = base / 2^(step/change_freq)
-  required int32 change_freq = 40;
-}
-
-message InverseTProto {
-  // lr = base_lr / (1+step/final_lr)
-  required float final_lr = 39;
-}
-message InverseProto {
-  // lr = base_lr*(1+gamma*step)^(-pow)
-  required float gamma = 1 [default = 1];
-  // lr = base_lr*(1+gamma*step)^(-pow)
-  required float pow = 2 [default = 0];
-}
-message UniformProto {
-  optional float low = 1 [default = -1];
-  optional float high = 2 [default = 1];
-}
-message GaussianProto {
-  optional float mean = 1 [default = 0];
-  optional float std = 2 [default = 1];
-}
-
-// --------------
-// All Enum Types
-// --------------
-
-enum AlgType {
-  // Back-propagation algorithm for feed-forward models, e.g., CNN and RNN
-  kBP = 1;
-  // Contrastive Divergence algorithm for RBM, DBM, etc.
-  kCD = 2;
-  // BPTT for training RNN models
-  kBPTT = 3;
-  // For user defined algorithm.
-  kUserAlg = 104;
-}
-
-enum LayerType {
-  /*
-   * Input layers
-   *  - Load records from file, database
-   */
-  kCSVInput = 100;
-  kImagePreprocess = 101;
-  kRecordInput = 103;
-  kLMDBData = 190;  // deprecated
-  kLabel = 191;  // deprecated
-  kMnist = 192;  // deprecated
-  kRGBImage = 193;  // deprecated
-  kShardData = 194;  // deprecated
-  kCharRNN = 195;
-  kRNNLabel = 196;
-  kOneHot = 197;
-
-  /*
-   * Neuron layers
-   *  - Feature transformation
-   */
-  kConvolution = 201;
-  kCConvolution = 202;
-  kDropout = 203;
-  kDummy = 204;
-  kInnerProduct = 205;
-  kLRN = 206;
-  kPooling = 207;
-  kCPooling = 208;
-  kRBMHid = 209;
-  kRBMVis = 210;
-  kReLU = 211;
-  kSTanh = 212;
-  kSigmoid = 213;
-  kSoftmax = 214;
-  kGRU = 215;
-  kEmbedding = 216;
-  kActivation = 217;
-  kBM = 218;
-
-  kCudnnConv = 250;
-  kCudnnPool = 251;
-  kCudnnLRN = 252;
-  kCudnnSoftmax = 253;
-  kCudnnActivation = 254;
-  kCudnnBM = 255;
-
-  /*
-   * Loss layers
-   *  - Compute objective loss
-   */
-  kEuclideanLoss = 300;
-  kSoftmaxLoss = 301;
-  // cudnn v3
-  kCudnnSoftmaxLoss = 350;
-
-  /*
-   * Output layers
-   *  - Write results to file, database
-   */
-  kAccuracy = 400;
-  kArgSort = 401;
-  kCSVOutput = 402;
-  kRecordOutput = 403;
-  kCharRNNOutput = 404;
-
-  /*
-   * Connection layers
-   *  - Connect layers when neural net is partitioned
-   */
-  kBridgeDst = 500;
-  kBridgeSrc = 501;
-  kConcate = 502;
-  kSlice = 503;
-  kSplit = 504;
-  kRNNDummy = 505;
-
-  /*
-   * User defined layer
-   *  - users should configure user_type
-   */
-  kUserLayer = 600;
-}
-
-enum UpdaterType {
-  // noraml SGD with momentum and weight decay
-  kSGD = 1;
-  // adaptive subgradient, http://www.magicbroom.info/Papers/DuchiHaSi10.pdf
-  kAdaGrad = 2;
-  // http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf
-  kRMSProp = 3;
-  // Nesterov first optimal gradient method
-  kNesterov = 4;
-  // AdaDelta
-  kAdaDelta = 5;
-  // Adam
-  kAdam = 6;
-  // AdamMax
-  kAdamMax = 7;
-  // For user defined updater
-  kUserUpdater = 105;
-}
-
-enum Phase {
-  kUnknown = 0;
-  kTrain = 1;
-  kVal = 2;
-  kTest= 4;
-  // postivie phase for contrastive divergence algorithm
-  kPositive = 8;
-  // negative phase for contrastive divergence algorithm
-  kNegative = 16;
-  kForward = 32;
-  kBackward = 64;
-  kLoss = 128;
-  kDeploy = 256;
-
-  // used for aggregate parameter gradients when Param is shared
-  kAggGrad = 512;
-}
-
-enum ParamType {
-  // built-in Param
-  kParam = 0;
-  // user-defined Param
-  kUser = 103;
-}
-
-enum ChangeMethod {
-  kFixed = 0;
-  kInverseT = 1;
-  kInverse = 2;
-  kExponential = 3;
-  kLinear = 4;
-  kStep = 5;
-  kFixedStep = 6;
-  // For user defiend change method
-  kUserChange = 100;
-}
-
-enum InitMethod {
-  // fix the values of all parameters  a constant in the value field
-  kConstant = 0;
-  // sample gaussian with std and mean
-  kGaussian = 1;
-  // uniform sampling between low and high
-  kUniform = 2;
-  // from Toronto Convnet, let a=1/sqrt(fan_in), w*=a after generating from
-  // Gaussian distribution
-  kGaussianSqrtFanIn = 4;
-  // from Toronto Convnet, rectified linear activation, let
-  // a=sqrt(3)/sqrt(fan_in), range is [-a, +a]; no need to set value=sqrt(3),
-  // the program will multiply it.
-  kUniformSqrtFanIn = 5;
-  // from Theano MLP tutorial, let a=sqrt(6/(fan_in+fan_out)). for tanh
-  // activation, range is [-a, +a], for sigmoid activation, range is
-  // [-4a, +4a], put the scale factor to value field.
-  // <a href="http://deeplearning.net/tutorial/mlp.html"> Theano MLP</a>
-  kUniformSqrtFanInOut = 6;
-
-  // For user defined init method
-  kUserInit = 101;
-}
-
-enum UnrollConnType {
-  // i-th unrolled layer <- (i - shift)-th src unrolled layer
-  kUnrollOneToOne = 1;
-  // i-th unrolled layer <- all src unrolled layers
-  kUnrollOneToAll = 2;
-  // i-th unrolled layer <- last unrolled src layer
-  kUnrollFirstToLast = 3;
-  // customized connection type defined by src_conn
-  kUnrollCustomized = 4;
-}
diff --git a/src/proto/model.proto b/src/proto/model.proto
new file mode 100644
index 0000000..3df68e2
--- /dev/null
+++ b/src/proto/model.proto
@@ -0,0 +1,956 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package singa;
+
+/// \file layer.proto is adapted from [Caffe](https://github.com/BVLC/caffe/)'s
+/// proto file with commit id c419f8517b1e1b3d7a07fe212fc6c90a70b519ea. We
+/// use caffe's protocol for configuring layer hyper-parameters for easy
+/// transporting Caffe model into SINGA. Specifically, we do the following
+/// changes:
+/// 1. we rename LayerParameter to LayerConf to differentiate model parameters
+/// 2. we rename xxxParameter to xxxConf for fields of LayerParameter
+/// 3. we comment out some fields (using /*...*/) not used in SINGA layer but
+///    reserve their tags.
+/// 4. we add new fields (commented like 'singa field..') to support our own
+///   functionalities.
+/// TODO(wangwei) write a proto converter to automatically load caffe models
+/// using Python (or C++/Java).
+
+// Specifies the shape (dimensions) of a Blob.
+message BlobShape {
+  repeated int64 dim = 1 [packed = true];
+}
+
+message BlobProto {
+  optional BlobShape shape = 7;
+  repeated float data = 5 [packed = true];
+  repeated float diff = 6 [packed = true];
+  repeated double double_data = 8 [packed = true];
+  repeated double double_diff = 9 [packed = true];
+
+  // 4D dimensions -- deprecated.  Use "shape" instead.
+  optional int32 num = 1 [default = 0];
+  optional int32 channels = 2 [default = 0];
+  optional int32 height = 3 [default = 0];
+  optional int32 width = 4 [default = 0];
+}
+
+message FillerConf {
+  // The filler type, case insensitive
+  optional string type = 1 [default = 'constant'];
+  optional float value = 2 [default = 0]; // the value in constant filler
+  optional float min = 3 [default = 0]; // the min value in uniform filler
+  optional float max = 4 [default = 1]; // the max value in uniform filler
+  optional float mean = 5 [default = 0]; // the mean value in Gaussian filler
+  optional float std = 6 [default = 1]; // the std value in Gaussian filler
+  // The expected number of non-zero output weights for a given input in
+  // Gaussian filler -- the default -1 means don't perform sparsification.
+  /* optional int32 sparse = 7 [default = -1]; */
+  // Normalize the filler variance by fan_in, fan_out, or their average.
+  // Applies to 'xavier' and 'msra' fillers.
+  enum VarianceNorm {
+    FAN_IN = 0;
+    FAN_OUT = 1;
+    AVERAGE = 2;
+  }
+  optional VarianceNorm variance_norm = 8 [default = FAN_IN];
+}
+
+/// SINGA message
+message OptimizerConf {
+  // case insensitive
+  optional string type = 1 [default = "sgd"];
+
+  // used by RMSprop and Adadelta
+  optional float rho = 2 [default = 0.95];
+
+  // used by Adam and AdamMax
+  optional float beta_1 = 3 [default = 0.9];
+  optional float beta_2 = 4 [default = 0.999];
+
+  // used by vanilla sgd and nesterov
+  optional float momentum = 5 [default = 0.9];
+
+  // delta is used to avoid dividing zero
+  optional float delta = 6 [default = 1e-8];
+
+  // global regularizer lower priority than ParamSpec regularizer
+  optional RegularizerConf regularizer = 10;
+  // global constraint lower priority than ParamSpec constraint
+  optional ConstraintConf constraint = 11;
+}
+
+message ConstraintConf {
+  // case insensitive to limit the parameter value/gradient scale
+  optional string type = 1 [default = "l2"];
+  // e.g., the threshold for limiting the parameter scale.
+  optional float threshold = 2;
+}
+
+/// SINGA message
+message RegularizerConf {
+  // case insensitive to regularize the parameters, e.g., L2.
+  optional string type = 1 [default = "l2"];
+  // e.g., the weight decay for L2 regularizer
+  optional float coefficient = 2;
+}
+
+// Specifies training parameters (multipliers on global learning constants,
+// and the name and other settings used for weight sharing).
+message ParamSpec {
+  // The names of the parameter blobs -- useful for sharing parameters among
+  // layers, but never required otherwise.  To share a parameter between two
+  // layers, give it a (non-empty) name.
+  optional string name = 1;
+
+  // Whether to require shared weights to have the same shape, or just the same
+  // count -- defaults to STRICT if unspecified.
+  /*
+  optional DimCheckMode share_mode = 2;
+  enum DimCheckMode {
+    // STRICT (default) requires that num, channels, height, width each match.
+    STRICT = 0;
+    // PERMISSIVE requires only the count (num*channels*height*width) to match.
+    PERMISSIVE = 1;
+  }
+  */
+
+  // The multiplier on the global learning rate for this parameter.
+  optional float lr_mult = 3 [default = 1.0];
+
+  // The multiplier on the global weight decay for this parameter.
+  optional float decay_mult = 4 [default = 1.0];
+
+  // SINGA uses this filed internally. Users just configure the fillers in
+  // Layer specific conf message as caffe (style).
+  optional FillerConf filler = 20;
+  optional ConstraintConf constraint = 21;
+  optional RegularizerConf regularizer = 22;
+}
+
+enum Phase {
+  kTrain = 4;
+  kEval = 8;
+}
+// NOTE
+// Update the next available ID when you add a new LayerConf field.
+//
+// LayerConf next available layer-specific ID: 139 (last added: tile_param)
+message LayerConf {
+  optional string name = 1; // the layer name
+  optional string type = 2; // the layer type
+  /* repeated string bottom = 3; // the name of each bottom blob */
+  /* repeated string top = 4; // the name of each top blob */
+
+  // The train / test phase for computation.
+  // optional Phase phase = 10;
+
+  // The amount of weight to assign each top blob in the objective.
+  // Each layer assigns a default value, usually of either 0 or 1,
+  // to each top blob.
+  /* repeated float loss_weight = 5; */
+
+  // Specifies training parameters (multipliers on global learning constants,
+  // and the name and other settings used for weight sharing).
+  repeated ParamSpec param = 6;
+
+  // The blobs containing the numeric parameters of the layer.
+  repeated BlobProto blobs = 7;
+
+  // Specifies on which bottoms the backpropagation should be skipped.
+  // The size must be either 0 or equal to the number of bottoms.
+  /* repeated bool propagate_down = 11; */
+
+  // Rules controlling whether and when a layer is included in the network,
+  // based on the current NetState.  You may specify a non-zero number of rules
+  // to include OR exclude, but not both.  If no include or exclude rules are
+  // specified, the layer is always included.  If the current NetState meets
+  // ANY (i.e., one or more) of the specified rules, the layer is
+  // included/excluded.
+  /* repeated NetStateRule include = 8; */
+  /* repeated NetStateRule exclude = 9; */
+
+  // Confs for data pre-processing.
+  /* optional TransformationConf transform_param = 100; */
+
+  // Confs shared by loss layers.
+  /* optional LossConf loss_param = 101; */
+
+  // Layer type-specific parameters.
+  //
+  // Note: certain layers may have more than one computational engine
+  // for their implementation. These layers include an Engine type and
+  // engine parameter for selecting the implementation.
+  // The default for the engine is set by the ENGINE switch at compile-time.
+  //optional AccuracyConf accuracy_conf = 102;
+  optional ArgMaxConf argmax_conf = 103;
+  optional ConcatConf concat_conf = 104;
+  optional ContrastiveLossConf contrastive_loss_conf = 105;
+  optional ConvolutionConf convolution_conf = 106;
+  optional RNNConf rnn_conf = 140;
+  // optional DataConf data_conf = 107;
+  optional DropoutConf dropout_conf = 108;
+  // optional DummyDataConf dummy_data_conf = 109;
+  optional EltwiseConf eltwise_conf = 110;
+  optional EmbedConf embed_conf = 137;
+  optional ExpConf exp_conf = 111;
+  optional FlattenConf flatten_conf = 135;
+  // optional HDF5DataConf hdf5_data_conf = 112;
+  // optional HDF5OutputConf hdf5_output_conf = 113;
+  optional HingeLossConf hinge_loss_conf = 114;
+  // optional ImageDataConf image_data_conf = 115;
+  optional InfogainLossConf infogain_loss_conf = 116;
+  optional InnerProductConf inner_product_conf = 117;
+  optional LogConf log_conf = 134;
+  optional LRNConf lrn_conf = 118;
+  // optional MemoryDataConf memory_data_conf = 119;
+  optional MVNConf mvn_conf = 120;
+  optional PoolingConf pooling_conf = 121;
+  optional PowerConf power_conf = 122;
+  optional PReLUConf prelu_conf = 131;
+  // optional PythonConf python_conf = 130;
+  optional ReductionConf reduction_conf = 136;
+  optional ReLUConf relu_conf = 123;
+  optional ReshapeConf reshape_conf = 133;
+  optional SigmoidConf sigmoid_conf = 124;
+  optional SoftmaxConf softmax_conf = 125;
+  optional SPPConf spp_conf = 132;
+  optional SliceConf slice_conf = 126;
+  optional TanHConf tanh_conf = 127;
+  optional ThresholdConf threshold_conf = 128;
+  optional TileConf tile_conf = 138;
+  //optional WindowDataConf window_data_conf = 129;
+
+  // Used in SINGA
+  optional DenseConf dense_conf = 201;
+  optional MetricConf metric_conf = 200;
+  optional BatchNormConf batchnorm_conf = 202;
+  optional SplitConf split_conf = 203;
+}
+
+// Message that stores hyper-parameters used to apply transformation
+// to the data layer's data
+/*
+message TransformationConf {
+  // For data pre-processing, we can do simple scaling and subtracting the
+  // data mean, if provided. Note that the mean subtraction is always carried
+  // out before scaling.
+  optional float scale = 1 [default = 1];
+  // Specify if we want to randomly mirror data.
+  optional bool mirror = 2 [default = false];
+  // Specify if we would like to randomly crop an image.
+  optional uint32 crop_size = 3 [default = 0];
+  // mean_file and mean_value cannot be specified at the same time
+  optional string mean_file = 4;
+  // if specified can be repeated once (would substract it from all the channels)
+  // or can be repeated the same number of times as channels
+  // (would subtract them from the corresponding channel)
+  repeated float mean_value = 5;
+  // Force the decoded image to have 3 color channels.
+  optional bool force_color = 6 [default = false];
+  // Force the decoded image to have 1 color channels.
+  optional bool force_gray = 7 [default = false];
+}
+*/
+
+// Message that stores hyper-parameters shared by loss layers
+message LossConf {
+  // If specified, ignore instances with the given label.
+  optional int32 ignore_label = 1;
+  // If true, normalize each batch across all instances (including spatial
+  // dimesions, but not ignored instances); else, divide by batch size only.
+  optional bool normalize = 2 [default = true];
+}
+
+message MetricConf {
+  // When computing accuracy, count as correct by comparing the true label to
+  // the top k scoring classes.  By default, only compare to the top scoring
+  // class (i.e. argmax).
+  optional uint32 top_k = 1 [default = 1];
+
+  // The "label" axis of the prediction blob, whose argmax corresponds to the
+  // predicted label -- may be negative to index from the end (e.g., -1 for the
+  // last axis).  For example, if axis == 1 and the predictions are
+  // (N x C x H x W), the label blob is expected to contain N*H*W ground truth
+  // labels with integer values in {0, 1, ..., C-1}.
+  optional int32 axis = 2 [default = 1];
+
+  // If specified, ignore instances with the given label.
+  optional int32 ignore_label = 3;
+}
+// Messages that store hyper-parameters used by individual layer types follow, in
+// alphabetical order.
+
+
+
+message ArgMaxConf {
+  // If true produce pairs (argmax, maxval)
+  optional bool out_max_val = 1 [default = false];
+  optional uint32 top_k = 2 [default = 1];
+  // The axis along which to maximise -- may be negative to index from the
+  // end (e.g., -1 for the last axis).
+  // By default ArgMaxLayer maximizes over the flattened trailing dimensions
+  // for each index of the first / num dimension.
+  optional int32 axis = 3;
+}
+
+message ConcatConf {
+  // The axis along which to concatenate -- may be negative to index from the
+  // end (e.g., -1 for the last axis).  Other axes must have the
+  // same dimension for all the bottom blobs.
+  // By default, ConcatLayer concatenates blobs along the "channels" axis (1).
+  optional int32 axis = 2 [default = 1];
+
+  // DEPRECATED: alias for "axis" -- does not support negative indexing.
+  optional uint32 concat_dim = 1 [default = 1];
+}
+
+message ContrastiveLossConf {
+  // margin for dissimilar pair
+  optional float margin = 1 [default = 1.0];
+  // The first implementation of this cost did not exactly match the cost of
+  // Hadsell et al 2006 -- using (margin - d^2) instead of (margin - d)^2.
+  // legacy_version = false (the default) uses (margin - d)^2 as proposed in the
+  // Hadsell paper. New models should probably use this version.
+  // legacy_version = true uses (margin - d^2). This is kept to support /
+  // reproduce existing models and results
+  optional bool legacy_version = 2 [default = false];
+}
+
+message ConvolutionConf {
+  optional uint32 num_output = 1; // The number of outputs for the layer
+  optional bool bias_term = 2 [default = true]; // whether to have bias terms
+
+  // Pad, kernel size, and stride are all given as a single value for equal
+  // dimensions in all spatial dimensions, or once per spatial dimension.
+  repeated uint32 pad = 3; // The padding size; defaults to 0
+  repeated uint32 kernel_size = 4; // The kernel size
+  repeated uint32 stride = 6; // The stride; defaults to 1
+
+  // For 2D convolution only, the *_h and *_w versions may also be used to
+  // specify both spatial dimensions.
+  optional uint32 pad_h = 9 [default = 0]; // The padding height (2D only)
+  optional uint32 pad_w = 10 [default = 0]; // The padding width (2D only)
+  optional uint32 kernel_h = 11; // The kernel height (2D only)
+  optional uint32 kernel_w = 12; // The kernel width (2D only)
+  optional uint32 stride_h = 13; // The stride height (2D only)
+  optional uint32 stride_w = 14; // The stride width (2D only)
+
+  // SINGA: not supported.
+  // optional uint32 group = 5 [default = 1]; // The group size for group conv
+
+  optional FillerConf weight_filler = 7; // The filler for the weight
+  optional FillerConf bias_filler = 8; // The filler for the bias
+  enum Engine {
+    DEFAULT = 0;
+    CAFFE = 1;
+    CUDNN = 2;
+  }
+  optional Engine engine = 15 [default = DEFAULT];
+
+  // The axis to interpret as "channels" when performing convolution.
+  // Preceding dimensions are treated as independent inputs;
+  // succeeding dimensions are treated as "spatial".
+  // With (N, C, H, W) inputs, and axis == 1 (the default), we perform
+  // N independent 2D convolutions, sliding C-channel (or (C/g)-channels, for
+  // groups g>1) filters across the spatial axes (H, W) of the input.
+  // With (N, C, D, H, W) inputs, and axis == 1, we perform
+  // N independent 3D convolutions, sliding (C/g)-channels
+  // filters across the spatial axes (D, H, W) of the input.
+  // SINGA: not supported;
+  // optional int32 axis = 16 [default = 1];
+
+  // Whether to force use of the general ND convolution, even if a specific
+  // implementation for blobs of the appropriate number of spatial dimensions
+  // is available. (Currently, there is only a 2D-specific convolution
+  // implementation; for input blobs with num_axes != 2, this option is
+  // ignored and the ND implementation will be used.)
+  // SINGA: not supported;
+  // optional bool force_nd_im2col = 17 [default = false];
+
+
+  // SINGA: add by xiangrui
+  // cudnn workspace size in MB
+  optional int32 workspace_byte_limit = 50 [default = 512];
+  // cudnn algorithm preference
+  // options: "fastest", "limited_workspace", "no_workspace"
+  optional string prefer = 51 [default = "fastest"];
+}
+
+message RNNConf {
+  optional uint32 hidden_size = 1; // The hidden feature size
+  optional uint32 num_stacks = 2; // The number of stacked RNN layers
+  optional float dropout = 3 [default = 0];
+  optional bool remember_state = 4 [default = false];
+  // cudnn inputmode
+  // options: "linear", "skip"
+  optional string input_mode = 7 [default = "linear"];
+  // cudnn direction
+  // options: "unidirectional", "bidirectional"
+  optional string direction = 8 [default = "unidirectional"];
+  // cudnn RNN mode
+  // options: "relu", "tanh", "lstm", "gru"
+  optional string rnn_mode = 9 [default = "relu"];
+}
+
+/*
+message DataConf {
+  enum DB {
+    LEVELDB = 0;
+    LMDB = 1;
+  }
+  // Specify the data source.
+  optional string source = 1;
+  // Specify the batch size.
+  optional uint32 batch_size = 4;
+  // The rand_skip variable is for the data layer to skip a few data points
+  // to avoid all asynchronous sgd clients to start at the same point. The skip
+  // point would be set as rand_skip * rand(0,1). Note that rand_skip should not
+  // be larger than the number of keys in the database.
+  // DEPRECATED. Each solver accesses a different subset of the database.
+  optional uint32 rand_skip = 7 [default = 0];
+  optional DB backend = 8 [default = LEVELDB];
+  // DEPRECATED. See TransformationConf. For data pre-processing, we can do
+  // simple scaling and subtracting the data mean, if provided. Note that the
+  // mean subtraction is always carried out before scaling.
+  optional float scale = 2 [default = 1];
+  optional string mean_file = 3;
+  // DEPRECATED. See TransformationConf. Specify if we would like to randomly
+  // crop an image.
+  optional uint32 crop_size = 5 [default = 0];
+  // DEPRECATED. See TransformationConf. Specify if we want to randomly mirror
+  // data.
+  optional bool mirror = 6 [default = false];
+  // Force the encoded image to have 3 color channels
+  optional bool force_encoded_color = 9 [default = false];
+  // Prefetch queue (Number of batches to prefetch to host memory, increase if
+  // data access bandwidth varies).
+  optional uint32 prefetch = 10 [default = 4];
+}
+*/
+
+message DropoutConf {
+  optional float dropout_ratio = 1 [default = 0.5]; // dropout ratio
+}
+
+// DummyDataLayer fills any number of arbitrarily shaped blobs with random
+// (or constant) data generated by "Fillers" (see "message FillerConf").
+message DummyDataConf {
+  // This layer produces N >= 1 top blobs.  DummyDataConf must specify 1 or N
+  // shape fields, and 0, 1 or N data_fillers.
+  //
+  // If 0 data_fillers are specified, ConstantFiller with a value of 0 is used.
+  // If 1 data_filler is specified, it is applied to all top blobs.  If N are
+  // specified, the ith is applied to the ith top blob.
+  repeated FillerConf data_filler = 1;
+  repeated BlobShape shape = 6;
+
+  // 4D dimensions -- deprecated.  Use "shape" instead.
+  repeated uint32 num = 2;
+  repeated uint32 channels = 3;
+  repeated uint32 height = 4;
+  repeated uint32 width = 5;
+}
+
+message EltwiseConf {
+  enum EltwiseOp {
+    PROD = 0;
+    SUM = 1;
+    MAX = 2;
+  }
+  optional EltwiseOp operation = 1 [default = SUM]; // element-wise operation
+  repeated float coeff = 2; // blob-wise coefficient for SUM operation
+
+  // Whether to use an asymptotically slower (for >2 inputs) but stabler method
+  // of computing the gradient for the PROD operation. (No effect for SUM op.)
+  optional bool stable_prod_grad = 3 [default = true];
+}
+
+// Message that stores hyper-parameters used by EmbedLayer
+message EmbedConf {
+  optional uint32 num_output = 1; // The number of outputs for the layer
+  // The input is given as integers to be interpreted as one-hot
+  // vector indices with dimension num_input.  Hence num_input should be
+  // 1 greater than the maximum possible input value.
+  optional uint32 input_dim = 2;
+
+  optional bool bias_term = 3 [default = true]; // Whether to use a bias term
+  optional FillerConf weight_filler = 4; // The filler for the weight
+  optional FillerConf bias_filler = 5; // The filler for the bias
+
+}
+
+// Message that stores hyper-parameters used by ExpLayer
+message ExpConf {
+  // ExpLayer computes outputs y = base ^ (shift + scale * x), for base > 0.
+  // Or if base is set to the default (-1), base is set to e,
+  // so y = exp(shift + scale * x).
+  optional float base = 1 [default = -1.0];
+  optional float scale = 2 [default = 1.0];
+  optional float shift = 3 [default = 0.0];
+}
+
+/// Message that stores hyper-parameters used by FlattenLayer
+message FlattenConf {
+  // The first axis to flatten: all preceding axes are retained in the output.
+  // May be negative to index from the end (e.g., -1 for the last axis).
+  optional int32 axis = 1 [default = 1];
+
+  // The last axis to flatten: all following axes are retained in the output.
+  // May be negative to index from the end (e.g., the default -1 for the last
+  // axis).
+  optional int32 end_axis = 2 [default = -1];
+}
+
+/*
+// Message that stores hyper-parameters used by HDF5DataLayer
+message HDF5DataConf {
+  // Specify the data source.
+  optional string source = 1;
+  // Specify the batch size.
+  optional uint32 batch_size = 2;
+
+  // Specify whether to shuffle the data.
+  // If shuffle == true, the ordering of the HDF5 files is shuffled,
+  // and the ordering of data within any given HDF5 file is shuffled,
+  // but data between different files are not interleaved; all of a file's
+  // data are output (in a random order) before moving onto another file.
+  optional bool shuffle = 3 [default = false];
+}
+
+message HDF5OutputConf {
+  optional string file_name = 1;
+}
+*/
+
+message HingeLossConf {
+  enum Norm {
+    L1 = 1;
+    L2 = 2;
+  }
+  // Specify the Norm to use L1 or L2
+  optional Norm norm = 1 [default = L1];
+}
+
+/*
+message ImageDataConf {
+  // Specify the data source.
+  optional string source = 1;
+  // Specify the batch size.
+  optional uint32 batch_size = 4 [default = 1];
+  // The rand_skip variable is for the data layer to skip a few data points
+  // to avoid all asynchronous sgd clients to start at the same point. The skip
+  // point would be set as rand_skip * rand(0,1). Note that rand_skip should not
+  // be larger than the number of keys in the database.
+  optional uint32 rand_skip = 7 [default = 0];
+  // Whether or not ImageLayer should shuffle the list of files at every epoch.
+  optional bool shuffle = 8 [default = false];
+  // It will also resize images if new_height or new_width are not zero.
+  optional uint32 new_height = 9 [default = 0];
+  optional uint32 new_width = 10 [default = 0];
+  // Specify if the images are color or gray
+  optional bool is_color = 11 [default = true];
+  // DEPRECATED. See TransformationConf. For data pre-processing, we can do
+  // simple scaling and subtracting the data mean, if provided. Note that the
+  // mean subtraction is always carried out before scaling.
+  optional float scale = 2 [default = 1];
+  optional string mean_file = 3;
+  // DEPRECATED. See TransformationConf. Specify if we would like to randomly
+  // crop an image.
+  optional uint32 crop_size = 5 [default = 0];
+  // DEPRECATED. See TransformationConf. Specify if we want to randomly mirror
+  // data.
+  optional bool mirror = 6 [default = false];
+  optional string root_folder = 12 [default = ""];
+}
+*/
+
+message InfogainLossConf {
+  // Specify the infogain matrix source.
+  optional string source = 1;
+}
+
+message InnerProductConf {
+  optional uint32 num_output = 1; // The number of outputs for the layer
+  optional bool bias_term = 2 [default = true]; // whether to have bias terms
+  optional FillerConf weight_filler = 3; // The filler for the weight
+  optional FillerConf bias_filler = 4; // The filler for the bias
+
+  // The first axis to be lumped into a single inner product computation;
+  // all preceding axes are retained in the output.
+  // May be negative to index from the end (e.g., -1 for the last axis).
+  optional int32 axis = 5 [default = 1];
+}
+
+message DenseConf {
+  optional uint32 num_output = 1; // The number of outputs for the layer
+  optional bool bias_term = 2 [default = true]; // whether to have bias terms
+  optional FillerConf weight_filler = 3; // The filler for the weight
+  optional FillerConf bias_filler = 4; // The filler for the bias
+
+  // The first axis to be lumped into a single inner product computation;
+  // all preceding axes are retained in the output.
+  // May be negative to index from the end (e.g., -1 for the last axis).
+  optional int32 axis = 5 [default = 1];
+
+  optional bool transpose = 21 [default = false]; // whether transpose or not
+}
+
+// Message that stores hyper-parameters used by LogLayer
+message LogConf {
+  // LogLayer computes outputs y = log_base(shift + scale * x), for base > 0.
+  // Or if base is set to the default (-1), base is set to e,
+  // so y = ln(shift + scale * x) = log_e(shift + scale * x)
+  optional float base = 1 [default = -1.0];
+  optional float scale = 2 [default = 1.0];
+  optional float shift = 3 [default = 0.0];
+}
+
+// Message that stores hyper-parameters used by LRNLayer
+message LRNConf {
+  optional uint32 local_size = 1 [default = 5];
+  optional float alpha = 2 [default = 1.];
+  optional float beta = 3 [default = 0.75];
+  enum NormRegion {
+    ACROSS_CHANNELS = 0;
+    WITHIN_CHANNEL = 1;
+  }
+  optional NormRegion norm_region = 4 [default = ACROSS_CHANNELS];
+  optional float k = 5 [default = 1.];
+}
+
+message MemoryDataConf {
+  optional uint32 batch_size = 1;
+  optional uint32 channels = 2;
+  optional uint32 height = 3;
+  optional uint32 width = 4;
+}
+
+message MVNConf {
+  // This parameter can be set to false to normalize mean only
+  optional bool normalize_variance = 1 [default = true];
+
+  // This parameter can be set to true to perform DNN-like MVN
+  optional bool across_channels = 2 [default = false];
+
+  // Epsilon for not dividing by zero while normalizing variance
+  optional float eps = 3 [default = 1e-9];
+}
+
+message PoolingConf {
+  enum PoolMethod {
+    MAX = 0;
+    AVE = 1;
+    STOCHASTIC = 2;
+  }
+  optional PoolMethod pool = 1 [default = MAX]; // The pooling method
+  // Pad, kernel size, and stride are all given as a single value for equal
+  // dimensions in height and width or as Y, X pairs.
+  optional uint32 pad = 4 [default = 0]; // The padding size (equal in Y, X)
+  optional uint32 pad_h = 9 [default = 0]; // The padding height
+  optional uint32 pad_w = 10 [default = 0]; // The padding width
+  optional uint32 kernel_size = 2; // The kernel size (square)
+  optional uint32 kernel_h = 5; // The kernel height
+  optional uint32 kernel_w = 6; // The kernel width
+  optional uint32 stride = 3 [default = 1]; // The stride (equal in Y, X)
+  optional uint32 stride_h = 7; // The stride height
+  optional uint32 stride_w = 8; // The stride width
+  /*
+  enum Engine {
+    DEFAULT = 0;
+    CAFFE = 1;
+    CUDNN = 2;
+  }
+  optional Engine engine = 11 [default = DEFAULT];
+  */
+  // If global_pooling then it will pool over the size of the bottom by doing
+  // kernel_h = bottom->height and kernel_w = bottom->width
+  optional bool global_pooling = 12 [default = false];
+  // whether to propagate nan
+  optional bool nan_prop = 53 [default = false];
+}
+
+message PowerConf {
+  // PowerLayer computes outputs y = (shift + scale * x) ^ power.
+  optional float power = 1 [default = 1.0];
+  optional float scale = 2 [default = 1.0];
+  optional float shift = 3 [default = 0.0];
+}
+/*
+message PythonConf {
+  optional string module = 1;
+  optional string layer = 2;
+  // This value is set to the attribute `param_str` of the `PythonLayer` object
+  // in Python before calling the `setup()` method. This could be a number,
+  // string, dictionary in Python dict format, JSON, etc. You may parse this
+  // string in `setup` method and use it in `forward` and `backward`.
+  optional string param_str = 3 [default = ''];
+  // Whether this PythonLayer is shared among worker solvers during data parallelism.
+  // If true, each worker solver sequentially run forward from this layer.
+  // This value should be set true if you are using it as a data layer.
+  optional bool share_in_parallel = 4 [default = false];
+}
+*/
+
+// Message that stores hyper-parameters used by ReductionLayer
+message ReductionConf {
+  enum ReductionOp {
+    SUM = 1;
+    ASUM = 2;
+    SUMSQ = 3;
+    MEAN = 4;
+  }
+
+  optional ReductionOp operation = 1 [default = SUM]; // reduction operation
+
+  // The first axis to reduce to a scalar -- may be negative to index from the
+  // end (e.g., -1 for the last axis).
+  // (Currently, only reduction along ALL "tail" axes is supported; reduction
+  // of axis M through N, where N < num_axes - 1, is unsupported.)
+  // Suppose we have an n-axis bottom Blob with shape:
+  //     (d0, d1, d2, ..., d(m-1), dm, d(m+1), ..., d(n-1)).
+  // If axis == m, the output Blob will have shape
+  //     (d0, d1, d2, ..., d(m-1)),
+  // and the ReductionOp operation is performed (d0 * d1 * d2 * ... * d(m-1))
+  // times, each including (dm * d(m+1) * ... * d(n-1)) individual data.
+  // If axis == 0 (the default), the output Blob always has the empty shape
+  // (count 1), performing reduction across the entire input --
+  // often useful for creating new loss functions.
+  optional int32 axis = 2 [default = 0];
+
+  optional float coeff = 3 [default = 1.0]; // coefficient for output
+}
+
+// Message that stores hyper-parameters used by ReLULayer
+message ReLUConf {
+  // Allow non-zero slope for negative inputs to speed up optimization
+  // Described in:
+  // Maas, A. L., Hannun, A. Y., & Ng, A. Y. (2013). Rectifier nonlinearities
+  // improve neural network acoustic models. In ICML Workshop on Deep Learning
+  // for Audio, Speech, and Language Processing.
+  optional float negative_slope = 1 [default = 0];
+  /*
+  enum Engine {
+    DEFAULT = 0;
+    CAFFE = 1;
+    CUDNN = 2;
+  }
+  optional Engine engine = 2 [default = DEFAULT];
+  */
+}
+
+message ReshapeConf {
+  // Specify the output dimensions. If some of the dimensions are set to 0,
+  // the corresponding dimension from the bottom layer is used (unchanged).
+  // Exactly one dimension may be set to -1, in which case its value is
+  // inferred from the count of the bottom blob and the remaining dimensions.
+  // For example, suppose we want to reshape a 2D blob "input" with shape 2 x 8:
+  //
+  //   layer {
+  //     type: "Reshape" bottom: "input" top: "output"
+  //     reshape_param { ... }
+  //   }
+  //
+  // If "input" is 2D with shape 2 x 8, then the following reshape_param
+  // specifications are all equivalent, producing a 3D blob "output" with shape
+  // 2 x 2 x 4:
+  //
+  //   reshape_param { shape { dim:  2  dim: 2  dim:  4 } }
+  //   reshape_param { shape { dim:  0  dim: 2  dim:  4 } }
+  //   reshape_param { shape { dim:  0  dim: 2  dim: -1 } }
+  //   reshape_param { shape { dim: -1  dim: 0  dim:  2 } }
+  //
+  optional BlobShape shape = 1;
+
+  // axis and num_axes control the portion of the bottom blob's shape that are
+  // replaced by (included in) the reshape. By default (axis == 0 and
+  // num_axes == -1), the entire bottom blob shape is included in the reshape,
+  // and hence the shape field must specify the entire output shape.
+  //
+  // axis may be non-zero to retain some portion of the beginning of the input
+  // shape (and may be negative to index from the end; e.g., -1 to begin the
+  // reshape after the last axis, including nothing in the reshape,
+  // -2 to include only the last axis, etc.).
+  //
+  // For example, suppose "input" is a 2D blob with shape 2 x 8.
+  // Then the following ReshapeLayer specifications are all equivalent,
+  // producing a blob "output" with shape 2 x 2 x 4:
+  //
+  //   reshape_param { shape { dim: 2  dim: 2  dim: 4 } }
+  //   reshape_param { shape { dim: 2  dim: 4 } axis:  1 }
+  //   reshape_param { shape { dim: 2  dim: 4 } axis: -3 }
+  //
+  // num_axes specifies the extent of the reshape.
+  // If num_axes >= 0 (and axis >= 0), the reshape will be performed only on
+  // input axes in the range [axis, axis+num_axes].
+  // num_axes may also be -1, the default, to include all remaining axes
+  // (starting from axis).
+  //
+  // For example, suppose "input" is a 2D blob with shape 2 x 8.
+  // Then the following ReshapeLayer specifications are equivalent,
+  // producing a blob "output" with shape 1 x 2 x 8.
+  //
+  //   reshape_param { shape { dim:  1  dim: 2  dim:  8 } }
+  //   reshape_param { shape { dim:  1  dim: 2  }  num_axes: 1 }
+  //   reshape_param { shape { dim:  1  }  num_axes: 0 }
+  //
+  // On the other hand, these would produce output blob shape 2 x 1 x 8:
+  //
+  //   reshape_param { shape { dim: 2  dim: 1  dim: 8  }  }
+  //   reshape_param { shape { dim: 1 }  axis: 1  num_axes: 0 }
+  //
+  optional int32 axis = 2 [default = 0];
+  optional int32 num_axes = 3 [default = -1];
+}
+
+message SigmoidConf {
+  enum Engine {
+    DEFAULT = 0;
+    CAFFE = 1;
+    CUDNN = 2;
+  }
+  optional Engine engine = 1 [default = DEFAULT];
+}
+
+message SliceConf {
+  // The axis along which to slice -- may be negative to index from the end
+  // (e.g., -1 for the last axis).
+  // By default, SliceLayer concatenates blobs along the "channels" axis (1).
+  optional int32 axis = 3 [default = 1];
+  repeated uint32 slice_point = 2;
+
+  // DEPRECATED: alias for "axis" -- does not support negative indexing.
+  optional uint32 slice_dim = 1 [default = 1];
+}
+
+// Message that stores hyper-parameters used by SoftmaxLayer, SoftmaxWithLossLayer
+message SoftmaxConf {
+  enum Engine {
+    DEFAULT = 0;
+    CAFFE = 1;
+    CUDNN = 2;
+  }
+  optional Engine engine = 1 [default = DEFAULT];
+
+  // The axis along which to perform the softmax -- may be negative to index
+  // from the end (e.g., -1 for the last axis).
+  // Any other axes will be evaluated as independent softmaxes.
+  // optional int32 axis = 2 [default = 1];
+
+  /// The cudnn algorithm preferences
+  /// Options are: accurate, fast and log
+  optional string algorithm = 50 [default = "accurate"];
+}
+
+message TanHConf {
+  enum Engine {
+    DEFAULT = 0;
+    CAFFE = 1;
+    CUDNN = 2;
+  }
+  optional Engine engine = 1 [default = DEFAULT];
+}
+
+// Message that stores hyper-parameters used by TileLayer
+message TileConf {
+  // The index of the axis to tile.
+  optional int32 axis = 1 [default = 1];
+
+  // The number of copies (tiles) of the blob to output.
+  optional int32 tiles = 2;
+}
+
+// Message that stores hyper-parameters used by ThresholdLayer
+message ThresholdConf {
+  optional float threshold = 1 [default = 0]; // Strictly positive values
+}
+
+/*
+message WindowDataConf {
+  // Specify the data source.
+  optional string source = 1;
+  // For data pre-processing, we can do simple scaling and subtracting the
+  // data mean, if provided. Note that the mean subtraction is always carried
+  // out before scaling.
+  optional float scale = 2 [default = 1];
+  optional string mean_file = 3;
+  // Specify the batch size.
+  optional uint32 batch_size = 4;
+  // Specify if we would like to randomly crop an image.
+  optional uint32 crop_size = 5 [default = 0];
+  // Specify if we want to randomly mirror data.
+  optional bool mirror = 6 [default = false];
+  // Foreground (object) overlap threshold
+  optional float fg_threshold = 7 [default = 0.5];
+  // Background (non-object) overlap threshold
+  optional float bg_threshold = 8 [default = 0.5];
+  // Fraction of batch that should be foreground objects
+  optional float fg_fraction = 9 [default = 0.25];
+  // Amount of contextual padding to add around a window
+  // (used only by the window_data_layer)
+  optional uint32 context_pad = 10 [default = 0];
+  // Mode for cropping out a detection window
+  // warp: cropped window is warped to a fixed size and aspect ratio
+  // square: the tightest square around the window is cropped
+  optional string crop_mode = 11 [default = "warp"];
+  // cache_images: will load all images in memory for faster access
+  optional bool cache_images = 12 [default = false];
+  // append root_folder to locate images
+  optional string root_folder = 13 [default = ""];
+}
+*/
+
+message SPPConf {
+  enum PoolMethod {
+    MAX = 0;
+    AVE = 1;
+    STOCHASTIC = 2;
+  }
+  optional uint32 pyramid_height = 1;
+  optional PoolMethod pool = 2 [default = MAX]; // The pooling method
+  enum Engine {
+    DEFAULT = 0;
+    CAFFE = 1;
+    CUDNN = 2;
+  }
+  optional Engine engine = 6 [default = DEFAULT];
+}
+
+message PReLUConf {
+  // Parametric ReLU described in K. He et al, Delving Deep into Rectifiers:
+  // Surpassing Human-Level Performance on ImageNet Classification, 2015.
+
+  // Initial value of a_i. Default is a_i=0.25 for all i.
+  optional FillerConf filler = 1;
+  // Whether or not slope paramters are shared across channels.
+  optional bool channel_shared = 2 [default = false];
+
+  optional string format = 20 [default = "NCHW"];
+}
+
+message BatchNormConf {
+  // Used in the moving average computation runningMean =
+  // newMean*factor + runningMean*(1-factor).
+  optional double factor = 1 [default = 0.9];
+}
+
+message SplitConf {
+  // Indicate the number of outputs
+  optional int32 output_size = 1 [default = 2];
+}
diff --git a/src/python/setup.py.in b/src/python/setup.py.in
new file mode 100644
index 0000000..881cd30
--- /dev/null
+++ b/src/python/setup.py.in
@@ -0,0 +1,98 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# 
+
+# Always prefer setuptools over distutils
+from setuptools import setup
+
+
+setup(
+    name='singa',
+
+    version='${PACKAGE_VERSION}',
+
+    description='A General Deep Learning System',
+
+    url='https://github.com/apache/incubator-singa',
+
+    author='Apache SINGA (incubating)',
+    author_email='dev@singa.incubator.apache.org',
+
+    license='Apache 2',
+
+    classifiers=[
+        #   3 - Alpha
+        #   4 - Beta
+        #   5 - Production/Stable
+        'Development Status :: 3 - Alpha',
+
+        'Intended Audience :: Developers',
+        'Topic :: Deep Learning System ',
+
+        'License :: Apache License',
+
+        # Specify the Python versions you support here. In particular, ensure
+        # that you indicate whether you support Python 2, Python 3 or both.
+        'Programming Language :: Python :: 2',
+        'Programming Language :: Python :: 2.6',
+        'Programming Language :: Python :: 2.7',
+        ],
+
+    keywords='deep learning singa apache',
+
+    packages= ['singa', 'singa.proto'],
+
+    #py_modules=["singa"],
+
+    install_requires=[
+        'numpy>=1.11.0',
+        'protobuf>=2.5.0,<3'
+        ],
+
+    #List additional groups of dependencies here (e.g. development
+    #dependencies). You can install these using the following syntax,
+    #for example:
+    #$ pip install -e .[dev,test]
+    #extras_require={
+    #   'dev': ['check-manifest'],
+    #   'test': ['coverage'],
+    #},
+
+    #If there are data files included in your packages that need to be
+    #installed, specify them here.  If using Python 2.6 or less, then these
+    #have to be included in MANIFEST.in as well.
+
+    package_data={
+        'singa': ['_singa_wrap.so'],
+    },
+
+    #Although 'package_data' is the preferred approach, in some case you may
+    #need to place data files outside of your packages. See:
+    #http://docs.python.org/3.4/distutils/setupscript.html#installing-additional-files # noqa
+    #In this case, 'data_file' will be installed into '<sys.prefix>/my_data'
+    #data_files=[('my_data', ['data/data_file'])],
+
+    #To provide executable scripts, use entry points in preference to the
+    #"scripts" keyword. Entry points provide cross-platform support and allow
+    #pip to create the appropriate form of executable for the target platform.
+
+    entry_points={
+        'console_scripts': [
+            'singa=singa.command:main',
+        ],
+    },
+)
diff --git a/src/python/singa/__init__.py b/src/python/singa/__init__.py
new file mode 100644
index 0000000..c81c6ef
--- /dev/null
+++ b/src/python/singa/__init__.py
@@ -0,0 +1,19 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# 
+
+
diff --git a/src/python/singa/command.py b/src/python/singa/command.py
new file mode 100644
index 0000000..f14c8c5
--- /dev/null
+++ b/src/python/singa/command.py
@@ -0,0 +1,240 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# =============================================================================
+
+'''
+This script is the main entrance for user to run singa inside a model workspace
+
+To use this script, user sudo install these dependencies: flask pillow and protobuf
+'''
+
+import sys, glob, os, random, shutil, time
+from flask import Flask, request, redirect, url_for
+import numpy as np
+import ConfigParser
+import urllib, traceback
+
+
+from argparse import ArgumentParser
+from argparse import RawDescriptionHelpFormatter
+sys.path.append(os.getcwd())
+
+__all__ = []
+__version__ = 0.1
+__date__ = '2016-07-20'
+__updated__ = '2016-07-20'
+__shortdesc__ = '''
+welcome to singa
+'''
+
+app = Flask(__name__)
+config = ConfigParser.RawConfigParser()
+service = {}
+data_path = "data_"
+parameter_path = "parameter_"
+
+debug = False
+
+class CLIError(Exception):
+    '''Generic exception to raise and log different fatal errors.'''
+    def __init__(self, msg):
+        super(CLIError).__init__(type(self))
+        self.msg = "E: %s" % msg
+    def __str__(self):
+        return self.msg
+    def __unicode__(self):
+        return self.msg
+
+def main(argv=None): # IGNORE:C0111
+    '''Command line options.'''
+
+    from . import device
+
+    if argv is None:
+        argv = sys.argv
+    else:
+        sys.argv.extend(argv)
+
+    program_name = os.path.basename(sys.argv[0])
+    program_version = "v%s" % __version__
+    program_build_date = str(__updated__)
+    program_version_message = '%%(prog)s %s (%s)' % (program_version, program_build_date)
+    program_shortdesc = __shortdesc__
+    program_license = '''%s
+
+  Created by dbsystem group on %s.
+  Copyright 2016 NUS School of Computing. All rights reserved.
+
+  Licensed under the Apache License 2.0
+  http://www.apache.org/licenses/LICENSE-2.0
+
+  Distributed on an "AS IS" basis without warranties
+  or conditions of any kind, either express or implied.
+
+USAGE
+''' % (program_shortdesc, str(__date__))
+
+    global debug
+
+    try:
+        # Setup argument parser
+        parser = ArgumentParser(description=program_license, formatter_class=RawDescriptionHelpFormatter)
+        parser.add_argument("-p", "--port", dest="port", default=5000, help="the port to listen to, default is 5000")
+        parser.add_argument("-param", "--parameter", dest="parameter",  help="the parameter file path to be loaded")
+        parser.add_argument("-D", "--debug", dest="debug", action="store_true", help="whether need to debug")
+        parser.add_argument("-R", "--reload", dest="reload_data", action="store_true", help="whether need to reload data")
+        parser.add_argument("-C", "--cpu", dest="use_cpu", action="store_true", help="Using cpu or not, default is using gpu")
+        parser.add_argument("-m", "--mode", dest="mode", choices=['train','test','serve'], default='serve', help="On Which mode (train,test,serve) to run singa")
+        parser.add_argument('-V', '--version', action='version', version=program_version_message)
+
+        # Process arguments
+        args = parser.parse_args()
+
+        port = args.port
+        parameter_file = args.parameter
+        mode = args.mode
+        need_reload = args.reload_data
+        use_cpu = args.use_cpu
+        debug = args.debug
+
+        #prepare data files
+        config.read('file.cfg')
+        file_prepare(need_reload)
+
+
+        import network as net
+        model = net.create()
+
+        #load parameter
+        parameter_file=get_parameter(parameter_file)
+
+        if parameter_file:
+            print "load parameter file: %s" % parameter_file
+            model.load(parameter_file)
+
+        if use_cpu:
+            raise CLIError("Currently cpu is not support!")
+        else:
+            print "runing with gpu"
+            d = device.create_cuda_gpu()
+
+        model.to_device(d)
+
+        if mode == "serve":
+            print "runing singa in serve mode, listen to  port: %s " % port
+            global service
+            from serve import Service
+            service =Service(model,d)
+
+            app.debug = debug
+            app.run(host='0.0.0.0', port= port)
+        elif mode == "train":
+            print "runing singa in train mode"
+            global trainer
+            from train import Trainer
+            trainer= Trainer(model,d)
+            if not parameter_file:
+                trainer.initialize()
+            trainer.train()
+        else:
+            raise CLIError("Currently only serve mode is surpported!")
+        return 0
+    except KeyboardInterrupt:
+        ### handle keyboard interrupt ###
+        return 0
+    except Exception, e:
+        if debug:
+            traceback.print_exc()
+            raise(e)
+        indent = len(program_name) * " "
+        sys.stderr.write(program_name + ": " + str(e) + "\n")
+        sys.stderr.write(indent + "  for help use --help \n\n")
+        return 2
+
+def file_prepare(reload_data=False):
+    '''
+        download all files and generate data.py
+    '''
+    if not reload_data and os.path.exists("data_.py"):
+        return
+
+    print "download file"
+    #clean data
+    shutil.rmtree("data_.py",ignore_errors=True)
+    shutil.rmtree("data_",ignore_errors=True)
+
+    data_py=open("data_.py",'w')
+    data_py.write("#%s" % "This file is Generated by SINGA, please don't edit\n\n")
+    if config.has_section("data"):
+        file_list = config.items("data")
+        #download files
+        for f in file_list:
+            name,path=download_file(f[0],f[1],data_path)
+            data_py.write("%s=\"%s\"\n" % (name,path))
+
+    data_py.flush()
+    data_py.close()
+
+    if config.has_section("parameter"):
+        parameter_list = config.items("parameter")
+        for p in parameter_list:
+            download_file(p[0],p[1],parameter_path)
+
+def download_file(name,path,dest):
+    '''
+    download one file to dest
+    '''
+    if not os.path.exists(dest):
+        os.makedirs(dest)
+    if (path.startswith('http')):
+        file_name = path.split('/')[-1]
+        target = os.path.join(dest,file_name)
+        urllib.urlretrieve(path,target)
+    return name,target
+
+
+def get_parameter(file_name=None):
+    '''
+    get the paticular file name or get the last parameter file
+    '''
+    if not os.path.exists(parameter_path):
+        os.makedirs(parameter_path)
+        return
+
+    if file_name:
+	return os.path.join(parameter_path,file_name)
+
+    parameter_list = [ os.path.join(parameter_path,f) for f in os.listdir(parameter_path)]
+    if len(parameter_list)==0:
+        return
+    parameter_list.sort()
+
+    return parameter_list[-1]
+
+@app.route("/")
+def index():
+    return "Hello SINGA User!"
+
+@app.route('/predict', methods=['POST'])
+def predict():
+    if request.method == 'POST':
+        try:
+            response=service.serve(request)
+        except Exception as e:
+            return e
+        return response
+    return "error, should be post request"
diff --git a/src/python/singa/device.py b/src/python/singa/device.py
new file mode 100644
index 0000000..2d93823
--- /dev/null
+++ b/src/python/singa/device.py
@@ -0,0 +1,123 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# =============================================================================
+'''
+This script includes Device class and its subclasses for python users
+to call singa::Device and its methods.
+
+TODO(wangwei) implement py CudaGPU class.
+'''
+
+from . import singa_wrap as singa
+
+
+class Device(object):
+    """ Class and member functions for singa::Device.
+
+    Create Device instances using the CreateXXXDevice.
+    """
+
+    def __init__(self, id, device):
+        """Device constructor given device ID.
+
+        Args:
+            id (int): device ID.
+            device: swig shared_ptr<Device>
+        """
+        self.id = id
+        self.singa_device = device
+
+    def set_rand_seed(self, seed):
+        self.singa_device.SetRandSeed(seed)
+
+    def get_host(self):
+        return self.singa_device.host()
+
+    def get_id(self):
+        return self.singa_device.id()
+
+
+def get_num_gpus():
+    return singa.Platform.GetNumGPUs()
+
+
+def get_gpu_ids():
+    return singa.Platform.GetGPUIDs()
+
+
+def get_gpu_mem_size(id):
+    return singa.Platform.GetGPUMemSize(id)
+
+
+def device_query(id, verbose=False):
+    return singa.Platform.DeviceQuery(id, verbose)
+
+
+def create_cuda_gpus(num):
+    '''Create a list of CudaGPU devices.
+
+    Args:
+        num (int): number of device to create.
+    Returns:
+        a list of swig converted CudaGPU devices.
+    '''
+
+    return singa.Platform.CreateCudaGPUs(num)
+
+
+def create_cuda_gpu():
+    '''Create a single CudaGPU device.
+
+    Returns:
+        a swig converted CudaGPU device.
+    '''
+
+    return singa.Platform.CreateCudaGPUs(1)[0]
+
+
+def create_cuda_gpus_on(device_ids):
+    '''Create a list of CudaGPU devices.
+
+    Args:
+        device_ids (list): a list of GPU card IDs.
+
+    Returns:
+        a list of swig converted CudaGPU devices.
+    '''
+    return singa.Platform.CreateCudaGPUsOn(device_ids)
+
+
+def create_cuda_gpu_on(device_id):
+    '''Create a CudaGPU device on the given device ID.
+
+    Args:
+        device_id (int): GPU card ID.
+
+    Returns:
+        a swig converted CudaGPU device.
+    '''
+    devices = create_cuda_gpus_on([device_id])
+    return devices[0]
+
+
+default_device = singa.Platform.GetDefaultDevice()
+
+
+def get_default_device():
+    '''Get the default host device which is a CppCPU device'''
+    return default_device
+
diff --git a/src/python/singa/initializer.py b/src/python/singa/initializer.py
new file mode 100644
index 0000000..fb99663
--- /dev/null
+++ b/src/python/singa/initializer.py
@@ -0,0 +1,122 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# =============================================================================
+'''Popular initialization methods for parameter values (Tensor objects).
+
+Example usages::
+
+    from singa import tensor
+    from singa import initializer
+
+    x = tensor.Tensor((3, 5))
+    initializer.uniform(x, 3, 5) # use both fan_in and fan_out
+    initializer.uniform(x, 3, 0)  # use only fan_in
+'''
+
+import math
+
+
+def uniform(t, fan_in=0, fan_out=0):
+    '''Initialize the values of the input tensor following a uniform
+    distribution with specific bounds.
+
+    Args:
+        fan_in(int): for the weight Tensor of a convolution layer,
+            fan_in = nb_channel * kh * kw; for dense layer,
+            fan_in = input_feature_length
+        fan_out(int): for the convolution layer weight Tensor,
+            fan_out = nb_filter * kh * kw; for the weight Tensor of a dense
+            layer, fan_out = output_feature_length
+
+    Ref: [Bengio and Glorot 2010]: Understanding the difficulty of
+    training deep feedforward neuralnetworks.
+
+    '''
+    assert fan_in > 0 or fan_out > 0, \
+        'fan_in and fan_out cannot be 0 at the same time'
+    avg = 2
+    if fan_in * fan_out == 0:
+        avg = 1
+    x = math.sqrt(3.0 * avg / (fan_in + fan_out))
+    t.uniform(-x, x)
+
+
+def gaussian(t, fan_in=0, fan_out=0):
+    '''Initialize the values of the input tensor following a Gaussian
+    distribution with specific std.
+
+    Args:
+        fan_in(int): for the weight Tensor of a convolution layer,
+            fan_in = nb_channel * kh * kw; for dense layer,
+            fan_in = input_feature_length
+        fan_out(int): for the convolution layer weight Tensor,
+            fan_out = nb_filter * kh * kw; for the weight Tensor of a dense
+            layer, fan_out = output_feature_length
+
+    Ref Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun: Delving Deep into
+    Rectifiers: Surpassing Human-Level Performance on ImageNet Classification
+    '''
+    assert fan_in > 0 or fan_out > 0, \
+        'fan_in and fan_out cannot be 0 at the same time'
+    avg = 2
+    if fan_in * fan_out == 0:
+        avg = 1
+    std = math.sqrt(2.0 * avg / (fan_in + fan_out))
+    t.gaussian(0, std)
+
+
+def xavier(t):
+    '''Initialize the matrix parameter follow a Uniform distribution from
+    [-sqrt(6/(fan_in + fan_out)), sqrt(6/(fan_in + fan_out))].
+
+    Deprecated. Please use uniform()
+
+    Args:
+        t (Tensor): the parater tensor
+    '''
+
+    scale = math.sqrt(6.0 / (t.shape[0] + t.shape[1]))
+    t.uniform(-scale, scale)
+
+
+def glorot(t):
+    '''Initialize the matrix parameter follow a Gaussian distribution with
+    mean = 0 and std = sqrt(2.0 / (nb_row + nb_col))
+
+    Deprecated. Please use gaussian()
+
+    Args:
+        t (Tensor): the parater tensor
+    '''
+    scale = math.sqrt(2.0 / (t.shape[0] + t.shape[1]))
+    t.gaussian(0, 1)
+    t *= scale
+
+
+def msra(t):
+    '''Initialize the matrix parameter follow a Guassian distribution with
+    mean = 0, std = math.sqrt(2.0 / nb_row).
+
+    Deprecated. Please use gaussian()
+
+    Ref [He, Zhang, Ren and Sun 2015]: Specifically accounts for ReLU
+    nonlinearities.
+
+    Args:
+        t (Tensor): the parater tensor
+    '''
+    t.gaussian(0, math.sqrt(2.0 / t.shape[0]))
diff --git a/src/python/singa/layer.py b/src/python/singa/layer.py
new file mode 100644
index 0000000..f22b3d1
--- /dev/null
+++ b/src/python/singa/layer.py
@@ -0,0 +1,933 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+""" Python layers wrap the C++ layers to provide simpler construction APIs.
+
+Example usages::
+
+    from singa import layer
+    from singa import tensor
+    from singa import device
+    from singa.model_pb2 import kTrain
+
+    layer.engine = 'cudnn'  # to use cudnn layers
+    dev = device.create_cuda_gpu()
+
+    # create a convolution layer
+    conv = layer.Conv2D('conv', 32, 3, 1, pad=1, input_sample_shape=(3, 32, 32))
+    conv.to_device(dev)  # move the layer data onto a CudaGPU device
+    x = tensor.Tensor((3, 32, 32), dev)
+    x.uniform(-1, 1)
+    y = conv.foward(kTrain, x)
+
+    dy = tensor.Tensor()
+    dy.reset_like(y)
+    dy.set_value(0.1)
+    # dp is a list of tensors for parameter gradients
+    dx, dp = conv.backward(kTrain, dy)
+"""
+
+from sets import Set
+from . import singa_wrap
+from .proto import model_pb2
+import tensor
+
+
+engine = 'cudnn'
+'''engine is the prefix of layer identifier.
+
+The value could be one of [**'cudnn', 'singacpp', 'singacuda', 'singacl'**], for
+layers implemented using the cudnn library, Cpp, Cuda and OpenCL respectively.
+For example, CudnnConvolution layer is identified by 'cudnn_convolution';
+'singacpp_convolution' is for Convolution layer;
+Some layers' implementation use only Tensor functions, thererfore they are
+transparent to the underlying devices. For threse layers, they would have
+multiple identifiers, e.g., singacpp_dropout, singacuda_dropout and
+singacl_dropout are all for the Dropout layer. In addition, it has an extra
+identifier 'singa', i.e. 'singa_dropout' also stands for the Dropout layer.
+
+engine is case insensitive. Each python layer would create the correct specific
+layer using the engine attribute.
+'''
+
+
+class Layer(object):
+    '''Base Python layer class.
+
+    Typically, the life cycle of a layer instance includes:
+        1. construct layer without input_sample_shapes, goto 2;
+           construct layer with input_sample_shapes, goto 3;
+        2. call setup to create the parameters and setup other meta fields
+        3. call forward or access layer members
+        4. call backward and get parameters for update
+
+    Args:
+        name (str): layer name
+    '''
+
+    def __init__(self, name, **kwargs):
+        self.layer = None  # layer converted by swig
+        self.name = name  # TODO(wangwei) duplicate with self.conf.name
+        self.conf = model_pb2.LayerConf()
+        self.conf.name = name
+        self.param_specs = []
+        self.has_setup = False
+
+    def param_names(self):
+        '''
+        Returns:
+            a list of strings, one for the name of one parameter Tensor
+        '''
+        names = []
+        for x in self.param_specs:
+            names.append(x['name'])
+        return names
+
+    def setup(self, in_shapes):
+        '''Call the C++ setup function to create params and set some meta data.
+
+        Args:
+            in_shapes: if the layer accepts a single input Tensor, in_shapes is
+                a single tuple specifying the inpute Tensor shape; if the layer
+                accepts multiple input Tensor (e.g., the concatenation layer),
+                in_shapes is a tuple of tuples, each for one input Tensor
+        '''
+        if self.has_setup:
+            return
+        self.layer.Setup(list(in_shapes),
+                         self.conf.SerializeToString())
+        self.has_setup = True
+
+    def get_output_sample_shape(self):
+        '''Called after setup to get the shape of the output sample(s).
+
+        Returns:
+            a tuple for a single output Tensor or a list of tuples if this layer
+            has multiple outputs
+        '''
+        assert self.has_setup, \
+            'Must call setup() before get_output_sample_shape()'
+        return self.layer.GetOutputSampleShape()
+
+    def param_values(self):
+        '''Return param value tensors.
+
+        Parameter tensors are not stored as layer members because cpp Tensor
+        could be moved onto diff devices due to the change of layer device,
+        which would result in inconsistency.
+
+        Returns:
+            a list of tensors, one for each paramter
+        '''
+        if self.layer is None:
+            return []
+        else:
+            return tensor.from_raw_tensors(self.layer.param_values())
+
+    def forward(self, flag, x):
+        '''Forward propagate through this layer.
+
+        Args:
+            flag (int): kTrain or kEval
+            x (Tensor or list<Tensor>): an input tensor if the layer is
+                connected from a single layer; a list of tensors if the layer
+                is connected from multiple layers.
+
+        Return:
+            a tensor if the layer is connected to a single layer; a list of
+            tensors if the layer is connected to multiple layers;
+        '''
+        assert self.has_setup, 'Must call setup() before forward()'
+        if type(x) == list:
+            xs = []
+            for t in x:
+                x.append(t.singa_tensor)
+        else:
+            assert isinstance(x, tensor.Tensor), \
+                'input must be a Tensor or a list of Tensor'
+            xs = x.singa_tensor
+        y = self.layer.Forward(flag, xs)
+        if type(y) == list:
+            return tensor.from_raw_tensors(y)
+        else:
+            return tensor.from_raw_tensor(y)
+
+    def backward(self, flag, dy):
+        '''Backward propagate gradients through this layer.
+
+        Args:
+            flag (int): for future use.
+            dy (Tensor or list<Tensor>): the gradient tensor(s) y w.r.t the
+                objective loss
+        Return:
+            <dx, <dp1, dp2..>>, dx is a (set of) tensor(s) for the gradient of x
+            , dpi is the gradient of the i-th parameter
+        '''
+        if type(dy) == list:
+            dys = []
+            for t in dy:
+                dys.append(t.singa_tensor)
+        else:
+            assert isinstance(dy, tensor.Tensor), \
+                'the input must be a Tensor or a set of Tensor'
+            dys = dy.singa_tensor
+        ret = self.layer.Backward(flag, dys)
+        if type(ret[0]) == list:
+            dxs = tensor.from_raw_tensors(ret[0])
+        else:
+            dxs = tensor.from_raw_tensor(ret[0])
+        return dxs, tensor.from_raw_tensors(ret[1])
+
+    def to_device(self, device):
+        '''Move layer state tensors onto the given device.
+
+        Args:
+            device: swig converted device, created using singa.device
+        '''
+        if self.layer is not None:
+            self.layer.ToDevice(device)
+
+    def as_type(self, dtype):
+        pass
+
+    def __copy__(self):
+        pass
+
+    def __deepcopy__(self):
+        pass
+
+
+class Conv2D(Layer):
+    """Construct a layer for 2D convolution.
+
+    Args:
+        nb_kernels (int): num of the channels (kernels) of the input Tensor
+        kernel: an integer or a pair of integers for kernel height and width
+        stride: an integer or a pair of integers for stride height and width
+        border_mode (string): padding mode, case in-sensitive,
+            'valid' -> padding is 0 for height and width
+            'same' -> padding is half of the kernel (floor), the kernel must be
+            odd number.
+        cudnn_prefer (string): the preferred algorithm for cudnn convolution
+            which could be 'fatest', 'autotune', 'limited_workspace' and
+            'no_workspace'
+        data_format (string): either 'NCHW' or 'NHWC'
+        use_bias (bool): True or False
+        pad: an integer or a pair of integers for padding height and width
+        W_specs (dict): used to specify the weight matrix specs, fields
+            include,
+            'name' for parameter name
+            'lr_mult' for learning rate multiplier
+            'decay_mult' for weight decay multiplier
+            'init' for init method, which could be 'gaussian', 'uniform',
+            'xavier' and ''
+            'std', 'mean', 'high', 'low' for corresponding init methods
+            TODO(wangwei) 'clamp' for gradient constraint, value is scalar
+            'regularizer' for regularization, currently support 'l2'
+        b_specs (dict): hyper-parameters for bias vector, similar as W_specs
+        name (string): layer name.
+        input_sample_shape: 3d tuple for the shape of the input Tensor
+            without the batchsize, e.g., (channel, height, width) or
+            (height, width, channel)
+    """
+    def __init__(self, name, nb_kernels, kernel=3, stride=1, border_mode='same',
+                 cudnn_prefer='fatest', data_format='NCHW',
+                 use_bias=True, W_specs=None, b_specs=None,
+                 pad=None, input_sample_shape=None):
+        super(Conv2D, self).__init__(name)
+        assert data_format == 'NCHW', 'Not supported data format: %s ' \
+            'only "NCHW" is enabled currently' % (data_format)
+        conf = self.conf.convolution_conf
+        conf.num_output = nb_kernels
+        conf = _set_kernel_stride_pad(conf, kernel, stride, border_mode, pad)
+        conf.bias_term = use_bias
+        # TODO(wangwei) enable data format for cpp code
+        # conf.data_format = data_format
+        if W_specs is None:
+            W_specs = {'init': 'xavier'}
+        if b_specs is None:
+            b_specs = {'init': 'constant'}
+        if 'name' not in W_specs:
+            W_specs['name'] = name + '_weight'
+        if 'name' not in b_specs:
+            b_specs['name'] = name + '_bias'
+        wspecs = _construct_param_specs_from_dict(W_specs)
+        self.conf.param.extend([wspecs])
+        self.param_specs.append(wspecs)
+        bspecs = _construct_param_specs_from_dict(b_specs)
+        self.conf.param.extend([bspecs])
+        self.param_specs.append(bspecs)
+
+        _check_engine(engine, ['cudnn', 'singacpp'])
+        self.layer = _create_layer(engine, 'Convolution')
+        if input_sample_shape is not None:
+            self.setup(input_sample_shape)
+
+
+class Conv1D(Conv2D):
+    """Construct a layer for 1D convolution.
+
+    Most of the args are the same as those for Conv2D except the kernel,
+    stride, pad, which is a scalar instead of a tuple.
+    input_sample_shape is a tuple with a single value for the input feature
+    length
+    """
+
+    def __init__(self, name, nb_kernels, kernel=3, stride=1,
+                 border_mode='same', cudnn_prefer='fatest',
+                 use_bias=True, W_specs={'init': 'Xavier'},
+                 b_specs={'init': 'Constant', 'value': 0}, pad=None,
+                 input_sample_shape=None):
+        pad = None
+        if pad is not None:
+            pad = (0, pad)
+        if input_sample_shape is not None:
+            input_sample_shape = (1, 1, input_sample_shape[0])
+        super(Conv1D, self).__init__(name, nb_kernels, (1, kernel), (0, stride),
+                                     border_mode, cudnn_prefer,
+                                     use_bias=use_bias, pad=pad,
+                                     W_specs=W_specs, b_specs=b_specs,
+                                     input_sample_shape=input_sample_shape)
+
+    def get_output_sample_shape(self):
+        shape = self.layer.GetOutputSampleShape()
+        assert len(shape) == 3, 'The output sample shape should be 3D.'\
+            'But the length is %d' % len(shape)
+        return (shape[0], shape[2])
+
+
+class Pooling2D(Layer):
+    '''2D pooling layer providing max/avg pooling.
+
+    All args are the same as those for Conv2D, except the following one
+
+    Args:
+        mode: pooling type, model_pb2.PoolingConf.MAX or
+            model_pb2.PoolingConf.AVE
+
+    '''
+    def __init__(self, name, mode, kernel=3, stride=2, border_mode='same',
+                 pad=None, data_format='NCHW', input_sample_shape=None):
+        super(Pooling2D, self).__init__(name)
+        assert data_format == 'NCHW', 'Not supported data format: %s ' \
+            'only "NCHW" is enabled currently' % (data_format)
+        conf = self.conf.pooling_conf
+        conf = _set_kernel_stride_pad(conf, kernel, stride, border_mode, pad)
+        conf.pool = mode
+        _check_engine(engine, ['cudnn', 'singacpp'])
+        self.layer = _create_layer(engine, 'Pooling')
+        if input_sample_shape is not None:
+            self.setup(input_sample_shape)
+
+
+class MaxPooling2D(Pooling2D):
+
+    def __init__(self, name, kernel=3, stride=2, border_mode='same', pad=None,
+                 data_format='NCHW', input_sample_shape=None):
+        super(MaxPooling2D, self).__init__(name, model_pb2.PoolingConf.MAX,
+                                           kernel, stride, border_mode,
+                                           pad, data_format, input_sample_shape)
+
+
+class AvgPooling2D(Pooling2D):
+
+    def __init__(self, name, kernel=3, stride=2, border_mode='same', pad=None,
+                 data_format='NCHW', input_sample_shape=None):
+        super(AvgPooling2D, self).__init__(name, model_pb2.PoolingConf.AVE,
+                                           kernel, stride, border_mode,
+                                           pad, data_format, input_sample_shape)
+
+
+class MaxPooling1D(MaxPooling2D):
+
+    def __init__(self, name, kernel=3, stride=2, border_mode='same', pad=None,
+                 data_format='NCHW', input_sample_shape=None):
+        """Max pooling for 1D feature.
+
+        Args:
+            input_sample_shape (tuple): 1D tuple for input feature length
+        """
+        pad = None
+        if pad is not None:
+            pad = (0, pad)
+        if input_sample_shape is not None:
+            assert len(input_sample_shape) == 1, \
+                'AvgPooling1D expects input sample to be 1D'
+            input_sample_shape = (1, 1, input_sample_shape[0])
+        else:
+            input_sample_shape = None
+        super(MaxPooling1D, self).__init__(name, (1, kernel), (0, stride),
+                                           border_mode, pad,
+                                           data_format, input_sample_shape)
+
+    def get_output_sample_shape(self):
+        shape = self.layer.GetOutputSampleShape()
+        return (shape[2],)
+
+
+class AvgPooling1D(AvgPooling2D):
+
+    def __init__(self, name, kernel=3, stride=2, border_mode='same', pad=None,
+                 data_format='NCHW', input_sample_shape=None):
+        """input_feature_length is a scalar value"""
+        pad2 = None
+        if pad is not None:
+            pad2 = (pad, 0)
+        if input_sample_shape is not None:
+            assert len(input_sample_shape) == 1, \
+                'AvgPooling1D expects input sample to be 1D'
+            input_sample_shape = (1, 1, input_sample_shape[0])
+        else:
+            input_sample_shape = None
+
+        super(AvgPooling1D, self).__init__(name, (kernel, 1), (0, stride),
+                                           border_mode, pad2,
+                                           data_format, input_sample_shape)
+
+    def get_output_sample_shape(self):
+        shape = self.layer.GetOutputSampleShape()
+        return (shape[2],)
+
+
+class BatchNormalization(Layer):
+    """Batch-normalization.
+
+    Args:
+        momentum (float): for running average mean and variance.
+        beta_specs (dict): dictionary includes the fields for the beta
+            param:
+            'name' for parameter name
+            'lr_mult' for learning rate multiplier
+            'decay_mult' for weight decay multiplier
+            'init' for init method, which could be 'gaussian', 'uniform',
+            'xavier' and ''
+            'std', 'mean', 'high', 'low' for corresponding init methods
+            'clamp' for gradient constraint, value is scalar
+            'regularizer' for regularization, currently support 'l2'
+        gamma_specs (dict): similar to beta_specs, but for the gamma param.
+        name (string): layer name
+        input_sample_shape (tuple): with at least one integer
+    """
+    def __init__(self, name, momentum=0.9,
+                 beta_specs=None, gamma_specs=None, input_sample_shape=None):
+        super(BatchNormalization, self).__init__(name)
+        conf = self.conf.batchnorm_conf
+        conf.factor = momentum
+        if beta_specs is None:
+            beta_specs = {'init': 'Xavier'}
+        if gamma_specs is None:
+            gamma_specs = {'init': 'Xavier'}
+        if 'name' not in beta_specs:
+            beta_specs['name'] = name + '_beta'
+        if 'name' not in gamma_specs:
+            gamma_specs['name'] = name + '_gamma'
+        mean_specs = {'init': 'constant', 'value': 0, 'name': name+'_mean'}
+        var_specs = {'init': 'constant', 'value': 1, 'name': name+'_var'}
+        self.conf.param.extend([_construct_param_specs_from_dict(gamma_specs)])
+        self.conf.param.extend([_construct_param_specs_from_dict(beta_specs)])
+        self.conf.param.extend([_construct_param_specs_from_dict(mean_specs)])
+        self.conf.param.extend([_construct_param_specs_from_dict(var_specs)])
+        self.param_specs.append(_construct_param_specs_from_dict(gamma_specs))
+        self.param_specs.append(_construct_param_specs_from_dict(beta_specs))
+        self.param_specs.append(_construct_param_specs_from_dict(mean_specs))
+        self.param_specs.append(_construct_param_specs_from_dict(var_specs))
+        _check_engine(engine, ['cudnn', 'singa', 'singacpp', 'singacuda',
+                               'singacl'])
+        self.layer = _create_layer(engine, 'BatchNorm')
+        if input_sample_shape is not None:
+            self.setup(input_sample_shape)
+
+
+class LRN(Layer):
+    """Local response normalization.
+
+    Args:
+        size (int): # of channels to be crossed
+            normalization.
+        mode (string): 'cross_channel'
+        input_sample_shape (tuple): 3d tuple, (channel, height, width)
+    """
+
+    def __init__(self, name, size=5, alpha=1, beta=0.75, mode='cross_channel',
+                 k=1, input_sample_shape=None):
+        super(LRN, self).__init__(name)
+        conf = self.conf.lrn_conf
+        conf.local_size = size
+        conf.alpha = alpha
+        conf.beta = beta
+        conf.k = k
+        # TODO(wangwei) enable mode = 'within_channel'
+        assert mode == 'cross_channel', 'only support mode="across_channel"'
+        conf.norm_region = model_pb2.LRNConf.ACROSS_CHANNELS
+        _check_engine(engine, ['cudnn', 'singa', 'singacpp', 'singacuda',
+                               'singacl'])
+        self.layer = _create_layer(engine, 'LRN')
+        if input_sample_shape is not None:
+            self.setup(input_sample_shape)
+
+
+class Dense(Layer):
+    """Apply linear/affine transformation, also called inner-product or
+    fully connected layer.
+
+    Args:
+        num_output (int): output feature length.
+        use_bias (bool): add a bias vector or not to the transformed feature
+        W_specs (dict): specs for the weight matrix
+            'name' for parameter name
+            'lr_mult' for learning rate multiplier
+            'decay_mult' for weight decay multiplier
+            'init' for init method, which could be 'gaussian', 'uniform',
+            'xavier' and ''
+            'std', 'mean', 'high', 'low' for corresponding init methods
+            'clamp' for gradient constraint, value is scalar
+            'regularizer' for regularization, currently support 'l2'
+        b_specs (dict): specs for the bias vector, same fields as W_specs.
+        W_transpose (bool): if true, output=x*W.T+b;
+        input_sample_shape (tuple): input feature length
+    """
+    def __init__(self, name, num_output, use_bias=True,
+                 W_specs=None, b_specs=None,
+                 W_transpose=False, input_sample_shape=None):
+        """Apply linear/affine transformation, also called inner-product or
+        fully connected layer.
+
+        Args:
+            num_output (int): output feature length.
+            use_bias (bool): add a bias vector or not to the transformed feature
+            W_specs (dict): specs for the weight matrix
+                'name' for parameter name
+                'lr_mult' for learning rate multiplier
+                'decay_mult' for weight decay multiplier
+                'init' for init method, which could be 'gaussian', 'uniform',
+                'xavier' and ''
+                'std', 'mean', 'high', 'low' for corresponding init methods
+                'clamp' for gradient constraint, value is scalar
+                'regularizer' for regularization, currently support 'l2'
+            b_specs (dict): specs for the bias vector, same fields as W_specs.
+            W_transpose (bool): if true, output=x*W.T+b;
+            input_sample_shape (tuple): input feature length
+        """
+        super(Dense, self).__init__(name)
+        conf = self.conf.dense_conf
+        conf.num_output = num_output
+        conf.bias_term = use_bias
+        conf.transpose = W_transpose
+        if W_specs is None:
+            W_specs = {'init': 'xavier'}
+        if b_specs is None:
+            b_specs = {'init': 'constant', 'value': 0}
+        if 'name' not in W_specs:
+            W_specs['name'] = name + '_weight'
+        if 'name' not in b_specs:
+            b_specs['name'] = name + '_bias'
+        wspecs = _construct_param_specs_from_dict(W_specs)
+        bspecs = _construct_param_specs_from_dict(b_specs)
+        self.conf.param.extend([wspecs, bspecs])
+        self.param_specs.extend([wspecs, bspecs])
+        # dense layer is transparent to engine.
+        if engine == 'cudnn':
+            self.layer = _create_layer('singacuda', 'Dense')
+        else:
+            self.layer = _create_layer(engine, 'Dense')
+        if input_sample_shape is not None:
+            self.setup(input_sample_shape)
+
+
+class Dropout(Layer):
+    """Droput layer.
+
+    Args:
+        p (float): probability for dropping out the element, i.e., set to 0
+        name (string): layer name
+    """
+
+    def __init__(self, name, p=0.5, input_sample_shape=None):
+        super(Dropout, self).__init__(name)
+        conf = self.conf.dropout_conf
+        conf.dropout_ratio = p
+        # 'cudnn' works for v>=5.0
+        #  if engine.lower() == 'cudnn':
+        #      engine = 'cuda'
+        _check_engine(engine, ['cudnn', 'singa', 'singacpp', 'singacuda',
+                               'singacl'])
+        self.layer = _create_layer(engine, 'Dropout')
+        if input_sample_shape is not None:
+            self.setup(input_sample_shape)
+
+
+class Activation(Layer):
+    """Activation layers.
+
+    Args:
+        name (string): layer name
+        mode (string): 'relu', 'sigmoid', or 'tanh'
+        input_sample_shape (tuple): shape of a single sample
+    """
+    def __init__(self, name, mode='relu', input_sample_shape=None):
+        super(Activation, self).__init__(name)
+        _check_engine(engine, ['cudnn', 'singacpp', 'singacuda', 'singacl'])
+        self.conf.type = (engine + '_' + mode).lower()
+        self.layer = _create_layer(engine, mode)
+        if input_sample_shape is not None:
+            self.setup(input_sample_shape)
+
+
+class Softmax(Layer):
+    """Apply softmax.
+
+    Args:
+        axis (int): reshape the input as a matrix with the dimension
+            [0,axis) as the row, the [axis, -1) as the column.
+        input_sample_shape (tuple): shape of a single sample
+    """
+    def __init__(self, name, axis=1, input_sample_shape=None):
+        super(Softmax, self).__init__(name)
+        # conf = self.conf.softmax_conf
+        # conf.axis = axis
+        _check_engine(engine, ['cudnn', 'singa', 'singacpp', 'singacl',
+                               'singacuda'])
+        self.layer = _create_layer(engine, 'Softmax')
+        if input_sample_shape is not None:
+            self.setup(input_sample_shape)
+
+
+class Flatten(Layer):
+    """Reshape the input tensor into a matrix.
+
+    Args:
+        axis (int): reshape the input as a matrix with the dimension
+            [0,axis) as the row, the [axis, -1) as the column.
+        input_sample_shape (tuple): shape for a single sample
+    """
+    def __init__(self, name, axis=1, input_sample_shape=None):
+        super(Flatten, self).__init__(name)
+        conf = self.conf.flatten_conf
+        conf.axis = axis
+        # fltten layer is transparent to engine
+        if engine == 'cudnn':
+            self.layer = _create_layer('singacuda', 'Flatten')
+        else:
+            self.layer = _create_layer(engine, 'Flatten')
+        if input_sample_shape is not None:
+            self.setup(input_sample_shape)
+
+
+class Merge(Layer):
+    '''Sum all input tensors.
+
+    Args:
+        input_sample_shape: sample shape of the input. The sample shape of all
+            inputs should be the same.
+    '''
+    def __init__(self, name, input_sample_shape=None):
+        self.in_shape = input_sample_shape
+        self.num_input = 1
+        super(Merge, self).__init__(name)
+
+    def setup(self, in_shape):
+        self.in_shape = in_shape
+        self.has_setup = True
+
+    def get_output_sample_shape(self):
+        return self.in_shape
+
+    def forward(self, flag, inputs):
+        assert len(inputs) > 1, 'There must be multiple input tensors'
+        self.num_input = len(inputs)
+        output = tensor.Tensor()
+        output.reset_like(inputs[0])
+        output.set_value(0)
+        for x in inputs:
+            output += x
+        return output
+
+    def backward(self, flag, grad):
+        assert isinstance(grad, tensor.Tensor), 'The input must be Tensor'
+        return [grad], []  # * self.num_input
+
+
+class Split(Layer):
+    '''Replicate the input tensor.
+
+    Args:
+        num_output (int): number of output tensors to generate.
+        input_sample_shape: includes a single integer for the input sample
+            feature size.
+    '''
+    def __init__(self, name, num_output, input_sample_shape=None):
+        self.num_output = num_output
+        self.in_shape = input_sample_shape
+        super(Split, self).__init__(name)
+
+    def setup(self, in_shape):
+        self.in_shape = in_shape
+        self.has_setup = True
+
+    def get_output_sample_shape(self):
+        return self.in_shape
+
+    def forward(self, flag, input):
+        assert isinstance(input, tensor.Tensor), 'The input must be Tensor'
+        outputs = [input] * self.num_output
+        return outputs
+
+    def backward(self, flag, grads):
+        assert len(grads) > 1, 'There must be multiple gradients'
+        dx = tensor.Tensor()
+        dx.reset_like(grads[0])
+        dx.set_value(0)
+        for g in grads:
+            dx += g
+        return dx, []
+
+
+class RNN(Layer):
+    '''Recurrent layer with 4 types of units, namely lstm, gru, tanh and relu.
+
+    Args:
+        hidden_size: hidden feature size, the same for all stacks of layers.
+        rnn_mode: decides the rnn unit, which could be one of 'lstm', 'gru',
+            'tanh' and 'relu', refer to cudnn manual for each mode.
+        num_stacks: num of stacks of rnn layers. It is different to the
+            unrolling seqence length.
+        input_mode: 'linear' convert the input feature x by by a linear
+            transformation to get a feature vector of size hidden_size;
+            'skip' does nothing but requires the input feature size equals
+            hidden_size
+        bidirection: True for bidirectional RNN
+        param_specs: config for initializing the RNN parameters.
+        input_sample_shape: includes a single integer for the input sample
+            feature size.
+    '''
+
+    def __init__(self, name, hidden_size, rnn_mode='lstm', dropout=0.0,
+                 num_stacks=1, input_mode='linear', bidirectional=False,
+                 param_specs=None, input_sample_shape=None):
+        super(RNN, self).__init__(name)
+        conf = self.conf.rnn_conf
+        assert hidden_size > 0, 'Hidden feature size must > 0'
+        conf.hidden_size = hidden_size
+        assert rnn_mode in Set(['lstm', 'gru', 'tanh', 'relu']),  \
+            'rnn mode %s is not available' % (rnn_mode)
+        conf.rnn_mode = rnn_mode
+        conf.num_stacks = num_stacks
+        conf.dropout = dropout
+        conf.input_mode = input_mode
+        conf.direction = 'unidirectional'
+        if bidirectional:
+            conf.direction = 'bidirectional'
+        # currently only has rnn layer implemented using cudnn
+        _check_engine(engine, ['cudnn'])
+        if param_specs is None:
+            param_specs = {'name': name + '-weight',
+                           'init': 'uniform', 'low': 0, 'high': 1}
+        self.conf.param.extend([_construct_param_specs_from_dict(param_specs)])
+        self.param_specs.append(_construct_param_specs_from_dict(param_specs))
+
+        self.layer = singa_wrap.CudnnRNN()
+        if input_sample_shape is not None:
+            self.setup(input_sample_shape)
+
+    def forward(self, flag, inputs):
+        '''Forward inputs through the RNN.
+
+        Args:
+            flag, kTrain or kEval.
+            inputs, <x1, x2,...xn, hx, cx>, where xi is the input tensor for the
+                i-th position, its shape is (batch_size, input_feature_length);
+                the batch_size of xi must >= that of xi+1; hx is the initial
+                hidden state of shape (num_stacks * bidirection?2:1, batch_size,
+                hidden_size). cx is the initial cell state tensor of the same
+                shape as hy. cx is valid for only lstm. For other RNNs there is
+                no cx. Both hx and cx could be dummy tensors without shape and
+                data.
+
+        Returns:
+            <y1, y2, ... yn, hy, cy>, where yi is the output tensor for the i-th
+                position, its shape is (batch_size,
+                hidden_size * bidirection?2:1). hy is the final hidden state
+                tensor. cx is the final cell state tensor. cx is only used for
+                lstm.
+        '''
+        assert self.has_setup, 'Must call setup() before forward()'
+        assert len(inputs) > 1, 'The input to RNN must include at '\
+            'least one input tensor '\
+            'and one hidden state tensor (could be a dummy tensor)'
+        tensors = []
+        for t in inputs:
+            assert isinstance(t, tensor.Tensor), \
+                'input must be py Tensor %s' % (type(t))
+            tensors.append(t.singa_tensor)
+        y = self.layer.Forward(flag, tensors)
+        return tensor.from_raw_tensors(y)
+
+    def backward(self, flag, grad):
+        '''Backward gradients through the RNN.
+
+        Args:
+            flag, for future use.
+            grad, <dy1, dy2,...dyn, dhy, dcy>, where dyi is the gradient for the
+            i-th output, its shape is (batch_size, hidden_size*bidirection?2:1);
+                dhy is the gradient for the final hidden state, its shape is
+                (num_stacks * bidirection?2:1, batch_size,
+                hidden_size). dcy is the gradient for the final cell state.
+                cx is valid only for lstm. For other RNNs there is
+                no cx. Both dhy and dcy could be dummy tensors without shape and
+                data.
+
+        Returns:
+            <dx1, dx2, ... dxn, dhx, dcx>, where dxi is the gradient tensor for
+                the i-th input, its shape is (batch_size,
+                input_feature_length). dhx is the gradient for the initial
+                hidden state. dcx is the gradient for the initial cell state,
+                which is valid only for lstm.
+        '''
+        tensors = []
+        for t in grad:
+            assert isinstance(t, tensor.Tensor), 'grad must be py Tensor'
+            tensors.append(t.singa_tensor)
+        ret = self.layer.Backward(flag, tensors)
+        return tensor.from_raw_tensors(ret[0]), tensor.from_raw_tensors(ret[1])
+
+
+class LSTM(RNN):
+    def __init__(self, name, hidden_size, dropout=0.0, num_stacks=1,
+                 input_mode='linear', bidirectional=False,
+                 param_specs=None, input_sample_shape=None):
+        super(LSTM, self).__init__(name, hidden_size,  'lstm',  dropout,
+                                   num_stacks, input_mode, bidirectional,
+                                   param_specs, input_sample_shape)
+
+
+class GRU(RNN):
+    def __init__(self, name, hidden_size, dropout=0.0, num_stacks=1,
+                 input_mode='linear', bidirectional=False, param_specs=None,
+                 input_sample_shape=None):
+        super(GRU, self).__init__(name,  hidden_size, 'gru',  dropout,
+                                  num_stacks, input_mode, bidirectional,
+                                  param_specs, input_sample_shape)
+
+
+def _check_engine(engine, allowed_engines):
+    assert engine.lower() in Set(allowed_engines), \
+           '%s is not a supported engine. Pls use one of %s' % \
+           (engine, ', '.join(allowed_engines))
+
+
+def _create_layer(eng, layer):
+    ''' create singa wrap layer.
+
+    Both arguments are case insensitive.
+    Args:
+        engine, implementation engine, either 'singa' or 'cudnn'
+        layer, layer type, e.g., 'convolution', 'pooling'; for activation
+        layers, use the specific activation mode, e.g. 'relu', 'tanh'.
+    '''
+    layer_type = eng + '_' + layer
+    return singa_wrap.CreateLayer(layer_type.lower())
+
+
+def _set_kernel_stride_pad(conf, kernel, stride, border_mode, pad):
+    """Private function called by Convolution2D and Pooling2D."""
+    if isinstance(kernel, tuple):
+        conf.kernel_h = kernel[0]
+        conf.kernel_w = kernel[1]
+    else:
+        conf.kernel_h = kernel
+        conf.kernel_w = kernel
+    if isinstance(stride, tuple):
+        conf.stride_h = stride[0]
+        conf.stride_w = stride[1]
+    else:
+        conf.stride_h = stride
+        conf.stride_w = stride
+    mode = border_mode.lower()
+    if pad is None:
+        # TODO(wangwei) check the border mode
+        if mode == 'same':
+            assert conf.kernel_h % 2 == 1 and conf.kernel_w % 2 == 1, \
+                'Must use odd kernel for mode="same", kernel is (%d, %d)' % (
+                    conf.kernel_h, conf.kernel_w)
+            pad = (conf.kernel_h / 2, conf.kernel_w / 2)
+        elif mode == 'valid':
+            pad = (0, 0)
+        else:
+            assert False, ('Unsupported border_mode: %s. '
+                           'Please use {"valid", "same"}' % border_mode)
+        assert isinstance(pad, tuple), 'pad should be a tuple'
+    if isinstance(pad, tuple):
+        conf.pad_h = pad[0]
+        conf.pad_w = pad[1]
+    else:
+        conf.pad_h = pad
+        conf.pad_w = pad
+    return conf
+
+
+def _construct_param_specs_from_dict(specs):
+    """Conver the param specs from a dict into ParamSpec protobuf object.
+
+    Args:
+        specs (dict): the fields inlcude
+            'name' for parameter name
+            'lr_mult' for learning rate multiplier;
+            'decay_mult' for weight decay multiplier;
+            'init' for init method, which could be 'gaussian', 'uniform',
+            'xavier' and 'msra';
+            'std', 'mean', 'high', 'low' are used by corresponding init methods;
+            'constraint' for gradient constraint, value is a float threshold for
+                clampping the gradient.
+            'regularizer' for regularization, currently support 'l2', value is a
+                float for the coefficient.
+
+    Returns:
+        a ParamSpec object
+    """
+    conf = model_pb2.ParamSpec()
+    if 'name' in specs:
+        conf.name = specs['name']
+    if 'lr_mult' in specs:
+        conf.lr_mult = specs['lr_mult']
+    if 'decay_mult' in specs:
+        conf.decay_mult = specs['decay_mult']
+    if 'init' in specs:
+        filler = conf.filler
+        filler.type = specs['init'].lower()
+        if specs['init'].lower() == 'uniform':
+            assert 'low' in specs and 'high' in specs, \
+                'low and high are required for "uniform" init method'
+            filler.min = specs['low']
+            filler.max = specs['high']
+        elif specs['init'].lower() == 'gaussian':
+            assert 'mean' in specs and 'std' in specs, \
+                'std and mean are required for "gaussian" init method'
+            filler.mean = specs['mean']
+            filler.std = specs['std']
+        elif specs['init'].lower() == 'constant' and 'value' in specs:
+            filler.value = specs['value']
+    if 'regularizer' in specs:
+        conf.regularizer.coefficient = specs['regularizer']
+    if 'constraint' in specs:
+        conf.constraint.threshold = specs['constraint']
+    return conf
+
+
+def get_layer_list():
+    """ Return a list of strings which include the identifiers (tags) of all
+    supported layers
+    """
+    return singa_wrap.GetRegisteredLayers()
diff --git a/src/python/singa/loss.py b/src/python/singa/loss.py
new file mode 100644
index 0000000..c88290b
--- /dev/null
+++ b/src/python/singa/loss.py
@@ -0,0 +1,141 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# =============================================================================
+
+'''
+Loss module includes a set of training loss implmentations. Some are converted
+from C++ implementation, and the rest are implemented directly using python
+Tensor.
+
+Example usage::
+
+    from singa import tensor
+    from singa import loss
+    from singa.proto import model_pb2
+
+    x = tensor.Tensor((3, 5))
+    x.uniform(0, 1)  # randomly genearte the prediction activation
+    y = tensor.from_numpy(np.array([0, 1, 3], dtype=np.int))  # set the truth
+
+    f = loss.SoftmaxCrossEntropy()
+    l = f.forward(model_pb2.kTrain, x, y)  # l is tensor with 3 loss values
+    g = f.backward()  # g is a tensor containing all gradients of x w.r.t l
+'''
+
+
+from . import singa_wrap as singa
+import tensor
+
+
+class Loss(object):
+    '''Base loss class.
+
+    Subclasses that wrap the C++ loss classes can use the inherited foward,
+    backward, and evaluate functions of this base class. Other subclasses need
+    to override these functions
+    '''
+
+    def __init__(self):
+        self.swig_loss = None
+
+    def forward(self, flag, x, y):
+        '''Compute the loss values.
+
+        Args:
+            flag (int): kTrain or kEval. If it is kTrain, then the backward
+                function must be called before calling forward again.
+            x (Tensor): the prediction Tensor
+            y (Tensor): the ground truch Tensor, x.shape[0] must = y.shape[0]
+
+        Returns:
+            a tensor of floats for the loss values, one per sample
+        '''
+        return tensor.from_raw_tensor(
+            self.swig_loss.Forward(flag, x.singa_tensor, y.singa_tensor))
+
+    def backward(self):
+        '''
+        Returns:
+            the grad of x w.r.t. the loss
+        '''
+        return tensor.from_raw_tensor(self.swig_loss.Backward())
+
+    def evaluate(self, flag, x, y):  # TODO(wangwei) remove flag
+        '''
+        Args:
+            flag (int): must be kEval, to be removed
+            x (Tensor): the prediction Tensor
+            y (Tensor): the ground truth Tnesor
+
+        Returns:
+            the averaged loss for all samples in x.
+        '''
+        return self.swig_loss.Evaluate(flag, x.singa_tensor, y.singa_tensor)
+
+
+class SoftmaxCrossEntropy(Loss):
+    '''This loss function is a combination of SoftMax and Cross-Entropy loss.
+
+    It converts the inputs via SoftMax function and then
+    computes the cross-entropy loss against the ground truth values.
+    '''
+
+    def __init__(self):
+        self.swig_loss = singa.SoftmaxCrossEntropy()
+
+
+class SquaredError(Loss):
+    '''This loss evaluates the squared error between the prediction and the
+    truth values.
+
+    It is implemented using Python Tensor operations.
+    '''
+    def __init__(self):
+        super(Loss, SquaredError).__init__()
+        self.err = None
+
+    def forward(self, flag, x, y):
+        '''Compute the error as 0.5 * ||x-y||^2.
+
+        Args:
+            flag (int): kTrain or kEval; if kTrain, then the backward must be
+                called before calling forward again.
+            x (Tensor): the prediction Tensor
+            y (Tensor): the truth Tensor, an integer value per sample, whose
+                value is [0, x.shape[1])
+
+        Returns:
+            a Tensor with one error value per sample
+        '''
+        self.err = x - y
+        return 0.5 * tensor.squared(self.err)
+
+    def backward(self):
+        '''Compute the gradient of x w.r.t the error.
+
+        Returns:
+            x - y
+        '''
+        return self.err
+
+    def evaluate(self, flag, x, y):
+        '''Compuate the averaged error.
+
+        Returns:
+            a float value as the averaged error
+        '''
+        return tensor.sum(0.5 * tensor.squared(x - y)) / x.size()
diff --git a/src/python/singa/metric.py b/src/python/singa/metric.py
new file mode 100644
index 0000000..3a5750d
--- /dev/null
+++ b/src/python/singa/metric.py
@@ -0,0 +1,85 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# =============================================================================
+'''This module includes a set of metric classes for evaluating the model's
+performance. The specific metric classes could be converted from C++
+implmentation or implemented directly using Python.
+
+
+Example usage::
+
+    from singa import tensor
+    from singa import metric
+
+    x = tensor.Tensor((3, 5))
+    x.uniform(0, 1)  # randomly genearte the prediction activation
+    x = tensor.SoftMax(x)  # normalize the prediction into probabilities
+    y = tensor.from_numpy(np.array([0, 1, 3], dtype=np.int))  # set the truth
+
+    f = metric.Accuracy()
+    acc = f.evaluate(x, y)  # averaged accuracy over all 3 samples in x
+
+'''
+
+from . import singa_wrap as singa
+import tensor
+
+
+class Metric(object):
+    '''Base metric class.
+
+    Subclasses that wrap the C++ loss classes can use the inherited foward,
+    and evaluate functions of this base class. Other subclasses need
+    to override these functions. Users need to feed in the **predictions** and
+    ground truth to get the metric values.
+    '''
+
+    def __init__(self):
+        self.swig_metric = None
+
+    def forward(self, x, y):
+        '''Compute the metric for each sample.
+
+        Args:
+            x (Tensor): predictions, one row per sample
+            y (Tensor): ground truth values, one row per sample
+
+        Returns:
+            a tensor of floats, one per sample
+        '''
+        return tensor.from_raw_tensor(
+            self.swig_metric.Forward(x.singa_tensor, y.singa_tensor))
+
+    def evaluate(self, x, y):
+        '''Compute the averaged metric over all samples.
+
+        Args:
+            x (Tensor): predictions, one row per sample
+            y (Tensor): ground truth values, one row per sample
+        Returns:
+            a float value for the averaged metric
+        '''
+        return self.swig_metric.Evaluate(x.singa_tensor, y.singa_tensor)
+
+
+class Accuracy(Metric):
+    '''Compute the top one accuracy for singel label prediction tasks.
+
+    It calls the C++ functions to do the calculation.
+    '''
+    def __init__(self):
+        self.swig_metric = singa.Accuracy()
diff --git a/autogen.sh b/src/python/singa/model.py
old mode 100755
new mode 100644
similarity index 91%
rename from autogen.sh
rename to src/python/singa/model.py
index ff8eacb..38d9950
--- a/autogen.sh
+++ b/src/python/singa/model.py
@@ -1,4 +1,3 @@
-#!/bin/sh
 #/**
 # * Licensed to the Apache Software Foundation (ASF) under one
 # * or more contributor license agreements.  See the NOTICE file
@@ -17,7 +16,6 @@
 # * limitations under the License.
 # */
 
-mkdir -p ./config;
-aclocal;
-autoreconf -f -i;
-automake;
+class Model(Object):
+    pass
+
diff --git a/src/python/singa/net.py b/src/python/singa/net.py
new file mode 100644
index 0000000..0026953
--- /dev/null
+++ b/src/python/singa/net.py
@@ -0,0 +1,213 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+"""
+Nerual net class for constructing the nets using layers and providing access
+functions for net info, e.g., parameters.
+"""
+
+
+from .proto.model_pb2 import kTrain, kEval
+import tensor
+import layer
+import cPickle as pickle
+
+
+class FeedForwardNet(object):
+
+    def __init__(self, loss=None, metric=None):
+        self.loss = loss
+        self.metric = metric
+        self.layers = []
+        self.src_of_layer = {}
+        self.dst_of_layer = None
+        self.ordered_layers = None
+
+    def to_device(self, dev):
+        for lyr in self.layers:
+            lyr.to_device(dev)
+
+    def add(self, lyr, src=None):
+        """Append a layer into the layer list.
+
+        This function will get the sample shape from the last layer to setup
+        the newly added layer. For the first layer, it is setup outside.
+        The calling function should ensure the correctness of the layer order.
+
+        Args:
+            lyr (Layer): the layer to be added
+        """
+        if src is not None:
+            if isinstance(src, layer.Layer):
+                assert src.has_setup is True, 'the source layer must be set up'
+                self.src_of_layer[lyr.name] = [src]
+            else:
+                assert type(src) == list, 'the src must be a list of layers'
+                self.src_of_layer[lyr.name] = src
+                # print 'merge------', len(src)
+        else:
+            assert len(self.layers) > 0 or lyr.has_setup, \
+                'Source layers are needed to set up this layer'
+            if len(self.layers) > 0:
+                self.src_of_layer[lyr.name] = [self.layers[-1]]
+            else:
+                self.src_of_layer[lyr.name] = []
+        if lyr.has_setup is False:
+            # print shape
+            in_shape = self.src_of_layer[lyr.name][0].get_output_sample_shape()
+            lyr.setup(in_shape)
+            print lyr.name, lyr.get_output_sample_shape()
+        self.layers.append(lyr)
+        return lyr
+
+    def param_values(self):
+        values = []
+        layers = self.layers
+        if self.ordered_layers is not None:
+            layers = self.ordered_layers
+        for lyr in layers:
+            values.extend(lyr.param_values())
+        return values
+
+    def param_specs(self):
+        specs = []
+        layers = self.layers
+        if self.ordered_layers is not None:
+            layers = self.ordered_layers
+        for lyr in layers:
+            specs.extend(lyr.param_specs)
+        return specs
+
+    def param_names(self):
+        return [spec.name for spec in self.param_specs()]
+
+    def train(self, x, y):
+        out = self.forward(kTrain, x)
+        l = self.loss.forward(kTrain, out, y)
+        if self.metric is not None:
+            m = self.metric.evaluate(out, y)
+        return self.backward(), (l.l1(), m)
+
+    def evaluate(self, x, y):
+        """Evaluate the loss and metric of the given data"""
+        out = self.forward(kEval, x)
+        l = None
+        m = None
+        assert self.loss is not None or self.metric is not None,\
+            'Cannot do evaluation, as neither loss nor metic is set'
+        if self.loss is not None:
+            l = self.loss.evaluate(kEval, out, y)
+        if self.metric is not None:
+            m = self.metric.evaluate(out, y)
+        return l, m
+
+    def predict(self, x):
+        xx = self.forward(kEval, x)
+        return tensor.softmax(xx)
+
+    def topo_sort(self, cur, src_of_layer, visited=None, order=None):
+        if visited is None:
+            visited = {}
+            for name in src_of_layer.keys():
+                visited[name] = False
+            order = []
+        srcs = src_of_layer[cur.name]
+        for src in srcs:
+            if visited[src.name] is False:
+                visited[src.name] = True
+                self.topo_sort(src, src_of_layer, visited, order)
+        order.append(cur)
+        visited[cur.name] = True
+        return order
+
+    def forward(self, flag, x):
+        # print x.l1()
+        if self.ordered_layers is None:
+            self.ordered_layers = self.topo_sort(self.layers[-1],
+                                                 self.src_of_layer)
+        inputs = [x]
+        output_of_layer = {}
+        for cur in self.ordered_layers:
+            srcs = self.src_of_layer[cur.name]
+            disp_src = cur.name + '<--'
+            for src in srcs:
+                outs = output_of_layer[src.name]
+                if type(outs) == list:
+                    inputs.append(outs[0])
+                else:
+                    inputs.append(outs)
+                disp_src += '+' + src.name
+                # del output_of_layer[src.name]
+            # print disp_src
+            if len(inputs) == 1:
+                inputs = inputs[0]
+            output_of_layer[cur.name] = cur.forward(flag, inputs)
+            inputs = []
+            # print lyr.name, x.l1()
+        # print output_of_layer
+        return output_of_layer[self.ordered_layers[-1].name]
+
+    def backward(self):
+        if self.dst_of_layer is None:
+            self.dst_of_layer = {}
+            for cur in self.layers:
+                self.dst_of_layer[cur.name] = []
+            for cur in self.ordered_layers[1:]:
+                srcs = self.src_of_layer[cur.name]
+                for src in srcs:
+                    self.dst_of_layer[src.name].append(cur)
+        grad = self.loss.backward()
+        if len(grad.shape) > 1:
+            grad /= grad.shape[0]  # average across the batch
+        # print 'grad', grad.l1()
+        grads = [grad]
+        output_of_layer = {}
+        pgrads = []
+        for cur in reversed(self.ordered_layers):
+            for dst in self.dst_of_layer[cur.name]:
+                outputs = output_of_layer[dst.name]
+                if type(outputs) == list:
+                    grads.append(outputs[0])
+                else:
+                    grads.append(outputs)
+                # del output_of_layer[dst.name]
+            if len(grads) == 1:
+                grads = grads[0]
+            outs, _pgrads = cur.backward(kTrain, grads)
+            pgrads.append(_pgrads)
+            output_of_layer[cur.name] = outs
+            grads = []
+
+        ret = []
+        for pgrad in reversed(pgrads):
+            ret.extend(pgrad)
+        return ret
+
+    def save(self, f):
+        """Save model parameters using cpickle"""
+        params = {}
+        for (specs, val) in zip(self.param_specs(), self.param_values()):
+            val.to_host()
+            params[specs.name] = tensor.to_numpy(val)
+        with open(f, 'wb') as fd:
+            pickle.dump(params, fd)
+
+    def load(self, f):
+        """Load model parameters using cpickle"""
+        with open(f, 'rb') as fd:
+            params = pickle.load(fd)
+        for (specs, val) in zip(self.param_specs(), self.param_values()):
+            val.copy_from_numpy(params[specs.name])
diff --git a/src/python/singa/optimizer.py b/src/python/singa/optimizer.py
new file mode 100644
index 0000000..00380e0
--- /dev/null
+++ b/src/python/singa/optimizer.py
@@ -0,0 +1,377 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# =============================================================================
+'''This module includes a set of optimizers for updating model parameters.
+
+Example usage::
+
+  from singa import optimizer
+  from singa import tensor
+
+  sgd = optimizer.SGD(lr=0.01, momentum=0.9, weight_decay=1e-4)
+  p = tensor.Tensor((3,5))
+  p.uniform(-1, 1)
+  g = tensor.Tensor((3,5))
+  g.gaussian(0, 0.01)
+
+  sgd.apply(1, g, p, 'param')  # use the global lr=0.1 for epoch 1
+  sgd.apply_with_lr(2, 0.03, g, p, 'param')  # use lr=0.03 for epoch 2
+'''
+
+from . import singa_wrap as singa
+import tensor
+from proto import model_pb2
+
+
+class Optimizer(object):
+    '''The base python optimizer class.
+
+    Typically, an optimizer is used as follows:
+
+    1. construct the optimizer
+    2. (optional) register each parameter with its specs.
+    3. use the optimizer to update parameter values given parameter gradients
+       and other optional info
+
+    The subclasses should override the apply_with_lr function to do the real
+    parameter udpate.
+
+    Args:
+        lr (float): a constant for the learning rate, mutually exclusive with
+            'lr_gen'.
+        momentum (float): a constant for the momentum value
+        weight_decay (float): the coefficent for L2 regularizer, which is
+            mutually exclusive with 'regularizer'.
+        lr_gen (function): a function returns the learning rate given
+            the current training step/epoch. It is mutually exclusive with lr.
+            If both are not set, the apply_with_lr function should be used for
+            param updating.
+        regularizer: an instance of Regularizer or RegularizerConf; If set,
+            regularization would be applied in apply_with_lr().
+            Users can also do regularization outside.
+        constraint: an instance of Constraint or ConstraintConf; If set,
+            constraint would be applied inside apply_with_lr(). Users can
+            also do regularization outside.
+    '''
+
+    def __init__(self, lr=None, momentum=None, weight_decay=None, lr_gen=None,
+                 regularizer=None, constraint=None):
+        if lr is not None:
+            assert lr_gen is None, 'Cannot set lr and lr_gen at the same time'
+
+            def lr_gen(epoch):
+                return lr
+        self.lr_gen = lr_gen
+        self.momentum = momentum
+        if weight_decay is not None:
+            assert regularizer is None, \
+                'Cannot set weight_decay and regularizer at the same time'
+            regularizer = L2Regularizer(weight_decay)
+        if regularizer is not None:
+            if isinstance(regularizer, model_pb2.RegularizerConf):
+                self.regularizer = CppRegularizer(regularizer)
+            else:
+                self.regularizer = regularizer
+        else:
+            self.regularizer = None
+        if constraint is not None:
+            if isinstance(constraint, model_pb2.ConstraintConf):
+                self.constraint = CppConstraint(constraint)
+            else:
+                self.constraint = constraint
+        else:
+            self.constraint = None
+        self.regularizers = {}
+        self.constraints = {}
+        self.decay_multiplier = {}
+        self.learning_rate_multiplier = {}
+
+    def register(self, name, specs):
+        '''Register the param specs, including creating regularizer and
+        constraint per param object. Param specific regularizer and constraint
+        have higher priority than the global ones.
+
+        Args:
+            name (str): parameter name
+            specs (ParamSpec): protobuf obj, including regularizer and
+                constraint, multipliers for learning rate and weight decay.
+        '''
+        assert isinstance(specs, model_pb2.ParamSpec), \
+            'specs should be model_pb2.ParamSpec instance'
+        if specs.HasField('regularizer'):
+            self.regularizers[name] = CppRegularizer(specs.regularizer)
+        elif specs.decay_mult != 1:
+            self.regularizers[name] = L2Regularizer(
+                specs.decay_mult * self.regularizer.coefficient)
+
+        if specs.HasField('constraint'):
+            self.constraints[name] = CppConstraint(specs.constraint)
+
+        if specs.lr_mult != 1:
+            self.learning_rate_multiplier[name] = specs.lr_mult
+
+    def apply_regularizer_constraint(self, epoch, value, grad, name=None):
+        '''Apply regularization and constraint if available.
+
+        If there are both global regularizer (constraint) and param specific
+        regularizer (constraint), it would use the param specific one.
+
+        Args:
+            value (Tensor): parameter value Tensor
+            grad (Tensor): parameter gradient Tensor
+            name (string): to get parameter specific regularizer or constraint
+            epoch (int): some regularizer or constraint would use epoch
+
+        Returns:
+            the updated gradient Tensor
+        '''
+        if name is not None and name in self.constraints:
+            self.constraints[name].apply(epoch, value, grad)
+        elif self.constraint is not None:
+            self.constraint.apply(epoch, value, grad)
+
+        if name is not None and name in self.regularizers:
+            self.regularizers[name].apply(epoch, value, grad)
+        elif self.regularizer is not None:
+            self.regularizer.apply(epoch, value, grad)
+        return grad
+
+    def apply_with_lr(self, epoch, lr, grad, value, name=None):
+        '''Do update with given learning rate.
+
+        The subclass optimizer must override this function.
+
+        Args:
+            epoch (int): training epoch (could be iteration or epoch)
+            lr (float): learning rate
+            grad (Tensor): parameter gradient
+            value (Tesnor): parameter value
+            name (string): paramter name to retrieval parameter specific
+                updating rules (including regularizer and constraint)
+
+        Returns:
+            updated parameter value
+        '''
+        assert False, 'This is the base function, pls call the subclass func'
+        return value
+
+    def apply(self, epoch, grad, value, name=None):
+        '''Do update assuming the learning rate generator is set.
+
+        The subclass optimizer does not need to override this function.
+
+        Args:
+            epoch (int): training epoch (could be iteration or epoch)
+            grad (Tensor): parameter gradient
+            value (Tesnor): parameter value
+            name (string): paramter name to retrieval parameter specific
+                updating rules (including regularizer and constraint)
+
+        Return:
+            updated parameter value
+        '''
+        assert self.lr_gen is not None, 'Learning rate generator is not set.'\
+            'Either set the lr_gen in constructor or call apply_with_lr'
+        lr = self.lr_gen(epoch)
+        return self.apply_with_lr(epoch, lr, grad, value, name)
+
+
+class SGD(Optimizer):
+    '''The vallina Stochasitc Gradient Descent algorithm with momentum.
+
+    See the base Optimizer for all arguments.
+    '''
+
+    def __init__(self, lr=None, momentum=None, weight_decay=None, lr_gen=None,
+                 regularizer=None, constraint=None):
+        super(SGD, self).__init__(lr, momentum, weight_decay, lr_gen,
+                                  regularizer, constraint)
+        conf = model_pb2.OptimizerConf()
+        if self.momentum is not None:
+            conf.momentum = self.momentum
+        conf.type = 'sgd'
+        self.opt = singa.CreateOptimizer('SGD')
+        self.opt.Setup(conf.SerializeToString())
+
+    def apply_with_lr(self, epoch, lr, grad, value, name):
+        self.apply_regularizer_constraint(epoch, value, grad, name)
+        if name is not None and name in self.learning_rate_multiplier:
+            lr = lr * self.learning_rate_multiplier[name]
+        self.opt.Apply(epoch, lr, name, grad.singa_tensor, value.singa_tensor)
+        return value
+
+
+class Nesterov(Optimizer):
+    '''The SGD with Nesterov momentum.
+
+    See the base Optimizer for all arguments.
+    '''
+
+    def __init__(self, lr=None, momentum=0.9, weight_decay=None, lr_gen=None,
+                 regularizer=None, constraint=None):
+        super(Nesterov, self).__init__(lr, momentum, weight_decay, lr_gen,
+                                       regularizer, constraint)
+        conf = model_pb2.OptimizerConf()
+        if self.momentum is not None:
+            conf.momentum = momentum
+        conf.type = 'nesterov'
+        self.opt = singa.CreateOptimizer('Nesterov')
+        self.opt.Setup(conf.SerializeToString())
+
+    def apply_with_lr(self, epoch, lr, grad, value, name):
+        self.apply_regularizer_constraint(epoch, value, grad, name)
+        if name is not None and name in self.learning_rate_multiplier:
+            lr = lr * self.learning_rate_multiplier[name]
+        self.opt.Apply(epoch, lr, name, grad.singa_tensor, value.singa_tensor)
+        return value
+
+
+class AdaGrad(Optimizer):
+    '''AdaGrad optimizer.
+
+    See the base Optimizer for all constructor args.
+
+    Args:
+        epsilon (float): small number for preventing numeric error.
+    '''
+
+    def __init__(self, epsilon=1e-8, lr=None, weight_decay=None, lr_gen=None,
+                 regularizer=None, constraint=None):
+        super(RMSProp, self).__init__(lr, weight_decay, lr_gen, regularizer,
+                                      constraint)
+        conf = model_pb2.OptimizerConf()
+        conf.delta = epsilon
+        conf.type = 'adagrad'
+        self.opt = singa.CreateOptimizer('AdaGrad')
+        self.opt.Setup(conf.SerializeToString())
+
+    def apply_with_lr(self, epoch, lr, grad, value, name):
+        grad = self.apply_regularizer_constraint(epoch, value, grad, name)
+        if name is not None and name in self.learning_rate_multiplier:
+            lr = lr * self.learning_rate_multiplier[name]
+        self.opt.Apply(epoch, lr,  name, grad.singa_tensor, value.singa_tensor)
+        return value
+
+
+class RMSProp(Optimizer):
+    '''RMSProp optimizer.
+
+    See the base Optimizer for all constructor args.
+
+    Args:
+        rho (float): float within [0, 1]
+        epsilon (float): small value for preventing numeric error
+    '''
+
+    def __init__(self, rho=0.9, epsilon=1e-8, lr=None, weight_decay=None,
+                 lr_gen=None, regularizer=None, constraint=None):
+        super(RMSProp, self).__init__(lr, weight_decay, lr_gen, regularizer,
+                                      constraint)
+        conf = model_pb2.OptimizerConf()
+        conf.rho = rho
+        conf.delta = epsilon
+        self.opt = singa.CreateOptimizer('RMSProp')
+        self.opt.Setup(conf.SerializeToString())
+
+    def apply_with_lr(self, epoch, lr, grad, value, name):
+        grad = self.apply_regularizer_constraint(epoch, value, grad, name)
+        if name is not None and name in self.learning_rate_multiplier:
+            lr = lr * self.learning_rate_multiplier[name]
+        self.opt.Apply(epoch, lr,  name, grad.singa_tensor, value.singa_tensor)
+        return value
+
+
+class Regularizer(object):
+    '''Base Python regularizer for parameter gradients.'''
+
+    def apply(self, value, grad):
+        assert False, 'Not Implemented. Call the subclass function.'
+        return grad
+
+
+class CppRegularizer(Regularizer):
+    '''Wrapper for regularizer implemented using C++.
+
+    Args:
+        conf (RegularizerConf): protobuf message for the configuration.
+    '''
+
+    def __init__(self, conf):
+        self.reg = singa.CreateRegularizer(conf.type)
+        self.reg.Setup(conf.SerializeToString())
+
+    def apply(self, epoch, value, grad):
+        self.reg.Apply(epoch, value.singa_tensor, grad.singa_tensor)
+        return grad
+
+
+class L2Regularizer(Regularizer):
+    '''L2 regularization
+
+    Args:
+        coefficient (float): regularization coefficient.
+    '''
+
+    def __init__(self, coefficient):
+        self.coefficient = coefficient
+
+    def apply(self, epoch, value, grad, coefficient=None):
+        if coefficient is None:
+            assert self.coefficient is not None, 'Must set the coefficient'
+            coefficient = self.coefficient
+        # print coefficient, value.l1(), grad.l1()
+        if coefficient != 0:
+            tensor.axpy(coefficient, value, grad)
+        return grad
+
+
+class Constraint(object):
+    '''Base Python constraint class for paramter gradients'''
+
+    def apply(self, epoch, value, grad):
+        return grad
+
+
+class CppConstraint(Constraint):
+    '''Wrapper for constraints implemented using C++.
+
+    Args:
+        conf (ConstraintConf): protobuf message for the configuration.
+    '''
+
+    def __init__(self, conf):
+        self.constraint = singa.CreateConstraint(conf.type)
+        self.constraint.Setup(conf.SerializeToString())
+
+    def apply(self, epoch, value, grad):
+        self.constraint.Apply(epoch, value.singa_tensor, grad.singa_tensor)
+        return grad
+
+
+class L2Constraint(Constraint):
+    '''Rescale the gradient to make the L2 norm <= a given threshold'''
+
+    def __init__(self, threshold=None):
+        self.threshold = threshold
+
+    def apply(self, epoch, value, grad, threshold=None):
+        if threshold is None:
+            assert self.threshold is not None, 'Must set the threshold'
+            threshold = self.threshold
+        nrm = grad.l2()
+        grad *= threshold / nrm
+        return grad
diff --git a/src/python/singa/tensor.py b/src/python/singa/tensor.py
new file mode 100644
index 0000000..f6bca43
--- /dev/null
+++ b/src/python/singa/tensor.py
@@ -0,0 +1,1011 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# =============================================================================
+"""
+Example usage::
+
+    from singa import tensor
+    from singa import device
+
+    # create a tensor with shape (2,3), default CppCPU device and float32
+    x = tensor.Tensor((2,3))
+    x.set_value(0.4)
+
+    # create a tensor from a numpy array
+    y = tensor.from_numpy((3,3), dtype=np.float32)
+    y.uniform(-1, 1)
+
+    z = mult(x, y)  # gemm -> z of shape (2, 3)
+
+    x += z # element-wise addition
+
+    dev = device.create_cuda_gpu()
+    x.to_device(dev)  # move the data to a gpu device
+
+    r = relu(x)
+
+    r.to_host()  # move the data back to host cpu
+    s = r.to_numpy()  # tensor -> numpy array, r must be on cpu
+
+
+There are two set of tensor functions,
+
+Tensor member functions
+    which would change the internal state of the Tensor instance.
+Tensor module functions
+    which accept Tensor instances as arguments and return Tensor instances.
+
+Every Tesor instance must be initialized before reading data from it.
+"""
+
+import numpy as np
+from functools import reduce
+from .proto import core_pb2
+from . import singa_wrap as singa
+import device as pydevice
+
+
+class Tensor(object):
+    '''Create a Py Tensor, which wraps a swig converted Tensor from CPP Tensor
+
+    The three arguments are three attributes of the Tensor.
+
+    Args:
+        shape (list<int>): a list of integers for the tensor shape. If shape is
+            not specified, the created tensor is called a dummy tensor.
+        device: a swig converted Device instance using the device moduel . If it
+            is None, then the default host device would be used.
+        dtype: data type. currently, most operations only accept kFloat32.
+    '''
+
+    def __init__(self, shape=None, device=None, dtype=core_pb2.kFloat32):
+        if shape is None:
+            # call constructor of singa::Tensor
+            self.singa_tensor = singa.Tensor()
+            return
+        else:
+            assert isinstance(shape, tuple), 'shape should be tuple'
+            if device is None:
+                device = pydevice.get_default_device()
+                self.singa_tensor = singa.Tensor(list(shape), device, dtype)
+            else:
+                self.singa_tensor = singa.Tensor(list(shape), device, dtype)
+        self.shape = shape
+        self.dtype = dtype
+        self.device = device
+
+    def ndim(self):
+        '''
+        Returns:
+            the number of dimensions of the tensor.
+        '''
+        return self.singa_tensor.nDim()
+
+    def is_transpose(self):
+        '''
+        Returns:
+            True if the internal data is transposed; otherwise False.
+        '''
+        return self.singa_tensor.transpose()
+
+    def size(self):  # TODO(wangwei) compute size
+        '''
+        Returns:
+            the number of elements of the tensor.
+        '''
+        return self.singa_tensor.Size()
+
+    def memsize(self):
+        '''
+        Returns:
+            the number of Bytes allocated for this tensor.
+        '''
+        return self.singa_tensor.MemSize()
+
+    def reshape(self, shape):
+        '''Change the tensor shape.
+
+        Args:
+            shape (list<int>): new shape, which should have the same volumn as
+                the original shape.
+        '''
+        assert product(self.shape) == product(shape), \
+            'product of shape should be equal'
+        self.shape = shape
+        self.singa_tensor.Reshape(list(shape))
+
+    def reset_like(self, t):
+        '''Reset the shape, dtype and device as the given tensor.
+
+        Args:
+            t (Tensor)
+        '''
+        self.singa_tensor.ResetLike(t.singa_tensor)
+        self.shape = t.shape
+        self.device = t.device
+        self.dtype = t.dtype
+
+    '''
+    def as_type(self, dtype):
+        Change the data type.
+
+        Args:
+            dtype:
+        self.singa_tensor.AsType(dtype)
+    '''
+
+    def to_device(self, device):
+        '''Move the tensor data onto a given device.
+
+        Args:
+            device: a swig Device converted from CudaGPU or CppCPU or OpenclGPU
+        '''
+        self.singa_tensor.ToDevice(device)
+        self.device = device
+
+    def to_host(self):
+        '''Move the tensor data onto the default host CppCPU device.
+        '''
+        self.singa_tensor.ToHost()
+        self.device = pydevice.default_device
+
+    def l2(self):
+        '''
+        Returns:
+            the L2 norm.
+        '''
+        return self.singa_tensor.L2()
+
+    def l1(self):
+        '''
+        Returns:
+            the L1 norm.
+        '''
+        return self.singa_tensor.L1()
+
+    def set_value(self, x):
+        '''Set all elements of the tensor to be the give value.
+
+        Args:
+            x (float), a float value to be set to all elements.
+        '''
+        # assert type(x) == float, 'set value only accepts float input'
+        # if isinstance(x, float):
+        self.singa_tensor.floatSetValue(x)
+
+    def copy_from_numpy(self, np_array, offset=0):
+        ''' Copy the data from the numpy array.
+
+        Args:
+            np_array: source numpy array
+            offset (int): destination offset
+        '''
+        assert np_array.size == self.size(), 'tensor shape should be the same'
+        if not np_array.ndim == 1:
+            np_array = np_array.flatten()
+        dt = np_array.dtype
+        if dt == np.float32:
+            self.singa_tensor.floatCopyDataFromHostPtr(np_array)
+        elif dt == np.int or dt == np.int32:
+            self.singa_tensor.intCopyDataFromHostPtr(np_array)
+        else:
+            print 'Not implemented yet for ', dt
+
+    def copy_data(self, t):
+        '''Copy data from other Tensor instance.
+
+        Args:
+            t (Tensor): source Tensor.
+        '''
+        assert isinstance(t, Tensor), 't must be a singa Tensor instance'
+        self.singa_tensor.CopyData(t.singa_tensor)
+
+    def clone(self):
+        '''
+        Returns:
+            a new Tensor which does deep copy of this tensor
+        '''
+        return _call_singa_func(self.singa_tensor.Clone)
+
+    def T(self):
+        ''' shallow copy, negate the transpose field.
+
+        Returns:
+            a new Tensor which shares the underlying data memory (shallow copy)
+            but is marked as a transposed version of this tensor.
+        '''
+        return _call_singa_func(self.singa_tensor.T)
+
+    def copy(self):
+        '''shallow copy calls copy constructor of singa::Tensor
+        '''
+        return _call_singa_func(singa.Tensor, self.singa_tensor)
+
+    def deepcopy(self):
+        '''Same as clone().
+
+        Returns:
+            a new Tensor
+        '''
+        return self.clone()
+
+    def bernoulli(self, p):
+        '''Sample 0/1 for each element according to the given probability.
+
+        Args:
+            p (float): with probability p, each element is sample to 1.
+        '''
+        singa.floatBernoulli(float(p), self.singa_tensor)
+
+    def gaussian(self, mean, std):
+        '''Generate a value for each element following a Gaussian distribution.
+
+        Args:
+            mean (float): mean of the distribution
+            std (float): standard variance of the distribution
+        '''
+        singa.floatGaussian(float(mean), float(std), self.singa_tensor)
+
+    def uniform(self, low, high):
+        '''Generate a value for each element following a uniform distribution.
+
+        Args:
+            low (float): the lower bound
+            high (float): the hight bound
+        '''
+        singa.floatUniform(float(low), float(high), self.singa_tensor)
+
+    def add_column(self, v):
+        '''Add a tensor to each column of this tensor.
+
+        Args:
+            v (Tensor): a Tensor to be added as a column to this tensor.
+        '''
+        singa.AddColumn(v.singa_tensor, self.singa_tensor)
+
+    def add_row(self, v):
+        '''Add a tensor to each row of this tensor.
+
+        Args:
+            v (Tensor): a Tensor to be added as a row to this tensor.
+        '''
+        singa.AddRow(v.singa_tensor, self.singa_tensor)
+
+    def div_column(self, v):
+        '''Divide each column of this tensor by v.
+
+        Args:
+            v (Tensor): 1d tensor of the same length the column of self.
+        '''
+        singa.DivColumn(v.singa_tensor, self.singa_tensor)
+
+    def div_row(self, v):
+        '''Divide each row of this tensor by v.
+
+        Args:
+            v (Tensor): 1d tensor of the same length the row of self.
+        '''
+        singa.DivRow(v.singa_tensor, self.singa_tensor)
+
+    def mult_column(self, v):
+        '''Multiply each column of this tensor by v element-wisely.
+
+        Args:
+            v (Tensor): 1d tensor of the same length the column of self.
+        '''
+        singa.MultColumn(v.singa_tensor, self.singa_tensor)
+
+    def mult_row(self, v):
+        '''Multiply each row of this tensor by v element-wisely.
+
+        Args:
+            v (Tensor): 1d tensor of the same length the row of self.
+        '''
+        singa.MultRow(v.singa_tensor, self.singa_tensor)
+
+    '''
+    python operators (+=, -=, *=, /=) for singa::Tensor unary operators
+    '''
+
+    def __iadd__(self, x):
+        ''' inplace element-wise addition with a tensor or a float value.
+
+        Args:
+            x (float or Tensor):
+        '''
+        if isinstance(x, Tensor):
+            self.singa_tensor += x.singa_tensor
+        else:
+            self.singa_tensor += float(x)
+        return self
+
+    def __isub__(self, x):
+        ''' inplace element-wise subtraction with a tensor or a float value.
+
+        Args:
+            x (float or Tensor):
+        '''
+
+        if isinstance(x, Tensor):
+            self.singa_tensor -= x.singa_tensor
+        else:
+            self.singa_tensor -= float(x)
+        return self
+
+    def __imul__(self, x):
+        ''' inplace element-wise multiplication with a tensor or a float value.
+
+        Args:
+            x (float or Tensor):
+        '''
+        if isinstance(x, Tensor):
+            self.singa_tensor *= x.singa_tensor
+        else:
+            self.singa_tensor *= float(x)
+        return self
+
+    def __idiv__(self, x):
+        ''' inplace element-wise division by a tensor or a float value.
+
+        Args:
+            x (float or Tensor):
+        '''
+        if isinstance(x, Tensor):
+            self.singa_tensor /= x.singa_tensor
+        else:
+            self.singa_tensor /= float(x)
+        return self
+
+    '''
+    python operators (+, -, *, /, <, <=, >, >=) for singa binary operators
+    '''
+
+    def __add__(self, rhs):
+        if isinstance(rhs, Tensor):
+            return _call_singa_func(singa.Add_TT,
+                                    self.singa_tensor, rhs.singa_tensor)
+        else:
+            return _call_singa_func(singa.Add_Tf,
+                                    self.singa_tensor, rhs)
+
+    def __sub__(self, rhs):
+        if isinstance(rhs, Tensor):
+            return _call_singa_func(singa.Sub_TT,
+                                    self.singa_tensor, rhs.singa_tensor)
+        else:
+            return _call_singa_func(singa.Sub_Tf,
+                                    self.singa_tensor, rhs)
+
+    def __mul__(self, rhs):
+        if isinstance(rhs, Tensor):
+            return _call_singa_func(singa.EltwiseMul_TT,
+                                    self.singa_tensor, rhs.singa_tensor)
+        else:
+            return _call_singa_func(singa.EltwiseMul_Tf,
+                                    self.singa_tensor, rhs)
+
+    def __div__(self, rhs):
+        if isinstance(rhs, Tensor):
+            return _call_singa_func(singa.Div_TT,
+                                    self.singa_tensor, rhs.singa_tensor)
+        else:
+            return _call_singa_func(singa.Div_Tf,
+                                    self.singa_tensor, rhs)
+
+    def __lt__(self, rhs):
+        if isinstance(rhs, Tensor):
+            return _call_singa_func(singa.LT_TT, self.singa_tensor,
+                                    rhs.singa_tensor)
+        else:
+            return _call_singa_func(singa.LT_Tf, self.singa_tensor, rhs)
+
+    def __le__(self, rhs):
+        if isinstance(rhs, Tensor):
+            return _call_singa_func(
+                singa.LE_TT,
+                self.singa_tensor,
+                rhs.singa_tensor)
+        else:
+            return _call_singa_func(singa.LE_Tf, self.singa_tensor, rhs)
+
+    def __gt__(self, rhs):
+        if isinstance(rhs, Tensor):
+            return _call_singa_func(
+                singa.GT_TT,
+                self.singa_tensor,
+                rhs.singa_tensor)
+        else:
+            return _call_singa_func(singa.GT_Tf, self.singa_tensor, rhs)
+
+    def __ge__(self, rhs):
+        if isinstance(rhs, Tensor):
+            return _call_singa_func(
+                singa.GE_TT,
+                self.singa_tensor,
+                rhs.singa_tensor)
+        else:
+            return _call_singa_func(singa.GE_Tf, self.singa_tensor, rhs)
+
+
+''' python functions for global functions in Tensor.h
+'''
+
+
+def from_raw_tensor(t):
+    x = Tensor(t.shape(), t.device(), t.data_type())
+    x.singa_tensor = t
+    return x
+
+
+def from_raw_tensors(tt):
+    ret = []
+    for t in list(tt):
+        ret.append(from_raw_tensor(t))
+    return ret
+
+
+def product(shape):
+    return reduce(lambda x, y: x * y, shape)
+
+
+def sizeof(dtype):
+    '''
+    Returns:
+        the number of bytes of the given SINGA data type defined in core.proto
+    '''
+    return singa.SizeOf(dtype)
+
+
+def reshape(t, s):
+    '''Reshape the input tensor with the given shape.
+
+    Args:
+        t (Tensor): the tensor to be changed
+        s (list<int>): the new shape, which should have the same volumn as the
+            old shape.
+
+    Returns:
+        the new Tensor
+    '''
+    return _call_singa_func(singa.Reshape, t.singa_tensor, s)
+
+
+def copy_data_to_from(dst, src, size, dst_offset=0, src_offset=0):
+    '''Copy the data between two Tensor instances which could be on different
+    devices.
+
+    Args:
+        dst (Tensor): destination Tensor
+        src (Tensor): source Tensor
+        size (int) : number of elements to copy
+        dst_offset (int): offset in terms of elements to the start of dst
+        src_offset (int): offset in terms of elements to the start of src
+    '''
+    singa.CopyDataToFrom(dst.singa_tensor, src.singa_tensor, size,
+                         dst_offset, src_offset)
+
+
+def from_numpy(np_array):
+    '''Create a Tensor instance with the shape, dtype and values from the numpy
+    array.
+
+    Args:
+        np_array: the numpy array.
+
+    Returns:
+        A Tensor instance allocated on the default CppCPU device.
+    '''
+    ret = Tensor(np_array.shape)
+    ret.copy_from_numpy(np_array)
+    return ret
+
+
+def to_numpy(t):
+    '''Convert the tensor into a numpy array.
+
+    Since numpy array is allocated on CPU devices, the input Tensor instance
+    must be on the default CppCPU device.
+
+    Args:
+        t (Tensor), a Tensor on the default CppCPU device.
+
+    Returns:
+        a numpy array
+    '''
+    assert (t.device.id() == -1) or (t.device is None), \
+        'Please move the tensor onto the default host device'
+
+    if t.dtype == core_pb2.kFloat32:
+        np_array = t.singa_tensor.floatGetValue(int(t.size()))
+    elif t.dtype == core_pb2.kInt:
+        np_array = t.singa_tensor.intGetValue(int(t.size()))
+    else:
+        print 'Not implemented yet for ', t.dtype
+    return np_array.reshape(t.shape)
+
+
+def abs(t):
+    '''
+    Args:
+        t (Tensor): input Tensor
+
+    Returns:
+        a new Tensor whose element y = abs(x), x is an element of t
+    '''
+    return _call_singa_func(singa.Abs, t.singa_tensor)
+
+
+def exp(t):
+    '''
+    Args:
+        t (Tensor): input Tensor
+
+    Returns:
+        a new Tensor whose element y = exp(x), x is an element of t
+    '''
+    return _call_singa_func(singa.Exp, t.singa_tensor)
+
+
+def log(t):
+    '''
+    Args:
+        t (Tensor): input Tensor
+
+    Returns:
+        a new Tensor whose element y = log(x), x is an element of t
+    '''
+    return _call_singa_func(singa.Log, t.singa_tensor)
+
+
+def relu(t):
+    '''
+    Args:
+        t (Tensor): input Tensor
+
+    Returns:
+        a new Tensor whose element y = x if x >0; otherwise 0; x is an element
+        of t
+    '''
+    return _call_singa_func(singa.ReLU, t.singa_tensor)
+
+
+def sigmoid(t):
+    '''
+    Args:
+        t (Tensor): input Tensor
+
+    Returns:
+        a new Tensor whose element y = sigmoid(x); x is an element of t
+    '''
+    return _call_singa_func(singa.Sigmoid, t.singa_tensor)
+
+
+def square(t):
+    '''
+    Args:
+        t (Tensor): input Tensor
+
+    Returns:
+        a new Tensor whose element y = x * x, x is an element of t
+    '''
+    return _call_singa_func(singa.Square, t.singa_tensor)
+
+
+def tanh(t):
+    '''
+    Args:
+        t (Tensor): input Tensor
+
+    Returns:
+        a new Tensor whose element y = tanh(x), x is an element of t
+    '''
+    return _call_singa_func(singa.Tanh, t.singa_tensor)
+
+
+def sum(t, axis=None):
+    '''Sum elements of the input tensor long the given axis.
+
+    Args:
+        t (Tensor): input Tensor
+        axis (int, optional): if None, the summation is done over all elements;
+            if axis is provided, then it is calculated along the given axis,
+            e.g. 0 -- sum each column; 1 -- sum each row.
+
+    Returns:
+        a float value as the sum of all elements, or a new Tensor
+    '''
+
+    if axis is None:
+        return singa.floatSum(t.singa_tensor)
+    else:
+        return _call_singa_func(singa.Sum, t.singa_tensor, axis)
+
+
+def pow(t, x, out=None):
+    '''
+    Args:
+        t (Tensor): input tensor
+        x (float or Tensor): y[i] = t[i]^x if x is a float value; otherwise,
+            y[i]= t[i]^x[i] if x is a tensor.
+        out (None or Tensor): if None, a new Tensor would be constructed to
+            store the result; otherwise, the result is put into out.
+
+    Returns:
+        the result tensor.
+    '''
+    if out is None:
+        if isinstance(x, Tensor):
+            return _call_singa_func(singa.Pow, t.singa_tensor, x.singa_tensor)
+        else:
+            return _call_singa_func(singa.Pow_f, t.singa_tensor, x)
+    else:
+        if isinstance(x, Tensor):
+            singa.Pow(t.singa_tensor, x.singa_tensor, out.singa_tensor)
+        else:
+            singa.Pow_f_out(t.singa_tensor, x, out.singa_tensor)
+        return out
+
+
+def average(t, axis=None):
+    '''
+    Args:
+        t (Tensor): input Tensor
+        axis (int, optional): if None, average all elements; otherwise average
+            along the given dimension. 0 for averaging each column; 1 for
+            averaging each row.
+
+    Returns:
+        a float value if axis is None; otherwise, a new Tensor for the result.
+    '''
+    if t.ndim() > 1:
+        return _call_singa_func(singa.Average, t.singa_tensor, axis)
+    else:
+        return singa.floatSum(t.singa_tensor) / t.size()
+
+
+def softmax(t, out=None):
+    '''Apply SoftMax for each row of the Tensor.
+
+    Args:
+        t (Tensor): the input 1d or 2d tensor
+        out (Tensor, optional): if not None, it is used to store the result
+
+    Returns:
+        the result Tensor
+    '''
+    if out is None:
+        return _call_singa_func(singa.SoftMax, t.singa_tensor)
+    else:
+        singa.SoftMax(t.singa_tensor, out.singa_tensor)
+        return out
+
+
+def lt(t, x):
+    '''Elementi-wise comparison for t < x
+
+    Args:
+        t (Tensor): left hand side operand
+        x (Tensor or float): right hand side operand
+
+    Returns:
+        a Tensor with each element being t[i] < x ? 1.0f:0.0f,
+        or t[i] < x[i] ? 1.0f:0.0f
+    '''
+    return t < x
+
+
+def le(t, x):
+    '''Elementi-wise comparison for t <= x.
+
+    Args:
+        t (Tensor): left hand side operand
+        x (Tensor or float): right hand side operand
+
+    Returns:
+        a Tensor with each element being t[i] <= x ? 1.0f:0.0f,
+        or t[i] <= x[i] ? 1.0f:0.0f
+    '''
+    return t <= x
+
+
+def gt(t, x):
+    '''Elementi-wise comparison for t > x.
+
+    Args:
+        t (Tensor): left hand side operand
+        x (Tensor or float): right hand side operand
+
+    Returns:
+        a Tensor with each element being t[i] > x ? 1.0f:0.0f,
+        or t[i] > x[i] ? 1.0f:0.0f
+    '''
+    return t > x
+
+
+def ge(t, x):
+    '''Elementi-wise comparison for t >= x.
+
+    Args:
+        t (Tensor): left hand side operand
+        x (Tensor or float): right hand side operand
+
+    Returns:
+        a Tensor with each element being t[i] >= x ? 1.0f:0.0f,
+        or t[i] >= x[i] ? 1.0f:0.0f
+    '''
+    return t >= x
+
+
+def add(lhs, rhs, ret=None):
+    '''Elementi-wise addition.
+
+    Args:
+        lhs (Tensor)
+        rhs (Tensor)
+        ret (Tensor, optional): if not None, the result is stored in it;
+            otherwise, a new Tensor would be created for the result.
+
+    Returns:
+        the result Tensor
+    '''
+    if ret is None:
+        # call Tensor.__add__()
+        return lhs + rhs
+    else:
+        if isinstance(rhs, Tensor):
+            singa.Add(lhs.singa_tensor, rhs.singa_tensor, ret.singa_tensor)
+        else:
+            singa.Add_Tf_out(lhs.singa_tensor, rhs, ret.singa_tensor)
+        return ret
+
+
+def sub(lhs, rhs, ret=None):
+    '''Elementi-wise subtraction.
+
+    Args:
+        lhs (Tensor)
+        rhs (Tensor)
+        ret (Tensor, optional): if not None, the result is stored in it;
+            otherwise, a new Tensor would be created for the result.
+
+    Returns:
+        the result Tensor
+    '''
+    if ret is None:
+        # call Tensor.__sub__()
+        return lhs - rhs
+    else:
+        if isinstance(rhs, Tensor):
+            singa.Sub(lhs.singa_tensor, rhs.singa_tensor, ret.singa_tensor)
+        else:
+            singa.Sub_Tf_out(lhs.singa_tensor, rhs, ret.singa_tensor)
+        return ret
+
+
+def eltwise_mult(lhs, rhs, ret=None):
+    '''Elementi-wise multiplication.
+
+    Args:
+        lhs (Tensor)
+        rhs (Tensor)
+        ret (Tensor, optional): if not None, the result is stored in it;
+            otherwise, a new Tensor would be created for the result.
+
+    Returns:
+        the result Tensor
+    '''
+
+    if ret is None:
+        # call Tensor.__mul__()
+        return lhs * rhs
+    else:
+        if isinstance(rhs, Tensor):
+            singa.EltwiseMult(lhs.singa_tensor, rhs.singa_tensor,
+                              ret.singa_tensor)
+        else:
+            singa.EltwiseMult_Tf_out(lhs.singa_tensor, rhs,
+                                     ret.singa_tensor)
+        return ret
+
+
+def mult(A, B, C=None, alpha=1.0, beta=0.0):
+    '''Do matrix-matrix or matrix-vector multiplication.
+
+    This function returns C = alpha * A * B + beta * C
+
+    Args:
+        A (Tensor): 2d Tensor
+        B (Tensor): If B is a 1d Tensor, GEMV would be invoked for matrix-vector
+            multiplication; otherwise GEMM would be invoked.
+        C (Tensor, optional): for storing the result; If None, a new Tensor
+            would be created.
+        alpha (float)
+        beta (float)
+
+    Returns:
+        the result Tensor
+    '''
+    if C is None:
+        return _call_singa_func(singa.Mult, A.singa_tensor, B.singa_tensor)
+    else:
+        singa.floatMult(alpha, A.singa_tensor, B.singa_tensor,
+                        beta, C.singa_tensor)
+        return C
+
+
+def div(lhs, rhs, ret=None):
+    '''Elementi-wise division.
+
+    Args:
+        lhs (Tensor)
+        rhs (Tensor)
+        ret (Tensor, optional): if not None, the result is stored in it;
+            otherwise, a new Tensor would be created for the result.
+
+    Returns:
+        the result Tensor
+    '''
+    if ret is None:
+        # call Tensor.__div__()
+        return lhs / rhs
+    else:
+        if isinstance(rhs, Tensor):
+            singa.Div(lhs.singa_tensor, rhs.singa_tensor, ret.singa_tensor)
+        else:
+            singa.Div_Tf_out(lhs.singa_tensor, rhs, ret.singa_tensor)
+        return ret
+
+
+def axpy(alpha, x, y):
+    '''Element-wise operation for y += alpha * x.
+
+    Args:
+        alpha (float)
+        x (Tensor)
+        y (Tensor)
+
+    Returns:
+        y
+    '''
+    singa.floatAxpy(float(alpha), x.singa_tensor, y.singa_tensor)
+    return y
+
+
+def bernoulli(p, t):
+    '''Generate a binary value for each element of t.
+
+    Args:
+        p (float): each element is 1 with probability p; and 0 with 1 - p
+        t (Tensor): the results are put into t
+
+    Returns:
+        t
+    '''
+    singa.floatBernoulli(float(p), t.singa_tensor)
+    return t
+
+
+def gaussian(mean, std, t):
+    '''Generate values following a Gaussian distribution.
+
+    Args:
+        mean (float): the mean of the Gaussian distribution.
+        std (float): the standard variance of the Gaussian distribution.
+        t (Tensor): the results are put into t
+
+    Returns:
+        t
+    '''
+    singa.floatGaussian(float(mean), float(std), t.singa_tensor)
+    return t
+
+
+def uniform(low, high, t):
+    '''Generate values following a Uniform distribution.
+
+    Args:
+        low (float): the lower bound
+        hight (float): the higher bound
+        t (Tensor): the results are put into t
+
+    Returns:
+        t
+    '''
+    singa.floatUniform(float(low), float(high), t.singa_tensor)
+    return t
+
+
+def add_column(alpha, v, beta, M):
+    '''Add v to each column of M.
+
+    Denote each column of M as m, m = alpha * v + beta * m
+
+    Args:
+        alpha (float)
+        v (Tensor)
+        beta (float)
+        M (Tensor): 2d tensor
+    Returns:
+        M
+    '''
+    singa.floatAddColumn(float(alpha), float(beta), v.singa_tensor,
+                         M.singa_tensor)
+    return M
+
+
+def add_row(alpha, v, beta, M):
+    '''Add v to each row of M.
+
+    Denote each row of M as m, m = alpha * v + beta * m
+
+    Args:
+        alpha (float)
+        v (Tensor)
+        beta (float)
+        M (Tensor): 2d tensor
+    Returns:
+        M
+    '''
+    singa.floatAddRow(alpha, beta, v.singa_tensor, M.singa_tensor)
+    return M
+
+
+def sum_columns(M):
+    '''Sum all columns into a single column.
+
+    Args:
+        M (Tensor): the input 2d tensor.
+
+    Returns:
+        a new Tensor as the resulted column.
+    '''
+    assert M.ndim() == 2, 'M.nDim() is supposed to be 2'
+    ret = Tensor((M.shape[0], 1))
+    singa.SumColumns(M.singa_tensor, ret.singa_tensor)
+    return ret
+
+
+def sum_rows(M):
+    '''Sum all rows into a single row.
+
+    Args:
+        M (Tensor): the input 2d tensor.
+
+    Returns:
+        a new Tensor as the resulted row.
+    '''
+    assert M.ndim() == 2, 'M.nDim() is supposed to be 2'
+    ret = Tensor((1, M.shape[1]))
+    singa.SumRows(M.singa_tensor, ret.singa_tensor)
+    return ret
+
+
+''' private functions, internally used
+'''
+
+
+def _call_singa_func(_singa_func, *args):
+    ''' this function calls singa global functions that returns Tensor
+        and create new python Tensor instance
+        e.g., Tensor [singa_func](args...)
+    '''
+    new_t = Tensor()
+    new_t.singa_tensor = _singa_func(*args)
+    new_t.shape = tuple(new_t.singa_tensor.shape())
+    new_t.device = new_t.singa_tensor.device()
+    new_t.dtype = new_t.singa_tensor.data_type()
+    return new_t
diff --git a/src/python/singa/utils.py b/src/python/singa/utils.py
new file mode 100644
index 0000000..a192cff
--- /dev/null
+++ b/src/python/singa/utils.py
@@ -0,0 +1,47 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+import sys
+
+
+def update_progress(progress, info):
+    """Display progress bar and user info.
+
+    Args:
+        progress (float): progress [0, 1], negative for halt, and >=1 for done.
+        info (str): a string for user provided info to be displayed.
+    """
+    barLength = 20  # bar length
+    status = ""
+    if isinstance(progress, int):
+        progress = float(progress)
+    if not isinstance(progress, float):
+        progress = 0
+        status = "error: progress var must be float. "
+    if progress < 0:
+        progress = 0
+        status = "Halt. "
+    if progress >= 1:
+        progress = 1
+        status = "Done. "
+    status = status + info
+    block = int(round(barLength*progress))
+    text = "[{0}] {1:3.1f}% {2}".format("."*block + " "*(barLength-block),
+                                        progress*100, status)
+    sys.stdout.write(text)
+    sys.stdout.write('\b'*(9 + barLength + len(status)))
+    sys.stdout.flush()
diff --git a/src/python/swig/config.i.in b/src/python/swig/config.i.in
new file mode 100644
index 0000000..5743ba3
--- /dev/null
+++ b/src/python/swig/config.i.in
@@ -0,0 +1,4 @@
+// Pass in cmake configurations to swig
+#cmakedefine01 USE_CUDA
+#cmakedefine01 USE_CUDNN
+#cmakedefine CUDNN_VERSION_SWIG ${CUDNN_VERSION_SWIG}
diff --git a/src/python/swig/core_device.i b/src/python/swig/core_device.i
new file mode 100644
index 0000000..b3521be
--- /dev/null
+++ b/src/python/swig/core_device.i
@@ -0,0 +1,69 @@
+/************************************************************
+*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*
+*************************************************************/
+
+/*interface file for swig */
+
+%module core_device
+%include "std_vector.i"
+%include "std_string.i"
+%include "std_pair.i"
+%include "std_shared_ptr.i"
+
+%{
+#include "singa/core/device.h"
+%}
+
+/* smart pointer to avoid memory leak */
+%shared_ptr(singa::Device);
+
+namespace std{
+%template(sizePair) std::pair<size_t, size_t>;
+%template(vectorPair) std::vector<std::pair<size_t, size_t>>;
+%template(vectorSharedPtr) std::vector<std::shared_ptr<singa::Device>>;
+}
+
+namespace singa{
+
+class Device {
+  public:
+  virtual void SetRandSeed(unsigned seed) = 0;
+  std::shared_ptr<Device> host();
+  int id() const;
+};
+
+class Platform {
+ public:
+#if USE_CUDA
+  static int GetNumGPUs();
+  static const std::vector<int> GetGPUIDs();
+  static const std::pair<size_t, size_t> GetGPUMemSize(const int device);
+  static const std::vector<std::pair<size_t, size_t>> GetGPUMemSize();
+  static const std::string DeviceQuery(int id, bool verbose = false);
+  static const std::vector<std::shared_ptr<Device> >
+  CreateCudaGPUs(const size_t num_devices, size_t init_size = 0);
+  static const std::vector<std::shared_ptr<Device>>
+  CreateCudaGPUsOn(const std::vector<int> &devices, size_t init_size = 0);
+#endif // USE_CUDA
+  static std::shared_ptr<Device> GetDefaultDevice();
+};
+
+}
+
diff --git a/src/python/swig/core_tensor.i b/src/python/swig/core_tensor.i
new file mode 100644
index 0000000..60f8b45
--- /dev/null
+++ b/src/python/swig/core_tensor.i
@@ -0,0 +1,371 @@
+/************************************************************
+*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*
+*************************************************************/
+
+/*interface file for swig */
+
+%module core_tensor
+%include "std_vector.i"
+%include "std_string.i"
+%include "std_shared_ptr.i"
+
+/*
+%include "carrays.i"
+%array_class(float, floatArray);
+%array_class(int, intArray);
+%array_class(char, charArray);
+%array_class(double, doubleArray);
+*/
+
+%{
+#define SWIG_FILE_WITH_INIT
+#include "singa/core/tensor.h"
+#include "singa/core/device.h"
+#include "singa/proto/core.pb.h"
+#include "singa/proto/model.pb.h"
+using singa::DataType;
+%}
+%shared_ptr(singa::Device)
+
+%include "numpy.i"
+%init %{
+  import_array();
+%}
+%apply (float *IN_ARRAY1, int DIM1) {
+       (const float *src, const size_t num)
+}
+%apply (int *IN_ARRAY1, int DIM1) {
+       (const int *src, const size_t num)
+}
+%apply (float *ARGOUT_ARRAY1, int DIM1) {
+       (float *value, const size_t num)
+}
+%apply (int *ARGOUT_ARRAY1, int DIM1) {
+       (int *value, const size_t num)
+}
+
+%template(Shape) std::vector<size_t>;
+
+namespace singa{
+
+  enum DataType {
+    kFloat32, kFloat16, kInt, kChar, kDouble
+  };
+
+  inline size_t Product(const std::vector<size_t> &shape,
+                        int start = 0, size_t len = 0);
+  inline size_t SizeOf(DataType t);
+
+
+  class Tensor {
+
+   public:
+    Tensor();
+    explicit Tensor(const std::vector<size_t> &shape,
+                    DataType dtype = kFloat32);
+    Tensor(const std::vector<size_t> &shape,
+           std::shared_ptr<singa::Device> dev, DataType dtype = kFloat32);
+    Tensor(const Tensor &from);
+
+    std::shared_ptr<singa::Device> device() const;
+/*
+    template <typename DType> const DType* data() const;
+    %template(floatData) data<float>;
+    %template(intData) data<int>;
+    %template(charData) data<char>;
+    %template(doubleData) data<double>;
+    */
+
+    template <typename SType> void GetValue(SType* value, const size_t num);
+    %template(floatGetValue) GetValue<float>;
+    %template(intGetValue) GetValue<int>;
+
+    const DataType data_type() const;
+    const std::vector<size_t> &shape() const;
+    const size_t shape(size_t idx) const;
+    size_t nDim() const;
+    bool transpose() const;
+    size_t Size() const;
+    size_t MemSize() const;
+    void Reshape(const std::vector<size_t> &shape);
+    void ResetLike(const Tensor &t);
+    void AsType(DataType type);
+    void ToDevice(std::shared_ptr<singa::Device> dev);
+    void ToHost();
+    float L2() const;
+    float L1() const;
+
+    template <typename SType> void SetValue(const SType x);
+    %template(floatSetValue) SetValue<float>;
+    /* TODO(chonho-01) other types */
+    // --- other types
+
+    template <typename DType> void CopyDataFromHostPtr(const DType *src,
+                                                       const size_t num,
+                                                       const size_t offset = 0);
+    %template(floatCopyDataFromHostPtr) CopyDataFromHostPtr<float>;
+    %template(intCopyDataFromHostPtr) CopyDataFromHostPtr<int>;
+    // --- other types
+
+    void CopyData(const Tensor &other);
+    Tensor Clone() const;
+    Tensor T() const;
+
+    /* python has no assignment operator
+    Tensor &operator=(const Tensor &t); */
+    Tensor &operator+=(const Tensor &t);
+    Tensor &operator-=(const Tensor &t);
+    Tensor &operator*=(const Tensor &t);
+    Tensor &operator/=(const Tensor &t);
+
+
+    template <typename DType> Tensor &operator+=(const DType x);
+    %template(iAdd_f) operator+=<float>;
+    // --- other types
+
+    template <typename DType> Tensor &operator-=(DType x);
+    %template(iSub_f) operator-=<float>;
+    // --- other types
+
+    template <typename DType> Tensor &operator*=(DType x);
+    %template(iMul_f) operator*=<float>;
+    // --- other types
+
+    template <typename DType> Tensor &operator/=(DType x);
+    %template(iDiv_f) operator/=<float>;
+    // --- other types
+
+
+    /*TODO(chonho-04)
+    amax
+    amin
+    asum
+    */
+
+
+  };
+
+  void CopyDataToFrom(Tensor *dst, const Tensor &src, size_t num,
+                      size_t src_offset = 0, size_t dst_offset = 0);
+
+  Tensor Reshape(const Tensor &in, const std::vector<size_t> &s);
+
+  Tensor Abs(const Tensor &t);
+  Tensor Exp(const Tensor &t);
+  Tensor Log(const Tensor &t);
+  Tensor ReLU(const Tensor &t);
+  Tensor Sigmoid(const Tensor &t);
+  Tensor Sign(const Tensor &t);
+  Tensor Sqrt(const Tensor &t);
+  Tensor Square(const Tensor &t);
+  Tensor Tanh(const Tensor &t);
+
+  Tensor Sum(const Tensor &t, int axis);
+  template <typename SType> SType Sum(const Tensor &t);
+  %template(floatSum) Sum<float>;
+  // --- other types
+
+  /* TODO(chonho-02)
+     need to implement the average of all elements ??? */
+  Tensor Average(const Tensor &t, int axis);
+  Tensor SoftMax(const Tensor &t);
+
+
+  Tensor Pow(const Tensor &base, const Tensor &exp);
+  void Pow(const Tensor &base, const Tensor &exp, Tensor *out);
+
+  %rename(Pow_f) Pow(const Tensor &in, const float x);
+  template <typename SType>
+  Tensor Pow(const Tensor &in, const SType x);
+  %template(pow_temp) Pow<float>;
+
+  %rename(Pow_f_out) Pow(const Tensor &in, const float x, Tensor *out);
+  template <typename SType>
+  void Pow(const Tensor &in, const SType x, Tensor *out);
+  %template(pow_temp) Pow<float>;
+
+
+  /* rename comparison operators */
+  %rename(LT_Tf) operator<(const Tensor &t, const float x);
+  %rename(LE_Tf) operator<=(const Tensor &t, const float x);
+  %rename(GT_Tf) operator>(const Tensor &t, const float x);
+  %rename(GE_Tf) operator>=(const Tensor &t, const float x);
+  %rename(LT_TT) operator<(const Tensor &lhs, const Tensor &rhs);
+  %rename(LE_TT) operator<=(const Tensor &lhs, const Tensor &rhs);
+  %rename(GT_TT) operator>(const Tensor &lhs, const Tensor &rhs);
+  %rename(GE_TT) operator>=(const Tensor &lhs, const Tensor &rhs);
+
+  Tensor operator<(const Tensor &lhs, const Tensor &rhs);
+  Tensor operator<=(const Tensor &lhs, const Tensor &rhs);
+  Tensor operator>(const Tensor &lhs, const Tensor &rhs);
+  Tensor operator>=(const Tensor &lhs, const Tensor &rhs);
+
+
+  template <typename DType>
+  Tensor operator<(const Tensor &t, const DType x);
+  %template(op) operator< <float>;
+  // --- other types
+
+  template <typename DType>
+  Tensor operator<=(const Tensor &t, const DType x);
+  %template(op) operator<= <float>;
+  // --- other types
+
+  template <typename DType>
+  Tensor operator>(const Tensor &t, const DType x);
+  %template(op) operator> <float>;
+  // --- other types
+
+  template <typename DType>
+  Tensor operator>=(const Tensor &t, const DType x);
+  %template(op) operator>= <float>;
+  // --- other types
+
+  /* NOTE(chonho)
+  no need to include theses
+  in python, these can be replaced with comparison operators
+
+  template <typename DType>
+  void LT(const Tensor &t, DType x, Tensor *ret);
+  template <typename DType>
+  void LE(const Tensor &t, DType x, Tensor *ret);
+  template <typename DType>
+  void GT(const Tensor &t, DType x, Tensor *ret);
+  template <typename DType>
+  void GE(const Tensor &t, DType x, Tensor *ret);
+  */
+
+
+  /* ========== Arithmetic operations ========== */
+  %rename(Add_TT) operator+(const Tensor &lhs, const Tensor &rhs);
+  %rename(Sub_TT) operator-(const Tensor &lhs, const Tensor &rhs);
+  %rename(EltwiseMul_TT) operator*(const Tensor &lhs, const Tensor &rhs);
+  %rename(Div_TT) operator/(const Tensor &lhs, const Tensor &rhs);
+  Tensor operator+(const Tensor &lhs, const Tensor &rhs);
+  Tensor operator-(const Tensor &lhs, const Tensor &rhs);
+  Tensor operator*(const Tensor &lhs, const Tensor &rhs);
+  Tensor operator/(const Tensor &lhs, const Tensor &rhs);
+
+  %rename(Add_Tf) operator+(const Tensor &t, float x);
+  template <typename DType>
+  Tensor operator+(const Tensor &t, DType x);
+  %template(op) operator+<float>;
+  // --- other types
+
+  %rename(Sub_Tf) operator-(const Tensor &t, float x);
+  template <typename DType>
+  Tensor operator-(const Tensor &t, DType x);
+  %template(op) operator-<float>;
+  // --- other types
+
+  %rename(EltwiseMul_Tf) operator*(const Tensor &t, float x);
+  template <typename DType>
+  Tensor operator*(const Tensor &t, DType x);
+  %template(op) operator*<float>;
+  // --- other types
+
+  %rename(Div_Tf) operator/(const Tensor &t, float x);
+  template <typename DType>
+  Tensor operator/(const Tensor &t, DType x);
+  %template(op) operator/<float>;
+  // --- other types
+
+  void Add(const Tensor &lhs, const Tensor &rhs, Tensor *ret);
+  void Sub(const Tensor &lhs, const Tensor &rhs, Tensor *ret);
+  void EltwiseMult(const Tensor &lhs, const Tensor &rhs, Tensor *ret);
+  void Div(const Tensor &lhs, const Tensor &rhs, Tensor *ret);
+
+  template <typename DType>
+  void Add(const Tensor &t, DType x, Tensor *ret);
+  %template(Add_Tf_out) Add<float>;
+  // --- other types
+
+  template <typename DType>
+  void Sub(const Tensor &t, DType x, Tensor *ret);
+  %template(Sub_Tf_out) Sub<float>;
+  // --- other types
+
+  template <typename DType>
+  void EltwiseMult(const Tensor &t, DType x, Tensor *ret);
+  %template(EltwiseMult_Tf_out) EltwiseMult<float>;
+  // --- other types
+
+  template <typename DType>
+  void Div(const Tensor &t, DType x, Tensor *ret);
+  %template(Div_Tf_out) Div<float>;
+  // --- other types
+
+
+  /* ========== Random operations ========== */
+  template <typename SType>
+  void Bernoulli(const SType p, Tensor *out);
+  %template(floatBernoulli) Bernoulli<float>;
+  // --- other types
+
+  template <typename SType>
+  void Gaussian(const SType mean, const SType std, Tensor *out);
+  %template(floatGaussian) Gaussian<float>;
+  // --- other types
+
+  template <typename SType>
+  void Uniform(const SType low, const SType high, Tensor *out);
+  %template(floatUniform) Uniform<float>;
+  // --- other types
+
+  /* ========== Blas operations ========== */
+  template <typename SType>
+  void Axpy(SType alpha, const Tensor &in, Tensor *out);
+  %template(floatAxpy) Axpy<float>;
+  // --- other types
+
+  Tensor Mult(const Tensor &A, const Tensor &B);
+  void Mult(const Tensor &A, const Tensor &B, Tensor *C);
+  template <typename SType>
+  void Mult(const SType alpha, const Tensor &A, const Tensor &B,
+            const SType beta, Tensor *C);
+  %template(floatMult) Mult<float>;
+
+  void AddColumn(const Tensor &v, Tensor *M);
+  template <typename SType>
+  void AddColumn(const SType alpha, const SType beta, const Tensor &v,
+                 Tensor *M);
+  %template(floatAddColumn) AddColumn<float>;
+
+  void AddRow(const Tensor &v, Tensor *M);
+  template <typename SType>
+  void AddRow(const SType alpha, const SType beta, const Tensor &v,
+              Tensor *M);
+  %template(floatAddRow) AddRow<float>;
+
+  void DivColumn(const Tensor &v, Tensor *M);
+  void DivRow(const Tensor &v, Tensor *M);
+  void MultColumn(const Tensor &v, Tensor *M);
+  void MultRow(const Tensor &v, Tensor *M);
+  void SubColumn(const Tensor &v, Tensor *M);
+  void SubRow(const Tensor &v, Tensor *M);
+
+  void SumColumns(const Tensor &M, Tensor *v);
+  void SumRows(const Tensor &M, Tensor *v);
+
+  Tensor SoftMax(const Tensor &in);
+  void SoftMax(const Tensor &in, Tensor *out);
+
+}
+
diff --git a/src/python/swig/model_layer.i b/src/python/swig/model_layer.i
new file mode 100644
index 0000000..ae651d5
--- /dev/null
+++ b/src/python/swig/model_layer.i
@@ -0,0 +1,102 @@
+/************************************************************
+*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*
+*************************************************************/
+
+/*interface file for swig */
+
+%module model_layer
+%include "std_vector.i"
+%include "std_string.i"
+%include "std_pair.i"
+%include "std_shared_ptr.i"
+
+
+%{
+#include "singa/model/layer.h"
+#include "../src/model/layer/rnn.h"
+#include "../src/model/layer/cudnn_rnn.h"
+#include "singa/core/tensor.h"
+#include "singa/proto/model.pb.h"
+#include "singa/singa_config.h"
+using singa::Tensor;
+using singa::ParamSpec;
+using singa::DataType;
+using singa::Device;
+using singa::LayerConf;
+%}
+
+%shared_ptr(singa::Layer)
+%shared_ptr(singa::RNN)
+#if USE_CUDNN
+%shared_ptr(singa::CudnnRNN)
+#endif
+
+namespace std {
+  %template(strVector) vector<string>;
+  %template(paramVector) vector<singa::ParamSpec>;
+  %template(tensorVector) vector<singa::Tensor>;
+  %template(ttvecPair) pair<singa::Tensor, vector<singa::Tensor>>;
+  %template(tvecPair) pair<vector<singa::Tensor>, vector<singa::Tensor>>;
+}
+
+
+namespace singa {
+
+class Layer {
+  public:
+    Layer();
+//      virtual void Setup(const std::vector<vector<size_t>>&, const string&);
+    void Setup(const std::vector<size_t>& in_sample_shape,
+                        const std::string& proto_str);
+    virtual const std::vector<Tensor> param_values();
+    virtual const std::vector<size_t> GetOutputSampleShape() const;
+    virtual void ToDevice(std::shared_ptr<Device> device);
+    virtual void AsType(DataType dtype);
+    virtual const Tensor Forward(int flag, const Tensor& input);
+    virtual const std::vector<Tensor> Forward(
+        int flag, const std::vector<Tensor>& inputs);
+    virtual const std::pair<Tensor, std::vector<Tensor>> Backward(
+        int flag, const Tensor& grad);
+    virtual const std::pair<std::vector<Tensor>, std::vector<Tensor>>
+    Backward(int flag, const vector<Tensor>& grads);
+};
+
+std::shared_ptr<Layer> CreateLayer(const std::string& type);
+const std::vector<std::string> GetRegisteredLayers();
+class RNN : public Layer {
+};
+
+#if USE_CUDA && USE_CUDNN
+#if CUDNN_VERSION_SWIG >= 5005
+class CudnnRNN : public RNN {
+ public:
+ // note: Must use std::vector instead of vector.
+  const std::vector<Tensor> Forward(int flag, const std::vector<Tensor>& inputs) override;
+  const std::pair<std::vector<Tensor>, std::vector<Tensor>> Backward(
+      int flag, const std::vector<Tensor>& grads) override;
+  void ToDevice(std::shared_ptr<Device> device) override;
+    const std::vector<Tensor> param_values() override;
+    const std::vector<size_t> GetOutputSampleShape() const override;
+};
+
+#endif  // CUDNN_VERSION_SWIG >= 5005
+#endif  // USE_CUDA && USE_CUDNN
+}
+
diff --git a/src/python/swig/model_loss.i b/src/python/swig/model_loss.i
new file mode 100644
index 0000000..864ad88
--- /dev/null
+++ b/src/python/swig/model_loss.i
@@ -0,0 +1,62 @@
+/************************************************************
+*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*
+*************************************************************/
+
+/*interface file for swig */
+
+%module model_loss
+%include "std_string.i"
+%{
+#include "singa/model/loss.h"
+  using singa::Tensor;
+%}
+
+namespace singa {
+class Loss {
+public:
+  Loss() = default;
+  virtual ~Loss() {}
+
+  virtual Tensor Forward(int flag, const Tensor &prediction,
+                         const Tensor &target) = 0;
+
+  float Evaluate(int flag, const Tensor &prediction, const Tensor &target);
+
+  /// Compute the gradients of the loss values w.r.t. the prediction.
+  virtual Tensor Backward() = 0;
+};
+
+class MSE : public Loss {
+public:
+  Tensor Forward(int flag, const Tensor &prediction, const Tensor &target)
+      override;
+
+  Tensor Backward() override;
+};
+
+class SoftmaxCrossEntropy : public Loss {
+public:
+  Tensor Forward(int flag, const Tensor &prediction, const Tensor &target)
+      override;
+
+  Tensor Backward() override;
+};
+
+}
diff --git a/src/proto/singa.proto b/src/python/swig/model_metric.i
similarity index 66%
rename from src/proto/singa.proto
rename to src/python/swig/model_metric.i
index 2fbf2db..9d93cd0 100644
--- a/src/proto/singa.proto
+++ b/src/python/swig/model_metric.i
@@ -19,11 +19,25 @@
 *
 *************************************************************/
 
-package singa;
+/*interface file for swig */
 
-message SingaProto {
-  // ip/hostname:port[,ip/hostname:port]
-  optional string zookeeper_host = 1 [default = "localhost:2181"];
-  // log dir for singa binary and job information(job id, host list, pid list)
-  optional string log_dir = 2 [default = "/tmp/singa-log/"];
+%module model_metric
+%{
+#include "singa/model/metric.h"
+using singa::Tensor;
+%}
+
+namespace singa {
+class Metric {
+ public:
+  Metric() = default;
+  virtual ~Metric() {}
+  virtual Tensor Forward(const Tensor& prediction, const Tensor& target) = 0;
+  float Evaluate(const Tensor& prediction, const Tensor& target);
+};
+class Accuracy : public Metric {
+ public:
+  Tensor Forward(const Tensor& prediction, const Tensor& target);
+};
+
 }
diff --git a/src/python/swig/model_optimizer.i b/src/python/swig/model_optimizer.i
new file mode 100644
index 0000000..78b30b8
--- /dev/null
+++ b/src/python/swig/model_optimizer.i
@@ -0,0 +1,70 @@
+/************************************************************
+*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*
+*************************************************************/
+
+/*interface file for swig */
+
+%module model_optimizer
+%include "std_vector.i"
+%include "std_string.i"
+%include "std_pair.i"
+%include "std_shared_ptr.i"
+
+%{
+#include "singa/model/optimizer.h"
+#include "singa/proto/model.pb.h"
+using singa::Tensor;
+using singa::ParamSpec;
+using singa::OptimizerConf;
+%}
+
+
+%shared_ptr(singa::Optimizer)
+%shared_ptr(singa::Regularizer)
+%shared_ptr(singa::Constraint)
+
+namespace singa {
+class Optimizer {
+ public:
+  // Optimizer() = default;
+  virtual ~Optimizer() = default;
+  void Setup(const std::string& str);
+  virtual void Apply(int step, float lr, const std::string& name,
+    const Tensor& grad, Tensor& value) = 0;
+};
+inline std::shared_ptr<Optimizer> CreateOptimizer(const std::string& type);
+
+class Constraint {
+ public:
+  Constraint() = default;
+  void Setup(const std::string& conf_str);
+  void Apply(int step, Tensor& grad, Tensor& value);
+};
+
+inline std::shared_ptr<Constraint> CreateConstraint(const std::string& type);
+
+class Regularizer {
+ public:
+  Regularizer() = default;
+  void Setup(const std::string& conf_str);
+  void Apply(int step, Tensor& grad, Tensor& value);
+};
+inline std::shared_ptr<Regularizer> CreateRegularizer(const std::string& type);
+}
diff --git a/src/python/swig/numpy.i b/src/python/swig/numpy.i
new file mode 100644
index 0000000..e58090e
--- /dev/null
+++ b/src/python/swig/numpy.i
@@ -0,0 +1,3119 @@
+/* -*- C -*-  (not really, but good for syntax highlighting) */
+
+/*
+ * Copyright (c) 2005-2015, NumPy Developers.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *        notice, this list of conditions and the following disclaimer.
+ *
+ *     * Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials provided
+ *        with the distribution.
+ *
+ *     * Neither the name of the NumPy Developers nor the names of any
+ *        contributors may be used to endorse or promote products derived
+ *        from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifdef SWIGPYTHON
+
+%{
+#ifndef SWIG_FILE_WITH_INIT
+#define NO_IMPORT_ARRAY
+#endif
+#include "stdio.h"
+#define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION
+#include <numpy/arrayobject.h>
+%}
+
+/**********************************************************************/
+
+%fragment("NumPy_Backward_Compatibility", "header")
+{
+%#if NPY_API_VERSION < 0x00000007
+%#define NPY_ARRAY_DEFAULT NPY_DEFAULT
+%#define NPY_ARRAY_FARRAY  NPY_FARRAY
+%#define NPY_FORTRANORDER  NPY_FORTRAN
+%#endif
+}
+
+/**********************************************************************/
+
+/* The following code originally appeared in
+ * enthought/kiva/agg/src/numeric.i written by Eric Jones.  It was
+ * translated from C++ to C by John Hunter.  Bill Spotz has modified
+ * it to fix some minor bugs, upgrade from Numeric to numpy (all
+ * versions), add some comments and functionality, and convert from
+ * direct code insertion to SWIG fragments.
+ */
+
+%fragment("NumPy_Macros", "header")
+{
+/* Macros to extract array attributes.
+ */
+%#if NPY_API_VERSION < 0x00000007
+%#define is_array(a)            ((a) && PyArray_Check((PyArrayObject*)a))
+%#define array_type(a)          (int)(PyArray_TYPE((PyArrayObject*)a))
+%#define array_numdims(a)       (((PyArrayObject*)a)->nd)
+%#define array_dimensions(a)    (((PyArrayObject*)a)->dimensions)
+%#define array_size(a,i)        (((PyArrayObject*)a)->dimensions[i])
+%#define array_strides(a)       (((PyArrayObject*)a)->strides)
+%#define array_stride(a,i)      (((PyArrayObject*)a)->strides[i])
+%#define array_data(a)          (((PyArrayObject*)a)->data)
+%#define array_descr(a)         (((PyArrayObject*)a)->descr)
+%#define array_flags(a)         (((PyArrayObject*)a)->flags)
+%#define array_enableflags(a,f) (((PyArrayObject*)a)->flags) = f
+%#else
+%#define is_array(a)            ((a) && PyArray_Check(a))
+%#define array_type(a)          PyArray_TYPE((PyArrayObject*)a)
+%#define array_numdims(a)       PyArray_NDIM((PyArrayObject*)a)
+%#define array_dimensions(a)    PyArray_DIMS((PyArrayObject*)a)
+%#define array_strides(a)       PyArray_STRIDES((PyArrayObject*)a)
+%#define array_stride(a,i)      PyArray_STRIDE((PyArrayObject*)a,i)
+%#define array_size(a,i)        PyArray_DIM((PyArrayObject*)a,i)
+%#define array_data(a)          PyArray_DATA((PyArrayObject*)a)
+%#define array_descr(a)         PyArray_DESCR((PyArrayObject*)a)
+%#define array_flags(a)         PyArray_FLAGS((PyArrayObject*)a)
+%#define array_enableflags(a,f) PyArray_ENABLEFLAGS((PyArrayObject*)a,f)
+%#endif
+%#define array_is_contiguous(a) (PyArray_ISCONTIGUOUS((PyArrayObject*)a))
+%#define array_is_native(a)     (PyArray_ISNOTSWAPPED((PyArrayObject*)a))
+%#define array_is_fortran(a)    (PyArray_ISFORTRAN((PyArrayObject*)a))
+}
+
+/**********************************************************************/
+
+%fragment("NumPy_Utilities",
+          "header")
+{
+  /* Given a PyObject, return a string describing its type.
+   */
+  const char* pytype_string(PyObject* py_obj)
+  {
+    if (py_obj == NULL          ) return "C NULL value";
+    if (py_obj == Py_None       ) return "Python None" ;
+    if (PyCallable_Check(py_obj)) return "callable"    ;
+    if (PyString_Check(  py_obj)) return "string"      ;
+    if (PyInt_Check(     py_obj)) return "int"         ;
+    if (PyFloat_Check(   py_obj)) return "float"       ;
+    if (PyDict_Check(    py_obj)) return "dict"        ;
+    if (PyList_Check(    py_obj)) return "list"        ;
+    if (PyTuple_Check(   py_obj)) return "tuple"       ;
+%#if PY_MAJOR_VERSION < 3
+    if (PyFile_Check(    py_obj)) return "file"        ;
+    if (PyModule_Check(  py_obj)) return "module"      ;
+    if (PyInstance_Check(py_obj)) return "instance"    ;
+%#endif
+
+    return "unkown type";
+  }
+
+  /* Given a NumPy typecode, return a string describing the type.
+   */
+  const char* typecode_string(int typecode)
+  {
+    static const char* type_names[25] = {"bool",
+                                         "byte",
+                                         "unsigned byte",
+                                         "short",
+                                         "unsigned short",
+                                         "int",
+                                         "unsigned int",
+                                         "long",
+                                         "unsigned long",
+                                         "long long",
+                                         "unsigned long long",
+                                         "float",
+                                         "double",
+                                         "long double",
+                                         "complex float",
+                                         "complex double",
+                                         "complex long double",
+                                         "object",
+                                         "string",
+                                         "unicode",
+                                         "void",
+                                         "ntypes",
+                                         "notype",
+                                         "char",
+                                         "unknown"};
+    return typecode < 24 ? type_names[typecode] : type_names[24];
+  }
+
+  /* Make sure input has correct numpy type.  This now just calls
+     PyArray_EquivTypenums().
+   */
+  int type_match(int actual_type,
+                 int desired_type)
+  {
+    return PyArray_EquivTypenums(actual_type, desired_type);
+  }
+
+%#ifdef SWIGPY_USE_CAPSULE
+  void free_cap(PyObject * cap)
+  {
+    void* array = (void*) PyCapsule_GetPointer(cap,SWIGPY_CAPSULE_NAME);
+    if (array != NULL) free(array);
+  }
+%#endif
+
+
+}
+
+/**********************************************************************/
+
+%fragment("NumPy_Object_to_Array",
+          "header",
+          fragment="NumPy_Backward_Compatibility",
+          fragment="NumPy_Macros",
+          fragment="NumPy_Utilities")
+{
+  /* Given a PyObject pointer, cast it to a PyArrayObject pointer if
+   * legal.  If not, set the python error string appropriately and
+   * return NULL.
+   */
+  PyArrayObject* obj_to_array_no_conversion(PyObject* input,
+                                            int        typecode)
+  {
+    PyArrayObject* ary = NULL;
+    if (is_array(input) && (typecode == NPY_NOTYPE ||
+                            PyArray_EquivTypenums(array_type(input), typecode)))
+    {
+      ary = (PyArrayObject*) input;
+    }
+    else if is_array(input)
+    {
+      const char* desired_type = typecode_string(typecode);
+      const char* actual_type  = typecode_string(array_type(input));
+      PyErr_Format(PyExc_TypeError,
+                   "Array of type '%s' required.  Array of type '%s' given",
+                   desired_type, actual_type);
+      ary = NULL;
+    }
+    else
+    {
+      const char* desired_type = typecode_string(typecode);
+      const char* actual_type  = pytype_string(input);
+      PyErr_Format(PyExc_TypeError,
+                   "Array of type '%s' required.  A '%s' was given",
+                   desired_type,
+                   actual_type);
+      ary = NULL;
+    }
+    return ary;
+  }
+
+  /* Convert the given PyObject to a NumPy array with the given
+   * typecode.  On success, return a valid PyArrayObject* with the
+   * correct type.  On failure, the python error string will be set and
+   * the routine returns NULL.
+   */
+  PyArrayObject* obj_to_array_allow_conversion(PyObject* input,
+                                               int       typecode,
+                                               int*      is_new_object)
+  {
+    PyArrayObject* ary = NULL;
+    PyObject*      py_obj;
+    if (is_array(input) && (typecode == NPY_NOTYPE ||
+                            PyArray_EquivTypenums(array_type(input),typecode)))
+    {
+      ary = (PyArrayObject*) input;
+      *is_new_object = 0;
+    }
+    else
+    {
+      py_obj = PyArray_FROMANY(input, typecode, 0, 0, NPY_ARRAY_DEFAULT);
+      /* If NULL, PyArray_FromObject will have set python error value.*/
+      ary = (PyArrayObject*) py_obj;
+      *is_new_object = 1;
+    }
+    return ary;
+  }
+
+  /* Given a PyArrayObject, check to see if it is contiguous.  If so,
+   * return the input pointer and flag it as not a new object.  If it is
+   * not contiguous, create a new PyArrayObject using the original data,
+   * flag it as a new object and return the pointer.
+   */
+  PyArrayObject* make_contiguous(PyArrayObject* ary,
+                                 int*           is_new_object,
+                                 int            min_dims,
+                                 int            max_dims)
+  {
+    PyArrayObject* result;
+    if (array_is_contiguous(ary))
+    {
+      result = ary;
+      *is_new_object = 0;
+    }
+    else
+    {
+      result = (PyArrayObject*) PyArray_ContiguousFromObject((PyObject*)ary,
+                                                              array_type(ary),
+                                                              min_dims,
+                                                              max_dims);
+      *is_new_object = 1;
+    }
+    return result;
+  }
+
+  /* Given a PyArrayObject, check to see if it is Fortran-contiguous.
+   * If so, return the input pointer, but do not flag it as not a new
+   * object.  If it is not Fortran-contiguous, create a new
+   * PyArrayObject using the original data, flag it as a new object
+   * and return the pointer.
+   */
+  PyArrayObject* make_fortran(PyArrayObject* ary,
+                              int*           is_new_object)
+  {
+    PyArrayObject* result;
+    if (array_is_fortran(ary))
+    {
+      result = ary;
+      *is_new_object = 0;
+    }
+    else
+    {
+      Py_INCREF(array_descr(ary));
+      result = (PyArrayObject*) PyArray_FromArray(ary,
+                                                  array_descr(ary),
+                                                  NPY_FORTRANORDER);
+      *is_new_object = 1;
+    }
+    return result;
+  }
+
+  /* Convert a given PyObject to a contiguous PyArrayObject of the
+   * specified type.  If the input object is not a contiguous
+   * PyArrayObject, a new one will be created and the new object flag
+   * will be set.
+   */
+  PyArrayObject* obj_to_array_contiguous_allow_conversion(PyObject* input,
+                                                          int       typecode,
+                                                          int*      is_new_object)
+  {
+    int is_new1 = 0;
+    int is_new2 = 0;
+    PyArrayObject* ary2;
+    PyArrayObject* ary1 = obj_to_array_allow_conversion(input,
+                                                        typecode,
+                                                        &is_new1);
+    if (ary1)
+    {
+      ary2 = make_contiguous(ary1, &is_new2, 0, 0);
+      if ( is_new1 && is_new2)
+      {
+        Py_DECREF(ary1);
+      }
+      ary1 = ary2;
+    }
+    *is_new_object = is_new1 || is_new2;
+    return ary1;
+  }
+
+  /* Convert a given PyObject to a Fortran-ordered PyArrayObject of the
+   * specified type.  If the input object is not a Fortran-ordered
+   * PyArrayObject, a new one will be created and the new object flag
+   * will be set.
+   */
+  PyArrayObject* obj_to_array_fortran_allow_conversion(PyObject* input,
+                                                       int       typecode,
+                                                       int*      is_new_object)
+  {
+    int is_new1 = 0;
+    int is_new2 = 0;
+    PyArrayObject* ary2;
+    PyArrayObject* ary1 = obj_to_array_allow_conversion(input,
+                                                        typecode,
+                                                        &is_new1);
+    if (ary1)
+    {
+      ary2 = make_fortran(ary1, &is_new2);
+      if (is_new1 && is_new2)
+      {
+        Py_DECREF(ary1);
+      }
+      ary1 = ary2;
+    }
+    *is_new_object = is_new1 || is_new2;
+    return ary1;
+  }
+} /* end fragment */
+
+/**********************************************************************/
+
+%fragment("NumPy_Array_Requirements",
+          "header",
+          fragment="NumPy_Backward_Compatibility",
+          fragment="NumPy_Macros")
+{
+  /* Test whether a python object is contiguous.  If array is
+   * contiguous, return 1.  Otherwise, set the python error string and
+   * return 0.
+   */
+  int require_contiguous(PyArrayObject* ary)
+  {
+    int contiguous = 1;
+    if (!array_is_contiguous(ary))
+    {
+      PyErr_SetString(PyExc_TypeError,
+                      "Array must be contiguous.  A non-contiguous array was given");
+      contiguous = 0;
+    }
+    return contiguous;
+  }
+
+  /* Require that a numpy array is not byte-swapped.  If the array is
+   * not byte-swapped, return 1.  Otherwise, set the python error string
+   * and return 0.
+   */
+  int require_native(PyArrayObject* ary)
+  {
+    int native = 1;
+    if (!array_is_native(ary))
+    {
+      PyErr_SetString(PyExc_TypeError,
+                      "Array must have native byteorder.  "
+                      "A byte-swapped array was given");
+      native = 0;
+    }
+    return native;
+  }
+
+  /* Require the given PyArrayObject to have a specified number of
+   * dimensions.  If the array has the specified number of dimensions,
+   * return 1.  Otherwise, set the python error string and return 0.
+   */
+  int require_dimensions(PyArrayObject* ary,
+                         int            exact_dimensions)
+  {
+    int success = 1;
+    if (array_numdims(ary) != exact_dimensions)
+    {
+      PyErr_Format(PyExc_TypeError,
+                   "Array must have %d dimensions.  Given array has %d dimensions",
+                   exact_dimensions,
+                   array_numdims(ary));
+      success = 0;
+    }
+    return success;
+  }
+
+  /* Require the given PyArrayObject to have one of a list of specified
+   * number of dimensions.  If the array has one of the specified number
+   * of dimensions, return 1.  Otherwise, set the python error string
+   * and return 0.
+   */
+  int require_dimensions_n(PyArrayObject* ary,
+                           int*           exact_dimensions,
+                           int            n)
+  {
+    int success = 0;
+    int i;
+    char dims_str[255] = "";
+    char s[255];
+    for (i = 0; i < n && !success; i++)
+    {
+      if (array_numdims(ary) == exact_dimensions[i])
+      {
+        success = 1;
+      }
+    }
+    if (!success)
+    {
+      for (i = 0; i < n-1; i++)
+      {
+        sprintf(s, "%d, ", exact_dimensions[i]);
+        strcat(dims_str,s);
+      }
+      sprintf(s, " or %d", exact_dimensions[n-1]);
+      strcat(dims_str,s);
+      PyErr_Format(PyExc_TypeError,
+                   "Array must have %s dimensions.  Given array has %d dimensions",
+                   dims_str,
+                   array_numdims(ary));
+    }
+    return success;
+  }
+
+  /* Require the given PyArrayObject to have a specified shape.  If the
+   * array has the specified shape, return 1.  Otherwise, set the python
+   * error string and return 0.
+   */
+  int require_size(PyArrayObject* ary,
+                   npy_intp*      size,
+                   int            n)
+  {
+    int i;
+    int success = 1;
+    int len;
+    char desired_dims[255] = "[";
+    char s[255];
+    char actual_dims[255] = "[";
+    for(i=0; i < n;i++)
+    {
+      if (size[i] != -1 &&  size[i] != array_size(ary,i))
+      {
+        success = 0;
+      }
+    }
+    if (!success)
+    {
+      for (i = 0; i < n; i++)
+      {
+        if (size[i] == -1)
+        {
+          sprintf(s, "*,");
+        }
+        else
+        {
+          sprintf(s, "%ld,", (long int)size[i]);
+        }
+        strcat(desired_dims,s);
+      }
+      len = strlen(desired_dims);
+      desired_dims[len-1] = ']';
+      for (i = 0; i < n; i++)
+      {
+        sprintf(s, "%ld,", (long int)array_size(ary,i));
+        strcat(actual_dims,s);
+      }
+      len = strlen(actual_dims);
+      actual_dims[len-1] = ']';
+      PyErr_Format(PyExc_TypeError,
+                   "Array must have shape of %s.  Given array has shape of %s",
+                   desired_dims,
+                   actual_dims);
+    }
+    return success;
+  }
+
+  /* Require the given PyArrayObject to to be Fortran ordered.  If the
+   * the PyArrayObject is already Fortran ordered, do nothing.  Else,
+   * set the Fortran ordering flag and recompute the strides.
+   */
+  int require_fortran(PyArrayObject* ary)
+  {
+    int success = 1;
+    int nd = array_numdims(ary);
+    int i;
+    npy_intp * strides = array_strides(ary);
+    if (array_is_fortran(ary)) return success;
+    /* Set the Fortran ordered flag */
+    array_enableflags(ary,NPY_ARRAY_FARRAY);
+    /* Recompute the strides */
+    strides[0] = strides[nd-1];
+    for (i=1; i < nd; ++i)
+      strides[i] = strides[i-1] * array_size(ary,i-1);
+    return success;
+  }
+}
+
+/* Combine all NumPy fragments into one for convenience */
+%fragment("NumPy_Fragments",
+          "header",
+          fragment="NumPy_Backward_Compatibility",
+          fragment="NumPy_Macros",
+          fragment="NumPy_Utilities",
+          fragment="NumPy_Object_to_Array",
+          fragment="NumPy_Array_Requirements")
+{
+}
+
+/* End John Hunter translation (with modifications by Bill Spotz)
+ */
+
+/* %numpy_typemaps() macro
+ *
+ * This macro defines a family of 74 typemaps that allow C arguments
+ * of the form
+ *
+ *    1. (DATA_TYPE IN_ARRAY1[ANY])
+ *    2. (DATA_TYPE* IN_ARRAY1, DIM_TYPE DIM1)
+ *    3. (DIM_TYPE DIM1, DATA_TYPE* IN_ARRAY1)
+ *
+ *    4. (DATA_TYPE IN_ARRAY2[ANY][ANY])
+ *    5. (DATA_TYPE* IN_ARRAY2, DIM_TYPE DIM1, DIM_TYPE DIM2)
+ *    6. (DIM_TYPE DIM1, DIM_TYPE DIM2, DATA_TYPE* IN_ARRAY2)
+ *    7. (DATA_TYPE* IN_FARRAY2, DIM_TYPE DIM1, DIM_TYPE DIM2)
+ *    8. (DIM_TYPE DIM1, DIM_TYPE DIM2, DATA_TYPE* IN_FARRAY2)
+ *
+ *    9. (DATA_TYPE IN_ARRAY3[ANY][ANY][ANY])
+ *   10. (DATA_TYPE* IN_ARRAY3, DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3)
+ *   11. (DATA_TYPE** IN_ARRAY3, DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3)
+ *   12. (DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DATA_TYPE* IN_ARRAY3)
+ *   13. (DATA_TYPE* IN_FARRAY3, DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3)
+ *   14. (DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DATA_TYPE* IN_FARRAY3)
+ *
+ *   15. (DATA_TYPE IN_ARRAY4[ANY][ANY][ANY][ANY])
+ *   16. (DATA_TYPE* IN_ARRAY4, DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DIM_TYPE DIM4)
+ *   17. (DATA_TYPE** IN_ARRAY4, DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DIM_TYPE DIM4)
+ *   18. (DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, , DIM_TYPE DIM4, DATA_TYPE* IN_ARRAY4)
+ *   19. (DATA_TYPE* IN_FARRAY4, DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DIM_TYPE DIM4)
+ *   20. (DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DIM_TYPE DIM4, DATA_TYPE* IN_FARRAY4)
+ *
+ *   21. (DATA_TYPE INPLACE_ARRAY1[ANY])
+ *   22. (DATA_TYPE* INPLACE_ARRAY1, DIM_TYPE DIM1)
+ *   23. (DIM_TYPE DIM1, DATA_TYPE* INPLACE_ARRAY1)
+ *
+ *   24. (DATA_TYPE INPLACE_ARRAY2[ANY][ANY])
+ *   25. (DATA_TYPE* INPLACE_ARRAY2, DIM_TYPE DIM1, DIM_TYPE DIM2)
+ *   26. (DIM_TYPE DIM1, DIM_TYPE DIM2, DATA_TYPE* INPLACE_ARRAY2)
+ *   27. (DATA_TYPE* INPLACE_FARRAY2, DIM_TYPE DIM1, DIM_TYPE DIM2)
+ *   28. (DIM_TYPE DIM1, DIM_TYPE DIM2, DATA_TYPE* INPLACE_FARRAY2)
+ *
+ *   29. (DATA_TYPE INPLACE_ARRAY3[ANY][ANY][ANY])
+ *   30. (DATA_TYPE* INPLACE_ARRAY3, DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3)
+ *   31. (DATA_TYPE** INPLACE_ARRAY3, DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3)
+ *   32. (DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DATA_TYPE* INPLACE_ARRAY3)
+ *   33. (DATA_TYPE* INPLACE_FARRAY3, DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3)
+ *   34. (DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DATA_TYPE* INPLACE_FARRAY3)
+ *
+ *   35. (DATA_TYPE INPLACE_ARRAY4[ANY][ANY][ANY][ANY])
+ *   36. (DATA_TYPE* INPLACE_ARRAY4, DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DIM_TYPE DIM4)
+ *   37. (DATA_TYPE** INPLACE_ARRAY4, DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DIM_TYPE DIM4)
+ *   38. (DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DIM_TYPE DIM4, DATA_TYPE* INPLACE_ARRAY4)
+ *   39. (DATA_TYPE* INPLACE_FARRAY4, DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DIM_TYPE DIM4)
+ *   40. (DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DIM_TYPE DIM4, DATA_TYPE* INPLACE_FARRAY4)
+ *
+ *   41. (DATA_TYPE ARGOUT_ARRAY1[ANY])
+ *   42. (DATA_TYPE* ARGOUT_ARRAY1, DIM_TYPE DIM1)
+ *   43. (DIM_TYPE DIM1, DATA_TYPE* ARGOUT_ARRAY1)
+ *
+ *   44. (DATA_TYPE ARGOUT_ARRAY2[ANY][ANY])
+ *
+ *   45. (DATA_TYPE ARGOUT_ARRAY3[ANY][ANY][ANY])
+ *
+ *   46. (DATA_TYPE ARGOUT_ARRAY4[ANY][ANY][ANY][ANY])
+ *
+ *   47. (DATA_TYPE** ARGOUTVIEW_ARRAY1, DIM_TYPE* DIM1)
+ *   48. (DIM_TYPE* DIM1, DATA_TYPE** ARGOUTVIEW_ARRAY1)
+ *
+ *   49. (DATA_TYPE** ARGOUTVIEW_ARRAY2, DIM_TYPE* DIM1, DIM_TYPE* DIM2)
+ *   50. (DIM_TYPE* DIM1, DIM_TYPE* DIM2, DATA_TYPE** ARGOUTVIEW_ARRAY2)
+ *   51. (DATA_TYPE** ARGOUTVIEW_FARRAY2, DIM_TYPE* DIM1, DIM_TYPE* DIM2)
+ *   52. (DIM_TYPE* DIM1, DIM_TYPE* DIM2, DATA_TYPE** ARGOUTVIEW_FARRAY2)
+ *
+ *   53. (DATA_TYPE** ARGOUTVIEW_ARRAY3, DIM_TYPE* DIM1, DIM_TYPE* DIM2, DIM_TYPE* DIM3)
+ *   54. (DIM_TYPE* DIM1, DIM_TYPE* DIM2, DIM_TYPE* DIM3, DATA_TYPE** ARGOUTVIEW_ARRAY3)
+ *   55. (DATA_TYPE** ARGOUTVIEW_FARRAY3, DIM_TYPE* DIM1, DIM_TYPE* DIM2, DIM_TYPE* DIM3)
+ *   56. (DIM_TYPE* DIM1, DIM_TYPE* DIM2, DIM_TYPE* DIM3, DATA_TYPE** ARGOUTVIEW_FARRAY3)
+ *
+ *   57. (DATA_TYPE** ARGOUTVIEW_ARRAY4, DIM_TYPE* DIM1, DIM_TYPE* DIM2, DIM_TYPE* DIM3, DIM_TYPE* DIM4)
+ *   58. (DIM_TYPE* DIM1, DIM_TYPE* DIM2, DIM_TYPE* DIM3, DIM_TYPE* DIM4, DATA_TYPE** ARGOUTVIEW_ARRAY4)
+ *   59. (DATA_TYPE** ARGOUTVIEW_FARRAY4, DIM_TYPE* DIM1, DIM_TYPE* DIM2, DIM_TYPE* DIM3, DIM_TYPE* DIM4)
+ *   60. (DIM_TYPE* DIM1, DIM_TYPE* DIM2, DIM_TYPE* DIM3, DIM_TYPE* DIM4, DATA_TYPE** ARGOUTVIEW_FARRAY4)
+ *
+ *   61. (DATA_TYPE** ARGOUTVIEWM_ARRAY1, DIM_TYPE* DIM1)
+ *   62. (DIM_TYPE* DIM1, DATA_TYPE** ARGOUTVIEWM_ARRAY1)
+ *
+ *   63. (DATA_TYPE** ARGOUTVIEWM_ARRAY2, DIM_TYPE* DIM1, DIM_TYPE* DIM2)
+ *   64. (DIM_TYPE* DIM1, DIM_TYPE* DIM2, DATA_TYPE** ARGOUTVIEWM_ARRAY2)
+ *   65. (DATA_TYPE** ARGOUTVIEWM_FARRAY2, DIM_TYPE* DIM1, DIM_TYPE* DIM2)
+ *   66. (DIM_TYPE* DIM1, DIM_TYPE* DIM2, DATA_TYPE** ARGOUTVIEWM_FARRAY2)
+ *
+ *   67. (DATA_TYPE** ARGOUTVIEWM_ARRAY3, DIM_TYPE* DIM1, DIM_TYPE* DIM2, DIM_TYPE* DIM3)
+ *   68. (DIM_TYPE* DIM1, DIM_TYPE* DIM2, DIM_TYPE* DIM3, DATA_TYPE** ARGOUTVIEWM_ARRAY3)
+ *   69. (DATA_TYPE** ARGOUTVIEWM_FARRAY3, DIM_TYPE* DIM1, DIM_TYPE* DIM2, DIM_TYPE* DIM3)
+ *   70. (DIM_TYPE* DIM1, DIM_TYPE* DIM2, DIM_TYPE* DIM3, DATA_TYPE** ARGOUTVIEWM_FARRAY3)
+ *
+ *   71. (DATA_TYPE** ARGOUTVIEWM_ARRAY4, DIM_TYPE* DIM1, DIM_TYPE* DIM2, DIM_TYPE* DIM3, DIM_TYPE* DIM4)
+ *   72. (DIM_TYPE* DIM1, DIM_TYPE* DIM2, DIM_TYPE* DIM3, DIM_TYPE* DIM4, DATA_TYPE** ARGOUTVIEWM_ARRAY4)
+ *   73. (DATA_TYPE** ARGOUTVIEWM_FARRAY4, DIM_TYPE* DIM1, DIM_TYPE* DIM2, DIM_TYPE* DIM3, DIM_TYPE* DIM4)
+ *   74. (DIM_TYPE* DIM1, DIM_TYPE* DIM2, DIM_TYPE* DIM3, DIM_TYPE* DIM4, DATA_TYPE** ARGOUTVIEWM_FARRAY4)
+ *
+ * where "DATA_TYPE" is any type supported by the NumPy module, and
+ * "DIM_TYPE" is any int-like type suitable for specifying dimensions.
+ * The difference between "ARRAY" typemaps and "FARRAY" typemaps is
+ * that the "FARRAY" typemaps expect Fortran ordering of
+ * multidimensional arrays.  In python, the dimensions will not need
+ * to be specified (except for the "DATA_TYPE* ARGOUT_ARRAY1"
+ * typemaps).  The IN_ARRAYs can be a numpy array or any sequence that
+ * can be converted to a numpy array of the specified type.  The
+ * INPLACE_ARRAYs must be numpy arrays of the appropriate type.  The
+ * ARGOUT_ARRAYs will be returned as new numpy arrays of the
+ * appropriate type.
+ *
+ * These typemaps can be applied to existing functions using the
+ * %apply directive.  For example:
+ *
+ *     %apply (double* IN_ARRAY1, int DIM1) {(double* series, int length)};
+ *     double prod(double* series, int length);
+ *
+ *     %apply (int DIM1, int DIM2, double* INPLACE_ARRAY2)
+ *           {(int rows, int cols, double* matrix        )};
+ *     void floor(int rows, int cols, double* matrix, double f);
+ *
+ *     %apply (double IN_ARRAY3[ANY][ANY][ANY])
+ *           {(double tensor[2][2][2]         )};
+ *     %apply (double ARGOUT_ARRAY3[ANY][ANY][ANY])
+ *           {(double low[2][2][2]                )};
+ *     %apply (double ARGOUT_ARRAY3[ANY][ANY][ANY])
+ *           {(double upp[2][2][2]                )};
+ *     void luSplit(double tensor[2][2][2],
+ *                  double low[2][2][2],
+ *                  double upp[2][2][2]    );
+ *
+ * or directly with
+ *
+ *     double prod(double* IN_ARRAY1, int DIM1);
+ *
+ *     void floor(int DIM1, int DIM2, double* INPLACE_ARRAY2, double f);
+ *
+ *     void luSplit(double IN_ARRAY3[ANY][ANY][ANY],
+ *                  double ARGOUT_ARRAY3[ANY][ANY][ANY],
+ *                  double ARGOUT_ARRAY3[ANY][ANY][ANY]);
+ */
+
+%define %numpy_typemaps(DATA_TYPE, DATA_TYPECODE, DIM_TYPE)
+
+/************************/
+/* Input Array Typemaps */
+/************************/
+
+/* Typemap suite for (DATA_TYPE IN_ARRAY1[ANY])
+ */
+%typecheck(SWIG_TYPECHECK_DOUBLE_ARRAY,
+           fragment="NumPy_Macros")
+  (DATA_TYPE IN_ARRAY1[ANY])
+{
+  $1 = is_array($input) || PySequence_Check($input);
+}
+%typemap(in,
+         fragment="NumPy_Fragments")
+  (DATA_TYPE IN_ARRAY1[ANY])
+  (PyArrayObject* array=NULL, int is_new_object=0)
+{
+  npy_intp size[1] = { $1_dim0 };
+  array = obj_to_array_contiguous_allow_conversion($input,
+                                                   DATA_TYPECODE,
+                                                   &is_new_object);
+  if (!array || !require_dimensions(array, 1) ||
+      !require_size(array, size, 1)) SWIG_fail;
+  $1 = ($1_ltype) array_data(array);
+}
+%typemap(freearg)
+  (DATA_TYPE IN_ARRAY1[ANY])
+{
+  if (is_new_object$argnum && array$argnum)
+    { Py_DECREF(array$argnum); }
+}
+
+/* Typemap suite for (DATA_TYPE* IN_ARRAY1, DIM_TYPE DIM1)
+ */
+%typecheck(SWIG_TYPECHECK_DOUBLE_ARRAY,
+           fragment="NumPy_Macros")
+  (DATA_TYPE* IN_ARRAY1, DIM_TYPE DIM1)
+{
+  $1 = is_array($input) || PySequence_Check($input);
+}
+%typemap(in,
+         fragment="NumPy_Fragments")
+  (DATA_TYPE* IN_ARRAY1, DIM_TYPE DIM1)
+  (PyArrayObject* array=NULL, int is_new_object=0)
+{
+  npy_intp size[1] = { -1 };
+  array = obj_to_array_contiguous_allow_conversion($input,
+                                                   DATA_TYPECODE,
+                                                   &is_new_object);
+  if (!array || !require_dimensions(array, 1) ||
+      !require_size(array, size, 1)) SWIG_fail;
+  $1 = (DATA_TYPE*) array_data(array);
+  $2 = (DIM_TYPE) array_size(array,0);
+}
+%typemap(freearg)
+  (DATA_TYPE* IN_ARRAY1, DIM_TYPE DIM1)
+{
+  if (is_new_object$argnum && array$argnum)
+    { Py_DECREF(array$argnum); }
+}
+
+/* Typemap suite for (DIM_TYPE DIM1, DATA_TYPE* IN_ARRAY1)
+ */
+%typecheck(SWIG_TYPECHECK_DOUBLE_ARRAY,
+           fragment="NumPy_Macros")
+  (DIM_TYPE DIM1, DATA_TYPE* IN_ARRAY1)
+{
+  $1 = is_array($input) || PySequence_Check($input);
+}
+%typemap(in,
+         fragment="NumPy_Fragments")
+  (DIM_TYPE DIM1, DATA_TYPE* IN_ARRAY1)
+  (PyArrayObject* array=NULL, int is_new_object=0)
+{
+  npy_intp size[1] = {-1};
+  array = obj_to_array_contiguous_allow_conversion($input,
+                                                   DATA_TYPECODE,
+                                                   &is_new_object);
+  if (!array || !require_dimensions(array, 1) ||
+      !require_size(array, size, 1)) SWIG_fail;
+  $1 = (DIM_TYPE) array_size(array,0);
+  $2 = (DATA_TYPE*) array_data(array);
+}
+%typemap(freearg)
+  (DIM_TYPE DIM1, DATA_TYPE* IN_ARRAY1)
+{
+  if (is_new_object$argnum && array$argnum)
+    { Py_DECREF(array$argnum); }
+}
+
+/* Typemap suite for (DATA_TYPE IN_ARRAY2[ANY][ANY])
+ */
+%typecheck(SWIG_TYPECHECK_DOUBLE_ARRAY,
+           fragment="NumPy_Macros")
+  (DATA_TYPE IN_ARRAY2[ANY][ANY])
+{
+  $1 = is_array($input) || PySequence_Check($input);
+}
+%typemap(in,
+         fragment="NumPy_Fragments")
+  (DATA_TYPE IN_ARRAY2[ANY][ANY])
+  (PyArrayObject* array=NULL, int is_new_object=0)
+{
+  npy_intp size[2] = { $1_dim0, $1_dim1 };
+  array = obj_to_array_contiguous_allow_conversion($input,
+                                                   DATA_TYPECODE,
+                                                   &is_new_object);
+  if (!array || !require_dimensions(array, 2) ||
+      !require_size(array, size, 2)) SWIG_fail;
+  $1 = ($1_ltype) array_data(array);
+}
+%typemap(freearg)
+  (DATA_TYPE IN_ARRAY2[ANY][ANY])
+{
+  if (is_new_object$argnum && array$argnum)
+    { Py_DECREF(array$argnum); }
+}
+
+/* Typemap suite for (DATA_TYPE* IN_ARRAY2, DIM_TYPE DIM1, DIM_TYPE DIM2)
+ */
+%typecheck(SWIG_TYPECHECK_DOUBLE_ARRAY,
+           fragment="NumPy_Macros")
+  (DATA_TYPE* IN_ARRAY2, DIM_TYPE DIM1, DIM_TYPE DIM2)
+{
+  $1 = is_array($input) || PySequence_Check($input);
+}
+%typemap(in,
+         fragment="NumPy_Fragments")
+  (DATA_TYPE* IN_ARRAY2, DIM_TYPE DIM1, DIM_TYPE DIM2)
+  (PyArrayObject* array=NULL, int is_new_object=0)
+{
+  npy_intp size[2] = { -1, -1 };
+  array = obj_to_array_contiguous_allow_conversion($input, DATA_TYPECODE,
+                                                   &is_new_object);
+  if (!array || !require_dimensions(array, 2) ||
+      !require_size(array, size, 2)) SWIG_fail;
+  $1 = (DATA_TYPE*) array_data(array);
+  $2 = (DIM_TYPE) array_size(array,0);
+  $3 = (DIM_TYPE) array_size(array,1);
+}
+%typemap(freearg)
+  (DATA_TYPE* IN_ARRAY2, DIM_TYPE DIM1, DIM_TYPE DIM2)
+{
+  if (is_new_object$argnum && array$argnum)
+    { Py_DECREF(array$argnum); }
+}
+
+/* Typemap suite for (DIM_TYPE DIM1, DIM_TYPE DIM2, DATA_TYPE* IN_ARRAY2)
+ */
+%typecheck(SWIG_TYPECHECK_DOUBLE_ARRAY,
+           fragment="NumPy_Macros")
+  (DIM_TYPE DIM1, DIM_TYPE DIM2, DATA_TYPE* IN_ARRAY2)
+{
+  $1 = is_array($input) || PySequence_Check($input);
+}
+%typemap(in,
+         fragment="NumPy_Fragments")
+  (DIM_TYPE DIM1, DIM_TYPE DIM2, DATA_TYPE* IN_ARRAY2)
+  (PyArrayObject* array=NULL, int is_new_object=0)
+{
+  npy_intp size[2] = { -1, -1 };
+  array = obj_to_array_contiguous_allow_conversion($input,
+                                                   DATA_TYPECODE,
+                                                   &is_new_object);
+  if (!array || !require_dimensions(array, 2) ||
+      !require_size(array, size, 2)) SWIG_fail;
+  $1 = (DIM_TYPE) array_size(array,0);
+  $2 = (DIM_TYPE) array_size(array,1);
+  $3 = (DATA_TYPE*) array_data(array);
+}
+%typemap(freearg)
+  (DIM_TYPE DIM1, DIM_TYPE DIM2, DATA_TYPE* IN_ARRAY2)
+{
+  if (is_new_object$argnum && array$argnum)
+    { Py_DECREF(array$argnum); }
+}
+
+/* Typemap suite for (DATA_TYPE* IN_FARRAY2, DIM_TYPE DIM1, DIM_TYPE DIM2)
+ */
+%typecheck(SWIG_TYPECHECK_DOUBLE_ARRAY,
+           fragment="NumPy_Macros")
+  (DATA_TYPE* IN_FARRAY2, DIM_TYPE DIM1, DIM_TYPE DIM2)
+{
+  $1 = is_array($input) || PySequence_Check($input);
+}
+%typemap(in,
+         fragment="NumPy_Fragments")
+  (DATA_TYPE* IN_FARRAY2, DIM_TYPE DIM1, DIM_TYPE DIM2)
+  (PyArrayObject* array=NULL, int is_new_object=0)
+{
+  npy_intp size[2] = { -1, -1 };
+  array = obj_to_array_fortran_allow_conversion($input,
+                                                DATA_TYPECODE,
+                                                &is_new_object);
+  if (!array || !require_dimensions(array, 2) ||
+      !require_size(array, size, 2) || !require_fortran(array)) SWIG_fail;
+  $1 = (DATA_TYPE*) array_data(array);
+  $2 = (DIM_TYPE) array_size(array,0);
+  $3 = (DIM_TYPE) array_size(array,1);
+}
+%typemap(freearg)
+  (DATA_TYPE* IN_FARRAY2, DIM_TYPE DIM1, DIM_TYPE DIM2)
+{
+  if (is_new_object$argnum && array$argnum)
+    { Py_DECREF(array$argnum); }
+}
+
+/* Typemap suite for (DIM_TYPE DIM1, DIM_TYPE DIM2, DATA_TYPE* IN_FARRAY2)
+ */
+%typecheck(SWIG_TYPECHECK_DOUBLE_ARRAY,
+           fragment="NumPy_Macros")
+  (DIM_TYPE DIM1, DIM_TYPE DIM2, DATA_TYPE* IN_FARRAY2)
+{
+  $1 = is_array($input) || PySequence_Check($input);
+}
+%typemap(in,
+         fragment="NumPy_Fragments")
+  (DIM_TYPE DIM1, DIM_TYPE DIM2, DATA_TYPE* IN_FARRAY2)
+  (PyArrayObject* array=NULL, int is_new_object=0)
+{
+  npy_intp size[2] = { -1, -1 };
+  array = obj_to_array_fortran_allow_conversion($input,
+                                                   DATA_TYPECODE,
+                                                   &is_new_object);
+  if (!array || !require_dimensions(array, 2) ||
+      !require_size(array, size, 2) || !require_fortran(array)) SWIG_fail;
+  $1 = (DIM_TYPE) array_size(array,0);
+  $2 = (DIM_TYPE) array_size(array,1);
+  $3 = (DATA_TYPE*) array_data(array);
+}
+%typemap(freearg)
+  (DIM_TYPE DIM1, DIM_TYPE DIM2, DATA_TYPE* IN_FARRAY2)
+{
+  if (is_new_object$argnum && array$argnum)
+    { Py_DECREF(array$argnum); }
+}
+
+/* Typemap suite for (DATA_TYPE IN_ARRAY3[ANY][ANY][ANY])
+ */
+%typecheck(SWIG_TYPECHECK_DOUBLE_ARRAY,
+           fragment="NumPy_Macros")
+  (DATA_TYPE IN_ARRAY3[ANY][ANY][ANY])
+{
+  $1 = is_array($input) || PySequence_Check($input);
+}
+%typemap(in,
+         fragment="NumPy_Fragments")
+  (DATA_TYPE IN_ARRAY3[ANY][ANY][ANY])
+  (PyArrayObject* array=NULL, int is_new_object=0)
+{
+  npy_intp size[3] = { $1_dim0, $1_dim1, $1_dim2 };
+  array = obj_to_array_contiguous_allow_conversion($input,
+                                                   DATA_TYPECODE,
+                                                   &is_new_object);
+  if (!array || !require_dimensions(array, 3) ||
+      !require_size(array, size, 3)) SWIG_fail;
+  $1 = ($1_ltype) array_data(array);
+}
+%typemap(freearg)
+  (DATA_TYPE IN_ARRAY3[ANY][ANY][ANY])
+{
+  if (is_new_object$argnum && array$argnum)
+    { Py_DECREF(array$argnum); }
+}
+
+/* Typemap suite for (DATA_TYPE* IN_ARRAY3, DIM_TYPE DIM1, DIM_TYPE DIM2,
+ *                    DIM_TYPE DIM3)
+ */
+%typecheck(SWIG_TYPECHECK_DOUBLE_ARRAY,
+           fragment="NumPy_Macros")
+  (DATA_TYPE* IN_ARRAY3, DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3)
+{
+  $1 = is_array($input) || PySequence_Check($input);
+}
+%typemap(in,
+         fragment="NumPy_Fragments")
+  (DATA_TYPE* IN_ARRAY3, DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3)
+  (PyArrayObject* array=NULL, int is_new_object=0)
+{
+  npy_intp size[3] = { -1, -1, -1 };
+  array = obj_to_array_contiguous_allow_conversion($input, DATA_TYPECODE,
+                                                   &is_new_object);
+  if (!array || !require_dimensions(array, 3) ||
+      !require_size(array, size, 3)) SWIG_fail;
+  $1 = (DATA_TYPE*) array_data(array);
+  $2 = (DIM_TYPE) array_size(array,0);
+  $3 = (DIM_TYPE) array_size(array,1);
+  $4 = (DIM_TYPE) array_size(array,2);
+}
+%typemap(freearg)
+  (DATA_TYPE* IN_ARRAY3, DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3)
+{
+  if (is_new_object$argnum && array$argnum)
+    { Py_DECREF(array$argnum); }
+}
+
+/* Typemap suite for (DATA_TYPE** IN_ARRAY3, DIM_TYPE DIM1, DIM_TYPE DIM2,
+ *                    DIM_TYPE DIM3)
+ */
+%typecheck(SWIG_TYPECHECK_DOUBLE_ARRAY,
+           fragment="NumPy_Macros")
+  (DATA_TYPE** IN_ARRAY3, DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3)
+{
+  /* for now, only concerned with lists */
+  $1 = PySequence_Check($input);
+}
+%typemap(in,
+         fragment="NumPy_Fragments")
+  (DATA_TYPE** IN_ARRAY3, DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3)
+  (DATA_TYPE** array=NULL, PyArrayObject** object_array=NULL, int* is_new_object_array=NULL)
+{
+  npy_intp size[2] = { -1, -1 };
+  PyArrayObject* temp_array;
+  Py_ssize_t i;
+  int is_new_object;
+
+  /* length of the list */
+  $2 = PyList_Size($input);
+
+  /* the arrays */
+  array = (DATA_TYPE **)malloc($2*sizeof(DATA_TYPE *));
+  object_array = (PyArrayObject **)calloc($2,sizeof(PyArrayObject *));
+  is_new_object_array = (int *)calloc($2,sizeof(int));
+
+  if (array == NULL || object_array == NULL || is_new_object_array == NULL)
+  {
+    SWIG_fail;
+  }
+
+  for (i=0; i<$2; i++)
+  {
+    temp_array = obj_to_array_contiguous_allow_conversion(PySequence_GetItem($input,i), DATA_TYPECODE, &is_new_object);
+
+    /* the new array must be stored so that it can be destroyed in freearg */
+    object_array[i] = temp_array;
+    is_new_object_array[i] = is_new_object;
+
+    if (!temp_array || !require_dimensions(temp_array, 2)) SWIG_fail;
+
+    /* store the size of the first array in the list, then use that for comparison. */
+    if (i == 0)
+    {
+      size[0] = array_size(temp_array,0);
+      size[1] = array_size(temp_array,1);
+    }
+
+    if (!require_size(temp_array, size, 2)) SWIG_fail;
+
+    array[i] = (DATA_TYPE*) array_data(temp_array);
+  }
+
+  $1 = (DATA_TYPE**) array;
+  $3 = (DIM_TYPE) size[0];
+  $4 = (DIM_TYPE) size[1];
+}
+%typemap(freearg)
+  (DATA_TYPE** IN_ARRAY3, DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3)
+{
+  Py_ssize_t i;
+
+  if (array$argnum!=NULL) free(array$argnum);
+
+  /*freeing the individual arrays if needed */
+  if (object_array$argnum!=NULL)
+  {
+    if (is_new_object_array$argnum!=NULL)
+    {
+      for (i=0; i<$2; i++)
+      {
+        if (object_array$argnum[i] != NULL && is_new_object_array$argnum[i])
+        { Py_DECREF(object_array$argnum[i]); }
+      }
+      free(is_new_object_array$argnum);
+    }
+    free(object_array$argnum);
+  }
+}
+
+/* Typemap suite for (DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3,
+ *                    DATA_TYPE* IN_ARRAY3)
+ */
+%typecheck(SWIG_TYPECHECK_DOUBLE_ARRAY,
+           fragment="NumPy_Macros")
+  (DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DATA_TYPE* IN_ARRAY3)
+{
+  $1 = is_array($input) || PySequence_Check($input);
+}
+%typemap(in,
+         fragment="NumPy_Fragments")
+  (DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DATA_TYPE* IN_ARRAY3)
+  (PyArrayObject* array=NULL, int is_new_object=0)
+{
+  npy_intp size[3] = { -1, -1, -1 };
+  array = obj_to_array_contiguous_allow_conversion($input, DATA_TYPECODE,
+                                                   &is_new_object);
+  if (!array || !require_dimensions(array, 3) ||
+      !require_size(array, size, 3)) SWIG_fail;
+  $1 = (DIM_TYPE) array_size(array,0);
+  $2 = (DIM_TYPE) array_size(array,1);
+  $3 = (DIM_TYPE) array_size(array,2);
+  $4 = (DATA_TYPE*) array_data(array);
+}
+%typemap(freearg)
+  (DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DATA_TYPE* IN_ARRAY3)
+{
+  if (is_new_object$argnum && array$argnum)
+    { Py_DECREF(array$argnum); }
+}
+
+/* Typemap suite for (DATA_TYPE* IN_FARRAY3, DIM_TYPE DIM1, DIM_TYPE DIM2,
+ *                    DIM_TYPE DIM3)
+ */
+%typecheck(SWIG_TYPECHECK_DOUBLE_ARRAY,
+           fragment="NumPy_Macros")
+  (DATA_TYPE* IN_FARRAY3, DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3)
+{
+  $1 = is_array($input) || PySequence_Check($input);
+}
+%typemap(in,
+         fragment="NumPy_Fragments")
+  (DATA_TYPE* IN_FARRAY3, DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3)
+  (PyArrayObject* array=NULL, int is_new_object=0)
+{
+  npy_intp size[3] = { -1, -1, -1 };
+  array = obj_to_array_fortran_allow_conversion($input, DATA_TYPECODE,
+                                                &is_new_object);
+  if (!array || !require_dimensions(array, 3) ||
+      !require_size(array, size, 3) | !require_fortran(array)) SWIG_fail;
+  $1 = (DATA_TYPE*) array_data(array);
+  $2 = (DIM_TYPE) array_size(array,0);
+  $3 = (DIM_TYPE) array_size(array,1);
+  $4 = (DIM_TYPE) array_size(array,2);
+}
+%typemap(freearg)
+  (DATA_TYPE* IN_FARRAY3, DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3)
+{
+  if (is_new_object$argnum && array$argnum)
+    { Py_DECREF(array$argnum); }
+}
+
+/* Typemap suite for (DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3,
+ *                    DATA_TYPE* IN_FARRAY3)
+ */
+%typecheck(SWIG_TYPECHECK_DOUBLE_ARRAY,
+           fragment="NumPy_Macros")
+  (DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DATA_TYPE* IN_FARRAY3)
+{
+  $1 = is_array($input) || PySequence_Check($input);
+}
+%typemap(in,
+         fragment="NumPy_Fragments")
+  (DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DATA_TYPE* IN_FARRAY3)
+  (PyArrayObject* array=NULL, int is_new_object=0)
+{
+  npy_intp size[3] = { -1, -1, -1 };
+  array = obj_to_array_fortran_allow_conversion($input,
+                                                   DATA_TYPECODE,
+                                                   &is_new_object);
+  if (!array || !require_dimensions(array, 3) ||
+      !require_size(array, size, 3) || !require_fortran(array)) SWIG_fail;
+  $1 = (DIM_TYPE) array_size(array,0);
+  $2 = (DIM_TYPE) array_size(array,1);
+  $3 = (DIM_TYPE) array_size(array,2);
+  $4 = (DATA_TYPE*) array_data(array);
+}
+%typemap(freearg)
+  (DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DATA_TYPE* IN_FARRAY3)
+{
+  if (is_new_object$argnum && array$argnum)
+    { Py_DECREF(array$argnum); }
+}
+
+/* Typemap suite for (DATA_TYPE IN_ARRAY4[ANY][ANY][ANY][ANY])
+ */
+%typecheck(SWIG_TYPECHECK_DOUBLE_ARRAY,
+           fragment="NumPy_Macros")
+  (DATA_TYPE IN_ARRAY4[ANY][ANY][ANY][ANY])
+{
+  $1 = is_array($input) || PySequence_Check($input);
+}
+%typemap(in,
+         fragment="NumPy_Fragments")
+  (DATA_TYPE IN_ARRAY4[ANY][ANY][ANY][ANY])
+  (PyArrayObject* array=NULL, int is_new_object=0)
+{
+  npy_intp size[4] = { $1_dim0, $1_dim1, $1_dim2 , $1_dim3};
+  array = obj_to_array_contiguous_allow_conversion($input, DATA_TYPECODE,
+                                                   &is_new_object);
+  if (!array || !require_dimensions(array, 4) ||
+      !require_size(array, size, 4)) SWIG_fail;
+  $1 = ($1_ltype) array_data(array);
+}
+%typemap(freearg)
+  (DATA_TYPE IN_ARRAY4[ANY][ANY][ANY][ANY])
+{
+  if (is_new_object$argnum && array$argnum)
+    { Py_DECREF(array$argnum); }
+}
+
+/* Typemap suite for (DATA_TYPE* IN_ARRAY4, DIM_TYPE DIM1, DIM_TYPE DIM2,
+ *                    DIM_TYPE DIM3, DIM_TYPE DIM4)
+ */
+%typecheck(SWIG_TYPECHECK_DOUBLE_ARRAY,
+           fragment="NumPy_Macros")
+  (DATA_TYPE* IN_ARRAY4, DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DIM_TYPE DIM4)
+{
+  $1 = is_array($input) || PySequence_Check($input);
+}
+%typemap(in,
+         fragment="NumPy_Fragments")
+  (DATA_TYPE* IN_ARRAY4, DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DIM_TYPE DIM4)
+  (PyArrayObject* array=NULL, int is_new_object=0)
+{
+  npy_intp size[4] = { -1, -1, -1, -1 };
+  array = obj_to_array_contiguous_allow_conversion($input, DATA_TYPECODE,
+                                                   &is_new_object);
+  if (!array || !require_dimensions(array, 4) ||
+      !require_size(array, size, 4)) SWIG_fail;
+  $1 = (DATA_TYPE*) array_data(array);
+  $2 = (DIM_TYPE) array_size(array,0);
+  $3 = (DIM_TYPE) array_size(array,1);
+  $4 = (DIM_TYPE) array_size(array,2);
+  $5 = (DIM_TYPE) array_size(array,3);
+}
+%typemap(freearg)
+  (DATA_TYPE* IN_ARRAY4, DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DIM_TYPE DIM4)
+{
+  if (is_new_object$argnum && array$argnum)
+    { Py_DECREF(array$argnum); }
+}
+
+/* Typemap suite for (DATA_TYPE** IN_ARRAY4, DIM_TYPE DIM1, DIM_TYPE DIM2,
+ *                    DIM_TYPE DIM3, DIM_TYPE DIM4)
+ */
+%typecheck(SWIG_TYPECHECK_DOUBLE_ARRAY,
+           fragment="NumPy_Macros")
+  (DATA_TYPE** IN_ARRAY4, DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DIM_TYPE DIM4)
+{
+  /* for now, only concerned with lists */
+  $1 = PySequence_Check($input);
+}
+%typemap(in,
+         fragment="NumPy_Fragments")
+  (DATA_TYPE** IN_ARRAY4, DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DIM_TYPE DIM4)
+  (DATA_TYPE** array=NULL, PyArrayObject** object_array=NULL, int* is_new_object_array=NULL)
+{
+  npy_intp size[3] = { -1, -1, -1 };
+  PyArrayObject* temp_array;
+  Py_ssize_t i;
+  int is_new_object;
+
+  /* length of the list */
+  $2 = PyList_Size($input);
+
+  /* the arrays */
+  array = (DATA_TYPE **)malloc($2*sizeof(DATA_TYPE *));
+  object_array = (PyArrayObject **)calloc($2,sizeof(PyArrayObject *));
+  is_new_object_array = (int *)calloc($2,sizeof(int));
+
+  if (array == NULL || object_array == NULL || is_new_object_array == NULL)
+  {
+    SWIG_fail;
+  }
+
+  for (i=0; i<$2; i++)
+  {
+    temp_array = obj_to_array_contiguous_allow_conversion(PySequence_GetItem($input,i), DATA_TYPECODE, &is_new_object);
+
+    /* the new array must be stored so that it can be destroyed in freearg */
+    object_array[i] = temp_array;
+    is_new_object_array[i] = is_new_object;
+
+    if (!temp_array || !require_dimensions(temp_array, 3)) SWIG_fail;
+
+    /* store the size of the first array in the list, then use that for comparison. */
+    if (i == 0)
+    {
+      size[0] = array_size(temp_array,0);
+      size[1] = array_size(temp_array,1);
+      size[2] = array_size(temp_array,2);
+    }
+
+    if (!require_size(temp_array, size, 3)) SWIG_fail;
+
+    array[i] = (DATA_TYPE*) array_data(temp_array);
+  }
+
+  $1 = (DATA_TYPE**) array;
+  $3 = (DIM_TYPE) size[0];
+  $4 = (DIM_TYPE) size[1];
+  $5 = (DIM_TYPE) size[2];
+}
+%typemap(freearg)
+  (DATA_TYPE** IN_ARRAY4, DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DIM_TYPE DIM4)
+{
+  Py_ssize_t i;
+
+  if (array$argnum!=NULL) free(array$argnum);
+
+  /*freeing the individual arrays if needed */
+  if (object_array$argnum!=NULL)
+  {
+    if (is_new_object_array$argnum!=NULL)
+    {
+      for (i=0; i<$2; i++)
+      {
+        if (object_array$argnum[i] != NULL && is_new_object_array$argnum[i])
+        { Py_DECREF(object_array$argnum[i]); }
+      }
+      free(is_new_object_array$argnum);
+    }
+    free(object_array$argnum);
+  }
+}
+
+/* Typemap suite for (DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DIM_TYPE DIM4,
+ *                    DATA_TYPE* IN_ARRAY4)
+ */
+%typecheck(SWIG_TYPECHECK_DOUBLE_ARRAY,
+           fragment="NumPy_Macros")
+  (DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DIM_TYPE DIM4, DATA_TYPE* IN_ARRAY4)
+{
+  $1 = is_array($input) || PySequence_Check($input);
+}
+%typemap(in,
+         fragment="NumPy_Fragments")
+  (DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DIM_TYPE DIM4, DATA_TYPE* IN_ARRAY4)
+  (PyArrayObject* array=NULL, int is_new_object=0)
+{
+  npy_intp size[4] = { -1, -1, -1 , -1};
+  array = obj_to_array_contiguous_allow_conversion($input, DATA_TYPECODE,
+                                                   &is_new_object);
+  if (!array || !require_dimensions(array, 4) ||
+      !require_size(array, size, 4)) SWIG_fail;
+  $1 = (DIM_TYPE) array_size(array,0);
+  $2 = (DIM_TYPE) array_size(array,1);
+  $3 = (DIM_TYPE) array_size(array,2);
+  $4 = (DIM_TYPE) array_size(array,3);
+  $5 = (DATA_TYPE*) array_data(array);
+}
+%typemap(freearg)
+  (DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DIM_TYPE DIM4, DATA_TYPE* IN_ARRAY4)
+{
+  if (is_new_object$argnum && array$argnum)
+    { Py_DECREF(array$argnum); }
+}
+
+/* Typemap suite for (DATA_TYPE* IN_FARRAY4, DIM_TYPE DIM1, DIM_TYPE DIM2,
+ *                    DIM_TYPE DIM3, DIM_TYPE DIM4)
+ */
+%typecheck(SWIG_TYPECHECK_DOUBLE_ARRAY,
+           fragment="NumPy_Macros")
+  (DATA_TYPE* IN_FARRAY4, DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DIM_TYPE DIM4)
+{
+  $1 = is_array($input) || PySequence_Check($input);
+}
+%typemap(in,
+         fragment="NumPy_Fragments")
+  (DATA_TYPE* IN_FARRAY4, DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DIM_TYPE DIM4)
+  (PyArrayObject* array=NULL, int is_new_object=0)
+{
+  npy_intp size[4] = { -1, -1, -1, -1 };
+  array = obj_to_array_fortran_allow_conversion($input, DATA_TYPECODE,
+                                                &is_new_object);
+  if (!array || !require_dimensions(array, 4) ||
+      !require_size(array, size, 4) | !require_fortran(array)) SWIG_fail;
+  $1 = (DATA_TYPE*) array_data(array);
+  $2 = (DIM_TYPE) array_size(array,0);
+  $3 = (DIM_TYPE) array_size(array,1);
+  $4 = (DIM_TYPE) array_size(array,2);
+  $5 = (DIM_TYPE) array_size(array,3);
+}
+%typemap(freearg)
+  (DATA_TYPE* IN_FARRAY4, DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DIM_TYPE DIM4)
+{
+  if (is_new_object$argnum && array$argnum)
+    { Py_DECREF(array$argnum); }
+}
+
+/* Typemap suite for (DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DIM_TYPE DIM4,
+ *                    DATA_TYPE* IN_FARRAY4)
+ */
+%typecheck(SWIG_TYPECHECK_DOUBLE_ARRAY,
+           fragment="NumPy_Macros")
+  (DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DIM_TYPE DIM4, DATA_TYPE* IN_FARRAY4)
+{
+  $1 = is_array($input) || PySequence_Check($input);
+}
+%typemap(in,
+         fragment="NumPy_Fragments")
+  (DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DIM_TYPE DIM4, DATA_TYPE* IN_FARRAY4)
+  (PyArrayObject* array=NULL, int is_new_object=0)
+{
+  npy_intp size[4] = { -1, -1, -1 , -1 };
+  array = obj_to_array_fortran_allow_conversion($input, DATA_TYPECODE,
+                                                   &is_new_object);
+  if (!array || !require_dimensions(array, 4) ||
+      !require_size(array, size, 4) || !require_fortran(array)) SWIG_fail;
+  $1 = (DIM_TYPE) array_size(array,0);
+  $2 = (DIM_TYPE) array_size(array,1);
+  $3 = (DIM_TYPE) array_size(array,2);
+  $4 = (DIM_TYPE) array_size(array,3);
+  $5 = (DATA_TYPE*) array_data(array);
+}
+%typemap(freearg)
+  (DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DIM_TYPE DIM4, DATA_TYPE* IN_FARRAY4)
+{
+  if (is_new_object$argnum && array$argnum)
+    { Py_DECREF(array$argnum); }
+}
+
+/***************************/
+/* In-Place Array Typemaps */
+/***************************/
+
+/* Typemap suite for (DATA_TYPE INPLACE_ARRAY1[ANY])
+ */
+%typecheck(SWIG_TYPECHECK_DOUBLE_ARRAY,
+           fragment="NumPy_Macros")
+  (DATA_TYPE INPLACE_ARRAY1[ANY])
+{
+  $1 = is_array($input) && PyArray_EquivTypenums(array_type($input),
+                                                 DATA_TYPECODE);
+}
+%typemap(in,
+         fragment="NumPy_Fragments")
+  (DATA_TYPE INPLACE_ARRAY1[ANY])
+  (PyArrayObject* array=NULL)
+{
+  npy_intp size[1] = { $1_dim0 };
+  array = obj_to_array_no_conversion($input, DATA_TYPECODE);
+  if (!array || !require_dimensions(array,1) || !require_size(array, size, 1) ||
+      !require_contiguous(array) || !require_native(array)) SWIG_fail;
+  $1 = ($1_ltype) array_data(array);
+}
+
+/* Typemap suite for (DATA_TYPE* INPLACE_ARRAY1, DIM_TYPE DIM1)
+ */
+%typecheck(SWIG_TYPECHECK_DOUBLE_ARRAY,
+           fragment="NumPy_Macros")
+  (DATA_TYPE* INPLACE_ARRAY1, DIM_TYPE DIM1)
+{
+  $1 = is_array($input) && PyArray_EquivTypenums(array_type($input),
+                                                 DATA_TYPECODE);
+}
+%typemap(in,
+         fragment="NumPy_Fragments")
+  (DATA_TYPE* INPLACE_ARRAY1, DIM_TYPE DIM1)
+  (PyArrayObject* array=NULL, int i=1)
+{
+  array = obj_to_array_no_conversion($input, DATA_TYPECODE);
+  if (!array || !require_dimensions(array,1) || !require_contiguous(array)
+      || !require_native(array)) SWIG_fail;
+  $1 = (DATA_TYPE*) array_data(array);
+  $2 = 1;
+  for (i=0; i < array_numdims(array); ++i) $2 *= array_size(array,i);
+}
+
+/* Typemap suite for (DIM_TYPE DIM1, DATA_TYPE* INPLACE_ARRAY1)
+ */
+%typecheck(SWIG_TYPECHECK_DOUBLE_ARRAY,
+           fragment="NumPy_Macros")
+  (DIM_TYPE DIM1, DATA_TYPE* INPLACE_ARRAY1)
+{
+  $1 = is_array($input) && PyArray_EquivTypenums(array_type($input),
+                                                 DATA_TYPECODE);
+}
+%typemap(in,
+         fragment="NumPy_Fragments")
+  (DIM_TYPE DIM1, DATA_TYPE* INPLACE_ARRAY1)
+  (PyArrayObject* array=NULL, int i=0)
+{
+  array = obj_to_array_no_conversion($input, DATA_TYPECODE);
+  if (!array || !require_dimensions(array,1) || !require_contiguous(array)
+      || !require_native(array)) SWIG_fail;
+  $1 = 1;
+  for (i=0; i < array_numdims(array); ++i) $1 *= array_size(array,i);
+  $2 = (DATA_TYPE*) array_data(array);
+}
+
+/* Typemap suite for (DATA_TYPE INPLACE_ARRAY2[ANY][ANY])
+ */
+%typecheck(SWIG_TYPECHECK_DOUBLE_ARRAY,
+           fragment="NumPy_Macros")
+  (DATA_TYPE INPLACE_ARRAY2[ANY][ANY])
+{
+  $1 = is_array($input) && PyArray_EquivTypenums(array_type($input),
+                                                 DATA_TYPECODE);
+}
+%typemap(in,
+         fragment="NumPy_Fragments")
+  (DATA_TYPE INPLACE_ARRAY2[ANY][ANY])
+  (PyArrayObject* array=NULL)
+{
+  npy_intp size[2] = { $1_dim0, $1_dim1 };
+  array = obj_to_array_no_conversion($input, DATA_TYPECODE);
+  if (!array || !require_dimensions(array,2) || !require_size(array, size, 2) ||
+      !require_contiguous(array) || !require_native(array)) SWIG_fail;
+  $1 = ($1_ltype) array_data(array);
+}
+
+/* Typemap suite for (DATA_TYPE* INPLACE_ARRAY2, DIM_TYPE DIM1, DIM_TYPE DIM2)
+ */
+%typecheck(SWIG_TYPECHECK_DOUBLE_ARRAY,
+           fragment="NumPy_Macros")
+  (DATA_TYPE* INPLACE_ARRAY2, DIM_TYPE DIM1, DIM_TYPE DIM2)
+{
+  $1 = is_array($input) && PyArray_EquivTypenums(array_type($input),
+                                                 DATA_TYPECODE);
+}
+%typemap(in,
+         fragment="NumPy_Fragments")
+  (DATA_TYPE* INPLACE_ARRAY2, DIM_TYPE DIM1, DIM_TYPE DIM2)
+  (PyArrayObject* array=NULL)
+{
+  array = obj_to_array_no_conversion($input, DATA_TYPECODE);
+  if (!array || !require_dimensions(array,2) || !require_contiguous(array)
+      || !require_native(array)) SWIG_fail;
+  $1 = (DATA_TYPE*) array_data(array);
+  $2 = (DIM_TYPE) array_size(array,0);
+  $3 = (DIM_TYPE) array_size(array,1);
+}
+
+/* Typemap suite for (DIM_TYPE DIM1, DIM_TYPE DIM2, DATA_TYPE* INPLACE_ARRAY2)
+ */
+%typecheck(SWIG_TYPECHECK_DOUBLE_ARRAY,
+           fragment="NumPy_Macros")
+  (DIM_TYPE DIM1, DIM_TYPE DIM2, DATA_TYPE* INPLACE_ARRAY2)
+{
+  $1 = is_array($input) && PyArray_EquivTypenums(array_type($input),
+                                                 DATA_TYPECODE);
+}
+%typemap(in,
+         fragment="NumPy_Fragments")
+  (DIM_TYPE DIM1, DIM_TYPE DIM2, DATA_TYPE* INPLACE_ARRAY2)
+  (PyArrayObject* array=NULL)
+{
+  array = obj_to_array_no_conversion($input, DATA_TYPECODE);
+  if (!array || !require_dimensions(array,2) || !require_contiguous(array) ||
+      !require_native(array)) SWIG_fail;
+  $1 = (DIM_TYPE) array_size(array,0);
+  $2 = (DIM_TYPE) array_size(array,1);
+  $3 = (DATA_TYPE*) array_data(array);
+}
+
+/* Typemap suite for (DATA_TYPE* INPLACE_FARRAY2, DIM_TYPE DIM1, DIM_TYPE DIM2)
+ */
+%typecheck(SWIG_TYPECHECK_DOUBLE_ARRAY,
+           fragment="NumPy_Macros")
+  (DATA_TYPE* INPLACE_FARRAY2, DIM_TYPE DIM1, DIM_TYPE DIM2)
+{
+  $1 = is_array($input) && PyArray_EquivTypenums(array_type($input),
+                                                 DATA_TYPECODE);
+}
+%typemap(in,
+         fragment="NumPy_Fragments")
+  (DATA_TYPE* INPLACE_FARRAY2, DIM_TYPE DIM1, DIM_TYPE DIM2)
+  (PyArrayObject* array=NULL)
+{
+  array = obj_to_array_no_conversion($input, DATA_TYPECODE);
+  if (!array || !require_dimensions(array,2) || !require_contiguous(array)
+      || !require_native(array) || !require_fortran(array)) SWIG_fail;
+  $1 = (DATA_TYPE*) array_data(array);
+  $2 = (DIM_TYPE) array_size(array,0);
+  $3 = (DIM_TYPE) array_size(array,1);
+}
+
+/* Typemap suite for (DIM_TYPE DIM1, DIM_TYPE DIM2, DATA_TYPE* INPLACE_FARRAY2)
+ */
+%typecheck(SWIG_TYPECHECK_DOUBLE_ARRAY,
+           fragment="NumPy_Macros")
+  (DIM_TYPE DIM1, DIM_TYPE DIM2, DATA_TYPE* INPLACE_FARRAY2)
+{
+  $1 = is_array($input) && PyArray_EquivTypenums(array_type($input),
+                                                 DATA_TYPECODE);
+}
+%typemap(in,
+         fragment="NumPy_Fragments")
+  (DIM_TYPE DIM1, DIM_TYPE DIM2, DATA_TYPE* INPLACE_FARRAY2)
+  (PyArrayObject* array=NULL)
+{
+  array = obj_to_array_no_conversion($input, DATA_TYPECODE);
+  if (!array || !require_dimensions(array,2) || !require_contiguous(array) ||
+      !require_native(array) || !require_fortran(array)) SWIG_fail;
+  $1 = (DIM_TYPE) array_size(array,0);
+  $2 = (DIM_TYPE) array_size(array,1);
+  $3 = (DATA_TYPE*) array_data(array);
+}
+
+/* Typemap suite for (DATA_TYPE INPLACE_ARRAY3[ANY][ANY][ANY])
+ */
+%typecheck(SWIG_TYPECHECK_DOUBLE_ARRAY,
+           fragment="NumPy_Macros")
+  (DATA_TYPE INPLACE_ARRAY3[ANY][ANY][ANY])
+{
+  $1 = is_array($input) && PyArray_EquivTypenums(array_type($input),
+                                                 DATA_TYPECODE);
+}
+%typemap(in,
+         fragment="NumPy_Fragments")
+  (DATA_TYPE INPLACE_ARRAY3[ANY][ANY][ANY])
+  (PyArrayObject* array=NULL)
+{
+  npy_intp size[3] = { $1_dim0, $1_dim1, $1_dim2 };
+  array = obj_to_array_no_conversion($input, DATA_TYPECODE);
+  if (!array || !require_dimensions(array,3) || !require_size(array, size, 3) ||
+      !require_contiguous(array) || !require_native(array)) SWIG_fail;
+  $1 = ($1_ltype) array_data(array);
+}
+
+/* Typemap suite for (DATA_TYPE* INPLACE_ARRAY3, DIM_TYPE DIM1, DIM_TYPE DIM2,
+ *                    DIM_TYPE DIM3)
+ */
+%typecheck(SWIG_TYPECHECK_DOUBLE_ARRAY,
+           fragment="NumPy_Macros")
+  (DATA_TYPE* INPLACE_ARRAY3, DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3)
+{
+  $1 = is_array($input) && PyArray_EquivTypenums(array_type($input),
+                                                 DATA_TYPECODE);
+}
+%typemap(in,
+         fragment="NumPy_Fragments")
+  (DATA_TYPE* INPLACE_ARRAY3, DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3)
+  (PyArrayObject* array=NULL)
+{
+  array = obj_to_array_no_conversion($input, DATA_TYPECODE);
+  if (!array || !require_dimensions(array,3) || !require_contiguous(array) ||
+      !require_native(array)) SWIG_fail;
+  $1 = (DATA_TYPE*) array_data(array);
+  $2 = (DIM_TYPE) array_size(array,0);
+  $3 = (DIM_TYPE) array_size(array,1);
+  $4 = (DIM_TYPE) array_size(array,2);
+}
+
+/* Typemap suite for (DATA_TYPE** INPLACE_ARRAY3, DIM_TYPE DIM1, DIM_TYPE DIM2,
+ *                    DIM_TYPE DIM3)
+ */
+%typecheck(SWIG_TYPECHECK_DOUBLE_ARRAY,
+           fragment="NumPy_Macros")
+  (DATA_TYPE** INPLACE_ARRAY3, DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3)
+{
+  $1 = PySequence_Check($input);
+}
+%typemap(in,
+         fragment="NumPy_Fragments")
+  (DATA_TYPE** INPLACE_ARRAY3, DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3)
+  (DATA_TYPE** array=NULL, PyArrayObject** object_array=NULL)
+{
+  npy_intp size[2] = { -1, -1 };
+  PyArrayObject* temp_array;
+  Py_ssize_t i;
+
+  /* length of the list */
+  $2 = PyList_Size($input);
+
+  /* the arrays */
+  array = (DATA_TYPE **)malloc($2*sizeof(DATA_TYPE *));
+  object_array = (PyArrayObject **)calloc($2,sizeof(PyArrayObject *));
+
+  if (array == NULL || object_array == NULL)
+  {
+    SWIG_fail;
+  }
+
+  for (i=0; i<$2; i++)
+  {
+    temp_array = obj_to_array_no_conversion(PySequence_GetItem($input,i), DATA_TYPECODE);
+
+    /* the new array must be stored so that it can be destroyed in freearg */
+    object_array[i] = temp_array;
+
+    if ( !temp_array || !require_dimensions(temp_array, 2) ||
+      !require_contiguous(temp_array) ||
+      !require_native(temp_array) ||
+      !PyArray_EquivTypenums(array_type(temp_array), DATA_TYPECODE)
+    ) SWIG_fail;
+
+    /* store the size of the first array in the list, then use that for comparison. */
+    if (i == 0)
+    {
+      size[0] = array_size(temp_array,0);
+      size[1] = array_size(temp_array,1);
+    }
+
+    if (!require_size(temp_array, size, 2)) SWIG_fail;
+
+    array[i] = (DATA_TYPE*) array_data(temp_array);
+  }
+
+  $1 = (DATA_TYPE**) array;
+  $3 = (DIM_TYPE) size[0];
+  $4 = (DIM_TYPE) size[1];
+}
+%typemap(freearg)
+  (DATA_TYPE** INPLACE_ARRAY3, DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3)
+{
+  if (array$argnum!=NULL) free(array$argnum);
+  if (object_array$argnum!=NULL) free(object_array$argnum);
+}
+
+/* Typemap suite for (DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3,
+ *                    DATA_TYPE* INPLACE_ARRAY3)
+ */
+%typecheck(SWIG_TYPECHECK_DOUBLE_ARRAY,
+           fragment="NumPy_Macros")
+  (DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DATA_TYPE* INPLACE_ARRAY3)
+{
+  $1 = is_array($input) && PyArray_EquivTypenums(array_type($input),
+                                                 DATA_TYPECODE);
+}
+%typemap(in,
+         fragment="NumPy_Fragments")
+  (DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DATA_TYPE* INPLACE_ARRAY3)
+  (PyArrayObject* array=NULL)
+{
+  array = obj_to_array_no_conversion($input, DATA_TYPECODE);
+  if (!array || !require_dimensions(array,3) || !require_contiguous(array)
+      || !require_native(array)) SWIG_fail;
+  $1 = (DIM_TYPE) array_size(array,0);
+  $2 = (DIM_TYPE) array_size(array,1);
+  $3 = (DIM_TYPE) array_size(array,2);
+  $4 = (DATA_TYPE*) array_data(array);
+}
+
+/* Typemap suite for (DATA_TYPE* INPLACE_FARRAY3, DIM_TYPE DIM1, DIM_TYPE DIM2,
+ *                    DIM_TYPE DIM3)
+ */
+%typecheck(SWIG_TYPECHECK_DOUBLE_ARRAY,
+           fragment="NumPy_Macros")
+  (DATA_TYPE* INPLACE_FARRAY3, DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3)
+{
+  $1 = is_array($input) && PyArray_EquivTypenums(array_type($input),
+                                                 DATA_TYPECODE);
+}
+%typemap(in,
+         fragment="NumPy_Fragments")
+  (DATA_TYPE* INPLACE_FARRAY3, DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3)
+  (PyArrayObject* array=NULL)
+{
+  array = obj_to_array_no_conversion($input, DATA_TYPECODE);
+  if (!array || !require_dimensions(array,3) || !require_contiguous(array) ||
+      !require_native(array) || !require_fortran(array)) SWIG_fail;
+  $1 = (DATA_TYPE*) array_data(array);
+  $2 = (DIM_TYPE) array_size(array,0);
+  $3 = (DIM_TYPE) array_size(array,1);
+  $4 = (DIM_TYPE) array_size(array,2);
+}
+
+/* Typemap suite for (DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3,
+ *                    DATA_TYPE* INPLACE_FARRAY3)
+ */
+%typecheck(SWIG_TYPECHECK_DOUBLE_ARRAY,
+           fragment="NumPy_Macros")
+  (DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DATA_TYPE* INPLACE_FARRAY3)
+{
+  $1 = is_array($input) && PyArray_EquivTypenums(array_type($input),
+                                                 DATA_TYPECODE);
+}
+%typemap(in,
+         fragment="NumPy_Fragments")
+  (DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DATA_TYPE* INPLACE_FARRAY3)
+  (PyArrayObject* array=NULL)
+{
+  array = obj_to_array_no_conversion($input, DATA_TYPECODE);
+  if (!array || !require_dimensions(array,3) || !require_contiguous(array)
+      || !require_native(array) || !require_fortran(array)) SWIG_fail;
+  $1 = (DIM_TYPE) array_size(array,0);
+  $2 = (DIM_TYPE) array_size(array,1);
+  $3 = (DIM_TYPE) array_size(array,2);
+  $4 = (DATA_TYPE*) array_data(array);
+}
+
+/* Typemap suite for (DATA_TYPE INPLACE_ARRAY4[ANY][ANY][ANY][ANY])
+ */
+%typecheck(SWIG_TYPECHECK_DOUBLE_ARRAY,
+           fragment="NumPy_Macros")
+  (DATA_TYPE INPLACE_ARRAY4[ANY][ANY][ANY][ANY])
+{
+  $1 = is_array($input) && PyArray_EquivTypenums(array_type($input),
+                                                 DATA_TYPECODE);
+}
+%typemap(in,
+         fragment="NumPy_Fragments")
+  (DATA_TYPE INPLACE_ARRAY4[ANY][ANY][ANY][ANY])
+  (PyArrayObject* array=NULL)
+{
+  npy_intp size[4] = { $1_dim0, $1_dim1, $1_dim2 , $1_dim3 };
+  array = obj_to_array_no_conversion($input, DATA_TYPECODE);
+  if (!array || !require_dimensions(array,4) || !require_size(array, size, 4) ||
+      !require_contiguous(array) || !require_native(array)) SWIG_fail;
+  $1 = ($1_ltype) array_data(array);
+}
+
+/* Typemap suite for (DATA_TYPE* INPLACE_ARRAY4, DIM_TYPE DIM1, DIM_TYPE DIM2,
+ *                    DIM_TYPE DIM3, DIM_TYPE DIM4)
+ */
+%typecheck(SWIG_TYPECHECK_DOUBLE_ARRAY,
+           fragment="NumPy_Macros")
+  (DATA_TYPE* INPLACE_ARRAY4, DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DIM_TYPE DIM4)
+{
+  $1 = is_array($input) && PyArray_EquivTypenums(array_type($input),
+                                                 DATA_TYPECODE);
+}
+%typemap(in,
+         fragment="NumPy_Fragments")
+  (DATA_TYPE* INPLACE_ARRAY4, DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DIM_TYPE DIM4)
+  (PyArrayObject* array=NULL)
+{
+  array = obj_to_array_no_conversion($input, DATA_TYPECODE);
+  if (!array || !require_dimensions(array,4) || !require_contiguous(array) ||
+      !require_native(array)) SWIG_fail;
+  $1 = (DATA_TYPE*) array_data(array);
+  $2 = (DIM_TYPE) array_size(array,0);
+  $3 = (DIM_TYPE) array_size(array,1);
+  $4 = (DIM_TYPE) array_size(array,2);
+  $5 = (DIM_TYPE) array_size(array,3);
+}
+
+/* Typemap suite for (DATA_TYPE** INPLACE_ARRAY4, DIM_TYPE DIM1, DIM_TYPE DIM2,
+ *                    DIM_TYPE DIM3, DIM_TYPE DIM4)
+ */
+%typecheck(SWIG_TYPECHECK_DOUBLE_ARRAY,
+           fragment="NumPy_Macros")
+  (DATA_TYPE** INPLACE_ARRAY4, DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DIM_TYPE DIM4)
+{
+  $1 = PySequence_Check($input);
+}
+%typemap(in,
+         fragment="NumPy_Fragments")
+  (DATA_TYPE** INPLACE_ARRAY4, DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DIM_TYPE DIM4)
+  (DATA_TYPE** array=NULL, PyArrayObject** object_array=NULL)
+{
+  npy_intp size[3] = { -1, -1, -1 };
+  PyArrayObject* temp_array;
+  Py_ssize_t i;
+
+  /* length of the list */
+  $2 = PyList_Size($input);
+
+  /* the arrays */
+  array = (DATA_TYPE **)malloc($2*sizeof(DATA_TYPE *));
+  object_array = (PyArrayObject **)calloc($2,sizeof(PyArrayObject *));
+
+  if (array == NULL || object_array == NULL)
+  {
+    SWIG_fail;
+  }
+
+  for (i=0; i<$2; i++)
+  {
+    temp_array = obj_to_array_no_conversion(PySequence_GetItem($input,i), DATA_TYPECODE);
+
+    /* the new array must be stored so that it can be destroyed in freearg */
+    object_array[i] = temp_array;
+
+    if ( !temp_array || !require_dimensions(temp_array, 3) ||
+      !require_contiguous(temp_array) ||
+      !require_native(temp_array) ||
+      !PyArray_EquivTypenums(array_type(temp_array), DATA_TYPECODE)
+    ) SWIG_fail;
+
+    /* store the size of the first array in the list, then use that for comparison. */
+    if (i == 0)
+    {
+      size[0] = array_size(temp_array,0);
+      size[1] = array_size(temp_array,1);
+      size[2] = array_size(temp_array,2);
+    }
+
+    if (!require_size(temp_array, size, 3)) SWIG_fail;
+
+    array[i] = (DATA_TYPE*) array_data(temp_array);
+  }
+
+  $1 = (DATA_TYPE**) array;
+  $3 = (DIM_TYPE) size[0];
+  $4 = (DIM_TYPE) size[1];
+  $5 = (DIM_TYPE) size[2];
+}
+%typemap(freearg)
+  (DATA_TYPE** INPLACE_ARRAY4, DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DIM_TYPE DIM4)
+{
+  if (array$argnum!=NULL) free(array$argnum);
+  if (object_array$argnum!=NULL) free(object_array$argnum);
+}
+
+/* Typemap suite for (DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DIM_TYPE DIM4,
+ *                    DATA_TYPE* INPLACE_ARRAY4)
+ */
+%typecheck(SWIG_TYPECHECK_DOUBLE_ARRAY,
+           fragment="NumPy_Macros")
+  (DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DIM_TYPE DIM4, DATA_TYPE* INPLACE_ARRAY4)
+{
+  $1 = is_array($input) && PyArray_EquivTypenums(array_type($input),
+                                                 DATA_TYPECODE);
+}
+%typemap(in,
+         fragment="NumPy_Fragments")
+  (DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DIM_TYPE DIM4, DATA_TYPE* INPLACE_ARRAY4)
+  (PyArrayObject* array=NULL)
+{
+  array = obj_to_array_no_conversion($input, DATA_TYPECODE);
+  if (!array || !require_dimensions(array,4) || !require_contiguous(array)
+      || !require_native(array)) SWIG_fail;
+  $1 = (DIM_TYPE) array_size(array,0);
+  $2 = (DIM_TYPE) array_size(array,1);
+  $3 = (DIM_TYPE) array_size(array,2);
+  $4 = (DIM_TYPE) array_size(array,3);
+  $5 = (DATA_TYPE*) array_data(array);
+}
+
+/* Typemap suite for (DATA_TYPE* INPLACE_FARRAY4, DIM_TYPE DIM1, DIM_TYPE DIM2,
+ *                    DIM_TYPE DIM3, DIM_TYPE DIM4)
+ */
+%typecheck(SWIG_TYPECHECK_DOUBLE_ARRAY,
+           fragment="NumPy_Macros")
+  (DATA_TYPE* INPLACE_FARRAY4, DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DIM_TYPE DIM4)
+{
+  $1 = is_array($input) && PyArray_EquivTypenums(array_type($input),
+                                                 DATA_TYPECODE);
+}
+%typemap(in,
+         fragment="NumPy_Fragments")
+  (DATA_TYPE* INPLACE_FARRAY4, DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DIM_TYPE DIM4)
+  (PyArrayObject* array=NULL)
+{
+  array = obj_to_array_no_conversion($input, DATA_TYPECODE);
+  if (!array || !require_dimensions(array,4) || !require_contiguous(array) ||
+      !require_native(array) || !require_fortran(array)) SWIG_fail;
+  $1 = (DATA_TYPE*) array_data(array);
+  $2 = (DIM_TYPE) array_size(array,0);
+  $3 = (DIM_TYPE) array_size(array,1);
+  $4 = (DIM_TYPE) array_size(array,2);
+  $5 = (DIM_TYPE) array_size(array,3);
+}
+
+/* Typemap suite for (DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3,
+ *                    DATA_TYPE* INPLACE_FARRAY4)
+ */
+%typecheck(SWIG_TYPECHECK_DOUBLE_ARRAY,
+           fragment="NumPy_Macros")
+  (DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DIM_TYPE DIM4, DATA_TYPE* INPLACE_FARRAY4)
+{
+  $1 = is_array($input) && PyArray_EquivTypenums(array_type($input),
+                                                 DATA_TYPECODE);
+}
+%typemap(in,
+         fragment="NumPy_Fragments")
+  (DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DIM_TYPE DIM4, DATA_TYPE* INPLACE_FARRAY4)
+  (PyArrayObject* array=NULL)
+{
+  array = obj_to_array_no_conversion($input, DATA_TYPECODE);
+  if (!array || !require_dimensions(array,4) || !require_contiguous(array)
+      || !require_native(array) || !require_fortran(array)) SWIG_fail;
+  $1 = (DIM_TYPE) array_size(array,0);
+  $2 = (DIM_TYPE) array_size(array,1);
+  $3 = (DIM_TYPE) array_size(array,2);
+  $4 = (DIM_TYPE) array_size(array,3);
+  $5 = (DATA_TYPE*) array_data(array);
+}
+
+/*************************/
+/* Argout Array Typemaps */
+/*************************/
+
+/* Typemap suite for (DATA_TYPE ARGOUT_ARRAY1[ANY])
+ */
+%typemap(in,numinputs=0,
+         fragment="NumPy_Backward_Compatibility,NumPy_Macros")
+  (DATA_TYPE ARGOUT_ARRAY1[ANY])
+  (PyObject* array = NULL)
+{
+  npy_intp dims[1] = { $1_dim0 };
+  array = PyArray_SimpleNew(1, dims, DATA_TYPECODE);
+  if (!array) SWIG_fail;
+  $1 = ($1_ltype) array_data(array);
+}
+%typemap(argout)
+  (DATA_TYPE ARGOUT_ARRAY1[ANY])
+{
+  $result = SWIG_Python_AppendOutput($result,(PyObject*)array$argnum);
+}
+
+/* Typemap suite for (DATA_TYPE* ARGOUT_ARRAY1, DIM_TYPE DIM1)
+ */
+%typemap(in,numinputs=1,
+         fragment="NumPy_Fragments")
+  (DATA_TYPE* ARGOUT_ARRAY1, DIM_TYPE DIM1)
+  (PyObject* array = NULL)
+{
+  npy_intp dims[1];
+  if (!PyInt_Check($input))
+  {
+    const char* typestring = pytype_string($input);
+    PyErr_Format(PyExc_TypeError,
+                 "Int dimension expected.  '%s' given.",
+                 typestring);
+    SWIG_fail;
+  }
+  $2 = (DIM_TYPE) PyInt_AsLong($input);
+  dims[0] = (npy_intp) $2;
+  array = PyArray_SimpleNew(1, dims, DATA_TYPECODE);
+  if (!array) SWIG_fail;
+  $1 = (DATA_TYPE*) array_data(array);
+}
+%typemap(argout)
+  (DATA_TYPE* ARGOUT_ARRAY1, DIM_TYPE DIM1)
+{
+  $result = SWIG_Python_AppendOutput($result,(PyObject*)array$argnum);
+}
+
+/* Typemap suite for (DIM_TYPE DIM1, DATA_TYPE* ARGOUT_ARRAY1)
+ */
+%typemap(in,numinputs=1,
+         fragment="NumPy_Fragments")
+  (DIM_TYPE DIM1, DATA_TYPE* ARGOUT_ARRAY1)
+  (PyObject* array = NULL)
+{
+  npy_intp dims[1];
+  if (!PyInt_Check($input))
+  {
+    const char* typestring = pytype_string($input);
+    PyErr_Format(PyExc_TypeError,
+                 "Int dimension expected.  '%s' given.",
+                 typestring);
+    SWIG_fail;
+  }
+  $1 = (DIM_TYPE) PyInt_AsLong($input);
+  dims[0] = (npy_intp) $1;
+  array = PyArray_SimpleNew(1, dims, DATA_TYPECODE);
+  if (!array) SWIG_fail;
+  $2 = (DATA_TYPE*) array_data(array);
+}
+%typemap(argout)
+  (DIM_TYPE DIM1, DATA_TYPE* ARGOUT_ARRAY1)
+{
+  $result = SWIG_Python_AppendOutput($result,(PyObject*)array$argnum);
+}
+
+/* Typemap suite for (DATA_TYPE ARGOUT_ARRAY2[ANY][ANY])
+ */
+%typemap(in,numinputs=0,
+         fragment="NumPy_Backward_Compatibility,NumPy_Macros")
+  (DATA_TYPE ARGOUT_ARRAY2[ANY][ANY])
+  (PyObject* array = NULL)
+{
+  npy_intp dims[2] = { $1_dim0, $1_dim1 };
+  array = PyArray_SimpleNew(2, dims, DATA_TYPECODE);
+  if (!array) SWIG_fail;
+  $1 = ($1_ltype) array_data(array);
+}
+%typemap(argout)
+  (DATA_TYPE ARGOUT_ARRAY2[ANY][ANY])
+{
+  $result = SWIG_Python_AppendOutput($result,(PyObject*)array$argnum);
+}
+
+/* Typemap suite for (DATA_TYPE ARGOUT_ARRAY3[ANY][ANY][ANY])
+ */
+%typemap(in,numinputs=0,
+         fragment="NumPy_Backward_Compatibility,NumPy_Macros")
+  (DATA_TYPE ARGOUT_ARRAY3[ANY][ANY][ANY])
+  (PyObject* array = NULL)
+{
+  npy_intp dims[3] = { $1_dim0, $1_dim1, $1_dim2 };
+  array = PyArray_SimpleNew(3, dims, DATA_TYPECODE);
+  if (!array) SWIG_fail;
+  $1 = ($1_ltype) array_data(array);
+}
+%typemap(argout)
+  (DATA_TYPE ARGOUT_ARRAY3[ANY][ANY][ANY])
+{
+  $result = SWIG_Python_AppendOutput($result,(PyObject*)array$argnum);
+}
+
+/* Typemap suite for (DATA_TYPE ARGOUT_ARRAY4[ANY][ANY][ANY][ANY])
+ */
+%typemap(in,numinputs=0,
+         fragment="NumPy_Backward_Compatibility,NumPy_Macros")
+  (DATA_TYPE ARGOUT_ARRAY4[ANY][ANY][ANY][ANY])
+  (PyObject* array = NULL)
+{
+  npy_intp dims[4] = { $1_dim0, $1_dim1, $1_dim2, $1_dim3 };
+  array = PyArray_SimpleNew(4, dims, DATA_TYPECODE);
+  if (!array) SWIG_fail;
+  $1 = ($1_ltype) array_data(array);
+}
+%typemap(argout)
+  (DATA_TYPE ARGOUT_ARRAY4[ANY][ANY][ANY][ANY])
+{
+  $result = SWIG_Python_AppendOutput($result,(PyObject*)array$argnum);
+}
+
+/*****************************/
+/* Argoutview Array Typemaps */
+/*****************************/
+
+/* Typemap suite for (DATA_TYPE** ARGOUTVIEW_ARRAY1, DIM_TYPE* DIM1)
+ */
+%typemap(in,numinputs=0)
+  (DATA_TYPE** ARGOUTVIEW_ARRAY1, DIM_TYPE* DIM1    )
+  (DATA_TYPE*  data_temp = NULL , DIM_TYPE  dim_temp)
+{
+  $1 = &data_temp;
+  $2 = &dim_temp;
+}
+%typemap(argout,
+         fragment="NumPy_Backward_Compatibility")
+  (DATA_TYPE** ARGOUTVIEW_ARRAY1, DIM_TYPE* DIM1)
+{
+  npy_intp dims[1] = { *$2 };
+  PyObject* obj = PyArray_SimpleNewFromData(1, dims, DATA_TYPECODE, (void*)(*$1));
+  PyArrayObject* array = (PyArrayObject*) obj;
+
+  if (!array) SWIG_fail;
+  $result = SWIG_Python_AppendOutput($result,obj);
+}
+
+/* Typemap suite for (DIM_TYPE* DIM1, DATA_TYPE** ARGOUTVIEW_ARRAY1)
+ */
+%typemap(in,numinputs=0)
+  (DIM_TYPE* DIM1    , DATA_TYPE** ARGOUTVIEW_ARRAY1)
+  (DIM_TYPE  dim_temp, DATA_TYPE*  data_temp = NULL )
+{
+  $1 = &dim_temp;
+  $2 = &data_temp;
+}
+%typemap(argout,
+         fragment="NumPy_Backward_Compatibility")
+  (DIM_TYPE* DIM1, DATA_TYPE** ARGOUTVIEW_ARRAY1)
+{
+  npy_intp dims[1] = { *$1 };
+  PyObject* obj = PyArray_SimpleNewFromData(1, dims, DATA_TYPECODE, (void*)(*$2));
+  PyArrayObject* array = (PyArrayObject*) obj;
+
+  if (!array) SWIG_fail;
+  $result = SWIG_Python_AppendOutput($result,obj);
+}
+
+/* Typemap suite for (DATA_TYPE** ARGOUTVIEW_ARRAY2, DIM_TYPE* DIM1, DIM_TYPE* DIM2)
+ */
+%typemap(in,numinputs=0)
+  (DATA_TYPE** ARGOUTVIEW_ARRAY2, DIM_TYPE* DIM1     , DIM_TYPE* DIM2     )
+  (DATA_TYPE*  data_temp = NULL , DIM_TYPE  dim1_temp, DIM_TYPE  dim2_temp)
+{
+  $1 = &data_temp;
+  $2 = &dim1_temp;
+  $3 = &dim2_temp;
+}
+%typemap(argout,
+         fragment="NumPy_Backward_Compatibility")
+  (DATA_TYPE** ARGOUTVIEW_ARRAY2, DIM_TYPE* DIM1, DIM_TYPE* DIM2)
+{
+  npy_intp dims[2] = { *$2, *$3 };
+  PyObject* obj = PyArray_SimpleNewFromData(2, dims, DATA_TYPECODE, (void*)(*$1));
+  PyArrayObject* array = (PyArrayObject*) obj;
+
+  if (!array) SWIG_fail;
+  $result = SWIG_Python_AppendOutput($result,obj);
+}
+
+/* Typemap suite for (DIM_TYPE* DIM1, DIM_TYPE* DIM2, DATA_TYPE** ARGOUTVIEW_ARRAY2)
+ */
+%typemap(in,numinputs=0)
+  (DIM_TYPE* DIM1     , DIM_TYPE* DIM2     , DATA_TYPE** ARGOUTVIEW_ARRAY2)
+  (DIM_TYPE  dim1_temp, DIM_TYPE  dim2_temp, DATA_TYPE*  data_temp = NULL )
+{
+  $1 = &dim1_temp;
+  $2 = &dim2_temp;
+  $3 = &data_temp;
+}
+%typemap(argout,
+         fragment="NumPy_Backward_Compatibility")
+  (DIM_TYPE* DIM1, DIM_TYPE* DIM2, DATA_TYPE** ARGOUTVIEW_ARRAY2)
+{
+  npy_intp dims[2] = { *$1, *$2 };
+  PyObject* obj = PyArray_SimpleNewFromData(2, dims, DATA_TYPECODE, (void*)(*$3));
+  PyArrayObject* array = (PyArrayObject*) obj;
+
+  if (!array) SWIG_fail;
+  $result = SWIG_Python_AppendOutput($result,obj);
+}
+
+/* Typemap suite for (DATA_TYPE** ARGOUTVIEW_FARRAY2, DIM_TYPE* DIM1, DIM_TYPE* DIM2)
+ */
+%typemap(in,numinputs=0)
+  (DATA_TYPE** ARGOUTVIEW_FARRAY2, DIM_TYPE* DIM1     , DIM_TYPE* DIM2     )
+  (DATA_TYPE*  data_temp = NULL  , DIM_TYPE  dim1_temp, DIM_TYPE  dim2_temp)
+{
+  $1 = &data_temp;
+  $2 = &dim1_temp;
+  $3 = &dim2_temp;
+}
+%typemap(argout,
+         fragment="NumPy_Backward_Compatibility,NumPy_Array_Requirements")
+  (DATA_TYPE** ARGOUTVIEW_FARRAY2, DIM_TYPE* DIM1, DIM_TYPE* DIM2)
+{
+  npy_intp dims[2] = { *$2, *$3 };
+  PyObject* obj = PyArray_SimpleNewFromData(2, dims, DATA_TYPECODE, (void*)(*$1));
+  PyArrayObject* array = (PyArrayObject*) obj;
+
+  if (!array || !require_fortran(array)) SWIG_fail;
+  $result = SWIG_Python_AppendOutput($result,obj);
+}
+
+/* Typemap suite for (DIM_TYPE* DIM1, DIM_TYPE* DIM2, DATA_TYPE** ARGOUTVIEW_FARRAY2)
+ */
+%typemap(in,numinputs=0)
+  (DIM_TYPE* DIM1     , DIM_TYPE* DIM2     , DATA_TYPE** ARGOUTVIEW_FARRAY2)
+  (DIM_TYPE  dim1_temp, DIM_TYPE  dim2_temp, DATA_TYPE*  data_temp = NULL  )
+{
+  $1 = &dim1_temp;
+  $2 = &dim2_temp;
+  $3 = &data_temp;
+}
+%typemap(argout,
+         fragment="NumPy_Backward_Compatibility,NumPy_Array_Requirements")
+  (DIM_TYPE* DIM1, DIM_TYPE* DIM2, DATA_TYPE** ARGOUTVIEW_FARRAY2)
+{
+  npy_intp dims[2] = { *$1, *$2 };
+  PyObject* obj = PyArray_SimpleNewFromData(2, dims, DATA_TYPECODE, (void*)(*$3));
+  PyArrayObject* array = (PyArrayObject*) obj;
+
+  if (!array || !require_fortran(array)) SWIG_fail;
+  $result = SWIG_Python_AppendOutput($result,obj);
+}
+
+/* Typemap suite for (DATA_TYPE** ARGOUTVIEW_ARRAY3, DIM_TYPE* DIM1, DIM_TYPE* DIM2,
+                      DIM_TYPE* DIM3)
+ */
+%typemap(in,numinputs=0)
+  (DATA_TYPE** ARGOUTVIEW_ARRAY3, DIM_TYPE* DIM1    , DIM_TYPE* DIM2    , DIM_TYPE* DIM3    )
+  (DATA_TYPE* data_temp = NULL  , DIM_TYPE dim1_temp, DIM_TYPE dim2_temp, DIM_TYPE dim3_temp)
+{
+  $1 = &data_temp;
+  $2 = &dim1_temp;
+  $3 = &dim2_temp;
+  $4 = &dim3_temp;
+}
+%typemap(argout,
+         fragment="NumPy_Backward_Compatibility")
+  (DATA_TYPE** ARGOUTVIEW_ARRAY3, DIM_TYPE* DIM1, DIM_TYPE* DIM2, DIM_TYPE* DIM3)
+{
+  npy_intp dims[3] = { *$2, *$3, *$4 };
+  PyObject* obj = PyArray_SimpleNewFromData(3, dims, DATA_TYPECODE, (void*)(*$1));
+  PyArrayObject* array = (PyArrayObject*) obj;
+
+  if (!array) SWIG_fail;
+  $result = SWIG_Python_AppendOutput($result,obj);
+}
+
+/* Typemap suite for (DIM_TYPE* DIM1, DIM_TYPE* DIM2, DIM_TYPE* DIM3,
+                      DATA_TYPE** ARGOUTVIEW_ARRAY3)
+ */
+%typemap(in,numinputs=0)
+  (DIM_TYPE* DIM1, DIM_TYPE* DIM2, DIM_TYPE* DIM3, DATA_TYPE** ARGOUTVIEW_ARRAY3)
+  (DIM_TYPE dim1_temp, DIM_TYPE dim2_temp, DIM_TYPE dim3_temp, DATA_TYPE* data_temp = NULL)
+{
+  $1 = &dim1_temp;
+  $2 = &dim2_temp;
+  $3 = &dim3_temp;
+  $4 = &data_temp;
+}
+%typemap(argout,
+         fragment="NumPy_Backward_Compatibility")
+  (DIM_TYPE* DIM1, DIM_TYPE* DIM2, DIM_TYPE* DIM3, DATA_TYPE** ARGOUTVIEW_ARRAY3)
+{
+  npy_intp dims[3] = { *$1, *$2, *$3 };
+  PyObject* obj = PyArray_SimpleNewFromData(3, dims, DATA_TYPECODE, (void*)(*$4));
+  PyArrayObject* array = (PyArrayObject*) obj;
+
+  if (!array) SWIG_fail;
+  $result = SWIG_Python_AppendOutput($result,obj);
+}
+
+/* Typemap suite for (DATA_TYPE** ARGOUTVIEW_FARRAY3, DIM_TYPE* DIM1, DIM_TYPE* DIM2,
+                      DIM_TYPE* DIM3)
+ */
+%typemap(in,numinputs=0)
+  (DATA_TYPE** ARGOUTVIEW_FARRAY3, DIM_TYPE* DIM1    , DIM_TYPE* DIM2    , DIM_TYPE* DIM3    )
+  (DATA_TYPE* data_temp = NULL   , DIM_TYPE dim1_temp, DIM_TYPE dim2_temp, DIM_TYPE dim3_temp)
+{
+  $1 = &data_temp;
+  $2 = &dim1_temp;
+  $3 = &dim2_temp;
+  $4 = &dim3_temp;
+}
+%typemap(argout,
+         fragment="NumPy_Backward_Compatibility,NumPy_Array_Requirements")
+  (DATA_TYPE** ARGOUTVIEW_FARRAY3, DIM_TYPE* DIM1, DIM_TYPE* DIM2, DIM_TYPE* DIM3)
+{
+  npy_intp dims[3] = { *$2, *$3, *$4 };
+  PyObject* obj = PyArray_SimpleNewFromData(3, dims, DATA_TYPECODE, (void*)(*$1));
+  PyArrayObject* array = (PyArrayObject*) obj;
+
+  if (!array || !require_fortran(array)) SWIG_fail;
+  $result = SWIG_Python_AppendOutput($result,obj);
+}
+
+/* Typemap suite for (DIM_TYPE* DIM1, DIM_TYPE* DIM2, DIM_TYPE* DIM3,
+                      DATA_TYPE** ARGOUTVIEW_FARRAY3)
+ */
+%typemap(in,numinputs=0)
+  (DIM_TYPE* DIM1    , DIM_TYPE* DIM2    , DIM_TYPE* DIM3    , DATA_TYPE** ARGOUTVIEW_FARRAY3)
+  (DIM_TYPE dim1_temp, DIM_TYPE dim2_temp, DIM_TYPE dim3_temp, DATA_TYPE* data_temp = NULL   )
+{
+  $1 = &dim1_temp;
+  $2 = &dim2_temp;
+  $3 = &dim3_temp;
+  $4 = &data_temp;
+}
+%typemap(argout,
+         fragment="NumPy_Backward_Compatibility,NumPy_Array_Requirements")
+  (DIM_TYPE* DIM1, DIM_TYPE* DIM2, DIM_TYPE* DIM3, DATA_TYPE** ARGOUTVIEW_FARRAY3)
+{
+  npy_intp dims[3] = { *$1, *$2, *$3 };
+  PyObject* obj = PyArray_SimpleNewFromData(3, dims, DATA_TYPECODE, (void*)(*$4));
+  PyArrayObject* array = (PyArrayObject*) obj;
+
+  if (!array || !require_fortran(array)) SWIG_fail;
+  $result = SWIG_Python_AppendOutput($result,obj);
+}
+
+/* Typemap suite for (DATA_TYPE** ARGOUTVIEW_ARRAY4, DIM_TYPE* DIM1, DIM_TYPE* DIM2,
+                      DIM_TYPE* DIM3, DIM_TYPE* DIM4)
+ */
+%typemap(in,numinputs=0)
+  (DATA_TYPE** ARGOUTVIEW_ARRAY4, DIM_TYPE* DIM1    , DIM_TYPE* DIM2    , DIM_TYPE* DIM3    , DIM_TYPE* DIM4    )
+  (DATA_TYPE* data_temp = NULL  , DIM_TYPE dim1_temp, DIM_TYPE dim2_temp, DIM_TYPE dim3_temp, DIM_TYPE dim4_temp)
+{
+  $1 = &data_temp;
+  $2 = &dim1_temp;
+  $3 = &dim2_temp;
+  $4 = &dim3_temp;
+  $5 = &dim4_temp;
+}
+%typemap(argout,
+         fragment="NumPy_Backward_Compatibility")
+  (DATA_TYPE** ARGOUTVIEW_ARRAY4, DIM_TYPE* DIM1, DIM_TYPE* DIM2, DIM_TYPE* DIM3, DIM_TYPE* DIM4)
+{
+  npy_intp dims[4] = { *$2, *$3, *$4 , *$5 };
+  PyObject* obj = PyArray_SimpleNewFromData(4, dims, DATA_TYPECODE, (void*)(*$1));
+  PyArrayObject* array = (PyArrayObject*) obj;
+
+  if (!array) SWIG_fail;
+  $result = SWIG_Python_AppendOutput($result,obj);
+}
+
+/* Typemap suite for (DIM_TYPE* DIM1, DIM_TYPE* DIM2, DIM_TYPE* DIM3, DIM_TYPE* DIM4,
+                      DATA_TYPE** ARGOUTVIEW_ARRAY4)
+ */
+%typemap(in,numinputs=0)
+  (DIM_TYPE* DIM1    , DIM_TYPE* DIM2    , DIM_TYPE* DIM3    , DIM_TYPE* DIM4    , DATA_TYPE** ARGOUTVIEW_ARRAY4)
+  (DIM_TYPE dim1_temp, DIM_TYPE dim2_temp, DIM_TYPE dim3_temp, DIM_TYPE dim4_temp, DATA_TYPE* data_temp = NULL  )
+{
+  $1 = &dim1_temp;
+  $2 = &dim2_temp;
+  $3 = &dim3_temp;
+  $4 = &dim4_temp;
+  $5 = &data_temp;
+}
+%typemap(argout,
+         fragment="NumPy_Backward_Compatibility")
+  (DIM_TYPE* DIM1, DIM_TYPE* DIM2, DIM_TYPE* DIM3, DIM_TYPE* DIM4, DATA_TYPE** ARGOUTVIEW_ARRAY4)
+{
+  npy_intp dims[4] = { *$1, *$2, *$3 , *$4 };
+  PyObject* obj = PyArray_SimpleNewFromData(4, dims, DATA_TYPECODE, (void*)(*$5));
+  PyArrayObject* array = (PyArrayObject*) obj;
+
+  if (!array) SWIG_fail;
+  $result = SWIG_Python_AppendOutput($result,obj);
+}
+
+/* Typemap suite for (DATA_TYPE** ARGOUTVIEW_FARRAY4, DIM_TYPE* DIM1, DIM_TYPE* DIM2,
+                      DIM_TYPE* DIM3, DIM_TYPE* DIM4)
+ */
+%typemap(in,numinputs=0)
+  (DATA_TYPE** ARGOUTVIEW_FARRAY4, DIM_TYPE* DIM1    , DIM_TYPE* DIM2    , DIM_TYPE* DIM3    , DIM_TYPE* DIM4    )
+  (DATA_TYPE* data_temp = NULL   , DIM_TYPE dim1_temp, DIM_TYPE dim2_temp, DIM_TYPE dim3_temp, DIM_TYPE dim4_temp)
+{
+  $1 = &data_temp;
+  $2 = &dim1_temp;
+  $3 = &dim2_temp;
+  $4 = &dim3_temp;
+  $5 = &dim4_temp;
+}
+%typemap(argout,
+         fragment="NumPy_Backward_Compatibility,NumPy_Array_Requirements")
+  (DATA_TYPE** ARGOUTVIEW_FARRAY4, DIM_TYPE* DIM1, DIM_TYPE* DIM2, DIM_TYPE* DIM3, DIM_TYPE* DIM4)
+{
+  npy_intp dims[4] = { *$2, *$3, *$4 , *$5 };
+  PyObject* obj = PyArray_SimpleNewFromData(4, dims, DATA_TYPECODE, (void*)(*$1));
+  PyArrayObject* array = (PyArrayObject*) obj;
+
+  if (!array || !require_fortran(array)) SWIG_fail;
+  $result = SWIG_Python_AppendOutput($result,obj);
+}
+
+/* Typemap suite for (DIM_TYPE* DIM1, DIM_TYPE* DIM2, DIM_TYPE* DIM3, DIM_TYPE* DIM4,
+                      DATA_TYPE** ARGOUTVIEW_FARRAY4)
+ */
+%typemap(in,numinputs=0)
+  (DIM_TYPE* DIM1    , DIM_TYPE* DIM2    , DIM_TYPE* DIM3    , DIM_TYPE* DIM4    , DATA_TYPE** ARGOUTVIEW_FARRAY4)
+  (DIM_TYPE dim1_temp, DIM_TYPE dim2_temp, DIM_TYPE dim3_temp, DIM_TYPE dim4_temp, DATA_TYPE* data_temp = NULL   )
+{
+  $1 = &dim1_temp;
+  $2 = &dim2_temp;
+  $3 = &dim3_temp;
+  $4 = &dim4_temp;
+  $5 = &data_temp;
+}
+%typemap(argout,
+         fragment="NumPy_Backward_Compatibility,NumPy_Array_Requirements")
+  (DIM_TYPE* DIM1, DIM_TYPE* DIM2, DIM_TYPE* DIM3, DIM_TYPE* DIM4, DATA_TYPE** ARGOUTVIEW_FARRAY4)
+{
+  npy_intp dims[4] = { *$1, *$2, *$3 , *$4 };
+  PyObject* obj = PyArray_SimpleNewFromData(4, dims, DATA_TYPECODE, (void*)(*$5));
+  PyArrayObject* array = (PyArrayObject*) obj;
+
+  if (!array || !require_fortran(array)) SWIG_fail;
+  $result = SWIG_Python_AppendOutput($result,obj);
+}
+
+/*************************************/
+/* Managed Argoutview Array Typemaps */
+/*************************************/
+
+/* Typemap suite for (DATA_TYPE** ARGOUTVIEWM_ARRAY1, DIM_TYPE* DIM1)
+ */
+%typemap(in,numinputs=0)
+  (DATA_TYPE** ARGOUTVIEWM_ARRAY1, DIM_TYPE* DIM1    )
+  (DATA_TYPE*  data_temp = NULL  , DIM_TYPE  dim_temp)
+{
+  $1 = &data_temp;
+  $2 = &dim_temp;
+}
+%typemap(argout,
+         fragment="NumPy_Backward_Compatibility,NumPy_Utilities")
+  (DATA_TYPE** ARGOUTVIEWM_ARRAY1, DIM_TYPE* DIM1)
+{
+  npy_intp dims[1] = { *$2 };
+  PyObject* obj = PyArray_SimpleNewFromData(1, dims, DATA_TYPECODE, (void*)(*$1));
+  PyArrayObject* array = (PyArrayObject*) obj;
+
+  if (!array) SWIG_fail;
+
+%#ifdef SWIGPY_USE_CAPSULE
+    PyObject* cap = PyCapsule_New((void*)(*$1), SWIGPY_CAPSULE_NAME, free_cap);
+%#else
+    PyObject* cap = PyCObject_FromVoidPtr((void*)(*$1), free);
+%#endif
+
+%#if NPY_API_VERSION < 0x00000007
+  PyArray_BASE(array) = cap;
+%#else
+  PyArray_SetBaseObject(array,cap);
+%#endif
+
+  $result = SWIG_Python_AppendOutput($result,obj);
+}
+
+/* Typemap suite for (DIM_TYPE* DIM1, DATA_TYPE** ARGOUTVIEWM_ARRAY1)
+ */
+%typemap(in,numinputs=0)
+  (DIM_TYPE* DIM1    , DATA_TYPE** ARGOUTVIEWM_ARRAY1)
+  (DIM_TYPE  dim_temp, DATA_TYPE*  data_temp = NULL  )
+{
+  $1 = &dim_temp;
+  $2 = &data_temp;
+}
+%typemap(argout,
+         fragment="NumPy_Backward_Compatibility,NumPy_Utilities")
+  (DIM_TYPE* DIM1, DATA_TYPE** ARGOUTVIEWM_ARRAY1)
+{
+  npy_intp dims[1] = { *$1 };
+  PyObject* obj = PyArray_SimpleNewFromData(1, dims, DATA_TYPECODE, (void*)(*$2));
+  PyArrayObject* array = (PyArrayObject*) obj;
+
+  if (!array) SWIG_fail;
+
+%#ifdef SWIGPY_USE_CAPSULE
+    PyObject* cap = PyCapsule_New((void*)(*$1), SWIGPY_CAPSULE_NAME, free_cap);
+%#else
+    PyObject* cap = PyCObject_FromVoidPtr((void*)(*$1), free);
+%#endif
+
+%#if NPY_API_VERSION < 0x00000007
+  PyArray_BASE(array) = cap;
+%#else
+  PyArray_SetBaseObject(array,cap);
+%#endif
+
+  $result = SWIG_Python_AppendOutput($result,obj);
+}
+
+/* Typemap suite for (DATA_TYPE** ARGOUTVIEWM_ARRAY2, DIM_TYPE* DIM1, DIM_TYPE* DIM2)
+ */
+%typemap(in,numinputs=0)
+  (DATA_TYPE** ARGOUTVIEWM_ARRAY2, DIM_TYPE* DIM1     , DIM_TYPE* DIM2     )
+  (DATA_TYPE*  data_temp = NULL  , DIM_TYPE  dim1_temp, DIM_TYPE  dim2_temp)
+{
+  $1 = &data_temp;
+  $2 = &dim1_temp;
+  $3 = &dim2_temp;
+}
+%typemap(argout,
+         fragment="NumPy_Backward_Compatibility,NumPy_Utilities")
+  (DATA_TYPE** ARGOUTVIEWM_ARRAY2, DIM_TYPE* DIM1, DIM_TYPE* DIM2)
+{
+  npy_intp dims[2] = { *$2, *$3 };
+  PyObject* obj = PyArray_SimpleNewFromData(2, dims, DATA_TYPECODE, (void*)(*$1));
+  PyArrayObject* array = (PyArrayObject*) obj;
+
+  if (!array) SWIG_fail;
+
+%#ifdef SWIGPY_USE_CAPSULE
+    PyObject* cap = PyCapsule_New((void*)(*$1), SWIGPY_CAPSULE_NAME, free_cap);
+%#else
+    PyObject* cap = PyCObject_FromVoidPtr((void*)(*$1), free);
+%#endif
+
+%#if NPY_API_VERSION < 0x00000007
+  PyArray_BASE(array) = cap;
+%#else
+  PyArray_SetBaseObject(array,cap);
+%#endif
+
+  $result = SWIG_Python_AppendOutput($result,obj);
+}
+
+/* Typemap suite for (DIM_TYPE* DIM1, DIM_TYPE* DIM2, DATA_TYPE** ARGOUTVIEWM_ARRAY2)
+ */
+%typemap(in,numinputs=0)
+  (DIM_TYPE* DIM1     , DIM_TYPE* DIM2     , DATA_TYPE** ARGOUTVIEWM_ARRAY2)
+  (DIM_TYPE  dim1_temp, DIM_TYPE  dim2_temp, DATA_TYPE*  data_temp = NULL  )
+{
+  $1 = &dim1_temp;
+  $2 = &dim2_temp;
+  $3 = &data_temp;
+}
+%typemap(argout,
+         fragment="NumPy_Backward_Compatibility,NumPy_Utilities")
+  (DIM_TYPE* DIM1, DIM_TYPE* DIM2, DATA_TYPE** ARGOUTVIEWM_ARRAY2)
+{
+  npy_intp dims[2] = { *$1, *$2 };
+  PyObject* obj = PyArray_SimpleNewFromData(2, dims, DATA_TYPECODE, (void*)(*$3));
+  PyArrayObject* array = (PyArrayObject*) obj;
+
+  if (!array) SWIG_fail;
+
+%#ifdef SWIGPY_USE_CAPSULE
+    PyObject* cap = PyCapsule_New((void*)(*$1), SWIGPY_CAPSULE_NAME, free_cap);
+%#else
+    PyObject* cap = PyCObject_FromVoidPtr((void*)(*$1), free);
+%#endif
+
+%#if NPY_API_VERSION < 0x00000007
+  PyArray_BASE(array) = cap;
+%#else
+  PyArray_SetBaseObject(array,cap);
+%#endif
+
+  $result = SWIG_Python_AppendOutput($result,obj);
+}
+
+/* Typemap suite for (DATA_TYPE** ARGOUTVIEWM_FARRAY2, DIM_TYPE* DIM1, DIM_TYPE* DIM2)
+ */
+%typemap(in,numinputs=0)
+  (DATA_TYPE** ARGOUTVIEWM_FARRAY2, DIM_TYPE* DIM1     , DIM_TYPE* DIM2     )
+  (DATA_TYPE*  data_temp = NULL   , DIM_TYPE  dim1_temp, DIM_TYPE  dim2_temp)
+{
+  $1 = &data_temp;
+  $2 = &dim1_temp;
+  $3 = &dim2_temp;
+}
+%typemap(argout,
+         fragment="NumPy_Backward_Compatibility,NumPy_Array_Requirements,NumPy_Utilities")
+  (DATA_TYPE** ARGOUTVIEWM_FARRAY2, DIM_TYPE* DIM1, DIM_TYPE* DIM2)
+{
+  npy_intp dims[2] = { *$2, *$3 };
+  PyObject* obj = PyArray_SimpleNewFromData(2, dims, DATA_TYPECODE, (void*)(*$1));
+  PyArrayObject* array = (PyArrayObject*) obj;
+
+  if (!array || !require_fortran(array)) SWIG_fail;
+
+%#ifdef SWIGPY_USE_CAPSULE
+    PyObject* cap = PyCapsule_New((void*)(*$1), SWIGPY_CAPSULE_NAME, free_cap);
+%#else
+    PyObject* cap = PyCObject_FromVoidPtr((void*)(*$1), free);
+%#endif
+
+%#if NPY_API_VERSION < 0x00000007
+  PyArray_BASE(array) = cap;
+%#else
+  PyArray_SetBaseObject(array,cap);
+%#endif
+
+  $result = SWIG_Python_AppendOutput($result,obj);
+}
+
+/* Typemap suite for (DIM_TYPE* DIM1, DIM_TYPE* DIM2, DATA_TYPE** ARGOUTVIEWM_FARRAY2)
+ */
+%typemap(in,numinputs=0)
+  (DIM_TYPE* DIM1     , DIM_TYPE* DIM2     , DATA_TYPE** ARGOUTVIEWM_FARRAY2)
+  (DIM_TYPE  dim1_temp, DIM_TYPE  dim2_temp, DATA_TYPE*  data_temp = NULL   )
+{
+  $1 = &dim1_temp;
+  $2 = &dim2_temp;
+  $3 = &data_temp;
+}
+%typemap(argout,
+         fragment="NumPy_Backward_Compatibility,NumPy_Array_Requirements,NumPy_Utilities")
+  (DIM_TYPE* DIM1, DIM_TYPE* DIM2, DATA_TYPE** ARGOUTVIEWM_FARRAY2)
+{
+  npy_intp dims[2] = { *$1, *$2 };
+  PyObject* obj = PyArray_SimpleNewFromData(2, dims, DATA_TYPECODE, (void*)(*$3));
+  PyArrayObject* array = (PyArrayObject*) obj;
+
+  if (!array || !require_fortran(array)) SWIG_fail;
+
+%#ifdef SWIGPY_USE_CAPSULE
+    PyObject* cap = PyCapsule_New((void*)(*$1), SWIGPY_CAPSULE_NAME, free_cap);
+%#else
+    PyObject* cap = PyCObject_FromVoidPtr((void*)(*$1), free);
+%#endif
+
+%#if NPY_API_VERSION < 0x00000007
+  PyArray_BASE(array) = cap;
+%#else
+  PyArray_SetBaseObject(array,cap);
+%#endif
+
+  $result = SWIG_Python_AppendOutput($result,obj);
+}
+
+/* Typemap suite for (DATA_TYPE** ARGOUTVIEWM_ARRAY3, DIM_TYPE* DIM1, DIM_TYPE* DIM2,
+                      DIM_TYPE* DIM3)
+ */
+%typemap(in,numinputs=0)
+  (DATA_TYPE** ARGOUTVIEWM_ARRAY3, DIM_TYPE* DIM1    , DIM_TYPE* DIM2    , DIM_TYPE* DIM3    )
+  (DATA_TYPE* data_temp = NULL   , DIM_TYPE dim1_temp, DIM_TYPE dim2_temp, DIM_TYPE dim3_temp)
+{
+  $1 = &data_temp;
+  $2 = &dim1_temp;
+  $3 = &dim2_temp;
+  $4 = &dim3_temp;
+}
+%typemap(argout,
+         fragment="NumPy_Backward_Compatibility,NumPy_Utilities")
+  (DATA_TYPE** ARGOUTVIEWM_ARRAY3, DIM_TYPE* DIM1, DIM_TYPE* DIM2, DIM_TYPE* DIM3)
+{
+  npy_intp dims[3] = { *$2, *$3, *$4 };
+  PyObject* obj = PyArray_SimpleNewFromData(3, dims, DATA_TYPECODE, (void*)(*$1));
+  PyArrayObject* array = (PyArrayObject*) obj;
+
+  if (!array) SWIG_fail;
+
+%#ifdef SWIGPY_USE_CAPSULE
+    PyObject* cap = PyCapsule_New((void*)(*$1), SWIGPY_CAPSULE_NAME, free_cap);
+%#else
+    PyObject* cap = PyCObject_FromVoidPtr((void*)(*$1), free);
+%#endif
+
+%#if NPY_API_VERSION < 0x00000007
+  PyArray_BASE(array) = cap;
+%#else
+  PyArray_SetBaseObject(array,cap);
+%#endif
+
+  $result = SWIG_Python_AppendOutput($result,obj);
+}
+
+/* Typemap suite for (DIM_TYPE* DIM1, DIM_TYPE* DIM2, DIM_TYPE* DIM3,
+                      DATA_TYPE** ARGOUTVIEWM_ARRAY3)
+ */
+%typemap(in,numinputs=0)
+  (DIM_TYPE* DIM1    , DIM_TYPE* DIM2    , DIM_TYPE* DIM3    , DATA_TYPE** ARGOUTVIEWM_ARRAY3)
+  (DIM_TYPE dim1_temp, DIM_TYPE dim2_temp, DIM_TYPE dim3_temp, DATA_TYPE* data_temp = NULL   )
+{
+  $1 = &dim1_temp;
+  $2 = &dim2_temp;
+  $3 = &dim3_temp;
+  $4 = &data_temp;
+}
+%typemap(argout,
+         fragment="NumPy_Backward_Compatibility,NumPy_Utilities")
+  (DIM_TYPE* DIM1, DIM_TYPE* DIM2, DIM_TYPE* DIM3, DATA_TYPE** ARGOUTVIEWM_ARRAY3)
+{
+  npy_intp dims[3] = { *$1, *$2, *$3 };
+  PyObject* obj= PyArray_SimpleNewFromData(3, dims, DATA_TYPECODE, (void*)(*$4));
+  PyArrayObject* array = (PyArrayObject*) obj;
+
+  if (!array) SWIG_fail;
+
+%#ifdef SWIGPY_USE_CAPSULE
+    PyObject* cap = PyCapsule_New((void*)(*$1), SWIGPY_CAPSULE_NAME, free_cap);
+%#else
+    PyObject* cap = PyCObject_FromVoidPtr((void*)(*$1), free);
+%#endif
+
+%#if NPY_API_VERSION < 0x00000007
+  PyArray_BASE(array) = cap;
+%#else
+  PyArray_SetBaseObject(array,cap);
+%#endif
+
+  $result = SWIG_Python_AppendOutput($result,obj);
+}
+
+/* Typemap suite for (DATA_TYPE** ARGOUTVIEWM_FARRAY3, DIM_TYPE* DIM1, DIM_TYPE* DIM2,
+                      DIM_TYPE* DIM3)
+ */
+%typemap(in,numinputs=0)
+  (DATA_TYPE** ARGOUTVIEWM_FARRAY3, DIM_TYPE* DIM1    , DIM_TYPE* DIM2    , DIM_TYPE* DIM3    )
+  (DATA_TYPE* data_temp = NULL    , DIM_TYPE dim1_temp, DIM_TYPE dim2_temp, DIM_TYPE dim3_temp)
+{
+  $1 = &data_temp;
+  $2 = &dim1_temp;
+  $3 = &dim2_temp;
+  $4 = &dim3_temp;
+}
+%typemap(argout,
+         fragment="NumPy_Backward_Compatibility,NumPy_Array_Requirements,NumPy_Utilities")
+  (DATA_TYPE** ARGOUTVIEWM_FARRAY3, DIM_TYPE* DIM1, DIM_TYPE* DIM2, DIM_TYPE* DIM3)
+{
+  npy_intp dims[3] = { *$2, *$3, *$4 };
+  PyObject* obj = PyArray_SimpleNewFromData(3, dims, DATA_TYPECODE, (void*)(*$1));
+  PyArrayObject* array = (PyArrayObject*) obj;
+
+  if (!array || !require_fortran(array)) SWIG_fail;
+
+%#ifdef SWIGPY_USE_CAPSULE
+    PyObject* cap = PyCapsule_New((void*)(*$1), SWIGPY_CAPSULE_NAME, free_cap);
+%#else
+    PyObject* cap = PyCObject_FromVoidPtr((void*)(*$1), free);
+%#endif
+
+%#if NPY_API_VERSION < 0x00000007
+  PyArray_BASE(array) = cap;
+%#else
+  PyArray_SetBaseObject(array,cap);
+%#endif
+
+  $result = SWIG_Python_AppendOutput($result,obj);
+}
+
+/* Typemap suite for (DIM_TYPE* DIM1, DIM_TYPE* DIM2, DIM_TYPE* DIM3,
+                      DATA_TYPE** ARGOUTVIEWM_FARRAY3)
+ */
+%typemap(in,numinputs=0)
+  (DIM_TYPE* DIM1    , DIM_TYPE* DIM2    , DIM_TYPE* DIM3    , DATA_TYPE** ARGOUTVIEWM_FARRAY3)
+  (DIM_TYPE dim1_temp, DIM_TYPE dim2_temp, DIM_TYPE dim3_temp, DATA_TYPE* data_temp = NULL    )
+{
+  $1 = &dim1_temp;
+  $2 = &dim2_temp;
+  $3 = &dim3_temp;
+  $4 = &data_temp;
+}
+%typemap(argout,
+         fragment="NumPy_Backward_Compatibility,NumPy_Array_Requirements,NumPy_Utilities")
+  (DIM_TYPE* DIM1, DIM_TYPE* DIM2, DIM_TYPE* DIM3, DATA_TYPE** ARGOUTVIEWM_FARRAY3)
+{
+  npy_intp dims[3] = { *$1, *$2, *$3 };
+  PyObject* obj = PyArray_SimpleNewFromData(3, dims, DATA_TYPECODE, (void*)(*$4));
+  PyArrayObject* array = (PyArrayObject*) obj;
+
+  if (!array || !require_fortran(array)) SWIG_fail;
+
+%#ifdef SWIGPY_USE_CAPSULE
+    PyObject* cap = PyCapsule_New((void*)(*$1), SWIGPY_CAPSULE_NAME, free_cap);
+%#else
+    PyObject* cap = PyCObject_FromVoidPtr((void*)(*$1), free);
+%#endif
+
+%#if NPY_API_VERSION < 0x00000007
+  PyArray_BASE(array) = cap;
+%#else
+  PyArray_SetBaseObject(array,cap);
+%#endif
+
+  $result = SWIG_Python_AppendOutput($result,obj);
+}
+
+/* Typemap suite for (DATA_TYPE** ARGOUTVIEWM_ARRAY4, DIM_TYPE* DIM1, DIM_TYPE* DIM2,
+                      DIM_TYPE* DIM3, DIM_TYPE* DIM4)
+ */
+%typemap(in,numinputs=0)
+  (DATA_TYPE** ARGOUTVIEWM_ARRAY4, DIM_TYPE* DIM1    , DIM_TYPE* DIM2    , DIM_TYPE* DIM3    , DIM_TYPE* DIM4    )
+  (DATA_TYPE* data_temp = NULL   , DIM_TYPE dim1_temp, DIM_TYPE dim2_temp, DIM_TYPE dim3_temp, DIM_TYPE dim4_temp)
+{
+  $1 = &data_temp;
+  $2 = &dim1_temp;
+  $3 = &dim2_temp;
+  $4 = &dim3_temp;
+  $5 = &dim4_temp;
+}
+%typemap(argout,
+         fragment="NumPy_Backward_Compatibility,NumPy_Utilities")
+  (DATA_TYPE** ARGOUTVIEWM_ARRAY4, DIM_TYPE* DIM1, DIM_TYPE* DIM2, DIM_TYPE* DIM3, DIM_TYPE* DIM4)
+{
+  npy_intp dims[4] = { *$2, *$3, *$4 , *$5 };
+  PyObject* obj = PyArray_SimpleNewFromData(4, dims, DATA_TYPECODE, (void*)(*$1));
+  PyArrayObject* array = (PyArrayObject*) obj;
+
+  if (!array) SWIG_fail;
+
+%#ifdef SWIGPY_USE_CAPSULE
+    PyObject* cap = PyCapsule_New((void*)(*$1), SWIGPY_CAPSULE_NAME, free_cap);
+%#else
+    PyObject* cap = PyCObject_FromVoidPtr((void*)(*$1), free);
+%#endif
+
+%#if NPY_API_VERSION < 0x00000007
+  PyArray_BASE(array) = cap;
+%#else
+  PyArray_SetBaseObject(array,cap);
+%#endif
+
+  $result = SWIG_Python_AppendOutput($result,obj);
+}
+
+/* Typemap suite for (DIM_TYPE* DIM1, DIM_TYPE* DIM2, DIM_TYPE* DIM3, DIM_TYPE* DIM4,
+                      DATA_TYPE** ARGOUTVIEWM_ARRAY4)
+ */
+%typemap(in,numinputs=0)
+  (DIM_TYPE* DIM1    , DIM_TYPE* DIM2    , DIM_TYPE* DIM3    , DIM_TYPE* DIM4    , DATA_TYPE** ARGOUTVIEWM_ARRAY4)
+  (DIM_TYPE dim1_temp, DIM_TYPE dim2_temp, DIM_TYPE dim3_temp, DIM_TYPE dim4_temp, DATA_TYPE* data_temp = NULL   )
+{
+  $1 = &dim1_temp;
+  $2 = &dim2_temp;
+  $3 = &dim3_temp;
+  $4 = &dim4_temp;
+  $5 = &data_temp;
+}
+%typemap(argout,
+         fragment="NumPy_Backward_Compatibility,NumPy_Utilities")
+  (DIM_TYPE* DIM1, DIM_TYPE* DIM2, DIM_TYPE* DIM3, DIM_TYPE* DIM4, DATA_TYPE** ARGOUTVIEWM_ARRAY4)
+{
+  npy_intp dims[4] = { *$1, *$2, *$3 , *$4 };
+  PyObject* obj = PyArray_SimpleNewFromData(4, dims, DATA_TYPECODE, (void*)(*$5));
+  PyArrayObject* array = (PyArrayObject*) obj;
+
+  if (!array) SWIG_fail;
+
+%#ifdef SWIGPY_USE_CAPSULE
+    PyObject* cap = PyCapsule_New((void*)(*$1), SWIGPY_CAPSULE_NAME, free_cap);
+%#else
+    PyObject* cap = PyCObject_FromVoidPtr((void*)(*$1), free);
+%#endif
+
+%#if NPY_API_VERSION < 0x00000007
+  PyArray_BASE(array) = cap;
+%#else
+  PyArray_SetBaseObject(array,cap);
+%#endif
+
+  $result = SWIG_Python_AppendOutput($result,obj);
+}
+
+/* Typemap suite for (DATA_TYPE** ARGOUTVIEWM_FARRAY4, DIM_TYPE* DIM1, DIM_TYPE* DIM2,
+                      DIM_TYPE* DIM3, DIM_TYPE* DIM4)
+ */
+%typemap(in,numinputs=0)
+  (DATA_TYPE** ARGOUTVIEWM_FARRAY4, DIM_TYPE* DIM1    , DIM_TYPE* DIM2    , DIM_TYPE* DIM3    , DIM_TYPE* DIM4    )
+  (DATA_TYPE* data_temp = NULL    , DIM_TYPE dim1_temp, DIM_TYPE dim2_temp, DIM_TYPE dim3_temp, DIM_TYPE dim4_temp)
+{
+  $1 = &data_temp;
+  $2 = &dim1_temp;
+  $3 = &dim2_temp;
+  $4 = &dim3_temp;
+  $5 = &dim4_temp;
+}
+%typemap(argout,
+         fragment="NumPy_Backward_Compatibility,NumPy_Array_Requirements,NumPy_Utilities")
+  (DATA_TYPE** ARGOUTVIEWM_FARRAY4, DIM_TYPE* DIM1, DIM_TYPE* DIM2, DIM_TYPE* DIM3)
+{
+  npy_intp dims[4] = { *$2, *$3, *$4 , *$5 };
+  PyObject* obj = PyArray_SimpleNewFromData(4, dims, DATA_TYPECODE, (void*)(*$1));
+  PyArrayObject* array = (PyArrayObject*) obj;
+
+  if (!array || !require_fortran(array)) SWIG_fail;
+
+%#ifdef SWIGPY_USE_CAPSULE
+    PyObject* cap = PyCapsule_New((void*)(*$1), SWIGPY_CAPSULE_NAME, free_cap);
+%#else
+    PyObject* cap = PyCObject_FromVoidPtr((void*)(*$1), free);
+%#endif
+
+%#if NPY_API_VERSION < 0x00000007
+  PyArray_BASE(array) = cap;
+%#else
+  PyArray_SetBaseObject(array,cap);
+%#endif
+
+  $result = SWIG_Python_AppendOutput($result,obj);
+}
+
+/* Typemap suite for (DIM_TYPE* DIM1, DIM_TYPE* DIM2, DIM_TYPE* DIM3, DIM_TYPE* DIM4,
+                      DATA_TYPE** ARGOUTVIEWM_FARRAY4)
+ */
+%typemap(in,numinputs=0)
+  (DIM_TYPE* DIM1    , DIM_TYPE* DIM2    , DIM_TYPE* DIM3    , DIM_TYPE* DIM4    , DATA_TYPE** ARGOUTVIEWM_FARRAY4)
+  (DIM_TYPE dim1_temp, DIM_TYPE dim2_temp, DIM_TYPE dim3_temp, DIM_TYPE dim4_temp, DATA_TYPE* data_temp = NULL    )
+{
+  $1 = &dim1_temp;
+  $2 = &dim2_temp;
+  $3 = &dim3_temp;
+  $4 = &dim4_temp;
+  $5 = &data_temp;
+}
+%typemap(argout,
+         fragment="NumPy_Backward_Compatibility,NumPy_Array_Requirements,NumPy_Utilities")
+  (DIM_TYPE* DIM1, DIM_TYPE* DIM2, DIM_TYPE* DIM3, DIM_TYPE* DIM4, DATA_TYPE** ARGOUTVIEWM_FARRAY4)
+{
+  npy_intp dims[4] = { *$1, *$2, *$3 , *$4 };
+  PyObject* obj = PyArray_SimpleNewFromData(4, dims, DATA_TYPECODE, (void*)(*$5));
+  PyArrayObject* array = (PyArrayObject*) obj;
+
+  if (!array || !require_fortran(array)) SWIG_fail;
+
+%#ifdef SWIGPY_USE_CAPSULE
+    PyObject* cap = PyCapsule_New((void*)(*$1), SWIGPY_CAPSULE_NAME, free_cap);
+%#else
+    PyObject* cap = PyCObject_FromVoidPtr((void*)(*$1), free);
+%#endif
+
+%#if NPY_API_VERSION < 0x00000007
+  PyArray_BASE(array) = cap;
+%#else
+  PyArray_SetBaseObject(array,cap);
+%#endif
+
+  $result = SWIG_Python_AppendOutput($result,obj);
+}
+
+/* Typemap suite for (DATA_TYPE** ARGOUTVIEWM_ARRAY4, DIM_TYPE* DIM1, DIM_TYPE* DIM2,
+                      DIM_TYPE* DIM3, DIM_TYPE* DIM4)
+ */
+%typemap(in,numinputs=0)
+  (DATA_TYPE** ARGOUTVIEWM_ARRAY4, DIM_TYPE* DIM1    , DIM_TYPE* DIM2    , DIM_TYPE* DIM3    , DIM_TYPE* DIM4    )
+  (DATA_TYPE* data_temp = NULL   , DIM_TYPE dim1_temp, DIM_TYPE dim2_temp, DIM_TYPE dim3_temp, DIM_TYPE dim4_temp)
+{
+  $1 = &data_temp;
+  $2 = &dim1_temp;
+  $3 = &dim2_temp;
+  $4 = &dim3_temp;
+  $5 = &dim4_temp;
+}
+%typemap(argout,
+         fragment="NumPy_Backward_Compatibility,NumPy_Utilities")
+  (DATA_TYPE** ARGOUTVIEWM_ARRAY4, DIM_TYPE* DIM1, DIM_TYPE* DIM2, DIM_TYPE* DIM3, DIM_TYPE* DIM4)
+{
+  npy_intp dims[4] = { *$2, *$3, *$4 , *$5 };
+  PyObject* obj = PyArray_SimpleNewFromData(4, dims, DATA_TYPECODE, (void*)(*$1));
+  PyArrayObject* array = (PyArrayObject*) obj;
+
+  if (!array) SWIG_fail;
+
+%#ifdef SWIGPY_USE_CAPSULE
+    PyObject* cap = PyCapsule_New((void*)(*$1), SWIGPY_CAPSULE_NAME, free_cap);
+%#else
+    PyObject* cap = PyCObject_FromVoidPtr((void*)(*$1), free);
+%#endif
+
+%#if NPY_API_VERSION < 0x00000007
+  PyArray_BASE(array) = cap;
+%#else
+  PyArray_SetBaseObject(array,cap);
+%#endif
+
+  $result = SWIG_Python_AppendOutput($result,obj);
+}
+
+/* Typemap suite for (DIM_TYPE* DIM1, DIM_TYPE* DIM2, DIM_TYPE* DIM3, DIM_TYPE* DIM4,
+                      DATA_TYPE** ARGOUTVIEWM_ARRAY4)
+ */
+%typemap(in,numinputs=0)
+  (DIM_TYPE* DIM1    , DIM_TYPE* DIM2    , DIM_TYPE* DIM3    , DIM_TYPE* DIM4    , DATA_TYPE** ARGOUTVIEWM_ARRAY4)
+  (DIM_TYPE dim1_temp, DIM_TYPE dim2_temp, DIM_TYPE dim3_temp, DIM_TYPE dim4_temp, DATA_TYPE* data_temp = NULL   )
+{
+  $1 = &dim1_temp;
+  $2 = &dim2_temp;
+  $3 = &dim3_temp;
+  $4 = &dim4_temp;
+  $5 = &data_temp;
+}
+%typemap(argout,
+         fragment="NumPy_Backward_Compatibility,NumPy_Utilities")
+  (DIM_TYPE* DIM1, DIM_TYPE* DIM2, DIM_TYPE* DIM3, DIM_TYPE* DIM4, DATA_TYPE** ARGOUTVIEWM_ARRAY4)
+{
+  npy_intp dims[4] = { *$1, *$2, *$3 , *$4 };
+  PyObject* obj = PyArray_SimpleNewFromData(4, dims, DATA_TYPECODE, (void*)(*$5));
+  PyArrayObject* array = (PyArrayObject*) obj;
+
+  if (!array) SWIG_fail;
+
+%#ifdef SWIGPY_USE_CAPSULE
+    PyObject* cap = PyCapsule_New((void*)(*$1), SWIGPY_CAPSULE_NAME, free_cap);
+%#else
+    PyObject* cap = PyCObject_FromVoidPtr((void*)(*$1), free);
+%#endif
+
+%#if NPY_API_VERSION < 0x00000007
+  PyArray_BASE(array) = cap;
+%#else
+  PyArray_SetBaseObject(array,cap);
+%#endif
+
+  $result = SWIG_Python_AppendOutput($result,obj);
+}
+
+/* Typemap suite for (DATA_TYPE** ARGOUTVIEWM_FARRAY4, DIM_TYPE* DIM1, DIM_TYPE* DIM2,
+                      DIM_TYPE* DIM3, DIM_TYPE* DIM4)
+ */
+%typemap(in,numinputs=0)
+  (DATA_TYPE** ARGOUTVIEWM_FARRAY4, DIM_TYPE* DIM1    , DIM_TYPE* DIM2    , DIM_TYPE* DIM3    , DIM_TYPE* DIM4    )
+  (DATA_TYPE* data_temp = NULL    , DIM_TYPE dim1_temp, DIM_TYPE dim2_temp, DIM_TYPE dim3_temp, DIM_TYPE dim4_temp)
+{
+  $1 = &data_temp;
+  $2 = &dim1_temp;
+  $3 = &dim2_temp;
+  $4 = &dim3_temp;
+  $5 = &dim4_temp;
+}
+%typemap(argout,
+         fragment="NumPy_Backward_Compatibility,NumPy_Array_Requirements,NumPy_Utilities")
+  (DATA_TYPE** ARGOUTVIEWM_FARRAY4, DIM_TYPE* DIM1, DIM_TYPE* DIM2, DIM_TYPE* DIM3, DIM_TYPE* DIM4)
+{
+  npy_intp dims[4] = { *$2, *$3, *$4 , *$5 };
+  PyObject* obj = PyArray_SimpleNewFromData(4, dims, DATA_TYPECODE, (void*)(*$1));
+  PyArrayObject* array = (PyArrayObject*) obj;
+
+  if (!array || !require_fortran(array)) SWIG_fail;
+
+%#ifdef SWIGPY_USE_CAPSULE
+    PyObject* cap = PyCapsule_New((void*)(*$1), SWIGPY_CAPSULE_NAME, free_cap);
+%#else
+    PyObject* cap = PyCObject_FromVoidPtr((void*)(*$1), free);
+%#endif
+
+%#if NPY_API_VERSION < 0x00000007
+  PyArray_BASE(array) = cap;
+%#else
+  PyArray_SetBaseObject(array,cap);
+%#endif
+
+  $result = SWIG_Python_AppendOutput($result,obj);
+}
+
+/* Typemap suite for (DIM_TYPE* DIM1, DIM_TYPE* DIM2, DIM_TYPE* DIM3, DIM_TYPE* DIM4,
+                      DATA_TYPE** ARGOUTVIEWM_FARRAY4)
+ */
+%typemap(in,numinputs=0)
+  (DIM_TYPE* DIM1    , DIM_TYPE* DIM2    , DIM_TYPE* DIM3    , DIM_TYPE* DIM4    , DATA_TYPE** ARGOUTVIEWM_FARRAY4)
+  (DIM_TYPE dim1_temp, DIM_TYPE dim2_temp, DIM_TYPE dim3_temp, DIM_TYPE dim4_temp, DATA_TYPE* data_temp = NULL    )
+{
+  $1 = &dim1_temp;
+  $2 = &dim2_temp;
+  $3 = &dim3_temp;
+  $4 = &dim4_temp;
+  $5 = &data_temp;
+}
+%typemap(argout,
+         fragment="NumPy_Backward_Compatibility,NumPy_Array_Requirements,NumPy_Utilities")
+  (DIM_TYPE* DIM1, DIM_TYPE* DIM2, DIM_TYPE* DIM3, DIM_TYPE* DIM4, DATA_TYPE** ARGOUTVIEWM_FARRAY4)
+{
+  npy_intp dims[4] = { *$1, *$2, *$3 , *$4 };
+  PyObject* obj = PyArray_SimpleNewFromData(4, dims, DATA_TYPECODE, (void*)(*$5));
+  PyArrayObject* array = (PyArrayObject*) obj;
+
+  if (!array || !require_fortran(array)) SWIG_fail;
+
+%#ifdef SWIGPY_USE_CAPSULE
+    PyObject* cap = PyCapsule_New((void*)(*$1), SWIGPY_CAPSULE_NAME, free_cap);
+%#else
+    PyObject* cap = PyCObject_FromVoidPtr((void*)(*$1), free);
+%#endif
+
+%#if NPY_API_VERSION < 0x00000007
+  PyArray_BASE(array) = cap;
+%#else
+  PyArray_SetBaseObject(array,cap);
+%#endif
+
+  $result = SWIG_Python_AppendOutput($result,obj);
+}
+
+%enddef    /* %numpy_typemaps() macro */
+/* *************************************************************** */
+
+/* Concrete instances of the %numpy_typemaps() macro: Each invocation
+ * below applies all of the typemaps above to the specified data type.
+ */
+%numpy_typemaps(signed char       , NPY_BYTE     , int)
+%numpy_typemaps(unsigned char     , NPY_UBYTE    , int)
+%numpy_typemaps(short             , NPY_SHORT    , int)
+%numpy_typemaps(unsigned short    , NPY_USHORT   , int)
+%numpy_typemaps(int               , NPY_INT      , int)
+%numpy_typemaps(unsigned int      , NPY_UINT     , int)
+%numpy_typemaps(long              , NPY_LONG     , int)
+%numpy_typemaps(unsigned long     , NPY_ULONG    , int)
+%numpy_typemaps(long long         , NPY_LONGLONG , int)
+%numpy_typemaps(unsigned long long, NPY_ULONGLONG, int)
+%numpy_typemaps(float             , NPY_FLOAT    , int)
+%numpy_typemaps(double            , NPY_DOUBLE   , int)
+
+/* ***************************************************************
+ * The follow macro expansion does not work, because C++ bool is 4
+ * bytes and NPY_BOOL is 1 byte
+ *
+ *    %numpy_typemaps(bool, NPY_BOOL, int)
+ */
+
+/* ***************************************************************
+ * On my Mac, I get the following warning for this macro expansion:
+ * 'swig/python detected a memory leak of type 'long double *', no destructor found.'
+ *
+ *    %numpy_typemaps(long double, NPY_LONGDOUBLE, int)
+ */
+
+/* ***************************************************************
+ * Swig complains about a syntax error for the following macro
+ * expansions:
+ *
+ *    %numpy_typemaps(complex float,  NPY_CFLOAT , int)
+ *
+ *    %numpy_typemaps(complex double, NPY_CDOUBLE, int)
+ *
+ *    %numpy_typemaps(complex long double, NPY_CLONGDOUBLE, int)
+ */
+
+#endif /* SWIGPYTHON */
diff --git a/include/singa/io/imagefolder_store.h b/src/python/swig/singa.i
similarity index 79%
rename from include/singa/io/imagefolder_store.h
rename to src/python/swig/singa.i
index c05d92d..12f46f3 100644
--- a/include/singa/io/imagefolder_store.h
+++ b/src/python/swig/singa.i
@@ -18,4 +18,14 @@
 * under the License.
 *
 *************************************************************/
-// TODO(wangwei) store images in a disk folder
+
+/*interface file for swig */
+
+%module singa_wrap
+%include "config.i"
+%include "core_tensor.i"
+%include "core_device.i"
+%include "model_layer.i"
+%include "model_optimizer.i"
+%include "model_loss.i"
+%include "model_metric.i"
diff --git a/src/server.cc b/src/server.cc
deleted file mode 100644
index 3b72243..0000000
--- a/src/server.cc
+++ /dev/null
@@ -1,259 +0,0 @@
-/************************************************************
-*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*
-*************************************************************/
-
-#include "singa/server.h"
-
-#include <thread>
-#include <chrono>
-#include "mshadow/tensor.h"
-#include "singa/proto/common.pb.h"
-#include "singa/utils/param.h"
-#include "singa/utils/singleton.h"
-#include "singa/utils/factory.h"
-#include "singa/utils/cluster.h"
-
-namespace singa {
-
-using namespace mshadow;
-using std::vector;
-
-Server::Server(int group_id, int server_id,
-    const JobProto& job_conf,
-    const vector<int>& slice2group,
-    const vector<int>& slice2server) {
-  grp_id_ = group_id;
-  id_ = server_id;
-  updater_ = Updater::Create(job_conf.updater());
-  slice2group_ = slice2group;
-  slice2server_ = slice2server;
-  dealer_ = new Dealer(Addr(grp_id_, id_, kServer));
-}
-
-Server::~Server() {
-  delete updater_;
-  // free Params (i.e., slices) in server shard
-  for (auto entry : shard_)
-    for (auto param : entry.second->shares)
-      delete param;
-  delete dealer_;
-}
-
-void Stop(void* running) {
-  *static_cast<bool *>(running) = false;
-}
-
-void Server::Run() {
-  LOG(ERROR) << "Server (group = " << grp_id_ <<", id = " << id_ << ") start";
-  auto cluster = Cluster::Get();
-  if (cluster->nserver_groups()) {
-    CHECK_GT(slice2group_.size(), 0);
-    if (cluster->nservers_per_group()) {
-      CHECK_GT(slice2server_.size(), 0);
-    }
-  }
-  n_updates_.resize(slice2group_.size(), 0);
-  n_pending_sync_.resize(slice2group_.size(), 0);
-  last_sync_.resize(slice2group_.size());
-
-  bool running = true;
-  CHECK(cluster->runtime()->WatchSGroup(grp_id_, id_, Stop, &running));
-  // start recv loop and process requests
-  while (running) {
-    // cannot use blocking Receive() here, it will get stuck after workers stop.
-    Msg* msg = dealer_->Receive(cluster->poll_time());
-    if (msg == nullptr)
-      continue;
-    Msg* response = nullptr;
-    int type = msg->type();
-    int slice_id = SliceID(msg->trgt_val());
-    if (type == kPut) {
-      response = HandlePut(&msg);
-    } else if (shard_.find(slice_id) == shard_.end()) {
-      // TODO(wangsh): buffer the msg instead, and process it after the
-      //               corresponding put request is done
-      // delay the processing by re-queue the msg. May sleep for a while?
-      response = msg;
-    } else {
-      switch (type) {
-        case kGet:
-          response = HandleGet(&msg);
-          break;
-        case kUpdate:
-          for (auto reply : HandleUpdate(&msg))
-            dealer_->Send(&reply);
-          break;
-        case kSyncRequest:
-          response = HandleSyncRequest(&msg);
-          break;
-        case kSyncResponse:
-          HandleSyncResponse(&msg);
-          break;
-        default:
-          LOG(ERROR) << "Unknown message type: " << type;
-          break;
-      }
-    }
-    if (response != nullptr)
-      dealer_->Send(&response);
-  }
-
-  // send stop msg to stub
-  Msg* msg = new Msg(Addr(grp_id_, id_, kServer), Addr(-1, -1, kStub));
-  msg->set_type(kStop);
-  dealer_->Send(&msg);
-  std::this_thread::sleep_for(std::chrono::milliseconds(1000));
-  LOG(ERROR) << "Server (group = " << grp_id_ << ", id = " << id_ << ") stops";
-}
-
-Msg* Server::HandlePut(Msg **msg) {
-  int version = (*msg)->trgt_version();
-  int slice_id = SliceID((*msg)->trgt_val());
-  if (shard_.find(slice_id) != shard_.end())
-    LOG(FATAL) << "Param (" << slice_id << ") is put more than once";
-
-  // TODO(wangwei) replace hard coded param type 0
-  auto  param = Singleton<Factory<Param>>::Instance()->Create(0);
-  auto response = param->HandlePutMsg(msg, true);
-  // parse num of shares of this param from a worker group
-  int num_shares = 1;
-  if ((*msg)->NextFrame())
-    (*msg)->ParseFormatFrame("i", &num_shares);
-  DeleteMsg(msg);
-  shard_[slice_id] = new ParamEntry(num_shares, param);
-  // must set version after HandlePutMsg which allocates the memory
-  param->set_version(version);
-  param->set_last_version(version);
-  param->set_id(slice_id);
-  // allocate blob for param sync between groups.
-  if (slice2group_[slice_id] != grp_id_) {
-    last_sync_[slice_id].ReshapeLike(param->data());
-    last_sync_[slice_id].CopyFrom(param->data());
-  }
-  LOG(INFO) << "server (group = " << grp_id_ << ", id = " << id_
-            <<") put slice=" << slice_id << " size=" << param->size();
-  return response;
-}
-
-Msg* Server::HandleGet(Msg **msg) {
-  int val = (*msg)->trgt_val();
-  auto param = shard_.at(SliceID(val))->shares.at(0);
-  // re-queue the request if the param is not updated to the required version
-  if (param->version() < (*msg)->trgt_version()) {
-    return *msg;
-  } else {
-    // LOG(ERROR) << "get " << slice << " from "<<(*msg)->src_first();
-    auto reply = param->HandleGetMsg(msg, false);
-    reply->set_trgt(val, param->version());
-    return reply;
-  }
-}
-
-const vector<Msg*> Server::HandleUpdate(Msg **msg) {
-  vector<Msg*> ret;
-  int sliceid = SliceID((*msg)->trgt_val());
-  auto entry = shard_.at(sliceid);
-  buffer_requests_[sliceid].push_back(*msg);
-  int num_update;
-  (*msg)->LastFrame();
-  (*msg)->ParseFormatFrame("i", &num_update);
-  (*msg)->FirstFrame();
-  entry->num_update += num_update;
-  // LOG(ERROR) << "update "<< sliceid << " from " << AddrGrp((*msg)->src())
-  //            << ", " << num_update << " total " << entry->num_total;
-  // do update until recv gradients from all shares of this param/slice
-  if (entry->num_update >= entry->num_total) {
-    CHECK_EQ(entry->num_update, entry->num_total);
-    auto& request = buffer_requests_.at(sliceid);
-    int step = (*msg)->trgt_version();
-    int trgt_val = (*msg)->trgt_val();
-    auto param = entry->shares.at(0);
-    // extract and aggregate gradients
-    param->ParseUpdateMsgs(request);
-    // DLOG(ERROR) << "update param " << param->id() << " @ step " << step;
-    updater_->Update(step, param, 1.0f / entry->num_total);
-    param->set_version(param->version() + 1);
-    // response to all shares of this param
-    for (auto response : param->GenUpdateResponseMsgs(&request, false)) {
-      response->set_trgt(trgt_val, param->version());
-      ret.push_back(response);
-    }
-    entry->num_update = 0;
-    n_updates_[sliceid]++;
-    // sync with master group after at least sync_freq local updates
-    // the last check is to avoid sending msg to stopped servers
-    // may send the update steps on this server since last sync, i.e.,
-    // version-last_version
-    if (slice2group_[sliceid] != grp_id_
-        && n_updates_[sliceid] >= Cluster::Get()->sync_freq()
-        && n_pending_sync_[sliceid] <= Cluster::Get()->sync_freq()) {
-      auto shape = Shape1(param->size());
-      Tensor<cpu, 1> tmp(last_sync_[sliceid].mutable_cpu_data(), shape);
-      Tensor<cpu, 1> cur(param->mutable_cpu_data(), shape);
-      tmp = cur - tmp;
-      int addr = Addr(slice2group_[sliceid], slice2server_[sliceid], kServer);
-      Msg* sync = new Msg(Addr(grp_id_, id_, kServer), addr);
-      sync->set_type(kSyncRequest);
-      sync->set_trgt(trgt_val, param->version());
-      sync->AddFrame(tmp.dptr, param->size() * sizeof(float));
-      Copy(tmp, cur);
-      ret.push_back(sync);
-      n_updates_[sliceid] = 0;
-      n_pending_sync_[sliceid]++;
-    }
-  }
-  // message already pushed to buffer, just need to reset the pointer
-  *msg = nullptr;
-  return ret;
-}
-
-Msg* Server::HandleSyncRequest(Msg **msg) {
-  Msg* msgg = *msg;
-  int slice = SliceID(msgg->trgt_val());
-  auto param = shard_.at(slice)->shares.at(0);
-  auto shape = Shape1(param->size());
-  CHECK_EQ(msgg->FrameSize(), param->size()*sizeof(float));
-  Tensor<cpu, 1> inc(static_cast<float*>(msgg->FrameData()), shape);
-  Tensor<cpu, 1> cur(param->mutable_cpu_data(), shape);
-  // recv sync msg on the slice I am maintaining
-  cur += inc;
-  msgg->SwapAddr();
-  msgg->set_type(kSyncResponse);
-  // copy the fresh param value into the response msg
-  Copy(inc, cur);
-  return msgg;
-}
-
-// recv sync msg on slice mastered by others
-void Server::HandleSyncResponse(Msg **msg) {
-  Msg* msgg = *msg;
-  int slice = SliceID(msgg->trgt_val());
-  auto param = shard_.at(slice)->shares.at(0);
-  auto shape = Shape1(param->size());
-  Tensor<cpu, 1> prev(last_sync_[param->id()].mutable_cpu_data(), shape);
-  Tensor<cpu, 1> cur(param->mutable_cpu_data(), shape);
-  Tensor<cpu, 1> master(static_cast<float*>(msgg->FrameData()), shape);
-  cur += master - prev;  // cur = master + (cur - prev);
-  Copy(prev, cur);
-  DeleteMsg(msg);
-  n_pending_sync_[slice]--;
-}
-
-}  // namespace singa
diff --git a/src/stub.cc b/src/stub.cc
deleted file mode 100644
index 84c1f8b..0000000
--- a/src/stub.cc
+++ /dev/null
@@ -1,282 +0,0 @@
-/************************************************************
-*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*
-*************************************************************/
-
-#include "singa/stub.h"
-
-#include <glog/logging.h>
-#include <unistd.h>
-#include <map>
-#include <thread>
-#include <set>
-#include "singa/proto/common.pb.h"
-#include "singa/utils/cluster.h"
-#include "singa/utils/common.h"
-#include "singa/utils/tinydir.h"
-#include "singa/utils/math_blob.h"
-
-namespace singa {
-
-using std::vector;
-using std::string;
-
-/***********************Stub****************************/
-Stub::~Stub() {
-  delete router_;
-}
-/**
- * Get a hash id for a Param object from a group.
- *
- * Simple multiple group_id with a large prime number 997 (assuming there are
- * no more than 997 worker groups) and plus owner param id.
- */
-inline int Hash(int grp_id, int param_id) {
-  return grp_id * 997 + param_id;
-}
-const std::unordered_map<int, ParamEntry*>  CreateParamShard(
-    const vector<Worker*>& workers) {
-  std::unordered_map<int, ParamEntry*> shard;
-  // grp id -> net
-  std::unordered_map<int, NeuralNet*> grp2net;
-  // grp id -> worker id range
-  std::unordered_map<int, std::pair<int, int>> grp2workers;
-  for (auto worker : workers) {
-    int grp = worker->grp_id(), id = worker->id();
-    if (grp2net.find(grp) == grp2net.end()) {
-      grp2net[grp] = worker->train_net();
-      grp2workers[grp] = std::make_pair(id, id + 1);
-    } else {
-      CHECK_EQ(grp2net[grp], worker->train_net());
-      int start = grp2workers[grp].first, end = grp2workers[grp].second;
-      if (start > id) start = id;
-      if (end < id + 1) end = id + 1;
-      grp2workers[grp] = std::make_pair(start, end);
-    }
-  }
-
-  for (const auto entry : grp2net) {
-    int grp = entry.first;
-    int wstart = grp2workers[grp].first, wend = grp2workers[grp].second;
-    for (auto layer : entry.second->layers()) {
-      if (layer->unroll_index() > 0)
-        continue;
-      int partition = layer->partition_id();
-      bool local =  partition >= wstart && partition < wend;
-      for (auto param : layer->GetParams()) {
-        int hash = Hash(grp, param->owner());
-        if (shard.find(hash) == shard.end())
-          shard[hash] = new ParamEntry();
-        shard[hash]->AddParam(local, param);
-      }
-    }
-  }
-  return shard;
-}
-
-void Stub::Run(const vector<int>& slice2server,
-    const vector<Worker*>& workers, const vector<Server*>& servers) {
-  slice2server_ = slice2server;
-  int nworkers = workers.size(), nservers = servers.size();
-  auto cluster = Cluster::Get();
-  int procs_id = cluster->procs_id();
-  LOG(INFO) << "Stub in process " << procs_id << " starts";
-  auto shard = CreateParamShard(workers);
-  std::map<int, Dealer*> inter_dealers;  // for sending msg to other procs
-  std::queue<Msg*> msg_queue;
-  while (true) {
-    Msg* msg = nullptr;
-    if (msg_queue.empty()) {
-      msg = router_->Receive();
-    } else {
-      msg = msg_queue.front();
-      msg_queue.pop();
-    }
-//    LOG(ERROR) << "stub recv msg " << msg;
-    int type = msg->type(), dst = msg->dst(), flag = AddrType(dst);
-    if (flag == kStub && (AddrProc(dst) == procs_id || AddrGrp(dst) == -1)) {
-      //  the following statements are ordered!
-      if (type == kConnect) {
-        DeleteMsg(&msg);
-      } else if (type == kStop) {
-        int src_flag = AddrType(msg->src());
-        if (src_flag == kServer) nservers--;
-        else if (src_flag == kWorkerParam) nworkers--;
-        DeleteMsg(&msg);
-        if (nworkers == 0 && nservers == 0) break;
-      } else {
-        int grp;
-        int paramid = ParamID(msg->trgt_val());
-        ParamEntry *entry = nullptr;
-        switch (type) {
-          case kUpdate:
-            grp = AddrGrp(msg->src());
-            entry = shard.at(Hash(grp, paramid));
-            for (auto update_msg : HandleUpdateRequest(entry, &msg))
-              msg_queue.push(update_msg);
-            break;
-          case kRUpdate:
-            grp = AddrGrp(msg->dst());
-            entry = shard.at(Hash(grp, paramid));
-            HandleUpdateResponse(entry, &msg);
-            break;
-          case kGet:
-            grp = AddrGrp(msg->src());
-            entry = shard.at(Hash(grp, paramid));
-            for (auto get_msg : HandleGetRequest(entry, &msg))
-              msg_queue.push(get_msg);
-            break;
-          case kRGet:
-            grp = AddrGrp(msg->dst());
-            entry = shard.at(Hash(grp, paramid));
-            HandleGetResponse(entry, &msg);
-            break;
-          case kPut:
-            grp = AddrGrp(msg->src());
-            entry = shard.at(Hash(grp, paramid));
-            for (auto put_msg : HandlePutRequest(entry, &msg))
-              msg_queue.push(put_msg);
-            break;
-          default:
-            LOG(ERROR) << "Unknow message type:" << type;
-            break;
-        }
-      }
-    } else {
-      int dst_procs = AddrProc(dst);
-      if (flag != kStub)
-        dst_procs = cluster->ProcsIDOf(AddrGrp(dst), AddrID(dst), flag);
-      if (dst_procs != procs_id) {
-        if (inter_dealers.find(dst_procs) == inter_dealers.end())
-          inter_dealers[dst_procs] = CreateInterProcsDealer(dst_procs);
-        inter_dealers[dst_procs]->Send(&msg);
-      } else {
-//        LOG(ERROR) << "router send msg " << msg;
-        router_->Send(&msg);
-      }
-    }
-  }
-  LOG(ERROR) << "Stub in process " << procs_id << " stops";
-  for (auto& entry : inter_dealers)
-    delete entry.second;
-}
-
-Dealer* Stub::CreateInterProcsDealer(int dst_procs) {
-  // forward to other procs
-  auto cluster = Cluster::Get();
-  auto dealer = new Dealer(-2);
-  while (cluster->endpoint(dst_procs) == "") {
-    // kCollectSleepTime));
-    std::this_thread::sleep_for(std::chrono::milliseconds(3000));
-    LOG(ERROR) << "waiting for procs " << dst_procs << " to register";
-  }
-  dealer->Connect("tcp://"+cluster->endpoint(dst_procs));
-  return dealer;
-}
-
-void Stub::GenMsgs(int type, int version, ParamEntry* entry, Msg* msg,
-                      vector<Msg*> *ret) {
-  int procs_id = Cluster::Get()->procs_id();
-  int src_grp = AddrGrp(msg->src());
-  int dst_grp = src_grp / Cluster::Get()->nworker_groups_per_server_group();
-  auto param = entry->shares.at(0);
-  for (int idx = 0 ; idx < param->num_slices(); idx++) {
-    int slice_id = param->slice_start() + idx;
-    int server = slice2server_[slice_id];
-    int dst_procs = Cluster::Get()->ProcsIDOf(dst_grp, server, kServer);
-    Msg* new_msg = nullptr;
-    if (type == kPut) {
-      CHECK_GT(entry->num_total, 0);
-      new_msg = param->GenPutMsg(dst_procs != procs_id, idx);
-      new_msg->AddFormatFrame("i", entry->num_total);
-    } else if (type == kGet) {
-      new_msg = param->GenGetMsg(dst_procs != procs_id, idx);
-    } else if (type == kUpdate) {
-      new_msg = param->GenUpdateMsg(dst_procs != procs_id, idx);
-      new_msg->AddFormatFrame("i", entry->num_local);
-    } else {
-      LOG(FATAL) << "Wrong type";
-    }
-    new_msg->set_trgt(ParamTrgt(param->owner(), slice_id), version);
-    new_msg->set_src(Addr(src_grp, procs_id, kStub));
-    new_msg->set_dst(Addr(dst_grp, server, kServer));
-    ret->push_back(new_msg);
-//    LOG(ERROR) << "stub gen msg " << new_msg;
-  }
-}
-
-const vector<Msg*> Stub::HandleGetRequest(ParamEntry* entry, Msg** msg) {
-  vector<Msg*> ret;
-  int version = (*msg)->trgt_version();
-  if (version > entry->next_version) {
-    entry->next_version = version;
-    GenMsgs(kGet, version, entry, *msg, &ret);
-  }
-  DeleteMsg(msg);
-  return ret;
-}
-
-const vector<Msg*> Stub::HandleUpdateRequest(ParamEntry *entry, Msg** msg) {
-  vector<Msg*> ret;
-  entry->num_update++;
-  if (entry->num_update >= entry->num_local) {
-    // average local gradient
-    if (entry->num_local > 1) {
-      auto it = entry->shares.begin();
-      auto sum = it;
-      for (++it; it != entry->shares.end(); it++) {
-        AXPY(1.0f, (*it)->grad(), (*sum)->mutable_grad());
-      }
-    }
-    int step = (*msg)->trgt_version();
-    GenMsgs(kUpdate, step, entry, *msg, &ret);
-    entry->num_update = 0;
-  }
-  DeleteMsg(msg);
-  return ret;
-}
-
-const vector<Msg*> Stub::HandlePutRequest(ParamEntry* entry, Msg** msg) {
-  vector<Msg*> ret;
-  int version = (*msg)->trgt_version();
-  GenMsgs(kPut, version, entry, *msg, &ret);
-  DeleteMsg(msg);
-  return ret;
-}
-
-void Stub::HandleGetResponse(ParamEntry* entry, Msg** msg) {
-  int version = (*msg)->trgt_version();
-  int sliceid = SliceID((*msg)->trgt_val());
-  auto param = entry->shares.at(0);
-  if (param->ParseGetResponseMsg(*msg, sliceid-param->slice_start()))
-    for (auto *p : entry->shares)
-      p->set_version(version);
-  DeleteMsg(msg);
-}
-
-void Stub::HandleUpdateResponse(ParamEntry* entry, Msg** msg) {
-  int version = (*msg)->trgt_version();
-  int sliceid = SliceID((*msg)->trgt_val());
-  auto param = entry->shares.at(0);
-  if (param->ParseUpdateResponseMsg(*msg, sliceid-param->slice_start()))
-    for (auto *p : entry->shares)
-      p->set_version(version);
-  DeleteMsg(msg);
-}
-}  // namespace singa
diff --git a/src/test/test_cluster.cc b/src/test/test_cluster.cc
deleted file mode 100644
index cd57991..0000000
--- a/src/test/test_cluster.cc
+++ /dev/null
@@ -1,143 +0,0 @@
-/************************************************************
-*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-* 
-*   http://www.apache.org/licenses/LICENSE-2.0
-* 
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*
-*************************************************************/
-
-#include "gtest/gtest.h"
-#include "singa/utils/cluster.h"
-
-using namespace singa;
-
-std::string host = "localhost:2181";
-
-void zk_cb(void *contest) {
-  LOG(INFO) << "zk callback: " << static_cast<char *>(contest);
-}
-/*
-TEST(CluserRuntimeTest, GroupManagement) {
-  ClusterRuntime* rt = new ZKClusterRT(host);
-  ASSERT_EQ(rt->Init(), true);
-  ASSERT_EQ(rt->WatchSGroup(1, 1, zk_cb, "test call back"), true);
-  ASSERT_EQ(rt->JoinSGroup(1, 1, 1), true);
-  ASSERT_EQ(rt->JoinSGroup(1, 2, 1), true);
-  ASSERT_EQ(rt->LeaveSGroup(1, 2, 1), true);
-  ASSERT_EQ(rt->LeaveSGroup(1, 1, 1), true);
-  sleep(3);
-  delete rt;
-}
-
-TEST(CluserRuntimeTest, ProcessManagement) {
-  ClusterRuntime* rt = new ZKClusterRT(host);
-  ASSERT_EQ(rt->Init(), true);
-  ASSERT_EQ(rt->RegistProc("1.2.3.4:5"), 0);
-  ASSERT_EQ(rt->RegistProc("1.2.3.4:6"), 1);
-  ASSERT_EQ(rt->RegistProc("1.2.3.4:7"), 2);
-  ASSERT_NE(rt->GetProcHost(0), "");
-  ASSERT_NE(rt->GetProcHost(1), "");
-  ASSERT_NE(rt->GetProcHost(2), "");
-  sleep(3);
-  delete rt;
-}
-
-ClusterProto GenClusterProto(){
-  ClusterProto proto;
-  int nworker=6, nserver=4;
-  proto.set_nworkers(nworker);
-  proto.set_nservers(nserver);
-  proto.set_nworkers_per_group(3);
-  proto.set_nservers_per_group(2);
-  proto.set_nthreads_per_worker(1);
-  proto.set_nthreads_per_server(2);
-
-  proto.set_hostfile(folder+"/hostfile");
-
-  std::ofstream fout(folder+"/hostfile", std::ofstream::out);
-  for(int i=0;i<nworker+nserver;i++){
-    char tmp[20];
-    sprintf(tmp, "awan-0-%02d-0", i);
-    fout<<tmp<<std::endl;
-  }
-  fout.flush();
-  fout.close();
-  return proto;
-}
-
-TEST(ClusterTest, NoServer){
-  ClusterProto proto=GenClusterProto();
-  proto.set_nservers(0);
-  auto cluster=Cluster::Get(proto, 0);
-  ASSERT_EQ(proto.nworkers(),cluster->nworkers());
-  ASSERT_EQ(0, cluster->nservers());
-  ASSERT_EQ(proto.nworkers_per_group(),cluster->nworkers_per_group());
-  ASSERT_EQ(proto.nservers_per_group(),cluster->nservers_per_group());
-  ASSERT_FALSE(cluster->AmIServer());
-  ASSERT_TRUE(cluster->AmIWorker());
-  ASSERT_EQ(0,cluster->group_procs_id());
-  ASSERT_EQ(0,cluster->group_id());
-  ASSERT_EQ(2, cluster->nworker_groups());
-  ASSERT_EQ(0, cluster->nserver_groups());
-  ASSERT_STREQ("awan-0-00-0", cluster->host_addr().c_str());
-
-  cluster=Cluster::Get(proto, 5);
-  ASSERT_EQ(2,cluster->group_procs_id());
-  ASSERT_EQ(1,cluster->group_id());
-  ASSERT_EQ(2, cluster->nworker_groups());
-  ASSERT_EQ(0, cluster->nserver_groups());
-  ASSERT_STREQ("awan-0-05-0", cluster->host_addr().c_str());
-}
-
-TEST(ClusterTest, SingleServerGroup){
-  ClusterProto proto=GenClusterProto();
-  proto.set_nservers(2);
-  auto cluster=Cluster::Get(proto, 3);
-  ASSERT_FALSE(cluster->AmIServer());
-  ASSERT_TRUE(cluster->AmIWorker());
-  ASSERT_EQ(0,cluster->group_procs_id());
-  ASSERT_EQ(1,cluster->group_id());
-  ASSERT_EQ(2, cluster->nworker_groups());
-  ASSERT_EQ(1, cluster->nserver_groups());
-  ASSERT_STREQ("awan-0-03-0", cluster->host_addr().c_str());
-
-  cluster=Cluster::Get(proto, 7);
-  ASSERT_EQ(1,cluster->group_procs_id());
-  ASSERT_EQ(0,cluster->group_id());
-  ASSERT_EQ(2, cluster->nworker_groups());
-  ASSERT_EQ(1, cluster->nserver_groups());
-  ASSERT_STREQ("awan-0-07-0", cluster->host_addr().c_str());
-}
-
-TEST(ClusterTest, MultiServerGroups){
-  ClusterProto proto=GenClusterProto();
-  auto cluster=Cluster::Get(proto, 7);
-  ASSERT_EQ(1,cluster->group_procs_id());
-  ASSERT_EQ(0,cluster->group_id());
-  ASSERT_EQ(2, cluster->nworker_groups());
-  ASSERT_EQ(2, cluster->nserver_groups());
-  ASSERT_STREQ("awan-0-07-0", cluster->host_addr().c_str());
-
-  cluster=Cluster::Get(proto, 8);
-  ASSERT_TRUE(cluster->AmIServer());
-  ASSERT_FALSE(cluster->AmIWorker());
-  ASSERT_EQ(0,cluster->group_procs_id());
-  ASSERT_EQ(1,cluster->group_id());
-  ASSERT_EQ(2, cluster->nworker_groups());
-  ASSERT_EQ(2, cluster->nserver_groups());
-  ASSERT_STREQ("awan-0-08-0", cluster->host_addr().c_str());
-}
-**/
diff --git a/src/test/test_common.cc b/src/test/test_common.cc
deleted file mode 100644
index 4c33eb6..0000000
--- a/src/test/test_common.cc
+++ /dev/null
@@ -1,133 +0,0 @@
-/************************************************************
-*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*
-*************************************************************/
-
-#include <string>
-#include <unordered_map>
-#include <vector>
-#include "gtest/gtest.h"
-#include "singa/utils/common.h"
-
-using std::string;
-using std::vector;
-using namespace singa;
-
-TEST(CommonTest, TestIntVecToString) {
-  vector<int> num_vec {2, 3, 5, 7, 11};
-  string str = "(2, 3, 5, 7, 11, )";
-  ASSERT_EQ(str, IntVecToString(num_vec));
-}
-
-TEST(CommonTest, TestStringPrintf) {
-  const char* str_a = "abc";
-  const char* str_b = "edfgh";
-  const char* str_c = " !@#";
-  const char* str_d = "1";
-  const char* str_e = "2";
-  const char* str_f = "3";
-
-  string fmt_a = "%s%s%s";
-  string fmt_b = "[%s] [%s] [%s] ";
-
-  string str_d_a = "abcedfgh !@#";
-  string str_d_b = "[1] [2] [3] ";
-
-  ASSERT_EQ(str_d_a, StringPrintf(fmt_a, str_a, str_b, str_c));
-  ASSERT_EQ(str_d_b, StringPrintf(fmt_b, str_d, str_e, str_f));
-}
-
-TEST(CommonTest, TestGCDLCM) {
-  int a = 2, b = 5, c = 10, d = 15;
-
-  ASSERT_EQ(1, gcd(a, b));
-  ASSERT_EQ(5, gcd(c, d));
-  ASSERT_EQ(10, LeastCommonMultiple(b, c));
-  ASSERT_EQ(30, LeastCommonMultiple(c, d));
-}
-
-TEST(CommonTest, TestMetric) {
-  string str, msg;
-  Metric metric;
-  metric.Add("a", 0.5);
-  metric.Add("b", 0.5);
-  metric.Add("a", 1.5);
-  str = metric.ToLogString();
-  msg = metric.ToString();
-  metric.Reset();
-  metric.ParseFrom(msg);
-  ASSERT_EQ(str, metric.ToLogString());
-}
-
-TEST(CommonTest, TestSlice) {
-  vector<vector<int>> slices_0;
-  vector<int> sizes {14112, 96, 256, 884736, 384};
-  ASSERT_EQ(slices_0, Slice(0, sizes));
-
-  vector<vector<int>> slices_1 {
-    {14112},
-    {96},
-    {256},
-    {884736},
-    {384},
-  };
-
-  vector<vector<int>> slices_2 {
-    {14112},
-    {96},
-    {256},
-    {435328, 449408},
-    {384},
-  };
-
-  vector<vector<int>> slices_4 {
-    {14112},
-    {96},
-    {256},
-    {210432, 224896, 224896, 224512},
-    {384},
-  };
-
-  vector<vector<int>> slices_8 {
-    {14112},
-    {96},
-    {256},
-    {97984, 112448, 112448, 112448, 112448, 112448, 112448, 112064},
-    {384},
-  };
-
-  ASSERT_EQ(slices_1, Slice(1, sizes));
-  ASSERT_EQ(slices_2, Slice(2, sizes));
-  ASSERT_EQ(slices_4, Slice(4, sizes));
-  ASSERT_EQ(slices_8, Slice(8, sizes));
-}
-
-TEST(CommonTest, TestPartitionSlices) {
-  vector<int> slices {
-    97984, 112448, 112448, 112448, 112448, 112448, 112448, 112064
-  };
-  vector<int> box_1 {0, 0, 0, 0, 0, 0, 0, 0};
-  vector<int> box_2 {0, 0, 0, 0, 1, 1, 1, 1};
-  vector<int> box_4 {0, 0, 1, 1, 2, 2, 3, 3};
-  vector<int> box_8 {0, 1, 2, 3, 4, 5, 6, 7};
-  ASSERT_EQ(box_1, PartitionSlices(1, slices));
-  ASSERT_EQ(box_2, PartitionSlices(2, slices));
-  ASSERT_EQ(box_4, PartitionSlices(4, slices));
-  ASSERT_EQ(box_8, PartitionSlices(8, slices));
-}
diff --git a/src/test/test_connection_layers.cc b/src/test/test_connection_layers.cc
deleted file mode 100644
index cd7f5f5..0000000
--- a/src/test/test_connection_layers.cc
+++ /dev/null
@@ -1,459 +0,0 @@
-/************************************************************
-*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*
-*************************************************************/
-
-#include <string>
-#include <unordered_map>
-#include <vector>
-#include "gtest/gtest.h"
-#include "singa/comm/msg.h"
-#include "singa/comm/socket.h"
-#include "singa/neuralnet/connection_layer.h"
-#include "singa/neuralnet/neuron_layer.h"
-#include "singa/proto/job.pb.h"
-
-using namespace singa;
-
-const int N = 10;  // size of dim 0
-const int M = 20;  // size of dim 1
-const int K = 5;  // size of partitions
-
-TEST(ConnectionLayerTest, DummyTest) {
-  // use dummy as input layer
-  vector<Layer*> src_in;
-  LayerProto proto_in;
-  proto_in.set_name("dummy_input");
-  proto_in.mutable_dummy_conf()->set_input(true);
-  proto_in.mutable_dummy_conf()->add_shape(N);
-  proto_in.mutable_dummy_conf()->add_shape(M);
-  DummyLayer in;
-  in.Setup(proto_in, src_in);
-  ASSERT_EQ(in.data(nullptr).shape(0), N);
-  ASSERT_EQ(in.data(nullptr).shape(1), M);
-  in.ComputeFeature(0, src_in);
-
-  // use dummy as neuron layer
-  vector<Layer*> src_neu;
-  src_neu.push_back(static_cast<Layer*>(&in));
-  LayerProto proto_neu;
-  proto_neu.set_name("dummy_neuron");
-  proto_neu.mutable_dummy_conf();
-  DummyLayer neu;
-  neu.Setup(proto_neu, src_neu);
-  ASSERT_EQ(neu.data(nullptr).shape(0), N);
-  ASSERT_EQ(neu.data(nullptr).shape(1), M);
-  neu.ComputeFeature(0, src_neu);
-  ASSERT_EQ(in.data(nullptr).count(), neu.data(nullptr).count());
-  for (int i = 0; i < in.data(nullptr).count(); ++i)
-    ASSERT_EQ(in.data(nullptr).cpu_data()[i], neu.data(nullptr).cpu_data()[i]);
-
-  // use dummy as output layer
-  vector<Layer*> src_out;
-  src_out.push_back(static_cast<Layer*>(&neu));
-  LayerProto proto_out;
-  proto_out.set_name("dummy_output");
-  proto_out.mutable_dummy_conf()->set_output(true);
-  DummyLayer out;
-  out.Setup(proto_out, src_out);
-  ASSERT_EQ(out.data(nullptr).shape(0), N);
-  ASSERT_EQ(out.data(nullptr).shape(1), M);
-  out.ComputeFeature(0, src_out);
-  ASSERT_EQ(in.data(nullptr).count(), out.data(nullptr).count());
-  for (int i = 0; i < in.data(nullptr).count(); ++i)
-    ASSERT_EQ(in.data(nullptr).cpu_data()[i], out.data(nullptr).cpu_data()[i]);
-
-  // test for computing gradient
-  out.ComputeGradient(0, src_out);
-  neu.ComputeGradient(0, src_neu);
-  in.ComputeGradient(0, src_in);
-  for (int i = 0; i < in.grad(nullptr).count(); ++i)
-    ASSERT_EQ(in.grad(nullptr).cpu_data()[i], out.grad(nullptr).cpu_data()[i]);
-}
-
-TEST(ConnectionLayerTest, BridgeTest) {
-  // use dummy as input layer
-  vector<Layer*> src_in;
-  LayerProto proto_in;
-  proto_in.set_name("dummy_input");
-  proto_in.mutable_dummy_conf()->set_input(true);
-  proto_in.mutable_dummy_conf()->add_shape(N);
-  proto_in.mutable_dummy_conf()->add_shape(M);
-  DummyLayer in;
-  in.Setup(proto_in, src_in);
-
-  // add src bridge layer
-  vector<Layer*> src_src;
-  src_src.push_back(static_cast<Layer*>(&in));
-  LayerProto proto_src;
-  proto_src.set_name("bridge_src");
-  BridgeSrcLayer src;
-  src.Setup(proto_src, src_src);
-  ASSERT_EQ(src.data(nullptr).shape(0), N);
-  ASSERT_EQ(src.data(nullptr).shape(1), M);
-
-  // add dst bridge layer
-  vector<Layer*> src_dst;
-  src_dst.push_back(static_cast<Layer*>(&src));
-  LayerProto proto_dst;
-  proto_dst.set_name("bridge_dst");
-  BridgeDstLayer dst;
-  dst.Setup(proto_dst, src_dst);
-  ASSERT_EQ(dst.data(nullptr).shape(0), N);
-  ASSERT_EQ(dst.data(nullptr).shape(1), M);
-
-  msgQueues[-1];
-  msgQueues[Addr(0, 0, kWorkerLayer)];
-
-  // bind bridges to socket
-  // Router router(N);
-  Router router;
-  // router.Bind("inproc://router");
-  Dealer dealer(Addr(0, 0, kWorkerLayer));
-  // dealer.Connect("inproc://router");
-  std::unordered_map<std::string, Layer*> name2bridge;
-  name2bridge[src.name()] = &src;
-  name2bridge[dst.name()] = &dst;
-  src.MakePaired(static_cast<Layer*>(&dst), 0, &dealer, &name2bridge);
-  dst.MakePaired(static_cast<Layer*>(&src), 0, &dealer, &name2bridge);
-
-  // use dummy as output layer
-  LayerProto proto_out;
-  vector<Layer*> src_out;
-  src_out.push_back(static_cast<Layer*>(&dst));
-  proto_out.set_name("dummy_output");
-  proto_out.mutable_dummy_conf()->set_output(true);
-  DummyLayer out;
-  out.Setup(proto_out, src_out);
-
-  // test for computing feature
-  in.ComputeFeature(0, src_in);
-  src.ComputeFeature(0, src_src);
-  Msg* msg_data = router.Receive();
-  router.Send(&msg_data);
-  dst.ComputeFeature(0, src_dst);
-  out.ComputeFeature(0, src_out);
-  for (int i = 0; i < in.data(nullptr).count(); ++i)
-    ASSERT_EQ(in.data(nullptr).cpu_data()[i], out.data(nullptr).cpu_data()[i]);
-
-  // test for computing gradient
-  out.ComputeGradient(0, src_out);
-  dst.ComputeGradient(0, src_dst);
-  Msg* msg_grad = router.Receive();
-  router.Send(&msg_grad);
-  src.ComputeGradient(0, src_src);
-  in.ComputeGradient(0, src_in);
-  for (int i = 0; i < in.grad(nullptr).count(); ++i)
-    ASSERT_EQ(in.grad(nullptr).cpu_data()[i], out.grad(nullptr).cpu_data()[i]);
-}
-
-TEST(ConnectionLayerTest, DataSliceTest) {
-  // use dummy as input layer
-  vector<Layer*> src_in;
-  LayerProto proto_in;
-  proto_in.set_name("dummy_input");
-  proto_in.mutable_dummy_conf()->set_input(true);
-  proto_in.mutable_dummy_conf()->add_shape(N);
-  proto_in.mutable_dummy_conf()->add_shape(M);
-  DummyLayer in;
-  in.Setup(proto_in, src_in);
-
-  // add slice layer
-  vector<Layer*> src_slice;
-  src_slice.push_back(static_cast<Layer*>(&in));
-  LayerProto proto_slice;
-  proto_slice.set_name("slice");
-  proto_slice.mutable_slice_conf()->set_slice_dim(0);
-  proto_slice.mutable_slice_conf()->set_num_slices(K);
-  SliceLayer slice;
-  slice.Setup(proto_slice, src_slice);
-  ASSERT_EQ(slice.data(nullptr).shape(0), N / K);
-  ASSERT_EQ(slice.data(nullptr).shape(1), M);
-
-  // use dummy as output layers
-  LayerProto proto_out[K];
-  vector<Layer*> src_out[K];
-  DummyLayer out[K];
-  for (int i = 0; i < K; ++i) {
-    src_out[i].push_back(static_cast<Layer*>(&slice));
-    proto_out[i].set_name("dummy_output_"+std::to_string(i));
-    proto_out[i].set_partition_id(i);
-    proto_out[i].mutable_dummy_conf()->set_output(true);
-    out[i].Setup(proto_out[i], src_out[i]);
-  }
-
-  // test for computing feature
-  in.ComputeFeature(0, src_in);
-  slice.ComputeFeature(0, src_slice);
-  for (int i = 0; i < K; ++i)
-    out[i].ComputeFeature(0, src_out[i]);
-  int step = (N * M) / K;
-  for (int i = 0; i < in.data(nullptr).count(); ++i) {
-    ASSERT_EQ(in.data(nullptr).cpu_data()[i],
-              out[i / step].data(nullptr).cpu_data()[i % step]);
-  }
-
-  // test for computing gradient
-  for (int i = 0; i < K; ++i)
-    out[i].ComputeGradient(0, src_out[i]);
-  slice.ComputeGradient(0, src_slice);
-  in.ComputeGradient(0, src_in);
-  for (int i = 0; i < in.grad(nullptr).count(); ++i) {
-    ASSERT_EQ(in.grad(nullptr).cpu_data()[i],
-              out[i / step].grad(nullptr).cpu_data()[i % step]);
-  }
-}
-
-TEST(ConnectionLayerTest, ModelSliceTest) {
-  // use dummy as input layer
-  vector<Layer*> src_in;
-  LayerProto proto_in;
-  proto_in.set_name("dummy_input");
-  proto_in.mutable_dummy_conf()->set_input(true);
-  proto_in.mutable_dummy_conf()->add_shape(N);
-  proto_in.mutable_dummy_conf()->add_shape(M);
-  DummyLayer in;
-  in.Setup(proto_in, src_in);
-
-  // add slice layer
-  vector<Layer*> src_slice;
-  src_slice.push_back(static_cast<Layer*>(&in));
-  LayerProto proto_slice;
-  proto_slice.set_name("slice");
-  proto_slice.mutable_slice_conf()->set_slice_dim(1);
-  proto_slice.mutable_slice_conf()->set_num_slices(K);
-  SliceLayer slice;
-  slice.Setup(proto_slice, src_slice);
-  ASSERT_EQ(slice.data(nullptr).shape(0), N);
-  ASSERT_EQ(slice.data(nullptr).shape(1), M / K);
-
-  // use dummy as output layers
-  LayerProto proto_out[K];
-  vector<Layer*> src_out[K];
-  DummyLayer out[K];
-  for (int i = 0; i < K; ++i) {
-    src_out[i].push_back(static_cast<Layer*>(&slice));
-    proto_out[i].set_name("dummy_output_"+std::to_string(i));
-    proto_out[i].set_partition_id(i);
-    proto_out[i].mutable_dummy_conf()->set_output(true);
-    out[i].Setup(proto_out[i], src_out[i]);
-  }
-
-  // test for computing feature
-  in.ComputeFeature(0, src_in);
-  slice.ComputeFeature(0, src_slice);
-  for (int i = 0; i < K; ++i)
-    out[i].ComputeFeature(0, src_out[i]);
-  int step = M / K;
-  int offset = 0;
-  for (int i = 0; i < in.data(nullptr).count(); ++i) {
-    if (i && i % M == 0) offset += step;
-    ASSERT_EQ(in.data(nullptr).cpu_data()[i],
-              out[(i / step) % K].data(nullptr).cpu_data()[offset + i % step]);
-  }
-
-  // test for computing gradient
-  for (int i = 0; i < K; ++i)
-    out[i].ComputeGradient(0, src_out[i]);
-  slice.ComputeGradient(0, src_slice);
-  in.ComputeGradient(0, src_in);
-  offset = 0;
-  for (int i = 0; i < in.grad(nullptr).count(); ++i) {
-    if (i && i % M == 0) offset += step;
-    ASSERT_EQ(in.grad(nullptr).cpu_data()[i],
-              out[(i / step) % K].grad(nullptr).cpu_data()[offset + i % step]);
-  }
-}
-
-TEST(ConnectionLayerTest, DataConcateTest) {
-  // use dummy as input layers
-  LayerProto proto_in[K];
-  vector<Layer*> src_in[K];
-  DummyLayer in[K];
-  for (int i = 0; i < K; ++i) {
-    proto_in[i].set_name("dummy_input_"+std::to_string(i));
-    proto_in[i].set_partition_id(i);
-    proto_in[i].mutable_dummy_conf()->set_input(true);
-    proto_in[i].mutable_dummy_conf()->add_shape(N / K);
-    proto_in[i].mutable_dummy_conf()->add_shape(M);
-    in[i].Setup(proto_in[i], src_in[i]);
-  }
-
-  // add concate layer
-  vector<Layer*> src_concate;
-  for (int i = 0; i < K; ++i)
-    src_concate.push_back(static_cast<Layer*>(&in[i]));
-  LayerProto proto_concate;
-  proto_concate.set_name("concate");
-  proto_concate.mutable_concate_conf()->set_concate_dim(0);
-  proto_concate.mutable_concate_conf()->set_num_concates(K);
-  ConcateLayer concate;
-  concate.Setup(proto_concate, src_concate);
-  ASSERT_EQ(concate.data(static_cast<Layer*>(&concate)).shape(0), N);
-  ASSERT_EQ(concate.data(static_cast<Layer*>(&concate)).shape(1), M);
-
-  // use dummy as output layer
-  vector<Layer*> src_out;
-  src_out.push_back(static_cast<Layer*>(&concate));
-  LayerProto proto_out;
-  proto_out.set_name("dummy_output");
-  proto_out.mutable_dummy_conf()->set_output(true);
-  DummyLayer out;
-  out.Setup(proto_out, src_out);
-
-  // test for computing feature
-  for (int i = 0; i < K; ++i)
-    in[i].ComputeFeature(0, src_in[i]);
-  concate.ComputeFeature(0, src_concate);
-  out.ComputeFeature(0, src_out);
-  int step = (N * M) / K;
-  for (int i = 0; i < out.data(nullptr).count(); ++i) {
-    ASSERT_EQ(in[i / step].data(nullptr).cpu_data()[i % step],
-              out.data(nullptr).cpu_data()[i]);
-  }
-
-  // test for computing gradient
-  out.ComputeGradient(0, src_out);
-  concate.ComputeGradient(0, src_concate);
-  for (int i = 0; i < K; ++i)
-    in[i].ComputeGradient(0, src_in[i]);
-  for (int i = 0; i < out.grad(nullptr).count(); ++i) {
-    ASSERT_EQ(in[i / step].grad(nullptr).cpu_data()[i % step],
-              out.grad(nullptr).cpu_data()[i]);
-  }
-}
-
-TEST(ConnectionLayerTest, ModelConcateTest) {
-  // use dummy as input layers
-  LayerProto proto_in[K];
-  vector<Layer*> src_in[K];
-  DummyLayer in[K];
-  for (int i = 0; i < K; ++i) {
-    proto_in[i].set_name("dummy_input_"+std::to_string(i));
-    proto_in[i].set_partition_id(i);
-    proto_in[i].mutable_dummy_conf()->set_input(true);
-    proto_in[i].mutable_dummy_conf()->add_shape(N);
-    proto_in[i].mutable_dummy_conf()->add_shape(M / K);
-    in[i].Setup(proto_in[i], src_in[i]);
-  }
-
-  // add concate layer
-  vector<Layer*> src_concate;
-  for (int i = 0; i < K; ++i)
-    src_concate.push_back(static_cast<Layer*>(&in[i]));
-  LayerProto proto_concate;
-  proto_concate.set_name("concate");
-  proto_concate.mutable_concate_conf()->set_concate_dim(1);
-  proto_concate.mutable_concate_conf()->set_num_concates(K);
-  ConcateLayer concate;
-  concate.Setup(proto_concate, src_concate);
-  ASSERT_EQ(concate.data(static_cast<Layer*>(&concate)).shape(0), N);
-  ASSERT_EQ(concate.data(static_cast<Layer*>(&concate)).shape(1), M);
-
-  // use dummy as output layer
-  vector<Layer*> src_out;
-  src_out.push_back(static_cast<Layer*>(&concate));
-  LayerProto proto_out;
-  proto_out.set_name("dummy_output");
-  proto_out.mutable_dummy_conf()->set_output(true);
-  DummyLayer out;
-  out.Setup(proto_out, src_out);
-
-  // test for computing feature
-  for (int i = 0; i < K; ++i)
-    in[i].ComputeFeature(0, src_in[i]);
-  concate.ComputeFeature(0, src_concate);
-  out.ComputeFeature(0, src_out);
-  int step = M / K;
-  int offset = 0;
-  for (int i = 0; i < out.grad(nullptr).count(); ++i) {
-    if (i && i % M == 0) offset += step;
-    ASSERT_EQ(in[(i / step) % K].data(nullptr).cpu_data()[offset + i % step],
-              out.data(nullptr).cpu_data()[i]);
-  }
-
-  // test for computing gradient
-  out.ComputeGradient(0, src_out);
-  concate.ComputeGradient(0, src_concate);
-  for (int i = 0; i < K; ++i)
-    in[i].ComputeGradient(0, src_in[i]);
-  offset = 0;
-  for (int i = 0; i < out.grad(nullptr).count(); ++i) {
-    if (i && i % M == 0) offset += step;
-    ASSERT_EQ(in[(i / step) % K].grad(nullptr).cpu_data()[offset + i % step],
-              out.grad(nullptr).cpu_data()[i]);
-  }
-}
-
-TEST(ConnectionLayerTest, SplitTest) {
-  // use dummy as input layer
-  vector<Layer*> src_in;
-  LayerProto proto_in;
-  proto_in.set_name("dummy_input");
-  proto_in.mutable_dummy_conf()->set_input(true);
-  proto_in.mutable_dummy_conf()->add_shape(N);
-  proto_in.mutable_dummy_conf()->add_shape(M);
-  DummyLayer in;
-  in.Setup(proto_in, src_in);
-
-  // add split layer
-  vector<Layer*> src_split;
-  src_split.push_back(static_cast<Layer*>(&in));
-  LayerProto proto_split;
-  proto_split.set_name("split");
-  proto_split.mutable_split_conf()->set_num_splits(K);
-  SplitLayer split;
-  split.Setup(proto_split, src_split);
-  ASSERT_EQ(split.data(static_cast<Layer*>(&split)).shape(0), N);
-  ASSERT_EQ(split.data(static_cast<Layer*>(&split)).shape(1), M);
-
-  // use dummy as output layers
-  LayerProto proto_out[K];
-  vector<Layer*> src_out[K];
-  DummyLayer out[K];
-  for (int i = 0; i < K; ++i) {
-    src_out[i].push_back(static_cast<Layer*>(&split));
-    proto_out[i].set_name("dummy_output_"+std::to_string(i));
-    proto_out[i].set_partition_id(i);
-    proto_out[i].mutable_dummy_conf()->set_output(true);
-    out[i].Setup(proto_out[i], src_out[i]);
-  }
-
-  // test for computing feature
-  in.ComputeFeature(0, src_in);
-  split.ComputeFeature(0, src_split);
-  for (int i = 0; i < K; ++i)
-    out[i].ComputeFeature(0, src_out[i]);
-  for (int i = 0; i < in.data(nullptr).count(); ++i) {
-    for (int k = 0; k < K; ++k)
-      ASSERT_EQ(in.data(nullptr).cpu_data()[i],
-                out[k].data(nullptr).cpu_data()[i]);
-  }
-
-  // test for computing gradient
-  for (int i = 0; i < K; ++i)
-    out[i].ComputeGradient(0, src_out[i]);
-  split.ComputeGradient(0, src_split);
-  in.ComputeGradient(0, src_in);
-  for (int i = 0; i < in.grad(nullptr).count(); ++i) {
-    float grad = 0;
-    for (int k = 0; k < K; ++k) grad += out[k].grad(nullptr).cpu_data()[i];
-    ASSERT_EQ(in.grad(nullptr).cpu_data()[i], grad);
-  }
-}
diff --git a/src/test/test_context.cc b/src/test/test_context.cc
deleted file mode 100644
index 70f6d07..0000000
--- a/src/test/test_context.cc
+++ /dev/null
@@ -1,76 +0,0 @@
-/************************************************************
-*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*
-*************************************************************/
-
-#include <thread>
-#include "gtest/gtest.h"
-#include "singa/utils/singleton.h"
-#include "singa/utils/context.h"
-#include "singa/utils/cuda_utils.h"
-
-using namespace singa;
-using namespace std;
-
-TEST(ContextTest, TestDevice) {
-  auto context = Singleton<Context>::Instance();
-
-  auto id = std::this_thread::get_id();
-  context->SetupDevice(id, 0);
-  auto device_id = context->device_id(id);
-  ASSERT_EQ(0, device_id);
-}
-
-TEST(ContextTest, TestHandle) {
-  auto context = Singleton<Context>::Instance();
-
-  float cpu_ret = 0.0f;
-  float gpu_ret = 0.0f;
-
-  float A[12];
-  float B[12];
-
-  for (int i = 0; i < 12; i++) {
-    A[i] = i - 1;
-    B[i] = i + 1;
-  }
-
-  float* A_gpu = NULL;
-  float* B_gpu = NULL;
-  context->SetupDevice(std::this_thread::get_id(), 0);
-
-  cudaMalloc(reinterpret_cast<void**>(&A_gpu), 12 * sizeof(float));
-  cudaMalloc(reinterpret_cast<void**>(&B_gpu), 12 * sizeof(float));
-
-  cudaMemcpy(A_gpu, A, 12 * sizeof(float), cudaMemcpyHostToDevice);
-  cudaMemcpy(B_gpu, B, 12 * sizeof(float), cudaMemcpyHostToDevice);
-
-  cublasHandle_t handle = context->cublas_handle(std::this_thread::get_id());
-
-  cublasSdot(handle, 12, A_gpu, 1, B_gpu, 1, &gpu_ret);
-
-  for (int i = 0; i < 12; ++i) {
-    cpu_ret += A[i] * B[i];
-  }
-
-  ASSERT_EQ(gpu_ret, cpu_ret);
-
-  cudaFree(A_gpu);
-  cudaFree(B_gpu);
-}
diff --git a/src/test/test_csv_input_layer.cc b/src/test/test_csv_input_layer.cc
deleted file mode 100644
index 86eaff9..0000000
--- a/src/test/test_csv_input_layer.cc
+++ /dev/null
@@ -1,92 +0,0 @@
-/************************************************************
-*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*
-*************************************************************/
-#include <string>
-#include <vector>
-#include <fstream>
-
-#include "gtest/gtest.h"
-#include "singa/neuralnet/input_layer.h"
-#include "singa/proto/job.pb.h"
-
-class CSVInputLayerTest : public ::testing::Test {
- protected:
-  virtual void SetUp() {
-    std::string path ="src/test/test.csv";
-    std::ofstream ofs(path, std::ofstream::out);
-    ASSERT_TRUE(ofs.is_open());
-    ofs << "12,3.2,1,14.1\n";
-    ofs << "2,0.2,0,1.1\n";
-    ofs << "1,2.2,1,4.1\n";
-    ofs.close();
-    auto conf = csv_conf.mutable_store_conf();
-    conf->set_path(path);
-    conf->add_batchsize(2);
-    conf->add_shape(3);
-    conf->set_backend("textfile");
-  }
-  singa::LayerProto csv_conf;
-};
-
-TEST_F(CSVInputLayerTest, Setup) {
-  singa::CSVInputLayer layer;
-  layer.Setup(csv_conf, std::vector<singa::Layer*>{});
-  EXPECT_EQ(2, static_cast<int>(layer.aux_data().size()));
-  EXPECT_EQ(6, layer.data(nullptr).count());
-}
-
-TEST_F(CSVInputLayerTest, ComputeFeature) {
-  singa::CSVInputLayer csv;
-  csv.Setup(csv_conf, std::vector<singa::Layer*>{});
-  csv.ComputeFeature(singa::kTrain, std::vector<singa::Layer*>{});
-
-  EXPECT_EQ(12, csv.aux_data()[0]);
-  EXPECT_EQ(2, csv.aux_data()[1]);
-  auto data = csv.data(nullptr);
-  EXPECT_EQ(3.2f, data.cpu_data()[0]);
-  EXPECT_EQ(14.1f, data.cpu_data()[2]);
-  EXPECT_EQ(0.2f, data.cpu_data()[3]);
-  EXPECT_EQ(1.1f, data.cpu_data()[5]);
-}
-TEST_F(CSVInputLayerTest, ComputeFeatureDeploy) {
-  singa::CSVInputLayer csv;
-  csv_conf.mutable_store_conf()->set_shape(0, 4);
-  csv.Setup(csv_conf, std::vector<singa::Layer*>{});
-  csv.ComputeFeature(singa::kDeploy, std::vector<singa::Layer*>{});
-
-  auto data = csv.data(nullptr);
-  EXPECT_EQ(12.f, data.cpu_data()[0]);
-  EXPECT_EQ(1.f, data.cpu_data()[2]);
-  EXPECT_EQ(14.1f, data.cpu_data()[3]);
-  EXPECT_EQ(0.2f, data.cpu_data()[5]);
-}
-
-TEST_F(CSVInputLayerTest, SeekToFirst) {
-  singa::CSVInputLayer csv;
-  csv.Setup(csv_conf, std::vector<singa::Layer*>{});
-  csv.ComputeFeature(singa::kTrain, std::vector<singa::Layer*>{});
-  csv.ComputeFeature(singa::kTrain, std::vector<singa::Layer*>{});
-
-  auto data = csv.data(nullptr);
-  EXPECT_EQ(2.2f, data.cpu_data()[0]);
-  EXPECT_EQ(4.1f, data.cpu_data()[2]);
-  EXPECT_EQ(3.2f, data.cpu_data()[3]);
-  EXPECT_EQ(14.1f, data.cpu_data()[5]);
-}
diff --git a/src/test/test_gru_layer.cc b/src/test/test_gru_layer.cc
deleted file mode 100644
index e0e381f..0000000
--- a/src/test/test_gru_layer.cc
+++ /dev/null
@@ -1,287 +0,0 @@
-/************************************************************
- *
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- *
- *************************************************************/
-#include <string>
-#include <vector>
-#include <fstream>
-#include <iostream>
-using namespace std;
-
-
-#include "gtest/gtest.h"
-#include "singa/neuralnet/neuron_layer.h"
-#include "singa/neuralnet/input_layer.h"
-#include "singa/driver.h"
-#include "singa/proto/job.pb.h"
-
-using namespace singa;
-
-class GRULayerTest: public ::testing::Test {
- protected:
-  virtual void SetUp() {
-    // Initialize the settings for the first input-layer
-    std::string path1 = "src/test/gru-in-1.csv";  // path of a csv file
-    std::ofstream ofs1(path1, std::ofstream::out);
-    ASSERT_TRUE(ofs1.is_open());
-    ofs1 << "0,0,0,1\n";
-    ofs1 << "0,0,1,0\n";
-    ofs1.close();
-    auto conf1 = in1_conf.mutable_store_conf();
-    conf1->set_path(path1);
-    conf1->add_batchsize(2);
-    conf1->add_shape(4);
-    conf1->set_backend("textfile");
-    conf1->set_has_label(false);
-
-
-    // Initialize the settings for the second input-layer
-    std::string path2 = "src/test/gru-in-2.csv";  // path of a csv file
-    std::ofstream ofs2(path2, std::ofstream::out);
-    ASSERT_TRUE(ofs2.is_open());
-    ofs2 << "0,1,0,0\n";
-    ofs2 << "1,0,0,0\n";
-    ofs2.close();
-    auto conf2 = in2_conf.mutable_store_conf();
-    conf2->set_path(path2);
-
-    conf2->add_batchsize(2);
-    conf2->add_shape(4);
-    conf2->set_backend("textfile");
-    conf2->set_has_label(false);
-
-
-    gru1_conf.mutable_gru_conf() -> set_dim_hidden(2);
-    gru1_conf.mutable_gru_conf() -> set_bias_term(true);
-    for (int i = 0; i < 9; i ++) {
-      gru1_conf.add_param();
-    }
-
-
-    gru1_conf.mutable_param(0)->set_name("wzhx1");
-    gru1_conf.mutable_param(0)->set_type(kParam);
-    gru1_conf.mutable_param(0)->mutable_init()->set_type(kConstant);
-    gru1_conf.mutable_param(0)->mutable_init()->set_value(0.5f);
-
-    gru1_conf.mutable_param(1)->set_name("wrhx1");
-    gru1_conf.mutable_param(1)->set_type(kParam);
-    gru1_conf.mutable_param(1)->mutable_init()->set_type(kConstant);
-    gru1_conf.mutable_param(1)->mutable_init()->set_value(0.5f);
-
-    gru1_conf.mutable_param(2)->set_name("wchx1");
-    gru1_conf.mutable_param(2)->set_type(kParam);
-    gru1_conf.mutable_param(2)->mutable_init()->set_type(kConstant);
-    gru1_conf.mutable_param(2)->mutable_init()->set_value(0.5f);
-
-    gru1_conf.mutable_param(3)->set_name("wzhh1");
-    gru1_conf.mutable_param(3)->set_type(kParam);
-    gru1_conf.mutable_param(3)->mutable_init()->set_type(kConstant);
-    gru1_conf.mutable_param(3)->mutable_init()->set_value(0.5f);
-
-    gru1_conf.mutable_param(4)->set_name("wrhh1");
-    gru1_conf.mutable_param(4)->set_type(kParam);
-    gru1_conf.mutable_param(4)->mutable_init()->set_type(kConstant);
-    gru1_conf.mutable_param(4)->mutable_init()->set_value(0.5f);
-
-    gru1_conf.mutable_param(5)->set_name("wchh1");
-    gru1_conf.mutable_param(5)->set_type(kParam);
-    gru1_conf.mutable_param(5)->mutable_init()->set_type(kConstant);
-    gru1_conf.mutable_param(5)->mutable_init()->set_value(0.5f);
-
-    gru1_conf.mutable_param(6)->set_name("bz1");
-    gru1_conf.mutable_param(6)->set_type(kParam);
-    gru1_conf.mutable_param(6)->mutable_init()->set_type(kConstant);
-    gru1_conf.mutable_param(6)->mutable_init()->set_value(0.5f);
-
-    gru1_conf.mutable_param(7)->set_name("br1");
-    gru1_conf.mutable_param(7)->set_type(kParam);
-    gru1_conf.mutable_param(7)->mutable_init()->set_type(kConstant);
-    gru1_conf.mutable_param(7)->mutable_init()->set_value(0.5f);
-
-    gru1_conf.mutable_param(8)->set_name("bc1");
-    gru1_conf.mutable_param(8)->set_type(kParam);
-    gru1_conf.mutable_param(8)->mutable_init()->set_type(kConstant);
-    gru1_conf.mutable_param(8)->mutable_init()->set_value(0.5f);
-
-    gru2_conf.mutable_gru_conf() -> set_dim_hidden(2);
-    gru2_conf.mutable_gru_conf() -> set_bias_term(true);
-    for (int i = 0; i < 9; i ++) {
-      gru2_conf.add_param();
-    }
-
-    gru2_conf.mutable_param(0)->set_name("wzhx2");
-    gru2_conf.mutable_param(0)->set_type(kParam);
-    gru2_conf.mutable_param(0)->mutable_init()->set_type(kConstant);
-    gru2_conf.mutable_param(0)->mutable_init()->set_value(0.5f);
-
-    gru2_conf.mutable_param(1)->set_name("wrhx2");
-    gru2_conf.mutable_param(1)->set_type(kParam);
-    gru2_conf.mutable_param(1)->mutable_init()->set_type(kConstant);
-    gru2_conf.mutable_param(1)->mutable_init()->set_value(0.5f);
-
-    gru2_conf.mutable_param(2)->set_name("wchx2");
-    gru2_conf.mutable_param(2)->set_type(kParam);
-    gru2_conf.mutable_param(2)->mutable_init()->set_type(kConstant);
-    gru2_conf.mutable_param(2)->mutable_init()->set_value(0.5f);
-
-    gru2_conf.mutable_param(3)->set_name("wzhh2");
-    gru2_conf.mutable_param(3)->set_type(kParam);
-    gru2_conf.mutable_param(3)->mutable_init()->set_type(kConstant);
-    gru2_conf.mutable_param(3)->mutable_init()->set_value(0.5f);
-
-    gru2_conf.mutable_param(4)->set_name("wrhh2");
-    gru2_conf.mutable_param(4)->set_type(kParam);
-    gru2_conf.mutable_param(4)->mutable_init()->set_type(kConstant);
-    gru2_conf.mutable_param(4)->mutable_init()->set_value(0.5f);
-
-    gru2_conf.mutable_param(5)->set_name("wchh2");
-    gru2_conf.mutable_param(5)->set_type(kParam);
-    gru2_conf.mutable_param(5)->mutable_init()->set_type(kConstant);
-    gru2_conf.mutable_param(5)->mutable_init()->set_value(0.5f);
-
-    gru2_conf.mutable_param(6)->set_name("bz2");
-    gru2_conf.mutable_param(6)->set_type(kParam);
-    gru2_conf.mutable_param(6)->mutable_init()->set_type(kConstant);
-    gru2_conf.mutable_param(6)->mutable_init()->set_value(0.5f);
-
-    gru2_conf.mutable_param(7)->set_name("br2");
-    gru2_conf.mutable_param(7)->set_type(kParam);
-    gru2_conf.mutable_param(7)->mutable_init()->set_type(kConstant);
-    gru2_conf.mutable_param(7)->mutable_init()->set_value(0.5f);
-
-    gru2_conf.mutable_param(8)->set_name("bc2");
-    gru2_conf.mutable_param(8)->set_type(kParam);
-    gru2_conf.mutable_param(8)->mutable_init()->set_type(kConstant);
-    gru2_conf.mutable_param(8)->mutable_init()->set_value(0.5f);
-  }
-  singa::LayerProto in1_conf;
-  singa::LayerProto in2_conf;
-  singa::LayerProto gru1_conf;
-  singa::LayerProto gru2_conf;
-};
-
-TEST_F(GRULayerTest, Setup) {
-  singa::Driver driver;
-  // driver.RegisterLayer<GRULayer, int> (kGRU);
-  driver.RegisterParam<Param>(0);
-  driver.RegisterParamGenerator<UniformGen>(kUniform);
-  driver.RegisterParamGenerator<ParamGenerator>(kConstant);
-
-  singa::CSVInputLayer in_layer_1;
-  singa::CSVInputLayer in_layer_2;
-
-  in_layer_1.Setup(in1_conf, std::vector<singa::Layer*> { });
-  EXPECT_EQ(2, static_cast<int>(in_layer_1.aux_data().size()));
-  EXPECT_EQ(8, in_layer_1.data(nullptr).count());
-
-  in_layer_2.Setup(in2_conf, std::vector<singa::Layer*>{ });
-  EXPECT_EQ(2, static_cast<int>(in_layer_2.aux_data().size()));
-  EXPECT_EQ(8, in_layer_2.data(nullptr).count());
-
-  singa::GRULayer gru_layer_1;
-  gru_layer_1.Setup(gru1_conf, std::vector<singa::Layer*>{&in_layer_1});
-  // EXPECT_EQ(2, gru_layer_1.hdim());
-  // EXPECT_EQ(4, gru_layer_1.vdim());
-
-  for (unsigned int i = 0; i < gru_layer_1.GetParams().size(); i ++) {
-    gru_layer_1.GetParams()[i]->InitValues();
-  }
-  EXPECT_EQ (0.5, gru_layer_1.GetParams()[0]->data().cpu_data()[0]);
-  // cout << "gru_layer_1: " << gru_layer_1.GetParams()[0]->data().cpu_data()[0]
-  // << endl;
-
-  singa::GRULayer gru_layer_2;
-  gru_layer_2.Setup(gru2_conf,
-                    std::vector<singa::Layer*>{&in_layer_2, &gru_layer_1});
-  // EXPECT_EQ(2, gru_layer_2.hdim());
-  // EXPECT_EQ(4, gru_layer_2.vdim());
-  for (unsigned int i = 0; i < gru_layer_2.GetParams().size(); i ++) {
-    gru_layer_2.GetParams()[i]->InitValues();
-  }
-  EXPECT_EQ (0.5, gru_layer_2.GetParams()[0]->data().cpu_data()[0]);
-}
-
-
-/*
-TEST_F(GRULayerTest, ComputeFeature) {
-  singa::CSVInputLayer in_layer_1;
-  singa::CSVInputLayer in_layer_2;
-
-  in_layer_1.Setup(in1_conf, std::vector<singa::Layer*> { });
-  in_layer_1.ComputeFeature(singa::kTrain, std::vector<singa::Layer*> { });
-  in_layer_2.Setup(in2_conf, std::vector<singa::Layer*>{ });
-  in_layer_2.ComputeFeature(singa::kTrain, std::vector<singa::Layer*> { });
-
-
-  singa::GRULayer gru_layer_1;
-  gru_layer_1.Setup(gru1_conf, std::vector<singa::Layer*>{&in_layer_1});
-  for (unsigned int i = 0; i < gru_layer_1.GetParams().size(); i ++) {
-    gru_layer_1.GetParams()[i]->InitValues();
-  }
-  gru_layer_1.ComputeFeature(singa::kTrain, std::vector<singa::Layer*>{&in_layer_1});
-  for (int i = 0; i < gru_layer_1.data(nullptr).count(); i ++) {
-    EXPECT_GT(0.000001,abs(0.204824-gru_layer_1.data(nullptr).cpu_data()[i]));
-  }
-
-  singa::GRULayer gru_layer_2;
-  gru_layer_2.Setup(gru2_conf, std::vector<singa::Layer*>{&in_layer_2, &gru_layer_1});
-  for (unsigned int i = 0; i < gru_layer_2.GetParams().size(); i ++) {
-    gru_layer_2.GetParams()[i]->InitValues();
-  }
-  gru_layer_2.ComputeFeature(singa::kTrain, std::vector<singa::Layer*>{&in_layer_2, &gru_layer_1});
-  for (int i = 0; i < gru_layer_2.data(nullptr).count(); i ++) {
-    EXPECT_GT(0.000001,abs(0.346753-gru_layer_2.data(nullptr).cpu_data()[i]));
-  }
-}
-
-TEST_F(GRULayerTest, ComputeGradient) {
-  singa::CSVInputLayer in_layer_1;
-  singa::CSVInputLayer in_layer_2;
-
-  in_layer_1.Setup(in1_conf, std::vector<singa::Layer*> { });
-  in_layer_1.ComputeFeature(singa::kTrain, std::vector<singa::Layer*> { });
-  in_layer_2.Setup(in2_conf, std::vector<singa::Layer*>{ });
-  in_layer_2.ComputeFeature(singa::kTrain, std::vector<singa::Layer*> { });
-
-
-  singa::GRULayer gru_layer_1;
-  gru_layer_1.Setup(gru1_conf, std::vector<singa::Layer*>{&in_layer_1});
-  for (unsigned int i = 0; i < gru_layer_1.GetParams().size(); i ++) {
-    gru_layer_1.GetParams()[i]->InitValues();
-  }
-  gru_layer_1.ComputeFeature(singa::kTrain, std::vector<singa::Layer*>{&in_layer_1});
-
-
-  singa::GRULayer gru_layer_2;
-  gru_layer_2.Setup(gru2_conf, std::vector<singa::Layer*>{&in_layer_2, &gru_layer_1});
-  for (unsigned int i = 0; i < gru_layer_2.GetParams().size(); i ++) {
-    gru_layer_2.GetParams()[i]->InitValues();
-  }
-  gru_layer_2.ComputeFeature(singa::kTrain, std::vector<singa::Layer*>{&in_layer_2, &gru_layer_1});
-
-  // For test purpose, we set dummy values for gru_layer_2.grad_
-  for (int i = 0; i < gru_layer_2.grad(nullptr).count(); i ++) {
-    gru_layer_2.mutable_grad(nullptr)->mutable_cpu_data()[i] = 1.0f;
-  }
-  gru_layer_2.ComputeGradient(singa::kTrain, std::vector<singa::Layer*>{&in_layer_2, &gru_layer_1});
-
-  gru_layer_1.ComputeGradient(singa::kTrain, std::vector<singa::Layer*>{&in_layer_1});
-
-}
-*/
diff --git a/src/test/test_kvfile.cc b/src/test/test_kvfile.cc
deleted file mode 100644
index 5707ca9..0000000
--- a/src/test/test_kvfile.cc
+++ /dev/null
@@ -1,85 +0,0 @@
-/************************************************************
-*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*
-*************************************************************/
-
-#include <sys/stat.h>
-
-#include "gtest/gtest.h"
-#include "singa/io/kvfile.h"
-
-std::string key[] = {"firstkey",
-                     "secondkey",
-                     "3key",
-                     "key4",
-                     "key5"};
-std::string tuple[] = {"firsttuple",
-                       "2th-tuple",
-                       "thridtuple",
-                       "tuple4",
-                       "tuple5"};
-namespace singa {
-namespace io {
-TEST(KVFileTest, CreateKVFile) {
-  std::string path = "src/test/kvfile.bin";
-  KVFile kvfile(path, KVFile::kCreate, 50);
-  kvfile.Insert(key[0], tuple[0]);
-  kvfile.Insert(key[1], tuple[1]);
-  kvfile.Insert(key[2], tuple[2]);
-  kvfile.Flush();
-}
-
-TEST(KVFileTest, AppendKVFile) {
-  std::string path = "src/test/kvfile.bin";
-  KVFile kvfile(path, KVFile::kAppend, 50);
-  kvfile.Insert(key[3], tuple[3]);
-  kvfile.Insert(key[4], tuple[4]);
-  kvfile.Flush();
-}
-
-TEST(KVFileTest, CountKVFile) {
-  std::string path = "src/test/kvfile.bin";
-  KVFile kvfile(path, KVFile::kRead, 50);
-  int count = kvfile.Count();
-  ASSERT_EQ(5, count);
-}
-
-TEST(KVFileTest, ReadKVFile) {
-  std::string path = "src/test/kvfile.bin";
-  KVFile kvfile(path, KVFile::kRead, 50);
-  std::string k, t;
-  ASSERT_TRUE(kvfile.Next(&k, &t));
-  ASSERT_STREQ(key[0].c_str(), k.c_str());
-  ASSERT_STREQ(tuple[0].c_str(), t.c_str());
-  ASSERT_TRUE(kvfile.Next(&k, &t));
-  ASSERT_STREQ(key[1].c_str(), k.c_str());
-  ASSERT_STREQ(tuple[1].c_str(), t.c_str());
-  ASSERT_TRUE(kvfile.Next(&k, &t));
-  ASSERT_TRUE(kvfile.Next(&k, &t));
-  ASSERT_TRUE(kvfile.Next(&k, &t));
-  ASSERT_STREQ(key[4].c_str(), k.c_str());
-  ASSERT_STREQ(tuple[4].c_str(), t.c_str());
-  ASSERT_FALSE(kvfile.Next(&k, &t));
-  kvfile.SeekToFirst();
-  ASSERT_TRUE(kvfile.Next(&k, &t));
-  ASSERT_STREQ(key[0].c_str(), k.c_str());
-  ASSERT_STREQ(tuple[0].c_str(), t.c_str());
-}
-}  // namespace io
-}  // namespace singa
diff --git a/src/test/test_math.cc b/src/test/test_math.cc
deleted file mode 100644
index 9830703..0000000
--- a/src/test/test_math.cc
+++ /dev/null
@@ -1,1033 +0,0 @@
-/************************************************************
-*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*
-*************************************************************/
-#include <thread>
-#include "gtest/gtest.h"
-#include "singa/utils/blob.h"
-#include "singa/utils/math_blob.h"
-#include "singa/utils/math_addr.h"
-#include "singa/utils/math_kernel.h"
-#include "singa/utils/singa_op.h"
-#include "singa/utils/context.h"
-#include "singa/utils/singleton.h"
-
-#ifdef USE_GPU
-#include <cuda_runtime.h>
-#include <cublas_v2.h>
-#endif
-
-using namespace singa;
-using namespace std;
-
-TEST(MathBlobTest, TestScale) {
-  Blob<float> *A = new Blob<float>(10);
-  Blob<float> *B = new Blob<float>(10);
-  A->SetValue(2);
-  B->SetValue(6);
-  Scale<float>(3.0, A);
-  ASSERT_EQ(A->check_equal(B), true);
-}
-
-TEST(MathBlobTest, TestAXPY) {
-  Blob<float> * A = new Blob<float>(10);
-  Blob<float> * B = new Blob<float>(10);
-  Blob<float> * C = new Blob<float>(10);
-  Blob<float> * D = new Blob<float>(10);
-  A->SetValue(2);
-  B->SetValue(3);
-  C->SetValue(7);
-  D->SetValue(2);
-  AXPY<float>(2.0, *A, B);
-  ASSERT_EQ(B->check_equal(C), true);
-  ASSERT_EQ(A->check_equal(D), true);
-}
-
-TEST(MathBlobTest, TestGEMV) {
-  float A[5][5] = {};
-  float AT[5][5] = {};
-  float B[5] = {};
-  float Res[5] = {};
-  for (int i = 0; i < 5; i++) {
-    for (int j = 0; j < 5; j++) {
-      A[i][j] = i * j + i - j;
-      AT[j][i] = i * j + i - j;
-    }
-    B[i] = 5*i + 3;
-    Res[i] = i;
-  }
-
-  Blob<float> * BlobA = new Blob<float>(5, 5);
-  Blob<float> * BlobAT = new Blob<float>(5, 5);
-  Blob<float> * BlobB = new Blob<float>(5);
-  Blob<float> * BlobAB = new Blob<float>(5);
-  Blob<float> * BlobATB = new Blob<float>(5);
-  Blob<float> * BlobRes = new Blob<float>(5);
-
-  BlobA->set_cpu_data(A[0]);
-  BlobAT->set_cpu_data(AT[0]);
-  BlobAT->set_transpose(true);
-  BlobB->set_cpu_data(B);
-  BlobAB->set_cpu_data(Res);
-  BlobATB->set_cpu_data(Res);
-
-  for (int i = 0; i < 5; i++) {
-    for (int j = 0; j < 5; j++) {
-      Res[i] += 2*A[i][j] * B[j];
-    }
-  }
-
-  BlobRes->set_cpu_data(Res);
-
-  GEMV<float>(2, 1, *BlobA, *BlobB, BlobAB);
-  GEMV<float>(2, 1, *BlobAT, *BlobB, BlobATB);
-
-  ASSERT_EQ(BlobAB->check_equal(BlobRes), true);
-  ASSERT_EQ(BlobATB->check_equal(BlobRes), true);
-}
-
-TEST(MathBlobTest, TestMVDot) {
-  float A[5][5] = {};
-  float AT[5][5] = {};
-  float B[5] = {};
-  float Res[5] = {};
-  for (int i = 0; i < 5; i++) {
-    for (int j = 0; j < 5; j++) {
-      A[i][j] = i * j + i - j;
-      AT[j][i] = i * j + i - j;
-    }
-    B[i] = 5*i -2;
-    Res[i] = 0;
-  }
-
-  Blob<float> * BlobA = new Blob<float>(5, 5);
-  Blob<float> * BlobAT = new Blob<float>(5, 5);
-  Blob<float> * BlobB = new Blob<float>(5);
-  Blob<float> * BlobAB = new Blob<float>(5);
-  Blob<float> * BlobATB = new Blob<float>(5);
-  Blob<float> * BlobRes = new Blob<float>(5);
-
-  BlobA->set_cpu_data(A[0]);
-  BlobAT->set_cpu_data(AT[0]);
-  BlobAT->set_transpose(true);
-  BlobB->set_cpu_data(B);
-  BlobAB->set_cpu_data(Res);
-  BlobATB->set_cpu_data(Res);
-
-  for (int i = 0; i < 5; i++) {
-    for (int j = 0; j < 5; j++) {
-      Res[i] += A[i][j] * B[j];
-    }
-  }
-
-  BlobRes->set_cpu_data(Res);
-
-  MVDot<float>(*BlobA, *BlobB, BlobAB);
-  MVDot<float>(*BlobAT, *BlobB, BlobATB);
-
-  const float * addrRes = BlobAB->cpu_data();
-  for (int i = 0; i < 5; i++) {
-    ASSERT_EQ(addrRes[i], Res[i]);
-  }
-  ASSERT_EQ(BlobAB->check_equal(BlobRes), true);
-  ASSERT_EQ(BlobAB->check_equal(BlobRes), true);
-  ASSERT_EQ(BlobATB->check_equal(BlobRes), true);
-}
-
-TEST(MathBlobTest, TestGEMM) {
-  float A[5][5] = {};
-  float AT[5][5] = {};
-  float B[5][5]= {};
-  float BT[5][5]= {};
-  float Res[5][5]= {};
-  for (int i = 0; i < 5; i++) {
-    for (int j = 0; j < 5; j++) {
-      A[i][j] = i * j + i - j;
-      AT[j][i] = i * j + i - j;
-      B[i][j] = - i * j + i * i - j * j;
-      BT[j][i] = - i * j + i * i - j * j;
-      Res[i][j] = i * j + i * i + j * j;
-    }
-  }
-
-  Blob<float> * BlobA = new Blob<float>(5, 5);
-  BlobA->set_cpu_data(A[0]);
-  Blob<float> * BlobAT = new Blob<float>(5, 5);
-  BlobAT->set_cpu_data(AT[0]);
-  BlobAT->set_transpose(true);
-  Blob<float> * BlobB = new Blob<float>(5, 5);
-  BlobB->set_cpu_data(B[0]);
-  Blob<float> * BlobBT = new Blob<float>(5, 5);
-  BlobBT->set_cpu_data(BT[0]);
-  BlobBT->set_transpose(true);
-  Blob<float> * BlobAB = new Blob<float>(5, 5);
-  BlobAB->set_cpu_data(Res[0]);
-  Blob<float> * BlobABT = new Blob<float>(5, 5);
-  BlobABT->set_cpu_data(Res[0]);
-  Blob<float> * BlobATB = new Blob<float>(5, 5);
-  BlobATB->set_cpu_data(Res[0]);
-  Blob<float> * BlobATBT = new Blob<float>(5, 5);
-  BlobATBT->set_cpu_data(Res[0]);
-
-  for (int i = 0; i < 5; i++) {
-    for (int j = 0; j < 5; j++) {
-      Res[i][j] *= 2;
-      for (int k = 0; k < 5; k++) {
-        Res[i][j] += 3 * A[i][k]*B[k][j];
-      }
-    }
-  }
-
-  Blob<float> * BlobRes = new Blob<float>(5, 5);
-  BlobRes->set_cpu_data(Res[0]);
-
-  GEMM<float>(3, 2, *BlobA, *BlobB, BlobAB);
-  GEMM<float>(3, 2, *BlobA, *BlobBT, BlobABT);
-  GEMM<float>(3, 2, *BlobAT, *BlobB, BlobATB);
-  GEMM<float>(3, 2, *BlobAT, *BlobBT, BlobATBT);
-
-  ASSERT_EQ(BlobAB->check_equal(BlobRes), true);
-  ASSERT_EQ(BlobATB->check_equal(BlobRes), true);
-  ASSERT_EQ(BlobABT->check_equal(BlobRes), true);
-  ASSERT_EQ(BlobATBT->check_equal(BlobRes), true);
-}
-
-TEST(MathBlobTest, TestMMDot) {
-  float A[5][5] = {};
-  float AT[5][5] = {};
-  float B[5][5]= {};
-  float BT[5][5]= {};
-  float Res[5][5]= {};
-  for (int i = 0; i < 5; i++) {
-    for (int j = 0; j < 5; j++) {
-      A[i][j] = i * j + i - j;
-      AT[j][i] = i * j + i - j;
-      B[i][j] = - i * j + i * i - j * j;
-      BT[j][i] = - i * j + i * i - j * j;
-      Res[i][j] = i * j + i * i + j * j;
-    }
-  }
-
-  Blob<float> * BlobA = new Blob<float>(5, 5);
-  BlobA->set_cpu_data(A[0]);
-  Blob<float> * BlobAT = new Blob<float>(5, 5);
-  BlobAT->set_cpu_data(AT[0]);
-  BlobAT->set_transpose(true);
-  Blob<float> * BlobB = new Blob<float>(5, 5);
-  BlobB->set_cpu_data(B[0]);
-  Blob<float> * BlobBT = new Blob<float>(5, 5);
-  BlobBT->set_cpu_data(BT[0]);
-  BlobBT->set_transpose(true);
-  Blob<float> * BlobAB = new Blob<float>(5, 5);
-  BlobAB->set_cpu_data(Res[0]);
-  Blob<float> * BlobABT = new Blob<float>(5, 5);
-  BlobABT->set_cpu_data(Res[0]);
-  Blob<float> * BlobATB = new Blob<float>(5, 5);
-  BlobATB->set_cpu_data(Res[0]);
-  Blob<float> * BlobATBT = new Blob<float>(5, 5);
-  BlobATBT->set_cpu_data(Res[0]);
-
-  for (int i = 0; i < 5; i++) {
-    for (int j = 0; j < 5; j++) {
-      Res[i][j] = 0;
-      for (int k = 0; k < 5; k++) {
-        Res[i][j] += A[i][k]*B[k][j];
-      }
-    }
-  }
-
-  Blob<float> * BlobRes = new Blob<float>(5, 5);
-  BlobRes->set_cpu_data(Res[0]);
-
-  MMDot<float>(*BlobA, *BlobB, BlobAB);
-  MMDot<float>(*BlobA, *BlobBT, BlobABT);
-  MMDot<float>(*BlobAT, *BlobB, BlobATB);
-  MMDot<float>(*BlobAT, *BlobBT, BlobATBT);
-
-  ASSERT_EQ(BlobAB->check_equal(BlobRes), true);
-  ASSERT_EQ(BlobATB->check_equal(BlobRes), true);
-  ASSERT_EQ(BlobABT->check_equal(BlobRes), true);
-  ASSERT_EQ(BlobATBT->check_equal(BlobRes), true);
-}
-
-TEST(MathBlobTest, TestVVDot) {
-  float A[10] = {};
-  float B[10] = {};
-  float prod = 0;
-  for (int i = 0; i < 10; i++) {
-    A[i] = i * i - 5* (i%2);
-    B[i] = 2* i * i - 3* (i%4);
-    prod += A[i] * B[i];
-  }
-
-  Blob<float> * BlobA = new Blob<float>(10);
-  BlobA->set_cpu_data(A);
-  Blob<float> * BlobB = new Blob<float>(10);
-  BlobB->set_cpu_data(B);
-  float blobprod = VVDot<float>(*BlobA, *BlobB);
-  ASSERT_EQ(blobprod, prod);
-}
-
-TEST(MathBlobTest, TestOuterProduct) {
-  float A[10] = {};
-  float B[10] = {};
-  float AB[10][10] = {};
-  for (int i = 0; i < 10; i++) {
-    A[i] = i * i - 5* (i%2);
-    B[i] = 2* i * i - 3* (i%4);
-  }
-  for (int i = 0; i < 10; i++) {
-    for (int j = 0; j < 10; j++) {
-      AB[i][j] = A[i]*B[j];
-    }
-  }
-  Blob<float> * BlobA = new Blob<float>(10);
-  BlobA->set_cpu_data(A);
-  Blob<float> * BlobB = new Blob<float>(10);
-  BlobB->set_cpu_data(B);
-  Blob<float> * BlobAB = new Blob<float>(10, 10);
-  // BlobAB->SetValue(3);
-  Blob<float> * BlobRes = new Blob<float>(10, 10);
-  BlobRes->set_cpu_data(AB[0]);
-  OuterProduct<float>(*BlobA, *BlobB, BlobAB);
-
-  ASSERT_EQ(BlobAB->check_equal(BlobRes), true);
-}
-
-TEST(MathBlobTest, TestMapAB) {
-  float A[10] = {};
-  float Res[10] = {};
-  for (int i = 0; i < 10; i++) {
-    A[i] = i * i - 5* (i%2);
-    Res[i] = A[i] * A[i];
-  }
-  Blob<float> * BlobA = new Blob<float>(10);
-  BlobA->set_cpu_data(A);
-  Blob<float> * BlobB = new Blob<float>(10);
-  Blob<float> * BlobRes = new Blob<float>(10);
-  BlobRes->set_cpu_data(Res);
-  Map<singa::op::Square<float>, float>(*BlobA, BlobB);
-  ASSERT_EQ(BlobB->check_equal(BlobRes), true);
-}
-
-TEST(MathBlobTest, TestMapABC) {
-  float A[10] = {};
-  float B[10] = {};
-  float Res[10] = {};
-  for (int i = 0; i < 10; i++) {
-    A[i] = i * i - 5* (i%2);
-    B[i] = 2* i * i - 3* (i%4);
-    Res[i] = A[i] * B[i];
-  }
-  Blob<float> * BlobA = new Blob<float>(10);
-  BlobA->set_cpu_data(A);
-  Blob<float> * BlobB = new Blob<float>(10);
-  BlobB->set_cpu_data(B);
-  Blob<float> * BlobC = new Blob<float>(10);
-  Blob<float> * BlobRes = new Blob<float>(10);
-  BlobRes->set_cpu_data(Res);
-  Map<singa::op::Mult<float>, float>(*BlobA, *BlobB, BlobC);
-  ASSERT_EQ(BlobC->check_equal(BlobRes), true);
-}
-
-TEST(MathBlobTest, TestCopy) {
-  Blob<float> *BlobA = new Blob<float>(10);
-  Blob<float> *BlobB = new Blob<float>(10);
-  float A[10] = {};
-  for (int i = 0; i < 10; i++) {
-    A[i] = i * i - 5* (i%2);
-  }
-  BlobA->set_cpu_data(A);
-  Copy<float>(*BlobA, BlobB);
-  ASSERT_EQ(BlobA->check_equal(BlobB), true);
-}
-
-TEST(MathBlobTest, TestAdd) {
-  Blob<float> *A = new Blob<float>(10);
-  Blob<float> *B = new Blob<float>(10);
-  Blob<float> *C = new Blob<float>(10);
-  Blob<float> *D = new Blob<float>(10);
-  A->SetValue(5);
-  B->SetValue(6);
-  D->SetValue(11);
-  Add<float>(*A, *B, C);
-  ASSERT_EQ(C->check_equal(D), true);
-}
-
-TEST(MathBlobTest, TestSub) {
-  Blob<float> *A = new Blob<float>(10);
-  Blob<float> *B = new Blob<float>(10);
-  Blob<float> *C = new Blob<float>(10);
-  Blob<float> *D = new Blob<float>(10);
-  A->SetValue(5);
-  B->SetValue(6);
-  D->SetValue(-1);
-  Sub<float>(*A, *B, C);
-  ASSERT_EQ(C->check_equal(D), true);
-}
-
-TEST(MathBlobTest, TestMVAddCol) {
-  Blob<float> *BlobA = new Blob<float>(10);
-  Blob<float> *BlobB = new Blob<float>(10, 10);
-  Blob<float> *BlobBT = new Blob<float>(10, 10);
-  Blob<float> *BlobRes = new Blob<float>(10, 10);
-  Blob<float> *BlobResT = new Blob<float>(10, 10);
-
-  float A[10] = {};
-  float B[10][10] = {};
-  float BT[10][10] = {};
-  for (int i = 0; i < 10; i++) {
-    A[i] = 5*i -2;
-    for (int j = 0; j < 10; j++) {
-      B[i][j] = i * j + i - j;
-      BT[j][i] = i * j + i - j;
-    }
-  }
-
-  BlobA->set_cpu_data(A);
-  BlobB->set_cpu_data(B[0]);
-  BlobBT->set_cpu_data(BT[0]);
-  BlobBT->set_transpose(true);
-
-  for (int i = 0; i < 10; i++) {
-    for (int j = 0; j < 10; j++) {
-      B[i][j] = 2.0 * A[i] + 3.0 * B[i][j];
-      BT[j][i] = 2.0 * A[i] + 3.0 * BT[j][i];
-    }
-  }
-
-  BlobRes->set_cpu_data(B[0]);
-  BlobResT->set_cpu_data(BT[0]);
-  BlobResT->set_transpose(true);
-
-  MVAddCol<float>(2.0, 3.0, *BlobA, BlobB);
-  MVAddCol<float>(2.0, 3.0, *BlobA, BlobBT);
-
-  ASSERT_EQ(BlobB->check_equal(BlobRes), true);
-  ASSERT_EQ(BlobBT->check_equal(BlobResT), true);
-}
-
-TEST(MathBlobTest, TestMVAddRow) {
-  Blob<float> *BlobA = new Blob<float>(10);
-  Blob<float> *BlobB = new Blob<float>(10, 10);
-  Blob<float> *BlobBT = new Blob<float>(10, 10);
-  Blob<float> *BlobRes = new Blob<float>(10, 10);
-  Blob<float> *BlobResT = new Blob<float>(10, 10);
-
-  float A[10] = {};
-  float B[10][10] = {};
-  float BT[10][10] = {};
-  for (int i = 0; i < 10; i++) {
-    A[i] = 5*i -2;
-    for (int j = 0; j < 10; j++) {
-      B[i][j] = i * j + i - j;
-      BT[j][i] = i * j + i - j;
-    }
-  }
-
-  BlobA->set_cpu_data(A);
-  BlobB->set_cpu_data(B[0]);
-  BlobBT->set_cpu_data(BT[0]);
-  BlobBT->set_transpose(true);
-
-  for (int i = 0; i < 10; i++) {
-    for (int j = 0; j < 10; j++) {
-      B[j][i] = 2.0 * A[i] + 3.0 * B[j][i];
-      BT[i][j] = 2.0 * A[i] + 3.0 * BT[i][j];
-    }
-  }
-
-  BlobRes->set_cpu_data(B[0]);
-  BlobResT->set_cpu_data(BT[0]);
-  BlobResT->set_transpose(true);
-
-  MVAddRow<float>(2.0, 3.0, *BlobA, BlobB);
-  MVAddRow<float>(2.0, 3.0, *BlobA, BlobBT);
-
-  ASSERT_EQ(BlobB->check_equal(BlobRes), true);
-  ASSERT_EQ(BlobBT->check_equal(BlobResT), true);
-}
-
-TEST(MathBlobTest, TestRepmatCol) {
-  Blob<float> *BlobA = new Blob<float>(10);
-  Blob<float> *BlobB = new Blob<float>(10, 10);
-  Blob<float> *BlobBT = new Blob<float>(10, 10);
-  Blob<float> *BlobRes = new Blob<float>(10, 10);
-  Blob<float> *BlobResT = new Blob<float>(10, 10);
-
-  float A[10] = {};
-  float B[10][10] = {};
-  float BT[10][10] = {};
-  for (int i = 0; i < 10; i++) {
-    A[i] = 5*i -2;
-    for (int j = 0; j < 10; j++) {
-      B[i][j] = A[i];
-      BT[j][i] = A[i];
-    }
-  }
-
-  BlobA->set_cpu_data(A);
-  BlobBT->set_transpose(true);
-
-  BlobRes->set_cpu_data(B[0]);
-  BlobResT->set_cpu_data(BT[0]);
-  BlobResT->set_transpose(true);
-
-  RepmatCol<float>(*BlobA, BlobB);
-  RepmatCol<float>(*BlobA, BlobBT);
-
-  ASSERT_EQ(BlobB->check_equal(BlobRes), true);
-  ASSERT_EQ(BlobBT->check_equal(BlobResT), true);
-}
-
-TEST(MathBlobTest, TestRepmatRow) {
-  Blob<float> *BlobA = new Blob<float>(10);
-  Blob<float> *BlobB = new Blob<float>(10, 10);
-  Blob<float> *BlobBT = new Blob<float>(10, 10);
-  Blob<float> *BlobRes = new Blob<float>(10, 10);
-  Blob<float> *BlobResT = new Blob<float>(10, 10);
-
-  float A[10] = {};
-  float B[10][10] = {};
-  float BT[10][10] = {};
-  for (int i = 0; i < 10; i++) {
-    A[i] = 5*i -2;
-    for (int j = 0; j < 10; j++) {
-      B[j][i] = A[i];
-      BT[i][j] = A[i];
-    }
-  }
-
-  BlobA->set_cpu_data(A);
-  BlobBT->set_transpose(true);
-
-  BlobRes->set_cpu_data(B[0]);
-  BlobResT->set_cpu_data(BT[0]);
-  BlobResT->set_transpose(true);
-
-  RepmatRow<float>(*BlobA, BlobB);
-  RepmatRow<float>(*BlobA, BlobBT);
-
-  ASSERT_EQ(BlobB->check_equal(BlobRes), true);
-  ASSERT_EQ(BlobBT->check_equal(BlobResT), true);
-}
-
-TEST(MathBlobTest, TestMVSumCol) {
-  Blob<float> *BlobA = new Blob<float>(10);
-  Blob<float> *BlobACopy = new Blob<float>(10);
-  Blob<float> *BlobB = new Blob<float>(10, 10);
-  Blob<float> *BlobBT = new Blob<float>(10, 10);
-  Blob<float> *BlobRes = new Blob<float>(10);
-
-  float A[10] = {};
-  float B[10][10] = {};
-  float BT[10][10] = {};
-  for (int i = 0; i < 10; i++) {
-    A[i] = 5*i -2;
-    for (int j = 0; j < 10; j++) {
-      B[i][j] = i * j + i - j;
-      BT[j][i] = i * j + i - j;
-    }
-  }
-
-  BlobA->set_cpu_data(A);
-  BlobACopy->set_cpu_data(A);
-  BlobB->set_cpu_data(B[0]);
-  BlobBT->set_cpu_data(BT[0]);
-  BlobBT->set_transpose(true);
-
-  for (int i = 0; i < 10; i++) {
-    A[i] *= 2.0;
-    for (int j = 0; j < 10; j++) {
-      A[i] += 3.0 * B[i][j];
-    }
-  }
-  BlobRes->set_cpu_data(A);
-
-  MVSumCol<float>(2.0, 3.0, *BlobB, BlobA);
-  MVSumCol<float>(2.0, 3.0, *BlobBT, BlobACopy);
-
-  ASSERT_EQ(BlobA->check_equal(BlobRes), true);
-  ASSERT_EQ(BlobACopy->check_equal(BlobRes), true);
-}
-
-TEST(MathBlobTest, TestMVSumRow) {
-  Blob<float> *BlobA = new Blob<float>(10);
-  Blob<float> *BlobACopy = new Blob<float>(10);
-  Blob<float> *BlobB = new Blob<float>(10, 10);
-  Blob<float> *BlobBT = new Blob<float>(10, 10);
-  Blob<float> *BlobRes = new Blob<float>(10);
-
-  float A[10] = {};
-  float B[10][10] = {};
-  float BT[10][10] = {};
-  for (int i = 0; i < 10; i++) {
-    A[i] = 5*i -2;
-    for (int j = 0; j < 10; j++) {
-      B[j][i] = i * j + i - j;
-      BT[i][j] = i * j + i - j;
-    }
-  }
-
-  BlobA->set_cpu_data(A);
-  BlobACopy->set_cpu_data(A);
-  BlobB->set_cpu_data(B[0]);
-  BlobBT->set_cpu_data(BT[0]);
-  BlobBT->set_transpose(true);
-
-  for (int i = 0; i < 10; i++) {
-    A[i] *= 2.0;
-    for (int j = 0; j < 10; j++) {
-      A[i] += 3.0 * B[j][i];
-    }
-  }
-  BlobRes->set_cpu_data(A);
-
-  MVSumRow<float>(2.0, 3.0, *BlobB, BlobA);
-  MVSumRow<float>(2.0, 3.0, *BlobBT, BlobACopy);
-
-  ASSERT_EQ(BlobA->check_equal(BlobRes), true);
-  ASSERT_EQ(BlobACopy->check_equal(BlobRes), true);
-}
-
-TEST(MathBlobTest, TestASum) {
-  float A[10] = {};
-  for (int i = 0; i < 10; i++) {
-    A[i] = ((i % 3) -1) * i;
-  }
-
-  Blob<float> *BlobA = new Blob<float>(10);
-  BlobA->set_cpu_data(A);
-
-  float BlobRes = Asum<float>(*BlobA);
-  float res = cblas_sasum(10, A, 1) / 10;
-
-  ASSERT_EQ(BlobRes, res);
-}
-
-TEST(MathTest, TestGemmCPU) {
-  float A[3][2] = {};
-  float B[3][2] = {};
-  float C[2][2] = {};
-  for (int i = 0; i < 3; i++)
-    for (int j = 0; j < 2; j++) {
-      A[i][j] = i+j;
-      B[i][j] = i+j - i*j;
-    }
-  cpu_gemm(A[0], B[0], 2, 2, 3 , 1.0f, 0.0f, true, false, C[0]);
-  float D[2][2] = {};
-  for (int i = 0; i < 2; i++)
-    for (int j = 0; j < 2; j++) {
-      D[i][j] = 0;
-      for (int k = 0; k < 3; k++)
-        D[i][j] += A[k][i]*B[k][j];
-    }
-    for (int i = 0; i < 2; i++)
-      for (int j = 0; j < 2; j++) {
-      ASSERT_EQ(C[i][j], D[i][j]);
-    }
-}
-
-TEST(MathTest, TestGemvCPU) {
-  float A[4][3] = {};
-  float B[4]= {};
-  float C[3] = {};
-  float D[3] = {};
-
-  for (int i = 0; i < 3; i++) {
-    for (int j = 0; j < 4; j++) {
-      A[j][i] = i-j + i*j;
-    }
-  }
-
-  for (int i = 0; i < 4; i++)B[i] = i;
-  for (int i = 0; i < 3; i++)C[i] = 10;
-  cpu_gemv(A[0], B, 4, 3, 1.0f, 1.0f, true, C);
-
-  for (int i = 0; i < 3; i++) {
-    for (int j = 0; j < 4; j++) {
-      D[i] += A[j][i]*B[j];
-    }
-  }
-  for (int i = 0; i < 3; i++) {
-    ASSERT_EQ(C[i], D[i]+10);
-  }
-}
-
-
-/*
-TEST(MathTest, TestAxpyCPU) {
-  float A[4][3] = {};
-  float C[4][3] = {};
-  float B[3][4] = {};
-  float D[3][4] = {};
-
-  for (int i = 0; i < 4; i++) {
-    for (int j = 0; j < 3; j++) {
-      A[i][j] = i-j + i*j;
-      B[j][i] = i-j + i*j;
-      C[i][j] = A[i][j];
-      D[j][i] = B[j][i];
-    }
-  }
-
-  cpu_axpy(A[0], 12, 2.0f, B[0]);
-  for (int i = 0; i < 12; i++) {
-    D[i / 4][i % 4] += 2*C[i / 3][i % 3];
-  }
-
-  for (int i = 0; i < 3; i++) {
-    for (int j = 0; j < 4; j++) {
-      ASSERT_EQ(B[i][j], D[i][j]);
-    }
-  }
-}
-
-TEST(MathTest, TestEopCPU) {
-
-  float A[10] = {};
-  float B[10] = {};
-  float C[10] = {};
-  float O[10] = {};
-
-  for (int i = 0; i < 10; i++) {
-    A[i] = i;
-    B[i] = -i;
-    C[i] = i;
-  }
-  cpu_e_f<singa::op::Set>(5, 15.0f, O, O);
-  for (int i = 0; i < 5; i++) {
-    ASSERT_EQ(O[i]-15,0);
-  }
-  for (int i = 5; i < 10; i++) {
-    ASSERT_EQ(O[i],0);
-  }
-}
-*/
-
-#ifdef USE_GPU
-TEST(MathTest, TestGemmGPU) {
-  float A[3][2] = {};
-  float B[3][2] = {};
-  float C[2][2] = {};
-  for (int i = 0; i < 3; i++) {
-    for (int j = 0; j < 2; j++) {
-      A[i][j] = i+j;
-      B[i][j] = i+j - i*j;
-    }
-  }
-
-  float* A_gpu = NULL;
-  float* B_gpu = NULL;
-  float* C_gpu = NULL;
-
-  cudaMalloc(reinterpret_cast<void**>(&A_gpu), 3*2*sizeof(float));
-  cudaMalloc(reinterpret_cast<void**>(&B_gpu), 3*2*sizeof(float));
-  cudaMalloc(reinterpret_cast<void**>(&C_gpu), 2*2*sizeof(float));
-
-  cudaMemcpy(A_gpu, A, 3*2*sizeof(float), cudaMemcpyHostToDevice);
-  cudaMemcpy(B_gpu, B, 3*2*sizeof(float), cudaMemcpyHostToDevice);
-  auto context = Singleton<Context>::Instance();
-  context->SetupDevice(std::this_thread::get_id(), 0);
-  gpu_gemm<float>(context->cublas_handle(0), A_gpu, B_gpu, 2, 2, 3 , 1, 0, true,
-                  false, C_gpu);
-
-  cudaMemcpy(C, C_gpu, 2*2*sizeof(float), cudaMemcpyDeviceToHost);
-
-  float D[2][2] = {};
-  for (int i = 0; i < 2; i++) {
-    for (int j = 0; j < 2; j++) {
-      D[i][j] = 0;
-      for (int k = 0; k < 3; k++) {
-        D[i][j] += A[k][i]*B[k][j];
-      }
-    }
-  }
-
-  for (int i = 0; i < 2; i++) {
-    for (int j = 0; j < 2; j++) {
-      ASSERT_EQ(C[i][j], D[i][j]);
-    }
-  }
-
-  cudaFree(A_gpu);
-  cudaFree(B_gpu);
-  cudaFree(C_gpu);
-}
-
-
-TEST(MathTest, TestGemvGPU) {
-  float A[4][3] = {};
-  float B[4]= {};
-  float C[3] = {};
-  float D[3] = {};
-
-  for (int i = 0; i < 4; i++) {
-    for (int j = 0; j < 3; j++) {
-      A[i][j] = i-j + i*j;
-    }
-  }
-
-  for (int i = 0; i < 4; i++) B[i] = i;
-  for (int i = 0; i < 3; i++) C[i] = 10;
-
-  float* A_gpu = NULL;
-  float* B_gpu = NULL;
-  float* C_gpu = NULL;
-
-  cudaMalloc(reinterpret_cast<void**>(&A_gpu), 4*3*sizeof(float));
-  cudaMalloc(reinterpret_cast<void**>(&B_gpu), 4*sizeof(float));
-  cudaMalloc(reinterpret_cast<void**>(&C_gpu), 3*sizeof(float));
-
-  cudaMemcpy(A_gpu, A, 4*3*sizeof(float), cudaMemcpyHostToDevice);
-  cudaMemcpy(B_gpu, B, 4*sizeof(float), cudaMemcpyHostToDevice);
-  cudaMemcpy(C_gpu, C, 3*sizeof(float), cudaMemcpyHostToDevice);
-  auto context = Singleton<Context>::Instance();
-  context->SetupDevice(std::this_thread::get_id(), 0);
-  gpu_gemv<float>(context->cublas_handle(0), A_gpu, B_gpu, 4, 3, 1.0f, 1.0f,
-                  true, C_gpu);
-
-  cudaMemcpy(C, C_gpu, 3*sizeof(float), cudaMemcpyDeviceToHost);
-
-  for (int i = 0; i < 3; i++) {
-    for (int j = 0; j < 4; j++) {
-      D[i] += A[j][i]*B[j];
-    }
-  }
-
-  for (int i = 0; i < 3; i++) {
-    ASSERT_EQ(C[i], D[i]+10);
-  }
-
-  cudaFree(A_gpu);
-  cudaFree(B_gpu);
-  cudaFree(C_gpu);
-}
-
-
-/*
-TEST(MathTest, TestAxpyGPU) {
-  float A[4][3] = {};
-  float C[4][3] = {};
-  float B[3][4] = {};
-  float D[3][4] = {};
-
-  for (int i = 0; i < 4; i++)
-  {
-    for (int j = 0; j < 3; j++)
-    {
-      A[i][j] = i-j + i*j;
-      B[j][i] = i-j + i*j;
-      C[i][j] = A[i][j];
-      D[j][i] = B[j][i];
-    }
-  }
-
-  float* A_gpu=NULL;
-  float* B_gpu=NULL;
-
-  cudaMalloc((void**)&A_gpu, 4*3*sizeof(float));
-  cudaMalloc((void**)&B_gpu, 3*4*sizeof(float));
-
-  cudaMemcpy(A_gpu,A,4*3*sizeof(float),cudaMemcpyHostToDevice);
-  cudaMemcpy(B_gpu,B,3*4*sizeof(float),cudaMemcpyHostToDevice);
-
-  gpu_axpy<float>(A_gpu, 12, 2, B_gpu);
-
-  cudaMemcpy(A,A_gpu,4*3*sizeof(float),cudaMemcpyDeviceToHost);
-  cudaMemcpy(B,B_gpu,3*4*sizeof(float),cudaMemcpyDeviceToHost);
-
-  //for (int i = 0; i < 12; i++)D[0][i] += 2*C[0][i];
-
-  for (int i = 0; i < 4; i++)
-  {
-    for (int j = 0; j < 3; j++)
-    {
-      D[i][j] += C[i][j];
-      ASSERT_EQ(B[i][j],D[i][j]);
-    }
-  }
-
-  cudaFree(A_gpu);
-  cudaFree(B_gpu);
-}
-*/
-
-
-TEST(MathTest, TestDotGPU) {
-  float A[12];
-  float B[12];
-  for (int i = 0; i < 12; i++) {
-    A[i] = i - 1;
-    B[i] = i + 1;
-  }
-
-  float* A_gpu = NULL;
-  float* B_gpu = NULL;
-
-  cudaMalloc(reinterpret_cast<void**>(&A_gpu), 12*sizeof(float));
-  cudaMalloc(reinterpret_cast<void**>(&B_gpu), 12*sizeof(float));
-
-  cudaMemcpy(A_gpu, A, 12*sizeof(float), cudaMemcpyHostToDevice);
-  cudaMemcpy(B_gpu, B, 12*sizeof(float), cudaMemcpyHostToDevice);
-  auto context = Singleton<Context>::Instance();
-  context->SetupDevice(std::this_thread::get_id(), 0);
-  float gpu_ret = gpu_dot<float>(context->cublas_handle(0), 12, A_gpu, B_gpu);
-
-  float cpu_ret = 0.0f;
-  for (int i = 0; i < 12; i++) {
-    cpu_ret += A[i] * B[i];
-  }
-
-  ASSERT_EQ(gpu_ret, cpu_ret);
-
-  cudaFree(A_gpu);
-  cudaFree(B_gpu);
-}
-
-TEST(MathTest, TestSingaSumRowGPU) {
-  float A[3][4];
-  float B[4];
-  float C[4];
-
-  for (int i = 0; i < 3; i++) {
-    for (int j = 0; j < 4; j++) {
-      // A[i][j] = i + j;
-      A[i][j] = 1.0f;
-    }
-  }
-
-  for (int i = 0; i < 4; i++) {
-    B[i] = 0.0f;
-    C[i] = 0.0f;
-  }
-
-  float* A_gpu = NULL;
-  float* B_gpu = NULL;
-
-  cudaMalloc(reinterpret_cast<void**>(&A_gpu), 12*sizeof(float));
-  cudaMalloc(reinterpret_cast<void**>(&B_gpu), 4*sizeof(float));
-  cudaMemcpy(A_gpu, A, 12*sizeof(float), cudaMemcpyHostToDevice);
-  singa_gpu_sum_row(A_gpu, B_gpu, 3, 4, 4);
-
-  cudaMemcpy(B, B_gpu, 4*sizeof(float), cudaMemcpyDeviceToHost);
-
-  for (int i = 0; i < 4; i++) {
-    for (int j = 0; j < 3; j++) {
-      C[i] += A[j][i];
-    }
-  }
-
-  for (int i = 0; i < 4; i++) {
-    ASSERT_EQ(B[i], C[i]);
-  }
-
-  cudaFree(A_gpu);
-  cudaFree(B_gpu);
-}
-
-TEST(MathTest, TestSingaAddVecRowGPU) {
-  float A[3][4];
-  float B[4];
-  float C[3][4];
-  float D[3][4];
-
-  for (int i = 0; i < 4; i++) {
-    B[i] = i;
-  }
-
-  for (int i = 0; i < 3; i++) {
-    for (int j = 0; j < 4; j++) {
-      A[i][j] = i + j;
-      D[i][j] = A[i][j] + B[j];
-    }
-  }
-
-  float* A_gpu = NULL;
-  float* B_gpu = NULL;
-  float* C_gpu = NULL;
-
-  cudaMalloc(reinterpret_cast<void**>(&A_gpu), 3*4*sizeof(float));
-  cudaMalloc(reinterpret_cast<void**>(&B_gpu), 4*sizeof(float));
-  cudaMalloc(reinterpret_cast<void**>(&C_gpu), 3*4*sizeof(float));
-  cudaMemcpy(A_gpu, A, 3*4*sizeof(float), cudaMemcpyHostToDevice);
-  cudaMemcpy(B_gpu, B, 4*sizeof(float), cudaMemcpyHostToDevice);
-
-  singa_gpu_add_vec_row(B_gpu, A_gpu, C_gpu, 3, 4, 4);
-
-  cudaMemcpy(C, C_gpu, 3*4*sizeof(float), cudaMemcpyDeviceToHost);
-
-  for (int i = 0; i < 3; i++) {
-    for (int j = 0; j < 4; j++) {
-      ASSERT_EQ(C[i][j], D[i][j]);
-    }
-  }
-
-  cudaFree(A_gpu);
-  cudaFree(B_gpu);
-  cudaFree(C_gpu);
-}
-
-
-TEST(MathTest, TestSingaSetValueGPU) {
-  float A[3][4];
-  float* A_gpu = NULL;
-
-  cudaMalloc(reinterpret_cast<void**>(&A_gpu), 3*4*sizeof(float));
-
-  cudaMemcpy(A_gpu, A, 3*4*sizeof(float), cudaMemcpyHostToDevice);
-
-  singa_gpu_set_value(A_gpu, 4.0, 3*4);
-
-  cudaMemcpy(A, A_gpu, 3*4*sizeof(float), cudaMemcpyDeviceToHost);
-
-  for (int i = 0; i < 3; i++) {
-    for (int j = 0; j < 4; j++) {
-      ASSERT_EQ(A[i][j], 4.0f);
-    }
-  }
-
-  cudaFree(A_gpu);
-}
-
-
-TEST(MathTest, TestEopGPU) {
-  float A[10] = {};
-  float B[10] = {};
-
-  for (int i = 0; i < 10; i++) {
-    A[i] = i;
-    B[i] = -i;
-  }
-
-  float* A_gpu = NULL;
-  float* B_gpu = NULL;
-
-  cudaMalloc(reinterpret_cast<void**>(&A_gpu), 10*sizeof(float));
-  cudaMalloc(reinterpret_cast<void**>(&B_gpu), 10*sizeof(float));
-
-  cudaMemcpy(A_gpu, A, 10*sizeof(float), cudaMemcpyHostToDevice);
-  cudaMemcpy(B_gpu, B, 10*sizeof(float), cudaMemcpyHostToDevice);
-
-  gpu_e_f<singa::op::Sigmoid<float>, float>(10, A_gpu, B_gpu);
-
-  cudaFree(A_gpu);
-  cudaFree(B_gpu);
-}
-#endif  // USE_GPU
diff --git a/src/test/test_msg.cc b/src/test/test_msg.cc
deleted file mode 100644
index db83b1c..0000000
--- a/src/test/test_msg.cc
+++ /dev/null
@@ -1,102 +0,0 @@
-/************************************************************
-*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*
-*************************************************************/
-
-#include "gtest/gtest.h"
-#include "singa/comm/msg.h"
-using namespace singa;
-TEST(MsgTest, AddrTest) {
-  int src_grp = 1, src_worker = 2;
-  int dst_grp = 0, dst_server = 1;
-  int src_addr = Addr(src_grp, src_worker, 0);
-  int dst_addr = Addr(dst_grp, dst_server, 1);
-  Msg msg(src_addr, dst_addr);
-  msg.set_trgt(123, -1);
-  ASSERT_EQ(AddrGrp(msg.src()), src_grp);
-  ASSERT_EQ(AddrID(msg.src()), src_worker);
-  ASSERT_EQ(AddrType(msg.src()), 0);
-
-  msg.SwapAddr();
-  ASSERT_EQ(AddrGrp(msg.src()), dst_grp);
-  ASSERT_EQ(AddrID(msg.src()), dst_server);
-  ASSERT_EQ(AddrType(msg.src()), 1);
-  ASSERT_EQ(msg.trgt_val(), 123);
-  ASSERT_EQ(msg.trgt_version(), -1);
-}
-
-TEST(MsgTest, AddFrameTest) {
-  int buf[5] = {1, 2, 3, 4, 5};
-  Msg msg;
-  msg.AddFrame("abcdefg", 7);
-  msg.AddFrame(buf, sizeof(int) * 5);
-
-  msg.FirstFrame();
-  char* str = msg.FrameStr();
-  ASSERT_STREQ(str, "abcdefg");
-  delete str;
-  ASSERT_EQ(msg.NextFrame(), true);
-  int *val = static_cast<int*>(msg.FrameData());
-  ASSERT_EQ(val[3], 4);
-  ASSERT_EQ(msg.NextFrame(), false);
-
-  msg.FirstFrame();
-  str = msg.FrameStr();
-  ASSERT_STREQ(str, "abcdefg");
-  msg.LastFrame();
-  val = static_cast<int*>(msg.FrameData());
-  ASSERT_EQ(val[2], 3);
-}
-
-TEST(MsgTest, AddFormatFrame) {
-  int x = 5;
-  Msg msg;
-  msg.AddFormatFrame("i", 12);
-  msg.AddFormatFrame("f", 10.f);
-  msg.AddFormatFrame("s", "abc");
-  msg.AddFormatFrame("p", &x);
-  msg.AddFormatFrame("isfp", 12, "abc", 10.f, &x);
-
-  msg.FirstFrame();
-  int y;
-  msg.ParseFormatFrame("i", &y);
-  ASSERT_EQ(y, 12);
-  ASSERT_EQ(msg.NextFrame(), true);
-
-  float z;
-  msg.ParseFormatFrame("f", &z);
-  ASSERT_EQ(z, 10.f);
-  ASSERT_EQ(msg.NextFrame(), true);
-
-  char buf[10];
-  msg.ParseFormatFrame("s", buf);
-  ASSERT_STREQ(buf, "abc");
-  ASSERT_EQ(msg.NextFrame(), true);
-
-  int *p;
-  msg.ParseFormatFrame("p", &p);
-  ASSERT_EQ(p, &x);
-  ASSERT_EQ(msg.NextFrame(), true);
-
-  msg.ParseFormatFrame("isfp", &y, buf, &z, &p);
-  ASSERT_EQ(y, 12);
-  ASSERT_STREQ(buf, "abc");
-  ASSERT_EQ(z, 10.f);
-  ASSERT_EQ(p, &x);
-}
diff --git a/src/test/test_neuralnet.cc b/src/test/test_neuralnet.cc
deleted file mode 100644
index 3ab197b..0000000
--- a/src/test/test_neuralnet.cc
+++ /dev/null
@@ -1,116 +0,0 @@
-/************************************************************
-*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-* 
-*   http://www.apache.org/licenses/LICENSE-2.0
-* 
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*
-*************************************************************/
-
-#include "gtest/gtest.h"
-#include "singa/driver.h"
-#include "singa/neuralnet/connection_layer.h"
-#include "singa/neuralnet/neuralnet.h"
-#include "singa/neuralnet/neuron_layer.h"
-
-using namespace singa;
-
-const int N = 10;  // size of dim 0
-const int M = 20;  // size of dim 1
-const int K = 2;  // size of partitions
-
-TEST(NeuralNet, RegisterLayers) {
-  Driver driver;
-  driver.RegisterLayer<DummyLayer, int>(kDummy);
-  driver.RegisterLayer<SliceLayer, int>(kSlice);
-  driver.RegisterLayer<SplitLayer, int>(kSplit);
-  driver.RegisterLayer<ConcateLayer, int>(kConcate);
-  driver.RegisterLayer<BridgeSrcLayer, int>(kBridgeSrc);
-  driver.RegisterLayer<BridgeDstLayer, int>(kBridgeDst);
-}
-
-TEST(NeuralNet, AddModelSplitLayers) {
-  NetProto proto;
-  // use dummy as input layer
-  LayerProto* proto_in = proto.add_layer();
-  proto_in->set_name("dummy_input");
-  proto_in->set_type(kDummy);
-  proto_in->mutable_dummy_conf()->set_input(true);
-  proto_in->mutable_dummy_conf()->add_shape(N);
-  proto_in->mutable_dummy_conf()->add_shape(M);
-  // use 2 dummy neuron layers
-  for (int i = 0; i < 2; ++i) {
-    LayerProto* proto_neuron = proto.add_layer();
-    proto_neuron->set_name("dummy_neuron_" + std::to_string(i));
-    proto_neuron->set_type(kDummy);
-    proto_neuron->add_srclayers("dummy_input");
-  }
-  // use dummy as output layer
-  for (int i = 0; i < 2; ++i) {
-    LayerProto* proto_out = proto.add_layer();
-    proto_out->set_name("dummy_output" + std::to_string(i));
-    proto_out->set_type(kDummy);
-    proto_out->mutable_dummy_conf()->set_output(true);
-    proto_out->add_srclayers("dummy_neuron_" + std::to_string(i));
-  }
-  NeuralNet::Create(proto, kTrain, K);
-}
-
-TEST(NeuralNet, DirectConnection) {
-  NetProto proto;
-  // use dummy as input layer
-  LayerProto* proto_in = proto.add_layer();
-  proto_in->set_name("dummy_input");
-  proto_in->set_type(kDummy);
-  proto_in->mutable_dummy_conf()->set_input(true);
-  proto_in->mutable_dummy_conf()->add_shape(N);
-  proto_in->mutable_dummy_conf()->add_shape(M);
-  // use dummy neuron layer
-  LayerProto* proto_neuron = proto.add_layer();
-  proto_neuron->set_name("dummy_neuron");
-  proto_neuron->set_type(kDummy);
-  proto_neuron->add_srclayers("dummy_input");
-  // use dummy as output layer
-  LayerProto* proto_out = proto.add_layer();
-  proto_out->set_name("dummy_output");
-  proto_out->set_type(kDummy);
-  proto_out->mutable_dummy_conf()->set_output(true);
-  proto_out->add_srclayers("dummy_neuron");
-  NeuralNet::Create(proto, kTrain, K);
-}
-
-TEST(NeuralNet, SliceConcate) {
-  NetProto proto;
-  // use dummy as input layer
-  LayerProto* proto_in = proto.add_layer();
-  proto_in->set_name("dummy_input");
-  proto_in->set_type(kDummy);
-  proto_in->mutable_dummy_conf()->set_input(true);
-  proto_in->mutable_dummy_conf()->add_shape(N);
-  proto_in->mutable_dummy_conf()->add_shape(M);
-  // use dummy neuron layer
-  LayerProto* proto_neuron = proto.add_layer();
-  proto_neuron->set_name("dummy_neuron");
-  proto_neuron->set_type(kDummy);
-  proto_neuron->add_srclayers("dummy_input");
-  // use dummy as output layer
-  LayerProto* proto_out = proto.add_layer();
-  proto_out->set_name("dummy_output");
-  proto_out->set_type(kDummy);
-  proto_out->set_partition_dim(1);
-  proto_out->mutable_dummy_conf()->set_output(true);
-  proto_out->add_srclayers("dummy_neuron");
-  NeuralNet::Create(proto, kTrain, K);
-}
diff --git a/src/test/test_paramslicer.cc b/src/test/test_paramslicer.cc
deleted file mode 100644
index bc7dedd..0000000
--- a/src/test/test_paramslicer.cc
+++ /dev/null
@@ -1,70 +0,0 @@
-/************************************************************
-*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-* 
-*   http://www.apache.org/licenses/LICENSE-2.0
-* 
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*
-*************************************************************/
-
-#include "singa/utils/param.h"
-#include "gtest/gtest.h"
-
-
-using namespace singa;
-
-const int param_size[] = {2400, 32, 25600, 32, 51200, 64, 57600, 10};
-
-/*
-class ParamSlicerTest : public ::testing::Test {
-  public:
-    ParamSlicerTest() {
-      ParamProto proto;
-      int nparams=sizeof(param_size)/sizeof(int);
-      for(int i=0;i<nparams;i++){
-        vector<int> shape{param_size[i]};
-        auto param=std::make_shared<Param>();
-        param->Setup(proto, shape);
-        param->set_id(i);
-        params.push_back(param);
-      }
-    }
-  protected:
-    vector<shared_ptr<Param>> params;
-};
-
-// all params are stored in one box, no need to split
-TEST_F(ParamSlicerTest, OneBox){
-  int nparams=sizeof(param_size)/sizeof(int);
-  ParamSlicer slicer;
-  int num=1;
-  auto slices=slicer.Slice(num, params);
-  ASSERT_EQ(slices.size(),nparams);
-  ASSERT_EQ(slicer.Get(1).size(),1);
-  ASSERT_EQ(slicer.Get(2).size(),1);
-  ASSERT_EQ(slicer.Get(nparams-1).back(), slices.size()-1);
-}
-
-// there are multiple boxes
-TEST_F(ParamSlicerTest, MultipleBox){
-  int nparams=sizeof(param_size)/sizeof(int);
-  ParamSlicer slicer;
-  int num=4;
-  auto slices=slicer.Slice(num, params);
-  ASSERT_EQ(slicer.Get(1).size(),1);
-  ASSERT_EQ(slicer.Get(3).size(),1);
-  ASSERT_EQ(slicer.Get(nparams-1).back(), slices.size()-1);
-}
-*/
diff --git a/src/test/test_record_input_layer.cc b/src/test/test_record_input_layer.cc
deleted file mode 100644
index 64e1ad4..0000000
--- a/src/test/test_record_input_layer.cc
+++ /dev/null
@@ -1,122 +0,0 @@
-/************************************************************
-*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*
-*************************************************************/
-#include <string>
-#include <vector>
-
-#include "gtest/gtest.h"
-#include "singa/neuralnet/input_layer.h"
-#include "singa/proto/job.pb.h"
-#include "singa/proto/common.pb.h"
-
-class RecordInputLayerTest : public ::testing::Test {
- protected:
-  virtual void SetUp() {
-    std::string path ="src/test/test.bin";
-    auto* store = singa::io::CreateStore("kvfile");
-    store->Open(path, singa::io::kCreate);
-    {
-    singa::RecordProto image;
-    image.add_data(3.2);
-    image.add_data(1);
-    image.add_data(14.1);
-    image.set_label(12);
-    std::string val;
-    image.SerializeToString(&val);
-    store->Write("0", val);
-    }
-
-    {
-    singa::SingleLabelImageRecord image;
-    image.add_data(0.2);
-    image.add_data(0);
-    image.add_data(1.1);
-    image.set_label(2);
-    std::string val;
-    image.SerializeToString(&val);
-    store->Write("1", val);
-    }
-
-    {
-    singa::SingleLabelImageRecord image;
-    image.add_data(2.2);
-    image.add_data(1);
-    image.add_data(4.1);
-    image.set_label(1);
-    std::string val;
-    image.SerializeToString(&val);
-    store->Write("2", val);
-    }
-    store->Flush();
-    store->Close();
-
-    auto conf = image_conf.mutable_store_conf();
-    conf->set_path(path);
-    conf->add_batchsize(2);
-    conf->add_shape(3);
-    conf->set_backend("kvfile");
-  }
-  singa::LayerProto image_conf;
-};
-
-TEST_F(RecordInputLayerTest, Setup) {
-  singa::RecordInputLayer layer;
-  layer.Setup(image_conf, std::vector<singa::Layer*>{});
-  EXPECT_EQ(2, static_cast<int>(layer.aux_data().size()));
-  EXPECT_EQ(6, layer.data(nullptr).count());
-}
-
-TEST_F(RecordInputLayerTest, ComputeFeature) {
-  singa::RecordInputLayer image;
-  image.Setup(image_conf, std::vector<singa::Layer*>{});
-  image.ComputeFeature(singa::kTrain, std::vector<singa::Layer*>{});
-
-  EXPECT_EQ(12, image.aux_data()[0]);
-  EXPECT_EQ(2, image.aux_data()[1]);
-  auto data = image.data(nullptr);
-  EXPECT_EQ(3.2f, data.cpu_data()[0]);
-  EXPECT_EQ(14.1f, data.cpu_data()[2]);
-  EXPECT_EQ(0.2f, data.cpu_data()[3]);
-  EXPECT_EQ(1.1f, data.cpu_data()[5]);
-}
-TEST_F(RecordInputLayerTest, ComputeFeatureDeploy) {
-  singa::RecordInputLayer image;
-  image.Setup(image_conf, std::vector<singa::Layer*>{});
-  image.ComputeFeature(singa::kDeploy, std::vector<singa::Layer*>{});
-
-  auto data = image.data(nullptr);
-  EXPECT_EQ(3.2f, data.cpu_data()[0]);
-  EXPECT_EQ(14.1f, data.cpu_data()[2]);
-  EXPECT_EQ(0.2f, data.cpu_data()[3]);
-  EXPECT_EQ(1.1f, data.cpu_data()[5]);
-}
-
-TEST_F(RecordInputLayerTest, SeekToFirst) {
-  singa::RecordInputLayer image;
-  image.Setup(image_conf, std::vector<singa::Layer*>{});
-  image.ComputeFeature(singa::kTrain, std::vector<singa::Layer*>{});
-  image.ComputeFeature(singa::kTrain, std::vector<singa::Layer*>{});
-
-  auto data = image.data(nullptr);
-  EXPECT_EQ(2.2f, data.cpu_data()[0]);
-  EXPECT_EQ(4.1f, data.cpu_data()[2]);
-  EXPECT_EQ(3.2f, data.cpu_data()[3]);
-  EXPECT_EQ(14.1f, data.cpu_data()[5]);
-}
diff --git a/src/test/test_store.cc b/src/test/test_store.cc
deleted file mode 100644
index d8a8904..0000000
--- a/src/test/test_store.cc
+++ /dev/null
@@ -1,92 +0,0 @@
-/************************************************************
-*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*
-*************************************************************/
-#include <string>
-#include "gtest/gtest.h"
-#include "singa/io/store.h"
-
-TEST(TextFileStore, Open) {
-  auto store = singa::io::CreateStore("textfile");
-  EXPECT_EQ(store->Open("src/test/store.txt", singa::io::kCreate), true);
-  store->Close();
-  EXPECT_EQ(store->Open("src/test/store.txt", singa::io::kRead), true);
-  store->Close();
-}
-
-TEST(TextFileStore, Write) {
-  auto store = singa::io::CreateStore("textfile");
-  store->Open("src/test/store.txt", singa::io::kCreate);
-  store->Write("001", "first tuple");
-  store->Write("002", "second tuple");
-  store->Flush();
-  store->Write("003", "third tuple");
-  store->Close();
-}
-
-TEST(TextFileStore, Read) {
-  auto store = singa::io::CreateStore("textfile");
-  EXPECT_EQ(store->Open("src/test/store.txt", singa::io::kRead), true);
-  std::string key, value;
-  EXPECT_EQ(store->Read(&key, &value), true);
-  EXPECT_EQ(key, "0");
-  EXPECT_EQ(value, "first tuple");
-
-  EXPECT_EQ(store->Read(&key, &value), true);
-  EXPECT_EQ(store->Read(&key, &value), true);
-  EXPECT_EQ(store->Read(&key, &value), false);
-  store->SeekToFirst();
-
-  EXPECT_EQ(store->Read(&key, &value), true);
-  EXPECT_EQ(key, "0");
-  EXPECT_EQ(value, "first tuple");
-}
-TEST(KVFileStore, Open) {
-  auto store = singa::io::CreateStore("kvfile");
-  EXPECT_EQ(store->Open("src/test/store.bin", singa::io::kCreate), true);
-  store->Close();
-  EXPECT_EQ(store->Open("src/test/store.bin", singa::io::kRead), true);
-  store->Close();
-}
-TEST(KVFileStore, Write) {
-  auto store = singa::io::CreateStore("kvfile");
-  store->Open("src/test/store.bin", singa::io::kCreate);
-  store->Write("001", "first tuple");
-  store->Write("002", "second tuple");
-  store->Flush();
-  store->Write("003", "third tuple");
-  store->Close();
-}
-TEST(KVFileStore, Read) {
-  auto store = singa::io::CreateStore("kvfile");
-  store->Open("src/test/store.bin", singa::io::kRead);
-  std::string key, value;
-  EXPECT_EQ(store->Read(&key, &value), true);
-  EXPECT_EQ(key, "001");
-  EXPECT_EQ(value, "first tuple");
-
-  EXPECT_EQ(store->Read(&key, &value), true);
-  EXPECT_EQ(store->Read(&key, &value), true);
-  EXPECT_EQ(store->Read(&key, &value), false);
-  store->SeekToFirst();
-
-  EXPECT_EQ(store->Read(&key, &value), true);
-  EXPECT_EQ(key, "001");
-  EXPECT_EQ(value, "first tuple");
-}
diff --git a/src/test/test_unrolling.cc b/src/test/test_unrolling.cc
deleted file mode 100644
index 7965882..0000000
--- a/src/test/test_unrolling.cc
+++ /dev/null
@@ -1,373 +0,0 @@
-/************************************************************
- *
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- *
- *************************************************************/
-#include <string>
-#include <vector>
-#include <fstream>
-#include <iostream>
-using namespace std;
-
-#include "gtest/gtest.h"
-#include "singa/neuralnet/input_layer.h"
-#include "singa/neuralnet/neuron_layer.h"
-#include "singa/neuralnet/neuralnet.h"
-#include "singa/neuralnet/connection_layer.h"
-#include "singa/driver.h"
-#include "singa/proto/job.pb.h"
-#include "singa/utils/common.h"
-
-using namespace singa;
-
-class UnrollingTest: public ::testing::Test {
-protected:
-	virtual void SetUp() {
-		NetProto* net_conf1 = job_conf1.mutable_neuralnet();
-
-		LayerProto* data_layer1 = net_conf1->add_layer();
-		data_layer1->set_name("data");
-		data_layer1->set_type(kRecordInput);
-
-		LayerProto* embedding_layer1 = net_conf1->add_layer();
-		embedding_layer1->set_name("embedding");
-		embedding_layer1->set_type(kDummy);
-		embedding_layer1->add_srclayers("data");
-		embedding_layer1->set_unroll_len(3);
-		embedding_layer1->add_unroll_conn_type(kUnrollOneToAll);
-
-		LayerProto* gru_layer1 = net_conf1->add_layer();
-		gru_layer1->set_name("gru");
-		gru_layer1->set_type(kGRU);
-		gru_layer1->add_srclayers("embedding");
-		gru_layer1->mutable_gru_conf()->set_dim_hidden(20);
-		gru_layer1->add_param()->set_name("w_z_hx");
-		gru_layer1->add_param()->set_name("w_r_hx");
-		gru_layer1->add_param()->set_name("w_c_hx");
-		gru_layer1->add_param()->set_name("w_z_hh");
-		gru_layer1->add_param()->set_name("w_r_hh");
-		gru_layer1->add_param()->set_name("w_c_hh");
-		gru_layer1->set_unroll_len(3);
-		gru_layer1->add_unroll_conn_type(kUnrollOneToOne);
-
-		LayerProto* out_layer1 = net_conf1->add_layer();
-		out_layer1->set_name("out");
-		out_layer1->set_type(kInnerProduct);
-		out_layer1->add_srclayers("gru");
-		out_layer1->mutable_innerproduct_conf()->set_num_output(100);
-		out_layer1->add_param()->set_name("w");
-		out_layer1->add_param()->set_name("b");
-		out_layer1->set_unroll_len(3);
-		out_layer1->add_unroll_conn_type(kUnrollOneToOne);
-
-		LayerProto* loss_layer1 = net_conf1->add_layer();
-		loss_layer1->set_name("loss");
-		loss_layer1->set_type(kSoftmaxLoss);
-		loss_layer1->add_srclayers("out");
-		loss_layer1->add_srclayers("data");
-		loss_layer1->set_unroll_len(3);
-		loss_layer1->add_unroll_conn_type(kUnrollOneToOne);
-		loss_layer1->add_unroll_conn_type(kUnrollOneToAll);
-
-		/*
-		 * Initialize job conf 2
-		NetProto* net_conf2 = job_conf2.mutable_neuralnet();
-
-		LayerProto* data_layer2 = net_conf2->add_layer();
-		data_layer2->set_name("data");
-		data_layer2->set_type(kRecordInput);
-
-		LayerProto* embedding_layer2 = net_conf2->add_layer();
-		embedding_layer2->set_name("embedding");
-		embedding_layer2->set_type(kDummy);
-		embedding_layer2->add_srclayers("data");
-		embedding_layer2->add_srclayers("softmax");
-		embedding_layer2->set_unroll_len(3);
-		embedding_layer2->add_unroll_conn_type(kUnrollOneToAll);
-		embedding_layer2->add_shift(0);
-		embedding_layer2->add_unroll_conn_type(kUnrollOneToOne);
-		embedding_layer2->add_shift(1);
-
-		LayerProto* gru_layer2 = net_conf2->add_layer();
-		gru_layer2->set_name("gru");
-		gru_layer2->set_type(kGRU);
-		gru_layer2->add_srclayers("embedding");
-		gru_layer2->mutable_gru_conf()->set_dim_hidden(20);
-		gru_layer2->mutable_gru_conf()->set_bias_term(false);
-		gru_layer2->add_param()->set_name("w_z_hx");
-		gru_layer2->add_param()->set_name("w_r_hx");
-		gru_layer2->add_param()->set_name("w_c_hx");
-		gru_layer2->add_param()->set_name("w_z_hh");
-		gru_layer2->add_param()->set_name("w_r_hh");
-		gru_layer2->add_param()->set_name("w_c_hh");
-		gru_layer2->set_unroll_len(3);
-		gru_layer2->add_unroll_conn_type(kUnrollOneToOne);
-		gru_layer2->add_shift(0);
-
-		LayerProto* out_layer2 = net_conf2->add_layer();
-		out_layer2->set_name("out");
-		out_layer2->set_type(kInnerProduct);
-		out_layer2->add_srclayers("gru");
-		out_layer2->mutable_innerproduct_conf()->set_num_output(100);
-		out_layer2->add_param()->set_name("w");
-		out_layer2->add_param()->set_name("b");
-		out_layer2->set_unroll_len(3);
-		out_layer2->add_unroll_conn_type(kUnrollOneToOne);
-		out_layer2->add_shift(0);
-
-		LayerProto* softmax_layer2 = net_conf2->add_layer();
-		softmax_layer2->set_name("softmax");
-		softmax_layer2->set_type(kSoftmax);
-		softmax_layer2->add_srclayers("out");
-		softmax_layer2->set_unroll_len(3);
-		softmax_layer2->add_unroll_conn_type(kUnrollOneToOne);
-		softmax_layer2->add_shift(0);
-
-		LayerProto* loss_layer2 = net_conf2->add_layer();
-		loss_layer2->set_name("loss");
-		loss_layer2->set_type(kSoftmaxLoss);
-		loss_layer2->add_srclayers("softmax");
-		loss_layer2->add_srclayers("data");
-		loss_layer2->set_unroll_len(3);
-		loss_layer2->add_unroll_conn_type(kUnrollOneToOne);
-		loss_layer2->add_shift(0);
-		loss_layer2->add_unroll_conn_type(kUnrollOneToAll);
-		loss_layer2->add_shift(0);
-		 */
-	}
-
-	singa::JobProto job_conf1;
-	singa::JobProto job_conf2;
-};
-
-TEST_F(UnrollingTest, GRULanguageModelTrain) {
-	NetProto net;
-	net.CopyFrom(job_conf1.neuralnet());
-	NetProto unrolled_net = NeuralNet::Unrolling(net);
-	EXPECT_EQ("0#data", unrolled_net.layer(0).name());
-
-	EXPECT_EQ("0#embedding", unrolled_net.layer(1).name());
-	EXPECT_EQ(1, unrolled_net.layer(1).srclayers_size());
-	EXPECT_EQ("0#data", unrolled_net.layer(1).srclayers(0));
-
-	EXPECT_EQ("1#embedding", unrolled_net.layer(2).name());
-	EXPECT_EQ(1, unrolled_net.layer(2).srclayers_size());
-	EXPECT_EQ("0#data", unrolled_net.layer(2).srclayers(0));
-
-	EXPECT_EQ("2#embedding", unrolled_net.layer(3).name());
-	EXPECT_EQ(1, unrolled_net.layer(3).srclayers_size());
-	EXPECT_EQ("0#data", unrolled_net.layer(3).srclayers(0));
-
-	EXPECT_EQ("0#gru", unrolled_net.layer(4).name());
-	EXPECT_EQ(1, unrolled_net.layer(4).srclayers_size());
-	EXPECT_EQ("0#embedding", unrolled_net.layer(4).srclayers(0));
-	EXPECT_EQ("0#w_z_hx", unrolled_net.layer(4).param(0).name());
-	EXPECT_EQ("0#w_r_hx", unrolled_net.layer(4).param(1).name());
-	EXPECT_EQ("0#w_c_hx", unrolled_net.layer(4).param(2).name());
-	EXPECT_EQ("0#w_z_hh", unrolled_net.layer(4).param(3).name());
-	EXPECT_EQ("0#w_r_hh", unrolled_net.layer(4).param(4).name());
-	EXPECT_EQ("0#w_c_hh", unrolled_net.layer(4).param(5).name());
-
-	EXPECT_EQ("1#gru", unrolled_net.layer(5).name());
-	EXPECT_EQ(2, unrolled_net.layer(5).srclayers_size());
-	EXPECT_EQ("1#embedding", unrolled_net.layer(5).srclayers(0));
-	EXPECT_EQ("0#gru", unrolled_net.layer(5).srclayers(1));
-	EXPECT_EQ("1#w_z_hx", unrolled_net.layer(5).param(0).name());
-	EXPECT_EQ("0#w_z_hx", unrolled_net.layer(5).param(0).share_from());
-	EXPECT_EQ("1#w_r_hx", unrolled_net.layer(5).param(1).name());
-	EXPECT_EQ("0#w_r_hx", unrolled_net.layer(5).param(1).share_from());
-	EXPECT_EQ("1#w_c_hx", unrolled_net.layer(5).param(2).name());
-	EXPECT_EQ("0#w_c_hx", unrolled_net.layer(5).param(2).share_from());
-	EXPECT_EQ("1#w_z_hh", unrolled_net.layer(5).param(3).name());
-	EXPECT_EQ("0#w_z_hh", unrolled_net.layer(5).param(3).share_from());
-	EXPECT_EQ("1#w_r_hh", unrolled_net.layer(5).param(4).name());
-	EXPECT_EQ("0#w_r_hh", unrolled_net.layer(5).param(4).share_from());
-	EXPECT_EQ("1#w_c_hh", unrolled_net.layer(5).param(5).name());
-	EXPECT_EQ("0#w_c_hh", unrolled_net.layer(5).param(5).share_from());
-
-	EXPECT_EQ("2#gru", unrolled_net.layer(6).name());
-	EXPECT_EQ(2, unrolled_net.layer(6).srclayers_size());
-	EXPECT_EQ("2#embedding", unrolled_net.layer(6).srclayers(0));
-	EXPECT_EQ("1#gru", unrolled_net.layer(6).srclayers(1));
-	EXPECT_EQ("2#w_z_hx", unrolled_net.layer(6).param(0).name());
-	EXPECT_EQ("0#w_z_hx", unrolled_net.layer(6).param(0).share_from());
-	EXPECT_EQ("2#w_r_hx", unrolled_net.layer(6).param(1).name());
-	EXPECT_EQ("0#w_r_hx", unrolled_net.layer(6).param(1).share_from());
-	EXPECT_EQ("2#w_c_hx", unrolled_net.layer(6).param(2).name());
-	EXPECT_EQ("0#w_c_hx", unrolled_net.layer(6).param(2).share_from());
-	EXPECT_EQ("2#w_z_hh", unrolled_net.layer(6).param(3).name());
-	EXPECT_EQ("0#w_z_hh", unrolled_net.layer(6).param(3).share_from());
-	EXPECT_EQ("2#w_r_hh", unrolled_net.layer(6).param(4).name());
-	EXPECT_EQ("0#w_r_hh", unrolled_net.layer(6).param(4).share_from());
-	EXPECT_EQ("2#w_c_hh", unrolled_net.layer(6).param(5).name());
-	EXPECT_EQ("0#w_c_hh", unrolled_net.layer(6).param(5).share_from());
-
-	EXPECT_EQ("0#out", unrolled_net.layer(7).name());
-	EXPECT_EQ(1, unrolled_net.layer(7).srclayers_size());
-	EXPECT_EQ("0#gru", unrolled_net.layer(7).srclayers(0));
-	EXPECT_EQ("0#w", unrolled_net.layer(7).param(0).name());
-	EXPECT_EQ("0#b", unrolled_net.layer(7).param(1).name());
-
-	EXPECT_EQ("1#out", unrolled_net.layer(8).name());
-	EXPECT_EQ(1, unrolled_net.layer(8).srclayers_size());
-	EXPECT_EQ("1#gru", unrolled_net.layer(8).srclayers(0));
-	EXPECT_EQ("1#w", unrolled_net.layer(8).param(0).name());
-	EXPECT_EQ("0#w", unrolled_net.layer(8).param(0).share_from());
-	EXPECT_EQ("1#b", unrolled_net.layer(8).param(1).name());
-	EXPECT_EQ("0#b", unrolled_net.layer(8).param(1).share_from());
-
-	EXPECT_EQ("2#out", unrolled_net.layer(9).name());
-	EXPECT_EQ(1, unrolled_net.layer(9).srclayers_size());
-	EXPECT_EQ("2#gru", unrolled_net.layer(9).srclayers(0));
-	EXPECT_EQ("2#w", unrolled_net.layer(9).param(0).name());
-	EXPECT_EQ("0#w", unrolled_net.layer(9).param(0).share_from());
-	EXPECT_EQ("2#b", unrolled_net.layer(9).param(1).name());
-	EXPECT_EQ("0#b", unrolled_net.layer(9).param(1).share_from());
-
-	EXPECT_EQ("0#loss", unrolled_net.layer(10).name());
-	EXPECT_EQ(2, unrolled_net.layer(10).srclayers_size());
-	EXPECT_EQ("0#out", unrolled_net.layer(10).srclayers(0));
-	EXPECT_EQ("0#data", unrolled_net.layer(10).srclayers(1));
-
-	EXPECT_EQ("1#loss", unrolled_net.layer(11).name());
-	EXPECT_EQ(2, unrolled_net.layer(11).srclayers_size());
-	EXPECT_EQ("1#out", unrolled_net.layer(11).srclayers(0));
-	EXPECT_EQ("0#data", unrolled_net.layer(11).srclayers(1));
-
-	EXPECT_EQ("2#loss", unrolled_net.layer(12).name());
-	EXPECT_EQ(2, unrolled_net.layer(12).srclayers_size());
-	EXPECT_EQ("2#out", unrolled_net.layer(12).srclayers(0));
-	EXPECT_EQ("0#data", unrolled_net.layer(12).srclayers(1));
-}
-
-/*
-TEST_F(UnrollingTest, GRULanguageModelTest) {
-	NetProto net;
-	net.CopyFrom(job_conf2.neuralnet());
-	NetProto unrolled_net = NeuralNet::Unrolling(net);
-
-	EXPECT_EQ("data", unrolled_net.layer(0).name());
-
-	EXPECT_EQ("0#embedding", unrolled_net.layer(1).name());
-	EXPECT_EQ(1, unrolled_net.layer(1).srclayers_size());
-	EXPECT_EQ("data", unrolled_net.layer(1).srclayers(0));
-
-	EXPECT_EQ("1#embedding", unrolled_net.layer(2).name());
-	EXPECT_EQ(2, unrolled_net.layer(2).srclayers_size());
-	EXPECT_EQ("data", unrolled_net.layer(2).srclayers(0));
-	EXPECT_EQ("0#softmax", unrolled_net.layer(2).srclayers(1));
-
-	EXPECT_EQ("2#embedding", unrolled_net.layer(3).name());
-	EXPECT_EQ(2, unrolled_net.layer(3).srclayers_size());
-	EXPECT_EQ("data", unrolled_net.layer(3).srclayers(0));
-	EXPECT_EQ("1#softmax", unrolled_net.layer(3).srclayers(1));
-
-	EXPECT_EQ("0#gru", unrolled_net.layer(4).name());
-	EXPECT_EQ(1, unrolled_net.layer(4).srclayers_size());
-	EXPECT_EQ("0#embedding", unrolled_net.layer(4).srclayers(0));
-	EXPECT_EQ("w_z_hx", unrolled_net.layer(4).param(0).name());
-	EXPECT_EQ("w_r_hx", unrolled_net.layer(4).param(1).name());
-	EXPECT_EQ("w_c_hx", unrolled_net.layer(4).param(2).name());
-	EXPECT_EQ("w_z_hh", unrolled_net.layer(4).param(3).name());
-	EXPECT_EQ("w_r_hh", unrolled_net.layer(4).param(4).name());
-	EXPECT_EQ("w_c_hh", unrolled_net.layer(4).param(5).name());
-
-	EXPECT_EQ("1#gru", unrolled_net.layer(5).name());
-	EXPECT_EQ(2, unrolled_net.layer(5).srclayers_size());
-	EXPECT_EQ("0#gru", unrolled_net.layer(5).srclayers(0));
-	EXPECT_EQ("1#embedding", unrolled_net.layer(5).srclayers(1));
-	EXPECT_EQ("1#w_z_hx", unrolled_net.layer(5).param(0).name());
-	EXPECT_EQ("w_z_hx", unrolled_net.layer(5).param(0).share_from());
-	EXPECT_EQ("1#w_r_hx", unrolled_net.layer(5).param(1).name());
-	EXPECT_EQ("w_r_hx", unrolled_net.layer(5).param(1).share_from());
-	EXPECT_EQ("1#w_c_hx", unrolled_net.layer(5).param(2).name());
-	EXPECT_EQ("w_c_hx", unrolled_net.layer(5).param(2).share_from());
-	EXPECT_EQ("1#w_z_hh", unrolled_net.layer(5).param(3).name());
-	EXPECT_EQ("w_z_hh", unrolled_net.layer(5).param(3).share_from());
-	EXPECT_EQ("1#w_r_hh", unrolled_net.layer(5).param(4).name());
-	EXPECT_EQ("w_r_hh", unrolled_net.layer(5).param(4).share_from());
-	EXPECT_EQ("1#w_c_hh", unrolled_net.layer(5).param(5).name());
-	EXPECT_EQ("w_c_hh", unrolled_net.layer(5).param(5).share_from());
-
-	EXPECT_EQ("2#gru_2", unrolled_net.layer(6).name());
-	EXPECT_EQ(2, unrolled_net.layer(6).srclayers_size());
-	EXPECT_EQ("1#gru", unrolled_net.layer(6).srclayers(0));
-	EXPECT_EQ("2#embedding", unrolled_net.layer(6).srclayers(1));
-	EXPECT_EQ("2#w_z_hx", unrolled_net.layer(6).param(0).name());
-	EXPECT_EQ("w_z_hx", unrolled_net.layer(6).param(0).share_from());
-	EXPECT_EQ("2#w_r_hx", unrolled_net.layer(6).param(1).name());
-	EXPECT_EQ("w_r_hx", unrolled_net.layer(6).param(1).share_from());
-	EXPECT_EQ("2#w_c_hx", unrolled_net.layer(6).param(2).name());
-	EXPECT_EQ("w_c_hx", unrolled_net.layer(6).param(2).share_from());
-	EXPECT_EQ("2#w_z_hh", unrolled_net.layer(6).param(3).name());
-	EXPECT_EQ("w_z_hh", unrolled_net.layer(6).param(3).share_from());
-	EXPECT_EQ("2#w_r_hh", unrolled_net.layer(6).param(4).name());
-	EXPECT_EQ("w_r_hh", unrolled_net.layer(6).param(4).share_from());
-	EXPECT_EQ("2#w_c_hh", unrolled_net.layer(6).param(5).name());
-	EXPECT_EQ("w_c_hh", unrolled_net.layer(6).param(5).share_from());
-
-	EXPECT_EQ("out_0", unrolled_net.layer(7).name());
-	EXPECT_EQ(1, unrolled_net.layer(7).srclayers_size());
-	EXPECT_EQ("gru_0", unrolled_net.layer(7).srclayers(0));
-	EXPECT_EQ("w", unrolled_net.layer(7).param(0).name());
-	EXPECT_EQ("b", unrolled_net.layer(7).param(1).name());
-
-	EXPECT_EQ("out_1", unrolled_net.layer(8).name());
-	EXPECT_EQ(1, unrolled_net.layer(8).srclayers_size());
-	EXPECT_EQ("gru_1", unrolled_net.layer(8).srclayers(0));
-	EXPECT_EQ("w_1", unrolled_net.layer(8).param(0).name());
-	EXPECT_EQ("w", unrolled_net.layer(8).param(0).share_from());
-	EXPECT_EQ("b_1", unrolled_net.layer(8).param(1).name());
-	EXPECT_EQ("b", unrolled_net.layer(8).param(1).share_from());
-
-	EXPECT_EQ("out_2", unrolled_net.layer(9).name());
-	EXPECT_EQ(1, unrolled_net.layer(9).srclayers_size());
-	EXPECT_EQ("gru_2", unrolled_net.layer(9).srclayers(0));
-	EXPECT_EQ("w_2", unrolled_net.layer(9).param(0).name());
-	EXPECT_EQ("w", unrolled_net.layer(9).param(0).share_from());
-	EXPECT_EQ("b_2", unrolled_net.layer(9).param(1).name());
-	EXPECT_EQ("b", unrolled_net.layer(9).param(1).share_from());
-
-	EXPECT_EQ("softmax_0", unrolled_net.layer(10).name());
-	EXPECT_EQ(1, unrolled_net.layer(10).srclayers_size());
-	EXPECT_EQ("out_0", unrolled_net.layer(10).srclayers(0));
-
-	EXPECT_EQ("softmax_1", unrolled_net.layer(11).name());
-	EXPECT_EQ(1, unrolled_net.layer(11).srclayers_size());
-	EXPECT_EQ("out_1", unrolled_net.layer(11).srclayers(0));
-
-	EXPECT_EQ("softmax_2", unrolled_net.layer(12).name());
-	EXPECT_EQ(1, unrolled_net.layer(12).srclayers_size());
-	EXPECT_EQ("out_2", unrolled_net.layer(12).srclayers(0));
-
-	EXPECT_EQ("loss_0", unrolled_net.layer(13).name());
-	EXPECT_EQ(2, unrolled_net.layer(13).srclayers_size());
-	EXPECT_EQ("softmax_0", unrolled_net.layer(13).srclayers(0));
-	EXPECT_EQ("data", unrolled_net.layer(13).srclayers(1));
-
-	EXPECT_EQ("loss_1", unrolled_net.layer(14).name());
-	EXPECT_EQ(2, unrolled_net.layer(14).srclayers_size());
-	EXPECT_EQ("softmax_1", unrolled_net.layer(14).srclayers(0));
-	EXPECT_EQ("data", unrolled_net.layer(14).srclayers(1));
-
-	EXPECT_EQ("loss_2", unrolled_net.layer(15).name());
-	EXPECT_EQ(2, unrolled_net.layer(15).srclayers_size());
-	EXPECT_EQ("softmax_2", unrolled_net.layer(15).srclayers(0));
-	EXPECT_EQ("data", unrolled_net.layer(15).srclayers(1));
-}
-  */
diff --git a/src/utils/blob.cc b/src/utils/blob.cc
deleted file mode 100644
index bfc36e6..0000000
--- a/src/utils/blob.cc
+++ /dev/null
@@ -1,259 +0,0 @@
-/************************************************************
-*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*
-*************************************************************/
-
-/**
- * The code is adapted from Caffe under BSD 2 Clause license.
- *
- * COPYRIGHT
- * All contributions by the University of California:
- * Copyright (c) 2014, The Regents of the University of California (Regents)
- * All rights reserved.
- * All other contributions:
- * Copyright (c) 2014, the respective contributors
- * All rights reserved.
- */
-#include "singa/utils/blob.h"
-
-#include <cblas.h>
-#include <math.h>
-#include <utility>
-
-#define NOT_IMPLEMENTED LOG(FATAL) << "Not implemented function"
-#define NO_GPU LOG(FATAL) << "CPU-only Mode: cannot make GPU call."
-// Instantiate a class with float and double specifications.
-#define INSTANTIATE_CLASS(classname) \
-  template class classname<float>; \
-  template class classname<double>
-// Disable the copy and assignment operator for a class.
-#define DISABLE_COPY_AND_ASSIGN(classname) \
-private:\
-  classname(const classname&);\
-  classname& operator=(const classname&)
-
-#ifndef CPU_ONLY
-#include "singa/utils/cuda_utils.h"
-#endif  // CPU_ONLY
-
-namespace singa {
-
-SyncedMemory::~SyncedMemory() {
-  if (cpu_ptr_ && own_cpu_data_) {
-    FreeHost(cpu_ptr_);
-  }
-#ifndef CPU_ONLY
-  if (gpu_ptr_) {
-    CUDA_CHECK(cudaFree(gpu_ptr_));
-  }
-#endif  // CPU_ONLY
-}
-
-const void* SyncedMemory::cpu_data() {
-  to_cpu();
-  return cpu_ptr_;
-}
-
-const void* SyncedMemory::gpu_data() {
-#ifndef CPU_ONLY
-  to_gpu();
-  return gpu_ptr_;
-#else
-  NO_GPU;
-#endif
-  return nullptr;
-}
-
-void* SyncedMemory::mutable_cpu_data() {
-  to_cpu();
-  head_ = HEAD_AT_CPU;
-  return cpu_ptr_;
-}
-
-void* SyncedMemory::mutable_gpu_data() {
-#ifndef CPU_ONLY
-  to_gpu();
-  head_ = HEAD_AT_GPU;
-  return gpu_ptr_;
-#else
-  NO_GPU;
-#endif
-  return nullptr;
-}
-
-void SyncedMemory::set_cpu_data(void* data) {
-  CHECK(data);
-  if (own_cpu_data_) {
-    FreeHost(cpu_ptr_);
-  }
-  cpu_ptr_ = data;
-  head_ = HEAD_AT_CPU;
-  own_cpu_data_ = false;
-}
-
-void SyncedMemory::to_cpu() {
-  switch (head_) {
-  case UNINITIALIZED:
-    MallocHost(&cpu_ptr_, size_);
-    memset(cpu_ptr_, 0, size_);
-    head_ = HEAD_AT_CPU;
-    own_cpu_data_ = true;
-    break;
-  case HEAD_AT_GPU:
-#ifndef CPU_ONLY
-    if (cpu_ptr_ == NULL) {
-      MallocHost(&cpu_ptr_, size_);
-      own_cpu_data_ = true;
-    }
-    CUDA_CHECK(cudaMemcpy(cpu_ptr_, gpu_ptr_, size_, cudaMemcpyDefault));
-    head_ = SYNCED;
-#else
-    NO_GPU;
-#endif
-    break;
-  case HEAD_AT_CPU:
-  case SYNCED:
-    break;
-  }
-}
-
-void SyncedMemory::to_gpu() {
-#ifndef CPU_ONLY
-  switch (head_) {
-  case UNINITIALIZED:
-    CUDA_CHECK(cudaMalloc(&gpu_ptr_, size_));
-    CUDA_CHECK(cudaMemset(gpu_ptr_, 0, size_));
-    head_ = HEAD_AT_GPU;
-    break;
-  case HEAD_AT_CPU:
-    if (gpu_ptr_ == NULL) {
-      CUDA_CHECK(cudaMalloc(&gpu_ptr_, size_));
-    }
-    CUDA_CHECK(cudaMemcpy(gpu_ptr_, cpu_ptr_, size_, cudaMemcpyDefault));
-    head_ = SYNCED;
-    break;
-  case HEAD_AT_GPU:
-  case SYNCED:
-    break;
-  }
-#else
-  NO_GPU;
-#endif
-}
-
-template <typename Dtype>
-void Blob<Dtype>::Reshape(const std::vector<int>& shape) {
-  shape_ = shape;
-  count_ = shape.size() ? 1 : 0;
-  for (size_t i = 0; i < shape.size(); ++i) {
-    CHECK(shape[i]);
-    count_ *= shape[i];
-  }
-  if (count_ > capacity_) {
-    capacity_ = count_;
-    data_.reset(new SyncedMemory(capacity_ * sizeof(Dtype)));
-  }
-}
-
-template <typename Dtype>
-void Blob<Dtype>::ReshapeLike(const Blob<Dtype>& other) {
-  Reshape(other.shape());
-}
-
-template <typename Dtype>
-void Blob<Dtype>::CopyFrom(const Blob& source) {
-    CopyFrom(source, false);
-}
-
-template <typename Dtype>
-void Blob<Dtype>::CopyFrom(const Blob& source, bool shape_check) {
-  LOG(WARNING) << "Better use Copy(const Blob&, Blob*)";
-  CHECK_EQ(source.count(), count()) << " cp between blobs of diff size";
-
-  if (shape_check &&
-      !std::equal(shape_.begin(), shape_.end(), source.shape_.begin())) {
-      LOG(FATAL) << "Trying to copy blobs of different sizes.";
-  }
-#ifndef CPU_ONLY
-  CUDA_CHECK(cudaMemcpy(static_cast<Dtype*>(data_->mutable_gpu_data()),
-             source.gpu_data(), sizeof(Dtype) * count_, cudaMemcpyDefault));
-#endif
-  memcpy(static_cast<Dtype*>(data_->mutable_cpu_data()), source.cpu_data(),
-         sizeof(Dtype)*count_);
-}
-
-template <typename Dtype>
-void Blob<Dtype>::FromProto(const singa::BlobProto& proto) {
-  std::vector<int> shape;
-  for (int s : proto.shape()) {
-    shape.push_back(s);
-  }
-  int count = count_;
-  Reshape(shape);
-  if (count != count_)
-    LOG(WARNING) << "Blob is reshaped to diff size " << count << ":" << count_;
-  // copy data
-  Dtype* data_vec = mutable_cpu_data();
-  for (int i = 0; i < count_; ++i) {
-    data_vec[i] = proto.data(i);
-  }
-}
-
-template <typename Dtype>
-void Blob<Dtype>::ToProto(singa::BlobProto* proto) const {
-  for (int s : shape_) {
-    proto->add_shape(s);
-  }
-  proto->clear_data();
-  const Dtype* data_vec = cpu_data();
-  for (int i = 0; i < count_; ++i) {
-    proto->add_data(data_vec[i]);
-  }
-}
-
-template <typename Dtype>
-void Blob<Dtype>::SetValue(Dtype v) {
-  Dtype* ptr = mutable_cpu_data();
-  for (int i =0; i < count(); i++)
-    ptr[i] = v;
-}
-template <typename Dtype>
-void Blob<Dtype>::ShareData(Blob* other, bool cpu_only) {
-  CHECK_EQ(count_, other->count());
-  if (cpu_only)
-    data_->set_cpu_data(other->mutable_cpu_data());
-  else
-    data_ = other->data_;
-}
-
-/*
-template <typename Dtype>
-void Blob<Dtype>::Swap(Blob& other) {
-  CHECK_EQ(other.count(), count());
-  CHECK(std::equal(shape_.begin(), shape_.end(), other.shape_.begin()));
-  std::swap(data_, other.data_);
-  std::swap(capacity_, other.capacity_);
-}
-*/
-
-INSTANTIATE_CLASS(Blob);
-template class Blob<int>;
-template class Blob<unsigned int>;
-
-}  // namespace singa
diff --git a/src/utils/channel.cc b/src/utils/channel.cc
new file mode 100644
index 0000000..588a11a
--- /dev/null
+++ b/src/utils/channel.cc
@@ -0,0 +1,104 @@
+/************************************************************
+*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*
+*************************************************************/
+
+#include "singa/utils/channel.h"
+
+#include "singa/utils/logging.h"
+#include "singa/utils/singleton.h"
+
+namespace singa {
+
+ChannelManager::~ChannelManager() {
+  for (auto it : name2ptr_) {
+    if (it.second != nullptr) delete (it.second);
+  }
+}
+
+void ChannelManager::Init() {
+  // do nothing here
+}
+
+void ChannelManager::SetDefaultDir(const char* dir) {
+  if (dir != nullptr) {
+    dir_ = dir;
+    if (dir[dir_.length() - 1] != '/') dir_ += '/';
+  }
+}
+
+Channel* ChannelManager::GetInstance(const std::string& channel) {
+  // find the channel
+  if (name2ptr_.find(channel) == name2ptr_.end()) {
+    // create new channel
+    Channel* chn = new Channel(channel);
+    chn->SetDestFilePath(dir_ + channel);
+    chn->EnableDestFile(true);
+    name2ptr_[channel] = chn;
+  }
+  return name2ptr_[channel];
+}
+
+Channel::Channel(const std::string& name) { name_ = name; }
+
+Channel::~Channel() {
+  if (os_.is_open()) os_.close();
+}
+
+void Channel::SetDestFilePath(const std::string& file) {
+  // file is append only
+  if (os_.is_open()) os_.close();
+  {
+    std::ifstream fin(file.c_str());
+    if (fin.good())
+      LOG(WARNING) << "Messages will be appended to an existed file: " << file;
+  }
+  os_.open(file.c_str(), std::ios::app);
+  if (os_.is_open() == false)
+    LOG(WARNING) << "Cannot open channel file (" << file << ")";
+}
+
+void Channel::Send(const std::string& message) {
+  if (stderr_) fprintf(stderr, "%s\n", message.c_str());
+  if (file_ && os_.is_open()) os_ << message << "\n";
+  // TODO(wangwei) flush
+}
+
+void Channel::Send(const google::protobuf::Message& message) {
+  if (stderr_) fprintf(stderr, "%s\n", message.DebugString().c_str());
+  if (file_ && os_.is_open()) message.SerializeToOstream(&os_);
+  // TODO(wangwei) flush
+}
+
+void InitChannel(const char* argv) {
+  ChannelManager* mng = Singleton<ChannelManager>().Instance();
+  mng->Init();
+}
+
+void SetChannelDirectory(const char* path) {
+  ChannelManager* mng = Singleton<ChannelManager>().Instance();
+  mng->SetDefaultDir(path);
+}
+
+Channel* GetChannel(const std::string& channel_name) {
+  ChannelManager* mng = Singleton<ChannelManager>().Instance();
+  return mng->GetInstance(channel_name);
+}
+
+}  // namespace singa
diff --git a/src/utils/cluster.cc b/src/utils/cluster.cc
deleted file mode 100644
index a9928eb..0000000
--- a/src/utils/cluster.cc
+++ /dev/null
@@ -1,131 +0,0 @@
-/************************************************************
-*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*
-*************************************************************/
-
-#include "singa/utils/cluster.h"
-
-#include <sys/stat.h>
-#include <sys/types.h>
-#include <unistd.h>
-#include <fstream>
-
-namespace singa {
-using std::vector;
-
-Cluster* Cluster::Setup(int job, const SingaProto& singaConf,
-                        const ClusterProto& clusterConf) {
-  Singleton<Cluster>::Instance()->Init(job, singaConf, clusterConf);
-  return Singleton<Cluster>::Instance();
-}
-
-Cluster* Cluster::Get() {
-  if (!Singleton<Cluster>::Instance()->nprocs_) {
-    LOG(ERROR) << "The first call to Get should "
-               << "provide the job conf path";
-  }
-  return Singleton<Cluster>::Instance();
-}
-
-void Cluster::Register(int pid, const std::string& endpoint) {
-  procs_id_ = cluster_rt_->RegistProc(endpoint, pid);
-  CHECK_GE(procs_id_, 0);
-  CHECK_LT(procs_id_, nprocs());
-  LOG(ERROR) << "proc #" << procs_id_ << " -> " << endpoint
-             << " (pid = " << pid << ")";
-}
-
-void Cluster::Init(int job, const SingaProto& singaConf,
-                   const ClusterProto& clusterConf) {
-  cluster_ = clusterConf;
-  singa_ = singaConf;
-  SetupFolders(clusterConf);
-  if (server_worker_separate())
-    nprocs_ = nworker_procs() + nserver_procs();
-  else
-    nprocs_ = std::max(nworker_procs(), nserver_procs());
-
-  // locate the process id of every worker/server
-  int ngrps = cluster_.nworker_groups();
-  int grp_size = cluster_.nworkers_per_group();
-  int procs = 0;
-  for (int i = 0; i < ngrps; ++i) {
-    for (int j = 0; j < grp_size; ++j) {
-      procs = (i * grp_size + j) / cluster_.nworkers_per_procs();
-      procs_ids_[Hash(i, j, kWorkerLayer)] = procs;
-      procs_ids_[Hash(i, j, kWorkerParam)] = procs;
-    }
-  }
-  int offset = cluster_.server_worker_separate() ? procs + 1 : 0;
-  ngrps = cluster_.nserver_groups();
-  grp_size = cluster_.nservers_per_group();
-  for (int i = 0; i < ngrps; ++i) {
-    for (int j = 0; j < grp_size; ++j) {
-      procs_ids_[Hash(i, j, kServer)] =
-          (i * grp_size + j) / cluster_.nservers_per_procs() + offset;
-    }
-  }
-  // cluster_rt_ = new ZKClusterRT(singa_.zookeeper_host(), job);
-  // cluster_rt_ = new SPClusterRT();
-  cluster_rt_ = ClusterRuntime::Create(singa_.zookeeper_host(), job);
-  cluster_rt_->Init();
-}
-
-void Cluster::SetupFolders(const ClusterProto &cluster) {
-  // create visulization folder
-  mkdir(vis_folder().c_str(),  S_IRWXU | S_IRWXG | S_IROTH | S_IXOTH);
-  // create checkpoint folder
-  mkdir(checkpoint_folder().c_str(),  S_IRWXU | S_IRWXG | S_IROTH | S_IXOTH);
-}
-
-const vector<int> Cluster::ExecutorRng(int pid, int grp_size, int procs_size) {
-  int gstart, gend, start, end;
-  if (grp_size >= procs_size) {
-    // all workers in this procs are from the same group
-    gstart = pid * procs_size / grp_size;
-    gend = gstart + 1;
-    start = pid * procs_size % grp_size;
-    end = start + procs_size;
-  } else {
-    // there are multiple (complete) groups in this procs.
-    CHECK_EQ(procs_size % grp_size, 0);
-    int groups_per_procs = procs_size / grp_size;
-    gstart = pid * groups_per_procs;
-    gend = (pid+1) * groups_per_procs;
-    start = 0;
-    end = grp_size;
-  }
-  return vector<int>{gstart, gend, start, end};
-}
-
-int Cluster::Hash(int gid, int id, int flag) {
-  int ret = -1;
-  if (flag == kServer) {
-    ret = kServer * cluster_.nworker_groups()
-      * cluster_.nworkers_per_group()
-      + (cluster_.nserver_groups() + gid)
-      * cluster_.nservers_per_group() + id;
-  } else {
-    ret = (flag * cluster_.nworker_groups() + gid)
-          * cluster_.nworkers_per_group() + id;
-  }
-  return ret;
-}
-
-}  // namespace singa
diff --git a/src/utils/cluster_rt.cc b/src/utils/cluster_rt.cc
deleted file mode 100644
index 9a7b8bd..0000000
--- a/src/utils/cluster_rt.cc
+++ /dev/null
@@ -1,110 +0,0 @@
-/************************************************************
-*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*
-*************************************************************/
-
-#include "singa/utils/cluster_rt.h"
-
-#include <glog/logging.h>
-#include <google/protobuf/text_format.h>
-#include <stdlib.h>
-#include <algorithm>
-#include <fstream>
-#include <iostream>
-#include "singa/proto/job.pb.h"
-
-#ifdef USE_ZOOKEEPER
-#include "singa/utils/zk_service.h"
-#endif
-
-using std::string;
-using std::to_string;
-using std::vector;
-
-namespace singa {
-
-ClusterRuntime* ClusterRuntime::Create(const std::string&host, int job_id) {
-#ifdef USE_ZOOKEEPER
-  return new ZKClusterRT(host, job_id);
-#else
-  return new SPClusterRT();
-#endif
-}
-
-SPClusterRT::~SPClusterRT() {
-  // release callback vector
-  for (auto list : grp_callbacks_)
-    for (RTCallback* p : list.second) {
-    delete p;
-  }
-}
-
-bool SPClusterRT::Init() {
-  return true;
-}
-
-int SPClusterRT::RegistProc(const string& host_addr, int pid) {
-  int ret;
-  lock_.lock();
-  proc_list_.push_back(host_addr + std::to_string(pid));
-  ret = proc_list_.size()-1;
-  lock_.unlock();
-  return ret;
-}
-
-string SPClusterRT::GetProcHost(int proc_id) {
-  if (proc_list_.size() < (unsigned)proc_id + 1) return "";
-  return proc_list_[proc_id];
-}
-
-bool SPClusterRT::WatchSGroup(int gid, int sid, rt_callback fn, void* ctx) {
-  // store the callback function and context for later usage
-  RTCallback *cb = new RTCallback;
-  cb->fn = fn;
-  cb->ctx = ctx;
-  lock_.lock();
-  if (grp_callbacks_.count(gid) == 0)
-    grp_callbacks_[gid] = vector<RTCallback*>{};
-  grp_callbacks_[gid].push_back(cb);
-  lock_.unlock();
-  return true;
-}
-
-bool SPClusterRT::JoinSGroup(int gid, int wid, int s_group) {
-  lock_.lock();
-  if (grp_count_.count(gid) == 0)
-    grp_count_[gid] = 0;
-  grp_count_[gid]++;
-  lock_.unlock();
-  return true;
-}
-
-bool SPClusterRT::LeaveSGroup(int gid, int wid, int s_group) {
-  lock_.lock();
-  if (--grp_count_[gid] == 0) {
-      for (RTCallback* cb : grp_callbacks_[gid]) {
-        (*cb->fn)(cb->ctx);
-        cb->fn = nullptr;
-      }
-  }
-  lock_.unlock();
-  return true;
-}
-
-}  // namespace singa
diff --git a/src/utils/common.cc b/src/utils/common.cc
deleted file mode 100644
index bd0fee5..0000000
--- a/src/utils/common.cc
+++ /dev/null
@@ -1,574 +0,0 @@
-/************************************************************
-*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*
-*************************************************************/
-
-/**
- * The code is adapted from Caffe under BSD 2 Clause license.
- * All contributions by the University of California:
- * Copyright (c) 2014, The Regents of the University of California (Regents)
- * All rights reserved.
- * All other contributions:
- * Copyright (c) 2014, the respective contributors
- * All rights reserved.
- * Caffe uses a shared copyright model: each contributor holds copyright over
- * their contributions to Caffe. The project versioning records all such
- * contribution and copyright details. If a contributor wants to further mark
- * their specific copyright on a particular contribution, they should indicate
- * their copyright solely in the commit message of the change when it is
- * committed.
- */
-
-#include "singa/utils/common.h"
-
-#include <sys/ioctl.h>
-#include <sys/socket.h>
-#include <sys/stat.h>
-#include <sys/types.h>
-
-#include <netinet/in.h>
-#include <net/if.h>
-#include <arpa/inet.h>
-
-#include <stdarg.h>
-#include <stdio.h>
-#include <time.h>
-#include <unistd.h>
-#include <fcntl.h>
-#include <cfloat>
-
-#include <fstream>
-
-#include <glog/logging.h>
-#include <google/protobuf/io/coded_stream.h>
-#include <google/protobuf/io/zero_copy_stream_impl.h>
-#include <google/protobuf/text_format.h>
-
-namespace singa {
-
-const int kBufLen = 1024;
-
-string IntVecToString(const vector<int>& vec) {
-  string disp = "(";
-  for (int x : vec)
-    disp += std::to_string(x) + ", ";
-  return disp + ")";
-}
-
-/**
- *  * Formatted string.
- *   */
-string VStringPrintf(string fmt, va_list l) {
-  char buffer[4096];
-  vsnprintf(buffer, sizeof(buffer), fmt.c_str(), l);
-  return string(buffer);
-}
-
-/**
- *  * Formatted string.
- *   */
-string StringPrintf(string fmt, ...) {
-  va_list l;
-  va_start(l, fmt);  // fmt.AsString().c_str());
-  string result = VStringPrintf(fmt, l);
-  va_end(l);
-  return result;
-}
-
-int ArgPos(int argc, char** arglist, const char* arg) {
-  for (int i = 0; i < argc; i++) {
-    if (strcmp(arglist[i], arg) == 0) {
-      return i;
-    }
-  }
-  return -1;
-}
-
-void  CreateFolder(const string name) {
-  struct stat buffer;
-  if (stat(name.c_str(), &buffer) != 0) {
-    mkdir(name.c_str(), S_IRWXU | S_IRWXG | S_IROTH | S_IXOTH);
-    CHECK_EQ(stat(name.c_str(), &buffer), 0);
-  }
-}
-
-const vector<vector<int>> Slice(int num, const vector<int>& sizes) {
-  vector<vector<int>> slices;
-  if (num == 0)
-    return slices;
-  int avg = 0;
-  for (int x : sizes)
-      avg += x;
-  avg = avg / num + avg % num;
-  int diff = avg / 10;
-  // DLOG(INFO) << "Slicer, param avg = " << avg << ", diff = " << diff;
-
-  int capacity = avg, nbox = 0;
-  for (int x : sizes) {
-    vector<int> slice;
-    string slicestr = "";
-    while (x > 0) {
-      int size = 0;
-      if (capacity >= x) {
-        capacity -= x;
-        size = x;
-        x = 0;
-      } else if (capacity + diff >= x) {
-        size = x;
-        x = 0;
-        capacity = 0;
-      } else if (capacity >= diff) {
-        x -= capacity;
-        size = capacity;
-        capacity = avg;
-        nbox++;
-      } else {
-        capacity = avg;
-        nbox++;
-      }
-      if (size) {
-        slice.push_back(size);
-        slicestr += ", " + std::to_string(size);
-      }
-    }
-    // DLOG(INFO) << slicestr;
-    slices.push_back(slice);
-  }
-  CHECK_LE(nbox, num);
-  return slices;
-}
-
-const vector<int> PartitionSlices(int num, const vector<int>& slices) {
-  vector<int> slice2box;
-  if (num == 0)
-    return slice2box;
-  int avg = 0;
-  for (int x : slices)
-    avg += x;
-  avg = avg / num + avg % num;
-  int box = avg, boxid = 0, diff = avg / 10;
-  for (auto it = slices.begin(); it != slices.end();) {
-    int x = *it;
-    if (box >= x) {
-      box -= x;
-      slice2box.push_back(boxid);
-      it++;
-    } else if (box + diff >= x) {
-      slice2box.push_back(boxid);
-      it++;
-      box = 0;
-    } else {
-      box = avg;
-      boxid++;
-    }
-  }
-  CHECK_EQ(slice2box.size(), slices.size());
-  int previd = -1;
-  string disp;
-  for (size_t i = 0; i < slice2box.size(); i++) {
-    if (previd != slice2box[i]) {
-      previd = slice2box[i];
-      disp += " box = " +std::to_string(previd) + ":";
-    }
-    disp += " " + std::to_string(slices[i]);
-  }
-  return slice2box;
-}
-
-int gcd(int a, int b) {
-  for (;;) {
-    if (a == 0) return b;
-    b %= a;
-    if (b == 0) return a;
-    a %= b;
-  }
-}
-
-int LeastCommonMultiple(int a, int b) {
-  int temp = gcd(a, b);
-  return temp ? (a / temp * b) : 0;
-}
-
-string GetHostIP() {
-  int fd;
-  struct ifreq ifr;
-  fd = socket(AF_INET, SOCK_DGRAM, 0);
-  /* I want to get an IPv4 IP address */
-  ifr.ifr_addr.sa_family = AF_INET;
-  /* I want IP address attached to "eth0" */
-  strncpy(ifr.ifr_name, "eth0", IFNAMSIZ-1);
-  ioctl(fd, SIOCGIFADDR, &ifr);
-  close(fd);
-  string ip(inet_ntoa(((struct sockaddr_in *)&ifr.ifr_addr)->sin_addr));
-  /* display result */
-  LOG(INFO) << "Host IP = " << ip;
-  return ip;
-}
-
-void SetupLog(const string& log_dir, const string& model) {
-  // TODO(wangwei) check if NFS, then create folder using script, otherwise
-  // may have problems due to multiple processes create the same folder.
-  CreateFolder(log_dir);
-  string warn = log_dir + "/" + model + "-warn-";
-  string info = log_dir + "/" +  model + "-info-";
-  string error = log_dir + "/" +  model + "-error-";
-  string fatal = log_dir + "/" + model + "-fatal-";
-  google::SetLogDestination(google::WARNING, warn.c_str());
-  google::SetLogDestination(google::INFO, info.c_str());
-  google::SetLogDestination(google::ERROR, error.c_str());
-  google::SetLogDestination(google::FATAL, fatal.c_str());
-}
-
-Metric::Metric(const string& str) {
-  ParseFrom(str);
-}
-
-void Metric::Add(const string& name, float value) {
-  Add(name, value, 1);
-}
-void Metric::Add(const string& name, float value, int count) {
-  if (entry_.find(name) == entry_.end()) {
-    entry_[name] = std::make_pair(1, value);
-  } else {
-    auto& e = entry_.at(name);
-    e.first += count;
-    e.second += value;
-  }
-}
-
-void Metric::Reset() {
-  for (auto& e : entry_) {
-    e.second.first = 0;
-    e.second.second = 0;
-  }
-}
-
-string Metric::ToLogString() const {
-  string ret;
-  size_t k = 0;
-  for (auto e : entry_) {
-    ret += e.first + " = ";
-    ret += std::to_string(e.second.second / e.second.first);
-    if (++k < entry_.size())
-      ret += ", ";
-  }
-  return ret;
-}
-
-string Metric::ToString() const {
-  MetricProto proto;
-  for (auto e : entry_) {
-    proto.add_name(e.first);
-    proto.add_count(e.second.first);
-    proto.add_val(e.second.second);
-  }
-  string ret;
-  proto.SerializeToString(&ret);
-  return ret;
-}
-
-void Metric::ParseFrom(const string& msg) {
-  MetricProto proto;
-  proto.ParseFromString(msg);
-  Reset();
-  for (int i = 0; i < proto.name_size(); i++) {
-    entry_[proto.name(i)] = std::make_pair(proto.count(i), proto.val(i));
-  }
-}
-
-
-/*************Below functions are adapted from Caffe ************/
-using google::protobuf::io::CodedInputStream;
-using google::protobuf::io::FileInputStream;
-using google::protobuf::io::FileOutputStream;
-using google::protobuf::io::ZeroCopyInputStream;
-
-
-void Im2col(const float* data_im, const int channels,
-    const int height, const int width, const int kernel_h, const int kernel_w,
-    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
-    float* data_col) {
-  int height_col = (height + 2 * pad_h - kernel_h) / stride_h + 1;
-  int width_col = (width + 2 * pad_w - kernel_w) / stride_w + 1;
-  int channels_col = channels * kernel_h * kernel_w;
-  for (int c = 0; c < channels_col; ++c) {
-    int w_offset = c % kernel_w;
-    int h_offset = (c / kernel_w) % kernel_h;
-    int c_im = c / kernel_h / kernel_w;
-    for (int h = 0; h < height_col; ++h) {
-      for (int w = 0; w < width_col; ++w) {
-        int h_pad = h * stride_h - pad_h + h_offset;
-        int w_pad = w * stride_w - pad_w + w_offset;
-        if (h_pad >= 0 && h_pad < height && w_pad >= 0 && w_pad < width)
-          data_col[(c * height_col + h) * width_col + w] =
-            data_im[(c_im * height + h_pad) * width + w_pad];
-        else
-          data_col[(c * height_col + h) * width_col + w] = 0;
-      }
-    }
-  }
-}
-
-void Col2im(const float* data_col, const int channels,
-    const int height, const int width, const int patch_h, const int patch_w,
-    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
-    float* data_im) {
-  memset(data_im, 0, height * width * channels * sizeof(float));
-  int height_col = (height + 2 * pad_h - patch_h) / stride_h + 1;
-  int width_col = (width + 2 * pad_w - patch_w) / stride_w + 1;
-  int channels_col = channels * patch_h * patch_w;
-  for (int c = 0; c < channels_col; ++c) {
-    int w_offset = c % patch_w;
-    int h_offset = (c / patch_w) % patch_h;
-    int c_im = c / patch_h / patch_w;
-    for (int h = 0; h < height_col; ++h) {
-      for (int w = 0; w < width_col; ++w) {
-        int h_pad = h * stride_h - pad_h + h_offset;
-        int w_pad = w * stride_w - pad_w + w_offset;
-        if (h_pad >= 0 && h_pad < height && w_pad >= 0 && w_pad < width)
-          data_im[(c_im * height + h_pad) * width + w_pad] +=
-            data_col[(c * height_col + h) * width_col + w];
-      }
-    }
-  }
-}
-
-void ForwardMaxPooling(const float* bottom, const int num, const int channels,
-    const int height, const int width, const int kernel_h, const int kernel_w,
-    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
-    float* top, float* mask) {
-  int top_height = (height + pad_h * 2 -kernel_h) / stride_h + 1;
-  int top_width = (width + pad_w * 2 -kernel_w) / stride_w + 1;
-  int top_count = num * top_height * top_width * channels;
-  for (int i = 0; i < top_count; i++) {
-    mask[i] = -1;
-    top[i] = -FLT_MAX;
-  }
-  const int bottom_offset =  height * width;
-  const int top_offset = top_height * top_width;
-  // The main loop
-  for (int n = 0; n < num; ++n) {
-    for (int c = 0; c < channels; ++c) {
-      for (int ph = 0; ph < top_height; ++ph) {
-        for (int pw = 0; pw < top_width; ++pw) {
-          int hstart = ph * stride_h - pad_h;
-          int wstart = pw * stride_w - pad_w;
-          int hend = std::min(hstart + kernel_h, height);
-          int wend = std::min(wstart + kernel_w, width);
-          hstart = std::max(hstart, 0);
-          wstart = std::max(wstart, 0);
-          const int top_index = ph * top_width + pw;
-          for (int h = hstart; h < hend; ++h) {
-            for (int w = wstart; w < wend; ++w) {
-              const int index = h * width + w;
-              if (bottom[index] > top[top_index]) {
-                top[top_index] = bottom[index];
-                mask[top_index] = index;
-              }
-            }
-          }
-        }
-      }
-      // compute offset
-      bottom += bottom_offset;
-      top += top_offset;
-      mask += top_offset;
-    }
-  }
-}
-
-void BackwardMaxPooling(const float* top, const float* mask, const int num,
-    const int channels, const int height, const int width,
-    const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
-    const int stride_h, const int stride_w,
-    float* bottom) {
-  int top_height = (height + pad_h * 2 -kernel_h) / stride_h + 1;
-  int top_width = (width + pad_w * 2 -kernel_w) / stride_w + 1;
-  const int top_offset = top_height * top_width;
-  const int bottom_offset = height * width;
-  memset(bottom, 0, sizeof(float) * num * channels * bottom_offset);
-  for (int n = 0; n < num; ++n) {
-    for (int c = 0; c < channels; ++c) {
-      for (int ph = 0; ph < top_height; ++ph) {
-        for (int pw = 0; pw < top_width; ++pw) {
-          const int top_idx = ph * top_width + pw;
-          const int bottom_idx = static_cast<int>(mask[top_idx]);
-          bottom[bottom_idx] += top[top_idx];
-        }
-      }
-      top += top_offset;
-      mask += top_offset;
-      bottom += bottom_offset;
-    }
-  }
-}
-
-void ForwardAvgPooling(const float* bottom, const int num, const int channels,
-    const int height, const int width, const int kernel_h, const int kernel_w,
-    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
-    float* top) {
-  int top_height = (height + pad_h * 2 -kernel_h) / stride_h + 1;
-  int top_width = (width + pad_w * 2 -kernel_w) / stride_w + 1;
-  int top_count = num * top_height * top_width * channels;
-  for (int i = 0; i < top_count; i++) {
-    top[i] = 0;
-  }
-  const int bottom_offset =  height * width;
-  const int top_offset = top_height * top_width;
-  // The main loop
-  for (int n = 0; n < num; ++n) {
-    for (int c = 0; c < channels; ++c) {
-      for (int ph = 0; ph < top_height; ++ph) {
-        for (int pw = 0; pw < top_width; ++pw) {
-          int hstart = ph * stride_h - pad_h;
-          int wstart = pw * stride_w - pad_w;
-          int hend = std::min(hstart + kernel_h, height+pad_h);
-          int wend = std::min(wstart + kernel_w, width+pad_w);
-          int pool_size = (hend-hstart) * (wend-wstart);
-          hstart = std::max(hstart, 0);
-          wstart = std::max(wstart, 0);
-          hend = std::min(hend, height);
-          wend = std::min(wend, width);
-          const int top_index = ph * top_width + pw;
-          for (int h = hstart; h < hend; ++h) {
-            for (int w = wstart; w < wend; ++w) {
-              const int index = h * width + w;
-              top[top_index] += bottom[index];
-            }
-          }
-          top[top_index] /= pool_size;
-        }
-      }
-      // compute offset
-      bottom += bottom_offset;
-      top += top_offset;
-    }
-  }
-}
-
-void BackwardAvgPooling(const float* top, const int num, const int channels,
-    const int height, const int width, const int kernel_h, const int kernel_w,
-    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
-    float* bottom) {
-  int top_height = (height + pad_h * 2 -kernel_h) / stride_h + 1;
-  int top_width = (width + pad_w * 2 -kernel_w) / stride_w + 1;
-  const int top_offset = top_height * top_width;
-  const int bottom_offset = height * width;
-  memset(bottom, 0, sizeof(float) * num * channels * bottom_offset);
-  for (int n = 0; n < num; ++n) {
-    for (int c = 0; c < channels; ++c) {
-      for (int ph = 0; ph < top_height; ++ph) {
-        for (int pw = 0; pw < top_width; ++pw) {
-          int hstart = ph * stride_h - pad_h;
-          int wstart = pw * stride_w - pad_w;
-          int hend = std::min(hstart + kernel_h, height+pad_h);
-          int wend = std::min(wstart + kernel_w, width+pad_w);
-          int pool_size = (hend-hstart) * (wend-wstart);
-          hstart = std::max(hstart, 0);
-          wstart = std::max(wstart, 0);
-          hend = std::min(hend, height);
-          wend = std::min(wend, width);
-          const int top_index = ph * top_width + pw;
-          for (int h = hstart; h < hend; ++h) {
-            for (int w = wstart; w < wend; ++w) {
-              const int index = h * width + w;
-              bottom[index] += top[top_index] / pool_size;
-            }
-          }
-        }
-      }
-      top += top_offset;
-      bottom += bottom_offset;
-    }
-  }
-}
-
-void ReadProtoFromTextFile(const char* filename, Message* proto) {
-  int fd = open(filename, O_RDONLY);
-  CHECK_NE(fd, -1) << "File not found: " << filename;
-  FileInputStream* input = new FileInputStream(fd);
-  CHECK(google::protobuf::TextFormat::Parse(input, proto));
-  delete input;
-  close(fd);
-}
-
-void WriteProtoToTextFile(const Message& proto, const char* filename) {
-  int fd = open(filename, O_WRONLY | O_CREAT, 0644);
-  FileOutputStream* output = new FileOutputStream(fd);
-  CHECK(google::protobuf::TextFormat::Print(proto, output));
-  delete output;
-  close(fd);
-}
-
-void ReadProtoFromBinaryFile(const char* filename, Message* proto) {
-  int fd = open(filename, O_RDONLY);
-  CHECK_NE(fd, -1) << "File not found: " << filename;
-  ZeroCopyInputStream* raw_input = new FileInputStream(fd);
-  CodedInputStream* coded_input = new CodedInputStream(raw_input);
-  // upper limit 512MB, warning threshold 256MB
-  coded_input->SetTotalBytesLimit(536870912, 268435456);
-  CHECK(proto->ParseFromCodedStream(coded_input));
-  delete coded_input;
-  delete raw_input;
-  close(fd);
-}
-
-void WriteProtoToBinaryFile(const Message& proto, const char* filename) {
-  int fd = open(filename, O_CREAT|O_WRONLY|O_TRUNC, 0644);
-  CHECK_NE(fd, -1) << "File cannot open: " << filename;
-  CHECK(proto.SerializeToFileDescriptor(fd));
-}
-
-
-
-void WriteStringToTextFile(const string& filename, const string& context) {
-  std::ofstream ofs;
-  ofs.open(filename);
-  CHECK(ofs.is_open()) << "Can't write to file: " << filename;
-  ofs << context;
-  ofs.flush();
-  ofs.close();
-}
-
-
-const vector<std::pair<string, float>> GetMetricFromString(const string& disp) {
-  size_t pos = 0;
-  vector<string> terms;
-  while (pos != string::npos) {
-    auto next = disp.find_first_of(" ,", pos);  // delimiter: space or comma
-    if (next != string::npos) {
-      terms.push_back(disp.substr(pos, next - pos));
-      pos = disp.find_first_not_of(" ,", next + 1);
-    } else {
-      break;
-    }
-  }
-  if (pos != string::npos)
-    terms.push_back(disp.substr(pos));
-  vector<std::pair<string, float>> ret;
-  for (unsigned i = 0; i < terms.size(); i++) {
-    if (terms[i] == "=") {
-      CHECK_GE(i, 1);
-      CHECK_LT(i, terms.size() - 1) << "terms[i] = " << terms[i];
-      ret.push_back(std::make_pair(terms[i-1], std::stof(terms[i + 1])));
-    }
-  }
-  return ret;
-}
-}  // namespace singa
diff --git a/src/utils/graph.cc b/src/utils/graph.cc
deleted file mode 100644
index 4f59635..0000000
--- a/src/utils/graph.cc
+++ /dev/null
@@ -1,273 +0,0 @@
-/************************************************************
-*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*
-*************************************************************/
-
-#include "singa/utils/graph.h"
-
-#include <glog/logging.h>
-#include <algorithm>
-#include <queue>
-#include <unordered_set>
-#include "singa/utils/common.h"
-
-namespace singa {
-
-using std::map;
-using std::string;
-using std::vector;
-
-/**************************************************************************
- * Implementation for Node class
- *************************************************************************/
-Node::Node(string name) {
-  this->name = name;
-}
-Node::Node(string name, const std::map<string, string>& attrs) {
-  this->name = name;
-  this->attrs = attrs;
-}
-
-Node::Node(const string& name, const string& origin, int id, void* proto) {
-  this->name = name;
-  this->origin = origin;
-  this->proto = proto;
-  this->partition_id = id;
-}
-
-void Node::AddDstNode(Node* dstnode) {
-  dstnodes.push_back(dstnode);
-}
-
-void Node::AddSrcNode(Node* srcnode) {
-  srcnodes.push_back(srcnode);
-}
-
-void Node::RemoveDstNode(Node* dst) {
-  auto iter = dstnodes.begin();
-  while ((*iter)->name != dst->name && iter != dstnodes.end())
-    iter++;
-  CHECK_STREQ((*iter)->name.c_str(), dst->name.c_str());
-  dstnodes.erase(iter);
-}
-
-void Node::RemoveSrcNode(Node* src) {
-  auto iter = srcnodes.begin();
-  while ((*iter)->name != src->name && iter != srcnodes.end())
-    iter++;
-  CHECK_STREQ((*iter)->name.c_str(), src->name.c_str());
-  srcnodes.erase(iter);
-}
-
-/****************************************************************************
- * Implementation for Graph class
- ****************************************************************************/
-
-Graph::~Graph() {
-  for (Node* node : nodes_)
-    delete node;
-}
-
-Node* Graph::AddNode(const string& name, const string& origin, int id,
-                    void* proto) {
-  Node* node = new Node(name, origin, id, proto);
-  nodes_.push_back(node);
-  CHECK(name2node_.find(node->name) == name2node_.end())
-    << "node " << node->name << " already exists";
-  name2node_[node->name] = node;
-  return node;
-}
-
-Node* Graph::AddNode(const string& name,
-                     const std::map<string, string>& attrs) {
-  Node* node = new Node(name, attrs);
-  nodes_.push_back(node);
-  CHECK(name2node_.find(node->name) == name2node_.end())
-    << "node " << node->name << " already exists";
-  name2node_[node->name] = node;
-  return node;
-}
-
-void Graph::AddEdge(Node* srcnode, Node* dstnode) {
-  srcnode->AddDstNode(dstnode);
-  dstnode->AddSrcNode(srcnode);
-}
-
-void Graph::AddEdge(const string& src, const string& dst) {
-  auto srcnode = name2node_.find(src);
-  CHECK(srcnode != name2node_.end()) << "can't find src node " << src;
-  auto dstnode = name2node_.find(dst);
-  CHECK(dstnode != name2node_.end()) << "can't find dst node " << dst;
-  AddEdge(srcnode->second, dstnode->second);
-}
-void Graph::AddEdge(Node* srcnode, Node* dstnode,
-      const std::map<string, string>& attrs) {
-  AddEdge(srcnode, dstnode);
-  edge_attrs_[GetEdgeName(srcnode->name, dstnode->name)] = attrs;
-}
-void Graph::AddEdge(const string& src, const std::string& dst,
-      const std::map<string, string>& attrs) {
-  AddEdge(src, dst);
-  edge_attrs_[GetEdgeName(src, dst)] = attrs;
-}
-
-void Graph::RemoveEdge(Node* src, Node* dst) {
-  src->RemoveDstNode(dst);
-  dst->RemoveSrcNode(src);
-}
-
-void Graph::RemoveEdge(const string &src, const string& dst) {
-  auto srcnode = name2node_.find(src);
-  CHECK(srcnode != name2node_.end()) << "can't find src node " << src;
-  auto dstnode = name2node_.find(dst);
-  CHECK(dstnode != name2node_.end()) << "can't find dst node " << dst;
-  RemoveEdge(srcnode->second, dstnode->second);
-}
-
-// sort to make `bottom' nodes be placed in the front positions
-void Graph::Sort() {
-  // nodes to be visited
-  std::queue<Node*> visiting_nodes;
-  // visited node set
-  std::unordered_set<Node*> visited_set;
-  // visiting_nodes + visted_set
-  std::unordered_set<Node*> visit_set;;
-  for (auto node : nodes_) {
-    // visit nodes without source nodes firstly
-    if (node->srcnodes.size() == 0) {
-      visiting_nodes.push(node);
-      visit_set.insert(node);
-    }
-  }
-  int n = nodes_.size();
-  nodes_.clear();
-  while (!visiting_nodes.empty()) {
-    auto node = visiting_nodes.front();
-    visiting_nodes.pop();
-    bool visit = true;
-    bool bi_direction = false;
-    // check if a node has a bi-direction edge with its neighbour
-    for (auto src : node->srcnodes)
-      for (auto src_of_src : src->srcnodes)
-        if (strcmp((src_of_src->name).c_str(), (node->name).c_str()) == 0) {
-          bi_direction = true;
-          break;
-        }
-    // check whether its src nodes number greater than 1
-    if (bi_direction && (node->srcnodes).size() > 1) {
-        auto src = node->srcnodes.at(0);
-        if (visited_set.find(src) == visited_set.end()) {
-          visit = false;
-        }
-    } else {
-      for (auto src : node->srcnodes)
-        if (visited_set.find(src) == visited_set.end()) {
-          visit = false;
-          break;
-        }
-    }
-    if (visit) {
-      nodes_.push_back(node);
-      visited_set.insert(node);
-      for (auto dst : node->dstnodes) {
-        // queueing the dst node if it is not queued before
-        if (visit_set.find(dst) == visit_set.end()) {
-          visiting_nodes.push(dst);
-          visit_set.insert(dst);
-        }
-      }
-    } else {
-      visiting_nodes.push(node);
-    }
-  }
-  CHECK_EQ(nodes_.size(), n);
-}
-
-const Graph Graph::Reverse() const {
-  Graph g;
-  for (Node* n : nodes_)
-    g.AddNode(n->name, n->attrs);
-  for (Node* src : nodes_)
-    for (Node* dst : src->dstnodes) {
-      map<string, string> attrs;
-      const string edge = GetEdgeName(src->name, dst->name);
-      if (edge_attrs_.find(edge) != edge_attrs_.end())
-        attrs = edge_attrs_.at(edge);
-      g.AddEdge(dst->name, src->name, attrs);
-    }
-  return g;
-}
-string Graph::ToJson() const {
-  map<string, string> label;
-  return ToJson(label);
-}
-
-
-string Graph::ToJson(const map<string, string>& label) const {
-  string disp = "{\"directed\":1,\n";
-
-  // add nodes
-  disp += "\"nodes\":[\n";
-
-  bool first = true;
-  map<string, int> node_id;
-  int id = 0;
-  for (auto node : nodes_) {
-    string name = node->name;
-    string lbl = name + " -- ";
-    if (label.find(name) != label.end())
-      lbl += label.at(name);
-    if (node->attrs.find("label") != node->attrs.end())
-      lbl += node->attrs.at("label");
-    disp += StringPrintf("%c{\"id\":\"%s\", \"label\":\"%s\"",
-        !first ? ',' : ' ', name.c_str(), lbl.c_str());
-    for (const auto& attr : node->attrs)
-      if (attr.first != "label")
-        disp += StringPrintf(", \"%s\":\"%s\"",
-            attr.first.c_str(), attr.second.c_str());
-    disp += "}\n";
-    first = false;
-    node_id[name] = id++;
-  }
-  disp += "]\n,\n";
-
-  // add edges
-  disp += "\"links\":[\n";
-  first = true;
-  for (auto src : nodes_) {
-    for (auto dst : src->dstnodes) {
-      const string edge_name = GetEdgeName(src->name, dst->name);
-      string lbl = "";
-      if (label.find(edge_name) != label.end())
-        lbl = label.at(edge_name);
-      disp += StringPrintf("%c{\"source\":%d, \"target\":%d, \"label\": \"%s\"",
-          !first ? ',' : ' ', node_id[src->name], node_id[dst->name],
-          lbl.c_str());
-      if (edge_attrs_.find(edge_name) != edge_attrs_.end()) {
-        for (const auto& attr : edge_attrs_.at(edge_name))
-          disp += StringPrintf(", \"%s\":\"%s\"",
-              attr.first.c_str(), attr.second.c_str());
-      }
-      disp += "}\n";
-      first = false;
-    }
-  }
-  return disp + "]}";
-}
-}  // namespace singa
diff --git a/src/utils/image_transform.cc b/src/utils/image_transform.cc
deleted file mode 100644
index 28d5f4c..0000000
--- a/src/utils/image_transform.cc
+++ /dev/null
@@ -1,57 +0,0 @@
-/************************************************************
-*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*
-*************************************************************/
-#include "singa/utils/image_transform.h"
-
-namespace singa {
-
-void ImageTransform(const float* in, const float* mean, bool mirror, int h_crop,
-    int w_crop, int h_offset, int w_offset, int channel, int height, int width,
-    float scale, float* out) {
-  if (h_crop == 0) {
-    CHECK_EQ(h_offset, 0);
-    h_crop = height;
-  }
-  if (w_crop ==0) {
-    CHECK_EQ(w_offset, 0);
-    w_crop = width;
-  }
-  CHECK_NE(scale, 0);
-
-  int out_idx = 0, in_idx = 0;
-  for (int c = 0; c < channel; c++) {
-    for (int h = 0; h < h_crop; h++) {
-      for (int w = 0; w < w_crop; w++) {
-        in_idx = (c * height + h_offset + h) * width + w_offset + w;
-        if (mirror) {
-          out_idx = (c * h_crop + h) * w_crop + (w_crop - 1 - w);
-        } else {
-          out_idx = (c * h_crop + h) * w_crop + w;
-        }
-        out[out_idx] = in[in_idx];
-        if (mean != nullptr)
-          out[out_idx] -= mean[in_idx];
-        out[out_idx] *= scale;
-      }
-    }
-  }
-}
-
-}  // namespace singa
diff --git a/src/utils/job_manager.cc b/src/utils/job_manager.cc
deleted file mode 100644
index 2ea5b1b..0000000
--- a/src/utils/job_manager.cc
+++ /dev/null
@@ -1,271 +0,0 @@
-/************************************************************
-*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-* 
-*   http://www.apache.org/licenses/LICENSE-2.0
-* 
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*
-*************************************************************/
-
-#include "singa/utils/job_manager.h"
-
-#include <glog/logging.h>
-#include <google/protobuf/text_format.h>
-#include <stdlib.h>
-#include <algorithm>
-#include <fstream>
-#include <iostream>
-#include "singa/proto/job.pb.h"
-
-using std::string;
-using std::vector;
-
-namespace singa {
-
-JobManager::JobManager(const string& host) {
-  host_ = host;
-}
-
-bool JobManager::Init() {
-#ifdef USE_ZOOKEEPER
-  if (!zk_.Init(host_, timeout_)) return false;
-  if (!zk_.CreateNode(kZKPathSinga.c_str(), nullptr, 0, nullptr))
-    return false;
-  if (!zk_.CreateNode(kZKPathSys.c_str(), nullptr, 0, nullptr))
-    return false;
-  if (!zk_.CreateNode(kZKPathJLock.c_str(), nullptr, 0, nullptr))
-    return false;
-  if (!zk_.CreateNode(kZKPathHostIdx.c_str(), "0", 0, nullptr))
-    return false;
-  if (!zk_.CreateNode(kZKPathApp.c_str(), nullptr, 0, nullptr))
-    return false;
-#endif
-  return true;
-}
-
-bool JobManager::GenerateJobID(int* id) {
-#ifdef USE_ZOOKEEPER
-  char buf[kZKBufSize];
-  string lock = kZKPathJLock + "/lock-";
-  if (!zk_.CreateNode(lock.c_str(), nullptr,
-                        ZOO_EPHEMERAL | ZOO_SEQUENCE, buf)) {
-    return false;
-  }
-  *id = atoi(buf + strlen(buf) - 10);
-#else
-  *id = 0;
-#endif
-  return true;
-}
-
-bool JobManager::GenerateHostList(const char* host_file, const char* job_file,
-                                  vector<string>* list) {
-  int nprocs = 1;
-  list->clear();
-  // compute required #process from job conf
-  if (job_file != nullptr) {
-    ClusterProto cluster;
-    google::protobuf::TextFormat::ParseFromString(ExtractClusterConf(job_file),
-                                                  &cluster);
-    int nworker_procs = cluster.nworker_groups() * cluster.nworkers_per_group()
-                        / cluster.nworkers_per_procs();
-    int nserver_procs = cluster.nserver_groups() * cluster.nservers_per_group()
-                        / cluster.nservers_per_procs();
-    if (cluster.server_worker_separate())
-      nprocs = nworker_procs + nserver_procs;
-    else
-      nprocs = std::max(nworker_procs, nserver_procs);
-  }
-#ifdef USE_ZOOKEEPER
-  // get available host list from global conf
-  std::ifstream hostfile(host_file);
-  if (!hostfile.is_open()) {
-    LOG(FATAL) << "Cannot open file: " << host_file;
-  }
-  vector<string> hosts;
-  string host;
-  while (!hostfile.eof()) {
-    getline(hostfile, host);
-    if (!host.length() || host[0] == '#') continue;
-    hosts.push_back(host);
-  }
-  if (!hosts.size()) {
-    LOG(FATAL) << "Empty host file";
-  }
-  // read next host index
-  char val[kZKBufSize];
-  if (!zk_.GetNode(kZKPathHostIdx.c_str(), val)) return false;
-  int next = atoi(val);
-  // generate host list
-  for (int i = 0; i < nprocs; ++i) {
-    list->push_back(hosts[(next + i) % hosts.size()]);
-  }
-  // write next host index
-  next = (next + nprocs) % hosts.size();
-  snprintf(val, kZKBufSize, "%d", next);
-  if (!zk_.UpdateNode(kZKPathHostIdx.c_str(), val)) return false;
-#else
-  CHECK_EQ(nprocs, 1) << "To run multi-process job, please enable zookeeper";
-  list->push_back("localhost");
-#endif
-  return true;
-}
-
-bool JobManager::ListJobProcs(int job, vector<string>* procs) {
-  procs->clear();
-#ifdef USE_ZOOKEEPER
-  string job_path = GetZKJobWorkspace(job);
-  // check job path
-  if (!zk_.Exist(job_path.c_str())) {
-    LOG(ERROR) << "job " << job << " not exists";
-    return true;
-  }
-  string proc_path = job_path + kZKPathJobProc;
-  vector<string> vt;
-  // check job proc path
-  if (!zk_.GetChild(proc_path.c_str(), &vt)) {
-    return false;
-  }
-  char buf[singa::kZKBufSize];
-  for (string pname : vt) {
-    pname = proc_path + "/" + pname;
-    if (!zk_.GetNode(pname.c_str(), buf)) continue;
-    std::string proc = "";
-    for (int i = 0; buf[i] != '\0'; ++i) {
-      if (buf[i] == ':') {
-        buf[i] = '\0';
-        proc += buf;
-      } else if (buf[i] == '|') {
-        proc += buf + i;
-      }
-    }
-    procs->push_back(proc);
-  }
-  if (!procs->size()) LOG(ERROR) << "job " << job << " not exists";
-#endif
-  return true;
-}
-
-bool JobManager::ListJobs(vector<JobInfo>* jobs) {
-  jobs->clear();
-#ifdef USE_ZOOKEEPER
-  vector<string> vt;
-  // get all children in app path
-  if (!zk_.GetChild(kZKPathApp.c_str(), &vt)) {
-    return false;
-  }
-  std::sort(vt.begin(), vt.end());
-  int size = static_cast<int>(vt.size());
-  vector<string> procs;
-  for (int i = 0; i < size; ++i) {
-    string path = kZKPathApp + "/" + vt[i] + kZKPathJobProc;
-    if (!zk_.GetChild(path.c_str(), &procs)) continue;
-    JobInfo job;
-    string jid = vt[i].substr(vt[i].length()-10);
-    job.id = atoi(jid.c_str());
-    job.procs = procs.size();
-    jobs->push_back(job);
-    // may need to delete it
-    if (!job.procs && (i + kJobsNotRemoved < size))
-        CleanPath(kZKPathApp + "/" + vt[i], true);
-  }
-#else
-  LOG(ERROR) << "Not supported without zookeeper";
-#endif
-  return true;
-}
-
-bool JobManager::Remove(int job) {
-#ifdef USE_ZOOKEEPER
-  string path = GetZKJobWorkspace(job) + kZKPathJobProc;
-  if (zk_.Exist(path.c_str())) {
-    return CleanPath(path.c_str(), false);
-  }
-#else
-  LOG(ERROR) << "Not supported without zookeeper";
-#endif
-  return true;
-}
-
-bool JobManager::RemoveAllJobs() {
-#ifdef USE_ZOOKEEPER
-  if (zk_.Exist(kZKPathApp.c_str())) {
-    return CleanPath(kZKPathApp.c_str(), false);
-  }
-#else
-  LOG(ERROR) << "Not supported without zookeeper";
-#endif
-  return true;
-}
-
-bool JobManager::CleanUp() {
-#ifdef USE_ZOOKEEPER
-  if (zk_.Exist(kZKPathSinga.c_str())) {
-    return CleanPath(kZKPathSinga.c_str(), true);
-  }
-#else
-  LOG(ERROR) << "Not supported without zookeeper";
-#endif
-  return true;
-}
-
-bool JobManager::CleanPath(const string& path, bool remove) {
-#ifdef USE_ZOOKEEPER
-  vector<string> child;
-  if (!zk_.GetChild(path.c_str(), &child)) return false;
-  for (string c : child) {
-    if (!CleanPath(path + "/" + c, true)) return false;
-  }
-  if (remove) return zk_.DeleteNode(path.c_str());
-#else
-  LOG(ERROR) << "Not supported without zookeeper";
-#endif
-  return true;
-}
-
-// extract cluster configuration part from the job config file
-// TODO(wangsh) improve this function to make it robust
-string JobManager::ExtractClusterConf(const char* job_file) {
-  std::ifstream fin(job_file);
-  CHECK(fin.is_open()) << "cannot open job conf file " << job_file;
-  string line;
-  string cluster;
-  bool in_cluster = false;
-  while (!fin.eof()) {
-    std::getline(fin, line);
-    if (in_cluster == false) {
-      size_t pos = line.find("cluster");
-      if (pos == std::string::npos) continue;
-      in_cluster = true;
-      line = line.substr(pos);
-      cluster = "";
-    }
-    if (in_cluster == true) {
-      cluster += line + "\n";
-      if (line.find("}") != std::string::npos)
-        in_cluster = false;
-    }
-  }
-  LOG(INFO) << "cluster configure: " << cluster;
-  size_t s_pos = cluster.find("{");
-  size_t e_pos = cluster.find("}");
-  if (s_pos == std::string::npos || e_pos == std::string::npos) {
-    LOG(FATAL) << "cannot extract valid cluster configuration in file: "
-               << job_file;
-  }
-  return cluster.substr(s_pos + 1, e_pos - s_pos-1);
-}
-
-}  // namespace singa
diff --git a/src/utils/logging.cc b/src/utils/logging.cc
new file mode 100644
index 0000000..3b0916b
--- /dev/null
+++ b/src/utils/logging.cc
@@ -0,0 +1,170 @@
+/************************************************************
+*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+* 
+*   http://www.apache.org/licenses/LICENSE-2.0
+* 
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*
+*************************************************************/
+
+#include "singa/utils/logging.h"
+
+#include <stdlib.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+namespace singa {
+
+FILE* log_file[NUM_SEVERITIES] = {};
+bool not_log_stderr[NUM_SEVERITIES] = {};
+
+void InitLogging(const char *argv) {
+#ifdef USE_GLOG
+  google::InitGoogleLogging(argv);
+#else
+  LogToStderr();
+#endif
+}
+
+void LogToStderr() {
+#ifdef USE_GLOG
+  google::LogToStderr();
+#else
+  for (int i = 0; i < NUM_SEVERITIES; ++i) {
+    log_file[i] = nullptr;
+    not_log_stderr[i] = false;
+  }
+#endif
+}
+
+void SetStderrLogging(int severity) {
+#ifdef USE_GLOG
+  google::SetStderrLogging(severity);
+#else
+  for (int i = 0; i < NUM_SEVERITIES; ++i) {
+    not_log_stderr[i] = i >= severity ? false : true;
+  }
+#endif
+}
+
+void SetLogDestination(int severity, const char* path) {
+#ifdef USE_GLOG
+  google::SetLogDestination(severity, path);
+#else
+  log_file[severity] = fopen(path, "a");
+  if (severity < ERROR) not_log_stderr[severity] = true;
+#endif
+}
+
+#ifndef USE_GLOG
+namespace logging {
+
+LogMessage::LogMessage(const char* fname, int line, int severity)
+    : fname_(fname), line_(line), severity_(severity) {}
+
+inline pid_t GetPID() { return getpid(); }
+inline pid_t GetTID() { return (pid_t)(uintptr_t)pthread_self(); }
+
+void LogMessage::GenerateLogMessage() {
+  time_t rw_time = time(nullptr);
+  struct tm tm_time;
+  localtime_r(&rw_time, &tm_time);
+  // log to a file
+  for (int i = severity_; i >= 0; --i)
+    if (log_file[i] )
+      DoLogging(log_file[i], tm_time);
+  // log to stderr
+  if (!not_log_stderr[severity_])
+    DoLogging(stderr, tm_time);
+}
+
+void LogMessage::DoLogging(FILE* file, const struct tm& tm_time) {
+  fprintf(file, "[%c d%02d%02d t%02d:%02d:%02d p%05d:%03d %s:%d] %s\n",
+          "IWEF"[severity_],
+          1 + tm_time.tm_mon,
+          tm_time.tm_mday,
+          tm_time.tm_hour,
+          tm_time.tm_min,
+          tm_time.tm_sec,
+          GetPID(),
+          static_cast<unsigned>(GetTID()%1000),
+          fname_,
+          line_,
+          str().c_str());
+}
+
+LogMessage::~LogMessage() { GenerateLogMessage(); }
+
+LogMessageFatal::LogMessageFatal(const char* file, int line)
+  : LogMessage(file, line, FATAL) {}
+LogMessageFatal::~LogMessageFatal() {
+  // abort() ensures we don't return
+  GenerateLogMessage();
+  abort();
+}
+
+template <>
+void MakeCheckOpValueString(std::ostream* os, const char& v) {
+  if (v >= 32 && v <= 126) {
+    (*os) << "'" << v << "'";
+  } else {
+    (*os) << "char value " << (short)v;
+  }
+}
+
+template <>
+void MakeCheckOpValueString(std::ostream* os, const signed char& v) {
+  if (v >= 32 && v <= 126) {
+    (*os) << "'" << v << "'";
+  } else {
+    (*os) << "signed char value " << (short)v;
+  }
+}
+
+template <>
+void MakeCheckOpValueString(std::ostream* os, const unsigned char& v) {
+  if (v >= 32 && v <= 126) {
+    (*os) << "'" << v << "'";
+  } else {
+    (*os) << "unsigned char value " << (unsigned short)v;
+  }
+}
+
+template <>
+void MakeCheckOpValueString(std::ostream* os, const std::nullptr_t& p) {
+    (*os) << "nullptr";
+}
+
+CheckOpMessageBuilder::CheckOpMessageBuilder(const char* exprtext)
+    : stream_(new std::ostringstream) {
+  *stream_ << "Check failed: " << exprtext << " (";
+}
+
+CheckOpMessageBuilder::~CheckOpMessageBuilder() { delete stream_; }
+
+std::ostream* CheckOpMessageBuilder::ForVar2() {
+  *stream_ << " vs. ";
+  return stream_;
+}
+
+string* CheckOpMessageBuilder::NewString() {
+  *stream_ << ")";
+  return new string(stream_->str());
+}
+
+}  // namespace logging
+#endif
+
+}  // namespace singa
diff --git a/src/utils/math_kernel.cu b/src/utils/math_kernel.cu
deleted file mode 100644
index 65d7067..0000000
--- a/src/utils/math_kernel.cu
+++ /dev/null
@@ -1,450 +0,0 @@
-/************************************************************
-*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*
-*************************************************************/
-#include <cmath>
-#include <algorithm>
-#include "singa/utils/math_kernel.h"
-#include "mshadow/tensor.h"  // FLT_MIN?
-
-#define CU2DBLOCK_X 32
-#define CU2DBLOCK_Y 32
-
-#define CU1DBLOCK 1024
-#define CU1DBLOCKF 1024.0
-
-// Cuda Kernel Functions
-
-__global__
-void kernel_softmax_loss(const float *prob, const int *label , float *loss,
-    int n, int dim) {
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  int num_threads = blockDim.x * gridDim.x;
-  for (; index < n; index += num_threads) {
-    float prob_of_truth = prob[index * dim + label[index]];
-    loss[index] -= log(max(prob_of_truth, FLT_MIN));
-  }
-}
-
-__global__
-void kernel_softmax_gradient(float *grad, const int *label ,
-    int n, int dim, float scale) {
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  int num_threads = blockDim.x * gridDim.x;
-  for (; index < n; index += num_threads) {
-    int pos = index * dim + label[index];
-    grad[pos] = (grad[pos] - 1.0f) * scale;
-  }
-}
-
-__global__
-void kernel_sum_vec(float *data, float *sum , int n) {
-  int THREADS = blockDim.x;
-
-  __shared__ float aux[CU1DBLOCK];
-  int steps = (n - 1) / THREADS + 1;
-  aux[threadIdx.x] = data[threadIdx.x];
-
-  for (int i = 1; i < steps; ++i) {
-    if (threadIdx.x + i * THREADS < n) {
-      aux[threadIdx.x] += data[threadIdx.x+i*THREADS];
-    }
-  }
-
-  int total_threads = THREADS;
-  __syncthreads();
-
-  while (total_threads > 1) {
-    int half_point = ((1+total_threads) >> 1);
-    if (threadIdx.x < half_point) {
-      if (threadIdx.x+half_point < total_threads) {
-        aux[threadIdx.x] += aux[threadIdx.x + half_point];
-      }
-    }
-    __syncthreads();
-    total_threads = ((total_threads+1) >> 1);
-  }
-
-  __syncthreads();
-  *sum = aux[0];
-}
-
-__global__
-void kernel_sum_col(const float *src_mat_data,
-    float *dst_vec_data, int rows, int cols, int stride) {
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  int num_threads = blockDim.x * gridDim.x;
-  for (; index < rows; index += num_threads) {
-    dst_vec_data[index] = 0.0f;
-    for (int k = 0; k < cols; k++) {
-      dst_vec_data[index] += src_mat_data[index * stride + k];
-    }
-  }
-}
-
-__global__
-void kernel_sum_row(const float *src_mat_data,
-    float *dst_vec_data, int rows, int cols, int stride) {
-  int j = blockIdx.x;
-  int THREADS = blockDim.x;
-  if (j >= cols) {
-    return;
-  }
-
-  __shared__ float aux[CU1DBLOCK];
-  int steps = (rows - 1) / THREADS + 1;
-  aux[threadIdx.x] = src_mat_data[j+threadIdx.x*stride];
-  for (int i = 1; i < steps; ++i) {
-    if (threadIdx.x+i*THREADS < rows) {
-      aux[threadIdx.x] += src_mat_data[j+(threadIdx.x+i*THREADS)*stride];
-    }
-  }
-
-  int total_threads = THREADS;
-  __syncthreads();
-  while (total_threads > 1) {
-    int half_point = ((1+total_threads) >> 1);
-    if (threadIdx.x < half_point) {
-      if (threadIdx.x+half_point < total_threads) {
-        aux[threadIdx.x] += aux[threadIdx.x + half_point];
-      }
-    }
-    __syncthreads();
-    total_threads = ((total_threads+1) >> 1);
-  }
-
-  __syncthreads();
-  dst_vec_data[j] = aux[0];
-}
-
-__global__
-void kernel_add_vec_row(const float *src_vec_data, const float *src_mat_data,
-    float* des_mat_data, int rows, int cols, int stride) {
-  int i = blockIdx.x * blockDim.x + threadIdx.x;
-  int j = blockIdx.y * blockDim.y + threadIdx.y;
-  int num_threads_x = blockDim.x * gridDim.x;
-  int num_threads_y = blockDim.y * gridDim.y;
-  int index = 0;
-  for (; i < cols && j < rows; i += num_threads_x, j += num_threads_y) {
-    index = j * stride + i;
-    des_mat_data[index] = src_mat_data[index] + src_vec_data[i];
-  }
-}
-
-__global__
-void kernel_exp(const float *src_data, float *des_data, int n) {
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  int num_threads = blockDim.x * gridDim.x;
-  for (; index < n; index += num_threads) {
-    des_data[index] = exp(src_data[index]);
-  }
-}
-
-__global__
-void kernel_log(const float *src_data, float *des_data, int n) {
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  int num_threads = blockDim.x * gridDim.x;
-  for (; index < n; index += num_threads) {
-    des_data[index] = log(src_data[index]);
-  }
-}
-
-__global__
-void kernel_sigmoid(const float *src_data, float *des_data, int n) {
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  int num_threads = blockDim.x * gridDim.x;
-  for (; index < n; index += num_threads) {
-    des_data[index] = 1.0f / (1.0f + expf(-src_data[index]));
-  }
-}
-
-__global__
-void kernel_sigmoid_grad(const float *src_data, float *des_data, int n) {
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  int num_threads = blockDim.x * gridDim.x;
-  for (; index < n; index += num_threads) {
-    des_data[index] = src_data[index] * (1.0f - src_data[index]);
-  }
-}
-
-__global__
-void kernel_relu(const float *src_data, float *des_data, int n) {
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  int num_threads = blockDim.x * gridDim.x;
-  for (; index < n; index += num_threads) {
-    des_data[index] = max(src_data[index], 0.0f);
-  }
-}
-
-__global__
-void kernel_relu_grad(const float *src_data, float *des_data, int n) {
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  int num_threads = blockDim.x * gridDim.x;
-  for (; index < n; index += num_threads) {
-    des_data[index] = src_data[index] > 0.0f ? 1.0f : 0.0f;
-  }
-}
-
-__global__
-void kernel_tanh(const float *src_data, float *des_data, int n) {
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  int num_threads = blockDim.x * gridDim.x;
-  for (; index < n; index += num_threads) {
-    des_data[index] = tanhf(src_data[index]);
-  }
-}
-
-__global__
-void kernel_tanh_grad(const float *src_data, float *des_data, int n) {
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  int num_threads = blockDim.x * gridDim.x;
-  for (; index < n; index += num_threads) {
-    des_data[index] = (1.0f - src_data[index] * src_data[index]);
-  }
-}
-
-__global__
-void kernel_softplus(const float *src_data, float *des_data, int n) {
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  int num_threads = blockDim.x * gridDim.x;
-  for (; index < n; index += num_threads) {
-    des_data[index] = logf(1 + expf(src_data[index]));
-  }
-}
-
-__global__
-void kernel_softplus_grad(const float *src_data, float *des_data, int n) {
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  int num_threads = blockDim.x * gridDim.x;
-  for (; index < n; index += num_threads) {
-    des_data[index] = 1.0f / (1.0f + expf(-src_data[index]));
-  }
-}
-
-__global__
-void kernel_square(const float *src_data, float *des_data, int n) {
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  int num_threads = blockDim.x * gridDim.x;
-  for (; index < n; index += num_threads) {
-    des_data[index] = src_data[index] * src_data[index];
-  }
-}
-
-__global__
-void kernel_square_grad(const float *src_data, float *des_data, int n) {
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  int num_threads = blockDim.x * gridDim.x;
-  for (; index < n; index += num_threads) {
-    des_data[index] = 2 * sqrt(src_data[index]);
-  }
-}
-
-__global__
-void kernel_sqrt(const float *src_data, float *des_data, int n) {
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  int num_threads = blockDim.x * gridDim.x;
-  for (; index < n; index += num_threads) {
-    des_data[index] = sqrt(src_data[index]);
-  }
-}
-
-__global__
-void kernel_pow(const float *src_data_a, const float *src_data_b,
-    float *des_data, int n) {
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  int num_threads = blockDim.x * gridDim.x;
-  for (; index < n; index += num_threads) {
-    des_data[index] = pow(src_data_a[index], src_data_b[index]);
-  }
-}
-
-__global__
-void kernel_mult(const float *src_data_a, const float *src_data_b,
-    float *des_data, int n) {
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  int num_threads = blockDim.x * gridDim.x;
-  for (; index < n; index += num_threads) {
-    des_data[index] = src_data_a[index] * src_data_b[index];
-  }
-}
-
-__global__
-void kernel_div(const float *src_data_a, const float *src_data_b,
-    float *des_data, int n) {
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  int num_threads = blockDim.x * gridDim.x;
-  for (; index < n; index += num_threads) {
-    des_data[index] = src_data_a[index] / src_data_b[index];
-  }
-}
-
-__global__ static
-void kernel_set_value(float *data, float value, int n) {
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  int num_threads = blockDim.x * gridDim.x;
-  for (; index < n; index += num_threads) {
-    data[index] = value;
-  }
-}
-
-__global__
-void kernel_threshold(const float *src_data, float *des_data,
-    float alpha, int n) {
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  int num_threads = blockDim.x * gridDim.x;
-  for (; index < n; index += num_threads) {
-    des_data[index] = src_data[index] < alpha ? 1.0f : 0.0f;
-  }
-}
-
-//
-namespace singa {
-
-void singa_gpu_softmaxloss_forward(int n, int dim, const float *prob,
-    const int *label, float *loss) {
-  kernel_softmax_loss<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>(prob, label, loss, n,
-      dim);
-}
-
-void singa_gpu_softmaxloss_backward(int n, int dim, float scale,
-    const int *label, float *grad) {
-  kernel_softmax_gradient<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>(grad, label, n,
-      dim, scale);
-}
-
-void singa_gpu_sum_vec(float *data, float *sum , int n) {
-  int threads_per_block = n > CU1DBLOCK ? CU1DBLOCK : n;
-  //  here, we only need one block
-  int num_blocks = 1;
-
-  kernel_sum_vec<<<num_blocks, threads_per_block>>>(data, sum, n);
-}
-
-void singa_gpu_sum_row(const float *src_mat_data, float *dst_vec_data,
-    int rows, int cols, int stride) {
-  int threads_per_block = rows > CU1DBLOCK ? CU1DBLOCK : rows;
-  int num_blocks = cols;
-
-  kernel_sum_row<<<num_blocks, threads_per_block>>>(src_mat_data,
-      dst_vec_data, rows, cols, stride);
-}
-
-void singa_gpu_sum_col(const float *src_mat_data, float *dst_vec_data,
-    int rows, int cols, int stride) {
-  int threads_per_block = cols > CU1DBLOCK ? CU1DBLOCK : cols;
-  int num_blocks = rows;
-
-  kernel_sum_col<<<num_blocks, threads_per_block>>>(src_mat_data,
-      dst_vec_data, rows, cols, stride);
-}
-
-void singa_gpu_add_vec_row(const float *src_vec_data, const float *src_mat_data,
-    float *des_mat_data , int rows, int cols, int stride) {
-  dim3 threads_per_block(CU2DBLOCK_X, CU2DBLOCK_Y);
-  dim3 num_blocks(cols/threads_per_block.x +
-    (cols%threads_per_block.x == 0 ? 0 : 1),
-    rows/threads_per_block.y + (rows%threads_per_block.y == 0 ? 0 : 1));
-  kernel_add_vec_row<<<num_blocks, threads_per_block>>>
-    (src_vec_data, src_mat_data, des_mat_data, rows, cols, stride);
-}
-
-void singa_gpu_exp(const float *src_data, float *des_data, int n) {
-  kernel_exp<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>(src_data, des_data, n);
-}
-
-void singa_gpu_log(const float *src_data, float *des_data, int n) {
-  kernel_log<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>(src_data, des_data, n);
-}
-
-void singa_gpu_sigmoid(const float *src_data, float *des_data, int n) {
-  kernel_sigmoid<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>(src_data, des_data, n);
-}
-
-void singa_gpu_sigmoid_grad(const float *src_data, float *des_data,
-    int n) {
-  kernel_sigmoid_grad<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>
-    (src_data, des_data, n);
-}
-
-void singa_gpu_relu(const float *src_data, float *des_data, int n) {
-  kernel_relu<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>(src_data, des_data, n);
-}
-
-void singa_gpu_relu_grad(const float *src_data, float *des_data, int n) {
-  kernel_relu_grad<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>(src_data, des_data, n);
-}
-
-void singa_gpu_tanh(const float *src_data, float *des_data, int n) {
-  kernel_tanh<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>(src_data, des_data, n);
-}
-
-void singa_gpu_tanh_grad(const float *src_data, float *des_data, int n) {
-  kernel_tanh_grad<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>(src_data, des_data, n);
-}
-
-void singa_gpu_softplus(const float *src_data, float *des_data, int n) {
-  kernel_softplus<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>(src_data, des_data, n);
-}
-
-void singa_gpu_softplus_grad(const float *src_data, float *des_data, int n) {
-  kernel_softplus_grad<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>
-    (src_data, des_data, n);
-}
-
-void singa_gpu_square(const float *src_data, float *des_data, int n) {
-  kernel_square<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>(src_data, des_data, n);
-}
-
-void singa_gpu_square_grad(const float *src_data, float *des_data, int n) {
-  kernel_square_grad<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>(src_data, des_data, n);
-}
-
-void singa_gpu_sqrt(const float *src_data, float *des_data, int n) {
-  kernel_sqrt<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>(src_data, des_data, n);
-}
-
-void singa_gpu_pow(const float *src_data_a, const float *src_data_b,
-    float *des_data, int n) {
-  kernel_pow<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>
-    (src_data_a, src_data_b, des_data, n);
-}
-
-void singa_gpu_mult(const float *src_data_a, const float *src_data_b,
-    float *des_data, int n) {
-  kernel_mult<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>
-    (src_data_a, src_data_b, des_data, n);
-}
-
-void singa_gpu_div(const float *src_data_a, const float *src_data_b,
-    float *des_data, int n) {
-  kernel_div<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>
-    (src_data_a, src_data_b, des_data, n);
-}
-
-void singa_gpu_set_value(float *data, float value, int n) {
-  kernel_set_value<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>(data, value, n);
-}
-
-void singa_gpu_threshold(const float *src_data, float *des_data,
-    float alpha, int n) {
-  kernel_threshold<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>
-    (src_data, des_data, alpha, n);
-}
-
-}  // namespace singa
diff --git a/src/utils/opencl_utils.cc b/src/utils/opencl_utils.cc
new file mode 100644
index 0000000..e4fe69b
--- /dev/null
+++ b/src/utils/opencl_utils.cc
@@ -0,0 +1,63 @@
+/************************************************************
+*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*
+*************************************************************/
+
+#include "singa/utils/opencl_utils.h"
+
+#ifdef USE_OPENCL
+
+void PrintDeviceInfo(const cl::Device &dev) {
+  cl_int status = CL_SUCCESS;
+
+  LOG(INFO) << "\tDevice type: " << dev.getInfo<CL_DEVICE_TYPE>(&status);
+  LOG(INFO) << "\tUnified memory: " << dev.getInfo<CL_DEVICE_HOST_UNIFIED_MEMORY>(&status);
+  LOG(INFO) << "\tClock speed (MHz): " << dev.getInfo<CL_DEVICE_MAX_CLOCK_FREQUENCY>(&status);
+  LOG(INFO) << "\tECC memory: " << dev.getInfo<CL_DEVICE_ERROR_CORRECTION_SUPPORT>(&status);
+  LOG(INFO) << "\tLittle endian: " << dev.getInfo<CL_DEVICE_ENDIAN_LITTLE>(&status);
+  LOG(INFO) << "\tCompute units: " << dev.getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>(&status);
+  LOG(INFO) << "\tMax work grp size: " << dev.getInfo<CL_DEVICE_MAX_WORK_GROUP_SIZE>(&status);
+//LOG(INFO) << "\tMax work item size: " << dev.getInfo<CL_DEVICE_MAX_WORK_ITEM_SIZES>(&status);
+  LOG(INFO) << "\tMax item dimension: " << dev.getInfo<CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS>(&status);
+  LOG(INFO) << "\tQueue properties: " << dev.getInfo<CL_DEVICE_QUEUE_PROPERTIES>(&status);
+  LOG(INFO) << "\tExecution capabilities: " << dev.getInfo<CL_DEVICE_EXECUTION_CAPABILITIES>(&status);
+  LOG(INFO) << "\tMax mem alloc size: " << dev.getInfo<CL_DEVICE_MAX_MEM_ALLOC_SIZE>(&status);
+  LOG(INFO) << "\tGlobal mem size: " << dev.getInfo<CL_DEVICE_GLOBAL_MEM_SIZE>(&status);
+  LOG(INFO) << "\tLocal mem size: " << dev.getInfo<CL_DEVICE_LOCAL_MEM_SIZE>(&status);
+  LOG(INFO) << "\n";
+
+  OCL_CHECK(status, "Failed to retrieve device information!");
+}
+
+
+void PrintPlatformInfo(const cl::Platform &p) {
+  cl_int status = CL_SUCCESS;
+
+  LOG(INFO) << "\tName: 	 " << p.getInfo<CL_PLATFORM_NAME>(&status);
+  LOG(INFO) << "\tProfile: " << p.getInfo<CL_PLATFORM_PROFILE>(&status);
+  LOG(INFO) << "\tVersion: " << p.getInfo<CL_PLATFORM_VERSION>(&status);
+  LOG(INFO) << "\tVendor:  " << p.getInfo<CL_PLATFORM_VENDOR>(&status);
+  LOG(INFO) << "\tExtensions: " << p.getInfo<CL_PLATFORM_EXTENSIONS>(&status);
+  LOG(INFO) << "\n";
+
+  OCL_CHECK(status, "Failed to retrieve platform information!");
+}
+
+
+#endif // USE_OPENCL
diff --git a/src/utils/param.cc b/src/utils/param.cc
deleted file mode 100644
index 73d8314..0000000
--- a/src/utils/param.cc
+++ /dev/null
@@ -1,447 +0,0 @@
-/************************************************************
-*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*
-*************************************************************/
-
-#include "singa/utils/param.h"
-
-#include <glog/logging.h>
-#include <cmath>
-#include <random>
-#include <unordered_map>
-#include "mshadow/tensor.h"
-#include "singa/utils/factory.h"
-#include "singa/utils/singleton.h"
-#include "singa/utils/common.h"
-
-namespace singa {
-
-using mshadow::cpu;
-using mshadow::Random;
-using mshadow::Shape1;
-using mshadow::Tensor;
-using std::vector;
-using std::string;
-
-ParamGenerator* ParamGenerator::Create(const ParamGenProto& proto) {
-  auto factory = Singleton<Factory<ParamGenerator>>::Instance();
-  ParamGenerator * gen = nullptr;
-  if (proto.has_user_type())
-    gen = factory->Create(proto.user_type());
-  else
-    gen = factory->Create(proto.type());
-  gen->Init(proto);
-  return gen;
-}
-
-void ParamGenerator::Fill(Blob<float>* blob) {
-  Tensor<cpu, 1> data(blob->mutable_cpu_data(), Shape1(blob->count()));
-  data = proto_.value();
-}
-
-void GaussianGen::Fill(Blob<float>* blob) {
-  Tensor<cpu, 1> data(blob->mutable_cpu_data(), Shape1(blob->count()));
-  auto random = TSingleton<Random<cpu>>::Instance();
-  random->SampleGaussian(data, proto_.mean(), proto_.std());
-  if (proto_.value() != 1)
-    data *= proto_.value();
-}
-
-void GaussianSqrtFanInGen::Fill(Blob<float>* blob) {
-  // only valid for param matrix with num of cols as fan in
-  CHECK_EQ(blob->shape().size(), 2);
-  Tensor<cpu, 1> data(blob->mutable_cpu_data(), Shape1(blob->count()));
-  GaussianGen::Fill(blob);
-  data /= sqrt(blob->shape().at(1));
-}
-
-void UniformGen::Fill(Blob<float>* blob) {
-  Tensor<cpu, 1> data(blob->mutable_cpu_data(), Shape1(blob->count()));
-  auto random = TSingleton<Random<cpu>>::Instance();
-  random->SampleUniform(data, proto_.low(), proto_.high());
-  if (proto_.value() != 1)
-    data *= proto_.value();
-}
-
-void UniformSqrtFanInGen::Fill(Blob<float>* blob) {
-  // only valid for param matrix with num of cols as fan in
-  CHECK_EQ(blob->shape().size(), 2);
-  Tensor<cpu, 1> data(blob->mutable_cpu_data(), Shape1(blob->count()));
-  UniformGen::Fill(blob);
-  data /= sqrt(blob->shape().at(1) / 3.0f);
-}
-
-void UniformSqrtFanInOutGen::Fill(Blob<float>* blob) {
-  // only valid for param matrix with num of cols as fan in
-  CHECK_EQ(blob->shape().size(), 2);
-  Tensor<cpu, 1> data(blob->mutable_cpu_data(), Shape1(blob->count()));
-  UniformGen::Fill(blob);
-  data /= sqrt(blob->shape()[0] + blob->shape()[1]);
-}
-
-/****************** Param functions *********************************/
-Param* Param::Create(const ParamProto& proto) {
-  Factory<Param>* factory = Singleton<Factory<Param>>::Instance();
-  Param* p = nullptr;
-  if (proto.has_user_type())
-    p = factory->Create(proto.user_type());
-  else
-    p = factory->Create(proto.type());
-  p->Init(proto);
-  return p;
-}
-
-const vector<int> Param::ComputeSlices(int num, const vector<Param*>& params) {
-  // collect sizes of unique Params
-  std::vector<int> paramsize;
-  for (auto param : params)
-    if (param->id() == param->owner())
-      paramsize.push_back(param->size());
-  // slice into lcm pieces to achieve good load-balance for both intra-group
-  // partition (among servers in a group) and inter-group partition (each group
-  // is assgined a sub-set of slices)
-  auto param_slice = Slice(num, paramsize);
-  vector<int> slices;
-  for (auto const vec : param_slice)
-    for (int len : vec)
-      slices.push_back(len);
-  return slices;
-}
-
-void Param::SliceParams(int num, const vector<Param*>& params) {
-  auto slices = ComputeSlices(num, params);
-  // construct map from Param ID to its slices <slice id, len>
-  std::unordered_map<int, vector<std::pair<int, int>>> paramid2slices;
-  int slice_id = 0;
-  auto it = slices.begin();
-  for (auto param : params) {
-    if (param->id() == param->owner()) {
-      int len = 0;
-      while (len < param->size() && it != slices.end()) {
-        paramid2slices[param->id()].push_back(std::make_pair(slice_id++, *it));
-        len += *it;
-        it++;
-      }
-      CHECK_EQ(param->size(), len) << "length misamtch for ID=" << param->id();
-    }
-  }
-  for (auto param : params) {
-    for (auto entry : paramid2slices[param->owner()]) {
-      param->AddSlice(entry.first, entry.second);
-      LOG(INFO) << "param id " << param->id() << " owner=" << param->owner()
-        << ", slice id = " << entry.first << ", size = " << entry.second;
-    }
-  }
-}
-
-void Param::Setup(const vector<int>& shape) {
-  data_.Reshape(shape);
-  grad_.Reshape(shape);
-  history_.Reshape(shape);
-  update_.Reshape(shape);
-}
-
-void Param::InitValues() {
-  InitValues(0);
-}
-
-void Param::InitValues(int version) {
-  ParamGenerator* gen = ParamGenerator::Create(proto_.init());
-  gen->Fill(&data_);
-  set_version(version);
-}
-
-void Param::ShareDataFrom(Param* other, bool cpu_only) {
-  if (this == other) {
-    LOG(WARNING) << "No need to share Param with itself";
-    return;
-  }
-
-  proto_.set_owner(other->owner());
-  CHECK_EQ(data_.count(), other->data_.count());
-  data_.ShareData(&(other->data_), cpu_only);
-  if (grad_.count() == 0)
-    grad_.Reshape(data_.shape());
-  version_ = other->version_;
-  last_version_ = other->last_version_;
-  slice_start_ = other->slice_start_;
-  num_slices_ = other->num_slices_;
-  slice_offset_ = other->slice_offset_;
-  slice_size_ = other->slice_size_;
-  // change pending list size equal to slice size
-  pending_get_.resize(other->pending_get_.size());
-  pending_update_.resize(other->pending_update_.size());
-}
-
-void Param::ShareFrom(Param* other) {
-  if (this == other) {
-    LOG(WARNING) << "No need to share Param with itself";
-    return;
-  }
-
-  ShareDataFrom(other, false);
-  grad_.ShareData(&(other->grad_), false);
-}
-
-void Param::FromProto(const string str) {
-  BlobProto blob;
-  blob.ParseFromString(str);
-  data_.FromProto(blob);
-}
-
-void Param::FromProto(const BlobProto& blob) {
-  data_.FromProto(blob);
-}
-
-void Param::ToProto(BlobProto* blob) {
-  data_.ToProto(blob);
-}
-
-void Param::AddSlice(int slice_id, int size) {
-  int offset = 0;
-  if (slice_size_.size() > 0) {
-    // must be added in order
-    CHECK_EQ(slice_start_ + num_slices_, slice_id);
-    offset = slice_offset_.back() + slice_size_.back();
-  } else {
-    slice_start_ = slice_id;
-    offset = 0;
-  }
-  slice_offset_.push_back(offset);
-  slice_size_.push_back(size);
-  pending_get_.push_back(false);
-  pending_update_.push_back(false);
-  num_slices_++;
-}
-
-Msg* Param::GenPutMsg(bool copy, int idx) {
-  CHECK_LT(idx, num_slices_);
-  Msg* msg = new Msg();
-  msg->set_type(kPut);
-  const void* ptr = data_.cpu_data() + slice_offset_[idx];
-  const void* p = ptr;
-  if (copy) p = nullptr;
-  msg->AddFormatFrame("iffp", slice_size_[idx], lr_scale(), wd_scale(), p);
-  if (copy) {
-    msg->AddFrame(ptr, slice_size_[idx] * sizeof(float));
-  }
-//  LOG(ERROR) << "gen put msg: " << msg;
-  return msg;
-}
-
-Msg* Param::GenGetMsg(bool copy, int idx) {
-  CHECK_LT(idx, num_slices_);
-  Msg* msg = new Msg();
-  msg->set_type(kGet);
-  msg->AddFormatFrame("ip",  copy, data_.mutable_cpu_data()
-      + slice_offset_[idx]);
-  pending_get_[idx] = true;
-  num_pending_requests_++;
-  return msg;
-}
-
-Msg* Param::GenUpdateMsg(bool copy, int idx) {
-  CHECK_LT(idx, num_slices_);
-  Msg* msg = new Msg();
-  msg->set_type(kUpdate);
-  msg->AddFormatFrame("i", copy);
-  const void* ptr = grad_.cpu_data() + slice_offset_[idx];
-  if (copy) {
-    msg->AddFrame(ptr, slice_size_[idx]*sizeof(float));
-  } else {
-    msg->AddFormatFrame("p", ptr);  // to share values of grad blob
-  }
-
-  pending_update_[idx] = true;
-  num_pending_requests_++;
-  return msg;
-}
-
-Msg* Param::GenSyncMsg(int offset, int size) {
-  Msg* msg = new Msg();
-  msg->set_type(kSyncRequest);
-  msg->set_trgt(ParamTrgt(-1, id()), last_version());
-  // always copy data because syn is between server groups in diff procs
-  msg->AddFrame(mutable_cpu_data(), data_.count()*sizeof(float));
-  return msg;
-}
-
-Msg* Param::HandlePutMsg(Msg** msg, bool reserve) {
-  // TODO(wangsheng) remove the check later
-  CHECK(reserve);
-  int size;
-  float lr, wc;
-  float* ptr;
-//  LOG(ERROR) << "handle put msg:" << *msg;
-  (*msg)->ParseFormatFrame("iffp", &size, &lr, &wc, &ptr);
-  ParamProto proto;
-  proto.set_lr_scale(lr);
-  proto.set_wd_scale(wc);
-  vector<int> shape{size};
-  Init(proto);
-  Setup(shape);
-  if (ptr == nullptr) {
-    CHECK((*msg)->NextFrame());
-    CHECK_EQ(size * sizeof(float), (*msg)->FrameSize());
-    memcpy(mutable_cpu_data(), (*msg)->FrameData(), size * sizeof(float));
-  } else {
-    data_.set_cpu_data(ptr);
-  }
-  if (!reserve) DeleteMsg(msg);
-  return nullptr;
-}
-
-Msg* Param::HandleGetMsg(Msg** msg, bool reserve) {
-  // TODO(wangsheng) remove the check later
-  CHECK(!reserve);
-  int copy;
-  float* ptr;
-  (*msg)->ParseFormatFrame("ip", &copy, &ptr);
-  if (copy) {
-    (*msg)->AddFrame(mutable_cpu_data(), sizeof(float) * size());
-  } else if (ptr != data_.cpu_data()) {
-    // this case reflects following situation:
-    // worker 0 and server are in the same process, while worker 1 is not.
-    // worker 1 "put" data into server, so server need to allocate memory.
-    // then worker 0 "get" data from server, so server need:
-    //  1. copy the data to the worker0 provided space
-    //  2. change its own pointer to that space in order to share memory
-    // in this case, the server always points to last worker's space
-    memcpy(ptr, data_.cpu_data(), sizeof(float) * size());
-    data_.set_cpu_data(ptr);
-  }
-  // else the mem space is shared among all worker and servers
-  Msg* ret = nullptr;
-  if (reserve) {
-    ret = new Msg(**msg);
-  } else {
-    // if not reserve the msg, we reuse it as return value
-    ret = *msg;
-    *msg = nullptr;
-  }
-  ret->SwapAddr();
-  ret->set_type(kRGet);
-  return ret;
-}
-
-void Param::ParseUpdateMsgs(const vector<Msg*>& msgs) {
-  CHECK_GT(msgs.size(), 0);
-  float* server_grad = nullptr;
-  vector<float*> worker_grad;
-  for (auto* msg : msgs) {
-    int copy;
-    msg->ParseFormatFrame("i", &copy);
-    msg->NextFrame();
-    float* ptr = nullptr;
-    if (copy) {
-      ptr = static_cast<float*>(msg->FrameData());
-      CHECK_EQ(size() * sizeof(float), msg->FrameSize());
-    } else {
-      msg->ParseFormatFrame("p", &ptr);
-      server_grad = ptr;
-    }
-    worker_grad.push_back(ptr);
-  }
-  if (server_grad == nullptr)
-    server_grad = worker_grad.at(0);
-  for (float* grad : worker_grad) {
-    if (grad != server_grad) {
-      // TODO(wangsh) think about optimize it later?
-      for (int i = 0; i < size(); i++) {
-        server_grad[i] += grad[i];
-      }
-    }
-  }
-  grad_.set_cpu_data(server_grad);
-}
-
-const vector<Msg*> Param::GenUpdateResponseMsgs(vector<Msg*>* msgs,
-                                                bool reserve) {
-  // TODO(wangsheng) remove the check later
-  CHECK(!reserve);
-  vector<Msg*> ret;
-  for (Msg* msg : *msgs) {
-    Msg* ptr = reserve ? new Msg(*msg) : msg;
-    ptr->FirstFrame();
-    ptr->SwapAddr();
-    ptr->set_type(kRUpdate);
-    int copy;
-    ptr->ParseFormatFrame("i", &copy);
-    if (copy) {
-      ptr->NextFrame();
-      CHECK_EQ(ptr->FrameSize(), sizeof(float) * size());
-      memcpy(ptr->FrameData(), mutable_cpu_data(), ptr->FrameSize());
-    }
-    ret.push_back(ptr);
-  }
-  // if not reserved, we remove all pointers
-  if (!reserve) msgs->clear();
-  return ret;
-}
-
-Msg* Param::HandleSyncMsg(Msg** msg, bool reserve) {
-  // TODO(wangwei) handle it later
-  if (!reserve) DeleteMsg(msg);
-  return nullptr;
-}
-
-int Param::ParseGetResponseMsg(Msg *msg, int slice_idx) {
-  CHECK(pending_get_[slice_idx]) << slice_idx;
-  pending_get_[slice_idx] = false;
-  ParseResponseMsg(msg, slice_idx);
-  return (--num_pending_requests_) % num_slices_ == 0;
-}
-
-int Param::ParseUpdateResponseMsg(Msg *msg, int slice_idx) {
-  CHECK(pending_update_[slice_idx]) << id() << " " << slice_idx;
-  pending_update_[slice_idx] = false;
-  ParseResponseMsg(msg, slice_idx);
-  return (--num_pending_requests_) % num_slices_ == 0;
-}
-
-int Param::ParseSyncResponseMsg(Msg* msg, int slice_idx) {
-  // TODO(wangwei) handle it later
-  return 1;
-}
-
-void Param::ParseResponseMsg(Msg* msg, int slice_idx) {
-  int copy;
-  msg->ParseFormatFrame("i", &copy);
-  msg->NextFrame();
-  if (copy) {
-    CHECK_EQ(msg->FrameSize(), slice_size_[slice_idx] * sizeof(float));
-    memcpy(mutable_cpu_data() + slice_offset_[slice_idx],
-        msg->FrameData(), msg->FrameSize());
-  }
-  // LOG(ERROR)<<"parse response norm "<<data_->asum_data()<<" of "<<id();
-}
-
-/************************ParamEntry***************************/
-ParamEntry::ParamEntry(int total, Param* p) {
-  num_total = total;
-  shares.push_back(p);
-}
-
-void ParamEntry::AddParam(bool local, Param* p) {
-  num_local += local;
-  num_total += 1;
-  if (local) shares.push_back(p);
-}
-
-}  // namespace singa
diff --git a/src/utils/tool.cc b/src/utils/tool.cc
deleted file mode 100644
index 3b1df72..0000000
--- a/src/utils/tool.cc
+++ /dev/null
@@ -1,169 +0,0 @@
-/************************************************************
-*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-* 
-*   http://www.apache.org/licenses/LICENSE-2.0
-* 
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*
-*************************************************************/
-
-#include <glog/logging.h>
-#include <algorithm>
-#include <string>
-#include <vector>
-#include "singa/proto/singa.pb.h"
-#include "singa/utils/common.h"
-#include "singa/utils/job_manager.h"
-
-std::string conf_dir;
-singa::SingaProto global;
-const int SUCCESS = 0;
-const int ARG_ERR = 1;
-const int RUN_ERR = 2;
-
-// show log dir in global config
-int getlogdir() {
-  std::string dir = global.log_dir();
-  while (dir.length() > 1 && dir[dir.length()-1] == '/') dir.pop_back();
-  printf("%s\n", dir.c_str());
-  return SUCCESS;
-}
-
-// generate a unique job id
-int create() {
-  singa::JobManager mngr(global.zookeeper_host());
-  if (!mngr.Init()) return RUN_ERR;
-  int id;
-  if (!mngr.GenerateJobID(&id)) return RUN_ERR;
-  printf("%d\n", id);
-  return SUCCESS;
-}
-
-// generate a host list
-int genhost(char* job_conf) {
-  singa::JobManager mngr(global.zookeeper_host());
-  if (!mngr.Init()) return RUN_ERR;
-  std::vector<std::string> list;
-  if (!mngr.GenerateHostList((conf_dir+"/hostfile").c_str(), job_conf, &list))
-    return RUN_ERR;
-  // output selected hosts
-  for (std::string host : list)
-    printf("%s\n", host.c_str());
-  return SUCCESS;
-}
-
-// list singa jobs (running or all)
-int list(bool all) {
-  singa::JobManager mngr(global.zookeeper_host());
-  if (!mngr.Init()) return RUN_ERR;
-  std::vector<singa::JobInfo> jobs;
-  if (!mngr.ListJobs(&jobs)) return RUN_ERR;
-  printf("JOB ID    |NUM PROCS  \n");
-  printf("----------|-----------\n");
-  for (singa::JobInfo job : jobs) {
-    if (!job.procs && !all) continue;
-    printf("%-10d|%-10d\n", job.id, job.procs);
-  }
-  return SUCCESS;
-}
-
-// view procs of a singa job
-int view(int id) {
-  singa::JobManager mngr(global.zookeeper_host());
-  if (!mngr.Init()) return RUN_ERR;
-  std::vector<std::string> procs;
-  if (!mngr.ListJobProcs(id, &procs)) return RUN_ERR;
-  for (std::string s : procs) {
-    printf("%s\n", s.c_str());
-  }
-  return SUCCESS;
-}
-
-// remove a job path in zookeeper
-int remove(int id) {
-  singa::JobManager mngr(global.zookeeper_host());
-  if (!mngr.Init()) return RUN_ERR;
-  if (!mngr.Remove(id)) return RUN_ERR;
-  return SUCCESS;
-}
-
-// remove all job paths in zookeeper
-int removeall() {
-  singa::JobManager mngr(global.zookeeper_host());
-  if (!mngr.Init()) return RUN_ERR;
-  if (!mngr.RemoveAllJobs()) return RUN_ERR;
-  return SUCCESS;
-}
-
-// clean all singa data in zookeeper
-int cleanup() {
-  singa::JobManager mngr(global.zookeeper_host());
-  if (!mngr.Init()) return RUN_ERR;
-  if (!mngr.CleanUp()) return RUN_ERR;
-  return SUCCESS;
-}
-
-int main(int argc, char **argv) {
-  std::string usage = "Usage: singatool <command> <args>\n"
-      " getlogdir          :  show log dir in global config\n"
-      " create             :  generate a unique job id\n"
-      " genhost <job conf> :  generate a host list\n"
-      " list               :  list running singa jobs\n"
-      " listall            :  list all singa jobs\n"
-      " view <job id>      :  view procs of a singa job\n"
-      " remove <job id>    :  remove a job path in zookeeper\n"
-      " removeall          :  remova all job paths in zookeeper\n"
-      " cleanup            :  clean all singa data in zookeeper\n"
-      "[optional arguments] NOTICE: must put at end of a command\n"
-      " -confdir <dir>     :  path to singa global conf dir";
-
-  // set logging level to ERROR and log to STDERR only
-  google::LogToStderr();
-  google::SetStderrLogging(google::ERROR);
-  google::InitGoogleLogging(argv[0]);
-  // parse -confdir argument
-  int arg_pos = singa::ArgPos(argc, argv, "-confdir");
-  conf_dir = arg_pos == -1 ? "conf" : argv[arg_pos+1];
-  if (arg_pos != -1) argc -= 2;
-  singa::ReadProtoFromTextFile((conf_dir+"/singa.conf").c_str(), &global);
-
-  // stat code: ARG_ERR for wrong argument, RUN_ERR for runtime error
-  int stat = (argc <= 1) ? ARG_ERR : SUCCESS;
-  if (stat == SUCCESS) {
-    if (!strcmp(argv[1], "getlogdir"))
-      stat = getlogdir();
-    else if (!strcmp(argv[1], "create"))
-      stat = create();
-    else if (!strcmp(argv[1], "genhost"))
-      stat = (argc > 2) ? genhost(argv[2]) : genhost(nullptr);
-    else if (!strcmp(argv[1], "list"))
-      stat = list(false);
-    else if (!strcmp(argv[1], "listall"))
-      stat = list(true);
-    else if (!strcmp(argv[1], "view"))
-      stat = (argc > 2) ? view(atoi(argv[2])) : ARG_ERR;
-    else if (!strcmp(argv[1], "remove"))
-      stat = (argc > 2) ? remove(atoi(argv[2])) : ARG_ERR;
-    else if (!strcmp(argv[1], "removeall"))
-      stat = removeall();
-    else if (!strcmp(argv[1], "cleanup"))
-      stat = cleanup();
-    else
-      stat = ARG_ERR;
-  }
-
-  if (stat == ARG_ERR) LOG(ERROR) << usage;
-  return stat;
-}
diff --git a/src/utils/updater.cc b/src/utils/updater.cc
deleted file mode 100644
index a2180d3..0000000
--- a/src/utils/updater.cc
+++ /dev/null
@@ -1,284 +0,0 @@
-/************************************************************
-*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*
-*************************************************************/
-
-#include "singa/utils/updater.h"
-
-#include "mshadow/cxxnet_op.h"
-#include "mshadow/tensor.h"
-#include "singa/utils/singleton.h"
-#include "singa/utils/factory.h"
-
-namespace singa {
-
-using mshadow::cpu;
-using mshadow::expr::F;
-using mshadow::op::sqrtop;
-using mshadow::op::square;
-using mshadow::Shape;
-using mshadow::Shape1;
-using mshadow::Tensor;
-using mshadow::TensorContainer;
-
-LRGenerator* LRGenerator::Create(const LRGenProto& proto) {
-  auto factory = Singleton<Factory<LRGenerator>>::Instance();
-  LRGenerator* gen = nullptr;
-  if (proto.has_user_type())
-    gen = factory->Create(proto.user_type());
-  else
-    gen = factory->Create(proto.type());
-  gen->Init(proto);
-  return gen;
-}
-
-float FixedStepLRGen::Get(int step) {
-  if (last_idx_ < proto_.fixedstep_conf().step_size() - 1
-      && step >= proto_.fixedstep_conf().step(last_idx_ + 1)) {
-      last_idx_++;
-    }
-  return proto_.fixedstep_conf().step_lr(last_idx_);
-}
-
-float StepLRGen::Get(int step) {
-  // do not cast int to float
-  int freq = proto_.step_conf().change_freq();
-  float lr = proto_.base_lr() * pow(proto_.step_conf().gamma(), step / freq);
-  // LOG_IF(INFO, step % freq == 0) << "Update learning rate to " << lr
-  //   << " @ step " << step;
-  return lr;
-}
-
-float LinearLRGen::Get(int step) {
-  int freq = proto_.linear_conf().change_freq();
-  float r = step * 1.0 / freq;
-  return (1.0 - r) * proto_.base_lr() + r * proto_.linear_conf().final_lr();
-}
-
-float ExpLRGen::Get(int step) {
-  int freq = proto_.exponential_conf().change_freq();
-  return proto_.base_lr() / pow(2, step * 1. / freq);
-}
-
-float InvLRGen::Get(int step) {
-  return proto_.base_lr() * pow(1.f + proto_.inverse_conf().gamma() * step,
-           - proto_.inverse_conf().pow());
-}
-
-float InvTLRGen::Get(int step) {
-  return proto_.base_lr() / (1 + step * 1. / proto_.inverset_conf().final_lr());
-}
-
-Updater* Updater::Create(const UpdaterProto& proto) {
-  auto factory = Singleton<Factory<Updater>>::Instance();
-  Updater* updater = nullptr;
-  if (proto.has_user_type())
-    updater = factory->Create(proto.user_type());
-  else
-    updater = factory->Create(proto.type());
-  updater->Init(proto);
-  return updater;
-}
-
-/**************** added for Python Binding ***************************/
-Updater* Updater::CreateUpdater(const string str) {
-  UpdaterProto conf;
-  conf.ParseFromString(str);
-  return Updater::Create(conf);
-}
-/***********************Python Binding end**************************/
-
-
-/***********************SGD with momentum******************************/
-void Updater::Init(const UpdaterProto& proto) {
-  momentum_ = proto.momentum();
-  weight_decay_ = proto.weight_decay();
-  lr_gen_ = LRGenerator::Create(proto.learning_rate());
-  clip_low_ = proto.clip_low();
-  clip_high_ = proto.clip_high();
-}
-
-void Updater::Clip(const float low, const float high, Param* param) {
-  Blob<float>* grad = param->mutable_grad();
-  float* ptr = grad->mutable_cpu_data();
-  for (int i = 0; i < grad->count(); i++) {
-    if (ptr[i] > high)
-      ptr[i] = high;
-    else if (ptr[i] < low)
-      ptr[i] = low;
-  }
-}
-
-void SGDUpdater::Update(int step, Param* param, float grad_scale) {
-  if (clip_high_ > clip_low_)
-    Clip(clip_low_, clip_high_, param);
-  Shape<1> s = Shape1(param->size());
-  Tensor<cpu, 1> data(param->mutable_cpu_data(), s);
-  Tensor<cpu, 1> grad(param->mutable_cpu_grad(), s);
-  float lr = lr_gen_->Get(step) * param->lr_scale();
-  float wd = weight_decay_ * param->wd_scale();
-  grad *= grad_scale;
-  if (wd > 0)  // L2 regularization, should be done after timing grad_scale
-    grad += data * wd;
-  if (momentum_ > 0) {
-    Tensor<cpu, 1> history(param->mutable_cpu_history(), s);
-    history = history * momentum_ - lr * grad;
-    data += history;
-  } else {
-    grad *= -lr;
-    data += grad;
-  }
-}
-
-/***********************Nesterov******************************/
-void NesterovUpdater::Update(int step, Param* param, float grad_scale) {
- if (clip_high_ > clip_low_)
-    Clip(clip_low_, clip_high_, param);
-
-  Shape<1> s = Shape1(param->size());
-  Tensor<cpu, 1> data(param->mutable_cpu_data(), s);
-  Tensor<cpu, 1> grad(param->mutable_cpu_grad(), s);
-  Tensor<cpu, 1> history(param->mutable_cpu_history(), s);
-  TensorContainer<cpu, 1> tmp(s);
-  float lr = lr_gen_->Get(step)*param->lr_scale();
-  float wd = weight_decay_*param->wd_scale();
-  grad *= grad_scale;
-  if (wd > 0)  // L2 regularization, should be done after timing grad_scale
-    grad += data * wd;
-  Copy(tmp, history);
-  history = history * momentum_ + lr * grad;
-  tmp = history * (1 + momentum_) - tmp * momentum_;
-  data -= tmp;
-}
-/***********************AdaGrad******************************/
-void AdaGradUpdater::Update(int step, Param* param, float grad_scale) {
-  if (clip_high_ > clip_low_)
-    Clip(clip_low_, clip_high_, param);
-  Shape<1> s = Shape1(param->size());
-  Tensor<cpu, 1> data(param->mutable_cpu_data(), s);
-  Tensor<cpu, 1> grad(param->mutable_cpu_grad(), s);
-  Tensor<cpu, 1> history(param->mutable_cpu_history(), s);
-  float lr = lr_gen_->Get(step)*param->lr_scale();
-  float wd = weight_decay_*param->wd_scale();
-  grad *= grad_scale;
-  if (wd > 0)  //  L2 regularization, should be done after timing grad_scale
-    grad += data * wd;
-  history += F<square>(grad);
-  data -= lr * grad / (F<sqrtop>(history, proto_.delta()));
-}
-
-/***********************RMSProp******************************/
-void RMSPropUpdater::Init(const UpdaterProto& proto) {
-  Updater::Init(proto);
-  rho_ = proto.rmsprop_conf().rho();
-  delta_ = proto.delta();
-}
-
-void RMSPropUpdater::Update(int step, Param* param, float grad_scale) {
- if (clip_high_ > clip_low_)
-    Clip(clip_low_, clip_high_, param);
-
-  Shape<1> s=Shape1(param->size());
-  Tensor<cpu, 1> data(param->mutable_cpu_data(), s);
-  Tensor<cpu, 1> grad(param->mutable_cpu_grad(), s);
-  Tensor<cpu, 1> history(param->mutable_cpu_history(), s);
-  float lr = lr_gen_->Get(step) * param->lr_scale();
-  float wd = weight_decay_ * param->wd_scale();
-  grad *= grad_scale;
-  if (wd > 0)  //  L2 regularization, should be done after timing grad_scale
-    grad += data * wd;
-  history = history * rho_ + (1 - rho_) * F<square>(grad);
-  data -= lr * grad / F<sqrtop>(history, delta_);
-}
-/***********************AdaDelta******************************/
-void AdaDeltaUpdater::Init(const UpdaterProto& proto){
-  Updater::Init(proto);
-  delta_ = proto.delta();
-  rho_=proto.adadelta_conf().rho();
-}
-
-void AdaDeltaUpdater::Update(int step, Param* param, float grad_scale){
-  Shape<1> s=Shape1(param->size());
-  Tensor<cpu, 1> data(param->mutable_cpu_data(), s);
-  Tensor<cpu, 1> grad(param->mutable_cpu_grad(), s);
-  Tensor<cpu, 1> history(param->mutable_cpu_history(), s);
-  Tensor<cpu, 1> update(param->mutable_cpu_update(), s);
-  TensorContainer<cpu, 1> tmp(s);
-  float wd = weight_decay_*param->wd_scale();
-  float lr = lr_gen_->Get(step) * param->lr_scale();
-  grad *= grad_scale;
-  if (wd > 0)  //  L2 regularization, should be done after timing grad_scale
-    grad += data * wd;
-  history = history * rho_ + (1 - rho_) * F<op::square>(grad);
-  tmp = grad * F<op::sqrtop>(update, delta_) / F<op::sqrtop>(history, delta_);
-  update = rho_ * update + (1 - rho_) * F<op::square>(tmp);
-  data -= lr * tmp;
-}
-
-/***********************Adam******************************/
-void AdamUpdater::Init(const UpdaterProto &proto) {
-  Updater::Init(proto);
-  beta1_=proto.adam_conf().beta1();
-  beta2_=proto.adam_conf().beta2();
-  delta_ = proto.delta();
-}
-
-void AdamUpdater::Update(int step, Param* param, float grad_scale) {
-  Shape<1> s=Shape1(param->size());
-  Tensor<cpu, 1> data(param->mutable_cpu_data(), s);
-  Tensor<cpu, 1> grad(param->mutable_cpu_grad(), s);
-  Tensor<cpu, 1> history(param->mutable_cpu_history(), s);
-  Tensor<cpu, 1> update(param->mutable_cpu_update(), s);
-  float wd = weight_decay_*param->wd_scale();
-  float lr = lr_gen_->Get(step) * param->lr_scale();
-  grad *= grad_scale;
-  if (wd > 0)  //  L2 regularization, should be done after timing grad_scale
-    grad += data * wd;
-  history = history * beta1_ + (1 - beta1_) * grad;
-  update = update * beta2_ + (1 - beta2_) * F<op::square>(grad);
-  data -= lr * history / F<op::sqrtop>(update, delta_);
-}
-
-/***********************AdamMax******************************/
-void AdamMaxUpdater::Init(const UpdaterProto &proto) {
-  Updater::Init(proto);
-  beta1_=proto.adammax_conf().beta1();
-  beta2_=proto.adammax_conf().beta2();
-  delta_=proto.delta();
-}
-
-void AdamMaxUpdater::Update(int step, Param* param, float grad_scale) {
-  Shape<1> s=Shape1(param->size());
-  Tensor<cpu, 1> data(param->mutable_cpu_data(), s);
-  Tensor<cpu, 1> grad(param->mutable_cpu_grad(), s);
-  Tensor<cpu, 1> history(param->mutable_cpu_history(), s);
-  Tensor<cpu, 1> update(param->mutable_cpu_update(), s);
-  float wd = weight_decay_*param->wd_scale();
-  float lr = lr_gen_->Get(step) * param->lr_scale();
-  grad *= grad_scale;
-  if (wd > 0)  //  L2 regularization, should be done after timing grad_scale
-    grad += data * wd;
-  history = history * beta1_ + (1 - beta1_) * grad;
-  update = update * beta2_;
-  grad = F<op::abs>(grad);
-  update = F<op::max>(update, grad) + delta_;
-  data -= lr * history / update;
-}
-
-}  // namespace singa
diff --git a/src/utils/zk_service.cc b/src/utils/zk_service.cc
deleted file mode 100644
index 352f6f7..0000000
--- a/src/utils/zk_service.cc
+++ /dev/null
@@ -1,326 +0,0 @@
-/************************************************************
-*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-* 
-*   http://www.apache.org/licenses/LICENSE-2.0
-* 
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*
-*************************************************************/
-
-#include "singa/utils/zk_service.h"
-
-#include <glog/logging.h>
-#include <algorithm>
-
-using std::string;
-using std::to_string;
-using std::vector;
-
-namespace singa {
-
-void ZKService::ChildChanges(zhandle_t *zh, int type, int state,
-                               const char *path, void *watcherCtx) {
-  // check if already callback
-  RTCallback *cb = static_cast<RTCallback*>(watcherCtx);
-  if (cb->fn == nullptr) return;
-  if (type == ZOO_CHILD_EVENT) {
-    struct String_vector child;
-    // check the child list and put another watcher
-    int ret = zoo_wget_children(zh, path, ChildChanges, watcherCtx, &child);
-    if (ret == ZOK) {
-      if (child.count == 0) {
-        LOG(INFO) << "child.count = 0 in path: " << path;
-        // all workers leave, we do callback now
-        (*cb->fn)(cb->ctx);
-        cb->fn = nullptr;
-      }
-    } else {
-      LOG(FATAL) << "Unhandled ZK error code: " << ret
-                 << " (zoo_wget_children " << path << ")";
-    }
-  } else {
-    LOG(FATAL) << "Unhandled callback type code: "<< type;
-  }
-}
-
-ZKService::~ZKService() {
-  // close zookeeper handler
-  zookeeper_close(zkhandle_);
-}
-
-char zk_cxt[] = "ZKClusterRT";
-
-bool ZKService::Init(const string& host, int timeout) {
-  zoo_set_debug_level(ZOO_LOG_LEVEL_ERROR);
-  zkhandle_ = zookeeper_init(host.c_str(), WatcherGlobal, timeout, 0,
-                             static_cast<void *>(zk_cxt), 0);
-  if (zkhandle_ == NULL) {
-    LOG(ERROR) << "Error when connecting to zookeeper servers...";
-    LOG(ERROR) << "Please ensure zookeeper service is up in host(s):";
-    LOG(ERROR) << host.c_str();
-    return false;
-  }
-
-  return true;
-}
-
-bool ZKService::CreateNode(const char* path, const char* val, int flag,
-                               char* output) {
-  CHECK(zkhandle_) << "zk handler not initialized";
-  char buf[kZKBufSize];
-  int ret = 0;
-  // send the zk request
-  for (int i = 0; i < kNumRetry; ++i) {
-    ret = zoo_create(zkhandle_, path, val, val == nullptr ? -1 : strlen(val),
-                     &ZOO_OPEN_ACL_UNSAFE, flag, buf, kZKBufSize);
-    if (ret == ZNONODE) {
-      LOG(WARNING) << "zookeeper parent node of " << path
-                  << " not exist, retry later";
-    } else if (ret == ZCONNECTIONLOSS) {
-      LOG(WARNING) << "zookeeper disconnected, retry later";
-    } else {
-      break;
-    }
-    sleep(kSleepSec);
-  }
-  // copy the node name to output
-  if (output != nullptr && (ret == ZOK || ret == ZNODEEXISTS)) {
-    snprintf(output, kZKBufSize, "%s", buf);
-    // use snprintf instead of strcpy
-    // strcpy(output, buf);
-  }
-  if (ret == ZOK) {
-    LOG(INFO) << "created zookeeper node " << buf
-              << " (" << (val == nullptr ? "NULL" : val) << ")";
-    return true;
-  } else if (ret == ZNODEEXISTS) {
-    LOG(WARNING) << "zookeeper node " << path << " already exists";
-    return true;
-  } else if (ret == ZCONNECTIONLOSS) {
-    LOG(ERROR) << "Cannot connect to zookeeper, "
-               << "please ensure it is running properly...\n"
-               << "If want to use zookeeper in our thirdparty folder, "
-               << "you can start it by:\n"
-               << "$ ./bin/zk-service.sh start";
-    return false;
-  }
-  LOG(FATAL) << "Unhandled ZK error code: " << ret
-             << " (zoo_create " << path << ")";
-  return false;
-}
-
-bool ZKService::DeleteNode(const char* path) {
-  CHECK(zkhandle_) << "zk handler not initialized";
-  int ret = zoo_delete(zkhandle_, path, -1);
-  if (ret == ZOK) {
-    LOG(INFO) << "deleted zookeeper node " << path;
-    return true;
-  } else if (ret == ZNONODE) {
-    LOG(WARNING) << "try to delete an non-existing zookeeper node " << path;
-    return true;
-  }
-  LOG(FATAL) << "Unhandled ZK error code: " << ret
-             << " (zoo_delete " << path << ")";
-  return false;
-}
-
-bool ZKService::Exist(const char* path) {
-  CHECK(zkhandle_) << "zk handler not initialized";
-  struct Stat stat;
-  int ret = zoo_exists(zkhandle_, path, 0, &stat);
-  if (ret == ZOK) return true;
-  else if (ret == ZNONODE) return false;
-  LOG(WARNING) << "Unhandled ZK error code: " << ret << " (zoo_exists)";
-  return false;
-}
-
-bool ZKService::UpdateNode(const char* path, const char* val) {
-  CHECK(zkhandle_) << "zk handler not initialized";
-  // set version = -1, do not check content version
-  int ret = zoo_set(zkhandle_, path, val, strlen(val), -1);
-  if (ret == ZOK) {
-    return true;
-  } else if (ret == ZNONODE) {
-    LOG(ERROR) << "zk node " << path << " does not exist";
-    return false;
-  }
-  LOG(FATAL) << "Unhandled ZK error code: " << ret
-             << " (zoo_get " << path << ")";
-  return false;
-}
-
-bool ZKService::GetNode(const char* path, char* output) {
-  CHECK(zkhandle_) << "zk handler not initialized";
-  struct Stat stat;
-  int val_len = kZKBufSize;
-  int ret = zoo_get(zkhandle_, path, 0, output, &val_len, &stat);
-  if (ret == ZOK) {
-    output[val_len] = '\0';
-    return true;
-  } else if (ret == ZNONODE) {
-    LOG(ERROR) << "zk node " << path << " does not exist";
-    return false;
-  }
-  LOG(FATAL) << "Unhandled ZK error code: " << ret
-             << " (zoo_get " << path << ")";
-  return false;
-}
-
-bool ZKService::GetChild(const char* path, vector<string>* vt) {
-  CHECK(zkhandle_) << "zk handler not initialized";
-  struct String_vector child;
-  int ret = zoo_get_children(zkhandle_, path, 0, &child);
-  if (ret == ZOK) {
-    vt->clear();
-    for (int i = 0; i < child.count; ++i) vt->push_back(child.data[i]);
-    return true;
-  }
-  LOG(FATAL) << "Unhandled ZK error code: " << ret
-             << " (zoo_get_children " << path << ")";
-  return false;
-}
-
-bool ZKService::WGetChild(const char* path, vector<string>* vt,
-                            RTCallback *cb) {
-  CHECK(zkhandle_) << "zk handler not initialized";
-  struct String_vector child;
-  int ret = zoo_wget_children(zkhandle_, path, ChildChanges, cb, &child);
-  if (ret == ZOK) {
-    vt->clear();
-    for (int i = 0; i < child.count; ++i) vt->push_back(child.data[i]);
-    return true;
-  }
-  LOG(FATAL) << "Unhandled ZK error code: " << ret
-             << " (zoo_get_children " << path << ")";
-  return false;
-}
-
-
-void ZKService::WatcherGlobal(zhandle_t * zh, int type, int state,
-                                const char *path, void *watcherCtx) {
-  if (type == ZOO_SESSION_EVENT) {
-    if (state == ZOO_CONNECTED_STATE)
-      LOG(INFO) << "GLOBAL_WATCHER connected to zookeeper successfully!";
-    else if (state == ZOO_EXPIRED_SESSION_STATE)
-      LOG(INFO) << "GLOBAL_WATCHER zookeeper session expired!";
-  }
-}
-
-ZKClusterRT::ZKClusterRT(const string& host, int job_id) {
-  host_ = host;
-  workspace_ = GetZKJobWorkspace(job_id);
-  group_path_ = workspace_ + kZKPathJobGroup;
-  proc_path_ = workspace_ + kZKPathJobProc;
-  proc_lock_path_ = workspace_ + kZKPathJobPLock;
-}
-
-ZKClusterRT::~ZKClusterRT() {
-  // release callback vector
-  for (RTCallback* p : cb_vec_) {
-    delete p;
-  }
-}
-
-bool ZKClusterRT::Init() {
-  if (!zk_.Init(host_, timeout_)) return false;
-  if (!zk_.CreateNode(kZKPathSinga.c_str(), nullptr, 0, nullptr))
-    return false;
-  if (!zk_.CreateNode(kZKPathApp.c_str(), nullptr, 0, nullptr))
-    return false;
-  if (!zk_.CreateNode(workspace_.c_str(), nullptr, 0, nullptr))
-    return false;
-  if (!zk_.CreateNode(group_path_.c_str(), nullptr, 0, nullptr))
-    return false;
-  if (!zk_.CreateNode(proc_path_.c_str(), nullptr, 0, nullptr))
-    return false;
-  if (!zk_.CreateNode(proc_lock_path_.c_str(), nullptr, 0, nullptr))
-    return false;
-  return true;
-}
-
-int ZKClusterRT::RegistProc(const string& host_addr, int pid) {
-  char buf[kZKBufSize];
-  string lock = proc_lock_path_ + "/lock-";
-  if (!zk_.CreateNode(lock.c_str(), nullptr,
-                        ZOO_EPHEMERAL | ZOO_SEQUENCE, buf)) {
-    return -1;
-  }
-  // get all children in lock folder
-  vector<string> vt;
-  if (!zk_.GetChild(proc_lock_path_.c_str(), &vt)) {
-    return -1;
-  }
-  // find own position among all locks
-  int id = -1;
-  std::sort(vt.begin(), vt.end());
-  for (int i = 0; i < static_cast<int>(vt.size()); ++i) {
-    if (proc_lock_path_+"/"+vt[i] == buf) {
-      id = i;
-      break;
-    }
-  }
-  if (id == -1) {
-    LOG(ERROR) << "cannot find own node " << buf;
-    return -1;
-  }
-  // create a new node in proc path
-  string path = proc_path_ + "/proc-" + to_string(id);
-  string content = host_addr + "|" + to_string(pid);
-  if (!zk_.CreateNode(path.c_str(), content.c_str(), ZOO_EPHEMERAL,
-                      nullptr)) {
-    return -1;
-  }
-  return id;
-}
-
-std::string ZKClusterRT::GetProcHost(int proc_id) {
-  char val[kZKBufSize];
-  // construct file name
-  string path = proc_path_ + "/proc-" + to_string(proc_id);
-  if (!zk_.GetNode(path.c_str(), val)) return "";
-  int len = strlen(val) - 1;
-  while (len && val[len] != '|') --len;
-  CHECK(len);
-  val[len] = '\0';
-  return string(val);
-}
-
-bool ZKClusterRT::WatchSGroup(int gid, int sid, rt_callback fn, void *ctx) {
-  CHECK_NOTNULL(fn);
-  string path = groupPath(gid);
-  // create zk node
-  if (!zk_.CreateNode(path.c_str(), nullptr, 0, nullptr)) return false;
-  vector<string> child;
-  // store the callback function and context for later usage
-  RTCallback *cb = new RTCallback;
-  cb->fn = fn;
-  cb->ctx = ctx;
-  cb_vec_.push_back(cb);
-  // start to watch on the zk node, does not care about the first return value
-  return zk_.WGetChild(path.c_str(), &child, cb);
-}
-
-bool ZKClusterRT::JoinSGroup(int gid, int wid, int s_group) {
-  string path = groupPath(s_group) + workerPath(gid, wid);
-  // try to create an ephemeral node under server group path
-  return zk_.CreateNode(path.c_str(), nullptr, ZOO_EPHEMERAL, nullptr);
-}
-
-bool ZKClusterRT::LeaveSGroup(int gid, int wid, int s_group) {
-  string path = groupPath(s_group) + workerPath(gid, wid);
-  return zk_.DeleteNode(path.c_str());
-}
-
-}  // namespace singa
diff --git a/src/worker.cc b/src/worker.cc
deleted file mode 100644
index e92d780..0000000
--- a/src/worker.cc
+++ /dev/null
@@ -1,545 +0,0 @@
-/************************************************************
-*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*
-*************************************************************/
-
-#include "singa/worker.h"
-
-#include <glog/logging.h>
-#include <chrono>
-#include <thread>
-#include <typeinfo>
-#include "singa/utils/cluster.h"
-#include "singa/utils/factory.h"
-#include "singa/utils/singleton.h"
-#include "singa/utils/context.h"
-#include "singa/utils/math_blob.h"
-
-namespace singa {
-
-using std::string;
-
-Worker* Worker::CreateWorker(const string str) {
-  AlgProto alg_proto;
-  alg_proto.ParseFromString(str);
-  return Worker::Create(alg_proto);
-}
-
-Worker* Worker::Create(const AlgProto& conf) {
-  auto factory = Singleton<Factory<singa::Worker>>::Instance();
-  Worker* worker = nullptr;
-  if (conf.has_user_alg())
-    worker = factory->Create(conf.user_alg());
-  else
-    worker = factory->Create(conf.alg());
-  return worker;
-}
-
-void Worker::Setup(int grp_id, int id, const JobProto& conf,
-    NeuralNet* train_net, NeuralNet* val_net, NeuralNet* test_net) {
-  grp_id_ = grp_id;
-  id_ = id;
-  job_conf_ = conf;
-  train_net_ = train_net;
-  val_net_ = val_net;
-  test_net_ = test_net;
-  InitSockets(train_net);
-}
-
-Worker::~Worker() {
-  if (dealer_) delete dealer_;
-  if (bridge_dealer_) delete bridge_dealer_;
-}
-
-void Worker::Run() {
-  // setup gpu device
-  auto context = Singleton<Context>::Instance();
-  // TODO(wangwei) -2 for uninitial device; -1 for CPU; >=0 for GPU now.
-  int device = -2;
-  while (device == -2) {
-    device = context->device_id(std::this_thread::get_id());
-    std::this_thread::sleep_for(std::chrono::milliseconds(1000));
-  }
-  LOG(ERROR) << "Worker (group = " << grp_id_ <<", id = " << id_ << ") "
-    << " start on " << (device >= 0 ? "GPU " + std::to_string(device) : "CPU");
-  if (device >= 0)
-    context->ActivateDevice(device);
-
-  auto cluster = Cluster::Get();
-  int svr_grp = grp_id_ / cluster->nworker_groups_per_server_group();
-  CHECK(cluster->runtime()->JoinSGroup(grp_id_, id_, svr_grp));
-  step_ = job_conf_.step();
-  InitSockets(train_net_);
-  InitNetParams(job_conf_, train_net_);
-  while (!StopNow(step_)) {
-    if (ValidateNow(step_) && val_net_ != nullptr) {
-      CollectAll(step_, train_net_);
-      LOG(ERROR) << "Validation @ step " + std::to_string(step_);
-      Test(job_conf_.validate_steps(), kVal, val_net_);
-    }
-    if (TestNow(step_) && test_net_ != nullptr) {
-      CollectAll(step_, train_net_);
-      LOG(ERROR) << "Test @ step " + std::to_string(step_);
-      Test(job_conf_.test_steps(), kTest, test_net_);
-    }
-    if (CheckpointNow(step_) && grp_id_ == 0) {
-      CollectAll(step_, train_net_);
-      Checkpoint(step_, Cluster::Get()->checkpoint_folder(), train_net_);
-      job_conf_.set_step(step_);
-    }
-    TrainOneBatch(step_, train_net_);
-    if (DisplayNow(step_) && grp_id_ == 0 && id_ == 0) {
-      Display(kTrain | kForward | kBackward,
-          "Train @ step " + std::to_string(step_), train_net_);
-    }
-    step_++;
-  }
-
-  // save the model
-  if (grp_id_ == 0)
-    Checkpoint(step_, Cluster::Get()->checkpoint_folder(), train_net_);
-  // clean up
-  cluster->runtime()->LeaveSGroup(grp_id_, id_, svr_grp);
-  // notify the stub on worker stop
-  Msg* msg = new Msg(Addr(grp_id_, id_, kWorkerParam), Addr(-1, -1, kStub));
-  msg->set_type(kStop);
-  dealer_->Send(&msg);  // use param dealer to send the stop msg
-  LOG(ERROR) << "Worker (group = " <<grp_id_ << ", id = " << id_ << ") stops";
-}
-
-void Worker::Test(int steps, Phase phase, NeuralNet* net) {
-  for (int step = 0; step < steps; step++)
-    TestOneBatch(step, phase, net);
-  Display(phase, " ", net);
-}
-
-void Worker::InitSockets(const NeuralNet* net) {
-  dealer_ = new Dealer(Addr(grp_id_, id_, kWorkerParam));
-  for (auto layer : net->layers()) {
-    if (layer->partition_id() == id_) {
-      if (typeid(*layer) == typeid(BridgeDstLayer)
-          || typeid(*layer) == typeid(BridgeSrcLayer)) {
-        bridge_dealer_ = new Dealer(Addr(grp_id_, id_, kWorkerLayer));
-        break;
-      }
-    }
-  }
-  // bind dealer to bridge layers
-  if (bridge_dealer_ != nullptr) {
-    for (auto dst : net->layers()) {
-      if (typeid(*dst) == typeid(BridgeDstLayer)) {
-        auto src = net->srclayers(dst)[0];
-        name2bridge_[src->name()] = src;
-        name2bridge_[dst->name()] = dst;
-        if (src->partition_id() == id_) {
-          dynamic_cast<BridgeLayer*>(src)->MakePaired(dst, grp_id_,
-              bridge_dealer_, &name2bridge_);
-        }
-        if (dst->partition_id() == id_) {
-          dynamic_cast<BridgeLayer*>(dst)->MakePaired(src, grp_id_,
-              bridge_dealer_, &name2bridge_);
-        }
-      }
-    }
-  }
-}
-
-void Worker::InitNetParams(const std::string& folder, vector<Layer*> net) {
-
-    std::unordered_map<string, Param*> name2param;
-    for (auto layer : net) {
-        for (auto param : layer->GetParams()) {
-          // only owners fill the memory of parameter values.
-          //if (param->owner() == param->id()) {
-            CHECK(name2param.find(param->name()) == name2param.end());
-            name2param[param->name()] = param;
-          //}
-        }
-    }
-    vector<string> paths;
-    paths.push_back(folder);
-    NeuralNet::Load(paths, name2param);
-}
-
-void Worker::InitNetParams(const JobProto& job_conf, NeuralNet* net) {
-  // for each server grp, its first subscriber worker grp does the param init
-  if (grp_id_ % Cluster::Get()->nworker_groups_per_server_group() == 0) {
-    // extract params that should be initialized by this worker
-    // must gen a name for each param if the user doesn't config it
-    std::unordered_map<string, Param*> name2param;
-    for (auto layer : net->layers()) {
-      if (layer->partition_id() == id_) {
-        for (auto param : layer->GetParams()) {
-          // only owners fill the memory of parameter values.
-          if (param->owner() == param->id()) {
-            CHECK(name2param.find(param->name()) == name2param.end());
-            name2param[param->name()] = param;
-          }
-        }
-      }
-    }
-    vector<string> paths;
-    for (const auto& p : job_conf_.checkpoint_path())
-      paths.push_back(p);
-    net->Load(paths, name2param);
-    // init other params who do not have checkpoint version
-    for (auto entry : name2param) {
-      if (entry.second->version() > 0) {
-        //  if load from pre-training params, reset version to start step
-        if (job_conf.reset_param_version()) {
-          entry.second->set_version(job_conf.step());
-        }
-      } else {
-        entry.second->InitValues(job_conf.step());
-        if (!job_conf.reset_param_version())
-          LOG(ERROR) << "better reset version of params from checkpoints "
-            << "to the same as other newly initialized params!";
-      }
-    }
-
-    // warmup training before put params to servers
-    // for (; step_ < job_conf.warmup_steps(); step_++)
-    //  TrainOneBatch(step_, net);
-    for (auto layer : net->layers()) {
-      if (layer->partition_id() == id_)
-        for (auto param : layer->GetParams())
-          if (param->owner() == param->id())
-            Put(param->version(), param);
-    }
-  }
-  // wait owners in the same procs init params, then no get requests sent
-  std::this_thread::sleep_for(std::chrono::milliseconds(1000));
-  for (auto layer : net->layers()) {
-    if (layer->partition_id() == id_)
-      for (auto param : layer->GetParams())
-        Get(job_conf.warmup_steps(), param);
-  }
-}
-
-void Worker::Checkpoint(int step, const std::string& folder, vector<Layer*> net) {
-  BlobProtos bps;
-  for (auto layer : net) {
-    //if (layer->partition_id() == id_) {
-      for (auto param : layer->GetParams()) {
-        // only owners fill the memory of parameter values.
-        //if (param->owner() == param->id()) {
-          auto *blob = bps.add_blob();
-          param->ToProto(blob);
-          bps.add_version(param->version());
-          bps.add_name(param->name());
-        //}
-      }
-    //}
-  }
-  char buf[256];
-  snprintf(buf, sizeof(buf), "%s/step%d-worker0", folder.c_str(), step);
-  LOG(INFO) << "checkpoint to " << buf;
-  WriteProtoToBinaryFile(bps, buf);
-}
-
-void Worker::Checkpoint(int step, const std::string& folder, NeuralNet* net) {
-  BlobProtos bps;
-  for (auto layer : net->layers()) {
-    if (layer->partition_id() == id_) {
-      for (auto param : layer->GetParams()) {
-        // only owners fill the memory of parameter values.
-        if (param->owner() == param->id()) {
-          auto *blob = bps.add_blob();
-          param->ToProto(blob);
-          bps.add_version(param->version());
-          bps.add_name(param->name());
-        }
-      }
-    }
-  }
-  char buf[256];
-  snprintf(buf, sizeof(buf), "%s/step%d-worker%d", folder.c_str(), step, id_);
-  LOG(INFO) << "checkpoint to " << buf;
-  WriteProtoToBinaryFile(bps, buf);
-}
-
-int Worker::Put(int step, Param* param) {
-  if (dealer_ == nullptr) {
-    LOG(WARNING) << "Null dealer in worker (" << grp_id_ << ", " << id_ << ")";
-    return 1;
-  }
-  // set Blob head to cpu to avoid calling cudaMemcpy by the stub thread, which
-  // would hang on some machines.
-  param->data().cpu_data();
-  Msg* msg = new Msg(Addr(grp_id_, id_, kWorkerParam), Addr(-1, -1, kStub));
-  msg->set_trgt(ParamTrgt(param->owner(), 0), step);
-  msg->set_type(kPut);
-  dealer_->Send(&msg);
-//  LOG(ERROR) << "worker msg " << msg;
-  return 1;
-}
-
-int Worker::Get(int step, Param* param) {
-  if (param->version() >= step)
-    return 1;
-  if (dealer_ == nullptr) {
-    LOG(WARNING) << "Null dealer in worker (" << grp_id_ << ", " << id_ << ")";
-    return 1;
-  }
-  // set Blob head to cpu to avoid calling cudaMemcpy by the stub thread, which
-  // would hang on some machines.
-  param->mutable_data()->mutable_cpu_data();
-
-  Msg* msg = new Msg(Addr(grp_id_, id_, kWorkerParam), Addr(-1, -1, kStub));
-  msg->set_trgt(ParamTrgt(param->owner(), 0), step);
-  msg->set_type(kGet);
-  dealer_->Send(&msg);
-  return 1;
-}
-
-int Worker::Update(int step, Param* param) {
-  param->set_last_version(param->version());
-  if (dealer_ == nullptr) {
-    LOG(WARNING) << "Null dealer in worker (" << grp_id_ << ", " << id_ << ")";
-    return 1;
-  }
-  // head of data Blob (SyncMem) to cpu, because the stub thread may use
-  // cudaMemcpy copy gradients into msgs. cudaMemcpy hangs when called by the
-  // stub thread on some GPU machines.
-  // TODO(wangwei) fix this issue and remove the following line.
-  // optimize for training with single worker by removing stub and server, and
-  // updating parameters locally inside the worker GPU. Then we do not need to
-  // transfer gradients and parameter values between GPU-CPU.
-  param->grad().cpu_data();
-  // change the head of SyncMem to cpu; otherwise, the updated parameter
-  // values would not be synced to gpu (since the head is at gpu).
-  param->mutable_data()->mutable_cpu_data();
-
-  Msg* msg = new Msg(Addr(grp_id_, id_, kWorkerParam), Addr(-1, -1, kStub));
-  msg->set_trgt(ParamTrgt(param->owner(), 0), step);
-  msg->set_type(kUpdate);
-  dealer_->Send(&msg);
-  return 1;
-}
-
-int Worker::CollectAll(int step, NeuralNet* net) {
-  auto& layers = net->layers();
-  for (auto& layer : layers) {
-    if (layer->partition_id() == id_) {
-      for (Param* p : layer->GetParams()) {
-        Collect(step, p);
-      }
-    }
-  }
-  return 1;
-}
-
-int Worker::Collect(int step, Param* param) {
-  while (param->version() <= param->last_version()) {
-    std::this_thread::sleep_for(std::chrono::milliseconds(kCollectSleepTime));
-    // LOG(ERROR) << "wait  "<< param->id() << " at " << step << " by " <<id_;
-  }
-  return 1;
-}
-
-void Worker::Display(int flag, const std::string& prefix, NeuralNet* net) {
-  for (auto layer : net->layers()) {
-    if (layer->partition_id() == id_) {
-      const string& disp = layer->ToString(false, flag);
-      if (disp.length())
-        LOG(ERROR) << prefix << "  " << disp;
-    }
-  }
-}
-
-/****************************BPWorker**********************************/
-void BPWorker::TrainOneBatch(int step, NeuralNet* net) {
-  Forward(step, kTrain, net);
-  Backward(step, net);
-}
-
-void BPWorker::TestOneBatch(int step, Phase phase, NeuralNet* net) {
-  Forward(step, phase, net);
-}
-
-void BPWorker::Forward(int step, Phase phase, NeuralNet* net) {
-  map<string, string> label;
-  for (auto& layer : net->layers()) {
-    if (layer->partition_id() == id_) {
-      if (phase == kTrain && layer->unroll_index() == 0) {
-        // wait until param is updated
-        for (Param* p : layer->GetParams()) {
-          Collect(step, p);
-        }
-      }
-      // DLOG(ERROR) << "Forward " << layer->name();
-      layer->ComputeFeature(phase | kForward, net->srclayers(layer));
-      if (job_conf_.debug() && DisplayNow(step) && grp_id_ == 0)
-        label[layer->name()] = layer->ToString(true, phase | kForward);
-    }
-  }
-  if (label.size()) {
-    const string path = Cluster::Get()->vis_folder() + "/fp-step"
-      + std::to_string(step) +"-loc" + std::to_string(id_) + ".json";
-    WriteStringToTextFile(path, net->ToGraph(false).ToJson(label));
-  }
-}
-
-void BPWorker::Backward(int step, NeuralNet* net) {
-  map<string, string> label;
-  auto& layers = net->layers();
-  for (auto it = layers.rbegin(); it != layers.rend(); it++) {
-    Layer* layer = *it;
-    if (layer->partition_id() == id_) {
-      layer->ComputeGradient(kTrain | kBackward, net->srclayers(layer));
-      if (job_conf_.debug() && DisplayNow(step) && grp_id_ == 0)
-        label[layer->name()] = layer->ToString(true, kTrain | kBackward);
-      for (Param* p : layer->GetParams())
-        Update(step, p);
-    }
-  }
-  if (label.size()) {
-    const string path = Cluster::Get()->vis_folder() + "/bp-step"
-      + std::to_string(step) + "-loc" + std::to_string(id_) + ".json";
-    WriteStringToTextFile(path, net->ToGraph(false).Reverse().ToJson(label));
-  }
-}
-
-/***************************BPTTWorker*********************************/
-void BPTTWorker::Forward(int step, Phase phase, NeuralNet* net) {
-  map<string, string> label;
-  for (auto& layer : net->layers()) {
-    if (layer->partition_id() == id_) {
-      if (phase == kTrain && layer->unroll_index() == 0) {
-        // wait until param is updated
-        for (Param* p : layer->GetParams()) {
-          Collect(step, p);
-          Zero(p->mutable_grad());
-        }
-      }
-      vector<Layer*> src = net->srclayers(layer);
-      if ((phase & kTest) && typeid(*layer) == typeid(RNNDummyLayer)) {
-        CHECK_LE(src.size(), 1);
-        auto dummy = dynamic_cast<RNNDummyLayer*>(layer);
-        Layer* srclayer = net->name2layer(dummy->srclayer(step));
-        if (step > 0)
-          CHECK(srclayer != nullptr);
-        if (srclayer != nullptr) {
-          src.clear();
-          src.push_back(srclayer);
-        }
-      }
-      // if full state rnn and not the starting of a new passing of the dataset,
-      // feed the hidden state of the last unit to the first unit.
-      if (layer->unroll_index() == 0 && full_state_ && !begin_) {
-        Layer* last = net->last_unroll_layer(layer);
-        CHECK(last != nullptr);
-        if (last != layer || (phase & kTest))
-          src.push_back(last);
-      }
-      // LOG(ERROR) << layer->name() << " forward";
-      // int ret =
-      layer->ComputeFeature(phase | kForward, src);
-      /*
-      if ((phase & Phase::kTrain) && ret == Status::kEnd)
-        begin_ = true;
-      */
-      if (job_conf_.debug() && DisplayNow(step) && grp_id_ == 0)
-        label[layer->name()] = layer->ToString(true, phase | kForward);
-    }
-  }
-  if (label.size()) {
-    const string path = Cluster::Get()->vis_folder() + "/fp-step"
-      + std::to_string(step) +"-loc" + std::to_string(id_) + ".json";
-    WriteStringToTextFile(path, net->ToGraph(false).ToJson(label));
-  }
-}
-
-void BPTTWorker::Backward(int step, NeuralNet* net) {
-  map<string, string> label;
-  auto& layers = net->layers();
-  for (auto it = layers.rbegin(); it != layers.rend(); it++) {
-    Layer* layer = *it;
-    if (layer->partition_id() == id_) {
-      layer->ComputeGradient(kTrain | kBackward | kAggGrad,
-          net->srclayers(layer));
-      // LOG(ERROR) << layer->name() << " backward";
-      if (job_conf_.debug() && DisplayNow(step) && grp_id_ == 0)
-        label[layer->name()] = layer->ToString(true, kTrain | kBackward);
-      // unrolled layers share parameter data and grad, just update the 1st one
-      if (layer->unroll_index() == 0)
-        for (Param* p : layer->GetParams())
-          Update(step, p);
-    }
-  }
-  if (label.size()) {
-    const string path = Cluster::Get()->vis_folder() + "/bp-step"
-      + std::to_string(step) + "-loc" + std::to_string(id_) + ".json";
-    WriteStringToTextFile(path, net->ToGraph(false).Reverse().ToJson(label));
-  }
-}
-void BPTTWorker::Display(int flag, const std::string& prefix, NeuralNet* net) {
-  std::unordered_map<string, float> perf;
-  for (auto layer : net->layers()) {
-    if (layer->partition_id() == id_) {
-      const string& disp = layer->ToString(false, flag);
-      for (const auto& entry : GetMetricFromString(disp))
-        perf[entry.first] += entry.second;
-    }
-  }
-  string disp = prefix + " ";
-  for (const auto& entry : perf)
-    disp += entry.first + " = " + std::to_string(entry.second) + ", ";
-  LOG(ERROR) << disp;
-}
-/****************************CDWorker**********************************/
-void CDWorker::TrainOneBatch(int step, NeuralNet* net) {
-  const auto& layers = net->layers();
-  for (auto* layer : layers) {
-    for (Param* p : layer->GetParams())  // wait until param is updated
-      Collect(step, p);
-    layer->ComputeFeature(kPositive, net->srclayers(layer));
-  }
-  for (auto* layer : layers)
-    if (typeid(*layer) == typeid(RBMVisLayer)
-          || typeid(*layer) == typeid(RBMHidLayer))
-      layer->ComputeFeature(kNegative | kTest, net->srclayers(layer));
-  for (int i = 1; i < job_conf_.train_one_batch().cd_conf().cd_k(); i++) {
-    for (auto* layer : layers) {
-      if (typeid(*layer) == typeid(RBMVisLayer)
-          || typeid(*layer) == typeid(RBMHidLayer))
-      layer->ComputeFeature(kNegative, net->srclayers(layer));
-    }
-  }
-  for (auto* layer : layers) {
-    if (typeid(*layer) == typeid(RBMVisLayer)
-        || typeid(*layer) == typeid(RBMHidLayer)) {
-      layer->ComputeGradient(kTrain, net->srclayers(layer));
-      for (Param* p : layer->GetParams()) {
-        Update(step, p);
-      }
-    }
-  }
-}
-
-void CDWorker::TestOneBatch(int step, Phase phase, NeuralNet* net) {
-  auto& layers = net->layers();
-  for (auto *layer : layers)
-    layer->ComputeFeature(kPositive, net->srclayers(layer));
-  for (auto *layer : layers)
-    if (typeid(*layer) == typeid(RBMVisLayer))
-      layer->ComputeFeature(kNegative | kTest, net->srclayers(layer));
-}
-
-}  // namespace singa
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
new file mode 100644
index 0000000..7db784c
--- /dev/null
+++ b/test/CMakeLists.txt
@@ -0,0 +1,47 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# 
+
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
+INCLUDE_DIRECTORIES(${CMAKE_BINARY_DIR}/include)
+
+IF(ENABLE_DIST)
+  ADD_EXECUTABLE(test_ep "singa/test_ep.cc")
+  ADD_DEPENDENCIES(test_ep singa_io)
+  TARGET_LINK_LIBRARIES(test_ep singa_utils singa_io protobuf ${SINGA_LINKER_LIBS})
+ENDIF()
+
+ADD_LIBRARY(gtest STATIC EXCLUDE_FROM_ALL "gtest/gtest.h" "gtest/gtest-all.cc")
+
+AUX_SOURCE_DIRECTORY(singa singa_test_source)
+LIST(REMOVE_ITEM singa_test_source "singa/test_ep.cc")
+
+IF(NOT USE_OPENCL)
+    MESSAGE(STATUS "Skipping OpenCL tests")
+    LIST(REMOVE_ITEM singa_test_source "singa/test_opencl.cc")
+ENDIF()
+
+
+ADD_EXECUTABLE(test_singa "gtest/gtest_main.cc" ${singa_test_source})
+ADD_DEPENDENCIES(test_singa singa_core singa_utils)
+#MESSAGE(STATUS "link libs" ${singa_linker_libs})
+TARGET_LINK_LIBRARIES(test_singa gtest singa_core singa_utils singa_model
+    singa_io singa_proto protobuf ${SINGA_LINKER_LIBS})
+IF(UNIX AND (NOT APPLE))
+    LIST(APPEND LINK_FLAGS "-pthread")
+ENDIF()
+SET_TARGET_PROPERTIES(test_singa PROPERTIES LINK_FLAGS "${LINK_FLAGS}")
diff --git a/test/gtest/CMakeLists.txt b/test/gtest/CMakeLists.txt
new file mode 100644
index 0000000..5b22dae
--- /dev/null
+++ b/test/gtest/CMakeLists.txt
@@ -0,0 +1,19 @@
+# 
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# 
+
+
diff --git a/include/gtest/gtest-all.cc b/test/gtest/gtest-all.cc
similarity index 100%
rename from include/gtest/gtest-all.cc
rename to test/gtest/gtest-all.cc
diff --git a/include/gtest/gtest.h b/test/gtest/gtest.h
similarity index 100%
rename from include/gtest/gtest.h
rename to test/gtest/gtest.h
diff --git a/include/gtest/gtest_main.cc b/test/gtest/gtest_main.cc
similarity index 100%
rename from include/gtest/gtest_main.cc
rename to test/gtest/gtest_main.cc
diff --git a/test/python/test_layer.py b/test/python/test_layer.py
new file mode 100644
index 0000000..141cf56
--- /dev/null
+++ b/test/python/test_layer.py
@@ -0,0 +1,213 @@
+# 
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# 
+
+import sys
+import os
+import unittest
+import numpy as np
+
+#sys.path.append(os.path.join(os.path.dirname(__file__), '../../build/python'))
+
+from singa import layer
+from singa import device
+from singa import tensor
+from singa.proto import model_pb2
+
+
+def _tuple_to_string(t):
+    lt = [str(x) for x in t]
+    return '(' + ', '.join(lt) + ')'
+
+
+class TestPythonLayer(unittest.TestCase):
+
+    def check_shape(self, actual, expect):
+        self.assertEqual(actual, expect, 'shape mismatch, actual shape is %s'
+                         ' exepcted is %s' % (_tuple_to_string(actual),
+                                              _tuple_to_string(expect))
+                         )
+
+    def setUp(self):
+        layer.engine='singacpp'
+        self.w = {'init': 'Xavier', 'regularizer': 1e-4}
+        self.b = {'init': 'Constant', 'value': 0}
+        self.sample_shape = None
+
+    def test_conv2D_shape(self):
+        in_sample_shape = (3, 224, 224)
+        conv = layer.Conv2D('conv', 64, 3, 1, W_specs=self.w, b_specs=self.b,
+                            input_sample_shape=in_sample_shape)
+        out_sample_shape = conv.get_output_sample_shape()
+        self.check_shape(out_sample_shape, (64, 224, 224))
+
+    def test_conv2D_forward_backward(self):
+        in_sample_shape = (1, 3, 3)
+        conv = layer.Conv2D('conv', 1, 3, 2, W_specs=self.w, b_specs=self.b,
+                            pad=1, input_sample_shape=in_sample_shape)
+        # cuda = device.create_cuda_gpu()
+        # conv.to_device(cuda)
+        params = conv.param_values()
+
+        raw_x = np.arange(9, dtype=np.float32) + 1
+        x = tensor.from_numpy(raw_x)
+        x.reshape((1, 1, 3, 3))
+        w = np.array([1, 1, 0, 0, 0, -1, 0, 1, 0], dtype=np.float32)
+        params[0].copy_from_numpy(w)
+        params[1].set_value(1.0)
+
+        # x.to_device(cuda)
+        y = conv.forward(model_pb2.kTrain, x)
+        # y.to_host()
+        npy = tensor.to_numpy(y).flatten()
+
+        self.assertAlmostEqual(3.0, npy[0])
+        self.assertAlmostEqual(7.0, npy[1])
+        self.assertAlmostEqual(-3.0, npy[2])
+        self.assertAlmostEqual(12.0, npy[3])
+
+        dy = np.asarray([0.1, 0.2, 0.3, 0.4], dtype=np.float32).reshape(y.shape)
+        grad = tensor.from_numpy(dy)
+        # grad.to_device(cuda)
+        (dx, [dw, db]) = conv.backward(model_pb2.kTrain, grad)
+        dx.to_host()
+        dw.to_host()
+        dx = tensor.to_numpy(dx).flatten()
+        dw = tensor.to_numpy(dw).flatten()
+        dy = dy.flatten()
+        self.assertAlmostEquals(dy[0] * w[4], dx[0])
+        self.assertAlmostEquals(dy[0] * w[5] + dy[1] * w[3], dx[1])
+        self.assertAlmostEquals(dy[1] * w[4], dx[2])
+        self.assertAlmostEquals(dy[0] * w[7] + dy[2] * w[1], dx[3])
+        self.assertAlmostEquals(
+            dy[0] *
+            w[8] +
+            dy[1] *
+            w[6] +
+            dy[2] *
+            w[2] +
+            dy[3] *
+            w[0],
+            dx[4])
+        self.assertAlmostEquals(dy[1] * w[7] + dy[3] * w[1], dx[5])
+        self.assertAlmostEquals(dy[2] * w[4], dx[6])
+        self.assertAlmostEquals(dy[2] * w[5] + dy[3] * w[3], dx[7])
+        self.assertAlmostEquals(dy[3] * w[4], dx[8])
+
+        self.assertAlmostEquals(dy[3] * raw_x[4], dw[0])
+        self.assertAlmostEquals(dy[3] * raw_x[5] + dy[2] * raw_x[3], dw[1])
+        self.assertAlmostEquals(dy[2] * raw_x[4], dw[2])
+        self.assertAlmostEquals(dy[1] * raw_x[1] + dy[3] * raw_x[7], dw[3])
+        self.assertAlmostEquals(
+            dy[0] *
+            raw_x[0] +
+            dy[1] *
+            raw_x[2] +
+            dy[2] *
+            raw_x[6] +
+            dy[3] *
+            raw_x[8],
+            dw[4], 5)
+        self.assertAlmostEquals(dy[0] * raw_x[1] + dy[2] * raw_x[7], dw[5])
+        self.assertAlmostEquals(dy[1] * raw_x[4], dw[6])
+        self.assertAlmostEquals(dy[0] * raw_x[3] + dy[1] * raw_x[5], dw[7])
+        self.assertAlmostEquals(dy[0] * raw_x[4], dw[8])
+
+    def test_conv1D(self):
+        in_sample_shape = (224,)
+        conv = layer.Conv1D('conv', 64, 3, 1, W_specs=self.w, b_specs=self.b,
+                            pad=1, input_sample_shape=in_sample_shape)
+        out_sample_shape = conv.get_output_sample_shape()
+        self.check_shape(out_sample_shape, (64, 224,))
+
+    def test_max_pooling2D(self):
+        in_sample_shape = (64, 224, 224)
+        pooling = layer.MaxPooling2D('pool', 3, 2,
+                                     input_sample_shape=in_sample_shape)
+        out_sample_shape = pooling.get_output_sample_shape()
+        self.check_shape(out_sample_shape, (64, 112, 112))
+
+    def test_max_pooling1D(self):
+        in_sample_shape = (224,)
+        pooling = layer.MaxPooling1D('pool', 3, 2,
+                                     input_sample_shape=in_sample_shape)
+        out_sample_shape = pooling.get_output_sample_shape()
+        self.check_shape(out_sample_shape, (112,))
+
+    def test_avg_pooling2D(self):
+        in_sample_shape = (64, 224, 224)
+        pooling = layer.AvgPooling2D('pool', 3, 2,
+                                     input_sample_shape=in_sample_shape)
+        out_sample_shape = pooling.get_output_sample_shape()
+        self.check_shape(out_sample_shape, (64, 112, 112))
+
+    def test_avg_pooling1D(self):
+        in_sample_shape = (224,)
+        pooling = layer.AvgPooling1D('pool', 3, 2,
+                                     input_sample_shape=in_sample_shape)
+        out_sample_shape = pooling.get_output_sample_shape()
+        self.check_shape(out_sample_shape, (112,))
+
+    def test_batch_normalization(self):
+        in_sample_shape = (3, 224, 224)
+        bn = layer.BatchNormalization('bn', input_sample_shape=in_sample_shape)
+        out_sample_shape = bn.get_output_sample_shape()
+        self.check_shape(out_sample_shape, in_sample_shape)
+
+    def test_lrn(self):
+        in_sample_shape = (3, 224, 224)
+        lrn = layer.LRN('lrn', input_sample_shape=in_sample_shape)
+        out_sample_shape = lrn.get_output_sample_shape()
+        self.check_shape(out_sample_shape, in_sample_shape)
+
+    def test_dense(self):
+        dense = layer.Dense('ip', 32, input_sample_shape=(64,))
+        out_sample_shape = dense.get_output_sample_shape()
+        self.check_shape(out_sample_shape, (32,))
+
+    def test_dropout(self):
+        input_sample_shape = (64, 1, 12)
+        dropout = layer.Dropout('drop', input_sample_shape=input_sample_shape)
+        out_sample_shape = dropout.get_output_sample_shape()
+        self.check_shape(out_sample_shape, input_sample_shape)
+
+    def test_activation(self):
+        input_sample_shape = (64, 1, 12)
+        act = layer.Activation('act', input_sample_shape=input_sample_shape)
+        out_sample_shape = act.get_output_sample_shape()
+        self.check_shape(out_sample_shape, input_sample_shape)
+
+    def test_softmax(self):
+        input_sample_shape = (12,)
+        softmax = layer.Softmax('soft', input_sample_shape=input_sample_shape)
+        out_sample_shape = softmax.get_output_sample_shape()
+        self.check_shape(out_sample_shape, input_sample_shape)
+
+    def test_flatten(self):
+        input_sample_shape = (64, 1, 12)
+        flatten = layer.Flatten('flat', input_sample_shape=input_sample_shape)
+        out_sample_shape = flatten.get_output_sample_shape()
+        self.check_shape(out_sample_shape, (64 * 1 * 12, ))
+
+        flatten = layer.Flatten('flat', axis=2,
+                                input_sample_shape=input_sample_shape)
+        out_sample_shape = flatten.get_output_sample_shape()
+        self.check_shape(out_sample_shape, (12,))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/test/python/test_optimizer.py b/test/python/test_optimizer.py
new file mode 100644
index 0000000..afdf337
--- /dev/null
+++ b/test/python/test_optimizer.py
@@ -0,0 +1,104 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# =============================================================================
+import sys
+import os
+import unittest
+import numpy as np
+
+sys.path.append(os.path.join(os.path.dirname(__file__), '../../build/python'))
+
+import singa.tensor as tensor
+import singa.optimizer as opt
+import singa.device as device
+
+cuda = device.create_cuda_gpu()
+
+
+class TestOptimizer(unittest.TestCase):
+
+    def setUp(self):
+        self.np_W = np.array([0.1, 0.2, 0.3, 0.4], dtype=np.float32)
+        self.W = tensor.from_numpy(self.np_W)
+        self.np_g = np.array([0.1, 0.3, 0.1, 0.2], dtype=np.float32)
+        self.g = tensor.from_numpy(self.np_g)
+
+    def to_cuda(self):
+        self.W.to_device(cuda)
+        self.g.to_device(cuda)
+
+    def test_sgd(self):
+        lr = 0.1
+        sgd = opt.SGD(lr)
+        sgd.apply(0, self.g, self.W, 'w')
+        w = tensor.to_numpy(self.W)
+        for i in range(self.W.size()):
+            self.assertAlmostEqual(w[i], self.np_W[i] - lr * self.np_g[i])
+
+    def test_sgd_cuda(self):
+        lr = 0.1
+        sgd = opt.SGD(lr)
+        self.to_cuda()
+        sgd.apply(0, self.g, self.W, 'w')
+        self.W.to_host()
+        w = tensor.to_numpy(self.W)
+        for i in range(self.W.size()):
+            self.assertAlmostEqual(w[i], self.np_W[i] - lr * self.np_g[i])
+
+    def test_constraint(self):
+        threshold = 0.02
+        cons = opt.L2Constraint(threshold)
+        cons.apply(0, self.W, self.g)
+        g = tensor.to_numpy(self.g)
+        nrm = np.linalg.norm(self.np_g) / self.np_g.size
+        for i in range(g.size):
+            self.assertAlmostEqual(g[i], self.np_g[i] * threshold / nrm)
+
+    def test_constraint_cuda(self):
+        threshold = 0.02
+        self.to_cuda()
+        cons = opt.L2Constraint(threshold)
+        cons.apply(0, self.W, self.g)
+        self.g.to_host()
+        g = tensor.to_numpy(self.g)
+        nrm = np.linalg.norm(self.np_g) / self.np_g.size
+        for i in range(g.size):
+            self.assertAlmostEqual(g[i], self.np_g[i] * threshold / nrm)
+
+    def test_regularizer(self):
+        coefficient = 0.0001
+        reg = opt.L2Regularizer(coefficient)
+        reg.apply(0, self.W, self.g)
+        g = tensor.to_numpy(self.g)
+        for i in range(g.size):
+            self.assertAlmostEqual(g[i],
+                                   self.np_g[i] + coefficient * self.np_W[i])
+
+    def test_regularizer_cuda(self):
+        coefficient = 0.0001
+        reg = opt.L2Regularizer(coefficient)
+        self.to_cuda()
+        reg.apply(0, self.W, self.g)
+        self.g.to_host()
+        g = tensor.to_numpy(self.g)
+        for i in range(g.size):
+            self.assertAlmostEqual(g[i],
+                                   self.np_g[i] + coefficient * self.np_W[i])
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/test/python/test_tensor.py b/test/python/test_tensor.py
new file mode 100644
index 0000000..2374adc
--- /dev/null
+++ b/test/python/test_tensor.py
@@ -0,0 +1,137 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# =============================================================================
+
+import sys
+import os
+import math
+import unittest
+
+sys.path.append(os.path.join(os.path.dirname(__file__), '../../build/python'))
+
+
+from singa import tensor
+from singa.proto import core_pb2
+
+
+class TestTensorMethods(unittest.TestCase):
+
+    def setUp(self):
+        self.shape = (2, 3)
+        self.t = tensor.Tensor(self.shape)
+        self.s = tensor.Tensor(self.shape)
+        self.t.set_value(0)
+        self.s.set_value(0)
+
+    def test_tensor_fields(self):
+        t = self.t
+        shape = self.shape
+        self.assertTupleEqual(t.shape, shape)
+        self.assertEqual(t.shape[0], shape[0])
+        self.assertEqual(t.shape[1], shape[1])
+        self.assertEqual(tensor.product(shape), 2*3)
+        self.assertEqual(t.ndim(), 2)
+        self.assertEqual(t.size(), 2*3)
+        self.assertEqual(t.memsize(), 2*3*tensor.sizeof(core_pb2.kFloat32))
+        self.assertFalse(t.is_transpose())
+
+    def test_unary_operators(self):
+        t = self.t
+        self.assertAlmostEqual(tensor.to_numpy(t)[0, 0], 0.0)
+        t += 1.23
+        self.assertAlmostEqual(tensor.to_numpy(t)[0, 0], 1.23)
+        t -= 0.23
+        self.assertAlmostEqual(tensor.to_numpy(t)[0, 0], 1.23-0.23)
+        t *= 2.5
+        self.assertAlmostEqual(tensor.to_numpy(t)[0, 0], (1.23-0.23)*2.5)
+        t /= 2
+        self.assertAlmostEqual(tensor.to_numpy(t)[0, 0], (1.23-0.23)*2.5/2)
+
+    def test_binary_operators(self):
+        t = self.t
+        t += 3.2
+        s = self.s
+        s += 2.1
+        a = t + s
+        self.assertAlmostEqual(tensor.to_numpy(a)[0, 0], 3.2+2.1, 5)
+        a = t - s
+        self.assertAlmostEqual(tensor.to_numpy(a)[0, 0], 3.2-2.1, 5)
+        a = t * s
+        self.assertAlmostEqual(tensor.to_numpy(a)[0, 0], 3.2*2.1, 5)
+        ''' not implemented yet
+        a = t / s
+        self.assertAlmostEqual(tensor.to_numpy(a)[0,0], 3.2/2.1, 5)
+        '''
+
+    def test_comparison_operators(self):
+        t = self.t
+        t += 3.45
+        a = t < 3.45
+        self.assertEqual(tensor.to_numpy(a)[0, 0], 0)
+        a = t <= 3.45
+        self.assertEqual(tensor.to_numpy(a)[0, 0], 1)
+        a = t > 3.45
+        self.assertEqual(tensor.to_numpy(a)[0, 0], 0)
+        a = t >= 3.45
+        self.assertEqual(tensor.to_numpy(a)[0, 0], 1)
+        a = tensor.lt(t, 3.45)
+        self.assertEqual(tensor.to_numpy(a)[0, 0], 0)
+        a = tensor.le(t, 3.45)
+        self.assertEqual(tensor.to_numpy(a)[0, 0], 1)
+        a = tensor.gt(t, 3.45)
+        self.assertEqual(tensor.to_numpy(a)[0, 0], 0)
+        a = tensor.ge(t, 3.45)
+        self.assertEqual(tensor.to_numpy(a)[0, 0], 1)
+
+    def test_tensor_copy(self):
+        t = tensor.Tensor((2, 3))
+        t += 1.23
+        self.assertAlmostEqual(tensor.to_numpy(t)[0, 0], 1.23)
+        tc = t.copy()
+        tdc = t.deepcopy()
+        self.assertAlmostEqual(tensor.to_numpy(tc)[0, 0], 1.23)
+        self.assertAlmostEqual(tensor.to_numpy(tdc)[0, 0], 1.23)
+        t += 1.23
+        self.assertAlmostEqual(tensor.to_numpy(t)[0, 0], 2.46)
+        self.assertAlmostEqual(tensor.to_numpy(tc)[0, 0], 2.46)
+        self.assertAlmostEqual(tensor.to_numpy(tdc)[0, 0], 1.23)
+
+    def test_copy_data(self):
+        t = self.t
+        t += 1.23
+        s = self.s
+        s += 5.43
+        self.assertAlmostEqual(tensor.to_numpy(t)[0, 0], 1.23)
+        tensor.copy_data_to_from(t, s, 2)
+        self.assertAlmostEqual(tensor.to_numpy(t)[0, 0], 5.43, 5)
+        self.assertAlmostEqual(tensor.to_numpy(t)[0, 1], 5.43, 5)
+        self.assertAlmostEqual(tensor.to_numpy(t)[0, 2], 1.23)
+
+    def test_global_method(self):
+        t = self.t
+        t += 12.34
+        a = tensor.log(t)
+        self.assertAlmostEqual(tensor.to_numpy(a)[0, 0], math.log(12.34))
+
+    def test_random(self):
+        x = tensor.Tensor((1000,))
+        x.gaussian(1, 0.01)
+        self.assertAlmostEqual(tensor.average(x), 1, 3)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/src/proto/singa.proto b/test/singa/test_accuracy.cc
similarity index 64%
copy from src/proto/singa.proto
copy to test/singa/test_accuracy.cc
index 2fbf2db..5d337fb 100644
--- a/src/proto/singa.proto
+++ b/test/singa/test_accuracy.cc
@@ -19,11 +19,17 @@
 *
 *************************************************************/
 
-package singa;
+#include "gtest/gtest.h"
+#include "singa/model/metric.h"
 
-message SingaProto {
-  // ip/hostname:port[,ip/hostname:port]
-  optional string zookeeper_host = 1 [default = "localhost:2181"];
-  // log dir for singa binary and job information(job id, host list, pid list)
-  optional string log_dir = 2 [default = "/tmp/singa-log/"];
+TEST(Accuracy, Compute) {
+  singa::Accuracy acc;
+  singa::Tensor p(singa::Shape{2, 3});
+  singa::Tensor t(singa::Shape{2}, singa::kInt);
+  const float pdat[6] = {0.1, 0.3, 0.6, 0.3, 0.2, 0.5};
+  const int tdat[2] = {1, 2};  // one wrong, one correct
+  p.CopyDataFromHostPtr(pdat, sizeof(pdat) / sizeof(float));
+  t.CopyDataFromHostPtr(tdat, sizeof(tdat) / sizeof(int));
+  float a = acc.Evaluate(p, t);
+  EXPECT_FLOAT_EQ(a, 0.5f);
 }
diff --git a/test/singa/test_activation.cc b/test/singa/test_activation.cc
new file mode 100644
index 0000000..bb8ad84
--- /dev/null
+++ b/test/singa/test_activation.cc
@@ -0,0 +1,136 @@
+/************************************************************
+*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*
+*************************************************************/
+
+#include "../src/model/layer/activation.h"
+#include "gtest/gtest.h"
+#include <math.h> // exp, tanh
+
+using singa::Activation;
+using singa::Shape;
+TEST(Activation, Setup) {
+  Activation acti;
+  // EXPECT_EQ("Activation", acti.layer_type());
+
+  singa::LayerConf conf;
+  conf.set_type("singa_relu");
+  singa::ReLUConf* reluconf = conf.mutable_relu_conf();
+  reluconf->set_negative_slope(0.5);
+
+  acti.Setup(Shape{3}, conf);
+  EXPECT_EQ("relu", acti.Mode());
+  EXPECT_EQ(0.5f, acti.Negative_slope());
+}
+
+TEST(Activation, Forward) {
+  const float x[] = {1.0f, 2.0f, 3.0f, -2.0f, -3.0f, -4.0};
+  size_t n = sizeof(x) / sizeof(float);
+  singa::Tensor in(singa::Shape{n});
+  in.CopyDataFromHostPtr<float>(x, n);
+
+  float neg_slope = 0.5f;
+  std::string types[] = {"singa_sigmoid", "singa_tanh", "singa_relu"};
+  for (int j = 0; j < 3; j++) {
+    Activation acti;
+    singa::LayerConf conf;
+    std::string layertype = types[j];
+    conf.set_type(layertype);
+    if (layertype == "relu") {
+      singa::ReLUConf* reluconf = conf.mutable_relu_conf();
+      reluconf->set_negative_slope(neg_slope);
+    }
+    acti.Setup(Shape{n}, conf);
+
+    singa::Tensor out = acti.Forward(singa::kTrain, in);
+
+    const float* yptr = out.data<float>();
+    EXPECT_EQ(n, out.Size());
+
+    float* y = new float[n];
+    if (acti.Mode() == "sigmoid") {
+      for (size_t i = 0; i < n; i++)
+        y[i] = 1.f / (1.f + exp(-x[i]));
+    }
+    else if (acti.Mode() == "tanh") {
+      for (size_t i = 0; i < n; i++)
+        y[i] = tanh(x[i]);
+    }
+    else if (acti.Mode() == "relu") {
+      for (size_t i = 0; i < n; i++)
+        y[i] = (x[i] >= 0.f) ? x[i] : 0.f;
+    }
+    else
+      LOG(FATAL) << "Unkown activation: " << acti.Mode();
+    EXPECT_FLOAT_EQ(y[0], yptr[0]);
+    EXPECT_FLOAT_EQ(y[4], yptr[4]);
+    EXPECT_FLOAT_EQ(y[5], yptr[5]);
+    delete[] y;
+  }
+}
+
+TEST(Activation, Backward) {
+  const float x[] = {1.0f, 2.0f, 3.0f, -2.0f, -3.0f, -4.0};
+  size_t n = sizeof(x) / sizeof(float);
+  singa::Tensor in(singa::Shape{n});
+  in.CopyDataFromHostPtr<float>(x, n);
+
+  float neg_slope = 0.5f;
+  std::string types[] = {"singa_sigmoid", "singa_tanh", "singa_relu"};
+  for (int j = 0; j < 3; j++) {
+    Activation acti;
+    singa::LayerConf conf;
+    std::string layertype = types[j];
+    conf.set_type(layertype);
+    if (layertype == "relu") {
+      singa::ReLUConf* reluconf = conf.mutable_relu_conf();
+      reluconf->set_negative_slope(neg_slope);
+    }
+    acti.Setup(Shape{n}, conf);
+
+    singa::Tensor out = acti.Forward(singa::kTrain, in);
+    const float* yptr = out.data<float>();
+
+    const float grad[] = {2.0f, -3.0f, 1.0f, 3.0f, -1.0f, -2.0};
+    singa::Tensor out_diff(singa::Shape{n});
+    out_diff.CopyDataFromHostPtr<float>(grad, n);
+    const auto in_diff = acti.Backward(singa::kTrain, out_diff);
+    const float* xptr = in_diff.first.data<float>();
+
+    float* dx = new float[n];
+    if (acti.Mode() == "sigmoid") {
+      for (size_t i = 0; i < n; i++)
+        dx[i] = grad[i] * yptr[i] * (1. - yptr[i]);
+    }
+    else if (acti.Mode() == "tanh") {
+      for (size_t i = 0; i < n; i++)
+        dx[i] = grad[i] * (1 - yptr[i] * yptr[i]);
+    }
+    else if (acti.Mode() == "relu") {
+      for (size_t i = 0; i < n; i++)
+        dx[i] = grad[i] * (x[i] > 0.f) + acti.Negative_slope() * (x[i] <= 0.f);
+    }
+    else
+      LOG(FATAL) << "Unkown activation: " << acti.Mode();
+    EXPECT_FLOAT_EQ(dx[0], xptr[0]);
+    EXPECT_FLOAT_EQ(dx[4], xptr[4]);
+    EXPECT_FLOAT_EQ(dx[5], xptr[5]);
+    delete[] dx;
+  }
+}
diff --git a/test/singa/test_adagrad.cc b/test/singa/test_adagrad.cc
new file mode 100644
index 0000000..f12ec68
--- /dev/null
+++ b/test/singa/test_adagrad.cc
@@ -0,0 +1,96 @@
+/************************************************************
+*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*
+*************************************************************/
+
+#include "gtest/gtest.h"
+#include "singa/model/optimizer.h"
+#include "singa/singa_config.h"
+#include <cmath>
+
+TEST(AdaGrad, ApplyCPU) {
+  singa::AdaGrad adagrad;
+  float lr = 0.1f;
+  const float v[4] = {0.1, 0.2, 0.3, 0.4};
+  const float g[4] = {0.01, 0.02, 0.03, 0.04};
+
+  singa::Tensor value(singa::Shape{4}), grad(singa::Shape{4});
+  value.CopyDataFromHostPtr(v, 4);
+  grad.CopyDataFromHostPtr(g, 4);
+
+  singa::OptimizerConf conf;
+  adagrad.Setup(conf);
+  adagrad.Apply(0, lr, "xx", grad, value);
+
+  singa::Tensor v1 = value.Clone();
+  const float* newv1 = v1.data<float>();
+  float history[4];
+  for (int i = 0; i < 4; ++i) history[i] = g[i] * g[i];
+  for (int i = 0; i < 4; ++i)
+    EXPECT_NEAR(newv1[i], v[i] - lr * g[i] / sqrt(history[i] + conf.delta()),
+                1e-5);
+
+  grad.CopyDataFromHostPtr(g, 4);
+  adagrad.Apply(1, lr, "xx", grad, value);
+  singa::Tensor v2 = value.Clone();
+  const float* newv2 = v2.data<float>();
+  for (int i = 0; i < 4; ++i) history[i] += g[i] * g[i];
+
+  for (int i = 0; i < 4; ++i)
+    EXPECT_NEAR(newv2[i],
+                newv1[i] - lr * g[i] / sqrt(history[i] + conf.delta()), 1e-5);
+}
+
+#ifdef USE_CUDA
+TEST(AdaGrad, ApplyCUDA) {
+  singa::AdaGrad adagrad;
+  float lr = 0.1f;
+  const float v[4] = {0.1, 0.2, 0.3, 0.4};
+  const float g[4] = {0.01, 0.02, 0.03, 0.04};
+
+  auto dev = std::make_shared<singa::CudaGPU>();
+  singa::Tensor value(singa::Shape{4}, dev), grad(singa::Shape{4}, dev);
+  value.CopyDataFromHostPtr(v, 4);
+  grad.CopyDataFromHostPtr(g, 4);
+
+  singa::OptimizerConf conf;
+  adagrad.Setup(conf);
+  adagrad.Apply(0, lr, "xx", grad, value);
+
+  singa::Tensor v1 = value.Clone();
+  v1.ToHost();
+  const float* newv1 = v1.data<float>();
+  float history[4];
+  for (int i = 0; i < 4; ++i) history[i] = g[i] * g[i];
+  for (int i = 0; i < 4; ++i)
+    EXPECT_NEAR(newv1[i], v[i] - lr * g[i] / sqrt(history[i] + conf.delta()),
+                1e-5);
+
+  grad.CopyDataFromHostPtr(g, 4);
+  adagrad.Apply(1, lr, "xx", grad, value);
+  singa::Tensor v2 = value.Clone();
+  v2.ToHost();
+  const float* newv2 = v2.data<float>();
+  for (int i = 0; i < 4; ++i) history[i] += g[i] * g[i];
+
+  for (int i = 0; i < 4; ++i)
+    EXPECT_FLOAT_EQ(newv2[i],
+                    newv1[i] - lr * g[i] / sqrt(history[i] + conf.delta()));
+}
+#endif
diff --git a/test/singa/test_batchnorm.cc b/test/singa/test_batchnorm.cc
new file mode 100644
index 0000000..a61f6f3
--- /dev/null
+++ b/test/singa/test_batchnorm.cc
@@ -0,0 +1,132 @@
+/*********************************************************
+*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*
+************************************************************/
+
+#include "../src/model/layer/batchnorm.h"
+#include "gtest/gtest.h"
+#include <iostream>
+
+using namespace singa;
+
+TEST(BatchNorm, Setup) {
+  BatchNorm batchnorm;
+  // EXPECT_EQ("BatchNorm", batchnorm.layer_type());
+
+  singa::LayerConf conf;
+  singa::BatchNormConf *batchnorm_conf = conf.mutable_batchnorm_conf();
+  batchnorm_conf->set_factor(0.01);
+  batchnorm.Setup(Shape{2, 4, 4}, conf);
+
+  EXPECT_FLOAT_EQ(0.01, batchnorm.factor());
+  EXPECT_EQ(2u, batchnorm.channels());
+  EXPECT_EQ(4u, batchnorm.height());
+  EXPECT_EQ(4u, batchnorm.width());
+}
+
+TEST(BatchNorm, Forward) {
+  BatchNorm batchnorm;
+  const float x[] = {1, 2, 3, 4};
+  Tensor in(Shape{2, 1, 2, 1});
+  in.CopyDataFromHostPtr(x, 2 * 1 * 2 * 1);
+  const float alpha_[] = {1, 1};
+  Tensor alpha(Shape{1, 2});
+  alpha.CopyDataFromHostPtr(alpha_, 1 * 2);
+
+  const float beta_[] = {2, 2};
+  Tensor beta(Shape{1, 2});
+  beta.CopyDataFromHostPtr(beta_, 1 * 2);
+  singa::LayerConf conf;
+  singa::BatchNormConf *batchnorm_conf = conf.mutable_batchnorm_conf();
+  batchnorm_conf->set_factor(1);
+  batchnorm.Setup(Shape{1, 2, 1}, conf);
+  batchnorm.set_bnScale(alpha);
+  batchnorm.set_bnBias(beta);
+  batchnorm.set_runningMean(beta);
+  batchnorm.set_runningVariance(beta);
+  Tensor out = batchnorm.Forward(kTrain, in);
+  const float *outptr = out.data<float>();
+  const auto &shape = out.shape();
+  EXPECT_EQ(4u, shape.size());
+  EXPECT_EQ(2u, shape[0]);
+  EXPECT_EQ(1u, shape[1]);
+  EXPECT_EQ(2u, shape[2]);
+  EXPECT_EQ(1u, shape[3]);
+  EXPECT_NEAR(1.0f, outptr[0], 1e-4f);
+  EXPECT_NEAR(1.0f, outptr[1], 1e-4f);
+  EXPECT_NEAR(3.0f, outptr[2], 1e-4f);
+  EXPECT_NEAR(3.0f, outptr[3], 1e-4f);
+}
+
+TEST(BatchNorm, Backward) {
+  BatchNorm batchnorm;
+  const float x[] = {1, 2, 3, 4};
+  Tensor in(Shape{2, 1, 2, 1});
+  in.CopyDataFromHostPtr(x, 2 * 1 * 2 * 1);
+  const float dy[] = {4, 3, 2, 1};
+  Tensor dy_in(Shape{2, 1, 2, 1});
+  dy_in.CopyDataFromHostPtr(dy, 2 * 1 * 2 * 1);
+  const float alpha_[] = {1, 1};
+  Tensor alpha(Shape{1, 2});
+  alpha.CopyDataFromHostPtr(alpha_, 1 * 2);
+
+  const float beta_[] = {0, 0};
+  Tensor beta(Shape{1, 2});
+  beta.CopyDataFromHostPtr(beta_, 1 * 2);
+  singa::LayerConf conf;
+  singa::BatchNormConf *batchnorm_conf = conf.mutable_batchnorm_conf();
+  batchnorm_conf->set_factor(1);
+  batchnorm.Setup(Shape{1, 2, 1}, conf);
+  batchnorm.set_bnScale(alpha);
+  batchnorm.set_bnBias(beta);
+  batchnorm.set_runningMean(beta);
+  batchnorm.set_runningVariance(beta);
+  Tensor out = batchnorm.Forward(kTrain, in);
+  auto ret = batchnorm.Backward(kTrain, dy_in);
+  Tensor dx = ret.first;
+  const auto & shape = dx.shape();
+  EXPECT_EQ(4u, shape.size());
+  EXPECT_EQ(2u, shape[0]);
+  EXPECT_EQ(1u, shape[1]);
+  EXPECT_EQ(2u, shape[2]);
+  EXPECT_EQ(1u, shape[3]);
+  const float *dxptr = ret.first.data<float>();
+  EXPECT_NEAR(.0f, dxptr[0], 1e-4f);
+  EXPECT_NEAR(.0f, dxptr[1], 1e-4f);
+  EXPECT_NEAR(.0f, dxptr[2], 1e-4f);
+  EXPECT_NEAR(.0f, dxptr[3], 1e-4f);
+
+  Tensor dbnScale = ret.second.at(0);
+  const float *dbnScaleptr = dbnScale.data<float>();
+  const auto & dbnScaleShape = dbnScale.shape();
+  EXPECT_EQ(1u, dbnScaleShape.size());
+  EXPECT_EQ(2u, dbnScaleShape[0]);
+
+  EXPECT_NEAR(-2.0f, dbnScaleptr[0], 1e-4f);
+  EXPECT_NEAR(-2.0f, dbnScaleptr[1], 1e-4f);
+
+  Tensor dbnBias = ret.second.at(1);
+  const float *dbnBiasptr = dbnBias.data<float>();
+  const auto & dbnBiasShape = dbnBias.shape();
+  EXPECT_EQ(1u, dbnBiasShape.size());
+  EXPECT_EQ(2u, dbnBiasShape[0]);
+
+  EXPECT_NEAR(6.0f, dbnBiasptr[0], 1e-4f);
+  EXPECT_NEAR(4.0f, dbnBiasptr[1], 1e-4f);
+}
diff --git a/test/singa/test_binfile_rw.cc b/test/singa/test_binfile_rw.cc
new file mode 100644
index 0000000..53c29fa
--- /dev/null
+++ b/test/singa/test_binfile_rw.cc
@@ -0,0 +1,133 @@
+/************************************************************
+*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*
+*************************************************************/
+
+#include "../include/singa/io/reader.h"
+#include "../include/singa/io/writer.h"
+#include "gtest/gtest.h"
+
+const char* path_bin = "./binfile_test";
+using singa::io::BinFileReader;
+using singa::io::BinFileWriter;
+TEST(BinFileWriter, Create) {
+  BinFileWriter writer;
+  bool ret;
+  ret = writer.Open(path_bin, singa::io::kCreate);
+  EXPECT_EQ(true, ret);
+
+  std::string key = "";
+  std::string value = "\nThis is a test for binfile io.";
+  ret = writer.Write(key, value);
+  EXPECT_EQ(true, ret);
+
+  ret = writer.Write(key, value);
+  EXPECT_EQ(true, ret);
+
+  writer.Flush();
+  writer.Close();
+}
+
+TEST(BinFileWriter, Append) {
+  BinFileWriter writer;
+  bool ret;
+  ret = writer.Open(path_bin, singa::io::kAppend, 20971520);
+  EXPECT_EQ(true, ret);
+
+  std::string key = "1";
+  std::string value = "\nThis is another test for binfile io.";
+  ret = writer.Write(key, value);
+  EXPECT_EQ(true, ret);
+
+  key = "2";
+  value = "\nThis is another test for binfile io.";
+  ret = writer.Write(key, value);
+  EXPECT_EQ(true, ret);
+
+  writer.Flush();
+  writer.Close();
+}
+
+TEST(BinFileReader, Read) {
+  BinFileReader reader;
+  bool ret;
+  ret = reader.Open(path_bin);
+  EXPECT_EQ(true, ret);
+
+  int cnt = reader.Count();
+  EXPECT_EQ(4, cnt);
+
+  std::string key, value;
+  reader.Read(&key, &value);
+  EXPECT_STREQ("", key.c_str());
+  EXPECT_STREQ("\nThis is a test for binfile io.", value.c_str());
+
+  reader.Read(&key, &value);
+  EXPECT_STREQ("", key.c_str());
+  EXPECT_STREQ("\nThis is a test for binfile io.", value.c_str());
+
+  reader.Read(&key, &value);
+  EXPECT_STREQ("1", key.c_str());
+  EXPECT_STREQ("\nThis is another test for binfile io.", value.c_str());
+
+  reader.Read(&key, &value);
+  EXPECT_STREQ("2", key.c_str());
+  EXPECT_STREQ("\nThis is another test for binfile io.", value.c_str());
+
+  reader.Close();
+}
+
+TEST(BinFileReader, SeekToFirst) {
+  BinFileReader reader;
+  bool ret;
+  ret = reader.Open(path_bin);
+  EXPECT_EQ(true, ret);
+
+  int cnt = reader.Count();
+  EXPECT_EQ(4, cnt);
+
+  std::string key, value;
+  reader.Read(&key, &value);
+  EXPECT_STREQ("", key.c_str());
+  EXPECT_STREQ("\nThis is a test for binfile io.", value.c_str());
+
+  reader.Read(&key, &value);
+  EXPECT_STREQ("", key.c_str());
+  EXPECT_STREQ("\nThis is a test for binfile io.", value.c_str());
+
+  reader.SeekToFirst();
+  reader.Read(&key, &value);
+  EXPECT_STREQ("", key.c_str());
+  EXPECT_STREQ("\nThis is a test for binfile io.", value.c_str());
+
+  reader.Read(&key, &value);
+  EXPECT_STREQ("", key.c_str());
+  EXPECT_STREQ("\nThis is a test for binfile io.", value.c_str());
+
+  reader.Read(&key, &value);
+  EXPECT_STREQ("1", key.c_str());
+  EXPECT_STREQ("\nThis is another test for binfile io.", value.c_str());
+
+  reader.Read(&key, &value);
+  EXPECT_STREQ("2", key.c_str());
+  EXPECT_STREQ("\nThis is another test for binfile io.", value.c_str());
+
+  reader.Close();
+  remove(path_bin);
+}
diff --git a/src/proto/singa.proto b/test/singa/test_channel.cc
similarity index 66%
copy from src/proto/singa.proto
copy to test/singa/test_channel.cc
index 2fbf2db..68b0017 100644
--- a/src/proto/singa.proto
+++ b/test/singa/test_channel.cc
@@ -19,11 +19,21 @@
 *
 *************************************************************/
 
-package singa;
+#include "gtest/gtest.h"
+#include "singa/utils/channel.h"
 
-message SingaProto {
-  // ip/hostname:port[,ip/hostname:port]
-  optional string zookeeper_host = 1 [default = "localhost:2181"];
-  // log dir for singa binary and job information(job id, host list, pid list)
-  optional string log_dir = 2 [default = "/tmp/singa-log/"];
+TEST(Channel, InitChannel) {
+  singa::InitChannel("");
+  singa::SetChannelDirectory("/tmp");
+}
+
+TEST(Channel, SendStringToFile) {
+  singa::Channel* chn = singa::GetChannel("test_channel");
+  chn->Send("test to file");
+}
+
+TEST(Channel, SendStringToFileAndStderr) {
+  singa::Channel* chn = singa::GetChannel("test_channel");
+  chn->EnableDestStderr(true);
+  chn->Send("test to both file and stderr");
 }
diff --git a/test/singa/test_convolution.cc b/test/singa/test_convolution.cc
new file mode 100644
index 0000000..4cfb38d
--- /dev/null
+++ b/test/singa/test_convolution.cc
@@ -0,0 +1,208 @@
+/************************************************************
+*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*
+*************************************************************/
+#include "singa/singa_config.h"
+
+#ifdef USE_CBLAS
+#include "../src/model/layer/convolution.h"
+
+#include "gtest/gtest.h"
+
+using singa::Convolution;
+using singa::Shape;
+TEST(Convolution, Setup) {
+  Convolution conv;
+  // EXPECT_EQ("Convolution", conv.layer_type());
+
+  singa::LayerConf conf;
+  singa::ConvolutionConf *convconf = conf.mutable_convolution_conf();
+  convconf->set_kernel_h(2);
+  convconf->set_kernel_w(2);
+  convconf->set_pad_h(1);
+  convconf->set_pad_w(1);
+  convconf->set_stride_h(1);
+  convconf->set_stride_w(1);
+  convconf->set_num_output(2);
+  convconf->set_bias_term(true);
+  conv.Setup(Shape{1, 3, 3}, conf);
+
+  EXPECT_EQ(2u, conv.kernel_h());
+  EXPECT_EQ(2u, conv.kernel_w());
+  EXPECT_EQ(1u, conv.pad_h());
+  EXPECT_EQ(1u, conv.pad_w());
+  EXPECT_EQ(1u, conv.stride_h());
+  EXPECT_EQ(1u, conv.stride_w());
+  EXPECT_EQ(2u, conv.num_filters());
+  EXPECT_EQ(true, conv.bias_term());
+  EXPECT_EQ(1u, conv.channels());
+  EXPECT_EQ(3u, conv.height());
+  EXPECT_EQ(3u, conv.width());
+}
+
+TEST(Convolution, Forward) {
+  const size_t batchsize = 2, c = 1, h = 3, w = 3;
+  const float x[batchsize * c * h * w] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f,
+                                          7.0f, 8.0f, 9.0f, 1.0f, 2.0f, 3.0f,
+                                          4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f};
+  singa::Tensor in(singa::Shape{batchsize, c, h, w});
+  in.CopyDataFromHostPtr(x, batchsize * c * h * w);
+
+  // Set weight and bias manually
+  const size_t num_filters = 1;
+  const size_t col_height = 1 * 3 * 3;  // channels * kernel_w * kernel_h
+  const float we[num_filters * col_height] = {1.0f,  1.0f, 0.0f, 0.0f, 0.0f,
+                                              -1.0f, 0.0f, 1.0f, 0.0f};
+  singa::Tensor weight(singa::Shape{num_filters, col_height});
+  weight.CopyDataFromHostPtr(we, num_filters * col_height);
+  const float b[num_filters] = {1.0f};
+  singa::Tensor bias(singa::Shape{num_filters});
+  bias.CopyDataFromHostPtr(b, num_filters);
+  Convolution conv;
+  conv.set_weight(weight);
+  conv.set_bias(bias);
+
+  singa::LayerConf conf;
+  singa::ConvolutionConf *convconf = conf.mutable_convolution_conf();
+  convconf->set_kernel_h(3);
+  convconf->set_kernel_w(3);
+  convconf->set_pad_h(1);
+  convconf->set_pad_w(1);
+  convconf->set_stride_h(2);
+  convconf->set_stride_w(2);
+  convconf->set_num_output(1);
+  convconf->set_bias_term(true);
+  conv.Setup(Shape{1, 3, 3}, conf);
+
+  // Parameter "flag" does not influence convolution
+  singa::Tensor out1 = conv.Forward(singa::kTrain, in);
+  const float *outptr1 = out1.data<float>();
+  // Input: 3*3; kernel: 3*3; stride: 2*2; padding: 1*1.
+  EXPECT_EQ(8u, out1.Size());
+
+  EXPECT_EQ(3.0f, outptr1[0]);
+  EXPECT_EQ(7.0f, outptr1[1]);
+  EXPECT_EQ(-3.0f, outptr1[2]);
+  EXPECT_EQ(12.0f, outptr1[3]);
+  EXPECT_EQ(3.0f, outptr1[4]);
+  EXPECT_EQ(7.0f, outptr1[5]);
+  EXPECT_EQ(-3.0f, outptr1[6]);
+  EXPECT_EQ(12.0f, outptr1[7]);
+}
+
+TEST(Convolution, Backward) {
+  // src_data
+  const size_t batchsize = 2, c = 1, src_h = 3, src_w = 3;
+  const float x[batchsize * c * src_h * src_w] = {
+      1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f,
+      1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f};
+  singa::Tensor in(singa::Shape{batchsize, c, src_h, src_w});
+  in.CopyDataFromHostPtr(x, batchsize * c * src_h * src_w);
+
+  // Set weight_ and bias_ manually
+  const size_t num_filters = 1;
+  const size_t col_height = 1 * 3 * 3;  // channels * kernel_w * kernel_h
+  const float we[num_filters * col_height] = {1.0f,  1.0f, 0.0f, 0.0f, 0.0f,
+                                              -1.0f, 0.0f, 1.0f, 0.0f};
+  singa::Tensor weight(singa::Shape{num_filters, col_height});
+  weight.CopyDataFromHostPtr(we, num_filters * col_height);
+  const float b[num_filters] = {1.0f};
+  singa::Tensor bias(singa::Shape{num_filters});
+  bias.CopyDataFromHostPtr(b, num_filters);
+  Convolution conv;
+  conv.set_weight(weight);
+  conv.set_bias(bias);
+
+  singa::LayerConf conf;
+  singa::ConvolutionConf *convconf = conf.mutable_convolution_conf();
+  convconf->set_kernel_h(3);
+  convconf->set_kernel_w(3);
+  convconf->set_pad_h(1);
+  convconf->set_pad_w(1);
+  convconf->set_stride_h(2);
+  convconf->set_stride_w(2);
+  convconf->set_num_output(1);
+  convconf->set_bias_term(true);
+  convconf->set_workspace_byte_limit(256);
+  convconf->set_prefer("fastest");
+  conv.Setup(Shape{1, 3, 3}, conf);
+
+  singa::Tensor out1 = conv.Forward(singa::kTrain, in);
+
+  // grad
+  const size_t grad_h = 2, grad_w = 2;
+  const float dy[batchsize * num_filters * grad_h * grad_w] = {
+      0.1f, 0.2f, 0.3f, 0.4f, 0.1f, 0.2f, 0.3f, 0.4f};
+  singa::Tensor grad(singa::Shape{batchsize, num_filters, grad_h, grad_w});
+  grad.CopyDataFromHostPtr(dy, batchsize * num_filters * grad_h * grad_w);
+
+  const auto ret = conv.Backward(singa::kTrain, grad);
+  singa::Tensor in_grad = ret.first;
+  const float *dx = in_grad.data<float>();
+  const float *wptr = we;
+  EXPECT_EQ(18u, in_grad.Size());
+  EXPECT_EQ(dy[0] * wptr[4], dx[0]);
+  EXPECT_EQ(dy[0] * wptr[5] + dy[1] * wptr[3], dx[1]);
+  EXPECT_EQ(dy[1] * wptr[4], dx[2]);
+  EXPECT_EQ(dy[0] * wptr[7] + dy[2] * wptr[1], dx[3]);
+  EXPECT_EQ(
+      dy[0] * wptr[8] + dy[1] * wptr[6] + dy[2] * wptr[2] + dy[3] * wptr[0],
+      dx[4]);
+  EXPECT_EQ(dy[1] * wptr[7] + dy[3] * wptr[1], dx[5]);
+  EXPECT_EQ(dy[2] * wptr[4], dx[6]);
+  EXPECT_EQ(dy[2] * wptr[5] + dy[3] * wptr[3], dx[7]);
+  EXPECT_EQ(dy[3] * wptr[4], dx[8]);
+  EXPECT_EQ(dy[4] * wptr[4], dx[9]);
+  EXPECT_EQ(dy[4] * wptr[5] + dy[1] * wptr[3], dx[10]);
+  EXPECT_EQ(dy[5] * wptr[4], dx[11]);
+  EXPECT_EQ(dy[4] * wptr[7] + dy[2] * wptr[1], dx[12]);
+  EXPECT_EQ(
+      dy[4] * wptr[8] + dy[5] * wptr[6] + dy[6] * wptr[2] + dy[7] * wptr[0],
+      dx[13]);
+  EXPECT_EQ(dy[5] * wptr[7] + dy[7] * wptr[1], dx[14]);
+  EXPECT_EQ(dy[6] * wptr[4], dx[15]);
+  EXPECT_EQ(dy[6] * wptr[5] + dy[7] * wptr[3], dx[16]);
+  EXPECT_EQ(dy[7] * wptr[4], dx[17]);
+
+  singa::Tensor dw = ret.second[0];
+  singa::Tensor db = ret.second[1];
+  const float *dbptr = db.data<float>();
+  EXPECT_FLOAT_EQ(dy[0] + dy[1] + dy[2] + dy[3] + dy[4] + dy[5] + dy[6] + dy[7],
+                  dbptr[0]);
+
+  const float *dwptr = dw.data<float>();
+  EXPECT_EQ(9u, dw.Size());
+  EXPECT_FLOAT_EQ(dy[3] * x[4] + dy[7] * x[13], dwptr[0]);
+  EXPECT_FLOAT_EQ(dy[3] * x[5] + dy[7] * x[14] + dy[2] * x[3] + dy[6] * x[12],
+                  dwptr[1]);
+  EXPECT_FLOAT_EQ(dy[2] * x[4] + dy[6] * x[13], dwptr[2]);
+  EXPECT_FLOAT_EQ(dy[1] * x[1] + dy[5] * x[10] + dy[3] * x[7] + dy[7] * x[16],
+                  dwptr[3]);
+  EXPECT_FLOAT_EQ(dy[0] * x[0] + dy[4] * x[9] + dy[1] * x[2] + dy[5] * x[11] +
+                      dy[2] * x[6] + dy[6] * x[15] + dy[3] * x[8] +
+                      dy[7] * x[17],
+                  dwptr[4]);
+  EXPECT_FLOAT_EQ(dy[0] * x[1] + dy[4] * x[10] + dy[2] * x[7] + dy[6] * x[16],
+                  dwptr[5]);
+  EXPECT_FLOAT_EQ(dy[1] * x[4] + dy[5] * x[13], dwptr[6]);
+  EXPECT_FLOAT_EQ(dy[0] * x[3] + dy[4] * x[12] + dy[1] * x[5] + dy[5] * x[14],
+                  dwptr[7]);
+  EXPECT_FLOAT_EQ(dy[0] * x[4] + dy[4] * x[13], dwptr[8]);
+}
+#endif  // USE_CBLAS
diff --git a/test/singa/test_cpp_cpu.cc b/test/singa/test_cpp_cpu.cc
new file mode 100644
index 0000000..5f3308a
--- /dev/null
+++ b/test/singa/test_cpp_cpu.cc
@@ -0,0 +1,72 @@
+/************************************************************
+*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*
+*************************************************************/
+
+#include "gtest/gtest.h"
+#include  "singa/core/device.h"
+#include "singa/proto/core.pb.h"
+
+using singa::CppCPU;
+using singa::Block;
+TEST(CppCPU, Constructor) {
+  CppCPU dev;
+  EXPECT_EQ(-1, dev.id());
+}
+
+TEST(CppCPU, MemoryMallocFree) {
+  CppCPU dev;
+  Block* b = dev.NewBlock(4);
+  EXPECT_NE(nullptr, b);
+  EXPECT_EQ(4u, b->size());
+  dev.FreeBlock(b);
+}
+
+TEST(CppCPU, Exec) {
+  CppCPU dev;
+  Block* b = dev.NewBlock(4);
+  int x = 1, y =3, z = 0;
+  dev.Exec([x, y, &z](singa::Context *ctx) {
+      z = x + y;
+      }, {b}, {b}, false);
+  EXPECT_EQ(x + y, z);
+  dev.FreeBlock(b);
+}
+
+TEST(CppCPU, CopyData) {
+  CppCPU dev;
+  Block* b = dev.NewBlock(4);
+  char s[] = {'a', 'b', 'c', 'x'};
+  dev.CopyDataFromHostPtr(b, s, 4);
+  const char* bstr = static_cast<const char*>(b->data());
+  EXPECT_EQ('a', bstr[0]);
+  EXPECT_EQ('b', bstr[1]);
+  EXPECT_EQ('x', bstr[3]);
+
+  Block* c = dev.NewBlock(4);
+  dev.CopyDataToFrom(c, b, 4, singa::kHostToHost, 0, 0);
+  const char* cstr = static_cast<const char*>(c->data());
+
+  EXPECT_EQ('a', cstr[0]);
+  EXPECT_EQ('b', cstr[1]);
+  EXPECT_EQ('x', cstr[3]);
+  dev.FreeBlock(b);
+  dev.FreeBlock(c);
+}
+
diff --git a/test/singa/test_cross_entropy.cc b/test/singa/test_cross_entropy.cc
new file mode 100644
index 0000000..c7fa2fb
--- /dev/null
+++ b/test/singa/test_cross_entropy.cc
@@ -0,0 +1,116 @@
+/************************************************************
+*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*
+*************************************************************/
+
+#include "gtest/gtest.h"
+#include "singa/core/tensor.h"
+#include "singa/core/device.h"
+#include "singa/model/loss.h"
+#include "singa/singa_config.h"
+
+using singa::Tensor;
+class TestSoftmaxCrossEntropy : public ::testing::Test {
+ protected:
+  virtual void SetUp() {
+    p.Reshape(singa::Shape{2, 4});
+    t.Reshape(singa::Shape{2, 1});
+  }
+  const float pdat[8] = {0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1};
+  const int tdat[2] = {0, 2};
+
+  singa::Tensor p, t;
+};
+
+TEST_F(TestSoftmaxCrossEntropy, CppForward) {
+  p.CopyDataFromHostPtr(pdat, 8);
+  t.AsType(singa::kInt);
+  t.CopyDataFromHostPtr(tdat, 2);
+
+  singa::SoftmaxCrossEntropy cross_entropy;
+  const Tensor& loss = cross_entropy.Forward(singa::kEval, p, t);
+  auto ldat = loss.data<float>();
+
+  const float result_test = -log(0.25);
+  EXPECT_FLOAT_EQ(ldat[0], result_test);
+  EXPECT_FLOAT_EQ(ldat[1], result_test);
+}
+
+TEST_F(TestSoftmaxCrossEntropy, CppBackward) {
+  p.CopyDataFromHostPtr(pdat, 8);
+  t.AsType(singa::kInt);
+  t.CopyDataFromHostPtr(tdat, 2);
+
+  singa::SoftmaxCrossEntropy cross_entropy;
+  cross_entropy.Forward(singa::kTrain, p, t);
+  const Tensor& grad = cross_entropy.Backward();
+
+  auto gdat = grad.data<float>();
+  EXPECT_FLOAT_EQ(gdat[0], -0.75);
+  EXPECT_FLOAT_EQ(gdat[1], 0.25);
+  EXPECT_FLOAT_EQ(gdat[2], 0.25);
+  EXPECT_FLOAT_EQ(gdat[3], 0.25);
+  EXPECT_FLOAT_EQ(gdat[4], 0.25);
+  EXPECT_FLOAT_EQ(gdat[5], 0.25);
+  EXPECT_FLOAT_EQ(gdat[6], -0.75);
+  EXPECT_FLOAT_EQ(gdat[7], 0.25);
+}
+
+#ifdef USE_CUDA
+
+TEST_F(TestSoftmaxCrossEntropy, CudaForward) {
+  singa::SoftmaxCrossEntropy cross_entropy;
+  auto dev = std::make_shared<singa::CudaGPU>();
+  p.ToDevice(dev);
+  t.ToDevice(dev);
+  p.CopyDataFromHostPtr(pdat, 8);
+  t.CopyDataFromHostPtr(tdat, 2);
+
+  Tensor loss = cross_entropy.Forward(singa::kEval, p, t);
+  loss.ToHost();
+  auto ldat = loss.data<float>();
+
+  const float result_test = -log(0.25);
+  EXPECT_FLOAT_EQ(ldat[0], result_test);
+  EXPECT_FLOAT_EQ(ldat[1], result_test);
+}
+
+TEST_F(TestSoftmaxCrossEntropy, CudaBackward) {
+  singa::SoftmaxCrossEntropy cross_entropy;
+  auto dev = std::make_shared<singa::CudaGPU>();
+  p.ToDevice(dev);
+  t.ToDevice(dev);
+  p.CopyDataFromHostPtr(pdat, 8);
+  t.CopyDataFromHostPtr(tdat, 2);
+
+  cross_entropy.Forward(singa::kTrain, p, t);
+  Tensor grad = cross_entropy.Backward();
+
+  grad.ToHost();
+  auto gdat = grad.data<float>();
+  EXPECT_FLOAT_EQ(gdat[0], -0.75);
+  EXPECT_FLOAT_EQ(gdat[1], 0.25);
+  EXPECT_FLOAT_EQ(gdat[2], 0.25);
+  EXPECT_FLOAT_EQ(gdat[3], 0.25);
+  EXPECT_FLOAT_EQ(gdat[4], 0.25);
+  EXPECT_FLOAT_EQ(gdat[5], 0.25);
+  EXPECT_FLOAT_EQ(gdat[6], -0.75);
+  EXPECT_FLOAT_EQ(gdat[7], 0.25);
+}
+#endif  // USE_CUDA
diff --git a/test/singa/test_csv.cc b/test/singa/test_csv.cc
new file mode 100644
index 0000000..d8dbe69
--- /dev/null
+++ b/test/singa/test_csv.cc
@@ -0,0 +1,60 @@
+/************************************************************
+*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*
+*************************************************************/
+
+#include "singa/io/encoder.h"
+#include "singa/io/decoder.h"
+#include "gtest/gtest.h"
+#include <sstream>
+#include <algorithm>
+
+using singa::Shape;
+using singa::Tensor;
+TEST(CSV, EncoderDecode) {
+  singa::CSVEncoder encoder;
+  singa::CSVDecoder decoder;
+
+  singa::DecoderConf decoder_conf;
+  decoder_conf.set_has_label(true);
+  decoder.Setup(decoder_conf);
+  EXPECT_EQ(true, decoder.has_label());
+
+  float in_data[] = {1.23, 4.5, 5.1, 3.33, 0.44};
+  std::string in_str = "2, 1.23, 4.5, 5.1, 3.33, 0.44";
+  int in_label = 2;
+  size_t size = 5;
+
+  std::vector<Tensor> input;
+  Tensor data(Shape{size}, singa::kFloat32), label(Shape{1}, singa::kInt);
+  data.CopyDataFromHostPtr<float>(in_data, size);
+  label.CopyDataFromHostPtr<int>(&in_label, 1);
+  input.push_back(data);
+  input.push_back(label);
+
+  std::string value = encoder.Encode(input);
+  in_str.erase(std::remove(in_str.begin(), in_str.end(), ' '), in_str.end());
+  EXPECT_EQ(in_str, value);
+
+  std::vector<Tensor> output = decoder.Decode(value);
+  const auto* out_data = output.at(0).data<float>();
+  const auto* out_label = output.at(1).data<int>();
+  for (size_t i = 0; i < size; i++) EXPECT_EQ(in_data[i], out_data[i]);
+  EXPECT_EQ(in_label, out_label[0]);
+}
diff --git a/test/singa/test_cudnn_activation.cc b/test/singa/test_cudnn_activation.cc
new file mode 100644
index 0000000..6a989d1
--- /dev/null
+++ b/test/singa/test_cudnn_activation.cc
@@ -0,0 +1,134 @@
+/************************************************************
+*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*
+*************************************************************/
+#include "singa/singa_config.h"
+#ifdef USE_CUDNN
+
+#include "singa/proto/core.pb.h"
+#include "../src/model/layer/cudnn_activation.h"
+#include "gtest/gtest.h"
+#include <math.h>  // exp tanh
+#include <cudnn.h>
+
+using singa::CudnnActivation;
+using singa::Shape;
+TEST(CudnnActivation, Setup) {
+  CudnnActivation acti;
+  // EXPECT_EQ("CudnnActivation", acti.layer_type());
+
+  singa::LayerConf conf;
+  conf.set_type("cudnn_relu");
+  singa::ReLUConf* reluconf = conf.mutable_relu_conf();
+  reluconf->set_negative_slope(0.5f);
+
+  acti.Setup(Shape{3}, conf);
+//  EXPECT_EQ(CUDNN_ACTIVATION_RELU, acti.CudnnMode());
+  EXPECT_EQ(0.5f, acti.Negative_slope());
+}
+
+TEST(CudnnActivation, Forward) {
+  const float x[] = {1.0f, 2.0f, 3.0f, -2.0f, -3.0f, -4.0};
+  size_t n = sizeof(x) / sizeof(float);
+  auto cuda = std::make_shared<singa::CudaGPU>();
+  singa::Tensor in(singa::Shape{n}, cuda);
+  in.CopyDataFromHostPtr<float>(x, n);
+
+  float neg_slope = 0.5f;
+  std::string types[] = {"cudnn_sigmoid", "cudnn_tanh", "cudnn_relu"};
+  for (int j = 0; j < 3; j++) {
+    CudnnActivation acti;
+    singa::LayerConf conf;
+    std::string layertype = types[j];
+    conf.set_type(layertype);
+    if (layertype == "relu") {
+      singa::ReLUConf* reluconf = conf.mutable_relu_conf();
+      reluconf->set_negative_slope(neg_slope);
+    }
+    acti.Setup(Shape{n}, conf);
+
+    singa::Tensor out = acti.Forward(singa::kTrain, in);
+    EXPECT_EQ(n, out.Size());
+    out.ToHost();
+    const float* yptr = out.data<float>();
+    float* y = new float[n];
+    if (acti.Mode() == "sigmoid") {
+      for (size_t i = 0; i < n; i++) y[i] = 1.f / (1.f + exp(-x[i]));
+    } else if (acti.Mode() == "tanh") {
+      for (size_t i = 0; i < n; i++) y[i] = tanh(x[i]);
+    } else if (acti.Mode() == "relu") {
+      for (size_t i = 0; i < n; i++) y[i] = (x[i] >= 0.f) ? x[i] : 0.f;
+    } else
+      LOG(FATAL) << "Unkown activation: " << acti.Mode();
+    EXPECT_FLOAT_EQ(y[0], yptr[0]);
+    EXPECT_FLOAT_EQ(y[4], yptr[4]);
+    EXPECT_FLOAT_EQ(y[5], yptr[5]);
+    delete[] y;
+  }
+}
+
+TEST(CudnnActivation, Backward) {
+  const float x[] = {2.0f, 3.0f, 3.0f, 7.f, 0.0f, 5.0, 1.5, 2.5, -2.5, 1.5};
+  size_t n = sizeof(x) / sizeof(float);
+  auto cuda = std::make_shared<singa::CudaGPU>();
+  singa::Tensor in(singa::Shape{n}, cuda);
+  in.CopyDataFromHostPtr<float>(x, n);
+  float neg_slope = 0.5f;
+  std::string types[] = {"cudnn_sigmoid", "cudnn_tanh", "cudnn_relu"};
+  for (int j = 0; j < 3; j++) {
+    CudnnActivation acti;
+    singa::LayerConf conf;
+    std::string layertype = types[j];
+    conf.set_type(layertype);
+    if (layertype == "RELU") {
+      singa::ReLUConf* reluconf = conf.mutable_relu_conf();
+      reluconf->set_negative_slope(neg_slope);
+    }
+    acti.Setup(Shape{n}, conf);
+    singa::Tensor out = acti.Forward(singa::kTrain, in);
+    EXPECT_EQ(n, out.Size());
+    out.ToHost();
+    const float* yptr = out.data<float>();
+
+    const float grad[] = {2.0f, 1.0f, 2.0f, 0.0f, -2.0f,
+                          -1.0, 1.5,  2.5,  -1.5, -2.5};
+    singa::Tensor out_diff(singa::Shape{n}, cuda);
+    out_diff.CopyDataFromHostPtr<float>(grad, n);
+    const auto ret = acti.Backward(singa::kTrain, out_diff);
+    singa::Tensor in_diff = ret.first;
+    in_diff.ToHost();
+    const float* xptr = in_diff.data<float>();
+    float* dx = new float[n];
+    if (acti.Mode() == "sigmoid") {
+      for (size_t i = 0; i < n; i++) dx[i] = grad[i] * yptr[i] * (1. - yptr[i]);
+    } else if (acti.Mode() == "tanh") {
+      for (size_t i = 0; i < n; i++) dx[i] = grad[i] * (1. - yptr[i] * yptr[i]);
+    } else if (acti.Mode() == "relu") {
+      for (size_t i = 0; i < n; i++)
+        dx[i] =
+            grad[i] * (x[i] > 0.f);  //+ acti.Negative_slope() * (x[i] <= 0.f);
+    } else
+      LOG(FATAL) << "Unkown activation: " << acti.Mode();
+    for (size_t i = 0; i < n; i++) {
+      EXPECT_NEAR(dx[i], xptr[i], 1e-7);
+    }
+    delete[] dx;
+  }
+}
+#endif  // USE_CUDNN
diff --git a/test/singa/test_cudnn_batchnorm.cc b/test/singa/test_cudnn_batchnorm.cc
new file mode 100644
index 0000000..b024c19
--- /dev/null
+++ b/test/singa/test_cudnn_batchnorm.cc
@@ -0,0 +1,240 @@
+/*********************************************************
+*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*
+************************************************************/
+
+#include "../src/model/layer/cudnn_batchnorm.h"
+
+#ifdef USE_CUDNN
+#include "gtest/gtest.h"
+
+using singa::CudnnBatchNorm;
+using singa::Shape;
+TEST(CudnnBatchNorm, Setup) {
+  CudnnBatchNorm batchnorm;
+  // EXPECT_EQ("CudnnBatchNorm", batchnorm.layer_type());
+
+  singa::LayerConf conf;
+  singa::BatchNormConf *batchnorm_conf = conf.mutable_batchnorm_conf();
+  batchnorm_conf->set_factor(0.01);
+  batchnorm.Setup(Shape{2, 4, 4}, conf);
+
+  EXPECT_FLOAT_EQ(0.01, batchnorm.factor());
+  EXPECT_EQ(2u, batchnorm.channels());
+  EXPECT_EQ(4u, batchnorm.height());
+  EXPECT_EQ(4u, batchnorm.width());
+}
+
+TEST(CudnnBatchNorm, Forward) {
+  CudnnBatchNorm batchnorm;
+  const float x[] = {
+    0.0736655, 0.0459045, 0.0779517, 0.0771059,
+    0.0586862, 0.0561263, 0.0708457, 0.0977273,
+    0.0405025, -0.170897, 0.0208982, 0.136865,
+    -0.0367905, -0.0618205, -0.0103908, -0.0522777,
+    -0.122161, -0.025427, -0.0718576, -0.185941,
+    0.0166533, 0.178679, -0.0576606, -0.137817,
+    0.150676, 0.153442, -0.0929899, -0.148675,
+    -0.112459, -0.106284, -0.103074, -0.0668811
+  };
+  auto cuda = std::make_shared<singa::CudaGPU>();
+  singa::Tensor in(singa::Shape{1,2,4,4}, cuda);
+  in.CopyDataFromHostPtr(x, 1*2*4*4);
+  const float alpha_[] = {1, 1};
+  singa::Tensor alpha(singa::Shape{1,2,1,1}, cuda);
+  alpha.CopyDataFromHostPtr(alpha_, 1*2*1*1);
+
+  const float beta_[] = {0, 0};
+  singa::Tensor beta(singa::Shape{1,2,1,1}, cuda);
+  beta.CopyDataFromHostPtr(beta_, 1*2*1*1);
+
+  singa::LayerConf conf;
+  singa::BatchNormConf *batchnorm_conf = conf.mutable_batchnorm_conf();
+  batchnorm_conf->set_factor(0.9);
+  batchnorm.Setup(Shape{2, 4, 4}, conf);
+
+  batchnorm.ToDevice(cuda);
+  batchnorm.set_bnScale(alpha);
+  batchnorm.set_bnBias(beta);
+  batchnorm.set_runningMean(beta);
+  batchnorm.set_runningVariance(beta);
+  singa::Tensor out = batchnorm.Forward(singa::kTrain, in);
+  out.ToHost();
+  const float *outptr = out.data<float>();
+  const auto & shape = out.shape();
+  EXPECT_EQ(4u, shape.size());
+  EXPECT_EQ(1u, shape[0]);
+  EXPECT_EQ(2u, shape[1]);
+  EXPECT_EQ(4u, shape[2]);
+  EXPECT_EQ(4u, shape[3]);
+  EXPECT_NEAR(0.637092, outptr[0], 1e-4f);
+  EXPECT_NEAR(0.262057, outptr[1], 1e-4f);
+  EXPECT_NEAR(0.694995, outptr[2], 1e-4f);
+  EXPECT_NEAR(0.683569, outptr[3], 1e-4f);
+  EXPECT_NEAR(0.43473, outptr[4], 1e-4f);
+  EXPECT_NEAR(0.400147, outptr[5], 1e-4f);
+  EXPECT_NEAR(0.598998, outptr[6], 1e-4f);
+  EXPECT_NEAR(0.962152, outptr[7], 1e-4f);
+  EXPECT_NEAR(0.189079, outptr[8], 1e-4f);
+  EXPECT_NEAR(-2.6668, outptr[9], 1e-4f);
+  EXPECT_NEAR(-0.0757632, outptr[10], 1e-4f);
+  EXPECT_NEAR(1.49088, outptr[11], 1e-4f);
+  EXPECT_NEAR(-0.855104, outptr[12], 1e-4f);
+  EXPECT_NEAR(-1.19324, outptr[13], 1e-4f);
+  EXPECT_NEAR(-0.498459, outptr[14], 1e-4f);
+  EXPECT_NEAR(-1.06433, outptr[15], 1e-4f);
+  EXPECT_NEAR(-0.696646, outptr[16], 1e-4f);
+  EXPECT_NEAR(0.185125, outptr[17], 1e-4f);
+  EXPECT_NEAR(-0.238109, outptr[18], 1e-4f);
+  EXPECT_NEAR(-1.27803, outptr[19], 1e-4f);
+  EXPECT_NEAR(0.568704, outptr[20], 1e-4f);
+  EXPECT_NEAR(2.04564, outptr[21], 1e-4f);
+  EXPECT_NEAR(-0.108697, outptr[22], 1e-4f);
+  EXPECT_NEAR(-0.839356, outptr[23], 1e-4f);
+  EXPECT_NEAR(1.79038, outptr[24], 1e-4f);
+  EXPECT_NEAR(1.81559, outptr[25], 1e-4f);
+  EXPECT_NEAR(-0.430738, outptr[26], 1e-4f);
+  EXPECT_NEAR(-0.938335, outptr[27], 1e-4f);
+  EXPECT_NEAR(-0.608203, outptr[28], 1e-4f);
+  EXPECT_NEAR(-0.551921, outptr[29], 1e-4f);
+  EXPECT_NEAR(-0.522658, outptr[30], 1e-4f);
+  EXPECT_NEAR(-0.192746, outptr[31], 1e-4f);
+}
+
+TEST(CudnnBatchNorm, Backward) {
+  CudnnBatchNorm batchnorm;
+  const float x[] = {
+    0.0736655, 0.0459045, 0.0779517, 0.0771059,
+    0.0586862, 0.0561263, 0.0708457, 0.0977273,
+    0.0405025, -0.170897, 0.0208982, 0.136865,
+    -0.0367905, -0.0618205, -0.0103908, -0.0522777,
+    -0.122161, -0.025427, -0.0718576, -0.185941,
+    0.0166533, 0.178679, -0.0576606, -0.137817,
+    0.150676, 0.153442, -0.0929899, -0.148675,
+    -0.112459, -0.106284, -0.103074, -0.0668811
+  };
+  auto cuda = std::make_shared<singa::CudaGPU>();
+  singa::Tensor x_tensor(singa::Shape{1,2,4,4}, cuda);
+  x_tensor.CopyDataFromHostPtr(x, 1*2*4*4);
+
+  singa::LayerConf conf;
+  singa::BatchNormConf *batchnorm_conf = conf.mutable_batchnorm_conf();
+  batchnorm_conf->set_factor(1);
+  batchnorm.Setup(Shape{2, 4, 4}, conf);
+
+  const float dy[] = {
+    -0.0064714, 0, 0, 0,
+    0, -0.00297655, -0.0195729, 0,
+    0, 0, 0, 0,
+    0, 0, 0, -0.0032594,
+    0, 0, 0, 0,
+    0, 0, 0.0125562, 0,
+    0.00041933, 0.000386108, -0.0074611, 0.0015929,
+    0.00468428, 0.00735506, -0.00682525, 0.00342023
+  };
+
+  singa::Tensor dy_tensor(singa::Shape{1,2,4,4}, cuda);
+  dy_tensor.CopyDataFromHostPtr(dy, 1*2*4*4);
+  const float alpha_[] = {1, 1};
+  singa::Tensor alpha(singa::Shape{2}, cuda);
+  alpha.CopyDataFromHostPtr(alpha_, 1*2*1*1);
+
+  const float beta_[] = {0, 0};
+  singa::Tensor beta(singa::Shape{2}, cuda);
+  beta.CopyDataFromHostPtr(beta_, 1*2*1*1);
+
+  const float mean_[] = {0.0123405, -0.0622333};
+  singa::Tensor mean(singa::Shape{2}, cuda);
+  mean.CopyDataFromHostPtr(mean_, 1*2*1*1);
+
+  const float var_[] = {15.9948, 8.68198};
+  singa::Tensor var(singa::Shape{2}, cuda);
+  var.CopyDataFromHostPtr(var_, 1*2*1*1);
+
+  batchnorm.ToDevice(cuda);
+  batchnorm.set_bnScale(alpha);
+  batchnorm.set_bnBias(beta);
+  batchnorm.set_runningMean(beta);
+  batchnorm.set_runningVariance(beta);
+  batchnorm.Forward(singa::kTrain, x_tensor);
+  const auto ret = batchnorm.Backward(singa::kTrain, dy_tensor);
+  singa::Tensor dx = ret.first;
+  dx.ToHost();
+  const float *dxptr = dx.data<float>();
+  const auto & shape = dx.shape();
+  EXPECT_EQ(4u, shape.size());
+  EXPECT_EQ(1u, shape[0]);
+  EXPECT_EQ(2u, shape[1]);
+  EXPECT_EQ(4u, shape[2]);
+  EXPECT_EQ(4u, shape[3]);
+  EXPECT_NEAR(-0.0528703, dxptr[0], 1e-4f);
+  EXPECT_NEAR(0.0302578, dxptr[1], 1e-4f);
+  EXPECT_NEAR(0.0352178, dxptr[2], 1e-4f);
+  EXPECT_NEAR(0.0350869, dxptr[3], 1e-4f);
+  EXPECT_NEAR(0.032236, dxptr[4], 1e-4f);
+  EXPECT_NEAR(-0.00837157, dxptr[5], 1e-4f);
+  EXPECT_NEAR(-0.2303, dxptr[6], 1e-4f);
+  EXPECT_NEAR(0.0382786, dxptr[7], 1e-4f);
+  EXPECT_NEAR(0.0294217, dxptr[8], 1e-4f);
+  EXPECT_NEAR(-0.00329757, dxptr[9], 1e-4f);
+  EXPECT_NEAR(0.0263874, dxptr[10], 1e-4f);
+  EXPECT_NEAR(0.0443361, dxptr[11], 1e-4f);
+  EXPECT_NEAR(0.0174587, dxptr[12], 1e-4f);
+  EXPECT_NEAR(0.0135847, dxptr[13], 1e-4f);
+  EXPECT_NEAR(0.0215447, dxptr[14], 1e-4f);
+  EXPECT_NEAR(-0.0289709, dxptr[15], 1e-4f);
+  EXPECT_NEAR(-0.0100591, dxptr[16], 1e-4f);
+  EXPECT_NEAR(-0.00895677, dxptr[17], 1e-4f);
+  EXPECT_NEAR(-0.00948587, dxptr[18], 1e-4f);
+  EXPECT_NEAR(-0.0107859, dxptr[19], 1e-4f);
+  EXPECT_NEAR(-0.00847725, dxptr[20], 1e-4f);
+  EXPECT_NEAR(-0.0066309, dxptr[21], 1e-4f);
+  EXPECT_NEAR(0.105131, dxptr[22], 1e-4f);
+  EXPECT_NEAR(-0.0102375, dxptr[23], 1e-4f);
+  EXPECT_NEAR(-0.00312763, dxptr[24], 1e-4f);
+  EXPECT_NEAR(-0.00339895, dxptr[25], 1e-4f);
+  EXPECT_NEAR(-0.0777377, dxptr[26], 1e-4f);
+  EXPECT_NEAR(0.00415871, dxptr[27], 1e-4f);
+  EXPECT_NEAR(0.0327506, dxptr[28], 1e-4f);
+  EXPECT_NEAR(0.0571663, dxptr[29], 1e-4f);
+  EXPECT_NEAR(-0.0720566, dxptr[30], 1e-4f);
+  EXPECT_NEAR(0.0217477, dxptr[31], 1e-4f);
+
+  singa::Tensor dbnScale = ret.second.at(0);
+  dbnScale.ToHost();
+  const float *dbnScaleptr = dbnScale.data<float>();
+  const auto & dbnScaleShape = dbnScale.shape();
+  EXPECT_EQ(1u, dbnScaleShape.size());
+  EXPECT_EQ(2u, dbnScaleShape[0]);
+
+  EXPECT_NEAR(-0.013569f, dbnScaleptr[0], 1e-4f);
+  EXPECT_NEAR(-0.00219431f, dbnScaleptr[1], 1e-4f);
+
+  singa::Tensor dbnBias = ret.second.at(1);
+  dbnBias.ToHost();
+  const float *dbnBiasptr = dbnBias.data<float>();
+  const auto & dbnBiasShape = dbnBias.shape();
+  EXPECT_EQ(1u, dbnBiasShape.size());
+  EXPECT_EQ(2u, dbnBiasShape[0]);
+
+  EXPECT_NEAR(-0.0322803f, dbnBiasptr[0], 1e-4f);
+  EXPECT_NEAR(0.0161278f, dbnBiasptr[1], 1e-4f);
+}
+
+#endif  //  USE_CUDNN
diff --git a/test/singa/test_cudnn_convolution.cc b/test/singa/test_cudnn_convolution.cc
new file mode 100644
index 0000000..8dbee63
--- /dev/null
+++ b/test/singa/test_cudnn_convolution.cc
@@ -0,0 +1,371 @@
+/************************************************************
+*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*
+*************************************************************/
+#include "../src/model/layer/cudnn_convolution.h"
+#ifdef USE_CUDNN
+
+#include "gtest/gtest.h"
+
+using singa::CudnnConvolution;
+using singa::Shape;
+TEST(CudnnConvolution, Setup) {
+  CudnnConvolution conv;
+  // EXPECT_EQ("CudnnConvolution", conv.layer_type());
+
+  singa::LayerConf conf;
+  singa::ConvolutionConf *convconf = conf.mutable_convolution_conf();
+  convconf->set_kernel_h(2);
+  convconf->set_kernel_w(2);
+  convconf->set_pad_h(1);
+  convconf->set_pad_w(1);
+  convconf->set_stride_h(1);
+  convconf->set_stride_w(1);
+  convconf->set_num_output(2);
+  convconf->set_bias_term(true);
+  // MB
+  convconf->set_workspace_byte_limit(256);
+  convconf->set_prefer("fastest");
+  conv.Setup(Shape{1, 3, 3}, conf);
+
+  EXPECT_EQ(2u, conv.kernel_h());
+  EXPECT_EQ(2u, conv.kernel_w());
+  EXPECT_EQ(1u, conv.pad_h());
+  EXPECT_EQ(1u, conv.pad_w());
+  EXPECT_EQ(1u, conv.stride_h());
+  EXPECT_EQ(1u, conv.stride_w());
+  EXPECT_EQ(2u, conv.num_filters());
+  EXPECT_EQ(true, conv.bias_term());
+  EXPECT_EQ(256u << 20, conv.workspace_byte_limit());
+  EXPECT_STREQ("fastest", conv.prefer().c_str());
+  EXPECT_EQ(1u, conv.channels());
+  EXPECT_EQ(3u, conv.height());
+  EXPECT_EQ(3u, conv.width());
+}
+
+TEST(CudnnConvolution, Forward) {
+  const size_t batchsize = 1, c = 1, h = 3, w = 3;
+  const float x[batchsize * c * h * w] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f,
+                                          6.0f, 7.0f, 8.0f, 9.0f};
+  auto cuda = std::make_shared<singa::CudaGPU>();
+  singa::Tensor in(singa::Shape{batchsize, c, h, w}, cuda);
+  in.CopyDataFromHostPtr(x, batchsize * c * h * w);
+
+  // Set weight and bias manually
+  const size_t num_filters = 1;
+  const size_t col_height = 1 * 3 * 3; // channels * kernel_w * kernel_h
+  const float we[num_filters * col_height] = {
+      1.0f, 1.0f, 0.0f, 0.0f, 0.0f, -1.0f, 0.0f, 1.0f, 0.0f};
+  singa::Tensor weight(singa::Shape{num_filters, col_height}, cuda);
+  weight.CopyDataFromHostPtr(we, col_height);
+  const float b[num_filters] = {1.0f};
+  singa::Tensor bias(singa::Shape{num_filters}, cuda);
+  bias.CopyDataFromHostPtr(b, num_filters);
+  CudnnConvolution conv;
+  conv.set_weight(weight);
+  conv.set_bias(bias);
+
+  singa::LayerConf conf;
+  singa::ConvolutionConf *convconf = conf.mutable_convolution_conf();
+  convconf->set_kernel_h(3);
+  convconf->set_kernel_w(3);
+  convconf->set_pad_h(1);
+  convconf->set_pad_w(1);
+  convconf->set_stride_h(2);
+  convconf->set_stride_w(2);
+  convconf->set_num_output(1);
+  convconf->set_bias_term(true);
+  // MB
+  convconf->set_workspace_byte_limit(256);
+  convconf->set_prefer("fastest");
+  conv.Setup(Shape{1, 3, 3}, conf);
+
+  // Parameter "flag" does not influence convolution
+  singa::Tensor out1 = conv.Forward(singa::kTrain, in);
+  out1.ToHost();
+  const float *outptr1 = out1.data<float>();
+  // Input: 3*3; kernel: 3*3; stride: 2*2; padding: 1*1.
+  EXPECT_EQ(4u, out1.Size());
+
+  EXPECT_EQ(3.0f, outptr1[0]);
+  EXPECT_EQ(7.0f, outptr1[1]);
+  EXPECT_EQ(-3.0f, outptr1[2]);
+  EXPECT_EQ(12.0f, outptr1[3]);
+}
+
+TEST(CudnnConvolution, Backward) {
+  // src_data
+  const size_t batchsize = 1, c = 1, src_h = 3, src_w = 3;
+  const float x[batchsize * c * src_h * src_w] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f,
+                                                  6.0f, 7.0f, 8.0f, 9.0f};
+  auto cuda = std::make_shared<singa::CudaGPU>();
+  singa::Tensor in(singa::Shape{batchsize, c, src_h, src_w}, cuda);
+  in.CopyDataFromHostPtr(x, batchsize * c * src_h * src_w);
+
+  // Set weight_ and bias_ manually
+  const size_t num_filters = 1;
+  const size_t col_height = 1 * 3 * 3; // channels * kernel_w * kernel_h
+  const float we[num_filters * col_height] = {
+      1.0f, 1.0f, 0.0f, 0.0f, 0.0f, -1.0f, 0.0f, 1.0f, 0.0f};
+  singa::Tensor weight(singa::Shape{num_filters, col_height},
+                       cuda);
+  weight.CopyDataFromHostPtr(we, col_height);
+  const float b[num_filters] = {1.0f};
+  singa::Tensor bias(singa::Shape{num_filters}, cuda);
+  bias.CopyDataFromHostPtr(b, num_filters);
+  CudnnConvolution conv;
+  conv.set_weight(weight);
+  conv.set_bias(bias);
+
+  singa::LayerConf conf;
+  singa::ConvolutionConf *convconf = conf.mutable_convolution_conf();
+  convconf->set_kernel_h(3);
+  convconf->set_kernel_w(3);
+  convconf->set_pad_h(1);
+  convconf->set_pad_w(1);
+  convconf->set_stride_h(2);
+  convconf->set_stride_w(2);
+  convconf->set_num_output(1);
+  convconf->set_bias_term(true);
+  convconf->set_workspace_byte_limit(256);
+  convconf->set_prefer("fastest");
+  conv.Setup(Shape{1, 3, 3}, conf);
+
+  // Parameter "flag" does not influence convolution
+  singa::Tensor out1 = conv.Forward(singa::kTrain, in);
+
+  // grad
+  const size_t grad_h = 2, grad_w = 2;
+  const float dy[batchsize * num_filters * grad_h * grad_w] = {0.1f, 0.2f, 0.3f,
+                                                               0.4f};
+  singa::Tensor grad(singa::Shape{batchsize, num_filters, grad_h, grad_w},
+                     cuda);
+  grad.CopyDataFromHostPtr(dy, batchsize * num_filters * grad_h * grad_w);
+
+  const auto ret = conv.Backward(singa::kTrain, grad);
+  singa::Tensor in_grad = ret.first;
+  in_grad.ToHost();
+  const float *dx = in_grad.data<float>();
+  const float *wptr = we;
+  EXPECT_EQ(9u, in_grad.Size());
+  EXPECT_EQ(dy[0] * wptr[4], dx[0]);
+  EXPECT_EQ(dy[0] * wptr[5] + dy[1] * wptr[3], dx[1]);
+  EXPECT_EQ(dy[1] * wptr[4], dx[2]);
+  EXPECT_EQ(dy[0] * wptr[7] + dy[2] * wptr[1], dx[3]);
+  EXPECT_EQ(
+      dy[0] * wptr[8] + dy[1] * wptr[6] + dy[2] * wptr[2] + dy[3] * wptr[0],
+      dx[4]);
+  EXPECT_EQ(dy[1] * wptr[7] + dy[3] * wptr[1], dx[5]);
+  EXPECT_EQ(dy[2] * wptr[4], dx[6]);
+  EXPECT_EQ(dy[2] * wptr[5] + dy[3] * wptr[3], dx[7]);
+  EXPECT_EQ(dy[3] * wptr[4], dx[8]);
+
+  singa::Tensor dw = ret.second[0];
+  singa::Tensor db = ret.second[1];
+  dw.ToHost();
+  db.ToHost();
+  const float *dbptr = db.data<float>();
+  EXPECT_EQ(dy[0] + dy[1] + dy[2] + dy[3], dbptr[0]);
+
+  const float *dwptr = dw.data<float>();
+  EXPECT_EQ(9u, dw.Size());
+  EXPECT_EQ(dy[3] * x[4], dwptr[0]);
+  EXPECT_EQ(dy[3] * x[5] + dy[2] * x[3], dwptr[1]);
+  EXPECT_EQ(dy[2] * x[4], dwptr[2]);
+  EXPECT_EQ(dy[1] * x[1] + dy[3] * x[7], dwptr[3]);
+  EXPECT_FLOAT_EQ(dy[0] * x[0] + dy[1] * x[2] + dy[2] * x[6] + dy[3] * x[8],
+                  dwptr[4]);
+  EXPECT_EQ(dy[0] * x[1] + dy[2] * x[7], dwptr[5]);
+  EXPECT_EQ(dy[1] * x[4], dwptr[6]);
+  EXPECT_EQ(dy[0] * x[3] + dy[1] * x[5], dwptr[7]);
+  EXPECT_EQ(dy[0] * x[4], dwptr[8]);
+}
+// Tests for prefer=autotune
+TEST(CudnnConvolution_AT, Setup) {
+  CudnnConvolution conv;
+  // EXPECT_EQ("CudnnConvolution", conv.layer_type());
+
+  singa::LayerConf conf;
+  singa::ConvolutionConf *convconf = conf.mutable_convolution_conf();
+  convconf->set_kernel_h(2);
+  convconf->set_kernel_w(2);
+  convconf->set_pad_h(1);
+  convconf->set_pad_w(1);
+  convconf->set_stride_h(1);
+  convconf->set_stride_w(1);
+  convconf->set_num_output(2);
+  convconf->set_bias_term(true);
+  // MB
+  convconf->set_workspace_byte_limit(256);
+  convconf->set_prefer("autotune");
+  conv.Setup(Shape{1, 3, 3}, conf);
+
+  EXPECT_EQ(2u, conv.kernel_h());
+  EXPECT_EQ(2u, conv.kernel_w());
+  EXPECT_EQ(1u, conv.pad_h());
+  EXPECT_EQ(1u, conv.pad_w());
+  EXPECT_EQ(1u, conv.stride_h());
+  EXPECT_EQ(1u, conv.stride_w());
+  EXPECT_EQ(2u, conv.num_filters());
+  EXPECT_EQ(true, conv.bias_term());
+  EXPECT_EQ(256u << 20, conv.workspace_byte_limit());
+  EXPECT_STREQ("autotune", conv.prefer().c_str());
+  EXPECT_EQ(1u, conv.channels());
+  EXPECT_EQ(3u, conv.height());
+  EXPECT_EQ(3u, conv.width());
+}
+
+TEST(CudnnConvolution_AT, Forward) {
+  const size_t batchsize = 1, c = 1, h = 3, w = 3;
+  const float x[batchsize * c * h * w] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f,
+                                          6.0f, 7.0f, 8.0f, 9.0f};
+
+  auto cuda = std::make_shared<singa::CudaGPU>();
+  singa::Tensor in(singa::Shape{batchsize, c, h, w}, cuda);
+  in.CopyDataFromHostPtr(x, batchsize * c * h * w);
+
+  // Set weight and bias manually
+  const size_t num_filters = 1;
+  const float we[num_filters * batchsize * h * w] = {
+      1.0f, 1.0f, 0.0f, 0.0f, 0.0f, -1.0f, 0.0f, 1.0f, 0.0f};
+  singa::Tensor weight(singa::Shape{num_filters, batchsize * h * w}, cuda);
+  weight.CopyDataFromHostPtr(we, batchsize * h * w);
+  const float b[num_filters] = {1.0f};
+  singa::Tensor bias(singa::Shape{num_filters}, cuda);
+  bias.CopyDataFromHostPtr(b, num_filters);
+  CudnnConvolution conv;
+  conv.set_weight(weight);
+  conv.set_bias(bias);
+
+  singa::LayerConf conf;
+  singa::ConvolutionConf *convconf = conf.mutable_convolution_conf();
+  convconf->set_kernel_h(3);
+  convconf->set_kernel_w(3);
+  convconf->set_pad_h(1);
+  convconf->set_pad_w(1);
+  convconf->set_stride_h(2);
+  convconf->set_stride_w(2);
+  convconf->set_num_output(1);
+  convconf->set_bias_term(true);
+  // MB
+  convconf->set_workspace_byte_limit(256);
+  convconf->set_prefer("autotune");
+  conv.Setup(Shape{1, 3, 3}, conf);
+
+  // Parameter "flag" does not influence convolution
+  singa::Tensor out1 = conv.Forward(singa::kTrain, in);
+  out1.ToHost();
+  const float *outptr1 = out1.data<float>();
+  // Input: 3*3; kernel: 3*3; stride: 2*2; padding: 1*1.
+  EXPECT_EQ(4u, out1.Size());
+
+  EXPECT_EQ(3.0f, outptr1[0]);
+  EXPECT_EQ(7.0f, outptr1[1]);
+  EXPECT_EQ(-3.0f, outptr1[2]);
+  EXPECT_EQ(12.0f, outptr1[3]);
+}
+
+TEST(CudnnConvolution_AT, Backward) {
+  // src_data
+  const size_t batchsize = 1, c = 1, src_h = 3, src_w = 3;
+  const float x[batchsize * c * src_h * src_w] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f,
+                                                  6.0f, 7.0f, 8.0f, 9.0f};
+
+  auto cuda = std::make_shared<singa::CudaGPU>();
+  singa::Tensor in(singa::Shape{batchsize, c, src_h, src_w}, cuda);
+  in.CopyDataFromHostPtr(x, batchsize * c * src_h * src_w);
+
+  // Set weight_ and bias_ manually
+  const size_t num_filters = 1;
+  const float we[num_filters * batchsize * src_h * src_w] = {
+      1.0f, 1.0f, 0.0f, 0.0f, 0.0f, -1.0f, 0.0f, 1.0f, 0.0f};
+  singa::Tensor weight(singa::Shape{num_filters, batchsize * src_h * src_w},
+                       cuda);
+  weight.CopyDataFromHostPtr(we, batchsize * src_h * src_w);
+  const float b[num_filters] = {1.0f};
+  singa::Tensor bias(singa::Shape{num_filters}, cuda);
+  bias.CopyDataFromHostPtr(b, num_filters);
+  CudnnConvolution conv;
+  conv.set_weight(weight);
+  conv.set_bias(bias);
+
+  singa::LayerConf conf;
+  singa::ConvolutionConf *convconf = conf.mutable_convolution_conf();
+  convconf->set_kernel_h(3);
+  convconf->set_kernel_w(3);
+  convconf->set_pad_h(1);
+  convconf->set_pad_w(1);
+  convconf->set_stride_h(2);
+  convconf->set_stride_w(2);
+  convconf->set_num_output(1);
+  convconf->set_bias_term(true);
+  convconf->set_workspace_byte_limit(256);
+  convconf->set_prefer("autotune");
+  conv.Setup(Shape{1, 3, 3}, conf);
+
+  // Parameter "flag" does not influence convolution
+  singa::Tensor out1 = conv.Forward(singa::kTrain, in);
+
+  // grad
+  const size_t grad_h = 2, grad_w = 2;
+  const float dy[batchsize * num_filters * grad_h * grad_w] = {0.1f, 0.2f, 0.3f,
+                                                               0.4f};
+  singa::Tensor grad(singa::Shape{batchsize, num_filters, grad_h, grad_w},
+                     cuda);
+  grad.CopyDataFromHostPtr(dy, batchsize * num_filters * grad_h * grad_w);
+
+  const auto ret = conv.Backward(singa::kTrain, grad);
+  singa::Tensor in_grad = ret.first;
+  in_grad.ToHost();
+  const float *dx = in_grad.data<float>();
+  const float *wptr = we;
+  EXPECT_EQ(9u, in_grad.Size());
+  EXPECT_EQ(dy[0] * wptr[4], dx[0]);
+  EXPECT_EQ(dy[0] * wptr[5] + dy[1] * wptr[3], dx[1]);
+  EXPECT_EQ(dy[1] * wptr[4], dx[2]);
+  EXPECT_EQ(dy[0] * wptr[7] + dy[2] * wptr[1], dx[3]);
+  EXPECT_EQ(
+      dy[0] * wptr[8] + dy[1] * wptr[6] + dy[2] * wptr[2] + dy[3] * wptr[0],
+      dx[4]);
+  EXPECT_EQ(dy[1] * wptr[7] + dy[3] * wptr[1], dx[5]);
+  EXPECT_EQ(dy[2] * wptr[4], dx[6]);
+  EXPECT_EQ(dy[2] * wptr[5] + dy[3] * wptr[3], dx[7]);
+  EXPECT_EQ(dy[3] * wptr[4], dx[8]);
+
+  singa::Tensor dw = ret.second[0];
+  singa::Tensor db = ret.second[1];
+  dw.ToHost();
+  db.ToHost();
+  const float *dbptr = db.data<float>();
+  EXPECT_EQ(dy[0] + dy[1] + dy[2] + dy[3], dbptr[0]);
+
+  const float *dwptr = dw.data<float>();
+  EXPECT_EQ(9u, dw.Size());
+  EXPECT_EQ(dy[3] * x[4], dwptr[0]);
+  EXPECT_EQ(dy[3] * x[5] + dy[2] * x[3], dwptr[1]);
+  EXPECT_EQ(dy[2] * x[4], dwptr[2]);
+  EXPECT_EQ(dy[1] * x[1] + dy[3] * x[7], dwptr[3]);
+  EXPECT_FLOAT_EQ(dy[0] * x[0] + dy[1] * x[2] + dy[2] * x[6] + dy[3] * x[8],
+                  dwptr[4]);
+  EXPECT_EQ(dy[0] * x[1] + dy[2] * x[7], dwptr[5]);
+  EXPECT_EQ(dy[1] * x[4], dwptr[6]);
+  EXPECT_EQ(dy[0] * x[3] + dy[1] * x[5], dwptr[7]);
+  EXPECT_EQ(dy[0] * x[4], dwptr[8]);
+}
+#endif  // USE_CUDNN
diff --git a/test/singa/test_cudnn_dropout.cc b/test/singa/test_cudnn_dropout.cc
new file mode 100644
index 0000000..f1b8437
--- /dev/null
+++ b/test/singa/test_cudnn_dropout.cc
@@ -0,0 +1,126 @@
+/************************************************************
+*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*
+*************************************************************/
+#include "../src/model/layer/cudnn_dropout.h"
+#ifdef USE_CUDNN
+// cudnn dropout is added in cudnn 5
+#if CUDNN_MAJOR >= 5
+
+#include "gtest/gtest.h"
+
+bool inline GetBitValue(const char* x, int pos) {
+  const unsigned char BitMask[] = {1, 2, 4, 8, 16, 32, 64, 128};
+  int idx = pos / 8;
+  int offset = pos % 8;
+  return x[idx] & BitMask[offset];
+}
+
+using singa::CudnnDropout;
+using singa::Shape;
+TEST(CudnnDropout, Setup) {
+  CudnnDropout drop;
+  // EXPECT_EQ("CudnnDropout", drop.layer_type());
+
+  singa::LayerConf conf;
+  singa::DropoutConf* dropconf = conf.mutable_dropout_conf();
+  dropconf->set_dropout_ratio(0.8);
+
+  drop.Setup(Shape{1}, conf);
+  EXPECT_EQ(0.8f, drop.dropout_ratio());
+}
+
+TEST(CudnnDropout, Forward) {
+  const float x[] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f};
+  size_t n = sizeof(x) / sizeof(float);
+  auto cuda = std::make_shared<singa::CudaGPU>();
+  singa::Tensor in(singa::Shape{n}, cuda);
+  in.CopyDataFromHostPtr(x, n);
+
+  float pdrop = 0.5;
+  CudnnDropout drop;
+  singa::LayerConf conf;
+  singa::DropoutConf* dropconf = conf.mutable_dropout_conf();
+  dropconf->set_dropout_ratio(pdrop);
+  drop.Setup(Shape{1}, conf);
+
+  singa::Tensor out1 = drop.Forward(singa::kTrain, in);
+
+  singa::Tensor mask(drop.mask().shape(), drop.mask().data_type());
+  mask.CopyData(drop.mask());
+  const char* mptr = mask.data<char>();
+  for (size_t i = 0; i < n; i++)
+    EXPECT_FLOAT_EQ(0, GetBitValue(mptr, i) * (GetBitValue(mptr, i) - 1));
+
+  out1.ToHost();
+  const float* outptr1 = out1.data<float>();
+  EXPECT_EQ(n, out1.Size());
+  float scale = 1.0f / (1.0f - pdrop);
+  // the output value should be 0 or the same as the input
+  EXPECT_EQ(0.f, outptr1[0] * (outptr1[0] - scale * x[0]));
+  EXPECT_EQ(0.f, outptr1[1] * (outptr1[1] - scale * x[1]));
+  EXPECT_EQ(0.f, outptr1[7] * (outptr1[7] - scale * x[7]));
+
+  singa::Tensor out2 = drop.Forward(singa::kEval, in);
+  out2.ToHost();
+  EXPECT_EQ(n, out2.Size());
+  const float* outptr2 = out2.data<float>();
+  // the output value should be the same as the input
+  EXPECT_EQ(x[0], outptr2[0]);
+  EXPECT_EQ(x[1], outptr2[1]);
+  EXPECT_EQ(x[7], outptr2[7]);
+}
+
+TEST(CudnnDropout, Backward) {
+  const float x[] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f};
+  size_t n = sizeof(x) / sizeof(float);
+  auto cuda = std::make_shared<singa::CudaGPU>();
+  singa::Tensor in(singa::Shape{n}, cuda);
+  in.CopyDataFromHostPtr(x, n);
+
+  float pdrop = 0.5;
+  float scale = 1.0f / (1.0f - pdrop);
+
+  CudnnDropout drop;
+  singa::LayerConf conf;
+  singa::DropoutConf* dropconf = conf.mutable_dropout_conf();
+  dropconf->set_dropout_ratio(pdrop);
+  drop.Setup(Shape{1}, conf);
+  singa::Tensor out1 = drop.Forward(singa::kTrain, in);
+
+  const float dy[] = {4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 1.0f, 2.0f, 3.0f};
+  singa::Tensor grad(singa::Shape{n}, cuda);
+  grad.CopyDataFromHostPtr(dy, n);
+
+  const auto ret = drop.Backward(singa::kTrain, grad);
+  singa::Tensor in_grad = ret.first;
+  in_grad.ToHost();
+  const float* dx = in_grad.data<float>();
+
+  singa::Tensor mask(drop.mask().shape(), drop.mask().data_type());
+  mask.CopyData(drop.mask());
+  const char* mptr = mask.data<char>();
+
+
+  EXPECT_FLOAT_EQ(dx[0], dy[0] * GetBitValue(mptr, 0) * scale);
+  EXPECT_FLOAT_EQ(dx[1], dy[1] * GetBitValue(mptr, 1) * scale);
+  EXPECT_FLOAT_EQ(dx[7], dy[7] * GetBitValue(mptr, 7) * scale);
+}
+#endif  // CUDNN_MAJOR>=5
+#endif  // USE_CUDNN
diff --git a/test/singa/test_cudnn_lrn.cc b/test/singa/test_cudnn_lrn.cc
new file mode 100644
index 0000000..04ca5f2
--- /dev/null
+++ b/test/singa/test_cudnn_lrn.cc
@@ -0,0 +1,203 @@
+/*********************************************************
+*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*
+************************************************************/
+
+#include "../src/model/layer/cudnn_lrn.h"
+
+#ifdef USE_CUDNN
+// cudnn lrn is added in cudnn 4
+#if CUDNN_VERSION_MAJOR >=4
+#include "gtest/gtest.h"
+
+using singa::CudnnLRN;
+using singa::Shape;
+TEST(CudnnLRN, Setup) {
+  CudnnLRN lrn;
+  // EXPECT_EQ("CudnnLRN", lrn.layer_type());
+
+  singa::LayerConf conf;
+  singa::LRNConf *lrn_conf = conf.mutable_lrn_conf();
+  lrn_conf->set_k(1.0);
+  lrn_conf->set_local_size(3);
+  lrn_conf->set_alpha(0.1);
+  lrn_conf->set_beta(0.75);
+  lrn.Setup(Shape{1}, conf);
+
+  EXPECT_FLOAT_EQ(1.0, lrn.k());
+  EXPECT_EQ(3, lrn.local_size());
+  EXPECT_FLOAT_EQ(0.1, lrn.alpha());
+  EXPECT_FLOAT_EQ(0.75, lrn.beta());
+}
+
+TEST(CudnnLRN, Forward) {
+  CudnnLRN lrn;
+  const float x[] = {
+    0.00658502, -0.0496967, -0.0333733, -0.0263094,
+    -0.044298, 0.0211638, 0.0829358, -0.0172312,
+    -0.0665471, -0.10017, -0.0750333, -0.104551,
+    -0.00981208, -0.0583349, -0.0751652, 0.011747,
+    0.0151165, 0.0304321, 0.0736639, -0.00652653,
+    0.00962833, 0.169646, -0.044588, -0.00244141,
+    0.0597329, -0.0530868, 0.0124246, 0.108429,
+    0.0451175, 0.0247055, 0.0304345, 0.0179575
+  };
+  auto cuda = std::make_shared<singa::CudaGPU>();
+  singa::Tensor in(singa::Shape{1,2,4,4}, cuda);
+  in.CopyDataFromHostPtr(x, 1*2*4*4);
+
+  singa::LayerConf conf;
+  singa::LRNConf *lrn_conf = conf.mutable_lrn_conf();
+  lrn_conf->set_k(1.0);
+  lrn_conf->set_local_size(3);
+  lrn_conf->set_alpha(0.1);
+  lrn_conf->set_beta(0.75);
+  lrn.Setup(Shape{2, 4, 4}, conf);
+
+  singa::Tensor out = lrn.Forward(singa::kTrain, in);
+  out.ToHost();
+  const float *outptr = out.data<float>();
+  const auto & shape = out.shape();
+  EXPECT_EQ(4u, shape.size());
+  EXPECT_EQ(1u, shape[0]);
+  EXPECT_EQ(2u, shape[1]);
+  EXPECT_EQ(4u, shape[2]);
+  EXPECT_EQ(4u, shape[3]);
+
+  EXPECT_NEAR(0.00658498f, outptr[0], 1e-6f);
+  EXPECT_NEAR(-0.0496925f, outptr[1], 1e-6f);
+  EXPECT_NEAR(-0.0333678f, outptr[2], 1e-6f);
+  EXPECT_NEAR(-0.0263089f, outptr[3], 1e-6f);
+  EXPECT_NEAR(-0.0442958f, outptr[4], 1e-6f);
+  EXPECT_NEAR(0.0211483f, outptr[5], 1e-6f);
+  EXPECT_NEAR(0.0829174f, outptr[6], 1e-6f);
+  EXPECT_NEAR(-0.0172311f, outptr[7], 1e-6f);
+  EXPECT_NEAR(-0.0665338f, outptr[8], 1e-6f);
+  EXPECT_NEAR(-0.100138f, outptr[9], 1e-6f);
+  EXPECT_NEAR(-0.0750224f, outptr[10], 1e-6f);
+  EXPECT_NEAR(-0.104492f, outptr[11], 1e-6f);
+  EXPECT_NEAR(-0.00981155f, outptr[12], 1e-6f);
+  EXPECT_NEAR(-0.058329f, outptr[13], 1e-6f);
+  EXPECT_NEAR(-0.0751528f, outptr[14], 1e-6f);
+  EXPECT_NEAR(0.0117468f, outptr[15], 1e-6f);
+  EXPECT_NEAR(0.0151164f, outptr[16], 1e-6f);
+  EXPECT_NEAR(0.0304296f, outptr[17], 1e-6f);
+  EXPECT_NEAR(0.0736518f, outptr[18], 1e-6f);
+  EXPECT_NEAR(-0.00652641f, outptr[19], 1e-6f);
+  EXPECT_NEAR(0.00962783f, outptr[20], 1e-6f);
+  EXPECT_NEAR(0.169522f, outptr[21], 1e-6f);
+  EXPECT_NEAR(-0.0445781f, outptr[22], 1e-6f);
+  EXPECT_NEAR(-0.00244139f, outptr[23], 1e-6f);
+  EXPECT_NEAR(0.0597209f, outptr[24], 1e-6f);
+  EXPECT_NEAR(-0.0530697f, outptr[25], 1e-6f);
+  EXPECT_NEAR(0.0124228f, outptr[26], 1e-6f);
+  EXPECT_NEAR(0.108367f, outptr[27], 1e-6f);
+  EXPECT_NEAR(0.045115f, outptr[28], 1e-6f);
+  EXPECT_NEAR(0.024703f, outptr[29], 1e-6f);
+  EXPECT_NEAR(0.0304295f, outptr[30], 1e-6f);
+  EXPECT_NEAR(0.0179573f, outptr[31], 1e-6f);
+}
+
+TEST(CudnnLRN, Backward) {
+  CudnnLRN lrn;
+
+  const float x[] = {
+    0.00658502, -0.0496967, -0.0333733, -0.0263094,
+    -0.044298, 0.0211638, 0.0829358, -0.0172312,
+    -0.0665471, -0.10017, -0.0750333, -0.104551,
+    -0.00981208, -0.0583349, -0.0751652, 0.011747,
+    0.0151165, 0.0304321, 0.0736639, -0.00652653,
+    0.00962833, 0.169646, -0.044588, -0.00244141,
+    0.0597329, -0.0530868, 0.0124246, 0.108429,
+    0.0451175, 0.0247055, 0.0304345, 0.0179575
+  };
+  auto cuda = std::make_shared<singa::CudaGPU>();
+  singa::Tensor x_tensor(singa::Shape{1,2,4,4}, cuda);
+  x_tensor.CopyDataFromHostPtr(x, 1*2*4*4);
+
+  const float dy[] = {
+    -0.103178, -0.0326904, 0.293932, 0.355288,
+    -0.0288079, -0.0543308, -0.0668226, 0.0462216,
+    -0.0448064, -0.068982, -0.0509133, -0.0721143,
+    0.0959078, -0.0389037, -0.0510071, -0.178793,
+    0.00428248, -0.001132, -0.19928, 0.011935,
+    0.00622313, 0.143793, 0.0253894, 0.0104906,
+    -0.170673, 0.0283919, 0.00523488, -0.0455003,
+    0.177807, 0.000892812, -0.00113197, 0.00327798
+  };
+
+  singa::Tensor dy_tensor(singa::Shape{1,2,4,4}, cuda);
+  dy_tensor.CopyDataFromHostPtr(dy, 1*2*4*4);
+
+  singa::LayerConf conf;
+  singa::LRNConf *lrn_conf = conf.mutable_lrn_conf();
+  lrn_conf->set_k(1.0);
+  lrn_conf->set_local_size(3);
+  lrn_conf->set_alpha(0.1);
+  lrn_conf->set_beta(0.75);
+  lrn.Setup(Shape{2, 4, 4}, conf);
+
+  lrn.Forward(singa::kTrain, x_tensor);
+  const auto ret = lrn.Backward(singa::kTrain, dy_tensor);
+  singa::Tensor dx = ret.first;
+  dx.ToHost();
+  const float *dxptr = dx.data<float>();
+  const auto & shape = dx.shape();
+  EXPECT_EQ(4u, shape.size());
+  EXPECT_EQ(1u, shape[0]);
+  EXPECT_EQ(2u, shape[1]);
+  EXPECT_EQ(4u, shape[2]);
+  EXPECT_EQ(4u, shape[3]);
+
+  EXPECT_NEAR(-0.103177, dxptr[0], 1e-6f);
+  EXPECT_NEAR(-0.0326837, dxptr[1], 1e-6f);
+  EXPECT_NEAR(0.293844, dxptr[2], 1e-6f);
+  EXPECT_NEAR(0.355269, dxptr[3], 1e-6f);
+  EXPECT_NEAR(-0.0288034, dxptr[4], 1e-6f);
+  EXPECT_NEAR(-0.0543157, dxptr[5], 1e-6f);
+  EXPECT_NEAR(-0.0667802, dxptr[6], 1e-6f);
+  EXPECT_NEAR(0.0462206, dxptr[7], 1e-6f);
+  EXPECT_NEAR(-0.0448215, dxptr[8], 1e-6f);
+  EXPECT_NEAR(-0.0689328, dxptr[9], 1e-6f);
+  EXPECT_NEAR(-0.0508914, dxptr[10], 1e-6f);
+  EXPECT_NEAR(-0.0720598, dxptr[11], 1e-6f);
+  EXPECT_NEAR(0.0959062, dxptr[12], 1e-6f);
+  EXPECT_NEAR(-0.0388931, dxptr[13], 1e-6f);
+  EXPECT_NEAR(-0.0509844, dxptr[14], 1e-6f);
+  EXPECT_NEAR(-0.17879, dxptr[15], 1e-6f);
+  EXPECT_NEAR(0.00428292, dxptr[16], 1e-6f);
+  EXPECT_NEAR(-0.00113432, dxptr[17], 1e-6f);
+  EXPECT_NEAR(-0.199158, dxptr[18], 1e-6f);
+  EXPECT_NEAR(0.0119317, dxptr[19], 1e-6f);
+  EXPECT_NEAR(0.00622216, dxptr[20], 1e-6f);
+  EXPECT_NEAR(0.143491, dxptr[21], 1e-6f);
+  EXPECT_NEAR(0.0253689, dxptr[22], 1e-6f);
+  EXPECT_NEAR(0.0104904, dxptr[23], 1e-6f);
+  EXPECT_NEAR(-0.170617, dxptr[24], 1e-6f);
+  EXPECT_NEAR(0.0283971, dxptr[25], 1e-6f);
+  EXPECT_NEAR(0.00523171, dxptr[26], 1e-6f);
+  EXPECT_NEAR(-0.0454887, dxptr[27], 1e-6f);
+  EXPECT_NEAR(0.177781, dxptr[28], 1e-6f);
+  EXPECT_NEAR(0.000889893, dxptr[29], 1e-6f);
+  EXPECT_NEAR(-0.00113756, dxptr[30], 1e-6f);
+  EXPECT_NEAR(0.00327978, dxptr[31], 1e-6f);
+}
+
+#endif  //  CUDNN_VERSION_MAJOR >= 4
+#endif  //  USE_CUDNN
diff --git a/test/singa/test_cudnn_pooling.cc b/test/singa/test_cudnn_pooling.cc
new file mode 100644
index 0000000..0e3314e
--- /dev/null
+++ b/test/singa/test_cudnn_pooling.cc
@@ -0,0 +1,131 @@
+/************************************************************
+*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*
+*************************************************************/
+#include "../src/model/layer/cudnn_pooling.h"
+#ifdef USE_CUDNN
+
+#include "gtest/gtest.h"
+
+using singa::CudnnPooling;
+using singa::Shape;
+TEST(CudnnPooling, Setup) {
+  CudnnPooling pool;
+  //  EXPECT_EQ("CudnnPooling", pool.layer_type());
+
+  singa::LayerConf conf;
+  singa::PoolingConf *poolconf = conf.mutable_pooling_conf();
+  poolconf->set_pool(singa::PoolingConf_PoolMethod_MAX);
+  poolconf->set_kernel_h(1);
+  poolconf->set_kernel_w(2);
+  poolconf->set_pad_h(1);
+  poolconf->set_pad_w(0);
+  poolconf->set_stride_h(2);
+  poolconf->set_stride_w(1);
+  pool.Setup(Shape{1, 3, 3}, conf);
+
+  EXPECT_EQ(singa::PoolingConf_PoolMethod_MAX, pool.pool_method());
+  EXPECT_EQ(1u, pool.kernel_h());
+  EXPECT_EQ(2u, pool.kernel_w());
+  EXPECT_EQ(1u, pool.pad_h());
+  EXPECT_EQ(0u, pool.pad_w());
+  EXPECT_EQ(2u, pool.stride_h());
+  EXPECT_EQ(1u, pool.stride_w());
+  EXPECT_EQ(1u, pool.channels());
+  EXPECT_EQ(3u, pool.height());
+  EXPECT_EQ(3u, pool.width());
+}
+
+TEST(CudnnPooling, Forward) {
+  const size_t batchsize = 1, c = 1, h = 3, w = 3;
+  const float x[batchsize * c * h * w] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f,
+                                          6.0f, 7.0f, 8.0f, 9.0f};
+  auto cuda = std::make_shared<singa::CudaGPU>();
+  singa::Tensor in(singa::Shape{batchsize, c, h, w}, cuda);
+  in.CopyDataFromHostPtr(x, batchsize * c * h * w);
+
+  CudnnPooling pool;
+  singa::LayerConf conf;
+  singa::PoolingConf *poolconf = conf.mutable_pooling_conf();
+  poolconf->set_pool(singa::PoolingConf_PoolMethod_MAX);
+  poolconf->set_kernel_h(2);
+  poolconf->set_kernel_w(2);
+  poolconf->set_pad_h(0);
+  poolconf->set_pad_w(0);
+  poolconf->set_stride_h(1);
+  poolconf->set_stride_w(1);
+  pool.Setup(Shape{1, 3, 3}, conf);
+
+  // Parameter "flag" does not influence pooling
+  singa::Tensor out1 = pool.Forward(singa::kTrain, in);
+  out1.ToHost();
+  const float *outptr1 = out1.data<float>();
+  // Input: 3*3; kernel: 2*2; stride: 1*1; no padding.
+  EXPECT_EQ(4u, out1.Size());
+  EXPECT_EQ(5.0f, outptr1[0]);
+  EXPECT_EQ(6.0f, outptr1[1]);
+  EXPECT_EQ(8.0f, outptr1[2]);
+  EXPECT_EQ(9.0f, outptr1[3]);
+}
+
+TEST(CudnnPooling, Backward) {
+  // src_data
+  const size_t batchsize = 1, c = 1, src_h = 3, src_w = 3;
+  const float x[batchsize * src_h * src_w] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f,
+                                              6.0f, 7.0f, 8.0f, 9.0f};
+  auto cuda = std::make_shared<singa::CudaGPU>();
+  singa::Tensor in(singa::Shape{batchsize, c, src_h, src_w}, cuda);
+  in.CopyDataFromHostPtr(x, batchsize * c * src_h * src_w);
+
+  CudnnPooling pool;
+  singa::LayerConf conf;
+  singa::PoolingConf *poolconf = conf.mutable_pooling_conf();
+  poolconf->set_pool(singa::PoolingConf_PoolMethod_MAX);
+  poolconf->set_kernel_h(2);
+  poolconf->set_kernel_w(2);
+  poolconf->set_pad_h(0);
+  poolconf->set_pad_w(0);
+  poolconf->set_stride_h(1);
+  poolconf->set_stride_w(1);
+  pool.Setup(Shape{1, 3, 3}, conf);
+
+  singa::Tensor out1 = pool.Forward(singa::kTrain, in);
+
+  // grad
+  const size_t grad_h = 2, grad_w = 2;
+  const float dy[batchsize * c * grad_h * grad_w] = {0.1f, 0.2f, 0.3f, 0.4f};
+  singa::Tensor grad(singa::Shape{batchsize, c, grad_h, grad_w}, cuda);
+  grad.CopyDataFromHostPtr(dy, batchsize * c * grad_h * grad_w);
+
+  const auto ret = pool.Backward(singa::kTrain, grad);
+  singa::Tensor in_grad = ret.first;
+  in_grad.ToHost();
+  const float *dx = in_grad.data<float>();
+  EXPECT_EQ(9u, in_grad.Size());
+  EXPECT_EQ(0.0f, dx[0]);
+  EXPECT_EQ(0.0f, dx[1]);
+  EXPECT_EQ(0.0f, dx[2]);
+  EXPECT_EQ(0.0f, dx[3]);
+  EXPECT_EQ(0.1f, dx[4]);
+  EXPECT_EQ(0.2f, dx[5]);
+  EXPECT_EQ(0.0f, dx[6]);
+  EXPECT_EQ(0.3f, dx[7]);
+  EXPECT_EQ(0.4f, dx[8]);
+}
+#endif  // USE_CUDNN
diff --git a/test/singa/test_cudnn_rnn.cc b/test/singa/test_cudnn_rnn.cc
new file mode 100644
index 0000000..07336a2
--- /dev/null
+++ b/test/singa/test_cudnn_rnn.cc
@@ -0,0 +1,181 @@
+/************************************************************
+*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*
+*************************************************************/
+
+#include "../src/model/layer/cudnn_rnn.h"
+#ifdef USE_CUDNN
+#if CUDNN_VERSION >= 5005
+
+#include "gtest/gtest.h"
+
+using singa::CudnnRNN;
+using singa::Shape;
+using singa::Tensor;
+class TestCudnnRNN : public ::testing::Test {
+  protected:
+    virtual void SetUp() {
+      singa::RNNConf *rnnconf = conf.mutable_rnn_conf();
+      rnnconf->set_hidden_size(hidden_size);
+      rnnconf->set_num_stacks(1);
+      rnnconf->set_dropout(0);
+      rnnconf->set_input_mode("linear");
+      rnnconf->set_direction("unidirectional");
+      rnnconf->set_rnn_mode("tanh");
+    }
+    singa::LayerConf conf;
+    size_t hidden_size = 4;
+};
+
+TEST_F(TestCudnnRNN, Setup) {
+  CudnnRNN rnn;
+  // EXPECT_EQ("CudnnRNN", rnn.layer_type());
+  rnn.Setup(Shape{2}, conf);
+  auto weight = rnn.param_values().at(0);
+  EXPECT_EQ(weight.Size(), hidden_size * (2 + hidden_size + 2));
+}
+
+TEST_F(TestCudnnRNN, Forward) {
+  auto cuda = std::make_shared<singa::CudaGPU>();
+  const size_t seqLength = 4, batchsize = 1, dim = 2;
+  const float x[seqLength * batchsize * dim] = {1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
+                                          1.0f, 1.0f, 1.0f};
+
+  vector<Tensor> inputs;
+  for (size_t i = 0; i < seqLength; i++) {
+    Tensor t(Shape{batchsize, dim}, cuda);
+    t.CopyDataFromHostPtr(x + i * t.Size(), t.Size());
+    inputs.push_back(t);
+  }
+
+  singa::Tensor hx;
+  inputs.push_back(hx);
+
+  CudnnRNN rnn;
+  rnn.Setup(Shape{dim}, conf);
+  rnn.ToDevice(cuda);
+
+  auto weight = rnn.param_values().at(0);
+  size_t weightSize = weight.Size();
+  float we[weightSize];
+  float wvalue = 0.1f;
+  for (size_t i = 0; i < weightSize; i++)
+    we[i] = wvalue;
+  weight.CopyDataFromHostPtr(we, weightSize);
+
+  const auto ret = rnn.Forward(singa::kEval, inputs);
+  EXPECT_EQ(ret.size(), seqLength + 1);
+  vector<float> hxptr(hidden_size, 0.0f);
+  for (size_t i = 0; i < seqLength; i++) {
+    auto y = ret[i];
+    y.ToHost();
+    auto yptr = y.data<float>();
+    vector<float> tmp;
+    for (size_t j = 0; j < hidden_size; j++) {
+      float ty = 0;
+      for (size_t k = 0; k < dim; k++) {
+        ty += x[i * dim + k] * wvalue;
+      }
+      ty += wvalue;
+      for (size_t k = 0; k < hidden_size; k++) {
+        ty += hxptr[k] * wvalue;
+      }
+      ty += wvalue;
+      ty = tanh(ty);
+      EXPECT_NEAR(ty, yptr[j], 1e-4);
+      tmp.push_back(ty);
+    }
+    std::copy(tmp.begin(), tmp.end(), hxptr.begin());
+  }
+}
+
+TEST_F(TestCudnnRNN, Backward) {
+  auto cuda = std::make_shared<singa::CudaGPU>();
+  const size_t seqLength = 4, batchsize = 1, dim = 2;
+  const float x[seqLength * batchsize * dim] = {1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
+                                          1.0f, 1.0f, 1.0f};
+
+  vector<Tensor> inputs;
+  for (size_t i = 0; i < seqLength; i++) {
+    Tensor t(Shape{batchsize, dim}, cuda);
+    t.CopyDataFromHostPtr(x + i * t.Size(), t.Size());
+    inputs.push_back(t);
+  }
+
+  singa::Tensor hx;
+  inputs.push_back(hx);
+
+  CudnnRNN rnn;
+  rnn.Setup(Shape{dim}, conf);
+  rnn.ToDevice(cuda);
+
+  auto weight = rnn.param_values().at(0);
+  size_t weightSize = weight.Size();
+  float we[weightSize];
+  float wvalue = 0.1f;
+  for (size_t i = 0; i < weightSize; i++)
+    we[i] = wvalue;
+  weight.CopyDataFromHostPtr(we, weightSize);
+
+  const auto outs = rnn.Forward(singa::kTrain, inputs);
+
+  float dyptr[seqLength * batchsize * hidden_size];
+  for (size_t i = 0; i < seqLength * batchsize * hidden_size; i++)
+    dyptr[i] = i * 0.1f;
+  vector<Tensor> grads;
+  for (size_t i = 0; i < seqLength; i++) {
+    Tensor dy(Shape{batchsize, hidden_size}, cuda);
+    dy.CopyDataFromHostPtr(dyptr + i * dy.Size(), dy.Size());
+    grads.push_back(dy);
+  }
+  Tensor dhy;
+  grads.push_back(dhy);
+  vector<float> dhyptr(hidden_size, 0.0f);
+  const auto ret = rnn.Backward(singa::kTrain, grads);
+  for (size_t i = seqLength - 1; i > 0 ; i --) {
+    auto dx = ret.first[i];
+    auto y = outs[i].Clone();
+    y.ToHost();
+    dx.ToHost();
+    auto dxptr = dx.data<float>();
+    auto yptr = y.data<float>();
+    for (size_t j = 0; j < hidden_size; j++) {
+      dhyptr[j] += dyptr[i * hidden_size + j];
+      dhyptr[j] *= 1 - yptr[j] * yptr[j];
+    }
+    for (size_t k = 0; k < dim; k++) {
+      float tdx = 0;
+      for (size_t j = 0; j < hidden_size; j++) {
+        tdx += dhyptr[j] * wvalue;
+      }
+      EXPECT_NEAR(tdx, dxptr[k], 1e-4);
+    }
+    vector<float> tmp;
+    for (size_t k = 0; k < hidden_size; k++) {
+      float tdhy = 0;
+      for (size_t j = 0; j < hidden_size; j++) {
+        tdhy += dhyptr[j] * wvalue;
+      }
+      tmp.push_back(tdhy);
+    }
+    std::copy(tmp.begin(), tmp.end(), dhyptr.begin());
+  }
+}
+#endif  // CUDNN_VERSION >= 5005
+#endif  // USE_CUDNN
diff --git a/test/singa/test_cudnn_softmax.cc b/test/singa/test_cudnn_softmax.cc
new file mode 100644
index 0000000..6e0d5ab
--- /dev/null
+++ b/test/singa/test_cudnn_softmax.cc
@@ -0,0 +1,169 @@
+/************************************************************
+*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*
+*************************************************************/
+#include "singa/singa_config.h"
+#ifdef USE_CUDNN
+
+#include "../src/model/layer/cudnn_softmax.h"
+#include "gtest/gtest.h"
+#include <math.h>  // exp
+#include <cudnn.h>
+
+// TODO(wangwei) add test for matrix input
+using singa::CudnnSoftmax;
+using singa::Shape;
+TEST(CudnnSoftmax, Setup) {
+  CudnnSoftmax sft;
+  // EXPECT_EQ("CudnnSoftmax", sft.layer_type());
+
+  singa::LayerConf conf;
+  singa::SoftmaxConf* softmaxconf = conf.mutable_softmax_conf();
+  softmaxconf->set_algorithm("fast");
+  sft.Setup(Shape{1}, conf);
+  EXPECT_EQ(CUDNN_SOFTMAX_FAST, sft.Algorithm());
+}
+
+TEST(CudnnSoftmax, Forward1D) {
+  const float x[] = {1.f, 2.f, 0.f, -2.f, -3.f, -1.f};
+  size_t n = sizeof(x) / sizeof(float);
+  auto cuda = std::make_shared<singa::CudaGPU>();
+  singa::Shape shape = {n};
+  singa::Tensor in(shape, cuda);
+  in.CopyDataFromHostPtr<float>(x, n);
+
+  CudnnSoftmax sft;
+  singa::LayerConf conf;
+  singa::SoftmaxConf* softmaxconf = conf.mutable_softmax_conf();
+  softmaxconf->set_algorithm("accurate");
+  sft.Setup(Shape{1}, conf);
+  singa::Tensor out = sft.Forward(singa::kTrain, in);
+  out.ToHost();
+  const float* yptr = out.data<float>();
+  EXPECT_EQ(n, out.Size());
+
+  float* y = new float[n];
+  float sigma = 0.f;
+  for (size_t i = 0; i < n; i++) sigma += exp(x[i]);
+  for (size_t i = 0; i < n; i++) y[i] = exp(x[i]) / sigma;
+  for (size_t i = 0; i < n; i++) EXPECT_FLOAT_EQ(y[i], yptr[i]);
+  delete[] y;
+}
+
+TEST(CudnnSoftmax, Backward1D) {
+  const float x[] = {1.f, 2.f, 3.f, -2.f, -3.f, -1.f};
+  size_t n = sizeof(x) / sizeof(float);
+  singa::Shape shape = {n};
+  auto cuda = std::make_shared<singa::CudaGPU>();
+  singa::Tensor in(shape, cuda);
+  in.CopyDataFromHostPtr<float>(x, n);
+
+  CudnnSoftmax sft;
+  singa::LayerConf conf;
+  singa::SoftmaxConf* softmaxconf = conf.mutable_softmax_conf();
+  softmaxconf->set_algorithm("accurate");
+  sft.Setup(Shape{1}, conf);
+
+  singa::Tensor out = sft.Forward(singa::kTrain, in);
+  out.ToHost();
+  const float* yptr = out.data<float>();
+
+  const float grad[] = {2.f, -3.f, 1.f, 3.f, -1.f, -2.f};
+  singa::Tensor out_diff(shape, cuda);
+  out_diff.CopyDataFromHostPtr<float>(grad, n);
+  const auto ret = sft.Backward(singa::kTrain, out_diff);
+  singa::Tensor in_diff = ret.first;
+  in_diff.ToHost();
+  const float* xptr = in_diff.data<float>();
+
+  float* dx = new float[n];
+  float sigma = 0.f;
+  for (size_t i = 0; i < n; i++) sigma += grad[i] * yptr[i];
+  for (size_t i = 0; i < n; i++) dx[i] = (grad[i] - sigma) * yptr[i];
+  for (size_t i = 0; i < n; i++) EXPECT_FLOAT_EQ(dx[i], xptr[i]);
+  delete[] dx;
+}
+
+TEST(CudnnSoftmax, Forward2D) {
+  const float x[] = {1.f, 2.f, 0.f, -2.f, -3.f, -1.f};
+  size_t n = sizeof(x) / sizeof(float);
+  size_t batch = 2, c = 3;
+  singa::Shape shape = {batch, c};
+  auto cuda = std::make_shared<singa::CudaGPU>();
+  singa::Tensor in(shape, cuda);
+  in.CopyDataFromHostPtr<float>(x, n);
+
+  CudnnSoftmax sft;
+  singa::LayerConf conf;
+  singa::SoftmaxConf* softmaxconf = conf.mutable_softmax_conf();
+  softmaxconf->set_algorithm("accurate");
+  sft.Setup(Shape{c}, conf);
+
+  singa::Tensor out = sft.Forward(singa::kTrain, in);
+  out.ToHost();
+  const float* yptr = out.data<float>();
+  EXPECT_EQ(n, out.Size());
+
+  float* y = new float[n];
+  float* sigma = new float[batch];
+  for (size_t i = 0; i < batch; i++) sigma[i] = 0.f;
+  for (size_t i = 0; i < n; i++) sigma[i / c] += exp(x[i]);
+  for (size_t i = 0; i < n; i++) y[i] = exp(x[i]) / sigma[i / c];
+  for (size_t i = 0; i < n; i++) EXPECT_FLOAT_EQ(y[i], yptr[i]);
+  delete[] y;
+  delete[] sigma;
+}
+
+TEST(CudnnSoftmax, Backward2D) {
+  const float x[] = {1.f, 2.f, 3.f, -2.f, -3.f, -1.f};
+  size_t n = sizeof(x) / sizeof(float);
+  size_t batch = 2, c = 3;
+  auto cuda = std::make_shared<singa::CudaGPU>();
+  singa::Shape shape = {batch, c};
+  singa::Tensor in(shape, cuda);
+  in.CopyDataFromHostPtr<float>(x, n);
+
+  CudnnSoftmax sft;
+  singa::LayerConf conf;
+  singa::SoftmaxConf* softmaxconf = conf.mutable_softmax_conf();
+  softmaxconf->set_algorithm("accurate");
+  sft.Setup(Shape{c}, conf);
+
+  singa::Tensor out = sft.Forward(singa::kTrain, in);
+  out.ToHost();
+  const float* yptr = out.data<float>();
+
+  const float grad[] = {2.f, -3.f, 1.f, 3.f, -1.f, -2.f};
+  singa::Tensor out_diff(shape, cuda);
+  out_diff.CopyDataFromHostPtr<float>(grad, n);
+  const auto ret = sft.Backward(singa::kTrain, out_diff);
+  singa::Tensor in_diff = ret.first;
+  in_diff.ToHost();
+  const float* xptr = in_diff.data<float>();
+
+  float* dx = new float[n];
+  float* sigma = new float[batch];
+  for (size_t i = 0; i < batch; i++) sigma[i] = 0.f;
+  for (size_t i = 0; i < n; i++) sigma[i / c] += grad[i] * yptr[i];
+  for (size_t i = 0; i < n; i++) dx[i] = (grad[i] - sigma[i / c]) * yptr[i];
+  for (size_t i = 0; i < n; i++) EXPECT_FLOAT_EQ(dx[i], xptr[i]);
+  delete[] dx;
+  delete[] sigma;
+}
+#endif  // USE_CUDNN
diff --git a/test/singa/test_dense.cc b/test/singa/test_dense.cc
new file mode 100644
index 0000000..0410929
--- /dev/null
+++ b/test/singa/test_dense.cc
@@ -0,0 +1,243 @@
+/************************************************************
+*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*
+*************************************************************/
+#include "../src/model/layer/dense.h"
+#include "gtest/gtest.h"
+#include "singa/singa_config.h"
+
+using singa::Dense;
+using singa::Shape;
+TEST(Dense, Setup) {
+  Dense dense;
+  // EXPECT_EQ("Dense", dense.layer_type());
+
+  singa::LayerConf conf;
+  singa::DenseConf *denseconf = conf.mutable_dense_conf();
+  denseconf->set_num_output(3);
+  dense.Setup(Shape{2}, conf);
+
+  EXPECT_EQ(3u, dense.num_output());
+  EXPECT_EQ(2u, dense.num_input());
+}
+#ifdef USE_CBLAS
+TEST(Dense, ForwardCpp) {
+  Dense dense;
+
+  singa::LayerConf conf;
+  singa::DenseConf *denseconf = conf.mutable_dense_conf();
+  denseconf->set_num_output(3);
+  denseconf->set_transpose(false);
+  dense.Setup(Shape{2}, conf);
+
+  const size_t batchsize = 3, vdim = 2, hdim = 3;
+  const float x[batchsize * vdim] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f};
+  singa::Tensor in(singa::Shape{batchsize, vdim});
+  in.CopyDataFromHostPtr(x, batchsize * vdim);
+
+  // set weight
+  const float we[vdim * hdim] = {1.0f, 1.0f, 1.0f, 2.0f, 0.0f, 1.0f};
+  singa::Tensor weight(singa::Shape{vdim, hdim});
+  weight.CopyDataFromHostPtr(we, hdim * vdim);
+
+  const float bia[hdim] = {1.0f, 1.0f, 1.0f};
+  singa::Tensor bias(singa::Shape{hdim});
+  bias.CopyDataFromHostPtr(bia, hdim);
+
+  dense.set_weight(weight);
+  dense.set_bias(bias);
+
+  singa::Tensor out1 = dense.Forward(singa::kTrain, in);
+  const float *outptr1 = out1.data<float>();
+  EXPECT_EQ(9u, out1.Size());
+  for (int i = 0; i < 3; i++)
+    for (int j = 0; j < 3; j++)
+      EXPECT_FLOAT_EQ((x[i * 2 + 0] * we[j] +
+                       x[i * 2 + 1] * we[3 + j] + bia[j]),
+                      outptr1[i * 3 + j]);
+}
+TEST(Dense, BackwardCpp) {
+  Dense dense;
+
+  singa::LayerConf conf;
+  singa::DenseConf *denseconf = conf.mutable_dense_conf();
+  denseconf->set_num_output(3);
+  denseconf->set_transpose(false);
+  dense.Setup(Shape{2}, conf);
+
+  const size_t batchsize = 3, vdim = 2, hdim = 3;
+  const float x[batchsize * vdim] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f};
+  singa::Tensor in(singa::Shape{batchsize, vdim});
+  in.CopyDataFromHostPtr(x, batchsize * vdim);
+
+  // set weight
+  const float we[hdim * vdim] = {1.0f, 1.0f, 1.0f, 2.0f, 0.0f, 1.0f};
+  singa::Tensor weight(singa::Shape{vdim, hdim});
+  weight.CopyDataFromHostPtr(we, hdim * vdim);
+
+  const float bia[hdim] = {1.0f, 1.0f, 1.0f};
+  singa::Tensor bias(singa::Shape{hdim});
+  bias.CopyDataFromHostPtr(bia, hdim);
+
+  dense.set_weight(weight);
+  dense.set_bias(bias);
+
+  singa::Tensor out1 = dense.Forward(singa::kTrain, in);
+
+  // grad
+  const float dy[batchsize * hdim] = {1.0f, 1.0f, 1.0f, 2.0f, 2.0f,
+                                      2.0f, 3.0f, 3.0f, 3.0f};
+  singa::Tensor grad(singa::Shape{batchsize, hdim});
+  grad.CopyDataFromHostPtr(dy, batchsize * hdim);
+
+  const auto ret = dense.Backward(singa::kTrain, grad);
+  singa::Tensor in_grad = ret.first;
+  singa::Tensor dweight = ret.second.at(0);
+  singa::Tensor dbias = ret.second.at(1);
+  EXPECT_EQ(6u, in_grad.Size());
+  /*
+  const float *dx = in_grad.data<float>();
+  for (int i = 0; i < 3; i++)
+    for (int j = 0; j < 2; j++)
+      EXPECT_FLOAT_EQ(
+          (dy[i * 3 + 0] * we[j * 3 + 0] + dy[i * 3 + 1] * we[j * 3 + 1] +
+           dy[i * 3 + 2] * we[j * 3 + 2]),
+          dx[i * 2 + j]);
+  const float *dweightx = dweight.data<float>();
+  EXPECT_EQ(6u, dweight.Size());
+  for (int i = 0; i < 3; i++)
+    for (int j = 0; j < 2; j++)
+      EXPECT_FLOAT_EQ(
+          (dy[i * 3 + 0] * x[j * 3 + 0] + dy[i * 3 + 1] * x[j * 3 + 0] +
+           dy[i * 3 + 2] * x[j * 3 + 2]),
+          dweightx[j * 2 + i]);
+  */
+  const float *dbiasx = dbias.data<float>();
+  EXPECT_EQ(3u, dbias.Size());
+  for (int i = 0; i < 3; i++)
+    EXPECT_FLOAT_EQ((dy[0 * 3 + i] + dy[1 * 3 + i] + dy[2 * 3 + i]), dbiasx[i]);
+}
+#endif  // USE_CBLAS
+
+#ifdef USE_CUDA
+TEST(Dense, ForwardCuda) {
+  Dense dense;
+
+  singa::LayerConf conf;
+  singa::DenseConf *denseconf = conf.mutable_dense_conf();
+  denseconf->set_num_output(3);
+  denseconf->set_transpose(false);
+  dense.Setup(Shape{2}, conf);
+
+  const size_t batchsize = 3, vdim = 2, hdim = 3;
+  const float x[batchsize * vdim] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f};
+  auto cuda = std::make_shared<singa::CudaGPU>();
+  singa::Tensor in(singa::Shape{batchsize, vdim}, cuda);
+  in.CopyDataFromHostPtr(x, batchsize * vdim);
+
+  // set weight
+  const float we[hdim * vdim] = {1.0f, 1.0f, 1.0f, 2.0f, 0.0f, 1.0f};
+  singa::Tensor weight(singa::Shape{vdim, hdim}, cuda);
+  weight.CopyDataFromHostPtr(we, hdim * vdim);
+
+  const float bia[hdim] = {1.0f, 1.0f, 1.0f};
+  singa::Tensor bias(singa::Shape{hdim}, cuda);
+  bias.CopyDataFromHostPtr(bia, hdim);
+
+  dense.set_weight(weight);
+  dense.set_bias(bias);
+
+  singa::Tensor out1 = dense.Forward(singa::kTrain, in);
+  out1.ToHost();
+  const float *outptr1 = out1.data<float>();
+  EXPECT_EQ(9u, out1.Size());
+  for (int i = 0; i < 3; i++)
+    for (int j = 0; j < 3; j++)
+      EXPECT_FLOAT_EQ((x[i * 2 + 0] * we[j] +
+                       x[i * 2 + 1] * we[3 + j] + bia[j]),
+                      outptr1[i * 3 + j]);
+}
+TEST(Dense, BackwardCuda) {
+  Dense dense;
+
+  singa::LayerConf conf;
+  singa::DenseConf *denseconf = conf.mutable_dense_conf();
+  denseconf->set_num_output(3);
+  denseconf->set_transpose(false);
+  dense.Setup(Shape{2}, conf);
+
+  const size_t batchsize = 3, vdim = 2, hdim = 3;
+  const float x[batchsize * vdim] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f};
+  auto cuda = std::make_shared<singa::CudaGPU>();
+  singa::Tensor in(singa::Shape{batchsize, vdim}, cuda);
+  in.CopyDataFromHostPtr(x, batchsize * vdim);
+
+  // set weight
+  const float we[hdim * vdim] = {1.0f, 1.0f, 1.0f, 2.0f, 0.0f, 1.0f};
+  singa::Tensor weight(singa::Shape{vdim, hdim}, cuda);
+  weight.CopyDataFromHostPtr(we, hdim * vdim);
+
+  const float bia[hdim] = {1.0f, 1.0f, 1.0f};
+  singa::Tensor bias(singa::Shape{hdim}, cuda);
+  bias.CopyDataFromHostPtr(bia, hdim);
+
+  dense.set_weight(weight);
+  dense.set_bias(bias);
+
+  singa::Tensor out1 = dense.Forward(singa::kTrain, in);
+
+  // grad
+  const float dy[batchsize * hdim] = {1.0f, 1.0f, 1.0f, 2.0f, 2.0f,
+                                      2.0f, 3.0f, 3.0f, 3.0f};
+  singa::Tensor grad(singa::Shape{batchsize, hdim}, cuda);
+  grad.CopyDataFromHostPtr(dy, batchsize * hdim);
+
+  auto ret = dense.Backward(singa::kTrain, grad);
+  singa::Tensor in_grad = ret.first;
+  singa::Tensor dweight = ret.second.at(0);
+  singa::Tensor dbias = ret.second.at(1);
+  in_grad.ToHost();
+  EXPECT_EQ(6u, in_grad.Size());
+  /*
+  const float *dx = in_grad.data<float>();
+  for (int i = 0; i < 3; i++)
+    for (int j = 0; j < 2; j++)
+      EXPECT_FLOAT_EQ(
+          (dy[i * 3 + 0] * we[j * 3 + 0] + dy[i * 3 + 1] * we[j * 3 + 1] +
+           dy[i * 3 + 2] * we[j * 3 + 2]),
+          dx[i * 2 + j]);
+  */
+  dweight.ToHost();
+  EXPECT_EQ(6u, dweight.Size());
+  /*
+  const float *dweightx = dweight.data<float>();
+  for (int i = 0; i < 3; i++)
+    for (int j = 0; j < 2; j++)
+      EXPECT_FLOAT_EQ(
+          (dy[0 * 3 + i] * x[0 * 2 + j] + dy[1 * 3 + i] * x[1 * 2 + j] +
+           dy[2 * 3 + i] * x[2 * 2 + j]),
+          dweightx[j * 2 + i]);
+  */
+  dbias.ToHost();
+  const float *dbiasx = dbias.data<float>();
+  EXPECT_EQ(3u, dbias.Size());
+  for (int i = 0; i < 3; i++)
+    EXPECT_FLOAT_EQ((dy[0 * 3 + i] + dy[1 * 3 + i] + dy[2 * 3 + i]), dbiasx[i]);
+}
+#endif
diff --git a/test/singa/test_dropout.cc b/test/singa/test_dropout.cc
new file mode 100644
index 0000000..b0c34a3
--- /dev/null
+++ b/test/singa/test_dropout.cc
@@ -0,0 +1,101 @@
+/************************************************************
+*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*
+*************************************************************/
+
+#include "../src/model/layer/dropout.h"
+#include "gtest/gtest.h"
+
+using singa::Dropout;
+using singa::Shape;
+TEST(Dropout, Setup) {
+  Dropout drop;
+  // EXPECT_EQ("Dropout", drop.layer_type());
+
+  singa::LayerConf conf;
+  singa::DropoutConf* dropconf = conf.mutable_dropout_conf();
+  dropconf->set_dropout_ratio(0.8);
+
+  drop.Setup(Shape{3}, conf);
+  EXPECT_EQ(0.8f, drop.dropout_ratio());
+}
+
+TEST(Dropout, Forward) {
+  const float x[] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f};
+  size_t n = sizeof(x) / sizeof(float);
+  singa::Tensor in(singa::Shape{n});
+  in.CopyDataFromHostPtr(x, n);
+
+  float pdrop = 0.5;
+  Dropout drop;
+  singa::LayerConf conf;
+  singa::DropoutConf* dropconf = conf.mutable_dropout_conf();
+  dropconf->set_dropout_ratio(pdrop);
+  drop.Setup(Shape{1}, conf);
+  float scale = 1.0f / (1.0f - pdrop);
+
+  singa::Tensor out1 = drop.Forward(singa::kTrain, in);
+
+  const float* mptr = drop.mask().data<float>();
+  for (size_t i = 0; i < n; i++)
+    EXPECT_FLOAT_EQ(0, mptr[i] * (mptr[i] - scale));
+
+  const float* outptr1 = out1.data<float>();
+  EXPECT_EQ(n, out1.Size());
+  // the output value should be 0 or the same as the input
+  EXPECT_EQ(0.f, outptr1[0] * (outptr1[0] - scale * x[0]));
+  EXPECT_EQ(0.f, outptr1[1] * (outptr1[1] - scale * x[1]));
+  EXPECT_EQ(0.f, outptr1[7] * (outptr1[7] - scale * x[7]));
+
+  singa::Tensor out2 = drop.Forward(singa::kEval, in);
+  EXPECT_EQ(n, out2.Size());
+  const float* outptr2 = out2.data<float>();
+  // the output value should be the same as the input
+  EXPECT_EQ(x[0], outptr2[0]);
+  EXPECT_EQ(x[1], outptr2[1]);
+  EXPECT_EQ(x[7], outptr2[7]);
+}
+
+TEST(Dropout, Backward) {
+  const float x[] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f};
+  size_t n = sizeof(x) / sizeof(float);
+  singa::Tensor in(singa::Shape{n});
+  in.CopyDataFromHostPtr(x, n);
+
+  float pdrop = 0.5;
+  float scale = 1.0f / (1.0f - pdrop);
+
+  Dropout drop;
+  singa::LayerConf conf;
+  singa::DropoutConf* dropconf = conf.mutable_dropout_conf();
+  dropconf->set_dropout_ratio(pdrop);
+  drop.Setup(Shape{1}, conf);
+  singa::Tensor out1 = drop.Forward(singa::kTrain, in);
+
+  const float dy[] = {4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 1.0f, 2.0f, 3.0f};
+  singa::Tensor grad(singa::Shape{n});
+  grad.CopyDataFromHostPtr(dy, n);
+
+  const float* mptr = drop.mask().data<float>();
+  const auto ret = drop.Backward(singa::kTrain, grad);
+  const float* dx = ret.first.data<float>();
+  EXPECT_FLOAT_EQ(dx[0], dy[0] * (mptr[0] > 0 ? 1.0f : 0.0f) * scale);
+  EXPECT_FLOAT_EQ(dx[1], dy[1] * (mptr[1] > 0) * scale);
+  EXPECT_FLOAT_EQ(dx[7], dy[7] * (mptr[7] > 0) * scale);
+}
diff --git a/test/singa/test_ep.cc b/test/singa/test_ep.cc
new file mode 100644
index 0000000..0d862e5
--- /dev/null
+++ b/test/singa/test_ep.cc
@@ -0,0 +1,113 @@
+/************************************************************
+*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*
+*************************************************************/
+#include "singa/singa_config.h"
+#ifdef ENABLE_DIST
+#include "singa/io/network.h"
+#include "singa/utils/integer.h"
+#include "singa/utils/logging.h"
+#include <assert.h>
+#include <unistd.h>
+#include <string.h>
+#include <memory>
+
+
+#define SIZE 10000000
+#define PORT 10000
+#define ITER 10
+
+using namespace singa;
+int main(int argc, char **argv) {
+  char *md = new char[SIZE];
+  char *payload = new char[SIZE];
+
+  const char *host = "localhost";
+  int port = PORT;
+
+  for (int i = 1; i < argc; ++i) {
+    if (strcmp(argv[i], "-p") == 0)
+      port = atoi(argv[++i]);
+    else if (strcmp(argv[i], "-h") == 0)
+      host = argv[++i];
+    else
+      fprintf(stderr, "Invalid option %s\n", argv[i]);
+  }
+
+  memset(md, 'a', SIZE);
+  memset(payload, 'b', SIZE);
+
+  NetworkThread *t = new NetworkThread(port);
+
+  EndPointFactory *epf = t->epf_;
+
+  // sleep
+  sleep(3);
+
+  EndPoint *ep = epf->getEp(host);
+
+  Message *m[ITER];
+  for (int i = 0; i < ITER; ++i) {
+    m[i] = new Message();
+    m[i]->setMetadata(md, SIZE);
+    m[i]->setPayload(payload, SIZE);
+  }
+
+  while (1) {
+    for (int i = 0; i < ITER; ++i) {
+      if (ep->send(m[i]) < 0)
+        return 1;
+      delete m[i];
+    }
+
+    for (int i = 0; i < ITER; ++i) {
+      m[i] = ep->recv();
+      if (!m[i])
+        return 1;
+      char *p;
+      CHECK(m[i]->getMetadata((void **)&p) == SIZE);
+      CHECK(0 == strncmp(p, md, SIZE));
+      CHECK(m[i]->getPayload((void **)&p) == SIZE);
+      CHECK(0 == strncmp(p, payload, SIZE));
+    }
+  }
+
+  // while(ep && cnt++ <= 5 && ep->send(m) > 0 ) {
+
+  //    LOG(INFO) << "Send a " << m->getSize() << " bytes message";
+
+  //    Message* m1 = ep->recv();
+
+  //    if (!m1)
+  //        break;
+
+  //    char *p;
+
+  //    LOG(INFO) << "Receive a " << m1->getSize() << " bytes message";
+
+  //    CHECK(m1->getMetadata((void**)&p) == SIZE);
+  //    CHECK(0 == strncmp(p, md, SIZE));
+  //    CHECK(m1->getPayload((void**)&p) == SIZE);
+  //    CHECK(0 == strncmp(p, payload, SIZE));
+
+  //    delete m;
+  //    m = m1;
+  //}
+}
+#endif  // ENABLE_DIST
diff --git a/test/singa/test_flatten.cc b/test/singa/test_flatten.cc
new file mode 100644
index 0000000..65748f7
--- /dev/null
+++ b/test/singa/test_flatten.cc
@@ -0,0 +1,143 @@
+/************************************************************
+*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*
+*************************************************************/
+
+#include "../src/model/layer/flatten.h"
+#include "gtest/gtest.h"
+
+using singa::Flatten;
+using singa::Shape;
+TEST(Flatten, Setup) {
+  Flatten flt;
+  // EXPECT_EQ("Flatten", flt.layer_type());
+
+  singa::LayerConf conf;
+  singa::FlattenConf *flattenconf = conf.mutable_flatten_conf();
+  flattenconf->set_axis(1);
+
+  flt.Setup(Shape{2}, conf);
+  EXPECT_EQ(1, flt.Axis());
+}
+
+TEST(Flatten, ForwardCPU) {
+  const float x[] = {1.f,  2.f,   3.f, -2.f,  -3.f, -4.f,
+                     1.5f, -1.5f, 0.f, -0.5f, -2.f, -1.f};
+  size_t n = sizeof(x) / sizeof(float);
+  singa::Shape s = {2, 1, 3, 2};
+  singa::Tensor in(s);
+  in.CopyDataFromHostPtr<float>(x, n);
+
+  int axis = 3;
+  Flatten flt;
+  singa::LayerConf conf;
+  singa::FlattenConf *flattenconf = conf.mutable_flatten_conf();
+  flattenconf->set_axis(axis);
+  flt.Setup(Shape{1, 3, 2}, conf);
+
+  singa::Tensor out = flt.Forward(singa::kTrain, in);
+  EXPECT_EQ(n, out.Size());
+  EXPECT_EQ(6u, out.shape(0));
+  EXPECT_EQ(2u, out.shape(1));
+  const float *yptr = out.data<float>();
+  for (size_t i = 0; i < n; i++) EXPECT_FLOAT_EQ(x[i], yptr[i]);
+}
+
+TEST(Flatten, BackwardCPU) {
+  // directly use input as the output_grad for backward
+  // note that only the shape of input really matters
+  const float dy[] = {1.f,  2.f,   3.f, -2.f,  -3.f, -4.f,
+                      1.5f, -1.5f, 0.f, -0.5f, -2.f, -1.f};
+  size_t n = sizeof(dy) / sizeof(float);
+  singa::Tensor in(singa::Shape{2, 1, 3, 2});
+  in.CopyDataFromHostPtr<float>(dy, n);
+
+  int axis = 2;
+  Flatten flt;
+  singa::LayerConf conf;
+  singa::FlattenConf *flattenconf = conf.mutable_flatten_conf();
+  flattenconf->set_axis(axis);
+  flt.Setup(Shape{1, 3, 2}, conf);
+
+  singa::Tensor temp = flt.Forward(singa::kTrain, in);
+  const auto out = flt.Backward(singa::kTrain, temp);
+  const float *xptr = out.first.data<float>();
+  EXPECT_EQ(n, out.first.Size());
+  EXPECT_EQ(2u, out.first.shape(0));
+  EXPECT_EQ(1u, out.first.shape(1));
+  EXPECT_EQ(3u, out.first.shape(2));
+  EXPECT_EQ(2u, out.first.shape(3));
+  for (size_t i = 0; i < n; i++) EXPECT_FLOAT_EQ(dy[i], xptr[i]);
+}
+
+#ifdef USE_CUDA
+TEST(Flatten, ForwardGPU) {
+  const float x[] = {1.f,  2.f,   3.f, -2.f,  -3.f, -4.f,
+                     1.5f, -1.5f, 0.f, -0.5f, -2.f, -1.f};
+  size_t n = sizeof(x) / sizeof(float);
+  auto cuda = std::make_shared<singa::CudaGPU>();
+  singa::Tensor in(singa::Shape{2, 1, 3, 2}, cuda);
+  in.CopyDataFromHostPtr<float>(x, n);
+
+  int axis = 3;
+  Flatten flt;
+  singa::LayerConf conf;
+  singa::FlattenConf *flattenconf = conf.mutable_flatten_conf();
+  flattenconf->set_axis(axis);
+  flt.Setup(Shape{1, 3, 2}, conf);
+
+  singa::Tensor out = flt.Forward(singa::kTrain, in);
+  out.ToHost();
+  EXPECT_EQ(n, out.Size());
+  EXPECT_EQ(6u, out.shape(0));
+  EXPECT_EQ(2u, out.shape(1));
+  const float *yptr = out.data<float>();
+  for (size_t i = 0; i < n; i++) EXPECT_FLOAT_EQ(x[i], yptr[i]);
+}
+
+TEST(Flatten, BackwardGPU) {
+  // directly use input as the output_grad for backward
+  // note that only the shape of input really matters
+  const float dy[] = {1.f,  2.f,   3.f, -2.f,  -3.f, -4.f,
+                      1.5f, -1.5f, 0.f, -0.5f, -2.f, -1.f};
+  size_t n = sizeof(dy) / sizeof(float);
+  auto cuda = std::make_shared<singa::CudaGPU>();
+  singa::Tensor in(singa::Shape{2, 1, 3, 2}, cuda);
+  in.CopyDataFromHostPtr<float>(dy, n);
+
+  int axis = 2;
+  Flatten flt;
+  singa::LayerConf conf;
+  singa::FlattenConf *flattenconf = conf.mutable_flatten_conf();
+  flattenconf->set_axis(axis);
+  flt.Setup(Shape{1, 3, 2}, conf);
+
+  singa::Tensor out = flt.Forward(singa::kTrain, in);
+  const auto ret = flt.Backward(singa::kTrain, out);
+  singa::Tensor in_diff = ret.first;
+  in_diff.ToHost();
+  const float *xptr = in_diff.data<float>();
+  EXPECT_EQ(n, in_diff.Size());
+  EXPECT_EQ(2u, in_diff.shape(0));
+  EXPECT_EQ(1u, in_diff.shape(1));
+  EXPECT_EQ(3u, in_diff.shape(2));
+  EXPECT_EQ(2u, in_diff.shape(3));
+  for (size_t i = 0; i < n; i++) EXPECT_FLOAT_EQ(dy[i], xptr[i]);
+}
+#endif // USE_CUDA
diff --git a/test/singa/test_image_transformer.cc b/test/singa/test_image_transformer.cc
new file mode 100644
index 0000000..4540aa8
--- /dev/null
+++ b/test/singa/test_image_transformer.cc
@@ -0,0 +1,261 @@
+/************************************************************
+*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*
+*************************************************************/
+
+#include "singa/io/transformer.h"
+#include "gtest/gtest.h"
+#include <time.h>
+#include <iostream>
+
+// decide whether to use opencv
+// #include "singa/singa_config.h"
+
+#ifdef USE_OPENCV
+#include <opencv2/highgui/highgui.hpp>
+#include <opencv2/imgproc/imgproc.hpp>
+#endif
+
+using singa::Shape;
+TEST(ImageTransformer, Setup) {
+  singa::ImageTransformer img_transformer;
+
+  singa::TransformerConf conf;
+  conf.set_resize_height(256);
+  conf.set_resize_width(256);
+  conf.set_horizontal_mirror(true);
+  conf.set_image_dim_order("HWC");
+  conf.add_crop_shape(224u);
+  conf.add_crop_shape(200u);
+
+  img_transformer.Setup(conf);
+  EXPECT_EQ(256, img_transformer.resize_height());
+  EXPECT_EQ(256, img_transformer.resize_width());
+  EXPECT_EQ(true, img_transformer.horizontal_mirror());
+  EXPECT_EQ("HWC", img_transformer.image_dim_order());
+  EXPECT_EQ(224u, img_transformer.crop_shape()[0]);
+  EXPECT_EQ(200u, img_transformer.crop_shape()[1]);
+}
+
+TEST(ImageTransformer, Apply3D) {
+  size_t n = 180;
+  float* x = new float[n];
+  size_t channel = 3, height = 6, width = 10;
+  singa::Tensor in(singa::Shape{height, width, channel});
+  srand(time(NULL));
+  for (size_t i = 0; i < n; i++) x[i] = (float)(rand() % 256);
+  in.CopyDataFromHostPtr<float>(x, n);
+  int resize_height = 4, resize_width = 6;
+
+  singa::ImageTransformer img_transformer;
+  singa::TransformerConf conf;
+  conf.set_resize_height(resize_height);
+  conf.set_resize_width(resize_width);
+  conf.set_horizontal_mirror(false);
+  conf.set_image_dim_order("HWC");
+  conf.add_crop_shape(2u);
+  conf.add_crop_shape(3u);
+  img_transformer.Setup(conf);
+
+  singa::Tensor out = img_transformer.Apply(singa::kEval, in);
+  EXPECT_EQ(2u, out.shape(0));
+  EXPECT_EQ(3u, out.shape(1));
+  const float* y = out.data<float>();
+#ifdef USE_OPENCV
+  cv::Mat mat(height, width, CV_32FC3, cv::Scalar(0, 0, 0));
+  for (size_t i = 0; i < height; i++)
+    for (size_t j = 0; j < width; j++)
+      for (size_t k = 0; k < channel; k++)
+        mat.at<cv::Vec3f>(i, j)[k] = x[i * width * channel + j * channel + k];
+  cv::Size size(resize_width, resize_height);
+  cv::Mat resized;
+  cv::resize(mat, resized, size);
+  EXPECT_EQ(resize_height, resized.size().height);
+  EXPECT_EQ(resize_width, resized.size().width);
+  size_t new_size = resize_height * resize_width * channel;
+  float* xt = new float[new_size];
+  for (int i = 0; i < resize_height; i++)
+    for (int j = 0; j < resize_width; j++)
+      for (size_t k = 0; k < channel; k++)
+        xt[i * resize_width * channel + j * channel + k] = resized.at<cv::Vec3f>(i, j)[k];
+  for (size_t c = 0; c < 3; c++)
+    for (size_t h = 0; h < 2; h++)
+      for (size_t w = 0; w < 3; w++){
+        //size_t in_idx = (c * height + 1 + h) * width + 1 + w,
+        //    out_idx = (c * 2 + h) * 3 + w;
+        // test for HWC
+        size_t in_idx = ((h + 1) * resize_width + 1 + w) * channel + c,
+              out_idx = (h * 3 + w) * channel + c;
+        EXPECT_EQ(xt[in_idx], y[out_idx]);
+      }
+  delete[] xt;
+#else
+  for (size_t c = 0; c < 3; c++)
+    for (size_t h = 0; h < 2; h++)
+      for (size_t w = 0; w < 3; w++){
+        //size_t in_idx = (c * height + 2 + h) * width + 3 + w,
+        //    out_idx = (c * 2 + h) * 3 + w;
+        // test for HWC
+        size_t in_idx = ((h + 2) * width + 3 + w) * channel + c,
+              out_idx = (h * 3 + w) * channel + c;
+        EXPECT_EQ(x[in_idx], y[out_idx]);
+      }
+#endif
+  delete[] x;
+}
+
+TEST(ImageTransformer, Apply2D) {
+  size_t n = 60;
+  float* x = new float[n];
+  size_t height = 6, width = 10;
+  singa::Tensor in(singa::Shape{height, width});
+  srand(time(NULL));
+  for (size_t i = 0; i < n; i++) x[i] = (float)(rand() % 256);
+  in.CopyDataFromHostPtr<float>(x, n);
+  int resize_height = 4, resize_width = 6;
+
+  singa::ImageTransformer img_transformer;
+  singa::TransformerConf conf;
+  conf.set_resize_height(resize_height);
+  conf.set_resize_width(resize_width);
+  conf.set_horizontal_mirror(false);
+  conf.set_image_dim_order("HWC");
+  conf.add_crop_shape(2u);
+  conf.add_crop_shape(3u);
+  img_transformer.Setup(conf);
+
+  singa::Tensor out = img_transformer.Apply(singa::kEval, in);
+  EXPECT_EQ(2u, out.shape(0));
+  EXPECT_EQ(3u, out.shape(1));
+  const float* y = out.data<float>();
+#ifdef USE_OPENCV
+  cv::Mat mat(height, width, CV_32FC1, cv::Scalar(0, 0, 0));
+  for (size_t i = 0; i < height; i++)
+    for (size_t j = 0; j < width; j++)
+      mat.at<cv::Vec<float, 1>>(i, j)[0] = x[i * width + j];
+  cv::Size size(resize_width, resize_height);
+  cv::Mat resized;
+  cv::resize(mat, resized, size);
+  EXPECT_EQ(resize_height, resized.size().height);
+  EXPECT_EQ(resize_width, resized.size().width);
+  size_t new_size = resize_height * resize_width;
+  float* xt = new float[new_size];
+  for (int i = 0; i < resize_height; i++)
+    for (int j = 0; j < resize_width; j++)
+        xt[i * resize_width + j] = resized.at<cv::Vec<float, 1>>(i, j)[0];
+
+  for (size_t h = 0; h < 2; h++)
+    for (size_t w = 0; w < 3; w++){
+      size_t in_idx = (h + 1) * resize_width + 1 + w,
+            out_idx = h * 3 + w;
+      EXPECT_EQ(xt[in_idx], y[out_idx]);
+    }
+  delete[] xt;
+#else
+  for (size_t h = 0; h < 2; h++)
+    for (size_t w = 0; w < 3; w++){
+      size_t in_idx = (h + 2) * width + 3 + w,
+            out_idx = h * 3 + w;
+      EXPECT_EQ(x[in_idx], y[out_idx]);
+    }
+#endif
+  delete[] x;
+}
+
+#ifdef USE_OPENCV
+TEST(ImageTransformer, Resize) {
+  size_t n = 180;
+  float* x = new float[n];
+  size_t channel = 3, height = 6, width = 10;
+  singa::Tensor in(singa::Shape{height, width, channel});
+  srand(time(NULL));
+  for (size_t i = 0; i < n; i++) x[i] = (float)(rand() % 256);
+  in.CopyDataFromHostPtr<float>(x, n);
+  int resize_height = 4, resize_width = 5;
+  singa::Tensor out = singa::resize(in, resize_height, resize_width, "HWC");
+  const float* y = out.data<float>();
+
+  cv::Mat mat(height, width, CV_32FC3, cv::Scalar(0, 0, 0));
+  for (size_t i = 0; i < height; i++)
+    for (size_t j = 0; j < width; j++)
+      for (size_t k = 0; k < channel; k++)
+        mat.at<cv::Vec3f>(i, j)[k] = x[i * width * channel + j * channel + k];
+  cv::Size size(resize_width, resize_height);
+  cv::Mat resized;
+  cv::resize(mat, resized, size);
+  EXPECT_EQ(resize_height, resized.size().height);
+  EXPECT_EQ(resize_width, resized.size().width);
+  size_t new_size = resize_height * resize_width * channel;
+  float* xt = new float[new_size];
+  for (int i = 0; i < resize_height; i++)
+    for (int j = 0; j < resize_width; j++)
+      for (size_t k = 0; k < channel; k++)
+        xt[i * resize_width * channel + j * channel + k] = resized.at<cv::Vec3f>(i, j)[k];
+
+  for (size_t i = 0; i < new_size; i++) EXPECT_EQ(xt[i], y[i]);
+  delete[] x;
+  delete[] xt;
+}
+#endif
+
+TEST(ImageTransformer, Crop) {
+  size_t n = 180;
+  float* x = new float[n];
+  size_t channel = 3, height = 6, width = 10;
+  singa::Tensor in(singa::Shape{channel, height, width});
+  srand(time(NULL));
+  for (size_t i = 0; i < n; i++) x[i] = (float)(rand() % 256);
+  in.CopyDataFromHostPtr<float>(x, n);
+  size_t crop_height = 3, crop_width = 4,
+         crop_h_offset = 2, crop_w_offset = 5;
+  singa::Tensor out = singa::crop(in, crop_height, crop_width,
+                         crop_h_offset, crop_w_offset, "CHW");
+
+  const float* y = out.data<float>();
+  for (size_t h = 0; h < crop_height; h++)
+    for (size_t w = 0; w < crop_width; w++)
+      for (size_t c = 0; c < channel; c++) {
+        size_t out_idx = c * crop_height * crop_width + h * crop_width + w;
+        size_t in_idx = c * height * width + (h + crop_h_offset)
+                 * width + w + crop_w_offset;
+        EXPECT_EQ(x[in_idx], y[out_idx]);
+      }
+  delete[] x;
+}
+
+TEST(ImageTransformer, Mirror) {
+  size_t n = 30;
+  float* x = new float[n];
+  size_t channel = 3, height = 2, width = 5;
+  singa::Tensor in(singa::Shape{height, width, channel});
+  srand(time(NULL));
+  for (size_t i = 0; i < n; i++) x[i] = (float)(rand() % 256);
+  in.CopyDataFromHostPtr<float>(x, n);
+  singa::Tensor out = singa::mirror(in, true, false, "HWC");
+
+  const float* y = out.data<float>();
+  for (size_t h = 0; h < height; h++)
+    for (size_t w = 0; w < width; w++)
+      for (size_t c = 0; c < channel; c++) {
+        size_t out_idx = h * width * channel + (width - 1 - w) * channel + c;
+        size_t in_idx = h * width * channel + w * channel + c;
+        EXPECT_EQ(x[in_idx], y[out_idx]);
+      }
+  delete[] x;
+}
diff --git a/test/singa/test_initializer.cc b/test/singa/test_initializer.cc
new file mode 100644
index 0000000..74a30bb
--- /dev/null
+++ b/test/singa/test_initializer.cc
@@ -0,0 +1,148 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "singa/model/initializer.h"
+#include "gtest/gtest.h"
+
+TEST(Initializer, Constant) {
+  singa::init::Constant x;
+  size_t n = 10;
+  singa::Tensor t(singa::Shape{n});
+  singa::FillerConf conf;
+  conf.set_value(3.1f);
+  x.Setup(conf);
+  x.Fill(t);
+  const float* xPtr = t.data<float>();
+  for (size_t i = 0; i < n; i++)
+    EXPECT_FLOAT_EQ(xPtr[i], 3.1f);
+}
+
+
+TEST(Initializer, Gaussian) {
+  singa::init::Gaussian x;
+  size_t n = 1000;
+  singa::Tensor t(singa::Shape{n});
+  singa::FillerConf conf;
+  conf.set_mean(0.11f);
+  conf.set_std(0.01f);
+  x.Setup(conf);
+  x.Fill(t);
+  const float* xPtr = t.data<float>();
+  float mean = 0.0f, std = 0.0f;
+  for (size_t i = 0; i < n; i++)
+    mean += xPtr[i];
+  mean /= n;
+  EXPECT_NEAR(mean, 0.11f, 1e-3);
+  for (size_t i = 0; i < n; i++)
+    std += (xPtr[i] - mean) * (xPtr[i] - mean);
+  std /= n;
+  std = sqrt(std);
+  EXPECT_NEAR(std, 0.01f, 1e-3);
+}
+
+#ifdef USE_CUDA
+TEST(Initializer, ConstantCUDA) {
+  singa::init::Constant x;
+  auto dev = std::make_shared<singa::CudaGPU>();
+  size_t n = 10;
+  singa::Tensor t(singa::Shape{n}, dev);
+  singa::FillerConf conf;
+  conf.set_value(3.1f);
+  x.Setup(conf);
+  x.Fill(t);
+  t.ToHost();
+  const float* xPtr = t.data<float>();
+  for (size_t i = 0; i < n; i++)
+    EXPECT_FLOAT_EQ(xPtr[i], 3.1f);
+
+
+  singa::init::Constant y(-0.1f);
+  singa::Tensor s(singa::Shape{n}, dev);
+  y.Fill(s);
+  s.ToHost();
+  const float* sPtr = s.data<float>();
+  for (size_t i = 0; i < n; i++)
+    EXPECT_FLOAT_EQ(sPtr[i], -0.1f);
+}
+
+
+TEST(Initializer, GaussianCUDA) {
+  singa::init::Gaussian x;
+  auto dev = std::make_shared<singa::CudaGPU>();
+  size_t n = 1000;
+  singa::Tensor t(singa::Shape{n}, dev);
+  singa::FillerConf conf;
+  conf.set_mean(0.11f);
+  conf.set_std(0.01f);
+  x.Setup(conf);
+  x.Fill(t);
+  t.ToHost();
+  const float* tPtr = t.data<float>();
+  float mean = 0.0f, std = 0.0f;
+  for (size_t i = 0; i < n; i++)
+    mean += tPtr[i];
+  mean /= n;
+  EXPECT_NEAR(mean, 0.11f, 1e-2);
+  for (size_t i = 0; i < n; i++)
+    std += (tPtr[i] - mean) * (tPtr[i] - mean);
+  std /= n;
+  std = sqrt(std);
+  EXPECT_NEAR(std, 0.01f, 1e-2);
+
+
+  singa::init::Gaussian y(1.5f, 0.1f);
+  singa::Tensor s(singa::Shape{n}, dev);
+  y.Fill(s);
+  s.ToHost();
+  const float* sPtr = s.data<float>();
+  for (size_t i = 0; i < n; i++)
+    mean += sPtr[i];
+  mean /= n;
+  EXPECT_NEAR(mean, 1.5f, 0.1f);
+  for (size_t i = 0; i < n; i++)
+    std += (sPtr[i] - mean) * (sPtr[i] - mean);
+  std /= n;
+  std = sqrt(std);
+  EXPECT_NEAR(std, 0.1f, 0.1f);
+}
+
+TEST(Initializer, XavierCUDA) {
+  singa::init::Constant x;
+  auto dev = std::make_shared<singa::CudaGPU>();
+  size_t m = 30, n=40;
+  singa::Tensor t(singa::Shape{m, n}, dev);
+  x.Fill(t);
+  t.ToHost();
+  const float* xPtr = t.data<float>();
+  float mean = 0.0f;
+  float high = -100.0f, low = 100.0f;
+  for (size_t i = 0; i < n; i++) {
+    mean += xPtr[i];
+    if (high < xPtr[i])
+      high = xPtr[i];
+    if (low > xPtr[i])
+      low = xPtr[i];
+  }
+  mean /= m * n;
+  EXPECT_NEAR(mean, 0, 1e-2);
+  float scale = sqrt(6.0f / (m + n));
+  EXPECT_LT(high, scale);
+  EXPECT_GT(low, -scale);
+}
+
+#endif
diff --git a/test/singa/test_jpg.cc b/test/singa/test_jpg.cc
new file mode 100644
index 0000000..95ee01d
--- /dev/null
+++ b/test/singa/test_jpg.cc
@@ -0,0 +1,100 @@
+/************************************************************
+*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*
+*************************************************************/
+
+#include "singa/io/encoder.h"
+#include "singa/io/decoder.h"
+#include "gtest/gtest.h"
+#include <time.h>
+
+#ifdef USE_OPENCV
+#include <opencv2/highgui/highgui.hpp>
+#include <opencv2/imgproc/imgproc.hpp>
+using singa::Shape;
+using singa::Tensor;
+TEST(Decoder, Decode) {
+  singa::JPGEncoder encoder;
+  singa::JPGDecoder decoder;
+
+  // initial random seed
+  srand(time(NULL));
+ 
+  singa::EncoderConf encoder_conf;
+  encoder_conf.set_image_dim_order("HWC");
+  encoder.Setup(encoder_conf);
+  EXPECT_EQ("HWC", encoder.image_dim_order());
+
+  singa::DecoderConf decoder_conf;
+  decoder_conf.set_image_dim_order("HWC");
+  decoder.Setup(decoder_conf);
+  EXPECT_EQ("HWC", decoder.image_dim_order());
+
+  size_t height = 4, width = 2;
+  size_t nheight = 4, nwidth = 2, channel = 3;
+  size_t total = nheight * nwidth * channel;
+  cv::Mat image(height, width, CV_8UC3);
+  for (size_t i = 0; i < height; i++)
+    for (size_t j = 0; j < width; j++)
+      for (size_t k = 0; k < channel; k++)
+        image.at<cv::Vec3b>(i, j)[k] = static_cast<uchar>(rand() % 256);
+
+  cv::Mat transformed;
+  cv::Size size(nwidth, nheight);
+  cv::resize(image, transformed, size);
+  EXPECT_EQ(static_cast<int>(nwidth), transformed.size().width);
+  EXPECT_EQ(static_cast<int>(nheight), transformed.size().height);
+  EXPECT_EQ(static_cast<int>(channel), transformed.channels());
+
+  unsigned char* buff = transformed.data;
+  Shape shape{nheight, nwidth, channel};
+  Tensor pixel(shape, singa::kUChar), label(Shape{1}, singa::kInt);
+  pixel.CopyDataFromHostPtr<unsigned char>(buff, total);
+  int raw_label = 2;
+  label.CopyDataFromHostPtr<int>(&raw_label, 1);
+
+  std::vector<Tensor> input;
+  input.push_back(pixel);
+  input.push_back(label);
+  const auto* in_pixel = input[0].data<unsigned char>();
+  for (size_t i = 0; i < total; i++) EXPECT_EQ(buff[i], in_pixel[i]);
+  const int* in_label = input[1].data<int>();
+  EXPECT_EQ(2, in_label[0]);
+  EXPECT_EQ(2u, input.size());
+ 
+  std::string tmp = encoder.Encode(input);
+  std::vector<Tensor> output = decoder.Decode(tmp);
+  EXPECT_EQ(2u, output.size());
+  EXPECT_EQ(singa::kFloat32, output[0].data_type());
+  Shape out_shape = output[0].shape();
+  for (size_t i = 0; i < shape.size(); i++) EXPECT_EQ(shape[i], out_shape[i]);
+  const int* out_label = output[1].data<int>();
+  EXPECT_EQ(raw_label, out_label[0]);
+  // opencv imencode will have some information loss
+  /*const float* out_pixel = output[0].data<const float>();
+  cv::Mat out(height, width, CV_8UC3);
+  for (size_t i = 0; i < height; i++)
+    for (size_t j = 0; j < width; j++)
+      for (size_t k = 0; k < channel; k++)
+        out.at<cv::Vec3b>(i, j)[k] = 
+            out_pixel[i * width * channel + j * channel + k];
+  for(size_t i = 0; i < total; i++)
+    EXPECT_LE(fabs(in_pixel[i]-out_pixel[i]), 10.f);*/
+}
+#endif
diff --git a/test/singa/test_layer.cc b/test/singa/test_layer.cc
new file mode 100644
index 0000000..7726a4a
--- /dev/null
+++ b/test/singa/test_layer.cc
@@ -0,0 +1,46 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "gtest/gtest.h"
+#include "singa/model/layer.h"
+#include "singa/singa_config.h"
+
+TEST(Layer, CreateLayer) {
+  std::vector<std::string> types{
+      "convolution", "dense", "dropout", "relu", "batchnorm",
+      "flatten",     "lrn",   "pooling", "prelu",      "softmax"};
+  for (auto type : types) {
+    auto layer = singa::CreateLayer("singacpp_" + type);
+    // EXPECT_EQ(layer->layer_type(), type);
+  }
+}
+
+#ifdef USE_CUDNN
+TEST(Layer, CreateCudnnLayer) {
+  std::vector<std::string> types{
+      "convolution", "dropout", "relu", "batchnorm",
+      "lrn",   "pooling", "softmax"};
+#if CUDNN_VERSION_MAJOR >= 5
+  types.push_back("dropout");
+#endif
+  for (auto type : types) {
+    auto layer = singa::CreateLayer("cudnn_" + type);
+    // EXPECT_EQ(layer->layer_type(), type);
+  }
+}
+#endif
diff --git a/test/singa/test_lmdb_rw.cc b/test/singa/test_lmdb_rw.cc
new file mode 100644
index 0000000..6d7b3d0
--- /dev/null
+++ b/test/singa/test_lmdb_rw.cc
@@ -0,0 +1,140 @@
+/************************************************************
+*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*
+*************************************************************/
+
+#include "../include/singa/io/reader.h"
+#include "../include/singa/io/writer.h"
+#include "gtest/gtest.h"
+#ifdef USE_LMDB
+
+const char* path_lmdb = "./test_lmdb";
+using singa::io::LMDBReader;
+using singa::io::LMDBWriter;
+TEST(LMDBWriter, Create) {
+  LMDBWriter writer;
+  bool ret;
+  ret = writer.Open(path_lmdb, singa::io::kCreate);
+  EXPECT_EQ(true, ret);
+
+  std::string key = "1";
+  std::string value = "This is the first test for lmdb io.";
+  ret = writer.Write(key, value);
+  EXPECT_EQ(true, ret);
+
+  key = "2";
+  value = "This is the second test for lmdb io.";
+  ret = writer.Write(key, value);
+  EXPECT_EQ(true, ret);
+
+  writer.Flush();
+  writer.Close();
+}
+
+TEST(LMDBWriter, Append) {
+  LMDBWriter writer;
+  bool ret;
+  ret = writer.Open(path_lmdb, singa::io::kAppend);
+  EXPECT_EQ(true, ret);
+
+  std::string key = "3";
+  std::string value = "This is the third test for lmdb io.";
+  ret = writer.Write(key, value);
+  EXPECT_EQ(true, ret);
+
+  key = "4";
+  value = "This is the fourth test for lmdb io.";
+  ret = writer.Write(key, value);
+  EXPECT_EQ(true, ret);
+
+  writer.Flush();
+  writer.Close();
+}
+
+TEST(LMDBReader, Read) {
+  LMDBReader reader;
+  bool ret;
+  ret = reader.Open(path_lmdb);
+  EXPECT_EQ(true, ret);
+
+  int cnt = reader.Count();
+  EXPECT_EQ(4, cnt);
+
+  std::string key, value;
+  reader.Read(&key, &value);
+  EXPECT_STREQ("1", key.c_str());
+  EXPECT_STREQ("This is the first test for lmdb io.", value.c_str());
+
+  reader.Read(&key, &value);
+  EXPECT_STREQ("2", key.c_str());
+  EXPECT_STREQ("This is the second test for lmdb io.", value.c_str());
+
+  reader.Read(&key, &value);
+  EXPECT_STREQ("3", key.c_str());
+  EXPECT_STREQ("This is the third test for lmdb io.", value.c_str());
+
+  reader.Read(&key, &value);
+  EXPECT_STREQ("4", key.c_str());
+  EXPECT_STREQ("This is the fourth test for lmdb io.", value.c_str());
+
+  reader.Close();
+}
+
+TEST(LMDBReader, SeekToFirst) {
+  LMDBReader reader;
+  bool ret;
+  ret = reader.Open(path_lmdb);
+  EXPECT_EQ(true, ret);
+
+  int cnt = reader.Count();
+  EXPECT_EQ(4, cnt);
+
+  std::string key, value;
+  reader.Read(&key, &value);
+  EXPECT_STREQ("1", key.c_str());
+  EXPECT_STREQ("This is the first test for lmdb io.", value.c_str());
+
+  reader.Read(&key, &value);
+  EXPECT_STREQ("2", key.c_str());
+  EXPECT_STREQ("This is the second test for lmdb io.", value.c_str());
+
+  reader.SeekToFirst();
+  reader.Read(&key, &value);
+  EXPECT_STREQ("1", key.c_str());
+  EXPECT_STREQ("This is the first test for lmdb io.", value.c_str());
+
+  reader.Read(&key, &value);
+  EXPECT_STREQ("2", key.c_str());
+  EXPECT_STREQ("This is the second test for lmdb io.", value.c_str());
+
+  reader.Read(&key, &value);
+  EXPECT_STREQ("3", key.c_str());
+  EXPECT_STREQ("This is the third test for lmdb io.", value.c_str());
+
+  reader.Read(&key, &value);
+  EXPECT_STREQ("4", key.c_str());
+  EXPECT_STREQ("This is the fourth test for lmdb io.", value.c_str());
+
+  reader.Close();
+
+  remove("./test_lmdb/data.mdb");
+  remove("./test_lmdb/lock.mdb");
+  remove("./test_lmdb");
+}
+#endif  // USE_LMDB
diff --git a/test/singa/test_logging.cc b/test/singa/test_logging.cc
new file mode 100644
index 0000000..619e7e8
--- /dev/null
+++ b/test/singa/test_logging.cc
@@ -0,0 +1,64 @@
+/************************************************************
+*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+* 
+*   http://www.apache.org/licenses/LICENSE-2.0
+* 
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*
+*************************************************************/
+
+#include "gtest/gtest.h"
+#include "singa/utils/logging.h"
+
+TEST(Logging, InfoLogging) {
+  singa::InitLogging("");
+  int a = 3;
+  CHECK_EQ(a, 3);
+  LOG(INFO) << "test info logging";
+}
+
+TEST(Logging, WarningLogging) {
+  int a = 4;
+  CHECK_EQ(a, 4);
+  LOG(WARNING) << "test warning logging";
+}
+
+TEST(Logging, ErrorLogging) {
+  int a = 5;
+  CHECK_EQ(a, 5);
+  LOG(ERROR) << "test error logging";
+}
+
+TEST(Logging, FatalLogging) {
+  int a = 6;
+  CHECK_EQ(a, 6);
+  // LOG(FATAL) << "test fatal logging";
+}
+
+TEST(Logging, SetLogDestination) {
+  int a = 6;
+  singa::SetLogDestination(singa::WARNING, "/tmp/test.log");
+  CHECK_EQ(a, 6);
+  LOG(WARNING) << "test warning logging to file";
+}
+
+TEST(Logging, StderrLoggingLevel) {
+  int a = 6;
+  singa::SetStderrLogging(singa::WARNING);
+  CHECK_EQ(a, 6);
+  LOG(INFO) << "test info logging to stderr";
+  LOG(WARNING) << "test warning logging to stderr and file";
+  LOG(ERROR) << "test error logging to stderr and file";
+}
diff --git a/test/singa/test_lrn.cc b/test/singa/test_lrn.cc
new file mode 100644
index 0000000..454e1a9
--- /dev/null
+++ b/test/singa/test_lrn.cc
@@ -0,0 +1,116 @@
+/*********************************************************
+*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*
+************************************************************/
+
+#include "../src/model/layer/lrn.h"
+#include "gtest/gtest.h"
+
+using namespace singa;
+
+TEST(LRN, Setup) {
+  LRN lrn;
+  // EXPECT_EQ("LRN", lrn.layer_type());
+
+  LayerConf conf;
+  LRNConf *lrn_conf = conf.mutable_lrn_conf();
+  lrn_conf->set_k(1.0);
+  lrn_conf->set_local_size(3);
+  lrn_conf->set_alpha(0.1);
+  lrn_conf->set_beta(0.75);
+  lrn.Setup(Shape{1}, conf);
+
+  EXPECT_FLOAT_EQ(1.0, lrn.k());
+  EXPECT_EQ(3, lrn.local_size());
+  EXPECT_FLOAT_EQ(0.1, lrn.alpha());
+  EXPECT_FLOAT_EQ(0.75, lrn.beta());
+}
+
+TEST(LRN, Forward) {
+  LRN lrn;
+  const float x[] = {1, 2, 3, 4, 5, 6, 7, 8};
+  Tensor in(Shape{2, 4, 1, 1});
+  in.CopyDataFromHostPtr(x, 8);
+
+  singa::LayerConf conf;
+  singa::LRNConf *lrn_conf = conf.mutable_lrn_conf();
+  lrn_conf->set_k(1.0);
+  lrn_conf->set_local_size(3);
+  lrn_conf->set_alpha(0.1);
+  lrn_conf->set_beta(0.75);
+  lrn.Setup(Shape{4, 1, 1}, conf);
+
+  Tensor out = lrn.Forward(kTrain, in);
+  const float *outptr = out.data<float>();
+  const auto &shape = out.shape();
+  EXPECT_EQ(4u, shape.size());
+  EXPECT_EQ(2u, shape[0]);
+  EXPECT_EQ(4u, shape[1]);
+  EXPECT_EQ(1u, shape[2]);
+  EXPECT_EQ(1u, shape[3]);
+
+  EXPECT_NEAR(0.737787, outptr[0], 1e-6f);
+  EXPECT_NEAR(1.037221, outptr[1], 1e-6f);
+  EXPECT_NEAR(1.080992, outptr[2], 1e-6f);
+  EXPECT_NEAR(1.563179, outptr[3], 1e-6f);
+  EXPECT_NEAR(1.149545, outptr[4], 1e-6f);
+  EXPECT_NEAR(0.930604, outptr[5], 1e-6f);
+  EXPECT_NEAR(0.879124, outptr[6], 1e-6f);
+  EXPECT_NEAR(1.218038, outptr[7], 1e-6f);
+}
+
+TEST(LRN, Backward) {
+  LRN lrn;
+  const float x[] = {1, 2, 3, 4, 5, 6, 7, 8};
+  Tensor in(Shape{2, 4, 1, 1});
+  in.CopyDataFromHostPtr(x, 8);
+
+  singa::LayerConf conf;
+  singa::LRNConf *lrn_conf = conf.mutable_lrn_conf();
+  lrn_conf->set_k(1.0);
+  lrn_conf->set_local_size(3);
+  lrn_conf->set_alpha(0.1);
+  lrn_conf->set_beta(0.75);
+  lrn.Setup(Shape{4, 1, 1}, conf);
+
+  Tensor out = lrn.Forward(kTrain, in);
+
+  const float dy_arr[] = {8, 7, 6, 5, 4, 3, 2, 1};
+  Tensor dy(Shape{2, 4, 1, 1});
+  dy.CopyDataFromHostPtr(dy_arr, 8);
+
+  const auto ret = lrn.Backward(singa::kTrain, dy);
+  singa::Tensor dx = ret.first;
+  const float *dxptr = dx.data<float>();
+  const auto &shape = dx.shape();
+  EXPECT_EQ(4u, shape.size());
+  EXPECT_EQ(2u, shape[0]);
+  EXPECT_EQ(4u, shape[1]);
+  EXPECT_EQ(1u, shape[2]);
+  EXPECT_EQ(1u, shape[3]);
+
+  EXPECT_NEAR(4.858288752f, dxptr[0], 1e-6f);
+  EXPECT_NEAR(1.04332631f, dxptr[1], 1e-6f);
+  EXPECT_NEAR(-0.952648779f, dxptr[2], 1e-6f);
+  EXPECT_NEAR(-0.38373312f, dxptr[3], 1e-6f);
+  EXPECT_NEAR(0.259424615f, dxptr[4], 1e-6f);
+  EXPECT_NEAR(-0.426475393f, dxptr[5], 1e-6f);
+  EXPECT_NEAR(-0.213195118f, dxptr[6], 1e-6f);
+  EXPECT_NEAR(-0.099276183f, dxptr[7], 1e-6f);
+}
diff --git a/test/singa/test_memory.cc b/test/singa/test_memory.cc
new file mode 100644
index 0000000..33a3747
--- /dev/null
+++ b/test/singa/test_memory.cc
@@ -0,0 +1,99 @@
+/************************************************************
+*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*
+*************************************************************/
+
+#include "gtest/gtest.h"
+#include "singa/utils/logging.h"
+#include "singa/core/memory.h"
+#include "singa/singa_config.h"
+#include "singa/utils/timer.h"
+#include "singa/utils/cuda_utils.h"
+
+#ifdef USE_CUDA
+/*
+TEST(CnmemPool, PoolInitAll) {
+  singa::CnMemPool pool(1);
+  int nDevices;
+  cudaGetDeviceCount(&nDevices);
+  CHECK_GE(nDevices, 1);
+}
+
+TEST(CnmemPool, UsePool) {
+  singa::CnMemPool pool;
+  pool.InitPool();
+  int numOfTests = 10;
+  int numOfWriteVsRead = 3;
+  int allocSize = 32;
+  for (int i = 0; i < numOfTests; i++) {
+    int** memPtrs = new int* [numOfWriteVsRead];
+    for (int j = 0; j < numOfWriteVsRead; j++) {
+      pool.Malloc((void**)(&memPtrs[j]), allocSize);
+    }
+    pool.Free(memPtrs[0]);
+    delete[] memPtrs;
+  }
+}
+TEST(CudaMemPool, UsePool) {
+  singa::CudaMemPool pool;
+  int numOfTests = 10;
+  int numOfWriteVsRead = 3;
+  int allocSize = 32;
+  for (int i = 0; i < numOfTests; i++) {
+    int** memPtrs = new int* [numOfWriteVsRead];
+    for (int j = 0; j < numOfWriteVsRead; j++) {
+      pool.Malloc((void**)(&memPtrs[j]), allocSize);
+    }
+    pool.Free(memPtrs[0]);
+    delete[] memPtrs;
+  }
+}
+*/
+
+TEST(MemPool, CompareCudaCnmem) {
+  singa::CudaMemPool cudaPool;
+  singa::CnMemPool cnPool;
+
+  int numOfTests = 5000;
+  int allocSize = 32;
+
+  singa::DeviceMemPool* pool = NULL;
+  pool = &cnPool;
+
+  CUDA_CHECK(cudaSetDevice(0));
+  singa::Timer tick;
+  for (int i = 0; i < numOfTests; i++) {
+    int* memPtrs = NULL;
+    pool->Malloc((void**)&memPtrs, allocSize);
+    pool->Free(memPtrs);
+  }
+  tick.Tick();
+  int cn_time = tick.Elapsed();
+
+  pool = &cudaPool;
+  for (int i = 0; i < numOfTests; i++) {
+    int* memPtrs = NULL;
+    pool->Malloc((void**)&memPtrs, allocSize);
+    pool->Free(memPtrs);
+  }
+  tick.Tick();
+  int cuda_time = tick.Elapsed();
+  EXPECT_GE(cuda_time, cn_time);
+}
+#endif  // USE_CUDA
diff --git a/test/singa/test_mse.cc b/test/singa/test_mse.cc
new file mode 100644
index 0000000..a0ab1a1
--- /dev/null
+++ b/test/singa/test_mse.cc
@@ -0,0 +1,109 @@
+/************************************************************
+*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*
+*************************************************************/
+
+#include "gtest/gtest.h"
+#include "singa/core/tensor.h"
+#include "singa/core/device.h"
+#include "singa/model/loss.h"
+
+using singa::Tensor;
+class TestMSE : public ::testing::Test {
+ protected:
+  virtual void SetUp() {
+    p.Reshape(singa::Shape{2, 3});
+    t.Reshape(singa::Shape{2, 3});
+    p.CopyDataFromHostPtr(pdat, sizeof(pdat) / sizeof(float));
+    t.CopyDataFromHostPtr(tdat, sizeof(pdat) / sizeof(float));
+  }
+  const float pdat[6] = {0.1, 1.1, 2.1, 0.3, 2.2, 1.8};
+  const float tdat[6] = {0.1, 1.1, 2.0, 0.3, 2.2, 1.8};
+
+  singa::Tensor p, t;
+};
+
+#ifdef USE_CBLAS
+TEST_F(TestMSE, CppForward) {
+  singa::MSE mse;
+  const Tensor& loss = mse.Forward(singa::kEval, p, t);
+  auto ldat = loss.data<float>();
+
+  for (size_t i = 0, k = 0; i < loss.Size(); i++) {
+    float l = 0.f;
+    for (size_t j = 0; j < p.Size() / loss.Size(); j++) {
+      l += (pdat[k] - tdat[k]) * (pdat[k] - tdat[k]);
+      k++;
+    }
+    EXPECT_FLOAT_EQ(ldat[i], 0.5 * l);
+  }
+}
+
+TEST_F(TestMSE, CppBackward) {
+  singa::MSE mse;
+  mse.Forward(singa::kTrain, p, t);
+  const Tensor& grad = mse.Backward();
+
+  auto gdat = grad.data<float>();
+
+  for (size_t i = 0; i < grad.Size(); i++)
+    EXPECT_FLOAT_EQ(gdat[i], (1.0f / p.shape().at(0)) * (pdat[i] - tdat[i]));
+}
+#endif
+#ifdef USE_CUDA
+TEST_F(TestMSE, CudaForward) {
+  singa::MSE* mse = new singa::MSE();
+  auto dev = std::make_shared<singa::CudaGPU>();
+  p.ToDevice(dev);
+  t.ToDevice(dev);
+  Tensor loss = mse->Forward(singa::kEval, p, t);
+
+  loss.ToHost();
+  auto ldat = loss.data<float>();
+
+  for (size_t i = 0, k = 0; i < loss.Size(); i++) {
+    float l = 0.f;
+    for (size_t j = 0; j < p.Size() / loss.Size(); j++) {
+      l += (pdat[k] - tdat[k]) * (pdat[k] - tdat[k]);
+      k++;
+    }
+    EXPECT_FLOAT_EQ(ldat[i], 0.5 * l);
+  }
+	p.ToHost();
+	t.ToHost();
+  delete mse;
+}
+
+TEST_F(TestMSE, CudaBackward) {
+  singa::MSE mse;
+  auto dev = std::make_shared<singa::CudaGPU>();
+  p.ToDevice(dev);
+  t.ToDevice(dev);
+  mse.Forward(singa::kTrain, p, t);
+  Tensor grad = mse.Backward();
+  grad.ToHost();
+  auto gdat = grad.data<float>();
+
+  for (size_t i = 0; i < grad.Size(); i++)
+    EXPECT_FLOAT_EQ(gdat[i], (1.0f / p.shape().at(0)) * (pdat[i] - tdat[i]));
+	p.ToHost();
+	t.ToHost();
+
+}
+#endif
diff --git a/test/singa/test_nesterov.cc b/test/singa/test_nesterov.cc
new file mode 100644
index 0000000..7c76784
--- /dev/null
+++ b/test/singa/test_nesterov.cc
@@ -0,0 +1,101 @@
+/************************************************************
+*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*
+*************************************************************/
+
+#include "gtest/gtest.h"
+#include "singa/model/optimizer.h"
+#include "singa/singa_config.h"
+
+TEST(Nesterov, ApplyCPU) {
+  singa::Nesterov nesterov;
+  float lr = 0.1f;
+  auto func = [](int step) { return step <= 5 ? 0.5f : 0.9f; };
+  nesterov.SetMomentumGenerator(func);
+  const float v[4] = {0.1, 0.2, 0.3, 0.4};
+  const float g[4] = {0.01, 0.02, 0.03, 0.04};
+
+  singa::Tensor value(singa::Shape{4}), grad(singa::Shape{4});
+  value.CopyDataFromHostPtr(v, 4);
+  grad.CopyDataFromHostPtr(g, 4);
+
+  nesterov.Apply(0, lr, "xx", grad, value);
+
+  singa::Tensor v1 = value.Clone();
+  const float* newv1 = v1.data<float>();
+  float history[4], tmp[4];
+  for (int i = 0; i < 4; ++i) {
+    history[i] = g[i] * lr;
+    tmp[i] = history[i] * (1 + func(0));
+  }
+  for (int i = 0; i < 4; ++i) EXPECT_FLOAT_EQ(newv1[i], v[i] - tmp[i]);
+
+  grad.CopyDataFromHostPtr(g, 4);
+  nesterov.Apply(1, lr, "xx", grad, value);
+  singa::Tensor v2 = value.Clone();
+  const float* newv2 = v2.data<float>();
+  for (int i = 0; i < 4; ++i) {
+    tmp[i] = history[i];
+    history[i] = history[i] * func(1) + g[i] * lr;
+    tmp[i] = history[i] * (1 + func(1)) - tmp[i] * func(1);
+  }
+
+  for (int i = 0; i < 4; ++i) EXPECT_FLOAT_EQ(newv2[i], newv1[i] - tmp[i]);
+}
+
+#ifdef USE_CUDA
+TEST(Nesterov, ApplyCUDA) {
+  singa::Nesterov nesterov;
+  float lr = 0.1f;
+  auto func = [](int step) { return step <= 5 ? 0.5f : 0.9f; };
+  nesterov.SetMomentumGenerator(func);
+  const float v[4] = {0.1, 0.2, 0.3, 0.4};
+  const float g[4] = {0.01, 0.02, 0.03, 0.04};
+
+  auto dev = std::make_shared<singa::CudaGPU>();
+  singa::Tensor value(singa::Shape{4}, dev), grad(singa::Shape{4}, dev);
+  value.CopyDataFromHostPtr(v, 4);
+  grad.CopyDataFromHostPtr(g, 4);
+
+  nesterov.Apply(0, lr, "xx", grad, value);
+
+  singa::Tensor v1 = value.Clone();
+  v1.ToHost();
+  const float* newv1 = v1.data<float>();
+  float history[4], tmp[4];
+  for (int i = 0; i < 4; ++i) {
+    history[i] = g[i] * lr;
+    tmp[i] = history[i] * (1 + func(0));
+  }
+  for (int i = 0; i < 4; ++i) EXPECT_FLOAT_EQ(newv1[i], v[i] - tmp[i]);
+
+  grad.CopyDataFromHostPtr(g, 4);
+  nesterov.Apply(1, lr, "xx", grad, value);
+  singa::Tensor v2 = value.Clone();
+  v2.ToHost();
+  const float* newv2 = v2.data<float>();
+  for (int i = 0; i < 4; ++i) {
+    tmp[i] = history[i];
+    history[i] = history[i] * func(1) + g[i] * lr;
+    tmp[i] = history[i] * (1 + func(1)) - tmp[i] * func(1);
+  }
+
+  for (int i = 0; i < 4; ++i) EXPECT_FLOAT_EQ(newv2[i], newv1[i] - tmp[i]);
+}
+#endif
diff --git a/test/singa/test_opencl.cc b/test/singa/test_opencl.cc
new file mode 100644
index 0000000..3ce1889
--- /dev/null
+++ b/test/singa/test_opencl.cc
@@ -0,0 +1,629 @@
+/************************************************************
+*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*
+*************************************************************/
+
+#include "gtest/gtest.h"
+#include "singa/core/device.h"
+#include "singa/core/tensor.h"
+#include "singa/proto/core.pb.h"
+using singa::CppCPU;
+using singa::Block;
+using singa::Shape;
+using singa::Tensor;
+
+#ifdef USE_OPENCL
+using singa::OpenclDevice;
+class OpenCL_TensorMath : public ::testing::Test {
+protected:
+
+  OpenCL_TensorMath() {
+    for (int i = 0; i < 4; i++) {
+      float4[i] = (float)i;
+      float4zero[i] = 0.0f;
+    }
+
+    for (int i = 0; i < 16; i++) {
+      float16[i] = (float)i;
+      float16zero[i] = 0.0f;
+    }
+
+    auto ocl_dev = std::make_shared<OpenclDevice>();
+
+    tf4in = Tensor(Shape{1, 4}, ocl_dev);
+    tf4in.CopyDataFromHostPtr(float4, 4);
+
+    tf4zin = Tensor(Shape{1, 4}, ocl_dev);
+    tf4zin.CopyDataFromHostPtr(float4zero, 4);
+
+    tf16in = Tensor(Shape{4, 4}, ocl_dev);
+    tf16in.CopyDataFromHostPtr(float16, 16);
+
+    tf16zin = Tensor(Shape{4, 4}, ocl_dev);
+    tf16zin.CopyDataFromHostPtr(float16zero, 16);
+
+    float empty[10000] = {};
+    empty10k = Tensor(Shape{10000}, ocl_dev);
+    empty10k.CopyDataFromHostPtr(empty, 10000);
+  }
+
+  float float4[4];
+  float float4zero[4];
+  float float16[16];
+  float float16zero[16];
+
+  Tensor tf4in, tf16in;
+  Tensor tf4zin, tf16zin;
+  Tensor empty10k;
+};
+
+
+// Makes a float array and fills it with increasing values from 0.
+float* MakeMatrix(const int size) {
+  float* mat = new float[size];
+  for (int i = 0; i < size; i++)
+    mat[i] = i;
+  return mat;
+}
+
+
+TEST(OpenclDevice, Constructor) {
+  OpenclDevice dev;
+  EXPECT_EQ(0, dev.id());
+}
+
+
+TEST(OpenclDevice, MemoryAllocFree) {
+  OpenclDevice dev;
+  Block* b = dev.NewBlock(4);
+  EXPECT_NE(nullptr, b);
+  EXPECT_EQ(4u, b->size());
+  dev.FreeBlock(b);
+}
+
+// Tests for integrity of one round of data transfer to an OpenCL device and back.
+TEST(OpenclDevice, CopyDataToFrom) {
+  OpenclDevice dev;
+  CppCPU host;
+
+  Block* a = host.NewBlock(4);
+  Block* b = dev.NewBlock(4);
+  Block* c = host.NewBlock(4);
+
+  // Allocate the Block object on the host.
+  char s[] = {'a', 'b', 'c', 'x'};
+  host.CopyDataFromHostPtr(a, s, 4);
+
+  // Copy back and forth.
+  dev.CopyDataToFrom(b, a, 4, singa::kHostToDevice);
+  dev.CopyDataToFrom(c, b, 4, singa::kDeviceToHost);
+
+  const char* astr = static_cast<const char*>(c->data());
+  EXPECT_EQ('a', astr[0]);
+  EXPECT_EQ('b', astr[1]);
+  EXPECT_EQ('x', astr[3]);
+}
+
+
+TEST(OpenclDevice, DuplicateDataOnDevice) {
+  OpenclDevice dev;
+  CppCPU host;
+
+  Block* a = host.NewBlock(4);
+  Block* b = dev.NewBlock(4);
+  Block* c = dev.NewBlock(4);
+  Block* d = host.NewBlock(4);
+
+  // Allocate the Block object on the host.
+  char s[] = {'a', 'b', 'c', 'x'};
+  host.CopyDataFromHostPtr(a, s, 4);
+
+  // Copy to device and duplicate.
+  dev.CopyDataToFrom(b, a, 4, singa::kHostToDevice);
+  dev.CopyDataToFrom(c, b, 4, singa::kDeviceToDevice);
+  dev.CopyDataToFrom(d, c, 4, singa::kDeviceToHost);
+
+  const char* astr = static_cast<const char*>(d->data());
+  EXPECT_EQ('a', astr[0]);
+  EXPECT_EQ('b', astr[1]);
+  EXPECT_EQ('x', astr[3]);
+}
+
+// Tensor tests, uses OpenCL_TensorMath class defined above.
+
+TEST_F(OpenCL_TensorMath, CopyDataToDevice) {
+  tf4in.ToHost();
+  const float* out = tf4in.data<float>();
+
+  EXPECT_EQ(1.0f, out[1]);
+  EXPECT_EQ(3.0f, out[3]);
+}
+
+
+TEST_F(OpenCL_TensorMath, MemberAbs) {
+  tf4in = Abs(tf4in);
+
+  tf4in.ToHost();
+  const float* out = tf4in.data<float>();
+
+  EXPECT_EQ(0.0f, out[0]);
+  EXPECT_EQ(1.0f, out[1]);
+  EXPECT_EQ(2.0f, out[2]);
+  EXPECT_EQ(3.0f, out[3]);
+}
+
+
+TEST_F(OpenCL_TensorMath, MemberExp) {
+  tf4in = Exp(tf4in);
+
+  tf4in.ToHost();
+  const float* out = tf4in.data<float>();
+
+  EXPECT_NEAR(exp(0.0f), out[0], 1e-5);
+  EXPECT_NEAR(exp(1.0f), out[1], 1e-5);
+  EXPECT_NEAR(exp(2.0f), out[2], 1e-5);
+  EXPECT_NEAR(exp(3.0f), out[3], 1e-5);
+}
+
+
+TEST_F(OpenCL_TensorMath, MemberLog) {
+  tf4in = Log(tf4in);
+
+  tf4in.ToHost();
+  const float* out = tf4in.data<float>();
+
+//  EXPECT_NEAR(log(0.0f), out[0], 1e-5); // Evaluates to neg infinity.
+  EXPECT_NEAR(log(1.0f), out[1], 1e-5);
+  EXPECT_NEAR(log(2.0f), out[2], 1e-5);
+  EXPECT_NEAR(log(3.0f), out[3], 1e-5);
+}
+
+
+TEST_F(OpenCL_TensorMath, MemberReLU) {
+  tf4in -= 2.0f;
+  Tensor result = ReLU(tf4in);
+
+  result.ToHost();
+  const float* out = result.data<float>();
+
+  EXPECT_NEAR(0.0f, out[0], 1e-5);
+  EXPECT_NEAR(0.0f, out[1], 1e-5);
+  EXPECT_NEAR(0.0f, out[2], 1e-5);
+  EXPECT_NEAR(1.0f, out[3], 1e-5);
+}
+
+
+TEST_F(OpenCL_TensorMath, MemberSigmoid) {
+  tf4in = Sigmoid(tf4in);
+
+  tf4in.ToHost();
+  const float* out = tf4in.data<float>();
+
+  EXPECT_NEAR(1.0f / (1.0f + exp(-0.0f)), out[0], 1e-5);
+  EXPECT_NEAR(1.0f / (1.0f + exp(-1.0f)), out[1], 1e-5);
+  EXPECT_NEAR(1.0f / (1.0f + exp(-2.0f)), out[2], 1e-5);
+}
+
+
+TEST_F(OpenCL_TensorMath, MemberSign) {
+  tf4in -= 1.0f;
+
+  tf4in.ToHost();
+  const float* out = tf4in.data<float>();
+
+  EXPECT_NEAR(-1.0f, out[0], 1e-5);
+  EXPECT_NEAR(0.0f, out[1], 1e-5);
+  EXPECT_NEAR(1.0f, out[2], 1e-5);
+  EXPECT_NEAR(2.0f, out[3], 1e-5);
+}
+
+
+TEST_F(OpenCL_TensorMath, MemberSqrt) {
+  tf4in = Sqrt(tf4in);
+
+  tf4in.ToHost();
+  const float* out = tf4in.data<float>();
+
+  EXPECT_NEAR(0.0f, out[0], 1e-5);
+  EXPECT_NEAR(1.0f, out[1], 1e-5);
+  EXPECT_NEAR(sqrt(2.0f), out[2], 1e-5);
+  EXPECT_NEAR(sqrt(3.0f), out[3], 1e-5);
+}
+
+
+TEST_F(OpenCL_TensorMath, MemberSquare) {
+  tf4in = Square(tf4in);
+
+  tf4in.ToHost();
+  const float* out = tf4in.data<float>();
+
+  EXPECT_NEAR(0.0f, out[0], 1e-5);
+  EXPECT_NEAR(1.0f, out[1], 1e-5);
+  EXPECT_NEAR(4.0f, out[2], 1e-5);
+  EXPECT_NEAR(9.0f, out[3], 1e-5);
+}
+
+
+TEST_F(OpenCL_TensorMath, MemberTanh) {
+  tf4in = Tanh(tf4in);
+
+  tf4in.ToHost();
+  const float* out = tf4in.data<float>();
+
+  EXPECT_NEAR(0.0f, out[0], 1e-5);
+  EXPECT_NEAR(tanh(1.0f), out[1], 1e-5);
+  EXPECT_NEAR(tanh(2.0f), out[2], 1e-5);
+  EXPECT_NEAR(tanh(3.0f), out[3], 1e-5);
+}
+
+
+TEST_F(OpenCL_TensorMath, Sum) {
+  Tensor result = Sum(tf4in, 0);
+
+  result.ToHost();
+  const float* out = result.data<float>();
+
+  EXPECT_NEAR(0.0f, out[0], 1e-5);
+  EXPECT_NEAR(1.0f, out[1], 1e-5);
+  EXPECT_NEAR(2.0f, out[2], 1e-5);
+  EXPECT_NEAR(3.0f, out[3], 1e-5);
+}
+
+TEST_F(OpenCL_TensorMath, MemberLT) {
+  Tensor result = tf4in < 2.0f;
+
+  result.ToHost();
+  const float* out = result.data<float>();
+
+  EXPECT_FLOAT_EQ(1.0f, out[0]);
+  EXPECT_FLOAT_EQ(1.0f, out[1]);
+  EXPECT_FLOAT_EQ(0.0f, out[2]);
+  EXPECT_FLOAT_EQ(0.0f, out[3]);
+}
+
+
+TEST_F(OpenCL_TensorMath, MemberLE) {
+  Tensor result = tf4in <= 2.0f;
+
+  result.ToHost();
+  const float* out = result.data<float>();
+
+  EXPECT_FLOAT_EQ(1.0f, out[0]);
+  EXPECT_FLOAT_EQ(1.0f, out[1]);
+  EXPECT_FLOAT_EQ(1.0f, out[2]);
+  EXPECT_FLOAT_EQ(0.0f, out[3]);
+}
+
+
+TEST_F(OpenCL_TensorMath, MemberGT) {
+  Tensor result = tf4in > 2.0f;
+
+  result.ToHost();
+  const float* out = result.data<float>();
+
+  EXPECT_FLOAT_EQ(0.0f, out[0]);
+  EXPECT_FLOAT_EQ(0.0f, out[1]);
+  EXPECT_FLOAT_EQ(0.0f, out[2]);
+  EXPECT_FLOAT_EQ(1.0f, out[3]);
+}
+
+
+TEST_F(OpenCL_TensorMath, MemberGE) {
+  Tensor result = tf4in >= 2.0f;
+
+  result.ToHost();
+  const float* out = result.data<float>();
+
+  EXPECT_FLOAT_EQ(0.0f, out[0]);
+  EXPECT_FLOAT_EQ(0.0f, out[1]);
+  EXPECT_FLOAT_EQ(1.0f, out[2]);
+  EXPECT_FLOAT_EQ(1.0f, out[3]);
+}
+
+
+TEST_F(OpenCL_TensorMath, MemberPow) {
+  Tensor result = Pow(tf4in, 2.0f);
+
+  result.ToHost();
+  const float* out = result.data<float>();
+
+  EXPECT_FLOAT_EQ(0.0f, out[0]);
+  EXPECT_FLOAT_EQ(1.0f, out[1]);
+  EXPECT_FLOAT_EQ(4.0f, out[2]);
+  EXPECT_FLOAT_EQ(9.0f, out[3]);
+
+  result = Pow(tf4in, tf4in);
+
+  result.ToHost();
+  const float* out1 = result.data<float>();
+
+  EXPECT_FLOAT_EQ(1.0f, out1[0]); // 0 ^ 0 is 1, apparently.
+  EXPECT_FLOAT_EQ(1.0f, out1[1]);
+  EXPECT_FLOAT_EQ(4.0f, out1[2]);
+  EXPECT_FLOAT_EQ(27.0f, out1[3]);
+}
+
+
+TEST_F(OpenCL_TensorMath, MemberSub) {
+  Tensor result = tf4in - tf4zin;
+
+  result.ToHost();
+  const float* out = result.data<float>();
+
+  EXPECT_FLOAT_EQ(0.0f, out[0]);
+  EXPECT_FLOAT_EQ(1.0f, out[1]);
+  EXPECT_FLOAT_EQ(2.0f, out[2]);
+  EXPECT_FLOAT_EQ(3.0f, out[3]);
+
+  result = tf4in - 0.0f;
+
+  result.ToHost();
+  const float* out1 = result.data<float>();
+
+  EXPECT_FLOAT_EQ(0.0f, out1[0]);
+  EXPECT_FLOAT_EQ(1.0f, out1[1]);
+  EXPECT_FLOAT_EQ(2.0f, out1[2]);
+  EXPECT_FLOAT_EQ(3.0f, out1[3]);
+}
+
+
+TEST_F(OpenCL_TensorMath, MemberEltwiseMult) {
+  Tensor result = tf4in * tf4zin;
+
+  result.ToHost();
+  const float* out = result.data<float>();
+
+  EXPECT_FLOAT_EQ(0.0f, out[0]);
+  EXPECT_FLOAT_EQ(0.0f, out[1]);
+  EXPECT_FLOAT_EQ(0.0f, out[2]);
+  EXPECT_FLOAT_EQ(0.0f, out[3]);
+
+  result = tf4in * 10.0f;
+
+  result.ToHost();
+  const float* out1 = result.data<float>();
+
+  EXPECT_FLOAT_EQ(0.0f, out1[0]);
+  EXPECT_FLOAT_EQ(10.0f, out1[1]);
+  EXPECT_FLOAT_EQ(20.0f, out1[2]);
+  EXPECT_FLOAT_EQ(30.0f, out1[3]);
+}
+
+
+TEST_F(OpenCL_TensorMath, MemberDiv) {
+  Tensor result = tf4in / tf4in;
+
+  result.ToHost();
+  const float* out = result.data<float>();
+
+//  EXPECT_FLOAT_EQ(0.0f, out[0]); // Divide by zero.
+  EXPECT_FLOAT_EQ(1.0f, out[1]);
+  EXPECT_FLOAT_EQ(1.0f, out[2]);
+  EXPECT_FLOAT_EQ(1.0f, out[3]);
+
+  result = tf4in / 10.0f;
+
+  result.ToHost();
+  const float* out1 = result.data<float>();
+
+  EXPECT_FLOAT_EQ(0.0f, out1[0]);
+  EXPECT_FLOAT_EQ(0.1f, out1[1]);
+  EXPECT_FLOAT_EQ(0.2f, out1[2]);
+  EXPECT_FLOAT_EQ(0.3f, out1[3]);
+
+  result = Div(10.0f, tf4in);
+
+  result.ToHost();
+  const float* out2 = result.data<float>();
+
+//  EXPECT_FLOAT_EQ(0.0f, out[0]); // Divide by 0.
+  EXPECT_FLOAT_EQ(10.0f, out2[1]);
+  EXPECT_FLOAT_EQ(5.0f, out2[2]);
+  EXPECT_NEAR((10.0f / 3.0f), out2[3], 1e-5);
+}
+
+// **************************************
+// Random functions
+// **************************************
+
+TEST_F(OpenCL_TensorMath, Bernoulli) {
+  const float p = 0.3f;
+
+  Bernoulli(p, &empty10k);
+
+  empty10k.ToHost();
+  const float* out = empty10k.data<float>();
+
+  float sum = 0.0f;
+  for (int i = 0; i < 10000; i++) sum += out[i];
+
+  float mean = sum / 10000;
+
+  EXPECT_NEAR(mean, p, 1e-2);
+
+  sum = 0.0f;
+  for (int i = 0; i < 10000; i++) sum += (out[i] - mean) * (out[i] - mean);
+  float variance = sum / 9999;
+
+  EXPECT_NEAR(variance, p * (1 - p), 1e-2);
+}
+
+
+TEST_F(OpenCL_TensorMath, Gaussian) {
+  Gaussian(0.0f, 1.0f, &empty10k);
+
+  empty10k.ToHost();
+  const float* out = empty10k.data<float>();
+
+  float sum = 0.0f;
+  for (int i = 0; i < 10000; i++) sum += out[i];
+  float mean = sum / 10000;
+
+  EXPECT_NEAR(mean, 0.0f, 1e-2);
+
+  sum = 0.0f;
+  for (int i = 0; i < 10000; i++) sum += (out[i] - mean) * (out[i] - mean);
+  float variance = sum / 9999;
+
+  EXPECT_NEAR(variance, 1.0f, 1e-2);
+}
+
+
+TEST_F(OpenCL_TensorMath, Uniform) {
+  Uniform(0.1f, 0.2f, &empty10k);
+
+  empty10k.ToHost();
+  const float* out = empty10k.data<float>();
+
+  float sum = 0.0f;
+  for (int i = 0; i < 10000; i++) sum += out[i];
+  float mean = sum / 10000;
+
+  EXPECT_NEAR(mean, 0.15f, 1e-2);
+
+  sum = 0.0f;
+  for (int i = 0; i < 10000; i++) sum += (out[i] - mean) * (out[i] - mean);
+  float variance = sum / 9999;
+
+  EXPECT_NEAR(variance, 0.01f, 1e-2);
+}
+
+// *********************************************************
+// BLAS functions, ref to http://docs.nvidia.com/cuda/cublas
+// *********************************************************
+
+
+TEST_F(OpenCL_TensorMath, EltwiseAdd) {
+  Tensor result = tf4in + tf4in;
+
+  result.ToHost();
+  const float* out = result.data<float>();
+
+  EXPECT_EQ(0.0f, out[0]);
+  EXPECT_EQ(2.0f, out[1]);
+  EXPECT_EQ(4.0f, out[2]);
+  EXPECT_EQ(6.0f, out[3]);
+
+  result = tf4in + tf4zin;
+
+  result.ToHost();
+  const float* out1 = result.data<float>();
+
+  EXPECT_EQ(0.0f, out1[0]);
+  EXPECT_EQ(1.0f, out1[1]);
+  EXPECT_EQ(2.0f, out1[2]);
+  EXPECT_EQ(3.0f, out1[3]);
+
+  result = Tensor(tf4in.shape(), tf4in.device(), tf4in.data_type());
+  Add(tf4in, tf4in, &result);
+
+  result.ToHost();
+  const float* out2 = result.data<float>();
+
+  EXPECT_EQ(0.0f, out2[0]);
+  EXPECT_EQ(2.0f, out2[1]);
+  EXPECT_EQ(4.0f, out2[2]);
+  EXPECT_EQ(6.0f, out2[3]);
+
+  result = tf4in + 1.0f;
+
+  result.ToHost();
+  const float* out3 = result.data<float>();
+
+  EXPECT_EQ(1.0f, out3[0]);
+  EXPECT_EQ(2.0f, out3[1]);
+  EXPECT_EQ(3.0f, out3[2]);
+  EXPECT_EQ(4.0f, out3[3]);
+}
+
+
+TEST_F(OpenCL_TensorMath, SetValue) {
+  const float one_third = 1.0f / 3.0f;
+  empty10k.SetValue(one_third);
+
+  empty10k.ToHost();
+  const float* out = empty10k.data<float>();
+
+  EXPECT_EQ(one_third, out[0]);
+  EXPECT_EQ(one_third, out[1]);
+  EXPECT_EQ(one_third, out[1024]);
+  EXPECT_EQ(one_third, out[4096]);
+  EXPECT_EQ(one_third, out[9998]);
+  EXPECT_EQ(one_third, out[9999]);
+}
+
+
+TEST_F(OpenCL_TensorMath, Axpy) {
+  Axpy(10.0f, tf4in, &tf4in);
+
+  tf4in.ToHost();
+  const float* out = tf4in.data<float>();
+
+  EXPECT_EQ(0.0f, out[0]);  // 0 * 10 + 0 = 0
+  EXPECT_EQ(11.0f, out[1]); // 1 * 10 + 1 = 11
+  EXPECT_EQ(22.0f, out[2]); // 2 * 10 + 2 = 22
+  EXPECT_EQ(33.0f, out[3]); // 3 * 10 + 3 = 33
+}
+
+TEST_F(OpenCL_TensorMath, Mult) {
+  Tensor result = Mult(tf4in, tf4zin.T()); // Multiply with zero.
+
+  result.ToHost();
+  const float* out = result.data<float>();
+
+  EXPECT_EQ(0.0f, out[0]); // 1x4 * 4x1 = 1x1.
+
+  result = Mult(tf4in, tf4in.T());
+
+  result.ToHost();
+  const float* out0 = result.data<float>();
+
+  EXPECT_EQ(14.0f, out0[0]); // 1x4 * 4x1 = 1x1.
+
+  tf16zin.SetValue(10.0f); // Multiply with 10.0.
+  result = Mult(tf16in, tf16zin); // 4x4 * 4x4 = 4x4.
+
+  result.ToHost();
+  const float* out1 = result.data<float>();
+  EXPECT_EQ(240.0f, out1[0]);
+  EXPECT_EQ(280.0f, out1[1]);
+  EXPECT_EQ(320.0f, out1[2]);
+  EXPECT_EQ(360.0f, out1[3]);
+
+  EXPECT_EQ(240.0f, out1[4]);
+  EXPECT_EQ(280.0f, out1[5]);
+  EXPECT_EQ(320.0f, out1[6]);
+  EXPECT_EQ(360.0f, out1[7]);
+
+  EXPECT_EQ(240.0f, out1[8]);
+  EXPECT_EQ(280.0f, out1[9]);
+  EXPECT_EQ(320.0f, out1[10]);
+  EXPECT_EQ(360.0f, out1[11]);
+
+  EXPECT_EQ(240.0f, out1[12]);
+  EXPECT_EQ(280.0f, out1[13]);
+  EXPECT_EQ(320.0f, out1[14]);
+  EXPECT_EQ(360.0f, out1[15]);
+}
+
+
+
+// TODO: ComputeCrossEntropy, SoftmaxCrossEntropy
+//
+#endif  // USE_OPENCL
diff --git a/test/singa/test_platform.cc b/test/singa/test_platform.cc
new file mode 100644
index 0000000..f50c978
--- /dev/null
+++ b/test/singa/test_platform.cc
@@ -0,0 +1,97 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+#include "gtest/gtest.h"
+#include "singa/core/device.h"
+#include "singa/core/tensor.h"
+
+#ifdef USE_CUDA
+using singa::Platform;
+TEST(Platform, NumGPUs) {
+  int n = Platform::GetNumGPUs();
+  EXPECT_GE(n, 0);
+  EXPECT_LE(n, 32);
+}
+
+TEST(Platform, QueryMem) {
+  size_t n = Platform::GetNumGPUs();
+  auto ids = Platform::GetGPUIDs();
+  EXPECT_EQ(ids.size(), n);
+  auto mem = Platform::GetGPUMemSize();
+  for (auto x : mem)
+    EXPECT_GT(x.second, x.first);
+}
+
+TEST(Platform, CreateDevice) {
+  auto dev = Platform::CreateCudaGPUs(1).at(0);
+  size_t size[] = { 128, 256, 3, 24 };
+  {
+    auto ptr = dev->NewBlock(size[0]);
+    auto allocated = dev->GetAllocatedMem();
+    EXPECT_LE(size[0], allocated);
+    dev->FreeBlock(ptr);
+    allocated = dev->GetAllocatedMem();
+  }
+  {
+    auto ptr0 = dev->NewBlock(size[0]);
+    auto ptr1 = dev->NewBlock(size[1]);
+    auto ptr2 = dev->NewBlock(size[2]);
+    auto allocated = dev->GetAllocatedMem();
+    EXPECT_LE(size[0] + size[1] + size[2], allocated);
+    auto ptr3 = dev->NewBlock(size[3]);
+    allocated = dev->GetAllocatedMem();
+    EXPECT_LE(size[0] + size[1] + size[2] + size[3], allocated);
+    dev->FreeBlock(ptr0);
+    dev->FreeBlock(ptr1);
+    dev->FreeBlock(ptr2);
+//    allocated = dev->GetAllocatedMem();
+//    EXPECT_EQ(size[3], allocated);
+    dev->FreeBlock(ptr3);
+//    allocated = dev->GetAllocatedMem();
+//    EXPECT_EQ(0, allocated);
+  }
+}
+
+TEST(Platform, CreateMultDevice) {
+  int n = Platform::GetNumGPUs();
+  auto devs = Platform::CreateCudaGPUs(n);
+  for (auto dev : devs) {
+    auto b = dev->NewBlock(32);
+    EXPECT_LE(32u, dev->GetAllocatedMem());
+    dev->FreeBlock(b);
+  }
+}
+
+TEST(Platform, CreatTensor) {
+  auto cuda = Platform::CreateCudaGPUs(1)[0];
+  singa::Tensor t(singa::Shape{2,3,4}, cuda);
+  t.SetValue(2.1f);
+  t.ToHost();
+  auto tPtr = t.data<float>();
+  for (size_t i = 0; i < t.Size(); i++)
+    EXPECT_FLOAT_EQ(tPtr[i], 2.1f);
+  t.ToDevice(cuda);
+  t = t * 3.0f;
+  t.ToHost();
+  tPtr = t.data<float>();
+  for (size_t i = 0; i < t.Size(); i++)
+    EXPECT_FLOAT_EQ(tPtr[i], 2.1f * 3.0f);
+}
+#endif
+
diff --git a/test/singa/test_pooling.cc b/test/singa/test_pooling.cc
new file mode 100644
index 0000000..7ba56d1
--- /dev/null
+++ b/test/singa/test_pooling.cc
@@ -0,0 +1,141 @@
+/************************************************************
+*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*
+*************************************************************/
+#include "../src/model/layer/pooling.h"
+
+#include "gtest/gtest.h"
+
+using singa::Pooling;
+using singa::Shape;
+TEST(Pooling, Setup) {
+  Pooling pool;
+  //  EXPECT_EQ("Pooling", pool.layer_type());
+
+  singa::LayerConf conf;
+  singa::PoolingConf *poolconf = conf.mutable_pooling_conf();
+  poolconf->set_pool(singa::PoolingConf_PoolMethod_MAX);
+  poolconf->set_kernel_h(1);
+  poolconf->set_kernel_w(2);
+  poolconf->set_pad_h(1);
+  poolconf->set_pad_w(0);
+  poolconf->set_stride_h(2);
+  poolconf->set_stride_w(1);
+  pool.Setup(Shape{1, 3, 3}, conf);
+
+  EXPECT_EQ(singa::PoolingConf_PoolMethod_MAX, pool.pool_method());
+  EXPECT_EQ(1u, pool.kernel_h());
+  EXPECT_EQ(2u, pool.kernel_w());
+  EXPECT_EQ(1u, pool.pad_h());
+  EXPECT_EQ(0u, pool.pad_w());
+  EXPECT_EQ(2u, pool.stride_h());
+  EXPECT_EQ(1u, pool.stride_w());
+  EXPECT_EQ(1u, pool.channels());
+  EXPECT_EQ(3u, pool.height());
+  EXPECT_EQ(3u, pool.width());
+}
+
+TEST(Pooling, Forward) {
+  const size_t batchsize = 2, c = 1, h = 3, w = 3;
+  const float x[batchsize * c * h * w] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f,
+                                          7.0f, 8.0f, 9.0f, 1.0f, 2.0f, 3.0f,
+                                          4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f};
+  singa::Tensor in(singa::Shape{batchsize, c, h, w});
+  in.CopyDataFromHostPtr(x, batchsize * c * h * w);
+
+  Pooling pool;
+  singa::LayerConf conf;
+  singa::PoolingConf *poolconf = conf.mutable_pooling_conf();
+  poolconf->set_pool(singa::PoolingConf_PoolMethod_MAX);
+  poolconf->set_kernel_h(2);
+  poolconf->set_kernel_w(2);
+  poolconf->set_pad_h(0);
+  poolconf->set_pad_w(0);
+  poolconf->set_stride_h(1);
+  poolconf->set_stride_w(1);
+  pool.Setup(Shape{1, 3, 3}, conf);
+
+  // Parameter "flag" does not influence pooling
+  singa::Tensor out1 = pool.Forward(singa::kTrain, in);
+  const float *outptr1 = out1.data<float>();
+  // Input: 3*3; kernel: 2*2; stride: 1*1; no padding.
+  EXPECT_EQ(8u, out1.Size());
+  EXPECT_EQ(5.0f, outptr1[0]);
+  EXPECT_EQ(6.0f, outptr1[1]);
+  EXPECT_EQ(8.0f, outptr1[2]);
+  EXPECT_EQ(9.0f, outptr1[3]);
+  EXPECT_EQ(5.0f, outptr1[4]);
+  EXPECT_EQ(6.0f, outptr1[5]);
+  EXPECT_EQ(8.0f, outptr1[6]);
+  EXPECT_EQ(9.0f, outptr1[7]);
+}
+
+TEST(Pooling, Backward) {
+  // src_data
+  const size_t batchsize = 2, c = 1, src_h = 3, src_w = 3;
+  const float x[batchsize * c * src_h * src_w] = {
+      1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f,
+      1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f};
+  singa::Tensor in(singa::Shape{batchsize, c, src_h, src_w});
+  in.CopyDataFromHostPtr(x, batchsize * c * src_h * src_w);
+
+  Pooling pool;
+  singa::LayerConf conf;
+  singa::PoolingConf *poolconf = conf.mutable_pooling_conf();
+  poolconf->set_pool(singa::PoolingConf_PoolMethod_MAX);
+  poolconf->set_kernel_h(2);
+  poolconf->set_kernel_w(2);
+  poolconf->set_pad_h(0);
+  poolconf->set_pad_w(0);
+  poolconf->set_stride_h(1);
+  poolconf->set_stride_w(1);
+  pool.Setup(Shape{1, 3, 3}, conf);
+
+  singa::Tensor out1 = pool.Forward(singa::kTrain, in);
+
+  // grad
+  const size_t grad_h = 2, grad_w = 2;
+  const float dy[batchsize * c * grad_h * grad_w] = {0.1f, 0.2f, 0.3f, 0.4f,
+                                                     0.1f, 0.2f, 0.3f, 0.4f};
+  singa::Tensor grad(singa::Shape{batchsize, c, grad_h, grad_w});
+  grad.CopyDataFromHostPtr(dy, batchsize * c * grad_h * grad_w);
+
+  const auto ret = pool.Backward(singa::kTrain, grad);
+  singa::Tensor in_grad = ret.first;
+  const float *dx = in_grad.data<float>();
+  EXPECT_EQ(18u, in_grad.Size());
+  EXPECT_EQ(0.0f, dx[0]);
+  EXPECT_EQ(0.0f, dx[1]);
+  EXPECT_EQ(0.0f, dx[2]);
+  EXPECT_EQ(0.0f, dx[3]);
+  EXPECT_EQ(0.1f, dx[4]);
+  EXPECT_EQ(0.2f, dx[5]);
+  EXPECT_EQ(0.0f, dx[6]);
+  EXPECT_EQ(0.3f, dx[7]);
+  EXPECT_EQ(0.4f, dx[8]);
+  EXPECT_EQ(0.0f, dx[9]);
+  EXPECT_EQ(0.0f, dx[10]);
+  EXPECT_EQ(0.0f, dx[11]);
+  EXPECT_EQ(0.0f, dx[12]);
+  EXPECT_EQ(0.1f, dx[13]);
+  EXPECT_EQ(0.2f, dx[14]);
+  EXPECT_EQ(0.0f, dx[15]);
+  EXPECT_EQ(0.3f, dx[16]);
+  EXPECT_EQ(0.4f, dx[17]);
+}
diff --git a/test/singa/test_prelu.cc b/test/singa/test_prelu.cc
new file mode 100644
index 0000000..77b4b74
--- /dev/null
+++ b/test/singa/test_prelu.cc
@@ -0,0 +1,249 @@
+/************************************************************
+*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*
+*************************************************************/
+
+#include "../src/model/layer/prelu.h"
+#include "gtest/gtest.h"
+#include "singa/singa_config.h"
+
+using singa::PReLU;
+using singa::Shape;
+TEST(PReLU, Setup) {
+  PReLU prelu;
+  // EXPECT_EQ("PReLU", prelu.layer_type());
+
+  singa::LayerConf conf;
+  singa::PReLUConf *preluconf = conf.mutable_prelu_conf();
+  preluconf->set_channel_shared(true);
+  preluconf->set_format("NHWC");
+
+  prelu.Setup(Shape{4}, conf);
+  EXPECT_EQ(true, prelu.Channel_shared());
+  EXPECT_EQ("NHWC", prelu.Format());
+}
+
+TEST(PReLU, ForwardCPU) {
+  const float x[] = {1.f,  2.f, 3.f,  -2.f, -3.f, -1.f,
+                     -1.f, 2.f, -1.f, -2.f, -2.f, -1.f};
+  size_t n = sizeof(x) / sizeof(float);
+  size_t batchsize = 2, c = 3, h = 2, w = 1;
+  singa::Tensor in(singa::Shape{batchsize, h, w, c});
+  in.CopyDataFromHostPtr<float>(x, n);
+
+  PReLU prelu;
+  singa::LayerConf conf;
+  singa::PReLUConf *preluconf = conf.mutable_prelu_conf();
+  preluconf->set_channel_shared(false);
+  preluconf->set_format("NHWC");
+  prelu.Setup(Shape{h, w, c}, conf);
+
+  const float neg_slope[] = {0.25f, 0.5f, 0.75f};
+  singa::Tensor a(singa::Shape{c});
+  a.CopyDataFromHostPtr<float>(neg_slope, c);
+  prelu.Set_a(a);
+
+  singa::Tensor out = prelu.Forward(singa::kTrain, in);
+  const float *yptr = out.data<float>();
+  EXPECT_EQ(n, out.Size());
+
+  float *y = new float[n];
+  size_t div_factor = prelu.Channel_shared() ? c : 1;
+  if (prelu.Format() == "NCHW") {
+    for (size_t i = 0; i < n; i++) {
+      size_t pos = i / (h * w) % c / div_factor;
+      y[i] = std::max(x[i], 0.f) + neg_slope[pos] * std::min(x[i], 0.f);
+    }
+  } else if (prelu.Format() == "NHWC") {
+    for (size_t i = 0; i < n; i++) {
+      size_t pos = i % c / div_factor;
+      y[i] = std::max(x[i], 0.f) + neg_slope[pos] * std::min(x[i], 0.f);
+    }
+  }
+  for (size_t i = 0; i < n; i++) EXPECT_FLOAT_EQ(y[i], yptr[i]);
+  delete[] y;
+}
+
+TEST(PReLU, BackwardCPU) {
+  const float x[] = {1.f,  2.f, 3.f,  -2.f, -3.f, -1.f,
+                     -1.f, 2.f, -1.f, -2.f, -2.f, -1.f};
+  size_t n = sizeof(x) / sizeof(float);
+  size_t batchsize = 2, c = 3, h = 2, w = 1;
+  singa::Tensor in(singa::Shape{batchsize, c, h, w});
+  in.CopyDataFromHostPtr<float>(x, n);
+
+  PReLU prelu;
+  singa::LayerConf conf;
+  singa::PReLUConf *preluconf = conf.mutable_prelu_conf();
+  preluconf->set_channel_shared(false);
+  preluconf->set_format("NCHW");
+  prelu.Setup(Shape{c, h, w}, conf);
+
+  const float neg_slope[] = {0.25f, 0.5f, 0.75f};
+  singa::Tensor a(singa::Shape{c});
+  a.CopyDataFromHostPtr<float>(neg_slope, c);
+  prelu.Set_a(a);
+
+  singa::Tensor out = prelu.Forward(singa::kTrain, in);
+
+  const float grad[] = {1.f, 2.f,  -2.f, -1.f, -1.f, -3.f,
+                        2.f, -2.f, 1.f,  1.f,  -2.f, 0.f};
+  singa::Tensor out_diff(singa::Shape{batchsize, c, h, w});
+  out_diff.CopyDataFromHostPtr<float>(grad, n);
+  const auto ret = prelu.Backward(singa::kTrain, out_diff);
+  const float *xptr = ret.first.data<float>();
+  const float *aptr = ret.second.at(0).data<float>();
+  float *dx = new float[n];
+  size_t div_factor = prelu.Channel_shared() ? c : 1;
+  size_t params = prelu.Channel_shared() ? 1 : c;
+  float da[] = {0.f, 0.f, 0.f};
+  if (prelu.Format() == "NCHW") {
+    for (size_t i = 0; i < n; i++) {
+      size_t pos = i / (h * w) % c / div_factor;
+      dx[i] = grad[i] *
+              (std::max(x[i], 0.f) + neg_slope[pos] * std::min(x[i], 0.f));
+    }
+    for (size_t i = 0; i < n; i++) {
+      size_t pos = i / (h * w) % c / div_factor;
+      da[pos] += grad[i] * std::min(x[i], 0.f);
+    }
+  } else if (prelu.Format() == "NHWC") {
+    for (size_t i = 0; i < n; i++) {
+      size_t pos = i % c / div_factor;
+      dx[i] = grad[i] *
+              (std::max(x[i], 0.f) + neg_slope[pos] * std::min(x[i], 0.f));
+    }
+    for (size_t i = 0; i < n; i++) {
+      size_t pos = i % c / div_factor;
+      da[pos] += grad[i] * std::min(x[i], 0.f);
+    }
+  }
+  for (size_t i = 0; i < n; i++) EXPECT_FLOAT_EQ(dx[i], xptr[i]);
+  for (size_t i = 0; i < params; i++) EXPECT_FLOAT_EQ(da[i], aptr[i]);
+  delete[] dx;
+}
+
+#ifdef USE_CUDA
+TEST(PReLU, ForwardGPU) {
+  const float x[] = {1.f,  2.f, 3.f,  -2.f, -3.f, -1.f,
+                         -1.f, 2.f, -1.f, -2.f, -2.f, -1.f};
+  size_t n = sizeof(x) / sizeof(float);
+  size_t batchsize = 2, c = 3, h = 2, w = 1;
+  auto cuda = std::make_shared<singa::CudaGPU>();
+  singa::Tensor in(singa::Shape{batchsize, h, w, c}, cuda);
+  in.CopyDataFromHostPtr<float>(x, n);
+
+  PReLU prelu;
+  singa::LayerConf conf;
+  singa::PReLUConf *preluconf = conf.mutable_prelu_conf();
+  preluconf->set_channel_shared(false);
+  preluconf->set_format("NHWC");
+  prelu.Setup(Shape{h, w, c}, conf);
+
+  const float neg_slope[] = {0.25f, 0.5f, 0.75f};
+  singa::Tensor a(singa::Shape{c}, cuda);
+  a.CopyDataFromHostPtr<float>(neg_slope, c);
+  prelu.Set_a(a);
+
+  singa::Tensor out = prelu.Forward(singa::kTrain, in);
+  out.ToHost();
+  const float *yptr = out.data<float>();
+  EXPECT_EQ(n, out.Size());
+
+  float *y = new float[n];
+  size_t div_factor = prelu.Channel_shared() ? c : 1;
+  if (prelu.Format() == "NCHW") {
+    for (size_t i = 0; i < n; i++) {
+      size_t pos = i / (h * w) % c / div_factor;
+      y[i] = std::max(x[i], 0.f) + neg_slope[pos] * std::min(x[i], 0.f);
+    }
+  } else if (prelu.Format() == "NHWC") {
+    for (size_t i = 0; i < n; i++) {
+      size_t pos = i % c / div_factor;
+      y[i] = std::max(x[i], 0.f) + neg_slope[pos] * std::min(x[i], 0.f);
+    }
+  }
+  for (size_t i = 0; i < n; i++) EXPECT_FLOAT_EQ(y[i], yptr[i]);
+  delete[] y;
+}
+
+TEST(PReLU, BackwardGPU) {
+  const float x[] = {1.f,  2.f, 3.f,  -2.f, -3.f, -1.f,
+                           -1.f, 2.f, -1.f, -2.f, -2.f, -1.f};
+  size_t n = sizeof(x) / sizeof(float);
+  size_t batchsize = 2, c = 3, h = 2, w = 1;
+  auto cuda = std::make_shared<singa::CudaGPU>();
+  singa::Tensor in(singa::Shape{batchsize, c, h, w}, cuda);
+  in.CopyDataFromHostPtr<float>(x, n);
+
+  PReLU prelu;
+  singa::LayerConf conf;
+  singa::PReLUConf *preluconf = conf.mutable_prelu_conf();
+  preluconf->set_channel_shared(false);
+  preluconf->set_format("NCHW");
+  prelu.Setup(Shape{c, h, w}, conf);
+
+  const float neg_slope[] = {0.25f, 0.5f, 0.75f};
+  singa::Tensor a(singa::Shape{c}, cuda);
+  a.CopyDataFromHostPtr<float>(neg_slope, c);
+  prelu.Set_a(a);
+
+  singa::Tensor out = prelu.Forward(singa::kTrain, in);
+  const float grad[] = {1.f, 2.f,  -2.f, -1.f, -1.f, -3.f,
+                          2.f, -2.f, 1.f,  1.f,  -2.f, 0.f};
+  singa::Tensor out_diff(singa::Shape{batchsize, c, h, w}, cuda);
+  out_diff.CopyDataFromHostPtr<float>(grad, n);
+  const auto ret = prelu.Backward(singa::kTrain, out_diff);
+
+  singa::Tensor in_diff = ret.first;
+  in_diff.ToHost();
+  const float *xptr = in_diff.data<float>();
+  singa::Tensor a_diff = ret.second.at(0);
+  a_diff.ToHost();
+  const float *aptr = a_diff.data<float>();
+  float *dx = new float[n];
+  size_t div_factor = prelu.Channel_shared() ? c : 1;
+  size_t params = prelu.Channel_shared() ? 1 : c;
+  float da[] = {0.f, 0.f, 0.f};
+  if (prelu.Format() == "NCHW") {
+    for (size_t i = 0; i < n; i++) {
+      size_t pos = i / (h * w) % c / div_factor;
+      dx[i] = grad[i] *
+                (std::max(x[i], 0.f) + neg_slope[pos] * std::min(x[i], 0.f));
+    }
+    for (size_t i = 0; i < n; i++) {
+      size_t pos = i / (h * w) % c / div_factor;
+      da[pos] += grad[i] * std::min(x[i], 0.f);
+    }
+  } else if (prelu.Format() == "NHWC") {
+    for (size_t i = 0; i < n; i++) {
+      size_t pos = i % c / div_factor;
+      dx[i] = grad[i] *
+        (std::max(x[i], 0.f) + neg_slope[pos] * std::min(x[i], 0.f));
+  }
+    for (size_t i = 0; i < n; i++) {
+      size_t pos = i % c / div_factor;
+      da[pos] += grad[i] * std::min(x[i], 0.f);
+    }
+  }
+  for (size_t i = 0; i < n; i++) EXPECT_FLOAT_EQ(dx[i], xptr[i]);
+  for (size_t i = 0; i < params; i++) EXPECT_FLOAT_EQ(da[i], aptr[i]);
+  delete[] dx;
+}
+#endif
diff --git a/test/singa/test_rmsprop.cc b/test/singa/test_rmsprop.cc
new file mode 100644
index 0000000..d259592
--- /dev/null
+++ b/test/singa/test_rmsprop.cc
@@ -0,0 +1,105 @@
+/************************************************************
+*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*
+*************************************************************/
+
+#include "gtest/gtest.h"
+#include "singa/model/optimizer.h"
+#include <cmath>
+
+TEST(RMSProp, ApplyCPU) {
+  singa::RMSProp rmsprop;
+  float lr = 0.1f;
+  float rho = 0.9;
+  const float v[4] = {0.1, 0.2, 0.3, 0.4};
+  const float g[4] = {0.01, 0.02, 0.03, 0.04};
+
+  singa::OptimizerConf conf;
+  conf.set_rho(rho);
+  conf.set_delta(1E-8);
+
+  singa::Tensor value(singa::Shape{4}), grad(singa::Shape{4});
+  value.CopyDataFromHostPtr(v, 4);
+  grad.CopyDataFromHostPtr(g, 4);
+
+  rmsprop.Setup(conf);
+  rmsprop.Apply(0, lr, "xx", grad, value);
+
+  singa::Tensor v1 = value.Clone();
+  const float* newv1 = v1.data<float>();
+  float history[4];
+  for (int i = 0; i < 4; ++i) history[i] = g[i] * g[i] * (1 - rho);
+  for (int i = 0; i < 4; ++i)
+    EXPECT_NEAR(newv1[i], v[i] - g[i] * lr / sqrt(history[i] + (float)1E-8),
+                1e-5);
+
+  grad.CopyDataFromHostPtr(g, 4);
+  rmsprop.Apply(1, lr, "xx", grad, value);
+  singa::Tensor v2 = value.Clone();
+  const float* newv2 = v2.data<float>();
+  for (int i = 0; i < 4; ++i)
+    history[i] = history[i] * rho + g[i] * g[i] * (1 - rho);
+
+  for (int i = 0; i < 4; ++i)
+    EXPECT_NEAR(newv2[i], newv1[i] - lr * g[i] / sqrt(history[i] + (float)1E-8),
+                1e-5);
+}
+
+#ifdef USE_CUDA
+TEST(RMSProp, ApplyCUDA) {
+  singa::RMSProp rmsprop;
+  float lr = 0.1f;
+  float rho = 0.02;
+  const float v[4] = {0.1, 0.2, 0.3, 0.4};
+  const float g[4] = {0.01, 0.02, 0.03, 0.04};
+
+  singa::OptimizerConf conf;
+  conf.set_rho(rho);
+  conf.set_delta(1e-8);
+
+  auto dev = std::make_shared<singa::CudaGPU>();
+  singa::Tensor value(singa::Shape{4}, dev), grad(singa::Shape{4}, dev);
+  value.CopyDataFromHostPtr(v, 4);
+  grad.CopyDataFromHostPtr(g, 4);
+
+  rmsprop.Setup(conf);
+  rmsprop.Apply(0, lr, "xx", grad, value);
+
+  singa::Tensor v1 = value.Clone();
+  v1.ToHost();
+  const float* newv1 = v1.data<float>();
+  float history[4];
+  for (int i = 0; i < 4; ++i) history[i] = g[i] * g[i] * (1 - rho);
+  for (int i = 0; i < 4; ++i)
+    EXPECT_NEAR(newv1[i], v[i] - lr * g[i] / sqrt(history[i] + conf.delta()),
+                1e-5);
+
+  grad.CopyDataFromHostPtr(g, 4);
+  rmsprop.Apply(1, lr, "xx", grad, value);
+  singa::Tensor v2 = value.Clone();
+  v2.ToHost();
+  const float* newv2 = v2.data<float>();
+  for (int i = 0; i < 4; ++i)
+    history[i] = history[i] * rho + g[i] * g[i] * (1 - rho);
+
+  for (int i = 0; i < 4; ++i)
+    EXPECT_NEAR(newv2[i],
+                newv1[i] - lr * g[i] / sqrt(history[i] + conf.delta()), 1e-5);
+}
+#endif
diff --git a/test/singa/test_sgd.cc b/test/singa/test_sgd.cc
new file mode 100644
index 0000000..e6ed9bf
--- /dev/null
+++ b/test/singa/test_sgd.cc
@@ -0,0 +1,150 @@
+/************************************************************
+*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*
+*************************************************************/
+
+#include "gtest/gtest.h"
+#include "singa/model/optimizer.h"
+#include "singa/singa_config.h"
+
+TEST(SGD, ApplyWithoutMomentum) {
+  singa::SGD sgd;
+  const float v[4] = {0.1, 0.2, 0.3, 0.4};
+  const float g[4] = {0.1, 0.1, 0.1, 0.1};
+
+  singa::Tensor value(singa::Shape{4}), grad(singa::Shape{4});
+  value.CopyDataFromHostPtr(v, 4);
+  grad.CopyDataFromHostPtr(g, 4);
+
+  float lr = 0.1f;
+  sgd.Apply(0, lr, "xx", grad, value);
+
+  singa::Tensor v1 = value.Clone();
+  const float* newv1 = v1.data<float>();
+  for (int i = 0; i < 4; i++) {
+    EXPECT_FLOAT_EQ(newv1[i], v[i] - g[i] * lr);
+  }
+
+
+  lr /= 2;
+  grad.CopyDataFromHostPtr(g, 4);
+  sgd.Apply(1, lr, "xx", grad, value);
+  singa::Tensor v2 = value.Clone();
+  const float* newv2 = v2.data<float>();
+  for (int i = 0; i < 4; i++) {
+    EXPECT_FLOAT_EQ(newv2[i], newv1[i] - g[i] * lr);
+  }
+}
+
+
+TEST(SGD, ApplyWithMomentum) {
+  singa::SGD sgd;
+  float lr = 0.1f;
+  auto func = [](int step) { return step <=5 ? 0.5f: 0.9f;};
+  sgd.SetMomentumGenerator(func);
+  const float v[4] = {0.1, 0.2, 0.3, 0.4};
+  const float g[4] = {0.01, 0.02, 0.03, 0.04};
+
+  singa::Tensor value(singa::Shape{4}), grad(singa::Shape{4});
+  value.CopyDataFromHostPtr(v, 4);
+  grad.CopyDataFromHostPtr(g, 4);
+
+  sgd.Apply(0, lr, "xx", grad, value);
+
+  singa::Tensor v1 = value.Clone();
+  const float* newv1 = v1.data<float>();
+  for (int i = 0; i < 4; i++) {
+    EXPECT_FLOAT_EQ(newv1[i], v[i] - g[i] * lr);
+  }
+
+  grad.CopyDataFromHostPtr(g, 4);
+  sgd.Apply(1, lr, "xx", grad, value);
+  singa::Tensor v2 = value.Clone();
+  const float* newv2 = v2.data<float>();
+  for (int i = 0; i < 4; i++) {
+    EXPECT_FLOAT_EQ(newv2[i], newv1[i] - (g[i] * lr + g[i] * lr * func(1)));
+  }
+}
+
+#ifdef USE_CUDA
+TEST(SGD, ApplyWithoutMomentumCuda) {
+  singa::SGD sgd;
+  const float v[4] = {0.1, 0.2, 0.3, 0.4};
+  const float g[4] = {0.1, 0.1, 0.1, 0.1};
+
+	auto dev = std::make_shared<singa::CudaGPU>();
+  singa::Tensor value(singa::Shape{4}, dev), grad(singa::Shape{4}, dev);
+  value.CopyDataFromHostPtr(v, 4);
+  grad.CopyDataFromHostPtr(g, 4);
+
+  float lr = 0.1f;
+  sgd.Apply(0, lr, "xx", grad, value);
+
+  singa::Tensor v1 = value.Clone();
+  v1.ToHost();
+  const float* newv1 = v1.data<float>();
+  for (int i = 0; i < 4; i++) {
+    EXPECT_FLOAT_EQ(newv1[i], v[i] - g[i] * lr);
+  }
+
+
+  lr /= 2;
+  grad.CopyDataFromHostPtr(g, 4);
+  sgd.Apply(1, lr, "xx", grad, value);
+  singa::Tensor v2 = value.Clone();
+  v2.ToHost();
+  const float* newv2 = v2.data<float>();
+  for (int i = 0; i < 4; i++) {
+    EXPECT_FLOAT_EQ(newv2[i], newv1[i] - g[i] * lr);
+  }
+}
+
+
+TEST(SGD, ApplyWithMomentumCuda) {
+  singa::SGD sgd;
+  float lr = 0.1f;
+  auto func = [](int step) { return step <=5 ? 0.5f: 0.9f;};
+  sgd.SetMomentumGenerator(func);
+  const float v[4] = {0.1, 0.2, 0.3, 0.4};
+  const float g[4] = {0.01, 0.02, 0.03, 0.04};
+
+	auto dev = std::make_shared<singa::CudaGPU>();
+  singa::Tensor value(singa::Shape{4}, dev), grad(singa::Shape{4}, dev);
+  value.CopyDataFromHostPtr(v, 4);
+  grad.CopyDataFromHostPtr(g, 4);
+
+  sgd.Apply(0, lr, "xx", grad, value);
+
+  singa::Tensor v1 = value.Clone();
+  v1.ToHost();
+  const float* newv1 = v1.data<float>();
+  for (int i = 0; i < 4; i++) {
+    EXPECT_FLOAT_EQ(newv1[i], v[i] - g[i] * lr);
+  }
+
+  grad.CopyDataFromHostPtr(g, 4);
+  sgd.Apply(1, lr, "xx", grad, value);
+  singa::Tensor v2 = value.Clone();
+  v2.ToHost();
+  const float* newv2 = v2.data<float>();
+  for (int i = 0; i < 4; i++) {
+    EXPECT_FLOAT_EQ(newv2[i], newv1[i] - (g[i] * lr + g[i] * lr * func(1)));
+  }
+}
+#endif
diff --git a/test/singa/test_snapshot.cc b/test/singa/test_snapshot.cc
new file mode 100644
index 0000000..33b57b9
--- /dev/null
+++ b/test/singa/test_snapshot.cc
@@ -0,0 +1,123 @@
+/************************************************************
+*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*
+*************************************************************/
+
+#include "gtest/gtest.h"
+#include "singa/io/snapshot.h"
+#include "singa/io/reader.h"
+#include "singa/core/tensor.h"
+
+#include <string>
+#include <fstream>
+
+const std::string prefix = "./snapshot_test";
+const float param_1_data[] = {0.1, 0.2, 0.3, 0.4};
+const float param_2_data[] = {0.2, 0.1, 0.4, 0.3};
+const std::string desc_1 =
+    "parameter name: Param_1\tdata type: 0\tdim: 1\tshape: 4";
+const std::string desc_2 =
+    "parameter name: Param_2\tdata type: 0\tdim: 2\tshape: 2 2";
+const int int_data[] = {1, 3, 5, 7};
+const double double_data[] = {0.2, 0.4, 0.6, 0.8};
+
+TEST(Snapshot, WriteTest) {
+  singa::Snapshot snapshot(prefix, singa::Snapshot::kWrite);
+  singa::Tensor param_1(singa::Shape{4}), param_2(singa::Shape{2, 2});
+  param_1.CopyDataFromHostPtr(param_1_data, 4);
+  param_2.CopyDataFromHostPtr(param_2_data, 4);
+  snapshot.Write("Param_1", param_1);
+  snapshot.Write("Param_2", param_2);
+}
+
+TEST(Snapshot, ReadTest) {
+  singa::Snapshot snapshot(prefix, singa::Snapshot::kRead);
+  singa::Tensor param_1, param_2;
+  singa::Shape shape1, shape2;
+  shape1 = snapshot.ReadShape("Param_1");
+  EXPECT_EQ(shape1.size(), 1u);
+  EXPECT_EQ(shape1[0], 4u);
+  shape2 = snapshot.ReadShape("Param_2");
+  EXPECT_EQ(shape2.size(), 2u);
+  EXPECT_EQ(shape2[0], 2u);
+  EXPECT_EQ(shape2[1], 2u);
+  param_1 = snapshot.Read("Param_1");
+  const float* data_1 = param_1.data<float>();
+  for (size_t i = 0; i < singa::Product(shape1); ++i)
+    EXPECT_FLOAT_EQ(data_1[i], param_1_data[i]);
+  param_2 = snapshot.Read("Param_2");
+  const float* data_2 = param_2.data<float>();
+  for (size_t i = 0; i < singa::Product(shape2); ++i)
+    EXPECT_FLOAT_EQ(data_2[i], param_2_data[i]);
+  std::ifstream desc_file(prefix + ".desc");
+  std::string line;
+  getline(desc_file, line);
+  EXPECT_EQ(line, desc_1);
+  getline(desc_file, line);
+  EXPECT_EQ(line, desc_2);
+}
+
+TEST(Snapshot, ReadIntTest) {
+  {
+    singa::Snapshot int_snapshot_write(prefix + ".int",
+                                       singa::Snapshot::kWrite);
+    singa::Tensor int_param(singa::Shape{4});
+    int_param.AsType(singa::kInt);
+    int_param.CopyDataFromHostPtr(int_data, 4);
+    int_snapshot_write.Write("IntParam", int_param);
+  }
+
+  {
+    singa::Snapshot int_snapshot_read(prefix + ".int", singa::Snapshot::kRead);
+    singa::Shape shape;
+    shape = int_snapshot_read.ReadShape("IntParam");
+    EXPECT_EQ(shape.size(), 1u);
+    EXPECT_EQ(shape[0], 4u);
+    singa::Tensor int_param = int_snapshot_read.Read("IntParam");
+    const int* param_data = int_param.data<int>();
+    for (size_t i = 0; i < singa::Product(shape); ++i)
+      EXPECT_EQ(param_data[i], int_data[i]);
+  }
+}
+
+/*
+TEST(Snapshot, ReadDoubleTest) {
+  {
+    singa::Snapshot double_snapshot_write(prefix + ".double",
+                                          singa::Snapshot::kWrite);
+    singa::Tensor double_param(singa::Shape{4});
+    double_param.AsType(singa::kDouble);
+    double_param.CopyDataFromHostPtr(double_data, 4);
+    double_snapshot_write.Write("DoubleParam", double_param);
+  }
+
+  {
+    singa::Snapshot double_snapshot_read(prefix + ".double",
+                                         singa::Snapshot::kRead);
+    singa::Shape shape;
+    shape = double_snapshot_read.ReadShape("DoubleParam");
+    EXPECT_EQ(shape.size(), 1u);
+    EXPECT_EQ(shape[0], 4u);
+    singa::Tensor double_param = double_snapshot_read.Read("DoubleParam");
+    const double* param_data = double_param.data<double>();
+    for (size_t i = 0; i < singa::Product(shape); ++i)
+      EXPECT_EQ(param_data[i], double_data[i]);
+  }
+}
+*/
diff --git a/test/singa/test_softmax.cc b/test/singa/test_softmax.cc
new file mode 100644
index 0000000..8064b80
--- /dev/null
+++ b/test/singa/test_softmax.cc
@@ -0,0 +1,103 @@
+/************************************************************
+*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*
+*************************************************************/
+
+#include "../src/model/layer/softmax.h"
+#include "gtest/gtest.h"
+#include <math.h> // exp
+
+using singa::Softmax;
+using singa::Shape;
+TEST(Softmax, Setup) {
+  Softmax sft;
+  // EXPECT_EQ("Softmax", sft.layer_type());
+
+  singa::LayerConf conf;
+  sft.Setup(Shape{3}, conf);
+}
+
+#ifdef USE_CBLAS
+TEST(Softmax, Forward) {
+  const float x[] = {1.0f, 2.0f, 0.0f, -2.0f, -3.0f, -1.0};
+  size_t row = 2;
+  size_t col = 3;
+  size_t n = row * col;
+  singa::Tensor in(singa::Shape{row, col});
+  in.CopyDataFromHostPtr<float>(x, row * col);
+
+  Softmax sft;
+  singa::LayerConf conf;
+  sft.Setup(Shape{col}, conf);
+
+  singa::Tensor out = sft.Forward(singa::kTrain, in);
+  const float* yptr = out.data<float>();
+  EXPECT_EQ(n, out.Size());
+
+  float* sigma = new float[row];
+  for (size_t i = 0; i < row; i++)
+    sigma[i] = 0.f;
+  for (size_t i = 0; i < n; i++)
+    sigma[i / col] += exp(x[i]);
+  //EXPECT_EQ(0, sigma[1]);
+  for (size_t i = 0; i < row; i++)
+    for (size_t j = 0; j < col; j++) {
+      EXPECT_FLOAT_EQ(yptr[i * col + j], exp(x[i * col + j]) / sigma[i]);
+    }
+  delete[] sigma;
+}
+
+TEST(Softmax, Backward) {
+  const float x[] = {1.0f, 2.0f, 0.0f, -2.0f, -3.0f, -1.0};
+  size_t n = sizeof(x) / sizeof(float);
+  size_t row = 2;
+  size_t col = 3;
+  singa::Tensor in(singa::Shape{row, col});
+  in.CopyDataFromHostPtr<float>(x, n);
+
+  Softmax sft;
+  singa::LayerConf conf;
+  sft.Setup(Shape{col}, conf);
+  singa::Tensor out = sft.Forward(singa::kTrain, in);
+  const float* yptr = out.data<float>();
+
+  const float grad[] = {2.0f, -3.0f, 1.0f, 3.0f, -1.0f, -2.0};
+  singa::Tensor out_diff(singa::Shape{row, col});
+  out_diff.CopyDataFromHostPtr<float>(grad, n);
+  const auto in_diff = sft.Backward(singa::kTrain, out_diff);
+  const float* xptr = in_diff.first.data<float>();
+
+  float* dx = new float[n];
+  float* sigma = new float[row];
+  for (size_t i = 0; i < row; i++)
+    sigma[i] = 0.f;
+  for (size_t i = 0; i < n; i++)
+    sigma[i / col] += grad[i] * yptr[i];
+  // EXPECT_EQ(0, sigma[0]);
+  // EXPECT_EQ(0, sigma[1]);
+  for (size_t i = 0; i < row; i++)
+    for (size_t j = 0; j < col; j++)
+      dx[i * col + j] = (grad[i * col + j] - sigma[i]) * yptr[i * col +j];
+  EXPECT_FLOAT_EQ(dx[0], xptr[0]);
+  EXPECT_FLOAT_EQ(dx[4], xptr[4]);
+  EXPECT_FLOAT_EQ(dx[5], xptr[5]);
+  delete[] dx;
+  delete[] sigma;
+}
+#endif
diff --git a/test/singa/test_tensor.cc b/test/singa/test_tensor.cc
new file mode 100644
index 0000000..316b996
--- /dev/null
+++ b/test/singa/test_tensor.cc
@@ -0,0 +1,131 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "gtest/gtest.h"
+#include "singa/core/tensor.h"
+using singa::Tensor;
+using singa::Shape;
+using singa::Device;
+
+TEST(TensorTest, TestConstructor) {
+  singa::Tensor float_t(singa::Shape{2,3});
+  EXPECT_EQ(6u, float_t.Size());
+  EXPECT_EQ(sizeof(float) * 6, float_t.MemSize());
+  EXPECT_EQ(singa::kFloat32, float_t.data_type());
+  auto s = float_t.shape();
+  EXPECT_EQ(s[0], 2u);
+  EXPECT_EQ(s[1], 3u);
+
+  EXPECT_NE(float_t.device(), nullptr);
+
+  singa::Tensor float16_t(Shape{2,3}, singa::kFloat16);
+  EXPECT_EQ(singa::kFloat16, float16_t.data_type());
+  EXPECT_EQ(6u, float16_t.Size());
+  EXPECT_EQ(12u, float16_t.block()->size());
+
+  singa::Tensor x(float16_t);
+  EXPECT_EQ(float16_t.Size(), x.Size());
+  EXPECT_EQ(float16_t.block(), x.block());
+  EXPECT_EQ(float16_t.data_type(), x.data_type());
+  EXPECT_EQ(float16_t.device(), x.device());
+
+  singa::Tensor y = float16_t;
+  EXPECT_EQ(float16_t.Size(), x.Size());
+  EXPECT_EQ(float16_t.block(), x.block());
+  EXPECT_EQ(float16_t.data_type(), x.data_type());
+  EXPECT_EQ(float16_t.device(), x.device());
+}
+
+TEST(TensorClass, Reshape) {
+  Tensor t;
+  t.Reshape(Shape{2,3});
+  EXPECT_TRUE((Shape{2,3} == t.shape()));
+
+  t.Reshape(Shape{3,3, 4});
+  EXPECT_TRUE((Shape{3,3, 4} == t.shape()));
+
+  t.Reshape(Shape{12});
+  EXPECT_TRUE((Shape{12} == t.shape()));
+
+  Tensor o;
+  EXPECT_TRUE(o.shape() != t.shape());
+  o.Reshape(Shape{3, 3});
+  EXPECT_TRUE(o.shape() != t.shape());
+}
+
+TEST(TensorClass, AsType) {
+  Tensor t;
+  EXPECT_EQ(singa::kFloat32, t.data_type());
+  t.AsType(singa::kFloat16);
+  EXPECT_EQ(singa::kFloat16, t.data_type());
+}
+
+TEST(TensorClass, ToDevice) {
+  Tensor t(Shape{2,3});
+  EXPECT_EQ(singa::defaultDevice, t.device());
+  auto dev = std::make_shared<singa::CppCPU>();
+  t.ToDevice(dev);
+  EXPECT_NE(singa::defaultDevice, t.device());
+}
+
+TEST(TensorClass, CopyDataFromHostPtr) {
+  float data[] = {1.0f, 2.0f, 3.0f};
+  Tensor t(Shape{3});
+  t.CopyDataFromHostPtr(data, 3);
+  const float* dptr = static_cast<const float*>(t.block()->data());
+  EXPECT_FLOAT_EQ(1.0f, dptr[0]);
+  EXPECT_FLOAT_EQ(2.0f, dptr[1]);
+  EXPECT_FLOAT_EQ(3.0f, dptr[2]);
+}
+
+TEST(TensorClass, CopyData) {
+  float data[] = {1.0f, 2.0f, 3.0f};
+  Tensor t(Shape{3});
+  t.CopyDataFromHostPtr(data, 3);
+
+  Tensor o(Shape{3});
+  o.CopyData(t);
+  const float* dptr = static_cast<const float*>(o.block()->data());
+  EXPECT_FLOAT_EQ(1.0f, dptr[0]);
+  EXPECT_FLOAT_EQ(2.0f, dptr[1]);
+  EXPECT_FLOAT_EQ(3.0f, dptr[2]);
+}
+
+TEST(TensorClass, Clone) {
+  float data[] = {1.0f, 2.0f, 3.0f};
+  Tensor t(Shape{3});
+  t.CopyDataFromHostPtr(data, 3);
+
+  Tensor o = t.Clone();
+  const float* dptr = static_cast<const float*>(o.block()->data());
+  EXPECT_FLOAT_EQ(1.0f, dptr[0]);
+  EXPECT_FLOAT_EQ(2.0f, dptr[1]);
+  EXPECT_FLOAT_EQ(3.0f, dptr[2]);
+}
+
+TEST(TensorClass, T) {
+  Tensor t(Shape{2,3});
+  EXPECT_FALSE(t.transpose());
+  Tensor o = t.T();
+  EXPECT_EQ(true, o.transpose());
+  EXPECT_EQ(t.block(), o.block());
+  EXPECT_EQ(t.data_type(), o.data_type());
+  EXPECT_EQ(t.shape()[0],  o.shape()[1]);
+  EXPECT_EQ(t.shape()[1],  o.shape()[0]);
+}
+
diff --git a/test/singa/test_tensor_math.cc b/test/singa/test_tensor_math.cc
new file mode 100644
index 0000000..267905d
--- /dev/null
+++ b/test/singa/test_tensor_math.cc
@@ -0,0 +1,901 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "gtest/gtest.h"
+#include "singa/core/tensor.h"
+using singa::Tensor;
+using singa::Shape;
+using singa::Device;
+
+class TestTensorMath : public ::testing::Test {
+ protected:
+  virtual void SetUp() {
+    a.Reshape(singa::Shape{6});
+    b.Reshape(singa::Shape{6});
+    c.Reshape(singa::Shape{6, 1});
+    d.Reshape(singa::Shape{3, 2});
+    e.Reshape(singa::Shape{3, 2});
+
+    a.CopyDataFromHostPtr<float>(dat1, 6);
+    b.CopyDataFromHostPtr<float>(dat2, 6);
+    e.CopyDataFromHostPtr<float>(dat1, 6);
+  }
+  Tensor a, b, c, d, e;
+  const float dat1[6] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f};
+  const float dat2[6] = {1.1f, 2.1f, 3.1f, 4.1f, 5.1f, 6.1f};
+};
+
+TEST_F(TestTensorMath, MemberAbs) {
+  Tensor aa = a.Clone();
+  Tensor bb = b.Clone();
+  Tensor cc = aa - bb;
+  const float *dptr = cc.data<float>();
+  EXPECT_NEAR(-0.1, dptr[0], 1e-5);
+  EXPECT_NEAR(-0.1, dptr[1], 1e-5);
+  EXPECT_NEAR(-0.1, dptr[2], 1e-5);
+
+  Tensor p = Abs(cc);
+  const float *dptr1 = p.data<float>();
+  EXPECT_NEAR(0.1, dptr1[0], 1e-5);
+  EXPECT_NEAR(0.1, dptr1[1], 1e-5);
+  EXPECT_NEAR(0.1, dptr1[2], 1e-5);
+}
+
+TEST_F(TestTensorMath, MemberExp) {
+  Tensor p = Exp(a);
+  const float *dptr1 = p.data<float>();
+  EXPECT_NEAR(exp(1.0f), dptr1[0], 1e-5);
+  EXPECT_NEAR(exp(2.0f), dptr1[1], 1e-5);
+  EXPECT_NEAR(exp(3.0f), dptr1[2], 1e-5);
+}
+
+TEST_F(TestTensorMath, MemberLog) {
+  Tensor p = Log(a);
+  const float *dptr1 = p.data<float>();
+  EXPECT_NEAR(log(1.0f), dptr1[0], 1e-5);
+  EXPECT_NEAR(log(2.0f), dptr1[1], 1e-5);
+  EXPECT_NEAR(log(3.0f), dptr1[2], 1e-5);
+}
+
+TEST_F(TestTensorMath, MemberReLU) {
+  Tensor aa = a.Clone();
+  Tensor cc = aa - 2.0f;
+  const float *dptr = cc.data<float>();
+  EXPECT_NEAR(-1.0f, dptr[0], 1e-5);
+  EXPECT_NEAR(0.0f, dptr[1], 1e-5);
+  EXPECT_NEAR(1.0f, dptr[2], 1e-5);
+
+  Tensor p = ReLU(cc);
+  const float *dptr1 = p.data<float>();
+  EXPECT_NEAR(0.0f, dptr1[0], 1e-5);
+  EXPECT_NEAR(0.0f, dptr1[1], 1e-5);
+  EXPECT_NEAR(1.0f, dptr1[2], 1e-5);
+}
+
+TEST_F(TestTensorMath, MemberSigmoid) {
+  Tensor p = Sigmoid(a);
+  const float *dptr1 = p.data<float>();
+  EXPECT_NEAR(1.0f / (1.0f + exp(-1.0f)), dptr1[0], 1e-5);
+  EXPECT_NEAR(1.0f / (1.0f + exp(-2.0f)), dptr1[1], 1e-5);
+  EXPECT_NEAR(1.0f / (1.0f + exp(-3.0f)), dptr1[2], 1e-5);
+}
+
+TEST_F(TestTensorMath, MemberSign) {
+  Tensor aa = a.Clone();
+  Tensor cc = aa - 2.0f;
+  const float *dptr = cc.data<float>();
+  EXPECT_NEAR(-1.0f, dptr[0], 1e-5);
+  EXPECT_NEAR(0.0f, dptr[1], 1e-5);
+  EXPECT_NEAR(1.0f, dptr[2], 1e-5);
+
+  Tensor p = Sign(cc);
+  const float *dptr1 = p.data<float>();
+  EXPECT_EQ(0.0f, dptr1[0]);
+  EXPECT_EQ(0.0f, dptr1[1]);
+  EXPECT_EQ(1.0f, dptr1[2]);
+}
+
+TEST_F(TestTensorMath, MemberSqrt) {
+  Tensor p = Sqrt(a);
+  const float *dptr1 = p.data<float>();
+  EXPECT_NEAR(sqrt(1.0), dptr1[0], 1e-5);
+  EXPECT_NEAR(sqrt(2.0), dptr1[1], 1e-5);
+  EXPECT_NEAR(sqrt(3.0), dptr1[2], 1e-5);
+}
+
+TEST_F(TestTensorMath, MemberSquare) {
+  Tensor p = Square(a);
+  const float *dptr1 = p.data<float>();
+  EXPECT_NEAR(1.0, dptr1[0], 1e-5);
+  EXPECT_NEAR(4.0, dptr1[1], 1e-5);
+  EXPECT_NEAR(9.0, dptr1[2], 1e-5);
+}
+
+TEST_F(TestTensorMath, MemberTanh) {
+  Tensor p = Tanh(a);
+  const float *dptr1 = p.data<float>();
+  EXPECT_NEAR(tanh(1.0), dptr1[0], 1e-5);
+  EXPECT_NEAR(tanh(2.0), dptr1[1], 1e-5);
+  EXPECT_NEAR(tanh(3.0), dptr1[2], 1e-5);
+}
+
+TEST_F(TestTensorMath, Sum) {
+  Tensor p1 = Sum(e, 0);
+  const float *dptr1 = p1.data<float>();
+  EXPECT_FLOAT_EQ(9.0f, dptr1[0]);
+  EXPECT_FLOAT_EQ(12.0f, dptr1[1]);
+
+  Tensor p2(Shape{3, 1});
+  p2 = Sum(e, 1);
+  const float *dptr2 = p2.data<float>();
+  EXPECT_FLOAT_EQ(3.0f, dptr2[0]);
+  EXPECT_FLOAT_EQ(7.0f, dptr2[1]);
+  EXPECT_FLOAT_EQ(11.0f, dptr2[2]);
+}
+
+TEST_F(TestTensorMath, SoftMax) {
+  Tensor p1 = SoftMax(Reshape(e, Shape{1, 6}));
+  const float *dptr1 = p1.data<float>();
+  float sum = 0;
+  for (int i = 0; i < 6; i++) sum += exp(i + 1);
+  EXPECT_NEAR(exp(1) / sum, dptr1[0], 1e-5);
+  EXPECT_NEAR(exp(3) / sum, dptr1[2], 1e-5);
+  EXPECT_NEAR(exp(5) / sum, dptr1[4], 1e-5);
+  EXPECT_NEAR(exp(2) / sum, dptr1[1], 1e-5);
+  EXPECT_NEAR(exp(4) / sum, dptr1[3], 1e-5);
+  EXPECT_NEAR(exp(6) / sum, dptr1[5], 1e-5);
+
+  Tensor p2 = SoftMax(e);
+  const float *dptr2 = p2.data<float>();
+  EXPECT_NEAR(exp(1) / (exp(1) + exp(2)), dptr2[0], 1e-5);
+  EXPECT_NEAR(exp(2) / (exp(1) + exp(2)), dptr2[1], 1e-5);
+}
+
+TEST_F(TestTensorMath, MemberLT) {
+  Tensor p1 = a < 2.0f;
+  const float *dptr1 = p1.data<float>();
+  EXPECT_FLOAT_EQ(1.0f, dptr1[0]);
+  EXPECT_FLOAT_EQ(0.0f, dptr1[1]);
+  EXPECT_FLOAT_EQ(0.0f, dptr1[2]);
+}
+
+TEST_F(TestTensorMath, MemberLE) {
+  Tensor p1 = a <= 2.0f;
+  const float *dptr1 = p1.data<float>();
+  EXPECT_FLOAT_EQ(1.0f, dptr1[0]);
+  EXPECT_FLOAT_EQ(1.0f, dptr1[1]);
+  EXPECT_FLOAT_EQ(0.0f, dptr1[2]);
+}
+
+TEST_F(TestTensorMath, MemberGT) {
+  Tensor p1 = a > 2.0f;
+  const float *dptr1 = p1.data<float>();
+  EXPECT_FLOAT_EQ(0.0f, dptr1[0]);
+  EXPECT_FLOAT_EQ(0.0f, dptr1[1]);
+  EXPECT_FLOAT_EQ(1.0f, dptr1[2]);
+}
+
+TEST_F(TestTensorMath, MemberGE) {
+  Tensor p1 = a >= 2.0f;
+  const float *dptr1 = p1.data<float>();
+  EXPECT_FLOAT_EQ(0.0f, dptr1[0]);
+  EXPECT_FLOAT_EQ(1.0f, dptr1[1]);
+  EXPECT_FLOAT_EQ(1.0f, dptr1[2]);
+}
+
+TEST_F(TestTensorMath, MemberPow) {
+  Tensor p1 = Pow(b, 3.0f);
+  const float *dptr1 = p1.data<float>();
+  EXPECT_FLOAT_EQ(pow(1.1f, 3.0f), dptr1[0]);
+  EXPECT_FLOAT_EQ(pow(2.1f, 3.0f), dptr1[1]);
+  EXPECT_FLOAT_EQ(pow(3.1f, 3.0f), dptr1[2]);
+
+  // TODO(Yuchen): check pow(tensor a, tensor b) and add testcase after the
+  // function is complete
+  // Tensor p2 = Pow(a,b);
+  // const float *dptr2 = p2.data<float>();
+  // EXPECT_FLOAT_EQ(pow(1.0f,1.1f), dptr2[0]);
+  // EXPECT_FLOAT_EQ(pow(2.0f,2.1f), dptr2[1]);
+  // EXPECT_FLOAT_EQ(pow(3.0f,3.1f), dptr2[2]);
+}
+
+TEST_F(TestTensorMath, MemberSub) {
+  Tensor p1 = a - b;
+  const float *dptr1 = p1.data<float>();
+  EXPECT_NEAR(-0.1, dptr1[0], 1e-5);
+  EXPECT_NEAR(-0.1, dptr1[1], 1e-5);
+  EXPECT_NEAR(-0.1, dptr1[2], 1e-5);
+}
+
+TEST_F(TestTensorMath, MemberEltwiseMult) {
+  Tensor p1 = a * b;
+  const float *dptr1 = p1.data<float>();
+  EXPECT_NEAR(1.0 * 1.1, dptr1[0], 1e-5);
+  EXPECT_NEAR(2.0 * 2.1, dptr1[1], 1e-5);
+  EXPECT_NEAR(3.0 * 3.1, dptr1[2], 1e-5);
+}
+
+TEST_F(TestTensorMath, MemberDiv) {
+  Tensor p1 = a / b;
+  const float *dptr1 = p1.data<float>();
+  EXPECT_NEAR(1.0 / 1.1, dptr1[0], 1e-5);
+  EXPECT_NEAR(2.0 / 2.1, dptr1[1], 1e-5);
+  EXPECT_NEAR(3.0 / 3.1, dptr1[2], 1e-5);
+
+  Tensor p2 = Div(10.0f, b);
+  const float *dptr2 = p2.data<float>();
+  EXPECT_NEAR(10.0 / 1.1, dptr2[0], 1e-5);
+  EXPECT_NEAR(10.0 / 2.1, dptr2[1], 1e-5);
+  EXPECT_NEAR(10.0 / 3.1, dptr2[2], 1e-5);
+
+  Tensor p3 = a / 8.0f;
+  const float *dptr3 = p3.data<float>();
+  EXPECT_NEAR(1.0 / 8.0, dptr3[0], 1e-5);
+  EXPECT_NEAR(2.0 / 8.0, dptr3[1], 1e-5);
+  EXPECT_NEAR(3.0 / 8.0, dptr3[2], 1e-5);
+}
+
+TEST_F(TestTensorMath, MemberBernoulli) {
+  Tensor p1(Shape{10000});
+  Bernoulli(0.3f, &p1);
+  const float *dptr1 = p1.data<float>();
+  float sum = 0;
+  for (int i = 0; i < 10000; i++) sum += dptr1[i];
+  float mean = sum / 10000;
+  EXPECT_NEAR(mean, 0.3f, 1e-2);
+
+  sum = 0;
+  for (int i = 0; i < 10000; i++) sum += (dptr1[i] - mean) * (dptr1[i] - mean);
+  float variance = sum / 9999;
+  EXPECT_NEAR(variance, 0.3 * 0.7, 1e-2);
+}
+
+TEST_F(TestTensorMath, MemberUniform) {
+  Tensor p1(Shape{10000});
+  Uniform(0.1f, 0.2f, &p1);
+  const float *dptr1 = p1.data<float>();
+  float sum = 0;
+  for (int i = 0; i < 10000; i++) sum += dptr1[i];
+  float mean = sum / 10000;
+  EXPECT_NEAR(mean, 0.15f, 1e-3);
+
+  sum = 0;
+  for (int i = 0; i < 10000; i++) sum += (dptr1[i] - mean) * (dptr1[i] - mean);
+  float variance = sum / 9999;
+  EXPECT_NEAR(variance, 0.01f / 12, 1e-3);
+}
+
+TEST_F(TestTensorMath, MemberGaussian) {
+  Tensor p1(Shape{50000});
+  Gaussian(0.0f, 1.0f, &p1);
+  const float *dptr1 = p1.data<float>();
+  float sum = 0;
+  for (int i = 0; i < 50000; i++) sum += dptr1[i];
+  float mean = sum / 50000;
+  EXPECT_NEAR(mean, 0.0, 1e-2);
+
+  sum = 0;
+  for (int i = 0; i < 50000; i++) sum += (dptr1[i] - mean) * (dptr1[i] - mean);
+  float variance = sum / 49999;
+  EXPECT_NEAR(variance, 1.0, 1e-2);
+}
+
+TEST_F(TestTensorMath, MemberAddTensor) {
+  Tensor aa = a.Clone();
+  aa += a;
+  const float *dptr = aa.data<float>();
+  EXPECT_FLOAT_EQ(2.0f, dptr[0]);
+  EXPECT_FLOAT_EQ(4.0f, dptr[1]);
+  EXPECT_FLOAT_EQ(6.0f, dptr[2]);
+
+  // check p is initialized to 0
+  Tensor p(Shape{6});
+  p += aa;
+  const float *dptr1 = p.data<float>();
+  EXPECT_FLOAT_EQ(2.0f, dptr1[0]);
+  EXPECT_FLOAT_EQ(4.0f, dptr1[1]);
+  EXPECT_FLOAT_EQ(6.0f, dptr1[2]);
+
+  a += b;
+  const float *dptr2 = a.data<float>();
+  EXPECT_FLOAT_EQ(2.1f, dptr2[0]);
+  EXPECT_FLOAT_EQ(4.1f, dptr2[1]);
+  EXPECT_FLOAT_EQ(6.1f, dptr2[2]);
+  EXPECT_FLOAT_EQ(12.1f, dptr2[5]);
+}
+
+TEST_F(TestTensorMath, AddTensors) {
+  Tensor ret(a.shape(), a.device(), a.data_type());
+  Add(a, b, &ret);
+  const float *dptr = ret.data<float>();
+  EXPECT_FLOAT_EQ(2.1f, dptr[0]);
+  EXPECT_FLOAT_EQ(4.1f, dptr[1]);
+  EXPECT_FLOAT_EQ(6.1f, dptr[2]);
+  EXPECT_FLOAT_EQ(12.1f, dptr[5]);
+
+  const Tensor d = a + b;
+  const float *dptr2 = d.data<float>();
+  EXPECT_FLOAT_EQ(2.1f, dptr2[0]);
+  EXPECT_FLOAT_EQ(4.1f, dptr2[1]);
+  EXPECT_FLOAT_EQ(6.1f, dptr2[2]);
+  EXPECT_FLOAT_EQ(12.1f, dptr2[5]);
+
+  Add(a, b, &a);
+  const float *dptr1 = a.data<float>();
+  EXPECT_FLOAT_EQ(2.1f, dptr1[0]);
+  EXPECT_FLOAT_EQ(4.1f, dptr1[1]);
+  EXPECT_FLOAT_EQ(6.1f, dptr1[2]);
+  EXPECT_FLOAT_EQ(12.1f, dptr1[5]);
+}
+
+TEST_F(TestTensorMath, SetValue) {
+  Tensor t(Shape{4});
+  t.SetValue(0.3f);
+  const float *ptr = t.data<float>();
+  for (int i = 0; i < 4; i++) EXPECT_FLOAT_EQ(ptr[i], 0.3f);
+}
+
+TEST_F(TestTensorMath, Reshape) {
+  Tensor t(Shape{4});
+  t.SetValue(0.3f);
+  Tensor p = Reshape(t, Shape{4, 1});
+  const float *ptr = t.data<float>();
+  EXPECT_EQ(p.shape(0), 4u);
+  EXPECT_EQ(p.shape(1), 1u);
+  for (int i = 0; i < 4; i++) EXPECT_FLOAT_EQ(ptr[i], 0.3f);
+}
+#ifdef USE_CBLAS
+TEST_F(TestTensorMath, L2Cpp) {
+  float l2 = a.L2();
+  float target = 0.0f;
+  for (size_t i = 0; i < a.Size(); i++) target += dat1[i] * dat1[i];
+  EXPECT_FLOAT_EQ(l2, sqrt(target) / a.Size());
+}
+TEST_F(TestTensorMath, MultCpp) {
+  const float x[4] = {1.0f, 2.0f, 3.0f, 4.0f};
+  Tensor t(Shape{2, 2});
+  t.CopyDataFromHostPtr(x, 4);
+  d.CopyDataFromHostPtr(dat1, 6);
+  Tensor C = Mult(d, t);
+  const float *xptr = C.data<float>();
+  for (int i = 0; i < 3; i++) {
+    for (int j = 0; j < 2; j++) {
+      float tmp = 0;
+      for (int k = 0; k < 2; k++) {
+        tmp += dat1[i * 2 + k] * x[k * 2 + j];
+      }
+      EXPECT_FLOAT_EQ(xptr[i * 2 + j], tmp);
+    }
+  }
+  const float y[8] = {1.0f, 2.0f, 3.0f, 4.0f, 1.1f, 2.1f, 3.1f, 4.1f};
+  Tensor s(Shape{4, 2});
+  s.CopyDataFromHostPtr(y, 8);
+  const float *sPtr = s.data<float>();
+  for (int i = 0; i < 8; i++) EXPECT_FLOAT_EQ(sPtr[i], y[i]);
+  Tensor D = Mult(d, s.T());
+  const float *DPtr = D.data<float>();
+  for (int i = 0; i < 3; i++) {
+    for (int j = 0; j < 4; j++) {
+      float tmp = 0;
+      for (int k = 0; k < 2; k++) {
+        tmp += dat1[i * 2 + k] * y[j * 2 + k];
+      }
+      EXPECT_FLOAT_EQ(DPtr[i * 4 + j], tmp);
+    }
+  }
+  Tensor p(Shape{4, 1});
+  p.CopyDataFromHostPtr(x, 4);
+  Tensor q(Shape{1, 4});
+  q.SetValue(1.0f);
+  Tensor o(Shape{4, 4});
+
+  Mult(p, q, &o);
+  const float *oPtr = o.data<float>();
+  for (int i = 0; i < 4; i++) {
+    for (int j = 0; j < 4; j++) {
+      EXPECT_FLOAT_EQ(oPtr[i * 4 + j], x[i]);
+    }
+  }
+}
+
+TEST_F(TestTensorMath, AddColumnCpp) {
+  const float x[3] = {1.0f, 2.0f, 3.0f};
+  Tensor t(Shape{3});
+  t.CopyDataFromHostPtr(x, 3);
+  d.CopyDataFromHostPtr(dat1, 6);
+  AddColumn(t, &d);
+  const float *xptr = d.data<float>();
+  for (int i = 0; i < 3; i++) {
+    for (int j = 0; j < 2; j++) {
+      EXPECT_FLOAT_EQ(xptr[i * 2 + j], dat1[i * 2 + j] + x[i]);
+    }
+  }
+}
+TEST_F(TestTensorMath, SubColumnCpp) {
+  const float x[3] = {1.0f, 2.0f, 3.0f};
+  Tensor t(Shape{3});
+  t.CopyDataFromHostPtr(x, 3);
+  d.CopyDataFromHostPtr(dat1, 6);
+  SubColumn(t, &d);
+  const float *xptr = d.data<float>();
+  for (int i = 0; i < 3; i++) {
+    for (int j = 0; j < 2; j++) {
+      EXPECT_FLOAT_EQ(xptr[i * 2 + j], dat1[i * 2 + j] - x[i]);
+    }
+  }
+}
+
+TEST_F(TestTensorMath, DivColumnCpp) {
+  const float x[3] = {1.0f, 2.0f, 3.0f};
+  Tensor t(Shape{3});
+  t.CopyDataFromHostPtr(x, 3);
+  d.CopyDataFromHostPtr(dat1, 6);
+  DivColumn(t, &d);
+  const float *xptr = d.data<float>();
+  for (int i = 0; i < 3; i++) {
+    for (int j = 0; j < 2; j++) {
+      EXPECT_FLOAT_EQ(xptr[i * 2 + j], dat1[i * 2 + j] / x[i]);
+    }
+  }
+}
+
+TEST_F(TestTensorMath, AddRowCpp) {
+  const float x[2] = {1.1f, 2.1f};
+  Tensor t(Shape{2});
+  t.CopyDataFromHostPtr(x, 2);
+  d.CopyDataFromHostPtr(dat1, 6);
+  AddRow(t, &d);
+  const float *xptr = d.data<float>();
+  for (int i = 0; i < 3; i++) {
+    for (int j = 0; j < 2; j++) {
+      EXPECT_FLOAT_EQ(xptr[i * 2 + j], dat1[i * 2 + j] + x[j]);
+    }
+  }
+}
+
+TEST_F(TestTensorMath, SubRowCpp) {
+  const float x[2] = {1.1f, 2.1f};
+  Tensor t(Shape{2});
+  t.CopyDataFromHostPtr(x, 2);
+  d.CopyDataFromHostPtr(dat1, 6);
+  SubRow(t, &d);
+  const float *xptr = d.data<float>();
+  for (int i = 0; i < 3; i++) {
+    for (int j = 0; j < 2; j++) {
+      EXPECT_FLOAT_EQ(xptr[i * 2 + j], dat1[i * 2 + j] - x[j]);
+    }
+  }
+}
+
+TEST_F(TestTensorMath, MultRowCpp) {
+  const float x[2] = {1.1f, 2.1f};
+  Tensor t(Shape{2});
+  t.CopyDataFromHostPtr(x, 2);
+  d.CopyDataFromHostPtr(dat1, 6);
+  MultRow(t, &d);
+  const float *xptr = d.data<float>();
+  for (int i = 0; i < 3; i++) {
+    for (int j = 0; j < 2; j++) {
+      EXPECT_FLOAT_EQ(xptr[i * 2 + j], dat1[i * 2 + j] * x[j]);
+    }
+  }
+}
+
+TEST_F(TestTensorMath, SumRowsCpp) {
+  Tensor t(Shape{2});
+  float dat[6];
+  for (int i = 0; i < 6; i ++)
+    dat[i] = (float)rand()/(float)(RAND_MAX/ 10);
+  d.CopyDataFromHostPtr(dat, 6);
+  SumRows(d, &t);
+  const float *tptr = t.data<float>();
+  for (int i = 0; i < 2; i++) {
+    float tmp = 0;
+    for (int j = 0; j < 3; j++) {
+      tmp += dat[j * 2 + i];
+    }
+    EXPECT_FLOAT_EQ(tptr[i], tmp);
+  }
+}
+
+TEST_F(TestTensorMath, SumColumnsCpp) {
+  Tensor t(Shape{3});
+  d.CopyDataFromHostPtr(dat1, 6);
+  SumColumns(d, &t);
+  const float *tptr = t.data<float>();
+  for (int i = 0; i < 3; i++) {
+    float tmp = 0;
+    for (int j = 0; j < 2; j++) {
+      tmp += dat1[i * 2 + j];
+    }
+    EXPECT_FLOAT_EQ(tptr[i], tmp);
+  }
+}
+#endif
+#ifdef USE_CUDA
+TEST_F(TestTensorMath, L2Cuda) {
+  auto dev = std::make_shared<singa::CudaGPU>();
+  Tensor t(Shape{3, 2}, dev);
+  t.CopyDataFromHostPtr(dat1, 6);
+  float l2 = t.L2();
+  float target = 0.0f;
+  for (size_t i = 0; i < t.Size(); i++) target += dat1[i] * dat1[i];
+  EXPECT_FLOAT_EQ(l2, sqrt(target) / t.Size());
+}
+TEST_F(TestTensorMath, MultCuda) {
+  const float x[4] = {1.0f, 2.0f, 3.0f, 4.0f};
+  auto dev = std::make_shared<singa::CudaGPU>();
+  Tensor t(Shape{2, 2}, dev);
+  t.CopyDataFromHostPtr(x, 4);
+  d.ToDevice(dev);
+  d.CopyDataFromHostPtr(dat1, 6);
+  Tensor C = Mult(d, t);
+  C.ToHost();
+  const float *xptr = C.data<float>();
+  for (int i = 0; i < 3; i++) {
+    for (int j = 0; j < 2; j++) {
+      float tmp = 0;
+      for (int k = 0; k < 2; k++) {
+        tmp += dat1[i * 2 + k] * x[k * 2 + j];
+      }
+      EXPECT_FLOAT_EQ(xptr[i * 2 + j], tmp);
+    }
+  }
+
+  const float y[8] = {1.0f, 2.0f, 3.0f, 4.0f, 1.1f, 2.1f, 3.1f, 4.1f};
+  Tensor s(Shape{4, 2}, dev);
+  s.CopyDataFromHostPtr(y, 8);
+  Tensor D = Mult(d, s.T());
+  D.ToHost();
+  const float *DPtr = D.data<float>();
+  for (int i = 0; i < 3; i++) {
+    for (int j = 0; j < 4; j++) {
+      float tmp = 0;
+      for (int k = 0; k < 2; k++) {
+        tmp += dat1[i * 2 + k] * y[j * 2 + k];
+      }
+      EXPECT_FLOAT_EQ(DPtr[i * 4 + j], tmp);
+    }
+  }
+  Tensor p(Shape{4, 1}, dev);
+  p.CopyDataFromHostPtr(x, 4);
+  Tensor q(Shape{1, 4}, dev);
+  q.SetValue(1.0f);
+  Tensor o(Shape{4, 4}, dev);
+
+  Mult(p, q, &o);
+  o.ToHost();
+  const float *oPtr = o.data<float>();
+  for (int i = 0; i < 4; i++) {
+    for (int j = 0; j < 4; j++) {
+      EXPECT_FLOAT_EQ(oPtr[i * 4 + j], x[i]);
+    }
+  }
+  d.ToHost();
+  p.ToHost();
+}
+
+TEST_F(TestTensorMath, AddColumnCuda) {
+  const float x[3] = {1.0f, 2.0f, 3.0f};
+  auto dev = std::make_shared<singa::CudaGPU>();
+  Tensor t(Shape{3}, dev);
+  t.CopyDataFromHostPtr(x, 3);
+  d.CopyDataFromHostPtr(dat1, 6);
+  d.ToDevice(dev);
+  AddColumn(t, &d);
+  d.ToHost();
+  const float *xptr = d.data<float>();
+  for (int i = 0; i < 3; i++) {
+    for (int j = 0; j < 2; j++) {
+      EXPECT_FLOAT_EQ(xptr[i * 2 + j], dat1[i * 2 + j] + x[i]);
+    }
+  }
+}
+
+TEST_F(TestTensorMath, SubColumnCuda) {
+  const float x[3] = {1.0f, 2.0f, 3.0f};
+  auto dev = std::make_shared<singa::CudaGPU>();
+  Tensor t(Shape{3}, dev);
+  t.CopyDataFromHostPtr(x, 3);
+  d.CopyDataFromHostPtr(dat1, 6);
+  d.ToDevice(dev);
+  SubColumn(t, &d);
+  d.ToHost();
+  const float *xptr = d.data<float>();
+  for (int i = 0; i < 3; i++) {
+    for (int j = 0; j < 2; j++) {
+      EXPECT_FLOAT_EQ(xptr[i * 2 + j], dat1[i * 2 + j] - x[i]);
+    }
+  }
+}
+#endif
+TEST_F(TestTensorMath, MultColumnCpp) {
+  const float x[3] = {1.0f, 2.0f, 3.0f};
+  Tensor t(Shape{3});
+  t.CopyDataFromHostPtr(x, 3);
+  d.CopyDataFromHostPtr(dat1, 6);
+  MultColumn(t, &d);
+  const float *xptr = d.data<float>();
+  for (int i = 0; i < 3; i++) {
+    for (int j = 0; j < 2; j++) {
+      EXPECT_FLOAT_EQ(xptr[i * 2 + j], dat1[i * 2 + j] * x[i]);
+    }
+  }
+}
+#ifdef USE_CUDA
+TEST_F(TestTensorMath, MultColumnCuda) {
+  const float x[3] = {1.0f, 2.0f, 3.0f};
+  auto dev = std::make_shared<singa::CudaGPU>();
+  Tensor t(Shape{3}, dev);
+  t.CopyDataFromHostPtr(x, 3);
+  d.CopyDataFromHostPtr(dat1, 6);
+  d.ToDevice(dev);
+  MultColumn(t, &d);
+  d.ToHost();
+  const float *xptr = d.data<float>();
+  for (int i = 0; i < 3; i++) {
+    for (int j = 0; j < 2; j++) {
+      EXPECT_FLOAT_EQ(xptr[i * 2 + j], dat1[i * 2 + j] * x[i]);
+    }
+  }
+}
+TEST_F(TestTensorMath, DivColumnCuda) {
+  const float x[3] = {1.0f, 2.0f, 3.0f};
+  auto dev = std::make_shared<singa::CudaGPU>();
+  Tensor t(Shape{3}, dev);
+  t.CopyDataFromHostPtr(x, 3);
+  d.CopyDataFromHostPtr(dat1, 6);
+  d.ToDevice(dev);
+  DivColumn(t, &d);
+  d.ToHost();
+  const float *xptr = d.data<float>();
+  for (int i = 0; i < 3; i++) {
+    for (int j = 0; j < 2; j++) {
+      EXPECT_FLOAT_EQ(xptr[i * 2 + j], dat1[i * 2 + j] / x[i]);
+    }
+  }
+}
+TEST_F(TestTensorMath, AddRowCuda) {
+  const float x[2] = {1.1f, 2.1f};
+  auto dev = std::make_shared<singa::CudaGPU>();
+  Tensor t(Shape{2}, dev);
+  t.CopyDataFromHostPtr(x, 2);
+  d.CopyDataFromHostPtr(dat1, 6);
+  d.ToDevice(dev);
+  AddRow(t, &d);
+  d.ToHost();
+  const float *xptr = d.data<float>();
+  for (int i = 0; i < 3; i++) {
+    for (int j = 0; j < 2; j++) {
+      EXPECT_FLOAT_EQ(xptr[i * 2 + j], dat1[i * 2 + j] + x[j]);
+    }
+  }
+}
+TEST_F(TestTensorMath, SubRowCuda) {
+  const float x[2] = {1.1f, 2.1f};
+  auto dev = std::make_shared<singa::CudaGPU>();
+  Tensor t(Shape{2}, dev);
+  t.CopyDataFromHostPtr(x, 2);
+  d.CopyDataFromHostPtr(dat1, 6);
+  d.ToDevice(dev);
+  SubRow(t, &d);
+  d.ToHost();
+  const float *xptr = d.data<float>();
+  for (int i = 0; i < 3; i++) {
+    for (int j = 0; j < 2; j++) {
+      EXPECT_FLOAT_EQ(xptr[i * 2 + j], dat1[i * 2 + j] - x[j]);
+    }
+  }
+}
+TEST_F(TestTensorMath, MultRowCuda) {
+  const float x[2] = {1.1f, 2.1f};
+  auto dev = std::make_shared<singa::CudaGPU>();
+  Tensor t(Shape{2}, dev);
+  t.CopyDataFromHostPtr(x, 2);
+  d.CopyDataFromHostPtr(dat1, 6);
+  d.ToDevice(dev);
+  MultRow(t, &d);
+  d.ToHost();
+  const float *xptr = d.data<float>();
+  for (int i = 0; i < 3; i++) {
+    for (int j = 0; j < 2; j++) {
+      EXPECT_FLOAT_EQ(xptr[i * 2 + j], dat1[i * 2 + j] * x[j]);
+    }
+  }
+}
+#endif
+TEST_F(TestTensorMath, DivRowCpp) {
+  const float x[2] = {1.1f, 2.1f};
+  Tensor t(Shape{2});
+  t.CopyDataFromHostPtr(x, 2);
+  d.CopyDataFromHostPtr(dat1, 6);
+  DivRow(t, &d);
+  const float *xptr = d.data<float>();
+  for (int i = 0; i < 3; i++) {
+    for (int j = 0; j < 2; j++) {
+      EXPECT_FLOAT_EQ(xptr[i * 2 + j], dat1[i * 2 + j] / x[j]);
+    }
+  }
+}
+#ifdef USE_CUDA
+TEST_F(TestTensorMath, DivRowCuda) {
+  const float x[2] = {1.1f, 2.1f};
+  auto dev = std::make_shared<singa::CudaGPU>();
+  Tensor t(Shape{2}, dev);
+  t.CopyDataFromHostPtr(x, 2);
+  d.CopyDataFromHostPtr(dat1, 6);
+  d.ToDevice(dev);
+  DivRow(t, &d);
+  d.ToHost();
+  const float *xptr = d.data<float>();
+  for (int i = 0; i < 3; i++) {
+    for (int j = 0; j < 2; j++) {
+      EXPECT_FLOAT_EQ(xptr[i * 2 + j], dat1[i * 2 + j] / x[j]);
+    }
+  }
+}
+TEST_F(TestTensorMath, SumRowsCuda) {
+  auto dev = std::make_shared<singa::CudaGPU>();
+  Tensor t(Shape{2}, dev);
+  d.CopyDataFromHostPtr(dat1, 6);
+  d.ToDevice(dev);
+  SumRows(d, &t);
+  t.ToHost();
+  const float *tptr = t.data<float>();
+  for (int i = 0; i < 2; i++) {
+    float tmp = 0;
+    for (int j = 0; j < 3; j++) {
+      tmp += dat1[j * 2 + i];
+    }
+    EXPECT_FLOAT_EQ(tptr[i], tmp);
+  }
+  d.ToHost();
+}
+TEST_F(TestTensorMath, SumColumnCuda) {
+  auto dev = std::make_shared<singa::CudaGPU>();
+  Tensor t(Shape{3}, dev);
+  d.CopyDataFromHostPtr(dat1, 6);
+  d.ToDevice(dev);
+  SumColumns(d, &t);
+  t.ToHost();
+  const float *tptr = t.data<float>();
+  for (int i = 0; i < 3; i++) {
+    float tmp = 0;
+    for (int j = 0; j < 2; j++) {
+      tmp += dat1[i * 2 + j];
+    }
+    EXPECT_FLOAT_EQ(tptr[i], tmp);
+  }
+  d.ToHost();
+}
+
+#endif
+
+TEST_F(TestTensorMath, ConcatenateRowsCpp) {
+  d.CopyDataFromHostPtr<float>(dat1, 6);
+  e.CopyDataFromHostPtr<float>(dat2, 6);
+  const auto ret = singa::ConcatenateRows(vector<Tensor>{d, e});
+  EXPECT_EQ(ret.shape(0), d.shape(0) + e.shape(0));
+  EXPECT_EQ(ret.shape(1), d.shape(1));
+  const float *retPtr = ret.data<float>();
+  for (int i = 0; i < 6; i++) EXPECT_FLOAT_EQ(retPtr[i], dat1[i]);
+  for (int i = 0; i < 6; i++) EXPECT_FLOAT_EQ(retPtr[i + 6], dat2[i]);
+}
+
+TEST_F(TestTensorMath, ConcatenateColumnsCpp) {
+  d.CopyDataFromHostPtr<float>(dat1, 6);
+  e.CopyDataFromHostPtr<float>(dat2, 6);
+  const auto ret = singa::ConcatenateColumns(vector<Tensor>{d, e});
+  EXPECT_EQ(ret.shape(0), d.shape(0));
+  EXPECT_EQ(ret.shape(1), d.shape(1) + e.shape(1));
+
+  const float *retPtr = ret.data<float>();
+  for (int i = 0; i < 3; i++) {
+    for (int j = 0; j < 2; j++)
+      EXPECT_FLOAT_EQ(retPtr[i * 4 + j], dat1[i * 2 + j]);
+    for (int j = 0; j < 2; j++)
+      EXPECT_FLOAT_EQ(retPtr[i * 4 + 2 + j], dat2[i * 2 + j]);
+  }
+}
+
+TEST_F(TestTensorMath, CopyRowsCpp) {
+  const auto ret = singa::CopyRows(e, 1, 2);
+  EXPECT_EQ(ret.shape(0), 1u);
+  EXPECT_EQ(ret.shape(1), e.shape(1));
+  const float *retPtr = ret.data<float>();
+  for (size_t i = 0; i < ret.Size(); i++)
+    EXPECT_FLOAT_EQ(retPtr[i], dat1[1 * 2 + i]);
+}
+
+TEST_F(TestTensorMath, CopyColumnsCpp) {
+  a.Reshape(Shape{2, 3});
+  const auto ret = singa::CopyColumns(a, 1, 3);
+  EXPECT_EQ(ret.shape(0), a.shape(0));
+  EXPECT_EQ(ret.shape(1), 2u);
+  const float *retPtr = ret.data<float>();
+  for (size_t i = 0; i < ret.shape(0); i++)
+    for (size_t j = 0; j < ret.shape(1); j++)
+      EXPECT_FLOAT_EQ(retPtr[i * ret.shape(1) + j],
+                      dat1[i * a.shape(1) + j + 1]);
+}
+
+#ifdef USE_CUDA
+
+TEST_F(TestTensorMath, ConcatenateRowsCuda) {
+  auto dev = std::make_shared<singa::CudaGPU>();
+  d.ToDevice(dev);
+  e.ToDevice(dev);
+  d.CopyDataFromHostPtr<float>(dat1, 6);
+  e.CopyDataFromHostPtr<float>(dat2, 6);
+  auto ret = singa::ConcatenateRows(vector<Tensor>{d, e});
+  EXPECT_EQ(ret.shape(0), d.shape(0) + e.shape(0));
+  EXPECT_EQ(ret.shape(1), d.shape(1));
+  ret.ToHost();
+  const float *retPtr = ret.data<float>();
+  for (int i = 0; i < 6; i++) EXPECT_FLOAT_EQ(retPtr[i], dat1[i]);
+  for (int i = 0; i < 6; i++) EXPECT_FLOAT_EQ(retPtr[i + 6], dat2[i]);
+}
+
+TEST_F(TestTensorMath, ConcatenateColumnsCuda) {
+  auto dev = std::make_shared<singa::CudaGPU>();
+  d.ToDevice(dev);
+  e.ToDevice(dev);
+  d.CopyDataFromHostPtr<float>(dat1, 6);
+  e.CopyDataFromHostPtr<float>(dat2, 6);
+  auto ret = singa::ConcatenateColumns(vector<Tensor>{d, e});
+  ret.ToHost();
+  EXPECT_EQ(ret.shape(0), d.shape(0));
+  EXPECT_EQ(ret.shape(1), d.shape(1) + e.shape(1));
+
+  const float *retPtr = ret.data<float>();
+  for (int i = 0; i < 3; i++) {
+    for (int j = 0; j < 2; j++)
+      EXPECT_FLOAT_EQ(retPtr[i * 4 + j], dat1[i * 2 + j]);
+    for (int j = 0; j < 2; j++)
+      EXPECT_FLOAT_EQ(retPtr[i * 4 + 2 + j], dat2[i * 2 + j]);
+  }
+}
+
+TEST_F(TestTensorMath, CopyRowsCuda) {
+  auto dev = std::make_shared<singa::CudaGPU>();
+  e.ToDevice(dev);
+  auto ret = singa::CopyRows(e, 1, 2);
+  ret.ToHost();
+  EXPECT_EQ(ret.shape(0), 1u);
+  EXPECT_EQ(ret.shape(1), e.shape(1));
+  const float *retPtr = ret.data<float>();
+  for (size_t i = 0; i < ret.Size(); i++)
+    EXPECT_FLOAT_EQ(retPtr[i], dat1[1 * 2 + i]);
+}
+
+TEST_F(TestTensorMath, CopyColumnsCuda) {
+  auto dev = std::make_shared<singa::CudaGPU>();
+  a.Reshape(Shape{2, 3});
+  a.ToDevice(dev);
+  auto ret = singa::CopyColumns(a, 1, 3);
+  EXPECT_EQ(ret.shape(0), a.shape(0));
+  EXPECT_EQ(ret.shape(1), 2u);
+  ret.ToHost();
+  const float *retPtr = ret.data<float>();
+  for (size_t i = 0; i < ret.shape(0); i++)
+    for (size_t j = 0; j < ret.shape(1); j++)
+      EXPECT_FLOAT_EQ(retPtr[i * ret.shape(1) + j],
+                      dat1[i * a.shape(1) + j + 1]);
+}
+
+#endif
diff --git a/test/singa/test_textfile_rw.cc b/test/singa/test_textfile_rw.cc
new file mode 100644
index 0000000..c436478
--- /dev/null
+++ b/test/singa/test_textfile_rw.cc
@@ -0,0 +1,133 @@
+/************************************************************
+*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*
+*************************************************************/
+
+#include "../include/singa/io/reader.h"
+#include "../include/singa/io/writer.h"
+#include "gtest/gtest.h"
+
+const char* path_csv = "./textfile_test.csv";
+using singa::io::TextFileReader;
+using singa::io::TextFileWriter;
+TEST(TextFileWriter, Create) {
+  TextFileWriter writer;
+  bool ret;
+  ret = writer.Open(path_csv, singa::io::kCreate);
+  EXPECT_EQ(true, ret);
+
+  std::string key = "";
+  std::string value = "This is a test for binfile io.";
+  ret = writer.Write(key, value);
+  EXPECT_EQ(true, ret);
+
+  ret = writer.Write(key, value);
+  EXPECT_EQ(true, ret);
+
+  writer.Flush();
+  writer.Close();
+}
+
+TEST(TextFileWriter, Append) {
+  TextFileWriter writer;
+  bool ret;
+  ret = writer.Open(path_csv, singa::io::kAppend);
+  EXPECT_EQ(true, ret);
+
+  std::string key = "1";
+  std::string value = "This is another test for binfile io.";
+  ret = writer.Write(key, value);
+  EXPECT_EQ(true, ret);
+
+  key = "2";
+  value = "This is another test for binfile io.";
+  ret = writer.Write(key, value);
+  EXPECT_EQ(true, ret);
+
+  writer.Flush();
+  writer.Close();
+}
+
+TEST(TextFileReader, Read) {
+  TextFileReader reader;
+  bool ret;
+  ret = reader.Open(path_csv);
+  EXPECT_EQ(true, ret);
+
+  int cnt = reader.Count();
+  EXPECT_EQ(4, cnt);
+
+  std::string key, value;
+  reader.Read(&key, &value);
+  EXPECT_STREQ("0", key.c_str());
+  EXPECT_STREQ("This is a test for binfile io.", value.c_str());
+
+  reader.Read(&key, &value);
+  EXPECT_STREQ("1", key.c_str());
+  EXPECT_STREQ("This is a test for binfile io.", value.c_str());
+
+  reader.Read(&key, &value);
+  EXPECT_STREQ("2", key.c_str());
+  EXPECT_STREQ("This is another test for binfile io.", value.c_str());
+
+  reader.Read(&key, &value);
+  EXPECT_STREQ("3", key.c_str());
+  EXPECT_STREQ("This is another test for binfile io.", value.c_str());
+
+  reader.Close();
+}
+
+TEST(TextFileReader, SeekToFirst) {
+  TextFileReader reader;
+  bool ret;
+  ret = reader.Open(path_csv);
+  EXPECT_EQ(true, ret);
+
+  int cnt = reader.Count();
+  EXPECT_EQ(4, cnt);
+
+  std::string key, value;
+  reader.Read(&key, &value);
+  EXPECT_STREQ("0", key.c_str());
+  EXPECT_STREQ("This is a test for binfile io.", value.c_str());
+
+  reader.Read(&key, &value);
+  EXPECT_STREQ("1", key.c_str());
+  EXPECT_STREQ("This is a test for binfile io.", value.c_str());
+
+  reader.SeekToFirst();
+  reader.Read(&key, &value);
+  EXPECT_STREQ("0", key.c_str());
+  EXPECT_STREQ("This is a test for binfile io.", value.c_str());
+
+  reader.Read(&key, &value);
+  EXPECT_STREQ("1", key.c_str());
+  EXPECT_STREQ("This is a test for binfile io.", value.c_str());
+
+  reader.Read(&key, &value);
+  EXPECT_STREQ("2", key.c_str());
+  EXPECT_STREQ("This is another test for binfile io.", value.c_str());
+
+  reader.Read(&key, &value);
+  EXPECT_STREQ("3", key.c_str());
+  EXPECT_STREQ("This is another test for binfile io.", value.c_str());
+
+  reader.Close();
+  remove(path_csv);
+}
diff --git a/test/singa/test_timer.cc b/test/singa/test_timer.cc
new file mode 100644
index 0000000..76d3597
--- /dev/null
+++ b/test/singa/test_timer.cc
@@ -0,0 +1,30 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "gtest/gtest.h"
+#include "singa/utils/timer.h"
+
+#include <chrono>
+#include <thread>
+
+TEST(TimerTest, TestTick) {
+  singa::Timer t;
+  std::this_thread::sleep_for(std::chrono::milliseconds(1000));
+  int time = t.Elapsed<singa::Timer::Milliseconds>();
+  EXPECT_GE(time, 1000);
+}
diff --git a/tool/python/README.md b/tool/python/README.md
deleted file mode 100644
index 8c90cfe..0000000
--- a/tool/python/README.md
+++ /dev/null
@@ -1,375 +0,0 @@
-# Python Binding
-
----
-
-Python binding provides APIs for configuring a training job following
-[keras](http://keras.io/), including the configuration of neural net, training
-algorithm, etc.  It replaces the configuration file (e.g., *job.conf*) in
-protobuf format, which is typically long and error-prone to prepare. In later
-version, we will add python functions to interact with the layer and neural net
-objects, which would enable users to train and debug their models
-interactively.
-
-Here is the layout of python related code,
-
-    SINGAROOT/tool/python
-    |-- pb2 (has job_pb2.py)
-    |-- singa
-        |-- model.py
-        |-- layer.py
-        |-- parameter.py
-        |-- initialization.py
-        |-- utils
-            |-- utility.py
-            |-- message.py
-    |-- examples
-        |-- cifar10_cnn.py, mnist_mlp.py, , mnist_rbm1.py, mnist_ae.py, etc.
-        |-- datasets
-            |-- cifar10.py
-            |-- mnist.py
-
-## Compiling and running instructions
-
-In order to use the Python APIs, users need to add the following arguments when compiling
-SINGA,
-
-    ./configure --enable-python --with-python=PYTHON_DIR
-    make
-
-where PYTHON_DIR has Python.h
-
-
-The training program is launched by
-
-    bin/singa-run.sh -exec <user_main.py>
-
-where user_main.py creates the JobProto object and passes it to Driver::Train to
-start the training.
-
-For example,
-
-    cd SINGAROOT
-    bin/singa-run.sh -exec tool/python/examples/cifar10_cnn.py
-
-
-
-## Examples
-
-
-### MLP Example
-
-This example uses python APIs to configure and train a MLP model over the MNIST
-dataset. The configuration content is the same as that written in *SINGAROOT/examples/mnist/job.conf*.
-
-```
-X_train, X_test, workspace = mnist.load_data()
-
-m = Sequential('mlp', sys.argv)
-
-m.add(Dense(2500, init='uniform', activation='tanh'))
-m.add(Dense(2000, init='uniform', activation='tanh'))
-m.add(Dense(1500, init='uniform', activation='tanh'))
-m.add(Dense(1000, init='uniform', activation='tanh'))
-m.add(Dense(500,  init='uniform', activation='tanh'))
-m.add(Dense(10, init='uniform', activation='softmax'))
-
-sgd = SGD(lr=0.001, lr_type='step')
-topo = Cluster(workspace)
-m.compile(loss='categorical_crossentropy', optimizer=sgd, cluster=topo)
-m.fit(X_train, nb_epoch=1000, with_test=True)
-result = m.evaluate(X_test, batch_size=100, test_steps=10, test_freq=60)
-```
-
-### CNN Example
-
-This example uses python APIs to configure and train a CNN model over the Cifar10
-dataset. The configuration content is the same as that written in *SINGAROOT/examples/cifar10/job.conf*.
-
-
-```
-X_train, X_test, workspace = cifar10.load_data()
-
-m = Sequential('cnn', sys.argv)
-
-m.add(Convolution2D(32, 5, 1, 2, w_std=0.0001, b_lr=2))
-m.add(MaxPooling2D(pool_size=(3,3), stride=2))
-m.add(Activation('relu'))
-m.add(LRN2D(3, alpha=0.00005, beta=0.75))
-
-m.add(Convolution2D(32, 5, 1, 2, b_lr=2))
-m.add(Activation('relu'))
-m.add(AvgPooling2D(pool_size=(3,3), stride=2))
-m.add(LRN2D(3, alpha=0.00005, beta=0.75))
-
-m.add(Convolution2D(64, 5, 1, 2))
-m.add(Activation('relu'))
-m.add(AvgPooling2D(pool_size=(3,3), stride=2))
-
-m.add(Dense(10, w_wd=250, b_lr=2, b_wd=0, activation='softmax'))
-
-sgd = SGD(decay=0.004, lr_type='manual', step=(0,60000,65000), step_lr=(0.001,0.0001,0.00001))
-topo = Cluster(workspace)
-m.compile(updater=sgd, cluster=topo)
-m.fit(X_train, nb_epoch=1000, with_test=True)
-result = m.evaluate(X_test, 1000, test_steps=30, test_freq=300)
-```
-
-
-### RBM Example
-
-This example uses python APIs to configure and train a RBM model over the MNIST
-dataset. The configuration content is the same as that written in *SINGAROOT/examples/rbm*.conf*.
-
-```
-rbmid = 3
-X_train, X_test, workspace = mnist.load_data(nb_rbm=rbmid)
-m = Energy('rbm'+str(rbmid), sys.argv)
-
-out_dim = [1000, 500, 250]
-m.add(RBM(out_dim, w_std=0.1, b_wd=0))
-
-sgd = SGD(lr=0.1, decay=0.0002, momentum=0.8)
-topo = Cluster(workspace)
-m.compile(optimizer=sgd, cluster=topo)
-m.fit(X_train, alg='cd', nb_epoch=6000)
-```
-
-### AutoEncoder Example
-This example uses python APIs to configure and train an autoencoder model over
-the MNIST dataset. The configuration content is the same as that written in
-*SINGAROOT/examples/autoencoder.conf*.
-
-
-```
-rbmid = 4
-X_train, X_test, workspace = mnist.load_data(nb_rbm=rbmid+1)
-m = Sequential('autoencoder', sys.argv)
-
-hid_dim = [1000, 500, 250, 30]
-m.add(Autoencoder(hid_dim, out_dim=784, activation='sigmoid', param_share=True))
-
-agd = AdaGrad(lr=0.01)
-topo = Cluster(workspace)
-m.compile(loss='mean_squared_error', optimizer=agd, cluster=topo)
-m.fit(X_train, alg='bp', nb_epoch=12200)
-```
-
-### To run SINGA on GPU
-
-Users need to set a list of gpu ids to `device` field in fit() or evaluate().
-The number of GPUs must be the same to the number of workers configured for
-cluster topology.
-
-
-```
-gpu_id = [0]
-m.fit(X_train, nb_epoch=100, with_test=True, device=gpu_id)
-```
-
-### TIPS
-
-Hidden layers for MLP can be configured as
-
-```
-for n in [2500, 2000, 1500, 1000, 500]:
-  m.add(Dense(n, init='uniform', activation='tanh'))
-m.add(Dense(10, init='uniform', activation='softmax'))
-```
-
-Activation layer can be specified separately
-
-```
-m.add(Dense(2500, init='uniform'))
-m.add(Activation('tanh'))
-```
-
-Users can explicitly specify hyper-parameters of weight and bias
-
-```
-par = Parameter(init='uniform', scale=0.05)
-m.add(Dense(2500, w_param=par, b_param=par, activation='tanh'))
-m.add(Dense(2000, w_param=par, b_param=par, activation='tanh'))
-m.add(Dense(1500, w_param=par, b_param=par, activation='tanh'))
-m.add(Dense(1000, w_param=par, b_param=par, activation='tanh'))
-m.add(Dense(500, w_param=par, b_param=par, activation='tanh'))
-m.add(Dense(10, w_param=par, b_param=par, activation='softmax'))
-```
-
-
-```
-parw = Parameter(init='gauss', std=0.0001)
-parb = Parameter(init='const', value=0)
-m.add(Convolution(32, 5, 1, 2, w_param=parw, b_param=parb, b_lr=2))
-m.add(MaxPooling2D(pool_size(3,3), stride=2))
-m.add(Activation('relu'))
-m.add(LRN2D(3, alpha=0.00005, beta=0.75))
-
-parw.update(std=0.01)
-m.add(Convolution(32, 5, 1, 2, w_param=parw, b_param=parb))
-m.add(Activation('relu'))
-m.add(AvgPooling2D(pool_size(3,3), stride=2))
-m.add(LRN2D(3, alpha=0.00005, beta=0.75))
-
-m.add(Convolution(64, 5, 1, 2, w_param=parw, b_param=parb, b_lr=1))
-m.add(Activation('relu'))
-m.add(AvgPooling2D(pool_size(3,3), stride=2))
-
-m.add(Dense(10, w_param=parw, w_wd=250, b_param=parb, b_lr=2, b_wd=0, activation='softmax'))
-```
-
-
-Data can be added in this way,
-
-```
-X_train, X_test = mnist.load_data()  // parameter values are set in load_data()
-m.fit(X_train, ...)                  // Data layer for training is added
-m.evaluate(X_test, ...)              // Data layer for testing is added
-```
-or this way,
-
-```
-X_train, X_test = mnist.load_data()  // parameter values are set in load_data()
-m.add(X_train)                       // explicitly add Data layer
-m.add(X_test)                        // explicitly add Data layer
-```
-
-
-```
-store = Store(path='train.bin', batch_size=64, ...)        // parameter values are set explicitly
-m.add(Data(load='recordinput', phase='train', conf=store)) // Data layer is added
-store = Store(path='test.bin', batch_size=100, ...)        // parameter values are set explicitly
-m.add(Data(load='recordinput', phase='test', conf=store))  // Data layer is added
-```
-
-
-### Cases to run SINGA
-
-(1) Run SINGA for training
-
-```
-m.fit(X_train, nb_epoch=1000)
-```
-
-(2) Run SINGA for training and validation
-
-```
-m.fit(X_train, validate_data=X_valid, nb_epoch=1000)
-```
-
-(3) Run SINGA for test while training
-
-```
-m.fit(X_train, nb_epoch=1000, with_test=True)
-result = m.evaluate(X_test, batch_size=100, test_steps=100)
-```
-
-(4) Run SINGA for test only
-Assume a checkpoint exists after training
-
-```
-result = m.evaluate(X_test, batch_size=100, checkpoint_path=workspace+'/checkpoint/step100-worker0')
-```
-
-
-## Implementation Details
-
-### Layer class (inherited)
-
-* Data
-* Dense
-* Activation
-* Convolution2D
-* MaxPooling2D
-* AvgPooling2D
-* LRN2D
-* Dropout
-* RBM
-* Autoencoder
-
-### Model class
-
-Model class has `jobconf` (JobProto) and `layers` (layer list)
-
-Methods in Model class
-
-* add
-	* add Layer into Model
-	* 2 subclasses: Sequential model and Energy model
-
-* compile
-	* set Updater (i.e., optimizer) and Cluster (i.e., topology) components
-
-* fit
-	* set Training data and parameter values for the training
-		* (optional) set Validatiaon data and parameter values
-	* set Train_one_batch component
-	* specify `with_test` field if a user wants to run SINGA with test data simultaneously.
-	* [TODO] recieve train/validation results, e.g., accuracy, loss, ppl, etc.
-
-* evaluate
-	* set Testing data and parameter values for the testing
-	* specify `checkpoint_path` field if a user want to run SINGA only for testing.
-	* [TODO] recieve test results, e.g., accuracy, loss, ppl, etc.
-
-### Results
-
-fit() and evaluate() return train/test results, a dictionary containing
-
-* [key]: step number
-* [value]: a list of dictionay
-	* 'acc' for accuracy
-	* 'loss' for loss
-	* 'ppl' for ppl
-	* 'se' for squred error
-
-
-### Parameter class
-
-Users need to set parameter and initial values. For example,
-
-* Parameter (fields in Param proto)
-	* lr = (float) // learning rate multiplier, used to scale the learning rate when updating parameters.
-	* wd = (float) // weight decay multiplier, used to scale the weight decay when updating parameters.
-
-* Parameter initialization (fields in ParamGen proto)
-	* init = (string) // one of the types, 'uniform', 'constant', 'gaussian'
-	* high = (float)  // for 'uniform'
-	* low = (float)   // for 'uniform'
-	* value = (float) // for 'constant'
-	* mean = (float)  // for 'gaussian'
-	* std = (float)   // for 'gaussian'
-
-* Weight (`w_param`) is 'gaussian' with mean=0, std=0.01 at default
-
-* Bias (`b_param`) is 'constant' with value=0 at default
-
-* How to update the parameter fields
-	* for updating Weight, put `w_` in front of field name
-	* for updating Bias, put `b_` in front of field name
-
-Several ways to set Parameter values
-
-```
-parw = Parameter(lr=2, wd=10, init='gaussian', std=0.1)
-parb = Parameter(lr=1, wd=0, init='constant', value=0)
-m.add(Convolution2D(10, w_param=parw, b_param=parb, ...)
-```
-
-```
-m.add(Dense(10, w_mean=1, w_std=0.1, w_lr=2, w_wd=10, ...)
-```
-
-```
-parw = Parameter(init='constant', mean=0)
-m.add(Dense(10, w_param=parw, w_lr=1, w_wd=1, b_value=1, ...)
-```
-
-### Other classes
-
-* Store
-* Algorithm
-* Updater
-* SGD
-* AdaGrad
-* Cluster
-
diff --git a/tool/python/examples/__init__.py b/tool/python/examples/__init__.py
deleted file mode 100644
index a796a7a..0000000
--- a/tool/python/examples/__init__.py
+++ /dev/null
@@ -1,22 +0,0 @@
-#/************************************************************
-#*
-#* Licensed to the Apache Software Foundation (ASF) under one
-#* or more contributor license agreements.  See the NOTICE file
-#* distributed with this work for additional information
-#* regarding copyright ownership.  The ASF licenses this file
-#* to you under the Apache License, Version 2.0 (the
-#* "License"); you may not use this file except in compliance
-#* with the License.  You may obtain a copy of the License at
-#*
-#*   http://www.apache.org/licenses/LICENSE-2.0
-#*
-#* Unless required by applicable law or agreed to in writing,
-#* software distributed under the License is distributed on an
-#* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-#* KIND, either express or implied.  See the License for the
-#* specific language governing permissions and limitations
-#* under the License.
-#*
-#*************************************************************/
-
-
diff --git a/tool/python/examples/cifar10_cnn.py b/tool/python/examples/cifar10_cnn.py
deleted file mode 100755
index 8d4e778..0000000
--- a/tool/python/examples/cifar10_cnn.py
+++ /dev/null
@@ -1,55 +0,0 @@
-#!/usr/bin/env python
-#/************************************************************
-#*
-#* Licensed to the Apache Software Foundation (ASF) under one
-#* or more contributor license agreements.  See the NOTICE file
-#* distributed with this work for additional information
-#* regarding copyright ownership.  The ASF licenses this file
-#* to you under the Apache License, Version 2.0 (the
-#* "License"); you may not use this file except in compliance
-#* with the License.  You may obtain a copy of the License at
-#*
-#*   http://www.apache.org/licenses/LICENSE-2.0
-#*
-#* Unless required by applicable law or agreed to in writing,
-#* software distributed under the License is distributed on an
-#* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-#* KIND, either express or implied.  See the License for the
-#* specific language governing permissions and limitations
-#* under the License.
-#*
-#*************************************************************/
-
-
-
-import sys, os
-sys.path.append(os.path.join(os.path.dirname(__file__),'..'))
-from singa.model import *
-from examples.datasets import cifar10
-
-X_train, X_test, workspace = cifar10.load_data()
-
-m = Sequential('cifar10-cnn', sys.argv)
-
-m.add(Convolution2D(32, 5, 1, 2, w_std=0.0001, b_lr=2))
-m.add(MaxPooling2D(pool_size=(3,3), stride=2))
-m.add(Activation('relu'))
-m.add(LRN2D(3, alpha=0.00005, beta=0.75))
-
-m.add(Convolution2D(32, 5, 1, 2, b_lr=2))
-m.add(Activation('relu'))
-m.add(AvgPooling2D(pool_size=(3,3), stride=2))
-m.add(LRN2D(3, alpha=0.00005, beta=0.75))
-
-m.add(Convolution2D(64, 5, 1, 2))
-m.add(Activation('relu'))
-m.add(AvgPooling2D(pool_size=(3,3), stride=2))
-
-m.add(Dense(10, w_wd=250, b_lr=2, b_wd=0, activation='softmax'))
-
-sgd = SGD(decay=0.004, momentum=0.9, lr_type='manual', step=(0,60000,65000), step_lr=(0.001,0.0001,0.00001))
-topo = Cluster(workspace)
-m.compile(loss='categorical_crossentropy', optimizer=sgd, cluster=topo)
-m.fit(X_train, nb_epoch=1000, with_test=True)
-result = m.evaluate(X_test, test_steps=100, test_freq=300)
-
diff --git a/tool/python/examples/cifar10_cnn_cudnn.py b/tool/python/examples/cifar10_cnn_cudnn.py
deleted file mode 100755
index e243834..0000000
--- a/tool/python/examples/cifar10_cnn_cudnn.py
+++ /dev/null
@@ -1,57 +0,0 @@
-#!/usr/bin/env python
-
-#/************************************************************
-#*
-#* Licensed to the Apache Software Foundation (ASF) under one
-#* or more contributor license agreements.  See the NOTICE file
-#* distributed with this work for additional information
-#* regarding copyright ownership.  The ASF licenses this file
-#* to you under the Apache License, Version 2.0 (the
-#* "License"); you may not use this file except in compliance
-#* with the License.  You may obtain a copy of the License at
-#*
-#*   http://www.apache.org/licenses/LICENSE-2.0
-#*
-#* Unless required by applicable law or agreed to in writing,
-#* software distributed under the License is distributed on an
-#* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-#* KIND, either express or implied.  See the License for the
-#* specific language governing permissions and limitations
-#* under the License.
-#*
-#*************************************************************/
-
-
-import sys, os
-sys.path.append(os.path.join(os.path.dirname(__file__),'..'))
-from singa.model import *
-from examples.datasets import cifar10
-
-X_train, X_test, workspace = cifar10.load_data()
-
-m = Sequential('cifar10-cnn', sys.argv)
-
-m.add(Convolution2D(32, 5, 1, 2, w_std=0.0001, b_lr=2))
-m.add(MaxPooling2D(pool_size=(3,3), stride=2))
-m.add(Activation('relu'))
-m.add(LRN2D(3, alpha=0.00005, beta=0.75))
-
-m.add(Convolution2D(32, 5, 1, 2, b_lr=2))
-m.add(Activation('relu'))
-m.add(AvgPooling2D(pool_size=(3,3), stride=2))
-m.add(LRN2D(3, alpha=0.00005, beta=0.75))
-
-m.add(Convolution2D(64, 5, 1, 2))
-m.add(Activation('relu'))
-m.add(AvgPooling2D(pool_size=(3,3), stride=2))
-
-m.add(Dense(10, w_wd=250, b_lr=2, b_wd=0, activation='softmax'))
-
-sgd = SGD(decay=0.004, momentum=0.9, lr_type='manual', step=(0,60000,65000), step_lr=(0.001,0.0001,0.00001))
-topo = Cluster(workspace)
-m.compile(loss='categorical_crossentropy', optimizer=sgd, cluster=topo)
-
-gpu_id = [0]
-m.fit(X_train, nb_epoch=7000, with_test=True, device=gpu_id)
-result = m.evaluate(X_test, test_steps=100, test_freq=1000)
-
diff --git a/tool/python/examples/cifar10_cnn_parameter.py b/tool/python/examples/cifar10_cnn_parameter.py
deleted file mode 100755
index c5470b6..0000000
--- a/tool/python/examples/cifar10_cnn_parameter.py
+++ /dev/null
@@ -1,57 +0,0 @@
-#!/usr/bin/env python
-#/************************************************************
-#*
-#* Licensed to the Apache Software Foundation (ASF) under one
-#* or more contributor license agreements.  See the NOTICE file
-#* distributed with this work for additional information
-#* regarding copyright ownership.  The ASF licenses this file
-#* to you under the Apache License, Version 2.0 (the
-#* "License"); you may not use this file except in compliance
-#* with the License.  You may obtain a copy of the License at
-#*
-#*   http://www.apache.org/licenses/LICENSE-2.0
-#*
-#* Unless required by applicable law or agreed to in writing,
-#* software distributed under the License is distributed on an
-#* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-#* KIND, either express or implied.  See the License for the
-#* specific language governing permissions and limitations
-#* under the License.
-#*
-#*************************************************************/
-
-
-import sys, os
-sys.path.append(os.path.join(os.path.dirname(__file__),'..'))
-from singa.model import *
-from examples.datasets import cifar10
-
-X_train, X_test, workspace = cifar10.load_data()
-
-m = Sequential('cifar10-cnn', sys.argv)
-
-parw = Parameter(init='gaussian', std=0.0001)
-parb = Parameter(init='constant')
-m.add(Convolution2D(32, 5, 1, 2, w_param=parw, b_param=parb, b_lr=2))
-m.add(MaxPooling2D(pool_size=(3,3), stride=2))
-m.add(Activation('relu'))
-m.add(LRN2D(3, alpha=0.00005, beta=0.75))
-
-parw.update(std=0.01)
-m.add(Convolution2D(32, 5, 1, 2, w_param=parw, b_param=parb))
-m.add(Activation('relu'))
-m.add(AvgPooling2D(pool_size=(3,3), stride=2))
-m.add(LRN2D(3, alpha=0.00005, beta=0.75))
-
-m.add(Convolution2D(64, 5, 1, 2, w_param=parw, b_param=parb, b_lr=1))
-m.add(Activation('relu'))
-m.add(AvgPooling2D(pool_size=(3,3), stride=2))
-
-m.add(Dense(10, w_param=parw, w_wd=250, b_param=parb, b_lr=2, b_wd=0, activation='softmax'))
-
-sgd = SGD(decay=0.004, lr_type='manual', step=(0,60000,65000), step_lr=(0.001,0.0001,0.00001))
-topo = Cluster(workspace)
-m.compile(loss='categorical_crossentropy', optimizer=sgd, cluster=topo)
-m.fit(X_train, nb_epoch=100, with_test=True)
-result = m.evaluate(X_test, test_steps=10, test_freq=300)
-
diff --git a/tool/python/examples/datasets/__init__.py b/tool/python/examples/datasets/__init__.py
deleted file mode 100644
index a796a7a..0000000
--- a/tool/python/examples/datasets/__init__.py
+++ /dev/null
@@ -1,22 +0,0 @@
-#/************************************************************
-#*
-#* Licensed to the Apache Software Foundation (ASF) under one
-#* or more contributor license agreements.  See the NOTICE file
-#* distributed with this work for additional information
-#* regarding copyright ownership.  The ASF licenses this file
-#* to you under the Apache License, Version 2.0 (the
-#* "License"); you may not use this file except in compliance
-#* with the License.  You may obtain a copy of the License at
-#*
-#*   http://www.apache.org/licenses/LICENSE-2.0
-#*
-#* Unless required by applicable law or agreed to in writing,
-#* software distributed under the License is distributed on an
-#* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-#* KIND, either express or implied.  See the License for the
-#* specific language governing permissions and limitations
-#* under the License.
-#*
-#*************************************************************/
-
-
diff --git a/tool/python/examples/datasets/cifar10.py b/tool/python/examples/datasets/cifar10.py
deleted file mode 100644
index ef5136f..0000000
--- a/tool/python/examples/datasets/cifar10.py
+++ /dev/null
@@ -1,57 +0,0 @@
-#!/usr/bin/env python
-
-#/************************************************************
-#*
-#* Licensed to the Apache Software Foundation (ASF) under one
-#* or more contributor license agreements.  See the NOTICE file
-#* distributed with this work for additional information
-#* regarding copyright ownership.  The ASF licenses this file
-#* to you under the Apache License, Version 2.0 (the
-#* "License"); you may not use this file except in compliance
-#* with the License.  You may obtain a copy of the License at
-#*
-#*   http://www.apache.org/licenses/LICENSE-2.0
-#*
-#* Unless required by applicable law or agreed to in writing,
-#* software distributed under the License is distributed on an
-#* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-#* KIND, either express or implied.  See the License for the
-#* specific language governing permissions and limitations
-#* under the License.
-#*
-#*************************************************************/
-
-
-from singa.model import *
-
-def load_data(
-         workspace = None,
-         backend = 'kvfile',
-         batchsize = 64,
-         random = 5000,
-         shape = (3, 32, 32),
-         std = 127.5,
-         mean = 127.5
-      ):
-
-  # using cifar10 dataset
-  data_dir = 'examples/cifar10'
-  path_train = data_dir + '/train_data.bin'
-  path_test  = data_dir + '/test_data.bin'
-  path_mean  = data_dir + '/image_mean.bin'
-  if workspace == None: workspace = data_dir
-
-  store = Store(path=path_train, mean_file=path_mean, backend=backend,
-              random_skip=random, batchsize=batchsize,
-              shape=shape)
-
-  data_train = Data(load='recordinput', phase='train', conf=store)
-
-  store = Store(path=path_test, mean_file=path_mean, backend=backend,
-              batchsize=batchsize,
-              shape=shape)
-
-  data_test = Data(load='recordinput', phase='test', conf=store)
-
-  return data_train, data_test, workspace
-
diff --git a/tool/python/examples/datasets/mnist.py b/tool/python/examples/datasets/mnist.py
deleted file mode 100644
index 0f75393..0000000
--- a/tool/python/examples/datasets/mnist.py
+++ /dev/null
@@ -1,55 +0,0 @@
-#!/usr/bin/env python
-
-#/************************************************************
-#*
-#* Licensed to the Apache Software Foundation (ASF) under one
-#* or more contributor license agreements.  See the NOTICE file
-#* distributed with this work for additional information
-#* regarding copyright ownership.  The ASF licenses this file
-#* to you under the Apache License, Version 2.0 (the
-#* "License"); you may not use this file except in compliance
-#* with the License.  You may obtain a copy of the License at
-#*
-#*   http://www.apache.org/licenses/LICENSE-2.0
-#*
-#* Unless required by applicable law or agreed to in writing,
-#* software distributed under the License is distributed on an
-#* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-#* KIND, either express or implied.  See the License for the
-#* specific language governing permissions and limitations
-#* under the License.
-#*
-#*************************************************************/
-
-
-from singa.model import *
-
-def load_data(
-     workspace = None,
-     backend = 'kvfile',
-     nb_rbm = 0,  # the number of layers for RBM and Autoencoder
-     checkpoint_steps = 0,
-     **pvalues
-   ):
-
-  # using mnist dataset
-  data_dir = 'examples/mnist'
-  path_train = data_dir + '/train_data.bin'
-  path_test  = data_dir + '/test_data.bin'
-  if workspace == None: workspace = data_dir
-
-  # checkpoint path to load
-  checkpoint_list = None
-  if checkpoint_steps > 0:
-    workerid = 0
-    checkpoint_list = []
-    for i in range(nb_rbm-1, 0, -1):
-      checkpoint_list.append('examples/rbm/rbm{0}/checkpoint/step{1}-worker{2}'.format(str(i),checkpoint_steps,workerid))
-
-  store = Store(path=path_train, backend=backend, **pvalues)
-  data_train = Data(load='recordinput', phase='train', conf=store, checkpoint=checkpoint_list)
-
-  store = Store(path=path_test, backend=backend, **pvalues)
-  data_test = Data(load='recordinput', phase='test', conf=store)
-
-  return data_train, data_test, workspace
diff --git a/tool/python/examples/mnist_ae.py b/tool/python/examples/mnist_ae.py
deleted file mode 100755
index 888f288..0000000
--- a/tool/python/examples/mnist_ae.py
+++ /dev/null
@@ -1,48 +0,0 @@
-#!/usr/bin/env python
-#/************************************************************
-#*
-#* Licensed to the Apache Software Foundation (ASF) under one
-#* or more contributor license agreements.  See the NOTICE file
-#* distributed with this work for additional information
-#* regarding copyright ownership.  The ASF licenses this file
-#* to you under the Apache License, Version 2.0 (the
-#* "License"); you may not use this file except in compliance
-#* with the License.  You may obtain a copy of the License at
-#*
-#*   http://www.apache.org/licenses/LICENSE-2.0
-#*
-#* Unless required by applicable law or agreed to in writing,
-#* software distributed under the License is distributed on an
-#* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-#* KIND, either express or implied.  See the License for the
-#* specific language governing permissions and limitations
-#* under the License.
-#*
-#*************************************************************/
-
-
-import sys, os
-sys.path.append(os.path.join(os.path.dirname(__file__),'..'))
-from singa.model import *
-from examples.datasets import mnist
-
-# Sample parameter values for Autoencoder example
-rbmid = 4
-pvalues = {'batchsize' : 100, 'shape' : 784, 'std_value' : 255}
-X_train, X_test, workspace = mnist.load_data(
-            workspace = 'examples/rbm/autoencoder',
-            nb_rbm = rbmid+1,
-            checkpoint_steps = 6000,
-            **pvalues)
-
-m = Sequential('autoencoder', sys.argv)
-
-hid_dim = [1000, 500, 250, 30]
-m.add(Autoencoder(hid_dim, out_dim=784, activation='sigmoid', param_share=True))
-
-agd = AdaGrad(lr=0.01)
-topo = Cluster(workspace)
-m.compile(loss='mean_squared_error', optimizer=agd, cluster=topo)
-m.fit(X_train, alg='bp', nb_epoch=12200, with_test=True)
-result = m.evaluate(X_test, test_steps=100, test_freq=1000)
-
diff --git a/tool/python/examples/mnist_mlp.py b/tool/python/examples/mnist_mlp.py
deleted file mode 100755
index 10cd15e..0000000
--- a/tool/python/examples/mnist_mlp.py
+++ /dev/null
@@ -1,55 +0,0 @@
-#!/usr/bin/env python
-#/************************************************************
-#*
-#* Licensed to the Apache Software Foundation (ASF) under one
-#* or more contributor license agreements.  See the NOTICE file
-#* distributed with this work for additional information
-#* regarding copyright ownership.  The ASF licenses this file
-#* to you under the Apache License, Version 2.0 (the
-#* "License"); you may not use this file except in compliance
-#* with the License.  You may obtain a copy of the License at
-#*
-#*   http://www.apache.org/licenses/LICENSE-2.0
-#*
-#* Unless required by applicable law or agreed to in writing,
-#* software distributed under the License is distributed on an
-#* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-#* KIND, either express or implied.  See the License for the
-#* specific language governing permissions and limitations
-#* under the License.
-#*
-#*************************************************************/
-
-
-import sys, os
-sys.path.append(os.path.join(os.path.dirname(__file__),'..'))
-from singa.model import *
-from examples.datasets import mnist
-
-# Sample parameter values for Mnist MLP example
-pvalues = {'batchsize' : 64, 'shape' : 784, 'random_skip' : 5000,
-           'std_value' : 127.5, 'mean_value' : 127.5}
-X_train, X_test, workspace = mnist.load_data(**pvalues)
-
-m = Sequential('mlp', argv=sys.argv)
-
-''' Weight and Bias are initialized by
-    uniform distribution with scale=0.05 at default
-'''
-m.add(Dense(2500, init='uniform', activation='tanh'))
-m.add(Dense(2000, init='uniform', activation='tanh'))
-m.add(Dense(1500, init='uniform', activation='tanh'))
-m.add(Dense(1000, init='uniform', activation='tanh'))
-m.add(Dense(500,  init='uniform', activation='tanh'))
-m.add(Dense(10, init='uniform', activation='softmax'))
-
-sgd = SGD(lr=0.001, lr_type='step')
-topo = Cluster(workspace)
-m.compile(loss='categorical_crossentropy', optimizer=sgd, cluster=topo)
-
-m.fit(X_train, nb_epoch=100, with_test=True)
-result = m.evaluate(X_test, batch_size=100, test_steps=10)
-
-#e.g., display result
-#for k, v in sorted(result.items(), key=lambda x: x[0]):
-#  print k, v
diff --git a/tool/python/examples/mnist_mlp_parameter.py b/tool/python/examples/mnist_mlp_parameter.py
deleted file mode 100755
index 9080451..0000000
--- a/tool/python/examples/mnist_mlp_parameter.py
+++ /dev/null
@@ -1,50 +0,0 @@
-#!/usr/bin/env python
-#/************************************************************
-#*
-#* Licensed to the Apache Software Foundation (ASF) under one
-#* or more contributor license agreements.  See the NOTICE file
-#* distributed with this work for additional information
-#* regarding copyright ownership.  The ASF licenses this file
-#* to you under the Apache License, Version 2.0 (the
-#* "License"); you may not use this file except in compliance
-#* with the License.  You may obtain a copy of the License at
-#*
-#*   http://www.apache.org/licenses/LICENSE-2.0
-#*
-#* Unless required by applicable law or agreed to in writing,
-#* software distributed under the License is distributed on an
-#* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-#* KIND, either express or implied.  See the License for the
-#* specific language governing permissions and limitations
-#* under the License.
-#*
-#*************************************************************/
-
-
-import sys, os
-sys.path.append(os.path.join(os.path.dirname(__file__),'..'))
-from singa.model import *
-from singa.datasets import mnist
-
-# Sample parameter values for Mnist MLP example
-pvalues = {'batchsize' : 64, 'shape' : 784,
-           'random_skip' : 5000,
-           'std_value' : 127.5, 'mean_value' : 127.5}
-X_train, X_test, workspace = mnist.load_data(**pvalues)
-
-m = Sequential('mlp', argv=sys.argv)
-
-par = Parameter(init='uniform', scale=0.05)
-m.add(Dense(2500, w_param=par, b_param=par, activation='tanh'))
-m.add(Dense(2000, w_param=par, b_param=par, activation='tanh'))
-m.add(Dense(1500, w_param=par, b_param=par, activation='tanh'))
-m.add(Dense(1000, w_param=par, b_param=par, activation='tanh'))
-m.add(Dense(500, w_param=par, b_param=par, activation='tanh'))
-m.add(Dense(10, w_param=par, b_param=par, activation='softmax'))
-
-sgd = SGD(lr=0.001, lr_type='step')
-topo = Cluster(workspace)
-m.compile(loss='categorical_crossentropy', optimizer=sgd, cluster=topo)
-
-m.fit(X_train, nb_epoch=100, with_test=True)
-result = m.evaluate(X_test, batch_size=100, test_steps=10)
diff --git a/tool/python/examples/mnist_mlp_test.py b/tool/python/examples/mnist_mlp_test.py
deleted file mode 100755
index ee4e4aa..0000000
--- a/tool/python/examples/mnist_mlp_test.py
+++ /dev/null
@@ -1,52 +0,0 @@
-#!/usr/bin/env python
-#/************************************************************
-#*
-#* Licensed to the Apache Software Foundation (ASF) under one
-#* or more contributor license agreements.  See the NOTICE file
-#* distributed with this work for additional information
-#* regarding copyright ownership.  The ASF licenses this file
-#* to you under the Apache License, Version 2.0 (the
-#* "License"); you may not use this file except in compliance
-#* with the License.  You may obtain a copy of the License at
-#*
-#*   http://www.apache.org/licenses/LICENSE-2.0
-#*
-#* Unless required by applicable law or agreed to in writing,
-#* software distributed under the License is distributed on an
-#* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-#* KIND, either express or implied.  See the License for the
-#* specific language governing permissions and limitations
-#* under the License.
-#*
-#*************************************************************/
-
-
-import sys, os
-sys.path.append(os.path.join(os.path.dirname(__file__),'..'))
-from singa.model import *
-from examples.datasets import mnist
-
-# Sample parameter values for Mnist MLP example
-pvalues = {'batchsize' : 64, 'shape' : 784,
-           'std_value' : 127.5, 'mean_value' : 127.5}
-X_train, X_test, workspace = mnist.load_data(**pvalues)
-
-m = Sequential('mlp', argv=sys.argv)
-
-m.add(Dense(2500, init='uniform', activation='tanh'))
-m.add(Dense(2000, init='uniform', activation='tanh'))
-m.add(Dense(1500, init='uniform', activation='tanh'))
-m.add(Dense(1000, init='uniform', activation='tanh'))
-m.add(Dense(500,  init='uniform', activation='tanh'))
-m.add(Dense(10, init='uniform', activation='softmax'))
-
-sgd = SGD(lr=0.001, lr_type='step')
-topo = Cluster(workspace)
-m.compile(loss='categorical_crossentropy', optimizer=sgd, cluster=topo)
-
-''' For doing test only, normally users sets checkpoint path
-    e.g., assume that checkpoint exists by
-          m.fit(X_train, nb_epoch=100, checkpoint_freq=100)
-'''
-path = workspace+'/checkpoint/step100-worker0'
-result = m.evaluate(X_test, batch_size=100, test_steps=100, checkpoint_path=path)
diff --git a/tool/python/examples/mnist_rbm1.py b/tool/python/examples/mnist_rbm1.py
deleted file mode 100755
index 5f22d52..0000000
--- a/tool/python/examples/mnist_rbm1.py
+++ /dev/null
@@ -1,46 +0,0 @@
-#!/usr/bin/env python
-#/************************************************************
-#*
-#* Licensed to the Apache Software Foundation (ASF) under one
-#* or more contributor license agreements.  See the NOTICE file
-#* distributed with this work for additional information
-#* regarding copyright ownership.  The ASF licenses this file
-#* to you under the Apache License, Version 2.0 (the
-#* "License"); you may not use this file except in compliance
-#* with the License.  You may obtain a copy of the License at
-#*
-#*   http://www.apache.org/licenses/LICENSE-2.0
-#*
-#* Unless required by applicable law or agreed to in writing,
-#* software distributed under the License is distributed on an
-#* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-#* KIND, either express or implied.  See the License for the
-#* specific language governing permissions and limitations
-#* under the License.
-#*
-#*************************************************************/
-
-
-import sys, os
-sys.path.append(os.path.join(os.path.dirname(__file__),'..'))
-from singa.model import *
-from examples.datasets import mnist
-
-rbmid = 1
-pvalues = {'batchsize' : 100, 'shape' : 784, 'std_value' : 255}
-X_train, X_test, workspace = mnist.load_data(
-            workspace = 'examples/rbm/rbm1',
-            nb_rbm = rbmid,
-            checkpoint_steps = 6000,
-            **pvalues)
-
-m = Energy('rbm'+str(rbmid), sys.argv)
-
-m.add(RBM(1000, w_std=0.1, b_wd=0))
-
-sgd = SGD(lr=0.1, decay=0.0002, momentum=0.8)
-topo = Cluster(workspace)
-m.compile(optimizer=sgd, cluster=topo)
-m.fit(X_train, alg='cd', nb_epoch=6000)
-#result = m.evaluate(X_test, test_steps=100, test_freq=500)
-
diff --git a/tool/python/examples/mnist_rbm2.py b/tool/python/examples/mnist_rbm2.py
deleted file mode 100755
index 1544f14..0000000
--- a/tool/python/examples/mnist_rbm2.py
+++ /dev/null
@@ -1,47 +0,0 @@
-#!/usr/bin/env python
-#/************************************************************
-#*
-#* Licensed to the Apache Software Foundation (ASF) under one
-#* or more contributor license agreements.  See the NOTICE file
-#* distributed with this work for additional information
-#* regarding copyright ownership.  The ASF licenses this file
-#* to you under the Apache License, Version 2.0 (the
-#* "License"); you may not use this file except in compliance
-#* with the License.  You may obtain a copy of the License at
-#*
-#*   http://www.apache.org/licenses/LICENSE-2.0
-#*
-#* Unless required by applicable law or agreed to in writing,
-#* software distributed under the License is distributed on an
-#* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-#* KIND, either express or implied.  See the License for the
-#* specific language governing permissions and limitations
-#* under the License.
-#*
-#*************************************************************/
-
-
-import sys, os
-sys.path.append(os.path.join(os.path.dirname(__file__),'..'))
-from singa.model import *
-from examples.datasets import mnist
-
-rbmid = 2
-pvalues = {'batchsize' : 100, 'shape' : 784, 'std_value' : 255}
-X_train, X_test, workspace = mnist.load_data(
-            workspace = 'examples/rbm/rbm2',
-            nb_rbm = rbmid,
-            checkpoint_steps = 6000,
-            **pvalues)
-
-m = Energy('rbm'+str(rbmid), sys.argv)
-
-out_dim = [1000, 500]
-m.add(RBM(out_dim, w_std=0.1, b_wd=0))
-
-sgd = SGD(lr=0.1, decay=0.0002, momentum=0.8)
-topo = Cluster(workspace)
-m.compile(optimizer=sgd, cluster=topo)
-m.fit(X_train, alg='cd', nb_epoch=6000)
-#result = m.evaluate(X_test, test_steps=100, test_freq=500)
-
diff --git a/tool/python/examples/mnist_rbm3.py b/tool/python/examples/mnist_rbm3.py
deleted file mode 100755
index 3a6348d..0000000
--- a/tool/python/examples/mnist_rbm3.py
+++ /dev/null
@@ -1,47 +0,0 @@
-#!/usr/bin/env python
-#/************************************************************
-#*
-#* Licensed to the Apache Software Foundation (ASF) under one
-#* or more contributor license agreements.  See the NOTICE file
-#* distributed with this work for additional information
-#* regarding copyright ownership.  The ASF licenses this file
-#* to you under the Apache License, Version 2.0 (the
-#* "License"); you may not use this file except in compliance
-#* with the License.  You may obtain a copy of the License at
-#*
-#*   http://www.apache.org/licenses/LICENSE-2.0
-#*
-#* Unless required by applicable law or agreed to in writing,
-#* software distributed under the License is distributed on an
-#* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-#* KIND, either express or implied.  See the License for the
-#* specific language governing permissions and limitations
-#* under the License.
-#*
-#*************************************************************/
-
-
-import sys, os
-sys.path.append(os.path.join(os.path.dirname(__file__),'..'))
-from singa.model import *
-from examples.datasets import mnist
-
-rbmid = 3
-pvalues = {'batchsize' : 100, 'shape' : 784, 'std_value' : 255}
-X_train, X_test, workspace = mnist.load_data(
-            workspace = 'examples/rbm/rbm3',
-            nb_rbm = rbmid,
-            checkpoint_steps = 6000,
-            **pvalues)
-
-m = Energy('rbm'+str(rbmid), sys.argv)
-
-out_dim = [1000, 500, 250]
-m.add(RBM(out_dim, w_std=0.1, b_wd=0))
-
-sgd = SGD(lr=0.1, decay=0.0002, momentum=0.8)
-topo = Cluster(workspace)
-m.compile(optimizer=sgd, cluster=topo)
-m.fit(X_train, alg='cd', nb_epoch=6000)
-#result = m.evaluate(X_test, test_steps=100, test_freq=500)
-
diff --git a/tool/python/examples/mnist_rbm4.py b/tool/python/examples/mnist_rbm4.py
deleted file mode 100755
index 8343b4f..0000000
--- a/tool/python/examples/mnist_rbm4.py
+++ /dev/null
@@ -1,47 +0,0 @@
-#!/usr/bin/env python
-#/************************************************************
-#*
-#* Licensed to the Apache Software Foundation (ASF) under one
-#* or more contributor license agreements.  See the NOTICE file
-#* distributed with this work for additional information
-#* regarding copyright ownership.  The ASF licenses this file
-#* to you under the Apache License, Version 2.0 (the
-#* "License"); you may not use this file except in compliance
-#* with the License.  You may obtain a copy of the License at
-#*
-#*   http://www.apache.org/licenses/LICENSE-2.0
-#*
-#* Unless required by applicable law or agreed to in writing,
-#* software distributed under the License is distributed on an
-#* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-#* KIND, either express or implied.  See the License for the
-#* specific language governing permissions and limitations
-#* under the License.
-#*
-#*************************************************************/
-
-
-import sys, os
-sys.path.append(os.path.join(os.path.dirname(__file__),'..'))
-from singa.model import *
-from examples.datasets import mnist
-
-rbmid = 4
-pvalues = {'batchsize' : 100, 'shape' : 784, 'std_value' : 255}
-X_train, X_test, workspace = mnist.load_data(
-            workspace = 'examples/rbm/rbm'+str(rbmid),
-            nb_rbm = rbmid,
-            checkpoint_steps = 6000,
-            **pvalues)
-
-m = Energy('rbm'+str(rbmid), sys.argv)
-
-out_dim = [1000, 500, 250, 30]
-m.add(RBM(out_dim, sampling='gaussian', w_std=0.1, b_wd=0))
-
-sgd = SGD(lr=0.001, decay=0.0002, momentum=0.8)
-topo = Cluster(workspace)
-m.compile(optimizer=sgd, cluster=topo)
-m.fit(X_train, alg='cd', nb_epoch=6000)
-#result = m.evaluate(X_test, test_steps=100, test_freq=500)
-
diff --git a/tool/python/examples/train_cifar10.py b/tool/python/examples/train_cifar10.py
deleted file mode 100755
index e8ac973..0000000
--- a/tool/python/examples/train_cifar10.py
+++ /dev/null
@@ -1,142 +0,0 @@
-#!/usr/bin/env python
-
-#/************************************************************
-#*
-#* Licensed to the Apache Software Foundation (ASF) under one
-#* or more contributor license agreements.  See the NOTICE file
-#* distributed with this work for additional information
-#* regarding copyright ownership.  The ASF licenses this file
-#* to you under the Apache License, Version 2.0 (the
-#* "License"); you may not use this file except in compliance
-#* with the License.  You may obtain a copy of the License at
-#*
-#*   http://www.apache.org/licenses/LICENSE-2.0
-#*
-#* Unless required by applicable law or agreed to in writing,
-#* software distributed under the License is distributed on an
-#* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-#* KIND, either express or implied.  See the License for the
-#* specific language governing permissions and limitations
-#* under the License.
-#*
-#*************************************************************/
-
-'''
-Example script of CNN model for CIFAR10 dataset
-'''
-import os, sys
-import numpy as np
-
-current_path_ = os.path.dirname(__file__)
-singa_root_ = os.path.abspath(os.path.join(current_path_,'../../..'))
-sys.path.append(os.path.join(singa_root_,'tool','python'))
-
-from singa.driver import Driver
-from singa.layer import *
-from singa.model import *
-
-
-'''
-CIFAR10 dataset can be downloaded at [https://www.cs.toronto.edu/~kriz/cifar.html]
-- please specify dataset_dir
-'''
-dataset_dir_ = singa_root_ + "/tool/python/examples/datasets/cifar-10-batches-py"
-mean_image = None
-
-def unpickle(file):
-    ''' This method loads dataset provided at CIFAR10 website
-        See [https://www.cs.toronto.edu/~kriz/cifar.html] for more details
-    '''
-    import cPickle
-    fo = open(file, 'rb')
-    dict = cPickle.load(fo)
-    fo.close()
-    return dict
-
-def compute_mean_image():
-    ''' This is a sample script to cmopute the average image
-        of all samples in 5 dataset of cifar10
-    '''
-    mean = None
-    nb_samples_total = 0
-    for did in range(1,6):
-        fname_train_data = dataset_dir_ + "/data_batch_{}".format(did)
-        cifar10 = unpickle(fname_train_data)
-        image = cifar10['data'].astype(dtype=np.uint8)
-        if did > 1:
-            image = np.vstack((image, image))
-    return np.average(image, axis=0)
-
-def load_dataset(did=1):
-    ''' CIFAR10 dataset includes
-        5 binary dataset, each contains 10000 images
-        1 row (1 image) includes 1 label & 3072 pixels
-        3072 pixels are  3 channels of a 32x32 image
-    '''
-    assert mean_image is not None, 'mean_image is required'
-    print '[Load CIFAR10 dataset {}]'.format(did)
-    fname_train_data = dataset_dir_ + "/data_batch_{}".format(did)
-    cifar10 = unpickle(fname_train_data)
-    image = cifar10['data'].astype(dtype=np.uint8)
-    image = image - mean_image
-    print '  image x:', image.shape
-    label = np.asarray(cifar10['labels'], dtype=np.uint8)
-    label = label.reshape(label.size, 1)
-    print '  label y:', label.shape
-    return image, label
-
-#-------------------------------------------------------------------
-mean_image = compute_mean_image()
-# mean_image = np.fromfile('tool/python/examples/datasets/cifar10_mean_image')
-
-print '[Layer registration/declaration]'
-d = Driver()
-d.Init(sys.argv)
-
-input = ImageInput(32, 32, 3) # image width, height, channel
-label = LabelInput()
-
-nn = []
-nn.append(input)
-nn.append(Convolution2D(32, 5, 1, 2, w_std=0.0001, b_lr=2))
-nn.append(MaxPooling2D(pool_size=(3,3), stride=2))
-nn.append(Activation('relu'))
-nn.append(LRN2D(3, alpha=0.00005, beta=0.75))
-nn.append(Convolution2D(32, 5, 1, 2, b_lr=2))
-nn.append(Activation('relu'))
-nn.append(AvgPooling2D(pool_size=(3,3), stride=2))
-nn.append(LRN2D(3, alpha=0.00005, beta=0.75))
-nn.append(Convolution2D(64, 5, 1, 2))
-nn.append(Activation('relu'))
-nn.append(AvgPooling2D(pool_size=(3,3), stride=2))
-nn.append(Dense(10, w_wd=250, b_lr=2, b_wd=0))
-loss = Loss('softmaxloss')
-
-# updater
-sgd = SGD(decay=0.004, momentum=0.9, lr_type='manual', step=(0,60000,65000), step_lr=(0.001,0.0001,0.00001))
-
-#-------------------------------------------------------------------
-batchsize = 100
-disp_freq = 50
-train_step = 1000
-
-print '[Start training]'
-for dataset_id in range(train_step / batchsize):
-
-    x, y = load_dataset(dataset_id%5+1)
-
-    for i in range(x.shape[0] / batchsize):
-        xb, yb = x[i*batchsize:(i+1)*batchsize,:], y[i*batchsize:(i+1)*batchsize,:]
-        nn[0].Feed(xb)
-        label.Feed(yb)
-        for h in range(1, len(nn)):
-            nn[h].ComputeFeature(nn[h-1])
-        loss.ComputeFeature(nn[-1], label)
-        if (i+1)%disp_freq == 0:
-            print '  Step {:>3}: '.format(i+1 + dataset_id*(x.shape[0]/batchsize)),
-            loss.display()
-
-        loss.ComputeGradient()
-        for h in range(len(nn)-1, 0, -1):
-            nn[h].ComputeGradient()
-            sgd.Update(i+1, nn[h])
diff --git a/tool/python/examples/train_mnist.py b/tool/python/examples/train_mnist.py
deleted file mode 100755
index b8e6217..0000000
--- a/tool/python/examples/train_mnist.py
+++ /dev/null
@@ -1,117 +0,0 @@
-#!/usr/bin/env python
-
-#/************************************************************
-#*
-#* Licensed to the Apache Software Foundation (ASF) under one
-#* or more contributor license agreements.  See the NOTICE file
-#* distributed with this work for additional information
-#* regarding copyright ownership.  The ASF licenses this file
-#* to you under the Apache License, Version 2.0 (the
-#* "License"); you may not use this file except in compliance
-#* with the License.  You may obtain a copy of the License at
-#*
-#*   http://www.apache.org/licenses/LICENSE-2.0
-#*
-#* Unless required by applicable law or agreed to in writing,
-#* software distributed under the License is distributed on an
-#* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-#* KIND, either express or implied.  See the License for the
-#* specific language governing permissions and limitations
-#* under the License.
-#*
-#*************************************************************/
-
-'''
-Example script of MLP model for MNIST dataset
-'''
-import os, sys
-import numpy as np
-
-current_path_ = os.path.dirname(__file__)
-singa_root_=os.path.abspath(os.path.join(current_path_,'../../..'))
-sys.path.append(os.path.join(singa_root_,'tool','python'))
-
-from singa.driver import Driver
-from singa.layer import *
-from singa.model import *
-
-def swap32(x):
-    return (((x << 24) & 0xFF000000) |
-            ((x <<  8) & 0x00FF0000) |
-            ((x >>  8) & 0x0000FF00) |
-            ((x >> 24) & 0x000000FF))
-
-def load_dataset():
-    ''' MNIST dataset
-        train-images: 4 int32 headers & int8 pixels
-        train-labels: 2 int32 headers & int8 labels
-    '''
-    print '[Load MNIST dataset]'
-    fname_train_image = "examples/mnist/train-images-idx3-ubyte"
-    fname_train_label = "examples/mnist/train-labels-idx1-ubyte"
-    nb_header = [4, 2]
-
-    info = swap32(np.fromfile(fname_train_image, dtype=np.uint32, count=nb_header[0]))
-    nb_samples = info[1] 
-    shape = (info[2],info[3])
-    
-    x = np.fromfile(fname_train_image, dtype=np.uint8)
-    x = x[np.dtype(np.int32).itemsize*nb_header[0]:] # skip header
-    x = x.reshape(nb_samples, shape[0]*shape[1]) 
-    print '   data x:', x.shape
-    y = np.fromfile(fname_train_label, dtype=np.uint8)
-    y = y[np.dtype(np.int32).itemsize*nb_header[1]:] # skip header
-    y = y.reshape(nb_samples, 1) 
-    print '  label y:', y.shape
-
-    return x, y
-
-#-------------------------------------------------------------------
-print '[Layer registration/declaration]'
-d = Driver()
-d.Init(sys.argv)
-
-input = ImageInput(28, 28)
-label = LabelInput()
-
-nn = []
-nn.append(input)
-nn.append(Dense(2500, init='uniform'))
-nn.append(Activation('stanh'))
-nn.append(Dense(2000, init='uniform'))
-nn.append(Activation('stanh'))
-nn.append(Dense(1500, init='uniform'))
-nn.append(Activation('stanh'))
-nn.append(Dense(1000, init='uniform'))
-nn.append(Activation('stanh'))
-nn.append(Dense(500, init='uniform'))
-nn.append(Activation('stanh'))
-nn.append(Dense(10, init='uniform'))
-loss = Loss('softmaxloss')
-
-# updater
-sgd = SGD(lr=0.001, lr_type='step')
-
-#-------------------------------------------------------------------
-batchsize = 64 
-disp_freq = 10
-
-x, y = load_dataset()
-
-print '[Start training]'
-for i in range(x.shape[0] / batchsize):
-    xb, yb = x[i*batchsize:(i+1)*batchsize,:], y[i*batchsize:(i+1)*batchsize,:]
-    nn[0].Feed(xb)
-    label.Feed(yb)
-    for h in range(1, len(nn)):
-        nn[h].ComputeFeature(nn[h-1])
-    loss.ComputeFeature(nn[-1], label)
-    if (i+1)%disp_freq == 0:
-        print '  Step {:>3}: '.format(i+1),
-        loss.display()
-
-    loss.ComputeGradient()
-    for h in range(len(nn)-1, 0, -1):
-        nn[h].ComputeGradient()
-        sgd.Update(i+1, nn[h])
-
diff --git a/tool/python/singa.py b/tool/python/singa.py
deleted file mode 100755
index e44e94d..0000000
--- a/tool/python/singa.py
+++ /dev/null
@@ -1,46 +0,0 @@
-#!/usr/bin/env python
-
-#/************************************************************
-#*
-#* Licensed to the Apache Software Foundation (ASF) under one
-#* or more contributor license agreements.  See the NOTICE file
-#* distributed with this work for additional information
-#* regarding copyright ownership.  The ASF licenses this file
-#* to you under the Apache License, Version 2.0 (the
-#* "License"); you may not use this file except in compliance
-#* with the License.  You may obtain a copy of the License at
-#*
-#*   http://www.apache.org/licenses/LICENSE-2.0
-#*
-#* Unless required by applicable law or agreed to in writing,
-#* software distributed under the License is distributed on an
-#* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-#* KIND, either express or implied.  See the License for the
-#* specific language governing permissions and limitations
-#* under the License.
-#*
-#*************************************************************/
-
-import os
-import sys
-import string
-import pb2.job_pb2 as job_pb2
-import singa.driver as driver
-from google.protobuf.text_format import Merge
-
-if __name__ == '__main__':
-    """Invoke the training program using this python script.
-    ./bin/singa-run.sh -exec tool/python/singa.py -conf examples/cifar10/job.conf
-    """
- 
-    i = sys.argv.index('-conf')
-    s = open(sys.argv[i+1], 'r').read()
-    s = str(s)
-    j = job_pb2.JobProto()
-    Merge(s, j)
-    b = j.SerializeToString()
-    d = driver.Driver()
-    d.InitLog(sys.argv[0])
-    d.Init(sys.argv)
-    d.Train(False, b)
-    #d.Test(b)
diff --git a/tool/python/singa/__init__.py b/tool/python/singa/__init__.py
deleted file mode 100644
index a796a7a..0000000
--- a/tool/python/singa/__init__.py
+++ /dev/null
@@ -1,22 +0,0 @@
-#/************************************************************
-#*
-#* Licensed to the Apache Software Foundation (ASF) under one
-#* or more contributor license agreements.  See the NOTICE file
-#* distributed with this work for additional information
-#* regarding copyright ownership.  The ASF licenses this file
-#* to you under the Apache License, Version 2.0 (the
-#* "License"); you may not use this file except in compliance
-#* with the License.  You may obtain a copy of the License at
-#*
-#*   http://www.apache.org/licenses/LICENSE-2.0
-#*
-#* Unless required by applicable law or agreed to in writing,
-#* software distributed under the License is distributed on an
-#* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-#* KIND, either express or implied.  See the License for the
-#* specific language governing permissions and limitations
-#* under the License.
-#*
-#*************************************************************/
-
-
diff --git a/tool/python/singa/driver.i b/tool/python/singa/driver.i
deleted file mode 100644
index 63f2287..0000000
--- a/tool/python/singa/driver.i
+++ /dev/null
@@ -1,117 +0,0 @@
-/************************************************************
-*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*
-*************************************************************/
-
-/*interface file for swig */
-
-%module driver
-%include "std_vector.i"
-%include "std_string.i"
-%include "argcargv.i"
-%include "carrays.i"
-%array_class(float, floatArray);
-
-%apply (int ARGC, char **ARGV) { (int argc, char **argv)  }
-%{
-#include "singa/driver.h"
-#include "singa/worker.h"
-#include "singa/neuralnet/layer.h"
-#include "singa/neuralnet/neuron_layer.h"
-#include "singa/neuralnet/loss_layer.h"
-#include "singa/utils/blob.h"
-#include "singa/utils/param.h"
-#include "singa/utils/updater.h"
-#include "singa/proto/job.pb.h"
-#include "singa/proto/common.pb.h"
-%}
-
-namespace std {
-  %template(strVector) vector<string>;
-  %template(intVector) vector<int>;
-  %template(floatVector) vector<float>;
-  %template(layerVector) vector<singa::Layer*>;
-  %template(paramVector) vector<singa::Param*>;
-}
-
-namespace singa{
-  class Driver{
-    public:
-    void Train(bool resume, const std::string job_conf);
-    void Init(int argc, char **argv);
-    void InitLog(char* arg);
-    void Test(const std::string job_conf);
-  };
-
-  %nodefault Worker;
-  class Worker{
-    public:
-      static singa::Worker* CreateWorker(const std::string str);
-      void InitNetParams(const std::string& folder, std::vector<singa::Layer*> net);
-      void Checkpoint(int step, const std::string& folder, std::vector<singa::Layer*> net);
-  };
-    
-  class DummyLayer{
-    public:
-      void Setup(const std::string str, const std::vector<singa::Layer*>& srclayers);
-      void Feed(int batchsize, std::vector<float>& data, std::vector<int>& aux_data);
-      singa::Layer* ToLayer();
-  };
-
-  %nodefault Layer;
-  class Layer{
-    public:
-      static singa::Layer* CreateLayer(const std::string str);
-      static void SetupLayer(singa::Layer* layer, const std::string str, const std::vector<singa::Layer*>& srclayers);
-      virtual void ComputeFeature(int flag, const std::vector<singa::Layer*>& srclayers); 
-      virtual void ComputeGradient(int flag, const std::vector<singa::Layer*>& srclayers);
-      virtual const singa::Blob<float>& data(const singa::Layer* from); 
-      virtual const std::vector<singa::Param*> GetParams();
-      virtual const std::string ToString(bool debug, int flag);
-      void SetParams(std::vector<singa::Param*> params);
-  };
-
-  %nodefault Updater;
-  class Updater{
-    public:
-      static singa::Updater* CreateUpdater(const std::string str);
-      virtual void Update(int step, singa::Param* param, float grad_scale);
-  };
-
-  template <typename Dtype>
-  class Blob{
-    public:
-      inline int count();
-      inline const std::vector<int>& shape();
-      inline Dtype* mutable_cpu_data(); 
-      inline const Dtype* cpu_data();
-  };
-
-  class Param{
-    public:
-      inline int size();
-      inline const std::vector<int>& shape();
-      inline float* mutable_cpu_data();
-      void FromProto(const std::string str);
-      /*void ToProto(singa::BlobProto* blob); 
-      */
-  };
-
-  %template(floatBlob) Blob<float>;
-}
diff --git a/tool/python/singa/initializations.py b/tool/python/singa/initializations.py
deleted file mode 100644
index f016f1f..0000000
--- a/tool/python/singa/initializations.py
+++ /dev/null
@@ -1,67 +0,0 @@
-#!/usr/bin/env python
-
-#/************************************************************
-#*
-#* Licensed to the Apache Software Foundation (ASF) under one
-#* or more contributor license agreements.  See the NOTICE file
-#* distributed with this work for additional information
-#* regarding copyright ownership.  The ASF licenses this file
-#* to you under the Apache License, Version 2.0 (the
-#* "License"); you may not use this file except in compliance
-#* with the License.  You may obtain a copy of the License at
-#*
-#*   http://www.apache.org/licenses/LICENSE-2.0
-#*
-#* Unless required by applicable law or agreed to in writing,
-#* software distributed under the License is distributed on an
-#* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-#* KIND, either express or implied.  See the License for the
-#* specific language governing permissions and limitations
-#* under the License.
-#*
-#*************************************************************/
-
-'''
-This module pre-defines initial value for fields
-'''
-
-def get_init_values(identifier, **kwargs):
-    '''
-    This method returns field, a set of key-value pairs, that
-    key is specified by identifier and values are initialized.
-    '''
-
-    field = {}
-
-    if identifier == 'none':
-        return
-
-    if identifier == 'uniform':
-        scale = kwargs['scale'] if 'scale' in kwargs else 0.05
-        names = ['low', 'high']
-        values = [-scale, scale]
-
-    elif identifier == 'constant':
-        names = ['value']
-        values = [0]
-
-    elif identifier == 'gaussian':
-        names = ['mean', 'std']
-        values = [0, 0.01]
-
-    elif identifier == 'conv2d':
-        names = ['stride', 'pad']
-        values = [1, 0]
-
-    elif identifier == 'lrn2d':
-        names = ['alpha', 'beta', 'knorm']
-        values = [1, 0.75, 1]
-
-    elif identifier == 'dropout':
-        names = ['ratio']
-        values = [0.5]
-
-    for i in range(len(names)):
-        field[names[i]] = kwargs[names[i]] if names[i] in kwargs else values[i]
-
-    return field
diff --git a/tool/python/singa/layer.py b/tool/python/singa/layer.py
deleted file mode 100644
index c9a992d..0000000
--- a/tool/python/singa/layer.py
+++ /dev/null
@@ -1,693 +0,0 @@
-#!/usr/bin/env python
-
-#/************************************************************
-#*
-#* Licensed to the Apache Software Foundation (ASF) under one
-#* or more contributor license agreements.  See the NOTICE file
-#* distributed with this work for additional information
-#* regarding copyright ownership.  The ASF licenses this file
-#* to you under the Apache License, Version 2.0 (the
-#* "License"); you may not use this file except in compliance
-#* with the License.  You may obtain a copy of the License at
-#*
-#*   http://www.apache.org/licenses/LICENSE-2.0
-#*
-#* Unless required by applicable law or agreed to in writing,
-#* software distributed under the License is distributed on an
-#* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-#* KIND, either express or implied.  See the License for the
-#* specific language governing permissions and limitations
-#* under the License.
-#*
-#*************************************************************/
-
-'''
-This script includes Layer class and its subclasses that
-users can configure different types of layers for their model.
-'''
-import numpy as np
-from singa.parameter import Parameter, set_param_field
-from singa.initializations import get_init_values
-from singa.utils.utility import setval, generate_name
-from singa.utils.message import *
-from google.protobuf import text_format
-
-from singa.driver import Layer as SingaLayer, Updater as SingaUpdater,\
-                         intVector, floatVector, layerVector,\
-                         paramVector, floatArray_frompointer, DummyLayer
-
-class Layer(object):
-
-    singaupdater = None
-
-    def __init__(self, **kwargs):
-        '''
-        **kwargs (KEY=VALUE)
-          partition_dim = (int)  // partition dimension for net
-        '''
-
-        self.layer = Message('Layer', **kwargs).proto
-        # required field
-        if not 'name' in kwargs:
-            setval(self.layer, name=generate_name('layer', 1))
-
-        # layer connectivity is set in Model.build()
-        self.is_datalayer = False
-        self.singalayer = None
-        self.srclayers = []
-
-        # set src for Rafiki
-        if 'src' in kwargs:
-            self.src = kwargs['src']
-        else:
-            self.src = None
-
-    def setup(self, srclys):
-        ''' Create singa::Layer and store srclayers
-        '''
-        if self.singalayer == None:
-            self.singalayer = SingaLayer.CreateLayer(
-                                    self.layer.SerializeToString())
-            self.singaSrclayerVector = layerVector(len(srclys))
-            for i in range(len(srclys)):
-                self.srclayers.append(srclys[i])
-                self.singaSrclayerVector[i] = srclys[i].get_singalayer()
-            # set up the layer
-            SingaLayer.SetupLayer(self.singalayer,
-                                  self.layer.SerializeToString(),
-                                  self.singaSrclayerVector)
-
-    def ComputeFeature(self, *srclys):
-        ''' The method creates and sets up singa::Layer
-            and maintains its source layers
-            then call ComputeFeature for data transformation.
-
-            *srclys = (list)  // a list of source layers
-        '''
-        # create singa::Layer and store srclayers
-        if self.singalayer == None:
-            if self.src != None:
-                srclys = self.src
-            self.singalayer = SingaLayer.CreateLayer(
-                                    self.layer.SerializeToString())
-            self.singaSrclayerVector = layerVector(len(srclys))
-            for i in range(len(srclys)):
-                self.srclayers.append(srclys[i])
-                self.singaSrclayerVector[i] = srclys[i].get_singalayer()
-            # set up the layer
-            SingaLayer.SetupLayer(self.singalayer,
-                                  self.layer.SerializeToString(),
-                                  self.singaSrclayerVector)
-
-        self.singalayer.ComputeFeature(1, self.singaSrclayerVector)
-
-    def ComputeGradient(self):
-        ''' The method creates singa::Updater
-            and calls ComputeGradient for gradient computation
-            then updates the parameters.
-        '''
-        # call ComputeGradient of Singa
-        self.singalayer.ComputeGradient(1, self.singaSrclayerVector)
-
-    def UpdateParams(self, step, upd):
-        ''' The method updates parameter values
-        '''
-        # update parameters
-        singaParams = self.singalayer.GetParams()
-        for par in singaParams:
-            upd.singaupdater.Update(step, par, 1.0)
-
-    def GetParams(self):
-        ''' The method gets parameter values
-            singaParams[0] for weight
-            singaParams[1] for bias
-        '''
-        singaParams = self.singalayer.GetParams()
-        assert len(singaParams) == 2, 'weight and bias'
-        # for weight
-        weight_array = floatArray_frompointer(singaParams[0].mutable_cpu_data())
-        weight = [weight_array[i] for i in range(singaParams[0].size())]
-        weight = np.array(weight).reshape(singaParams[0].shape())
-        # for bias
-        bias_array = floatArray_frompointer(singaParams[1].mutable_cpu_data())
-        bias = [bias_array[i] for i in range(singaParams[1].size())]
-        bias = np.array(bias).reshape(singaParams[1].shape()[0], 1)
-
-        return weight, bias
-
-    def SetParams(self, *params):
-        ''' The method sets parameter values
-            params[0] for weight
-            params[1] for bias
-        '''
-        singaParams = self.singalayer.GetParams()
-        import pb2.common_pb2 as cm
-        for k in range(len(params)):
-            bp = cm.BlobProto()
-            bp.shape.append(int(params[k].shape[0]))
-            bp.shape.append(int(params[k].shape[1]))
-            for i in range(params[k].shape[0]):
-                for j in range(params[k].shape[1]):
-                    bp.data.append(params[k][i, j])
-            singaParams[k].FromProto(bp.SerializeToString())
-
-    def GetData(self):
-        ''' The method gets layer data values
-        '''
-        blobptr = self.singalayer.data(self.singalayer)
-        data_array = floatArray_frompointer(blobptr.mutable_cpu_data())
-        data = [data_array[i] for i in range(blobptr.count())]
-        return data
-
-    def display(self):
-        debug, flag = False, 0
-        print self.singalayer.ToString(debug, flag)
-
-    def get_singalayer(self):
-        return self.singalayer
-
-
-class Dummy(object):
-
-    def __init__(self, **kwargs):
-        ''' Dummy layer is used for data layer to feed/fetch input data
-            or label information
-        '''
-        self.is_datalayer = True
-        self.srclayers = None
-        self.singalayer = None
-
-        # create layer proto for Dummy layer
-        kwargs = {'name':'dummy', 'type':kDummy}
-        self.layer = Message('Layer', **kwargs).proto
-
-    def setup(self, data_shape):
-        ''' Create and Setup singa Dummy layer
-            called by load_model_parameter
-        '''
-        if self.singalayer == None:
-            setval(self.layer.dummy_conf, input=True)
-            setval(self.layer.dummy_conf, shape=data_shape)
-            self.singalayer = DummyLayer()
-            self.singalayer.Setup(self.layer.SerializeToString(),
-                                  layerVector(0))
-
-    def Feed(self, shape, data, aux_data):
-        ''' Create and Setup singa::DummyLayer for input data
-            Insert data using Feed()
-        '''
-        batchsize = shape[0]
-        hdim = reduce(lambda x, y: x*y, shape[1:])
-        datasize = batchsize * hdim
-
-        # create and setup the dummy layer
-        if self.singalayer == None:
-            self.setup(shape)
-
-        if data is not None:
-            data = data.astype(np.float)
-            dataVector = floatVector(datasize)
-            for i in range(batchsize):
-                for j in range(hdim):
-                    dataVector[i*hdim+j] = data[i, j]
-            labelVector = intVector(0)
-
-        if aux_data is not None:
-            aux_data = aux_data.astype(np.int)
-            labelVector = intVector(datasize)
-            for i in range(batchsize):
-                labelVector[i] = aux_data[i, 0]
-            dataVector = floatVector(0)
-
-        self.singalayer.Feed(batchsize, dataVector, labelVector)
-
-    def get_singalayer(self):
-        return self.singalayer.ToLayer()
-
-class ImageInput(Dummy):
-    ''' This class is used to feed image data
-    '''
-    def __init__(self, width=None, height=None, nb_channel=1):
-        super(ImageInput, self).__init__()
-        self.width = width
-        self.height = height
-        self.nb_channel = nb_channel
-
-    def Feed(self, image_data):
-        batchsize = image_data.shape[0]
-        if self.width == None or self.height == None:
-            hdim = image_data.shape[1]
-            imgsize = int(np.sqrt(hdim/self.nb_channel))
-        shape = [batchsize, self.nb_channel, self.width, self.height]
-        Dummy.Feed(self, shape, image_data, None)
-
-class LabelInput(Dummy):
-    ''' This class is used to feed label data
-    '''
-    def __init__(self):
-        super(LabelInput, self).__init__()
-
-    def Feed(self, label_data):
-        Dummy.Feed(self, label_data.shape, None, label_data)
-
-
-class Data(Layer):
-
-    def __init__(self, load, phase='train', checkpoint=None,
-                 conf=None, **kwargs):
-        '''
-        required
-          load       = (string)  // type of data
-        optional
-          phase      = (string)  // phase of data layer
-          checkpoint = (string)  // checkpoint path
-          conf       = (Store)   // Store object
-          **kwargs (KEY=VALUE)
-            partition_dim = (int)  // partition dimension for net
-        '''
-
-        assert load != None, 'data type should be specified'
-        if load == 'kData':
-            super(Data, self).__init__(name=generate_name('data'),
-                                       user_type=load, **kwargs)
-        else:
-            self.layer_type = enumLayerType(load)
-            super(Data, self).__init__(name=generate_name('data'),
-                                       type=self.layer_type, **kwargs)
-        self.is_datalayer = True
-
-        # include/exclude
-        setval(self.layer, include=enumPhase(phase))
-        #setval(self.layer, exclude=kTest if phase=='train' else kTrain)
-
-        if conf == None:
-            if load == 'kData':
-                setval(self.layer.Extensions[data_conf], **kwargs)
-            else:
-                setval(self.layer.store_conf, **kwargs)
-        else:
-            setval(self.layer, store_conf=conf.proto)
-
-        self.checkpoint = checkpoint # checkpoint for training data
-
-
-class Convolution2D(Layer):
-
-    def __init__(self, nb_filter=0, kernel=0, stride=1, pad=0,
-                 init=None, w_param=None, b_param=None,
-                 activation=None, **kwargs):
-        '''
-        required
-          nb_filter = (int)        // the number of filters
-          kernel    = (int/tuple)  // the size of filter
-        optional
-          stride    = (int/tuple)  // the size of stride
-          pad       = (int/tuple)  // the size of padding
-          init      = (string)     // 'uniform', 'gaussian', 'constant'
-          w_param   = (Parameter)  // Parameter object for weight
-          b_param   = (Parameter)  // Parameter object for bias
-          **kwargs (KEY=VALUE)
-            w_lr = (float) // learning rate multiplier for weight, used to
-                           // scale the learning rate when updating parameters.
-            w_wd = (float) // weight decay multiplier for weight, used to
-                           // scale the weight decay when updating parameters.
-            b_lr = (float) // learning rate multiplier for bias
-            b_wd = (float) // weight decay multiplier for bias
-        '''
-
-        assert nb_filter > 0, 'nb_filter should be set as positive int'
-        super(Convolution2D, self).__init__(name=generate_name('conv', 1),
-                                            type=kCConvolution, **kwargs)
-        fields = {"num_filters":nb_filter}
-        # for kernel
-        if type(kernel) == int:
-            fields['kernel'] = kernel
-        else:
-            fields['kernel_x'] = kernel[0]
-            fields['kernel_y'] = kernel[1]
-        # for stride
-        if type(stride) == int:
-            fields['stride'] = stride
-        else:
-            fields['stride_x'] = stride[0]
-            fields['stride_y'] = stride[1]
-        # for pad
-        if type(pad) == int:
-            fields['pad'] = pad
-        else:
-            fields['pad_x'] = pad[0]
-            fields['pad_y'] = pad[1]
-
-        setval(self.layer.convolution_conf, **fields)
-
-        # parameter w
-        if w_param == None:
-            self.init = 'gaussian' if init == None else init
-            w_param = Parameter(init=self.init)
-        set_param_field(w_param.param, 'w', True, **kwargs)
-        setval(self.layer, param=w_param.param)
-
-        # parameter b
-        if b_param == None:
-            self.init = 'constant' if init == None else init
-            b_param = Parameter(init=self.init) # default: constant
-        set_param_field(b_param.param, 'b', True, **kwargs)
-        setval(self.layer, param=b_param.param)
-
-        # following layers: e.g., activation, dropout, etc.
-        if activation:
-            self.mask = Activation(activation=activation).layer
-
-
-class MaxPooling2D(Layer):
-
-    def __init__(self, pool_size=None,
-                 stride=1, ignore_border=True, **kwargs):
-        '''
-        Max Pooling layer
-
-        required
-          pool_size     = (int|tuple) // the size for pooling
-        optional
-          stride        = (int)       // the size of striding
-          ignore_border = (bool)      // flag for padding
-          **kwargs                    // fields for Layer class
-        '''
-
-        assert pool_size != None, 'pool_size is required'
-        if type(pool_size) == int:
-            pool_size = (pool_size, pool_size)
-        assert type(pool_size) == tuple and pool_size[0] == pool_size[1], \
-               'currently pool size should be square in Singa'
-        super(MaxPooling2D, self).__init__(name=generate_name('pool'),
-                                           type=kCPooling, **kwargs)
-        fields = {'pool' : PoolingProto().MAX,
-                  'kernel' : pool_size[0],
-                  'stride' : stride,
-                  'pad' : 0 if ignore_border else 1}
-        setval(self.layer.pooling_conf, **fields)
-
-class AvgPooling2D(Layer):
-
-    def __init__(self, pool_size=None,
-                 stride=1, ignore_border=True, **kwargs):
-        '''
-        required
-          pool_size     = (int|tuple) // size for pooling
-        optional
-          stride        = (int)       // size of striding
-          ignore_border = (bool)      // flag for padding
-          **kwargs                    // fields for Layer class
-        '''
-
-        assert pool_size != None, 'pool_size is required'
-        if type(pool_size) == int:
-            pool_size = (pool_size, pool_size)
-        assert type(pool_size) == tuple and pool_size[0] == pool_size[1], \
-               'currently pool size should be square in Singa'
-        super(AvgPooling2D, self).__init__(name=generate_name('pool'),
-                                           type=kCPooling, **kwargs)
-        self.layer.pooling_conf.pool = PoolingProto().AVG
-        fields = {'pool' : PoolingProto().AVG,
-                  'kernel' : pool_size[0],
-                  'stride' : stride,
-                  'pad' : 0 if ignore_border else 1}
-        setval(self.layer.pooling_conf, **fields)
-
-class LRN2D(Layer):
-
-    def __init__(self, size=0, **kwargs):
-        '''
-        required
-          size = (int)  // local size
-        '''
-
-        super(LRN2D, self).__init__(name=generate_name('norm'), type=kLRN, **kwargs)
-        # required
-        assert size != 0, 'local size should be set'
-        self.layer.lrn_conf.local_size = size
-        init_values = get_init_values('lrn2d', **kwargs)
-        setval(self.layer.lrn_conf, **init_values)
-
-class Loss(Layer):
-
-    def __init__(self, lossname, topk=1, **kwargs):
-        '''
-        required
-          lossname = (string) // softmaxloss, euclideanloss
-        '''
-        self.layer_type = enumLayerType(lossname)
-        super(Loss, self).__init__(name=generate_name(lossname),
-                                         type=self.layer_type, **kwargs)
-        if lossname == 'softmaxloss':
-            self.layer.softmaxloss_conf.topk = topk
-
-class Activation(Layer):
-
-    def __init__(self, activation='stanh', **kwargs):
-        '''
-        required
-          activation = (string) // relu, sigmoid, tanh, stanh, softmax.
-        '''
-        if activation == 'tanh':
-            print 'Warning: Tanh layer is not supported for CPU'
-
-        self.name = activation
-        self.layer_type = kActivation
-        if activation == 'stanh':
-            self.layer_type = kSTanh
-        elif activation == 'softmax':
-            self.layer_type = kSoftmax
-        super(Activation, self).__init__(name=generate_name(self.name),
-                                         type=self.layer_type, **kwargs)
-        if activation == 'relu':
-            self.layer.activation_conf.type = RELU
-        elif activation == 'sigmoid':
-            self.layer.activation_conf.type = SIGMOID
-        elif activation == 'tanh':
-            self.layer.activation_conf.type = TANH # for GPU
-        #elif activation == 'stanh':
-        #    self.layer.activation_conf.type = STANH
-
-
-class Dropout(Layer):
-
-    def __init__(self, ratio=0.5):
-        '''
-        required
-          ratio = (float) // ratio of drop out nodes
-        '''
-
-        self.name = 'dropout'
-        self.layer_type = enumLayerType(self.name)
-        super(Dropout, self).__init__(name=generate_name(self.name),
-                                      type=self.layer_type, **kwargs)
-        self.layer.dropout_conf.dropout_ratio = ratio
-
-class Accuracy(Layer):
-
-    def __init__(self, **kwargs):
-        '''
-        '''
-
-        self.name = 'accuracy'
-        self.layer_type = enumLayerType(self.name)
-        super(Accuracy, self).__init__(name=generate_name(self.name),
-                                       type=self.layer_type, **kwargs)
-
-class RGB(Layer):
-
-    def __init__(self, meanfile=None, **kwargs):
-        '''
-        required
-          meanfile = (string) // path to meanfile (depreciated)
-        '''
-
-        assert meanfile != None, 'meanfile should be specified'
-        self.name = 'rgb'
-        self.layer_type = kRGBImage
-        super(RGB, self).__init__(name=generate_name(self.name),
-                                  type=self.layer_type)
-        self.layer.rgbimage_conf.meanfile = meanfile
-
-class Dense(Layer):
-
-    def __init__(self, output_dim=0, activation=None,
-                 init=None, w_param=None, b_param=None, input_dim=None,
-                 **kwargs):
-        '''
-        required
-          output_dim = (int)
-        optional
-          activation = (string)
-          init       = (string)     // 'uniform', 'gaussian', 'constant'
-          w_param    = (Parameter)  // Parameter object for weight
-          b_param    = (Parameter)  // Parameter object for bias
-          **kwargs
-            w_lr = (float) // learning rate multiplier for weight, used to
-                           // scale the learning rate when updating parameters.
-            w_wd = (float) // weight decay multiplier for weight, used to
-                           // scale the weight decay when updating parameters.
-            b_lr = (float) // learning rate multiplier for bias
-            b_wd = (float) // weight decay multiplier for bias
-        '''
-        # required
-        assert output_dim > 0, 'output_dim should be set'
-        super(Dense, self).__init__(type=kInnerProduct, **kwargs)
-        self.layer.innerproduct_conf.num_output = output_dim
-        if 'transpose' in kwargs:
-            self.layer.innerproduct_conf.transpose = kwargs['transpose']
-
-        # parameter w (default: gaussian)
-        if w_param == None:
-            self.init = 'gaussian' if init == None else init
-            w_param = Parameter(init=self.init)
-        set_param_field(w_param.param, 'w', False, **kwargs)
-        setval(self.layer, param=w_param.param)
-
-        # parameter b (default: constant)
-        if b_param == None:
-            self.init = 'constant' if init == None else init
-            b_param = Parameter(init=self.init)
-        set_param_field(b_param.param, 'b', False, **kwargs)
-        setval(self.layer, param=b_param.param)
-
-        # following layers: e.g., activation, dropout, etc.
-        if activation:
-            self.mask = Activation(activation=activation).layer
-
-
-''' Classes to deal with multiple layers
-'''
-class Autoencoder(object):
-
-    def __init__(self, hid_dim=None, out_dim=0,
-                 activation=None, param_share=True):
-        '''
-        Generate a set of layers (like MLP) for encoder and decoder
-        The layers are expanded and added in Sequential.add()
-
-        required
-          hid_dim     = (int/list) // the number of nodes in hidden layers
-          out_dim     = (int)      // the number of nodes in the top layer
-        optional
-          activation  = (string)
-          param_share = (bool)     // to share params in encoder and decoder
-        '''
-
-        # required
-        assert out_dim > 0, 'out_dim should be set'
-        self.out_dim = out_dim
-        assert hid_dim != None, 'hid_dim should be set'
-        self.hid_dim = [hid_dim] if type(hid_dim) == int else hid_dim
-
-        self.layer_type = 'AutoEncoder'
-        self.activation = activation
-        self.param_share = param_share
-
-class RBM(Layer):
-
-    def __init__(self, out_dim=None, w_param=None, b_param=None,
-                 sampling=None, **kwargs):
-        '''
-        Generate a set of layers (like MLP) according to the number of elements
-          in out_dim, and on top of it, two layers RBMVis and RBMHid with
-          bidirectional connection
-        The layers are expanded and added in Energy.add()
-
-        required
-          out_dim  = (int) or (int list) // the number of hidden nodes
-        optional
-          w_param  = (Parameter)  // Parameter object for weight
-          b_param  = (Parameter)  // Parameter object for bias
-          sampling = (string)
-        '''
-
-        assert out_dim > 0, 'out_dim should be set'
-        self.out_dim = [out_dim] if type(out_dim) == int else out_dim
-
-        self.name = kwargs['name'] if 'name' in kwargs else 'RBMVis'
-        self.layer_type = kwargs['type'] if 'type' in kwargs else kRBMVis
-        super(RBM, self).__init__(name=generate_name(self.name,
-                                                     withnumber=False),
-                                  type=self.layer_type, **kwargs)
-        setval(self.layer.rbm_conf, hdim=self.out_dim[-1])
-        if self.layer_type == kRBMHid and sampling != None:
-            if sampling == 'gaussian':
-                setval(self.layer.rbm_conf, gaussian=True)
-
-        # parameter w
-        if w_param == None:
-            w_param = Parameter(init='gaussian', **kwargs)
-            set_param_field(w_param.param, 'w', withnumber=False,
-                            level=len(self.out_dim), **kwargs)
-        else:
-            if self.layer_type == kRBMHid:
-                del kwargs['name']
-            else:
-                set_param_field(w_param.param, 'w', withnumber=False,
-        	  	        level=len(self.out_dim), **kwargs)
-        setval(self.layer, param=w_param.param)
-
-        # parameter b
-        if b_param == None:
-            b_param = Parameter(init='constant', **kwargs)
-            set_param_field(b_param.param, 'b', withnumber=False,
-        		    level=len(self.out_dim), **kwargs)
-        else:
-            if self.layer_type == kRBMHid:
-                pass
-            else:
-                set_param_field(b_param.param, 'b', withnumber=False,
-        		        level=len(self.out_dim), **kwargs)
-        setval(self.layer, param=b_param.param)
-
-        if self.layer_type == kRBMVis:
-            wname = w_param.param.name
-            parw = Parameter(name=wname+"_", init='none', share_from=wname)
-            bname = b_param.param.name
-            parb = Parameter(name=bname+"2", wd=0, init='constant')
-            self.bidirect = RBM(self.out_dim, name='RBMHid', type=kRBMHid,
-                         w_param=parw, b_param=parb, sampling=sampling).layer
-
-class Embedding(Layer):
-
-    def __init__(self, in_dim, out_dim, w_param=None, **kwargs):
-
-        super(Embedding, self).__init__(name=generate_name('embedding', 1),
-                                        user_type='kEmbedding')
-        fields = {'vocab_size': in_dim,
-                  'word_dim': out_dim}
-        setval(self.layer.Extensions[embedding_conf], **fields)
-        if w_param == None:
-            # default: uniform
-            w_param = Parameter(name=generate_name('w'), init=init)
-        else:
-            set_param_field(w_param.param, 'w', True, **kwargs)
-        setval(self.layer, param=w_param.param)
-
-class RNNLM(Layer):
-
-    def __init__(self, dim, w_param=None, **kwargs):
-
-        super(RNNLM, self).__init__(name=generate_name('hidden', 1),
-                                    user_type='kHidden')
-        if w_param == None:
-            # default: uniform
-            w_param = Parameter(name=generate_name('w'), init=init)
-        else:
-            set_param_field(w_param.param, 'w', True, **kwargs)
-        setval(self.layer, param=w_param.param)
-
-class UserLossRNNLM(Layer):
-
-    def __init__(self, **kwargs):
-
-        super(UserLossRNNLM, self).__init__(name=generate_name('loss', 1),
-                                            user_type='kLoss')
-        self.layer.Extensions[loss_conf].nclass = kwargs['nclass']
-        self.layer.Extensions[loss_conf].vocab_size = kwargs['vocab_size']
-        setval(self.layer, param=Parameter(name=generate_name('w'),
-                                           init='uniform', scale=0.3).param)
-        setval(self.layer, param=Parameter(name=generate_name('w', 1),
-                                           init='uniform', scale=0.3).param)
diff --git a/tool/python/singa/model.py b/tool/python/singa/model.py
deleted file mode 100644
index 4a6a688..0000000
--- a/tool/python/singa/model.py
+++ /dev/null
@@ -1,716 +0,0 @@
-#!/usr/bin/env python
-
-#/************************************************************
-#*
-#* Licensed to the Apache Software Foundation (ASF) under one
-#* or more contributor license agreements.  See the NOTICE file
-#* distributed with this work for additional information
-#* regarding copyright ownership.  The ASF licenses this file
-#* to you under the Apache License, Version 2.0 (the
-#* "License"); you may not use this file except in compliance
-#* with the License.  You may obtain a copy of the License at
-#*
-#*   http://www.apache.org/licenses/LICENSE-2.0
-#*
-#* Unless required by applicable law or agreed to in writing,
-#* software distributed under the License is distributed on an
-#* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-#* KIND, either express or implied.  See the License for the
-#* specific language governing permissions and limitations
-#* under the License.
-#*
-#*************************************************************/
-
-'''
-This script includes Model class and its subclasses that
-users can configure model parameter.
-'''
-
-import sys, re, subprocess
-from singa.layer import *
-from singa.utils.utility import *
-from singa.utils.message import *
-from google.protobuf import text_format
-
-from singa.driver import Updater as SingaUpdater
-
-class Model(object):
-    ''' Configure model parameter
-        - add(): add layer
-        - compile(): specify Updater and Cluster protos
-        - build(): construct a model (i.e., NetProto)
-        - fit(): run singa for training
-        - evaluate(): run singa for testing
-    '''
-
-    def __init__(self, name='my model', argv=None, label=False):
-        '''
-        optional
-          name  = (string) // name of model/job
-          argv             // pass sys.argv to source
-          label = (bool)   // exist label layer (depreciated)
-        '''
-        self.jobconf = Message('Job', name=name).proto
-        self.layers = []
-        self.label = label
-        self.argv = argv
-        self.result = None
-        self.last_checkpoint_path = None
-        self.cudnn = False
-        self.accuracy = False
-
-    def add(self, layer):
-        '''
-        add layer
-        '''
-        pass
-
-    def exist_datalayer(self, phase):
-        '''
-        check if data layer exists
-        '''
-        for ly in self.layers:
-            if enumPhase(phase) in ly.layer.include:
-                return True
-        return False
-
-    def compile(self, optimizer=None, cluster=None,
-                      loss=None, topk=1, **kwargs):
-        '''
-        required
-          optimizer = (Updater) // updater settings, e.g., SGD
-          cluster   = (Cluster) // cluster settings
-        optional
-          loss      = (string)  // name of loss function type
-          topk      = (int)     // nb of results considered to compute accuracy
-        '''
-        assert optimizer != None, 'optimizer (Updater component) should be set'
-        assert cluster != None, 'cluster (Cluster component) should be set'
-        setval(self.jobconf, updater=optimizer.proto)
-        setval(self.jobconf, cluster=cluster.proto)
-
-        # take care of loss function layer
-        if loss == None:
-            print 'loss layer is not set'
-        else:
-            if hasattr(self.layers[-1], 'mask'):
-                ly = self.layers[-1].mask
-            else:
-                ly = self.layers[-1].layer
-
-            # take care of the last layer
-            if ly.type == enumLayerType('softmax'):
-                # revise the last layer
-                if loss == 'categorical_crossentropy':
-                    setval(ly, type=enumLayerType('softmaxloss'))
-                    setval(ly.softmaxloss_conf, topk=topk)
-                elif loss == 'mean_squared_error':
-                    setval(ly, type=enumLayerType('euclideanloss'))
-            else:
-                # add new layer
-                if loss == 'categorical_crossentropy':
-                    self.add(Loss('softmaxloss', topk=topk))
-                elif loss == 'mean_squared_error':
-                    self.add(Loss('euclideanloss'))
-                elif loss == 'user_loss_rnnlm': # user-defined loss layer
-                    self.add(UserLossRNNLM(nclass=kwargs['nclass'],
-                                           vocab_size=kwargs['in_dim']))
-
-    def build(self):
-        '''
-        construct neuralnet proto
-        '''
-        net = NetProto()
-        slyname = self.layers[0].layer.name
-        for i in range(len(self.layers)):
-            ly = net.layer.add()
-            ly.CopyFrom(self.layers[i].layer)
-            lastly = ly
-            if self.layers[i].is_datalayer == True:
-                continue
-            getattr(ly, 'srclayers').append(slyname)
-            slyname = ly.name
-            if hasattr(self.layers[i], 'mask'):
-                mly = net.layer.add()
-                mly.CopyFrom(self.layers[i].mask)
-                getattr(mly, 'srclayers').append(slyname)
-                slyname = mly.name
-                lastly = mly
-            if hasattr(self.layers[i], 'bidirect'):
-                bly = net.layer.add()
-                bly.CopyFrom(self.layers[i].bidirect)
-                getattr(bly, 'srclayers').append(slyname)
-
-        # deal with label layer (depreciated)
-        if self.label == True:
-            label_layer = Layer(name='label', type=kLabel)
-            ly = net.layer.add()
-            ly.CopyFrom(label_layer.layer)
-            getattr(ly, 'srclayers').append(self.layers[0].layer.name)
-            getattr(lastly, 'srclayers').append(label_layer.layer.name)
-        else:
-            if lastly.name == 'RBMVis':
-                getattr(lastly, 'srclayers').append(bly.name)
-            else:
-                getattr(lastly, 'srclayers').append(self.layers[0].layer.name)
-
-        if self.accuracy == True:
-            smly = net.layer.add()
-            smly.CopyFrom(Layer(name='softmax', type=kSoftmax).layer)
-            setval(smly, include=kTest)
-            getattr(smly, 'srclayers').append(self.layers[-1].layer.name)
-            aly = net.layer.add()
-            aly.CopyFrom(Accuracy().layer)
-            setval(aly, include=kTest)
-            getattr(aly, 'srclayers').append('softmax')
-            getattr(aly, 'srclayers').append(self.layers[0].layer.name)
-
-        # use of cudnn
-        if self.cudnn == True:
-            self.set_cudnn_layer_type(net)
-
-        setval(self.jobconf, neuralnet=net)
-
-    def fit(self, data=None, alg='bp', nb_epoch=0,
-            with_test=False, execpath='', device=None, **fields):
-        '''
-        required
-          data        = (Data)     // Data class object for training data
-          alg         = (string)   // algorithm, e.g., 'bp', 'cd'
-          nb_epoch    = (int)      // the number of training steps
-        optional
-          with_test   = (bool)     // flag if singa runs for test data
-          execpath    = (string)   // path to user own singa (executable file)
-          device      = (int/list) // a list of gpu ids
-          **fields (KEY=VALUE)
-            batch_size       = (int)    // batch size for training data
-            train_steps      = (int)    // nb of steps for training, i.e., epoch
-            disp_freq        = (int)    // frequency to display training info
-            disp_after       = (int)    // display after this number
-            validate_data    = (Data)   // valid data, specified in load_data()
-            validate_freq    = (int)    // frequency of validation
-            validate_steps   = (int)    // total number of steps for validation
-            validate_after   = (int)    // start validation after this number
-            checkpoint_path  = (string) // path to checkpoint file
-            checkpoint_freq  = (int)    // frequency for checkpoint
-            checkpoint_after = (int)    // start checkpointing after this number
-        '''
-        assert data != None, 'Training data shold be set'
-        assert nb_epoch > 0, 'Training steps shold be set'
-
-        if 'batch_size' in fields:  # if new value is set, replace it
-            setval(data.layer.store_conf, batchsize=fields['batch_size'])
-
-        # insert layer for training
-        if self.exist_datalayer('train') == False:
-            self.layers.insert(0, data)
-        setval(self.jobconf, train_steps=nb_epoch)
-        setval(self.jobconf, disp_freq=nb_epoch/10)
-        if 'disp_freq' in fields:
-            setval(self.jobconf, disp_freq=fields['disp_freq'])
-
-        if 'validate_data' in fields:
-            self.layers.insert(1, fields['validate_data'])
-            setval(self.jobconf, validate_freq=nb_epoch/10)
-
-        setval(self.jobconf, **fields)
-
-        # loading checkpoint if it is set
-        if data.checkpoint != None:
-            setval(self.jobconf, checkpoint_path=data.checkpoint)
-
-        # save model parameter (i.e., checkpoint_path)
-        setval(self.jobconf, checkpoint_freq=nb_epoch)
-        self.last_checkpoint_path = '{0}/step{1}-worker0'.format(
-                         self.jobconf.cluster.workspace, nb_epoch)
-
-        # set Train_one_batch component, using backprogapation at default
-        setval(self.jobconf,
-               train_one_batch=Algorithm(type=enumAlgType(alg)).proto)
-
-        # use of cudnn
-        if device != None:
-            setval(self.jobconf, gpu=device)
-            self.cudnn = True
-
-        # start to run singa for training
-        if with_test == False:
-            self.build()  # construct Nneuralnet Component
-            #self.display()
-            return SingaRun(jobproto=self.jobconf,
-                            argv=self.argv, execpath=execpath)
-        else:
-            # run singa in evaluate() with test data
-            pass
-
-    def evaluate(self, data=None, alg='bp',
-                 checkpoint_path=None, execpath='',
-                 device=None, show_acc=False, **fields):
-        '''
-        required
-          data = (Data)   // Data class object for testing data
-        optional
-          alg             = (string)   // algorithm type, (bp at default)
-          checkpoint_path = (list)     // checkpoint path
-          execpaths       = (string)   // path to user's own executable
-          device          = (int/list) // a list of gpu ids
-          show_acc        = (bool)     // compute and the accuacy
-          **fields (KEY=VALUE)
-            batch_size   = (int)  // batch size for testing data
-            test_freq    = (int)  // frequency of testing
-            test_steps   = (int)  // total number of steps for testing
-            test_after   = (int)  // start testing after this number of steps
-        '''
-        assert data != None, 'Testing data should be set'
-        is_testonly = False
-
-        if 'batch_size' in fields:  # if new value is set, replace it
-            setval(data.layer.store_conf, batchsize=fields['batch_size'])
-
-        # insert layer for testing
-        if self.exist_datalayer('test') == False:
-            self.layers.insert(0, data)
-
-        # loading checkpoint if singa runs only for testing
-        if self.exist_datalayer('train') == False:
-            is_testonly = True
-            if checkpoint_path == None:
-                print 'checkpoint_path has not been specified'
-            else:
-                setval(self.jobconf, checkpoint_path=checkpoint_path)
-
-        steps = fields['test_steps'] if 'test_steps' in fields else 10
-        setval(self.jobconf, test_steps=steps)
-        setval(self.jobconf, **fields)
-
-        # set Train_one_batch component, using backprogapation at default
-        setval(self.jobconf,
-               train_one_batch=Algorithm(type=enumAlgType(alg)).proto)
-
-        # use of cudnn
-        if device != None:
-            setval(self.jobconf, gpu=device)
-            self.cudnn = True
-
-        # set True if showing the accuracy
-        self.accuracy = show_acc
-
-        self.build()  # construct Nneuralnet Component
-
-        #--- generate job.conf file for debug purpose
-        #filename = 'job.conf'
-        #with open(filename, 'w') as f:
-        #  f.write(text_format.MessageToString(self.jobconf.cluster))
-        #self.display()
-
-        #--- run singa ---
-        return SingaRun(jobproto=self.jobconf,
-                        argv=self.argv, execpath=execpath, testmode=is_testonly)
-        #return SingaRun_script(filename=filename, execpath=execpath)
-
-
-    def display(self):
-        ''' print out job proto
-        '''
-        print text_format.MessageToString(self.jobconf)
-
-    def set_cudnn_layer_type(self, net):
-        ''' convert LayerType to CdunnLayerType
-        '''
-        for i in range(len(net.layer)):
-            ly_type = net.layer[i].type
-            cudnn_ly_type = ly_type
-            if ly_type == kCConvolution: cudnn_ly_type = kCudnnConv
-            elif ly_type == kCPooling: cudnn_ly_type = kCudnnPool
-            elif ly_type == kLRN: cudnn_ly_type = kCudnnLRN
-            elif ly_type == kSoftmax: cudnn_ly_type = kCudnnSoftmax
-            elif ly_type == kSoftmaxLoss: cudnn_ly_type = kCudnnSoftmaxLoss
-            elif ly_type == kActivation:
-                cudnn_ly_type = kCudnnActivation
-            elif ly_type == kSTanh:
-                print 'Error report: STanh layer is not supported for GPU'
-            '''
-            elif ly_type == kReLU:
-                cudnn_ly_type = kCudnnActivation
-                net.layer[i].activation_conf.type = RELU
-            elif ly_type == kSigmoid:
-                cudnn_ly_type = kCudnnActivation
-                net.layer[i].activation_conf.type = SIGMOID
-            elif ly_type == kTanh:
-                cudnn_ly_type = kCudnnActivation
-                net.layer[i].activation_conf.type = TANH
-            '''
-            #elif ly_type == kSTanh:
-            #    print 'Error report: STanh layer is not supported for GPU'
-                #cudnn_ly_type = kCudnnActivation
-                #net.layer[i].activation_conf.type = STANH
-            net.layer[i].type = cudnn_ly_type
-
-    def show(self):
-        for ly in self.jobconf.neuralnet.layer:
-            print layer(ly.name)
-
-    def layer_by_id(self, k):
-        return self.jobconf.neuralnet.layer[k]
-
-    def layer_by_name(self, name):
-        return self.layers[k]
-
-    def size(self):
-        return len(self.jobconf.neuralnet.layer)
-
-class Energy(Model):
-    ''' energy model
-    '''
-
-    def __init__(self, name='my model', argv=[], label=False):
-        super(Energy, self).__init__(name=name, argv=argv, label=label)
-
-    def add(self, layer):
-        if hasattr(layer, 'layer_type'):
-            if layer.layer_type == kRBMVis:
-                dim = 0
-                for i in range(1, len(layer.out_dim)):
-                    parw = Parameter(name='w', init='none', level=i)
-                    parb = Parameter(name='b', init='none', level=i)
-                    dim = layer.out_dim[i-1]
-                    self.layers.append(Dense(dim, w_param=parw, b_param=parb,
-                                             activation='sigmoid'))
-                self.layers.append(layer)
-
-class Sequential(Model):
-    ''' sequential model
-    '''
-
-    def __init__(self, name='my model', argv=[], label=False):
-        super(Sequential, self).__init__(name=name, argv=argv, label=label)
-
-    def add(self, layer):
-        if hasattr(layer, 'layer_type'):
-            if layer.layer_type == 'AutoEncoder':
-                dim = 0
-                if layer.param_share == True:
-                    # Encoding
-                    for i in range(1, len(layer.hid_dim)+1):
-                        parw = Parameter(name='w',
-                                         init='none', level=i)
-                        parb = Parameter(name='b',
-                                         init='none', level=i)
-                        dim = layer.hid_dim[i-1]
-                        if i == len(layer.hid_dim): activation = None
-                        else: activation = layer.activation
-                        self.layers.append(Dense(dim,
-                                                 w_param=parw, b_param=parb,
-                                                 activation=activation))
-                    # Decoding
-                    for i in range(len(layer.hid_dim), 0, -1):
-                        parw = Parameter(name=generate_name('w', 2),
-                                         init='none')
-                        parb = Parameter(name=generate_name('b', 2),
-                                         init='none')
-                        setval(parw.param, share_from='w'+str(i))
-                        setval(parb.param, name='b'+str(i))
-                        if i == 1: dim = layer.out_dim
-                        else: dim = layer.hid_dim[i-2]
-                        self.layers.append(Dense(dim,
-                                                 w_param=parw, b_param=parb,
-                                                 activation=layer.activation,
-                                                 transpose=True))
-                else:
-                    # MLP
-                    for i in range(1, len(layer.hid_dim)+2):
-                        parw = Parameter(name='w',
-                                         init='none', level=i)
-                        parb = Parameter(name='b',
-                                         init='none', level=i)
-                        if i == len(layer.hid_dim)+1: dim = layer.out_dim
-                        else: dim = layer.hid_dim[i-1]
-                        self.layers.append(Dense(dim,
-                                                 w_param=parw, b_param=parb,
-                                                 activation=layer.activation))
-            else:
-                self.layers.append(layer)
-        else:
-            self.layers.append(layer)
-
-
-class Store(object):
-
-    def __init__(self, **kwargs):
-        '''
-        **kwargs
-            path       = (string)  // path to dataset
-            backend    = (string)  //
-            batch_size = (int)     // batch size of dataset
-            shape      = (int)     //
-        '''
-        self.proto = Message('Store', **kwargs).proto
-
-class Algorithm(object):
-
-    def __init__(self, type=enumAlgType('bp'), **kwargs):
-        '''
-        type = (string)  // type of algorithm, bp at default
-        '''
-        alg = Message('Alg', alg=type, **kwargs).proto
-        if type == enumAlgType('cd'):
-            setval(alg.cd_conf, **kwargs)
-        self.proto = alg
-
-class Updater(object):
-
-    def __init__(self, upd_type, lr, lr_type,
-                 decay, momentum,
-                 step, step_lr, **fields):
-        '''
-        required
-          upd_type = (enum)   // enum type of updater
-          lr       = (float)  // base learning rate
-        optional
-          lr_type  = (string) // type of the learning rate (Fixed at default)
-        '''
-        upd = Message('Updater', type=upd_type, **fields).proto
-        setval(upd.learning_rate, base_lr=lr)
-        if decay > 0:
-            setval(upd, weight_decay=decay)
-        if momentum > 0:
-            setval(upd, momentum=momentum)
-
-        if lr_type == None or lr_type == "fixed":
-            setval(upd.learning_rate, type=kFixed)
-        elif lr_type == 'step':
-            cp = Message('Step', change_freq=60, gamma=0.997)
-            setval(upd.learning_rate, type=kStep, step_conf=cp.proto)
-        elif lr_type == 'manual':
-            cp = Message('FixedStep', step=step, step_lr=step_lr)
-            setval(upd.learning_rate, type=kFixedStep, fixedstep_conf=cp.proto)
-        elif lr_type == 'linear':
-            cp = Message('Linear', change_freq=10, final_lr=0.1)
-            setval(upd.learning_rate, type=kLinear, linear_conf=cp.proto)
-
-        self.proto = upd
-        self.singaupdater = None
-
-    def Update(self, step, layer):
-        ''' This method updates parameters of layer
-            step = (int)  // training step, i.e., param version
-        '''
-        if self.singaupdater == None:
-            self.singaupdater = SingaUpdater.CreateUpdater(
-                                  self.proto.SerializeToString())
-
-        # update parameters
-        singaParams = layer.singalayer.GetParams()
-        for par in singaParams:
-            self.singaupdater.Update(step, par, 1.0)
-    
-
-class SGD(Updater):
-
-    def __init__(self, lr=0.01, lr_type=None,
-                 decay=0, momentum=0,
-                 step=(0), step_lr=(0.01), **fields):
-        '''
-        required
-           lr       = (float)      // base learning rate
-        optional
-           lr_type  = (string)     // type of learning rate, 'Fixed' at default
-           decay    = (float)      // weight decay
-           momentum = (float)      // momentum
-           step     = (int/list)   // steps
-           step_lr  = (float/list) // learning rate after the steps
-           **fields (KEY=VALUE)
-        '''
-        assert lr
-        super(SGD, self).__init__(upd_type=kSGD,
-                                  lr=lr, lr_type=lr_type,
-                                  decay=decay, momentum=momentum,
-                                  step=step, step_lr=step_lr, **fields)
-
-class AdaGrad(Updater):
-
-    def __init__(self, lr=0.01, lr_type=None,
-                 decay=0, momentum=0,
-                 step=(0), step_lr=(0.01), **fields):
-        '''
-        required
-           lr       = (float)      // base learning rate
-        optional
-           lr_type  = (string)     // type of learning rate, 'Fixed' at default
-           decay    = (float)      // weight decay
-           momentum = (float)      // momentum
-           step     = (int/list)   // steps
-           step_lr  = (float/list) // learning rate after the steps
-           **fields (KEY=VALUE)
-        '''
-        assert lr
-        super(AdaGrad, self).__init__(upd_type=kAdaGrad,
-                                  lr=lr, lr_type=lr_type,
-                                  decay=decay, momentum=momentum,
-                                  step=step, step_lr=step_lr, **fields)
-
-class Cluster(object):
-    """ Specify the cluster topology, e.g., number of workers/servers.
-
-    Currently we need to create this object in the .py file and also provide a
-    cluster configuration file to the command line. TODO(wangwei) update SINGA
-    code to eliminate the requirement of the cluster configuration file for
-    training on a single node or the cluster object in the pyfile for training
-    in a cluster.
-    """
-
-    def __init__(self, workspace=None,
-                 nworker_groups=1, nserver_groups=1,
-                 nworkers_per_group=1, nservers_per_group=1,
-                 nworkers_per_procs=1, nservers_per_procs=1,
-                 **fields):
-        '''
-        required
-          workspace = (string) // workspace path
-        optional
-          nworker_groups     = (int)
-          nserver_groups     = (int)
-          nworkers_per_group = (int)
-          nservers_per_group = (int)
-          nworkers_per_procs = (int)
-          nservers_per_procs = (int)
-          **fields
-            server_worker_separate = (bool)
-        '''
-        assert workspace != None, 'need to set workspace'
-        self.proto = Message('Cluster', workspace=workspace).proto
-        # optional
-        self.proto.nworker_groups = nworker_groups
-        self.proto.nserver_groups = nserver_groups
-        self.proto.nworkers_per_group = nworkers_per_group
-        self.proto.nservers_per_group = nservers_per_group
-        self.proto.nworkers_per_procs = nworkers_per_procs
-        self.proto.nservers_per_procs = nservers_per_procs
-        # other fields
-        setval(self.proto, **fields)
-
-
-def StoreResults(lines):
-    """ Parsing metrics from each line in the log file.
-
-    TODO(wangwei) format the log string to make them uniform for easy parsing
-    Another approach is creating a protobuf message for metrics, which can be
-    used for dumping metrics to string and loading perf string back to messages.
-    """
-
-    resultDic = {}
-    for line in lines:
-        line = re.findall(r'[\w|*.*]+', line)
-        if 'Train' in line:
-            step = line[line.index('step')+1]
-            if 'accuracy' in line:
-                resultDic.setdefault(step, {})['acc'] \
-                                             = line[line.index('accuracy')+1]
-            if 'loss' in line:
-                resultDic.setdefault(step, {})['loss'] \
-                                             = line[line.index('loss')+1]
-            if 'ppl' in line:
-                resultDic.setdefault(step, {})['ppl'] \
-                                             = line[line.index('ppl')+1]
-            if 'Squared' in line:
-                resultDic.setdefault(step, {})['se'] \
-                                             = line[line.index('Squared')+2]
-    return resultDic
-
-def SingaRun(jobproto='', argv=None, execpath='', testmode=False):
-    """
-    Run Singa and receive the training/test results.
-    """
-
-    import singa.driver as driver
-    d = driver.Driver()
-    d.InitLog(argv[0])
-    d.Init(argv)
-    if testmode == True:
-        d.Test(jobproto.SerializeToString())
-    else:
-        d.Train(False, jobproto.SerializeToString())
-
-    # Get the performance from the latest log file.
-    # TODO(wangwei) the log file would be overwritten by other running instance
-    # of the same program, e.g., lt-singa
-    logfile = '/tmp/singa-log/{0}.ERROR'.format(argv[0].split('/')[-1])
-    fin = open(logfile, 'r')
-    result = StoreResults(fin.readlines())
-
-    return result
-
-def SingaRun_script(filename='', execpath=''):
-    """
-    Deprecated.
-    Generate the job conf file and run the shell command.
-    """
-    SINGAROOT = '../../../'
-    conf = 'examples/' + filename
-    if execpath == '':
-        cmd = SINGAROOT+'bin/singa-run.sh ' \
-            + '-conf %s ' % conf
-    else:
-        cmd = SINGAROOT+'bin/singa-run.sh ' \
-            + '-conf %s ' % conf \
-            + '-exec %s ' % execpath
-
-    procs = subprocess.Popen(cmd.strip().split(' '),
-                             stdout=subprocess.PIPE,
-                             stderr=subprocess.STDOUT)
-
-    resultDic = {}
-    outputlines = iter(procs.stdout.readline, '')
-    resultDic = StoreResults(outputlines)
-
-    #TODO better format to store the result??
-    return resultDic
-
-def load_model_parameter(fin, neuralnet, batchsize=1, data_shape=None):
-    """
-    this method loads model parameter
-    """
-    hly_idx = 0
-    for i in range(len(neuralnet)): 
-        if neuralnet[i].is_datalayer:
-            if data_shape == None:
-                shape = neuralnet[i].shape
-                shape[0] = batchsize
-                neuralnet[i].setup(shape)
-            else:
-                neuralnet[i].setup(data_shape)
-        else:
-            hly_idx = i
-            break
-
-    net = layerVector(len(neuralnet)-hly_idx)
-    for i in range(hly_idx, len(neuralnet)): 
-        if neuralnet[i].src==None:
-            neuralnet[i].setup(neuralnet[i-1])
-        else:
-            neuralnet[i].setup(neuralnet[i].src)
-        net[i-hly_idx] = neuralnet[i].singalayer
-
-    from singa.driver import Worker
-    alg = Algorithm(type=enumAlgType('bp')).proto
-    w = Worker.CreateWorker(alg.SerializeToString())
-    w.InitNetParams(fin, net)
-
-def save_model_parameter(step, fout, neuralnet):
-    """
-    this method saves model parameter
-    """
-    hly_idx = 0
-    for i in range(len(neuralnet)): 
-        if not neuralnet[i].is_datalayer:
-            hly_idx = i
-            break
-
-    from singa.driver import Worker
-    net = layerVector(len(neuralnet)-hly_idx)
-    for i in range(hly_idx, len(neuralnet)): 
-        net[i-hly_idx] = neuralnet[i].singalayer
-    alg = Algorithm(type=enumAlgType('bp')).proto
-    w = Worker.CreateWorker(alg.SerializeToString())
-    w.Checkpoint(step, fout, net)
-
diff --git a/tool/python/singa/parameter.py b/tool/python/singa/parameter.py
deleted file mode 100644
index 14ad852..0000000
--- a/tool/python/singa/parameter.py
+++ /dev/null
@@ -1,140 +0,0 @@
-#!/usr/bin/env python
-
-#/************************************************************
-#*
-#* Licensed to the Apache Software Foundation (ASF) under one
-#* or more contributor license agreements.  See the NOTICE file
-#* distributed with this work for additional information
-#* regarding copyright ownership.  The ASF licenses this file
-#* to you under the Apache License, Version 2.0 (the
-#* "License"); you may not use this file except in compliance
-#* with the License.  You may obtain a copy of the License at
-#*
-#*   http://www.apache.org/licenses/LICENSE-2.0
-#*
-#* Unless required by applicable law or agreed to in writing,
-#* software distributed under the License is distributed on an
-#* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-#* KIND, either express or implied.  See the License for the
-#* specific language governing permissions and limitations
-#* under the License.
-#*
-#*************************************************************/
-
-'''
-This script includes Parameter class and a method, named set_param_field
-that users can configure Param and ParamGen protos.
-'''
-
-from singa.initializations import get_init_values
-from singa.utils.utility import setval, generate_name
-from singa.utils.message import *
-from google.protobuf import text_format
-
-
-class Parameter(object):
-
-    def __init__(self, **kwargs):
-        '''
-	optional
-	  **kwargs
-	    name  = (string) // parameter name
-	    lr    = (float)  // learning rate multiplier
-	    wd    = (float)  // weight decay multiplier
-	    init  = (string) // init type {'constant','uniform','gaussian'}
-	    value = (int)    // value for 'constant'
-	    scale = (float)  // [low=-scale, high=scale] for 'uniform'
-	    low   = (float)  // low value   for 'uniform'
-	    high  = (float)  // high value  for 'uniform'
-	    mean  = (float)  // mean for 'gaussian'
-	    std   = (float)  // std  for 'gaussian'
-	'''
-        fields = {'lr_scale' : kwargs['lr'] if 'lr' in kwargs else 1,
-                  'wd_scale' : kwargs['wd'] if 'wd' in kwargs else 1
-                 }
-        self.param = Message('Param', **fields).proto
-
-        if not 'name' in kwargs:
-            setval(self.param, name=generate_name('param', 1))
-        else:
-            pname = kwargs['name']
-            # parameter name for RBM
-            if 'level' in kwargs:
-                pname += str(kwargs['level'])
-                if pname[0] == 'b':
-                    pname += '2'
-            setval(self.param, name=pname)
-
-        if 'share_from' in kwargs:
-            setval(self.param, share_from=kwargs['share_from'])
-
-        if 'init' in kwargs:
-            init_values = get_init_values(kwargs['init'], **kwargs)
-            if not kwargs['init'] == 'none':
-                pgen = Message('ParamGen', type=enumInitMethod(kwargs['init']),
-                               **init_values)
-                del kwargs['init']
-                setval(self.param, init=pgen.proto)
-        else: # default: uniform
-            pgen = Message('ParamGen', type=enumInitMethod('uniform'))
-            setval(self.param, init=pgen.proto)
-
-    def update(self, **fields):
-        setval(self.param, **fields)
-        setval(self.param.init, **fields)
-
-
-def set_param_field(param, pname, changename=False, withnumber=True, **kwargs):
-    '''
-      param      = (ParamProto)
-      pname      = (string)     // 'w' for wiehgt, or 'b' for bias
-      changename = (bool)       // update parameter name if True
-      withnumber = (bool)       // add layer number if True
-      **kwargs
-        w_lr = (float) // learning rate multiplier for weight, used to
-                       // scale the learning rate when updating parameters.
-        w_wd = (float) // weight decay multiplier for weight, used to
-                       // scale the weight decay when updating parameters.
-        b_lr = (float) // learning rate multiplier for bias 
-        b_wd = (float) // weight decay multiplier for bias
-    '''
-    assert pname == 'w' or pname == 'b', 'pname should be w or b'
-
-    lr_ = param.lr_scale
-    wd_ = param.wd_scale
-    initkv = {}
-
-    if pname == 'w':
-        if 'w_lr' in kwargs:
-            lr_ = kwargs['w_lr']
-            del kwargs['w_lr']
-        if 'w_wd' in kwargs:
-            wd_ = kwargs['w_wd']
-            del kwargs['w_wd']
-        for key, val in kwargs.items():
-            if key.startswith('w_'):
-                initkv[key[2:]] = val
-
-    elif pname == 'b':
-        if 'b_lr' in kwargs:
-            lr_ = kwargs['b_lr']
-            del kwargs['b_lr']
-        if 'b_wd' in kwargs:
-            wd_ = kwargs['b_wd']
-            del kwargs['b_wd']
-        for key, val in kwargs.items():
-            if key.startswith('b_'):
-                initkv[key[2:]] = val
-
-    field = {'lr_scale' : lr_, 'wd_scale' : wd_}
-
-    # Set/update parameter fields
-    if param.name.startswith('param') or changename == True:
-        if 'level' in kwargs:  # parameter name for RBM
-            pname += str(kwargs['level'])
-        setval(param, name=generate_name(pname, withnumber=withnumber), **field)
-    else:
-        setval(param, **field)
-
-    # Set/update parameter init fields
-    setval(param.init, **initkv)
diff --git a/tool/python/singa/utils/__init__.py b/tool/python/singa/utils/__init__.py
deleted file mode 100644
index a796a7a..0000000
--- a/tool/python/singa/utils/__init__.py
+++ /dev/null
@@ -1,22 +0,0 @@
-#/************************************************************
-#*
-#* Licensed to the Apache Software Foundation (ASF) under one
-#* or more contributor license agreements.  See the NOTICE file
-#* distributed with this work for additional information
-#* regarding copyright ownership.  The ASF licenses this file
-#* to you under the Apache License, Version 2.0 (the
-#* "License"); you may not use this file except in compliance
-#* with the License.  You may obtain a copy of the License at
-#*
-#*   http://www.apache.org/licenses/LICENSE-2.0
-#*
-#* Unless required by applicable law or agreed to in writing,
-#* software distributed under the License is distributed on an
-#* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-#* KIND, either express or implied.  See the License for the
-#* specific language governing permissions and limitations
-#* under the License.
-#*
-#*************************************************************/
-
-
diff --git a/tool/python/singa/utils/message.py b/tool/python/singa/utils/message.py
deleted file mode 100644
index bfa9ef2..0000000
--- a/tool/python/singa/utils/message.py
+++ /dev/null
@@ -1,80 +0,0 @@
-#!/usr/bin/env python
-
-#/************************************************************
-#*
-#* Licensed to the Apache Software Foundation (ASF) under one
-#* or more contributor license agreements.  See the NOTICE file
-#* distributed with this work for additional information
-#* regarding copyright ownership.  The ASF licenses this file
-#* to you under the Apache License, Version 2.0 (the
-#* "License"); you may not use this file except in compliance
-#* with the License.  You may obtain a copy of the License at
-#*
-#*   http://www.apache.org/licenses/LICENSE-2.0
-#*
-#* Unless required by applicable law or agreed to in writing,
-#* software distributed under the License is distributed on an
-#* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-#* KIND, either express or implied.  See the License for the
-#* specific language governing permissions and limitations
-#* under the License.
-#*
-#*************************************************************/
-
-import sys, os
-from utility import *
-sys.path.append(os.path.join(os.path.dirname(__file__), '../../pb2'))
-
-'''
-This script reads proto files in ../../pb2, generated by proto buffer compiler.
- - Message class creates an object for proto and sets initial vlaues for
-   the fields, specified by kwargs
- - make_function method generates a method named enumInitMethod that returns
-   enum values of given enum type, defined in the proto files
-'''
-
-MODULE_LIST = []
-
-# import all modules in dir singa_root/tool/python/pb2
-# except common, singa, and __init__
-for f in os.listdir(os.path.join(os.path.dirname(__file__), '../../pb2')):
-    if (f.endswith(".pyc")):
-        continue
-    if(f == "__init__.py" or f == "common_pb2.py" or f == "singa_pb2.py"):
-        continue
-    module_name = f.split('.')[0]
-    module_obj = __import__(module_name)
-    MODULE_LIST.append(module_obj)
-    for func_name in dir(module_obj):
-        if not func_name.startswith("__"):
-            globals()[func_name] = getattr(module_obj, func_name)
-
-class Message(object):
-    def __init__(self, protoname, **kwargs):
-        for module_obj in MODULE_LIST:
-            if hasattr(module_obj, protoname+"Proto"):
-                class_ = getattr(module_obj, protoname+"Proto")
-                self.proto = class_()
-                return setval(self.proto, **kwargs)
-        raise Exception('invalid protoname')
-
-enumDict_ = dict()
-
-#get all enum type list in the modules
-for module_obj in MODULE_LIST:
-    for enumtype in module_obj.DESCRIPTOR.enum_types_by_name:
-        tempDict = enumDict_[enumtype] = dict()
-        for name in getattr(module_obj, enumtype).DESCRIPTOR.values_by_name:
-            tempDict[name[1:].lower()] = getattr(module_obj, name)
-
-def make_function(enumtype):
-    def _function(key):
-        return enumDict_[enumtype][key]
-    return _function
-
-current_module = sys.modules[__name__]
-
-#def all the enumtypes
-for module_obj in MODULE_LIST:
-    for enumtype in module_obj.DESCRIPTOR.enum_types_by_name:
-        setattr(current_module, "enum"+enumtype, make_function(enumtype))
diff --git a/tool/python/singa/utils/utility.py b/tool/python/singa/utils/utility.py
deleted file mode 100644
index b88720c..0000000
--- a/tool/python/singa/utils/utility.py
+++ /dev/null
@@ -1,86 +0,0 @@
-#!/usr/bin/env python
-
-#/************************************************************
-#*
-#* Licensed to the Apache Software Foundation (ASF) under one
-#* or more contributor license agreements.  See the NOTICE file
-#* distributed with this work for additional information
-#* regarding copyright ownership.  The ASF licenses this file
-#* to you under the Apache License, Version 2.0 (the
-#* "License"); you may not use this file except in compliance
-#* with the License.  You may obtain a copy of the License at
-#*
-#*   http://www.apache.org/licenses/LICENSE-2.0
-#*
-#* Unless required by applicable law or agreed to in writing,
-#* software distributed under the License is distributed on an
-#* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-#* KIND, either express or implied.  See the License for the
-#* specific language governing permissions and limitations
-#* under the License.
-#*
-#*************************************************************/
-
-'''
-This script includes methods to
-(1) generate name of layer, parameter, etc.
-(2) set field values for proto.
-(3) swap bits
-'''
-
-LAYERID = 0
-PARAMID = 0
-
-def generate_name(label, option=0, withnumber=True):
-    ''' This method returns name of layer or parameter with unique id.
-        option: 1 to increase id number
-        withnumber: True to concatenate number to name
-    '''
-
-    global LAYERID, PARAMID
-    num = LAYERID
-    if label == 'layer':
-        if option == 1: LAYERID += 1
-        num = LAYERID
-    elif label == 'param':
-        if option == 1: PARAMID += 1
-        num = PARAMID
-    else:
-        if option == 1: LAYERID += 1
-        num = LAYERID
-        if option == 2:
-            num = LAYERID+1
-
-    if withnumber == False:
-        return '{0}'.format(label)
-
-    return '{0}{1}'.format(label, num)
-
-def setval(proto, **kwargs):
-    ''' This method sets field values for give proto.
-    '''
-
-    for key, val in kwargs.items():
-        #print 'kv: ', k, ', ', v
-        if hasattr(proto, key):
-            flabel = proto.DESCRIPTOR.fields_by_name[key].label
-            ftype = proto.DESCRIPTOR.fields_by_name[key].type
-
-            fattr = getattr(proto, key)
-            if flabel == 3: # repeated field
-                if ftype == 11: # message type
-                    fattr = fattr.add()
-                    fattr.MergeFrom(val)
-                else:
-                    if type(val) == list or type(val) == tuple:
-                        for i in range(len(val)):
-                            fattr.append(val[i])
-                    else:
-                        fattr.append(val)
-            else:
-                if ftype == 11: # message type
-                    fattr = getattr(proto, key)
-                    fattr.MergeFrom(val)
-                else:
-                    setattr(proto, key, val)
-