Install nccl in dockerfile of manylinux2014-cuda10.2 to compile singa with nccl

Update version in Cmakefiles to 3.2.0
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 1d8201f..a722ec9 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -29,10 +29,10 @@
 #string(REGEX REPLACE "^[0-9]+\\.[0-9]+\\.([0-9]+).*" "\\1" VERSION_PATCH "${VERSION}")
 
 
-SET(PACKAGE_VERSION 3.1.0) # ${VERSION})
-SET(VERSION 3.1.0)
+SET(PACKAGE_VERSION 3.2.0) # ${VERSION})
+SET(VERSION 3.2.0)
 SET(SINGA_MAJOR_VERSION 3)
-SET(SINGA_MINOR_VERSION 1)
+SET(SINGA_MINOR_VERSION 2)
 SET(SINGA_PATCH_VERSION 0)
 #SET(SINGA_MAJOR_VERSION ${VERSION_MAJOR})  # 0 -
 #SET(SINGA_MINOR_VERSION ${VERSION_MINOR})  # 0 - 9
diff --git a/setup.py b/setup.py
index a4dc17b..4e9a644 100644
--- a/setup.py
+++ b/setup.py
@@ -83,7 +83,7 @@
 from datetime import date
 
 # stable version
-VERSION = '3.1.0.rc1'
+VERSION = '3.2.0'
 # get the git hash
 # git_hash = subprocess.check_output(["git", "describe"]).strip().split('-')[-1][1:]
 # comment the next line to build wheel for stable version
@@ -422,6 +422,7 @@
         'numpy >=1.16,<2.0',  #1.16
         'onnx==1.6',
         'deprecated',
+        'pytest',
         'unittest-xml-reporting',
         'future',
         'pillow',
diff --git a/test/python/test_tensor.py b/test/python/test_tensor.py
index 5335412..f7044f1 100644
--- a/test/python/test_tensor.py
+++ b/test/python/test_tensor.py
@@ -550,7 +550,7 @@
         scalar = random.random() * 100
         y = x + scalar
         self.assertEqual(y.dtype, tensor.float32)
-        np.testing.assert_array_almost_equal(tensor.to_numpy(y), x_val + scalar)
+        np.testing.assert_array_almost_equal(tensor.to_numpy(y), x_val + scalar, 5)
 
     @unittest.skipIf(not singa_api.USE_CUDA, 'CUDA is not enabled')
     def test_kint_float_gpu(self):
diff --git a/tool/docker/devel/centos6/cuda10/Dockerfile.manylinux2014 b/tool/docker/devel/centos6/cuda10/Dockerfile.manylinux2014
index d3aeaff..80ca788 100644
--- a/tool/docker/devel/centos6/cuda10/Dockerfile.manylinux2014
+++ b/tool/docker/devel/centos6/cuda10/Dockerfile.manylinux2014
@@ -119,6 +119,14 @@
     rm cudnn-10.2-linux-x64-v7.6.5.32.tgz && \
     ldconfig
 
+# install nccl for distributed training
+RUN git clone https://github.com/NVIDIA/nccl.git $HOME/nccl \
+    && cd $HOME/nccl \
+    && git checkout v2.4.8-1 \
+    && make BUILDDIR=/usr/local/ -j$(nproc) src.build \
+    && rm -rf /usr/local/obj \
+    && rm -rf $HOME/nccl
+
 # install cnmem to /usr/local/include  /usr/local/lib
 RUN git clone https://github.com/NVIDIA/cnmem.git cnmem \
     && cd cnmem && mkdir build && cd build && cmake .. && make && make install && cd ../.. && rm -rf cnmem