add benchmark.py for cifar_distributed_cnn
diff --git a/examples/cifar_distributed_cnn/benchmark.py b/examples/cifar_distributed_cnn/benchmark.py
new file mode 100644
index 0000000..6f9ef52
--- /dev/null
+++ b/examples/cifar_distributed_cnn/benchmark.py
@@ -0,0 +1,122 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+# the code is modified from
+# https://github.com/pytorch/vision/blob/master/torchvision/models/resnet.py
+
+from singa import opt
+# import opt
+from singa import device
+from singa import tensor
+
+import argparse
+import time
+import numpy as np
+from tqdm import trange
+
+
+def train_resnet(DIST=True, graph=True, sequential=False, verbosity=0):
+
+ # Define the hypermeters for the train_resnet
+ niters = 100
+ batch_size = 32
+ sgd = opt.SGD(lr=0.1, momentum=0.9, weight_decay=1e-5)
+
+ IMG_SIZE = 224
+
+ # For distributed training, sequential has better throughput in the current version
+ if DIST == True:
+ sgd = opt.DistOpt(sgd)
+ world_size = sgd.world_size
+ local_rank = sgd.local_rank
+ global_rank = sgd.global_rank
+ sequential = True
+ else:
+ local_rank = 0
+ world_size = 1
+ global_rank = 0
+ sequential = False
+
+ dev = device.create_cuda_gpu_on(local_rank)
+
+ tx = tensor.Tensor((batch_size, 3, IMG_SIZE, IMG_SIZE), dev)
+ ty = tensor.Tensor((batch_size,), dev, tensor.int32)
+ x = np.random.randn(batch_size, 3, IMG_SIZE, IMG_SIZE).astype(np.float32)
+ y = np.random.randint(0, 1000, batch_size, dtype=np.int32)
+ tx.copy_from_numpy(x)
+ ty.copy_from_numpy(y)
+
+ dev.SetVerbosity(verbosity)
+ dev.SetSkipIteration(5)
+
+ # Construct the model
+ from model import resnet
+ model = resnet.resnet50(num_channels=3, num_classes=1000)
+
+ model.train()
+ model.set_optimizer(sgd)
+ model.compile([tx], is_train=True, use_graph=graph, sequential=sequential)
+
+ # Train model
+ dev.Sync()
+ start = time.time()
+ with trange(niters) as t:
+ for _ in t:
+ model(tx, ty, dist_option='fp32', spars=None)
+
+ dev.Sync()
+ end = time.time()
+ titer = (end - start) / float(niters)
+ throughput = float(niters * batch_size * world_size) / (end - start)
+ if global_rank == 0:
+ print("\nThroughput = {} per second".format(throughput), flush=True)
+ print("TotalTime={}".format(end - start), flush=True)
+ print("Total={}".format(titer), flush=True)
+ dev.PrintTimeProfiling()
+
+
+if __name__ == "__main__":
+
+ parser = argparse.ArgumentParser(
+ description='Throughput test using Resnet 50')
+ parser.add_argument('--dist',
+ '--enable-dist',
+ default='False',
+ action='store_true',
+ help='enable distributed training',
+ dest='DIST')
+ parser.add_argument('--no-graph',
+ '--disable-graph',
+ default='True',
+ action='store_false',
+ help='disable graph',
+ dest='graph')
+ parser.add_argument('--verbosity',
+ '--log-verbosity',
+ default=0,
+ type=int,
+ help='logging verbosity',
+ dest='verbosity')
+
+ args = parser.parse_args()
+
+ train_resnet(DIST=args.DIST,
+ graph=args.graph,
+ sequential=False,
+ verbosity=args.verbosity)