examples/cpp/imagenet/alexnet.cc - singa - Git at Google

 /************************************************************
 *
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 *
 *************************************************************/

 #include "singa/singa_config.h"
 #ifdef USE_OPENCV
 #include <cmath>
 #include "./ilsvrc12.h"
 #include "singa/io/snapshot.h"
 #include "singa/model/feed_forward_net.h"
 #include "singa/model/initializer.h"
 #include "singa/model/metric.h"
 #include "singa/model/optimizer.h"
 #include "singa/utils/channel.h"
 #include "singa/utils/string.h"
 #include "singa/utils/timer.h"
 namespace singa {

 // currently supports 'cudnn' and 'singacpp'
 const std::string engine = "cudnn";
 LayerConf GenConvConf(string name, int nb_filter, int kernel, int stride,
                       int pad, float std, float bias = .0f) {
   LayerConf conf;
   conf.set_name(name);
   conf.set_type(engine + "_convolution");
   ConvolutionConf *conv = conf.mutable_convolution_conf();
   conv->set_num_output(nb_filter);
   conv->add_kernel_size(kernel);
   conv->add_stride(stride);
   conv->add_pad(pad);
   conv->set_bias_term(true);

   ParamSpec *wspec = conf.add_param();
   wspec->set_name(name + "_weight");
   auto wfill = wspec->mutable_filler();
   wfill->set_type("Gaussian");
   wfill->set_std(std);

   ParamSpec *bspec = conf.add_param();
   bspec->set_name(name + "_bias");
   bspec->set_lr_mult(2);
   bspec->set_decay_mult(0);
   auto bfill = bspec->mutable_filler();
   bfill->set_value(bias);
   return conf;
 }

 LayerConf GenPoolingConf(string name, bool max_pool, int kernel, int stride,
                          int pad) {
   LayerConf conf;
   conf.set_name(name);
   conf.set_type(engine + "_pooling");
   PoolingConf *pool = conf.mutable_pooling_conf();
   pool->set_kernel_size(kernel);
   pool->set_stride(stride);
   pool->set_pad(pad);
   if (!max_pool) pool->set_pool(PoolingConf_PoolMethod_AVE);
   return conf;
 }

 LayerConf GenReLUConf(string name) {
   LayerConf conf;
   conf.set_name(name);
   conf.set_type(engine + "_relu");
   return conf;
 }

 LayerConf GenDenseConf(string name, int num_output, float std, float wd,
                        float bias = .0f) {
   LayerConf conf;
   conf.set_name(name);
   conf.set_type("singa_dense");
   DenseConf *dense = conf.mutable_dense_conf();
   dense->set_num_output(num_output);

   ParamSpec *wspec = conf.add_param();
   wspec->set_name(name + "_weight");
   wspec->set_decay_mult(wd);
   auto wfill = wspec->mutable_filler();
   wfill->set_type("Gaussian");
   wfill->set_std(std);

   ParamSpec *bspec = conf.add_param();
   bspec->set_name(name + "_bias");
   bspec->set_lr_mult(2);
   bspec->set_decay_mult(0);
   auto bfill = bspec->mutable_filler();
   bfill->set_value(bias);

   return conf;
 }

 LayerConf GenLRNConf(string name) {
   LayerConf conf;
   conf.set_name(name);
   conf.set_type(engine + "_lrn");
   LRNConf *lrn = conf.mutable_lrn_conf();
   lrn->set_local_size(5);
   lrn->set_alpha(1e-04);
   lrn->set_beta(0.75);
   return conf;
 }

 LayerConf GenFlattenConf(string name) {
   LayerConf conf;
   conf.set_name(name);
   conf.set_type("singa_flatten");
   return conf;
 }

 LayerConf GenDropoutConf(string name, float dropout_ratio) {
   LayerConf conf;
   conf.set_name(name);
   conf.set_type(engine + "_dropout");
   DropoutConf *dropout = conf.mutable_dropout_conf();
   dropout->set_dropout_ratio(dropout_ratio);
   return conf;
 }

 FeedForwardNet CreateNet() {
   FeedForwardNet net;
   Shape s{3, 227, 227};

   net.Add(GenConvConf("conv1", 96, 11, 4, 0, 0.01), &s);
   net.Add(GenReLUConf("relu1"));
   net.Add(GenPoolingConf("pool1", true, 3, 2, 0));
   net.Add(GenLRNConf("lrn1"));
   net.Add(GenConvConf("conv2", 256, 5, 1, 2, 0.01, 1.0));
   net.Add(GenReLUConf("relu2"));
   net.Add(GenPoolingConf("pool2", true, 3, 2, 0));
   net.Add(GenLRNConf("lrn2"));
   net.Add(GenConvConf("conv3", 384, 3, 1, 1, 0.01));
   net.Add(GenReLUConf("relu3"));
   net.Add(GenConvConf("conv4", 384, 3, 1, 1, 0.01, 1.0));
   net.Add(GenReLUConf("relu4"));
   net.Add(GenConvConf("conv5", 256, 3, 1, 1, 0.01, 1.0));
   net.Add(GenReLUConf("relu5"));
   net.Add(GenPoolingConf("pool5", true, 3, 2, 0));
   net.Add(GenFlattenConf("flat"));
   net.Add(GenDenseConf("ip6", 4096, 0.005, 1, 1.0));
   net.Add(GenReLUConf("relu6"));
   net.Add(GenDropoutConf("drop6", 0.5));
   net.Add(GenDenseConf("ip7", 4096, 0.005, 1, 1.0));
   net.Add(GenReLUConf("relu7"));
   net.Add(GenDropoutConf("drop7", 0.5));
   net.Add(GenDenseConf("ip8", 1000, 0.01, 1));

   return net;
 }

 void TrainOneEpoch(FeedForwardNet &net, ILSVRC &data,
                    std::shared_ptr<Device> device, int epoch, string bin_folder,
                    size_t num_train_files, size_t batchsize, float lr,
                    Channel *train_ch, size_t pfreq, int nthreads) {
   float loss = 0.0f, metric = 0.0f;
   float load_time = 0.0f, train_time = 0.0f;
   size_t b = 0;
   size_t n_read;
   Timer timer, ttr;
   Tensor prefetch_x(Shape{batchsize, 3, kCropSize, kCropSize}), prefetch_y(Shape{batchsize}, kInt);
   string binfile = bin_folder + "/train1.bin";
   timer.Tick();
   data.LoadData(kTrain, binfile, batchsize, &prefetch_x, &prefetch_y, &n_read,
                 nthreads);
   load_time += timer.Elapsed();
   CHECK_EQ(n_read, batchsize);
   Tensor train_x(prefetch_x.shape(), device);
   Tensor train_y(prefetch_y.shape(), device, kInt);
   std::thread th;
   for (size_t fno = 1; fno <= num_train_files; fno++) {
     binfile = bin_folder + "/train" + std::to_string(fno) + ".bin";
     while (true) {
       if (th.joinable()) {
         th.join();
         load_time += timer.Elapsed();
         // LOG(INFO) << "num of samples: " << n_read;
         if (n_read < batchsize) {
           if (n_read > 0) {
             LOG(WARNING) << "Pls set batchsize to make num_total_samples "
                          << "% batchsize == 0. Otherwise, the last " << n_read
                          << " samples would not be used";
           }
           break;
         }
       }
       if (n_read == batchsize) {
         train_x.CopyData(prefetch_x);
         train_y.CopyData(prefetch_y);
       }
       timer.Tick();
       th = data.AsyncLoadData(kTrain, binfile, batchsize, &prefetch_x,
                               &prefetch_y, &n_read, nthreads);
       if (n_read < batchsize) continue;
       CHECK_EQ(train_x.shape(0), train_y.shape(0));
       ttr.Tick();
       auto ret = net.TrainOnBatch(epoch, train_x, train_y);
       train_time += ttr.Elapsed();
       loss += ret.first;
       metric += ret.second;
       b++;
     }
     if (b % pfreq == 0) {
       train_ch->Send(
           "Epoch " + std::to_string(epoch) + ", training loss = " +
           std::to_string(loss / b) + ", accuracy = " +
           std::to_string(metric / b) + ", lr = " + std::to_string(lr) +
           ", time of loading " + std::to_string(batchsize) + " images = " +
           std::to_string(load_time / b) +
           " ms, time of training (batchsize = " + std::to_string(batchsize) +
           ") = " + std::to_string(train_time / b) + " ms.");
       loss = 0.0f;
       metric = 0.0f;
       load_time = 0.0f;
       train_time = 0.0f;
       b = 0;
     }
   }
 }

 void TestOneEpoch(FeedForwardNet &net, ILSVRC &data,
                   std::shared_ptr<Device> device, int epoch, string bin_folder,
                   size_t num_test_images, size_t batchsize, Channel *val_ch,
                   int nthreads) {
   float loss = 0.0f, metric = 0.0f;
   float load_time = 0.0f, eval_time = 0.0f;
   size_t n_read;
   string binfile = bin_folder + "/test.bin";
   Timer timer, tte;
   Tensor prefetch_x, prefetch_y;
   timer.Tick();
   data.LoadData(kEval, binfile, batchsize, &prefetch_x, &prefetch_y, &n_read,
                 nthreads);
   load_time += timer.Elapsed();
   Tensor test_x(prefetch_x.shape(), device);
   Tensor test_y(prefetch_y.shape(), device, kInt);
   int remain = (int)num_test_images - n_read;
   CHECK_EQ(n_read, batchsize);
   std::thread th;
   while (true) {
     if (th.joinable()) {
       th.join();
       load_time += timer.Elapsed();
       remain -= n_read;
       if (remain < 0) break;
       if (n_read < batchsize) break;
     }
     test_x.CopyData(prefetch_x);
     test_y.CopyData(prefetch_y);
     timer.Tick();
     th = data.AsyncLoadData(kEval, binfile, batchsize, &prefetch_x, &prefetch_y,
                             &n_read, nthreads);

     CHECK_EQ(test_x.shape(0), test_y.shape(0));
     tte.Tick();
     auto ret = net.EvaluateOnBatch(test_x, test_y);
     eval_time += tte.Elapsed();
     ret.first.ToHost();
     ret.second.ToHost();
     loss += Sum(ret.first);
     metric += Sum(ret.second);
   }
   loss /= num_test_images;
   metric /= num_test_images;
   val_ch->Send("Epoch " + std::to_string(epoch) + ", val loss = " +
                std::to_string(loss) + ", accuracy = " + std::to_string(metric) +
                ", time of loading " + std::to_string(num_test_images) +
                " images = " + std::to_string(load_time) +
                " ms, time of evaluating " + std::to_string(num_test_images) +
                " images = " + std::to_string(eval_time) + " ms.");
 }

 void Checkpoint(FeedForwardNet &net, string prefix) {
   Snapshot snapshot(prefix, Snapshot::kWrite, 200);
   auto names = net.GetParamNames();
   auto values = net.GetParamValues();
   for (size_t k = 0; k < names.size(); k++) {
     values.at(k).ToHost();
     snapshot.Write(names.at(k), values.at(k));
   }
   LOG(INFO) << "Write snapshot into " << prefix;
 }

 void Train(int num_epoch, float lr, size_t batchsize, size_t train_file_size,
            string bin_folder, size_t num_train_images, size_t num_test_images,
            size_t pfreq, int nthreads) {
   ILSVRC data;
   data.ReadMean(bin_folder + "/mean.bin");
   auto net = CreateNet();
   auto cuda = std::make_shared<CudaGPU>(0);
   net.ToDevice(cuda);
   SGD sgd;
   OptimizerConf opt_conf;
   opt_conf.set_momentum(0.9);
   auto reg = opt_conf.mutable_regularizer();
   reg->set_coefficient(0.0005);
   sgd.Setup(opt_conf);
   sgd.SetLearningRateGenerator(
       [lr](int epoch) { return lr * std::pow(0.1, epoch / 20); });

   SoftmaxCrossEntropy loss;
   Accuracy acc;
   net.Compile(true, &sgd, &loss, &acc);

   Channel *train_ch = GetChannel("train_perf");
   train_ch->EnableDestStderr(true);
   Channel *val_ch = GetChannel("val_perf");
   val_ch->EnableDestStderr(true);
   size_t num_train_files = num_train_images / train_file_size +
                            (num_train_images % train_file_size ? 1 : 0);
   for (int epoch = 0; epoch < num_epoch; epoch++) {
     float epoch_lr = sgd.GetLearningRate(epoch);
     TrainOneEpoch(net, data, cuda, epoch, bin_folder, num_train_files,
                   batchsize, epoch_lr, train_ch, pfreq, nthreads);
     if (epoch % 10 == 0 && epoch > 0) {
       string prefix = "snapshot_epoch" + std::to_string(epoch);
       Checkpoint(net, prefix);
     }
     TestOneEpoch(net, data, cuda, epoch, bin_folder, num_test_images, batchsize,
                  val_ch, nthreads);
   }
 }
 }

 int main(int argc, char **argv) {
   singa::InitChannel(nullptr);
   int pos = singa::ArgPos(argc, argv, "-h");
   if (pos != -1) {
     std::cout << "Usage:\n"
               << "\t-epoch <int>: number of epoch to be trained, default is 90;\n"
               << "\t-lr <float>: base learning rate;\n"
               << "\t-batchsize <int>: batchsize, it should be changed regarding "
                  "to your memory;\n"
               << "\t-filesize <int>: number of training images that stores in "
                  "each binary file;\n"
               << "\t-ntrain <int>: number of training images;\n"
               << "\t-ntest <int>: number of test images;\n"
               << "\t-data <folder>: the folder which stores the binary files;\n"
               << "\t-pfreq <int>: the frequency(in batch) of printing current "
                  "model status(loss and accuracy);\n"
               << "\t-nthreads <int>`: the number of threads to load data which "
                  "feed to the model.\n";
     return 0;
   }
   pos = singa::ArgPos(argc, argv, "-epoch");
   int nEpoch = 90;
   if (pos != -1) nEpoch = atoi(argv[pos + 1]);

   pos = singa::ArgPos(argc, argv, "-lr");
   float lr = 0.01;
   if (pos != -1) lr = atof(argv[pos + 1]);

   pos = singa::ArgPos(argc, argv, "-batchsize");
   int batchsize = 256;
   if (pos != -1) batchsize = atof(argv[pos + 1]);

   pos = singa::ArgPos(argc, argv, "-filesize");
   size_t train_file_size = 1280;
   if (pos != -1) train_file_size = atoi(argv[pos + 1]);

   pos = singa::ArgPos(argc, argv, "-ntrain");
   size_t num_train_images = 1281167;
   if (pos != -1) num_train_images = atoi(argv[pos + 1]);

   pos = singa::ArgPos(argc, argv, "-ntest");
   size_t num_test_images = 50000;
   if (pos != -1) num_test_images = atoi(argv[pos + 1]);

   pos = singa::ArgPos(argc, argv, "-data");
   string bin_folder = "imagenet_data";
   if (pos != -1) bin_folder = argv[pos + 1];

   pos = singa::ArgPos(argc, argv, "-pfreq");
   size_t pfreq = 100;
   if (pos != -1) pfreq = atoi(argv[pos + 1]);

   pos = singa::ArgPos(argc, argv, "-nthreads");
   int nthreads = 12;
   if (pos != -1) nthreads = atoi(argv[pos + 1]);

   LOG(INFO) << "Start training";
   singa::Train(nEpoch, lr, batchsize, train_file_size, bin_folder,
                num_train_images, num_test_images, pfreq, nthreads);
   LOG(INFO) << "End training";
 }
 #endif
	/************************************************************
	*
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing,
	* software distributed under the License is distributed on an
	* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	* KIND, either express or implied. See the License for the
	* specific language governing permissions and limitations
	* under the License.
	*
	*************************************************************/

	#include "singa/singa_config.h"
	#ifdef USE_OPENCV
	#include <cmath>
	#include "./ilsvrc12.h"
	#include "singa/io/snapshot.h"
	#include "singa/model/feed_forward_net.h"
	#include "singa/model/initializer.h"
	#include "singa/model/metric.h"
	#include "singa/model/optimizer.h"
	#include "singa/utils/channel.h"
	#include "singa/utils/string.h"
	#include "singa/utils/timer.h"
	namespace singa {

	// currently supports 'cudnn' and 'singacpp'
	const std::string engine = "cudnn";
	LayerConf GenConvConf(string name, int nb_filter, int kernel, int stride,
	int pad, float std, float bias = .0f) {
	LayerConf conf;
	conf.set_name(name);
	conf.set_type(engine + "_convolution");
	ConvolutionConf *conv = conf.mutable_convolution_conf();
	conv->set_num_output(nb_filter);
	conv->add_kernel_size(kernel);
	conv->add_stride(stride);
	conv->add_pad(pad);
	conv->set_bias_term(true);

	ParamSpec *wspec = conf.add_param();
	wspec->set_name(name + "_weight");
	auto wfill = wspec->mutable_filler();
	wfill->set_type("Gaussian");
	wfill->set_std(std);

	ParamSpec *bspec = conf.add_param();
	bspec->set_name(name + "_bias");
	bspec->set_lr_mult(2);
	bspec->set_decay_mult(0);
	auto bfill = bspec->mutable_filler();
	bfill->set_value(bias);
	return conf;
	}

	LayerConf GenPoolingConf(string name, bool max_pool, int kernel, int stride,
	int pad) {
	LayerConf conf;
	conf.set_name(name);
	conf.set_type(engine + "_pooling");
	PoolingConf *pool = conf.mutable_pooling_conf();
	pool->set_kernel_size(kernel);
	pool->set_stride(stride);
	pool->set_pad(pad);
	if (!max_pool) pool->set_pool(PoolingConf_PoolMethod_AVE);
	return conf;
	}

	LayerConf GenReLUConf(string name) {
	LayerConf conf;
	conf.set_name(name);
	conf.set_type(engine + "_relu");
	return conf;
	}

	LayerConf GenDenseConf(string name, int num_output, float std, float wd,
	float bias = .0f) {
	LayerConf conf;
	conf.set_name(name);
	conf.set_type("singa_dense");
	DenseConf *dense = conf.mutable_dense_conf();
	dense->set_num_output(num_output);

	ParamSpec *wspec = conf.add_param();
	wspec->set_name(name + "_weight");
	wspec->set_decay_mult(wd);
	auto wfill = wspec->mutable_filler();
	wfill->set_type("Gaussian");
	wfill->set_std(std);

	ParamSpec *bspec = conf.add_param();
	bspec->set_name(name + "_bias");
	bspec->set_lr_mult(2);
	bspec->set_decay_mult(0);
	auto bfill = bspec->mutable_filler();
	bfill->set_value(bias);

	return conf;
	}

	LayerConf GenLRNConf(string name) {
	LayerConf conf;
	conf.set_name(name);
	conf.set_type(engine + "_lrn");
	LRNConf *lrn = conf.mutable_lrn_conf();
	lrn->set_local_size(5);
	lrn->set_alpha(1e-04);
	lrn->set_beta(0.75);
	return conf;
	}

	LayerConf GenFlattenConf(string name) {
	LayerConf conf;
	conf.set_name(name);
	conf.set_type("singa_flatten");
	return conf;
	}

	LayerConf GenDropoutConf(string name, float dropout_ratio) {
	LayerConf conf;
	conf.set_name(name);
	conf.set_type(engine + "_dropout");
	DropoutConf *dropout = conf.mutable_dropout_conf();
	dropout->set_dropout_ratio(dropout_ratio);
	return conf;
	}

	FeedForwardNet CreateNet() {
	FeedForwardNet net;
	Shape s{3, 227, 227};

	net.Add(GenConvConf("conv1", 96, 11, 4, 0, 0.01), &s);
	net.Add(GenReLUConf("relu1"));
	net.Add(GenPoolingConf("pool1", true, 3, 2, 0));
	net.Add(GenLRNConf("lrn1"));
	net.Add(GenConvConf("conv2", 256, 5, 1, 2, 0.01, 1.0));
	net.Add(GenReLUConf("relu2"));
	net.Add(GenPoolingConf("pool2", true, 3, 2, 0));
	net.Add(GenLRNConf("lrn2"));
	net.Add(GenConvConf("conv3", 384, 3, 1, 1, 0.01));
	net.Add(GenReLUConf("relu3"));
	net.Add(GenConvConf("conv4", 384, 3, 1, 1, 0.01, 1.0));
	net.Add(GenReLUConf("relu4"));
	net.Add(GenConvConf("conv5", 256, 3, 1, 1, 0.01, 1.0));
	net.Add(GenReLUConf("relu5"));
	net.Add(GenPoolingConf("pool5", true, 3, 2, 0));
	net.Add(GenFlattenConf("flat"));
	net.Add(GenDenseConf("ip6", 4096, 0.005, 1, 1.0));
	net.Add(GenReLUConf("relu6"));
	net.Add(GenDropoutConf("drop6", 0.5));
	net.Add(GenDenseConf("ip7", 4096, 0.005, 1, 1.0));
	net.Add(GenReLUConf("relu7"));
	net.Add(GenDropoutConf("drop7", 0.5));
	net.Add(GenDenseConf("ip8", 1000, 0.01, 1));

	return net;
	}

	void TrainOneEpoch(FeedForwardNet &net, ILSVRC &data,
	std::shared_ptr<Device> device, int epoch, string bin_folder,
	size_t num_train_files, size_t batchsize, float lr,
	Channel *train_ch, size_t pfreq, int nthreads) {
	float loss = 0.0f, metric = 0.0f;
	float load_time = 0.0f, train_time = 0.0f;
	size_t b = 0;
	size_t n_read;
	Timer timer, ttr;
	Tensor prefetch_x(Shape{batchsize, 3, kCropSize, kCropSize}), prefetch_y(Shape{batchsize}, kInt);
	string binfile = bin_folder + "/train1.bin";
	timer.Tick();
	data.LoadData(kTrain, binfile, batchsize, &prefetch_x, &prefetch_y, &n_read,
	nthreads);
	load_time += timer.Elapsed();
	CHECK_EQ(n_read, batchsize);
	Tensor train_x(prefetch_x.shape(), device);
	Tensor train_y(prefetch_y.shape(), device, kInt);
	std::thread th;
	for (size_t fno = 1; fno <= num_train_files; fno++) {
	binfile = bin_folder + "/train" + std::to_string(fno) + ".bin";
	while (true) {
	if (th.joinable()) {
	th.join();
	load_time += timer.Elapsed();
	// LOG(INFO) << "num of samples: " << n_read;
	if (n_read < batchsize) {
	if (n_read > 0) {
	LOG(WARNING) << "Pls set batchsize to make num_total_samples "
	<< "% batchsize == 0. Otherwise, the last " << n_read
	<< " samples would not be used";
	}
	break;
	}
	}
	if (n_read == batchsize) {
	train_x.CopyData(prefetch_x);
	train_y.CopyData(prefetch_y);
	}
	timer.Tick();
	th = data.AsyncLoadData(kTrain, binfile, batchsize, &prefetch_x,
	&prefetch_y, &n_read, nthreads);
	if (n_read < batchsize) continue;
	CHECK_EQ(train_x.shape(0), train_y.shape(0));
	ttr.Tick();
	auto ret = net.TrainOnBatch(epoch, train_x, train_y);
	train_time += ttr.Elapsed();
	loss += ret.first;
	metric += ret.second;
	b++;
	}
	if (b % pfreq == 0) {
	train_ch->Send(
	"Epoch " + std::to_string(epoch) + ", training loss = " +
	std::to_string(loss / b) + ", accuracy = " +
	std::to_string(metric / b) + ", lr = " + std::to_string(lr) +
	", time of loading " + std::to_string(batchsize) + " images = " +
	std::to_string(load_time / b) +
	" ms, time of training (batchsize = " + std::to_string(batchsize) +
	") = " + std::to_string(train_time / b) + " ms.");
	loss = 0.0f;
	metric = 0.0f;
	load_time = 0.0f;
	train_time = 0.0f;
	b = 0;
	}
	}
	}

	void TestOneEpoch(FeedForwardNet &net, ILSVRC &data,
	std::shared_ptr<Device> device, int epoch, string bin_folder,
	size_t num_test_images, size_t batchsize, Channel *val_ch,
	int nthreads) {
	float loss = 0.0f, metric = 0.0f;
	float load_time = 0.0f, eval_time = 0.0f;
	size_t n_read;
	string binfile = bin_folder + "/test.bin";
	Timer timer, tte;
	Tensor prefetch_x, prefetch_y;
	timer.Tick();
	data.LoadData(kEval, binfile, batchsize, &prefetch_x, &prefetch_y, &n_read,
	nthreads);
	load_time += timer.Elapsed();
	Tensor test_x(prefetch_x.shape(), device);
	Tensor test_y(prefetch_y.shape(), device, kInt);
	int remain = (int)num_test_images - n_read;
	CHECK_EQ(n_read, batchsize);
	std::thread th;
	while (true) {
	if (th.joinable()) {
	th.join();
	load_time += timer.Elapsed();
	remain -= n_read;
	if (remain < 0) break;
	if (n_read < batchsize) break;
	}
	test_x.CopyData(prefetch_x);
	test_y.CopyData(prefetch_y);
	timer.Tick();
	th = data.AsyncLoadData(kEval, binfile, batchsize, &prefetch_x, &prefetch_y,
	&n_read, nthreads);

	CHECK_EQ(test_x.shape(0), test_y.shape(0));
	tte.Tick();
	auto ret = net.EvaluateOnBatch(test_x, test_y);
	eval_time += tte.Elapsed();
	ret.first.ToHost();
	ret.second.ToHost();
	loss += Sum(ret.first);
	metric += Sum(ret.second);
	}
	loss /= num_test_images;
	metric /= num_test_images;
	val_ch->Send("Epoch " + std::to_string(epoch) + ", val loss = " +
	std::to_string(loss) + ", accuracy = " + std::to_string(metric) +
	", time of loading " + std::to_string(num_test_images) +
	" images = " + std::to_string(load_time) +
	" ms, time of evaluating " + std::to_string(num_test_images) +
	" images = " + std::to_string(eval_time) + " ms.");
	}

	void Checkpoint(FeedForwardNet &net, string prefix) {
	Snapshot snapshot(prefix, Snapshot::kWrite, 200);
	auto names = net.GetParamNames();
	auto values = net.GetParamValues();
	for (size_t k = 0; k < names.size(); k++) {
	values.at(k).ToHost();
	snapshot.Write(names.at(k), values.at(k));
	}
	LOG(INFO) << "Write snapshot into " << prefix;
	}

	void Train(int num_epoch, float lr, size_t batchsize, size_t train_file_size,
	string bin_folder, size_t num_train_images, size_t num_test_images,
	size_t pfreq, int nthreads) {
	ILSVRC data;
	data.ReadMean(bin_folder + "/mean.bin");
	auto net = CreateNet();
	auto cuda = std::make_shared<CudaGPU>(0);
	net.ToDevice(cuda);
	SGD sgd;
	OptimizerConf opt_conf;
	opt_conf.set_momentum(0.9);
	auto reg = opt_conf.mutable_regularizer();
	reg->set_coefficient(0.0005);
	sgd.Setup(opt_conf);
	sgd.SetLearningRateGenerator(
	[lr](int epoch) { return lr * std::pow(0.1, epoch / 20); });

	SoftmaxCrossEntropy loss;
	Accuracy acc;
	net.Compile(true, &sgd, &loss, &acc);

	Channel *train_ch = GetChannel("train_perf");
	train_ch->EnableDestStderr(true);
	Channel *val_ch = GetChannel("val_perf");
	val_ch->EnableDestStderr(true);
	size_t num_train_files = num_train_images / train_file_size +
	(num_train_images % train_file_size ? 1 : 0);
	for (int epoch = 0; epoch < num_epoch; epoch++) {
	float epoch_lr = sgd.GetLearningRate(epoch);
	TrainOneEpoch(net, data, cuda, epoch, bin_folder, num_train_files,
	batchsize, epoch_lr, train_ch, pfreq, nthreads);
	if (epoch % 10 == 0 && epoch > 0) {
	string prefix = "snapshot_epoch" + std::to_string(epoch);
	Checkpoint(net, prefix);
	}
	TestOneEpoch(net, data, cuda, epoch, bin_folder, num_test_images, batchsize,
	val_ch, nthreads);
	}
	}
	}

	int main(int argc, char **argv) {
	singa::InitChannel(nullptr);
	int pos = singa::ArgPos(argc, argv, "-h");
	if (pos != -1) {
	std::cout << "Usage:\n"
	<< "\t-epoch <int>: number of epoch to be trained, default is 90;\n"
	<< "\t-lr <float>: base learning rate;\n"
	<< "\t-batchsize <int>: batchsize, it should be changed regarding "
	"to your memory;\n"
	<< "\t-filesize <int>: number of training images that stores in "
	"each binary file;\n"
	<< "\t-ntrain <int>: number of training images;\n"
	<< "\t-ntest <int>: number of test images;\n"
	<< "\t-data <folder>: the folder which stores the binary files;\n"
	<< "\t-pfreq <int>: the frequency(in batch) of printing current "
	"model status(loss and accuracy);\n"
	<< "\t-nthreads <int>`: the number of threads to load data which "
	"feed to the model.\n";
	return 0;
	}
	pos = singa::ArgPos(argc, argv, "-epoch");
	int nEpoch = 90;
	if (pos != -1) nEpoch = atoi(argv[pos + 1]);

	pos = singa::ArgPos(argc, argv, "-lr");
	float lr = 0.01;
	if (pos != -1) lr = atof(argv[pos + 1]);

	pos = singa::ArgPos(argc, argv, "-batchsize");
	int batchsize = 256;
	if (pos != -1) batchsize = atof(argv[pos + 1]);

	pos = singa::ArgPos(argc, argv, "-filesize");
	size_t train_file_size = 1280;
	if (pos != -1) train_file_size = atoi(argv[pos + 1]);

	pos = singa::ArgPos(argc, argv, "-ntrain");
	size_t num_train_images = 1281167;
	if (pos != -1) num_train_images = atoi(argv[pos + 1]);

	pos = singa::ArgPos(argc, argv, "-ntest");
	size_t num_test_images = 50000;
	if (pos != -1) num_test_images = atoi(argv[pos + 1]);

	pos = singa::ArgPos(argc, argv, "-data");
	string bin_folder = "imagenet_data";
	if (pos != -1) bin_folder = argv[pos + 1];

	pos = singa::ArgPos(argc, argv, "-pfreq");
	size_t pfreq = 100;
	if (pos != -1) pfreq = atoi(argv[pos + 1]);

	pos = singa::ArgPos(argc, argv, "-nthreads");
	int nthreads = 12;
	if (pos != -1) nthreads = atoi(argv[pos + 1]);

	LOG(INFO) << "Start training";
	singa::Train(nEpoch, lr, batchsize, train_file_size, bin_folder,
	num_train_images, num_test_images, pfreq, nthreads);
	LOG(INFO) << "End training";
	}
	#endif