examples/model_selection/Trails/internal/ml/model_selection/src/eva_engine/phase2/algo/trainer.py - singa - Git at Google

 #
 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #

 import torch
 import torch.nn as nn
 from torch import optim
 from torch.utils.data import DataLoader
 from src.tools import utils

 from singa import singa_wrap as singa
 from singa import device as singa_device
 from singa import tensor
 from singa import opt
 from singa import autograd
 from singa.opt import Optimizer
 from singa.opt import DecayScheduler
 from singa.opt import Constant
 import numpy as np
 import time
 import argparse
 from PIL import Image
 import json

 np_dtype = {"float16": np.float16, "float32": np.float32}

 # singa_dtype = {"float16": tensor.float16, "float32": tensor.float32}
 singa_dtype = {"float32": tensor.float32}


 ### MSOptimizer
 class MSOptimizer(Optimizer):
     def __call__(self, loss):
         pn_p_g_list = self.call_with_returns(loss)
         # print ("optimizer1 before self.step()")
         # print ("optimizer1 before print len(pn_p_g_list): \n", len(pn_p_g_list))
         self.step()
         # print ("optimizer1 after print len(pn_p_g_list): \n", len(pn_p_g_list))
         # print ("optimizer1 after self.step()")
         return pn_p_g_list

     def call_with_returns(self, loss):
         # print ("call_with_returns before apply loss.data: \n", loss.data)
         pn_p_g_list = []
         for p, g in autograd.backward(loss):
             if p.name is None:
                 p.name = id(p)
             self.apply(p.name, p, g)
             # print ("call with returns")
             # print ("p.name: \n", p.name)
             # print ("p.data: \n", p.data)
             # print ("g.data: \n", g.data)
             pn_p_g_list.append([p.name, p, g])  # need iterables
         # print ("call_with_returns after apply loss.data: \n", loss.data)
         return pn_p_g_list


 # MSSGD -- actually no change of code
 class MSSGD(MSOptimizer):
     """Implements stochastic gradient descent (optionally with momentum).

     Nesterov momentum is based on the formula from `On the importance of initialization and momentum in deep learning`__.

     Args:
         lr(float): learning rate
         momentum(float, optional): momentum factor(default: 0)
         weight_decay(float, optional): weight decay(L2 penalty)(default: 0)
         dampening(float, optional): dampening for momentum(default: 0)
         nesterov(bool, optional): enables Nesterov momentum(default: False)

     Typical usage example:
         >> > from singa import opt
         >> > optimizer = opt.SGD(lr=0.1, momentum=0.9)
         >> > optimizer.update()

     __ http: // www.cs.toronto.edu / %7Ehinton / absps / momentum.pdf

     .. note::
         The implementation of SGD with Momentum / Nesterov subtly differs from
         Sutskever et. al. and implementations in some other frameworks.

         Considering the specific case of Momentum, the update can be written as

         .. math::
                   v = \rho * v + g \\
                   p = p - lr * v

         where p, g, v and: math: `\rho` denote the parameters, gradient,
         velocity, and momentum respectively.

         This is in contrast to Sutskever et. al. and
         other frameworks which employ an update of the form

         .. math::
              v = \rho * v + lr * g \\
              p = p - v

         The Nesterov version is analogously modified.
     """

     def __init__(self,
                  lr=0.1,
                  momentum=0,
                  dampening=0,
                  weight_decay=0,
                  nesterov=False,
                  dtype=tensor.float32):
         super(MSSGD, self).__init__(lr)

         # init momentum
         if type(momentum) == float or type(momentum) == int:
             if momentum < 0.0:
                 raise ValueError("Invalid momentum value: {}".format(momentum))
             self.momentum = Constant(momentum)
         elif isinstance(momentum, DecayScheduler):
             self.momentum = momentum
             momentum = momentum.init_value
         else:
             raise TypeError("Wrong momentum type")
         # self.dtype = dtype
         # self.mom_value = self.momentum(self.step_counter).as_type(self.dtype)
         self.mom_value = self.momentum(self.step_counter)

         # init dampening
         if type(dampening) == float or type(dampening) == int:
             self.dampening = Constant(dampening)
         elif isinstance(dampening, DecayScheduler):
             self.dampening = dampening
             dampening = dampening.init_value
         else:
             raise TypeError("Wrong dampening type")
         # self.dam_value = self.dampening(self.step_counter).as_type(self.dtype)
         self.dam_value = self.dampening(self.step_counter)

         # init weight_decay
         if type(weight_decay) == float or type(weight_decay) == int:
             if weight_decay < 0.0:
                 raise ValueError(
                     "Invalid weight_decay value: {}".format(weight_decay))
             self.weight_decay = Constant(weight_decay)
         elif isinstance(weight_decay, DecayScheduler):
             self.weight_decay = weight_decay
         else:
             raise TypeError("Wrong weight_decay type")
         # self.decay_value = self.weight_decay(self.step_counter).as_type(self.dtype)
         self.decay_value = self.weight_decay(self.step_counter)

         # init other params
         self.nesterov = nesterov
         self.moments = dict()

         # check value
         if nesterov and (momentum <= 0 or dampening != 0):
             raise ValueError(
                 "Nesterov momentum requires a momentum and zero dampening")

     def apply(self, param_name, param_value, param_grad):
         """Performs a single optimization step.

         Args:
                 param_name(String): the name of the param
                 param_value(Tensor): param values to be update in-place
                 grad(Tensor): param gradients; the values may be updated
                         in this function; cannot use it anymore
         """
         assert param_value.shape == param_grad.shape, ("shape mismatch",
                                                        param_value.shape,
                                                        param_grad.shape)
         self.device_check(param_value, self.step_counter, self.lr_value,
                           self.mom_value, self.dam_value, self.decay_value)

         # derive dtype from input
         # assert param_value.dtype == self.dtype

         # TODO add branch operator
         # if self.decay_value != 0:
         if self.weight_decay.init_value != 0:
             singa.Axpy(self.decay_value.data, param_value.data, param_grad.data)

         if self.momentum.init_value != 0:
             if param_name not in self.moments:
                 flag = param_value.device.graph_enabled()
                 param_value.device.EnableGraph(False)
                 self.moments[param_name] = tensor.zeros_like(param_value)
                 param_value.device.EnableGraph(flag)

             buf = self.moments[param_name]
             buf *= self.mom_value
             alpha = 1.0 - self.dam_value
             singa.Axpy(alpha.data, param_grad.data, buf.data)

             if self.nesterov:
                 singa.Axpy(self.mom_value.data, buf.data, param_grad.data)
             else:
                 param_grad = buf

         minus_lr = 0.0 - self.lr_value
         singa.Axpy(minus_lr.data, param_grad.data, param_value.data)

     def step(self):
         # increment step counter, lr and moment
         # print ("before super step")
         super().step()
         # print ("after super step")
         # print ("before custiomized step")
         # mom_value = self.momentum(self.step_counter).as_type(self.dtype)
         # dam_value = self.dampening(self.step_counter).as_type(self.dtype)
         # decay_value = self.weight_decay(self.step_counter).as_type(self.dtype)
         mom_value = self.momentum(self.step_counter)
         dam_value = self.dampening(self.step_counter)
         decay_value = self.weight_decay(self.step_counter)
         self.mom_value.copy_from(mom_value)
         self.dam_value.copy_from(dam_value)
         self.decay_value.copy_from(decay_value)
         # print ("after customized step")

     def get_states(self):
         states = super().get_states()
         if self.mom_value > 0:
             states[
                 'moments'] = self.moments  # a dict for 1st order moments tensors
         return states

     def set_states(self, states):
         super().set_states(states)
         if 'moments' in states:
             self.moments = states['moments']
             self.mom_value = self.momentum(self.step_counter)


 # Data augmentation
 def augmentation(x, batch_size):
     xpad = np.pad(x, [[0, 0], [0, 0], [4, 4], [4, 4]], 'symmetric')
     for data_num in range(0, batch_size):
         offset = np.random.randint(8, size=2)
         x[data_num, :, :, :] = xpad[data_num, :,
                                offset[0]:offset[0] + x.shape[2],
                                offset[1]:offset[1] + x.shape[2]]
         if_flip = np.random.randint(2)
         if (if_flip):
             x[data_num, :, :, :] = x[data_num, :, :, ::-1]
     return x


 # Calculate accuracy
 def accuracy(pred, target):
     # y is network output to be compared with ground truth (int)
     y = np.argmax(pred, axis=1)
     # print ("in accuracy y shape: ", y.shape)
     # print ("in accuracy target shape: ", target.shape)
     a = y == target
     correct = np.array(a, "int").sum()
     return correct


 # Data partition according to the rank
 def partition(global_rank, world_size, train_x, train_y, val_x, val_y):
     # Partition training data
     data_per_rank = train_x.shape[0] // world_size
     idx_start = global_rank * data_per_rank
     idx_end = (global_rank + 1) * data_per_rank
     train_x = train_x[idx_start:idx_end]
     train_y = train_y[idx_start:idx_end]

     # Partition evaluation data
     data_per_rank = val_x.shape[0] // world_size
     idx_start = global_rank * data_per_rank
     idx_end = (global_rank + 1) * data_per_rank
     val_x = val_x[idx_start:idx_end]
     val_y = val_y[idx_start:idx_end]
     return train_x, train_y, val_x, val_y


 # Function to all reduce NUMPY accuracy and loss from multiple devices
 def reduce_variable(variable, dist_opt, reducer):
     reducer.copy_from_numpy(variable)
     dist_opt.all_reduce(reducer.data)
     dist_opt.wait()
     output = tensor.to_numpy(reducer)
     return output


 def resize_dataset(x, image_size):
     num_data = x.shape[0]
     dim = x.shape[1]
     X = np.zeros(shape=(num_data, dim, image_size, image_size),
                  dtype=np.float32)
     for n in range(0, num_data):
         for d in range(0, dim):
             X[n, d, :, :] = np.array(Image.fromarray(x[n, d, :, :]).resize(
                 (image_size, image_size), Image.BILINEAR),
                 dtype=np.float32)
     return X


 class ModelTrainer:

     @classmethod
     def fully_train_arch(cls,
                          model,
                          use_test_acc: bool,
                          epoch_num,
                          train_loader: DataLoader,
                          val_loader: DataLoader,
                          test_loader: DataLoader,
                          args,
                          logger=None
                          ) -> (float, float, dict):
         """
         Args:
             model:
             use_test_acc:
             epoch_num: how many epoch, set by scheduler
             train_loader:
             val_loader:
             test_loader:
             args:
         Returns:
         """

         if logger is None:
             from src.logger import logger
             logger = logger
         logger.info(f'begin to train, batch size = {args.batch_size}')
         start_time, best_valid_auc = time.time(), 0.

         num_labels = args.num_labels
         lr = args.lr
         iter_per_epoch = args.iter_per_epoch
         # report_freq = args.report_freq
         # given_patience = args.patience

         # assign new values
         args.epoch_num = epoch_num

         # for multiple classification
         # opt_metric = nn.CrossEntropyLoss(reduction='mean').to(device)
         # this is only sutiable when output is dimension 1,
         # opt_metric = nn.BCEWithLogitsLoss(reduction='mean').to(device)

         # optimizer
         # optimizer = optim.Adam(model.parameters(), lr=lr)
         # scheduler = optim.lr_scheduler.CosineAnnealingLR(
         #     optimizer,
         #     T_max=epoch_num,  # Maximum number of iterations.
         #     eta_min=1e-4)  # Minimum learning rate.
         precision = 'float32'
         mssgd = MSSGD(lr=args.lr, momentum=0.9, weight_decay=1e-4, dtype=singa_dtype[precision])
         device_id = 0
         max_epoch = epoch_num
         # model = arch
         graph = True
         verbosity = 0
         dist_option = 'plain'
         spars = None
         global_rank = 0
         world_size = 1
         # gradient clipping, set the gradient value to be -1 - 1
         # for p in model.parameters():
         #     p.register_hook(lambda grad: torch.clamp(grad, -1., 1.))

         # training params
         # device = args.device
         if args.device == 'cpu':
             dev = singa_device.get_default_device()
         else:  # GPU
             dev = singa_device.create_cuda_gpu_on(args.local_rank)  # need to change to CPU device for CPU-only machines
         dev.SetRandSeed(0)

         # For distributed training, sequential has better performance
         if hasattr(mssgd, "communicator"):
             DIST = True
             sequential = True
         else:
             DIST = False
             sequential = False

         info_dic = {}
         valid_auc = -1
         valid_loss = 0

         ### singa data
         tx = tensor.Tensor((args.batch_size, args.nfeat), dev, singa_dtype[precision])
         ty = tensor.Tensor((args.batch_size,), dev, tensor.int32)
         ### singa data

         model.set_optimizer(mssgd)
         model.compile([tx], is_train=True, use_graph=graph, sequential=sequential)
         dev.SetVerbosity(verbosity)

         # synflow_flag = False ### just change the model to the absolute value
         # for epoch in range(epoch_num):
         # logger.info(f'Epoch [{epoch:3d}/{epoch_num:3d}]')
         # train and eval
         # print("begin to train...")
         # logger.info(f"Begin to train.....")
         # train_auc, train_loss = ModelTrainer.run(logger,
         #                                          epoch, iter_per_epoch, model, train_loader, opt_metric, args,
         #                                          optimizer=optimizer, namespace='train')
         # scheduler.step()
         # logger.info(f"Begin to evaluate on valid.....")
         # print("begin to evaluate...")
         # valid_auc, valid_loss = ModelTrainer.run(logger,
         #                                          epoch, iter_per_epoch, model, val_loader,
         #                                          opt_metric, args, namespace='val')

         # if use_test_acc:
         #     logger.info(f"Begin to evaluate on test.....")
         #     test_auc, test_loss = ModelTrainer.run(logger,
         #                                            epoch, iter_per_epoch, model, test_loader,
         #                                            opt_metric, args, namespace='test')
         # else:
         #     test_auc = -1

         # info_dic[epoch] = {
         #     "train_auc": train_auc,
         #     "valid_auc": valid_auc,
         #     "train_loss": train_loss,
         #     "valid_loss": valid_loss,
         #     "train_val_total_time": time.time() - start_time}

         # record best auc and save checkpoint
         # if valid_auc >= best_valid_auc:
         #     best_valid_auc, best_test_auc = valid_auc, test_auc
         #     logger.info(f'best valid auc: valid {valid_auc:.4f}, test {test_auc:.4f}')
         # else:
         #     logger.info(f'valid {valid_auc:.4f}, test {test_auc:.4f}')

         # Training and evaluation loop
         for epoch in range(max_epoch):
             start_time = time.time()
             logger.info(f'Epoch [{epoch:3d}/{epoch_num:3d}]')
             # np.random.shuffle(idx)

             if global_rank == 0:
                 print('Starting Epoch %d:' % (epoch))
                 logger.info('Starting Epoch %d:' % (epoch))

             # Training phase
             train_correct = np.zeros(shape=[1], dtype=np.float32)
             test_correct = np.zeros(shape=[1], dtype=np.float32)
             train_loss = np.zeros(shape=[1], dtype=np.float32)

             model.train()
             # print ("num_train_batch: \n", num_train_batch)
             # print ()
             batch_idx = 0
             # for b in range(num_train_batch):
             for batch_idx, batch in enumerate(train_loader, start=1):
                 if batch_idx % 50 == 0:
                     print("trainer.py train batch_idx: \n", batch_idx)
                     logger.info("trainer.py train batch_idx: \n", batch_idx)
                 # Generate the batch data in this iteration
                 # x = train_x[idx[b * batch_size:(b + 1) * batch_size]]
                 # if model.dimension == 4:
                 #     x = augmentation(x, batch_size)
                 #     if (image_size != model.input_size):
                 #         x = resize_dataset(x, model.input_size)
                 # x = x.astype(np_dtype[precision])
                 # y = train_y[idx[b * batch_size:(b + 1) * batch_size]]

                 y = batch['y'].cpu().numpy()
                 batch['id'] = batch['id'].cpu().numpy().astype(int)
                 # batch['value'] = batch['value'].to(args.device)
                 x = np.zeros((batch['id'].shape[0], args.nfeat), dtype=np.float32)
                 # print ("target shape: ", target.shape)
                 # print ("target: ", target)
                 # print ("batch['id'] shape: ", batch['id'].shape)
                 # print ("batch['id']: ", batch['id'])
                 # print ("batch['value'] shape: ", batch['value'].shape)
                 # print ("batch['value']: ", batch['value'])
                 # print ("batch['id'].cpu().numpy().astype(int): \n", batch['id'].cpu().numpy().astype(int))
                 for i in range(batch['id'].shape[0]):
                     x[i][batch['id'][i]] = (np.float32)(1.0)
                 x = x.astype(dtype=np.float32)
                 y = y.astype(dtype=np.int32)

                 if x.shape[0] != args.batch_size:  # last batch not processing
                     continue

                 synflow_flag = False
                 # Train the model
                 # if True: # normal train steps
                 # Copy the patch data into input tensors
                 # print ("normal train steps\n")
                 # print ("x.astype(np.float32): \n", x.astype(np.float32))
                 # print ("y: \n", y)
                 tx = tensor.Tensor(x.shape, dev, singa_dtype[precision])
                 ty = tensor.Tensor((y.shape[0],), dev, tensor.int32)
                 tx.copy_from_numpy(x)  # dtype=np.float32
                 # print ("tx: \n", tx)
                 ty.copy_from_numpy(y)
                 # print ("ty: \n", ty)
                 # print ("normal before model(tx, ty, synflow_flag, dist_option, spars)")
                 # print ("train_cnn tx: \n", tx)
                 # print ("train_cnn ty: \n", ty)
                 # print ("trainer.py train before model forward ...")
                 # print ("model: ", model)
                 pn_p_g_list, out, loss = model(tx, ty, dist_option, spars, synflow_flag)
                 # print ("trainer.py train normal after model(tx, ty, synflow_flag, dist_option, spars)")
                 # print ("trainer.py train tx shape: ", tx.shape)
                 # print ("trainer.py train ty shape: ", ty.shape)
                 # print ("trainer.py train out.shape: ", out.shape)
                 # print ("trainer.py train out: ", out)
                 # print ("trainer.py train y shape: ", y.shape)
                 train_correct += accuracy(tensor.to_numpy(out), y)
                 train_loss += tensor.to_numpy(loss)[0]

             if DIST:
                 # Reduce the evaluation accuracy and loss from multiple devices
                 reducer = tensor.Tensor((1,), dev, tensor.float32)
                 train_correct = reduce_variable(train_correct, mssgd, reducer)
                 train_loss = reduce_variable(train_loss, mssgd, reducer)

             if global_rank == 0:
                 print('Training loss = %f, training accuracy = %f' %
                       (train_loss, train_correct /
                        (batch_idx * args.batch_size * world_size)),
                       flush=True)
                 print("train total batch_idx: ", batch_idx)

                 logger.info('Training loss = %f, training accuracy = %f' %
                       (train_loss, train_correct /
                        (batch_idx * args.batch_size * world_size)))

                 logger.info("train total batch_idx: ", batch_idx)
                 train_metric = train_correct / (batch_idx * args.batch_size * world_size)

             # Evaluation phase
             model.eval()
             batch_idx = 0
             # for b in range(num_val_batch):
             # print ("evaluation begins")
             for batch_idx, batch in enumerate(test_loader, start=1):
                 # print ("trainer.py test batch_idx: \n", batch_idx)
                 # x = val_x[b * batch_size:(b + 1) * batch_size]
                 # if model.dimension == 4:
                 #     if (image_size != model.input_size):
                 #         x = resize_dataset(x, model.input_size)
                 # x = x.astype(np_dtype[precision])
                 # y = val_y[b * batch_size:(b + 1) * batch_size]
                 # batch['value'] = batch['value'].cpu().numpy().astype(np_dtype[precision])
                 # x = batch['value'].cpu().numpy().astype(np_dtype[precision])

                 y = batch['y'].cpu().numpy()
                 batch['id'] = batch['id'].cpu().numpy().astype(int)
                 # batch['value'] = batch['value'].to(args.device)
                 x = np.zeros((batch['id'].shape[0], args.nfeat), dtype=np.float32)
                 # print ("target shape: ", target.shape)
                 # print ("target: ", target)
                 # print ("batch['id'] shape: ", batch['id'].shape)
                 # print ("batch['id']: ", batch['id'])
                 # print ("batch['value'] shape: ", batch['value'].shape)
                 # print ("batch['value']: ", batch['value'])
                 # print ("batch['id'].cpu().numpy().astype(int): \n", batch['id'].cpu().numpy().astype(int))
                 for i in range(batch['id'].shape[0]):
                     x[i][batch['id'][i]] = (np.float32)(1.0)
                 # print ("x[1]: \n", x[1])
                 x = x.astype(dtype=np.float32)
                 y = y.astype(dtype=np.int32)

                 if x.shape[0] != (args.batch_size * 8):  # last batch not processing
                     # print ("trainer.py test batch_idx: ", batch_idx)
                     # print ("trainer.py test x.shape: ", x.shape)
                     continue

                 tx = tensor.Tensor(x.shape, dev, singa_dtype[precision])
                 ty = tensor.Tensor((y.shape[0],), dev, tensor.int32)
                 tx.copy_from_numpy(x)
                 ty.copy_from_numpy(y)
                 # print ("trainer.py test tx shape: ", tx.shape)
                 out_test = model(tx)
                 # print ("trainer.py test out_test shape: ", out_test.shape)
                 # print ("trainer.py test y shape: ", y.shape)
                 # print ("trainer.py out_test: ", out_test)
                 # print ("trainer.py y: ", y)
                 test_correct += accuracy(tensor.to_numpy(out_test), y)
                 # print ("test_correct: ", test_correct)

             if DIST:
                 # Reduce the evaulation accuracy from multiple devices
                 test_correct = reduce_variable(test_correct, mssgd, reducer)

             # Output the evaluation accuracy
             if global_rank == 0:
                 print('Evaluation accuracy = %f, Elapsed Time = %fs' %
                       (test_correct / (batch_idx * args.batch_size * 8 * world_size),
                        time.time() - start_time),
                       flush=True)

                 logger.info('Evaluation accuracy = %f, Elapsed Time = %fs' %
                       (test_correct / (batch_idx * args.batch_size * 8 * world_size),
                        time.time() - start_time))
                 # print ("test all batch_idx: ", batch_idx)
                 test_metric = test_correct / (batch_idx * args.batch_size * 8 * world_size)

             info_dic[epoch] = {
                 "train_metric": str(train_metric[0]),
                 "test_metric": str(test_metric[0]),
                 "train_loss": str(train_loss[0]),
                 # "valid_loss": valid_loss,
                 "train_test_total_time": str(time.time() - start_time)}

         dev.PrintTimeProfiling()

         # return valid_auc, time.time() - start_time, info_dic
         print("info_dic: ", info_dic)
         logger.info("info_dic: ", info_dic)

         logger.info(json.dumps(info_dic))

         test_metric = train_metric
         return test_metric, time.time() - start_time, info_dic

     @classmethod
     def fully_train_arch_origin(cls,
                                 model: nn.Module,
                                 use_test_acc: bool,
                                 epoch_num,
                                 train_loader: DataLoader,
                                 val_loader: DataLoader,
                                 test_loader: DataLoader,
                                 args,
                                 logger=None
                                 ) -> (float, float, dict):
         """
         Args:
             model:
             use_test_acc:
             epoch_num: how many epoch, set by scheduler
             train_loader:
             val_loader:
             test_loader:
             args:
         Returns:
         """

         if logger is None:
             from src.logger import logger
             logger = logger

         start_time, best_valid_auc = time.time(), 0.

         # training params
         device = args.device
         num_labels = args.num_labels
         lr = args.lr
         iter_per_epoch = args.iter_per_epoch
         # report_freq = args.report_freq
         # given_patience = args.patience

         # assign new values
         args.epoch_num = epoch_num

         # for multiple classification
         opt_metric = nn.CrossEntropyLoss(reduction='mean').to(device)
         # this is only sutiable when output is dimension 1,
         # opt_metric = nn.BCEWithLogitsLoss(reduction='mean').to(device)

         # optimizer
         optimizer = optim.Adam(model.parameters(), lr=lr)
         scheduler = optim.lr_scheduler.CosineAnnealingLR(
             optimizer,
             T_max=epoch_num,  # Maximum number of iterations.
             eta_min=1e-4)  # Minimum learning rate.

         # gradient clipping, set the gradient value to be -1 - 1
         for p in model.parameters():
             p.register_hook(lambda grad: torch.clamp(grad, -1., 1.))

         info_dic = {}
         valid_auc = -1
         valid_loss = 0
         for epoch in range(epoch_num):
             logger.info(f'Epoch [{epoch:3d}/{epoch_num:3d}]')
             # train and eval
             # print("begin to train...")
             logger.info(f"Begin to train.....")
             train_auc, train_loss = ModelTrainer.run(logger,
                                                      epoch, iter_per_epoch, model, train_loader, opt_metric, args,
                                                      optimizer=optimizer, namespace='train')
             scheduler.step()
             logger.info(f"Begin to evaluate on valid.....")
             # print("begin to evaluate...")
             valid_auc, valid_loss = ModelTrainer.run(logger,
                                                      epoch, iter_per_epoch, model, val_loader,
                                                      opt_metric, args, namespace='val')

             if use_test_acc:
                 logger.info(f"Begin to evaluate on test.....")
                 test_auc, test_loss = ModelTrainer.run(logger,
                                                        epoch, iter_per_epoch, model, test_loader,
                                                        opt_metric, args, namespace='test')
             else:
                 test_auc = -1

             info_dic[epoch] = {
                 "train_auc": train_auc,
                 "valid_auc": valid_auc,
                 "train_loss": train_loss,
                 "valid_loss": valid_loss,
                 "train_val_total_time": time.time() - start_time}

             # record best auc and save checkpoint
             if valid_auc >= best_valid_auc:
                 best_valid_auc, best_test_auc = valid_auc, test_auc
                 logger.info(f'best valid auc: valid {valid_auc:.4f}, test {test_auc:.4f}')
             else:
                 logger.info(f'valid {valid_auc:.4f}, test {test_auc:.4f}')

         return valid_auc, time.time() - start_time, info_dic

     @classmethod
     def fully_evaluate_arch(cls,
                             model: nn.Module,
                             use_test_acc: bool,
                             epoch_num,
                             val_loader: DataLoader,
                             test_loader: DataLoader,
                             args,
                             logger=None,
                             ) -> (float, float, dict):
         """
         Args:
             model:
             use_test_acc:
             epoch_num: how many epoch, set by scheduler
             val_loader:
             test_loader:
             args:
         Returns:
         """

         if logger is None:
             from src.logger import logger
             logger = logger

         start_time, best_valid_auc = time.time(), 0.

         device = args.device
         iter_per_epoch = args.iter_per_epoch
         args.epoch_num = epoch_num
         opt_metric = nn.CrossEntropyLoss(reduction='mean').to(device)

         info_dic = {}
         valid_auc = -1
         valid_loss = 0
         for epoch in range(epoch_num):
             logger.info(f'Epoch [{epoch:3d}/{epoch_num:3d}]')
             # print("begin to evaluate...")
             valid_auc, valid_loss = ModelTrainer.run(logger,
                                                      epoch, iter_per_epoch, model, val_loader,
                                                      opt_metric, args, namespace='val')

             if use_test_acc:
                 test_auc, test_loss = ModelTrainer.run(logger,
                                                        epoch, iter_per_epoch, model, test_loader,
                                                        opt_metric, args, namespace='test')
             else:
                 test_auc = -1

             # record best auc and save checkpoint
             if valid_auc >= best_valid_auc:
                 best_valid_auc, best_test_auc = valid_auc, test_auc
                 logger.info(f'best valid auc: valid {valid_auc:.4f}, test {test_auc:.4f}')
             else:
                 logger.info(f'valid {valid_auc:.4f}, test {test_auc:.4f}')

         return valid_auc, time.time() - start_time, info_dic

     #  train one epoch of train/val/test
     @classmethod
     def run(cls, logger, epoch, iter_per_epoch, model, data_loader, opt_metric, args, optimizer=None,
             namespace='train'):
         if optimizer:
             model.train()
         else:
             model.eval()

         time_avg, timestamp = utils.AvgrageMeter(), time.time()
         loss_avg, auc_avg = utils.AvgrageMeter(), utils.AvgrageMeter()

         batch_idx = 0
         for batch_idx, batch in enumerate(data_loader):
             # if suer set this, then only train fix number of iteras
             # stop training current epoch for evaluation
             if namespace == 'train' and iter_per_epoch is not None and batch_idx >= iter_per_epoch:
                 logger.info(f"Traing Iteration {batch_idx} > iter_per_epoch = {iter_per_epoch}, breakout")
                 break

             target = batch['y'].type(torch.LongTensor).to(args.device)
             batch['id'] = batch['id'].to(args.device)
             batch['value'] = batch['value'].to(args.device)

             if namespace == 'train':
                 y = model(batch)
                 loss = opt_metric(y, target)

                 optimizer.zero_grad()
                 loss.backward()
                 optimizer.step()
             else:
                 with torch.no_grad():
                     y = model(batch)
                     loss = opt_metric(y, target)

             # for multiple classification
             auc = utils.roc_auc_compute_fn(torch.nn.functional.softmax(y, dim=1)[:, 1], target)
             # for binary classification
             # auc = utils.roc_auc_compute_fn(y, target)
             loss_avg.update(loss.item(), target.size(0))
             auc_avg.update(auc, target.size(0))

             time_avg.update(time.time() - timestamp)
             timestamp = time.time()
             if batch_idx % args.report_freq == 0:
                 logger.info(f'Epoch [{epoch:3d}/{args.epoch_num}][{batch_idx:3d}/{len(data_loader)}]\t'
                             f'{time_avg.val:.3f} ({time_avg.avg:.3f}) AUC {auc_avg.val:4f} ({auc_avg.avg:4f}) '
                             f'Loss {loss_avg.val:8.4f} ({loss_avg.avg:8.4f})')

                 # print(f'Epoch [{epoch:3d}/{args.epoch_num}][{batch_idx:3d}/{len(data_loader)}]\t'
                 #       f'{time_avg.val:.3f} ({time_avg.avg:.3f}) AUC {auc_avg.val:4f} ({auc_avg.avg:4f}) '
                 #       f'Loss {loss_avg.val:8.4f} ({loss_avg.avg:8.4f})')

         # record the last epoch information
         logger.info(f'Epoch [{epoch:3d}/{args.epoch_num}][{batch_idx:3d}/{len(data_loader)}]\t'
                     f'{time_avg.val:.3f} ({time_avg.avg:.3f}) AUC {auc_avg.val:4f} ({auc_avg.avg:4f}) '
                     f'Loss {loss_avg.val:8.4f} ({loss_avg.avg:8.4f})')

         # print(f'Epoch [{epoch:3d}/{args.epoch_num}][{batch_idx:3d}/{len(data_loader)}]\t'
         #       f'{time_avg.val:.3f} ({time_avg.avg:.3f}) AUC {auc_avg.val:4f} ({auc_avg.avg:4f}) '
         #       f'Loss {loss_avg.val:8.4f} ({loss_avg.avg:8.4f})')

         logger.info(f'{namespace}\tTime {utils.timeSince(s=time_avg.sum):>12s} '
                     f'AUC {auc_avg.avg:8.4f} Loss {loss_avg.avg:8.4f}')

         # print(f'{namespace}\tTime {utils.timeSince(s=time_avg.sum):>12s} '
         #       f'AUC {auc_avg.avg:8.4f} Loss {loss_avg.avg:8.4f}')

         return auc_avg.avg, loss_avg.avg