examples/model_selection/TRAILS-Database-Native-Model-Selection/internal/ml/model_selection/src/eva_engine/phase1/algo/prune_synflow.py - singa - Git at Google

 #
 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #

 from src.eva_engine.phase1.algo.alg_base import Evaluator
 from src.common.constant import Config

 from singa import singa_wrap as singa
 from singa import device as singa_device
 from singa import tensor
 from singa import opt
 from singa import autograd
 from singa.opt import Optimizer
 from singa.opt import DecayScheduler
 from singa.opt import Constant
 import numpy as np
 import time
 import argparse
 from PIL import Image
 from numpy import linalg as LA

 np_dtype = {"float16": np.float16, "float32": np.float32}

 # singa_dtype = {"float16": tensor.float16, "float32": tensor.float32}
 singa_dtype = {"float32": tensor.float32}

 ### MSOptimizer
 class MSOptimizer(Optimizer):
     def __call__(self, loss):
         pn_p_g_list = self.call_with_returns(loss)
         # print ("optimizer1 before self.step()")
         # print ("optimizer1 before print len(pn_p_g_list): \n", len(pn_p_g_list))
         self.step()
         # print ("optimizer1 after print len(pn_p_g_list): \n", len(pn_p_g_list))
         # print ("optimizer1 after self.step()")
         return pn_p_g_list

     def call_with_returns(self, loss):
         # print ("call_with_returns before apply loss.data: \n", loss.data)
         pn_p_g_list = []
         for p, g in autograd.backward(loss):
             if p.name is None:
                 p.name = id(p)
             self.apply(p.name, p, g)
             # print ("call with returns")
             # print ("p.name: \n", p.name)
             # print ("p.data: \n", p.data)
             # print ("g.data: \n", g.data)
             pn_p_g_list.append([p.name, p, g])  # need iterables
         # print ("call_with_returns after apply loss.data: \n", loss.data)
         return pn_p_g_list

 # MSSGD -- actually no change of code
 class MSSGD(MSOptimizer):
     """Implements stochastic gradient descent (optionally with momentum).

     Nesterov momentum is based on the formula from `On the importance of initialization and momentum in deep learning`__.

     Args:
         lr(float): learning rate
         momentum(float, optional): momentum factor(default: 0)
         weight_decay(float, optional): weight decay(L2 penalty)(default: 0)
         dampening(float, optional): dampening for momentum(default: 0)
         nesterov(bool, optional): enables Nesterov momentum(default: False)

     Typical usage example:
         >> > from singa import opt
         >> > optimizer = opt.SGD(lr=0.1, momentum=0.9)
         >> > optimizer.update()

     __ http: // www.cs.toronto.edu / %7Ehinton / absps / momentum.pdf

     .. note::
         The implementation of SGD with Momentum / Nesterov subtly differs from
         Sutskever et. al. and implementations in some other frameworks.

         Considering the specific case of Momentum, the update can be written as

         .. math::
                   v = \rho * v + g \\
                   p = p - lr * v

         where p, g, v and: math: `\rho` denote the parameters, gradient,
         velocity, and momentum respectively.

         This is in contrast to Sutskever et. al. and
         other frameworks which employ an update of the form

         .. math::
              v = \rho * v + lr * g \\
              p = p - v

         The Nesterov version is analogously modified.
     """

     def __init__(self,
                  lr=0.1,
                  momentum=0,
                  dampening=0,
                  weight_decay=0,
                  nesterov=False,
                  dtype=tensor.float32):
         super(MSSGD, self).__init__(lr)

         # init momentum
         if type(momentum) == float or type(momentum) == int:
             if momentum < 0.0:
                 raise ValueError("Invalid momentum value: {}".format(momentum))
             self.momentum = Constant(momentum)
         elif isinstance(momentum, DecayScheduler):
             self.momentum = momentum
             momentum = momentum.init_value
         else:
             raise TypeError("Wrong momentum type")
         # self.dtype = dtype
         # self.mom_value = self.momentum(self.step_counter).as_type(self.dtype)
         self.mom_value = self.momentum(self.step_counter)

         # init dampening
         if type(dampening) == float or type(dampening) == int:
             self.dampening = Constant(dampening)
         elif isinstance(dampening, DecayScheduler):
             self.dampening = dampening
             dampening = dampening.init_value
         else:
             raise TypeError("Wrong dampening type")
         # self.dam_value = self.dampening(self.step_counter).as_type(self.dtype)
         self.dam_value = self.dampening(self.step_counter)

         # init weight_decay
         if type(weight_decay) == float or type(weight_decay) == int:
             if weight_decay < 0.0:
                 raise ValueError(
                     "Invalid weight_decay value: {}".format(weight_decay))
             self.weight_decay = Constant(weight_decay)
         elif isinstance(weight_decay, DecayScheduler):
             self.weight_decay = weight_decay
         else:
             raise TypeError("Wrong weight_decay type")
         # self.decay_value = self.weight_decay(self.step_counter).as_type(self.dtype)
         self.decay_value = self.weight_decay(self.step_counter)

         # init other params
         self.nesterov = nesterov
         self.moments = dict()

         # check value
         if nesterov and (momentum <= 0 or dampening != 0):
             raise ValueError(
                 "Nesterov momentum requires a momentum and zero dampening")

     def apply(self, param_name, param_value, param_grad):
         """Performs a single optimization step.

         Args:
                 param_name(String): the name of the param
                 param_value(Tensor): param values to be update in-place
                 grad(Tensor): param gradients; the values may be updated
                         in this function; cannot use it anymore
         """
         assert param_value.shape == param_grad.shape, ("shape mismatch",
                                                        param_value.shape,
                                                        param_grad.shape)
         self.device_check(param_value, self.step_counter, self.lr_value,
                           self.mom_value, self.dam_value, self.decay_value)

         # derive dtype from input
         # assert param_value.dtype == self.dtype

         # TODO add branch operator
         # if self.decay_value != 0:
         if self.weight_decay.init_value != 0:
             singa.Axpy(self.decay_value.data, param_value.data, param_grad.data)

         if self.momentum.init_value != 0:
             if param_name not in self.moments:
                 flag = param_value.device.graph_enabled()
                 param_value.device.EnableGraph(False)
                 self.moments[param_name] = tensor.zeros_like(param_value)
                 param_value.device.EnableGraph(flag)

             buf = self.moments[param_name]
             buf *= self.mom_value
             alpha = 1.0 - self.dam_value
             singa.Axpy(alpha.data, param_grad.data, buf.data)

             if self.nesterov:
                 singa.Axpy(self.mom_value.data, buf.data, param_grad.data)
             else:
                 param_grad = buf

         minus_lr = 0.0 - self.lr_value
         singa.Axpy(minus_lr.data, param_grad.data, param_value.data)

     def step(self):
         # increment step counter, lr and moment
         # print ("before super step")
         super().step()
         # print ("after super step")
         # print ("before custiomized step")
         # mom_value = self.momentum(self.step_counter).as_type(self.dtype)
         # dam_value = self.dampening(self.step_counter).as_type(self.dtype)
         # decay_value = self.weight_decay(self.step_counter).as_type(self.dtype)
         mom_value = self.momentum(self.step_counter)
         dam_value = self.dampening(self.step_counter)
         decay_value = self.weight_decay(self.step_counter)
         self.mom_value.copy_from(mom_value)
         self.dam_value.copy_from(dam_value)
         self.decay_value.copy_from(decay_value)
         # print ("after customized step")

     def get_states(self):
         states = super().get_states()
         if self.mom_value > 0:
             states[
                 'moments'] = self.moments  # a dict for 1st order moments tensors
         return states

     def set_states(self, states):
         super().set_states(states)
         if 'moments' in states:
             self.moments = states['moments']
             self.mom_value = self.momentum(self.step_counter)

 # Data augmentation
 def augmentation(x, batch_size):
     xpad = np.pad(x, [[0, 0], [0, 0], [4, 4], [4, 4]], 'symmetric')
     for data_num in range(0, batch_size):
         offset = np.random.randint(8, size=2)
         x[data_num, :, :, :] = xpad[data_num, :,
                                     offset[0]:offset[0] + x.shape[2],
                                     offset[1]:offset[1] + x.shape[2]]
         if_flip = np.random.randint(2)
         if (if_flip):
             x[data_num, :, :, :] = x[data_num, :, :, ::-1]
     return x


 # Calculate accuracy
 def accuracy(pred, target):
     # y is network output to be compared with ground truth (int)
     y = np.argmax(pred, axis=1)
     a = y == target
     correct = np.array(a, "int").sum()
     return correct


 # Data partition according to the rank
 def partition(global_rank, world_size, train_x, train_y, val_x, val_y):
     # Partition training data
     data_per_rank = train_x.shape[0] // world_size
     idx_start = global_rank * data_per_rank
     idx_end = (global_rank + 1) * data_per_rank
     train_x = train_x[idx_start:idx_end]
     train_y = train_y[idx_start:idx_end]

     # Partition evaluation data
     data_per_rank = val_x.shape[0] // world_size
     idx_start = global_rank * data_per_rank
     idx_end = (global_rank + 1) * data_per_rank
     val_x = val_x[idx_start:idx_end]
     val_y = val_y[idx_start:idx_end]
     return train_x, train_y, val_x, val_y


 # Function to all reduce NUMPY accuracy and loss from multiple devices
 def reduce_variable(variable, dist_opt, reducer):
     reducer.copy_from_numpy(variable)
     dist_opt.all_reduce(reducer.data)
     dist_opt.wait()
     output = tensor.to_numpy(reducer)
     return output


 def resize_dataset(x, image_size):
     num_data = x.shape[0]
     dim = x.shape[1]
     X = np.zeros(shape=(num_data, dim, image_size, image_size),
                  dtype=np.float32)
     for n in range(0, num_data):
         for d in range(0, dim):
             X[n, d, :, :] = np.array(Image.fromarray(x[n, d, :, :]).resize(
                 (image_size, image_size), Image.BILINEAR),
                                      dtype=np.float32)
     return X

 import torch
 class SynFlowEvaluator(Evaluator):

     def __init__(self):
         super().__init__()

     def evaluate(self, arch, device, batch_data: object, batch_labels: torch.Tensor, space_name: str) -> float:
         """
         This is implementation of paper
         "Pruning neural networks without any data by iteratively conserving synaptic flow"
         The score takes 5 steps:
             1. For each layer, for each parameter, calculate the absolute value |0|
             2. Use a single all-one-vector with dim = [1, c, h, w] to run a forward,
                Since only consider linear and Con2d operation, the forward output is multiple( [ |0l| for l in L] )
             3. New loss function R = sum(output), and then run backward
             4. for each layer, calculate Sl = Hadamard product( df/dw, w), where Sij=aij×bij
             5. score = sum( [ Sl for l in layers ] )
         Comments:
             1. this is data-Agnostic
             2. only compute on a single example
         """

         ### singa configs
         mssgd = MSSGD(lr=0.005, momentum=0.9, weight_decay=1e-5, dtype=singa_dtype['float32'])
         device_id = 0
         max_epoch = 1
         model = arch
         graph = True
         verbosity = 0
         dist_option='plain'
         spars=None
         precision = 'float32'
         global_rank = 0
         world_size = 1

         ### singa setups
         # print ("device: \n", device)
         if device == 'cpu':
             dev = singa_device.get_default_device()
         else:  # GPU
             dev = singa_device.create_cuda_gpu_on(local_rank)  # need to change to CPU device for CPU-only machines
         dev.SetRandSeed(0)
         np.random.seed(0)

         # For distributed training, sequential has better performance
         if hasattr(mssgd, "communicator"):
             DIST = True
             sequential = True
         else:
             DIST = False
             sequential = False

         model.train()

         ### process batch_data
         x = batch_data.cpu().numpy() # Size([1, 100]) and all ones
         x = x.astype(np_dtype[precision])
         y = np.ones(x.shape[0], dtype=np.int32)
         if model.dimension == 2:  # input data dimension
             tx = tensor.Tensor(x.shape, dev, singa_dtype[precision])
         ty = tensor.Tensor((x.shape[0],), dev, tensor.int32)

         model.set_optimizer(mssgd)
         model.compile([tx], is_train=True, use_graph=graph, sequential=sequential)
         dev.SetVerbosity(verbosity)


         # 1. Convert params to their abs.
         synflow_flag = True ### just change the model to the absolute value
         tx.copy_from_numpy(x)  # dtype=np.float32
         ty.copy_from_numpy(y)
         # print ("before model forward ...")
         pn_p_g_list, out, loss = model(tx, ty, dist_option, spars, synflow_flag)
         # print ("---------------------------------------")
         # print ("before absolute prune_synflow !!!nemb input vector!!! tensor.to_numpy(loss)[0]: ", tensor.to_numpy(loss)[0])
         # print ("before absolute prune_synflow !!!nemb input vector!!! tensor.to_numpy(loss): ", tensor.to_numpy(loss))
         # train_correct += accuracy(tensor.to_numpy(out), y)
         # train_loss += tensor.to_numpy(loss)[0]
         # all params turned to positive
         for pn_p_g_item in pn_p_g_list:
             # print ("absolute value parameter name: \n", pn_p_g_item[0])
             param_np = tensor.to_numpy(pn_p_g_item[1])
             # print ("param_np shape: \n", param_np.shape)
             # print ("param_np sqrt norm: \n", np.sqrt(LA.norm(param_np)/param_np.size))
             # print ("before abs np.min(tensor.to_numpy(pn_p_g_item[1])): \n", np.min(tensor.to_numpy(pn_p_g_item[1])))
             pn_p_g_item[1] = tensor.abs(pn_p_g_item[1])  # tensor actually ..
             # print ("after abs np.min(tensor.to_numpy(pn_p_g_item[1])): \n", np.min(tensor.to_numpy(pn_p_g_item[1])))
             # print ("after abs pn_p_g_item[1][0]: \n", pn_p_g_item[1][0])

         # 2. Compute gradients with input of one dummy example ( 1-vector with dimension [1, c, h, w] )
         # 3.R = sum(output)
         # 4. Select the gradients that we want to use for search/prune
         # 5. Sum over all parameter's results to get the final score.
         # score = sum([grad.sum() for grad in grads_abs])

         # print ("calculate synflow")
         synflow_flag = True
         ### step 1: all one input
         # Copy the patch data into input tensors
         # tx.copy_from_numpy(np.ones(x.shape, dtype=np.float32))
         tx.copy_from_numpy(x)  # dtype=np.float32 # actually it is all ones ... --> np.ones(x.shape, dtype=np.float32)
         ty.copy_from_numpy(y)
         ### step 2: all weights turned to positive (done)
         ### step 3: new loss (done)
         # print ("before model forward ...")
         pn_p_g_list, out, loss = model(tx, ty, dist_option, spars, synflow_flag)
         # print ("prune_synflow !!!nemb input vector!!! synflow step tensor.to_numpy(loss)[0]: ", tensor.to_numpy(loss)[0])
         ### step 4: calculate the multiplication of weights
         score = 0.0
         for pn_p_g_item in pn_p_g_list:
             # print ("calculate weight param * grad parameter name: \n", pn_p_g_item[0])
             if len(pn_p_g_item[1].shape) == 2: # param_value.data is "weight"
                 # print ("pn_p_g_item[1].shape: \n", pn_p_g_item[1].shape)
                 # print ("tensor.to_numpy(pn_p_g_item[1][0]): ", tensor.to_numpy(pn_p_g_item[1][0]))
                 # print ("calculate synflow parameter name: \n", pn_p_g_item[0])
                 # print ("should be positive np.min(tensor.to_numpy(pn_p_g_item[1])): ", np.min(tensor.to_numpy(pn_p_g_item[1])))
                 # print ("weight should be positive tensor.to_numpy(pn_p_g_item[1][0])[0, :10]: ", tensor.to_numpy(pn_p_g_item[1][0])[0, :10])
                 # print ("gradients tensor.to_numpy(pn_p_g_item[2][0])[0, :10]: ", tensor.to_numpy(pn_p_g_item[2][0])[0, :10])
                 # print ()
                 score += np.sum(np.absolute(tensor.to_numpy(pn_p_g_item[1]) * tensor.to_numpy(pn_p_g_item[2])))
         # print ("layer_hidden_list: \n", layer_hidden_list)
         # print ("prune_synflow !!!one-hot input vector!!! absolute step tensor.to_numpy(loss)[0]: ", tensor.to_numpy(loss)[0])
         print ("score: \n", score)

         return score
	#
	# Licensed to the Apache Software Foundation (ASF) under one
	# or more contributor license agreements. See the NOTICE file
	# distributed with this work for additional information
	# regarding copyright ownership. The ASF licenses this file
	# to you under the Apache License, Version 2.0 (the
	# "License"); you may not use this file except in compliance
	# with the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	#

	from src.eva_engine.phase1.algo.alg_base import Evaluator
	from src.common.constant import Config

	from singa import singa_wrap as singa
	from singa import device as singa_device
	from singa import tensor
	from singa import opt
	from singa import autograd
	from singa.opt import Optimizer
	from singa.opt import DecayScheduler
	from singa.opt import Constant
	import numpy as np
	import time
	import argparse
	from PIL import Image
	from numpy import linalg as LA

	np_dtype = {"float16": np.float16, "float32": np.float32}

	# singa_dtype = {"float16": tensor.float16, "float32": tensor.float32}
	singa_dtype = {"float32": tensor.float32}

	### MSOptimizer
	class MSOptimizer(Optimizer):
	def __call__(self, loss):
	pn_p_g_list = self.call_with_returns(loss)
	# print ("optimizer1 before self.step()")
	# print ("optimizer1 before print len(pn_p_g_list): \n", len(pn_p_g_list))
	self.step()
	# print ("optimizer1 after print len(pn_p_g_list): \n", len(pn_p_g_list))
	# print ("optimizer1 after self.step()")
	return pn_p_g_list

	def call_with_returns(self, loss):
	# print ("call_with_returns before apply loss.data: \n", loss.data)
	pn_p_g_list = []
	for p, g in autograd.backward(loss):
	if p.name is None:
	p.name = id(p)
	self.apply(p.name, p, g)
	# print ("call with returns")
	# print ("p.name: \n", p.name)
	# print ("p.data: \n", p.data)
	# print ("g.data: \n", g.data)
	pn_p_g_list.append([p.name, p, g]) # need iterables
	# print ("call_with_returns after apply loss.data: \n", loss.data)
	return pn_p_g_list

	# MSSGD -- actually no change of code
	class MSSGD(MSOptimizer):
	"""Implements stochastic gradient descent (optionally with momentum).

	Nesterov momentum is based on the formula from `On the importance of initialization and momentum in deep learning`__.

	Args:
	lr(float): learning rate
	momentum(float, optional): momentum factor(default: 0)
	weight_decay(float, optional): weight decay(L2 penalty)(default: 0)
	dampening(float, optional): dampening for momentum(default: 0)
	nesterov(bool, optional): enables Nesterov momentum(default: False)

	Typical usage example:
	>> > from singa import opt
	>> > optimizer = opt.SGD(lr=0.1, momentum=0.9)
	>> > optimizer.update()

	__ http: // www.cs.toronto.edu / %7Ehinton / absps / momentum.pdf

	.. note::
	The implementation of SGD with Momentum / Nesterov subtly differs from
	Sutskever et. al. and implementations in some other frameworks.

	Considering the specific case of Momentum, the update can be written as

	.. math::
	v = \rho * v + g \\
	p = p - lr * v

	where p, g, v and: math: `\rho` denote the parameters, gradient,
	velocity, and momentum respectively.

	This is in contrast to Sutskever et. al. and
	other frameworks which employ an update of the form

	.. math::
	v = \rho * v + lr * g \\
	p = p - v

	The Nesterov version is analogously modified.
	"""

	def __init__(self,
	lr=0.1,
	momentum=0,
	dampening=0,
	weight_decay=0,
	nesterov=False,
	dtype=tensor.float32):
	super(MSSGD, self).__init__(lr)

	# init momentum
	if type(momentum) == float or type(momentum) == int:
	if momentum < 0.0:
	raise ValueError("Invalid momentum value: {}".format(momentum))
	self.momentum = Constant(momentum)
	elif isinstance(momentum, DecayScheduler):
	self.momentum = momentum
	momentum = momentum.init_value
	else:
	raise TypeError("Wrong momentum type")
	# self.dtype = dtype
	# self.mom_value = self.momentum(self.step_counter).as_type(self.dtype)
	self.mom_value = self.momentum(self.step_counter)

	# init dampening
	if type(dampening) == float or type(dampening) == int:
	self.dampening = Constant(dampening)
	elif isinstance(dampening, DecayScheduler):
	self.dampening = dampening
	dampening = dampening.init_value
	else:
	raise TypeError("Wrong dampening type")
	# self.dam_value = self.dampening(self.step_counter).as_type(self.dtype)
	self.dam_value = self.dampening(self.step_counter)

	# init weight_decay
	if type(weight_decay) == float or type(weight_decay) == int:
	if weight_decay < 0.0:
	raise ValueError(
	"Invalid weight_decay value: {}".format(weight_decay))
	self.weight_decay = Constant(weight_decay)
	elif isinstance(weight_decay, DecayScheduler):
	self.weight_decay = weight_decay
	else:
	raise TypeError("Wrong weight_decay type")
	# self.decay_value = self.weight_decay(self.step_counter).as_type(self.dtype)
	self.decay_value = self.weight_decay(self.step_counter)

	# init other params
	self.nesterov = nesterov
	self.moments = dict()

	# check value
	if nesterov and (momentum <= 0 or dampening != 0):
	raise ValueError(
	"Nesterov momentum requires a momentum and zero dampening")

	def apply(self, param_name, param_value, param_grad):
	"""Performs a single optimization step.

	Args:
	param_name(String): the name of the param
	param_value(Tensor): param values to be update in-place
	grad(Tensor): param gradients; the values may be updated
	in this function; cannot use it anymore
	"""
	assert param_value.shape == param_grad.shape, ("shape mismatch",
	param_value.shape,
	param_grad.shape)
	self.device_check(param_value, self.step_counter, self.lr_value,
	self.mom_value, self.dam_value, self.decay_value)

	# derive dtype from input
	# assert param_value.dtype == self.dtype

	# TODO add branch operator
	# if self.decay_value != 0:
	if self.weight_decay.init_value != 0:
	singa.Axpy(self.decay_value.data, param_value.data, param_grad.data)

	if self.momentum.init_value != 0:
	if param_name not in self.moments:
	flag = param_value.device.graph_enabled()
	param_value.device.EnableGraph(False)
	self.moments[param_name] = tensor.zeros_like(param_value)
	param_value.device.EnableGraph(flag)

	buf = self.moments[param_name]
	buf *= self.mom_value
	alpha = 1.0 - self.dam_value
	singa.Axpy(alpha.data, param_grad.data, buf.data)

	if self.nesterov:
	singa.Axpy(self.mom_value.data, buf.data, param_grad.data)
	else:
	param_grad = buf

	minus_lr = 0.0 - self.lr_value
	singa.Axpy(minus_lr.data, param_grad.data, param_value.data)

	def step(self):
	# increment step counter, lr and moment
	# print ("before super step")
	super().step()
	# print ("after super step")
	# print ("before custiomized step")
	# mom_value = self.momentum(self.step_counter).as_type(self.dtype)
	# dam_value = self.dampening(self.step_counter).as_type(self.dtype)
	# decay_value = self.weight_decay(self.step_counter).as_type(self.dtype)
	mom_value = self.momentum(self.step_counter)
	dam_value = self.dampening(self.step_counter)
	decay_value = self.weight_decay(self.step_counter)
	self.mom_value.copy_from(mom_value)
	self.dam_value.copy_from(dam_value)
	self.decay_value.copy_from(decay_value)
	# print ("after customized step")

	def get_states(self):
	states = super().get_states()
	if self.mom_value > 0:
	states[
	'moments'] = self.moments # a dict for 1st order moments tensors
	return states

	def set_states(self, states):
	super().set_states(states)
	if 'moments' in states:
	self.moments = states['moments']
	self.mom_value = self.momentum(self.step_counter)

	# Data augmentation
	def augmentation(x, batch_size):
	xpad = np.pad(x, [[0, 0], [0, 0], [4, 4], [4, 4]], 'symmetric')
	for data_num in range(0, batch_size):
	offset = np.random.randint(8, size=2)
	x[data_num, :, :, :] = xpad[data_num, :,
	offset[0]:offset[0] + x.shape[2],
	offset[1]:offset[1] + x.shape[2]]
	if_flip = np.random.randint(2)
	if (if_flip):
	x[data_num, :, :, :] = x[data_num, :, :, ::-1]
	return x


	# Calculate accuracy
	def accuracy(pred, target):
	# y is network output to be compared with ground truth (int)
	y = np.argmax(pred, axis=1)
	a = y == target
	correct = np.array(a, "int").sum()
	return correct


	# Data partition according to the rank
	def partition(global_rank, world_size, train_x, train_y, val_x, val_y):
	# Partition training data
	data_per_rank = train_x.shape[0] // world_size
	idx_start = global_rank * data_per_rank
	idx_end = (global_rank + 1) * data_per_rank
	train_x = train_x[idx_start:idx_end]
	train_y = train_y[idx_start:idx_end]

	# Partition evaluation data
	data_per_rank = val_x.shape[0] // world_size
	idx_start = global_rank * data_per_rank
	idx_end = (global_rank + 1) * data_per_rank
	val_x = val_x[idx_start:idx_end]
	val_y = val_y[idx_start:idx_end]
	return train_x, train_y, val_x, val_y


	# Function to all reduce NUMPY accuracy and loss from multiple devices
	def reduce_variable(variable, dist_opt, reducer):
	reducer.copy_from_numpy(variable)
	dist_opt.all_reduce(reducer.data)
	dist_opt.wait()
	output = tensor.to_numpy(reducer)
	return output


	def resize_dataset(x, image_size):
	num_data = x.shape[0]
	dim = x.shape[1]
	X = np.zeros(shape=(num_data, dim, image_size, image_size),
	dtype=np.float32)
	for n in range(0, num_data):
	for d in range(0, dim):
	X[n, d, :, :] = np.array(Image.fromarray(x[n, d, :, :]).resize(
	(image_size, image_size), Image.BILINEAR),
	dtype=np.float32)
	return X

	import torch
	class SynFlowEvaluator(Evaluator):

	def __init__(self):
	super().__init__()

	def evaluate(self, arch, device, batch_data: object, batch_labels: torch.Tensor, space_name: str) -> float:
	"""
	This is implementation of paper
	"Pruning neural networks without any data by iteratively conserving synaptic flow"
	The score takes 5 steps:
	1. For each layer, for each parameter, calculate the absolute value \|0\|
	2. Use a single all-one-vector with dim = [1, c, h, w] to run a forward,
	Since only consider linear and Con2d operation, the forward output is multiple( [ \|0l\| for l in L] )
	3. New loss function R = sum(output), and then run backward
	4. for each layer, calculate Sl = Hadamard product( df/dw, w), where Sij=aij×bij
	5. score = sum( [ Sl for l in layers ] )
	Comments:
	1. this is data-Agnostic
	2. only compute on a single example
	"""

	### singa configs
	mssgd = MSSGD(lr=0.005, momentum=0.9, weight_decay=1e-5, dtype=singa_dtype['float32'])
	device_id = 0
	max_epoch = 1
	model = arch
	graph = True
	verbosity = 0
	dist_option='plain'
	spars=None
	precision = 'float32'
	global_rank = 0
	world_size = 1

	### singa setups
	# print ("device: \n", device)
	if device == 'cpu':
	dev = singa_device.get_default_device()
	else: # GPU
	dev = singa_device.create_cuda_gpu_on(local_rank) # need to change to CPU device for CPU-only machines
	dev.SetRandSeed(0)
	np.random.seed(0)

	# For distributed training, sequential has better performance
	if hasattr(mssgd, "communicator"):
	DIST = True
	sequential = True
	else:
	DIST = False
	sequential = False

	model.train()

	### process batch_data
	x = batch_data.cpu().numpy() # Size([1, 100]) and all ones
	x = x.astype(np_dtype[precision])
	y = np.ones(x.shape[0], dtype=np.int32)
	if model.dimension == 2: # input data dimension
	tx = tensor.Tensor(x.shape, dev, singa_dtype[precision])
	ty = tensor.Tensor((x.shape[0],), dev, tensor.int32)

	model.set_optimizer(mssgd)
	model.compile([tx], is_train=True, use_graph=graph, sequential=sequential)
	dev.SetVerbosity(verbosity)


	# 1. Convert params to their abs.
	synflow_flag = True ### just change the model to the absolute value
	tx.copy_from_numpy(x) # dtype=np.float32
	ty.copy_from_numpy(y)
	# print ("before model forward ...")
	pn_p_g_list, out, loss = model(tx, ty, dist_option, spars, synflow_flag)
	# print ("---------------------------------------")
	# print ("before absolute prune_synflow !!!nemb input vector!!! tensor.to_numpy(loss)[0]: ", tensor.to_numpy(loss)[0])
	# print ("before absolute prune_synflow !!!nemb input vector!!! tensor.to_numpy(loss): ", tensor.to_numpy(loss))
	# train_correct += accuracy(tensor.to_numpy(out), y)
	# train_loss += tensor.to_numpy(loss)[0]
	# all params turned to positive
	for pn_p_g_item in pn_p_g_list:
	# print ("absolute value parameter name: \n", pn_p_g_item[0])
	param_np = tensor.to_numpy(pn_p_g_item[1])
	# print ("param_np shape: \n", param_np.shape)
	# print ("param_np sqrt norm: \n", np.sqrt(LA.norm(param_np)/param_np.size))
	# print ("before abs np.min(tensor.to_numpy(pn_p_g_item[1])): \n", np.min(tensor.to_numpy(pn_p_g_item[1])))
	pn_p_g_item[1] = tensor.abs(pn_p_g_item[1]) # tensor actually ..
	# print ("after abs np.min(tensor.to_numpy(pn_p_g_item[1])): \n", np.min(tensor.to_numpy(pn_p_g_item[1])))
	# print ("after abs pn_p_g_item[1][0]: \n", pn_p_g_item[1][0])

	# 2. Compute gradients with input of one dummy example ( 1-vector with dimension [1, c, h, w] )
	# 3.R = sum(output)
	# 4. Select the gradients that we want to use for search/prune
	# 5. Sum over all parameter's results to get the final score.
	# score = sum([grad.sum() for grad in grads_abs])

	# print ("calculate synflow")
	synflow_flag = True
	### step 1: all one input
	# Copy the patch data into input tensors
	# tx.copy_from_numpy(np.ones(x.shape, dtype=np.float32))
	tx.copy_from_numpy(x) # dtype=np.float32 # actually it is all ones ... --> np.ones(x.shape, dtype=np.float32)
	ty.copy_from_numpy(y)
	### step 2: all weights turned to positive (done)
	### step 3: new loss (done)
	# print ("before model forward ...")
	pn_p_g_list, out, loss = model(tx, ty, dist_option, spars, synflow_flag)
	# print ("prune_synflow !!!nemb input vector!!! synflow step tensor.to_numpy(loss)[0]: ", tensor.to_numpy(loss)[0])
	### step 4: calculate the multiplication of weights
	score = 0.0
	for pn_p_g_item in pn_p_g_list:
	# print ("calculate weight param * grad parameter name: \n", pn_p_g_item[0])
	if len(pn_p_g_item[1].shape) == 2: # param_value.data is "weight"
	# print ("pn_p_g_item[1].shape: \n", pn_p_g_item[1].shape)
	# print ("tensor.to_numpy(pn_p_g_item[1][0]): ", tensor.to_numpy(pn_p_g_item[1][0]))
	# print ("calculate synflow parameter name: \n", pn_p_g_item[0])
	# print ("should be positive np.min(tensor.to_numpy(pn_p_g_item[1])): ", np.min(tensor.to_numpy(pn_p_g_item[1])))
	# print ("weight should be positive tensor.to_numpy(pn_p_g_item[1][0])[0, :10]: ", tensor.to_numpy(pn_p_g_item[1][0])[0, :10])
	# print ("gradients tensor.to_numpy(pn_p_g_item[2][0])[0, :10]: ", tensor.to_numpy(pn_p_g_item[2][0])[0, :10])
	# print ()
	score += np.sum(np.absolute(tensor.to_numpy(pn_p_g_item[1]) * tensor.to_numpy(pn_p_g_item[2])))
	# print ("layer_hidden_list: \n", layer_hidden_list)
	# print ("prune_synflow !!!one-hot input vector!!! absolute step tensor.to_numpy(loss)[0]: ", tensor.to_numpy(loss)[0])
	print ("score: \n", score)

	return score