blob: 1c671febdaeae0f99cc76c5490052ebbf50868ec [file] [log] [blame]
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from src.eva_engine.phase1.algo.alg_base import Evaluator
from src.common.constant import Config
from singa import singa_wrap as singa
from singa import device as singa_device
from singa import tensor
from singa import opt
from singa import autograd
from singa.opt import Optimizer
from singa.opt import DecayScheduler
from singa.opt import Constant
import numpy as np
import time
import argparse
from PIL import Image
from numpy import linalg as LA
np_dtype = {"float16": np.float16, "float32": np.float32}
# singa_dtype = {"float16": tensor.float16, "float32": tensor.float32}
singa_dtype = {"float32": tensor.float32}
### MSOptimizer
class MSOptimizer(Optimizer):
def __call__(self, loss):
pn_p_g_list = self.call_with_returns(loss)
# print ("optimizer1 before self.step()")
# print ("optimizer1 before print len(pn_p_g_list): \n", len(pn_p_g_list))
self.step()
# print ("optimizer1 after print len(pn_p_g_list): \n", len(pn_p_g_list))
# print ("optimizer1 after self.step()")
return pn_p_g_list
def call_with_returns(self, loss):
# print ("call_with_returns before apply loss.data: \n", loss.data)
pn_p_g_list = []
for p, g in autograd.backward(loss):
if p.name is None:
p.name = id(p)
self.apply(p.name, p, g)
# print ("call with returns")
# print ("p.name: \n", p.name)
# print ("p.data: \n", p.data)
# print ("g.data: \n", g.data)
pn_p_g_list.append([p.name, p, g]) # need iterables
# print ("call_with_returns after apply loss.data: \n", loss.data)
return pn_p_g_list
# MSSGD -- actually no change of code
class MSSGD(MSOptimizer):
"""Implements stochastic gradient descent (optionally with momentum).
Nesterov momentum is based on the formula from `On the importance of initialization and momentum in deep learning`__.
Args:
lr(float): learning rate
momentum(float, optional): momentum factor(default: 0)
weight_decay(float, optional): weight decay(L2 penalty)(default: 0)
dampening(float, optional): dampening for momentum(default: 0)
nesterov(bool, optional): enables Nesterov momentum(default: False)
Typical usage example:
>> > from singa import opt
>> > optimizer = opt.SGD(lr=0.1, momentum=0.9)
>> > optimizer.update()
__ http: // www.cs.toronto.edu / %7Ehinton / absps / momentum.pdf
.. note::
The implementation of SGD with Momentum / Nesterov subtly differs from
Sutskever et. al. and implementations in some other frameworks.
Considering the specific case of Momentum, the update can be written as
.. math::
v = \rho * v + g \\
p = p - lr * v
where p, g, v and: math: `\rho` denote the parameters, gradient,
velocity, and momentum respectively.
This is in contrast to Sutskever et. al. and
other frameworks which employ an update of the form
.. math::
v = \rho * v + lr * g \\
p = p - v
The Nesterov version is analogously modified.
"""
def __init__(self,
lr=0.1,
momentum=0,
dampening=0,
weight_decay=0,
nesterov=False,
dtype=tensor.float32):
super(MSSGD, self).__init__(lr)
# init momentum
if type(momentum) == float or type(momentum) == int:
if momentum < 0.0:
raise ValueError("Invalid momentum value: {}".format(momentum))
self.momentum = Constant(momentum)
elif isinstance(momentum, DecayScheduler):
self.momentum = momentum
momentum = momentum.init_value
else:
raise TypeError("Wrong momentum type")
# self.dtype = dtype
# self.mom_value = self.momentum(self.step_counter).as_type(self.dtype)
self.mom_value = self.momentum(self.step_counter)
# init dampening
if type(dampening) == float or type(dampening) == int:
self.dampening = Constant(dampening)
elif isinstance(dampening, DecayScheduler):
self.dampening = dampening
dampening = dampening.init_value
else:
raise TypeError("Wrong dampening type")
# self.dam_value = self.dampening(self.step_counter).as_type(self.dtype)
self.dam_value = self.dampening(self.step_counter)
# init weight_decay
if type(weight_decay) == float or type(weight_decay) == int:
if weight_decay < 0.0:
raise ValueError(
"Invalid weight_decay value: {}".format(weight_decay))
self.weight_decay = Constant(weight_decay)
elif isinstance(weight_decay, DecayScheduler):
self.weight_decay = weight_decay
else:
raise TypeError("Wrong weight_decay type")
# self.decay_value = self.weight_decay(self.step_counter).as_type(self.dtype)
self.decay_value = self.weight_decay(self.step_counter)
# init other params
self.nesterov = nesterov
self.moments = dict()
# check value
if nesterov and (momentum <= 0 or dampening != 0):
raise ValueError(
"Nesterov momentum requires a momentum and zero dampening")
def apply(self, param_name, param_value, param_grad):
"""Performs a single optimization step.
Args:
param_name(String): the name of the param
param_value(Tensor): param values to be update in-place
grad(Tensor): param gradients; the values may be updated
in this function; cannot use it anymore
"""
assert param_value.shape == param_grad.shape, ("shape mismatch",
param_value.shape,
param_grad.shape)
self.device_check(param_value, self.step_counter, self.lr_value,
self.mom_value, self.dam_value, self.decay_value)
# derive dtype from input
# assert param_value.dtype == self.dtype
# TODO add branch operator
# if self.decay_value != 0:
if self.weight_decay.init_value != 0:
singa.Axpy(self.decay_value.data, param_value.data, param_grad.data)
if self.momentum.init_value != 0:
if param_name not in self.moments:
flag = param_value.device.graph_enabled()
param_value.device.EnableGraph(False)
self.moments[param_name] = tensor.zeros_like(param_value)
param_value.device.EnableGraph(flag)
buf = self.moments[param_name]
buf *= self.mom_value
alpha = 1.0 - self.dam_value
singa.Axpy(alpha.data, param_grad.data, buf.data)
if self.nesterov:
singa.Axpy(self.mom_value.data, buf.data, param_grad.data)
else:
param_grad = buf
minus_lr = 0.0 - self.lr_value
singa.Axpy(minus_lr.data, param_grad.data, param_value.data)
def step(self):
# increment step counter, lr and moment
# print ("before super step")
super().step()
# print ("after super step")
# print ("before custiomized step")
# mom_value = self.momentum(self.step_counter).as_type(self.dtype)
# dam_value = self.dampening(self.step_counter).as_type(self.dtype)
# decay_value = self.weight_decay(self.step_counter).as_type(self.dtype)
mom_value = self.momentum(self.step_counter)
dam_value = self.dampening(self.step_counter)
decay_value = self.weight_decay(self.step_counter)
self.mom_value.copy_from(mom_value)
self.dam_value.copy_from(dam_value)
self.decay_value.copy_from(decay_value)
# print ("after customized step")
def get_states(self):
states = super().get_states()
if self.mom_value > 0:
states[
'moments'] = self.moments # a dict for 1st order moments tensors
return states
def set_states(self, states):
super().set_states(states)
if 'moments' in states:
self.moments = states['moments']
self.mom_value = self.momentum(self.step_counter)
# Data augmentation
def augmentation(x, batch_size):
xpad = np.pad(x, [[0, 0], [0, 0], [4, 4], [4, 4]], 'symmetric')
for data_num in range(0, batch_size):
offset = np.random.randint(8, size=2)
x[data_num, :, :, :] = xpad[data_num, :,
offset[0]:offset[0] + x.shape[2],
offset[1]:offset[1] + x.shape[2]]
if_flip = np.random.randint(2)
if (if_flip):
x[data_num, :, :, :] = x[data_num, :, :, ::-1]
return x
# Calculate accuracy
def accuracy(pred, target):
# y is network output to be compared with ground truth (int)
y = np.argmax(pred, axis=1)
a = y == target
correct = np.array(a, "int").sum()
return correct
# Data partition according to the rank
def partition(global_rank, world_size, train_x, train_y, val_x, val_y):
# Partition training data
data_per_rank = train_x.shape[0] // world_size
idx_start = global_rank * data_per_rank
idx_end = (global_rank + 1) * data_per_rank
train_x = train_x[idx_start:idx_end]
train_y = train_y[idx_start:idx_end]
# Partition evaluation data
data_per_rank = val_x.shape[0] // world_size
idx_start = global_rank * data_per_rank
idx_end = (global_rank + 1) * data_per_rank
val_x = val_x[idx_start:idx_end]
val_y = val_y[idx_start:idx_end]
return train_x, train_y, val_x, val_y
# Function to all reduce NUMPY accuracy and loss from multiple devices
def reduce_variable(variable, dist_opt, reducer):
reducer.copy_from_numpy(variable)
dist_opt.all_reduce(reducer.data)
dist_opt.wait()
output = tensor.to_numpy(reducer)
return output
def resize_dataset(x, image_size):
num_data = x.shape[0]
dim = x.shape[1]
X = np.zeros(shape=(num_data, dim, image_size, image_size),
dtype=np.float32)
for n in range(0, num_data):
for d in range(0, dim):
X[n, d, :, :] = np.array(Image.fromarray(x[n, d, :, :]).resize(
(image_size, image_size), Image.BILINEAR),
dtype=np.float32)
return X
import torch
class SynFlowEvaluator(Evaluator):
def __init__(self):
super().__init__()
def evaluate(self, arch, device, batch_data: object, batch_labels: torch.Tensor, space_name: str) -> float:
"""
This is implementation of paper
"Pruning neural networks without any data by iteratively conserving synaptic flow"
The score takes 5 steps:
1. For each layer, for each parameter, calculate the absolute value |0|
2. Use a single all-one-vector with dim = [1, c, h, w] to run a forward,
Since only consider linear and Con2d operation, the forward output is multiple( [ |0l| for l in L] )
3. New loss function R = sum(output), and then run backward
4. for each layer, calculate Sl = Hadamard product( df/dw, w), where Sij=aij×bij
5. score = sum( [ Sl for l in layers ] )
Comments:
1. this is data-Agnostic
2. only compute on a single example
"""
### singa configs
mssgd = MSSGD(lr=0.005, momentum=0.9, weight_decay=1e-5, dtype=singa_dtype['float32'])
device_id = 0
max_epoch = 1
model = arch
graph = True
verbosity = 0
dist_option='plain'
spars=None
precision = 'float32'
global_rank = 0
world_size = 1
### singa setups
# print ("device: \n", device)
if device == 'cpu':
dev = singa_device.get_default_device()
else: # GPU
dev = singa_device.create_cuda_gpu_on(local_rank) # need to change to CPU device for CPU-only machines
dev.SetRandSeed(0)
np.random.seed(0)
# For distributed training, sequential has better performance
if hasattr(mssgd, "communicator"):
DIST = True
sequential = True
else:
DIST = False
sequential = False
model.train()
### process batch_data
x = batch_data.cpu().numpy() # Size([1, 100]) and all ones
x = x.astype(np_dtype[precision])
y = np.ones(x.shape[0], dtype=np.int32)
if model.dimension == 2: # input data dimension
tx = tensor.Tensor(x.shape, dev, singa_dtype[precision])
ty = tensor.Tensor((x.shape[0],), dev, tensor.int32)
model.set_optimizer(mssgd)
model.compile([tx], is_train=True, use_graph=graph, sequential=sequential)
dev.SetVerbosity(verbosity)
# 1. Convert params to their abs.
synflow_flag = True ### just change the model to the absolute value
tx.copy_from_numpy(x) # dtype=np.float32
ty.copy_from_numpy(y)
# print ("before model forward ...")
pn_p_g_list, out, loss = model(tx, ty, dist_option, spars, synflow_flag)
# print ("---------------------------------------")
# print ("before absolute prune_synflow !!!nemb input vector!!! tensor.to_numpy(loss)[0]: ", tensor.to_numpy(loss)[0])
# print ("before absolute prune_synflow !!!nemb input vector!!! tensor.to_numpy(loss): ", tensor.to_numpy(loss))
# train_correct += accuracy(tensor.to_numpy(out), y)
# train_loss += tensor.to_numpy(loss)[0]
# all params turned to positive
for pn_p_g_item in pn_p_g_list:
# print ("absolute value parameter name: \n", pn_p_g_item[0])
param_np = tensor.to_numpy(pn_p_g_item[1])
# print ("param_np shape: \n", param_np.shape)
# print ("param_np sqrt norm: \n", np.sqrt(LA.norm(param_np)/param_np.size))
# print ("before abs np.min(tensor.to_numpy(pn_p_g_item[1])): \n", np.min(tensor.to_numpy(pn_p_g_item[1])))
pn_p_g_item[1] = tensor.abs(pn_p_g_item[1]) # tensor actually ..
# print ("after abs np.min(tensor.to_numpy(pn_p_g_item[1])): \n", np.min(tensor.to_numpy(pn_p_g_item[1])))
# print ("after abs pn_p_g_item[1][0]: \n", pn_p_g_item[1][0])
# 2. Compute gradients with input of one dummy example ( 1-vector with dimension [1, c, h, w] )
# 3.R = sum(output)
# 4. Select the gradients that we want to use for search/prune
# 5. Sum over all parameter's results to get the final score.
# score = sum([grad.sum() for grad in grads_abs])
# print ("calculate synflow")
synflow_flag = True
### step 1: all one input
# Copy the patch data into input tensors
# tx.copy_from_numpy(np.ones(x.shape, dtype=np.float32))
tx.copy_from_numpy(x) # dtype=np.float32 # actually it is all ones ... --> np.ones(x.shape, dtype=np.float32)
ty.copy_from_numpy(y)
### step 2: all weights turned to positive (done)
### step 3: new loss (done)
# print ("before model forward ...")
pn_p_g_list, out, loss = model(tx, ty, dist_option, spars, synflow_flag)
# print ("prune_synflow !!!nemb input vector!!! synflow step tensor.to_numpy(loss)[0]: ", tensor.to_numpy(loss)[0])
### step 4: calculate the multiplication of weights
score = 0.0
for pn_p_g_item in pn_p_g_list:
# print ("calculate weight param * grad parameter name: \n", pn_p_g_item[0])
if len(pn_p_g_item[1].shape) == 2: # param_value.data is "weight"
# print ("pn_p_g_item[1].shape: \n", pn_p_g_item[1].shape)
# print ("tensor.to_numpy(pn_p_g_item[1][0]): ", tensor.to_numpy(pn_p_g_item[1][0]))
# print ("calculate synflow parameter name: \n", pn_p_g_item[0])
# print ("should be positive np.min(tensor.to_numpy(pn_p_g_item[1])): ", np.min(tensor.to_numpy(pn_p_g_item[1])))
# print ("weight should be positive tensor.to_numpy(pn_p_g_item[1][0])[0, :10]: ", tensor.to_numpy(pn_p_g_item[1][0])[0, :10])
# print ("gradients tensor.to_numpy(pn_p_g_item[2][0])[0, :10]: ", tensor.to_numpy(pn_p_g_item[2][0])[0, :10])
# print ()
score += np.sum(np.absolute(tensor.to_numpy(pn_p_g_item[1]) * tensor.to_numpy(pn_p_g_item[2])))
# print ("layer_hidden_list: \n", layer_hidden_list)
# print ("prune_synflow !!!one-hot input vector!!! absolute step tensor.to_numpy(loss)[0]: ", tensor.to_numpy(loss)[0])
print ("score: \n", score)
return score