blob: 15275679249ce747ad4079869fe8b5501c97362e [file] [log] [blame]
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import os
from src.common.constant import Config
from src.tools.compute import generate_global_rank
from src.tools.io_tools import read_json
base_dir = os.environ.get("base_dir")
if base_dir is None: base_dir = os.getcwd()
print("base_dir is {}".format(base_dir))
# todo: move all those to a config file
# MLP related ground truth
mlp_train_frappe = os.path.join(base_dir, "tab_data/frappe/all_train_baseline_frappe.json")
mlp_train_uci_diabetes = os.path.join(base_dir, "tab_data/uci_diabetes/all_train_baseline_uci_160k_40epoch.json")
mlp_train_criteo = os.path.join(base_dir, "tab_data/criteo/all_train_baseline_criteo.json")
# score result
mlp_score_frappe = os.path.join(base_dir, "tab_data/frappe/score_frappe_batch_size_32_local_finish_all_models.json")
# mlp_score_frappe = os.path.join(base_dir, "tab_data/frappe/score_frappe_batch_size_32_nawot_synflow.json")
mlp_score_uci = os.path.join(base_dir, "tab_data/uci_diabetes/score_uci_diabetes_batch_size_32_all_metrics.json")
mlp_score_criteo = os.path.join(base_dir, "tab_data/criteo/score_criteo_batch_size_32.json")
# 0.8028456677612497
# todo: here is for debug expressFlow only
exp_mlp_score_frappe = os.path.join(base_dir, "micro_sensitivity/3_batch_size/4/score_mlp_sp_frappe_batch_size_32_cpu.json")
exp_mlp_score_uci = os.path.join(base_dir, "micro_sensitivity/3_batch_size/4/score_mlp_sp_uci_diabetes_batch_size_32_cpu.json")
exp_mlp_score_criteo = os.path.join(base_dir, "micro_sensitivity/3_batch_size/4/score_mlp_sp_criteo_batch_size_32_cpu.json")
# todo here we use weigth sharing.
mlp_score_frappe_weight_share = os.path.join(base_dir, "tab_data/weight_share_nas_frappe.json")
# pre computed result
score_one_model_time_dict = {
"cpu": {
Config.Frappe: 0.0211558125,
Config.UCIDataset: 0.015039052631578948,
Config.Criteo: 0.6824370454545454
},
"gpu": {
Config.Frappe: 0.013744457142857143,
Config.UCIDataset: 0.008209692307692308,
Config.Criteo: 0.6095493157894737
}
}
train_one_epoch_time_dict = {
"cpu": {
Config.Frappe: 5.122203075885773,
Config.UCIDataset: 4.16297769,
Config.Criteo: 422
},
"gpu": {
Config.Frappe: 2.8,
Config.UCIDataset: 1.4,
Config.Criteo: 125
}
}
class GTMLP:
_instances = {}
# use those algoroithm => new tfmem
default_alg_name_list = ["nas_wot", "synflow"]
device = "cpu"
def __new__(cls, dataset: str):
if dataset not in cls._instances:
instance = super(GTMLP, cls).__new__(cls)
instance.dataset = dataset
if dataset == Config.Frappe:
instance.mlp_train_path = mlp_train_frappe
instance.mlp_score_path = mlp_score_frappe
instance.mlp_score_path_expressflow = exp_mlp_score_frappe
instance.mlp_score_path_weight_share = mlp_score_frappe_weight_share
elif dataset == Config.Criteo:
instance.mlp_train_path = mlp_train_criteo
instance.mlp_score_path = mlp_score_criteo
instance.mlp_score_path_expressflow = exp_mlp_score_criteo
instance.mlp_score_path_weight_share = "./not_exist"
elif dataset == Config.UCIDataset:
instance.mlp_train_path = mlp_train_uci_diabetes
instance.mlp_score_path = mlp_score_uci
instance.mlp_score_path_expressflow = exp_mlp_score_uci
instance.mlp_score_path_weight_share = "./not_exist"
instance.mlp_train = read_json(instance.mlp_train_path)
instance.mlp_score = read_json(instance.mlp_score_path)
# todo: here we combine two json dict, remove later
mlp_score_expressflow = read_json(instance.mlp_score_path_expressflow)
for arch_id in mlp_score_expressflow:
if arch_id in instance.mlp_score:
instance.mlp_score[arch_id].update(mlp_score_expressflow[arch_id])
mlp_score_weight_share = read_json(instance.mlp_score_path_weight_share)
for arch_id in mlp_score_weight_share:
if arch_id in instance.mlp_score:
instance.mlp_score[arch_id].update({"weight_share": mlp_score_weight_share[arch_id]})
instance.mlp_global_rank = generate_global_rank(
instance.mlp_score, instance.default_alg_name_list)
cls._instances[dataset] = instance
return cls._instances[dataset]
def get_all_trained_model_ids(self):
return list(self.mlp_train[self.dataset].keys())
def get_all_scored_model_ids(self):
return list(self.mlp_score.keys())
def get_score_one_model_time(self, device: str):
_train_time_per_epoch = score_one_model_time_dict[device].get(self.dataset)
if _train_time_per_epoch is None:
raise NotImplementedError
return _train_time_per_epoch
def get_train_one_epoch_time(self, device: str):
_train_time_per_epoch = train_one_epoch_time_dict[device].get(self.dataset)
if _train_time_per_epoch is None:
raise NotImplementedError
return _train_time_per_epoch
def get_valid_auc(self, arch_id: str, epoch_num: int):
# todo: due to the too many job contention on server, the time usage may not valid.
# train on gpu,
time_usage = (int(epoch_num) + 1) * self.get_train_one_epoch_time("gpu")
if self.dataset == Config.Frappe:
if epoch_num is None or epoch_num >= 13: epoch_num = 13
t_acc = self.mlp_train[self.dataset][arch_id][str(epoch_num)]["valid_auc"]
time_usage = self.mlp_train[self.dataset][arch_id][str(epoch_num)]["train_val_total_time"]
return t_acc, time_usage
elif self.dataset == Config.Criteo:
if epoch_num is None or epoch_num >= 10: epoch_num = 9
t_acc = self.mlp_train[self.dataset][arch_id][str(epoch_num)]["valid_auc"]
time_usage = self.mlp_train[self.dataset][arch_id][str(epoch_num)]["train_val_total_time"]
return t_acc, time_usage
elif self.dataset == Config.UCIDataset:
if epoch_num is None or epoch_num >= 40: epoch_num = 39
t_acc = self.mlp_train[self.dataset][arch_id][str(0)]["valid_auc"]
time_usage = self.mlp_train[self.dataset][arch_id][str(epoch_num)]["train_val_total_time"]
return t_acc, time_usage
else:
raise NotImplementedError
def api_get_score(self, arch_id: str) -> dict:
score_dic = self.mlp_score[arch_id]
return score_dic
def get_global_rank_score(self, arch_id):
return self.mlp_global_rank[arch_id]