examples/hfl/src/bank.py - singa - Git at Google

 #
 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #   http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
 #

 # https://github.com/zhengzangw/Fed-SINGA/blob/main/src/client/data/bank.py

 import pandas as pd
 import numpy as np
 import sys
 from pandas.api.types import is_numeric_dtype
 from sklearn.model_selection import train_test_split
 from sklearn.utils import shuffle


 def encode(df):
     res = pd.DataFrame()
     for col in df.columns.values:
         if not is_numeric_dtype(df[col]):
             tmp = pd.get_dummies(df[col], prefix=col)
         else:
             tmp = df[col]
         res = pd.concat([res, tmp], axis=1)
     return res


 def load(device_id):
     fn_train = "data/bank_train_" + str(device_id) + ".csv"
     fn_test = "data/bank_test_" + str(device_id) + ".csv"

     train = pd.read_csv(fn_train, sep=',')
     test = pd.read_csv(fn_test, sep=',')

     train_x = train.drop(['y'], axis=1)
     train_y = train['y']
     val_x = test.drop(['y'], axis=1)
     val_y = test['y']

     train_x = np.array((train_x), dtype=np.float32)
     val_x = np.array((val_x), dtype=np.float32)
     train_y = np.array((train_y), dtype=np.int32)
     val_y = np.array((val_y), dtype=np.int32)

     train_x, val_x = normalize(train_x, val_x)
     num_classes = 2

     return train_x, train_y, val_x, val_y, num_classes


 def normalize(X_train, X_test):
     from sklearn.preprocessing import MinMaxScaler
     scaler = MinMaxScaler()
     X_train_scaled = scaler.fit_transform(X_train)
     X_test_scaled = scaler.transform(X_test)
     return X_train_scaled, X_test_scaled


 def split(num):
     filepath = "../data/bank-additional-full.csv"
     df = pd.read_csv(filepath, sep=';')
     df['y'] = (df['y'] == 'yes').astype(int)
     data = encode(df)
     data = shuffle(data)
     train, test = train_test_split(data, test_size=0.2)

     train.to_csv("data/bank_train_.csv", index=False)
     test.to_csv("data/bank_test_.csv", index=False)

     train_per_client = len(train) // num
     test_per_client = len(test) // num

     print("train_per_client:", train_per_client)
     print("test_per_client:", test_per_client)
     for i in range(num):
         sub_train = train[i * train_per_client:(i + 1) * train_per_client]
         sub_test = test[i * test_per_client:(i + 1) * test_per_client]
         sub_train.to_csv("data/bank_train_" + str(i) + ".csv", index=False)
         sub_test.to_csv("data/bank_test_" + str(i) + ".csv", index=False)


 if __name__ == "__main__":
     split(int(sys.argv[1]))
	#
	# Licensed to the Apache Software Foundation (ASF) under one
	# or more contributor license agreements. See the NOTICE file
	# distributed with this work for additional information
	# regarding copyright ownership. The ASF licenses this file
	# to you under the Apache License, Version 2.0 (the
	# "License"); you may not use this file except in compliance
	# with the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing,
	# software distributed under the License is distributed on an
	# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	# KIND, either express or implied. See the License for the
	# specific language governing permissions and limitations
	# under the License.
	#

	# https://github.com/zhengzangw/Fed-SINGA/blob/main/src/client/data/bank.py

	import pandas as pd
	import numpy as np
	import sys
	from pandas.api.types import is_numeric_dtype
	from sklearn.model_selection import train_test_split
	from sklearn.utils import shuffle


	def encode(df):
	res = pd.DataFrame()
	for col in df.columns.values:
	if not is_numeric_dtype(df[col]):
	tmp = pd.get_dummies(df[col], prefix=col)
	else:
	tmp = df[col]
	res = pd.concat([res, tmp], axis=1)
	return res


	def load(device_id):
	fn_train = "data/bank_train_" + str(device_id) + ".csv"
	fn_test = "data/bank_test_" + str(device_id) + ".csv"

	train = pd.read_csv(fn_train, sep=',')
	test = pd.read_csv(fn_test, sep=',')

	train_x = train.drop(['y'], axis=1)
	train_y = train['y']
	val_x = test.drop(['y'], axis=1)
	val_y = test['y']

	train_x = np.array((train_x), dtype=np.float32)
	val_x = np.array((val_x), dtype=np.float32)
	train_y = np.array((train_y), dtype=np.int32)
	val_y = np.array((val_y), dtype=np.int32)

	train_x, val_x = normalize(train_x, val_x)
	num_classes = 2

	return train_x, train_y, val_x, val_y, num_classes


	def normalize(X_train, X_test):
	from sklearn.preprocessing import MinMaxScaler
	scaler = MinMaxScaler()
	X_train_scaled = scaler.fit_transform(X_train)
	X_test_scaled = scaler.transform(X_test)
	return X_train_scaled, X_test_scaled


	def split(num):
	filepath = "../data/bank-additional-full.csv"
	df = pd.read_csv(filepath, sep=';')
	df['y'] = (df['y'] == 'yes').astype(int)
	data = encode(df)
	data = shuffle(data)
	train, test = train_test_split(data, test_size=0.2)

	train.to_csv("data/bank_train_.csv", index=False)
	test.to_csv("data/bank_test_.csv", index=False)

	train_per_client = len(train) // num
	test_per_client = len(test) // num

	print("train_per_client:", train_per_client)
	print("test_per_client:", test_per_client)
	for i in range(num):
	sub_train = train[i * train_per_client:(i + 1) * train_per_client]
	sub_test = test[i * test_per_client:(i + 1) * test_per_client]
	sub_train.to_csv("data/bank_train_" + str(i) + ".csv", index=False)
	sub_test.to_csv("data/bank_test_" + str(i) + ".csv", index=False)


	if __name__ == "__main__":
	split(int(sys.argv[1]))