mlscorecard/models.py - fineract-credit-scorecard - Git at Google

 #
 # Licensed to the Apache Software Foundation (ASF) under one or more
 # contributor license agreements.  See the NOTICE file distributed with
 # this work for additional information regarding copyright ownership.
 # The ASF licenses this file to You under the Apache License, Version 2.0
 # (the "License"); you may not use this file except in compliance with
 # the License.  You may obtain a copy of the License at
 #
 #    http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # =============================================================================
 """
 Credit Risk Models

 The module contains model definitions of various tested models for credit
 assessment
 """

 import numpy as np
 import pandas as pd

 from sklearn.preprocessing import LabelEncoder
 from sklearn.preprocessing import StandardScaler
 from sklearn.ensemble import RandomForestClassifier
 from sklearn.model_selection import train_test_split
 from sklearn.model_selection import GridSearchCV, ShuffleSplit
 from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, recall_score, precision_score

 class Model(object):
     """
     Basic Scorecard Model

     Warning: This class should not be used directly. Use derived classes
     instead.
     """

     def __init__(self,
                  classifier=None,
                  test_size=2.0,
                  n_splits=1,
                  random_state=None,
                  params=None):

         self.classifier = classifier
         self.params = params
         self.random_state = random_state
         self.test_size = test_size
         self.n_splits = n_splits

         self.model = GridSearchCV(estimator=classifier,
                                   param_grid=params,
                                   cv=ShuffleSplit(test_size=0.20,
                                   n_splits=n_splits,
                                   random_state=0))

     def __str__(self):
         return f"""
         Model Object
         ----------------------------------------------------------------

         Classifier: {self.classifier().__class__.__name__}
         Test Size: {self.test_size}
         Random State: {self.random_state}
         Number of Splits: {self.n_splits}
         Parameter Grid: {self.params}

         {self.model}
         """

     def train(self, x_train, y_train):
         """
         Train scorecard model

         Args:
             x_train:
                 array of training parameters
             y_train:
                 pandas dataframe with training labels
         """

         self.model = self.model.fit(x_train, y_train.values.ravel())

     def predict(self, data):
         """
         Predict scorecard model

         Args:
             data: array
                 Data to perform prediction on.
         """

         return self.model.predict(data)

     def accuracy(self, x_test, y_test):
         """
         Compute scorecard model accuracy

         Args:
             x_test: array
                 The test parameters.
             y_test: array
                 The labels
         """

         y_pred = self.predict(x_test)
         return accuracy_score(y_test, y_pred, normalize=False)

     def metrics(self, x_test, y_test):
         """
         Comput scorecard model metrics

         Args:
             x_test: array
                 The test parameters.
             y_test: array
                 The labels
         """

         y_pred = self.predict(x_test)
         cm = confusion_matrix(y_pred, y_test)
         accuracy = accuracy_score(y_test, y_pred, normalize=True)
         f1 = f1_score(self.y_test, y_pred, average="macro")
         recall = recall_score(y_test, y_pred, average="macro")
         precision = precision_score(y_test, y_pred, average="macro")
         return {"accuracy" : accuracy,
                 "f1_score" : f1,
                 "recall_score" : recall,
                 "precision_score": precision}


 class RandomForest(Model):
     """
     Model to predict credit risk using Random Forest Classifier

     Parameters
     ----------
     classifier: object, default: RandomForestClassifier
         sklearn classifier class.

     test_size: float, default: 0.2
         fraction of the dataset to use as test set.

     n_splits: int, default: 1
         number of splits.

     random_state: int, default: 0
         random state.

     params: dict: default: {'n_estimators' : [20, 30, 40], 'random_state' : [0]}
         model optimisation parameters
     """

     def __init__(self,
                  test_size=2.0,
                  n_splits=1,
                  random_state=0,
                  params={'n_estimators' : [20, 30, 40], 'random_state' : [0]}):
         self.classifier = RandomForestClassifier
         super(RandomForest, self).__init__(self.classifier,
                                            test_size,
                                            n_splits,
                                            random_state,
                                            params)

     def preprocessing(self, data):
         """
         Preprocess [German](https://raw.githubusercontent.com/humbletechy/Assign/master/datasets_9109_12699_german_credit_data.csv) dataset

         Parameters
         ----------
         data: DataFrame
             Pandas dataframe containing German dataset.
         """

         # Drop savings account and checkings account columns as they contain a lot
         # of NaN values and may not always be available in real life scenarios
         data = data.drop(columns = ['Saving accounts', 'Checking account'])

         cols = data.columns
         num_cols = data._get_numeric_data().columns
         categorical = list(set(cols) - set(num_cols))

         le = LabelEncoder()
         data = data.dropna()
         # Encode text columns to number values
         for category in categorical:
             data[category] = le.fit_transform(data[category])

         for col in data.columns:
             if(col not in categorical):
                 data[col] = (data[col].astype('float') - np.mean(data[col].astype('float')))/np.std(data[col].astype('float'))

         # Get Training parameters
         target_col = data.columns[-1]
         x = data.drop(columns=target_col, axis=1)
         y = data[target_col].astype('int')


         x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = self.test_size)
         x_train = pd.DataFrame(x_train)
         y_train = pd.DataFrame(y_train)

         sc = StandardScaler()
         x_train = sc.fit_transform(x_train)
         x_test = sc.transform(x_test)

         return (x_train, x_test, y_train, y_test)
	#
	# Licensed to the Apache Software Foundation (ASF) under one or more
	# contributor license agreements. See the NOTICE file distributed with
	# this work for additional information regarding copyright ownership.
	# The ASF licenses this file to You under the Apache License, Version 2.0
	# (the "License"); you may not use this file except in compliance with
	# the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	# =============================================================================
	"""
	Credit Risk Models

	The module contains model definitions of various tested models for credit
	assessment
	"""

	import numpy as np
	import pandas as pd

	from sklearn.preprocessing import LabelEncoder
	from sklearn.preprocessing import StandardScaler
	from sklearn.ensemble import RandomForestClassifier
	from sklearn.model_selection import train_test_split
	from sklearn.model_selection import GridSearchCV, ShuffleSplit
	from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, recall_score, precision_score

	class Model(object):
	"""
	Basic Scorecard Model

	Warning: This class should not be used directly. Use derived classes
	instead.
	"""

	def __init__(self,
	classifier=None,
	test_size=2.0,
	n_splits=1,
	random_state=None,
	params=None):

	self.classifier = classifier
	self.params = params
	self.random_state = random_state
	self.test_size = test_size
	self.n_splits = n_splits

	self.model = GridSearchCV(estimator=classifier,
	param_grid=params,
	cv=ShuffleSplit(test_size=0.20,
	n_splits=n_splits,
	random_state=0))

	def __str__(self):
	return f"""
	Model Object
	----------------------------------------------------------------

	Classifier: {self.classifier().__class__.__name__}
	Test Size: {self.test_size}
	Random State: {self.random_state}
	Number of Splits: {self.n_splits}
	Parameter Grid: {self.params}

	{self.model}
	"""

	def train(self, x_train, y_train):
	"""
	Train scorecard model

	Args:
	x_train:
	array of training parameters
	y_train:
	pandas dataframe with training labels
	"""

	self.model = self.model.fit(x_train, y_train.values.ravel())

	def predict(self, data):
	"""
	Predict scorecard model

	Args:
	data: array
	Data to perform prediction on.
	"""

	return self.model.predict(data)

	def accuracy(self, x_test, y_test):
	"""
	Compute scorecard model accuracy

	Args:
	x_test: array
	The test parameters.
	y_test: array
	The labels
	"""

	y_pred = self.predict(x_test)
	return accuracy_score(y_test, y_pred, normalize=False)

	def metrics(self, x_test, y_test):
	"""
	Comput scorecard model metrics

	Args:
	x_test: array
	The test parameters.
	y_test: array
	The labels
	"""

	y_pred = self.predict(x_test)
	cm = confusion_matrix(y_pred, y_test)
	accuracy = accuracy_score(y_test, y_pred, normalize=True)
	f1 = f1_score(self.y_test, y_pred, average="macro")
	recall = recall_score(y_test, y_pred, average="macro")
	precision = precision_score(y_test, y_pred, average="macro")
	return {"accuracy" : accuracy,
	"f1_score" : f1,
	"recall_score" : recall,
	"precision_score": precision}


	class RandomForest(Model):
	"""
	Model to predict credit risk using Random Forest Classifier

	Parameters
	----------
	classifier: object, default: RandomForestClassifier
	sklearn classifier class.

	test_size: float, default: 0.2
	fraction of the dataset to use as test set.

	n_splits: int, default: 1
	number of splits.

	random_state: int, default: 0
	random state.

	params: dict: default: {'n_estimators' : [20, 30, 40], 'random_state' : [0]}
	model optimisation parameters
	"""

	def __init__(self,
	test_size=2.0,
	n_splits=1,
	random_state=0,
	params={'n_estimators' : [20, 30, 40], 'random_state' : [0]}):
	self.classifier = RandomForestClassifier
	super(RandomForest, self).__init__(self.classifier,
	test_size,
	n_splits,
	random_state,
	params)

	def preprocessing(self, data):
	"""
	Preprocess [German](https://raw.githubusercontent.com/humbletechy/Assign/master/datasets_9109_12699_german_credit_data.csv) dataset

	Parameters
	----------
	data: DataFrame
	Pandas dataframe containing German dataset.
	"""

	# Drop savings account and checkings account columns as they contain a lot
	# of NaN values and may not always be available in real life scenarios
	data = data.drop(columns = ['Saving accounts', 'Checking account'])

	cols = data.columns
	num_cols = data._get_numeric_data().columns
	categorical = list(set(cols) - set(num_cols))

	le = LabelEncoder()
	data = data.dropna()
	# Encode text columns to number values
	for category in categorical:
	data[category] = le.fit_transform(data[category])

	for col in data.columns:
	if(col not in categorical):
	data[col] = (data[col].astype('float') - np.mean(data[col].astype('float')))/np.std(data[col].astype('float'))

	# Get Training parameters
	target_col = data.columns[-1]
	x = data.drop(columns=target_col, axis=1)
	y = data[target_col].astype('int')


	x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = self.test_size)
	x_train = pd.DataFrame(x_train)
	y_train = pd.DataFrame(y_train)

	sc = StandardScaler()
	x_train = sc.fit_transform(x_train)
	x_test = sc.transform(x_test)

	return (x_train, x_test, y_train, y_test)