blob: 0eef2377bad3dcbfa0f74e4712443853b4b3679b [file] [log] [blame]
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# =============================================================================
"""
Credit Risk Models
The module contains model definitions of various tested models for credit
assessment
"""
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, ShuffleSplit
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, recall_score, precision_score
class Model(object):
"""
Basic Scorecard Model
Warning: This class should not be used directly. Use derived classes
instead.
"""
def __init__(self,
classifier=None,
test_size=2.0,
n_splits=1,
random_state=None,
params=None):
self.classifier = classifier
self.params = params
self.random_state = random_state
self.test_size = test_size
self.n_splits = n_splits
self.model = GridSearchCV(estimator=classifier,
param_grid=params,
cv=ShuffleSplit(test_size=0.20,
n_splits=n_splits,
random_state=0))
def __str__(self):
return f"""
Model Object
----------------------------------------------------------------
Classifier: {self.classifier().__class__.__name__}
Test Size: {self.test_size}
Random State: {self.random_state}
Number of Splits: {self.n_splits}
Parameter Grid: {self.params}
{self.model}
"""
def train(self, x_train, y_train):
"""
Train scorecard model
Args:
x_train:
array of training parameters
y_train:
pandas dataframe with training labels
"""
self.model = self.model.fit(x_train, y_train.values.ravel())
def predict(self, data):
"""
Predict scorecard model
Args:
data: array
Data to perform prediction on.
"""
return self.model.predict(data)
def accuracy(self, x_test, y_test):
"""
Compute scorecard model accuracy
Args:
x_test: array
The test parameters.
y_test: array
The labels
"""
y_pred = self.predict(x_test)
return accuracy_score(y_test, y_pred, normalize=False)
def metrics(self, x_test, y_test):
"""
Comput scorecard model metrics
Args:
x_test: array
The test parameters.
y_test: array
The labels
"""
y_pred = self.predict(x_test)
cm = confusion_matrix(y_pred, y_test)
accuracy = accuracy_score(y_test, y_pred, normalize=True)
f1 = f1_score(self.y_test, y_pred, average="macro")
recall = recall_score(y_test, y_pred, average="macro")
precision = precision_score(y_test, y_pred, average="macro")
return {"accuracy" : accuracy,
"f1_score" : f1,
"recall_score" : recall,
"precision_score": precision}
class RandomForest(Model):
"""
Model to predict credit risk using Random Forest Classifier
Parameters
----------
classifier: object, default: RandomForestClassifier
sklearn classifier class.
test_size: float, default: 0.2
fraction of the dataset to use as test set.
n_splits: int, default: 1
number of splits.
random_state: int, default: 0
random state.
params: dict: default: {'n_estimators' : [20, 30, 40], 'random_state' : [0]}
model optimisation parameters
"""
def __init__(self,
test_size=2.0,
n_splits=1,
random_state=0,
params={'n_estimators' : [20, 30, 40], 'random_state' : [0]}):
self.classifier = RandomForestClassifier
super(RandomForest, self).__init__(self.classifier,
test_size,
n_splits,
random_state,
params)
def preprocessing(self, data):
"""
Preprocess [German](https://raw.githubusercontent.com/humbletechy/Assign/master/datasets_9109_12699_german_credit_data.csv) dataset
Parameters
----------
data: DataFrame
Pandas dataframe containing German dataset.
"""
# Drop savings account and checkings account columns as they contain a lot
# of NaN values and may not always be available in real life scenarios
data = data.drop(columns = ['Saving accounts', 'Checking account'])
cols = data.columns
num_cols = data._get_numeric_data().columns
categorical = list(set(cols) - set(num_cols))
le = LabelEncoder()
data = data.dropna()
# Encode text columns to number values
for category in categorical:
data[category] = le.fit_transform(data[category])
for col in data.columns:
if(col not in categorical):
data[col] = (data[col].astype('float') - np.mean(data[col].astype('float')))/np.std(data[col].astype('float'))
# Get Training parameters
target_col = data.columns[-1]
x = data.drop(columns=target_col, axis=1)
y = data[target_col].astype('int')
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = self.test_size)
x_train = pd.DataFrame(x_train)
y_train = pd.DataFrame(y_train)
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)
return (x_train, x_test, y_train, y_test)