Delete mlscorecard package
diff --git a/mlscorecard/models.py b/mlscorecard/models.py
deleted file mode 100644
index 0eef237..0000000
--- a/mlscorecard/models.py
+++ /dev/null
@@ -1,212 +0,0 @@
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements. See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# =============================================================================
-"""
-Credit Risk Models
-
-The module contains model definitions of various tested models for credit
-assessment
-"""
-
-import numpy as np
-import pandas as pd
-
-from sklearn.preprocessing import LabelEncoder
-from sklearn.preprocessing import StandardScaler
-from sklearn.ensemble import RandomForestClassifier
-from sklearn.model_selection import train_test_split
-from sklearn.model_selection import GridSearchCV, ShuffleSplit
-from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, recall_score, precision_score
-
-class Model(object):
- """
- Basic Scorecard Model
-
- Warning: This class should not be used directly. Use derived classes
- instead.
- """
-
- def __init__(self,
- classifier=None,
- test_size=2.0,
- n_splits=1,
- random_state=None,
- params=None):
-
- self.classifier = classifier
- self.params = params
- self.random_state = random_state
- self.test_size = test_size
- self.n_splits = n_splits
-
- self.model = GridSearchCV(estimator=classifier,
- param_grid=params,
- cv=ShuffleSplit(test_size=0.20,
- n_splits=n_splits,
- random_state=0))
-
- def __str__(self):
- return f"""
- Model Object
- ----------------------------------------------------------------
-
- Classifier: {self.classifier().__class__.__name__}
- Test Size: {self.test_size}
- Random State: {self.random_state}
- Number of Splits: {self.n_splits}
- Parameter Grid: {self.params}
-
- {self.model}
- """
-
- def train(self, x_train, y_train):
- """
- Train scorecard model
-
- Args:
- x_train:
- array of training parameters
- y_train:
- pandas dataframe with training labels
- """
-
- self.model = self.model.fit(x_train, y_train.values.ravel())
-
- def predict(self, data):
- """
- Predict scorecard model
-
- Args:
- data: array
- Data to perform prediction on.
- """
-
- return self.model.predict(data)
-
- def accuracy(self, x_test, y_test):
- """
- Compute scorecard model accuracy
-
- Args:
- x_test: array
- The test parameters.
- y_test: array
- The labels
- """
-
- y_pred = self.predict(x_test)
- return accuracy_score(y_test, y_pred, normalize=False)
-
- def metrics(self, x_test, y_test):
- """
- Comput scorecard model metrics
-
- Args:
- x_test: array
- The test parameters.
- y_test: array
- The labels
- """
-
- y_pred = self.predict(x_test)
- cm = confusion_matrix(y_pred, y_test)
- accuracy = accuracy_score(y_test, y_pred, normalize=True)
- f1 = f1_score(self.y_test, y_pred, average="macro")
- recall = recall_score(y_test, y_pred, average="macro")
- precision = precision_score(y_test, y_pred, average="macro")
- return {"accuracy" : accuracy,
- "f1_score" : f1,
- "recall_score" : recall,
- "precision_score": precision}
-
-
-class RandomForest(Model):
- """
- Model to predict credit risk using Random Forest Classifier
-
- Parameters
- ----------
- classifier: object, default: RandomForestClassifier
- sklearn classifier class.
-
- test_size: float, default: 0.2
- fraction of the dataset to use as test set.
-
- n_splits: int, default: 1
- number of splits.
-
- random_state: int, default: 0
- random state.
-
- params: dict: default: {'n_estimators' : [20, 30, 40], 'random_state' : [0]}
- model optimisation parameters
- """
-
- def __init__(self,
- test_size=2.0,
- n_splits=1,
- random_state=0,
- params={'n_estimators' : [20, 30, 40], 'random_state' : [0]}):
- self.classifier = RandomForestClassifier
- super(RandomForest, self).__init__(self.classifier,
- test_size,
- n_splits,
- random_state,
- params)
-
- def preprocessing(self, data):
- """
- Preprocess [German](https://raw.githubusercontent.com/humbletechy/Assign/master/datasets_9109_12699_german_credit_data.csv) dataset
-
- Parameters
- ----------
- data: DataFrame
- Pandas dataframe containing German dataset.
- """
-
- # Drop savings account and checkings account columns as they contain a lot
- # of NaN values and may not always be available in real life scenarios
- data = data.drop(columns = ['Saving accounts', 'Checking account'])
-
- cols = data.columns
- num_cols = data._get_numeric_data().columns
- categorical = list(set(cols) - set(num_cols))
-
- le = LabelEncoder()
- data = data.dropna()
- # Encode text columns to number values
- for category in categorical:
- data[category] = le.fit_transform(data[category])
-
- for col in data.columns:
- if(col not in categorical):
- data[col] = (data[col].astype('float') - np.mean(data[col].astype('float')))/np.std(data[col].astype('float'))
-
- # Get Training parameters
- target_col = data.columns[-1]
- x = data.drop(columns=target_col, axis=1)
- y = data[target_col].astype('int')
-
-
- x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = self.test_size)
- x_train = pd.DataFrame(x_train)
- y_train = pd.DataFrame(y_train)
-
- sc = StandardScaler()
- x_train = sc.fit_transform(x_train)
- x_test = sc.transform(x_test)
-
- return (x_train, x_test, y_train, y_test)