blob: 40ebf452be120f8d61d6d3cd5c94cbc8aba970ff [file] [log] [blame]
# Licensed to Apache Software Foundation (ASF) under one or more contributor
# license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright
# ownership. Apache Software Foundation (ASF) licenses this file to you under
# the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
import pickle
import numpy as np
from autosklearn.classification import AutoSklearnClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder
from automl.metrics import eval_classification_metrics
from automl.mod.tool import BasePredictor, Tool
from automl.params import Params
class AutoSklearn(Tool):
model_path = "autosklearn.pkl"
conda_env = {
"dependencies": [
"python=3.8.2",
"pip",
{
"pip": [
"mlflow",
"click==8.0.",
"scikit-learn==0.24.2",
"boto3==1.22.2",
"pandas>=1.0.0",
"setuptools<59.6.0",
"auto-sklearn==0.14.6",
"flaml==1.0.1"
],
},
],
"name": "MLflow-AutoML",
}
@staticmethod
def train_automl(train_x, train_y, other_params=None, **kwargs):
params = Params(param_str=other_params, **kwargs)
print(params)
pipeline_mods = []
pipeline_mods.append(
(
"oridinal_encoder",
OrdinalEncoder(
unknown_value=np.nan, handle_unknown="use_encoded_value"
),
)
)
pipeline = Pipeline(steps=pipeline_mods)
feat_type = [
"Categorical" if x.name in {"object", "category"} else "Numerical"
for x in train_x.dtypes
]
train_x = pipeline.fit_transform(train_x)
classifier = AutoSklearnClassifier(**params.input_params)
classifier.fit(train_x, train_y, feat_type=feat_type)
pipeline.steps.append(("classifier", classifier))
return pipeline
@staticmethod
def eval(pipeline: Pipeline, test_x, test_y, task="classification"):
oridinal_encoder = pipeline.steps[0][1]
classifier = pipeline.steps[1][1]
test_x = oridinal_encoder.transform(test_x)
y_pred = classifier.predict(test_x)
if task == "classification":
metrics = eval_classification_metrics(test_y, y_pred)
else:
metrics = super().eval_automl(automl, test_x, test_y)
return metrics
@staticmethod
def save_automl(classifier: AutoSklearnClassifier, save_path: str):
with open(save_path, "wb") as w_f:
pickle.dump(classifier, w_f)
class Predictor(BasePredictor):
def load_automl(self, model_path):
with open(model_path, "rb") as r_f:
self.pipeline: AutoSklearnClassifier = pickle.load(r_f)
self.oridinal_encoder = self.pipeline.steps[0][1]
self.automl = self.pipeline.steps[1][1]
def predict(self, inputs):
if isinstance(self.automl, AutoSklearnClassifier):
result = self.predict_classification(inputs)
else:
result = self.automl.predict(inputs)
return result
def predict_classification(self, inputs):
inputs = self.oridinal_encoder.transform(inputs)
pred_proba = self.classifier.predict_proba(inputs)
label_indexes = pred_proba.argmax(axis=1)
probs = pred_proba[np.arange(pred_proba.shape[0]), label_indexes]
labels = (
self.classifier.automl_.InputValidator.target_validator.inverse_transform(
label_indexes
)
)
result = []
for label, pro in zip(labels, probs):
result.append({"label": label, "confidence": float(pro)})
return result