Project-AutoML/automl/mod/mod_autosklearn.py - dolphinscheduler-mlflow - Git at Google

 # Licensed to Apache Software Foundation (ASF) under one or more contributor
 # license agreements. See the NOTICE file distributed with
 # this work for additional information regarding copyright
 # ownership. Apache Software Foundation (ASF) licenses this file to you under
 # the Apache License, Version 2.0 (the "License"); you may
 # not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.

 import pickle

 import numpy as np
 from autosklearn.classification import AutoSklearnClassifier
 from sklearn.pipeline import Pipeline
 from sklearn.preprocessing import OrdinalEncoder

 from automl.metrics import eval_classification_metrics
 from automl.mod.tool import BasePredictor, Tool
 from automl.params import Params


 class AutoSklearn(Tool):
     model_path = "autosklearn.pkl"

     conda_env = {
         "dependencies": [
             "python=3.8.2",
             "pip",
             {
                 "pip": [
                     "mlflow",
                     "click==8.0.",
                     "scikit-learn==0.24.2",
                     "boto3==1.22.2",
                     "pandas>=1.0.0",
                     "setuptools<59.6.0",
                     "auto-sklearn==0.14.6",
                     "flaml==1.0.1"
                 ],
             },
         ],
         "name": "MLflow-AutoML",
     }

     @staticmethod
     def train_automl(train_x, train_y, other_params=None, **kwargs):
         params = Params(param_str=other_params, **kwargs)
         print(params)
         pipeline_mods = []

         pipeline_mods.append(
             (
                 "oridinal_encoder",
                 OrdinalEncoder(
                     unknown_value=np.nan, handle_unknown="use_encoded_value"
                 ),
             )
         )
         pipeline = Pipeline(steps=pipeline_mods)
         feat_type = [
             "Categorical" if x.name in {"object", "category"} else "Numerical"
             for x in train_x.dtypes
         ]
         train_x = pipeline.fit_transform(train_x)
         classifier = AutoSklearnClassifier(**params.input_params)
         classifier.fit(train_x, train_y, feat_type=feat_type)

         pipeline.steps.append(("classifier", classifier))
         return pipeline

     @staticmethod
     def eval(pipeline: Pipeline, test_x, test_y, task="classification"):
         oridinal_encoder = pipeline.steps[0][1]
         classifier = pipeline.steps[1][1]
         test_x = oridinal_encoder.transform(test_x)
         y_pred = classifier.predict(test_x)
         if task == "classification":
             metrics = eval_classification_metrics(test_y, y_pred)
         else:
             metrics = super().eval_automl(automl, test_x, test_y)

         return metrics

     @staticmethod
     def save_automl(classifier: AutoSklearnClassifier, save_path: str):
         with open(save_path, "wb") as w_f:
             pickle.dump(classifier, w_f)


 class Predictor(BasePredictor):
     def load_automl(self, model_path):
         with open(model_path, "rb") as r_f:
             self.pipeline: AutoSklearnClassifier = pickle.load(r_f)
         self.oridinal_encoder = self.pipeline.steps[0][1]
         self.automl = self.pipeline.steps[1][1]

     def predict(self, inputs):
         if isinstance(self.automl, AutoSklearnClassifier):
             result = self.predict_classification(inputs)
         else:
             result = self.automl.predict(inputs)
         return result

     def predict_classification(self, inputs):

         inputs = self.oridinal_encoder.transform(inputs)

         pred_proba = self.classifier.predict_proba(inputs)
         label_indexes = pred_proba.argmax(axis=1)
         probs = pred_proba[np.arange(pred_proba.shape[0]), label_indexes]
         labels = (
             self.classifier.automl_.InputValidator.target_validator.inverse_transform(
                 label_indexes
             )
         )
         result = []
         for label, pro in zip(labels, probs):
             result.append({"label": label, "confidence": float(pro)})
         return result
	# Licensed to Apache Software Foundation (ASF) under one or more contributor
	# license agreements. See the NOTICE file distributed with
	# this work for additional information regarding copyright
	# ownership. Apache Software Foundation (ASF) licenses this file to you under
	# the Apache License, Version 2.0 (the "License"); you may
	# not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing,
	# software distributed under the License is distributed on an
	# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	# KIND, either express or implied. See the License for the
	# specific language governing permissions and limitations
	# under the License.

	import pickle

	import numpy as np
	from autosklearn.classification import AutoSklearnClassifier
	from sklearn.pipeline import Pipeline
	from sklearn.preprocessing import OrdinalEncoder

	from automl.metrics import eval_classification_metrics
	from automl.mod.tool import BasePredictor, Tool
	from automl.params import Params


	class AutoSklearn(Tool):
	model_path = "autosklearn.pkl"

	conda_env = {
	"dependencies": [
	"python=3.8.2",
	"pip",
	{
	"pip": [
	"mlflow",
	"click==8.0.",
	"scikit-learn==0.24.2",
	"boto3==1.22.2",
	"pandas>=1.0.0",
	"setuptools<59.6.0",
	"auto-sklearn==0.14.6",
	"flaml==1.0.1"
	],
	},
	],
	"name": "MLflow-AutoML",
	}

	@staticmethod
	def train_automl(train_x, train_y, other_params=None, **kwargs):
	params = Params(param_str=other_params, **kwargs)
	print(params)
	pipeline_mods = []

	pipeline_mods.append(
	(
	"oridinal_encoder",
	OrdinalEncoder(
	unknown_value=np.nan, handle_unknown="use_encoded_value"
	),
	)
	)
	pipeline = Pipeline(steps=pipeline_mods)
	feat_type = [
	"Categorical" if x.name in {"object", "category"} else "Numerical"
	for x in train_x.dtypes
	]
	train_x = pipeline.fit_transform(train_x)
	classifier = AutoSklearnClassifier(**params.input_params)
	classifier.fit(train_x, train_y, feat_type=feat_type)

	pipeline.steps.append(("classifier", classifier))
	return pipeline

	@staticmethod
	def eval(pipeline: Pipeline, test_x, test_y, task="classification"):
	oridinal_encoder = pipeline.steps[0][1]
	classifier = pipeline.steps[1][1]
	test_x = oridinal_encoder.transform(test_x)
	y_pred = classifier.predict(test_x)
	if task == "classification":
	metrics = eval_classification_metrics(test_y, y_pred)
	else:
	metrics = super().eval_automl(automl, test_x, test_y)

	return metrics

	@staticmethod
	def save_automl(classifier: AutoSklearnClassifier, save_path: str):
	with open(save_path, "wb") as w_f:
	pickle.dump(classifier, w_f)


	class Predictor(BasePredictor):
	def load_automl(self, model_path):
	with open(model_path, "rb") as r_f:
	self.pipeline: AutoSklearnClassifier = pickle.load(r_f)
	self.oridinal_encoder = self.pipeline.steps[0][1]
	self.automl = self.pipeline.steps[1][1]

	def predict(self, inputs):
	if isinstance(self.automl, AutoSklearnClassifier):
	result = self.predict_classification(inputs)
	else:
	result = self.automl.predict(inputs)
	return result

	def predict_classification(self, inputs):

	inputs = self.oridinal_encoder.transform(inputs)

	pred_proba = self.classifier.predict_proba(inputs)
	label_indexes = pred_proba.argmax(axis=1)
	probs = pred_proba[np.arange(pred_proba.shape[0]), label_indexes]
	labels = (
	self.classifier.automl_.InputValidator.target_validator.inverse_transform(
	label_indexes
	)
	)
	result = []
	for label, pro in zip(labels, probs):
	result.append({"label": label, "confidence": float(pro)})
	return result