blob: 6f8a801aed2987ba69ccf74f2ae40be290e34504 [file] [log] [blame]
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Iris H2O Example"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Acquisitor and Cleaner"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"marvin_cell": "acquisitor"
},
"outputs": [],
"source": [
"import pandas as pd\n",
"from marvin_python_toolbox.common.data import MarvinData\n",
"\n",
"file_path = MarvinData.download_file(url=\"https://s3.amazonaws.com/marvin-engines-data/Iris.csv\")\n",
"\n",
"iris = pd.read_csv(file_path)\n",
"\n",
"marvin_initial_dataset = iris"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Training Preparator"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"marvin_cell": "tpreparator"
},
"outputs": [],
"source": [
"from sklearn.model_selection import train_test_split\n",
"from sklearn import model_selection\n",
"\n",
"X_train, X_test= train_test_split(marvin_initial_dataset, random_state=1,test_size=0.3)\n",
"\n",
"marvin_dataset = {'train_X': X_train, 'test_X': X_test}\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Trainer"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"marvin_cell": "trainer",
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Checking whether there is an H2O instance running at http://localhost:54321 . connected.\n"
]
},
{
"data": {
"text/html": [
"<div style=\"overflow:auto\"><table style=\"width:50%\"><tr><td>H2O cluster uptime:</td>\n",
"<td>1 day 0 hours 48 mins</td></tr>\n",
"<tr><td>H2O cluster timezone:</td>\n",
"<td>America/Sao_Paulo</td></tr>\n",
"<tr><td>H2O data parsing timezone:</td>\n",
"<td>UTC</td></tr>\n",
"<tr><td>H2O cluster version:</td>\n",
"<td>3.26.0.3</td></tr>\n",
"<tr><td>H2O cluster version age:</td>\n",
"<td>20 days </td></tr>\n",
"<tr><td>H2O cluster name:</td>\n",
"<td>H2O_from_python_fernandozagatti_7kn410</td></tr>\n",
"<tr><td>H2O cluster total nodes:</td>\n",
"<td>1</td></tr>\n",
"<tr><td>H2O cluster free memory:</td>\n",
"<td>3.791 Gb</td></tr>\n",
"<tr><td>H2O cluster total cores:</td>\n",
"<td>4</td></tr>\n",
"<tr><td>H2O cluster allowed cores:</td>\n",
"<td>4</td></tr>\n",
"<tr><td>H2O cluster status:</td>\n",
"<td>locked, healthy</td></tr>\n",
"<tr><td>H2O connection url:</td>\n",
"<td>http://localhost:54321</td></tr>\n",
"<tr><td>H2O connection proxy:</td>\n",
"<td>None</td></tr>\n",
"<tr><td>H2O internal security:</td>\n",
"<td>False</td></tr>\n",
"<tr><td>H2O API Extensions:</td>\n",
"<td>Amazon S3, XGBoost, Algos, AutoML, Core V3, Core V4</td></tr>\n",
"<tr><td>Python version:</td>\n",
"<td>3.7.3 final</td></tr></table></div>"
],
"text/plain": [
"-------------------------- ---------------------------------------------------\n",
"H2O cluster uptime: 1 day 0 hours 48 mins\n",
"H2O cluster timezone: America/Sao_Paulo\n",
"H2O data parsing timezone: UTC\n",
"H2O cluster version: 3.26.0.3\n",
"H2O cluster version age: 20 days\n",
"H2O cluster name: H2O_from_python_fernandozagatti_7kn410\n",
"H2O cluster total nodes: 1\n",
"H2O cluster free memory: 3.791 Gb\n",
"H2O cluster total cores: 4\n",
"H2O cluster allowed cores: 4\n",
"H2O cluster status: locked, healthy\n",
"H2O connection url: http://localhost:54321\n",
"H2O connection proxy:\n",
"H2O internal security: False\n",
"H2O API Extensions: Amazon S3, XGBoost, Algos, AutoML, Core V3, Core V4\n",
"Python version: 3.7.3 final\n",
"-------------------------- ---------------------------------------------------"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Parse progress: |█████████████████████████████████████████████████████████| 100%\n",
"Parse progress: |█████████████████████████████████████████████████████████| 100%\n",
"AutoML progress: |████████████████████████████████████████████████████████| 100%\n"
]
}
],
"source": [
"import h2o\n",
"from h2o.automl import H2OAutoML\n",
"\n",
"h2o.init()\n",
"\n",
"train_X_frame = h2o.H2OFrame.from_python(marvin_dataset['train_X'])\n",
"test_X_frame = h2o.H2OFrame.from_python(marvin_dataset['test_X'])\n",
"\n",
"x = train_X_frame.columns\n",
"y = 'Species'\n",
"x.remove(y)\n",
"\n",
"automl = H2OAutoML(max_models=20, seed=1)\n",
"automl.train(x=x, \n",
" y=y, \n",
" training_frame=train_X_frame)\n",
"\n",
"marvin_model = automl"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Metrics Evaluator"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"marvin_cell": "evaluator",
"scrolled": false
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/fernandozagatti/.virtualenvs/iris-h2o-automl-env/lib/python3.7/site-packages/pandas/core/frame.py:4102: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame\n",
"\n",
"See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" errors=errors,\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Parse progress: |█████████████████████████████████████████████████████████| 100%\n",
"gbm prediction progress: |████████████████████████████████████████████████| 100%\n"
]
}
],
"source": [
"import h2o\n",
"from sklearn import metrics\n",
"\n",
"#h2o.init()\n",
"\n",
"y_test = marvin_dataset['test_X']['Species']\n",
"marvin_dataset['test_X'].drop(columns='Species',inplace= True)\n",
"\n",
"teste = h2o.H2OFrame.from_python(marvin_dataset['test_X'])\n",
"preds = marvin_model.predict(teste).as_data_frame()['predict'].values\n",
"marvin_metrics = metrics.accuracy_score(y_test, preds)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Prediction Preparator"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"marvin_cell": "ppreparator",
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Parse progress: |█████████████████████████████████████████████████████████| 100%\n"
]
}
],
"source": [
"import h2o\n",
"import pandas as pd\n",
"\n",
"input_message = {'SepalLengthCm': [input_message[0]], 'SepalWidthCm': [input_message[1]],\n",
" 'PetalLengthCm': [input_message[2]], 'PetalWidthCm': [input_message[3]]}\n",
"input_message = pd.DataFrame(data=input_message)\n",
"input_message = h2o.H2OFrame.from_python(input_message)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Predictor"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {
"marvin_cell": "predictor"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"gbm prediction progress: |████████████████████████████████████████████████| 100%\n"
]
}
],
"source": [
"final_prediction = marvin_model.predict(input_message).as_data_frame().values[0][0]"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Iris-versicolor\n"
]
}
],
"source": [
"print(final_prediction)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.3"
}
},
"nbformat": 4,
"nbformat_minor": 1
}