blob: ead32afd11e968d4073b881fc2d2fedd026dcd47 [file] [log] [blame]
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
# This example provides an end-to-end pipeline for a common Kaggle competition.
# The entire pipeline includes common utilities such as k-fold cross validation
# and data pre-processing.
#
# Specifically, the example studies the `House Prices: Advanced Regression
# Techniques` challenge as a case study.
#
# The link to the problem on Kaggle:
# https://www.kaggle.com/c/house-prices-advanced-regression-techniques
import numpy as onp
import pandas as pd
from mxnet import autograd
from mxnet import gluon
from mxnet import np
# After logging in www.kaggle.com, the training and testing data sets can be downloaded at:
# https://www.kaggle.com/c/house-prices-advanced-regression-techniques/download/train.csv
# https://www.kaggle.com/c/house-prices-advanced-regression-techniques/download/test.csv
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
all_X = pd.concat((train.loc[:, 'MSSubClass':'SaleCondition'],
test.loc[:, 'MSSubClass':'SaleCondition']))
# Get all the numerical features and apply standardization.
numeric_feas = all_X.dtypes[all_X.dtypes != "object"].index
all_X[numeric_feas] = all_X[numeric_feas].apply(lambda x:
(x - x.mean()) / (x.std()))
# Convert categorical feature values to numerical (including N/A).
all_X = pd.get_dummies(all_X, dummy_na=True)
# Approximate N/A feature value by the mean value of the current feature.
all_X = all_X.fillna(all_X.mean())
num_train = train.shape[0]
# Convert data formats to NDArrays to feed into gluon.
X_train = all_X[:num_train].as_matrix()
X_test = all_X[num_train:].as_matrix()
y_train = train.SalePrice.as_matrix()
X_train = np.array(X_train)
y_train = np.array(y_train)
y_train.reshape((num_train, 1))
X_test = np.array(X_test)
square_loss = gluon.loss.L2Loss()
def get_rmse_log(net, X_train, y_train):
"""Gets root mse between the logarithms of the prediction and the truth."""
num_train = X_train.shape[0]
clipped_preds = np.clip(net(X_train), 1, float('inf'))
return np.sqrt(2 * np.sum(square_loss(
np.log(clipped_preds), np.log(y_train))).item() / num_train)
def get_net():
"""Gets a neural network. Better results are obtained with modifications."""
net = gluon.nn.Sequential()
net.add(gluon.nn.Dense(50, activation="relu"))
net.add(gluon.nn.Dense(1))
net.initialize()
return net
def train(net, X_train, y_train, epochs, verbose_epoch, learning_rate,
weight_decay, batch_size):
"""Trains the model."""
dataset_train = gluon.data.ArrayDataset(X_train, y_train)
data_iter_train = gluon.data.DataLoader(dataset_train, batch_size,
shuffle=True)
trainer = gluon.Trainer(net.collect_params(), 'adam',
{'learning_rate': learning_rate,
'wd': weight_decay})
net.initialize(force_reinit=True)
for epoch in range(epochs):
for data, label in data_iter_train:
with autograd.record():
output = net(data)
loss = square_loss(output, label)
loss.backward()
trainer.step(batch_size)
avg_loss = get_rmse_log(net, X_train, y_train)
if epoch > verbose_epoch:
print(f"Epoch {epoch}, train loss: {avg_loss}")
return avg_loss
def k_fold_cross_valid(k, epochs, verbose_epoch, X_train, y_train,
learning_rate, weight_decay, batch_size):
"""Conducts k-fold cross validation for the model."""
assert k > 1
fold_size = X_train.shape[0] // k
train_loss_sum = 0.0
test_loss_sum = 0.0
for test_idx in range(k):
X_val_test = X_train[test_idx * fold_size: (test_idx + 1) *
fold_size, :]
y_val_test = y_train[test_idx * fold_size: (test_idx + 1) * fold_size]
val_train_defined = False
for i in range(k):
if i != test_idx:
X_cur_fold = X_train[i * fold_size: (i + 1) * fold_size, :]
y_cur_fold = y_train[i * fold_size: (i + 1) * fold_size]
if not val_train_defined:
X_val_train = X_cur_fold
y_val_train = y_cur_fold
val_train_defined = True
else:
X_val_train = np.concatenate([X_val_train, X_cur_fold], axis=0)
y_val_train = np.concatenate([y_val_train, y_cur_fold], axis=0)
net = get_net()
train_loss = train(net, X_val_train, y_val_train, epochs, verbose_epoch,
learning_rate, weight_decay, batch_size)
train_loss_sum += train_loss
test_loss = get_rmse_log(net, X_val_test, y_val_test)
print(f"Test loss: {test_loss}")
test_loss_sum += test_loss
return train_loss_sum / k, test_loss_sum / k
# The sets of parameters. Better results are obtained with modifications.
# These parameters can be fine-tuned with k-fold cross-validation.
k = 5
epochs = 100
verbose_epoch = 95
learning_rate = 0.3
weight_decay = 100
batch_size = 100
train_loss, test_loss = \
k_fold_cross_valid(k, epochs, verbose_epoch, X_train, y_train,
learning_rate, weight_decay, batch_size)
print(f"{k}-fold validation: Avg train loss: {train_loss}, Avg test loss: {test_loss}")
def learn(epochs, verbose_epoch, X_train, y_train, test, learning_rate,
weight_decay, batch_size):
"""Trains the model and predicts on the test data set."""
net = get_net()
_ = train(net, X_train, y_train, epochs, verbose_epoch, learning_rate,
weight_decay, batch_size)
preds = net(X_test).asnumpy()
test['SalePrice'] = pd.Series(preds.reshape(1, -1)[0])
submission = pd.concat([test['Id'], test['SalePrice']], axis=1)
submission.to_csv('submission.csv', index=False)
learn(epochs, verbose_epoch, X_train, y_train, test, learning_rate,
weight_decay, batch_size)