blob: b29b8dc6e01efb7a5f288a06ddd2ab8c0cb3016a [file] [log] [blame]
#-------------------------------------------------------------
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#
#-------------------------------------------------------------
source("nn/layers/affine.dml") as affine
source("nn/layers/cross_entropy_loss.dml") as cross_entropy_loss
source("nn/layers/dropout.dml") as dropout
source("nn/layers/relu.dml") as relu
source("nn/layers/softmax.dml")as softmax
source("nn/optim/adam.dml") as adam
source("scripts/staging/entity-resolution/primitives/evaluation.dml") as evaluation
# Implements Neural Network for Sherlock: A Deep Learning Approach to Semantic Data Type Detection
#
# [Hulsebos, Madelon, et al. "Sherlock: A deep learning approach to semantic data type detection."
# Proceedings of the 25th ACM SIGKDD International Conference on Knowledge Discovery & Data Mining.
# 2019.]
# Trains a 2 hidden layer softmax classifier.
# ---------------------------------------------------------------------------------------------
# NAME TYPE DEFAULT MEANING
# ---------------------------------------------------------------------------------------------
# X_train Matrix --- input data matrix, of shape (N, D)
# y_train Matrix --- target matrix, of shape (N, K)
# hidden_layer_neurons int number of neurons per hidden layer
# ---------------------------------------------------------------------------------------------
# W Matrix weights (parameters) matrix, of shape (D, M, 3).
# b Matrix biases vector, of shape (1, M, 3).
train = function(matrix[double] X_train, matrix[double] y_train, int hidden_layer_neurons)
return (matrix[double] W1, matrix[double] b1, matrix[double] W2, matrix[double] b2, matrix[double] W3, matrix[double] b3) {
# Generate input data
N = nrow(X_train) # num examples
D = ncol(X_train)# num features
t = 78 # num target cols
print("Training with " + N + " rows, " + D + " cols of data")
# Create network:
# batch -> affine1 -> relu1 -> dropout1 -> affine2 -> relu2 -> affine3 -> softmax
H1 = hidden_layer_neurons # number of neurons in 1st hidden layer
H2 = hidden_layer_neurons # number of neurons in 2nd hidden layer
p = 0.3 # dropout probability
[W1, b1] = affine::init(D, H1, -1)
[W2, b2] = affine::init(H1, H2, -1)
[W3, b3] = affine::init(H2, t, -1)
# Initialize Adams parameter
initial_lr = 0.0001 # learning rate
decay = 0.0001 # learning rate decay constant for weight decay
beta1 = 0.9; # [0, 1)
beta2 = 0.999; # [0, 1)
epsilon = 0.00000001;
adam_t = 0; # timestamp in adam function
# Adams optimizer
[mW1, vW1] = adam::init(W1);[mb1, vb1] = adam::init(b1)
[mW2, vW2] = adam::init(W2);[mb2, vb2] = adam::init(b2)
[mW3, vW3] = adam::init(W3);[mb3, vb3] = adam::init(b3)
# Optimize
print("Starting optimization")
batch_size = 256 #?
epochs = 100
iters = ceil(N / batch_size)
lr = initial_lr
print("init lr: " + initial_lr + " decay: " + decay + "iters: " + iters)
for (e in 1:epochs) {
for(i in 1:iters){
# Get next batch
beg = ((i-1) * batch_size) %% N + 1
end = min(N, beg + batch_size - 1)
X_batch = X_train[beg:end,]
y_batch = y_train[beg:end,]
# Compute forward pass
## layer 1:
out1 = affine::forward(X_batch, W1, b1)
outr1 = relu::forward(out1)
[outd1, maskd1] = dropout::forward(outr1, p, -1)
## layer 2:
out2 = affine::forward(outd1, W2, b2)
outr2 = relu::forward(out2)
## layer 3:
out3 = affine::forward(outr2, W3, b3)
probs = softmax::forward(out3)
if (i==1) {
# Compute loss
loss = cross_entropy_loss::forward(probs, y_batch)
accuracy = mean(rowIndexMax(probs) == rowIndexMax(y_batch))
# Compute validation loss & accuracy
probs_val = predict(X_train, W1, b1, W2, b2, W3, b3)
loss_val = cross_entropy_loss::forward(probs_val, y_train)
accuracy_val = mean(rowIndexMax(probs_val) == rowIndexMax(y_train))
# Output results
print("Epoch: " + e + ", Iter: " + i + ", Train Loss: " + loss + ", Train Accuracy: "
+ accuracy + ", Val Loss: " + loss_val + ", Val Accuracy: " + accuracy_val)
}
# Compute backward pass
## loss:
dprobs = cross_entropy_loss::backward(probs, y_batch)
## layer 3:
dout3 = softmax::backward(dprobs, out3)
[doutr2, dW3, db3] = affine::backward(dout3, outr2, W3, b3)
## layer 2:
dout2 = relu::backward(doutr2, out2)
[doutd1, dW2, db2] = affine::backward(dout2, outd1, W2, b2)
## layer 1:
doutr1 = dropout::backward(doutd1, outr1, p, maskd1)
dout1 = relu::backward(doutr1, out1)
[dX_batch, dW1, db1] = affine::backward(dout1, X_batch, W1, b1)
# Optimize with Adam
[W1, mW1, vW1] = adam::update(W1, dW1, lr, beta1, beta2, epsilon, adam_t, mW1, vW1)
[b1, mb1, vb1] = adam::update(b1, db1, lr, beta1, beta2, epsilon, adam_t, mb1, vb1)
[W2, mW2, vW2] = adam::update(W2, dW2, lr, beta1, beta2, epsilon, adam_t, mW2, vW2)
[b2, mb2, vb2] = adam::update(b2, db2, lr, beta1, beta2, epsilon, adam_t, mb2, vb2)
[W3, mW3, vW3] = adam::update(W3, dW3, lr, beta1, beta2, epsilon, adam_t, mW3, vW3)
[b3, mb3, vb3] = adam::update(b3, db3, lr, beta1, beta2, epsilon, adam_t, mb3, vb3)
}
# Decay learning rate
adam_t = adam_t + 1
lr = initial_lr * (1 / (1 + decay * e))
}
}
# Computes the class probability predictions of a softmax classifier
# ---------------------------------------------------------------------------------------------
# NAME TYPE DEFAULT MEANING
# ---------------------------------------------------------------------------------------------
# test_val Matrix --- input data matrix, of shape (N, D) each with D features
# W Matrix weights (parameters) matrix, of shape (D, M, 3).
# b Matrix biases vector, of shape (1, M, 3).
# ---------------------------------------------------------------------------------------------
# probs Matrix class probabilities of shape (N, K)
predict = function(matrix[double] test_val,
matrix[double] W1, matrix[double] b1,
matrix[double] W2, matrix[double] b2,
matrix[double] W3, matrix[double] b3)
return (matrix[double] probs) {
N = nrow(test_val)
K = ncol(W3) # num features
# Network:
# batch -> affine1 -> relu1 -> affine2 -> relu2 -> affine3 -> softmax
# Compute predictions over mini-batches
probs = matrix(0, rows=N, cols=K)
batch_size = 128
iters = ceil(N / batch_size)
for(i in 1:iters) {
# Get next batch
beg = ((i-1) * batch_size) %% N + 1
end = min(N, beg + batch_size - 1)
end = min(end, N)
X_batch = test_val[beg:end,]
# Compute forward pass
## layer 1:
out1 = affine::forward(X_batch, W1, b1)
outr1 = relu::forward(out1)
## layer 2:
out2 = affine::forward(outr1, W2, b2)
outr2 = relu::forward(out2)
## layer 3:
out3 = affine::forward(outr2, W3, b3)
probs_batch = softmax::forward(out3)
# Store predictions
probs[beg:end,] = probs_batch
}
}
# Evaluates the performance of the network.
# ---------------------------------------------------------------------------------------------
# NAME TYPE DEFAULT MEANING
# ---------------------------------------------------------------------------------------------
# probs Matrix class probabilities of shape (N, K) (one-hot encoded)
# Y Matrix target matrix of shape (N, K)
# ---------------------------------------------------------------------------------------------
# loss double scalar loss, of shape (1)
# accuracy double scalar accuracy, of shape (1)
# f1 score double scalar f1 score, of shape (1)
# precision double scalar precission, of shape (1)
# recall double scalar recall, of shape (1)
eval = function(matrix[double] probs, matrix[double] Y)
return (double loss, double accuracy, double f1, double precision, double recall) {
# Compute loss & accuracy
loss = cross_entropy_loss::forward(probs, Y)
correct_pred = rowIndexMax(probs) == rowIndexMax(Y)
accuracy = mean(correct_pred)
#calc f1 score
rows = nrow(Y)
cols = ncol(Y)
predBooleanMatrix = matrix(0, rows=rows, cols=cols)
for ( i in 1:rows) {
predBooleanMatrix[i, as.scalar(rowIndexMax(probs[i,1:cols]))] = 1
}
[f1, precision, recall] = evaluation::f1(predBooleanMatrix, Y)
}