scripts/builtin/sherlockNet.dml - systemds - Git at Google

 #-------------------------------------------------------------
 #
 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #   http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
 #
 #-------------------------------------------------------------
 source("nn/layers/affine.dml") as affine
 source("nn/layers/cross_entropy_loss.dml") as cross_entropy_loss
 source("nn/layers/dropout.dml") as dropout
 source("nn/layers/relu.dml") as relu
 source("nn/layers/softmax.dml")as softmax
 source("nn/optim/adam.dml") as adam
 source("scripts/staging/entity-resolution/primitives/evaluation.dml") as evaluation

 # Implements Neural Network for Sherlock: A Deep Learning Approach to Semantic Data Type Detection
 #
 # [Hulsebos, Madelon, et al. "Sherlock: A deep learning approach to semantic data type detection."
 # Proceedings of the 25th ACM SIGKDD International Conference on Knowledge Discovery & Data Mining.
 # 2019.]

 # Trains a 2 hidden layer softmax classifier.
 # ---------------------------------------------------------------------------------------------
 # NAME         TYPE      DEFAULT  MEANING
 # ---------------------------------------------------------------------------------------------
 # X_train      Matrix    ---      input data matrix, of shape (N, D)
 # y_train      Matrix    ---      target matrix, of shape (N, K)
 # hidden_layer_neurons int        number of neurons per hidden layer
 # ---------------------------------------------------------------------------------------------
 # W            Matrix             weights (parameters) matrix, of shape (D, M, 3).
 # b            Matrix             biases vector, of shape (1, M, 3).

 train = function(matrix[double] X_train, matrix[double] y_train, int hidden_layer_neurons)
   return (matrix[double] W1, matrix[double] b1, matrix[double] W2, matrix[double] b2, matrix[double] W3, matrix[double] b3) {

   # Generate input data
   N = nrow(X_train) # num examples
   D = ncol(X_train)# num features
   t = 78 # num target cols
   print("Training with " + N + " rows, " + D + " cols of data")
   # Create network:
   # batch -> affine1 -> relu1 -> dropout1 -> affine2 -> relu2 -> affine3 -> softmax
   H1 = hidden_layer_neurons # number of neurons in 1st hidden layer
   H2 = hidden_layer_neurons # number of neurons in 2nd hidden layer
   p = 0.3  # dropout probability
   [W1, b1] = affine::init(D, H1, -1)
   [W2, b2] = affine::init(H1, H2, -1)
   [W3, b3] = affine::init(H2, t, -1)

   # Initialize Adams parameter
   initial_lr = 0.0001  # learning rate
   decay = 0.0001  # learning rate decay constant for weight decay

   beta1   = 0.9;       # [0, 1)
   beta2   = 0.999;     # [0, 1)
   epsilon = 0.00000001;
   adam_t       = 0; # timestamp in adam function

   # Adams optimizer
   [mW1, vW1] = adam::init(W1);[mb1, vb1] = adam::init(b1)
   [mW2, vW2] = adam::init(W2);[mb2, vb2] = adam::init(b2)
   [mW3, vW3] = adam::init(W3);[mb3, vb3] = adam::init(b3)

   # Optimize
   print("Starting optimization")
   batch_size = 256 #?
   epochs = 100
   iters = ceil(N / batch_size)
   lr = initial_lr
   print("init lr: " + initial_lr + " decay: " + decay + "iters: " + iters)
   for (e in 1:epochs) {
     for(i in 1:iters){
       # Get next batch
       beg = ((i-1) * batch_size) %% N + 1
       end = min(N, beg + batch_size - 1)
       X_batch = X_train[beg:end,]
       y_batch = y_train[beg:end,]

       # Compute forward pass
       ## layer 1:
       out1 = affine::forward(X_batch, W1, b1)
       outr1 = relu::forward(out1)
       [outd1, maskd1] = dropout::forward(outr1, p, -1)
       ## layer 2:
       out2 = affine::forward(outd1, W2, b2)
       outr2 = relu::forward(out2)
       ## layer 3:
       out3 = affine::forward(outr2, W3, b3)
       probs = softmax::forward(out3)

       if (i==1) {
         # Compute loss
         loss = cross_entropy_loss::forward(probs, y_batch)
         accuracy = mean(rowIndexMax(probs) == rowIndexMax(y_batch))

         # Compute validation loss & accuracy
         probs_val = predict(X_train, W1, b1, W2, b2, W3, b3)
         loss_val = cross_entropy_loss::forward(probs_val, y_train)
         accuracy_val = mean(rowIndexMax(probs_val) == rowIndexMax(y_train))

         # Output results
         print("Epoch: " + e + ", Iter: " + i + ", Train Loss: " + loss + ", Train Accuracy: "
         + accuracy + ", Val Loss: " + loss_val + ", Val Accuracy: " + accuracy_val)
       }

       # Compute backward pass
       ## loss:
       dprobs = cross_entropy_loss::backward(probs, y_batch)
       ## layer 3:
       dout3 = softmax::backward(dprobs, out3)
       [doutr2, dW3, db3] = affine::backward(dout3, outr2, W3, b3)
       ## layer 2:
       dout2 = relu::backward(doutr2, out2)
       [doutd1, dW2, db2] = affine::backward(dout2, outd1, W2, b2)
       ## layer 1:
       doutr1 = dropout::backward(doutd1, outr1, p, maskd1)
       dout1 = relu::backward(doutr1, out1)
       [dX_batch, dW1, db1] = affine::backward(dout1, X_batch, W1, b1)

       # Optimize with Adam
       [W1, mW1, vW1] = adam::update(W1, dW1, lr, beta1, beta2, epsilon, adam_t, mW1, vW1)
       [b1, mb1, vb1] = adam::update(b1, db1, lr, beta1, beta2, epsilon, adam_t, mb1, vb1)
       [W2, mW2, vW2] = adam::update(W2, dW2, lr, beta1, beta2, epsilon, adam_t, mW2, vW2)
       [b2, mb2, vb2] = adam::update(b2, db2, lr, beta1, beta2, epsilon, adam_t, mb2, vb2)
       [W3, mW3, vW3] = adam::update(W3, dW3, lr, beta1, beta2, epsilon, adam_t, mW3, vW3)
       [b3, mb3, vb3] = adam::update(b3, db3, lr, beta1, beta2, epsilon, adam_t, mb3, vb3)
     }

     # Decay learning rate
     adam_t = adam_t + 1
     lr = initial_lr * (1 / (1 + decay * e))
   }
 }

 # Computes the class probability predictions of a softmax classifier
 # ---------------------------------------------------------------------------------------------
 # NAME         TYPE      DEFAULT  MEANING
 # ---------------------------------------------------------------------------------------------
 # test_val     Matrix    ---      input data matrix, of shape (N, D) each with D features
 # W            Matrix             weights (parameters) matrix, of shape (D, M, 3).
 # b            Matrix             biases vector, of shape (1, M, 3).
 # ---------------------------------------------------------------------------------------------
 # probs        Matrix             class probabilities of shape (N, K)

 predict = function(matrix[double] test_val,
                     matrix[double] W1, matrix[double] b1,
                     matrix[double] W2, matrix[double] b2,
                     matrix[double] W3, matrix[double] b3)
               return (matrix[double] probs) {

   N = nrow(test_val)
   K = ncol(W3) # num features

   # Network:
   # batch -> affine1 -> relu1 -> affine2 -> relu2 -> affine3 -> softmax
   # Compute predictions over mini-batches

   probs = matrix(0, rows=N, cols=K)

   batch_size = 128
   iters = ceil(N / batch_size)
   for(i in 1:iters) {
     # Get next batch
     beg = ((i-1) * batch_size) %% N + 1
     end = min(N, beg + batch_size - 1)
     end = min(end, N)
     X_batch = test_val[beg:end,]

     # Compute forward pass
     ## layer 1:
     out1 = affine::forward(X_batch, W1, b1)
     outr1 = relu::forward(out1)
     ## layer 2:
     out2 = affine::forward(outr1, W2, b2)
     outr2 = relu::forward(out2)
     ## layer 3:
     out3 = affine::forward(outr2, W3, b3)
     probs_batch = softmax::forward(out3)

     # Store predictions
     probs[beg:end,] = probs_batch
   }
 }

 # Evaluates the performance of the network.
 # ---------------------------------------------------------------------------------------------
 # NAME         TYPE      DEFAULT  MEANING
 # ---------------------------------------------------------------------------------------------
 # probs        Matrix             class probabilities of shape (N, K) (one-hot encoded)
 # Y            Matrix             target matrix of shape (N, K)
 # ---------------------------------------------------------------------------------------------
 # loss         double             scalar loss, of shape (1)
 # accuracy     double             scalar accuracy, of shape (1)
 # f1 score     double             scalar f1 score, of shape (1)
 # precision    double             scalar precission, of shape (1)
 # recall       double             scalar recall, of shape (1)

 eval = function(matrix[double] probs, matrix[double] Y)
   return (double loss, double accuracy, double f1, double precision, double recall) {
   # Compute loss & accuracy
   loss = cross_entropy_loss::forward(probs, Y)
   correct_pred = rowIndexMax(probs) == rowIndexMax(Y)
   accuracy = mean(correct_pred)

   #calc f1 score
   rows = nrow(Y)
   cols = ncol(Y)
   predBooleanMatrix = matrix(0, rows=rows, cols=cols)
   for ( i in 1:rows) {
     predBooleanMatrix[i, as.scalar(rowIndexMax(probs[i,1:cols]))] = 1
   }
   [f1, precision, recall] = evaluation::f1(predBooleanMatrix, Y)
 }
	#-------------------------------------------------------------
	#
	# Licensed to the Apache Software Foundation (ASF) under one
	# or more contributor license agreements. See the NOTICE file
	# distributed with this work for additional information
	# regarding copyright ownership. The ASF licenses this file
	# to you under the Apache License, Version 2.0 (the
	# "License"); you may not use this file except in compliance
	# with the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing,
	# software distributed under the License is distributed on an
	# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	# KIND, either express or implied. See the License for the
	# specific language governing permissions and limitations
	# under the License.
	#
	#-------------------------------------------------------------
	source("nn/layers/affine.dml") as affine
	source("nn/layers/cross_entropy_loss.dml") as cross_entropy_loss
	source("nn/layers/dropout.dml") as dropout
	source("nn/layers/relu.dml") as relu
	source("nn/layers/softmax.dml")as softmax
	source("nn/optim/adam.dml") as adam
	source("scripts/staging/entity-resolution/primitives/evaluation.dml") as evaluation

	# Implements Neural Network for Sherlock: A Deep Learning Approach to Semantic Data Type Detection
	#
	# [Hulsebos, Madelon, et al. "Sherlock: A deep learning approach to semantic data type detection."
	# Proceedings of the 25th ACM SIGKDD International Conference on Knowledge Discovery & Data Mining.
	# 2019.]

	# Trains a 2 hidden layer softmax classifier.
	# ---------------------------------------------------------------------------------------------
	# NAME TYPE DEFAULT MEANING
	# ---------------------------------------------------------------------------------------------
	# X_train Matrix --- input data matrix, of shape (N, D)
	# y_train Matrix --- target matrix, of shape (N, K)
	# hidden_layer_neurons int number of neurons per hidden layer
	# ---------------------------------------------------------------------------------------------
	# W Matrix weights (parameters) matrix, of shape (D, M, 3).
	# b Matrix biases vector, of shape (1, M, 3).

	train = function(matrix[double] X_train, matrix[double] y_train, int hidden_layer_neurons)
	return (matrix[double] W1, matrix[double] b1, matrix[double] W2, matrix[double] b2, matrix[double] W3, matrix[double] b3) {

	# Generate input data
	N = nrow(X_train) # num examples
	D = ncol(X_train)# num features
	t = 78 # num target cols
	print("Training with " + N + " rows, " + D + " cols of data")
	# Create network:
	# batch -> affine1 -> relu1 -> dropout1 -> affine2 -> relu2 -> affine3 -> softmax
	H1 = hidden_layer_neurons # number of neurons in 1st hidden layer
	H2 = hidden_layer_neurons # number of neurons in 2nd hidden layer
	p = 0.3 # dropout probability
	[W1, b1] = affine::init(D, H1, -1)
	[W2, b2] = affine::init(H1, H2, -1)
	[W3, b3] = affine::init(H2, t, -1)

	# Initialize Adams parameter
	initial_lr = 0.0001 # learning rate
	decay = 0.0001 # learning rate decay constant for weight decay

	beta1 = 0.9; # [0, 1)
	beta2 = 0.999; # [0, 1)
	epsilon = 0.00000001;
	adam_t = 0; # timestamp in adam function

	# Adams optimizer
	[mW1, vW1] = adam::init(W1);[mb1, vb1] = adam::init(b1)
	[mW2, vW2] = adam::init(W2);[mb2, vb2] = adam::init(b2)
	[mW3, vW3] = adam::init(W3);[mb3, vb3] = adam::init(b3)

	# Optimize
	print("Starting optimization")
	batch_size = 256 #?
	epochs = 100
	iters = ceil(N / batch_size)
	lr = initial_lr
	print("init lr: " + initial_lr + " decay: " + decay + "iters: " + iters)
	for (e in 1:epochs) {
	for(i in 1:iters){
	# Get next batch
	beg = ((i-1) * batch_size) %% N + 1
	end = min(N, beg + batch_size - 1)
	X_batch = X_train[beg:end,]
	y_batch = y_train[beg:end,]

	# Compute forward pass
	## layer 1:
	out1 = affine::forward(X_batch, W1, b1)
	outr1 = relu::forward(out1)
	[outd1, maskd1] = dropout::forward(outr1, p, -1)
	## layer 2:
	out2 = affine::forward(outd1, W2, b2)
	outr2 = relu::forward(out2)
	## layer 3:
	out3 = affine::forward(outr2, W3, b3)
	probs = softmax::forward(out3)

	if (i==1) {
	# Compute loss
	loss = cross_entropy_loss::forward(probs, y_batch)
	accuracy = mean(rowIndexMax(probs) == rowIndexMax(y_batch))

	# Compute validation loss & accuracy
	probs_val = predict(X_train, W1, b1, W2, b2, W3, b3)
	loss_val = cross_entropy_loss::forward(probs_val, y_train)
	accuracy_val = mean(rowIndexMax(probs_val) == rowIndexMax(y_train))

	# Output results
	print("Epoch: " + e + ", Iter: " + i + ", Train Loss: " + loss + ", Train Accuracy: "
	+ accuracy + ", Val Loss: " + loss_val + ", Val Accuracy: " + accuracy_val)
	}

	# Compute backward pass
	## loss:
	dprobs = cross_entropy_loss::backward(probs, y_batch)
	## layer 3:
	dout3 = softmax::backward(dprobs, out3)
	[doutr2, dW3, db3] = affine::backward(dout3, outr2, W3, b3)
	## layer 2:
	dout2 = relu::backward(doutr2, out2)
	[doutd1, dW2, db2] = affine::backward(dout2, outd1, W2, b2)
	## layer 1:
	doutr1 = dropout::backward(doutd1, outr1, p, maskd1)
	dout1 = relu::backward(doutr1, out1)
	[dX_batch, dW1, db1] = affine::backward(dout1, X_batch, W1, b1)

	# Optimize with Adam
	[W1, mW1, vW1] = adam::update(W1, dW1, lr, beta1, beta2, epsilon, adam_t, mW1, vW1)
	[b1, mb1, vb1] = adam::update(b1, db1, lr, beta1, beta2, epsilon, adam_t, mb1, vb1)
	[W2, mW2, vW2] = adam::update(W2, dW2, lr, beta1, beta2, epsilon, adam_t, mW2, vW2)
	[b2, mb2, vb2] = adam::update(b2, db2, lr, beta1, beta2, epsilon, adam_t, mb2, vb2)
	[W3, mW3, vW3] = adam::update(W3, dW3, lr, beta1, beta2, epsilon, adam_t, mW3, vW3)
	[b3, mb3, vb3] = adam::update(b3, db3, lr, beta1, beta2, epsilon, adam_t, mb3, vb3)
	}

	# Decay learning rate
	adam_t = adam_t + 1
	lr = initial_lr * (1 / (1 + decay * e))
	}
	}

	# Computes the class probability predictions of a softmax classifier
	# ---------------------------------------------------------------------------------------------
	# NAME TYPE DEFAULT MEANING
	# ---------------------------------------------------------------------------------------------
	# test_val Matrix --- input data matrix, of shape (N, D) each with D features
	# W Matrix weights (parameters) matrix, of shape (D, M, 3).
	# b Matrix biases vector, of shape (1, M, 3).
	# ---------------------------------------------------------------------------------------------
	# probs Matrix class probabilities of shape (N, K)

	predict = function(matrix[double] test_val,
	matrix[double] W1, matrix[double] b1,
	matrix[double] W2, matrix[double] b2,
	matrix[double] W3, matrix[double] b3)
	return (matrix[double] probs) {

	N = nrow(test_val)
	K = ncol(W3) # num features

	# Network:
	# batch -> affine1 -> relu1 -> affine2 -> relu2 -> affine3 -> softmax
	# Compute predictions over mini-batches

	probs = matrix(0, rows=N, cols=K)

	batch_size = 128
	iters = ceil(N / batch_size)
	for(i in 1:iters) {
	# Get next batch
	beg = ((i-1) * batch_size) %% N + 1
	end = min(N, beg + batch_size - 1)
	end = min(end, N)
	X_batch = test_val[beg:end,]

	# Compute forward pass
	## layer 1:
	out1 = affine::forward(X_batch, W1, b1)
	outr1 = relu::forward(out1)
	## layer 2:
	out2 = affine::forward(outr1, W2, b2)
	outr2 = relu::forward(out2)
	## layer 3:
	out3 = affine::forward(outr2, W3, b3)
	probs_batch = softmax::forward(out3)

	# Store predictions
	probs[beg:end,] = probs_batch
	}
	}

	# Evaluates the performance of the network.
	# ---------------------------------------------------------------------------------------------
	# NAME TYPE DEFAULT MEANING
	# ---------------------------------------------------------------------------------------------
	# probs Matrix class probabilities of shape (N, K) (one-hot encoded)
	# Y Matrix target matrix of shape (N, K)
	# ---------------------------------------------------------------------------------------------
	# loss double scalar loss, of shape (1)
	# accuracy double scalar accuracy, of shape (1)
	# f1 score double scalar f1 score, of shape (1)
	# precision double scalar precission, of shape (1)
	# recall double scalar recall, of shape (1)

	eval = function(matrix[double] probs, matrix[double] Y)
	return (double loss, double accuracy, double f1, double precision, double recall) {
	# Compute loss & accuracy
	loss = cross_entropy_loss::forward(probs, Y)
	correct_pred = rowIndexMax(probs) == rowIndexMax(Y)
	accuracy = mean(correct_pred)

	#calc f1 score
	rows = nrow(Y)
	cols = ncol(Y)
	predBooleanMatrix = matrix(0, rows=rows, cols=cols)
	for ( i in 1:rows) {
	predBooleanMatrix[i, as.scalar(rowIndexMax(probs[i,1:cols]))] = 1
	}
	[f1, precision, recall] = evaluation::f1(predBooleanMatrix, Y)
	}