scripts/builtin/sherlock.dml - systemds - Git at Google

 #-------------------------------------------------------------
 #
 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #   http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
 #
 #-------------------------------------------------------------
 source("scripts/builtin/sherlockNet.dml") as sherlockNet

 # Implements training phase of Sherlock: A Deep Learning Approach to Semantic Data Type Detection
 #
 # [Hulsebos, Madelon, et al. "Sherlock: A deep learning approach to semantic data type detection."
 # Proceedings of the 25th ACM SIGKDD International Conference on Knowledge Discovery & Data Mining.
 # 2019.]

 # Split feature matrix into four different feature categories and train neural networks on the
 # respective single features. Then combine all trained features to train final neural network.
 # ---------------------------------------------------------------------------------------------
 # NAME         TYPE      DEFAULT  MEANING
 # ---------------------------------------------------------------------------------------------
 # X_train      Matrix    ---      maxtrix of feature vectors
 # y_train      Matrix    ---      matrix Y of class labels of semantic data type
 # ---------------------------------------------------------------------------------------------
 # cW           Matrix    ---      weights (parameters) matrices for character distribtions
 # cb           Matrix    ---      biases vectors for character distribtions
 # wW           Matrix    ---      weights (parameters) matrices for word embeddings
 # wb           Matrix    ---      biases vectors for word embeddings
 # pW           Matrix    ---      weights (parameters) matrices for paragraph vectors
 # pb           Matrix    ---      biases vectors for paragraph vectors
 # sW           Matrix    ---      weights (parameters) matrices for global statistics
 # sb           Matrix    ---      biases vectors for global statistics
 # fW           Matrix    ---      weights (parameters) matrices for  combining all trained features (final)
 # fb           Matrix    ---      biases vectors for combining all trained features (final)
 # ---------------------------------------------------------------------------------------------

 m_sherlock = function(matrix[double] X_train, matrix[double] y_train)
       return (matrix[double] cW1, matrix[double] cb1,
             matrix[double] cW2, matrix[double] cb2,
             matrix[double] cW3, matrix[double] cb3,
             matrix[double] wW1, matrix[double] wb1,
             matrix[double] wW2, matrix[double] wb2,
             matrix[double] wW3, matrix[double] wb3,
             matrix[double] pW1, matrix[double] pb1,
             matrix[double] pW2, matrix[double] pb2,
             matrix[double] pW3, matrix[double] pb3,
             matrix[double] sW1, matrix[double] sb1,
             matrix[double] sW2, matrix[double] sb2,
             matrix[double] sW3, matrix[double] sb3,
             matrix[double] fW1, matrix[double] fb1,
             matrix[double] fW2, matrix[double] fb2,
             matrix[double] fW3, matrix[double] fb3) {
   train_cols = ncol(X_train)
   train_rows = nrow(X_train)
   [cW1, cb1, cW2, cb2, cW3, cb3] = sherlockNet::train(X_train[1:train_rows, 224:1183], y_train, 300)
   [wW1, wb1, wW2, wb2, wW3, wb3] = sherlockNet::train(cbind(X_train[1:train_rows, 13:212], X_train[1:train_rows, 1188]), y_train, 200)
   [pW1, pb1, pW2, pb2, pW3, pb3] = sherlockNet::train(X_train[1:train_rows, 1189:1588], y_train, 400)
   [sW1, sb1, sW2, sb2, sW3, sb3] = sherlockNet::train(cbind(X_train[1:train_rows, 1:12], X_train[1:train_rows, 213:223], X_train[1:train_rows, 1184:1187]), y_train, 80)

   # train features seperate
   cprobs = sherlockNet::predict(X_train[1:train_rows, 224:1183], cW1, cb1, cW2, cb2, cW3, cb3)
   wprobs = sherlockNet::predict(cbind(X_train[1:train_rows, 13:212], X_train[1:train_rows, 1188]), wW1,wb1, wW2, wb2, wW3, wb3)
   pprobs = sherlockNet::predict(X_train[1:train_rows, 1189:1588], pW1, pb1, pW2, pb2, pW3, pb3)
   sprobs = sherlockNet::predict(cbind(X_train[1:train_rows, 1:12], X_train[1:train_rows, 213:223], X_train[1:train_rows, 1184:1187]), sW1, sb1, sW2, sb2, sW3, sb3)

   #use trained features to train final net
   first_predictions = cbind(cprobs, wprobs, pprobs, sprobs)
   [fW1, fb1, fW2, fb2, fW3, fb3] = sherlockNet::train(first_predictions, y_train, 500)
 }

 # ---------------------------------------------------------------------------------------------
 # methods to load and transform the data in order to pass valid input to sherlock function
 # ---------------------------------------------------------------------------------------------
 # Function to transform processed X values from the original sherlock project to
 # valid input for sherlock X input matrices.

 # ---------------------------------------------------------------------------------------------
 # NAME         TYPE      DEFAULT  MEANING
 # ---------------------------------------------------------------------------------------------
 # data         String    ---      String with processed X values from original sherlock project
 # ---------------------------------------------------------------------------------------------
 # m_data       Matrix    ---      transformed matrix containing X values

 transform_values = function(frame[string] data) return (matrix[double] m_data) {

   rows = nrow(data)
   cols = ncol(data)
   data = data[1:rows,2:cols]

   #replace True/Fasle with 1/0
   data = map(data, "d -> d.replace(\"True\",\"1\")")
   data = map(data, "d -> d.replace(\"False\",\"0\")")
   m_data = as.matrix(data)
 }

 # ---------------------------------------------------------------------------------------------
 # Function to encode the string labels from the original sherlock project (y data)
 # to a numerical representation.
 # ---------------------------------------------------------------------------------------------

 # NAME            TYPE      DEFAULT  MEANING
 # ---------------------------------------------------------------------------------------------
 # data            String    ---      String labels from original sherlock project
 # transform_spec  String    ---      Transformation specification to encode label column to numerical represenation
 #                                       example: "{ "recode":[ "label" ] }"
 # ---------------------------------------------------------------------------------------------
 # ground_truth    Matrix    ---      matrix containing ground truth in numerical representation
 # meta_data       String    ---      String contaning meta data of transformation encoding

 transform_encode_labels = function(frame[string] data, string transform_spec) return (matrix[double] ground_truth , frame[string] meta_data) {

   rows = nrow(data)
   cols = ncol(data)

   #replace label with number
   [m_data, meta_data] = transformencode(target=data[1:rows,2:cols],
     spec=transform_spec)

   ground_truth = matrix(0, rows=rows, cols=78)
   for ( i in 1:rows) {
     ground_truth[i, as.scalar(m_data[i,1])] = 1
   }
 }

 # ---------------------------------------------------------------------------------------------
 # Use the encoding from transform_encode_labels() to transform y labels to a
 # numerical representation.
 # ---------------------------------------------------------------------------------------------

 # NAME            TYPE      DEFAULT  MEANING
 # ---------------------------------------------------------------------------------------------
 # data            String    ---      String labels from original sherlock project
 # meta_data       String    ---      String contaning meta data of transformation encoding
 # transform_spec  String    ---      Transformation specification to encode label column to numerical represenation
 #                                       example: "{ "recode":[ "label" ] }"
 # ---------------------------------------------------------------------------------------------
 # ground_truth    Matrix    ---      matrix containing ground truth in numerical representation

 transform_apply_labels = function(frame[string] data, frame[string] meta_data, string transform_spec) return (matrix[double] ground_truth) {
   #remove index row
   rows = nrow(data)
   cols = ncol(data)

   #replace label with number
   m_data = transformapply(target=data[1:rows,2:cols],
                   spec=transform_spec,
                   meta=meta_data)

   ground_truth = matrix(0, rows=rows, cols=78)
   for ( i in 1:rows) {
     ground_truth[i, as.scalar(m_data[i,1])] = 1
   }
 }

 # ---------------------------------------------------------------------------------------------
 # Transform y labels to ground truth
 # ---------------------------------------------------------------------------------------------

 # NAME         TYPE      DEFAULT  MEANING
 # ---------------------------------------------------------------------------------------------
 # data         Matrix    ---      matrix with labels from original sherlock project
 # ---------------------------------------------------------------------------------------------
 # ground_truth Matrix    ---      matrix containing ground truth in numerical representation

 transform_labels_to_ground_truth = function(matrix[double] data)
   return(matrix[double] ground_truth) {

   rows = nrow(data)
   ground_truth = matrix(0, rows=rows, cols=78)
   for ( i in 1:rows) {
     ground_truth[i, as.scalar(data[i,1])] = 1
   }
 }
	#-------------------------------------------------------------
	#
	# Licensed to the Apache Software Foundation (ASF) under one
	# or more contributor license agreements. See the NOTICE file
	# distributed with this work for additional information
	# regarding copyright ownership. The ASF licenses this file
	# to you under the Apache License, Version 2.0 (the
	# "License"); you may not use this file except in compliance
	# with the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing,
	# software distributed under the License is distributed on an
	# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	# KIND, either express or implied. See the License for the
	# specific language governing permissions and limitations
	# under the License.
	#
	#-------------------------------------------------------------
	source("scripts/builtin/sherlockNet.dml") as sherlockNet

	# Implements training phase of Sherlock: A Deep Learning Approach to Semantic Data Type Detection
	#
	# [Hulsebos, Madelon, et al. "Sherlock: A deep learning approach to semantic data type detection."
	# Proceedings of the 25th ACM SIGKDD International Conference on Knowledge Discovery & Data Mining.
	# 2019.]

	# Split feature matrix into four different feature categories and train neural networks on the
	# respective single features. Then combine all trained features to train final neural network.
	# ---------------------------------------------------------------------------------------------
	# NAME TYPE DEFAULT MEANING
	# ---------------------------------------------------------------------------------------------
	# X_train Matrix --- maxtrix of feature vectors
	# y_train Matrix --- matrix Y of class labels of semantic data type
	# ---------------------------------------------------------------------------------------------
	# cW Matrix --- weights (parameters) matrices for character distribtions
	# cb Matrix --- biases vectors for character distribtions
	# wW Matrix --- weights (parameters) matrices for word embeddings
	# wb Matrix --- biases vectors for word embeddings
	# pW Matrix --- weights (parameters) matrices for paragraph vectors
	# pb Matrix --- biases vectors for paragraph vectors
	# sW Matrix --- weights (parameters) matrices for global statistics
	# sb Matrix --- biases vectors for global statistics
	# fW Matrix --- weights (parameters) matrices for combining all trained features (final)
	# fb Matrix --- biases vectors for combining all trained features (final)
	# ---------------------------------------------------------------------------------------------

	m_sherlock = function(matrix[double] X_train, matrix[double] y_train)
	return (matrix[double] cW1, matrix[double] cb1,
	matrix[double] cW2, matrix[double] cb2,
	matrix[double] cW3, matrix[double] cb3,
	matrix[double] wW1, matrix[double] wb1,
	matrix[double] wW2, matrix[double] wb2,
	matrix[double] wW3, matrix[double] wb3,
	matrix[double] pW1, matrix[double] pb1,
	matrix[double] pW2, matrix[double] pb2,
	matrix[double] pW3, matrix[double] pb3,
	matrix[double] sW1, matrix[double] sb1,
	matrix[double] sW2, matrix[double] sb2,
	matrix[double] sW3, matrix[double] sb3,
	matrix[double] fW1, matrix[double] fb1,
	matrix[double] fW2, matrix[double] fb2,
	matrix[double] fW3, matrix[double] fb3) {
	train_cols = ncol(X_train)
	train_rows = nrow(X_train)
	[cW1, cb1, cW2, cb2, cW3, cb3] = sherlockNet::train(X_train[1:train_rows, 224:1183], y_train, 300)
	[wW1, wb1, wW2, wb2, wW3, wb3] = sherlockNet::train(cbind(X_train[1:train_rows, 13:212], X_train[1:train_rows, 1188]), y_train, 200)
	[pW1, pb1, pW2, pb2, pW3, pb3] = sherlockNet::train(X_train[1:train_rows, 1189:1588], y_train, 400)
	[sW1, sb1, sW2, sb2, sW3, sb3] = sherlockNet::train(cbind(X_train[1:train_rows, 1:12], X_train[1:train_rows, 213:223], X_train[1:train_rows, 1184:1187]), y_train, 80)

	# train features seperate
	cprobs = sherlockNet::predict(X_train[1:train_rows, 224:1183], cW1, cb1, cW2, cb2, cW3, cb3)
	wprobs = sherlockNet::predict(cbind(X_train[1:train_rows, 13:212], X_train[1:train_rows, 1188]), wW1,wb1, wW2, wb2, wW3, wb3)
	pprobs = sherlockNet::predict(X_train[1:train_rows, 1189:1588], pW1, pb1, pW2, pb2, pW3, pb3)
	sprobs = sherlockNet::predict(cbind(X_train[1:train_rows, 1:12], X_train[1:train_rows, 213:223], X_train[1:train_rows, 1184:1187]), sW1, sb1, sW2, sb2, sW3, sb3)

	#use trained features to train final net
	first_predictions = cbind(cprobs, wprobs, pprobs, sprobs)
	[fW1, fb1, fW2, fb2, fW3, fb3] = sherlockNet::train(first_predictions, y_train, 500)
	}

	# ---------------------------------------------------------------------------------------------
	# methods to load and transform the data in order to pass valid input to sherlock function
	# ---------------------------------------------------------------------------------------------
	# Function to transform processed X values from the original sherlock project to
	# valid input for sherlock X input matrices.

	# ---------------------------------------------------------------------------------------------
	# NAME TYPE DEFAULT MEANING
	# ---------------------------------------------------------------------------------------------
	# data String --- String with processed X values from original sherlock project
	# ---------------------------------------------------------------------------------------------
	# m_data Matrix --- transformed matrix containing X values

	transform_values = function(frame[string] data) return (matrix[double] m_data) {

	rows = nrow(data)
	cols = ncol(data)
	data = data[1:rows,2:cols]

	#replace True/Fasle with 1/0
	data = map(data, "d -> d.replace(\"True\",\"1\")")
	data = map(data, "d -> d.replace(\"False\",\"0\")")
	m_data = as.matrix(data)
	}

	# ---------------------------------------------------------------------------------------------
	# Function to encode the string labels from the original sherlock project (y data)
	# to a numerical representation.
	# ---------------------------------------------------------------------------------------------

	# NAME TYPE DEFAULT MEANING
	# ---------------------------------------------------------------------------------------------
	# data String --- String labels from original sherlock project
	# transform_spec String --- Transformation specification to encode label column to numerical represenation
	# example: "{ "recode":[ "label" ] }"
	# ---------------------------------------------------------------------------------------------
	# ground_truth Matrix --- matrix containing ground truth in numerical representation
	# meta_data String --- String contaning meta data of transformation encoding

	transform_encode_labels = function(frame[string] data, string transform_spec) return (matrix[double] ground_truth , frame[string] meta_data) {

	rows = nrow(data)
	cols = ncol(data)

	#replace label with number
	[m_data, meta_data] = transformencode(target=data[1:rows,2:cols],
	spec=transform_spec)

	ground_truth = matrix(0, rows=rows, cols=78)
	for ( i in 1:rows) {
	ground_truth[i, as.scalar(m_data[i,1])] = 1
	}
	}

	# ---------------------------------------------------------------------------------------------
	# Use the encoding from transform_encode_labels() to transform y labels to a
	# numerical representation.
	# ---------------------------------------------------------------------------------------------

	# NAME TYPE DEFAULT MEANING
	# ---------------------------------------------------------------------------------------------
	# data String --- String labels from original sherlock project
	# meta_data String --- String contaning meta data of transformation encoding
	# transform_spec String --- Transformation specification to encode label column to numerical represenation
	# example: "{ "recode":[ "label" ] }"
	# ---------------------------------------------------------------------------------------------
	# ground_truth Matrix --- matrix containing ground truth in numerical representation

	transform_apply_labels = function(frame[string] data, frame[string] meta_data, string transform_spec) return (matrix[double] ground_truth) {
	#remove index row
	rows = nrow(data)
	cols = ncol(data)

	#replace label with number
	m_data = transformapply(target=data[1:rows,2:cols],
	spec=transform_spec,
	meta=meta_data)

	ground_truth = matrix(0, rows=rows, cols=78)
	for ( i in 1:rows) {
	ground_truth[i, as.scalar(m_data[i,1])] = 1
	}
	}

	# ---------------------------------------------------------------------------------------------
	# Transform y labels to ground truth
	# ---------------------------------------------------------------------------------------------

	# NAME TYPE DEFAULT MEANING
	# ---------------------------------------------------------------------------------------------
	# data Matrix --- matrix with labels from original sherlock project
	# ---------------------------------------------------------------------------------------------
	# ground_truth Matrix --- matrix containing ground truth in numerical representation

	transform_labels_to_ground_truth = function(matrix[double] data)
	return(matrix[double] ground_truth) {

	rows = nrow(data)
	ground_truth = matrix(0, rows=rows, cols=78)
	for ( i in 1:rows) {
	ground_truth[i, as.scalar(data[i,1])] = 1
	}
	}