blob: 53af20abe0a1bc75e9768b4b5d8eaed49a9e97ff [file] [log] [blame]
#-------------------------------------------------------------
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#
#-------------------------------------------------------------
source("scripts/builtin/sherlockNet.dml") as sherlockNet
# Implements training phase of Sherlock: A Deep Learning Approach to Semantic Data Type Detection
#
# [Hulsebos, Madelon, et al. "Sherlock: A deep learning approach to semantic data type detection."
# Proceedings of the 25th ACM SIGKDD International Conference on Knowledge Discovery & Data Mining.
# 2019.]
# Split feature matrix into four different feature categories and train neural networks on the
# respective single features. Then combine all trained features to train final neural network.
# ---------------------------------------------------------------------------------------------
# NAME TYPE DEFAULT MEANING
# ---------------------------------------------------------------------------------------------
# X_train Matrix --- maxtrix of feature vectors
# y_train Matrix --- matrix Y of class labels of semantic data type
# ---------------------------------------------------------------------------------------------
# cW Matrix --- weights (parameters) matrices for character distribtions
# cb Matrix --- biases vectors for character distribtions
# wW Matrix --- weights (parameters) matrices for word embeddings
# wb Matrix --- biases vectors for word embeddings
# pW Matrix --- weights (parameters) matrices for paragraph vectors
# pb Matrix --- biases vectors for paragraph vectors
# sW Matrix --- weights (parameters) matrices for global statistics
# sb Matrix --- biases vectors for global statistics
# fW Matrix --- weights (parameters) matrices for combining all trained features (final)
# fb Matrix --- biases vectors for combining all trained features (final)
# ---------------------------------------------------------------------------------------------
m_sherlock = function(matrix[double] X_train, matrix[double] y_train)
return (matrix[double] cW1, matrix[double] cb1,
matrix[double] cW2, matrix[double] cb2,
matrix[double] cW3, matrix[double] cb3,
matrix[double] wW1, matrix[double] wb1,
matrix[double] wW2, matrix[double] wb2,
matrix[double] wW3, matrix[double] wb3,
matrix[double] pW1, matrix[double] pb1,
matrix[double] pW2, matrix[double] pb2,
matrix[double] pW3, matrix[double] pb3,
matrix[double] sW1, matrix[double] sb1,
matrix[double] sW2, matrix[double] sb2,
matrix[double] sW3, matrix[double] sb3,
matrix[double] fW1, matrix[double] fb1,
matrix[double] fW2, matrix[double] fb2,
matrix[double] fW3, matrix[double] fb3) {
train_cols = ncol(X_train)
train_rows = nrow(X_train)
[cW1, cb1, cW2, cb2, cW3, cb3] = sherlockNet::train(X_train[1:train_rows, 224:1183], y_train, 300)
[wW1, wb1, wW2, wb2, wW3, wb3] = sherlockNet::train(cbind(X_train[1:train_rows, 13:212], X_train[1:train_rows, 1188]), y_train, 200)
[pW1, pb1, pW2, pb2, pW3, pb3] = sherlockNet::train(X_train[1:train_rows, 1189:1588], y_train, 400)
[sW1, sb1, sW2, sb2, sW3, sb3] = sherlockNet::train(cbind(X_train[1:train_rows, 1:12], X_train[1:train_rows, 213:223], X_train[1:train_rows, 1184:1187]), y_train, 80)
# train features seperate
cprobs = sherlockNet::predict(X_train[1:train_rows, 224:1183], cW1, cb1, cW2, cb2, cW3, cb3)
wprobs = sherlockNet::predict(cbind(X_train[1:train_rows, 13:212], X_train[1:train_rows, 1188]), wW1,wb1, wW2, wb2, wW3, wb3)
pprobs = sherlockNet::predict(X_train[1:train_rows, 1189:1588], pW1, pb1, pW2, pb2, pW3, pb3)
sprobs = sherlockNet::predict(cbind(X_train[1:train_rows, 1:12], X_train[1:train_rows, 213:223], X_train[1:train_rows, 1184:1187]), sW1, sb1, sW2, sb2, sW3, sb3)
#use trained features to train final net
first_predictions = cbind(cprobs, wprobs, pprobs, sprobs)
[fW1, fb1, fW2, fb2, fW3, fb3] = sherlockNet::train(first_predictions, y_train, 500)
}
# ---------------------------------------------------------------------------------------------
# methods to load and transform the data in order to pass valid input to sherlock function
# ---------------------------------------------------------------------------------------------
# Function to transform processed X values from the original sherlock project to
# valid input for sherlock X input matrices.
# ---------------------------------------------------------------------------------------------
# NAME TYPE DEFAULT MEANING
# ---------------------------------------------------------------------------------------------
# data String --- String with processed X values from original sherlock project
# ---------------------------------------------------------------------------------------------
# m_data Matrix --- transformed matrix containing X values
transform_values = function(frame[string] data) return (matrix[double] m_data) {
rows = nrow(data)
cols = ncol(data)
data = data[1:rows,2:cols]
#replace True/Fasle with 1/0
data = map(data, "d -> d.replace(\"True\",\"1\")")
data = map(data, "d -> d.replace(\"False\",\"0\")")
m_data = as.matrix(data)
}
# ---------------------------------------------------------------------------------------------
# Function to encode the string labels from the original sherlock project (y data)
# to a numerical representation.
# ---------------------------------------------------------------------------------------------
# NAME TYPE DEFAULT MEANING
# ---------------------------------------------------------------------------------------------
# data String --- String labels from original sherlock project
# transform_spec String --- Transformation specification to encode label column to numerical represenation
# example: "{ "recode":[ "label" ] }"
# ---------------------------------------------------------------------------------------------
# ground_truth Matrix --- matrix containing ground truth in numerical representation
# meta_data String --- String contaning meta data of transformation encoding
transform_encode_labels = function(frame[string] data, string transform_spec) return (matrix[double] ground_truth , frame[string] meta_data) {
rows = nrow(data)
cols = ncol(data)
#replace label with number
[m_data, meta_data] = transformencode(target=data[1:rows,2:cols],
spec=transform_spec)
ground_truth = matrix(0, rows=rows, cols=78)
for ( i in 1:rows) {
ground_truth[i, as.scalar(m_data[i,1])] = 1
}
}
# ---------------------------------------------------------------------------------------------
# Use the encoding from transform_encode_labels() to transform y labels to a
# numerical representation.
# ---------------------------------------------------------------------------------------------
# NAME TYPE DEFAULT MEANING
# ---------------------------------------------------------------------------------------------
# data String --- String labels from original sherlock project
# meta_data String --- String contaning meta data of transformation encoding
# transform_spec String --- Transformation specification to encode label column to numerical represenation
# example: "{ "recode":[ "label" ] }"
# ---------------------------------------------------------------------------------------------
# ground_truth Matrix --- matrix containing ground truth in numerical representation
transform_apply_labels = function(frame[string] data, frame[string] meta_data, string transform_spec) return (matrix[double] ground_truth) {
#remove index row
rows = nrow(data)
cols = ncol(data)
#replace label with number
m_data = transformapply(target=data[1:rows,2:cols],
spec=transform_spec,
meta=meta_data)
ground_truth = matrix(0, rows=rows, cols=78)
for ( i in 1:rows) {
ground_truth[i, as.scalar(m_data[i,1])] = 1
}
}
# ---------------------------------------------------------------------------------------------
# Transform y labels to ground truth
# ---------------------------------------------------------------------------------------------
# NAME TYPE DEFAULT MEANING
# ---------------------------------------------------------------------------------------------
# data Matrix --- matrix with labels from original sherlock project
# ---------------------------------------------------------------------------------------------
# ground_truth Matrix --- matrix containing ground truth in numerical representation
transform_labels_to_ground_truth = function(matrix[double] data)
return(matrix[double] ground_truth) {
rows = nrow(data)
ground_truth = matrix(0, rows=rows, cols=78)
for ( i in 1:rows) {
ground_truth[i, as.scalar(data[i,1])] = 1
}
}