scripts/builtin/glove.dml - systemds - Git at Google

 #-------------------------------------------------------------
 #
 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #   http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
 #-------------------------------------------------------------


 # Computes the vector embeddings for words in a large text corpus.
 #
 # INPUT:
 # --------------------------------------------------------------------------------
 # input                 1DInput corpus in CSV format.
 # seed                  Random seed for reproducibility.
 # vector_size           Dimensionality of word vectors, V.
 # eta                   Learning rate for optimization, recommended value: 0.05.
 # alpha                 Weighting function parameter, recommended value: 0.75.
 # x_max                 Maximum co-occurrence value as per the GloVe paper: 100.
 # tol                   Tolerance value to avoid overfitting, recommended value: 1e-4.
 # iterations            Total number of training iterations.
 # print_loss_it         Interval (in iterations) for printing the loss.
 # maxTokens             Maximum number of tokens per text entry.
 # windowSize            Context window size.
 # distanceWeighting     Whether to apply distance-based weighting.
 # symmetric             Determines if the matrix is symmetric (TRUE) or asymmetric (FALSE).
 # ------------------------------------------------------------------------------
 #
 # OUTPUT:
 # ------------------------------------------------------------------------------
 # G                     The word indices and their word vectors, of shape (N, V). Each represented as a vector, of shape (1,V)
 # ------------------------------------------------------------------------------


 f_glove = function(
     Frame[Unknown] input,
     int seed, int vector_size,
     double alpha, double eta,
     double x_max,
     double tol,
     int iterations,
     int print_loss_it,
     Int maxTokens,
     Int windowSize,
     Boolean distanceWeighting,
     Boolean symmetric)
     return (frame[Unknown] G){

         [cooc_matrix, cooc_index] = cooccurrenceMatrix(input, maxTokens, windowSize, distanceWeighting, symmetric);
         G = gloveWithCoocMatrix(cooc_matrix, cooc_index, seed, vector_size, alpha, eta, x_max, tol, iterations, print_loss_it);
 }


 init = function(matrix[double] cooc_matrix, double x_max, double alpha)
   return(matrix[double] weights, matrix[double] log_cooc_matrix){
   E = 2.718281828;
   bounded = pmin(cooc_matrix, x_max);
   weights = pmin(1, (bounded / x_max) ^ alpha);
   log_cooc_matrix = ifelse(cooc_matrix > 0, log(cooc_matrix, E), 0);
 }

 gloveWithCoocMatrix = function(matrix[double] cooc_matrix, frame[Unknown] cooc_index, int seed, int vector_size, double alpha, double eta, double x_max, double tol, int iterations,int print_loss_it)
     return (frame[Unknown] G){
     /*
      * Computes the vector embeddings for words by analyzing their co-occurrence statistics in a large text corpus.
      *
      * Inputs:
      *  - cooc_matrix: Precomputed co-occurrence matrix of shape (N, N).
      *  - cooc_index:  Index file mapping words to their positions in the co-occurrence matrix.
      *                 The second column should contain the word list in the same order as the matrix.
      *  - seed: Random seed for reproducibility.
      *  - vector_size: Dimensionality of word vectors, V.
      *  - eta: Learning rate for optimization, recommended value: 0.05.
      *  - alpha: Weighting function parameter, recommended value: 0.75.
      *  - x_max: Maximum co-occurrence value as per the GloVe paper: 100.
      *  - tol: Tolerance value to avoid overfitting, recommended value: 1e-4.
      *  - iterations: Total number of training iterations.
      *  - print_loss_it: Interval (in iterations) for printing the loss.
      *
      * Outputs:
      *  - G: frame of the word indices and their word vectors, of shape (N, V). Each represented as a vector, of shape (1,V)
      */

     vocab_size = nrow(cooc_matrix);
     W = (rand(rows=vocab_size, cols=vector_size, min=0, max=1, seed=seed)-0.5)/vector_size;
     C = (rand(rows=vocab_size, cols=vector_size, min=0, max=1, seed=seed+1)-0.5)/vector_size;
     bw = (rand(rows=vocab_size, cols=1, min=0, max=1, seed=seed+2)-0.5)/vector_size;
     bc = (rand(rows=vocab_size, cols=1, min=0, max=1, seed=seed+3)-0.5)/vector_size;
     [weights, log_cooc_matrix] = init(cooc_matrix, x_max, alpha);

     momentum_W = 1e-8 + 0.1 * matrix(1, nrow(W), ncol(W));
     momentum_C = 1e-8 + 0.1 * matrix(1, nrow(C), ncol(C));
     momentum_bw = 1e-8 + 0.1 * matrix(1, nrow(bw), ncol(bw));
     momentum_bc = 1e-8 + 0.1 * matrix(1, nrow(bc), ncol(bc));

     error = 0;
     iter = 0;
     tolerance = tol;
     previous_error = 1e10;
     conti = TRUE;

     while (conti) {

         # compute predictions for all co-occurring word pairs at once
         predictions = W %*% t(C) + bw + t(bc);
         diffs = predictions - log_cooc_matrix;
         weighted_diffs = weights * diffs;

         # compute gradients
         wgrad = weighted_diffs %*% C;
         cgrad = t(weighted_diffs) %*% W;
         bwgrad = rowSums(weighted_diffs);
         bcgrad = matrix(colSums(weighted_diffs), nrow(bc), ncol(bc));

         error =  sum(0.5 * (weights * (diffs ^ 2)));
         iter = iter + 1;


         if (abs(previous_error - error) >= tolerance) {
             if(iter <= iterations){

                 # get steps and update
                 momentum_W = momentum_W + (wgrad ^ 2);
                 momentum_C = momentum_C + (cgrad ^ 2);
                 momentum_bw = momentum_bw + (bwgrad ^ 2);
                 momentum_bc = momentum_bc + (bcgrad ^ 2);

                 W = W - (eta * wgrad / (sqrt(momentum_W) + 1e-8));
                 C = C - (eta * cgrad / (sqrt(momentum_C) + 1e-8));
                 bw = bw - (eta * bwgrad / (sqrt(momentum_bw) + 1e-8));
                 bc = bc - (eta * bcgrad / (sqrt(momentum_bc) + 1e-8));

                 G = W + C;

                 previous_error = error;

                 final_iter = iter;
             } else {
                 conti = FALSE;
             }
         } else {
           conti = FALSE;
         }

         if (iter - floor(iter / print_loss_it) * print_loss_it == 0) {
             print("iteration: " + iter + " error: " + error);
         }
     }

     # add the word index to the word vectors
     print("Given " + iterations + " iterations, " + "stopped (or converged) at the " + final_iter + " iteration / error: " + error);
     G = cbind(cooc_index[,2], as.frame(G));
 }
	#-------------------------------------------------------------
	#
	# Licensed to the Apache Software Foundation (ASF) under one
	# or more contributor license agreements. See the NOTICE file
	# distributed with this work for additional information
	# regarding copyright ownership. The ASF licenses this file
	# to you under the Apache License, Version 2.0 (the
	# "License"); you may not use this file except in compliance
	# with the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing,
	# software distributed under the License is distributed on an
	# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	# KIND, either express or implied. See the License for the
	# specific language governing permissions and limitations
	# under the License.
	#-------------------------------------------------------------


	# Computes the vector embeddings for words in a large text corpus.
	#
	# INPUT:
	# --------------------------------------------------------------------------------
	# input 1DInput corpus in CSV format.
	# seed Random seed for reproducibility.
	# vector_size Dimensionality of word vectors, V.
	# eta Learning rate for optimization, recommended value: 0.05.
	# alpha Weighting function parameter, recommended value: 0.75.
	# x_max Maximum co-occurrence value as per the GloVe paper: 100.
	# tol Tolerance value to avoid overfitting, recommended value: 1e-4.
	# iterations Total number of training iterations.
	# print_loss_it Interval (in iterations) for printing the loss.
	# maxTokens Maximum number of tokens per text entry.
	# windowSize Context window size.
	# distanceWeighting Whether to apply distance-based weighting.
	# symmetric Determines if the matrix is symmetric (TRUE) or asymmetric (FALSE).
	# ------------------------------------------------------------------------------
	#
	# OUTPUT:
	# ------------------------------------------------------------------------------
	# G The word indices and their word vectors, of shape (N, V). Each represented as a vector, of shape (1,V)
	# ------------------------------------------------------------------------------


	f_glove = function(
	Frame[Unknown] input,
	int seed, int vector_size,
	double alpha, double eta,
	double x_max,
	double tol,
	int iterations,
	int print_loss_it,
	Int maxTokens,
	Int windowSize,
	Boolean distanceWeighting,
	Boolean symmetric)
	return (frame[Unknown] G){

	[cooc_matrix, cooc_index] = cooccurrenceMatrix(input, maxTokens, windowSize, distanceWeighting, symmetric);
	G = gloveWithCoocMatrix(cooc_matrix, cooc_index, seed, vector_size, alpha, eta, x_max, tol, iterations, print_loss_it);
	}


	init = function(matrix[double] cooc_matrix, double x_max, double alpha)
	return(matrix[double] weights, matrix[double] log_cooc_matrix){
	E = 2.718281828;
	bounded = pmin(cooc_matrix, x_max);
	weights = pmin(1, (bounded / x_max) ^ alpha);
	log_cooc_matrix = ifelse(cooc_matrix > 0, log(cooc_matrix, E), 0);
	}

	gloveWithCoocMatrix = function(matrix[double] cooc_matrix, frame[Unknown] cooc_index, int seed, int vector_size, double alpha, double eta, double x_max, double tol, int iterations,int print_loss_it)
	return (frame[Unknown] G){
	/*
	* Computes the vector embeddings for words by analyzing their co-occurrence statistics in a large text corpus.
	*
	* Inputs:
	* - cooc_matrix: Precomputed co-occurrence matrix of shape (N, N).
	* - cooc_index: Index file mapping words to their positions in the co-occurrence matrix.
	* The second column should contain the word list in the same order as the matrix.
	* - seed: Random seed for reproducibility.
	* - vector_size: Dimensionality of word vectors, V.
	* - eta: Learning rate for optimization, recommended value: 0.05.
	* - alpha: Weighting function parameter, recommended value: 0.75.
	* - x_max: Maximum co-occurrence value as per the GloVe paper: 100.
	* - tol: Tolerance value to avoid overfitting, recommended value: 1e-4.
	* - iterations: Total number of training iterations.
	* - print_loss_it: Interval (in iterations) for printing the loss.
	*
	* Outputs:
	* - G: frame of the word indices and their word vectors, of shape (N, V). Each represented as a vector, of shape (1,V)
	*/

	vocab_size = nrow(cooc_matrix);
	W = (rand(rows=vocab_size, cols=vector_size, min=0, max=1, seed=seed)-0.5)/vector_size;
	C = (rand(rows=vocab_size, cols=vector_size, min=0, max=1, seed=seed+1)-0.5)/vector_size;
	bw = (rand(rows=vocab_size, cols=1, min=0, max=1, seed=seed+2)-0.5)/vector_size;
	bc = (rand(rows=vocab_size, cols=1, min=0, max=1, seed=seed+3)-0.5)/vector_size;
	[weights, log_cooc_matrix] = init(cooc_matrix, x_max, alpha);

	momentum_W = 1e-8 + 0.1 * matrix(1, nrow(W), ncol(W));
	momentum_C = 1e-8 + 0.1 * matrix(1, nrow(C), ncol(C));
	momentum_bw = 1e-8 + 0.1 * matrix(1, nrow(bw), ncol(bw));
	momentum_bc = 1e-8 + 0.1 * matrix(1, nrow(bc), ncol(bc));

	error = 0;
	iter = 0;
	tolerance = tol;
	previous_error = 1e10;
	conti = TRUE;

	while (conti) {

	# compute predictions for all co-occurring word pairs at once
	predictions = W %*% t(C) + bw + t(bc);
	diffs = predictions - log_cooc_matrix;
	weighted_diffs = weights * diffs;

	# compute gradients
	wgrad = weighted_diffs %*% C;
	cgrad = t(weighted_diffs) %*% W;
	bwgrad = rowSums(weighted_diffs);
	bcgrad = matrix(colSums(weighted_diffs), nrow(bc), ncol(bc));

	error = sum(0.5 * (weights * (diffs ^ 2)));
	iter = iter + 1;


	if (abs(previous_error - error) >= tolerance) {
	if(iter <= iterations){

	# get steps and update
	momentum_W = momentum_W + (wgrad ^ 2);
	momentum_C = momentum_C + (cgrad ^ 2);
	momentum_bw = momentum_bw + (bwgrad ^ 2);
	momentum_bc = momentum_bc + (bcgrad ^ 2);

	W = W - (eta * wgrad / (sqrt(momentum_W) + 1e-8));
	C = C - (eta * cgrad / (sqrt(momentum_C) + 1e-8));
	bw = bw - (eta * bwgrad / (sqrt(momentum_bw) + 1e-8));
	bc = bc - (eta * bcgrad / (sqrt(momentum_bc) + 1e-8));

	G = W + C;

	previous_error = error;

	final_iter = iter;
	} else {
	conti = FALSE;
	}
	} else {
	conti = FALSE;
	}

	if (iter - floor(iter / print_loss_it) * print_loss_it == 0) {
	print("iteration: " + iter + " error: " + error);
	}
	}

	# add the word index to the word vectors
	print("Given " + iterations + " iterations, " + "stopped (or converged) at the " + final_iter + " iteration / error: " + error);
	G = cbind(cooc_index[,2], as.frame(G));
	}