blob: 9acf52975c65b1e22aec4b1ceb359974a5211c52 [file] [log] [blame]
#-------------------------------------------------------------
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#-------------------------------------------------------------
# Computes the vector embeddings for words in a large text corpus.
#
# INPUT:
# --------------------------------------------------------------------------------
# input 1DInput corpus in CSV format.
# seed Random seed for reproducibility.
# vector_size Dimensionality of word vectors, V.
# eta Learning rate for optimization, recommended value: 0.05.
# alpha Weighting function parameter, recommended value: 0.75.
# x_max Maximum co-occurrence value as per the GloVe paper: 100.
# tol Tolerance value to avoid overfitting, recommended value: 1e-4.
# iterations Total number of training iterations.
# print_loss_it Interval (in iterations) for printing the loss.
# maxTokens Maximum number of tokens per text entry.
# windowSize Context window size.
# distanceWeighting Whether to apply distance-based weighting.
# symmetric Determines if the matrix is symmetric (TRUE) or asymmetric (FALSE).
# ------------------------------------------------------------------------------
#
# OUTPUT:
# ------------------------------------------------------------------------------
# G The word indices and their word vectors, of shape (N, V). Each represented as a vector, of shape (1,V)
# ------------------------------------------------------------------------------
f_glove = function(
Frame[Unknown] input,
int seed, int vector_size,
double alpha, double eta,
double x_max,
double tol,
int iterations,
int print_loss_it,
Int maxTokens,
Int windowSize,
Boolean distanceWeighting,
Boolean symmetric)
return (frame[Unknown] G){
[cooc_matrix, cooc_index] = cooccurrenceMatrix(input, maxTokens, windowSize, distanceWeighting, symmetric);
G = gloveWithCoocMatrix(cooc_matrix, cooc_index, seed, vector_size, alpha, eta, x_max, tol, iterations, print_loss_it);
}
init = function(matrix[double] cooc_matrix, double x_max, double alpha)
return(matrix[double] weights, matrix[double] log_cooc_matrix){
E = 2.718281828;
bounded = pmin(cooc_matrix, x_max);
weights = pmin(1, (bounded / x_max) ^ alpha);
log_cooc_matrix = ifelse(cooc_matrix > 0, log(cooc_matrix, E), 0);
}
gloveWithCoocMatrix = function(matrix[double] cooc_matrix, frame[Unknown] cooc_index, int seed, int vector_size, double alpha, double eta, double x_max, double tol, int iterations,int print_loss_it)
return (frame[Unknown] G){
/*
* Computes the vector embeddings for words by analyzing their co-occurrence statistics in a large text corpus.
*
* Inputs:
* - cooc_matrix: Precomputed co-occurrence matrix of shape (N, N).
* - cooc_index: Index file mapping words to their positions in the co-occurrence matrix.
* The second column should contain the word list in the same order as the matrix.
* - seed: Random seed for reproducibility.
* - vector_size: Dimensionality of word vectors, V.
* - eta: Learning rate for optimization, recommended value: 0.05.
* - alpha: Weighting function parameter, recommended value: 0.75.
* - x_max: Maximum co-occurrence value as per the GloVe paper: 100.
* - tol: Tolerance value to avoid overfitting, recommended value: 1e-4.
* - iterations: Total number of training iterations.
* - print_loss_it: Interval (in iterations) for printing the loss.
*
* Outputs:
* - G: frame of the word indices and their word vectors, of shape (N, V). Each represented as a vector, of shape (1,V)
*/
vocab_size = nrow(cooc_matrix);
W = (rand(rows=vocab_size, cols=vector_size, min=0, max=1, seed=seed)-0.5)/vector_size;
C = (rand(rows=vocab_size, cols=vector_size, min=0, max=1, seed=seed+1)-0.5)/vector_size;
bw = (rand(rows=vocab_size, cols=1, min=0, max=1, seed=seed+2)-0.5)/vector_size;
bc = (rand(rows=vocab_size, cols=1, min=0, max=1, seed=seed+3)-0.5)/vector_size;
[weights, log_cooc_matrix] = init(cooc_matrix, x_max, alpha);
momentum_W = 1e-8 + 0.1 * matrix(1, nrow(W), ncol(W));
momentum_C = 1e-8 + 0.1 * matrix(1, nrow(C), ncol(C));
momentum_bw = 1e-8 + 0.1 * matrix(1, nrow(bw), ncol(bw));
momentum_bc = 1e-8 + 0.1 * matrix(1, nrow(bc), ncol(bc));
error = 0;
iter = 0;
tolerance = tol;
previous_error = 1e10;
conti = TRUE;
while (conti) {
# compute predictions for all co-occurring word pairs at once
predictions = W %*% t(C) + bw + t(bc);
diffs = predictions - log_cooc_matrix;
weighted_diffs = weights * diffs;
# compute gradients
wgrad = weighted_diffs %*% C;
cgrad = t(weighted_diffs) %*% W;
bwgrad = rowSums(weighted_diffs);
bcgrad = matrix(colSums(weighted_diffs), nrow(bc), ncol(bc));
error = sum(0.5 * (weights * (diffs ^ 2)));
iter = iter + 1;
if (abs(previous_error - error) >= tolerance) {
if(iter <= iterations){
# get steps and update
momentum_W = momentum_W + (wgrad ^ 2);
momentum_C = momentum_C + (cgrad ^ 2);
momentum_bw = momentum_bw + (bwgrad ^ 2);
momentum_bc = momentum_bc + (bcgrad ^ 2);
W = W - (eta * wgrad / (sqrt(momentum_W) + 1e-8));
C = C - (eta * cgrad / (sqrt(momentum_C) + 1e-8));
bw = bw - (eta * bwgrad / (sqrt(momentum_bw) + 1e-8));
bc = bc - (eta * bcgrad / (sqrt(momentum_bc) + 1e-8));
G = W + C;
previous_error = error;
final_iter = iter;
} else {
conti = FALSE;
}
} else {
conti = FALSE;
}
if (iter - floor(iter / print_loss_it) * print_loss_it == 0) {
print("iteration: " + iter + " error: " + error);
}
}
# add the word index to the word vectors
print("Given " + iterations + " iterations, " + "stopped (or converged) at the " + final_iter + " iteration / error: " + error);
G = cbind(cooc_index[,2], as.frame(G));
}