blob: 86b8b9ca16988717005b0881a996e2ec896444ca [file] [log] [blame]
#-------------------------------------------------------------
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#
#-------------------------------------------------------------
#
# The implementation is based on
# https://github.com/stanfordnlp/GloVe/blob/master/src/cooccur.c
#
#-------------------------------------------------------------
## Cleans and processes text data by removing punctuation, converting it to lowercase, and reformatting.
## Adds an index column to the result.
# INPUT:
# ------------------------------------------------------------------------------
# S (Frame[Unknown]): 1D input data frame containing text data.
# ------------------------------------------------------------------------------
# OUTPUT:
# ------------------------------------------------------------------------------
# result (Frame[Unknown]): Processed text data with an index column.
# ------------------------------------------------------------------------------
processText = function(Frame[Unknown] S) return (Frame[Unknown] result){
print("processText");
tmpStr = map(S[,1], "x -> x.replaceAll(\"[.]\", \"\")");
tmpStr = map(tmpStr, "x -> x.replaceAll(\"[^a-zA-Z\\s]\", \" \")");
tmpStr = map(tmpStr, "x -> x.toLowerCase()");
result = cbind(as.frame(seq(1, nrow(S), 1)), tmpStr);
}
## Tokenizes text data and retrieves word positions.
# INPUT:
# ------------------------------------------------------------------------------
# S (Frame[Unknown]): 2D input text data with an index column.
# maxTokens (Int): Maximum number of tokens per text entry.
# ------------------------------------------------------------------------------
# OUTPUT:
# ------------------------------------------------------------------------------
# result (Frame[Unknown]): Tokenized words.
# docID (Matrix[double]): Document ID matrix corresponding to tokens.
# ------------------------------------------------------------------------------
getWordPosition = function(Frame[Unknown] S, Int maxTokens) return (Frame[Unknown] result, Matrix[double] docID){
print("getWordPosition");
jspec_pos = "{\"algo\": \"split\", \"out\": \"position\",\"out_params\": {\"sort_alpha\": false},\"id_cols\": [1],\"tokenize_col\": 2}";
wordPosition = tokenize(target=S, spec=jspec_pos, max_tokens=maxTokens);
result = wordPosition[,3];
docID = as.matrix(wordPosition[,1]);
}
## Encodes words into a numerical matrix format, retrieves the vocabulary size, and maps word indices.
## Uses transformencode() to recode strings and find each unique string position in the co-occurrence matrix.
# INPUT:
# ------------------------------------------------------------------------------
# S (Frame[Unknown]): 1D frame of tokenized word positions.
# ------------------------------------------------------------------------------
# OUTPUT:
# ------------------------------------------------------------------------------
# recodedWordPosition (Matrix[double]): Encoded word positions as a numerical matrix.
# tableSize (Int): Number of distinct words in the input text (co-occurrence matrix size).
# column (Frame[Unknown]): Mapping of word indices to distinct words in the co-occurrence matrix.
# ------------------------------------------------------------------------------
getRecodedMatrix = function(Frame[Unknown] S) return (Matrix[double] recodedWordPosition, Int tableSize, Frame[Unknown] column){
print("getRecodedMatrix");
[recodedWordPosition, M] = transformencode(target=S, spec="{ids:true,recode:[1]}");
distinctWord = map(M[,1], "s -> UtilFunctions.splitRecodeEntry(s)[0]");
index = map(M[,1], "s -> Integer.valueOf(UtilFunctions.splitRecodeEntry(s)[1])");
column = cbind(index, distinctWord);
sortedIndex = order(target=as.matrix(index), by=1, decreasing=FALSE, index.return=TRUE);
#TODO vectorize via order of frames
for(i in 1:nrow(sortedIndex)){
p = as.integer(as.scalar(sortedIndex[i,1]));
column[as.integer(as.scalar(index[p])), 2] = distinctWord[p];
column[i, 1] = as.scalar(index[p]);
}
tableSize = nrow(distinctWord);
}
## Iterates over the recoded word positions to construct a co-occurrence matrix.
# INPUT:
# ------------------------------------------------------------------------------
# recodedWordPosition (Matrix[double]): 2D matrix of recoded word positions with text IDs.
# tableSize (Int): Size of the vocabulary (number of unique words).
# distanceWeighting (Boolean): Flag to apply distance weighting to co-occurrence counts.
# symmetric (Boolean): Determines if the matrix is symmetric (TRUE) or asymmetric (FALSE).
# windowSize (Int): Context window size.
# ------------------------------------------------------------------------------
# OUTPUT:
# ------------------------------------------------------------------------------
# coocMatrix (Matrix[double]): Final word-word co-occurrence matrix.
# ------------------------------------------------------------------------------
createCoocMatrix = function(
Matrix[double] recodedWordPosition,
Int tableSize,
boolean distanceWeighting,
boolean symmetric,
Int windowSize)
return (Matrix[double] coocMatrix)
{
print("Processing word cooccurrence...");
coocMatrix = matrix(0, tableSize, tableSize);
#TODO vectorize loop
for (i in 1:nrow(recodedWordPosition)) {
docId = as.integer(as.scalar(recodedWordPosition[i,1]));
wordIndex = as.integer(as.scalar(recodedWordPosition[i,2]));
if(wordIndex != 0){# This check is due to wrong result of the transformencode when running jvm test.
for (j in 1:windowSize) {
# Check left context
if (i-j > 0) {
if(docId == as.integer(as.scalar(recodedWordPosition[i-j, 1])))
{
neighbourWordIndex = as.integer(as.scalar(recodedWordPosition[i-j,2]));
increase = ifelse(distanceWeighting, 1.0 / j, 1.0);
coocMatrix[wordIndex, neighbourWordIndex] = coocMatrix[wordIndex, neighbourWordIndex] + increase;
}
}
# Check right context if symmetric
if(symmetric == TRUE){
if (i+j < nrow(recodedWordPosition) + 1) {
if(docId == as.integer(as.scalar(recodedWordPosition[i+j, 1])))
{
neighbourWordIndex = as.integer(as.scalar(recodedWordPosition[i+j,2]));
increase = ifelse(distanceWeighting, 1.0 / j, 1.0);
coocMatrix[wordIndex, neighbourWordIndex] = coocMatrix[wordIndex, neighbourWordIndex] + increase;
}
}
}
}
}
}
print("Word-word cooccurrence matrix computation completed.");
}
## Main function to process text data to construct a word-word co-occurrence matrix.
# INPUT:
# ------------------------------------------------------------------------------
# input (Frame[Unknown]): 1DInput corpus in CSV format.
# maxTokens (Int): Maximum number of tokens per text entry.
# windowSize (Int): Context window size.
# distanceWeighting (Boolean): Whether to apply distance-based weighting.
# symmetric (Boolean): Determines if the matrix is symmetric (TRUE) or asymmetric (FALSE).
# ------------------------------------------------------------------------------
# OUTPUT:
# ------------------------------------------------------------------------------
# coocMatrix (Matrix[double]): The computed co-occurrence matrix.
# column (Frame[Unknown]): Word-index mapping for the co-occurrence matrix.
# ------------------------------------------------------------------------------
f_cooccurrenceMatrix = function(
Frame[Unknown] input,
Int maxTokens,
Int windowSize,
Boolean distanceWeighting,
Boolean symmetric) return (Matrix[Double] coocMatrix, Frame[Unknown] column){
processedResult = processText(input);
[wordPosition, docID] = getWordPosition(processedResult, maxTokens);
[recodedWordPosition, tableSize, column] = getRecodedMatrix(wordPosition);
coocMatrix = createCoocMatrix(cbind(docID, recodedWordPosition), tableSize, distanceWeighting, symmetric, windowSize);
}