scripts/builtin/cooccurrenceMatrix.dml - systemds - Git at Google

 #-------------------------------------------------------------
 #
 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #   http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
 #
 #-------------------------------------------------------------
 #
 # The implementation is based on
 # https://github.com/stanfordnlp/GloVe/blob/master/src/cooccur.c
 #
 #-------------------------------------------------------------

 ## Cleans and processes text data by removing punctuation, converting it to lowercase, and reformatting.
 ## Adds an index column to the result.
 # INPUT:
 # ------------------------------------------------------------------------------
 # S     (Frame[Unknown]): 1D input data frame containing text data.
 # ------------------------------------------------------------------------------
 # OUTPUT:
 # ------------------------------------------------------------------------------
 # result    (Frame[Unknown]): Processed text data with an index column.
 # ------------------------------------------------------------------------------
 processText = function(Frame[Unknown] S) return (Frame[Unknown] result){
     print("processText");
     tmpStr = map(S[,1], "x -> x.replaceAll(\"[.]\", \"\")");
     tmpStr = map(tmpStr, "x -> x.replaceAll(\"[^a-zA-Z\\s]\", \" \")");
     tmpStr = map(tmpStr, "x -> x.toLowerCase()");
     result = cbind(as.frame(seq(1, nrow(S), 1)), tmpStr);
 }

 ## Tokenizes text data and retrieves word positions.
 # INPUT:
 # ------------------------------------------------------------------------------
 # S           (Frame[Unknown]): 2D input text data with an index column.
 # maxTokens   (Int): Maximum number of tokens per text entry.
 # ------------------------------------------------------------------------------
 # OUTPUT:
 # ------------------------------------------------------------------------------
 # result  (Frame[Unknown]): Tokenized words.
 # docID   (Matrix[double]): Document ID matrix corresponding to tokens.
 # ------------------------------------------------------------------------------
 getWordPosition = function(Frame[Unknown] S, Int maxTokens) return (Frame[Unknown] result, Matrix[double] docID){
     print("getWordPosition");
     jspec_pos = "{\"algo\": \"split\", \"out\": \"position\",\"out_params\": {\"sort_alpha\": false},\"id_cols\": [1],\"tokenize_col\": 2}";
     wordPosition = tokenize(target=S, spec=jspec_pos, max_tokens=maxTokens);
     result = wordPosition[,3];
     docID = as.matrix(wordPosition[,1]);
 }

 ## Encodes words into a numerical matrix format, retrieves the vocabulary size, and maps word indices.
 ## Uses transformencode() to recode strings and find each unique string position in the co-occurrence matrix.
 # INPUT:
 # ------------------------------------------------------------------------------
 # S     (Frame[Unknown]): 1D frame of tokenized word positions.
 # ------------------------------------------------------------------------------
 # OUTPUT:
 # ------------------------------------------------------------------------------
 # recodedWordPosition   (Matrix[double]): Encoded word positions as a numerical matrix.
 # tableSize            (Int): Number of distinct words in the input text (co-occurrence matrix size).
 # column               (Frame[Unknown]): Mapping of word indices to distinct words in the co-occurrence matrix.
 # ------------------------------------------------------------------------------
 getRecodedMatrix = function(Frame[Unknown] S) return (Matrix[double] recodedWordPosition, Int tableSize, Frame[Unknown] column){
     print("getRecodedMatrix");
     [recodedWordPosition, M] = transformencode(target=S, spec="{ids:true,recode:[1]}");

     distinctWord = map(M[,1], "s -> UtilFunctions.splitRecodeEntry(s)[0]");
     index = map(M[,1], "s -> Integer.valueOf(UtilFunctions.splitRecodeEntry(s)[1])");
     column = cbind(index, distinctWord);
     sortedIndex = order(target=as.matrix(index), by=1, decreasing=FALSE, index.return=TRUE);

     #TODO vectorize via order of frames
     for(i in 1:nrow(sortedIndex)){
         p = as.integer(as.scalar(sortedIndex[i,1]));
         column[as.integer(as.scalar(index[p])), 2] = distinctWord[p];
         column[i, 1] = as.scalar(index[p]);
     }
     tableSize = nrow(distinctWord);
 }

 ## Iterates over the recoded word positions to construct a co-occurrence matrix.
 # INPUT:
 # ------------------------------------------------------------------------------
 # recodedWordPosition (Matrix[double]): 2D matrix of recoded word positions with text IDs.
 # tableSize          (Int): Size of the vocabulary (number of unique words).
 # distanceWeighting  (Boolean): Flag to apply distance weighting to co-occurrence counts.
 # symmetric          (Boolean): Determines if the matrix is symmetric (TRUE) or asymmetric (FALSE).
 # windowSize        (Int): Context window size.
 # ------------------------------------------------------------------------------
 # OUTPUT:
 # ------------------------------------------------------------------------------
 # coocMatrix (Matrix[double]): Final word-word co-occurrence matrix.
 # ------------------------------------------------------------------------------
 createCoocMatrix = function(
     Matrix[double] recodedWordPosition,
     Int tableSize,
     boolean distanceWeighting,
     boolean symmetric,
     Int windowSize)
 return (Matrix[double] coocMatrix)
 {
     print("Processing word cooccurrence...");
     coocMatrix = matrix(0, tableSize, tableSize);

     #TODO vectorize loop
     for (i in 1:nrow(recodedWordPosition)) {
         docId = as.integer(as.scalar(recodedWordPosition[i,1]));
         wordIndex = as.integer(as.scalar(recodedWordPosition[i,2]));
         if(wordIndex != 0){# This check is due to wrong result of the transformencode when running jvm test.
             for (j in 1:windowSize) {
                 # Check left context
                 if (i-j > 0) {
                     if(docId == as.integer(as.scalar(recodedWordPosition[i-j, 1])))
                     {
                         neighbourWordIndex = as.integer(as.scalar(recodedWordPosition[i-j,2]));
                         increase = ifelse(distanceWeighting, 1.0 / j, 1.0);
                         coocMatrix[wordIndex, neighbourWordIndex] = coocMatrix[wordIndex, neighbourWordIndex] + increase;
                     }
                 }
                 # Check right context if symmetric
                 if(symmetric == TRUE){
                     if (i+j < nrow(recodedWordPosition) + 1) {
                         if(docId == as.integer(as.scalar(recodedWordPosition[i+j, 1])))
                         {
                             neighbourWordIndex = as.integer(as.scalar(recodedWordPosition[i+j,2]));
                             increase = ifelse(distanceWeighting, 1.0 / j, 1.0);
                             coocMatrix[wordIndex, neighbourWordIndex] = coocMatrix[wordIndex, neighbourWordIndex] + increase;
                         }
                     }
                 }
             }
         }
     }
     print("Word-word cooccurrence matrix computation completed.");
 }

 ## Main function to process text data to construct a word-word co-occurrence matrix.
 # INPUT:
 # ------------------------------------------------------------------------------
 # input            (Frame[Unknown]): 1DInput corpus in CSV format.
 # maxTokens        (Int): Maximum number of tokens per text entry.
 # windowSize       (Int): Context window size.
 # distanceWeighting (Boolean): Whether to apply distance-based weighting.
 # symmetric        (Boolean): Determines if the matrix is symmetric (TRUE) or asymmetric (FALSE).
 # ------------------------------------------------------------------------------
 # OUTPUT:
 # ------------------------------------------------------------------------------
 # coocMatrix (Matrix[double]): The computed co-occurrence matrix.
 # column     (Frame[Unknown]): Word-index mapping for the co-occurrence matrix.
 # ------------------------------------------------------------------------------
 f_cooccurrenceMatrix = function(
     Frame[Unknown] input,
     Int maxTokens,
     Int windowSize,
     Boolean distanceWeighting,
     Boolean symmetric) return (Matrix[Double] coocMatrix, Frame[Unknown] column){

     processedResult = processText(input);
     [wordPosition, docID] = getWordPosition(processedResult, maxTokens);
     [recodedWordPosition, tableSize, column] = getRecodedMatrix(wordPosition);
     coocMatrix = createCoocMatrix(cbind(docID, recodedWordPosition), tableSize, distanceWeighting, symmetric, windowSize);
 }
	#-------------------------------------------------------------
	#
	# Licensed to the Apache Software Foundation (ASF) under one
	# or more contributor license agreements. See the NOTICE file
	# distributed with this work for additional information
	# regarding copyright ownership. The ASF licenses this file
	# to you under the Apache License, Version 2.0 (the
	# "License"); you may not use this file except in compliance
	# with the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing,
	# software distributed under the License is distributed on an
	# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	# KIND, either express or implied. See the License for the
	# specific language governing permissions and limitations
	# under the License.
	#
	#-------------------------------------------------------------
	#
	# The implementation is based on
	# https://github.com/stanfordnlp/GloVe/blob/master/src/cooccur.c
	#
	#-------------------------------------------------------------

	## Cleans and processes text data by removing punctuation, converting it to lowercase, and reformatting.
	## Adds an index column to the result.
	# INPUT:
	# ------------------------------------------------------------------------------
	# S (Frame[Unknown]): 1D input data frame containing text data.
	# ------------------------------------------------------------------------------
	# OUTPUT:
	# ------------------------------------------------------------------------------
	# result (Frame[Unknown]): Processed text data with an index column.
	# ------------------------------------------------------------------------------
	processText = function(Frame[Unknown] S) return (Frame[Unknown] result){
	print("processText");
	tmpStr = map(S[,1], "x -> x.replaceAll(\"[.]\", \"\")");
	tmpStr = map(tmpStr, "x -> x.replaceAll(\"[^a-zA-Z\\s]\", \" \")");
	tmpStr = map(tmpStr, "x -> x.toLowerCase()");
	result = cbind(as.frame(seq(1, nrow(S), 1)), tmpStr);
	}

	## Tokenizes text data and retrieves word positions.
	# INPUT:
	# ------------------------------------------------------------------------------
	# S (Frame[Unknown]): 2D input text data with an index column.
	# maxTokens (Int): Maximum number of tokens per text entry.
	# ------------------------------------------------------------------------------
	# OUTPUT:
	# ------------------------------------------------------------------------------
	# result (Frame[Unknown]): Tokenized words.
	# docID (Matrix[double]): Document ID matrix corresponding to tokens.
	# ------------------------------------------------------------------------------
	getWordPosition = function(Frame[Unknown] S, Int maxTokens) return (Frame[Unknown] result, Matrix[double] docID){
	print("getWordPosition");
	jspec_pos = "{\"algo\": \"split\", \"out\": \"position\",\"out_params\": {\"sort_alpha\": false},\"id_cols\": [1],\"tokenize_col\": 2}";
	wordPosition = tokenize(target=S, spec=jspec_pos, max_tokens=maxTokens);
	result = wordPosition[,3];
	docID = as.matrix(wordPosition[,1]);
	}

	## Encodes words into a numerical matrix format, retrieves the vocabulary size, and maps word indices.
	## Uses transformencode() to recode strings and find each unique string position in the co-occurrence matrix.
	# INPUT:
	# ------------------------------------------------------------------------------
	# S (Frame[Unknown]): 1D frame of tokenized word positions.
	# ------------------------------------------------------------------------------
	# OUTPUT:
	# ------------------------------------------------------------------------------
	# recodedWordPosition (Matrix[double]): Encoded word positions as a numerical matrix.
	# tableSize (Int): Number of distinct words in the input text (co-occurrence matrix size).
	# column (Frame[Unknown]): Mapping of word indices to distinct words in the co-occurrence matrix.
	# ------------------------------------------------------------------------------
	getRecodedMatrix = function(Frame[Unknown] S) return (Matrix[double] recodedWordPosition, Int tableSize, Frame[Unknown] column){
	print("getRecodedMatrix");
	[recodedWordPosition, M] = transformencode(target=S, spec="{ids:true,recode:[1]}");

	distinctWord = map(M[,1], "s -> UtilFunctions.splitRecodeEntry(s)[0]");
	index = map(M[,1], "s -> Integer.valueOf(UtilFunctions.splitRecodeEntry(s)[1])");
	column = cbind(index, distinctWord);
	sortedIndex = order(target=as.matrix(index), by=1, decreasing=FALSE, index.return=TRUE);

	#TODO vectorize via order of frames
	for(i in 1:nrow(sortedIndex)){
	p = as.integer(as.scalar(sortedIndex[i,1]));
	column[as.integer(as.scalar(index[p])), 2] = distinctWord[p];
	column[i, 1] = as.scalar(index[p]);
	}
	tableSize = nrow(distinctWord);
	}

	## Iterates over the recoded word positions to construct a co-occurrence matrix.
	# INPUT:
	# ------------------------------------------------------------------------------
	# recodedWordPosition (Matrix[double]): 2D matrix of recoded word positions with text IDs.
	# tableSize (Int): Size of the vocabulary (number of unique words).
	# distanceWeighting (Boolean): Flag to apply distance weighting to co-occurrence counts.
	# symmetric (Boolean): Determines if the matrix is symmetric (TRUE) or asymmetric (FALSE).
	# windowSize (Int): Context window size.
	# ------------------------------------------------------------------------------
	# OUTPUT:
	# ------------------------------------------------------------------------------
	# coocMatrix (Matrix[double]): Final word-word co-occurrence matrix.
	# ------------------------------------------------------------------------------
	createCoocMatrix = function(
	Matrix[double] recodedWordPosition,
	Int tableSize,
	boolean distanceWeighting,
	boolean symmetric,
	Int windowSize)
	return (Matrix[double] coocMatrix)
	{
	print("Processing word cooccurrence...");
	coocMatrix = matrix(0, tableSize, tableSize);

	#TODO vectorize loop
	for (i in 1:nrow(recodedWordPosition)) {
	docId = as.integer(as.scalar(recodedWordPosition[i,1]));
	wordIndex = as.integer(as.scalar(recodedWordPosition[i,2]));
	if(wordIndex != 0){# This check is due to wrong result of the transformencode when running jvm test.
	for (j in 1:windowSize) {
	# Check left context
	if (i-j > 0) {
	if(docId == as.integer(as.scalar(recodedWordPosition[i-j, 1])))
	{
	neighbourWordIndex = as.integer(as.scalar(recodedWordPosition[i-j,2]));
	increase = ifelse(distanceWeighting, 1.0 / j, 1.0);
	coocMatrix[wordIndex, neighbourWordIndex] = coocMatrix[wordIndex, neighbourWordIndex] + increase;
	}
	}
	# Check right context if symmetric
	if(symmetric == TRUE){
	if (i+j < nrow(recodedWordPosition) + 1) {
	if(docId == as.integer(as.scalar(recodedWordPosition[i+j, 1])))
	{
	neighbourWordIndex = as.integer(as.scalar(recodedWordPosition[i+j,2]));
	increase = ifelse(distanceWeighting, 1.0 / j, 1.0);
	coocMatrix[wordIndex, neighbourWordIndex] = coocMatrix[wordIndex, neighbourWordIndex] + increase;
	}
	}
	}
	}
	}
	}
	print("Word-word cooccurrence matrix computation completed.");
	}

	## Main function to process text data to construct a word-word co-occurrence matrix.
	# INPUT:
	# ------------------------------------------------------------------------------
	# input (Frame[Unknown]): 1DInput corpus in CSV format.
	# maxTokens (Int): Maximum number of tokens per text entry.
	# windowSize (Int): Context window size.
	# distanceWeighting (Boolean): Whether to apply distance-based weighting.
	# symmetric (Boolean): Determines if the matrix is symmetric (TRUE) or asymmetric (FALSE).
	# ------------------------------------------------------------------------------
	# OUTPUT:
	# ------------------------------------------------------------------------------
	# coocMatrix (Matrix[double]): The computed co-occurrence matrix.
	# column (Frame[Unknown]): Word-index mapping for the co-occurrence matrix.
	# ------------------------------------------------------------------------------
	f_cooccurrenceMatrix = function(
	Frame[Unknown] input,
	Int maxTokens,
	Int windowSize,
	Boolean distanceWeighting,
	Boolean symmetric) return (Matrix[Double] coocMatrix, Frame[Unknown] column){

	processedResult = processText(input);
	[wordPosition, docID] = getWordPosition(processedResult, maxTokens);
	[recodedWordPosition, tableSize, column] = getRecodedMatrix(wordPosition);
	coocMatrix = createCoocMatrix(cbind(docID, recodedWordPosition), tableSize, distanceWeighting, symmetric, windowSize);
	}