| #------------------------------------------------------------- |
| # |
| # Licensed to the Apache Software Foundation (ASF) under one |
| # or more contributor license agreements. See the NOTICE file |
| # distributed with this work for additional information |
| # regarding copyright ownership. The ASF licenses this file |
| # to you under the Apache License, Version 2.0 (the |
| # "License"); you may not use this file except in compliance |
| # with the License. You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, |
| # software distributed under the License is distributed on an |
| # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| # KIND, either express or implied. See the License for the |
| # specific language governing permissions and limitations |
| # under the License. |
| # |
| #------------------------------------------------------------- |
| # |
| # The implementation is based on |
| # https://github.com/stanfordnlp/GloVe/blob/master/src/cooccur.c |
| # |
| #------------------------------------------------------------- |
| |
| ## Cleans and processes text data by removing punctuation, converting it to lowercase, and reformatting. |
| ## Adds an index column to the result. |
| # INPUT: |
| # ------------------------------------------------------------------------------ |
| # S (Frame[Unknown]): 1D input data frame containing text data. |
| # ------------------------------------------------------------------------------ |
| # OUTPUT: |
| # ------------------------------------------------------------------------------ |
| # result (Frame[Unknown]): Processed text data with an index column. |
| # ------------------------------------------------------------------------------ |
| processText = function(Frame[Unknown] S) return (Frame[Unknown] result){ |
| print("processText"); |
| tmpStr = map(S[,1], "x -> x.replaceAll(\"[.]\", \"\")"); |
| tmpStr = map(tmpStr, "x -> x.replaceAll(\"[^a-zA-Z\\s]\", \" \")"); |
| tmpStr = map(tmpStr, "x -> x.toLowerCase()"); |
| result = cbind(as.frame(seq(1, nrow(S), 1)), tmpStr); |
| } |
| |
| ## Tokenizes text data and retrieves word positions. |
| # INPUT: |
| # ------------------------------------------------------------------------------ |
| # S (Frame[Unknown]): 2D input text data with an index column. |
| # maxTokens (Int): Maximum number of tokens per text entry. |
| # ------------------------------------------------------------------------------ |
| # OUTPUT: |
| # ------------------------------------------------------------------------------ |
| # result (Frame[Unknown]): Tokenized words. |
| # docID (Matrix[double]): Document ID matrix corresponding to tokens. |
| # ------------------------------------------------------------------------------ |
| getWordPosition = function(Frame[Unknown] S, Int maxTokens) return (Frame[Unknown] result, Matrix[double] docID){ |
| print("getWordPosition"); |
| jspec_pos = "{\"algo\": \"split\", \"out\": \"position\",\"out_params\": {\"sort_alpha\": false},\"id_cols\": [1],\"tokenize_col\": 2}"; |
| wordPosition = tokenize(target=S, spec=jspec_pos, max_tokens=maxTokens); |
| result = wordPosition[,3]; |
| docID = as.matrix(wordPosition[,1]); |
| } |
| |
| ## Encodes words into a numerical matrix format, retrieves the vocabulary size, and maps word indices. |
| ## Uses transformencode() to recode strings and find each unique string position in the co-occurrence matrix. |
| # INPUT: |
| # ------------------------------------------------------------------------------ |
| # S (Frame[Unknown]): 1D frame of tokenized word positions. |
| # ------------------------------------------------------------------------------ |
| # OUTPUT: |
| # ------------------------------------------------------------------------------ |
| # recodedWordPosition (Matrix[double]): Encoded word positions as a numerical matrix. |
| # tableSize (Int): Number of distinct words in the input text (co-occurrence matrix size). |
| # column (Frame[Unknown]): Mapping of word indices to distinct words in the co-occurrence matrix. |
| # ------------------------------------------------------------------------------ |
| getRecodedMatrix = function(Frame[Unknown] S) return (Matrix[double] recodedWordPosition, Int tableSize, Frame[Unknown] column){ |
| print("getRecodedMatrix"); |
| [recodedWordPosition, M] = transformencode(target=S, spec="{ids:true,recode:[1]}"); |
| |
| distinctWord = map(M[,1], "s -> UtilFunctions.splitRecodeEntry(s)[0]"); |
| index = map(M[,1], "s -> Integer.valueOf(UtilFunctions.splitRecodeEntry(s)[1])"); |
| column = cbind(index, distinctWord); |
| sortedIndex = order(target=as.matrix(index), by=1, decreasing=FALSE, index.return=TRUE); |
| |
| #TODO vectorize via order of frames |
| for(i in 1:nrow(sortedIndex)){ |
| p = as.integer(as.scalar(sortedIndex[i,1])); |
| column[as.integer(as.scalar(index[p])), 2] = distinctWord[p]; |
| column[i, 1] = as.scalar(index[p]); |
| } |
| tableSize = nrow(distinctWord); |
| } |
| |
| ## Iterates over the recoded word positions to construct a co-occurrence matrix. |
| # INPUT: |
| # ------------------------------------------------------------------------------ |
| # recodedWordPosition (Matrix[double]): 2D matrix of recoded word positions with text IDs. |
| # tableSize (Int): Size of the vocabulary (number of unique words). |
| # distanceWeighting (Boolean): Flag to apply distance weighting to co-occurrence counts. |
| # symmetric (Boolean): Determines if the matrix is symmetric (TRUE) or asymmetric (FALSE). |
| # windowSize (Int): Context window size. |
| # ------------------------------------------------------------------------------ |
| # OUTPUT: |
| # ------------------------------------------------------------------------------ |
| # coocMatrix (Matrix[double]): Final word-word co-occurrence matrix. |
| # ------------------------------------------------------------------------------ |
| createCoocMatrix = function( |
| Matrix[double] recodedWordPosition, |
| Int tableSize, |
| boolean distanceWeighting, |
| boolean symmetric, |
| Int windowSize) |
| return (Matrix[double] coocMatrix) |
| { |
| print("Processing word cooccurrence..."); |
| coocMatrix = matrix(0, tableSize, tableSize); |
| |
| #TODO vectorize loop |
| for (i in 1:nrow(recodedWordPosition)) { |
| docId = as.integer(as.scalar(recodedWordPosition[i,1])); |
| wordIndex = as.integer(as.scalar(recodedWordPosition[i,2])); |
| if(wordIndex != 0){# This check is due to wrong result of the transformencode when running jvm test. |
| for (j in 1:windowSize) { |
| # Check left context |
| if (i-j > 0) { |
| if(docId == as.integer(as.scalar(recodedWordPosition[i-j, 1]))) |
| { |
| neighbourWordIndex = as.integer(as.scalar(recodedWordPosition[i-j,2])); |
| increase = ifelse(distanceWeighting, 1.0 / j, 1.0); |
| coocMatrix[wordIndex, neighbourWordIndex] = coocMatrix[wordIndex, neighbourWordIndex] + increase; |
| } |
| } |
| # Check right context if symmetric |
| if(symmetric == TRUE){ |
| if (i+j < nrow(recodedWordPosition) + 1) { |
| if(docId == as.integer(as.scalar(recodedWordPosition[i+j, 1]))) |
| { |
| neighbourWordIndex = as.integer(as.scalar(recodedWordPosition[i+j,2])); |
| increase = ifelse(distanceWeighting, 1.0 / j, 1.0); |
| coocMatrix[wordIndex, neighbourWordIndex] = coocMatrix[wordIndex, neighbourWordIndex] + increase; |
| } |
| } |
| } |
| } |
| } |
| } |
| print("Word-word cooccurrence matrix computation completed."); |
| } |
| |
| ## Main function to process text data to construct a word-word co-occurrence matrix. |
| # INPUT: |
| # ------------------------------------------------------------------------------ |
| # input (Frame[Unknown]): 1DInput corpus in CSV format. |
| # maxTokens (Int): Maximum number of tokens per text entry. |
| # windowSize (Int): Context window size. |
| # distanceWeighting (Boolean): Whether to apply distance-based weighting. |
| # symmetric (Boolean): Determines if the matrix is symmetric (TRUE) or asymmetric (FALSE). |
| # ------------------------------------------------------------------------------ |
| # OUTPUT: |
| # ------------------------------------------------------------------------------ |
| # coocMatrix (Matrix[double]): The computed co-occurrence matrix. |
| # column (Frame[Unknown]): Word-index mapping for the co-occurrence matrix. |
| # ------------------------------------------------------------------------------ |
| f_cooccurrenceMatrix = function( |
| Frame[Unknown] input, |
| Int maxTokens, |
| Int windowSize, |
| Boolean distanceWeighting, |
| Boolean symmetric) return (Matrix[Double] coocMatrix, Frame[Unknown] column){ |
| |
| processedResult = processText(input); |
| [wordPosition, docID] = getWordPosition(processedResult, maxTokens); |
| [recodedWordPosition, tableSize, column] = getRecodedMatrix(wordPosition); |
| coocMatrix = createCoocMatrix(cbind(docID, recodedWordPosition), tableSize, distanceWeighting, symmetric, windowSize); |
| } |