| #------------------------------------------------------------- |
| # |
| # Licensed to the Apache Software Foundation (ASF) under one |
| # or more contributor license agreements. See the NOTICE file |
| # distributed with this work for additional information |
| # regarding copyright ownership. The ASF licenses this file |
| # to you under the Apache License, Version 2.0 (the |
| # "License"); you may not use this file except in compliance |
| # with the License. You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, |
| # software distributed under the License is distributed on an |
| # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| # KIND, either express or implied. See the License for the |
| # specific language governing permissions and limitations |
| # under the License. |
| # |
| #------------------------------------------------------------- |
| |
| # Implements builtin for finding functional dependencies |
| # |
| # INPUT PARAMETERS: |
| # --------------------------------------------------------------------------------------------- |
| # NAME TYPE DEFAULT MEANING |
| # --------------------------------------------------------------------------------------------- |
| # X Double -- Input Matrix X, encoded Matrix if data is categorical |
| # Mask Double -- A row vector for interested features i.e. Mask =[1, 0, 1] |
| # will exclude the second column from processing |
| # threshold Double -- threshold value in interval [0, 1] for robust FDs |
| # --------------------------------------------------------------------------------------------- |
| |
| |
| #Output(s) |
| # --------------------------------------------------------------------------------------------- |
| # NAME TYPE DEFAULT MEANING |
| # --------------------------------------------------------------------------------------------- |
| # FD Double --- matrix of functional dependencies |
| |
| m_discoverFD = function(Matrix[Double] X, Matrix[Double] Mask, Double threshold) |
| return(Matrix[Double] FD) |
| { |
| if( threshold < 0 | threshold > 1 ) |
| stop("Stopping due to invalid input, threshold required in interval [0, 1] found "+threshold) |
| |
| if(ncol(X) != ncol(Mask)) |
| stop("Stopping due to dimension mismatch in Matrix and it's Mask") |
| |
| if( nrow(Mask) > 1 ) |
| stop("Stopping due to invalid input, Mask required n * 1 found n * "+nrow(Mask)) |
| |
| # feature pruning using mask (keep interested features only for FD discovery) |
| X = removeEmpty(target = X, margin = "cols", select=Mask) |
| |
| # allocate output and working sets |
| n = nrow(X) |
| d = ncol(X) |
| FD = matrix(0, d, d) |
| cm = matrix(0, 1, d) |
| |
| # num distinct per column |
| parfor(i in 1:d) |
| cm[1,i] = colDistinct(X[,i]) |
| |
| # add know functional dependencies |
| FD = FD + (cm == 1) # constant columns determined by all columns |
| FD = FD + (t(cm) == n) # unique columns determine all columns |
| FD = FD != 0 # to keep the count consistent |
| |
| # sort num distinct and enumerate only upper triangle |
| cm2 = order(target=t(cm), decreasing=TRUE, index.return=TRUE) |
| parfor(i in 1 : d, check=0) { |
| index_i = as.scalar(cm2[i,1]) |
| ndX = as.scalar(cm[1,index_i]) |
| if( ndX!=1 & ndX!=n) { |
| Xi = X[,index_i]; |
| k = ifelse(threshold < 1, 1, (i+1)); # enumerate only upper triangle if threshold = 1 |
| parfor(j in k:d , check=0) { |
| if(j != i) { |
| index_j = as.scalar(cm2[j,1]) |
| [A_determines_B, ratio] = isFD(Xi, X[,index_j], ndX); |
| if(A_determines_B | ratio >= threshold) |
| FD[index_i, index_j] = ratio; # matrix of robust FD with their ratios |
| } |
| } |
| } |
| } |
| } |
| |
| isFD = function(Matrix[Double] X, Matrix[Double] Y, Integer ndX) |
| return(Boolean A_determines_B, Double ratio) |
| { |
| ctab = table(X, Y) |
| rowSumTable = rowSums(ctab != 0) # X values -> num Y values |
| A_determines_B = (sum(rowSumTable==1) == ndX); |
| # robust functional dependency ratio (1.0 if A -> B) |
| ratio = sum(rowMaxs(ctab)) / nrow(X) |
| } |
| |
| colDistinct = function(Matrix[Double] X) |
| return(Double distinctItems) |
| { |
| distinctItems = sum(table(X, 1) != 0) |
| } |