blob: da6d3f6e01c619c7084d8e2d2095c827ad29d4e6 [file] [log] [blame]
#-------------------------------------------------------------
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#
#-------------------------------------------------------------
#
# This function considers some constraints indicating statements that can NOT happen in the data (denial constraints).
#
#
# INPUT PARAMETERS:
# ---------------------------------------------------------------------------------------------
# NAME TYPE DEFAULT MEANING
# ---------------------------------------------------------------------------------------------
# dataFrame Frame --- frame which columns represent the variables of the data and the rows correspond to different tuples or instances.
# Recommended to have a column indexing the instances from 1 to N (N=number of instances).
# constraintsFrame Frame --- frame with fixed columns and each row representing one constraint.
# 1. idx: (double) index of the constraint, from 1 to M (number of constraints)
# 2. constraint.type: (string) The constraints can be of 3 different kinds:
# - variableCompare: for each instance, it will compare the values of two variables (with a relation <, > or =).
# - valueCompare: for each instance, it will compare a fixed value and a variable value (with a relation <, > or =).
# - instanceCompare: for every couple of instances, it will compare the relation between two variables,
# ie if the value of the variable 1 in instance 1 is lower/higher than the value of variable 1 in instance 2,
# then the value of of variable 2 in instance 2 can't be lower/higher than the value of variable 2 in instance 2.
# 3. group.by: (boolean) if TRUE only one group of data (defined by a variable option) will be considered for the constraint.
# 4. group.variable: (string, only if group.by TRUE) name of the variable (column in dataFrame) that will divide our data in groups.
# 5. group.option: (only if group.by TRUE) option of the group.variable that defines the group to consider.
# 6. variable1: (string) first variable to compare (name of column in dataFrame).
# 7. relation: (string) can be < , > or = in the case of variableCompare and valueCompare, and < >, < < , > < or > >
# in the case of instanceCompare
# 8. variable2: (string) second variable to compare (name of column in dataFrame) or fixed value for the case of valueCompare.
#
# -----------------------
# EXAMPLE:
# dataFrame:
#
# rank discipline yrs.since.phd yrs.service sex salary
# 1 Prof B 19 18 Male 139750
# 2 Prof B 20 16 Male 173200
# 3 AsstProf B 3 3 Male 79750.56
# 4 Prof B 45 39 Male 115000
# 5 Prof B 40 40 Male 141500
# 6 AssocProf B 6 6 Male 97000
# 7 Prof B 30 23 Male 175000
# 8 Prof B 45 45 Male 147765
# 9 Prof B 21 20 Male 119250
# 10 Prof B 18 18 Female 129000
# 11 AssocProf B 12 8 Male 119800
# 12 AsstProf B 7 2 Male 79800
# 13 AsstProf B 1 1 Male 77700
#
# constraintsFrame:
#
# idx constraint.type group.by group.variable group.option variable1 relation variable2
# 1 variableCompare FALSE yrs.since.phd < yrs.service
# 2 instanceCompare TRUE rank Prof yrs.service >< salary
# 3 valueCompare FALSE salary = 78182
# 4 variableCompare TRUE discipline B yrs.service > yrs.since.phd
#
#
# Example: explanation of constraint 2 --> it can't happen that one professor of rank Prof has more years of service than other, but lower salary.
#
#----------------------------------
# OUTPUT PARAMETERS:
#----------------------------------
# ---------------------------------------------------------------------------------------------
# NAME TYPE DEFAULT MEANING
# --------------------------------------------------------------------------------
# WrongInstances Matrix Double Matrix of 2 columns.
# - First column shows the indexes of dataFrame that are wrong.
# - Second column shows the index of the denial constraint that is fulfilled
# If there are no wrong instances to show (0 constrains fulfilled) --> WrongInstances=matrix(0,1,2)
#
s_denialConstraints = function(Frame[Unknown] dataFrame, Frame[Unknown] constraintsFrame)
return(Matrix[double] WrongInstances)
{
print("DENIAL CONSTRAINTS");
N = nrow(dataFrame); # rows in data frame
M = nrow(constraintsFrame); # number of constraints
WrongInstances = matrix(0,rows=N*M,cols=2)
flag=0
colName = dataFrame[1,]
for(iConstraint in 2:M) { # loop starts in 2 because 1 is the name of the columns, not a constraint
var1 = as.scalar(constraintsFrame[iConstraint,6]) # variable 1 of the constraint
isCol1 = map(colName, "x->x.equals(\""+var1+"\")") # find the column of dataFrame corresponding to var1
rel = as.scalar(constraintsFrame[iConstraint,7]) # relation of the constraint
colIdx1=0
for(iLog in 1:ncol(colName)){
if(as.scalar(isCol1[1,iLog])=="true"){
colIdx1=iLog # number (index) of the column of dataFrame corresponding to var1
}
}
if (colIdx1==0){
print('Variable 1 for constraint ' + toString(iConstraint-1) + " not found in dataFrame")
}
# DEFINE IF THE CONSTRAINT IS RESTRICTED TO A GROUP OF DATA:
if(as.scalar(constraintsFrame[iConstraint,3])=="TRUE" & colIdx1!=0) {
varToGroup = as.scalar(constraintsFrame[iConstraint,4]) # variable that will divide our data in groups
isColToGroup = map(colName, "x->x.equals(\""+varToGroup+"\")") # find the column of dataFrame corresponding to varToGroup
for(iLog in 1:ncol(colName)){
if(as.scalar(isColToGroup[1,iLog])=="true"){
colIdxToGroup=iLog
}
}
groupInstances = dataFrame[,colIdxToGroup]
groupOption = as.scalar(constraintsFrame[iConstraint,5]) # option of the group.variable that defines the group to consider
IsGroupInstance= map(groupInstances, "x->x.equals(\""+groupOption+"\")") # find the instances with varToGroup = groupOption
IsGroupInstanceM = matrix(0, nrow(IsGroupInstance), 1)
for(h in 1:nrow(IsGroupInstance)){
IsGroupInstanceM[h,1] = ifelse(as.scalar(IsGroupInstance[h,1]) == "true",TRUE,FALSE)
}
} else if (colIdx1!=0){
IsGroupInstanceM = matrix(0, N, 1)
IsGroupInstance = matrix(1, N, 1)
for(h in 1:N){
IsGroupInstanceM[h,1] = ifelse(as.scalar(IsGroupInstance[h,1]) == 1,TRUE,FALSE)
}
}
# CONSTRAINT TO COMPARE VARIABLES OF THE SAME INSTANCE:
if(as.scalar(constraintsFrame[iConstraint,2])=="variableCompare" & colIdx1!=0){
var2 = as.scalar(constraintsFrame[iConstraint,8]) # variable 2 of the constraint
isCol2 = 0
isCol2 = map(colName, "x->x.equals(\""+var2+"\")")
for(iLog in 1:ncol(colName)){
if(as.scalar(isCol2[1,iLog])=="true"){
colIdx2=iLog
}
}
if (colIdx2==0){
print('Variable 2 for constraint ' + toString(iConstraint-1) + " not found in dataFrame")
}
if(rel=="<" & colIdx2!=0){
for(iInstance in 2:N){ # loop starts in 2 because 1 is the name of the columns, not an instance
value1 = as.scalar(dataFrame[iInstance,colIdx1]) # value 1 to compare in the constraint
value2 = as.scalar(dataFrame[iInstance,colIdx2]) # value 2 to compare in the constraint
if(as.integer(value1)<as.integer(value2) & as.scalar(IsGroupInstanceM[iInstance,1])){
flag = flag+1
WrongInstances[flag,1] = iInstance-1
WrongInstances[flag,2] = iConstraint-1
}
}
} else if(rel==">" & colIdx2!=0){
for(iInstance in 2:N){
value1 = as.scalar(dataFrame[iInstance,colIdx1]) # value 1 to compare in the constraint
value2 = as.scalar(dataFrame[iInstance,colIdx2]) # value 2 to compare in the constraint
if(as.integer(value1)>as.integer(value2) & as.scalar(IsGroupInstanceM[iInstance,1])){
flag = flag+1
WrongInstances[flag,1] = iInstance-1
WrongInstances[flag,2] = iConstraint-1
}
}
} else if(rel=="=" & colIdx2!=0){
for(iInstance in 2:N){
value1 = as.scalar(dataFrame[iInstance,colIdx1]) # value 1 to compare in the constraint
value2 = as.scalar(dataFrame[iInstance,colIdx2]) # value 2 to compare in the constraint
if(as.integer(value1)==as.integer(value2) & as.scalar(IsGroupInstanceM[iInstance,1])){
flag = flag+1
WrongInstances[flag,1] = iInstance-1
WrongInstances[flag,2] = iConstraint-1
}
}
}
# CONSTRAINT TO COMPARE A VALUE AND A VARIABLE FOR EACH iNSTANCE
} else if(as.scalar(constraintsFrame[iConstraint,2])=="valueCompare" & colIdx1!=0){
value2 = as.scalar(constraintsFrame[iConstraint,8]) # value 2 to compare in the constraint
if(rel=="<"){
for(iInstance in 2:N){
value1 = as.scalar(dataFrame[iInstance,colIdx1]) # value 1 to compare in the constraint
if(as.integer(value1)<as.integer(value2) & as.scalar(IsGroupInstanceM[iInstance,1])){
flag = flag+1
WrongInstances[flag,1] = iInstance-1
WrongInstances[flag,2] = iConstraint-1
}
}
} else if(rel==">"){
for(iInstance in 2:N){
value1 = as.scalar(dataFrame[iInstance,colIdx1]) # value 1 to compare in the constraint
if(as.integer(value1)>as.integer(value2) & as.scalar(IsGroupInstanceM[iInstance,1])){
flag = flag+1
WrongInstances[flag,1] = iInstance-1
WrongInstances[flag,2] = iConstraint-1
}
}
} else if(rel=="="){
for(iInstance in 2:N){
value1 = as.scalar(dataFrame[iInstance,colIdx1]) # value 1 to compare in the constraint
if(as.integer(value1)==as.integer(value2) & as.scalar(IsGroupInstanceM[iInstance,1])){
flag = flag+1
WrongInstances[flag,1] = iInstance-1
WrongInstances[flag,2] = iConstraint-1
}
}
}
# CONSTRAINT TO COMPARE THE RELATION BETWEEN VARIABLES FOR DIFFERENT INSTANCES
} else if(as.scalar(constraintsFrame[iConstraint,2])=="instanceCompare" & colIdx1!=0){
var2 = as.scalar(constraintsFrame[iConstraint,8]) # variable 2 of the constraint
isCol2 = map(colName, "x->x.equals(\""+var2+"\")")
colIdx2=0
for(iLog in 1:ncol(colName)){
if(as.scalar(isCol2[1,iLog])=="true"){
colIdx2=iLog
}
}
if (colIdx2==0){
print('Variable 2 for constraint ' + toString(iConstraint-1) + " not found in dataFrame")
} else {
# Define a matrix with as many rows as it should be considered according to "group.by" and the following 3 columns:
# (1) index of the instance, (2) instances or variable 1, (3) instances of the variable 2
DataMatrix = matrix(0,cols=4,rows=N-1)
flag3=0
for(iInstance in 2:N){
if(as.scalar(IsGroupInstanceM[iInstance,1])){
flag3=flag3+1
DataMatrix[flag3,1] = as.matrix(dataFrame[iInstance,1]) # InstanceIdx
DataMatrix[flag3,2] = as.matrix(dataFrame[iInstance,colIdx1])
DataMatrix[flag3,3] = as.matrix(dataFrame[iInstance,colIdx2])
}
}
DataMatrix=DataMatrix[1:flag3,]
# order the matrix according to the values of variable 1, decreasing or increasing depending on the first part of the relation(> or <):
if(rel=="<>" | rel=="<<"){
DataMatrixOrdered = order(target=DataMatrix,by=2,decreasing=FALSE,index.return=FALSE)
} else if(rel==">>" | rel=="><"){
DataMatrixOrdered = order(target=DataMatrix,by=2,decreasing=TRUE,index.return=FALSE)
}
# define groups of rows in the way that every group has the same value for variable 1 (second column of DataMatrixOrdered):
idxToGroup=matrix(0,flag3,1)
flag2=1
for(iRow in 2:flag3){
if((as.scalar(DataMatrixOrdered[iRow,2])-as.scalar(DataMatrixOrdered[iRow-1,2]))!=0){ # there is a change of group
flag2=flag2+1
idxToGroup[flag2,1]=iRow # vector with the row indexes where there is a change of group
}
}
idxToGroup=idxToGroup[1:flag2,1]
idxOrdered = DataMatrixOrdered[,1]
# loop over the groups and see if they fulfill the constrain (compare every group with the next one):
for (iGroup in 1:(flag2-2)){
idx1 = as.scalar(idxToGroup[iGroup,1])
idx2 = as.scalar(idxToGroup[iGroup+1,1])
idx3 = as.scalar(idxToGroup[iGroup+2,1])
if(rel=="<<" | rel=="><"){
G1 = DataMatrixOrdered[idx1+1:idx2,] # first group
G2 = DataMatrixOrdered[idx2+1:idx3,] # second group
M1 = min(G1[,3])
M2 = max(G2[,3])
if(M1<M2){
for(iNumber in 1:nrow(G1)){
if(as.integer(as.scalar(G1[iNumber,3]))<M2){
flag = flag+1
WrongInstances[flag,1] = as.scalar(G1[iNumber,1])
WrongInstances[flag,2] = iConstraint-1
}
}
for(iNumber in 1:nrow(G2)){
if(as.integer(as.scalar(G2[iNumber,3]))>M1){
flag = flag+1
WrongInstances[flag,1] = as.scalar(G2[iNumber,1])
WrongInstances[flag,2] = iConstraint-1
}
}
}
} else if(rel=="<>" | rel==">>"){
G1 = DataMatrixOrdered[idx1+1:idx2,] # first group
G2 = DataMatrixOrdered[idx2+1:idx3,] # second group
M1 = max(G1[,3])
M2 = min(G2[,3])
if(M1>M2){
for(iNumber in 1:nrow(G1)){
if(as.integer(as.scalar(G1[iNumber,3]))>as.integer(M2)){
flag = flag+1
WrongInstances[flag,1] = as.scalar(G1[iNumber,1])
WrongInstances[flag,2] = iConstraint-1
}
}
for(iNumber in 1:nrow(G2)){
if(as.integer(as.scalar(G2[iNumber,3]))<as.integer(M1)){
flag = flag+1
WrongInstances[flag,1] = as.scalar(G2[iNumber,1])
WrongInstances[flag,2] = iConstraint-1
}
}
}
}
}
}
}
}
if (flag==0){
flag=1
print("0 constraints are fulfilled")
}
# Define the final output:
WrongInstances=WrongInstances[1:flag,]
WrongInstances = order(target=WrongInstances,by=1,decreasing=FALSE,index.return=FALSE)
}