blob: 4dea80474012c8621c56f57866159b9b6213a66a [file] [log] [blame]
#-------------------------------------------------------------
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#
#-------------------------------------------------------------
# This builtin function implements incSliceLine, a linear-algebra-based
# ML model debugging technique for finding the top-k data slices where
# a trained models performs significantly worse than on the overall
# dataset. IncSliceLine is designed for scenarios in which training data is updated incrementally.
# For a detailed description of the SliceLine algorithm and experimental results, see:
# Svetlana Sagadeeva, Matthias Boehm: SliceLine: Fast, Linear-Algebra-based Slice Finding for ML Model Debugging.(SIGMOD 2021)
#
# INPUT:
# ---------------------------------------------------------------------------------------
# addedX Feature matrix of added tuples in recoded/binned representation
# oldX All-comprising feature matrix of previous runs (except for current run) in recoded/binned representation
# oldE All-comprising error vector of trained model for old tuples
# addedE Error vector of trained model for added tuples
# indicesRemoved Indices of tuples that were removed from the previous dataset (oldX)
# k Number of subsets required
# maxL maximum level L (conjunctions of L predicates), 0 unlimited
# minSup minimum support (min number of rows per slice)
# alpha weight [0,1]: 0 only size, 1 only error
# tpEval flag for task-parallel slice evaluation,
# otherwise data-parallel
# tpBlksz block size for task-parallel execution (num slices)
# selFeat flag for removing one-hot-encoded features that don't satisfy
# the initial minimum-support constraint and/or have zero error
# verbose flag for verbose debug output
# params list of parameters to ensure consistent parameters through all runs (for incremental updates)
# prevFoffb previous feature offsets (for incremental updates)
# prevFoffe previous feature offsets (for incremental updates)
# prevLattice previous lattice (for incremental updates)
# metaPrevLattice previous meta information for lattice encoding (for incremental updates)
# prevStats previous statistics whole lattice (for incremental updates)
# prevTK previous top-k slices (for incremental updates)
# prevTKC previous top-k scores (for incremental updates)
# encodeLat flag for encoding output lattice for less memory consumption
# pruningStrat pruning strategy: 0 all pruning, 1 all exact pruning,
# 2 only score pruning, 3 only max score pruning,
# 4 only size pruning, 5 no pruning
# ---------------------------------------------------------------------------------------
#
# OUTPUT:
# -----------------------------------------------------------------------------------------
# TK top-k slices (k x ncol(totalX) if successful)
# TKC score, size, error of slices (k x 3)
# D debug matrix, populated with enumeration stats if verbose
# L lattice matrix
# metaLattice meta information for lattice encoding
# Stats statistics matrix for all slices in L
# Xout feature matrix consisting of oldX, addedX and without removedX for next run
# eOut error vector consisting of oldE, addedE and without removedE for next run
# foffb feature offsets for next run
# foffe feature offsets for next run
# params list of parameters for next run
# -----------------------------------------------------------------------------------------
m_incSliceLine = function(
Matrix[Double] addedX, Matrix[Double] oldX = matrix(0, 0, 0),
Matrix[Double] oldE = matrix(0, 0, 0), Matrix[Double] addedE,
Int k = 4, Int maxL = 0, Int minSup = 32, Double alpha = 0.5,
Boolean tpEval = TRUE, Int tpBlksz = 16, Boolean selFeat = FALSE,
Matrix[Double] indicesRemoved = matrix(0,0,0),
Boolean verbose = FALSE, list[unknown] params = list(),
Matrix[Double] prevFoffb = matrix(0,0,0), Matrix[Double] prevFoffe = matrix(0,0,0),
list[unknown] prevLattice = list(), list[unknown] metaPrevLattice = list(),
list[unknown] prevStats = list(), Matrix[Double] prevTK = matrix(0,0,0),
Matrix[Double] prevTKC = matrix(0,0,0), Boolean encodeLat = TRUE,
Int pruningStrat = 1)
return(
Matrix[Double] TK, Matrix[Double] TKC, Matrix[Double] D,
list[unknown] L, list[unknown] metaLattice,
list[unknown] Stats, Matrix[Double] Xout, Matrix[Double] eOut,
Matrix[Double] foffb, Matrix[Double] foffe, list[unknown] params)
{
# for incremental updates a params list storing previous parameters is required
# to ensure consistent parameters over all runs
# the params list is automatically generated from the first run's params and only needs to be passed on.
if(length(prevLattice) > 0 &
((length(prevStats) == 0 | length(params) == 0 | nrow(prevFoffb) == 0 | nrow(prevFoffe) == 0 |
nrow(oldX) == 0 | nrow(oldE) == 0) | (encodeLat & length(metaPrevLattice) == 0)))
{
stop("incSliceLine: wrong or no parameters provided for incremental update. Please provide all parameters.\n"
+ " -- necessary parameters: params, prevLattice, prevStats, prevFoffb, prevFoffe, prevTK, prevTKC, oldX, oldE. \n"
+ " -- see documentation for more details.");
}
enableIncScorePruning = (pruningStrat <= 2);
enableIncSizePruning = (pruningStrat <= 1 | pruningStrat == 4);
enableIncMaxScorePruning = (pruningStrat <= 1 | pruningStrat == 3);
enableIncApproxPruning = (pruningStrat == 0);
t1 = time();
# store params for next run
[params, k, maxL, minSup, alpha, tpEval, tpBlksz, selFeat, encodeLat] = storeParams(k, maxL, minSup, alpha, tpEval, tpBlksz, selFeat, encodeLat, params);
# init debug matrix: levelID, enumerated S, valid S, TKmax, TKmin
D = matrix(0, 0, 5);
# combine old and added feature matrices and error vectors
if(nrow(oldX) == 0 | nrow(oldE) == 0) {
oldX = matrix(0,0,ncol(addedX));
oldE = matrix(0,0,ncol(addedE));
}
## remove all rows from oldX and oldE that are in indicesRemoved
removedX = matrix(0,0,ncol(oldX));
removedE = matrix(0,0,1);
if(length(indicesRemoved) > 0 & nrow(oldX) > 0){
[oldX, removedX] = removeRowsByIndices(oldX, indicesRemoved);
[oldE, removedE] = removeRowsByIndices(oldE, indicesRemoved);
if(verbose){
print("incSliceLine: removed "+nrow(indicesRemoved)+" tuples.");
}
}
totalX = rbind(oldX, addedX);
totalE = rbind(oldE, addedE);
# prepare output error vector for next run
eOut = totalE;
# compute number of tuples m and number of features n
m = nrow(totalX);
n = ncol(totalX);
# prepare offset vectors and one-hot encoded totalX
fdom = colMaxs(totalX);
foffb = t(cumsum(t(fdom))) - fdom;
foffe = t(cumsum(t(fdom)));
rix = matrix(seq(1,m)%*%matrix(1,1,n), m*n, 1)
cix = matrix(totalX + foffb, m*n, 1);
X2 = table(rix, cix, 1, m, as.scalar(foffe[,n]), FALSE); #one-hot encoded
#inititalize booleans
differentOffsets = TRUE;
if(nrow(prevFoffb) > 0){
differentOffsets = (!((sum(prevFoffb == foffb) == (ncol(foffb) * nrow(foffb)))
& (sum(prevFoffe == foffe) == (ncol(foffe) * nrow(foffe))) ));
}
# combining of addedX and oldX
addedX2 = X2[(nrow(oldX)+1):nrow(X2),];
# One-hot encoding of tuples that were removed from oldX
# combining of addedX and oldX in changedX2 facilitates simple determination of unchanged slices for pruning
changedX2 = addedX2;
removedX2 = removedX;
if(nrow(removedX) > 0){
removedX2 = oneHotEncodeUsingOffsets(removedX, foffb, foffe);
changedX2 = rbind(addedX2, removedX2);
} else {
removedX2 = matrix(0, 0, ncol(addedX2));
}
changedE = rbind(addedE, removedE);
# initialize statistics and basic slices
n2 = ncol(X2); # one-hot encoded features
eAvgOld = sum(oldE) / max(1, nrow(oldX)); # average error
eAvgNew = sum(addedE) / nrow(addedX);
eAvg = sum(totalE) / m; # average error
# prepare previous top-K set (one hot encoding, score computation)
prevTK2 = matrix(0, 0, ncol(X2));
prevTKC2 = matrix(0, 0, 4);
minsc = -Inf
if( nrow(prevTK) > 0 & enableIncScorePruning ) {
prevTK2 = oneHotEncodeUsingOffsets(prevTK, foffb, foffe);
[minsc, prevTKC2] = computeLowestPrevTK(prevTK2, X2, totalE, eAvg, alpha)
}
# create and score basic slices (conjunctions of 1 feature)
maxsc = getMaxScoreAllFeatures(nrow(X2), ncol(X2), prevLattice, metaPrevLattice,
prevStats, encodeLat, differentOffsets, alpha, eAvg, prevFoffb, prevFoffe, foffb, foffe);
maxscub = getMaxChangedScoreAllFeatures(nrow(X2), ncol(X2), addedX2, removedX2,
addedE, removedE, prevLattice, metaPrevLattice, prevStats, encodeLat, differentOffsets,
alpha, eAvg, minSup, prevFoffb, prevFoffe, foffb, foffe, enableIncApproxPruning);
[S, R, SPr, RPr, selCols] = createAndScoreBasicSlicesInc(X2, changedX2, prevTK2, totalE, changedE,
eAvg, eAvgOld, eAvgNew, minSup, alpha, minsc, maxsc, maxscub, verbose, enableIncScorePruning, enableIncMaxScorePruning, enableIncApproxPruning);
# initialize lattice and statistics for incremental updates
L1 = rbind(S, SPr);
metaLattice = list();
if( encodeLat ) {
[L1, M] = transformSlicesToIDs(L1, foffb, foffe);
metaLattice = append(metaLattice, M);
}
L = list(L1);
Stats1 = rbind(R, RPr);
Stats = list(Stats1);
# initialize top-k
[TK, TKC] = maintainTopKInc(S, R, prevTK2, prevTKC2, k, minSup, foffb, foffe);
if( verbose ) {
[maxsc2, minsc2] = analyzeTopKInc(TKC);
print("incSliceLine: initial top-K: count="+nrow(TK)+", max="+maxsc2+", min="+minsc2+" (time="+(time()-t1)+")")
D = rbind(D, t(as.matrix(list(1, n2, nrow(S), maxsc2, minsc2))));
}
# reduce dataset to relevant attributes (minSup, err>0), S reduced on-the-fly
if( selFeat ){
X2 = removeEmpty(target=X2, margin="cols", select=t(selCols));
changedX2 = removeEmpty(target=changedX2, margin="cols", select=t(selCols));
}
# lattice enumeration w/ size/error pruning, one iteration per level
# termination condition (max #feature levels)
maxL = ifelse(maxL<=0, n, maxL)
level = 1;
while( nrow(S) > 0 & sum(S) > 0 & level < n & level < maxL ) {
level = level + 1;
# enumerate candidate join pairs, incl size/error pruning
nrS = nrow(S);
S = getPairedCandidatesInc(S, R, TKC, level, eAvg, minSup, alpha, n2, foffb, foffe);
S2 = S;
# load one hot encoded previous lattice for the current level
prevLattice2 = matrix(0,0,0);
if(enableIncSizePruning){
prevLattice2 = preparePrevLattice(prevLattice, metaPrevLattice, prevFoffb,
prevFoffe, foffb, foffe, level, encodeLat, differentOffsets)
}
prevLattice1 = prevLattice2;
if(selFeat){
if(length(prevLattice2)>0 & enableIncSizePruning){
prevLattice2 = removeEmpty(target=prevLattice2, margin="cols", select=t(selCols));
}
S2 = removeEmpty(target=S, margin="cols", select=t(selCols));
}
if(verbose) {
print("\nincSliceLine: level "+level+":")
print(" -- generated paired slice candidates: "+nrS+" -> "+nrow(S));
}
# prune unchanged slices with slice size < minSup
SPr = matrix(0,0, ncol(S));
RPr = matrix(0,0, 4);
if(level <= length(prevStats) & enableIncSizePruning){
npairs = nrow(S);
[S, S2, SPr, RPr] = pruneUnchangedSlices(S, S2, prevLattice1, prevLattice2, prevStats, changedX2, minSup, verbose, level);
if(verbose) {
print(" -- dropping "+(npairs-nrow(S))+"/"+npairs+" unaffected paired slice candidates ");
}
}
# prepare and store output lattice for next run
L1 = rbind(S,SPr);
if ( encodeLat ) {
[L1, M] = transformSlicesToIDs(L1, foffb, foffe);
metaLattice = append(metaLattice, M);
}
L = append(L, L1);
if( nrow(S) > 0 ) {
# extract and evaluate candidate slices
if( tpEval ) { # task-parallel
# hybrid task-parallel w/ 1 matrix-matrix for blocks of 16 matrix-vector
R = matrix(0, nrow(S), 4)
parfor( i in 1:ceil(nrow(S)/tpBlksz), check=0 ) {
beg = (i-1)*tpBlksz + 1;
end = min(i*tpBlksz, nrow(R));
R[beg:end,] = evalSliceInc(X2, totalE, eAvg, t(S2[beg:end,]), level, alpha);
}
}
else { # data-parallel
R = evalSliceInc(X2, totalE, eAvg, t(S2), level, alpha);
}
# update output statistics
Rrep = rbind(R, RPr);
Stats = append(Stats, Rrep);
# maintain top-k after evaluation
[TK, TKC] = maintainTopKInc(S, R, TK, TKC, k, minSup, foffb, foffe);
if(verbose) {
[maxsc2, minsc2] = analyzeTopKInc(TKC);
valid = as.integer(sum(R[,2]>0 & R[,4]>=minSup));
print(" -- valid slices after eval: "+valid+"/"+nrow(S));
print(" -- top-K: count="+nrow(TK)+", max="+maxsc2+", min="+minsc2);
print(" -- (time="+(time()-t1)+")")
D = rbind(D, t(as.matrix(list(level, nrow(S), valid, maxsc2, minsc2))));
}
}
}
TK = decodeOneHot(TK, foffb, foffe);
# prepare output feature matrix for next run
Xout = totalX;
if( verbose ) {
print("incSliceLine: terminated at level "+level+":\n"
+ toString(TK) + "\n" + toString(TKC));
}
}
createAndScoreBasicSlicesInc = function(Matrix[Double] X2, Matrix[Double] X2p,
Matrix[Double] prevTK2, Matrix[Double] e, Matrix[Double] ep,
Double eAvg, Double eAvgOld, Double eAvgNew, Double minSup, Double alpha,
Double minsc, Matrix[Double] maxsc, Matrix[Double] maxscub, Boolean verbose,
Boolean enableIncScorePruning, Boolean enableIncMaxScorePruning, Boolean enableIncApproxPruning)
return(Matrix[Double] S, Matrix[Double] R, Matrix[Double] SPr, Matrix[Double] RPr, Matrix[Double] selCols)
{
n2 = ncol(X2);
cCnts = t(colSums(X2)); # column counts
ncCnts = t(colSums(X2p)); # column counts of modifications
tkCnts = t(colSums(prevTK2)); # column counts previous top-K
err = t(t(e) %*% X2); # total error vector
merr = t(colMaxs(X2 * e)); # maximum error vector
# a) min-support pruning
selCols = (cCnts >= minSup & err > 0);
if( verbose ) {
drop = n2 - as.integer(sum(selCols));
print("incSliceLine: dropping "+drop+"/"+n2+" features below minSup = "+minSup+".");
}
# b) unchanged pruning
# (valid to prune feature if its previous max score was negative or below minsc)
selCols2 = selCols;
if( enableIncMaxScorePruning ) {
selCols2 = selCols & (ncCnts > 0 | maxsc > max(0, minsc));
}
if( verbose ) {
n = as.integer(sum(selCols));
drop = as.integer(sum(selCols) - sum(selCols2));
print("incSliceLine: dropping "+drop+"/"+n+" unaffected features.");
}
if( enableIncApproxPruning ) {
# c) max score changed pruning
n = as.integer(sum(selCols2));
if( enableIncApproxPruning ) {
selCols2 = selCols2 & (maxscub >= max(0, minsc) | maxscub==-Inf);
}
if( verbose ) {
drop = as.integer(n - sum(selCols2));
print("incSliceLine: dropping "+drop+"/"+n+" insufficiently affected features.");
}
}
# working set of active slices (#attr x #slices) and top k
attr = removeEmpty(target=seq(1,n2), margin="rows", select=selCols2);
ss = removeEmpty(target=cCnts, margin="rows", select=selCols2);
se = removeEmpty(target=err, margin="rows", select=selCols2);
sm = removeEmpty(target=merr, margin="rows", select=selCols2);
tkCnts = removeEmpty(target=merr, margin="rows", select=selCols2);
S = table(seq(1,nrow(attr)), attr, nrow(attr), n2);
# score 1-slices and create initial top-k
sc = scoreInc(ss, se, eAvg, alpha, nrow(X2));
R = cbind(sc, se, sm, ss);
SPr = matrix(0,0, n2);
RPr = matrix(0,0, 4);
# store all pruned slices for incremental updates
if(sum(!selCols2) != 0){
attrPr = removeEmpty(target=seq(1,n2), margin="rows", select=!selCols2);
ssPr = removeEmpty(target=cCnts, margin="rows", select=!selCols2);
sePr = removeEmpty(target=err, margin="rows", select=!selCols2);
smPr = removeEmpty(target=merr, margin="rows", select=!selCols2);
SPr = table(seq(1,nrow(attrPr)), attrPr, nrow(attrPr), n2);
# scores are currently not used for pruning
# in case of future use, set scores to Inf so a slice that was not scored
# in this run can be identified and does not get pruned based on score in the next run
scPr = matrix(Inf, nrow(SPr), 1);
RPr = cbind(scPr, sePr, smPr, ssPr);
}
# d) score pruning
# compute upper bound scores for all remaining slices
if(minsc > -Inf & enableIncScorePruning) {
ubSc = scoreUBInc(ss, se, sm, eAvg, minSup, alpha, nrow(X2));
selCols3 = (ubSc > max(0, minsc));
# store all pruned slices for incremental updates
if(sum(!selCols3) != 0){
Rremoved = removeEmpty(target=R, margin="rows", select=!selCols3);
Sremoved = removeEmpty(target=S, margin="rows", select=!selCols3);
RPr = rbind(RPr, Rremoved);
SPr = rbind(SPr, Sremoved);
}
S = removeEmpty(target=S, margin="rows", select=selCols3);
R = removeEmpty(target=R, margin="rows", select=selCols3);
if( verbose ) {
n = as.integer(sum(selCols2));
drop = as.integer(sum(selCols2) - sum(selCols3));
print("incSliceLine: dropping "+drop+"/"+n+" features below minSore = "+minsc+".");
}
cix = removeEmpty(target=attr, margin="rows", select=selCols3);
if(sum(cix) != 0) {
selCols = table(cix, 1, n2, 1);
}else {
selCols = matrix(0, n2, 1);
}
}
else {
selCols = selCols2;
}
}
scoreInc = function(Matrix[Double] ss, Matrix[Double] se, Double eAvg, Double alpha, Integer n)
return(Matrix[Double] sc)
{
sc = alpha * ((se/ss) / eAvg - 1) - (1-alpha) * (n/ss - 1);
sc = replace(target=sc, pattern=NaN, replacement=-Inf);
}
scoreUBInc = function(Matrix[Double] ss, Matrix[Double] se, Matrix[Double] sm,
Double eAvg, Integer minSup, Double alpha, Integer n)
return(Matrix[Double] sc)
{
# Initial upper bound equation (with minSup and ss in pos/neg terms)
# sc = alpha * ((se/minSup) / eAvg - 1) - (1-alpha) * (n/ss - 1);
# Since sc is either monotonically increasing or decreasing, we
# probe interesting points of sc in the interval [minSup, ss],
# and compute the maximum to serve as the upper bound
s = cbind(matrix(minSup,nrow(ss),1), max(se/sm,minSup), ss)
sc = rowMaxs(alpha * ((min(s*sm,se)/s) / eAvg - 1) - (1-alpha) * (1/s*n - 1));
sc = replace(target=sc, pattern=NaN, replacement=-Inf);
}
maintainTopKInc = function(Matrix[Double] S, Matrix[Double] R,
Matrix[Double] TK, Matrix[Double] TKC, Integer k,
Integer minSup, Matrix[Double] foffb, Matrix[Double] foffe)
return(Matrix[Double] TK, Matrix[Double] TKC)
{
# prune invalid minSup and scores
I = (R[,1] > 0) & (R[,4] >= minSup);
if( sum(I)!=0 ) {
S = removeEmpty(target=S, margin="rows", select=I);
R = removeEmpty(target=R, margin="rows", select=I);
# evaluated candidates and previous top-k
slices = rbind(TK, S);
scores = rbind(TKC, R);
# extract top-k w/ deduplication due to previous TK
# (sort by score and ID for robustness)
[IDs, M] = transformSlicesToIDs(slices, foffb, foffe);
oby = matrix("1 5", rows=1, cols=2);
IX = order(target=cbind(scores,IDs), by=oby, decreasing=TRUE, index.return=TRUE);
IX = IX[1:min(2*k,nrow(IX)),];
P = table(seq(1,nrow(IX)), IX, nrow(IX), nrow(slices));
TK = P %*% slices;
TKC = P %*% scores;
if( nrow(TK) >= 2 ) {
I2 = rbind(rowSums(TK[1:(nrow(TK)-1),]==TK[2:nrow(TK),])!=ncol(TK), as.matrix(1))
TK = removeEmpty(target=TK, margin="rows", select=I2);
TKC = removeEmpty(target=TKC, margin="rows", select=I2);
TK = TK[1:min(k,nrow(TK)),];
TKC = TKC[1:min(k,nrow(TKC)),];
}
}
}
analyzeTopKInc = function(Matrix[Double] TKC) return(Double maxsc, Double minsc) {
maxsc = -Inf;
minsc = -Inf;
if( nrow(TKC)>0 ) {
maxsc = as.scalar(TKC[1,1]);
minsc = as.scalar(TKC[nrow(TKC),1]);
}
}
getPairedCandidatesInc = function(Matrix[Double] S,
Matrix[Double] R, Matrix[Double] TKC, Integer level,
Double eAvg, Integer minSup, Double alpha, Integer n2,
Matrix[Double] foffb, Matrix[Double] foffe)
return(Matrix[Double] P)
{
# prune invalid slices (possible without affecting overall
# pruning effectiveness due to handling of missing parents)
pI = (R[,4] >= minSup & R[,2] > 0);
S = removeEmpty(target=S, margin="rows", select=pI)
R = removeEmpty(target=R, margin="rows", select=pI)
# join compatible slices (without self)
join = S %*% t(S) == (level-2);
I = upper.tri(target=join, diag=FALSE, values=TRUE);
# pair construction
nr = nrow(I); nc = ncol(I);
rix = matrix(I * seq(1,nr), nr*nc, 1);
cix = matrix(I * t(seq(1,nc)), nr*nc, 1);
rix = removeEmpty(target=rix, margin="rows");
cix = removeEmpty(target=cix, margin="rows");
P = matrix(0,0,ncol(S))
if( sum(rix)!=0 ) {
P1 = table(seq(1,nrow(rix)), rix, nrow(rix), nrow(S));
P2 = table(seq(1,nrow(cix)), cix, nrow(rix), nrow(S));
P12 = P1 + P2; # combined slice
P = (P1 %*% S + P2 %*% S) != 0;
se = min(P1 %*% R[,2], P2 %*% R[,2])
sm = min(P1 %*% R[,3], P2 %*% R[,3])
ss = min(P1 %*% R[,4], P2 %*% R[,4])
# prune invalid self joins (>1 bit per feature)
I = matrix(1, nrow(P), 1);
for( j in 1:ncol(foffb) ) {
beg = as.scalar(foffb[1,j])+1;
end = as.scalar(foffe[1,j]);
I = I & (rowSums(P[,beg:end]) <= 1);
}
P12 = removeEmpty(target=P12, margin="rows", select=I)
P = removeEmpty(target=P, margin="rows", select=I);
ss = removeEmpty(target=ss, margin="rows", select=I);
se = removeEmpty(target=se, margin="rows", select=I);
sm = removeEmpty(target=sm, margin="rows", select=I);
# prepare IDs for deduplication and pruning
[ID, M] = transformSlicesToIDs(P, foffb, foffe);
# size pruning, with rowMin-rowMax transform
# to avoid densification (ignored zeros)
map = table(ID, seq(1,nrow(P)), max(ID), nrow(P))
ubSizes = 1/rowMaxs(map * (1/t(ss)));
ubSizes = replace(target=ubSizes, pattern=Inf, replacement=0);
fSizes = (ubSizes >= minSup)
# error pruning
ubError = 1/rowMaxs(map * (1/t(se)));
ubError = replace(target=ubError, pattern=Inf, replacement=0);
ubMError = 1/rowMaxs(map * (1/t(sm)));
ubMError = replace(target=ubMError, pattern=Inf, replacement=0);
ubScores = scoreUBInc(ubSizes, ubError, ubMError, eAvg, minSup, alpha, n2);
[maxsc, minsc] = analyzeTopKInc(TKC);
# score pruning
fScores = (ubScores > minsc & ubScores > 0)
# missing parents pruning
numParents = rowSums((map %*% P12) != 0)
fParents = (numParents == level);
# apply all pruning
fall = (fSizes & fScores & fParents );
# deduplication of join outputs
Dedup = removeEmpty(target=map, margin="rows", select=fall) != 0
#P = (Dedup %*% P) != 0, replaced by below (easier sparsity propagation)
DeI = table(rowIndexMax(Dedup), 1, nrow(P), 1);
P = removeEmpty(target=P, margin="rows", select=DeI);
}
}
evalSliceInc = function(Matrix[Double] X, Matrix[Double] e, Double eAvg,
Matrix[Double] tS, Integer l, Double alpha)
return(Matrix[Double] R)
{
# compute slice sizes for the slices that are new.
I = (X %*% tS) == l; # slice indicator
ss = t(colSums(I)); # absolute slice size (nnz)
se = t(t(e) %*% I); # absolute slice error
sm = t(colMaxs(I * e)); # maximum tuple error in slice
# score of relative error and relative size
sc = scoreInc(ss, se, eAvg, alpha, nrow(X));
R = cbind(sc, se, sm, ss);
}
decodeOneHot = function(Matrix[Double] M, Matrix[Double] foffb, Matrix[Double] foffe)
return(Matrix[Double] M)
{
R = matrix(1, nrow(M), ncol(foffb));
if( nrow(M) > 0 ) {
parfor( j in 1:ncol(foffb) ) {
beg = as.scalar(foffb[1,j])+1;
end = as.scalar(foffe[1,j]);
I = rowSums(M[,beg:end]) * rowIndexMax(M[,beg:end]);
R[, j] = I;
}
}
M = R;
}
# function to oneHotEncode but with predefined feature offsets, to facilitate the same encoding for different datasets
# note: this version needs to deal with zeros while the preprocessing encoding does not
oneHotEncodeUsingOffsets = function(Matrix[Double] A, Matrix[Double] foffb, Matrix[Double] foffe)
return(Matrix[Double] A2)
{
m = nrow(A);
n = ncol(A);
if( m > 0 ) {
rix = matrix(seq(1,m)%*%matrix(1,1,n), m*n, 1)
cix = matrix((A!=0)*(A + foffb), m*n, 1);
if( sum(cix!=0) < m*n ) {
ix = cix != 0;
rix = removeEmpty(target=rix, margin="rows", select=ix);
cix = removeEmpty(target=cix, margin="rows", select=ix);
}
if (sum(rix) == 0 | sum(cix) == 0) {
A2 = matrix(0, 0, as.scalar(foffe[, n]));
} else {
# actual one-hot encoding
A2 = table(rix, cix, 1, m, as.scalar(foffe[, n]), FALSE);
}
} else {
A2 = matrix(0, 0, as.scalar(foffe[, n]));
}
}
# store parameters for next run and overwrite params if provided
storeParams = function(Integer k, Integer maxL, Integer minSup, Double alpha, Boolean tpEval,
Integer tpBlksz, Boolean selFeat, Boolean encodeLat, list[unknown] params)
return(list[unknown] params, Integer k, Integer maxL, Integer minSup,
Double alpha, Boolean tpEval, Integer tpBlksz, Boolean selFeat, Boolean encodeLat)
{
if(length(params) == 0) {
params = list(as.double(k), as.double(maxL), as.double(minSup),
alpha, as.double(tpEval), as.double(tpBlksz), as.double(selFeat), as.double(encodeLat));
} else {
k = as.scalar(params[1]);
maxL = as.scalar(params[2]);
minSup = as.scalar(params[3]);
alpha = as.scalar(params[4]);
tpEval = as.boolean(as.scalar(params[5]));
tpBlksz = as.scalar(params[6]);
selFeat = as.boolean(as.scalar(params[7]));
encodeLat = as.boolean(as.scalar(params[8]));
}
}
computeLowestPrevTK = function(Matrix[Double] prevTK2, Matrix[Double] X2,
Matrix[Double] totalE, Double eAvg, Double alpha)
return(Double minsc, Matrix[Double] R)
{
# extract and evaluate candidate slices
l = t(rowSums(prevTK2));
# compute slice stats of curSlice within whole feature matrix X2.
I = (X2 %*% t(prevTK2)) == l; # slice indicator
ss = t(colSums(I)); # absolute slice size (nnz)
se = t(t(totalE) %*% I); # absolute slice error
sm = t(colMaxs(I * totalE)); # maximum tuple error in slice
# score slice and if applicable set min score for pruning
sc = scoreInc(ss, se, eAvg, alpha, nrow(X2));
sc = replace(target=sc, pattern=-Inf, replacement=0)
R = cbind(sc, se, sm, ss);
minsc = min(sc);
}
pruneUnchangedSlices = function(Matrix[Double] S, Matrix[Double] S2, Matrix[Double] prevLattice, Matrix[Double] prevLattice2, list[unknown] prevStats, Matrix[Double] changedX2, Int minSup, Boolean verbose, Integer level)
return(Matrix[Double] S, Matrix[Double] S2, Matrix[Double] SPr, Matrix[Double] RPr)
{
SPr = matrix(0,0, ncol(S));
RPr = matrix(0,0, 4);
unchangedS = prevLattice2;
unchangedR = as.matrix(prevStats[level])
# a) determine unchagned slices
# only computing unchanged slices for levels 2 and above,
# Imat has a 1 where a slice in changedX2 belongs to a slice in prevLatAtLevel
if(nrow(S2) > 0 & sum(S2) > 0 & nrow(unchangedS) > 0) {
# b) select unchanged slices that can be pruned
I = t(colSums((changedX2 %*% t(unchangedS)) == level)) # change pushdown
+ unchangedR[,4] < minSup; # minSup pushdown
unchangedS2 = removeEmpty(target=unchangedS, margin="rows", select=I);
if(sum(I) > 0){
SPr = removeEmpty(target=prevLattice, margin="rows", select=I);
RPr = removeEmpty(target=unchangedR, margin="rows", select=I);
}
# c) select only rows that cannot be pruned
selCols = !rowSums((S2 %*% t(unchangedS2)) == level);
if(nrow(unchangedS) > 0 & sum(selCols) < nrow(S) ){
S2 = removeEmpty(target=S2, margin="rows", select=selCols);
S = removeEmpty(target=S, margin="rows", select=selCols);
}
}
}
transformSlicesToIDs = function(Matrix[Double] S, Matrix[Double] foffb, Matrix[Double] foffe)
return(Matrix[Double] ID, frame[unknown] M)
{
if(nrow(S) > 0){
ID = matrix(0, nrow(S), 1);
dom = foffe-foffb+1;
parfor( j in 1:ncol(dom) ) {
beg = as.scalar(foffb[1,j])+1;
end = as.scalar(foffe[1,j]);
I = rowIndexMax(S[,beg:end]) * rowMaxs(S[,beg:end]);
prod = 1;
if(j<ncol(dom)) {
prod = prod(dom[1,(j+1):ncol(dom)])
}
ID += I * prod;
}
# ID transformation to avoid exceeding INT_MAX and
# and to void creating huge sparse intermediates
[ID, M] = transformencode(target=as.frame(ID), spec="{ids:true,recode:[1]}")
} else {
ID = matrix(0, 0, 1);
M = as.frame(matrix(0, 0, 0));
}
}
# Function to decode IDs back into slices using domain size scaling reversal
transformIDsToSlices = function(Matrix[Double] ID, Matrix[Double] foffb, Matrix[Double] foffe, frame[unknown] M)
return(Matrix[Double] S)
{
if(nrow(ID) > 0){
ID = transformdecode(target=ID, spec="{ids:true,recode:[1]}", meta=M);
ID = as.matrix(ID);
dom = foffe-foffb+1;
numSlices = nrow(ID);
numFeatures = ncol(foffb);
S = matrix(0, numSlices, numFeatures);
for (i in 1:numSlices) {
remainingID = as.scalar(ID[i, 1]);
for (j in 1:numFeatures) {
domSize = as.scalar(dom[1, numFeatures - j +1]);
value = remainingID %% domSize;
S[i, numFeatures - j +1] = value;
remainingID = floor(remainingID/domSize);
}
}
} else {
S = matrix(0, 0, 0);
}
}
# get the maximum reached scores from a previous run for every involved feature
# (this method is robust for different encodings, and fewer statistics R if S
# was not evaluated as well as Infinity, and rescales according to |X| and eAvg)
getMaxScoreAllFeatures = function(Int numRows, Int numFeatures, List[Unknown] prevLattice,
List[Unknown] metaPrevLattice, List[Unknown] prevStats, Boolean encodeLat,
Boolean differentOffsets, Double alpha, Double eAvg,
Matrix[Double] prevFoffb, Matrix[Double] prevFoffe, Matrix[Double] foffb, Matrix[Double] foffe)
return(Matrix[Double] maxsc)
{
maxsc = matrix(-Inf, numFeatures, 1);
if( length(prevLattice) > 0 ) {
for(level in 1:length(prevLattice)) {
S = as.matrix(prevLattice[level]);
if( encodeLat ) {
metaAtLevel = as.frame(metaPrevLattice[level]);
prevLatticeDec = transformIDsToSlices(S, prevFoffb, prevFoffe, metaAtLevel);
if( nrow(S) > 0 )
S = oneHotEncodeUsingOffsets(prevLatticeDec, foffb, foffe);
else
S = matrix(0, 1, numFeatures);
}
else if(differentOffsets) {
prevLatticeDec = decodeOneHot(S, prevFoffb, prevFoffe);
S = oneHotEncodeUsingOffsets(prevLatticeDec, foffb, foffe);
}
if( length(prevStats) >= level ) {
R = as.matrix(prevStats[level]);
# rescaling of scores according to new size of X and eAvg
sc = scoreInc(R[,4], R[,2], eAvg, alpha, numRows);
# robustness for S * sc creating NaNs because 0 * -Inf = NaN
sc = replace(target=sc, pattern=-Inf, replacement=0);
maxsc = max(maxsc, t(colMaxs(S*sc)));
}
}
}
}
getMaxChangedScoreAllFeatures = function(Int numRows, Int numFeatures, Matrix[Double] addedX2,
Matrix[Double] removedX2, Matrix[Double] addedE, Matrix[Double] removedE,
List[Unknown] prevLattice, List[Unknown] metaPrevLattice, List[Unknown] prevStats,
Boolean encodeLat, Boolean differentOffsets, Double alpha, Double eAvg, Double minSup,
Matrix[Double] prevFoffb, Matrix[Double] prevFoffe, Matrix[Double] foffb, Matrix[Double] foffe,
Boolean enableIncApproxPruning)
return(Matrix[Double] maxscub)
{
maxscub = matrix(-Inf, numFeatures, 1);
if( length(prevLattice) > 0 & nrow(addedX2) < 0.05*numRows & enableIncApproxPruning ) {
# compute upper bounds per feature for added subset
ss = t(colSums(addedX2));
se = t(t(addedE) %*% addedX2);
sm = t(colMaxs(addedX2 * addedE))
maxscub = scoreUBInc(ss, se, sm, eAvg, minSup, alpha, numRows);
# compute high-probability upper-bound scores per feature
for(level in 1:length(prevLattice)) {
if( length(prevStats) >= level ) {
S = as.matrix(prevLattice[level]);
if( encodeLat ) {
metaAtLevel = as.frame(metaPrevLattice[level]);
prevLatticeDec = transformIDsToSlices(S, prevFoffb, prevFoffe, metaAtLevel);
if( nrow(S) > 0 )
S = oneHotEncodeUsingOffsets(prevLatticeDec, foffb, foffe);
else
S = matrix(0, 1, numFeatures);
}
else if(differentOffsets) {
prevLatticeDec = decodeOneHot(S, prevFoffb, prevFoffe);
S = oneHotEncodeUsingOffsets(prevLatticeDec, foffb, foffe);
}
R = as.matrix(prevStats[level]); #(sc,se,sm,ss)
# compute exact deltas/slice scores, and update max scores per feature despite the
# exact deltas we need to consider upper bounds because new slices, previously not
# considered, might appear (only high probability, for hard guarantees we would
# need to consider previously pruned slices too)
dssp = rowSums((S%*%t(addedX2))==level); # size delta plus
dssm = rowSums((S%*%t(removedX2))==level); # size delta minus
dsep = rowSums((S%*%t(addedX2)==level)*t(addedE)); # error delta plus
dsem = rowSums((S%*%t(removedX2)==level)*t(removedE)); # error delta minus
# compute interesting extreme points
sc1 = scoreInc(R[,4] - dssm, R[,2] + dsep, eAvg, alpha, numRows);
sc2 = scoreInc(R[,4], R[,2] + dsep, eAvg, alpha, numRows);
sc3 = scoreInc(R[,4] + dssp, R[,2] + dsep, eAvg, alpha, numRows);
sc4 = scoreInc(R[,4] + dssp - dssm, R[,2] + dsep - dsem, eAvg, alpha, numRows);
sc = max(sc1, sc2, sc3, sc4);
# robustness for S * sc creating NaNs because 0 * -Inf = NaN
sc = replace(target=sc, pattern=-Inf, replacement=0);
maxscub = max(maxscub, t(colMaxs(S*sc)));
}
}
}
}
preparePrevLattice = function(list[unknown] prevLattice, list[unknown] metaPrevLattice,
Matrix[Double] prevFoffb, Matrix[Double] prevFoffe, Matrix[Double] foffb,
Matrix[Double] foffe, Integer level, Boolean encodeLat, Boolean differentOffsets)
return (Matrix[Double] prevLattice2) {
prevLattice2 = matrix(0,0,0);
if( length(prevLattice) >= level ) {
prevLattice2 = as.matrix(prevLattice[level]);
if(nrow(prevLattice2) > 0){
if(encodeLat) {
metaAtLevel = as.frame(metaPrevLattice[level]);
prevLatticeDec = transformIDsToSlices(prevLattice2, prevFoffb, prevFoffe, metaAtLevel);
prevLattice2 = oneHotEncodeUsingOffsets(prevLatticeDec, foffb, foffe);
# in case the offsets in the previous run were different the lattice needs to be adjusted
} else if(differentOffsets) {
prevLatticeDec = decodeOneHot(prevLattice2, prevFoffb, prevFoffe);
prevLattice2 = oneHotEncodeUsingOffsets(prevLatticeDec, foffb, foffe);
}
}
}
}
# Function to remove rows from matrix M based on a list of indices
removeRowsByIndices = function(Matrix[Double] M, Matrix[Double] indices)
return (Matrix[Double] remain, Matrix[Double] removed)
{
P1 = table(seq(1,nrow(indices)), indices, nrow(indices), nrow(M))
removed = P1 %*% M;
CIX = removeEmpty(target=(table(indices,1,nrow(M),1)==0)*seq(1,nrow(M)), margin="rows")
P2 = table(seq(1, nrow(CIX)), CIX, nrow(CIX), nrow(M))
remain = P2 %*% M;
}