blob: 731f3f970c6f092734228343ceb0ece5f3de845c [file] [log] [blame]
#-------------------------------------------------------------
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#
#-------------------------------------------------------------
#-------------------------------------------------------------
# X Input matrix (integer encoded [1..v])
# e error vector (classification accuracy, l2 norm, etc)
# k top-K subsets / slices
# maxL maximum level L (conjunctions of L predicates), 0 unlimited
# minSup minimum support (min number of rows per slice)
# alpha weight [0,1]: 0 only size, 1 only error
# tpEval flag for task-parallel slice evaluation,
# otherwise data-parallel
# tpBlksz block size for task-parallel execution (num slices)
# verbose flag for verbose debug output
# ------------------------------------------------------------
# TK top-k slices (k x ncol(X) if successful)
# TKC score, size, error of slices (k x 3)
# ------------------------------------------------------------
m_slicefinder = function(Matrix[Double] X, Matrix[Double] e,
Integer k = 4, Integer maxL = 0, Integer minSup = 32, Double alpha = 0.5,
Boolean tpEval = TRUE, Integer tpBlksz = 16, Boolean verbose = FALSE)
return(Matrix[Double] TK, Matrix[Double] TKC)
{
m = nrow(X);
n = ncol(X);
# prepare offset vectors and one-hot encoded X
fdom = colMaxs(X);
foffb = t(cumsum(t(fdom))) - fdom;
foffe = t(cumsum(t(fdom)))
rix = matrix(seq(1,m)%*%matrix(1,1,n), m*n, 1)
cix = matrix(X + foffb, m*n, 1);
X2 = table(rix, cix); #one-hot encoded
# initialize statistics and basic slices
n2 = ncol(X2); # one-hot encoded features
eAvg = sum(e) / m; # average error
[S, R] = createAndScoreBasicSlices(X2, e, eAvg, minSup, alpha, verbose);
# initialize top-k
[TK, TKC] = maintainTopK(S, R, matrix(0,0,n2), matrix(0,0,4), k, minSup);
if( verbose ) {
[maxsc, minsc] = analyzeTopK(TKC);
print("SliceFinder: initial top-K: count="+nrow(TK)+", max="+maxsc+", min="+minsc)
}
# lattice enumeration w/ size/error pruning, one iteration per level
# termination condition (max #feature levels)
maxL = ifelse(maxL<=0, n, maxL)
level = 1;
while( nrow(S) > 0 & sum(S) > 0 & level < n & level < maxL ) {
level = level + 1;
# enumerate candidate join pairs, incl size/error pruning
nrS = nrow(S);
S = getPairedCandidates(S, R, TK, TKC, k, level, eAvg, minSup, alpha, n2, foffb, foffe);
if(verbose) {
print("\nSliceFinder: level "+level+":")
print(" -- generated paired slice candidates: "+nrS+" -> "+nrow(S));
}
# extract and evaluate candidate slices
if( tpEval ) { # task-parallel
# hybrid task-parallel w/ 1 matrix-matrix for blocks of 16 matrix-vector
R = matrix(0, nrow(S), 4)
parfor( i in 1:ceil(nrow(S)/tpBlksz), check=0 ) {
beg = (i-1)*tpBlksz + 1;
end = min(i*tpBlksz, nrow(R));
R[beg:end,] = evalSlice(X2, e, eAvg, t(S[beg:end,]), level, alpha);
}
}
else { # data-parallel
R = evalSlice(X2, e, eAvg, t(S), level, alpha);
}
# maintain top-k after evaluation
[TK, TKC] = maintainTopK(S, R, TK, TKC, k, minSup);
if(verbose) {
[maxsc, minsc] = analyzeTopK(TKC);
valid = as.integer(sum(R[,4]>=minSup));
print(" -- valid slices after eval: "+valid+"/"+nrow(S));
print(" -- top-K: count="+nrow(TK)+", max="+maxsc+", min="+minsc);
}
}
TK = decodeTopK(TK, foffb, foffe);
if( verbose ) {
print("SliceFinder: terminated at level "+level+":\n"
+ toString(TK) + "\n" + toString(TKC));
}
}
createAndScoreBasicSlices = function(Matrix[Double] X2, Matrix[Double] e,
Double eAvg, Double minSup, Double alpha, Boolean verbose)
return(Matrix[Double] S, Matrix[Double] R)
{
n2 = ncol(X2);
cCnts = t(colSums(X2)); # column counts
err = t(t(e) %*% X2); # total error vector
merr = t(colMaxs(X2 * e)); # maximum error vector
if( verbose ) {
drop = as.integer(sum(cCnts < minSup));
print("SliceFinder: dropping "+drop+"/"+n2+" features below minSup = "+minSup+".");
}
# working set of active slices (#attr x #slices) and top k
selCols = (cCnts >= minSup);
attr = removeEmpty(target=seq(1,n2), margin="rows", select=selCols);
ss = removeEmpty(target=cCnts, margin="rows", select=selCols);
se = removeEmpty(target=err, margin="rows", select=selCols);
sm = removeEmpty(target=merr, margin="rows", select=selCols);
S = table(seq(1,nrow(attr)), attr, nrow(attr), n2);
# score 1-slices and create initial top-k
sc = score(ss, se, eAvg, alpha, nrow(X2));
R = cbind(sc, se, sm, ss);
}
score = function(Matrix[Double] ss, Matrix[Double] se, Double eAvg, Double alpha, Integer n)
return(Matrix[Double] sc)
{
sc = alpha * ((se/ss) / eAvg - 1) - (1-alpha) * (n/ss - 1);
sc = replace(target=sc, pattern=NaN, replacement=-Inf);
}
scoreUB = function(Matrix[Double] ss, Matrix[Double] se, Matrix[Double] sm,
Double eAvg, Integer minSup, Double alpha, Integer n)
return(Matrix[Double] sc)
{
# Initial upper bound equation (with minSup and ss in pos/neg terms)
# sc = alpha * ((se/minSup) / eAvg - 1) - (1-alpha) * (n/ss - 1);
# Since sc is either monotonically increasing or decreasing, we
# probe interesting points of sc in the interval [minSup, ss],
# and compute the maximum to serve as the upper bound
s = cbind(matrix(minSup,nrow(ss),1), max(se/sm,minSup), ss)
sc = rowMaxs(alpha * ((min(s*sm,se)/s) / eAvg - 1) - (1-alpha) * (1/s*n - 1));
sc = replace(target=sc, pattern=NaN, replacement=-Inf);
}
maintainTopK = function(Matrix[Double] S, Matrix[Double] R,
Matrix[Double] TK, Matrix[Double] TKC, Integer k, Integer minSup)
return(Matrix[Double] TK, Matrix[Double] TKC)
{
# prune invalid minSup and scores
I = (R[,1] > 0) & (R[,4] >= minSup);
if( sum(I)!=0 ) {
S = removeEmpty(target=S, margin="rows", select=I);
R = removeEmpty(target=R, margin="rows", select=I);
# evaluated candidated and previous top-k
slices = rbind(TK, S);
scores = rbind(TKC, R);
# extract top-k
IX = order(target=scores, by=1, decreasing=TRUE, index.return=TRUE);
IX = IX[1:min(k,nrow(IX)),];
P = table(seq(1,nrow(IX)), IX, nrow(IX), nrow(slices));
TK = P %*% slices;
TKC = P %*% scores;
}
}
analyzeTopK = function(Matrix[Double] TKC) return(Double maxsc, Double minsc) {
maxsc = -Inf;
minsc = -Inf;
if( nrow(TKC)>0 ) {
maxsc = as.scalar(TKC[1,1]);
minsc = as.scalar(TKC[nrow(TKC),1]);
}
}
getPairedCandidates = function(Matrix[Double] S, Matrix[Double] R,
Matrix[Double] TK, Matrix[Double] TKC, Integer k, Integer level,
Double eAvg, Integer minSup, Double alpha, Integer n2,
Matrix[Double] foffb, Matrix[Double] foffe)
return(Matrix[Double] P)
{
# prune invalid slices (possible without affecting overall
# pruning effectiveness due to handling of missing parents)
pI = (R[,4] >= minSup & R[,2] > 0);
S = removeEmpty(target=S, margin="rows", select=pI)
R = removeEmpty(target=R, margin="rows", select=pI)
# join compatible slices (without self)
join = S %*% t(S) == (level-2)
I = upper.tri(target=join, diag=FALSE, values=TRUE);
# pair construction
nr = nrow(I); nc = ncol(I);
rix = matrix(I * seq(1,nr), nr*nc, 1);
cix = matrix(I * t(seq(1,nc)), nr*nc, 1);
rix = removeEmpty(target=rix, margin="rows");
cix = removeEmpty(target=cix, margin="rows");
P = matrix(0,0,ncol(S))
if( sum(rix)!=0 ) {
P1 = table(seq(1,nrow(rix)), rix, nrow(rix), nrow(S));
P2 = table(seq(1,nrow(cix)), cix, nrow(rix), nrow(S));
P12 = P1 + P2; # combined slice
P = (P1 %*% S + P2 %*% S) != 0;
se = min(P1 %*% R[,2], P2 %*% R[,2])
sm = min(P1 %*% R[,3], P2 %*% R[,3])
ss = min(P1 %*% R[,4], P2 %*% R[,4])
# prune invalid self joins (>1 bit per feature)
I = matrix(1, nrow(P), 1);
for( j in 1:ncol(foffb) ) {
beg = as.scalar(foffb[1,j])+1;
end = as.scalar(foffe[1,j]);
I = I & (rowSums(P[,beg:end]) <= 1);
}
P12 = removeEmpty(target=P12, margin="rows", select=I)
P = removeEmpty(target=P, margin="rows", select=I);
ss = removeEmpty(target=ss, margin="rows", select=I);
se = removeEmpty(target=se, margin="rows", select=I);
sm = removeEmpty(target=sm, margin="rows", select=I);
# prepare IDs for deduplication and pruning
ID = matrix(0, nrow(P), 1);
dom = foffe-foffb+1;
for( j in 1:ncol(dom) ) {
beg = as.scalar(foffb[1,j])+1;
end = as.scalar(foffe[1,j]);
I = rowIndexMax(P[,beg:end]) * rowMaxs(P[,beg:end]);
prod = 1;
if(j<ncol(dom))
prod = prod(dom[1,(j+1):ncol(dom)])
ID = ID + I * prod;
}
# ID transformation to avoid exceeding INT_MAX and
# and to void creating huge sparse intermediates
[ID, M] = transformencode(target=as.frame(ID), spec="{ids:true,recode:[1]}")
# size pruning, with rowMin-rowMax transform
# to avoid densification (ignored zeros)
map = table(ID, seq(1,nrow(P)), max(ID), nrow(P))
ubSizes = 1/rowMaxs(map * (1/t(ss)));
ubSizes = replace(target=ubSizes, pattern=Inf, replacement=0);
fSizes = (ubSizes >= minSup)
# error pruning
ubError = 1/rowMaxs(map * (1/t(se)));
ubError = replace(target=ubError, pattern=Inf, replacement=0);
ubMError = 1/rowMaxs(map * (1/t(sm)));
ubMError = replace(target=ubMError, pattern=Inf, replacement=0);
ubScores = scoreUB(ubSizes, ubError, ubMError, eAvg, minSup, alpha, n2);
[maxsc, minsc] = analyzeTopK(TKC);
fScores = (ubScores > minsc & ubScores > 0)
# missing parents pruning
numParents = rowSums((map %*% P12) != 0)
fParents = (numParents == level);
# apply all pruning
map = map * (fSizes & fScores & fParents);
# deduplication of join outputs
Dedup = removeEmpty(target=map, margin="rows") != 0
P = (Dedup %*% P) != 0
}
}
evalSlice = function(Matrix[Double] X, Matrix[Double] e, Double eAvg,
Matrix[Double] tS, Integer l, Double alpha)
return(Matrix[Double] R)
{
I = (X %*% tS) == l; # slice indicator
ss = t(colSums(I)); # absolute slice size (nnz)
se = t(t(e) %*% I); # absolute slice error
sm = t(colMaxs(I * e)); # maximum tuple error in slice
# score of relative error and relative size
sc = score(ss, se, eAvg, alpha, nrow(X));
R = cbind(sc, se, sm, ss);
}
decodeTopK = function(Matrix[Double] TK, Matrix[Double] foffb, Matrix[Double] foffe)
return(Matrix[Double] TK)
{
R = matrix(1, nrow(TK), ncol(foffb));
if( nrow(TK) > 0 ) {
parfor( j in 1:ncol(foffb) ) {
beg = as.scalar(foffb[1,j])+1;
end = as.scalar(foffe[1,j]);
I = rowSums(TK[,beg:end]) * rowIndexMax(TK[,beg:end]);
R[, j] = I;
}
}
TK = R;
}