blob: 2c9efb7898cf5ee7995b1f3c25747a10eb733555 [file] [log] [blame]
#-------------------------------------------------------------
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#
#-------------------------------------------------------------
#-------------------------------------------------------------
# X Input matrix (integer encoded [1..v])
# e error vector (classification accuracy, l2 norm, etc)
# k top-K subsets / slices
# minSup minimum support (min number of rows per slice)
# alpha weight [0,1]: 0 only size, 1 only error
# dpEval flag for data-parallel slice evaluation,
# otherwise task-parallel
# verbose flag for verbose debug output
# ------------------------------------------------------------
# TK top-k slices (k x ncol(X) if successful)
# TKC score, size, error of slices (k x 3)
# ------------------------------------------------------------
slicing = function(Matrix[Double] X, Matrix[Double] e, Integer k = 4, Integer minSup = 32, Double alpha = 0.5, Boolean dpEval = FALSE, Boolean verbose = FALSE)
return(Matrix[Double] TK, Matrix[Double] TKC)
{
m = nrow(X);
n = ncol(X);
# prepare offset vectors and one-hot encoded X
fdom = colMaxs(X);
foffb = t(cumsum(t(fdom))) - fdom;
foffe = t(cumsum(t(fdom)))
rix = matrix(seq(1,m)%*%matrix(1,1,n), m*n, 1)
cix = matrix(X + foffb, m*n, 1);
X2 = table(rix, cix); #one-hot encoded
# initialize statistics and basic slices
n2 = ncol(X2); # one-hot encoded features
eAvg = sum(e) / m; # average error
[S, R] = createAndScoreBasicSlices(X2, e, eAvg, minSup, alpha, verbose);
# initialize top-k
[TK, TKC] = maintainTopK(S, R, matrix(0, 0, n2), matrix(0, 0, 3), k, minSup);
if( verbose ) {
[maxsc, minsc] = analyzeTopK(TKC);
print("SliceFinder: initial top-K: count="+nrow(TK)+", max="+maxsc+", min="+minsc)
}
# lattice enumeration w/ size/error pruning, one iteration per level
# termination condition (max #feature levels)
level = 1;
while( nrow(S) > 0 & sum(S) > 0 & level < n ) {
level = level + 1;
# enumerate candidate join pairs, incl size/error pruning
nrS = nrow(S);
S = getPairedCandidates(S, R, TK, TKC, k, level, eAvg, minSup, alpha, n2, foffb, foffe);
if(verbose) {
print("\nSliceFinder: level "+level+":")
print(" -- generated paired slice candidates: "+nrS+" -> "+nrow(S));
}
# extract and evaluate candidate slices
if( dpEval ) { #data-parallel
R = evalSlice(X2, e, eAvg, t(S), level, alpha);
}
else { # task-parallel
R = matrix(0, nrow(S), 3)
parfor( i in 1:nrow(S) )
R[i,] = evalSlice(X2, e, eAvg, t(S[i,]), level, alpha);
}
# maintain top-k after evaluation
[TK, TKC] = maintainTopK(S, R, TK, TKC, k, minSup);
if(verbose) {
[maxsc, minsc] = analyzeTopK(TKC);
valid = as.integer(sum(R[,3]>=minSup));
print(" -- valid slices after eval: "+valid+"/"+nrow(S));
print(" -- top-K: count="+nrow(TK)+", max="+maxsc+", min="+minsc);
}
}
TK = decodeTopK(TK, foffb, foffe);
if( verbose ) {
print("SliceFinder: terminated at level "+level+":\n"
+ toString(TK) + "\n" + toString(TKC));
}
}
createAndScoreBasicSlices = function(Matrix[Double] X2, Matrix[Double] e, Double eAvg,
Double minSup, Double alpha, Boolean verbose)
return(Matrix[Double] S, Matrix[Double] R)
{
n2 = ncol(X2);
cCnts = t(colSums(X2)); # column counts
err = t(t(e) %*% X2) # total error vector
if( verbose ) {
drop = as.integer(sum(cCnts < minSup));
print("SliceFinder: dropping "+drop+"/"+n2+" features below minSup = "+minSup+".");
}
# working set of active slices (#attr x #slices) and top k
selCols = (cCnts >= minSup);
attr = removeEmpty(target=seq(1,n2), margin="rows", select=selCols);
ss = removeEmpty(target=cCnts, margin="rows", select=selCols);
se = removeEmpty(target=err, margin="rows", select=selCols);
S = table(seq(1,nrow(attr)), attr, nrow(attr), n2);
# score 1-slices and create initial top-k
sc = score(ss, se, eAvg, alpha, nrow(X2));
R = cbind(sc, se, ss);
}
score = function(Matrix[Double] ss, Matrix[Double] se, Double eAvg, Double alpha, Integer n)
return(Matrix[Double] sc)
{
sc = alpha * ((se/ss) / eAvg - 1) - (1-alpha) * (n/ss - 1);
}
scoreUB = function(Matrix[Double] ss, Matrix[Double] se, Double eAvg, Integer minSup, Double alpha, Integer n)
return(Matrix[Double] sc)
{
sc = alpha * ((se/minSup) / eAvg - 1) - (1-alpha) * (n/ss - 1);
}
maintainTopK = function(Matrix[Double] S, Matrix[Double] R, Matrix[Double] TK, Matrix[Double] TKC, Integer k, Integer minSup)
return(Matrix[Double] TK, Matrix[Double] TKC)
{
# prune invalid minSup and scores
I = (R[,1] > 0) & (R[,3] >= minSup);
if( sum(I)!=0 ) {
S = removeEmpty(target=S, margin="rows", select=I);
R = removeEmpty(target=R, margin="rows", select=I);
# evaluated candidated and previous top-k
slices = rbind(TK, S);
scores = rbind(TKC, R);
# extract top-k
IX = order(target=scores, by=1, decreasing=TRUE, index.return=TRUE);
IX = IX[1:min(k,nrow(IX)),];
P = table(seq(1,nrow(IX)), IX, nrow(IX), nrow(slices));
TK = P %*% slices;
TKC = P %*% scores;
}
}
analyzeTopK = function(Matrix[Double] TKC) return(Double maxsc, Double minsc) {
maxsc = -Inf;
minsc = -Inf;
if( nrow(TKC)>0 ) {
maxsc = as.scalar(TKC[1,1]);
minsc = as.scalar(TKC[nrow(TKC),1]);
}
}
getPairedCandidates = function(Matrix[Double] S, Matrix[Double] R, Matrix[Double] TK,
Matrix[Double] TKC, Integer k, Integer level, Double eAvg, Integer minSup, Double alpha,
Integer n2, Matrix[Double] foffb, Matrix[Double] foffe)
return(Matrix[Double] P)
{
# join compatible slices (without self)
join = S %*% t(S) == (level-2)
I = upper.tri(target=join, diag=FALSE);
# pair construction
nr = nrow(I); nc = ncol(I);
rix = matrix(I * seq(1,nr), nr*nc, 1);
cix = matrix(I * t(seq(1,nc)), nr*nc, 1);
rix = removeEmpty(target=rix, margin="rows");
cix = removeEmpty(target=cix, margin="rows");
P = matrix(0,0,ncol(S))
if( sum(rix)!=0 ) {
P1 = table(seq(1,nrow(rix)), rix, nrow(rix), nrow(S));
P2 = table(seq(1,nrow(cix)), cix, nrow(rix), nrow(S));
P12 = (P1 + P2);
P = ((P1 %*% S) + (P2 %*% S)) != 0;
ss = min(P1 %*% R[,3], P2 %*% R[,3])
se = min(P1 %*% R[,2], P2 %*% R[,2])
# prune invalid self joins (>1 bit per feature)
I = matrix(1, nrow(P), 1);
for( j in 1:ncol(foffb) ) {
beg = as.scalar(foffb[1,j])+1;
end = as.scalar(foffe[1,j]);
I = I & (rowSums(P[,beg:end]) <= 1);
}
P = removeEmpty(target=P, margin="rows", select=I);
P12 = removeEmpty(target=P12, margin="rows", select=I)
ss = removeEmpty(target=ss, margin="rows", select=I);
se = removeEmpty(target=se, margin="rows", select=I);
# prepare IDs for deduplication and pruning
ID = matrix(0, nrow(P), 1);
dom = foffe-foffb+1;
for( j in 1:ncol(dom) ) {
beg = as.scalar(foffb[1,j])+1;
end = as.scalar(foffe[1,j]);
I = rowIndexMax(P[,beg:end]) * rowMaxs(P[,beg:end]);
prod = 1;
if(j<ncol(dom))
prod = prod(dom[1,(j+1):ncol(dom)])
ID = ID + I * prod;
}
# size pruning, with rowMin-rowMax transform
# to avoid densification (ignored zeros)
map = table(ID,seq(1,nrow(P)))
ubSizes = 1/rowMaxs(map * (1/t(ss)));
ubSizes = replace(target=ubSizes, pattern=Inf, replacement=0);
fSizes = (ubSizes >= minSup)
# error pruning
ubError = 1/rowMaxs(map * (1/t(se)));
ubError = replace(target=ubError, pattern=Inf, replacement=0);
ubScores = scoreUB(ubSizes, ubError, eAvg, minSup, alpha, n2);
[maxsc, minsc] = analyzeTopK(TKC);
fScores = (ubScores > minsc & ubScores > 0)
# missing parents pruning
numParents = rowSums((map %*% P12) != 0)
fParents = (numParents == level);
# apply all pruning
map = map * (fSizes & fScores & fParents);
# deduplication of join outputs
Dedup = removeEmpty(target=map, margin="rows") != 0
P = (Dedup %*% P) != 0
}
}
evalSlice = function(Matrix[Double] X, Matrix[Double] e, Double eAvg, Matrix[Double] tS, Integer l, Double alpha)
return(Matrix[Double] R)
{
I = (X %*% tS) == l; # slice indicator
ss = t(colSums(I)); # absolute slice size (nnz)
se = t(t(e) %*% I); # absolute slice error
# score of relative error and relative size
sc = score(ss, se, eAvg, alpha, nrow(X));
R = cbind(sc, se, ss);
}
decodeTopK = function(Matrix[Double] TK, Matrix[Double] foffb, Matrix[Double] foffe)
return(Matrix[Double] TK)
{
R = matrix(1, nrow(TK), ncol(foffb));
if( nrow(TK) > 0 ) {
parfor( j in 1:ncol(foffb) ) {
beg = as.scalar(foffb[1,j])+1;
end = as.scalar(foffe[1,j]);
I = rowSums(TK[,beg:end]) * rowIndexMax(TK[,beg:end]);
R[, j] = I;
}
}
TK = R;
}
Forig = read("./Salaries.csv", data_type="frame", format="csv", header=TRUE);
F = Forig[,1:ncol(Forig)-1];
y = as.matrix(Forig[,ncol(Forig)]);
# data preparation
jspec= "{ ids:true, recode:[1,2,3,6], bin:[{id:4, method:equi-width, numbins:14},{id:5, method:equi-width, numbins:12}]}"
[X,M] = transformencode(target=F, spec=jspec);
X = X[,2:ncol(X)]
#X = rbind(X,X)
#X = cbind(X,X)
#y = rbind(y,y)
# learn model
B = lm(X=X, y=y, verbose=FALSE);
yhat = X %*% B;
e = (y-yhat)^2;
# call slice finding
[S,C] = slicing(X=X, e=e, k=10, alpha=0.95, minSup=4, verbose=TRUE);