| #------------------------------------------------------------- |
| # |
| # Licensed to the Apache Software Foundation (ASF) under one |
| # or more contributor license agreements. See the NOTICE file |
| # distributed with this work for additional information |
| # regarding copyright ownership. The ASF licenses this file |
| # to you under the Apache License, Version 2.0 (the |
| # "License"); you may not use this file except in compliance |
| # with the License. You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, |
| # software distributed under the License is distributed on an |
| # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| # KIND, either express or implied. See the License for the |
| # specific language governing permissions and limitations |
| # under the License. |
| # |
| #------------------------------------------------------------- |
| |
| #------------------------------------------------------------- |
| # X Input matrix (integer encoded [1..v]) |
| # e error vector (classification accuracy, l2 norm, etc) |
| # k top-K subsets / slices |
| # maxL maximum level L (conjunctions of L predicates), 0 unlimited |
| # minSup minimum support (min number of rows per slice) |
| # alpha weight [0,1]: 0 only size, 1 only error |
| # dpEval flag for data-parallel slice evaluation, |
| # otherwise task-parallel |
| # verbose flag for verbose debug output |
| # ------------------------------------------------------------ |
| # TK top-k slices (k x ncol(X) if successful) |
| # TKC score, size, error of slices (k x 3) |
| # ------------------------------------------------------------ |
| |
| m_slicefinder = function(Matrix[Double] X, Matrix[Double] e, |
| Integer k = 4, Integer maxL = 0, Integer minSup = 32, Double alpha = 0.5, |
| Boolean dpEval = FALSE, Boolean verbose = FALSE) |
| return(Matrix[Double] TK, Matrix[Double] TKC) |
| { |
| m = nrow(X); |
| n = ncol(X); |
| |
| # prepare offset vectors and one-hot encoded X |
| fdom = colMaxs(X); |
| foffb = t(cumsum(t(fdom))) - fdom; |
| foffe = t(cumsum(t(fdom))) |
| rix = matrix(seq(1,m)%*%matrix(1,1,n), m*n, 1) |
| cix = matrix(X + foffb, m*n, 1); |
| X2 = table(rix, cix); #one-hot encoded |
| |
| # initialize statistics and basic slices |
| n2 = ncol(X2); # one-hot encoded features |
| eAvg = sum(e) / m; # average error |
| [S, R] = createAndScoreBasicSlices(X2, e, eAvg, minSup, alpha, verbose); |
| |
| # initialize top-k |
| [TK, TKC] = maintainTopK(S, R, matrix(0,0,n2), matrix(0,0,4), k, minSup); |
| |
| if( verbose ) { |
| [maxsc, minsc] = analyzeTopK(TKC); |
| print("SliceFinder: initial top-K: count="+nrow(TK)+", max="+maxsc+", min="+minsc) |
| } |
| |
| # lattice enumeration w/ size/error pruning, one iteration per level |
| # termination condition (max #feature levels) |
| maxL = ifelse(maxL<=0, n, maxL) |
| level = 1; |
| while( nrow(S) > 0 & sum(S) > 0 & level < n & level < maxL ) { |
| level = level + 1; |
| |
| # enumerate candidate join pairs, incl size/error pruning |
| nrS = nrow(S); |
| S = getPairedCandidates(S, R, TK, TKC, k, level, eAvg, minSup, alpha, n2, foffb, foffe); |
| |
| if(verbose) { |
| print("\nSliceFinder: level "+level+":") |
| print(" -- generated paired slice candidates: "+nrS+" -> "+nrow(S)); |
| } |
| |
| # extract and evaluate candidate slices |
| if( dpEval ) { #data-parallel |
| R = evalSlice(X2, e, eAvg, t(S), level, alpha); |
| } |
| else { # task-parallel |
| R = matrix(0, nrow(S), 4) |
| parfor( i in 1:nrow(S) ) |
| R[i,] = evalSlice(X2, e, eAvg, t(S[i,]), level, alpha); |
| } |
| |
| # maintain top-k after evaluation |
| [TK, TKC] = maintainTopK(S, R, TK, TKC, k, minSup); |
| |
| if(verbose) { |
| [maxsc, minsc] = analyzeTopK(TKC); |
| valid = as.integer(sum(R[,4]>=minSup)); |
| print(" -- valid slices after eval: "+valid+"/"+nrow(S)); |
| print(" -- top-K: count="+nrow(TK)+", max="+maxsc+", min="+minsc); |
| } |
| } |
| |
| TK = decodeTopK(TK, foffb, foffe); |
| |
| if( verbose ) { |
| print("SliceFinder: terminated at level "+level+":\n" |
| + toString(TK) + "\n" + toString(TKC)); |
| } |
| } |
| |
| createAndScoreBasicSlices = function(Matrix[Double] X2, Matrix[Double] e, |
| Double eAvg, Double minSup, Double alpha, Boolean verbose) |
| return(Matrix[Double] S, Matrix[Double] R) |
| { |
| n2 = ncol(X2); |
| cCnts = t(colSums(X2)); # column counts |
| err = t(t(e) %*% X2); # total error vector |
| merr = t(colMaxs(X2 * e)); # maximum error vector |
| |
| if( verbose ) { |
| drop = as.integer(sum(cCnts < minSup)); |
| print("SliceFinder: dropping "+drop+"/"+n2+" features below minSup = "+minSup+"."); |
| } |
| |
| # working set of active slices (#attr x #slices) and top k |
| selCols = (cCnts >= minSup); |
| attr = removeEmpty(target=seq(1,n2), margin="rows", select=selCols); |
| ss = removeEmpty(target=cCnts, margin="rows", select=selCols); |
| se = removeEmpty(target=err, margin="rows", select=selCols); |
| sm = removeEmpty(target=merr, margin="rows", select=selCols); |
| S = table(seq(1,nrow(attr)), attr, nrow(attr), n2); |
| |
| # score 1-slices and create initial top-k |
| sc = score(ss, se, eAvg, alpha, nrow(X2)); |
| R = cbind(sc, se, sm, ss); |
| } |
| |
| score = function(Matrix[Double] ss, Matrix[Double] se, Double eAvg, Double alpha, Integer n) |
| return(Matrix[Double] sc) |
| { |
| sc = alpha * ((se/ss) / eAvg - 1) - (1-alpha) * (n/ss - 1); |
| sc = replace(target=sc, pattern=NaN, replacement=-Inf); |
| } |
| |
| scoreUB = function(Matrix[Double] ss, Matrix[Double] se, Matrix[Double] sm, |
| Double eAvg, Integer minSup, Double alpha, Integer n) |
| return(Matrix[Double] sc) |
| { |
| # Initial upper bound equation (with minSup and ss in pos/neg terms) |
| # sc = alpha * ((se/minSup) / eAvg - 1) - (1-alpha) * (n/ss - 1); |
| |
| # Since sc is either monotonically increasing or decreasing, we |
| # probe interesting points of sc in the interval [minSup, ss], |
| # and compute the maximum to serve as the upper bound |
| s = cbind(matrix(minSup,nrow(ss),1), max(se/sm,minSup), ss) |
| sc = rowMaxs(alpha * ((min(s*sm,se)/s) / eAvg - 1) - (1-alpha) * (1/s*n - 1)); |
| sc = replace(target=sc, pattern=NaN, replacement=-Inf); |
| } |
| |
| |
| maintainTopK = function(Matrix[Double] S, Matrix[Double] R, |
| Matrix[Double] TK, Matrix[Double] TKC, Integer k, Integer minSup) |
| return(Matrix[Double] TK, Matrix[Double] TKC) |
| { |
| # prune invalid minSup and scores |
| I = (R[,1] > 0) & (R[,4] >= minSup); |
| |
| if( sum(I)!=0 ) { |
| S = removeEmpty(target=S, margin="rows", select=I); |
| R = removeEmpty(target=R, margin="rows", select=I); |
| |
| # evaluated candidated and previous top-k |
| slices = rbind(TK, S); |
| scores = rbind(TKC, R); |
| |
| # extract top-k |
| IX = order(target=scores, by=1, decreasing=TRUE, index.return=TRUE); |
| IX = IX[1:min(k,nrow(IX)),]; |
| P = table(seq(1,nrow(IX)), IX, nrow(IX), nrow(slices)); |
| TK = P %*% slices; |
| TKC = P %*% scores; |
| } |
| } |
| |
| analyzeTopK = function(Matrix[Double] TKC) return(Double maxsc, Double minsc) { |
| maxsc = -Inf; |
| minsc = -Inf; |
| if( nrow(TKC)>0 ) { |
| maxsc = as.scalar(TKC[1,1]); |
| minsc = as.scalar(TKC[nrow(TKC),1]); |
| } |
| } |
| |
| getPairedCandidates = function(Matrix[Double] S, Matrix[Double] R, |
| Matrix[Double] TK, Matrix[Double] TKC, Integer k, Integer level, |
| Double eAvg, Integer minSup, Double alpha, Integer n2, |
| Matrix[Double] foffb, Matrix[Double] foffe) |
| return(Matrix[Double] P) |
| { |
| # prune invalid slices (possible without affecting overall |
| # pruning effectiveness due to handling of missing parents) |
| pI = (R[,4] >= minSup); |
| S = removeEmpty(target=S, margin="rows", select=pI) |
| R = removeEmpty(target=R, margin="rows", select=pI) |
| |
| # join compatible slices (without self) |
| join = S %*% t(S) == (level-2) |
| I = upper.tri(target=join, diag=FALSE, values=TRUE); |
| |
| # pair construction |
| nr = nrow(I); nc = ncol(I); |
| rix = matrix(I * seq(1,nr), nr*nc, 1); |
| cix = matrix(I * t(seq(1,nc)), nr*nc, 1); |
| rix = removeEmpty(target=rix, margin="rows"); |
| cix = removeEmpty(target=cix, margin="rows"); |
| |
| P = matrix(0,0,ncol(S)) |
| if( sum(rix)!=0 ) { |
| P1 = table(seq(1,nrow(rix)), rix, nrow(rix), nrow(S)); |
| P2 = table(seq(1,nrow(cix)), cix, nrow(rix), nrow(S)); |
| P12 = P1 + P2; # combined slice |
| P = (P12 %*% S) != 0; |
| se = min(P1 %*% R[,2], P2 %*% R[,2]) |
| sm = min(P1 %*% R[,3], P2 %*% R[,3]) |
| ss = min(P1 %*% R[,4], P2 %*% R[,4]) |
| |
| # prune invalid self joins (>1 bit per feature) |
| I = matrix(1, nrow(P), 1); |
| for( j in 1:ncol(foffb) ) { |
| beg = as.scalar(foffb[1,j])+1; |
| end = as.scalar(foffe[1,j]); |
| I = I & (rowSums(P[,beg:end]) <= 1); |
| } |
| P12 = removeEmpty(target=P12, margin="rows", select=I) |
| P = removeEmpty(target=P, margin="rows", select=I); |
| ss = removeEmpty(target=ss, margin="rows", select=I); |
| se = removeEmpty(target=se, margin="rows", select=I); |
| sm = removeEmpty(target=sm, margin="rows", select=I); |
| |
| # prepare IDs for deduplication and pruning |
| ID = matrix(0, nrow(P), 1); |
| dom = foffe-foffb+1; |
| for( j in 1:ncol(dom) ) { |
| beg = as.scalar(foffb[1,j])+1; |
| end = as.scalar(foffe[1,j]); |
| I = rowIndexMax(P[,beg:end]) * rowMaxs(P[,beg:end]); |
| prod = 1; |
| if(j<ncol(dom)) |
| prod = prod(dom[1,(j+1):ncol(dom)]) |
| ID = ID + I * prod; |
| } |
| |
| # ID transformation to avoid exceeding INT_MAX and |
| # and to void creating huge sparse intermediates |
| [ID, M] = transformencode(target=as.frame(ID), spec="{ids:true,recode:[1]}") |
| |
| # size pruning, with rowMin-rowMax transform |
| # to avoid densification (ignored zeros) |
| map = table(ID, seq(1,nrow(P)), max(ID), nrow(P)) |
| ubSizes = 1/rowMaxs(map * (1/t(ss))); |
| ubSizes = replace(target=ubSizes, pattern=Inf, replacement=0); |
| fSizes = (ubSizes >= minSup) |
| |
| # error pruning |
| ubError = 1/rowMaxs(map * (1/t(se))); |
| ubError = replace(target=ubError, pattern=Inf, replacement=0); |
| ubMError = 1/rowMaxs(map * (1/t(sm))); |
| ubMError = replace(target=ubMError, pattern=Inf, replacement=0); |
| ubScores = scoreUB(ubSizes, ubError, ubMError, eAvg, minSup, alpha, n2); |
| [maxsc, minsc] = analyzeTopK(TKC); |
| fScores = (ubScores > minsc & ubScores > 0) |
| |
| # missing parents pruning |
| numParents = rowSums((map %*% P12) != 0) |
| fParents = (numParents == level); |
| |
| # apply all pruning |
| map = map * (fSizes & fScores & fParents); |
| |
| # deduplication of join outputs |
| Dedup = removeEmpty(target=map, margin="rows") != 0 |
| P = (Dedup %*% P) != 0 |
| } |
| } |
| |
| evalSlice = function(Matrix[Double] X, Matrix[Double] e, Double eAvg, |
| Matrix[Double] tS, Integer l, Double alpha) |
| return(Matrix[Double] R) |
| { |
| I = (X %*% tS) == l; # slice indicator |
| ss = t(colSums(I)); # absolute slice size (nnz) |
| se = t(t(e) %*% I); # absolute slice error |
| sm = t(colMaxs(I * e)); # maximum tuple error in slice |
| |
| # score of relative error and relative size |
| sc = score(ss, se, eAvg, alpha, nrow(X)); |
| R = cbind(sc, se, sm, ss); |
| } |
| |
| decodeTopK = function(Matrix[Double] TK, Matrix[Double] foffb, Matrix[Double] foffe) |
| return(Matrix[Double] TK) |
| { |
| R = matrix(1, nrow(TK), ncol(foffb)); |
| if( nrow(TK) > 0 ) { |
| parfor( j in 1:ncol(foffb) ) { |
| beg = as.scalar(foffb[1,j])+1; |
| end = as.scalar(foffe[1,j]); |
| I = rowSums(TK[,beg:end]) * rowIndexMax(TK[,beg:end]); |
| R[, j] = I; |
| } |
| } |
| TK = R; |
| } |