| #------------------------------------------------------------- |
| # |
| # Licensed to the Apache Software Foundation (ASF) under one |
| # or more contributor license agreements. See the NOTICE file |
| # distributed with this work for additional information |
| # regarding copyright ownership. The ASF licenses this file |
| # to you under the Apache License, Version 2.0 (the |
| # "License"); you may not use this file except in compliance |
| # with the License. You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, |
| # software distributed under the License is distributed on an |
| # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| # KIND, either express or implied. See the License for the |
| # specific language governing permissions and limitations |
| # under the License. |
| # |
| #------------------------------------------------------------- |
| |
| #------------------------------------------------------------- |
| # X Input matrix (integer encoded [1..v]) |
| # e error vector (classification accuracy, l2 norm, etc) |
| # k top-K subsets / slices |
| # minSup minimum support (min number of rows per slice) |
| # w weight [0,1]: 0 only size, 1 only error |
| # ------------------------------------------------------------ |
| # TK top-k slices (k x ncol(X) if successful) |
| # TKC score, size, error of slices (k x 3) |
| # ------------------------------------------------------------ |
| |
| slicing = function(Matrix[Double] X, Matrix[Double] e, Integer k = 4, Integer minSup = 4, Double w = 0.5, Boolean verbose = FALSE) |
| return(Matrix[Double] TK, Matrix[Double] TKC) |
| { |
| m = nrow(X); |
| n = ncol(X); |
| |
| # prepare offset vectors and one-hot encoded X |
| fdom = colMaxs(X); |
| foffb = t(cumsum(t(fdom))) - fdom; |
| foffe = t(cumsum(t(fdom))) |
| rix = matrix(seq(1,m)%*%matrix(1,1,n), m*n, 1) |
| cix = matrix(X + foffb, m*n, 1); |
| X2 = table(rix, cix); #one-hot encoded |
| |
| # initialize interesting statistics |
| n2 = ncol(X2); # one-hot encoded features |
| eAvg = sum(e) / m; # average error |
| CID = seq(1,n2); # column IDs |
| cCnts = t(colSums(X2)); # column counts |
| err = t(t(e) %*% X2) # total error vector |
| |
| if( verbose ) { |
| drop = as.integer(sum(cCnts < minSup)); |
| print("SliceFinder: dropping "+drop+"/"+n2+" features below minSup = "+minSup+"."); |
| } |
| |
| # working set of active slices (#attr x #slices) and top k |
| selCols = (cCnts >= minSup); |
| attr = removeEmpty(target=CID, margin="rows", select=selCols); |
| cCnts = removeEmpty(target=cCnts, margin="rows", select=selCols); |
| err = removeEmpty(target=err, margin="rows", select=selCols); |
| S = table(seq(1,nrow(attr)), attr, nrow(attr), n2); |
| continue = ncol(S) > 0 & sum(S) > 0; |
| level = 1; |
| |
| # score 1-slices and create initial top-k |
| R = scoreSlices(cCnts, err, m, eAvg, w); |
| |
| [TK, TKC] = maintainTopK(S, R, matrix(0, 0, n2), matrix(0, 0, 3), k, minSup); |
| |
| if( verbose ) { |
| [maxsc, minsc] = analyzeTopK(TKC); |
| print("SliceFinder: initial top-K: count="+nrow(TK)+", max="+maxsc+", min="+minsc) |
| } |
| |
| # lattice enumeration w/ size/error pruning, one iteration per level |
| while( continue ) { |
| level = level + 1; |
| |
| # enumerate candidate join pairs, incl size/error pruning |
| enumC = getPairedCandidates(S, R, TK, TKC, k, level, minSup, foffb, foffe); |
| |
| if(verbose) { |
| print("SliceFinder: level "+level+":") |
| print(" -- generated paired slice candidates: "+nrow(S)+" -> "+nrow(enumC)); |
| } |
| |
| # extract and evaluate candidate slices |
| # note: this could be done as a single matrix multiply, but to avoid |
| # large intermediates we use repeated matrix-vector multiplication |
| R = matrix(0, nrow(enumC), 3) |
| parfor( i in 1:nrow(enumC) ) |
| R[i,] = evalSlice(X2, e, t(enumC[i,]), level, w); |
| |
| # maintain top-k after evaluation |
| [TK, TKC] = maintainTopK(S, R, TK, TKC, k, minSup); |
| |
| # prune slices after evaluation and new top-K |
| # TODO evaluate if useful -> more pruning if at least 1 below threhsold? |
| [S, R] = pruneSlices(enumC, R, TK, TKC, k, minSup); |
| #S = enumC; |
| |
| if(verbose) { |
| [maxsc, minsc] = analyzeTopK(TKC); |
| print(" -- after eval and pruning: "+nrow(S)); |
| print(" -- top-K: count="+nrow(TK)+", max="+maxsc+", min="+minsc); |
| } |
| |
| # termination condition (max #feature levels) |
| continue = ncol(S) > 0 & sum(S) > 0 & level < n; |
| } |
| |
| if( verbose ) { |
| print(sum(TK)); |
| print("SliceFinder: terminated at level "+level+":\n" |
| + toString(TK) + "\n" + toString(TKC)); |
| } |
| } |
| |
| maintainTopK = function(Matrix[Double] S, Matrix[Double] R, Matrix[Double] TK, Matrix[Double] TKC, Integer k, Integer minSup) |
| return(Matrix[Double] TK, Matrix[Double] TKC) |
| { |
| # prune invalid minSup and scores |
| I = (R[,1] > 1) & (R[,3] >= minSup); |
| |
| if( sum(I)!=0 ) { |
| S = removeEmpty(target=S, margin="rows", select=I); |
| R = removeEmpty(target=R, margin="rows", select=I); |
| |
| # evaluated candidated and previous top-k |
| slices = rbind(TK, S); |
| scores = rbind(TKC, R); |
| |
| # extract top-k |
| IX = order(target=scores, by=1, decreasing=TRUE, index.return=TRUE); |
| IX = IX[1:min(k,nrow(IX)),]; |
| P = table(seq(1,nrow(IX)), IX, nrow(IX), nrow(slices)); |
| TK = P %*% slices; |
| TKC = P %*% scores; |
| } |
| } |
| |
| analyzeTopK = function(Matrix[Double] TKC) return(Double maxsc, Double minsc) { |
| maxsc = ifelse(nrow(TKC)>0, as.scalar(TKC[1,1]), NaN); |
| minsc = ifelse(nrow(TKC)>0, as.scalar(TKC[nrow(TKC),1]), NaN); |
| } |
| |
| |
| scoreSlices = function(Matrix[Double] ss, Matrix[Double] se, Integer m, Double eAvg, Double w) |
| return(Matrix[Double] C) |
| { |
| sc = w * (se/ss / eAvg) + (1-w) * ss/m; |
| C = cbind(sc, se, ss); |
| } |
| |
| evalSlice = function(Matrix[Double] X, Matrix[Double] e, Matrix[Double] s, Integer l, Double w = 0.5) |
| return(Matrix[Double] C) |
| { |
| I = (X %*% s) == l; # slice indicator |
| ss = sum(I); # absolute slice size (nnz) |
| se = as.scalar(t(I) %*% e); # absolute slice error |
| |
| # score of relative error and relative size |
| sc = w * (se/ss / sum(e)/nrow(X)) + (1-w) * ss/nrow(X); |
| C = t(as.matrix(list(sc, se, ss))); |
| } |
| |
| getPairedCandidates = function(Matrix[Double] S, Matrix[Double] R, Matrix[Double] TK, Matrix[Double] TKC, Integer k, Integer level, Integer minSup, Matrix[Double] foffb, Matrix[Double] foffe) |
| return(Matrix[Double] P) |
| { |
| while(FALSE){} |
| # join compatible slices (without self) |
| join = S %*% t(S) == (level-2) |
| |
| # pruning by size (at least one below threshold) |
| vsize = outer(R[,3], t(R[,3]), "min") >= minSup; |
| I = join * vsize; |
| I = upper.tri(target=I, diag=FALSE); |
| |
| # pair construction |
| nr = nrow(I); nc = ncol(I); |
| rix = matrix(I * seq(1,nr), nr*nc, 1); |
| cix = matrix(I * t(seq(1,nc)), nr*nc, 1); |
| rix = removeEmpty(target=rix, margin="rows"); |
| cix = removeEmpty(target=cix, margin="rows"); |
| P1 = table(seq(1,nrow(rix)), rix, nrow(rix), nrow(S)); |
| P2 = table(seq(1,nrow(cix)), cix, nrow(rix), nrow(S)); |
| P = ((P1 %*% S) + (P2 %*% S)) != 0; |
| |
| # prune invalid self joins (>1 bit per feature) |
| I = matrix(1, nrow(P), 1); |
| for( j in 1:ncol(foffb) ) { |
| beg = as.scalar(foffb[1,j])+1; |
| end = as.scalar(foffe[1,j]); |
| I = I & (rowSums(P[,beg:end]) <= 1); |
| } |
| P = removeEmpty(target=P, margin="rows", select=I); |
| |
| # deduplication |
| # TODO additional size pruning given dedup mapping |
| ID = matrix(1, nrow(P), 1); |
| dom = foffe-foffb; |
| for( j in 1:ncol(dom) ) { |
| beg = as.scalar(foffb[1,j])+1; |
| end = as.scalar(foffe[1,j]); |
| I = rowIndexMax(P[,beg:end]); |
| prod = 1; |
| if(j<ncol(dom)) |
| prod = prod(dom[1,(j+1):ncol(dom)]) |
| ID = ID + I * prod; |
| } |
| Dedup = removeEmpty(target=table(ID,seq(1,nrow(P))), margin="rows") != 0 |
| P = Dedup %*% P |
| } |
| |
| pruneSlices = function(Matrix[Double] S, Matrix[Double] R, Matrix[Double] TK, Matrix[Double] TKC, Integer k, Integer minSup) |
| return(Matrix[Double] S, Matrix[Double] R) |
| { |
| I = R[,3] >= minSup; |
| S = removeEmpty(target=S, margin="rows", select=I); |
| R = removeEmpty(target=R, margin="rows", select=I); |
| } |
| |
| Forig = read("./Salaries.csv", data_type="frame", format="csv", header=TRUE); |
| |
| F = Forig[,1:ncol(Forig)-1]; |
| y = as.matrix(Forig[,ncol(Forig)]); |
| |
| # data preparation |
| jspec= "{ ids:true, recode:[1,2,3,6], bin:[{id:4, method:equi-width, numbins:14},{id:5, method:equi-width, numbins:12}]}" |
| [X,M] = transformencode(target=F, spec=jspec); |
| X = X[,2:ncol(X)] |
| |
| # learn model |
| B = lm(X=X, y=y, verbose=FALSE); |
| yhat = X %*% B; |
| e = (y-yhat)^2; |
| |
| # call slice finding |
| [S,C] = slicing(X=X, e=e, k=4, w=0.5, minSup=4, verbose=TRUE); |