| /* |
| * Licensed to the Apache Software Foundation (ASF) under one |
| * or more contributor license agreements. See the NOTICE file |
| * distributed with this work for additional information |
| * regarding copyright ownership. The ASF licenses this file |
| * to you under the Apache License, Version 2.0 (the |
| * "License"); you may not use this file except in compliance |
| * with the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, |
| * software distributed under the License is distributed on an |
| * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| * KIND, either express or implied. See the License for the |
| * specific language governing permissions and limitations |
| * under the License. |
| */ |
| |
| package org.apache.sysds.runtime.codegen; |
| |
| import java.util.ArrayList; |
| import java.util.Arrays; |
| import java.util.List; |
| import java.util.concurrent.Callable; |
| import java.util.concurrent.ExecutorService; |
| import java.util.concurrent.Future; |
| |
| import org.apache.sysds.hops.OptimizerUtils; |
| import org.apache.sysds.runtime.DMLRuntimeException; |
| import org.apache.sysds.runtime.data.DenseBlock; |
| import org.apache.sysds.runtime.data.SparseBlock; |
| import org.apache.sysds.runtime.instructions.cp.DoubleObject; |
| import org.apache.sysds.runtime.instructions.cp.ScalarObject; |
| import org.apache.sysds.runtime.matrix.data.MatrixBlock; |
| import org.apache.sysds.runtime.util.CommonThreadPool; |
| import org.apache.sysds.runtime.util.UtilFunctions; |
| |
| public abstract class SpoofOuterProduct extends SpoofOperator |
| { |
| private static final long serialVersionUID = 2948612259863710279L; |
| |
| private static final int L2_CACHESIZE = 256 * 1024; //256KB (common size) |
| |
| public enum OutProdType { |
| LEFT_OUTER_PRODUCT, |
| RIGHT_OUTER_PRODUCT, |
| CELLWISE_OUTER_PRODUCT, // (e.g., X*log(sigmoid(-(U%*%t(V))))) ) |
| AGG_OUTER_PRODUCT // (e.g.,sum(X*log(U%*%t(V)+eps))) ) |
| } |
| |
| protected OutProdType _outerProductType; |
| |
| public SpoofOuterProduct(OutProdType type) { |
| setOuterProdType(type); |
| } |
| |
| public void setOuterProdType(OutProdType type) { |
| _outerProductType = type; |
| } |
| |
| public OutProdType getOuterProdType() { |
| return _outerProductType; |
| } |
| |
| @Override |
| public String getSpoofType() { |
| return "OP" + getClass().getName().split("\\.")[1]; |
| } |
| |
| @Override |
| public ScalarObject execute(ArrayList<MatrixBlock> inputs, ArrayList<ScalarObject> scalarObjects) |
| { |
| //sanity check |
| if( inputs==null || inputs.size() < 3 ) |
| throw new RuntimeException("Invalid input arguments."); |
| if( inputs.get(0).isEmptyBlock(false) ) |
| return new DoubleObject(0); |
| |
| //input preparation |
| DenseBlock[] ab = getDenseMatrices(prepInputMatrices(inputs, 1, 2, true, false)); |
| SideInput[] b = prepInputMatrices(inputs, 3, false); |
| double[] scalars = prepInputScalars(scalarObjects); |
| |
| //core sequential execute |
| final int m = inputs.get(0).getNumRows(); |
| final int n = inputs.get(0).getNumColumns(); |
| final int k = inputs.get(1).getNumColumns(); // rank |
| |
| MatrixBlock a = inputs.get(0); |
| MatrixBlock out = new MatrixBlock(1, 1, false); |
| out.allocateDenseBlock(); |
| |
| if( !a.isInSparseFormat() ) |
| executeCellwiseDense(a.getDenseBlock(), ab[0], ab[1], b, scalars, out.getDenseBlock(), m, n, k, _outerProductType, 0, m, 0, n); |
| else |
| executeCellwiseSparse(a.getSparseBlock(), ab[0], ab[1], b, scalars, out, m, n, k, a.getNonZeros(), _outerProductType, 0, m, 0, n); |
| return new DoubleObject(out.getDenseBlock().get(0, 0)); |
| } |
| |
| @Override |
| public ScalarObject execute(ArrayList<MatrixBlock> inputs, ArrayList<ScalarObject> scalarObjects, int numThreads) |
| { |
| //sanity check |
| if( inputs==null || inputs.size() < 3 ) |
| throw new RuntimeException("Invalid input arguments."); |
| if( inputs.get(0).isEmptyBlock(false) ) |
| return new DoubleObject(0); |
| |
| if( 2*inputs.get(0).getNonZeros()*inputs.get(1).getNumColumns() < PAR_MINFLOP_THRESHOLD ) |
| return execute(inputs, scalarObjects); //sequential |
| |
| //input preparation |
| DenseBlock[] ab = getDenseMatrices(prepInputMatrices(inputs, 1, 2, true, false)); |
| SideInput[] b = prepInputMatrices(inputs, 3, false); |
| double[] scalars = prepInputScalars(scalarObjects); |
| |
| //core sequential execute |
| final int m = inputs.get(0).getNumRows(); |
| final int n = inputs.get(0).getNumColumns(); |
| final int k = inputs.get(1).getNumColumns(); // rank |
| final long nnz = inputs.get(0).getNonZeros(); |
| double sum = 0; |
| |
| try |
| { |
| ExecutorService pool = CommonThreadPool.get(k); |
| ArrayList<ParOuterProdAggTask> tasks = new ArrayList<>(); |
| int numThreads2 = getPreferredNumberOfTasks(m, n, nnz, k, numThreads); |
| int blklen = (int)(Math.ceil((double)m/numThreads2)); |
| for( int i=0; i<numThreads2 & i*blklen<m; i++ ) |
| tasks.add(new ParOuterProdAggTask(inputs.get(0), ab[0], ab[1], b, scalars, |
| m, n, k, _outerProductType, i*blklen, Math.min((i+1)*blklen,m), 0, n)); |
| //execute tasks |
| List<Future<Double>> taskret = pool.invokeAll(tasks); |
| pool.shutdown(); |
| for( Future<Double> task : taskret ) |
| sum += task.get(); |
| } |
| catch (Exception e) { |
| throw new DMLRuntimeException(e); |
| } |
| |
| return new DoubleObject(sum); |
| } |
| |
| @Override |
| public MatrixBlock execute(ArrayList<MatrixBlock> inputs, ArrayList<ScalarObject> scalarObjects, MatrixBlock out) |
| { |
| //sanity check |
| if( inputs==null || inputs.size() < 3 || out==null ) |
| throw new RuntimeException("Invalid input arguments."); |
| |
| //check empty result |
| if( (_outerProductType == OutProdType.LEFT_OUTER_PRODUCT && inputs.get(1).isEmptyBlock(false)) //U is empty |
| || (_outerProductType == OutProdType.RIGHT_OUTER_PRODUCT && inputs.get(2).isEmptyBlock(false)) //V is empty |
| || inputs.get(0).isEmptyBlock(false) ) { //X is empty |
| out.examSparsity(); //turn empty dense into sparse |
| return out; |
| } |
| |
| //input preparation and result allocation (Allocate the output that is set by Sigma2CPInstruction) |
| if(_outerProductType == OutProdType.CELLWISE_OUTER_PRODUCT) { |
| //assign it to the time and sparse representation of the major input matrix |
| out.reset(inputs.get(0).getNumRows(), inputs.get(0).getNumColumns(), inputs.get(0).isInSparseFormat()); |
| } |
| else { |
| //if left outerproduct gives a value of k*n instead of n*k, change it back to n*k and then transpose the output |
| if(_outerProductType == OutProdType.LEFT_OUTER_PRODUCT ) |
| out.reset(inputs.get(0).getNumColumns(), inputs.get(1).getNumColumns(), false); // n*k |
| else if(_outerProductType == OutProdType.RIGHT_OUTER_PRODUCT ) |
| out.reset(inputs.get(0).getNumRows(), inputs.get(1).getNumColumns(), false); // m*k |
| } |
| |
| //check for empty inputs; otherwise allocate result |
| if( inputs.get(0).isEmptyBlock(false) ) |
| return out; |
| out.allocateBlock(); |
| |
| //input preparation |
| DenseBlock[] ab = getDenseMatrices(prepInputMatrices(inputs, 1, 2, true, false)); |
| SideInput[] b = prepInputMatrices(inputs, 3, false); |
| double[] scalars = prepInputScalars(scalarObjects); |
| |
| //core sequential execute |
| final int m = inputs.get(0).getNumRows(); |
| final int n = inputs.get(0).getNumColumns(); |
| final int k = inputs.get(1).getNumColumns(); // rank |
| |
| MatrixBlock a = inputs.get(0); |
| |
| switch(_outerProductType) { |
| case LEFT_OUTER_PRODUCT: |
| case RIGHT_OUTER_PRODUCT: |
| if( !a.isInSparseFormat() ) |
| executeDense(a.getDenseBlock(), ab[0], ab[1], b, scalars, out.getDenseBlock(), m, n, k, _outerProductType, 0, m, 0, n); |
| else |
| executeSparse(a.getSparseBlock(), ab[0], ab[1], b, scalars, out.getDenseBlock(), m, n, k, a.getNonZeros(), _outerProductType, 0, m, 0, n); |
| break; |
| |
| case CELLWISE_OUTER_PRODUCT: |
| if( !a.isInSparseFormat() ) |
| executeCellwiseDense(a.getDenseBlock(), ab[0], ab[1], b, scalars, out.getDenseBlock(), m, n, k, _outerProductType, 0, m, 0, n); |
| else |
| executeCellwiseSparse(a.getSparseBlock(), ab[0], ab[1], b, scalars, out, m, n, k, a.getNonZeros(), _outerProductType, 0, m, 0, n); |
| break; |
| |
| case AGG_OUTER_PRODUCT: |
| throw new DMLRuntimeException("Wrong codepath for aggregate outer product."); |
| } |
| |
| //post-processing |
| out.recomputeNonZeros(); |
| out.examSparsity(); |
| return out; |
| } |
| |
| @Override |
| public MatrixBlock execute(ArrayList<MatrixBlock> inputs, ArrayList<ScalarObject> scalarObjects, MatrixBlock out, int numThreads) |
| { |
| //sanity check |
| if( inputs==null || inputs.size() < 3 || out==null ) |
| throw new RuntimeException("Invalid input arguments."); |
| |
| //check empty result |
| if( (_outerProductType == OutProdType.LEFT_OUTER_PRODUCT && inputs.get(1).isEmptyBlock(false)) //U is empty |
| || (_outerProductType == OutProdType.RIGHT_OUTER_PRODUCT && inputs.get(2).isEmptyBlock(false)) //V is empty |
| || inputs.get(0).isEmptyBlock(false) ) { //X is empty |
| out.examSparsity(); //turn empty dense into sparse |
| return out; |
| } |
| |
| //input preparation and result allocation (Allocate the output that is set by Sigma2CPInstruction) |
| if(_outerProductType == OutProdType.CELLWISE_OUTER_PRODUCT) |
| { |
| //assign it to the time and sparse representation of the major input matrix |
| out.reset(inputs.get(0).getNumRows(), inputs.get(0).getNumColumns(), inputs.get(0).isInSparseFormat()); |
| out.allocateBlock(); |
| } |
| else |
| { |
| //if left outerproduct gives a value of k*n instead of n*k, change it back to n*k and then transpose the output |
| if( _outerProductType == OutProdType.LEFT_OUTER_PRODUCT ) |
| out.reset(inputs.get(0).getNumColumns(),inputs.get(1).getNumColumns(), false); // n*k |
| else if( _outerProductType == OutProdType.RIGHT_OUTER_PRODUCT ) |
| out.reset(inputs.get(0).getNumRows(),inputs.get(1).getNumColumns(), false); // m*k |
| out.allocateDenseBlock(); |
| } |
| |
| if( 2*inputs.get(0).getNonZeros()*inputs.get(1).getNumColumns() < PAR_MINFLOP_THRESHOLD ) |
| return execute(inputs, scalarObjects, out); //sequential |
| |
| //input preparation |
| DenseBlock[] ab = getDenseMatrices(prepInputMatrices(inputs, 1, 2, true, false)); |
| SideInput[] b = prepInputMatrices(inputs, 3, false); |
| double[] scalars = prepInputScalars(scalarObjects); |
| |
| //core sequential execute |
| final int m = inputs.get(0).getNumRows(); |
| final int n = inputs.get(0).getNumColumns(); |
| final int k = inputs.get(1).getNumColumns(); // rank |
| final long nnz = inputs.get(0).getNonZeros(); |
| |
| MatrixBlock a = inputs.get(0); |
| |
| try |
| { |
| ExecutorService pool = CommonThreadPool.get(numThreads); |
| ArrayList<ParExecTask> tasks = new ArrayList<>(); |
| //create tasks (for wdivmm-left, parallelization over columns; |
| //for wdivmm-right, parallelization over rows; both ensure disjoint results) |
| |
| if( _outerProductType == OutProdType.LEFT_OUTER_PRODUCT ) { |
| //parallelize over column partitions |
| int blklen = (int)(Math.ceil((double)n/numThreads)); |
| for( int j=0; j<numThreads & j*blklen<n; j++ ) |
| tasks.add(new ParExecTask(a, ab[0], ab[1], b, scalars, out, m, n, k, |
| _outerProductType, 0, m, j*blklen, Math.min((j+1)*blklen, n))); |
| } |
| else { //right or cell-wise |
| //parallelize over row partitions |
| int numThreads2 = getPreferredNumberOfTasks(m, n, nnz, k, numThreads); |
| int blklen = (int)(Math.ceil((double)m/numThreads2)); |
| for( int i=0; i<numThreads2 & i*blklen<m; i++ ) |
| tasks.add(new ParExecTask(a, ab[0], ab[1], b, scalars, out, m, n, k, |
| _outerProductType, i*blklen, Math.min((i+1)*blklen,m), 0, n)); |
| } |
| List<Future<Long>> taskret = pool.invokeAll(tasks); |
| pool.shutdown(); |
| for( Future<Long> task : taskret ) |
| out.setNonZeros(out.getNonZeros() + task.get()); |
| } |
| catch (Exception e) { |
| throw new DMLRuntimeException(e); |
| } |
| out.examSparsity(); |
| return out; |
| } |
| |
| private static int getPreferredNumberOfTasks(int m, int n, long nnz, int rank, int k) { |
| //compute number of tasks nk in range [k, 8k] |
| int base = (int) Math.min(Math.min(8*k, m/32), |
| Math.ceil((double)2*nnz*rank/PAR_MINFLOP_THRESHOLD)); |
| return UtilFunctions.roundToNext(base, k); |
| } |
| |
| private void executeDense(DenseBlock a, DenseBlock u, DenseBlock v, SideInput[] b, double[] scalars, |
| DenseBlock c, int m, int n, int k, OutProdType type, int rl, int ru, int cl, int cu ) |
| { |
| //approach: iterate over non-zeros of w, selective mm computation |
| //cache-conscious blocking: due to blocksize constraint (default 1000), |
| //a blocksize of 16 allows to fit blocks of UV into L2 cache (256KB) |
| |
| //NOTE: we don't create sparse side inputs w/ row-major cursors because |
| //cache blocking would lead to non-sequential access |
| |
| final int blocksizeIJ = 16; //u/v block (max at typical L2 size) |
| int cix = 0; |
| //blocked execution |
| for( int bi = rl; bi < ru; bi+=blocksizeIJ ) |
| for( int bj = cl, bimin = Math.min(ru, bi+blocksizeIJ); bj < cu; bj+=blocksizeIJ ) |
| { |
| int bjmin = Math.min(cu, bj+blocksizeIJ); |
| |
| //core computation |
| for( int i=bi; i<bimin; i++ ) { |
| double[] avals = a.values(i); |
| double[] uvals = u.values(i); |
| int aix = a.pos(i), uix = u.pos(i); |
| for( int j=bj; j<bjmin; j++) |
| if( avals[aix+j] != 0 ) { |
| int vix = v.pos(j); |
| cix = (type == OutProdType.LEFT_OUTER_PRODUCT) ? vix : uix; |
| genexecDense( avals[aix+j], uvals, uix, v.values(j), vix, |
| b, scalars, c.values(j), cix, m, n, k, i, j); |
| } |
| } |
| } |
| } |
| |
| private void executeCellwiseDense(DenseBlock a, DenseBlock u, DenseBlock v, SideInput[] b, double[] scalars, |
| DenseBlock c, int m, int n, int k, OutProdType type, int rl, int ru, int cl, int cu ) |
| { |
| //approach: iterate over non-zeros of w, selective mm computation |
| //cache-conscious blocking: due to blocksize constraint (default 1000), |
| //a blocksize of 16 allows to fit blocks of UV into L2 cache (256KB) |
| |
| //NOTE: we don't create sparse side inputs w/ row-major cursors because |
| //cache blocking would lead to non-sequential access |
| |
| final int blocksizeIJ = 16; //u/v block (max at typical L2 size) |
| //blocked execution |
| double sum = 0; |
| for( int bi = rl; bi < ru; bi+=blocksizeIJ ) |
| for( int bj = cl, bimin = Math.min(ru, bi+blocksizeIJ); bj < cu; bj+=blocksizeIJ ) |
| { |
| int bjmin = Math.min(cu, bj+blocksizeIJ); |
| |
| //core computation |
| for( int i=bi; i<bimin; i++ ) { |
| double[] avals = a.values(i); |
| double[] uvals = u.values(i); |
| int aix = a.pos(i), uix = u.pos(i); |
| if(type == OutProdType.CELLWISE_OUTER_PRODUCT) { |
| double[] cvals = c.values(i); |
| for( int j=bj; j<bjmin; j++) |
| if( avals[aix+j] != 0 ) |
| cvals[aix+j] = genexecCellwise( avals[aix+j], uvals, uix, |
| v.values(j), v.pos(j), b, scalars, m, n, k, i, j ); |
| } |
| else { |
| for( int j=bj; j<bjmin; j++) |
| if( avals[aix+j] != 0 ) |
| sum += genexecCellwise( avals[aix+j], uvals, uix, |
| v.values(j), v.pos(j), b, scalars, m, n, k, i, j); |
| |
| } |
| } |
| } |
| if( type != OutProdType.CELLWISE_OUTER_PRODUCT ) |
| c.set(0, 0, sum); |
| } |
| |
| private void executeSparse(SparseBlock sblock, DenseBlock u, DenseBlock v, SideInput[] b, double[] scalars, |
| DenseBlock c, int m, int n, int k, long nnz, OutProdType type, int rl, int ru, int cl, int cu) |
| { |
| boolean left = (_outerProductType== OutProdType.LEFT_OUTER_PRODUCT); |
| |
| //approach: iterate over non-zeros of w, selective mm computation |
| //blocked over ij, while maintaining front of column indexes, where the |
| //blocksize is chosen such that we reuse each Ui/Vj vector on average 8 times, |
| //with custom blocksizeJ for wdivmm_left to avoid LLC misses on output. |
| final int blocksizeI = (int) (8L*m*n/nnz); |
| |
| if( OptimizerUtils.getSparsity(m, n, nnz) < MatrixBlock.ULTRA_SPARSITY_TURN_POINT ) //ultra-sparse |
| { |
| //for ultra-sparse matrices, we do not allocate the index array because |
| //its allocation and maintenance can dominate the total runtime. |
| SideInput[] lb = createSparseSideInputs(b); |
| |
| //core wdivmm block matrix mult |
| for( int i=rl; i<ru; i++ ) { |
| if( sblock.isEmpty(i) ) continue; |
| |
| int wpos = sblock.pos(i); |
| int wlen = sblock.size(i); |
| int[] wix = sblock.indexes(i); |
| double[] wvals = sblock.values(i); |
| double[] uvals = u.values(i); |
| int uix = u.pos(i); |
| |
| int index = (cl==0||sblock.isEmpty(i)) ? 0 : sblock.posFIndexGTE(i,cl); |
| index = wpos + ((index>=0) ? index : n); |
| for( ; index<wpos+wlen && wix[index]<cu; index++ ) { |
| int jix = wix[index]; |
| genexecDense(wvals[index], uvals, uix, v.values(jix), v.pos(jix), lb, scalars, |
| c.values(jix), (left ? v.pos(jix) : uix), m, n, k, i, wix[index]); |
| } |
| } |
| } |
| else //sparse |
| { |
| //NOTE: we don't create sparse side inputs w/ row-major cursors because |
| //cache blocking would lead to non-sequential access |
| |
| final int blocksizeJ = left ? Math.max(8,Math.min(L2_CACHESIZE/(k*8), blocksizeI)) : blocksizeI; |
| int[] curk = new int[Math.min(blocksizeI,ru-rl)]; |
| |
| for( int bi = rl; bi < ru; bi+=blocksizeI ) |
| { |
| int bimin = Math.min(ru, bi+blocksizeI); |
| //prepare starting indexes for block row |
| for( int i=bi; i<bimin; i++ ) { |
| int index = (cl==0||sblock.isEmpty(i)) ? 0 : sblock.posFIndexGTE(i,cl); |
| curk[i-bi] = (index>=0) ? index : n; |
| } |
| |
| //blocked execution over column blocks |
| for( int bj = cl; bj < cu; bj+=blocksizeJ ) |
| { |
| int bjmin = Math.min(cu, bj+blocksizeJ); |
| //core wdivmm block matrix mult |
| for( int i=bi; i<bimin; i++ ) { |
| if( sblock.isEmpty(i) ) continue; |
| |
| int wpos = sblock.pos(i); |
| int wlen = sblock.size(i); |
| int[] wix = sblock.indexes(i); |
| double[] wvals = sblock.values(i); |
| double[] uvals = u.values(i); |
| int uix = u.pos(i); |
| |
| int index = wpos + curk[i-bi]; |
| for( ; index<wpos+wlen && wix[index]<bjmin; index++ ) { |
| int jix = wix[index]; |
| genexecDense(wvals[index], uvals, uix, v.values(jix), v.pos(jix), b, scalars, |
| c.values(jix), (left ? wix[index]*k : uix), m, n, k, i, wix[index]); |
| } |
| curk[i-bi] = index - wpos; |
| } |
| } |
| } |
| } |
| } |
| |
| private void executeCellwiseSparse(SparseBlock sblock, DenseBlock u, DenseBlock v, SideInput[] b, double[] scalars, |
| MatrixBlock out, int m, int n, int k, long nnz, OutProdType type, int rl, int ru, int cl, int cu ) |
| { |
| //NOTE: we don't create sparse side inputs w/ row-major cursors because |
| //cache blocking would lead to non-sequential access |
| |
| final int blocksizeIJ = (int) (8L*m*n/nnz); |
| int[] curk = new int[Math.min(blocksizeIJ, ru-rl)]; |
| |
| if( !out.isInSparseFormat() ) //DENSE |
| { |
| DenseBlock c = out.getDenseBlock(); |
| double tmp = 0; |
| for( int bi=rl; bi<ru; bi+=blocksizeIJ ) { |
| int bimin = Math.min(ru, bi+blocksizeIJ); |
| //prepare starting indexes for block row |
| Arrays.fill(curk, 0); |
| //blocked execution over column blocks |
| for( int bj=0; bj<n; bj+=blocksizeIJ ) { |
| int bjmin = Math.min(n, bj+blocksizeIJ); |
| for( int i=bi; i<bimin; i++ ) { |
| if( sblock.isEmpty(i) ) continue; |
| int wpos = sblock.pos(i); |
| int wlen = sblock.size(i); |
| int[] wix = sblock.indexes(i); |
| double[] wvals = sblock.values(i); |
| double[] cvals = c.values(i); |
| double[] uvals = u.values(i); |
| int uix = u.pos(i); |
| int index = wpos + curk[i-bi]; |
| if( type == OutProdType.CELLWISE_OUTER_PRODUCT ) |
| for( ; index<wpos+wlen && wix[index]<bjmin; index++ ) { |
| int jix = wix[index]; |
| cvals[jix] = genexecCellwise( wvals[index], |
| uvals, uix, v.values(jix), v.pos(jix), b, scalars, m, n, k, i, wix[index] ); |
| } |
| else |
| for( ; index<wpos+wlen && wix[index]<bjmin; index++ ) { |
| int jix = wix[index]; |
| tmp += genexecCellwise( wvals[index], |
| uvals, uix, v.values(jix), v.pos(jix), b, scalars, m, n, k, i, wix[index]); |
| } |
| curk[i-bi] = index - wpos; |
| } |
| } |
| } |
| if( type != OutProdType.CELLWISE_OUTER_PRODUCT ) |
| c.set(0, 0, tmp); |
| } |
| else //SPARSE |
| { |
| SparseBlock c = out.getSparseBlock(); |
| for( int bi=rl; bi<ru; bi+=blocksizeIJ ) { |
| int bimin = Math.min(ru, bi+blocksizeIJ); |
| //prepare starting indexes for block row |
| Arrays.fill(curk, 0); |
| //blocked execution over column blocks |
| for( int bj=0; bj<n; bj+=blocksizeIJ ) { |
| int bjmin = Math.min(n, bj+blocksizeIJ); |
| for( int i=bi; i<bimin; i++ ) { |
| if( sblock.isEmpty(i) ) continue; |
| int wpos = sblock.pos(i); |
| int wlen = sblock.size(i); |
| int[] wix = sblock.indexes(i); |
| double[] wval = sblock.values(i); |
| double[] uvals = u.values(i); |
| int uix = u.pos(i); |
| int index = wpos + curk[i-bi]; |
| for( ; index<wpos+wlen && wix[index]<bjmin; index++ ) { |
| int jix = wix[index]; |
| c.append(i, wix[index], genexecCellwise( wval[index], uvals, uix, |
| v.values(jix), v.pos(jix), b, scalars, m, n, k, i, wix[index] )); |
| } |
| curk[i-bi] = index - wpos; |
| } |
| } |
| } |
| } |
| } |
| |
| protected abstract void genexecDense( double a, double[] u, int ui, double[] v, int vi, SideInput[] b, |
| double[] scalars, double[] c, int ci, int m, int n, int k, int rowIndex, int colIndex); |
| |
| protected abstract double genexecCellwise( double a, double[] u, int ui, double[] v, int vi, SideInput[] b, |
| double[] scalars, int m, int n, int k, int rowIndex, int colIndex); |
| |
| private class ParExecTask implements Callable<Long> |
| { |
| private final MatrixBlock _a; |
| private final DenseBlock _u; |
| private final DenseBlock _v; |
| private final SideInput[] _b; |
| private final double[] _scalars; |
| private final MatrixBlock _c; |
| private final int _clen; |
| private final int _rlen; |
| private final int _k; |
| private final OutProdType _type; |
| private final int _rl; |
| private final int _ru; |
| private final int _cl; |
| private final int _cu; |
| |
| protected ParExecTask( MatrixBlock a, DenseBlock u, DenseBlock v, SideInput[] b, double[] scalars , MatrixBlock c, int m, int n, int k, OutProdType type, int rl, int ru, int cl, int cu ) { |
| _a = a; |
| _u = u; |
| _v = v; |
| _b = b; |
| _c = c; |
| _scalars = scalars; |
| _rlen = m; |
| _clen = n; |
| _k = k; |
| _type = type; |
| _rl = rl; |
| _ru = ru; |
| _cl = cl; |
| _cu = cu; |
| } |
| |
| @Override |
| public Long call() { |
| switch(_type) |
| { |
| case LEFT_OUTER_PRODUCT: |
| case RIGHT_OUTER_PRODUCT: |
| if( !_a.isInSparseFormat() ) |
| executeDense(_a.getDenseBlock(), _u, _v, _b, _scalars, _c.getDenseBlock(), _rlen, _clen, _k, _type, _rl, _ru, _cl, _cu); |
| else |
| executeSparse(_a.getSparseBlock(), _u, _v, _b, _scalars, _c.getDenseBlock(), _rlen, _clen, _k, _a.getNonZeros(), _type, _rl, _ru, _cl, _cu); |
| break; |
| case CELLWISE_OUTER_PRODUCT: |
| if( !_c.isInSparseFormat() ) |
| executeCellwiseDense(_a.getDenseBlock(), _u, _v, _b, _scalars, _c.getDenseBlock(), _rlen, _clen, _k, _type, _rl, _ru, _cl, _cu); |
| else |
| executeCellwiseSparse(_a.getSparseBlock(), _u, _v, _b, _scalars, _c, _rlen, _clen, _k, _a.getNonZeros(), _type, _rl, _ru, _cl, _cu); |
| break; |
| case AGG_OUTER_PRODUCT: |
| throw new DMLRuntimeException("Wrong codepath for aggregate outer product."); |
| } |
| |
| boolean left = (_outerProductType == OutProdType.LEFT_OUTER_PRODUCT); |
| int rl = left ? _cl : _rl; |
| int ru = left ? _cu : _ru; |
| return _c.recomputeNonZeros(rl, ru-1, 0, _c.getNumColumns()-1); |
| } |
| } |
| |
| private class ParOuterProdAggTask implements Callable<Double> |
| { |
| private final MatrixBlock _a; |
| private final DenseBlock _u; |
| private final DenseBlock _v; |
| private final SideInput[] _b; |
| private final double[] _scalars; |
| private final int _rlen; |
| private final int _clen; |
| private final int _k; |
| private final OutProdType _type; |
| private final int _rl; |
| private final int _ru; |
| private final int _cl; |
| private final int _cu; |
| |
| protected ParOuterProdAggTask( MatrixBlock a, DenseBlock u, DenseBlock v, SideInput[] b, double[] scalars, int m, int n, int k, OutProdType type, int rl, int ru, int cl, int cu ) { |
| _a = a; |
| _u = u; |
| _v = v; |
| _b = b; |
| _scalars = scalars; |
| _rlen = m; |
| _clen = n; |
| _k = k; |
| _type = type; |
| _rl = rl; |
| _ru = ru; |
| _cl = cl; |
| _cu = cu; |
| } |
| |
| @Override |
| public Double call() { |
| MatrixBlock out = new MatrixBlock(1, 1, false); |
| out.allocateDenseBlock(); |
| if( !_a.isInSparseFormat() ) |
| executeCellwiseDense(_a.getDenseBlock(), _u, _v, _b, _scalars, out.getDenseBlock(), _rlen, _clen, _k, _type, _rl, _ru, _cl, _cu); |
| else |
| executeCellwiseSparse(_a.getSparseBlock(), _u, _v, _b, _scalars, out, _rlen, _clen, _k, _a.getNonZeros(), _type, _rl, _ru, _cl, _cu); |
| return out.quickGetValue(0, 0); |
| } |
| } |
| } |