blob: 37ed289862dec6a4c00fbd24fa4a696501b33011 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.sysds.runtime.compress.colgroup.dictionary;
import java.io.DataInput;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Map;
import org.apache.commons.lang.NotImplementedException;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.sysds.runtime.compress.DMLCompressionException;
import org.apache.sysds.runtime.compress.bitmap.ABitmap;
import org.apache.sysds.runtime.compress.bitmap.Bitmap;
import org.apache.sysds.runtime.compress.bitmap.MultiColBitmap;
import org.apache.sysds.runtime.compress.colgroup.AColGroup.CompressionType;
import org.apache.sysds.runtime.compress.colgroup.AColGroupCompressed;
import org.apache.sysds.runtime.compress.colgroup.ColGroupEmpty;
import org.apache.sysds.runtime.compress.colgroup.IContainADictionary;
import org.apache.sysds.runtime.compress.colgroup.IContainDefaultTuple;
import org.apache.sysds.runtime.compress.lib.CLALibCombineGroups;
import org.apache.sysds.runtime.compress.utils.ACount.DArrCounts;
import org.apache.sysds.runtime.compress.utils.DblArrayCountHashMap;
import org.apache.sysds.runtime.compress.utils.DoubleCountHashMap;
import org.apache.sysds.runtime.data.SparseBlock;
import org.apache.sysds.runtime.data.SparseRowVector;
import org.apache.sysds.runtime.matrix.data.MatrixBlock;
public interface DictionaryFactory {
static final Log LOG = LogFactory.getLog(DictionaryFactory.class.getName());
public enum Type {
FP64_DICT, MATRIX_BLOCK_DICT, INT8_DICT, IDENTITY, IDENTITY_SLICE,
}
public static ADictionary read(DataInput in) throws IOException {
Type type = Type.values()[in.readByte()];
switch(type) {
case FP64_DICT:
return Dictionary.read(in);
case MATRIX_BLOCK_DICT:
return MatrixBlockDictionary.read(in);
case INT8_DICT:
return QDictionary.read(in);
default:
throw new DMLCompressionException("Unsupported type of dictionary : " + type);
}
}
public static long getInMemorySize(int nrValues, int nrColumns, double tupleSparsity, boolean lossy) {
if(lossy)
return QDictionary.getInMemorySize(nrValues * nrColumns);
else if(nrColumns > 1 && tupleSparsity < 0.4)
return MatrixBlockDictionary.getInMemorySize(nrValues, nrColumns, tupleSparsity);
else
return Dictionary.getInMemorySize(nrValues * nrColumns);
}
public static ADictionary create(DblArrayCountHashMap map, int nCols, boolean addZeroTuple, double sparsity) {
try {
final ArrayList<DArrCounts> vals = map.extractValues();
final int nVals = vals.size();
final int nTuplesOut = nVals + (addZeroTuple ? 1 : 0);
if(sparsity < 0.4) {
final MatrixBlock retB = new MatrixBlock(nTuplesOut, nCols, true);
retB.allocateSparseRowsBlock();
final SparseBlock sb = retB.getSparseBlock();
for(int i = 0; i < nVals; i++) {
final DArrCounts dac = vals.get(i);
final double[] dv = dac.key.getData();
for(int k = 0; k < dv.length; k++)
sb.append(dac.id, k, dv[k]);
}
retB.recomputeNonZeros();
retB.examSparsity(true);
return MatrixBlockDictionary.create(retB);
}
else {
final double[] resValues = new double[(nTuplesOut) * nCols];
for(int i = 0; i < nVals; i++) {
final DArrCounts dac = vals.get(i);
System.arraycopy(dac.key.getData(), 0, resValues, dac.id * nCols, nCols);
}
return Dictionary.create(resValues);
}
}
catch(Exception e) {
LOG.error("Failed to create dictionary: ", e);
return null;
}
}
public static ADictionary create(ABitmap ubm) {
return create(ubm, 1.0);
}
public static ADictionary create(ABitmap ubm, double sparsity, boolean withZeroTuple) {
return (withZeroTuple) ? createWithAppendedZeroTuple(ubm, sparsity) : create(ubm, sparsity);
}
public static ADictionary create(ABitmap ubm, double sparsity) {
final int nCol = ubm.getNumColumns();
if(ubm instanceof Bitmap)
return Dictionary.create(((Bitmap) ubm).getValues());
else if(sparsity < 0.4 && nCol > 4 && ubm instanceof MultiColBitmap) {
final MultiColBitmap mcbm = (MultiColBitmap) ubm;
final MatrixBlock m = new MatrixBlock(ubm.getNumValues(), nCol, true);
m.allocateSparseRowsBlock();
final SparseBlock sb = m.getSparseBlock();
final int nVals = ubm.getNumValues();
for(int i = 0; i < nVals; i++) {
final double[] tuple = mcbm.getValues(i);
for(int col = 0; col < nCol; col++)
sb.append(i, col, tuple[col]);
}
m.recomputeNonZeros();
m.examSparsity(true);
return MatrixBlockDictionary.create(m);
}
else if(ubm instanceof MultiColBitmap) {
MultiColBitmap mcbm = (MultiColBitmap) ubm;
final int nVals = ubm.getNumValues();
double[] resValues = new double[nVals * nCol];
for(int i = 0; i < nVals; i++)
System.arraycopy(mcbm.getValues(i), 0, resValues, i * nCol, nCol);
return Dictionary.create(resValues);
}
throw new NotImplementedException("Not implemented creation of bitmap type : " + ubm.getClass().getSimpleName());
}
public static ADictionary create(ABitmap ubm, int defaultIndex, double[] defaultTuple, double sparsity,
boolean addZero) {
final int nCol = ubm.getNumColumns();
final int nVal = ubm.getNumValues() - (addZero ? 0 : 1);
if(nCol > 4 && sparsity < 0.4) {
final MultiColBitmap mcbm = (MultiColBitmap) ubm; // always multi column
final MatrixBlock m = new MatrixBlock(nVal, nCol, true);
m.allocateSparseRowsBlock();
final SparseBlock sb = m.getSparseBlock();
for(int i = 0; i < defaultIndex; i++)
sb.set(i, new SparseRowVector(mcbm.getValues(i)), false);
// copy default
System.arraycopy(mcbm.getValues(defaultIndex), 0, defaultTuple, 0, nCol);
for(int i = defaultIndex; i < ubm.getNumValues() - 1; i++)
sb.set(i, new SparseRowVector(mcbm.getValues(i + 1)), false);
m.recomputeNonZeros();
m.examSparsity(true);
return MatrixBlockDictionary.create(m);
}
else {
double[] dict = new double[nCol * nVal];
if(ubm instanceof Bitmap) {
final double[] bmv = ((Bitmap) ubm).getValues();
System.arraycopy(bmv, 0, dict, 0, defaultIndex);
defaultTuple[0] = bmv[defaultIndex];
System.arraycopy(bmv, defaultIndex + 1, dict, defaultIndex, bmv.length - defaultIndex - 1);
}
else if(ubm instanceof MultiColBitmap) {
final MultiColBitmap mcbm = (MultiColBitmap) ubm;
for(int i = 0; i < defaultIndex; i++)
System.arraycopy(mcbm.getValues(i), 0, dict, i * nCol, nCol);
System.arraycopy(mcbm.getValues(defaultIndex), 0, defaultTuple, 0, nCol);
for(int i = defaultIndex; i < ubm.getNumValues() - 1; i++)
System.arraycopy(mcbm.getValues(i + 1), 0, dict, i * nCol, nCol);
}
else
throw new NotImplementedException("not supported ABitmap of type:" + ubm.getClass().getSimpleName());
return Dictionary.create(dict);
}
}
public static ADictionary createWithAppendedZeroTuple(ABitmap ubm, double sparsity) {
final int nVals = ubm.getNumValues();
final int nRows = nVals + 1;
final int nCols = ubm.getNumColumns();
if(ubm instanceof Bitmap) {
final double[] resValues = new double[nRows];
final double[] from = ((Bitmap) ubm).getValues();
System.arraycopy(from, 0, resValues, 0, from.length);
return Dictionary.create(resValues);
}
final MultiColBitmap mcbm = (MultiColBitmap) ubm;
if(sparsity < 0.4 && nCols > 4) {
final MatrixBlock m = new MatrixBlock(nRows, nCols, true);
m.allocateSparseRowsBlock();
final SparseBlock sb = m.getSparseBlock();
for(int i = 0; i < nVals; i++) {
final double[] tuple = mcbm.getValues(i);
for(int col = 0; col < nCols; col++)
sb.append(i, col, tuple[col]);
}
m.recomputeNonZeros();
m.examSparsity(true);
return MatrixBlockDictionary.create(m);
}
final double[] resValues = new double[nRows * nCols];
for(int i = 0; i < nVals; i++)
System.arraycopy(mcbm.getValues(i), 0, resValues, i * nCols, nCols);
return Dictionary.create(resValues);
}
public static ADictionary create(DoubleCountHashMap map) {
final double[] resValues = map.getDictionary();
return Dictionary.create(resValues);
}
public static ADictionary combineDictionaries(AColGroupCompressed a, AColGroupCompressed b) {
return combineDictionaries(a, b, null);
}
public static ADictionary combineDictionaries(AColGroupCompressed a, AColGroupCompressed b,
Map<Integer, Integer> filter) {
if(a instanceof ColGroupEmpty && b instanceof ColGroupEmpty)
return null; // null return is handled elsewhere.
CompressionType ac = a.getCompType();
CompressionType bc = b.getCompType();
boolean ae = a instanceof IContainADictionary;
boolean be = b instanceof IContainADictionary;
if(ae && be) {
ADictionary ad = ((IContainADictionary) a).getDictionary();
ADictionary bd = ((IContainADictionary) b).getDictionary();
if(ac.isConst()) {
if(bc.isConst()) {
return new Dictionary(CLALibCombineGroups.constructDefaultTuple(a, b));
}
else if(bc.isDense()) {
final double[] at = ((IContainDefaultTuple) a).getDefaultTuple();
return combineConstSparseSparseRet(at, bd, b.getNumCols());
}
}
else if(ac.isDense()) {
if(bc.isConst()) {
final double[] bt = ((IContainDefaultTuple) b).getDefaultTuple();
return combineSparseConstSparseRet(ad, a.getNumCols(), bt);
}
else if(bc.isDense())
return combineFullDictionaries(ad, a.getNumCols(), bd, b.getNumCols(), filter);
else if(bc.isSDC()) {
double[] tuple = ((IContainDefaultTuple) b).getDefaultTuple();
return combineSDCRight(ad, a.getNumCols(), bd, tuple, filter);
}
}
else if(ac.isSDC()) {
if(bc.isSDC()) {
final double[] at = ((IContainDefaultTuple) a).getDefaultTuple();
final double[] bt = ((IContainDefaultTuple) b).getDefaultTuple();
return combineSDC(ad, at, bd, bt, filter);
}
}
}
throw new NotImplementedException("Not supporting combining dense: " + a + " " + b);
}
/**
* Combine the dictionaries assuming a sparse combination where each dictionary can be a SDC containing a default
* element that have to be introduced into the combined dictionary.
*
* @param a A Dictionary can be SDC or const
* @param b A Dictionary can be Const or SDC.
* @return The combined dictionary
*/
public static ADictionary combineDictionariesSparse(AColGroupCompressed a, AColGroupCompressed b) {
CompressionType ac = a.getCompType();
CompressionType bc = b.getCompType();
if(ac.isSDC()) {
ADictionary ad = ((IContainADictionary) a).getDictionary();
if(bc.isConst()) {
double[] bt = ((IContainDefaultTuple) b).getDefaultTuple();
return combineSparseConstSparseRet(ad, a.getNumCols(), bt);
}
else if(bc.isSDC()) {
ADictionary bd = ((IContainADictionary) b).getDictionary();
if(a.sameIndexStructure(b)) {
return ad.cbind(bd, b.getNumCols());
}
// real combine extract default and combine like dense but with default before.
}
}
else if(ac.isConst()) {
double[] at = ((IContainDefaultTuple) a).getDefaultTuple();
if(bc.isSDC()) {
ADictionary bd = ((IContainADictionary) b).getDictionary();
return combineConstSparseSparseRet(at, bd, b.getNumCols());
}
}
throw new NotImplementedException("Not supporting combining dense: " + a + " " + b);
}
/**
* Combine the dictionaries as if the dictionaries contain the full spectrum of the combined data.
*
* @param a Left side dictionary
* @param nca Number of columns left dictionary
* @param b Right side dictionary
* @param ncb Number of columns right dictionary
* @return A combined dictionary
*/
public static ADictionary combineFullDictionaries(ADictionary a, int nca, ADictionary b, int ncb) {
return combineFullDictionaries(a, nca, b, ncb, null);
}
/**
* Combine the dictionaries as if the dictionaries only contain the values in the specified filter.
*
* @param a Left side dictionary
* @param nca Number of columns left dictionary
* @param b Right side dictionary
* @param ncb Number of columns right dictionary
* @param filter The mapping filter to not include all possible combinations in the output, this filter is allowed to
* be null, that means the output is defaulting back to a full combine
* @return A combined dictionary
*/
public static ADictionary combineFullDictionaries(ADictionary a, int nca, ADictionary b, int ncb,
Map<Integer, Integer> filter) {
final int ra = a.getNumberOfValues(nca);
final int rb = b.getNumberOfValues(ncb);
MatrixBlock ma = a.getMBDict(nca).getMatrixBlock();
MatrixBlock mb = b.getMBDict(ncb).getMatrixBlock();
if(ra == 1 && rb == 1)
return new MatrixBlockDictionary(ma.append(mb));
MatrixBlock out = new MatrixBlock(filter != null ? filter.size() : ra * rb, nca + ncb, false);
out.allocateBlock();
if(filter != null) {
for(int r : filter.keySet()) {
int o = filter.get(r);
int ia = r % ra;
int ib = r / ra;
for(int c = 0; c < nca; c++)
out.quickSetValue(o, c, ma.quickGetValue(ia, c));
for(int c = 0; c < ncb; c++)
out.quickSetValue(o, c + nca, mb.quickGetValue(ib, c));
}
}
else {
for(int r = 0; r < out.getNumRows(); r++) {
int ia = r % ra;
int ib = r / ra;
for(int c = 0; c < nca; c++)
out.quickSetValue(r, c, ma.quickGetValue(ia, c));
for(int c = 0; c < ncb; c++)
out.quickSetValue(r, c + nca, mb.quickGetValue(ib, c));
}
}
return new MatrixBlockDictionary(out);
}
public static ADictionary combineSDCRight(ADictionary a, int nca, ADictionary b, double[] tub) {
return combineSDCRight(a, nca, b, tub, null);
}
public static ADictionary combineSDCRight(ADictionary a, int nca, ADictionary b, double[] tub,
Map<Integer, Integer> filter) {
final int ncb = tub.length;
final int ra = a.getNumberOfValues(nca);
final int rb = b.getNumberOfValues(ncb);
MatrixBlock ma = a.getMBDict(nca).getMatrixBlock();
MatrixBlock mb = b.getMBDict(ncb).getMatrixBlock();
MatrixBlock out = new MatrixBlock(ra * (rb + 1), nca + ncb, false);
out.allocateBlock();
for(int r = 0; r < ra; r++) {
for(int c = 0; c < nca; c++)
out.quickSetValue(r, c, ma.quickGetValue(r, c));
for(int c = 0; c < ncb; c++)
out.quickSetValue(0, c + nca, tub[c]);
}
for(int r = ra; r < out.getNumRows(); r++) {
int ia = r % ra;
int ib = r / ra - 1;
for(int c = 0; c < nca; c++) // all good.
out.quickSetValue(r, c, ma.quickGetValue(ia, c));
for(int c = 0; c < ncb; c++)
out.quickSetValue(r, c + nca, mb.quickGetValue(ib, c));
}
return new MatrixBlockDictionary(out);
}
public static ADictionary combineSDC(ADictionary a, double[] tua, ADictionary b, double[] tub) {
return combineSDC(a, tua, b, tub, null);
}
public static ADictionary combineSDC(ADictionary a, double[] tua, ADictionary b, double[] tub,
Map<Integer, Integer> filter) {
if(filter != null)
throw new NotImplementedException();
final int nca = tua.length;
final int ncb = tub.length;
final int ra = a.getNumberOfValues(nca);
final int rb = b.getNumberOfValues(ncb);
MatrixBlock ma = a.getMBDict(nca).getMatrixBlock();
MatrixBlock mb = b.getMBDict(ncb).getMatrixBlock();
MatrixBlock out = new MatrixBlock((ra + 1) * (rb + 1), nca + ncb, false);
out.allocateBlock();
// 0 row both default tuples
for(int c = 0; c < nca; c++)
out.quickSetValue(0, c, tua[c]);
for(int c = 0; c < ncb; c++)
out.quickSetValue(0, c + nca, tub[c]);
// default case for b and all cases for a.
for(int r = 1; r < ra + 1; r++) {
for(int c = 0; c < nca; c++)
out.quickSetValue(r, c, ma.quickGetValue(r - 1, c));
for(int c = 0; c < ncb; c++)
out.quickSetValue(r, c + nca, tub[c]);
}
for(int r = ra + 1; r < out.getNumRows(); r++) {
int ia = r % (ra + 1) - 1;
int ib = r / (ra + 1) - 1;
if(ia == -1)
for(int c = 0; c < nca; c++)
out.quickSetValue(r, c, tua[c]);
else
for(int c = 0; c < nca; c++)
out.quickSetValue(r, c, ma.quickGetValue(ia, c));
for(int c = 0; c < ncb; c++) // all good here.
out.quickSetValue(r, c + nca, mb.quickGetValue(ib, c));
}
return new MatrixBlockDictionary(out);
}
public static ADictionary combineSparseConstSparseRet(ADictionary a, int nca, double[] tub) {
final int ncb = tub.length;
final int ra = a.getNumberOfValues(nca);
MatrixBlock ma = a.getMBDict(nca).getMatrixBlock();
MatrixBlock out = new MatrixBlock(ra, nca + ncb, false);
out.allocateBlock();
// default case for b and all cases for a.
for(int r = 0; r < ra; r++) {
for(int c = 0; c < nca; c++)
out.quickSetValue(r, c, ma.quickGetValue(r, c));
for(int c = 0; c < ncb; c++)
out.quickSetValue(r, c + nca, tub[c]);
}
return new MatrixBlockDictionary(out);
}
public static ADictionary combineConstSparseSparseRet(double[] tua, ADictionary b, int ncb) {
final int nca = tua.length;
final int rb = b.getNumberOfValues(ncb);
MatrixBlock mb = b.getMBDict(ncb).getMatrixBlock();
MatrixBlock out = new MatrixBlock(rb, nca + ncb, false);
out.allocateBlock();
// default case for b and all cases for a.
for(int r = 0; r < rb; r++) {
for(int c = 0; c < nca; c++)
out.quickSetValue(r, c, tua[c]);
for(int c = 0; c < ncb; c++)
out.quickSetValue(r, c + nca, mb.quickGetValue(r, c));
}
return new MatrixBlockDictionary(out);
}
}