blob: d8ab0f0f7c3718eb6d5de8afd21cef1e2d5add01 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.sysds.runtime.compress.estim.encoding;
import java.util.Arrays;
import org.apache.commons.lang3.NotImplementedException;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.sysds.runtime.DMLRuntimeException;
import org.apache.sysds.runtime.compress.colgroup.ColGroupConst;
import org.apache.sysds.runtime.compress.colgroup.ColGroupEmpty;
import org.apache.sysds.runtime.compress.colgroup.indexes.IColIndex;
import org.apache.sysds.runtime.compress.colgroup.mapping.AMapToData;
import org.apache.sysds.runtime.compress.colgroup.mapping.MapToFactory;
import org.apache.sysds.runtime.compress.colgroup.offset.AOffset;
import org.apache.sysds.runtime.compress.colgroup.offset.OffsetFactory;
import org.apache.sysds.runtime.compress.readers.ReaderColumnSelection;
import org.apache.sysds.runtime.compress.utils.DblArray;
import org.apache.sysds.runtime.compress.utils.DblArrayCountHashMap;
import org.apache.sysds.runtime.compress.utils.DoubleCountHashMap;
import org.apache.sysds.runtime.compress.utils.IntArrayList;
import org.apache.sysds.runtime.data.DenseBlock;
import org.apache.sysds.runtime.data.SparseBlock;
import org.apache.sysds.runtime.matrix.data.MatrixBlock;
public interface EncodingFactory {
public static final Log LOG = LogFactory.getLog(EncodingFactory.class.getName());
/**
* Encode a list of columns together from the input matrix, as if it is cocoded.
*
* @param m The matrix input to encode
* @param transposed If the matrix is transposed in memory
* @param rowCols The list of columns to encode.
* @return An encoded format of the information of the columns.
*/
public static IEncode createFromMatrixBlock(MatrixBlock m, boolean transposed, IColIndex rowCols) {
if(m.isEmpty())
return new EmptyEncoding();
else if(rowCols.size() == 1)
return createFromMatrixBlock(m, transposed, rowCols.get(0));
else
return createWithReader(m, rowCols, transposed);
}
/**
* Encode a full delta representation of the matrix input taking all rows into account.
*
* Note the input matrix should not be delta encoded, but instead while processing, enforcing that we do not
* allocate more memory.
*
* @param m The input matrix that is not delta encoded and should not be modified
* @param transposed If the input matrix is transposed.
* @param rowCols The list of columns to encode
* @return A delta encoded encoding.
*/
public static IEncode createFromMatrixBlockDelta(MatrixBlock m, boolean transposed, IColIndex rowCols) {
throw new NotImplementedException();
// final int sampleSize = transposed ? m.getNumColumns() : m.getNumRows();
// return createFromMatrixBlockDelta(m, transposed, rowCols, sampleSize);
}
/**
* Encode a delta representation of the matrix input taking the first "sampleSize" rows into account.
*
* Note the input matrix should not be delta encoded, but instead while processing, enforcing that we do not
* allocate more memory.
*
* @param m Input matrix that is not delta encoded and should not be modified
* @param transposed If the input matrix is transposed.
* @param rowCols The list of columns to encode
* @param sampleSize The number of rows to consider for the delta encoding (from the beginning)
* @return A delta encoded encoding.
*/
public static IEncode createFromMatrixBlockDelta(MatrixBlock m, boolean transposed, IColIndex rowCols,
int sampleSize) {
throw new NotImplementedException();
}
/**
* Create encoding of a single specific column inside the matrix input.
*
* @param m The Matrix to encode a column from
* @param transposed If the matrix is in transposed format.
* @param rowCol The column index to encode
* @return An encoded format of the information of this column.
*/
public static IEncode createFromMatrixBlock(MatrixBlock m, boolean transposed, int rowCol) {
if(m.isEmpty())
return new EmptyEncoding();
else if(transposed) {
if(m.isInSparseFormat())
return createFromSparseTransposed(m, rowCol);
else
return createFromDenseTransposed(m, rowCol);
}
else if(m.isInSparseFormat())
return createFromSparse(m, rowCol);
else
return createFromDense(m, rowCol);
}
public static IEncode create(ColGroupConst c) {
return new ConstEncoding(-1);
}
public static IEncode create(ColGroupEmpty c) {
return new EmptyEncoding();
}
public static IEncode create(AMapToData d) {
return new DenseEncoding(d);
}
public static IEncode create(AMapToData d, AOffset i, int nRow) {
return new SparseEncoding(d, i, nRow);
}
private static IEncode createFromDenseTransposed(MatrixBlock m, int row) {
final DenseBlock db = m.getDenseBlock();
if(!db.isContiguous())
throw new NotImplementedException("Not Implemented non contiguous dense matrix encoding for sample");
final DoubleCountHashMap map = new DoubleCountHashMap();
final int off = db.pos(row);
final int nCol = m.getNumColumns();
final int end = off + nCol;
final double[] vals = db.values(row);
// Iteration 1, make Count HashMap.
for(int i = off; i < end; i++) // sequential access
map.increment(vals[i]);
final int nUnique = map.size();
if(nUnique == 1)
return new ConstEncoding(m.getNumColumns());
else if(nUnique == 0)
return new EmptyEncoding();
else if(map.getOrDefault(0.0, -1) > nCol / 4) {
map.replaceWithUIDsNoZero();
final int zeroCount = map.get(0.0);
final int nV = nCol - zeroCount;
final IntArrayList offsets = new IntArrayList(nV);
final AMapToData d = MapToFactory.create(nV, nUnique - 1);
int di = 0;
for(int i = off, r = 0; i < end; i++, r++) {
if(vals[i] != 0) {
offsets.appendValue(r);
d.set(di++, map.getId(vals[i]));
}
}
if(di != nV)
throw new RuntimeException("Did not find equal number of elements " + di + " vs " + nV);
final AOffset o = OffsetFactory.createOffset(offsets);
return new SparseEncoding(d, o, nCol);
}
else {
// Create output map
final AMapToData d = MapToFactory.create(nCol, nUnique);
// Iteration 2, make final map
for(int i = off, r = 0; i < end; i++, r++)
d.set(r, map.getId(vals[i]));
return new DenseEncoding(d);
}
}
private static IEncode createFromSparseTransposed(MatrixBlock m, int row) {
final DoubleCountHashMap map = new DoubleCountHashMap(16);
final SparseBlock sb = m.getSparseBlock();
if(sb.isEmpty(row))
return new EmptyEncoding();
final int apos = sb.pos(row);
final int alen = sb.size(row) + apos;
final double[] avals = sb.values(row);
final int[] aix = sb.indexes(row);
// Iteration 1 of non zero values, make Count HashMap.
for(int i = apos; i < alen; i++) {
map.increment(avals[i]);
}
final int nUnique = map.size();
final int nCol = m.getNumColumns();
if(nUnique == 0)
return new EmptyEncoding();
else if(alen - apos > nCol / 4) { // return a dense encoding
// If the row was full but the overall matrix is sparse.
final int correct = (alen - apos == m.getNumColumns()) ? 0 : 1;
final AMapToData d = MapToFactory.create(nCol, nUnique + correct);
// Since the dictionary is allocated with zero then we exploit that here and
// only iterate through non zero entries.
for(int i = apos; i < alen; i++)
d.set(aix[i], map.getId(avals[i]) + correct);
// the rest is automatically set to zero.
return new DenseEncoding(d);
}
else { // return a sparse encoding
// Create output map
final AMapToData d = MapToFactory.create(alen - apos, nUnique);
// Iteration 2 of non zero values, make either a IEncode Dense or sparse map.
for(int i = apos, j = 0; i < alen; i++, j++)
d.set(j, map.getId(avals[i]));
// Iteration 3 of non zero indexes, make a Offset Encoding to know what cells are zero and not.
// not done yet
final AOffset o = OffsetFactory.createOffset(aix, apos, alen);
return new SparseEncoding(d, o, m.getNumColumns());
}
}
private static IEncode createFromDense(MatrixBlock m, int col) {
final DenseBlock db = m.getDenseBlock();
if(!db.isContiguous())
throw new NotImplementedException("Not Implemented non contiguous dense matrix encoding for sample");
final DoubleCountHashMap map = new DoubleCountHashMap(16);
final int off = col;
final int nCol = m.getNumColumns();
final int nRow = m.getNumRows();
final int end = off + nRow * nCol;
final double[] vals = m.getDenseBlockValues();
// Iteration 1, make Count HashMap.
for(int i = off; i < end; i += nCol) // jump down through rows.
map.increment(vals[i]);
final int nUnique = map.size();
if(nUnique == 1)
return new ConstEncoding(m.getNumColumns());
if(map.getOrDefault(0.0, -1) > nRow / 4) {
map.replaceWithUIDsNoZero();
final int zeroCount = map.get(0.0);
final int nV = m.getNumRows() - zeroCount;
final IntArrayList offsets = new IntArrayList(nV);
final AMapToData d = MapToFactory.create(nV, nUnique - 1);
int di = 0;
for(int i = off, r = 0; i < end; i += nCol, r++) {
if(vals[i] != 0) {
offsets.appendValue(r);
d.set(di++, map.getId(vals[i]));
}
}
if(di != nV)
throw new DMLRuntimeException("Invalid number of zero.");
final AOffset o = OffsetFactory.createOffset(offsets);
return new SparseEncoding(d, o, nRow);
}
else {
// Allocate counts, and iterate once to replace counts with u ids
final AMapToData d = MapToFactory.create(nRow, nUnique);
// Iteration 2, make final map
for(int i = off, r = 0; i < end; i += nCol, r++)
d.set(r, map.getId(vals[i]));
return new DenseEncoding(d);
}
}
private static IEncode createFromSparse(MatrixBlock m, int col) {
final DoubleCountHashMap map = new DoubleCountHashMap(16);
final SparseBlock sb = m.getSparseBlock();
final double guessedNumberOfNonZero = Math.min(4, Math.ceil(m.getNumRows() * m.getSparsity()));
final IntArrayList offsets = new IntArrayList((int) guessedNumberOfNonZero);
// Iteration 1 of non zero values, make Count HashMap.
for(int r = 0; r < m.getNumRows(); r++) { // Horrible performance but ... it works.
if(sb.isEmpty(r))
continue;
final int apos = sb.pos(r);
final int alen = sb.size(r) + apos;
final int[] aix = sb.indexes(r);
final int index = Arrays.binarySearch(aix, apos, alen, col);
if(index >= 0) {
offsets.appendValue(r);
map.increment(sb.values(r)[index]);
}
}
if(offsets.size() == 0)
return new EmptyEncoding();
final int nUnique = map.size();
final AMapToData d = MapToFactory.create(offsets.size(), nUnique);
// Iteration 2 of non zero values, make either a IEncode Dense or sparse map.
for(int off = 0, r = 0; off < offsets.size(); r++) {
if(sb.isEmpty(r))
continue;
final int apos = sb.pos(r);
final int alen = sb.size(r) + apos;
final int[] aix = sb.indexes(r);
// Performance hit because of binary search for each row.
final int index = Arrays.binarySearch(aix, apos, alen, col);
if(index >= 0) {
final double v = sb.values(r)[index];
if(index >= 0)
d.set(off++, map.getId(v));
}
}
// Iteration 3 of non zero indexes, make a Offset Encoding to know what cells are zero and not.
final AOffset o = OffsetFactory.createOffset(offsets);
return new SparseEncoding(d, o, m.getNumRows());
}
private static IEncode createWithReader(MatrixBlock m, IColIndex rowCols, boolean transposed) {
final ReaderColumnSelection reader1 = ReaderColumnSelection.createReader(m, rowCols, transposed);
final int nRows = transposed ? m.getNumColumns() : m.getNumRows();
final DblArrayCountHashMap map = new DblArrayCountHashMap();
final IntArrayList offsets = new IntArrayList();
DblArray cellVals = reader1.nextRow();
// Iteration 1, make Count HashMap, and offsets.
while(cellVals != null) {
map.increment(cellVals);
offsets.appendValue(reader1.getCurrentRowIndex());
cellVals = reader1.nextRow();
}
if(offsets.size() == 0)
return new EmptyEncoding();
else if(map.size() == 1 && offsets.size() == nRows)
return new ConstEncoding(nRows);
if(offsets.size() < nRows / 4)
// Output encoded sparse since there is very empty.
return createWithReaderSparse(m, map, rowCols, offsets, nRows, transposed);
else
return createWithReaderDense(m, map, rowCols, nRows, transposed, offsets.size() < nRows);
}
private static IEncode createWithReaderDense(MatrixBlock m, DblArrayCountHashMap map, IColIndex rowCols, int nRows,
boolean transposed, boolean zero) {
// Iteration 2,
final int unique = map.size() + (zero ? 1 : 0);
final ReaderColumnSelection reader2 = ReaderColumnSelection.createReader(m, rowCols, transposed);
final AMapToData d = MapToFactory.create(nRows, unique);
DblArray cellVals;
if(zero)
while((cellVals = reader2.nextRow()) != null)
d.set(reader2.getCurrentRowIndex(), map.getId(cellVals) + 1);
else
while((cellVals = reader2.nextRow()) != null)
d.set(reader2.getCurrentRowIndex(), map.getId(cellVals));
return new DenseEncoding(d);
}
private static IEncode createWithReaderSparse(MatrixBlock m, DblArrayCountHashMap map, IColIndex rowCols,
IntArrayList offsets, int nRows, boolean transposed) {
final ReaderColumnSelection reader2 = ReaderColumnSelection.createReader(m, rowCols, transposed);
DblArray cellVals = reader2.nextRow();
final AMapToData d = MapToFactory.create(offsets.size(), map.size());
int i = 0;
// Iterator 2 of non zero tuples.
while(cellVals != null) {
d.set(i++, map.getId(cellVals));
cellVals = reader2.nextRow();
}
final AOffset o = OffsetFactory.createOffset(offsets);
return new SparseEncoding(d, o, nRows);
}
public static SparseEncoding createSparse(AMapToData map, AOffset off, int nRows) {
return new SparseEncoding(map, off, nRows);
}
}