src/main/java/org/apache/sysds/runtime/compress/colgroup/AColGroup.java - systemds - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *   http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  * KIND, either express or implied.  See the License for the
  * specific language governing permissions and limitations
  * under the License.
  */

 package org.apache.sysds.runtime.compress.colgroup;

 import java.io.DataOutput;
 import java.io.IOException;
 import java.io.Serializable;
 import java.util.Collection;

 import org.apache.commons.lang.NotImplementedException;
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import org.apache.sysds.runtime.compress.colgroup.indexes.IColIndex;
 import org.apache.sysds.runtime.compress.colgroup.indexes.IColIndex.SliceResult;
 import org.apache.sysds.runtime.compress.colgroup.scheme.ICLAScheme;
 import org.apache.sysds.runtime.compress.cost.ComputationCostEstimator;
 import org.apache.sysds.runtime.compress.estim.CompressedSizeInfoColGroup;
 import org.apache.sysds.runtime.compress.estim.encoding.IEncode;
 import org.apache.sysds.runtime.compress.lib.CLALibCombineGroups;
 import org.apache.sysds.runtime.data.DenseBlock;
 import org.apache.sysds.runtime.data.SparseBlock;
 import org.apache.sysds.runtime.functionobjects.Plus;
 import org.apache.sysds.runtime.instructions.cp.CM_COV_Object;
 import org.apache.sysds.runtime.matrix.data.MatrixBlock;
 import org.apache.sysds.runtime.matrix.operators.AggregateUnaryOperator;
 import org.apache.sysds.runtime.matrix.operators.BinaryOperator;
 import org.apache.sysds.runtime.matrix.operators.CMOperator;
 import org.apache.sysds.runtime.matrix.operators.ScalarOperator;
 import org.apache.sysds.runtime.matrix.operators.UnaryOperator;

 /**
  * Abstract Class that is the lowest class type for the Compression framework.
  *
  * AColGroup store information about a number of columns.
  *
  */
 public abstract class AColGroup implements Serializable {
 	protected static final Log LOG = LogFactory.getLog(AColGroup.class.getName());
 	private static final long serialVersionUID = -1318908671481L;

 	/** Public super types of compression ColGroups supported */
 	public static enum CompressionType {
 		UNCOMPRESSED, RLE, OLE, DDC, CONST, EMPTY, SDC, SDCFOR, DDCFOR, DeltaDDC, LinearFunctional;

 		public boolean isDense() {
 			return this == DDC || this == CONST || this == DDCFOR || this == DDCFOR;
 		}

 		public boolean isConst() {
 			return this == CONST || this == EMPTY;
 		}

 		public boolean isSDC() {
 			return this == SDC;
 		}
 	}

 	/**
 	 * Concrete ColGroupType
 	 *
 	 * Protected such that outside the ColGroup package it should be unknown which specific subtype is used.
 	 */
 	protected static enum ColGroupType {
 		UNCOMPRESSED, RLE, OLE, DDC, CONST, EMPTY, SDC, SDCSingle, SDCSingleZeros, SDCZeros, SDCFOR, DDCFOR, DeltaDDC,
 		LinearFunctional;
 	}

 	/** The ColGroup indexes contained in the ColGroup */
 	protected final IColIndex _colIndexes;

 	/**
 	 * Main constructor.
 	 *
 	 * @param colIndices offsets of the columns in the matrix block that make up the group
 	 */
 	protected AColGroup(IColIndex colIndices) {
 		_colIndexes = colIndices;
 	}

 	/**
 	 * Obtain the offsets of the columns in the matrix block that make up the group
 	 *
 	 * @return offsets of the columns in the matrix block that make up the group
 	 */
 	public final IColIndex getColIndices() {
 		return _colIndexes;
 	}

 	/**
 	 * Obtain the number of columns in this column group.
 	 *
 	 * @return number of columns in this column group
 	 */
 	public final int getNumCols() {
 		return _colIndexes.size();
 	}

 	/**
 	 * Shift all column indexes contained by an offset.
 	 *
 	 * This is used for rbind to combine compressed matrices.
 	 *
 	 * Since column indexes are reused between operations, we allocate a new list here to be safe
 	 *
 	 * @param offset The offset to move all columns
 	 * @return A new column group object with the shifted columns
 	 */
 	public final AColGroup shiftColIndices(int offset) {
 		return copyAndSet(_colIndexes.shift(offset));
 	}

 	/**
 	 * Copy the content of the column group with pointers to the previous content but with new column given Note this
 	 * method does not verify if the colIndexes specified are valid and correct dimensions for the underlying column
 	 * groups.
 	 *
 	 * @param colIndexes the new indexes to use in the copy
 	 * @return a new object with pointers to underlying data.
 	 */
 	public abstract AColGroup copyAndSet(IColIndex colIndexes);

 	/**
 	 * Get the upper bound estimate of in memory allocation for the column group.
 	 *
 	 * @return an upper bound on the number of bytes used to store this ColGroup in memory.
 	 */
 	public long estimateInMemorySize() {
 		long size = 16; // object header
 		size += _colIndexes.estimateInMemorySize();
 		return size;
 	}

 	/**
 	 * Decompress a range of rows into a sparse block
 	 *
 	 * Note that this is using append, so the sparse column indexes need to be sorted afterwards.
 	 *
 	 * @param sb Sparse Target block
 	 * @param rl Row to start at
 	 * @param ru Row to end at
 	 */
 	public final void decompressToSparseBlock(SparseBlock sb, int rl, int ru) {
 		decompressToSparseBlock(sb, rl, ru, 0, 0);
 	}

 	/**
 	 * Decompress a range of rows into a dense block
 	 *
 	 * @param db Sparse Target block
 	 * @param rl Row to start at
 	 * @param ru Row to end at
 	 */
 	public final void decompressToDenseBlock(DenseBlock db, int rl, int ru) {
 		decompressToDenseBlock(db, rl, ru, 0, 0);
 	}

 	/**
 	 * Serializes column group to data output.
 	 *
 	 * @param out data output
 	 * @throws IOException if IOException occurs
 	 */
 	protected void write(DataOutput out) throws IOException {
 		out.writeByte(getColGroupType().ordinal());
 		_colIndexes.write(out);
 	}

 	/**
 	 * Returns the exact serialized size of column group. This can be used for example for buffer preallocation.
 	 *
 	 * @return exact serialized size for column group
 	 */
 	public long getExactSizeOnDisk() {
 		long ret = 0;
 		ret += 1; // type info (colGroup ordinal)
 		ret += _colIndexes.getExactSizeOnDisk();
 		return ret;
 	}

 	/**
 	 * Slice out the columns within the range of cl and cu to remove the dictionary values related to these columns. If
 	 * the ColGroup slicing from does not contain any columns within the range null is returned.
 	 *
 	 * @param cl The lower bound of the columns to select
 	 * @param cu The upper bound of the columns to select (not inclusive).
 	 * @return A cloned Column Group, with a copied pointer to the old column groups index structure, but reduced
 	 *         dictionary and _columnIndexes correctly aligned with the expected sliced compressed matrix.
 	 */
 	public final AColGroup sliceColumns(int cl, int cu) {
 		AColGroup ret = (cu - cl == 1) ? sliceColumn(cl) : sliceMultiColumns(cl, cu);
 		return ret;
 	}

 	/**
 	 * Slice out a single column from the column group.
 	 *
 	 * @param col The column to slice, the column could potentially not be inside the column group
 	 * @return A new column group that is a single column, if the column requested is not in this column group null is
 	 *         returned.
 	 */
 	public final AColGroup sliceColumn(int col) {
 		int idx = _colIndexes.findIndex(col);
 		if(idx >= 0)
 			return sliceSingleColumn(idx);
 		else
 			return null;
 	}

 	/**
 	 * Slice out multiple columns within the interval between the given indexes.
 	 *
 	 * @param cl The lower column index to slice from
 	 * @param cu The upper column index to slice to, (not included)
 	 * @return A column group of this containing the columns specified, returns null if the columns specified is not
 	 *         contained in the column group
 	 */
 	protected final AColGroup sliceMultiColumns(int cl, int cu) {
 		SliceResult sr = _colIndexes.slice(cl, cu);
 		if(sr.ret != null)
 			return sliceMultiColumns(sr.idStart, sr.idEnd, sr.ret);
 		else
 			return null;
 	}

 	/**
 	 * Compute the column sum of the given list of groups
 	 *
 	 * @param groups The Groups to sum
 	 * @param res    The result to put the values into
 	 * @param nRows  The number of rows in the groups
 	 * @return The given res list, where the sum of the column groups is added
 	 */
 	public static double[] colSum(Collection<AColGroup> groups, double[] res, int nRows) {
 		for(AColGroup g : groups)
 			g.computeColSums(res, nRows);
 		return res;
 	}

 	/**
 	 * Get the value at a global row/column position.
 	 *
 	 * In general this performs since a binary search of colIndexes is performed for each lookup.
 	 *
 	 * @param r row
 	 * @param c column
 	 * @return value at the row/column position
 	 */
 	public double get(int r, int c) {
 		final int colIdx = _colIndexes.findIndex(c);
 		if(colIdx < 0)
 			return 0;
 		else
 			return getIdx(r, colIdx);
 	}

 	/**
 	 * Get the value at a colGroup specific row/column index position.
 	 *
 	 * @param r      row
 	 * @param colIdx column index in the _colIndexes.
 	 * @return value at the row/column index position
 	 */
 	public abstract double getIdx(int r, int colIdx);

 	/**
 	 * Obtain number of distinct tuples in contained sets of values associated with this column group.
 	 *
 	 * If the column group is uncompressed the number or rows is returned.
 	 *
 	 * @return the number of distinct sets of values associated with the bitmaps in this column group
 	 */
 	public abstract int getNumValues();

 	/**
 	 * Obtain the compression type.
 	 *
 	 * @return How the elements of the column group are compressed.
 	 */
 	public abstract CompressionType getCompType();

 	/**
 	 * Internally get the specific type of ColGroup, this could be extracted from the object but that does not allow for
 	 * nice switches in the code.
 	 *
 	 * @return ColGroupType of the object.
 	 */
 	protected abstract ColGroupType getColGroupType();

 	/**
 	 * Decompress into the DenseBlock. (no NNZ handling)
 	 *
 	 * @param db   Target DenseBlock
 	 * @param rl   Row to start decompression from
 	 * @param ru   Row to end decompression at
 	 * @param offR Row offset into the target to decompress
 	 * @param offC Column offset into the target to decompress
 	 */
 	public abstract void decompressToDenseBlock(DenseBlock db, int rl, int ru, int offR, int offC);

 	/**
 	 * Decompress into the SparseBlock. (no NNZ handling)
 	 *
 	 * Note this method is allowing to calls to append since it is assumed that the sparse column indexes are sorted
 	 * afterwards
 	 *
 	 * @param sb   Target SparseBlock
 	 * @param rl   Row to start decompression from
 	 * @param ru   Row to end decompression at
 	 * @param offR Row offset into the target to decompress
 	 * @param offC Column offset into the target to decompress
 	 */
 	public abstract void decompressToSparseBlock(SparseBlock sb, int rl, int ru, int offR, int offC);

 	/**
 	 * Right matrix multiplication with this column group.
 	 *
 	 * This method can return null, meaning that the output overlapping group would have been empty.
 	 *
 	 * @param right The MatrixBlock on the right of this matrix multiplication
 	 * @return The new Column Group or null that is the result of the matrix multiplication.
 	 */
 	public final AColGroup rightMultByMatrix(MatrixBlock right) {
 		return rightMultByMatrix(right, null);
 	}

 	/**
 	 * Right matrix multiplication with this column group.
 	 *
 	 * This method can return null, meaning that the output overlapping group would have been empty.
 	 *
 	 * @param right   The MatrixBlock on the right of this matrix multiplication
 	 * @param allCols A pre-materialized list of all col indexes, that can be shared across all column groups if use
 	 *                full, can be set to null.
 	 * @return The new Column Group or null that is the result of the matrix multiplication.
 	 */
 	public abstract AColGroup rightMultByMatrix(MatrixBlock right, IColIndex allCols);

 	/**
 	 * Do a transposed self matrix multiplication on the left side t(x) %*% x. but only with this column group.
 	 *
 	 * This gives better performance since there is no need to iterate through all the rows of the matrix, but the
 	 * execution can be limited to its number of distinct values.
 	 *
 	 * Note it only calculate the upper triangle
 	 *
 	 * @param ret   The return matrix block [numColumns x numColumns]
 	 * @param nRows The number of rows in the column group
 	 */
 	public abstract void tsmm(MatrixBlock ret, int nRows);

 	/**
 	 * Left multiply with this column group.
 	 *
 	 * @param matrix The matrix to multiply with on the left
 	 * @param result The result to output the values into, always dense for the purpose of the column groups
 	 *               parallelizing
 	 * @param rl     The row to begin the multiplication from on the lhs matrix
 	 * @param ru     The row to end the multiplication at on the lhs matrix
 	 * @param cl     The column to begin the multiplication from on the lhs matrix
 	 * @param cu     The column to end the multiplication at on the lhs matrix
 	 */
 	public abstract void leftMultByMatrixNoPreAgg(MatrixBlock matrix, MatrixBlock result, int rl, int ru, int cl,
 		int cu);

 	/**
 	 * Left side matrix multiplication with a column group that is transposed.
 	 *
 	 * @param lhs    The left hand side Column group to multiply with, the left hand side should be considered
 	 *               transposed. Also it should be guaranteed that this column group is not empty.
 	 * @param result The result matrix to insert the result of the multiplication into
 	 * @param nRows  Number of rows in the lhs colGroup
 	 */
 	public abstract void leftMultByAColGroup(AColGroup lhs, MatrixBlock result, int nRows);

 	/**
 	 * Matrix multiply with this other column group, but:
 	 *
 	 * 1. Only output upper triangle values.
 	 *
 	 * 2. Multiply both ways with "this" being on the left and on the right.
 	 *
 	 * It should be guaranteed that the input is not the same as the caller of the method.
 	 *
 	 * The second step is achievable by treating the initial multiplied matrix, and adding its values to the correct
 	 * locations in the output.
 	 *
 	 * @param other  The other Column group to multiply with
 	 * @param result The result matrix to put the results into
 	 */
 	public abstract void tsmmAColGroup(AColGroup other, MatrixBlock result);

 	/**
 	 * Perform the specified scalar operation directly on the compressed column group, without decompressing individual
 	 * cells if possible.
 	 *
 	 * @param op operation to perform
 	 * @return version of this column group with the operation applied
 	 */
 	public abstract AColGroup scalarOperation(ScalarOperator op);

 	/**
 	 * Perform a binary row operation.
 	 *
 	 * @param op        The operation to execute
 	 * @param v         The vector of values to apply the values contained should be at least the length of the highest
 	 *                  value in the column index
 	 * @param isRowSafe True if the binary op is applied to an entire zero row and all results are zero
 	 * @return A updated column group with the new values.
 	 */
 	public abstract AColGroup binaryRowOpLeft(BinaryOperator op, double[] v, boolean isRowSafe);

 	/**
 	 * Short hand add operator call on column group to add a row vector to the column group
 	 *
 	 * @param v The vector to add
 	 * @return A new column group where the vector is added.
 	 */
 	public AColGroup addVector(double[] v) {
 		return binaryRowOpRight(new BinaryOperator(Plus.getPlusFnObject(), 1), v, false);
 	}

 	/**
 	 * Perform a binary row operation.
 	 *
 	 * @param op        The operation to execute
 	 * @param v         The vector of values to apply the values contained should be at least the length of the highest
 	 *                  value in the column index
 	 * @param isRowSafe True if the binary op is applied to an entire zero row and all results are zero
 	 * @return A updated column group with the new values.
 	 */
 	public abstract AColGroup binaryRowOpRight(BinaryOperator op, double[] v, boolean isRowSafe);

 	/**
 	 * Unary Aggregate operator, since aggregate operators require new object output, the output becomes an uncompressed
 	 * matrix.
 	 *
 	 * The range of rl to ru only applies to row aggregates. (ReduceCol)
 	 *
 	 * @param op    The operator used
 	 * @param c     The output matrix block
 	 * @param nRows The total number of rows in the Column Group
 	 * @param rl    The starting row to do aggregation from
 	 * @param ru    The last row to do aggregation to (not included)
 	 */
 	public abstract void unaryAggregateOperations(AggregateUnaryOperator op, double[] c, int nRows, int rl, int ru);

 	/**
 	 * Slice out column at specific index of this column group.
 	 *
 	 * It is guaranteed that the column to slice is contained in this columnGroup.
 	 *
 	 * @param idx The column index to slice out.
 	 * @return A new column group containing the columns inside. (never null)
 	 */
 	protected abstract AColGroup sliceSingleColumn(int idx);

 	/**
 	 * Slice range of columns inside this column group.
 	 *
 	 * It is guaranteed that the columns to slice is contained in this columnGroup.
 	 *
 	 * @param idStart    The column index to start at
 	 * @param idEnd      The column index to end at (not included)
 	 * @param outputCols The output columns to extract materialized for ease of implementation
 	 * @return The sliced ColGroup from this. (never null)
 	 */
 	protected abstract AColGroup sliceMultiColumns(int idStart, int idEnd, IColIndex outputCols);

 	/**
 	 * Slice range of rows out of the column group and return a new column group only containing the row segment.
 	 *
 	 * Note that this slice should maintain pointers back to the original dictionaries and only modify index structures.
 	 *
 	 * @param rl The row to start at
 	 * @param ru The row to end at (not included)
 	 * @return A new column group containing the specified row range.
 	 */
 	public abstract AColGroup sliceRows(int rl, int ru);

 	/**
 	 * Short hand method for getting minimum value contained in this column group.
 	 *
 	 * @return The minimum value contained in this ColumnGroup
 	 */
 	public abstract double getMin();

 	/**
 	 * Short hand method for getting maximum value contained in this column group.
 	 *
 	 * @return The maximum value contained in this ColumnGroup
 	 */
 	public abstract double getMax();

 	/**
 	 * Short hand method for getting the sum of this column group
 	 *
 	 * @param nRows The number of rows in the column group
 	 * @return The sum of this column group
 	 */
 	public abstract double getSum(int nRows);

 	/**
 	 * Detect if the column group contains a specific value.
 	 *
 	 * @param pattern The value to look for.
 	 * @return boolean saying true if the value is contained.
 	 */
 	public abstract boolean containsValue(double pattern);

 	/**
 	 * Get the number of nonZeros contained in this column group.
 	 *
 	 * @param nRows The number of rows in the column group, this is used for groups that does not contain information
 	 *              about how many rows they have.
 	 * @return The nnz.
 	 */
 	public abstract long getNumberNonZeros(int nRows);

 	/**
 	 * Make a copy of the column group values, and replace all values that match pattern with replacement value.
 	 *
 	 * @param pattern The value to look for
 	 * @param replace The value to replace the other value with
 	 * @return A new Column Group, reusing the index structure but with new values.
 	 */
 	public abstract AColGroup replace(double pattern, double replace);

 	/**
 	 * Compute the column sum
 	 *
 	 * @param c     The array to add the column sum to.
 	 * @param nRows The number of rows in the column group.
 	 */
 	public abstract void computeColSums(double[] c, int nRows);

 	/**
 	 * Central Moment instruction executed on a column group.
 	 *
 	 * @param op    The Operator to use.
 	 * @param nRows The number of rows contained in the ColumnGroup.
 	 * @return A Central Moment object.
 	 */
 	public abstract CM_COV_Object centralMoment(CMOperator op, int nRows);

 	/**
 	 * Expand the column group to multiple columns. (one hot encode the column group)
 	 *
 	 * @param max    The number of columns to expand to and cutoff values at.
 	 * @param ignore If zero and negative values should be ignored.
 	 * @param cast   If the double values contained should be cast to whole numbers.
 	 * @param nRows  The number of rows in the column group.
 	 * @return A new column group containing max number of columns.
 	 */
 	public abstract AColGroup rexpandCols(int max, boolean ignore, boolean cast, int nRows);

 	/**
 	 * Get the computation cost associated with this column group.
 	 *
 	 * @param e     The computation cost estimator
 	 * @param nRows the number of rows in the column group
 	 * @return The cost of this column group
 	 */
 	public abstract double getCost(ComputationCostEstimator e, int nRows);

 	/**
 	 * Perform unary operation on the column group and return a new column group
 	 *
 	 * @param op The operation to perform
 	 * @return The new column group
 	 */
 	public abstract AColGroup unaryOperation(UnaryOperator op);

 	/**
 	 * Get if the group is only containing zero
 	 *
 	 * @return true if empty
 	 */
 	public abstract boolean isEmpty();

 	/**
 	 * Append the other column group to this column group. This method tries to combine them to return a new column group
 	 * containing both. In some cases it is possible in reasonable time, in others it is not.
 	 *
 	 * The result is first this column group followed by the other column group in higher row values.
 	 *
 	 * If it is not possible or very inefficient null is returned.
 	 *
 	 * @param g The other column group
 	 * @return A combined column group or null
 	 */
 	public abstract AColGroup append(AColGroup g);

 	/**
 	 * Append all column groups in the list provided together in one go allocating the output once.
 	 *
 	 * If it is not possible or very inefficient null is returned.
 	 *
 	 * @param groups The groups to combine.
 	 * @return A combined column group or null
 	 */
 	public static AColGroup appendN(AColGroup[] groups) {
 		return groups[0].appendNInternal(groups);
 	}

 	/**
 	 * Append all column groups in the list provided together with this.
 	 *
 	 * A Important detail is the first entry in the group == this, and should not be appended twice.
 	 *
 	 * If it is not possible or very inefficient null is returned.
 	 *
 	 * @param groups The groups to combine.
 	 * @return A combined column group or null
 	 */
 	protected abstract AColGroup appendNInternal(AColGroup[] groups);

 	/**
 	 * Get the compression scheme for this column group to enable compression of other data.
 	 *
 	 * @return The compression scheme of this column group
 	 */
 	public abstract ICLAScheme getCompressionScheme();

 	/**
 	 * Clear variables that can be recomputed from the allocation of this column group.
 	 */
 	public void clear() {
 		// do nothing
 	}

 	/**
 	 * Recompress this column group into a new column group.
 	 *
 	 * @return A new or the same column group depending on optimization goal.
 	 */
 	public abstract AColGroup recompress();

 	/**
 	 * Recompress this column group into a new column group of the given type.
 	 *
 	 * @param ct The compressionType that the column group should morph into
 	 * @return A new column group
 	 */
 	public AColGroup morph(CompressionType ct) {
 		throw new NotImplementedException();
 	}

 	/**
 	 * Get the compression info for this column group.
 	 *
 	 * @param nRow The number of rows in this column group.
 	 * @return The compression info for this group.
 	 */
 	public abstract CompressedSizeInfoColGroup getCompressionInfo(int nRow);

 	/**
 	 * Combine this column group with another
 	 *
 	 * @param other The other column group to combine with.
 	 * @return A combined representation as a column group.
 	 */
 	public AColGroup combine(AColGroup other) {
 		return CLALibCombineGroups.combine(this, other);
 	}

 	/**
 	 * Get encoding of this column group.
 	 *
 	 * @return The encoding of the index structure.
 	 */
 	public IEncode getEncoding() {
 		throw new NotImplementedException();
 	}

 	public AColGroup sortColumnIndexes() {
 		if(_colIndexes.isSorted())
 			return this;
 		else {
 			int[] reorderingIndex = _colIndexes.getReorderingIndex();
 			IColIndex ni = _colIndexes.sort();
 			return fixColIndexes(ni, reorderingIndex);
 		}
 	}

 	protected abstract AColGroup fixColIndexes(IColIndex newColIndex, int[] reordering);

 	@Override
 	public String toString() {
 		StringBuilder sb = new StringBuilder();
 		sb.append(String.format("%15s", "Type: "));
 		sb.append(this.getClass().getSimpleName());
 		sb.append(String.format("\n%15s", "Columns: "));
 		sb.append(_colIndexes);
 		return sb.toString();
 	}
 }