[SYSTEMDS-373] Fix error in new compression

Remove Quan ColGroup because of new Dictionary technique.

- FIX Quantile and CentralMoment for DDC ColGroups
- compression tests remove redundant tests in Par Compressed Matrix and
  Remove dependency on AutomatedTestBase
- Compare matrix now only count the first 50 discrepancies
- Rounding improvement in Quantized representation
- Out commented some debugging
- Fix count distinct compressed test works, but is not comparable
  to count distinct non lossy. since the number of distinct values
  can both grow and shrink
- Update Dictionary to not copy a copy for lossy dictionaries on scalar
  operations
- Dictionary constructor in all colGroups
- QDictionary Optimize scalar operator
- Improve scalar operation
- Fix leftMultByRowVector OLE in case of only one value!
- Improve ColSum
- Improvements in performance of unary aggregates
- ColGroupValue fix for single column colGroups
- disallowing ColGroup construction with double array
- Dictionary documentation
- Fixed docs for ColGroupValue
- Lossy Bitmap fix and improved performance
- revert DMLScript entry file
- call BitMap lossy for 8Bit bitmap
- DDC rework, moving dublicate code to DDC from DDC1 and 2,
  furthermore general fixes and optimizations in tests and
  all ColGroups for better lossy handling
- DDC docs fix
- common sum and ColSum without performance degradation
- Fix bug in Unary aggregate where output from uncompressed ColGroup was
  empty
- Move SkipList boolean to compression Settings
- Dictionary correcting Override keyword
- A dictionary not I Dictionary
- in sum all rows to Double dictionary always reuse double array
- Allowing more tests to execute. Compression
- Parallel scalar operations
- shallow serialize
- rename AbstractBitmap to ABitmap
diff --git a/dev/Tasks-obsolete.txt b/dev/Tasks-obsolete.txt
index aa5afc7..9f157fb 100644
--- a/dev/Tasks-obsolete.txt
+++ b/dev/Tasks-obsolete.txt
@@ -322,6 +322,7 @@
 SYSTEMDS-370 Lossy Compression Blocks
  * 371 ColGroup Quantization                                          OK (Naive Q8)
  * 372 ColGroup Base Data change (from Double to Naive Q8)            OK
+ * 373 Fix error in new compression                                   OK
 
 SYSTEMDS-380 Memory Footprint
  * 381 Matrix Block Memory footprint update
diff --git a/pom.xml b/pom.xml
index 0fa1a1b..5df2fe6 100644
--- a/pom.xml
+++ b/pom.xml
@@ -520,6 +520,7 @@
 								<exclude>.classpath</exclude>
 								<exclude>.project</exclude>
 								<exclude>src/main/python/docs/build/**/*</exclude>
+								<exclude>src/main/python/docs/source/_build/**</exclude>
 								<exclude>docs/api/**/*</exclude>
 								<exclude>docs/_site/**/*</exclude>
 								<exclude>docs/site/run_issues.md</exclude>
diff --git a/src/main/java/org/apache/sysds/runtime/compress/AbstractCompressedMatrixBlock.java b/src/main/java/org/apache/sysds/runtime/compress/AbstractCompressedMatrixBlock.java
index 3277ae4..01b5b9b 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/AbstractCompressedMatrixBlock.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/AbstractCompressedMatrixBlock.java
@@ -251,14 +251,21 @@
 			return super.cmOperations(op);
 		ColGroup grp = _colGroups.get(0);
 		MatrixBlock vals = grp.getValuesAsBlock();
-		if(grp.getIfCountsType()){
-			MatrixBlock counts = ColGroupValue.getCountsAsBlock(grp.getCounts());
-			return vals.cmOperations(op, counts);
+		if(grp instanceof ColGroupValue){
+			int[] counts = ((ColGroupValue) grp).getCounts();
+			return vals.cmOperations(op, getCountsAsBlock(  counts));
 		}else{
 			return vals.cmOperations(op);
 		}
 	}
 
+	private static MatrixBlock getCountsAsBlock(int[] counts) {
+		MatrixBlock ret = new MatrixBlock(counts.length, 1, false);
+		for(int i = 0; i < counts.length; i++)
+			ret.quickSetValue(i, 0, counts[i]);
+		return ret;
+	}
+
 	@Override
 	public CM_COV_Object cmOperations(CMOperator op, MatrixBlock weights) {
 		printDecompressWarning("cmOperations");
@@ -296,12 +303,12 @@
 		if(grp.getIfCountsType() != true)
 			return grp.getValuesAsBlock().sortOperations(right, result);
 
-		if(right == null) {
+		if(right == null && grp instanceof ColGroupValue) {
 			MatrixBlock vals = grp.getValuesAsBlock();
-			int[] counts = grp.getCounts();
+			int[] counts = ((ColGroupValue)grp).getCounts();
 			double[] data = (vals.getDenseBlock() != null) ? vals.getDenseBlockValues() : null;
 			SortUtils.sortByValue(0, vals.getNumRows(), data, counts);
-			MatrixBlock counts2 = ColGroupValue.getCountsAsBlock(counts);
+			MatrixBlock counts2 = getCountsAsBlock(counts);
 			return vals.sortOperations(counts2, result);
 		}
 		else
@@ -504,12 +511,12 @@
 
 	@Override
 	public boolean isShallowSerialize() {
-		return false;
+		return true;
 	}
 
 	@Override
 	public boolean isShallowSerialize(boolean inclConvert) {
-		return false;
+		return true;
 	}
 
 	@Override
diff --git a/src/main/java/org/apache/sysds/runtime/compress/BitmapEncoder.java b/src/main/java/org/apache/sysds/runtime/compress/BitmapEncoder.java
index a7f3f74..e9be28a 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/BitmapEncoder.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/BitmapEncoder.java
@@ -19,12 +19,27 @@
 
 package org.apache.sysds.runtime.compress;
 
-import org.apache.sysds.runtime.compress.utils.AbstractBitmap;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.LinkedHashMap;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.Map;
+import java.util.Map.Entry;
+import java.util.Queue;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.sysds.runtime.compress.utils.ABitmap;
 import org.apache.sysds.runtime.compress.utils.Bitmap;
 import org.apache.sysds.runtime.compress.utils.BitmapLossy;
 import org.apache.sysds.runtime.compress.utils.DblArray;
 import org.apache.sysds.runtime.compress.utils.DblArrayIntListHashMap;
+import org.apache.sysds.runtime.compress.utils.DblArrayIntListHashMap.DArrayIListEntry;
 import org.apache.sysds.runtime.compress.utils.DoubleIntListHashMap;
+import org.apache.sysds.runtime.compress.utils.DoubleIntListHashMap.DIListEntry;
 import org.apache.sysds.runtime.compress.utils.IntArrayList;
 import org.apache.sysds.runtime.data.SparseBlock;
 import org.apache.sysds.runtime.matrix.data.MatrixBlock;
@@ -34,6 +49,7 @@
  */
 public class BitmapEncoder {
 
+	private static final Log LOG = LogFactory.getLog(BitmapEncoder.class.getName());
 
 	/**
 	 * Generate uncompressed bitmaps for a set of columns in an uncompressed matrix block.
@@ -43,7 +59,7 @@
 	 * @param compSettings The compression settings used for the compression.
 	 * @return uncompressed bitmap representation of the columns
 	 */
-	public static AbstractBitmap extractBitmap(int[] colIndices, MatrixBlock rawBlock,
+	public static ABitmap extractBitmap(int[] colIndices, MatrixBlock rawBlock,
 		CompressionSettings compSettings) {
 		// note: no sparse column selection reader because low potential
 		// single column selection
@@ -62,7 +78,7 @@
 			res = extractBitmap(colIndices, rawBlock, reader);
 		}
 		if(compSettings.lossy) {
-			return BitmapLossy.makeBitmapLossy(res);
+			return makeBitmapLossy(res);
 		}
 		else {
 			return res;
@@ -70,7 +86,9 @@
 	}
 
 	/**
-	 * Extract Bitmap from a single column. It will always skip all zero values. It also counts the instances of zero.
+	 * Extract Bitmap from a single column.
+	 * 
+	 * It counts the instances of zero, but skips storing the values.
 	 * 
 	 * @param colIndex     The index of the column
 	 * @param rawBlock     The Raw matrix block (that can be transposed)
@@ -94,37 +112,15 @@
 				int[] aix = a.indexes(colIndex);
 				double[] avals = a.values(colIndex);
 
-				// IntArrayList lstPtr0 = new IntArrayList(); // for 0 values
-				// int last = -1;
-				// iterate over non-zero entries but fill in zeros
 				for(int j = apos; j < apos + alen; j++) {
-					// fill in zero values
-					// if(!skipZeros)
-					// for(int k = last + 1; k < aix[j]; k++)
-					// lstPtr0.appendValue(k);
-					// handle non-zero value
 					IntArrayList lstPtr = distinctVals.get(avals[j]);
 					if(lstPtr == null) {
 						lstPtr = new IntArrayList();
 						distinctVals.appendValue(avals[j], lstPtr);
 					}
 					lstPtr.appendValue(aix[j]);
-					// last = aix[j];
 				}
-				// fill in remaining zero values
-				// if(!skipZeros) {
-				// for(int k = last + 1; k < m; k++)
-				// lstPtr0.appendValue(k);
-				// if(lstPtr0.size() > 0)
-				// distinctVals.appendValue(0, lstPtr0);
-				// }
 			}
-			// else if(!skipZeros) { // full 0 column
-			// IntArrayList lstPtr = new IntArrayList();
-			// for(int i = 0; i < m; i++)
-			// lstPtr.appendValue(i);
-			// distinctVals.appendValue(0, lstPtr);
-			// }
 		}
 		else // GENERAL CASE
 		{
@@ -145,9 +141,19 @@
 			}
 		}
 
-		return Bitmap.makeBitmap(distinctVals, numZeros);
+		return makeBitmap(distinctVals, numZeros);
 	}
 
+	/**
+	 * Extract Bitmap from multiple columns together.
+	 * 
+	 * It counts the instances of rows containing only zero values, but other groups can contain a zero value.
+	 * 
+	 * @param colIndices The Column indexes to extract the multi-column bit map from.
+	 * @param rawBlock   The raw block to extract from
+	 * @param rowReader  A Reader for the columns selected.
+	 * @return The Bitmap
+	 */
 	private static Bitmap extractBitmap(int[] colIndices, MatrixBlock rawBlock, ReaderColumnSelection rowReader) {
 		// probe map for distinct items (for value or value groups)
 		DblArrayIntListHashMap distinctVals = new DblArrayIntListHashMap();
@@ -168,7 +174,277 @@
 			lstPtr.appendValue(rowReader.getCurrentRowIndex());
 		}
 
-		return Bitmap.makeBitmap(distinctVals, colIndices.length, zero);
+		return makeBitmap(distinctVals, colIndices.length, zero);
 	}
 
+	/**
+	 * Make the multi column Bitmap.
+	 * 
+	 * @param distinctVals The distinct values fround in the columns selected.
+	 * @param numColumns   Number of columns
+	 * @param numZeros     Number of zero rows. aka rows only containing zero values.
+	 * @return The Bitmap.
+	 */
+	private static Bitmap makeBitmap(DblArrayIntListHashMap distinctVals, int numColumns, int numZeros) {
+		// added for one pass bitmap construction
+		// Convert inputs to arrays
+		int numVals = distinctVals.size();
+		int numCols = numColumns;
+		double[] values = new double[numVals * numCols];
+		IntArrayList[] offsetsLists = new IntArrayList[numVals];
+		int bitmapIx = 0;
+		for(DArrayIListEntry val : distinctVals.extractValues()) {
+			System.arraycopy(val.key.getData(), 0, values, bitmapIx * numCols, numCols);
+			offsetsLists[bitmapIx++] = val.value;
+		}
+		return new Bitmap(numCols, offsetsLists, numZeros, values);
+	}
+
+	/**
+	 * Make single column bitmap.
+	 * 
+	 * @param distinctVals Distinct values contained in the bitmap, mapping to offsets for locations in the matrix.
+	 * @param numZeros     Number of zero values in the matrix
+	 * @return The single column Bitmap.
+	 */
+	private static Bitmap makeBitmap(DoubleIntListHashMap distinctVals, int numZeros) {
+		// added for one pass bitmap construction
+		// Convert inputs to arrays
+		int numVals = distinctVals.size();
+		double[] values = new double[numVals];
+		IntArrayList[] offsetsLists = new IntArrayList[numVals];
+		int bitmapIx = 0;
+		for(DIListEntry val : distinctVals.extractValues()) {
+			values[bitmapIx] = val.key;
+			offsetsLists[bitmapIx++] = val.value;
+		}
+		return new Bitmap(1, offsetsLists, numZeros, values);
+	}
+
+	/**
+	 * Given a Bitmap try to make a lossy version of the same bitmap.
+	 * 
+	 * @param ubm The Uncompressed version of the bitmap.
+	 * @return A bitmap.
+	 */
+	private static ABitmap makeBitmapLossy(Bitmap ubm) {
+		final double[] fp = ubm.getValues();
+		if(fp.length == 0) {
+			return ubm;
+		}
+		Stats stats = new Stats(fp);
+		// TODO make better decisions than just a 8 Bit encoding.
+		if(Double.isInfinite(stats.max) || Double.isInfinite(stats.min)) {
+			LOG.warn("Defaulting to incompressable colGroup");
+			return ubm;
+		}
+		else {
+			return make8BitLossy(ubm, stats);
+		}
+	}
+
+	/**
+	 * Make the specific 8 bit encoding version of a bitmap.
+	 * 
+	 * @param ubm   The uncompressed Bitmap.
+	 * @param stats The statistics associated with the bitmap.
+	 * @return a lossy bitmap.
+	 */
+	private static BitmapLossy make8BitLossy(Bitmap ubm, Stats stats) {
+		final double[] fp = ubm.getValues();
+		int numCols = ubm.getNumColumns();
+		double scale = Math.max(Math.abs(stats.min), stats.max) / (double) Byte.MAX_VALUE;
+		byte[] scaledValues = scaleValues(fp, scale);
+		if(numCols == 1) {
+			return makeBitmapLossySingleCol(ubm, scaledValues, scale);
+		}
+		else {
+			return makeBitmapLossyMultiCol(ubm, scaledValues, scale);
+		}
+	}
+
+	/**
+	 * Make Single column lossy bitmap.
+	 * 
+	 * This method merges the previous offset lists together to reduce the size.
+	 * 
+	 * @param ubm          The original uncompressed bitmap.
+	 * @param scaledValues The scaled values to map into.
+	 * @param scale        The scale in use.
+	 * @return The Lossy bitmap.
+	 */
+	private static BitmapLossy makeBitmapLossySingleCol(Bitmap ubm, byte[] scaledValues, double scale) {
+
+		// Using Linked Hashmap to preserve the sorted order.
+		Map<Byte, Queue<IntArrayList>> values = new LinkedHashMap<>();
+		Map<Byte, Integer> lengths = new HashMap<>();
+
+		IntArrayList[] fullSizeOffsetsLists = ubm.getOffsetList();
+		int numZeroGroups = ubm.getZeroCounts();
+
+		for(int idx = 0; idx < scaledValues.length; idx++) {
+			if(scaledValues[idx] != 0) { // Throw away zero values.
+				if(values.containsKey(scaledValues[idx])) {
+					values.get(scaledValues[idx]).add(fullSizeOffsetsLists[idx]);
+					lengths.put(scaledValues[idx], lengths.get(scaledValues[idx]) + fullSizeOffsetsLists[idx].size());
+				}
+				else {
+					Queue<IntArrayList> offsets = new LinkedList<IntArrayList>();
+					offsets.add(fullSizeOffsetsLists[idx]);
+					values.put(scaledValues[idx], offsets);
+					lengths.put(scaledValues[idx], fullSizeOffsetsLists[idx].size());
+				}
+			}
+			else {
+				numZeroGroups++;
+			}
+		}
+		byte[] scaledValuesReduced = new byte[values.keySet().size()];
+		IntArrayList[] newOffsetsLists = new IntArrayList[values.keySet().size()];
+		Iterator<Entry<Byte, Queue<IntArrayList>>> x = values.entrySet().iterator();
+		int idx = 0;
+		while(x.hasNext()) {
+			Entry<Byte, Queue<IntArrayList>> ent = x.next();
+			scaledValuesReduced[idx] = ent.getKey().byteValue();
+			Queue<IntArrayList> q = ent.getValue();
+			if(q.size() == 1) {
+				newOffsetsLists[idx] = q.remove();
+			}
+			else {
+				newOffsetsLists[idx] = mergeOffsets(q, new int[lengths.get(ent.getKey())]);
+			}
+			idx++;
+		}
+		return new BitmapLossy(ubm.getNumColumns(), newOffsetsLists, numZeroGroups, scaledValuesReduced, scale);
+	}
+
+	/**
+	 * Multi column instance of makeBitmapLossySingleCol
+	 * 
+	 * @param ubm          The original uncompressed bitmap.
+	 * @param scaledValues The scaled values to map into.
+	 * @param scale        The scale in use.
+	 * @return The Lossy bitmap.
+	 */
+	private static BitmapLossy makeBitmapLossyMultiCol(Bitmap ubm, byte[] scaledValues, double scale) {
+		int numColumns = ubm.getNumColumns();
+		Map<List<Byte>, Queue<IntArrayList>> values = new HashMap<>();
+		Map<List<Byte>, Integer> lengths = new HashMap<>();
+		IntArrayList[] fullSizeOffsetsLists = ubm.getOffsetList();
+		int numZeroGroups = ubm.getZeroCounts();
+		boolean allZero = true;
+		for(int idx = 0; idx < scaledValues.length; idx += numColumns) {
+			List<Byte> array = new ArrayList<>();
+			for(int off = 0; off < numColumns; off++) {
+				allZero = scaledValues[idx + off] == 0 && allZero;
+				array.add(scaledValues[idx + off]);
+			}
+
+			numZeroGroups += allZero ? 1 : 0;
+			if(!allZero) {
+				if(values.containsKey(array)) {
+					values.get(array).add(fullSizeOffsetsLists[idx / numColumns]);
+					lengths.put(array, lengths.get(array) + fullSizeOffsetsLists[idx / numColumns].size());
+				}
+				else {
+					Queue<IntArrayList> offsets = new LinkedList<IntArrayList>();
+					offsets.add(fullSizeOffsetsLists[idx / numColumns]);
+					values.put(array, offsets);
+					lengths.put(array, fullSizeOffsetsLists[idx / numColumns].size());
+				}
+			}
+			allZero = true;
+		}
+
+		byte[] scaledValuesReduced = new byte[values.keySet().size() * numColumns];
+		IntArrayList[] newOffsetsLists = new IntArrayList[values.keySet().size()];
+		Iterator<Entry<List<Byte>, Queue<IntArrayList>>> x = values.entrySet().iterator();
+		int idx = 0;
+		while(x.hasNext()) {
+			Entry<List<Byte>, Queue<IntArrayList>> ent = x.next();
+			List<Byte> key = ent.getKey();
+			int row = idx * numColumns;
+			for(int off = 0; off < numColumns; off++) {
+				scaledValuesReduced[row + off] = key.get(off);
+			}
+			Queue<IntArrayList> q = ent.getValue();
+			newOffsetsLists[idx] = mergeOffsets(q, new int[lengths.get(key)]);
+			idx++;
+		}
+
+		return new BitmapLossy(ubm.getNumColumns(), newOffsetsLists, numZeroGroups, scaledValuesReduced, scale);
+	}
+
+	/**
+	 * Merge method to join together offset lists.
+	 * 
+	 * @param offsets The offsets to join
+	 * @param res     The result int array to put the values into. This has to be allocated to the joined size of all
+	 *                the input offsetLists
+	 * @return The merged offsetList.
+	 */
+	private static IntArrayList mergeOffsets(Queue<IntArrayList> offsets, int[] res) {
+		int indexStart = 0;
+		while(!offsets.isEmpty()) {
+			IntArrayList h = offsets.remove();
+			int[] v = h.extractValues();
+			for(int i = 0; i < h.size(); i++) {
+				res[indexStart++] = v[i];
+			}
+		}
+		Arrays.sort(res);
+		return new IntArrayList(res);
+	}
+
+	/**
+	 * Utility method to scale all the values in the array to byte range
+	 * 
+	 * @param fp    double array to scale
+	 * @param scale the scale to apply
+	 * @return the scaled values in byte
+	 */
+	private static byte[] scaleValues(double[] fp, double scale) {
+		byte[] res = new byte[fp.length];
+		for(int idx = 0; idx < fp.length; idx++) {
+			res[idx] = (byte) (Math.round(fp[idx] / scale));
+		}
+		return res;
+	}
+
+
+	/**
+	 * Statistics class to analyse what compression plan to use.
+	 */
+	private static class Stats {
+		protected double max;
+		protected double min;
+		protected double minDelta;
+		protected double maxDelta;
+		protected boolean sameDelta;
+
+		public Stats(double[] fp) {
+			max = fp[fp.length - 1];
+			min = fp[0];
+			minDelta = Double.POSITIVE_INFINITY;
+			maxDelta = Double.NEGATIVE_INFINITY;
+			sameDelta = true;
+			if(fp.length > 1) {
+
+				double delta = fp[0] - fp[1];
+				for(int i = 0; i < fp.length - 1; i++) {
+					double ndelta = fp[i] - fp[i + 1];
+					if(delta < minDelta) {
+						minDelta = delta;
+					}
+					if(delta > maxDelta) {
+						maxDelta = delta;
+					}
+					if(sameDelta && Math.abs(delta - ndelta) <= delta * 0.00000001) {
+						sameDelta = false;
+					}
+					delta = ndelta;
+				}
+			}
+		}
+	}
 }
diff --git a/src/main/java/org/apache/sysds/runtime/compress/CompressedMatrixBlock.java b/src/main/java/org/apache/sysds/runtime/compress/CompressedMatrixBlock.java
index 2913e99..d7a99bc 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/CompressedMatrixBlock.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/CompressedMatrixBlock.java
@@ -36,6 +36,7 @@
 import org.apache.commons.lang.NotImplementedException;
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
+import org.apache.sysds.hops.OptimizerUtils;
 import org.apache.sysds.lops.MMTSJ.MMTSJType;
 import org.apache.sysds.lops.MapMultChain.ChainType;
 import org.apache.sysds.runtime.DMLRuntimeException;
@@ -82,9 +83,6 @@
 	/** Threshold for when to parallelize the aggregation functions. */
 	private static final long MIN_PAR_AGG_THRESHOLD = 8 * 1024 * 1024; // 8MB
 
-	protected boolean _lossy;
-	protected boolean _sharedDDC1Dict = false;
-
 	/**
 	 * Constructor for building an empty Compressed Matrix block object.
 	 * 
@@ -122,10 +120,6 @@
 		nonZeros = that.getNonZeros();
 	}
 
-	// public CompressionStatistics getCompressionStatistics() {
-	// return _stats;
-	// }
-
 	public boolean isSingleUncompressedGroup() {
 		return(_colGroups != null && _colGroups.size() == 1 &&
 			_colGroups.get(0).getCompType() == CompressionType.UNCOMPRESSED);
@@ -227,18 +221,6 @@
 		for(ColGroup grp : _colGroups)
 			total += grp.estimateInMemorySize();
 
-		// Correction for shared DDC1 dictionary
-		// TODO Fix DDC Sharing.
-		if(_sharedDDC1Dict) {
-			boolean seenDDC1 = false;
-			for(ColGroup grp : _colGroups)
-				if(grp.getNumCols() == 1 && grp instanceof ColGroupDDC1) {
-					ColGroupDDC1 grpDDC = (ColGroupDDC1) grp;
-					if(seenDDC1)
-						total -= grpDDC.getDictionarySize();
-					seenDDC1 = true;
-				}
-		}
 		return total;
 	}
 
@@ -277,11 +259,9 @@
 	@Override
 	public long getExactSizeOnDisk() {
 		// header information
-		long ret = 22;
+		long ret = 20;
 		for(ColGroup grp : _colGroups) {
 			ret += 1; // type info
-			// TODO: Handle shared dictionary
-
 			ret += grp.getExactSizeOnDisk();
 		}
 		return ret;
@@ -293,9 +273,7 @@
 		rlen = in.readInt();
 		clen = in.readInt();
 		nonZeros = in.readLong();
-		_sharedDDC1Dict = in.readBoolean();
-		_lossy = in.readBoolean();
-		_colGroups = ColGroupIO.readGroups(in, _sharedDDC1Dict);
+		_colGroups = ColGroupIO.readGroups(in);
 	}
 
 	@Override
@@ -304,9 +282,7 @@
 		out.writeInt(rlen);
 		out.writeInt(clen);
 		out.writeLong(nonZeros);
-		out.writeBoolean(_sharedDDC1Dict);
-		out.writeBoolean(_lossy);
-		ColGroupIO.writeGroups(out, _sharedDDC1Dict, _colGroups);
+		ColGroupIO.writeGroups(out, _colGroups);
 	}
 
 	/**
@@ -356,9 +332,6 @@
 		return rnnz;
 	}
 
-	//////////////////////////////////////////
-	// Operations (overwrite existing ops for seamless integration)
-
 	@Override
 	public MatrixBlock scalarOperations(ScalarOperator sop, MatrixValue result) {
 
@@ -371,16 +344,72 @@
 			ret.reset(rlen, clen);
 		}
 
-		// Apply the operation recursively to each of the column groups.
-		// Most implementations will only modify metadata.
-		ArrayList<ColGroup> newColGroups = new ArrayList<>();
-		for(ColGroup grp : _colGroups) {
-			newColGroups.add(grp.scalarOperation(sop));
+		int threads = OptimizerUtils.getConstrainedNumThreads(_colGroups.size());
+
+		if(threads > 1) {
+			ExecutorService pool = CommonThreadPool.get(sop.getNumThreads());
+			ArrayList<ScalarTask> tasks = new ArrayList<>();
+
+			ArrayList<ColGroup> small = new ArrayList<>();
+
+			for(ColGroup grp : _colGroups) {
+				if(grp instanceof ColGroupUncompressed) {
+					ArrayList<ColGroup> uc = new ArrayList<>();
+					uc.add(grp);
+					tasks.add(new ScalarTask(uc, sop));
+				}
+				else {
+					int nv = ((ColGroupValue) grp).getNumValues();
+					if(nv < 256) {
+						small.add(grp);
+					}
+					else {
+						ArrayList<ColGroup> large = new ArrayList<>();
+						large.add(grp);
+						tasks.add(new ScalarTask(large, sop));
+
+					}
+				}
+				if(small.size() > 10) {
+					tasks.add(new ScalarTask(small, sop));
+					small = new ArrayList<>();
+				}
+			}
+			if(small.size() > 0) {
+				tasks.add(new ScalarTask(small, sop));
+			}
+			try {
+				List<Future<List<ColGroup>>> rtasks = pool.invokeAll(tasks);
+				pool.shutdown();
+
+				ArrayList<ColGroup> newColGroups = new ArrayList<>();
+				for(Future<List<ColGroup>> f : rtasks) {
+					for(ColGroup x : f.get()) {
+						newColGroups.add(x);
+					}
+				}
+				ret._colGroups = newColGroups;
+				ret.setNonZeros(rlen * clen);
+			}
+			catch(InterruptedException | ExecutionException e) {
+				LOG.fatal("UnaryAggregate Exception: " + e.getMessage(), e);
+				throw new DMLRuntimeException(e);
+			}
 		}
-		ret._colGroups = newColGroups;
-		ret.setNonZeros(rlen * clen);
+		else {
+
+			// Apply the operation to each of the column groups.
+			// Most implementations will only modify metadata.
+			ArrayList<ColGroup> newColGroups = new ArrayList<>();
+			for(ColGroup grp : _colGroups) {
+				newColGroups.add(grp.scalarOperation(sop));
+			}
+			ret._colGroups = newColGroups;
+			ret.setNonZeros(rlen * clen);
+		}
 
 		return ret;
+
 	}
 
 	@Override
@@ -515,10 +544,10 @@
 
 		// Should not happen that it is a single uncompressed group.
 		// multi-threaded MM of single uncompressed ColGroup
-		if(isSingleUncompressedGroup()) {
-			MatrixBlock tmp = ((ColGroupUncompressed) _colGroups.get(0)).getData();
-			return tmp.aggregateBinaryOperations(this == m1 ? tmp : m1, this == m2 ? tmp : m2, ret, op);
-		}
+		// if(isSingleUncompressedGroup()) {
+		// MatrixBlock tmp = ((ColGroupUncompressed) _colGroups.get(0)).getData();
+		// return tmp.aggregateBinaryOperations(this == m1 ? tmp : m1, this == m2 ? tmp : m2, ret, op);
+		// }
 
 		Timing time = LOG.isDebugEnabled() ? new Timing(true) : null;
 
@@ -534,6 +563,7 @@
 
 		// compute matrix mult
 		if(m1.getNumRows() > 1 && m2.getNumColumns() == 1) { // MV right
+			LOG.debug("Matrix Vector !");
 			CompressedMatrixBlock cmb = (CompressedMatrixBlock) m1;
 			if(op.getNumThreads() > 1)
 				cmb.rightMultByVector(m2, ret, op.getNumThreads());
@@ -541,6 +571,7 @@
 				cmb.rightMultByVector(m2, ret);
 		}
 		else if(m1.getNumRows() == 1 && m2.getNumColumns() > 1) { // MV left
+			LOG.debug("Vector Matrix");
 			if(op.getNumThreads() > 1)
 				leftMultByVectorTranspose(_colGroups, m1, ret, false, op.getNumThreads());
 			else
@@ -548,6 +579,7 @@
 		}
 		else { // MM
 				// prepare the other input (including decompression if necessary)
+			LOG.debug("Matrix Matrix");
 			boolean right = (m1 == this);
 			MatrixBlock that = right ? m2 : m1;
 			that = that instanceof CompressedMatrixBlock ? ((CompressedMatrixBlock) that).decompress() : that;
@@ -562,6 +594,7 @@
 			MatrixBlock tmpIn = new MatrixBlock(1, that.getNumColumns(), false).allocateBlock();
 			MatrixBlock tmpOut = new MatrixBlock(right ? rl : 1, right ? 1 : cl, false).allocateBlock();
 			if(right) { // MM right
+				LOG.debug("MM right");
 				for(int i = 0; i < that.getNumRows(); i++) { // on transpose
 					tmpIn = that.slice(i, i, 0, that.getNumColumns() - 1, tmpIn);
 					MatrixBlock tmpIn2 = LibMatrixReorg.transpose(tmpIn, // meta data op
@@ -575,6 +608,8 @@
 				}
 			}
 			else { // MM left
+
+				LOG.debug("MM left");
 				for(int i = 0; i < that.getNumRows(); i++) {
 					tmpIn = that.slice(i, i, 0, that.getNumColumns() - 1, tmpIn);
 					if(op.getNumThreads() > 1)
@@ -604,8 +639,6 @@
 			throw new NotImplementedException("Unary aggregate " + op.aggOp.increOp.fn + " not supported yet.");
 		}
 
-		Timing time = LOG.isDebugEnabled() ? new Timing(true) : null;
-
 		// prepare output dimensions
 		CellIndex tempCellIndex = new CellIndex(-1, -1);
 		op.indexFn.computeDimension(rlen, clen, tempCellIndex);
@@ -637,11 +670,21 @@
 		MatrixBlock ret = (MatrixBlock) result;
 		ret.allocateDenseBlock();
 
-		// special handling init value for rowmins/rowmax
-		if(op.indexFn instanceof ReduceCol && op.aggOp.increOp.fn instanceof Builtin) {
-			double val = (((Builtin) op.aggOp.increOp.fn)
-				.getBuiltinCode() == BuiltinCode.MAX) ? Double.NEGATIVE_INFINITY : Double.POSITIVE_INFINITY;
-			ret.getDenseBlock().set(val);
+		if(op.aggOp.increOp.fn instanceof Builtin) {
+			Double val = null;
+			switch(((Builtin) op.aggOp.increOp.fn).getBuiltinCode()) {
+				case MAX:
+					val = Double.NEGATIVE_INFINITY;
+					break;
+				case MIN:
+					val = Double.POSITIVE_INFINITY;
+					break;
+				default:
+					break;
+			}
+			if(val != null) {
+				ret.getDenseBlock().set(val);
+			}
 		}
 
 		// core unary aggregate
@@ -690,15 +733,16 @@
 					}
 				}
 			}
-			catch(Exception ex) {
-				throw new DMLRuntimeException(ex);
+			catch(InterruptedException | ExecutionException e) {
+				LOG.fatal("UnaryAggregate Exception: " + e.getMessage(), e);
+				throw new DMLRuntimeException(e);
 			}
 		}
 		else {
 			// process UC column group
 			for(ColGroup grp : _colGroups)
 				if(grp instanceof ColGroupUncompressed)
-					grp.unaryAggregateOperations(op, ret);
+					((ColGroupUncompressed) grp).unaryAggregateOperations(op, ret);
 
 			// process OLE/RLE column groups
 			aggregateUnaryOperations(op, _colGroups, ret, 0, rlen);
@@ -722,9 +766,6 @@
 		// post-processing
 		ret.recomputeNonZeros();
 
-		if(LOG.isDebugEnabled())
-			LOG.debug("Compressed uagg k=" + op.getNumThreads() + " in " + time.stop());
-
 		return ret;
 	}
 
@@ -743,7 +784,6 @@
 		// && ColGroupOffset.ALLOW_CACHE_CONSCIOUS_ROWSUMS && ru - rl > CompressionSettings.BITMAP_BLOCK_SZ;
 
 		// process cache-conscious DDC1 groups (adds to output)
-		// TODO: Fix such that is is able to sharing even if ColGroupDDC2
 		// if(cacheDDC1) {
 		// ArrayList<ColGroupDDC1> tmp = new ArrayList<>();
 		// for(ColGroup grp : groups)
@@ -756,9 +796,15 @@
 
 		// process remaining groups (adds to output)
 		// note: UC group never passed into this function
+		double[] c = ret.getDenseBlockValues();
+		if(c == null) {
+			c = ret.getSparseBlock().values(0);
+			// throw new RuntimeException("aggregateUnaryOperation failed to materialize matrix data");
+		}
 		for(ColGroup grp : groups)
 			if(!(grp instanceof ColGroupUncompressed) && !(cacheDDC1 && grp instanceof ColGroupDDC1))
-				((ColGroup) grp).unaryAggregateOperations(op, ret, rl, ru);
+				grp.unaryAggregateOperations(op, c, rl, ru);
+		// LOG.debug(Arrays.toString(c));
 	}
 
 	@Override
@@ -827,7 +873,6 @@
 				pool.shutdown();
 			}
 			catch(InterruptedException | ExecutionException e) {
-				LOG.error(e.getMessage());
 				throw new DMLRuntimeException(e);
 			}
 
@@ -871,11 +916,11 @@
 
 		// multi-threaded execution of all groups
 		try {
-			ColGroupUncompressed uc = getUncompressedColGroup();
+			// ColGroupUncompressed uc = getUncompressedColGroup();
 
 			// compute uncompressed column group in parallel
-			if(uc != null)
-				uc.rightMultByVector(vector, result, k);
+			// if(uc != null)
+			// uc.rightMultByVector(vector, result, k);
 
 			// compute remaining compressed column groups in parallel
 			ExecutorService pool = CommonThreadPool.get(k);
@@ -894,42 +939,43 @@
 				lnnz += tmp.get();
 			result.setNonZeros(lnnz);
 		}
-		catch(Exception ex) {
-			throw new DMLRuntimeException(ex);
+		catch(InterruptedException | ExecutionException e) {
+			LOG.fatal(e);
+			throw new DMLRuntimeException(e);
 		}
 	}
 
 	private static void rightMultByVector(List<ColGroup> groups, MatrixBlock vect, MatrixBlock ret, int rl, int ru) {
 		ColGroupValue.setupThreadLocalMemory(getMaxNumValues(groups));
 
-		boolean cacheDDC1 = ru - rl > CompressionSettings.BITMAP_BLOCK_SZ * 2;
+		// boolean cacheDDC1 = ru - rl > CompressionSettings.BITMAP_BLOCK_SZ * 2;
 
 		// process uncompressed column group (overwrites output)
 		// if(inclUC) {
-		// for(ColGroup grp : groups)
-		// if(grp instanceof ColGroupUncompressed)
-		// grp.rightMultByVector(vect, ret, rl, ru);
-		// }
+		for(ColGroup grp : groups) {
+			if(grp instanceof ColGroupUncompressed)
+				((ColGroupUncompressed) grp).rightMultByVector(vect, ret, rl, ru);
+		}
 
 		// process cache-conscious DDC1 groups (adds to output)
 
-		if(cacheDDC1) {
-			ArrayList<ColGroupDDC1> tmp = new ArrayList<>();
-			for(ColGroup grp : groups)
-				if(grp instanceof ColGroupDDC1)
-					tmp.add((ColGroupDDC1) grp);
-			if(!tmp.isEmpty())
-				ColGroupDDC1.rightMultByVector(tmp.toArray(new ColGroupDDC1[0]), vect, ret, rl, ru);
-		}
+		// if(cacheDDC1) {
+		// ArrayList<ColGroupDDC1> tmp = new ArrayList<>();
+		// for(ColGroup grp : groups)
+		// if(grp instanceof ColGroupDDC1)
+		// tmp.add((ColGroupDDC1) grp);
+		// if(!tmp.isEmpty())
+		// ColGroupDDC1.rightMultByVector(tmp.toArray(new ColGroupDDC1[0]), vect, ret, rl, ru);
+		// }
 		// process remaining groups (adds to output)
-
+		double[] values = ret.getDenseBlockValues();
 		for(ColGroup grp : groups) {
-			if(!(cacheDDC1 && grp instanceof ColGroupDDC1)) {
-
-				grp.rightMultByVector(vect, ret, rl, ru);
-
+			if(!(grp instanceof ColGroupUncompressed)) {
+				// if(!(cacheDDC1 && grp instanceof ColGroupDDC1)) {
+				grp.rightMultByVector(vect, values, rl, ru);
 			}
 		}
+		// LOG.warn(Arrays.toString(values));
 
 		ColGroupValue.cleanupThreadLocalMemory();
 
@@ -1006,23 +1052,28 @@
 		// multi-threaded execution
 		try {
 			// compute uncompressed column group in parallel
-			ColGroupUncompressed uc = getUncompressedColGroup();
-			if(uc != null)
-				uc.leftMultByRowVector(rowVector, result, k);
+			// ColGroupUncompressed uc = getUncompressedColGroup();
+			// if(uc != null)
+			// uc.leftMultByRowVector(rowVector, result, k);
 
 			// compute remaining compressed column groups in parallel
-			ExecutorService pool = CommonThreadPool.get(Math.min(colGroups.size() - ((uc != null) ? 1 : 0), k));
-			ArrayList<ColGroup>[] grpParts = createStaticTaskPartitioning(4 * k, false);
+			ExecutorService pool = CommonThreadPool.get(Math.min(colGroups.size(), k));
+			ArrayList<ColGroup>[] grpParts = createStaticTaskPartitioning(4 * k, true);
 			ArrayList<LeftMatrixMultTask> tasks = new ArrayList<>();
 			for(ArrayList<ColGroup> groups : grpParts)
 				tasks.add(new LeftMatrixMultTask(groups, rowVector, result));
-			List<Future<Object>> ret = pool.invokeAll(tasks);
+			List<Future<Object>> ret;
+
+			ret = pool.invokeAll(tasks);
+
 			pool.shutdown();
 			for(Future<Object> tmp : ret)
-				tmp.get(); // error handling
+				tmp.get();
+
 		}
-		catch(Exception ex) {
-			throw new DMLRuntimeException(ex);
+		catch(InterruptedException | ExecutionException e) {
+			e.printStackTrace();
+			throw new DMLRuntimeException(e);
 		}
 
 		// post-processing
@@ -1123,13 +1174,6 @@
 		return null;
 	}
 
-	// private static boolean containsUncompressedColGroup(List<ColGroup> groups) {
-	// 	for(ColGroup grp : groups)
-	// 		if(grp instanceof ColGroupUncompressed)
-	// 			return true;
-	// 	return false;
-	// }
-
 	private static class LeftMatrixMultTask implements Callable<Object> {
 		private final ArrayList<ColGroup> _groups;
 		private final MatrixBlock _vect;
@@ -1144,13 +1188,17 @@
 		@Override
 		public Object call() {
 			// setup memory pool for reuse
-			ColGroupValue.setupThreadLocalMemory(getMaxNumValues(_groups));
+			try {
+				ColGroupValue.setupThreadLocalMemory(getMaxNumValues(_groups));
+				// delegate matrix-vector operation to each column group
+				for(ColGroup grp : _groups)
+					grp.leftMultByRowVector(_vect, _ret);
 
-			// delegate matrix-vector operation to each column group
-			for(ColGroup grp : _groups)
-				grp.leftMultByRowVector(_vect, _ret);
-
-			ColGroupValue.cleanupThreadLocalMemory();
+				ColGroupValue.cleanupThreadLocalMemory();
+			}
+			catch(Exception e) {
+				throw new DMLRuntimeException(e);
+			}
 			return null;
 		}
 	}
@@ -1172,8 +1220,14 @@
 
 		@Override
 		public Long call() {
-			rightMultByVector(_groups, _vect, _ret, _rl, _ru);
-			return _ret.recomputeNonZeros(_rl, _ru - 1, 0, 0);
+			try {
+				rightMultByVector(_groups, _vect, _ret, _rl, _ru);
+				return _ret.recomputeNonZeros(_rl, _ru - 1, 0, 0);
+			}
+			catch(Exception e) {
+				LOG.fatal(e);
+				throw new DMLRuntimeException(e);
+			}
 		}
 	}
 
@@ -1271,6 +1325,25 @@
 		}
 	}
 
+	private static class ScalarTask implements Callable<List<ColGroup>> {
+		private final List<ColGroup> _colGroups;
+		private final ScalarOperator _sop;
+
+		protected ScalarTask(List<ColGroup> colGroups, ScalarOperator sop) {
+			_colGroups = colGroups;
+			_sop = sop;
+		}
+
+		@Override
+		public List<ColGroup> call() {
+			List<ColGroup> res = new ArrayList<ColGroup>();
+			for(ColGroup x : _colGroups) {
+				res.add(x.scalarOperation(_sop));
+			}
+			return res;
+		}
+	}
+
 	/**
 	 * Calculates the Aligned block size if the block is a certain length.
 	 * 
diff --git a/src/main/java/org/apache/sysds/runtime/compress/CompressedMatrixBlockFactory.java b/src/main/java/org/apache/sysds/runtime/compress/CompressedMatrixBlockFactory.java
index 0cbd8af..fde7769 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/CompressedMatrixBlockFactory.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/CompressedMatrixBlockFactory.java
@@ -19,10 +19,7 @@
 
 package org.apache.sysds.runtime.compress;
 
-import java.util.HashMap;
-import java.util.HashSet;
 import java.util.List;
-import java.util.Map.Entry;
 
 import org.apache.commons.lang3.tuple.ImmutablePair;
 import org.apache.commons.lang3.tuple.Pair;
@@ -31,11 +28,7 @@
 import org.apache.sysds.runtime.DMLRuntimeException;
 import org.apache.sysds.runtime.compress.cocode.PlanningCoCoder;
 import org.apache.sysds.runtime.compress.colgroup.ColGroup;
-import org.apache.sysds.runtime.compress.colgroup.ColGroup.CompressionType;
-import org.apache.sysds.runtime.compress.colgroup.ColGroupDDC1;
 import org.apache.sysds.runtime.compress.colgroup.ColGroupFactory;
-import org.apache.sysds.runtime.compress.colgroup.Dictionary;
-import org.apache.sysds.runtime.compress.colgroup.DictionaryShared;
 import org.apache.sysds.runtime.compress.estim.CompressedSizeEstimator;
 import org.apache.sysds.runtime.compress.estim.CompressedSizeEstimatorFactory;
 import org.apache.sysds.runtime.compress.estim.CompressedSizeInfo;
@@ -155,13 +148,13 @@
 
 		// --------------------------------------------------
 		// PHASE 4: Best-effort dictionary sharing for DDC1 single-col groups
-		Dictionary dict = (!(compSettings.validCompressions.contains(CompressionType.DDC)) ||
-			!(compSettings.allowSharedDDCDictionary)) ? null : createSharedDDC1Dictionary(colGroupList);
-		if(dict != null) {
-			applySharedDDC1Dictionary(colGroupList, dict);
-			res._sharedDDC1Dict = true;
-		}
-		_stats.setNextTimePhase(time.stop());
+		// Dictionary dict = (!(compSettings.validCompressions.contains(CompressionType.DDC)) ||
+		// 	!(compSettings.allowSharedDDCDictionary)) ? null : createSharedDDC1Dictionary(colGroupList);
+		// if(dict != null) {
+		// 	applySharedDDC1Dictionary(colGroupList, dict);
+		// 	res._sharedDDC1Dict = true;
+		// }
+		// _stats.setNextTimePhase(time.stop());
 		if(LOG.isDebugEnabled()) {
 			LOG.debug("--compression phase 4: " + _stats.getLastTimePhase());
 		}
@@ -193,8 +186,6 @@
 		LOG.debug("--compressed size: " + _stats.size);
 		LOG.debug("--compression ratio: " + _stats.ratio);
 
-		res._lossy = compSettings.lossy;
-
 		return new ImmutablePair<>(res, _stats);
 		// --------------------------------------------------
 	}
@@ -205,60 +196,60 @@
 	 * @param colGroups The List of all ColGroups.
 	 * @return the shared value list for the DDC ColGroups.
 	 */
-	private static Dictionary createSharedDDC1Dictionary(List<ColGroup> colGroups) {
-		// create joint dictionary
-		HashSet<Double> vals = new HashSet<>();
-		HashMap<Integer, Double> mins = new HashMap<>();
-		HashMap<Integer, Double> maxs = new HashMap<>();
-		int numDDC1 = 0;
-		for(final ColGroup grp : colGroups)
-			if(grp.getNumCols() == 1 && grp instanceof ColGroupDDC1) {
-				final ColGroupDDC1 grpDDC1 = (ColGroupDDC1) grp;
-				final double[] values = grpDDC1.getValues();
-				double min = Double.POSITIVE_INFINITY;
-				double max = Double.NEGATIVE_INFINITY;
-				for(int i = 0; i < values.length; i++) {
-					vals.add(values[i]);
-					min = Math.min(min, values[i]);
-					max = Math.max(max, values[i]);
-				}
-				mins.put(grpDDC1.getColIndex(0), min);
-				maxs.put(grpDDC1.getColIndex(0), max);
-				numDDC1++;
-			}
+	// private static Dictionary createSharedDDC1Dictionary(List<ColGroup> colGroups) {
+	// 	// create joint dictionary
+	// 	HashSet<Double> vals = new HashSet<>();
+	// 	HashMap<Integer, Double> mins = new HashMap<>();
+	// 	HashMap<Integer, Double> maxs = new HashMap<>();
+	// 	int numDDC1 = 0;
+	// 	for(final ColGroup grp : colGroups)
+	// 		if(grp.getNumCols() == 1 && grp instanceof ColGroupDDC1) {
+	// 			final ColGroupDDC1 grpDDC1 = (ColGroupDDC1) grp;
+	// 			final double[] values = grpDDC1.getValues();
+	// 			double min = Double.POSITIVE_INFINITY;
+	// 			double max = Double.NEGATIVE_INFINITY;
+	// 			for(int i = 0; i < values.length; i++) {
+	// 				vals.add(values[i]);
+	// 				min = Math.min(min, values[i]);
+	// 				max = Math.max(max, values[i]);
+	// 			}
+	// 			mins.put(grpDDC1.getColIndex(0), min);
+	// 			maxs.put(grpDDC1.getColIndex(0), max);
+	// 			numDDC1++;
+	// 		}
 
-		// abort shared dictionary creation if empty or too large
-		int maxSize = vals.contains(0d) ? 256 : 255;
-		if(numDDC1 < 2 || vals.size() > maxSize)
-			return null;
+	// 	// abort shared dictionary creation if empty or too large
+	// 	int maxSize = vals.contains(0d) ? 256 : 255;
+	// 	if(numDDC1 < 2 || vals.size() > maxSize)
+	// 		return null;
 
-		// build consolidated shared dictionary
-		double[] values = vals.stream().mapToDouble(Double::doubleValue).toArray();
-		int[] colIndexes = new int[numDDC1];
-		double[] extrema = new double[2 * numDDC1];
-		int pos = 0;
-		for(Entry<Integer, Double> e : mins.entrySet()) {
-			colIndexes[pos] = e.getKey();
-			extrema[2 * pos] = e.getValue();
-			extrema[2 * pos + 1] = maxs.get(e.getKey());
-			pos++;
-		}
-		return new DictionaryShared(values, colIndexes, extrema);
-	}
+	// 	// build consolidated shared dictionary
+	// 	double[] values = vals.stream().mapToDouble(Double::doubleValue).toArray();
+	// 	int[] colIndexes = new int[numDDC1];
+	// 	double[] extrema = new double[2 * numDDC1];
+	// 	int pos = 0;
+	// 	for(Entry<Integer, Double> e : mins.entrySet()) {
+	// 		colIndexes[pos] = e.getKey();
+	// 		extrema[2 * pos] = e.getValue();
+	// 		extrema[2 * pos + 1] = maxs.get(e.getKey());
+	// 		pos++;
+	// 	}
+	// 	return new DictionaryShared(values, colIndexes, extrema);
+	// }
 
-	private static void applySharedDDC1Dictionary(List<ColGroup> colGroups, Dictionary dict) {
-		// create joint mapping table
-		HashMap<Double, Integer> map = new HashMap<>();
-		double[] values = dict.getValues();
-		for(int i = 0; i < values.length; i++)
-			map.put(values[i], i);
+	// private static void applySharedDDC1Dictionary(List<ColGroup> colGroups, Dictionary dict) {
+	// 	// create joint mapping table
+	// 	HashMap<Double, Integer> map = new HashMap<>();
+	// 	double[] values = dict.getValues();
+	// 	for(int i = 0; i < values.length; i++)
+	// 		map.put(values[i], i);
 
-		// recode data of all relevant DDC1 groups
-		for(ColGroup grp : colGroups)
-			if(grp.getNumCols() == 1 && grp instanceof ColGroupDDC1) {
-				ColGroupDDC1 grpDDC1 = (ColGroupDDC1) grp;
-				grpDDC1.recodeData(map);
-				grpDDC1.setDictionary(dict);
-			}
-	}
+	// 	// recode data of all relevant DDC1 groups
+	// 	for(ColGroup grp : colGroups)
+	// 		if(grp.getNumCols() == 1 && grp instanceof ColGroupDDC1) {
+	// 			ColGroupDDC1 grpDDC1 = (ColGroupDDC1) grp;
+	// 			grpDDC1.recodeData(map);
+	// 			grpDDC1.setDictionary(dict);
+	// 		}
+	// }
 }
diff --git a/src/main/java/org/apache/sysds/runtime/compress/CompressionSettings.java b/src/main/java/org/apache/sysds/runtime/compress/CompressionSettings.java
index 0e0a017..901e883 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/CompressionSettings.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/CompressionSettings.java
@@ -19,7 +19,7 @@
 
 package org.apache.sysds.runtime.compress;
 
-import java.util.Set;
+import java.util.EnumSet;
 
 import org.apache.sysds.runtime.compress.cocode.PlanningCoCoder.PartitionerType;
 import org.apache.sysds.runtime.compress.colgroup.ColGroup.CompressionType;
@@ -46,12 +46,8 @@
 	 */
 	public final double samplingRatio;
 
-	/**
-	 * Share DDC Dictionaries between ColGroups.
-	 * 
-	 * TODO Fix The DDC dictionary sharing.
-	 */
-	public final boolean allowSharedDDCDictionary;
+	/** Share DDC Dictionaries between ColGroups. */
+	public final boolean allowSharedDictionary;
 
 	/**
 	 * Transpose input matrix, to optimize performance, this reallocate the matrix to a more cache conscious allocation
@@ -59,6 +55,12 @@
 	 */
 	public final boolean transposeInput;
 
+	/**
+	 * Boolean specifying if the OLE and RLE should construct skip to enable skipping large amounts of rows.
+	 * (Optimization)
+	 */
+	public final boolean skipList;
+
 	/** If the seed is -1 then the system used system millisecond time and class hash for seeding. */
 	public final int seed;
 
@@ -78,14 +80,16 @@
 	 * Valid Compressions List, containing the ColGroup CompressionTypes that are allowed to be used for the compression
 	 * Default is to always allow for Uncompromisable ColGroup.
 	 */
-	public final Set<CompressionType> validCompressions;
+	public final EnumSet<CompressionType> validCompressions;
 
-	protected CompressionSettings(double samplingRatio, boolean allowSharedDDCDictionary, boolean transposeInput,
-		int seed, boolean investigateEstimate, boolean lossy, Set<CompressionType> validCompressions,
-		boolean sortValuesByLength, PartitionerType columnPartitioner, int maxStaticColGroupCoCode) {
+	protected CompressionSettings(double samplingRatio, boolean allowSharedDictionary, boolean transposeInput,
+		boolean skipList, int seed, boolean investigateEstimate, boolean lossy,
+		EnumSet<CompressionType> validCompressions, boolean sortValuesByLength, PartitionerType columnPartitioner,
+		int maxStaticColGroupCoCode) {
 		this.samplingRatio = samplingRatio;
-		this.allowSharedDDCDictionary = allowSharedDDCDictionary;
+		this.allowSharedDictionary = allowSharedDictionary;
 		this.transposeInput = transposeInput;
+		this.skipList = skipList;
 		this.seed = seed;
 		this.investigateEstimate = investigateEstimate;
 		this.validCompressions = validCompressions;
@@ -100,7 +104,7 @@
 		StringBuilder sb = new StringBuilder();
 		sb.append("\n" + super.toString());
 		sb.append("\n Valid Compressions: " + validCompressions);
-		sb.append("\n DDC1 share dict: " + allowSharedDDCDictionary);
+		sb.append("\n DDC1 share dict: " + allowSharedDictionary);
 		sb.append("\n Partitioner: " + columnPartitioner);
 		sb.append("\n Lossy: " + lossy);
 		// If needed for debugging add more fields to the printing.
diff --git a/src/main/java/org/apache/sysds/runtime/compress/CompressionSettingsBuilder.java b/src/main/java/org/apache/sysds/runtime/compress/CompressionSettingsBuilder.java
index 1abe605..02620d0 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/CompressionSettingsBuilder.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/CompressionSettingsBuilder.java
@@ -31,8 +31,9 @@
  */
 public class CompressionSettingsBuilder {
 	private double samplingRatio = 1.0;
-	private boolean allowSharedDDCDictionary = false;
+	private boolean allowSharedDictionary = false;
 	private boolean transposeInput = true;
+	private boolean skipList = true;
 	private int seed = -1;
 	private boolean investigateEstimate = false;
 	private boolean lossy = false;
@@ -46,8 +47,9 @@
 		DMLConfig conf = ConfigurationManager.getDMLConfig();
 		this.lossy = conf.getBooleanValue(DMLConfig.COMPRESSED_LOSSY);
 		this.validCompressions = EnumSet.of(CompressionType.UNCOMPRESSED);
-		String[] validCompressionsString = conf.getTextValue(DMLConfig.COMPRESSED_VALID_COMPRESSIONS).split(",");;
-		for(String comp:  validCompressionsString){
+		String[] validCompressionsString = conf.getTextValue(DMLConfig.COMPRESSED_VALID_COMPRESSIONS).split(",");
+		;
+		for(String comp : validCompressionsString) {
 			validCompressions.add(CompressionType.valueOf(comp));
 		}
 	}
@@ -60,7 +62,7 @@
 	 */
 	public CompressionSettingsBuilder copySettings(CompressionSettings that) {
 		this.samplingRatio = that.samplingRatio;
-		this.allowSharedDDCDictionary = that.allowSharedDDCDictionary;
+		this.allowSharedDictionary = that.allowSharedDictionary;
 		this.transposeInput = that.transposeInput;
 		this.seed = that.seed;
 		this.investigateEstimate = that.investigateEstimate;
@@ -105,11 +107,11 @@
 	/**
 	 * Allow the Dictionaries to be shared between different column groups.
 	 * 
-	 * @param allowSharedDDCDictionary A boolean specifying if the dictionary can be shared between column groups.
+	 * @param allowSharedDictionary A boolean specifying if the dictionary can be shared between column groups.
 	 * @return The CompressionSettingsBuilder
 	 */
-	public CompressionSettingsBuilder setAllowSharedDDCDictionary(boolean allowSharedDDCDictionary) {
-		this.allowSharedDDCDictionary = allowSharedDDCDictionary;
+	public CompressionSettingsBuilder setAllowSharedDictionary(boolean allowSharedDictionary) {
+		this.allowSharedDictionary = allowSharedDictionary;
 		return this;
 	}
 
@@ -126,6 +128,18 @@
 	}
 
 	/**
+	 * Specify if the Offset list encoding should utilize skip lists. This increase size of compression but improves
+	 * performance in Offset encodings. OLE and RLE.
+	 * 
+	 * @param skipList a boolean specifying if the skiplist function is enabled
+	 * @return The CompressionSettingsBuilder
+	 */
+	public CompressionSettingsBuilder setSkipList(boolean skipList) {
+		this.skipList = skipList;
+		return this;
+	}
+
+	/**
 	 * Set the seed for the compression operation.
 	 * 
 	 * @param seed The seed used in sampling the matrix and general operations in the compression.
@@ -212,8 +226,8 @@
 	 * @return The CompressionSettings
 	 */
 	public CompressionSettings create() {
-		return new CompressionSettings(samplingRatio, allowSharedDDCDictionary, transposeInput, seed,
+		return new CompressionSettings(samplingRatio, allowSharedDictionary, transposeInput, skipList, seed,
 			investigateEstimate, lossy, validCompressions, sortValuesByLength, columnPartitioner,
 			maxStaticColGroupCoCode);
 	}
-}
\ No newline at end of file
+}
diff --git a/src/main/java/org/apache/sysds/runtime/compress/ReaderColumnSelection.java b/src/main/java/org/apache/sysds/runtime/compress/ReaderColumnSelection.java
index 60d9c5b..79547b8 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/ReaderColumnSelection.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/ReaderColumnSelection.java
@@ -21,14 +21,13 @@
 
 import org.apache.sysds.runtime.compress.utils.DblArray;
 
-/**
- * Base class for all column selection readers.
- */
+/** Base class for all column selection readers. */
 public abstract class ReaderColumnSelection {
 	protected int[] _colIndexes = null;
 	protected int _numRows = -1;
 	protected int _lastRow = -1;
-	// protected boolean _skipZeros = false;
+
+	private DblArray nonZeroReturn;
 
 	protected CompressionSettings _compSettings;
 
@@ -36,7 +35,6 @@
 		_colIndexes = colIndexes;
 		_numRows = numRows;
 		_lastRow = -1;
-		// _skipZeros = skipZeros;
 		_compSettings = compSettings;
 	}
 
@@ -45,7 +43,13 @@
 	 * 
 	 * @return next row
 	 */
-	public abstract DblArray nextRow();
+	public DblArray nextRow() {
+		while((nonZeroReturn = getNextRow()) != null && DblArray.isZero(nonZeroReturn)) {
+		}
+		return nonZeroReturn;
+	}
+
+	protected abstract DblArray getNextRow();
 
 	public int getCurrentRowIndex() {
 		return _lastRow;
diff --git a/src/main/java/org/apache/sysds/runtime/compress/ReaderColumnSelectionDense.java b/src/main/java/org/apache/sysds/runtime/compress/ReaderColumnSelectionDense.java
index cae285f..d07b863 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/ReaderColumnSelectionDense.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/ReaderColumnSelectionDense.java
@@ -25,8 +25,6 @@
 public class ReaderColumnSelectionDense extends ReaderColumnSelection {
 	protected MatrixBlock _data;
 
-	// reusable return
-	private DblArray nonZeroReturn;
 	private DblArray reusableReturn;
 	private double[] reusableArr;
 
@@ -37,19 +35,8 @@
 		reusableReturn = new DblArray(reusableArr);
 	}
 
-	@Override
-	public DblArray nextRow() {
-		// if(_skipZeros) {
-		while((nonZeroReturn = getNextRow()) != null && DblArray.isZero(nonZeroReturn)) {
-		}
-		return nonZeroReturn;
-		// }
-		// else {
-		// return getNextRow();
-		// }
-	}
 
-	private DblArray getNextRow() {
+	protected DblArray getNextRow() {
 		if(_lastRow == _numRows - 1)
 			return null;
 		_lastRow++;
diff --git a/src/main/java/org/apache/sysds/runtime/compress/ReaderColumnSelectionDenseSample.java b/src/main/java/org/apache/sysds/runtime/compress/ReaderColumnSelectionDenseSample.java
index 2ab76ce..bb314f2 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/ReaderColumnSelectionDenseSample.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/ReaderColumnSelectionDenseSample.java
@@ -22,10 +22,7 @@
 import org.apache.sysds.runtime.compress.utils.DblArray;
 import org.apache.sysds.runtime.matrix.data.MatrixBlock;
 
-/**
- * 
- * considers only a subset of row indexes
- */
+/** considers only a subset of row indexes */
 public class ReaderColumnSelectionDenseSample extends ReaderColumnSelection {
 	protected MatrixBlock _data;
 
@@ -33,32 +30,19 @@
 	private int lastIndex = -1;
 
 	// reusable return
-	private DblArray nonZeroReturn;
 	private DblArray reusableReturn;
 	private double[] reusableArr;
 
 	public ReaderColumnSelectionDenseSample(MatrixBlock data, int[] colIndexes, int[] sampleIndexes,
-		 CompressionSettings compSettings) {
-		super(colIndexes, -1,  compSettings);
+		CompressionSettings compSettings) {
+		super(colIndexes, -1, compSettings);
 		_data = data;
 		_sampleIndexes = sampleIndexes;
 		reusableArr = new double[colIndexes.length];
 		reusableReturn = new DblArray(reusableArr);
 	}
 
-	@Override
-	public DblArray nextRow() {
-		// if(_skipZeros) {
-			while((nonZeroReturn = getNextRow()) != null && DblArray.isZero(nonZeroReturn)) {
-			}
-			return nonZeroReturn;
-		// }
-		// else {
-			// return getNextRow();
-		// }
-	}
-
-	private DblArray getNextRow() {
+	protected DblArray getNextRow() {
 		if(lastIndex == _sampleIndexes.length - 1)
 			return null;
 		lastIndex++;
@@ -73,9 +57,4 @@
 	public int getCurrentRowIndex() {
 		return _sampleIndexes[lastIndex];
 	}
-
-	@Override
-	public void reset() {
-		lastIndex = -1;
-	}
 }
diff --git a/src/main/java/org/apache/sysds/runtime/compress/ReaderColumnSelectionSparse.java b/src/main/java/org/apache/sysds/runtime/compress/ReaderColumnSelectionSparse.java
index ddf124c..606c58a 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/ReaderColumnSelectionSparse.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/ReaderColumnSelectionSparse.java
@@ -32,8 +32,6 @@
  * zero-value in a sparse matrix like any other value.
  */
 public class ReaderColumnSelectionSparse extends ReaderColumnSelection {
-	private final DblArray ZERO_DBL_ARRAY;
-	private DblArray nonZeroReturn;
 
 	// reusable return
 	private DblArray reusableReturn;
@@ -45,9 +43,8 @@
 
 	public ReaderColumnSelectionSparse(MatrixBlock data, int[] colIndexes, CompressionSettings compSettings) {
 		super(colIndexes, compSettings.transposeInput ? data.getNumColumns() : data.getNumRows(), compSettings);
-		ZERO_DBL_ARRAY = new DblArray(new double[colIndexes.length], true);
 		reusableArr = new double[colIndexes.length];
-		reusableReturn = new DblArray(reusableArr);
+		reusableReturn = null;
 
 		if(!_compSettings.transposeInput) {
 			throw new RuntimeException("SparseColumnSelectionReader should not be used without transposed input.");
@@ -60,19 +57,7 @@
 				sparseCols[i] = data.getSparseBlock().get(colIndexes[i]);
 	}
 
-	@Override
-	public DblArray nextRow() {
-		// if(_skipZeros) {
-			while((nonZeroReturn = getNextRow()) != null && nonZeroReturn == ZERO_DBL_ARRAY) {
-			}
-			return nonZeroReturn;
-		// }
-		// else {
-			// return getNextRow();
-		// }
-	}
-
-	private DblArray getNextRow() {
+	protected DblArray getNextRow() {
 		if(_lastRow == _numRows - 1)
 			return null;
 		_lastRow++;
@@ -98,6 +83,6 @@
 				zeroResult = false;
 			}
 
-		return zeroResult ? ZERO_DBL_ARRAY : reusableReturn;
+		return zeroResult ? null : reusableReturn;
 	}
 }
diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ADictionary.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ADictionary.java
new file mode 100644
index 0000000..efbb40e
--- /dev/null
+++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ADictionary.java
@@ -0,0 +1,199 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.runtime.compress.colgroup;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+
+import org.apache.sysds.runtime.functionobjects.Builtin;
+import org.apache.sysds.runtime.functionobjects.KahanFunction;
+import org.apache.sysds.runtime.instructions.cp.KahanObject;
+import org.apache.sysds.runtime.matrix.operators.ScalarOperator;
+
+/**
+ * This dictionary class aims to encapsulate the storage and operations over unique floating point values of a column
+ * group.
+ */
+public abstract class ADictionary {
+
+	/**
+	 * Get all the values contained in the dictionary as a linearized double array.
+	 * 
+	 * @return linearized double array
+	 */
+	public abstract double[] getValues();
+
+	/**
+	 * Get Specific value contained in the dictionary at index.
+	 * 
+	 * @param i The index to extract the value from
+	 * @return The value contained at the index
+	 */
+	public abstract double getValue(int i);
+
+	/**
+	 * Determines if the content has a zero tuple. meaning all values at a specific row are zero value. This is useful
+	 * information to find out if the dictionary is used in a dense context. To improve some specific operations.
+	 * 
+	 * @param ncol The number of columns in the dictionary.
+	 * @return The index at which the zero tuple is located.
+	 */
+	public abstract int hasZeroTuple(int ncol);
+
+	/**
+	 * Returns the memory usage of the dictionary.
+	 * 
+	 * @return a long value in number of bytes for the dictionary.
+	 */
+	public abstract long getInMemorySize();
+
+	/**
+	 * Aggregate all the contained values, useful in value only computations where the operation is iterating through
+	 * all values contained in the dictionary.
+	 * 
+	 * @param init The initial Value, in cases such as Max value, this could be -infinity
+	 * @param fn   The Function to apply to values
+	 * @return The aggregated value as a double.
+	 */
+	public abstract double aggregate(double init, Builtin fn);
+
+	/**
+	 * returns the count of values contained in the dictionary.
+	 * 
+	 * @return an integer of count of values.
+	 */
+	public abstract int getValuesLength();
+
+	/**
+	 * Applies the scalar operation on the dictionary. Note that this operation modifies the underlying data, and
+	 * normally require a copy of the original Dictionary to preserve old objects.
+	 * 
+	 * @param op The operator to apply to the dictionary values.
+	 * @return this dictionary with modified values.
+	 */
+	public abstract ADictionary apply(ScalarOperator op);
+
+	/**
+	 * Applies the scalar operation on the dictionary. The returned dictionary should contain a new instance of the
+	 * underlying data. Therefore it will not modify the previous object.
+	 * 
+	 * @param op      The operator to apply to the dictionary values.
+	 * @param newVal  The value to append to the dictionary.
+	 * @param numCols The number of columns stored in the dictionary.
+	 * @return Another dictionary with modified values.
+	 */
+	public abstract ADictionary applyScalarOp(ScalarOperator op, double newVal, int numCols);
+
+	/**
+	 * Returns a deep clone of the dictionary.
+	 */
+	public abstract ADictionary clone();
+
+	/**
+	 * Aggregates the columns into the target double array provided.
+	 * 
+	 * @param c          The target double array, this contains the full number of columns, therefore the colIndexes for
+	 *                   this specific dictionary is needed.
+	 * @param fn         The function to apply to individual columns
+	 * @param colIndexes The mapping to the target columns from the individual columns
+	 */
+	public void aggregateCols(double[] c, Builtin fn, int[] colIndexes) {
+		int ncol = colIndexes.length;
+		int vlen = getValuesLength() / ncol;
+		// double[] ret = init;
+		// System.out.println(c.length + " " + ncol);
+		for(int k = 0; k < vlen; k++)
+			for(int j = 0, valOff = k * ncol; j < ncol; j++)
+				c[colIndexes[j]] = fn.execute(c[colIndexes[j]], getValue(valOff + j));
+		// return c;
+	}
+
+	/**
+	 * The read function to instantiate the dictionary.
+	 * 
+	 * @param in    The data input source to read the stored dictionary from
+	 * @param lossy Boolean specifying if the dictionary stored was lossy.
+	 * @return The concrete dictionary.
+	 * @throws IOException if the reading source throws it.
+	 */
+	public static ADictionary read(DataInput in, boolean lossy) throws IOException {
+		return lossy ? QDictionary.read(in) : Dictionary.read(in);
+	}
+
+	/**
+	 * Write the dictionary to a DataOutput.
+	 * 
+	 * @param out the output sink to write the dictionary to.
+	 * @throws IOException if the sink fails.
+	 */
+	public abstract void write(DataOutput out) throws IOException;
+
+	/**
+	 * Calculate the space consumption if the dictionary is stored on disk.
+	 * 
+	 * @return the long count of bytes to store the dictionary.
+	 */
+	public abstract long getExactSizeOnDisk();
+
+	/**
+	 * Get the number of values given that the column group has n columns
+	 * 
+	 * @param ncol The number of Columns in the ColumnGroup.
+	 * @return the number of value tuples contained in the dictionary.
+	 */
+	public abstract int getNumberOfValues(int ncol);
+
+	/**
+	 * Materializes a Zero tuple at the last index of the dictionary.
+	 * 
+	 * @param numCols The number of columns in the dictionary
+	 * @return the new Dictionary with materialized zero tuple.
+	 */
+	// public abstract IDictionary materializeZeroValue(int numCols);
+
+	/**
+	 * Method used as a pre-aggregate of each tuple in the dictionary, to single double values.
+	 * 
+	 * Note if the number of columns is one the actual dictionaries values are simply returned.
+	 * 
+	 * @param kplus     The function to apply to each value in the rows
+	 * @param kbuff     The buffer to use to aggregate the value.
+	 * @param nrColumns The number of columns in the ColGroup to know how to get the values from the dictionary.
+	 * @return a double array containing the row sums from this dictionary.
+	 */
+	protected abstract double[] sumAllRowsToDouble(KahanFunction kplus, KahanObject kbuff, int nrColumns);
+
+	/**
+	 * Sum the values at a specific row.
+	 * 
+	 * @param k         The row index to sum
+	 * @param kplus     The operator to use
+	 * @param kbuff     The buffer to aggregate inside.
+	 * @param nrColumns The number of columns
+	 * @return The sum of the row.
+	 */
+	protected abstract double sumRow(int k, KahanFunction kplus, KahanObject kbuff, int nrColumns);
+
+
+	protected abstract void colSum(double[] c, int[] counts, int[] colIndexes, KahanFunction kplus);
+
+	protected abstract double sum(int[] counts, int ncol,  KahanFunction kplus);
+}
diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroup.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroup.java
index 582e769..a3f6781 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroup.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroup.java
@@ -44,14 +44,14 @@
 	/**
 	 * Public Group types supported
 	 * 
-	 * Note For instance DDC is called DDC not DDC1, or DDC2 which is a specific subtype of the DDC.
+	 * Note For instance DDC is called DDC not DDC1, or DDC2 which is a specific subtype of the DDC. That
+	 * differentiation is hidden to a user.
+	 * 
+	 * Includes Uncompressed for sparse/dense representation RLE for Run length encoding OLE for Offset Length encoding
+	 * DDC for Dense dictionary encoding
 	 */
 	public enum CompressionType {
-		UNCOMPRESSED, // uncompressed sparse/dense
-		RLE, // RLE bitmap
-		OLE, // OLE bitmap
-		DDC, // Dictionary encoding
-		QUAN, // Quantize the double values to short
+		UNCOMPRESSED, RLE, OLE, DDC,
 	}
 
 	/**
@@ -60,30 +60,25 @@
 	 * Protected such that outside the ColGroup package it should be unknown which specific subtype is used.
 	 */
 	protected enum ColGroupType {
-		UNCOMPRESSED, // uncompressed sparse/dense
-		RLE, // RLE bitmap
-		OLE, // OLE bitmap
-		DDC1, // DDC Small Dictionary
-		DDC2, // DDC Large Dictionary
-		QUAN8S, // Qunatized Value.
+		UNCOMPRESSED, RLE, OLE, DDC1, DDC2,
 	}
 
 	/** The ColGroup Indexes 0 offset, contained in the ColGroup */
 	protected int[] _colIndexes;
 
-	/**
-	 * ColGroup Implementation Contains zero values NOTE This variable is moved here becuse that reduce the Object size
-	 * by 8
-	 */
-	protected boolean _zeros;
-	protected boolean _lossy;
-
 	/** Number of rows in the matrix, for use by child classes. */
 	protected int _numRows;
 
 	/**
-	 * Empty constructor, used for serializing into an empty new object of ColGroup.
+	 * ColGroup Implementation Contains zero row. Note this is not if it contains a zero value. If false then the stored
+	 * values are filling the ColGroup making it a dense representation, that can be leveraged in operations.
 	 */
+	protected boolean _zeros;
+
+	/** boolean specifying if the column group is encoded lossy */
+	protected boolean _lossy;
+
+	/** Empty constructor, used for serializing into an empty new object of ColGroup. */
 	protected ColGroup() {
 		this._colIndexes = null;
 		this._numRows = -1;
@@ -208,35 +203,13 @@
 	public abstract void write(DataOutput out) throws IOException;
 
 	/**
-	 * Serializes column group to data output.
-	 * 
-	 * @param out      data output
-	 * @param skipDict skip shared dictionary
-	 * @throws IOException if IOException occurs
-	 */
-	public void write(DataOutput out, boolean skipDict) throws IOException {
-		write(out); // skipDict ignored by default
-	}
-
-	/**
-	 * Deserializes column group from data input.
+	 * Deserialize column group from data input.
 	 * 
 	 * @param in data input
 	 * @throws IOException if IOException occurs
 	 */
 	public abstract void readFields(DataInput in) throws IOException;
 
-	// /**
-	//  * Deserializes column group from data input.
-	//  * 
-	//  * @param in       data input
-	//  * @param skipDict skip shared dictionary
-	//  * @throws IOException if IOException occurs
-	//  */
-	// public void readFields(DataInput in, boolean skipDict) throws IOException {
-	// 	readFields(in); // skipDict ignored by default
-	// }
-
 	/**
 	 * Returns the exact serialized size of column group. This can be used for example for buffer preallocation.
 	 * 
@@ -254,14 +227,6 @@
 	public abstract double get(int r, int c);
 
 	/**
-	 * Multiply the slice of the matrix that this column group represents by a vector on the right. Get the number of
-	 * values. contained inside the ColGroup.
-	 * 
-	 * @return value at the row/column position
-	 */
-	// public abstract long getValuesSize();
-
-	/**
 	 * Get all the values in the colGroup. Note that this is only the stored values not the way they are stored. Making
 	 * the output a list of values used in that colGroup not the actual full column.
 	 * 
@@ -286,31 +251,14 @@
 	public abstract boolean getIfCountsType();
 
 	/**
-	 * Returns the counts of values inside the MatrixBlock returned in getValuesAsBlock Throws an exception if the
-	 * getIfCountsType is false
-	 * 
-	 * @return the count of each value in the MatrixBlock.
-	 */
-	public abstract int[] getCounts();
-
-	/**
-	 * Returns the counts of values inside the MatrixBlock returned in getValuesAsBlock Throws an exception if the
-	 * getIfCountsType is false
-	 * 
-	 * @param includeZero Boolean to specify if zero should be included in the count.
-	 * @return the count of each value in the MatrixBlock.
-	 */
-	// public abstract int[] getCounts(boolean includeZero);
-
-	/**
 	 * Multiply the slice of the matrix that this column group represents by a vector on the right.
 	 * 
 	 * @param vector vector to multiply by (tall vector)
-	 * @param result accumulator for holding the result
+	 * @param c      accumulator for holding the result
 	 * @param rl     row lower
 	 * @param ru     row upper if the internal SystemML code that performs the multiplication experiences an error
 	 */
-	public abstract void rightMultByVector(MatrixBlock vector, MatrixBlock result, int rl, int ru);
+	public abstract void rightMultByVector(MatrixBlock vector, double[] c, int rl, int ru);
 
 	/**
 	 * Multiply the slice of the matrix that this column group represents by a row vector on the left (the original
@@ -321,9 +269,6 @@
 	 */
 	public abstract void leftMultByRowVector(MatrixBlock vector, MatrixBlock result);
 
-	// additional vector-matrix multiplication to avoid DDC uncompression
-	// public abstract void leftMultByRowVector(ColGroupDDC vector, MatrixBlock result);
-
 	/**
 	 * Perform the specified scalar operation directly on the compressed column group, without decompressing individual
 	 * cells if possible.
@@ -337,21 +282,21 @@
 	 * Unary Aggregate operator, since aggregate operators require new object output, the output becomes an uncompressed
 	 * matrix.
 	 * 
-	 * @param op     The operator used
-	 * @param result Rhe output matrix block.
+	 * @param op The operator used
+	 * @param c  Rhe output matrix block.
 	 */
-	public abstract void unaryAggregateOperations(AggregateUnaryOperator op, MatrixBlock result);
+	public abstract void unaryAggregateOperations(AggregateUnaryOperator op, double[] c);
 
 	/**
 	 * Unary Aggregate operator, since aggregate operators require new object output, the output becomes an uncompressed
 	 * matrix.
 	 * 
-	 * @param op     The operator used
-	 * @param result The output matrix block.
-	 * @param rl     The Starting Row to do aggregation from
-	 * @param ru     The last Row to do aggregation to (not included)
+	 * @param op The operator used
+	 * @param c  The output matrix block.
+	 * @param rl The Starting Row to do aggregation from
+	 * @param ru The last Row to do aggregation to (not included)
 	 */
-	public abstract void unaryAggregateOperations(AggregateUnaryOperator op, MatrixBlock result, int rl, int ru);
+	public abstract void unaryAggregateOperations(AggregateUnaryOperator op, double[] c, int rl, int ru);
 
 	/**
 	 * Create a column group iterator for a row index range.
@@ -393,6 +338,7 @@
 
 	/**
 	 * Is Lossy
+	 * 
 	 * @return returns if the ColGroup is compressed in a lossy manner.
 	 */
 	public abstract boolean isLossy();
diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupConverter.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupConverter.java
index a3b434a..9e7d68e 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupConverter.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupConverter.java
@@ -43,21 +43,21 @@
 		// create copy of column group
 		if(group instanceof ColGroupUncompressed) {
 			ColGroupUncompressed in = (ColGroupUncompressed) group;
-			ret = new ColGroupUncompressed(colIndices, in.getNumRows(), in.getData());
+			ret = new ColGroupUncompressed(colIndices, in._numRows, in.getData());
 		}
 		else if(group instanceof ColGroupRLE) {
 			ColGroupRLE in = (ColGroupRLE) group;
-			ret = new ColGroupRLE(colIndices, in.getNumRows(), in.hasZeros(), in.getValues(), in.getBitmaps(),
+			ret = new ColGroupRLE(colIndices, in._numRows, in.hasZeros(), in._dict, in.getBitmaps(),
 				in.getBitmapOffsets());
 		}
 		else if(group instanceof ColGroupOLE) {
 			ColGroupOLE in = (ColGroupOLE) group;
-			ret = new ColGroupOLE(colIndices, in.getNumRows(), in.hasZeros(), in.getValues(), in.getBitmaps(),
+			ret = new ColGroupOLE(colIndices, in._numRows, in.hasZeros(), in._dict, in.getBitmaps(),
 				in.getBitmapOffsets());
 		}
 		else if(group instanceof ColGroupDDC1) {
 			ColGroupDDC1 in = (ColGroupDDC1) group;
-			ret = new ColGroupDDC1(colIndices, in.getNumRows(), in.getValues(), in.getData());
+			ret = new ColGroupDDC1(colIndices, in._numRows, in._dict, in.getData(), in._zeros);
 		}
 		else {
 			throw new RuntimeException("Using '" + group.getClass() + "' instance of ColGroup not fully supported");
diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupDDC.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupDDC.java
index 993cff7..5116c74 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupDDC.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupDDC.java
@@ -22,14 +22,11 @@
 import java.util.Arrays;
 import java.util.Iterator;
 
-import org.apache.commons.lang.NotImplementedException;
 import org.apache.sysds.runtime.compress.CompressionSettings;
-import org.apache.sysds.runtime.compress.utils.AbstractBitmap;
-import org.apache.sysds.runtime.data.DenseBlock;
+import org.apache.sysds.runtime.compress.utils.ABitmap;
 import org.apache.sysds.runtime.functionobjects.Builtin;
 import org.apache.sysds.runtime.functionobjects.KahanFunction;
 import org.apache.sysds.runtime.functionobjects.KahanPlus;
-import org.apache.sysds.runtime.functionobjects.KahanPlusSq;
 import org.apache.sysds.runtime.instructions.cp.KahanObject;
 import org.apache.sysds.runtime.matrix.data.IJV;
 import org.apache.sysds.runtime.matrix.data.MatrixBlock;
@@ -37,47 +34,40 @@
 /**
  * Class to encapsulate information about a column group that is encoded with dense dictionary encoding (DDC).
  * 
- * NOTE: zero values are included at position 0 in the value dictionary, which simplifies various operations such as
- * counting the number of non-zeros.
  */
 public abstract class ColGroupDDC extends ColGroupValue {
 	private static final long serialVersionUID = -3204391646123465004L;
 
-	@Override
+	protected ColGroupDDC() {
+		super();
+	}
+
+	protected ColGroupDDC(int[] colIndices, int numRows, ABitmap ubm, CompressionSettings cs) {
+		super(colIndices, numRows, ubm, cs);
+	}
+
+	protected ColGroupDDC(int[] colIndices, int numRows, ADictionary dict) {
+		super(colIndices, numRows, dict);
+	}
+
 	public CompressionType getCompType() {
 		return CompressionType.DDC;
 	}
 
-	public ColGroupDDC() {
-		super();
-	}
-
-	protected ColGroupDDC(int[] colIndices, int numRows, AbstractBitmap ubm, CompressionSettings cs) {
-		super(colIndices, numRows, ubm, cs);
-	}
-
-	protected ColGroupDDC(int[] colIndices, int numRows, double[] values) {
-		super(colIndices, numRows, values);
-	}
-
 	@Override
 	public void decompressToBlock(MatrixBlock target, int rl, int ru) {
-		double[] dictionary = getValues();
-		for(int i = rl; i < ru; i++) {
-			for(int colIx = 0; colIx < _colIndexes.length; colIx++) {
-				int col = _colIndexes[colIx];
-				double cellVal = getData(i, colIx, dictionary);
-				target.quickSetValue(i, col, cellVal);
-			}
-		}
+		int ncol = getNumCols();
+		double[] values = getValues();
+		for(int i = rl; i < ru; i++)
+			for(int j = 0; j < ncol; j++)
+				target.appendValue(i, _colIndexes[j], getData(i, j, values));
 	}
 
 	@Override
 	public void decompressToBlock(MatrixBlock target, int[] colIndexTargets) {
-		int nrow = getNumRows();
 		int ncol = getNumCols();
 		double[] dictionary = getValues();
-		for(int i = 0; i < nrow; i++) {
+		for(int i = 0; i < _numRows; i++) {
 			for(int colIx = 0; colIx < ncol; colIx++) {
 				int origMatrixColIx = getColIndex(colIx);
 				int col = colIndexTargets[origMatrixColIx];
@@ -89,12 +79,20 @@
 
 	@Override
 	public void decompressToBlock(MatrixBlock target, int colpos) {
-		throw new NotImplementedException("Old Function Not In use");
-		// int nrow = getNumRows();
-		// for(int i = 0; i < nrow; i++) {
-		// double cellVal = getData(i, colpos);
-		// target.quickSetValue(i, 0, cellVal);
-		// }
+		int ncol = getNumCols();
+		double[] c = target.getDenseBlockValues();
+		double[] values = getValues();
+		int nnz = 0;
+		for(int i = 0; i < _numRows; i++) {
+			int index = getIndex(i);
+			if(index != values.length) {
+				nnz += ((c[i] = values[(index) * ncol + colpos]) != 0) ? 1 : 0;
+			}
+			else {
+				c[i] = 0.0;
+			}
+		}
+		target.setNonZeros(nnz);
 	}
 
 	@Override
@@ -105,205 +103,180 @@
 			throw new RuntimeException("Column index " + c + " not in DDC group.");
 
 		// get value
-		return _dict.getValue(getIndex(r, ix));
+		int index = getIndex(r, ix);
+		if(index != getNumValues()) {
+
+			return _dict.getValue(index);
+		}
+		else {
+			return 0.0;
+		}
 	}
 
 	@Override
 	public void countNonZerosPerRow(int[] rnnz, int rl, int ru) {
 		int ncol = getNumCols();
+		final int numVals = getNumValues();
 		for(int i = rl; i < ru; i++) {
 			int lnnz = 0;
-			for(int colIx = 0; colIx < ncol; colIx++)
-				lnnz += (_dict.getValue(getIndex(i, colIx)) != 0) ? 1 : 0;
+			for(int colIx = 0; colIx < ncol; colIx++) {
+				int index = getIndex(i, colIx);
+				if(index < numVals) {
+					lnnz += (_dict.getValue(getIndex(i, colIx)) != 0) ? 1 : 0;
+				}
+			}
 			rnnz[i - rl] += lnnz;
 		}
 	}
 
 	@Override
-	protected void computeSum(MatrixBlock result, KahanFunction kplus) {
-		final int ncol = getNumCols();
-		final int numVals = getNumValues();
-
-		// if(numVals < MAX_TMP_VALS) {
-		// iterative over codes and count per code
-
-		final int[] counts = getCounts();
-		if(_dict instanceof QDictionary && !(kplus instanceof KahanPlusSq)) {
-			final QDictionary values = ((QDictionary) _dict);
-			long sum = 0;
-			for(int k = 0, valOff = 0; k < numVals; k++, valOff += ncol) {
-				int cntk = counts[k];
-				for(int j = 0; j < ncol; j++)
-					sum += values.getValueByte(valOff + j) * cntk;
-			}
-			result.quickSetValue(0, 0, result.quickGetValue(0, 0) + sum * values._scale);
-			result.quickSetValue(0, 1, 0);
-		}
-		else {
-			double[] values = getValues();
-			// post-scaling of pre-aggregate with distinct values
-			KahanObject kbuff = new KahanObject(result.quickGetValue(0, 0), result.quickGetValue(0, 1));
-			for(int k = 0, valOff = 0; k < numVals; k++, valOff += ncol) {
-				int cntk = counts[k];
-				for(int j = 0; j < ncol; j++)
-					kplus.execute3(kbuff, values[valOff + j], cntk);
-			}
-			result.quickSetValue(0, 0, kbuff._sum);
-			result.quickSetValue(0, 1, kbuff._correction);
-		}
+	protected void computeSum(double[] c, KahanFunction kplus) {
+		c[0] += _dict.sum(getCounts(), _colIndexes.length, kplus);
 	}
 
-	protected void computeColSums(MatrixBlock result, KahanFunction kplus) {
-		int nrow = getNumRows();
-		int ncol = getNumCols();
-		double[] values = _dict.getValues();
-
-		KahanObject[] kbuff = new KahanObject[getNumCols()];
-		for(int j = 0; j < ncol; j++)
-			kbuff[j] = new KahanObject(result.quickGetValue(0, _colIndexes[j]),
-				result.quickGetValue(1, _colIndexes[j]));
-
-		for(int i = 0; i < nrow; i++) {
-			int rowIndex = getIndex(i);
-			for(int j = 0; j < ncol; j++)
-				kplus.execute2(kbuff[j], values[rowIndex + j]);
-		}
-
-		for(int j = 0; j < ncol; j++) {
-			result.quickSetValue(0, _colIndexes[j], kbuff[j]._sum);
-			result.quickSetValue(1, _colIndexes[j], kbuff[j]._correction);
-		}
-	}
-
-	// protected void computeRowSums(MatrixBlock result, KahanFunction kplus, int rl, int ru) {
-	// int ncol = getNumCols();
-	// KahanObject kbuff = new KahanObject(0, 0);
-	// double[] values = getValues();
-	// for(int i = rl; i < ru; i++) {
-	// kbuff.set(result.quickGetValue(i, 0), result.quickGetValue(i, 1));
-	// int rowIndex = getIndex(i);
-	// for(int j = 0; j < ncol; j++)
-	// kplus.execute2(kbuff, values[rowIndex + j]);
-	// result.quickSetValue(i, 0, kbuff._sum);
-	// result.quickSetValue(i, 1, kbuff._correction);
-	// }
-	// }
-
 	@Override
-	protected void computeRowSums(MatrixBlock result, KahanFunction kplus, int rl, int ru) {
-		// note: due to corrections the output might be a large dense block
-		DenseBlock c = result.getDenseBlock();
+	protected void computeColSums(double[] c, KahanFunction kplus) {
+		_dict.colSum(c, getCounts(), _colIndexes, kplus);
+	}
 
-		if(_dict instanceof QDictionary && !(kplus instanceof KahanPlusSq)) {
-			final QDictionary qDict = ((QDictionary) _dict);
-			if(_colIndexes.length == 1) {
-				byte[] vals = qDict._values;
-				for(int i = rl; i < ru; i++) {
-					double[] cvals = c.values(i);
-					int cix = c.pos(i);
-					cvals[cix] = cvals[cix] + vals[getIndex(i)] * qDict._scale;
-				}
-			}
-			else {
-				short[] vals = qDict.sumAllRowsToShort(_colIndexes.length);
-				for(int i = rl; i < ru; i++) {
-					double[] cvals = c.values(i);
-					int cix = c.pos(i);
-					cvals[cix] = cvals[cix] + vals[getIndex(i)] * qDict._scale;
-				}
-			}
-		}
-		else {
-			KahanObject kbuff = new KahanObject(0, 0);
-			KahanPlus kplus2 = KahanPlus.getKahanPlusFnObject();
-			// pre-aggregate nnz per value tuple
-			double[] vals = _dict.sumAllRowsToDouble(kplus, kbuff, _colIndexes.length, false);
+	@Override
+	protected void computeRowSums(double[] c, KahanFunction kplus, int rl, int ru) {
+		final int numVals = getNumValues();
+		KahanObject kbuff = new KahanObject(0, 0);
+		KahanPlus kplus2 = KahanPlus.getKahanPlusFnObject();
+		// pre-aggregate nnz per value tuple
+		double[] vals = _dict.sumAllRowsToDouble(kplus, kbuff, _colIndexes.length);
 
-			// scan data and add to result (use kahan plus not general KahanFunction
-			// for correctness in case of sqk+)
-			for(int i = rl; i < ru; i++) {
-				double[] cvals = c.values(i);
-				int cix = c.pos(i);
-				kbuff.set(cvals[cix], cvals[cix + 1]);
-				kplus2.execute2(kbuff, vals[getIndex(i)]);
-				cvals[cix] = kbuff._sum;
-				cvals[cix + 1] = kbuff._correction;
+		for(int rix = rl; rix < ru; rix++) {
+			int index = getIndex(rix);
+			if(index != numVals) {
+				setandExecute(c, kbuff, kplus2, vals[index], rix * 2);
 			}
-
 		}
 	}
 
-	protected void computeRowMxx(MatrixBlock result, Builtin builtin, int rl, int ru) {
-		double[] c = result.getDenseBlockValues();
+	@Override
+	protected void computeRowMxx(double[] c, Builtin builtin, int rl, int ru) {
+		final int numVals = getNumValues();
 		int ncol = getNumCols();
 		double[] dictionary = getValues();
 
 		for(int i = rl; i < ru; i++) {
 			int rowIndex = getIndex(i);
-			for(int j = 0; j < ncol; j++)
-				c[i] = builtin.execute(c[i], dictionary[rowIndex + j]);
+			if(rowIndex != numVals) {
+				for(int j = 0; j < ncol; j++)
+					c[i] = builtin.execute(c[i], dictionary[rowIndex + j]);
+			}
+			else {
+				c[i] = builtin.execute(c[i], 0.0);
+			}
 		}
 	}
 
 	protected final void postScaling(double[] vals, double[] c) {
 		final int ncol = getNumCols();
 		final int numVals = getNumValues();
-		double[] values = getValues();
 
-		for(int k = 0, valOff = 0; k < numVals; k++, valOff += ncol) {
-			double aval = vals[k];
+		if(_dict instanceof QDictionary) {
+			QDictionary d = (QDictionary) _dict;
+			byte[] values = d.getValuesByte();
+			for(int k = 0, valOff = 0; k < numVals; k++, valOff += ncol) {
+				double aval = vals[k];
+				if(valOff != numVals) {
+					for(int j = 0; j < ncol; j++) {
+						int colIx = _colIndexes[j];
+						c[colIx] += aval * values[valOff + j];
+					}
+				}
+			}
 			for(int j = 0; j < ncol; j++) {
 				int colIx = _colIndexes[j];
-				c[colIx] += aval * values[valOff + j];
+				c[colIx] = c[colIx] * d._scale;
+			}
+		}
+		else {
+			double[] values = getValues();
+			for(int k = 0, valOff = 0; k < numVals; k++, valOff += ncol) {
+				double aval = vals[k];
+				if(valOff != numVals) {
+					for(int j = 0; j < ncol; j++) {
+						int colIx = _colIndexes[j];
+						c[colIx] += aval * values[valOff + j];
+					}
+				}
 			}
 		}
 	}
 
-	/**
-	 * Generic get index in dictionary for value at row position.
-	 * 
-	 * @param r row position to get dictionary index for.
-	 * @return The dictionary index
-	 */
-	protected abstract int getIndex(int r);
+	@Override
+	public int[] getCounts(int[] counts) {
+		return getCounts(0, _numRows, counts);
+	}
 
-	/**
-	 * Generic get index in dictionary for value at row, col position. If used consider changing to getIndex and
-	 * precalculate offset to row
-	 * 
-	 * @param r     The row to find
-	 * @param colIx the col index to find
-	 * @return the index in the dictionary containing the specified value
-	 */
-	protected abstract int getIndex(int r, int colIx);
+	@Override
+	public int[] getCounts(int rl, int ru, int[] counts) {
+		for(int i = rl; i < ru; i++) {
+			int index = getIndex(i);
+			counts[index]++;
+		}
+		return counts;
+	}
 
-	/**
-	 * Generic get value for byte-length-agnostic access to first column.
-	 * 
-	 * @param r global row index
-	 * @return value
-	 */
-	protected abstract double getData(int r, double[] dictionary);
+	@Override
+	public void rightMultByVector(MatrixBlock vector, double[] c, int rl, int ru) {
+		double[] b = ColGroupConverter.getDenseVector(vector);
+		// double[] c = result.getDenseBlockValues();
+		final int numCols = getNumCols();
+		final int numVals = getNumValues();
 
-	/**
-	 * Generic get value for byte-length-agnostic access.
-	 * 
-	 * @param r          global row index
-	 * @param colIx      local column index
-	 * @param dictionary The values contained in the column groups dictionary
-	 * @return value
-	 */
-	protected abstract double getData(int r, int colIx, double[] dictionary);
+		// prepare reduced rhs w/ relevant values
+		double[] sb = new double[numCols];
+		for(int j = 0; j < numCols; j++) {
+			sb[j] = b[_colIndexes[j]];
+		}
 
-	/**
-	 * Generic set value for byte-length-agnostic write of encoded value.
-	 * 
-	 * @param r    global row index
-	 * @param code encoded value
-	 */
-	protected abstract void setData(int r, int code);
+		// pre-aggregate all distinct values
+		double[] vals = preaggValues(numVals, sb);
 
-	protected abstract int getCode(int r);
+		// iterative over codes and add to output
+		for(int i = rl; i < ru; i++) {
+			int index = getIndex(i);
+			if(index != numVals) { // Since we know that multiplying with 0 is .. 0 don't begin to aggregate.
+				c[i] += vals[index];
+			}
+		}
+	}
+
+	@Override
+	public void leftMultByRowVector(MatrixBlock vector, MatrixBlock result) {
+		double[] a = ColGroupConverter.getDenseVector(vector);
+		double[] c = result.getDenseBlockValues();
+		final int numVals = getNumValues();
+
+		if(8 * numVals < _numRows) {
+			// iterative over codes and pre-aggregate inputs per code (guaranteed <=255)
+			// temporary array also avoids false sharing in multi-threaded environments
+			double[] vals = allocDVector(numVals, true);
+			for(int i = 0; i < _numRows; i++) {
+				int index = getIndex(i);
+				if(index != numVals) { // Since we know that multiplying with 0 is .. 0 don't begin to aggregate.
+					vals[index] += a[i];
+				}
+			}
+			postScaling(vals, c);
+		}
+		else {
+			// iterate over codes, compute all, and add to the result
+			double[] values = getValues();
+			for(int i = 0; i < _numRows; i++) {
+				double aval = a[i];
+				if(aval != 0)
+					for(int j = 0, valOff = getIndex(i) * _colIndexes.length; j < _colIndexes.length; j++)
+						c[_colIndexes[j]] += aval * values[valOff + j];
+			}
+		}
+
+	}
 
 	@Override
 	public Iterator<IJV> getIterator(int rl, int ru, boolean inclZeros, boolean rowMajor) {
@@ -365,22 +338,65 @@
 			// do nothing
 		}
 
-		@Override
 		public void next(double[] buff, int rowIx, int segIx, boolean last) {
 			// copy entire value tuple to output row
 			final int clen = getNumCols();
-			final int off = getCode(rowIx) * clen;
+			final int off = getIndex(rowIx) * clen;
 			final double[] values = getValues();
 			for(int j = 0; j < clen; j++)
 				buff[_colIndexes[j]] = values[off + j];
 		}
 	}
 
-	@Override
 	public String toString() {
 		StringBuilder sb = new StringBuilder();
 		sb.append(super.toString());
 		return sb.toString();
 	}
 
+	/**
+	 * Generic get index in dictionary for value at row position.
+	 * 
+	 * @param r row position to get dictionary index for.
+	 * @return The dictionary index
+	 */
+	protected abstract int getIndex(int r);
+
+	/**
+	 * Generic get index in dictionary for value at row, col position. If used consider changing to getIndex and
+	 * precalculate offset to row
+	 * 
+	 * @param r     The row to find
+	 * @param colIx the col index to find
+	 * @return the index in the dictionary containing the specified value
+	 */
+	protected abstract int getIndex(int r, int colIx);
+
+	/**
+	 * Generic get value for byte-length-agnostic access to first column.
+	 * 
+	 * @param r      Global row index
+	 * @param values The values contained in the column groups dictionary
+	 * @return value
+	 */
+	protected abstract double getData(int r, double[] values);
+
+	/**
+	 * Generic get value for byte-length-agnostic access.
+	 * 
+	 * @param r      Global row index
+	 * @param colIx  Local column index
+	 * @param values The values contained in the column groups dictionary
+	 * @return value
+	 */
+	protected abstract double getData(int r, int colIx, double[] values);
+
+	/**
+	 * Generic set value for byte-length-agnostic write of encoded value.
+	 * 
+	 * @param r    global row index
+	 * @param code encoded value
+	 */
+	protected abstract void setData(int r, int code);
+
 }
diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupDDC1.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupDDC1.java
index e4c579f..3e14601 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupDDC1.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupDDC1.java
@@ -26,8 +26,7 @@
 import java.util.HashMap;
 
 import org.apache.sysds.runtime.compress.CompressionSettings;
-import org.apache.sysds.runtime.compress.utils.AbstractBitmap;
-import org.apache.sysds.runtime.matrix.data.MatrixBlock;
+import org.apache.sysds.runtime.compress.utils.ABitmap;
 import org.apache.sysds.runtime.matrix.operators.ScalarOperator;
 
 /**
@@ -43,22 +42,23 @@
 		super();
 	}
 
-	protected ColGroupDDC1(int[] colIndices, int numRows, AbstractBitmap ubm, CompressionSettings cs) {
+	protected ColGroupDDC1(int[] colIndices, int numRows, ABitmap ubm, CompressionSettings cs) {
 		super(colIndices, numRows, ubm, cs);
 
 		int numVals = ubm.getNumValues();
 		int numCols = ubm.getNumColumns();
 
 		_data = new byte[numRows];
-
 		// materialize zero values, if necessary
 		if(ubm.getNumOffsets() < (long) numRows * numCols) {
 			int zeroIx = containsAllZeroValue();
 			if(zeroIx < 0) {
+				// Utilize the index of the length as a zero index, Makes lookups slower, but removes
+				// the need to allocate a new Dictionary
 				zeroIx = numVals;
-				_dict = IDictionary.materializeZeroValue(_dict, numCols);
 			}
 			Arrays.fill(_data, (byte) zeroIx);
+			_zeros = true;
 		}
 
 		// iterate over values and write dictionary codes
@@ -70,10 +70,10 @@
 		}
 	}
 
-	// Internal Constructor, to be used when copying a DDC Colgroup, and for scalar operations
-	protected ColGroupDDC1(int[] colIndices, int numRows, double[] values, byte[] data) {
-		super(colIndices, numRows, values);
+	protected ColGroupDDC1(int[] colIndices, int numRows, ADictionary dict, byte[] data, boolean zeros) {
+		super(colIndices, numRows, dict);
 		_data = data;
+		_zeros = zeros;
 	}
 
 	@Override
@@ -84,8 +84,6 @@
 	/**
 	 * Getter method to get the data, contained in The DDC ColGroup.
 	 * 
-	 * Not safe if modifications is made to the byte list.
-	 * 
 	 * @return The contained data
 	 */
 	public byte[] getData() {
@@ -103,13 +101,15 @@
 	}
 
 	@Override
-	protected double getData(int r, double[] dictionary) {
-		return dictionary[_data[r] & 0xFF];
+	protected double getData(int r, double[] values) {
+		int index = (_data[r] & 0xFF);
+		return (index == values.length) ? 0.0 : values[index];
 	}
 
 	@Override
 	protected double getData(int r, int colIx, double[] values) {
-		return values[(_data[r] & 0xFF) * getNumCols() + colIx];
+		int index = (_data[r] & 0xFF) * getNumCols() + colIx;
+		return (index == values.length) ? 0.0 : values[index];
 	}
 
 	@Override
@@ -117,11 +117,6 @@
 		_data[r] = (byte) code;
 	}
 
-	@Override
-	protected int getCode(int r) {
-		return(_data[r] & 0xFF);
-	}
-
 	public void recodeData(HashMap<Double, Integer> map) {
 		// prepare translation table
 		final int numVals = getNumValues();
@@ -139,7 +134,6 @@
 	public void write(DataOutput out) throws IOException {
 		super.write(out);
 		// write data
-		// out.writeInt(_numRows);
 		for(int i = 0; i < _numRows; i++)
 			out.writeByte(_data[i]);
 	}
@@ -156,9 +150,7 @@
 	@Override
 	public long getExactSizeOnDisk() {
 		long ret = super.getExactSizeOnDisk();
-		// data
 		ret += _data.length;
-
 		return ret;
 	}
 
@@ -168,199 +160,14 @@
 	}
 
 	@Override
-	public void decompressToBlock(MatrixBlock target, int rl, int ru) {
-		int ncol = getNumCols();
-		double[] values = getValues();
-		for(int i = rl; i < ru; i++)
-			for(int j = 0; j < ncol; j++)
-				target.appendValue(i, _colIndexes[j], values[(_data[i] & 0xFF) * ncol + j]);
-		// note: append ok because final sort per row
-	}
-
-	@Override
-	public void decompressToBlock(MatrixBlock target, int colpos) {
-		int nrow = getNumRows();
-		int ncol = getNumCols();
-		double[] c = target.getDenseBlockValues();
-		double[] values = getValues();
-		int nnz = 0;
-		for(int i = 0; i < nrow; i++)
-			nnz += ((c[i] = values[(_data[i] & 0xFF) * ncol + colpos]) != 0) ? 1 : 0;
-		target.setNonZeros(nnz);
-	}
-
-	@Override
-	public int[] getCounts(int[] counts) {
-		return getCounts(0, getNumRows(), counts);
-	}
-
-	@Override
-	public int[] getCounts(int rl, int ru, int[] counts) {
-		final int numVals = getNumValues();
-		Arrays.fill(counts, 0, numVals, 0);
-		for(int i = rl; i < ru; i++)
-			counts[_data[i] & 0xFF]++;
-		return counts;
-	}
-
-	@Override
-	public void countNonZerosPerRow(int[] rnnz, int rl, int ru) {
-		final int ncol = getNumCols();
-		final int numVals = getNumValues();
-		final double[] values = getValues();
-
-		// pre-aggregate nnz per value tuple
-		int[] counts = new int[numVals];
-		for(int k = 0, valOff = 0; k < numVals; k++, valOff += ncol)
-			for(int j = 0; j < ncol; j++)
-				counts[k] += (values[valOff + j] != 0) ? 1 : 0;
-
-		// scan data and add counts to output rows
-		for(int i = rl; i < ru; i++)
-			rnnz[i - rl] += counts[_data[i] & 0xFF];
-	}
-
-	@Override
-	public void rightMultByVector(MatrixBlock vector, MatrixBlock result, int rl, int ru) {
-		double[] b = ColGroupConverter.getDenseVector(vector);
-		double[] c = result.getDenseBlockValues();
-		final int numCols = getNumCols();
-		final int numVals = getNumValues();
-
-		// prepare reduced rhs w/ relevant values
-		double[] sb = new double[numCols];
-		for(int j = 0; j < numCols; j++) {
-			sb[j] = b[_colIndexes[j]];
-		}
-
-		// pre-aggregate all distinct values (guaranteed <=255)
-		double[] vals = preaggValues(numVals, sb);
-
-		// iterative over codes and add to output
-		for(int i = rl; i < ru; i++) {
-			c[i] += vals[_data[i] & 0xFF];
-		}
-	}
-
-	public static void rightMultByVector(ColGroupDDC1[] grps, MatrixBlock vector, MatrixBlock result, int rl, int ru) {
-		double[] b = ColGroupConverter.getDenseVector(vector);
-		double[] c = result.getDenseBlockValues();
-
-		// prepare distinct values once
-		double[][] vals = new double[grps.length][];
-		for(int i = 0; i < grps.length; i++) {
-			// prepare reduced rhs w/ relevant values
-			double[] sb = new double[grps[i].getNumCols()];
-			for(int j = 0; j < sb.length; j++) {
-				sb[j] = b[grps[i]._colIndexes[j]];
-			}
-			// pre-aggregate all distinct values (guaranteed <=255)
-			vals[i] = grps[i].preaggValues(grps[i].getNumValues(), sb, true);
-		}
-
-		// cache-conscious matrix-vector multiplication
-		// iterative over codes of all groups and add to output
-		int blksz = 2048; // 16KB
-		for(int bi = rl; bi < ru; bi += blksz)
-			for(int j = 0; j < grps.length; j++)
-				for(int i = bi; i < Math.min(bi + blksz, ru); i++)
-					c[i] += vals[j][grps[j]._data[i] & 0xFF];
-	}
-
-	@Override
-	public void leftMultByRowVector(MatrixBlock vector, MatrixBlock result) {
-		double[] a = ColGroupConverter.getDenseVector(vector);
-		double[] c = result.getDenseBlockValues();
-		// final int nrow = getNumRows();
-		final int numVals = getNumValues();
-
-		// iterative over codes and pre-aggregate inputs per code (guaranteed <=255)
-		// temporary array also avoids false sharing in multi-threaded environments
-		double[] vals = allocDVector(numVals, true);
-		for(int i = 0; i < _numRows; i++) {
-			int index = getIndex(i);
-			vals[index] += a[i];
-		}
-
-		// post-scaling of pre-aggregate with distinct values
-		postScaling(vals, c);
-	}
-
-	// @Override
-	// public void leftMultByRowVector(ColGroupDDC a, MatrixBlock result) {
-	// 	double[] c = result.getDenseBlockValues();
-	// 	final int nrow = getNumRows();
-	// 	final int numVals = getNumValues();
-	// 	// final double[] dictionary = getValues();
-
-	// 	// iterative over codes and pre-aggregate inputs per code (guaranteed <=255)
-	// 	// temporary array also avoids false sharing in multi-threaded environments
-	// 	double[] vals = allocDVector(numVals, true);
-	// 	double[] aDict = a.getValues();
-	// 	for(int i = 0; i < nrow; i++) {
-	// 		int rowIdA = a.getIndex(i);
-	// 		int rowIdThis = getIndex(i);
-	// 		vals[rowIdThis] += aDict[rowIdA];
-	// 	}
-	// 	// vals[_data[i] & 0xFF] += a.getData(i, dictionary);
-
-	// 	// post-scaling of pre-aggregate with distinct values
-	// 	postScaling(vals, c);
-	// }
-
-
-	// public static void computeRowSums(ColGroupDDC1[] grps, MatrixBlock result, KahanFunction kplus, int rl, int ru) {
-	// 	// note: due to corrections the output might be a large dense block
-	// 	DenseBlock c = result.getDenseBlock();
-
-	// 	if(grps[0]._dict instanceof QDictionary && !(kplus instanceof KahanPlusSq)) {
-
-
-	// 		return; // early return if needed.
-	// 	}
-
-	// 	KahanObject kbuff = new KahanObject(0, 0);
-	// 	KahanPlus kplus2 = KahanPlus.getKahanPlusFnObject();
-
-	// 	// prepare distinct values once
-	// 	double[][] vals = new double[grps.length][];
-	// 	for(int i = 0; i < grps.length; i++) {
-	// 		// pre-aggregate all distinct values (guaranteed <=255)
-	// 		vals[i] = grps[i].sumAllValues(kplus, kbuff);
-	// 	}
-
-	// 	// cache-conscious row sums operations
-	// 	// iterative over codes of all groups and add to output
-	// 	// (use kahan plus not general KahanFunction for correctness in case of sqk+)
-	// 	int blksz = 1024; // 16KB
-	// 	double[] tmpAgg = new double[blksz];
-	// 	for(int bi = rl; bi < ru; bi += blksz) {
-	// 		Arrays.fill(tmpAgg, 0);
-	// 		// aggregate all groups
-	// 		for(int j = 0; j < grps.length; j++) {
-	// 			double[] valsj = vals[j];
-	// 			byte[] dataj = grps[j]._data;
-	// 			for(int i = bi; i < Math.min(bi + blksz, ru); i++)
-	// 				tmpAgg[i - bi] += valsj[dataj[i] & 0xFF];
-	// 		}
-	// 		// add partial results of all ddc groups
-	// 		for(int i = bi; i < Math.min(bi + blksz, ru); i++) {
-	// 			double[] cvals = c.values(i);
-	// 			int cix = c.pos(i);
-	// 			kbuff.set(cvals[cix], cvals[cix + 1]);
-	// 			kplus2.execute2(kbuff, tmpAgg[i - bi]);
-	// 			cvals[cix] = kbuff._sum;
-	// 			cvals[cix + 1] = kbuff._correction;
-	// 		}
-	// 	}
-
-	// }
-
-	@Override
 	public ColGroup scalarOperation(ScalarOperator op) {
-		// fast path: sparse-safe and -unsafe operations
-		// as zero are represented, it is sufficient to simply apply the scalar op
-		return new ColGroupDDC1(_colIndexes, _numRows, applyScalarOp(op), _data);
+		double val0 = op.executeScalar(0);
+		if(op.sparseSafe || val0 == 0 || !_zeros) {
+			return new ColGroupDDC1(_colIndexes, _numRows, applyScalarOp(op), _data, _zeros);
+		}
+		else {
+			return new ColGroupDDC1(_colIndexes, _numRows, applyScalarOp(op, val0, _colIndexes.length), _data, false);
+		}
 	}
 
 	@Override
diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupDDC2.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupDDC2.java
index b3d9fc7..6cdb7d4 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupDDC2.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupDDC2.java
@@ -25,8 +25,7 @@
 import java.util.Arrays;
 
 import org.apache.sysds.runtime.compress.CompressionSettings;
-import org.apache.sysds.runtime.compress.utils.AbstractBitmap;
-import org.apache.sysds.runtime.matrix.data.MatrixBlock;
+import org.apache.sysds.runtime.compress.utils.ABitmap;
 import org.apache.sysds.runtime.matrix.operators.ScalarOperator;
 
 /**
@@ -42,7 +41,7 @@
 		super();
 	}
 
-	protected ColGroupDDC2(int[] colIndices, int numRows, AbstractBitmap ubm, CompressionSettings cs) {
+	protected ColGroupDDC2(int[] colIndices, int numRows, ABitmap ubm, CompressionSettings cs) {
 		super(colIndices, numRows, ubm, cs);
 		_data = new char[numRows];
 
@@ -54,9 +53,9 @@
 			int zeroIx = containsAllZeroValue();
 			if(zeroIx < 0) {
 				zeroIx = numVals;
-				_dict = IDictionary.materializeZeroValue(_dict, numCols);
 			}
 			Arrays.fill(_data, (char) zeroIx);
+			_zeros = true;
 		}
 
 		// iterate over values and write dictionary codes
@@ -68,10 +67,10 @@
 		}
 	}
 
-	// Internal Constructor, to be used when copying a DDC Colgroup, and for scalar operations
-	protected ColGroupDDC2(int[] colIndices, int numRows, double[] values, char[] data) {
-		super(colIndices, numRows, values);
+	protected ColGroupDDC2(int[] colIndices, int numRows, ADictionary dict, char[] data, boolean zeros) {
+		super(colIndices, numRows, dict);
 		_data = data;
+		_zeros = zeros;
 	}
 
 	@Override
@@ -82,23 +81,20 @@
 	/**
 	 * Getter method to get the data, contained in The DDC ColGroup.
 	 * 
-	 * Not safe if modifications is made to the byte list.
-	 * 
 	 * @return The contained data
 	 */
-
 	public char[] getData() {
 		return _data;
 	}
 
 	@Override
-	protected int getIndex(int r){
+	protected int getIndex(int r) {
 		return _data[r];
 	}
-	
+
 	@Override
-	protected int getIndex(int r, int colIx){
-		return _data[r]  * getNumCols() + colIx;
+	protected int getIndex(int r, int colIx) {
+		return _data[r] * getNumCols() + colIx;
 	}
 
 	@Override
@@ -107,7 +103,7 @@
 	}
 
 	@Override
-	protected double getData(int r, int colIx,  double[] dictionary) {
+	protected double getData(int r, int colIx, double[] dictionary) {
 		return _dict.getValue(_data[r] * getNumCols() + colIx);
 	}
 
@@ -117,11 +113,6 @@
 	}
 
 	@Override
-	protected int getCode(int r) {
-		return _data[r];
-	}
-
-	@Override
 	public void write(DataOutput out) throws IOException {
 		super.write(out);
 		// write data
@@ -155,170 +146,14 @@
 	}
 
 	@Override
-	public void decompressToBlock(MatrixBlock target, int rl, int ru) {
-		int ncol = getNumCols();
-		double[] values = getValues();
-		for(int i = rl; i < ru; i++)
-			for(int j = 0; j < ncol; j++)
-				target.appendValue(i, _colIndexes[j], values[_data[i] * ncol + j]);
-		// note: append ok because final sort per row
-	}
-
-	@Override
-	public void decompressToBlock(MatrixBlock target, int colpos) {
-		int nrow = getNumRows();
-		int ncol = getNumCols();
-		double[] c = target.getDenseBlockValues();
-		double[] values = getValues();
-		int nnz = 0;
-		for(int i = 0; i < nrow; i++)
-			nnz += ((c[i] = values[_data[i] * ncol + colpos]) != 0) ? 1 : 0;
-		target.setNonZeros(nnz);
-	}
-
-	@Override
-	public int[] getCounts(int[] counts) {
-		return getCounts(0, getNumRows(), counts);
-	}
-
-	@Override
-	public int[] getCounts(int rl, int ru, int[] counts) {
-		final int numVals = getNumValues();
-		Arrays.fill(counts, 0, numVals, 0);
-		for(int i = rl; i < ru; i++)
-			counts[_data[i]]++;
-		return counts;
-	}
-
-	@Override
-	public void countNonZerosPerRow(int[] rnnz, int rl, int ru) {
-		final int ncol = getNumCols();
-		final int numVals = getNumValues();
-		final double[] values = getValues();
-
-		// pre-aggregate nnz per value tuple
-		int[] counts = new int[numVals];
-		for(int k = 0, valOff = 0; k < numVals; k++, valOff += ncol)
-			for(int j = 0; j < ncol; j++)
-				counts[k] += (values[valOff + j] != 0) ? 1 : 0;
-
-		// scan data and add counts to output rows
-		for(int i = rl; i < ru; i++)
-			rnnz[i - rl] += counts[_data[i]];
-	}
-
-	@Override
-	public void rightMultByVector(MatrixBlock vector, MatrixBlock result, int rl, int ru) {
-		double[] b = ColGroupConverter.getDenseVector(vector);
-		double[] c = result.getDenseBlockValues();
-		final int numCols = getNumCols();
-		final int numVals = getNumValues();
-
-		// prepare reduced rhs w/ relevant values
-		double[] sb = new double[numCols];
-		for(int j = 0; j < numCols; j++) {
-			sb[j] = b[_colIndexes[j]];
-		}
-
-		// pre-aggregate all distinct values
-		double[] vals = preaggValues(numVals, sb);
-
-		// iterative over codes and add to output
-		for(int i = rl; i < ru; i++)
-			c[i] += vals[_data[i]];
-	}
-
-	@Override
-	public void leftMultByRowVector(MatrixBlock vector, MatrixBlock result) {
-		double[] a = ColGroupConverter.getDenseVector(vector);
-		double[] c = result.getDenseBlockValues();
-		final int nrow = getNumRows();
-		final int ncol = getNumCols();
-		final int numVals = getNumValues();
-
-		if(8 * numVals < getNumRows()) {
-			// iterative over codes and pre-aggregate inputs per code
-			// temporary array also avoids false sharing in multi-threaded environments
-			double[] vals = allocDVector(numVals, true);
-			for(int i = 0; i < nrow; i++) {
-				vals[_data[i]] += a[i];
-			}
-
-			// post-scaling of pre-aggregate with distinct values
-			postScaling(vals, c);
-		}
-		else // general case
-		{
-			// iterate over codes, compute all, and add to the result
-			double[] values = getValues();
-			for(int i = 0; i < nrow; i++) {
-				double aval = a[i];
-				if(aval != 0)
-					for(int j = 0, valOff = _data[i] * ncol; j < ncol; j++)
-						c[_colIndexes[j]] += aval * values[valOff + j];
-			}
-		}
-	}
-
-	// @Override
-	// public void leftMultByRowVector(ColGroupDDC a, MatrixBlock result) {
-	// 	double[] c = result.getDenseBlockValues();
-	// 	final int nrow = getNumRows();
-	// 	final int ncol = getNumCols();
-	// 	final int numVals = getNumValues();
-	// 	final double[] dictionary = getValues();
-
-	// 	if(8 * numVals < getNumRows()) {
-	// 		// iterative over codes and pre-aggregate inputs per code
-	// 		// temporary array also avoids false sharing in multi-threaded environments
-	// 		double[] vals = allocDVector(numVals, true);
-	// 		for(int i = 0; i < nrow; i++) {
-	// 			vals[_data[i]] += a.getData(i, dictionary);
-	// 		}
-
-	// 		// post-scaling of pre-aggregate with distinct values
-	// 		postScaling(vals, c);
-	// 	}
-	// 	else // general case
-	// 	{
-	// 		// iterate over codes, compute all, and add to the result
-	// 		double[] values = getValues();
-	// 		for(int i = 0; i < nrow; i++) {
-	// 			double aval = a.getData(i, 0, dictionary);
-	// 			if(aval != 0)
-	// 				for(int j = 0, valOff = _data[i] * ncol; j < ncol; j++)
-	// 					c[_colIndexes[j]] += aval * values[valOff + j];
-	// 		}
-	// 	}
-	// }
-
-	// @Override
-	// protected void computeRowSums(MatrixBlock result, KahanFunction kplus, int rl, int ru) {
-	// 	// note: due to corrections the output might be a large dense block
-	// 	DenseBlock c = result.getDenseBlock();
-	// 	KahanObject kbuff = new KahanObject(0, 0);
-	// 	KahanPlus kplus2 = KahanPlus.getKahanPlusFnObject();
-
-	// 	// pre-aggregate nnz per value tuple
-	// 	double[] vals = sumAllValues(kplus, kbuff, false);
-
-	// 	// scan data and add to result (use kahan plus not general KahanFunction
-	// 	// for correctness in case of sqk+)
-	// 	for(int i = rl; i < ru; i++) {
-	// 		double[] cvals = c.values(i);
-	// 		int cix = c.pos(i);
-	// 		kbuff.set(cvals[cix], cvals[cix + 1]);
-	// 		kplus2.execute2(kbuff, vals[_data[i]]);
-	// 		cvals[cix] = kbuff._sum;
-	// 		cvals[cix + 1] = kbuff._correction;
-	// 	}
-	// }
-
-	@Override
 	public ColGroup scalarOperation(ScalarOperator op) {
-		// fast path: sparse-safe and -unsafe operations
-		// as zero are represented, it is sufficient to simply apply the scalar op
-		return new ColGroupDDC2(_colIndexes, _numRows, applyScalarOp(op), _data);
+		double val0 = op.executeScalar(0);
+		if(op.sparseSafe || val0 == 0 || !_zeros) {
+			return new ColGroupDDC2(_colIndexes, _numRows, applyScalarOp(op), _data, _zeros);
+		}
+		else {
+			return new ColGroupDDC2(_colIndexes, _numRows, applyScalarOp(op, val0, _colIndexes.length), _data, false);
+		}
 	}
 
 	@Override
diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupFactory.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupFactory.java
index 3472c1d..8b55c9c 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupFactory.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupFactory.java
@@ -37,7 +37,7 @@
 import org.apache.sysds.runtime.compress.estim.CompressedSizeEstimator;
 import org.apache.sysds.runtime.compress.estim.CompressedSizeEstimatorExact;
 import org.apache.sysds.runtime.compress.estim.CompressedSizeInfoColGroup;
-import org.apache.sysds.runtime.compress.utils.AbstractBitmap;
+import org.apache.sysds.runtime.compress.utils.ABitmap;
 import org.apache.sysds.runtime.matrix.data.MatrixBlock;
 import org.apache.sysds.runtime.util.CommonThreadPool;
 
@@ -54,7 +54,7 @@
 	 * @param compRatios   The previously computed Compression ratings of individual col indexes.
 	 * @param groups       The column groups to consider compressing together.
 	 * @param compSettings The compression settings to construct the compression based on.
-	 * @param k            The number of parallelism used.
+	 * @param k            The degree of parallelism used.
 	 * @return A Resulting array of ColGroups, containing the compressed information from the input matrix block.
 	 */
 	public static ColGroup[] compressColGroups(MatrixBlock in, HashMap<Integer, Double> compRatios, List<int[]> groups,
@@ -146,7 +146,7 @@
 		CompressedSizeInfoColGroup sizeInfo;
 		// The compression type is decided based on a full bitmap since it
 		// will be reused for the actual compression step.
-		AbstractBitmap ubm = null;
+		ABitmap ubm = null;
 		PriorityQueue<CompressedColumn> compRatioPQ = CompressedColumn.makePriorityQue(compRatios, colIndexes);
 
 		// Switching to exact estimator here, when doing the actual compression.
@@ -215,7 +215,7 @@
 	 * @param rawMatrixBlock The copy of the original input (maybe transposed) MatrixBlock
 	 * @return A Compressed ColGroup
 	 */
-	public static ColGroup compress(int[] colIndexes, int rlen, AbstractBitmap ubm, CompressionType compType,
+	public static ColGroup compress(int[] colIndexes, int rlen, ABitmap ubm, CompressionType compType,
 		CompressionSettings cs, MatrixBlock rawMatrixBlock) {
 		switch(compType) {
 			case DDC:
@@ -231,8 +231,6 @@
 				return new ColGroupOLE(colIndexes, rlen, ubm, cs);
 			case UNCOMPRESSED:
 				return new ColGroupUncompressed(colIndexes, rawMatrixBlock, cs);
-			// case QUAN:
-				// return new ColGroupQuan(colIndexes, rlen, ubm);
 			default:
 				throw new DMLCompressionException("Not implemented ColGroup Type compressed in factory.");
 		}
diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupIO.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupIO.java
index 03e78d7..eb6e1a4 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupIO.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupIO.java
@@ -41,21 +41,20 @@
 	/**
 	 * Read groups from a file. Note that the information about how many should be in the file already.
 	 * 
-	 * @param in              The Data input object to read from.
-	 * @param _sharedDDC1Dict Boolean flag to specify if the DCC should share dictionary.
+	 * @param in The Data input object to read from.
 	 * @return Return a List containing the ColGroups from the DataInput.
 	 * @throws IOException Throws IO Exception if the in refuses to read data.
 	 */
-	public static List<ColGroup> readGroups(DataInput in, boolean _sharedDDC1Dict) throws IOException {
+	public static List<ColGroup> readGroups(DataInput in) throws IOException {
 
 		// Read in how many colGroups there are
 		int nColGroups = in.readInt();
 		LOG.debug("reading " + nColGroups + " ColGroups");
 		// Allocate that amount into an ArrayList
 		List<ColGroup> _colGroups = new ArrayList<>(nColGroups);
-		// double[] sharedDict = null;
 
 		// Read each ColGroup one at a time.
+
 		for(int i = 0; i < nColGroups; i++) {
 			ColGroupType ctype = ColGroupType.values()[in.readByte()];
 			LOG.debug(ctype);
@@ -78,9 +77,6 @@
 				case DDC2:
 					grp = new ColGroupDDC2();
 					break;
-				// case QUAN8S:
-					// grp = new ColGroupQuan();
-					// break;
 				default:
 					throw new DMLRuntimeException("Unsupported ColGroup Type used:  " + ctype);
 			}
@@ -89,14 +85,6 @@
 			// and numCols evaluated in DDC1 because numCols not available yet
 			grp.readFields(in);
 
-			// use shared DDC1 dictionary if applicable
-			// if(_sharedDDC1Dict && grp.getNumCols() == 1 && grp instanceof ColGroupDDC1) {
-			// 	if(sharedDict == null)
-			// 		sharedDict = ((ColGroupValue) grp).getValues();
-			// 	else
-			// 		((ColGroupValue) grp).setValues(sharedDict);
-			// }
-
 			_colGroups.add(grp);
 		}
 
@@ -106,24 +94,17 @@
 	/**
 	 * Writes the ColGroups out to the DataOutput.
 	 * 
-	 * @param out             The DataOutput the ColGroups are written to
-	 * @param _sharedDDC1Dict Boolean flag specifying if the DDC share dictionaries.
-	 * @param _colGroups      List of the ColGroups to write to file.
+	 * @param out       The DataOutput the ColGroups are written to
+	 * @param colGroups List of the ColGroups to write to file.
 	 * @throws IOException Throws IO Exception if the out refuses to write.
 	 */
-	public static void writeGroups(DataOutput out, boolean _sharedDDC1Dict, List<ColGroup> _colGroups)
-		throws IOException {
-		// Write out how many ColGroups we save.
-		out.writeInt(_colGroups.size());
+	public static void writeGroups(DataOutput out, List<ColGroup> colGroups) throws IOException {
+		// Write out how many ColGroups to save.
+		out.writeInt(colGroups.size());
 
-		// boolean skipDict = false;
-		for(ColGroup grp : _colGroups) {
-			// TODO save DDC Dict sharing smarter.
-			// boolean shared = false;// (grp instanceof ColGroupDDC1 && _sharedDDC1Dict && grp.getNumCols() == 1);
+		for(ColGroup grp : colGroups) {
 			out.writeByte(grp.getColGroupType().ordinal());
-			// grp.write(out, skipDict & shared);
 			grp.write(out);
-			// skipDict |= shared;
 		}
 	}
 }
diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupOLE.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupOLE.java
index aa3d871..8dda7f7 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupOLE.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupOLE.java
@@ -25,16 +25,12 @@
 import java.util.Arrays;
 import java.util.Iterator;
 
-import org.apache.commons.logging.Log;
-import org.apache.commons.logging.LogFactory;
 import org.apache.sysds.runtime.compress.CompressionSettings;
-import org.apache.sysds.runtime.compress.utils.AbstractBitmap;
+import org.apache.sysds.runtime.compress.utils.ABitmap;
 import org.apache.sysds.runtime.compress.utils.LinearAlgebraUtils;
-import org.apache.sysds.runtime.data.DenseBlock;
 import org.apache.sysds.runtime.functionobjects.Builtin;
 import org.apache.sysds.runtime.functionobjects.KahanFunction;
 import org.apache.sysds.runtime.functionobjects.KahanPlus;
-import org.apache.sysds.runtime.functionobjects.KahanPlusSq;
 import org.apache.sysds.runtime.instructions.cp.KahanObject;
 import org.apache.sysds.runtime.matrix.data.MatrixBlock;
 import org.apache.sysds.runtime.matrix.operators.ScalarOperator;
@@ -46,8 +42,6 @@
 public class ColGroupOLE extends ColGroupOffset {
 	private static final long serialVersionUID = -9157676271360528008L;
 
-	private static final Log LOG = LogFactory.getLog(ColGroupOLE.class.getName());
-
 	protected int[] _skipList;
 
 	protected ColGroupOLE() {
@@ -62,7 +56,7 @@
 	 * @param ubm        Uncompressed bitmap representation of the block
 	 * @param cs         The Compression settings used for compression
 	 */
-	protected ColGroupOLE(int[] colIndices, int numRows, AbstractBitmap ubm, CompressionSettings cs) {
+	protected ColGroupOLE(int[] colIndices, int numRows, ABitmap ubm, CompressionSettings cs) {
 		super(colIndices, numRows, ubm, cs);
 
 		// compress the bitmaps
@@ -78,11 +72,11 @@
 		createCompressedBitmaps(numVals, totalLen, lbitmaps);
 
 		_skipList = null;
-		if(CREATE_SKIP_LIST && numRows > 2 * CompressionSettings.BITMAP_BLOCK_SZ) {
+		if(cs.skipList && numRows > 2 * CompressionSettings.BITMAP_BLOCK_SZ) {
 			_skipList = new int[numVals];
 			int blksz = CompressionSettings.BITMAP_BLOCK_SZ;
 			// _skipList = new int[numVals];
-			int rl = (getNumRows() / 2 / blksz) * blksz;
+			int rl = (_numRows / 2 / blksz) * blksz;
 			for(int k = 0; k < numVals; k++) {
 				int boff = _ptr[k];
 				int blen = len(k);
@@ -94,15 +88,11 @@
 			}
 		}
 
-		// debug output
-		double ucSize = MatrixBlock.estimateSizeDenseInMemory(numRows, colIndices.length);
-		if(estimateInMemorySize() > ucSize)
-			LOG.warn("OLE group larger than UC dense: " + estimateInMemorySize() + " " + ucSize);
 	}
 
-	protected ColGroupOLE(int[] colIndices, int numRows, boolean zeros, double[] values, char[] bitmaps,
+	protected ColGroupOLE(int[] colIndices, int numRows, boolean zeros, ADictionary dict, char[] bitmaps,
 		int[] bitmapOffs) {
-		super(colIndices, numRows, zeros, values);
+		super(colIndices, numRows, zeros, dict);
 		_data = bitmaps;
 		_ptr = bitmapOffs;
 	}
@@ -158,7 +148,6 @@
 			final int blksz = CompressionSettings.BITMAP_BLOCK_SZ;
 			final int numCols = getNumCols();
 			final int numVals = getNumValues();
-			final int n = getNumRows();
 			final double[] values = getValues();
 
 			// cache blocking config and position array
@@ -170,7 +159,7 @@
 				cix[j] = colixTargets[_colIndexes[j]];
 
 			// cache conscious append via horizontal scans
-			for(int bi = 0; bi < n; bi += blksz) {
+			for(int bi = 0; bi < _numRows; bi += blksz) {
 				for(int k = 0, off = 0; k < numVals; k++, off += numCols) {
 					int boff = _ptr[k];
 					int blen = len(k);
@@ -198,7 +187,6 @@
 		final int blksz = CompressionSettings.BITMAP_BLOCK_SZ;
 		final int numCols = getNumCols();
 		final int numVals = getNumValues();
-		final int n = getNumRows();
 		double[] c = target.getDenseBlockValues();
 		final double[] values = getValues();
 
@@ -207,8 +195,8 @@
 
 		// cache conscious append via horizontal scans
 		int nnz = 0;
-		for(int bi = 0; bi < n; bi += blksz) {
-			Arrays.fill(c, bi, Math.min(bi + blksz, n), 0);
+		for(int bi = 0; bi < _numRows; bi += blksz) {
+			Arrays.fill(c, bi, Math.min(bi + blksz, _numRows), 0);
 			for(int k = 0, off = 0; k < numVals; k++, off += numCols) {
 				int boff = _ptr[k];
 				int blen = len(k);
@@ -230,16 +218,22 @@
 	@Override
 	public int[] getCounts(int[] counts) {
 		final int numVals = getNumValues();
-		Arrays.fill(counts, 0, numVals, 0);
+		// Arrays.fill(counts, 0, numVals, 0);
+		int sum = 0;
 		for(int k = 0; k < numVals; k++) {
 			int boff = _ptr[k];
 			int blen = len(k);
 			// iterate over bitmap blocks and count partial lengths
 			int count = 0;
-			for(int bix = 0; bix < blen; bix += _data[boff + bix] + 1)
+			for(int bix = 0; bix < blen; bix += _data[boff + bix] + 1) {
 				count += _data[boff + bix];
+			}
+			sum += count;
 			counts[k] = count;
 		}
+		if(_zeros) {
+			counts[counts.length - 1] = _numRows * _colIndexes.length - sum;
+		}
 		return counts;
 	}
 
@@ -247,7 +241,8 @@
 	public int[] getCounts(int rl, int ru, int[] counts) {
 		final int blksz = CompressionSettings.BITMAP_BLOCK_SZ;
 		final int numVals = getNumValues();
-		Arrays.fill(counts, 0, numVals, 0);
+		// Arrays.fill(counts, 0, numVals, 0);
+		int sum = 0;
 		for(int k = 0; k < numVals; k++) {
 			int boff = _ptr[k];
 			int blen = len(k);
@@ -255,8 +250,12 @@
 			int count = 0;
 			for(int off = rl; bix < blen && off < ru; bix += _data[boff + bix] + 1, off += blksz)
 				count += _data[boff + bix];
+			sum += count;
 			counts[k] = count;
 		}
+		if(_zeros) {
+			counts[counts.length - 1] = (ru - rl) * _colIndexes.length - sum;
+		}
 		return counts;
 	}
 
@@ -274,7 +273,7 @@
 
 		// fast path: sparse-safe operations
 		// Note that bitmaps don't change and are shallow-copied
-		if(op.sparseSafe || val0 == 0) {
+		if(op.sparseSafe || val0 == 0 || !_zeros) {
 			return new ColGroupOLE(_colIndexes, _numRows, _zeros, applyScalarOp(op), _data, _ptr);
 		}
 
@@ -283,23 +282,22 @@
 		boolean[] lind = computeZeroIndicatorVector();
 		int[] loff = computeOffsets(lind);
 		if(loff.length == 0) { // empty offset list: go back to fast path
-			return new ColGroupOLE(_colIndexes, _numRows, true, applyScalarOp(op), _data, _ptr);
+			return new ColGroupOLE(_colIndexes, _numRows, false, applyScalarOp(op), _data, _ptr);
 		}
 
-		double[] rvalues = applyScalarOp(op, val0, getNumCols());
+		ADictionary rvalues = applyScalarOp(op, val0, getNumCols());
 		char[] lbitmap = genOffsetBitmap(loff, loff.length);
 		char[] rbitmaps = Arrays.copyOf(_data, _data.length + lbitmap.length);
 		System.arraycopy(lbitmap, 0, rbitmaps, _data.length, lbitmap.length);
 		int[] rbitmapOffs = Arrays.copyOf(_ptr, _ptr.length + 1);
 		rbitmapOffs[rbitmapOffs.length - 1] = rbitmaps.length;
 
-		return new ColGroupOLE(_colIndexes, _numRows, loff.length < _numRows, rvalues, rbitmaps, rbitmapOffs);
+		return new ColGroupOLE(_colIndexes, _numRows, false, rvalues, rbitmaps, rbitmapOffs);
 	}
 
 	@Override
-	public void rightMultByVector(MatrixBlock vector, MatrixBlock result, int rl, int ru) {
+	public void rightMultByVector(MatrixBlock vector, double[] c, int rl, int ru) {
 		double[] b = ColGroupConverter.getDenseVector(vector);
-		double[] c = result.getDenseBlockValues();
 		final int blksz = CompressionSettings.BITMAP_BLOCK_SZ;
 		final int numCols = getNumCols();
 		final int numVals = getNumValues();
@@ -353,7 +351,7 @@
 				// prepare value-to-add for entire value bitmap
 				int boff = _ptr[k];
 				int blen = len(k);
-				double val = sumValues(k, sb);
+				double val = sumValues(k, sb, _dict.getValues());
 
 				// iterate over bitmap blocks and add values
 				if(val != 0) {
@@ -387,10 +385,9 @@
 		final int blksz = CompressionSettings.BITMAP_BLOCK_SZ;
 		final int numCols = getNumCols();
 		final int numVals = getNumValues();
-		final int n = getNumRows();
 		final double[] values = getValues();
 
-		if(numVals > 1 && _numRows > blksz) {
+		if(numVals >= 1 && _numRows > blksz) {
 			// cache blocking config (see matrix-vector mult for explanation)
 			final int blksz2 = 2 * CompressionSettings.BITMAP_BLOCK_SZ;
 
@@ -401,8 +398,8 @@
 			double[] cvals = allocDVector(numVals, true);
 
 			// step 2: cache conscious matrix-vector via horizontal scans
-			for(int ai = 0; ai < n; ai += blksz2) {
-				int aimax = Math.min(ai + blksz2, n);
+			for(int ai = 0; ai < _numRows; ai += blksz2) {
+				int aimax = Math.min(ai + blksz2, _numRows);
 
 				// horizontal segment scan, incl pos maintenance
 				for(int k = 0; k < numVals; k++) {
@@ -449,87 +446,13 @@
 		}
 	}
 
-	// @Override
-	// public void leftMultByRowVector(ColGroupDDC a, MatrixBlock result) {
-	// 	// note: this method is only applicable for numrows < blocksize
-	// 	double[] c = result.getDenseBlockValues();
-	// 	final int numCols = getNumCols();
-	// 	final int numVals = getNumValues();
-	// 	final double[] values = getValues();
-	// 	final double[] aValues = a.getValues();
-
-	// 	// iterate over all values and their bitmaps
-	// 	for(int k = 0, valOff = 0; k < numVals; k++, valOff += numCols) {
-	// 		int boff = _ptr[k];
-
-	// 		// iterate over bitmap blocks and add partial results
-	// 		double vsum = 0;
-	// 		for(int j = boff + 1; j < boff + 1 + _data[boff]; j++)
-	// 			vsum += aValues[a.getIndex(_data[j])];
-
-	// 		// scale partial results by values and write results
-	// 		for(int j = 0; j < numCols; j++)
-	// 			c[_colIndexes[j]] += vsum * values[valOff + j];
-	// 	}
-	// }
-
 	@Override
-	protected final void computeSum(MatrixBlock result, KahanFunction kplus) {
-
-		// iterate over all values and their bitmaps
-		final int numVals = getNumValues();
-		final int numCols = getNumCols();
-
-		if(_dict instanceof QDictionary && !(kplus instanceof KahanPlusSq)) {
-			final QDictionary values = ((QDictionary) _dict);
-			long sum = 0;
-			for(int k = 0; k < numVals; k++) {
-				int boff = _ptr[k];
-				int blen = len(k);
-				int valOff = k * numCols;
-
-				// iterate over bitmap blocks and count partial lengths
-				int count = 0;
-				for(int bix = 0; bix < blen; bix += _data[boff + bix] + 1)
-					count += _data[boff + bix];
-
-				// scale counts by all values
-				for(int j = 0; j < numCols; j++)
-					sum += values.getValueByte(valOff + j) * count;
-			}
-			result.quickSetValue(0, 0, result.quickGetValue(0, 0) + sum * values._scale);
-			result.quickSetValue(0, 1, 0);
-		}
-		else {
-			KahanObject kbuff = new KahanObject(result.quickGetValue(0, 0), result.quickGetValue(0, 1));
-
-			final double[] values = getValues();
-
-			for(int k = 0; k < numVals; k++) {
-				int boff = _ptr[k];
-				int blen = len(k);
-				int valOff = k * numCols;
-
-				// iterate over bitmap blocks and count partial lengths
-				int count = 0;
-				for(int bix = 0; bix < blen; bix += _data[boff + bix] + 1)
-					count += _data[boff + bix];
-
-				// scale counts by all values
-				for(int j = 0; j < numCols; j++)
-					kplus.execute3(kbuff, values[valOff + j], count);
-			}
-
-			result.quickSetValue(0, 0, kbuff._sum);
-			result.quickSetValue(0, 1, kbuff._correction);
-		}
+	protected final void computeSum(double[] c, KahanFunction kplus) {
+		c[0] += _dict.sum(getCounts(), _colIndexes.length, kplus);
 	}
 
 	@Override
-	protected final void computeRowSums(MatrixBlock result, KahanFunction kplus, int rl, int ru) {
-		// note: due to corrections the output might be a large dense block
-		DenseBlock c = result.getDenseBlock();
-
+	protected final void computeRowSums(double[] c, KahanFunction kplus, int rl, int ru) {
 		KahanObject kbuff = new KahanObject(0, 0);
 		KahanPlus kplus2 = KahanPlus.getKahanPlusFnObject();
 		final int blksz = CompressionSettings.BITMAP_BLOCK_SZ;
@@ -540,7 +463,7 @@
 
 			// step 1: prepare position and value arrays
 			int[] apos = skipScan(numVals, rl);
-			double[] aval = _dict.sumAllRowsToDouble(kplus, kbuff, _colIndexes.length,false);
+			double[] aval = _dict.sumAllRowsToDouble(kplus, kbuff, _colIndexes.length);
 
 			// step 2: cache conscious row sums via horizontal scans
 			for(int bi = rl; bi < ru; bi += blksz2) {
@@ -561,14 +484,7 @@
 						// compute partial results
 						for(int i = 0; i < len; i++) {
 							int rix = ii + _data[pos + i];
-							double[] cvals = c.values(rix);
-							int cix = c.pos(rix);
-
-							kbuff.set(cvals[cix], cvals[cix + 1]);
-							kplus2.execute2(kbuff, val);
-							cvals[cix] = kbuff._sum;
-							cvals[cix + 1] = kbuff._correction;
-
+							setandExecute(c, kbuff, kplus2, val, rix * 2);
 						}
 						bix += len + 1;
 					}
@@ -577,9 +493,7 @@
 				}
 			}
 		}
-		else
-
-		{
+		else {
 			// iterate over all values and their bitmaps
 			for(int k = 0; k < numVals; k++) {
 				// prepare value-to-add for entire value bitmap
@@ -595,12 +509,7 @@
 						slen = _data[boff + bix];
 						for(int i = 1; i <= slen; i++) {
 							int rix = off + _data[boff + bix + i];
-							double[] cvals = c.values(rix);
-							int cix = c.pos(rix);
-							kbuff.set(cvals[cix], cvals[cix + 1]);
-							kplus2.execute2(kbuff, val);
-							cvals[cix] = kbuff._sum;
-							cvals[cix + 1] = kbuff._correction;
+							setandExecute(c, kbuff, kplus2, val, rix * 2);
 						}
 					}
 				}
@@ -609,47 +518,24 @@
 	}
 
 	@Override
-	protected final void computeColSums(MatrixBlock result, KahanFunction kplus) {
-		KahanObject kbuff = new KahanObject(0, 0);
-
-		// iterate over all values and their bitmaps
-		final int numVals = getNumValues();
-		final int numCols = getNumCols();
-		final double[] values = getValues();
-
-		for(int k = 0; k < numVals; k++) {
-			int boff = _ptr[k];
-			int blen = len(k);
-			int valOff = k * numCols;
-
-			// iterate over bitmap blocks and count partial lengths
-			int count = 0;
-			for(int bix = 0; bix < blen; bix += _data[boff + bix] + 1)
-				count += _data[boff + bix];
-
-			// scale counts by all values
-			for(int j = 0; j < numCols; j++) {
-				kbuff.set(result.quickGetValue(0, _colIndexes[j]), result.quickGetValue(1, _colIndexes[j]));
-				kplus.execute3(kbuff, values[valOff + j], count);
-				result.quickSetValue(0, _colIndexes[j], kbuff._sum);
-				result.quickSetValue(1, _colIndexes[j], kbuff._correction);
-			}
-		}
+	protected final void computeColSums(double[] c, KahanFunction kplus) {
+		_dict.colSum(c, getCounts(), _colIndexes, kplus);
 	}
 
 	@Override
-	protected final void computeRowMxx(MatrixBlock result, Builtin builtin, int rl, int ru) {
+	protected final void computeRowMxx(double[] c, Builtin builtin, int rl, int ru) {
 		// NOTE: zeros handled once for all column groups outside
 		final int blksz = CompressionSettings.BITMAP_BLOCK_SZ;
 		final int numVals = getNumValues();
-		double[] c = result.getDenseBlockValues();
+		final double[] values = getValues();
+		// double[] c = result.getDenseBlockValues();
 
 		// iterate over all values and their bitmaps
 		for(int k = 0; k < numVals; k++) {
 			// prepare value-to-add for entire value bitmap
 			int boff = _ptr[k];
 			int blen = len(k);
-			double val = mxxValues(k, builtin);
+			double val = mxxValues(k, builtin, values);
 
 			// iterate over bitmap blocks and add values
 			int slen;
@@ -748,7 +634,7 @@
 		final int blksz = CompressionSettings.BITMAP_BLOCK_SZ;
 
 		if(rl > 0) { // rl aligned with blksz
-			int rskip = (getNumRows() / 2 / blksz) * blksz;
+			int rskip = (_numRows / 2 / blksz) * blksz;
 
 			for(int k = 0; k < numVals; k++) {
 				int boff = _ptr[k];
@@ -769,7 +655,7 @@
 		final int blksz = CompressionSettings.BITMAP_BLOCK_SZ;
 
 		if(rl > 0) { // rl aligned with blksz
-			int rskip = (getNumRows() / 2 / blksz) * blksz;
+			int rskip = (_numRows / 2 / blksz) * blksz;
 			int boff = _ptr[k];
 			int blen = len(k);
 			int start = (rl >= rskip) ? rskip : 0;
@@ -827,7 +713,7 @@
 
 	@Override
 	public Iterator<Integer> getIterator(int k) {
-		return new OLEValueIterator(k, 0, getNumRows());
+		return new OLEValueIterator(k, 0, _numRows);
 	}
 
 	@Override
diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupOffset.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupOffset.java
index 5cd85a7..912910a 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupOffset.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupOffset.java
@@ -27,7 +27,7 @@
 import java.util.Iterator;
 
 import org.apache.sysds.runtime.compress.CompressionSettings;
-import org.apache.sysds.runtime.compress.utils.AbstractBitmap;
+import org.apache.sysds.runtime.compress.utils.ABitmap;
 import org.apache.sysds.runtime.compress.utils.LinearAlgebraUtils;
 import org.apache.sysds.runtime.functionobjects.Builtin;
 import org.apache.sysds.runtime.functionobjects.Builtin.BuiltinCode;
@@ -44,13 +44,12 @@
 public abstract class ColGroupOffset extends ColGroupValue {
 	private static final long serialVersionUID = -1635828933479403125L;
 
-	protected static final boolean CREATE_SKIP_LIST = true;
-
 	/** Bitmaps, one per uncompressed value tuple in {@link #_dict}. */
-	protected int[] _ptr; // bitmap offsets per value
-	protected char[] _data; // linearized bitmaps (variable length)
+	protected int[] _ptr;
+	/** Linearized bitmaps (variable lengths) */
+	protected char[] _data;
 
-	public ColGroupOffset() {
+	protected ColGroupOffset() {
 		super();
 	}
 
@@ -62,20 +61,12 @@
 	 * @param ubm        Uncompressed bitmap representation of the block
 	 * @param cs         The Compression settings used for compression
 	 */
-	public ColGroupOffset(int[] colIndices, int numRows, AbstractBitmap ubm, CompressionSettings cs) {
+	protected ColGroupOffset(int[] colIndices, int numRows, ABitmap ubm, CompressionSettings cs) {
 		super(colIndices, numRows, ubm, cs);
 	}
 
-	/**
-	 * Constructor for subclass methods that need to create shallow copies
-	 * 
-	 * @param colIndices raw column index information
-	 * @param numRows    number of rows in the block
-	 * @param zeros      indicator if column group contains zero values
-	 * @param values     set of distinct values for the block (associated bitmaps are kept in the subclass)
-	 */
-	protected ColGroupOffset(int[] colIndices, int numRows, boolean zeros, double[] values) {
-		super(colIndices, numRows, values);
+	protected ColGroupOffset(int[] colIndices, int numRows, boolean zeros, ADictionary dict){
+		super(colIndices, numRows, dict);
 		_zeros = zeros;
 	}
 
@@ -213,10 +204,9 @@
 			LinearAlgebraUtils.vectMultiplyAdd(b[i], values, c, off, 0, numVals);
 	}
 
-	protected final double mxxValues(int bitmapIx, Builtin builtin) {
+	protected final double mxxValues(int bitmapIx, Builtin builtin, double[] values) {
 		final int numCols = getNumCols();
 		final int valOff = bitmapIx * numCols;
-		final double[] values = getValues();
 		double val = (builtin.getBuiltinCode() == BuiltinCode.MAX) ?
 			Double.NEGATIVE_INFINITY : Double.POSITIVE_INFINITY;
 		for(int i = 0; i < numCols; i++)
@@ -303,6 +293,18 @@
 
 	protected abstract boolean[] computeZeroIndicatorVector();
 
+	// protected boolean findZeros(){
+		// boolean[] lind = computeZeroIndicatorVector();
+		// _zeros = false;
+		// for(boolean x : lind){
+		// 	if(x){
+		// 		_zeros = true;
+		// 		break;
+		// 	}
+		// }
+		// return _zeros;
+	// }
+
 	@Override
 	public Iterator<IJV> getIterator(int rl, int ru, boolean inclZeros, boolean rowMajor) {
 		if(rowMajor)
@@ -491,7 +493,7 @@
 		public IJV next() {
 			if(!hasNext())
 				throw new RuntimeException("No more offset entries.");
-			_ret.set(_rpos, _colIndexes[_cpos], (_vpos < 0) ? 0 : getValue(_vpos, _cpos));
+			_ret.set(_rpos, _colIndexes[_cpos], (_vpos < 0) ? 0 : _dict.getValue(_vpos *getNumCols()  + _cpos));
 			getNextValue();
 			return _ret;
 		}
@@ -515,7 +517,7 @@
 					return;
 				_cpos++;
 			}
-			while(!_inclZeros && (_vpos < 0 || getValue(_vpos, _cpos) == 0));
+			while(!_inclZeros && (_vpos < 0 ||  _dict.getValue(_vpos *getNumCols()  + _cpos) == 0));
 		}
 	}
 }
diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupQuan.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupQuan.java
deleted file mode 100644
index 7805921..0000000
--- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupQuan.java
+++ /dev/null
@@ -1,513 +0,0 @@
-// /*
-//  * Licensed to the Apache Software Foundation (ASF) under one
-//  * or more contributor license agreements.  See the NOTICE file
-//  * distributed with this work for additional information
-//  * regarding copyright ownership.  The ASF licenses this file
-//  * to you under the Apache License, Version 2.0 (the
-//  * "License"); you may not use this file except in compliance
-//  * with the License.  You may obtain a copy of the License at
-//  *
-//  *   http://www.apache.org/licenses/LICENSE-2.0
-//  *
-//  * Unless required by applicable law or agreed to in writing,
-//  * software distributed under the License is distributed on an
-//  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-//  * KIND, either express or implied.  See the License for the
-//  * specific language governing permissions and limitations
-//  * under the License.
-//  */
-
-// package org.apache.sysds.runtime.compress.colgroup;
-
-// import java.io.DataInput;
-// import java.io.DataOutput;
-// import java.io.IOException;
-// import java.util.Arrays;
-// import java.util.Iterator;
-
-// import org.apache.commons.lang.NotImplementedException;
-// import org.apache.sysds.runtime.DMLCompressionException;
-// import org.apache.sysds.runtime.DMLScriptException;
-// import org.apache.sysds.runtime.compress.utils.AbstractBitmap;
-// import org.apache.sysds.runtime.compress.utils.BitmapLossy;
-// import org.apache.sysds.runtime.functionobjects.Builtin;
-// import org.apache.sysds.runtime.functionobjects.Builtin.BuiltinCode;
-// import org.apache.sysds.runtime.functionobjects.KahanPlus;
-// import org.apache.sysds.runtime.functionobjects.KahanPlusSq;
-// import org.apache.sysds.runtime.functionobjects.ReduceAll;
-// import org.apache.sysds.runtime.functionobjects.ReduceCol;
-// import org.apache.sysds.runtime.functionobjects.ReduceRow;
-// import org.apache.sysds.runtime.matrix.data.IJV;
-// import org.apache.sysds.runtime.matrix.data.MatrixBlock;
-// import org.apache.sysds.runtime.matrix.operators.AggregateUnaryOperator;
-// import org.apache.sysds.runtime.matrix.operators.ScalarOperator;
-
-// public class ColGroupQuan extends ColGroup {
-
-// 	private static final long serialVersionUID = -9157476271360522008L;
-
-// 	protected QDictionary _values;
-
-// 	protected ColGroupQuan() {
-// 		super();
-// 	}
-
-// 	protected ColGroupQuan(int[] colIndexes, int numRows, AbstractBitmap ubm) {
-// 		// throw new NotImplementedException();
-// 		super(colIndexes, numRows);
-// 		byte[] lossyValues = ((BitmapLossy)ubm).getValues();
-// 		byte[] values = new byte[numRows * colIndexes.length];
-// 		for(int i = 0; i < lossyValues.length; i++) {
-// 			int[] runs = ubm.getOffsetsList(i).extractValues();
-// 			byte curV = lossyValues[i];
-
-// 			for(int j = 0; j < ubm.getOffsetsList(i).size(); j++) {
-// 				values[runs[j]] = curV;
-// 			}
-// 		}
-
-// 		_values = new QDictionary(values, ((BitmapLossy)ubm).getScale());
-// 	}
-
-// 	protected ColGroupQuan(int[] colIndexes, int numRows, QDictionary values) {
-// 		super(colIndexes, numRows);
-// 		_values = values;
-// 	}
-
-// 	@Override
-// 	public boolean getIfCountsType() {
-// 		return false;
-// 	}
-
-// 	private ColGroupQuan(int[] colIndexes, QDictionary values) {
-// 		super(colIndexes, values.getValuesLength() / colIndexes.length);
-// 		this._values = values;
-// 	}
-
-// 	@Override
-// 	public CompressionType getCompType() {
-// 		return CompressionType.QUAN;
-// 	}
-
-// 	@Override
-// 	protected ColGroupType getColGroupType() {
-// 		return ColGroupType.QUAN8S;
-// 	}
-
-// 	@Override
-// 	public void decompressToBlock(MatrixBlock target, int rl, int ru) {
-// 		if(_values == null || _values.getValuesLength()   == 0) {
-// 			return;
-// 		}
-// 		// TODO Fix Loop to not multiply
-// 		for(int row = rl; row < ru; row++) {
-// 			for(int colIx = 0; colIx < _colIndexes.length; colIx++) {
-// 				int col = _colIndexes[colIx];
-// 				target.quickSetValue(row, col, _values.getValue(row * colIx + row));
-// 			}
-// 		}
-// 	}
-
-// 	@Override
-// 	public void decompressToBlock(MatrixBlock target, int[] colIndexTargets) {
-// 		if(_values == null || _values.getValuesLength() == 0) {
-// 			return;
-// 		}
-// 		for(int row = 0; row < _numRows; row++) {
-// 			for(int colIx = 0; colIx < _colIndexes.length; colIx++) {
-// 				int col = _colIndexes[colIx];
-// 				target.quickSetValue(row, col, _values.getValue(row * colIx + row));
-// 			}
-// 		}
-// 	}
-
-// 	@Override
-// 	public void decompressToBlock(MatrixBlock target, int colpos) {
-// 		if(_values == null || _values.getValuesLength()  == 0)
-// 			return;
-
-// 		double[] c = target.getDenseBlockValues();
-// 		int nnz = 0;
-// 		// TODO Fix for multi col group
-// 		for(int row = 0; row < _numRows; row++) {
-// 			double val = _values.getValue(row);
-// 			if(val != 0) {
-// 				nnz++;
-// 			}
-// 			c[row] = val;
-// 		}
-// 		target.setNonZeros(nnz);
-// 	}
-
-// 	@Override
-// 	public void write(DataOutput out) throws IOException {
-
-// 		out.writeInt(_numRows);
-// 		out.writeInt(_colIndexes.length);
-
-// 		for(int i = 0; i < _colIndexes.length; i++)
-// 			out.writeInt(_colIndexes[i]);
-
-// 		for(int i = 0; i < _values.getValuesLength() ; i++)
-// 			out.writeByte(_values.getValueByte(i));
-
-// 		out.writeDouble(_values.getScale());
-// 	}
-
-// 	@Override
-// 	public void readFields(DataInput in) throws IOException {
-// 		_numRows = in.readInt();
-// 		int numCols = in.readInt();
-
-// 		_colIndexes = new int[numCols];
-// 		for(int i = 0; i < _colIndexes.length; i++)
-// 			_colIndexes[i] = in.readInt();
-
-// 		byte[] values = new byte[_numRows * numCols];
-// 		for(int i = 0; i < values.length; i++)
-// 			values[i] = in.readByte();
-
-// 		double scale = in.readDouble();
-
-// 		_values = new QDictionary(values, scale);
-// 	}
-
-// 	@Override
-// 	public long getExactSizeOnDisk() {
-// 		long ret = 8; // header
-// 		ret += 8; // Object header of QDictionary
-// 		ret += 4 * _colIndexes.length;
-// 		ret += _values.getValuesLength() ;
-// 		ret += 8; // scale value
-// 		return ret;
-// 	}
-
-// 	@Override
-// 	public double get(int r, int c) {
-// 		int colIx = Arrays.binarySearch(_colIndexes, c);
-// 		return _values.getValue(r * colIx + r);
-// 	}
-
-// 	@Override
-// 	public void rightMultByVector(MatrixBlock vector, MatrixBlock result, int rl, int ru) {
-
-// 		double[] b = ColGroupConverter.getDenseVector(vector);
-// 		double[] c = result.getDenseBlockValues();
-
-// 		if(_colIndexes.length == 1) {
-// 			double r = b[_colIndexes[0]] * _values.getScale();
-// 			for(int row = rl; row < ru; row++) {
-// 				c[row] += _values.getValueByte(row) * r;
-// 			}
-// 		}
-// 		else {
-
-// 			// prepare reduced rhs w/ relevant values
-// 			double[] sb = new double[_colIndexes.length];
-// 			for(int j = 0; j < _colIndexes.length; j++) {
-// 				sb[j] = b[_colIndexes[j]];
-// 			}
-
-// 			int colIx = 0;
-// 			for(int off = 0; off < _values.getValuesLength() ; off += _numRows) {
-// 				double r = _values.getScale() * sb[colIx];
-// 				for(int row = rl; row < ru; row++) {
-// 					c[row] += _values.getValueByte(off + row) * r;
-// 				}
-// 				colIx++;
-// 			}
-// 		}
-// 	}
-
-// 	@Override
-// 	public void leftMultByRowVector(MatrixBlock vector, MatrixBlock result) {
-// 		double[] a = ColGroupConverter.getDenseVector(vector);
-// 		double[] c = result.getDenseBlockValues();
-
-// 		for(int row = 0; row < _numRows; row++) {
-// 			double val = _values.getValue(row);
-// 			for(int col = 0; col < _colIndexes.length; col++) {
-// 				double value = val * a[row * col + row];
-// 				c[_colIndexes[col]] += value;
-// 			}
-// 		}
-
-// 	}
-
-// 	@Override
-// 	public void leftMultByRowVector(ColGroupDDC vector, MatrixBlock result) {
-// 		throw new NotImplementedException();
-// 	}
-
-// 	@Override
-// 	public ColGroup scalarOperation(ScalarOperator op) {
-// 		QDictionary res = _values.apply(op);
-// 		return new ColGroupQuan(_colIndexes, res);
-// 	}
-
-// 	@Override
-// 	public void unaryAggregateOperations(AggregateUnaryOperator op, MatrixBlock result) {
-// 		unaryAggregateOperations(op, result, 0, getNumRows());
-// 	}
-
-// 	@Override
-// 	public long estimateInMemorySize() {
-// 		return ColGroupSizes.estimateInMemorySizeQuan(getNumRows(), getNumCols());
-// 	}
-
-// 	@Override
-// 	public void unaryAggregateOperations(AggregateUnaryOperator op, MatrixBlock result, int rl, int ru) {
-
-// 		if(op.aggOp.increOp.fn instanceof KahanPlus) {
-
-// 			// Not using KahnObject because we already lost some of that precision anyway in
-// 			// quantization.
-// 			if(op.indexFn instanceof ReduceAll)
-// 				computeSum(result);
-// 			else if(op.indexFn instanceof ReduceCol)
-// 				computeRowSums(result, rl, ru);
-// 			else if(op.indexFn instanceof ReduceRow)
-// 				computeColSums(result);
-// 		}
-// 		else if(op.aggOp.increOp.fn instanceof KahanPlusSq) {
-// 			if(op.indexFn instanceof ReduceAll)
-// 				computeSumSq(result);
-// 			else if(op.indexFn instanceof ReduceCol)
-// 				computeRowSumsSq(result, rl, ru);
-// 			else if(op.indexFn instanceof ReduceRow)
-// 				computeColSumsSq(result);
-// 		}
-// 		else if(op.aggOp.increOp.fn instanceof Builtin &&
-// 			(((Builtin) op.aggOp.increOp.fn).getBuiltinCode() == BuiltinCode.MAX ||
-// 				((Builtin) op.aggOp.increOp.fn).getBuiltinCode() == BuiltinCode.MIN)) {
-// 			Builtin builtin = (Builtin) op.aggOp.increOp.fn;
-// 			// min and max (reduceall/reducerow over tuples only)
-
-// 			if(op.indexFn instanceof ReduceAll)
-// 				computeMxx(result, builtin, _zeros);
-// 			else if(op.indexFn instanceof ReduceCol)
-// 				computeRowMxx(result, builtin, rl, ru);
-// 			else if(op.indexFn instanceof ReduceRow)
-// 				computeColMxx(result, builtin, _zeros);
-// 		}
-// 		else {
-// 			throw new DMLScriptException("Unknown UnaryAggregate operator on CompressedMatrixBlock");
-// 		}
-// 	}
-
-// 	protected void computeSum(MatrixBlock result) {
-// 		long sum = 0L;
-// 		for(int i = 0; i < _values.length(); i++) {
-// 			sum += _values.getValueByte(i);
-// 		}
-// 		result.quickSetValue(0, 0, result.getValue(0, 0) + (double) sum * _values.getScale());
-// 	}
-
-// 	protected void computeSumSq(MatrixBlock result) {
-
-// 		double sumsq = 0;
-// 		for(int i = 0; i < _values.length(); i++) {
-// 			double v = _values.getValue(i);
-// 			sumsq += v * v;
-// 		}
-// 		result.quickSetValue(0, 0, result.getValue(0, 0) + sumsq);
-// 	}
-
-// 	protected void computeRowSums(MatrixBlock result, int rl, int ru) {
-// 		if(_colIndexes.length < 256) {
-// 			short[] rowSums = new short[ru - rl];
-// 			for(int row = rl; row < ru; row++) {
-// 				for(int colIx = 0; colIx < _colIndexes.length; colIx++) {
-// 					rowSums[row - rl] += _values.getValueByte(row * colIx + row);
-// 				}
-// 			}
-// 			for(int row = rl; row < ru; row++) {
-// 				result.quickSetValue(row, 0, result.getValue(row, 0) + rowSums[row - rl] * _values.getScale());
-// 			}
-// 		}
-// 		else {
-// 			throw new NotImplementedException("Not Implemented number of columns in ColGroupQuan row sum");
-// 		}
-// 	}
-
-// 	protected void computeRowSumsSq(MatrixBlock result, int rl, int ru) {
-// 		// TODO FIX Loop Index calculation!
-// 		if(_colIndexes.length < 256) {
-// 			float[] rowSumSq = new float[ru - rl];
-// 			for(int row = rl; row < ru; row++) {
-// 				for(int colIx = 0; colIx < _colIndexes.length; colIx++) {
-// 					double v = _values.getValue(row * colIx + row);
-// 					rowSumSq[row - rl] += v * v;
-// 				}
-// 			}
-
-// 			for(int row = rl; row < ru; row++) {
-// 				result.quickSetValue(row, 0, result.getValue(row, 0) + rowSumSq[row - rl]);
-// 			}
-
-// 		}
-// 		else {
-// 			throw new NotImplementedException("Not Implemented number of columns in ColGroupQuan row sum");
-// 		}
-// 	}
-
-// 	protected void computeColSums(MatrixBlock result) {
-// 		// TODO AVOID division
-// 		if(_numRows < 256) {
-// 			short[] colSums = new short[_colIndexes.length];
-// 			for(int i = 0; i < _values.length(); i++) {
-// 				colSums[i / _numRows] += _values.getValueByte(i);
-// 			}
-
-// 			for(int col = 0; col < _colIndexes.length; col++) {
-// 				result.quickSetValue(0, _colIndexes[col], colSums[col] * _values.getScale());
-// 			}
-// 		}
-// 		else if(_numRows < 16777216) { // (Int max + 1) / (short max + 1)
-// 			int[] colSums = new int[_colIndexes.length];
-// 			for(int i = 0; i < _values.length(); i++) {
-// 				colSums[i / _numRows] += _values.getValueByte(i);
-// 			}
-
-// 			for(int col = 0; col < _colIndexes.length; col++) {
-// 				result.quickSetValue(0, _colIndexes[col], colSums[col] * _values.getScale());
-// 			}
-// 		}
-// 		else {
-// 			double[] colSums = new double[_colIndexes.length];
-// 			for(int i = 0; i < _values.length(); i++) {
-// 				colSums[i / _numRows] += _values.getValueByte(i);
-// 			}
-
-// 			for(int col = 0; col < _colIndexes.length; col++) {
-// 				result.quickSetValue(0, _colIndexes[col], colSums[col] * _values.getScale());
-// 			}
-// 		}
-// 	}
-
-// 	protected void computeColSumsSq(MatrixBlock result) {
-
-// 		// TODO Avoid Division!
-// 		double[] sumsq = new double[_colIndexes.length];
-// 		for(int i = 0; i < _values.length(); i++) {
-// 			double v = _values.getValue(i);
-// 			sumsq[i / _numRows] += v * v;
-// 		}
-
-// 		for(int col = 0; col < _colIndexes.length; col++) {
-// 			result.quickSetValue(0, _colIndexes[col], sumsq[col]);
-// 		}
-
-// 	}
-
-// 	protected void computeRowMxx(MatrixBlock result, Builtin builtin, int rl, int ru) {
-// 		double[] c = result.getDenseBlockValues();
-// 		// TODO: Fix Loop!
-// 		for(int row = rl; row < ru; row++) {
-// 			for(int colIx = 0; colIx < _colIndexes.length; colIx++) {
-
-// 				double v = _values.getValue(row * colIx + row);
-// 				// System.out.println(v);
-// 				c[row] = builtin.execute(c[row], v);
-// 			}
-// 		}
-
-// 	}
-
-// 	protected void computeMxx(MatrixBlock result, Builtin builtin, boolean zeros) {
-
-// 		double res = 0;
-// 		for(int i = 0; i < _values.length(); i++) {
-// 			res = builtin.execute(res, _values.getValue(i));
-// 		}
-// 		result.quickSetValue(0, 0, res);
-// 	}
-
-// 	protected void computeColMxx(MatrixBlock result, Builtin builtin, boolean zeros) {
-// 		double[] colRes = new double[_colIndexes.length];
-// 		// TODO FIX INDEX CALCULATION / loop
-// 		for(int i = 0; i < _values.length(); i++) {
-// 			colRes[i / _numRows] = builtin.execute(colRes[i / _numRows], _values.getValue(i));
-// 		}
-
-// 		for(int col = 0; col < _colIndexes.length; col++) {
-// 			result.quickSetValue(0, _colIndexes[col], colRes[col]);
-// 		}
-// 	}
-
-// 	@Override
-// 	public Iterator<IJV> getIterator(int rl, int ru, boolean inclZeros, boolean rowMajor) {
-// 		return new QuanValueIterator();
-// 	}
-
-// 	private class QuanValueIterator implements Iterator<IJV> {
-
-// 		@Override
-// 		public boolean hasNext() {
-// 			throw new NotImplementedException("Not Implemented");
-// 		}
-
-// 		@Override
-// 		public IJV next() {
-// 			throw new NotImplementedException("Not Implemented");
-// 		}
-
-// 	}
-
-// 	@Override
-// 	public ColGroupRowIterator getRowIterator(int rl, int ru) {
-
-// 		return new QuanRowIterator();
-// 	}
-
-// 	private class QuanRowIterator extends ColGroupRowIterator {
-
-// 		@Override
-// 		public void next(double[] buff, int rowIx, int segIx, boolean last) {
-// 			throw new NotImplementedException("Not Implemented");
-// 		}
-
-// 	}
-
-// 	@Override
-// 	public void countNonZerosPerRow(int[] rnnz, int rl, int ru) {
-
-// 		for(int row = rl; row < ru; row++) {
-// 			int lnnz = 0;
-// 			for(int colIx = 0; colIx < _colIndexes.length; colIx++) {
-// 				lnnz += (_values.getValue(row * colIx + row) != 0) ? 1 : 0;
-// 			}
-// 			rnnz[row - rl] += lnnz;
-// 		}
-// 	}
-
-// 	@Override
-// 	public MatrixBlock getValuesAsBlock() {
-// 		MatrixBlock target = new MatrixBlock(_numRows, _colIndexes.length, 0.0);
-// 		decompressToBlock(target, _colIndexes);
-// 		return target;
-// 	}
-
-// 	@Override
-// 	public int[] getCounts() {
-// 		throw new DMLCompressionException(
-// 			"Invalid function call, the counts in Uncompressed Col Group is always 1 for each value");
-// 	}
-
-// 	@Override
-// 	public int[] getCounts(boolean includeZero) {
-// 		throw new DMLCompressionException(
-// 			"Invalid function call, the counts in Uncompressed Col Group is always 1 for each value");
-// 	}
-
-// 	@Override
-// 	public double[] getValues() {
-// 		return _values.getValues();
-// 	}
-
-// 	@Override
-// 	public boolean isLossy() {
-// 		return true;
-// 	}
-
-// }
\ No newline at end of file
diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupRLE.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupRLE.java
index 802dee4..315e3e7 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupRLE.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupRLE.java
@@ -24,16 +24,12 @@
 import java.util.Iterator;
 import java.util.List;
 
-import org.apache.commons.logging.Log;
-import org.apache.commons.logging.LogFactory;
 import org.apache.sysds.runtime.compress.CompressionSettings;
-import org.apache.sysds.runtime.compress.utils.AbstractBitmap;
+import org.apache.sysds.runtime.compress.utils.ABitmap;
 import org.apache.sysds.runtime.compress.utils.LinearAlgebraUtils;
-import org.apache.sysds.runtime.data.DenseBlock;
 import org.apache.sysds.runtime.functionobjects.Builtin;
 import org.apache.sysds.runtime.functionobjects.KahanFunction;
 import org.apache.sysds.runtime.functionobjects.KahanPlus;
-import org.apache.sysds.runtime.functionobjects.KahanPlusSq;
 import org.apache.sysds.runtime.instructions.cp.KahanObject;
 import org.apache.sysds.runtime.matrix.data.MatrixBlock;
 import org.apache.sysds.runtime.matrix.data.Pair;
@@ -43,8 +39,6 @@
 public class ColGroupRLE extends ColGroupOffset {
 	private static final long serialVersionUID = 7450232907594748177L;
 
-	private static final Log LOG = LogFactory.getLog(ColGroupRLE.class.getName());
-
 	protected ColGroupRLE() {
 		super();
 	}
@@ -57,7 +51,7 @@
 	 * @param ubm        Uncompressed bitmap representation of the block
 	 * @param cs         The Compression settings used for compression
 	 */
-	protected ColGroupRLE(int[] colIndices, int numRows, AbstractBitmap ubm, CompressionSettings cs) {
+	protected ColGroupRLE(int[] colIndices, int numRows, ABitmap ubm, CompressionSettings cs) {
 		super(colIndices, numRows, ubm, cs);
 
 		// compress the bitmaps
@@ -72,16 +66,11 @@
 		// compact bitmaps to linearized representation
 		createCompressedBitmaps(numVals, totalLen, lbitmaps);
 
-		// debug output
-		double ucSize = ColGroupSizes.estimateInMemorySizeUncompressed(numRows, colIndices.length, 1.0);
-		if(estimateInMemorySize() > ucSize)
-			LOG.warn(String
-				.format("RLE group larger than UC dense: %8d Uncompressed: %8d", estimateInMemorySize(), (int) ucSize));
 	}
 
-	protected ColGroupRLE(int[] colIndices, int numRows, boolean zeros, double[] values, char[] bitmaps,
+	protected ColGroupRLE(int[] colIndices, int numRows, boolean zeros, ADictionary dict, char[] bitmaps,
 		int[] bitmapOffs) {
-		super(colIndices, numRows, zeros, values);
+		super(colIndices, numRows, zeros, dict);
 		_data = bitmaps;
 		_ptr = bitmapOffs;
 	}
@@ -142,7 +131,6 @@
 			final int blksz = 128 * 1024;
 			final int numCols = getNumCols();
 			final int numVals = getNumValues();
-			final int n = getNumRows();
 			final double[] values = getValues();
 
 			// position and start offset arrays
@@ -155,8 +143,8 @@
 				cix[j] = colixTargets[_colIndexes[j]];
 
 			// cache conscious append via horizontal scans
-			for(int bi = 0; bi < n; bi += blksz) {
-				int bimax = Math.min(bi + blksz, n);
+			for(int bi = 0; bi < _numRows; bi += blksz) {
+				int bimax = Math.min(bi + blksz, _numRows);
 				for(int k = 0, off = 0; k < numVals; k++, off += numCols) {
 					int boff = _ptr[k];
 					int blen = len(k);
@@ -189,7 +177,6 @@
 		final int blksz = 128 * 1024;
 		final int numCols = getNumCols();
 		final int numVals = getNumValues();
-		final int n = getNumRows();
 		double[] c = target.getDenseBlockValues();
 		final double[] values = getValues();
 
@@ -199,8 +186,8 @@
 
 		// cache conscious append via horizontal scans
 		int nnz = 0;
-		for(int bi = 0; bi < n; bi += blksz) {
-			int bimax = Math.min(bi + blksz, n);
+		for(int bi = 0; bi < _numRows; bi += blksz) {
+			int bimax = Math.min(bi + blksz, _numRows);
 			Arrays.fill(c, bi, bimax, 0);
 			for(int k = 0, off = 0; k < numVals; k++, off += numCols) {
 				int boff = _ptr[k];
@@ -226,7 +213,7 @@
 	@Override
 	public int[] getCounts(int[] counts) {
 		final int numVals = getNumValues();
-		Arrays.fill(counts, 0, numVals, 0);
+		int sum = 0;
 		for(int k = 0; k < numVals; k++) {
 			int boff = _ptr[k];
 			int blen = len(k);
@@ -237,15 +224,19 @@
 				curRunEnd = curRunStartOff + _data[boff + bix + 1];
 				count += curRunEnd - curRunStartOff;
 			}
+			sum += count;
 			counts[k] = count;
 		}
+		if(_zeros) {
+			counts[counts.length - 1] = _numRows * _colIndexes.length - sum;
+		}
 		return counts;
 	}
 
 	@Override
 	public int[] getCounts(int rl, int ru, int[] counts) {
 		final int numVals = getNumValues();
-		Arrays.fill(counts, 0, numVals, 0);
+		int sum = 0;
 		for(int k = 0; k < numVals; k++) {
 			int boff = _ptr[k];
 			int blen = len(k);
@@ -259,15 +250,23 @@
 				curRunEnd = curRunStartOff + _data[boff + bix + 1];
 				count += Math.min(curRunEnd, ru) - curRunStartOff;
 			}
+			sum += count;
 			counts[k] = count;
 		}
+		if(_zeros) {
+			counts[counts.length - 1] = (ru - rl) * _colIndexes.length - sum;
+		}
 		return counts;
 	}
 
 	@Override
-	public void rightMultByVector(MatrixBlock vector, MatrixBlock result, int rl, int ru) {
+	public void rightMultByVector(MatrixBlock vector, double[] c, int rl, int ru) {
 		double[] b = ColGroupConverter.getDenseVector(vector);
-		double[] c = result.getDenseBlockValues();
+		// double[] c = result.getDenseBlockValues();
+		if(c == null) {
+
+			throw new NullPointerException("Result vector not available");
+		}
 		final int numCols = getNumCols();
 		final int numVals = getNumValues();
 
@@ -277,7 +276,7 @@
 			sb[j] = b[_colIndexes[j]];
 		}
 
-		if(numVals > 1 && _numRows > CompressionSettings.BITMAP_BLOCK_SZ) {
+		if(numVals >= 1 && _numRows > CompressionSettings.BITMAP_BLOCK_SZ) {
 			// L3 cache alignment, see comment rightMultByVector OLE column group
 			// core difference of RLE to OLE is that runs are not segment alignment,
 			// which requires care of handling runs crossing cache-buckets
@@ -306,10 +305,10 @@
 					while(bix < blen) {
 						int lstart = _data[boff + bix];
 						int llen = _data[boff + bix + 1];
-						LinearAlgebraUtils.vectAdd(val,
-							c,
-							Math.max(bi, start + lstart),
-							Math.min(start + lstart + llen, bimax) - Math.max(bi, start + lstart));
+						int len = Math.min(start + lstart + llen, bimax) - Math.max(bi, start + lstart);
+						if(len > 0) {
+							LinearAlgebraUtils.vectAdd(val, c, Math.max(bi, start + lstart), len);
+						}
 						if(start + lstart + llen >= bimax)
 							break;
 						start += lstart + llen;
@@ -325,7 +324,7 @@
 			for(int k = 0; k < numVals; k++) {
 				int boff = _ptr[k];
 				int blen = len(k);
-				double val = sumValues(k, sb);
+				double val = sumValues(k, sb, _dict.getValues());
 				int bix = 0;
 				int start = 0;
 
@@ -364,10 +363,9 @@
 		double[] c = result.getDenseBlockValues();
 		final int numCols = getNumCols();
 		final int numVals = getNumValues();
-		final int n = getNumRows();
 		final double[] values = getValues();
 
-		if(numVals > 1 && _numRows > CompressionSettings.BITMAP_BLOCK_SZ) {
+		if(numVals >= 1 && _numRows > CompressionSettings.BITMAP_BLOCK_SZ) {
 			final int blksz = 2 * CompressionSettings.BITMAP_BLOCK_SZ;
 
 			// step 1: prepare position and value arrays
@@ -378,8 +376,8 @@
 			double[] cvals = allocDVector(numVals, true);
 
 			// step 2: cache conscious matrix-vector via horizontal scans
-			for(int ai = 0; ai < n; ai += blksz) {
-				int aimax = Math.min(ai + blksz, n);
+			for(int ai = 0; ai < _numRows; ai += blksz) {
+				int aimax = Math.min(ai + blksz, _numRows);
 
 				// horizontal scan, incl pos maintenance
 				for(int k = 0; k < numVals; k++) {
@@ -430,44 +428,13 @@
 		}
 	}
 
-	// @Override
-	// public void leftMultByRowVector(ColGroupDDC a, MatrixBlock result) {
-	// 	// note: this method is only applicable for numrows < blocksize
-	// 	double[] c = result.getDenseBlockValues();
-	// 	final int numCols = getNumCols();
-	// 	final int numVals = getNumValues();
-	// 	final double[] values = getValues();
-	// 	final double[] aValues = a.getValues();
-
-	// 	// iterate over all values and their bitmaps
-	// 	for(int k = 0, valOff = 0; k < numVals; k++, valOff += numCols) {
-	// 		int boff = _ptr[k];
-	// 		int blen = len(k);
-
-	// 		double vsum = 0;
-	// 		int curRunEnd = 0;
-	// 		for(int bix = 0; bix < blen; bix += 2) {
-	// 			int curRunStartOff = curRunEnd + _data[boff + bix];
-	// 			int curRunLen = _data[boff + bix + 1];
-	// 			for(int i = curRunStartOff; i < curRunStartOff + curRunLen; i++) {
-	// 				vsum += aValues[a.getIndex(_data[i])];
-	// 			}
-	// 			curRunEnd = curRunStartOff + curRunLen;
-	// 		}
-
-	// 		// scale partial results by values and write results
-	// 		for(int j = 0; j < numCols; j++)
-	// 			c[_colIndexes[j]] += vsum * values[valOff + j];
-	// 	}
-	// }
-
 	@Override
 	public ColGroup scalarOperation(ScalarOperator op) {
 		double val0 = op.executeScalar(0);
 
 		// fast path: sparse-safe operations
 		// Note that bitmaps don't change and are shallow-copied
-		if(op.sparseSafe || val0 == 0) {
+		if(op.sparseSafe || val0 == 0 || !_zeros) {
 			return new ColGroupRLE(_colIndexes, _numRows, _zeros, applyScalarOp(op), _data, _ptr);
 		}
 
@@ -476,79 +443,32 @@
 		boolean[] lind = computeZeroIndicatorVector();
 		int[] loff = computeOffsets(lind);
 		if(loff.length == 0) { // empty offset list: go back to fast path
-			return new ColGroupRLE(_colIndexes, _numRows, true, applyScalarOp(op), _data, _ptr);
+			return new ColGroupRLE(_colIndexes, _numRows, false, applyScalarOp(op), _data, _ptr);
 		}
 
-		double[] rvalues = applyScalarOp(op, val0, getNumCols());
+		ADictionary rvalues = applyScalarOp(op, val0, getNumCols());
 		char[] lbitmap = genRLEBitmap(loff, loff.length);
 		char[] rbitmaps = Arrays.copyOf(_data, _data.length + lbitmap.length);
 		System.arraycopy(lbitmap, 0, rbitmaps, _data.length, lbitmap.length);
 		int[] rbitmapOffs = Arrays.copyOf(_ptr, _ptr.length + 1);
 		rbitmapOffs[rbitmapOffs.length - 1] = rbitmaps.length;
 
-		return new ColGroupRLE(_colIndexes, _numRows, loff.length < _numRows, rvalues, rbitmaps, rbitmapOffs);
+		return new ColGroupRLE(_colIndexes, _numRows, false, rvalues, rbitmaps, rbitmapOffs);
 	}
 
 	@Override
-	protected final void computeSum(MatrixBlock result, KahanFunction kplus) {
-
-		final int numCols = getNumCols();
-		final int numVals = getNumValues();
-
-		if(_dict instanceof QDictionary && !(kplus instanceof KahanPlusSq)) {
-			final QDictionary values = ((QDictionary) _dict);
-			long sum = 0;
-			for(int k = 0; k < numVals; k++) {
-				int count = getCountValue(k);
-				int valOff = k * _colIndexes.length;
-				// scale counts by all values
-				for(int j = 0; j < numCols; j++)
-					sum += values.getValueByte(valOff + j) * count;
-			}
-			result.quickSetValue(0, 0, result.quickGetValue(0, 0) + sum * values._scale);
-			result.quickSetValue(0, 1, 0);
-		}
-		else {
-			KahanObject kbuff = new KahanObject(result.quickGetValue(0, 0), result.quickGetValue(0, 1));
-
-			final double[] values = getValues();
-			for(int k = 0; k < numVals; k++) {
-				int count = getCountValue(k);
-				int valOff = k * _colIndexes.length;
-				// scale counts by all values
-				for(int j = 0; j < numCols; j++)
-					kplus.execute3(kbuff, values[valOff + j], count);
-			}
-
-			result.quickSetValue(0, 0, kbuff._sum);
-			result.quickSetValue(0, 1, kbuff._correction);
-		}
-
-	}
-
-	private int getCountValue(int k) {
-		int boff = _ptr[k];
-		int blen = len(k);
-		int curRunEnd = 0;
-		int count = 0;
-		for(int bix = 0; bix < blen; bix += 2) {
-			int curRunStartOff = curRunEnd + _data[boff + bix];
-			curRunEnd = curRunStartOff + _data[boff + bix + 1];
-			count += curRunEnd - curRunStartOff;
-		}
-		return count;
+	protected final void computeSum(double[] c, KahanFunction kplus) {
+		c[0] += _dict.sum(getCounts(), _colIndexes.length, kplus);
 	}
 
 	@Override
-	protected final void computeRowSums(MatrixBlock result, KahanFunction kplus, int rl, int ru) {
-		// note: due to corrections the output might be a large dense block
-		DenseBlock c = result.getDenseBlock();
+	protected final void computeRowSums(double[] c, KahanFunction kplus, int rl, int ru) {
 		KahanObject kbuff = new KahanObject(0, 0);
 		KahanPlus kplus2 = KahanPlus.getKahanPlusFnObject();
 
 		final int numVals = getNumValues();
 
-		if( numVals > 1 && _numRows > CompressionSettings.BITMAP_BLOCK_SZ) {
+		if(numVals > 1 && _numRows > CompressionSettings.BITMAP_BLOCK_SZ) {
 			final int blksz = CompressionSettings.BITMAP_BLOCK_SZ;
 
 			// step 1: prepare position and value arrays
@@ -556,7 +476,7 @@
 			// current pos / values per RLE list
 			int[] astart = new int[numVals];
 			int[] apos = skipScan(numVals, rl, astart);
-			double[] aval = _dict.sumAllRowsToDouble(kplus, kbuff, _colIndexes.length,false);
+			double[] aval = _dict.sumAllRowsToDouble(kplus, kbuff, _colIndexes.length);
 
 			// step 2: cache conscious matrix-vector via horizontal scans
 			for(int bi = rl; bi < ru; bi += blksz) {
@@ -577,12 +497,7 @@
 						int from = Math.max(bi, start + lstart);
 						int to = Math.min(start + lstart + llen, bimax);
 						for(int rix = from; rix < to; rix++) {
-							double[] cvals = c.values(rix);
-							int cix = c.pos(rix);
-							kbuff.set(cvals[cix], cvals[cix + 1]);
-							kplus2.execute2(kbuff, val);
-							cvals[cix] = kbuff._sum;
-							cvals[cix + 1] = kbuff._correction;
+							setandExecute(c, kbuff, kplus2, val, rix);
 						}
 						if(start + lstart + llen >= bimax)
 							break;
@@ -610,12 +525,7 @@
 						curRunStartOff = curRunEnd + _data[boff + bix];
 						curRunEnd = curRunStartOff + _data[boff + bix + 1];
 						for(int rix = curRunStartOff; rix < curRunEnd && rix < ru; rix++) {
-							double[] cvals = c.values(rix);
-							int cix = c.pos(rix);
-							kbuff.set(cvals[cix], cvals[cix + 1]);
-							kplus2.execute2(kbuff, val);
-							cvals[cix] = kbuff._sum;
-							cvals[cix + 1] = kbuff._correction;
+							setandExecute(c, kbuff, kplus2, val, rix * 2);
 						}
 					}
 				}
@@ -624,45 +534,21 @@
 	}
 
 	@Override
-	protected final void computeColSums(MatrixBlock result, KahanFunction kplus) {
-		KahanObject kbuff = new KahanObject(0, 0);
+	protected final void computeColSums(double[] c, KahanFunction kplus) {
+		_dict.colSum(c, getCounts(), _colIndexes, kplus);
+	}
 
-		final int numCols = getNumCols();
+	@Override
+	protected final void computeRowMxx(double[] c, Builtin builtin, int rl, int ru) {
+		// NOTE: zeros handled once for all column groups outside
 		final int numVals = getNumValues();
+		// double[] c = result.getDenseBlockValues();
 		final double[] values = getValues();
 
 		for(int k = 0; k < numVals; k++) {
 			int boff = _ptr[k];
 			int blen = len(k);
-			int valOff = k * numCols;
-			int curRunEnd = 0;
-			int count = 0;
-			for(int bix = 0; bix < blen; bix += 2) {
-				int curRunStartOff = curRunEnd + _data[boff + bix];
-				curRunEnd = curRunStartOff + _data[boff + bix + 1];
-				count += curRunEnd - curRunStartOff;
-			}
-
-			// scale counts by all values
-			for(int j = 0; j < numCols; j++) {
-				kbuff.set(result.quickGetValue(0, _colIndexes[j]), result.quickGetValue(1, _colIndexes[j]));
-				kplus.execute3(kbuff, values[valOff + j], count);
-				result.quickSetValue(0, _colIndexes[j], kbuff._sum);
-				result.quickSetValue(1, _colIndexes[j], kbuff._correction);
-			}
-		}
-	}
-
-	@Override
-	protected final void computeRowMxx(MatrixBlock result, Builtin builtin, int rl, int ru) {
-		// NOTE: zeros handled once for all column groups outside
-		final int numVals = getNumValues();
-		double[] c = result.getDenseBlockValues();
-
-		for(int k = 0; k < numVals; k++) {
-			int boff = _ptr[k];
-			int blen = len(k);
-			double val = mxxValues(k, builtin);
+			double val = mxxValues(k, builtin, values);
 
 			Pair<Integer, Integer> tmp = skipScanVal(k, rl);
 			int bix = tmp.getKey();
@@ -788,7 +674,7 @@
 
 	@Override
 	public Iterator<Integer> getIterator(int k) {
-		return new RLEValueIterator(k, 0, getNumRows());
+		return new RLEValueIterator(k, 0, _numRows);
 	}
 
 	@Override
diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupSizes.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupSizes.java
index 5b9c9cc..709975f 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupSizes.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupSizes.java
@@ -21,7 +21,6 @@
 
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
-import org.apache.sysds.runtime.DMLCompressionException;
 import org.apache.sysds.runtime.compress.CompressionSettings;
 import org.apache.sysds.runtime.matrix.data.MatrixBlock;
 import org.apache.sysds.utils.MemoryEstimates;
@@ -52,6 +51,9 @@
 
 	public static long estimateInMemorySizeDDC(int nrCols, int uniqueVals, boolean lossy) {
 		long size = estimateInMemorySizeGroupValue(nrCols, uniqueVals, lossy);
+		// if(!zeros){
+		// 	size += -nrCols * 8;
+		// }
 		return size;
 	}
 
@@ -112,13 +114,4 @@
 		return size;
 	}
 
-	public static long estimateInMemorySizeQuan(int nrRows, int nrColumns){
-		long size = estimateInMemorySizeGroup(nrColumns);
-		if(nrRows < 0 || nrColumns < 0){
-			throw new DMLCompressionException("Invalid number of rows and columns");
-		}
-		size += 8; // scale value
-		size += MemoryEstimates.byteArrayCost(nrRows*nrColumns);
-		return size;
-	}
 }
\ No newline at end of file
diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupUncompressed.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupUncompressed.java
index fb9ca41..db6d4d0 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupUncompressed.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupUncompressed.java
@@ -54,7 +54,7 @@
 	 */
 	private MatrixBlock _data;
 
-	public ColGroupUncompressed() {
+	protected ColGroupUncompressed() {
 		super();
 	}
 
@@ -71,7 +71,7 @@
 	 * @param compSettings   The Settings for how to compress this block, Here using information about the raw block if
 	 *                       it is transposed.
 	 */
-	public ColGroupUncompressed(int[] colIndicesList, MatrixBlock rawBlock, CompressionSettings compSettings) {
+	protected ColGroupUncompressed(int[] colIndicesList, MatrixBlock rawBlock, CompressionSettings compSettings) {
 		super(colIndicesList, compSettings.transposeInput ? rawBlock.getNumColumns() : rawBlock.getNumRows());
 
 		// prepare meta data
@@ -116,7 +116,7 @@
 	 * 
 	 * @param groupsToDecompress compressed columns to subsume. Must contain at least one element.
 	 */
-	public ColGroupUncompressed(List<ColGroup> groupsToDecompress) {
+	protected ColGroupUncompressed(List<ColGroup> groupsToDecompress) {
 		super(mergeColIndices(groupsToDecompress), groupsToDecompress.get(0)._numRows);
 
 		// Invert the list of column indices
@@ -141,7 +141,7 @@
 	 * @param numRows    number of rows in the column, for passing to the superclass
 	 * @param data       matrix block
 	 */
-	public ColGroupUncompressed(int[] colIndices, int numRows, MatrixBlock data) {
+	protected ColGroupUncompressed(int[] colIndices, int numRows, MatrixBlock data) {
 		super(colIndices, numRows);
 		_data = data;
 	}
@@ -254,6 +254,10 @@
 	}
 
 	@Override
+	public void rightMultByVector(MatrixBlock vector, double[] c, int rl, int ru) {
+		throw new NotImplementedException("Should not be called use other matrix function");
+	}
+
 	public void rightMultByVector(MatrixBlock vector, MatrixBlock result, int rl, int ru) {
 		// Pull out the relevant rows of the vector
 		int clen = _colIndexes.length;
@@ -266,22 +270,7 @@
 		shortVector.recomputeNonZeros();
 
 		// Multiply the selected columns by the appropriate parts of the vector
-		LibMatrixMult.matrixMult(_data, shortVector, result, rl, ru);
-	}
-
-	public void rightMultByVector(MatrixBlock vector, MatrixBlock result, int k) {
-		// Pull out the relevant rows of the vector
-		int clen = _colIndexes.length;
-
-		MatrixBlock shortVector = new MatrixBlock(clen, 1, false);
-		shortVector.allocateDenseBlock();
-		double[] b = shortVector.getDenseBlockValues();
-		for(int colIx = 0; colIx < clen; colIx++)
-			b[colIx] = vector.quickGetValue(_colIndexes[colIx], 0);
-		shortVector.recomputeNonZeros();
-
-		// Multiply the selected columns by the appropriate parts of the vector
-		LibMatrixMult.matrixMult(_data, shortVector, result, k);
+		LibMatrixMult.matrixMult(_data, shortVector, result,rl,ru);
 	}
 
 	@Override
@@ -324,11 +313,14 @@
 		return new ColGroupUncompressed(getColIndices(), _data.getNumRows(), retContent);
 	}
 
-	@Override
+	public void unaryAggregateOperations(AggregateUnaryOperator op, double[] ret) {
+		throw new NotImplementedException("Should not be called");
+	}
+
 	public void unaryAggregateOperations(AggregateUnaryOperator op, MatrixBlock ret) {
 		// execute unary aggregate operations
 		LibMatrixAgg.aggregateUnaryMatrix(_data, ret, op);
-
+		ret = ret.allocateBlock();
 		// shift result into correct column indexes
 		if(op.indexFn instanceof ReduceRow) {
 			// shift partial results, incl corrections
@@ -347,7 +339,7 @@
 	}
 
 	@Override
-	public void unaryAggregateOperations(AggregateUnaryOperator op, MatrixBlock result, int rl, int ru) {
+	public void unaryAggregateOperations(AggregateUnaryOperator op, double[] result, int rl, int ru) {
 		throw new NotImplementedException("Unimplemented Specific Sub ColGroup Aggregation Operation");
 	}
 
@@ -492,12 +484,6 @@
 	}
 
 	@Override
-	public int[] getCounts() {
-		throw new DMLCompressionException(
-			"Invalid function call, the counts in Uncompressed Col Group is always 1 for each value");
-	}
-
-	@Override
 	public double[] getValues() {
 		if(_data.isInSparseFormat()) {
 			return _data.getSparseBlock().values(0);
diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupValue.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupValue.java
index 06e205f..3b184a2 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupValue.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupValue.java
@@ -26,7 +26,7 @@
 
 import org.apache.sysds.runtime.DMLScriptException;
 import org.apache.sysds.runtime.compress.CompressionSettings;
-import org.apache.sysds.runtime.compress.utils.AbstractBitmap;
+import org.apache.sysds.runtime.compress.utils.ABitmap;
 import org.apache.sysds.runtime.compress.utils.Bitmap;
 import org.apache.sysds.runtime.compress.utils.BitmapLossy;
 import org.apache.sysds.runtime.functionobjects.Builtin;
@@ -37,20 +37,20 @@
 import org.apache.sysds.runtime.functionobjects.ReduceAll;
 import org.apache.sysds.runtime.functionobjects.ReduceCol;
 import org.apache.sysds.runtime.functionobjects.ReduceRow;
+import org.apache.sysds.runtime.instructions.cp.KahanObject;
 import org.apache.sysds.runtime.matrix.data.MatrixBlock;
 import org.apache.sysds.runtime.matrix.data.Pair;
 import org.apache.sysds.runtime.matrix.operators.AggregateUnaryOperator;
 import org.apache.sysds.runtime.matrix.operators.ScalarOperator;
-import org.apache.sysds.utils.MemoryEstimates;
 
 /**
- * Base class for column groups encoded with value dictionary.
+ * Base class for column groups encoded with value dictionary. This include column groups such as DDC OLE and RLE.
  * 
  */
 public abstract class ColGroupValue extends ColGroup {
 	private static final long serialVersionUID = 3786247536054353658L;
 
-	// thread-local pairs of reusable temporary vectors for positions and values
+	/** thread-local pairs of reusable temporary vectors for positions and values */
 	private static ThreadLocal<Pair<int[], double[]>> memPool = new ThreadLocal<Pair<int[], double[]>>() {
 		@Override
 		protected Pair<int[], double[]> initialValue() {
@@ -58,22 +58,23 @@
 		}
 	};
 
-	/** Distinct values associated with individual bitmaps. */
-	protected IDictionary _dict;
+	/** Distinct value tuples associated with individual bitmaps. */
+	protected ADictionary _dict;
 
-	public ColGroupValue() {
+	protected ColGroupValue() {
 		super();
 	}
 
 	/**
-	 * Stores the headers for the individual bitmaps.
+	 * Main constructor for the ColGroupValues. Used to contain the dictionaries used for the different types of
+	 * ColGroup.
 	 * 
 	 * @param colIndices indices (within the block) of the columns included in this column
 	 * @param numRows    total number of rows in the parent block
 	 * @param ubm        Uncompressed bitmap representation of the block
 	 * @param cs         The Compression settings used for compression
 	 */
-	public ColGroupValue(int[] colIndices, int numRows, AbstractBitmap ubm, CompressionSettings cs) {
+	protected ColGroupValue(int[] colIndices, int numRows, ABitmap ubm, CompressionSettings cs) {
 		super(colIndices, numRows);
 		_lossy = false;
 		_zeros = ubm.containsZero();
@@ -91,26 +92,11 @@
 				_lossy = true;
 				break;
 		}
-		// extract and store distinct values (bitmaps handled by subclasses)
-		// _dict = new Dictionary(ubm.getValues());
 	}
 
-	/**
-	 * Constructor for subclass methods that need to create shallow copies
-	 * 
-	 * @param colIndices raw column index information
-	 * @param numRows    number of rows in the block
-	 * @param values     set of distinct values for the block (associated bitmaps are kept in the subclass)
-	 */
-	protected ColGroupValue(int[] colIndices, int numRows, double[] values) {
+	protected ColGroupValue(int[] colIndices, int numRows, ADictionary dict) {
 		super(colIndices, numRows);
-		_dict = new Dictionary(values);
-	}
-
-	public long getDictionarySize() {
-		// NOTE: this estimate needs to be consistent with the estimate above,
-		// so for now we use the (incorrect) double array size, not the dictionary size
-		return (_dict != null) ? MemoryEstimates.doubleArrayCost(_dict.getValues().length) : 0;
+		_dict = dict;
 	}
 
 	/**
@@ -122,26 +108,16 @@
 		return _dict.getNumberOfValues(_colIndexes.length);
 	}
 
+	@Override
 	public double[] getValues() {
 		return _dict.getValues();
 	}
 
-	public void setValues(double[] values) {
-		_dict = new Dictionary(values);
-	}
-
-	public double getValue(int k, int col) {
-		return _dict.getValues()[k * getNumCols() + col];
-	}
-
-	public void setDictionary(Dictionary dict) {
-		_dict = dict;
-	}
-
 	@Override
 	public MatrixBlock getValuesAsBlock() {
 		final double[] values = getValues();
 		int vlen = values.length;
+
 		int rlen = _zeros ? vlen + 1 : vlen;
 		MatrixBlock ret = new MatrixBlock(rlen, 1, false);
 		for(int i = 0; i < vlen; i++)
@@ -149,21 +125,45 @@
 		return ret;
 	}
 
+	/**
+	 * Returns the counts of values inside the MatrixBlock returned in getValuesAsBlock Throws an exception if the
+	 * getIfCountsType is false.
+	 * 
+	 * The returned counts always contains the number of zeros as well if there are some contained, even if they are not
+	 * materialized.
+	 *
+	 * @return the count of each value in the MatrixBlock.
+	 */
 	public final int[] getCounts() {
-		int[] tmp = new int[getNumValues()];
-		tmp = getCounts(tmp);
-		if(_zeros && this instanceof ColGroupOffset) {
-			tmp = Arrays.copyOf(tmp, tmp.length + 1);
-			int sum = Arrays.stream(tmp).sum();
-			tmp[tmp.length - 1] = getNumRows() - sum;
+		int[] tmp;
+		if(_zeros) {
+			tmp = allocIVector(getNumValues() + 1, true);
 		}
-		return tmp;
+		else {
+			tmp = allocIVector(getNumValues(), true);
+		}
+		return getCounts(tmp);
 	}
 
-	public abstract int[] getCounts(int[] out);
-
+	/**
+	 * Returns the counts of values inside the MatrixBlock returned in getValuesAsBlock Throws an exception if the
+	 * getIfCountsType is false.
+	 * 
+	 * The returned counts always contains the number of zeros as well if there are some contained, even if they are not
+	 * materialized.
+	 *
+	 * @param rl the lower index of the interval of rows queried
+	 * @param ru the the upper boundary of the interval of rows queried
+	 * @return the count of each value in the MatrixBlock.
+	 */
 	public final int[] getCounts(int rl, int ru) {
-		int[] tmp = new int[getNumValues()];
+		int[] tmp;
+		if(_zeros) {
+			tmp = allocIVector(getNumValues() + 1, true);
+		}
+		else {
+			tmp = allocIVector(getNumValues(), true);
+		}
 		return getCounts(rl, ru, tmp);
 	}
 
@@ -171,83 +171,13 @@
 		return true;
 	}
 
-	public abstract int[] getCounts(int rl, int ru, int[] out);
-
-	public MatrixBlock getCountsAsBlock() {
-		return getCountsAsBlock(getCounts());
-	}
-
-	public static MatrixBlock getCountsAsBlock(int[] counts) {
-		MatrixBlock ret = new MatrixBlock(counts.length, 1, false);
-		for(int i = 0; i < counts.length; i++)
-			ret.quickSetValue(i, 0, counts[i]);
-		return ret;
-	}
-
 	protected int containsAllZeroValue() {
 		return _dict.hasZeroTuple(_colIndexes.length);
 	}
 
-	// protected final double[] sumAllValues(KahanFunction kplus, KahanObject kbuff) {
-	// return sumAllValues(kplus, kbuff, true);
-	// }
-
-	// protected final double[] sumAllValues(KahanFunction kplus, KahanObject kbuff, boolean allocNew) {
-	// // quick path: sum
-	// if(getNumCols() > 1 && _dict instanceof QDictionary && kplus instanceof KahanPlus){
-	// return sumAllValuesQToDouble();
-	// }
-	// else if(getNumCols() == 1 && kplus instanceof KahanPlus)
-	// return _dict.getValues(); // shallow copy of values
-
-	// // pre-aggregate value tuple
-	// final int numVals = getNumValues();
-	// double[] ret = allocNew ? new double[numVals] : allocDVector(numVals, false);
-	// for(int k = 0; k < numVals; k++)
-	// ret[k] = sumValues(k, kplus, kbuff);
-
-	// return ret;
-	// }
-
-	// /**
-	// * Method for summing all value tuples in the dictionary.
-	// *
-	// * This method assumes two things
-	// *
-	// * 1. That you dont call it if the number of columns in this ColGroup is 1. (then use
-	// ((QDictionary)_dict)._values)
-	// * 2. That it is not used for anything else than KahnPlus.
-	// * @return an short array of the sum of each row in the quantized array.
-	// */
-	// protected final short[] sumAllValuesQ(){
-	// final byte[] values = ((QDictionary)_dict)._values;
-	// short[] res = new short[getNumValues()];
-
-	// for(int i = 0, off = 0; off< values.length; i++, off += _colIndexes.length){
-	// for( int j = 0 ; j < _colIndexes.length; j++){
-	// res[i] += values[off + j];
-	// }
-	// }
-	// return res;
-	// }
-
-	// protected static final double[] sumAllValuesQToDouble(QDictionary dict, int nrCol){
-	// final byte[] values = dict._values;
-	// double[] res = new double[dict.getNumberOfValues()];
-
-	// for(int i = 0, off = 0; off< values.length; i++, off += _colIndexes.length){
-	// for( int j = 0 ; j < _colIndexes.length; j++){
-	// res[i] += values[off + j];
-	// }
-	// res[i] = res[i] * dict._scale;
-	// }
-	// return res;
-	// }
-
-	protected final double sumValues(int valIx, double[] b) {
+	protected final double sumValues(int valIx, double[] b, double[] values) {
 		final int numCols = getNumCols();
 		final int valOff = valIx * numCols;
-		final double[] values = _dict.getValues();
 		double val = 0;
 		for(int i = 0; i < numCols; i++)
 			val += values[valOff + i] * b[i];
@@ -260,97 +190,89 @@
 
 	protected final double[] preaggValues(int numVals, double[] b, boolean allocNew) {
 		double[] ret = allocNew ? new double[numVals] : allocDVector(numVals, false);
+		final double[] values = _dict.getValues();
 		for(int k = 0; k < numVals; k++)
-			ret[k] = sumValues(k, b);
+			ret[k] = sumValues(k, b, values);
 
 		return ret;
 	}
 
 	/**
+	 * Compute the Max or other equivalent operations.
+	 * 
 	 * NOTE: Shared across OLE/RLE/DDC because value-only computation.
 	 * 
-	 * @param result  output matrix block
+	 * @param c       output matrix block
 	 * @param builtin function object
 	 */
-	protected void computeMxx(MatrixBlock result, Builtin builtin) {
-		// init and 0-value handling
-		double val = (builtin
-			.getBuiltinCode() == BuiltinCode.MAX) ? Double.NEGATIVE_INFINITY : Double.POSITIVE_INFINITY;
-		if(_zeros)
-			val = builtin.execute(val, 0);
-
-		// iterate over all values only
-		val = _dict.aggregate(val, builtin);
-
-		// compute new partial aggregate
-		val = builtin.execute(val, result.quickGetValue(0, 0));
-		result.quickSetValue(0, 0, val);
-	}
-
-	/**
-	 * NOTE: Shared across OLE/RLE/DDC because value-only computation.
-	 * 
-	 * @param result  output matrix block
-	 * @param builtin function object
-	 */
-	protected void computeColMxx(MatrixBlock result, Builtin builtin) {
-		final int numCols = getNumCols();
-
-		// init and 0-value handling
-		double[] vals = new double[numCols];
-
-		// TODO fix edge cases in colMax. Since currently we rely on looking at rows in dict to specify if we start with
-		// zeros or not
-		if(!_zeros && _dict.getValuesLength() / numCols == getNumRows()) {
-			Arrays.fill(vals,
-				(builtin.getBuiltinCode() == BuiltinCode.MAX) ? Double.NEGATIVE_INFINITY : Double.POSITIVE_INFINITY);
+	protected void computeMxx(double[] c, Builtin builtin) {
+		if(_zeros) {
+			c[0] = builtin.execute(c[0], 0);
 		}
-
-		// iterate over all values only
-		vals = _dict.aggregateCols(vals, builtin, _colIndexes);
-		// copy results to output
-		for(int j = 0; j < numCols; j++)
-			result.quickSetValue(0, _colIndexes[j], vals[j]);
+		c[0] = _dict.aggregate(c[0], builtin);
 	}
 
 	/**
-	 * Method for use by subclasses. Applies a scalar operation to the value metadata stored in the superclass.
+	 * Compute the Column wise Max or other equivalent operations.
+	 * 
+	 * NOTE: Shared across OLE/RLE/DDC because value-only computation.
+	 * 
+	 * @param c       output matrix block
+	 * @param builtin function object
+	 */
+	protected void computeColMxx(double[] c, Builtin builtin) {
+		if(_zeros) {
+			for(int x = 0; x < _colIndexes.length; x++) {
+				c[_colIndexes[x]] = builtin.execute(c[_colIndexes[x]], 0);
+			}
+		}
+		_dict.aggregateCols(c, builtin, _colIndexes);
+	}
+
+	/**
+	 * Method for use by subclasses. Applies a scalar operation to the value metadata stored in the dictionary.
 	 * 
 	 * @param op scalar operation to perform
 	 * @return transformed copy of value metadata for this column group
 	 */
-	protected double[] applyScalarOp(ScalarOperator op) {
-		return _dict.clone().apply(op).getValues();
+	protected ADictionary applyScalarOp(ScalarOperator op) {
+		return _dict.clone().apply(op);
 	}
 
-	protected double[] applyScalarOp(ScalarOperator op, double newVal, int numCols) {
-		double[] values = _dict.getValues(); // allocate new array just once
-		Dictionary tmp = new Dictionary(Arrays.copyOf(values, values.length + numCols));
-		double[] ret = tmp.apply(op).getValues();
-
-		// add new value to the end
-		Arrays.fill(ret, values.length, values.length + numCols, newVal);
-		return ret;
+	/**
+	 * Method for use by subclasses. Applies a scalar operation to the value metadata stored in the dictionary. This
+	 * specific method is used in cases where an new entry is to be added in the dictionary.
+	 * 
+	 * Method should only be called if the newVal is not 0! Also the newVal should already have the operator applied.
+	 * 
+	 * @param op      The Operator to apply to the underlying data.
+	 * @param newVal  The new Value to append to the underlying data.
+	 * @param numCols The number of columns in the ColGroup, to specify how many copies of the newVal should be
+	 *                appended.
+	 * @return The new Dictionary containing the values.
+	 */
+	protected ADictionary applyScalarOp(ScalarOperator op, double newVal, int numCols) {
+		return _dict.applyScalarOp(op, newVal, numCols);
 	}
 
 	@Override
-	public void unaryAggregateOperations(AggregateUnaryOperator op, MatrixBlock result) {
-		unaryAggregateOperations(op, result, 0, getNumRows());
+	public void unaryAggregateOperations(AggregateUnaryOperator op, double[] c) {
+		unaryAggregateOperations(op, c, 0, _numRows);
 	}
 
 	@Override
-	public void unaryAggregateOperations(AggregateUnaryOperator op, MatrixBlock result, int rl, int ru) {
+	public void unaryAggregateOperations(AggregateUnaryOperator op, double[] c, int rl, int ru) {
 		// sum and sumsq (reduceall/reducerow over tuples and counts)
 		if(op.aggOp.increOp.fn instanceof KahanPlus || op.aggOp.increOp.fn instanceof KahanPlusSq) {
 			KahanFunction kplus = (op.aggOp.increOp.fn instanceof KahanPlus) ? KahanPlus
 				.getKahanPlusFnObject() : KahanPlusSq.getKahanPlusSqFnObject();
 
 			if(op.indexFn instanceof ReduceAll)
-				computeSum(result, kplus);
+				computeSum(c, kplus);
 			else if(op.indexFn instanceof ReduceCol)
-				computeRowSums(result, kplus, rl, ru);
+				computeRowSums(c, kplus, rl, ru);
 			else if(op.indexFn instanceof ReduceRow)
-				computeColSums(result, kplus);
+				computeColSums(c, kplus);
 		}
 		// min and max (reduceall/reducerow over tuples only)
 		else if(op.aggOp.increOp.fn instanceof Builtin &&
@@ -359,26 +281,23 @@
 			Builtin builtin = (Builtin) op.aggOp.increOp.fn;
 
 			if(op.indexFn instanceof ReduceAll)
-				computeMxx(result, builtin);
+				computeMxx(c, builtin);
 			else if(op.indexFn instanceof ReduceCol)
-				computeRowMxx(result, builtin, rl, ru);
+				computeRowMxx(c, builtin, rl, ru);
 			else if(op.indexFn instanceof ReduceRow)
-				computeColMxx(result, builtin);
+				computeColMxx(c, builtin);
 		}
 		else {
 			throw new DMLScriptException("Unknown UnaryAggregate operator on CompressedMatrixBlock");
 		}
 	}
 
-	protected abstract void computeSum(MatrixBlock result, KahanFunction kplus);
-
-	protected abstract void computeRowSums(MatrixBlock result, KahanFunction kplus, int rl, int ru);
-
-	protected abstract void computeColSums(MatrixBlock result, KahanFunction kplus);
-
-	protected abstract void computeRowMxx(MatrixBlock result, Builtin builtin, int rl, int ru);
-
-	// dynamic memory management
+	protected void setandExecute(double[] c, KahanObject kbuff, KahanPlus kplus2, double val, int rix) {
+		kbuff.set(c[rix], c[rix + 1]);
+		kplus2.execute2(kbuff, val);
+		c[rix] = kbuff._sum;
+		c[rix + 1] = kbuff._correction;
+	}
 
 	public static void setupThreadLocalMemory(int len) {
 		Pair<int[], double[]> p = new Pair<>();
@@ -447,7 +366,7 @@
 		for(int i = 0; i < numCols; i++)
 			_colIndexes[i] = in.readInt();
 
-		_dict = IDictionary.read(in, _lossy);
+		_dict = ADictionary.read(in, _lossy);
 
 	}
 
@@ -472,7 +391,7 @@
 		long ret = 0; // header
 		ret += 4; // num rows int
 		ret += 4; // num cols int
-		ret += 1; // Zeros boolean
+		ret += 1; // zeros boolean
 		ret += 1; // lossy boolean
 		// col indices
 		ret += 4 * _colIndexes.length;
@@ -481,4 +400,16 @@
 		return ret;
 	}
 
+	public abstract int[] getCounts(int[] out);
+
+	public abstract int[] getCounts(int rl, int ru, int[] out);
+
+	protected abstract void computeSum(double[] c, KahanFunction kplus);
+
+	protected abstract void computeRowSums(double[] c, KahanFunction kplus, int rl, int ru);
+
+	protected abstract void computeColSums(double[] c, KahanFunction kplus);
+
+	protected abstract void computeRowMxx(double[] c, Builtin builtin, int rl, int ru);
+
 }
diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/Dictionary.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/Dictionary.java
index c6a2e53..7d43c97 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/Dictionary.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/Dictionary.java
@@ -36,38 +36,36 @@
  * group. The primary reason for its introduction was to provide an entry point for specialization such as shared
  * dictionaries, which require additional information.
  */
-public class Dictionary extends IDictionary {
+public class Dictionary extends ADictionary {
 
-	// Linearized row major.
-	// v11 v12
-	// v21 v22
-	// ||
-	// \/
-	// v11 v12 v21 v22
-	protected final double[] _values;
+	private final double[] _values;
 
 	public Dictionary(double[] values) {
 		_values = values;
 	}
 
+	@Override
 	public double[] getValues() {
 		return _values;
 	}
 
+	@Override
 	public double getValue(int i) {
 		return _values[i];
 	}
 
+	@Override
 	public long getInMemorySize() {
 		// object + values array + double
 		return getInMemorySize(_values.length);
 	}
 
-	public static long getInMemorySize(int valuesCount) {
+	protected static long getInMemorySize(int valuesCount) {
 		// object + values array
 		return 16 + MemoryEstimates.doubleArrayCost(valuesCount);
 	}
 
+	@Override
 	public int hasZeroTuple(int ncol) {
 		int len = _values.length / ncol;
 		for(int i = 0, off = 0; i < len; i++, off += ncol) {
@@ -80,6 +78,7 @@
 		return -1;
 	}
 
+	@Override
 	public double aggregate(double init, Builtin fn) {
 		// full aggregate can disregard tuple boundaries
 		int len = _values.length;
@@ -89,16 +88,28 @@
 		return ret;
 	}
 
-	public IDictionary apply(ScalarOperator op) {
+	@Override
+	public Dictionary apply(ScalarOperator op) {
 		// in-place modification of the dictionary
 		int len = _values.length;
 		for(int i = 0; i < len; i++)
 			_values[i] = op.executeScalar(_values[i]);
-		return this; // fluent API
+		return this;
 	}
 
 	@Override
-	public IDictionary clone() {
+	public Dictionary applyScalarOp(ScalarOperator op, double newVal, int numCols) {
+		// allocate new array just once because we need to add the newVal.
+		double[] values = Arrays.copyOf(_values, _values.length + numCols);
+		for(int i = 0; i < _values.length; i++) {
+			values[i] = op.executeScalar(values[i]);
+		}
+		Arrays.fill(values, _values.length, _values.length + numCols, newVal);
+		return new Dictionary(values);
+	}
+
+	@Override
+	public Dictionary clone() {
 		return new Dictionary(_values.clone());
 	}
 
@@ -128,22 +139,19 @@
 		return 4 + 8 * _values.length;
 	}
 
-	public static Dictionary materializeZeroValueFull(Dictionary OldDictionary, int numCols) {
-		return new Dictionary(Arrays.copyOf(OldDictionary._values, OldDictionary._values.length + numCols));
-	}
-
+	@Override
 	public int getNumberOfValues(int ncol) {
 		return _values.length / ncol;
 	}
 
 	@Override
-	protected double[] sumAllRowsToDouble(KahanFunction kplus, KahanObject kbuff, int nrColumns, boolean allocNew) {
+	protected double[] sumAllRowsToDouble(KahanFunction kplus, KahanObject kbuff, int nrColumns) {
 		if(nrColumns == 1 && kplus instanceof KahanPlus)
 			return getValues(); // shallow copy of values
 
 		// pre-aggregate value tuple
 		final int numVals = _values.length / nrColumns;
-		double[] ret = allocNew ? new double[numVals] : ColGroupValue.allocDVector(numVals, false);
+		double[] ret = ColGroupValue.allocDVector(numVals, false);
 		for(int k = 0; k < numVals; k++) {
 			ret[k] = sumRow(k, kplus, kbuff, nrColumns);
 		}
@@ -160,4 +168,31 @@
 		return kbuff._sum;
 	}
 
+	@Override
+	protected void colSum(double[] c, int[] counts, int[] colIndexes, KahanFunction kplus) {
+		KahanObject kbuff = new KahanObject(0, 0);
+		for(int k = 0, valOff = 0; k < _values.length; k++, valOff += colIndexes.length) {
+			int cntk = counts[k];
+			for(int j = 0; j < colIndexes.length; j++) {
+				kbuff.set(c[colIndexes[j]], c[colIndexes[j] + colIndexes.length]);
+				// int index = getIndex();
+				kplus.execute3(kbuff, getValue(valOff + j), cntk);
+				c[colIndexes[j]] = kbuff._sum;
+				c[colIndexes[j] + colIndexes.length] = kbuff._correction;
+			}
+		}
+
+	}
+
+	@Override
+	protected double sum(int[] counts, int ncol, KahanFunction kplus) {
+		KahanObject kbuff = new KahanObject(0, 0);
+		for(int k = 0, valOff = 0; k < _values.length; k++, valOff += ncol) {
+			int cntk = counts[k];
+			for(int j = 0; j < ncol; j++) {
+				kplus.execute3(kbuff, getValue(valOff + j), cntk);
+			}
+		}
+		return kbuff._sum;
+	}
 }
diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/DictionaryShared.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/DictionaryShared.java
deleted file mode 100644
index 62a6a18..0000000
--- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/DictionaryShared.java
+++ /dev/null
@@ -1,79 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.sysds.runtime.compress.colgroup;
-
-import org.apache.commons.lang3.ArrayUtils;
-import org.apache.sysds.runtime.functionobjects.Builtin;
-import org.apache.sysds.runtime.functionobjects.Builtin.BuiltinCode;
-import org.apache.sysds.utils.MemoryEstimates;
-
-/**
- * This dictionary class aims to encapsulate the storage and operations over
- * unique floating point values of a column group. The primary reason for its
- * introduction was to provide an entry point for specialization such as shared
- * dictionaries, which require additional information.
- */
-public class DictionaryShared extends Dictionary {
-	protected final int[] _colIndexes;
-	// linearized <min/max> <min/max> of 
-	// column groups that share the dictionary
-	protected final double[] _extrema;
-	
-	public DictionaryShared(double[] values, int[] colIndexes, double[] extrema) {
-		super(values);
-		_colIndexes = colIndexes;
-		_extrema = extrema;
-	}
-	
-	@Override
-	public long getInMemorySize() {
-		return super.getInMemorySize()
-			+ MemoryEstimates.intArrayCost(_colIndexes.length)
-			+ MemoryEstimates.doubleArrayCost(_extrema.length);
-	}
-	
-	@Override
-	public double aggregate(double init, Builtin fn) {
-		//full aggregate directly over extreme values
-		int len = _extrema.length;
-		int off = fn.getBuiltinCode() == BuiltinCode.MIN ? 0 : 1;
-		double ret = init;
-		for(int i = off; i < len; i+=2)
-			ret = fn.execute(ret, _extrema[i]);
-		return ret;
-	}
-	
-	public double[] aggregateCols(double[] init, Builtin fn, int[] cols) {
-		int ncol = cols.length;
-		double[] ret = init;
-		int off = fn.getBuiltinCode() == BuiltinCode.MIN ? 0 : 1;
-		for(int i=0; i<ncol; i++) {
-			int pos = ArrayUtils.indexOf(_colIndexes, cols[i]);
-			ret[i] = fn.execute(ret[i], _extrema[2*pos+off]);
-		}
-		return ret;
-	}
-	
-	@Override
-	public DictionaryShared clone() {
-		return new DictionaryShared(
-			getValues().clone(), _colIndexes.clone(), _extrema.clone());
-	}
-}
diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/IDictionary.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/IDictionary.java
deleted file mode 100644
index 72e577b..0000000
--- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/IDictionary.java
+++ /dev/null
@@ -1,90 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.sysds.runtime.compress.colgroup;
-
-import java.io.DataInput;
-import java.io.DataOutput;
-import java.io.IOException;
-
-import org.apache.sysds.runtime.functionobjects.Builtin;
-import org.apache.sysds.runtime.functionobjects.KahanFunction;
-import org.apache.sysds.runtime.instructions.cp.KahanObject;
-import org.apache.sysds.runtime.matrix.operators.ScalarOperator;
-
-
-/**
- * This dictionary class aims to encapsulate the storage and operations over unique floating point values of a column
- * group. The primary reason for its introduction was to provide an entry point for specialization such as shared
- * dictionaries, which require additional information.
- */
-public abstract class IDictionary {
-
-	public abstract double[] getValues();
-
-	public abstract double getValue(int i);
-
-	public abstract int hasZeroTuple(int ncol);
-
-	public abstract long getInMemorySize();
-
-	public abstract double aggregate(double init, Builtin fn);
-
-	public abstract int getValuesLength();
-
-	public abstract IDictionary apply(ScalarOperator op);
-
-	public abstract IDictionary clone();
-
-	public double[] aggregateCols(double[] init, Builtin fn, int[] cols) {
-		int ncol = cols.length;
-		int vlen = getValuesLength() / ncol;
-		double[] ret = init;
-		for(int k = 0; k < vlen; k++)
-			for(int j = 0, valOff = k * ncol; j < ncol; j++)
-				ret[j] = fn.execute(ret[j], getValue(valOff + j));
-		return ret;
-	}
-
-	public static IDictionary read(DataInput in, boolean lossy) throws IOException {
-		return lossy ? QDictionary.read(in) : Dictionary.read(in);
-	}
-
-	public abstract void write(DataOutput out) throws IOException;
-
-	public abstract long getExactSizeOnDisk();
-
-	/**
-	 * Get the number of values given that the column group has n columns
-	 * @param ncol The number of Columns in the ColumnGroup.
-	 */
-	public abstract int getNumberOfValues(int ncol);
-
-	public static IDictionary materializeZeroValue(IDictionary OldDictionary, int numCols){
-		if(OldDictionary instanceof QDictionary){
-			return QDictionary.materializeZeroValueLossy((QDictionary)OldDictionary, numCols);
-		} else{
-			return Dictionary.materializeZeroValueFull((Dictionary)OldDictionary, numCols);
-		}
-	}
-
-	protected abstract double[] sumAllRowsToDouble(KahanFunction kplus, KahanObject kbuff, int nrColumns,  boolean allocNew);
-
-	protected abstract double sumRow(int k, KahanFunction kplus, KahanObject kbuff, int nrColumns);
-}
diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/QDictionary.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/QDictionary.java
index 34bc934..9cccf11 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/QDictionary.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/QDictionary.java
@@ -30,7 +30,9 @@
 import org.apache.sysds.runtime.functionobjects.Builtin;
 import org.apache.sysds.runtime.functionobjects.KahanFunction;
 import org.apache.sysds.runtime.functionobjects.KahanPlus;
+import org.apache.sysds.runtime.functionobjects.KahanPlusSq;
 import org.apache.sysds.runtime.functionobjects.Multiply;
+import org.apache.sysds.runtime.functionobjects.Plus;
 import org.apache.sysds.runtime.instructions.cp.KahanObject;
 import org.apache.sysds.runtime.matrix.operators.ScalarOperator;
 import org.apache.sysds.utils.MemoryEstimates;
@@ -40,43 +42,49 @@
  * group. The primary reason for its introduction was to provide an entry point for specialization such as shared
  * dictionaries, which require additional information.
  */
-public class QDictionary extends IDictionary {
+public class QDictionary extends ADictionary {
 
 	protected static final Log LOG = LogFactory.getLog(QDictionary.class.getName());
-	protected final double _scale;
-	protected final byte[] _values;
+	protected double _scale;
+	protected byte[] _values;
 
 	public QDictionary(BitmapLossy bm) {
 		_values = bm.getValues();
 		_scale = bm.getScale();
 	}
 
-	public QDictionary(byte[] values, double scale) {
+	private QDictionary(byte[] values, double scale) {
 		_values = values;
 		_scale = scale;
 	}
 
+	@Override
 	public double[] getValues() {
-		LOG.warn("Decompressing Quantized Representation");
 		double[] res = new double[_values.length];
 		for(int i = 0; i < _values.length; i++) {
-			res[i] = _values[i] * _scale;
+			res[i] = getValue(i);
 		}
 		return res;
 	}
 
+	@Override
 	public double getValue(int i) {
-		return _values[i] * _scale;
+		return (i == _values.length) ? 0.0 : _values[i] * _scale;
 	}
 
 	public byte getValueByte(int i) {
 		return _values[i];
 	}
 
+	public byte[] getValuesByte() {
+		return _values;
+	}
+
 	public double getScale() {
 		return _scale;
 	}
 
+	@Override
 	public long getInMemorySize() {
 		// object + values array + double
 		return getInMemorySize(_values.length);
@@ -87,6 +95,7 @@
 		return 16 + MemoryEstimates.byteArrayCost(valuesCount) + 8;
 	}
 
+	@Override
 	public int hasZeroTuple(int ncol) {
 		int len = _values.length / ncol;
 		for(int i = 0, off = 0; i < len; i++, off += ncol) {
@@ -99,6 +108,7 @@
 		return -1;
 	}
 
+	@Override
 	public double aggregate(double init, Builtin fn) {
 		// full aggregate can disregard tuple boundaries
 		int len = _values.length;
@@ -108,28 +118,63 @@
 		return ret;
 	}
 
+	@Override
 	public QDictionary apply(ScalarOperator op) {
 
 		if(op.fn instanceof Multiply) {
-			return new QDictionary(_values, op.executeScalar(_scale));
+			_scale = op.executeScalar(_scale);
+			return this;
+			// return new QDictionary(_values, op.executeScalar(_scale));
 		}
-		double[] temp = new double[_values.length];
-		double max = op.executeScalar((double) _values[0] * _scale);
-		temp[0] = max;
-		for(int i = 1; i < _values.length; i++) {
-			temp[i] = op.executeScalar((double) _values[i] * _scale);
+		else if(op.fn instanceof Plus) {
+			// TODO: find more operations that have the property of larges and smallest value producing the largest or
+			// smallest value from operation
+			double max = Math.max(Math.abs(op.executeScalar(-127 * _scale)), Math.abs(op.executeScalar(127 * _scale)));
+			double oldScale = _scale;
+			_scale = max / 127.0;
+
+			for(int i = 0; i < _values.length; i++) {
+				_values[i] = (byte) Math.round(op.executeScalar(_values[i] * oldScale) / _scale);
+			}
+		}
+		else {
+			double[] temp = new double[_values.length];
+			double max = op.executeScalar(getValue(0));
+			temp[0] = max;
+			for(int i = 1; i < _values.length; i++) {
+				temp[i] = op.executeScalar(getValue(i));
+				double absTemp = Math.abs(temp[i]);
+				if(absTemp > max) {
+					max = absTemp;
+				}
+			}
+			_scale = max / (double) (Byte.MAX_VALUE);
+			for(int i = 0; i < _values.length; i++) {
+				_values[i] = (byte) Math.round(temp[i] / _scale);
+			}
+		}
+
+		return this;
+	}
+
+	@Override
+	public QDictionary applyScalarOp(ScalarOperator op, double newVal, int numCols) {
+		double[] temp = getValues();
+		double max = newVal;
+		for(int i = 0; i < _values.length; i++) {
+			temp[i] = op.executeScalar(temp[i]);
 			double absTemp = Math.abs(temp[i]);
 			if(absTemp > max) {
 				max = absTemp;
 			}
 		}
-		byte[] newValues = new byte[_values.length];
-		double newScale = max / (double) (Byte.MAX_VALUE);
+		double scale = max / (double) (Byte.MAX_VALUE);
+		byte[] res = new byte[_values.length + numCols];
 		for(int i = 0; i < _values.length; i++) {
-			newValues[i] = (byte) ((double) temp[i] / newScale);
+			res[i] = (byte) Math.round(temp[i] / scale);
 		}
-
-		return new QDictionary(newValues, newScale);
+		Arrays.fill(res, _values.length, _values.length + numCols, (byte) Math.round(newVal / scale));
+		return new QDictionary(res, scale);
 	}
 
 	@Override
@@ -138,7 +183,7 @@
 	}
 
 	@Override
-	public IDictionary clone() {
+	public QDictionary clone() {
 		return new QDictionary(_values.clone(), _scale);
 	}
 
@@ -162,35 +207,21 @@
 
 	@Override
 	public long getExactSizeOnDisk() {
-		return 8 + 4 + _values.length + 10000;
+		return 8 + 4 + _values.length;
 	}
 
-	public static QDictionary materializeZeroValueLossy(QDictionary OldDictionary, int numCols) {
-		return new QDictionary(Arrays.copyOf(OldDictionary._values, OldDictionary._values.length + numCols),
-			OldDictionary._scale);
-	}
-
+	@Override
 	public int getNumberOfValues(int nCol) {
 		return _values.length / nCol;
 	}
 
-	public short[] sumAllRowsToShort(int nCol) {
-		short[] res = new short[getNumberOfValues(nCol)];
-		for(int i = 0, off = 0; off < _values.length; i++, off += nCol) {
-			for(int j = 0; j < nCol; j++) {
-				res[i] += _values[off + j];
-			}
-		}
-		return res;
-	}
-
 	@Override
-	protected double[] sumAllRowsToDouble(KahanFunction kplus, KahanObject kbuff, int nrColumns, boolean allocNew) {
+	protected double[] sumAllRowsToDouble(KahanFunction kplus, KahanObject kbuff, int nrColumns) {
 		if(nrColumns == 1 && kplus instanceof KahanPlus)
 			return getValues(); // shallow copy of values
 
 		final int numVals = _values.length / nrColumns;
-		double[] ret = allocNew ? new double[numVals] : ColGroupValue.allocDVector(numVals, false);
+		double[] ret = ColGroupValue.allocDVector(numVals, false);
 		for(int k = 0; k < numVals; k++) {
 			ret[k] = sumRow(k, kplus, kbuff, nrColumns);
 		}
@@ -201,16 +232,70 @@
 	@Override
 	protected double sumRow(int k, KahanFunction kplus, KahanObject kbuff, int nrColumns) {
 		int valOff = k * nrColumns;
-		if(kplus instanceof KahanPlus){
+		if(kplus instanceof KahanPlus) {
 			short res = 0;
-			for (int i = 0; i < nrColumns; i++){
+			for(int i = 0; i < nrColumns; i++) {
 				res += _values[valOff + i];
 			}
 			return res * _scale;
-		} else{
+		}
+		else {
 			kbuff.set(0, 0);
 			for(int i = 0; i < nrColumns; i++)
-				kplus.execute2(kbuff, _values[valOff + i] *_scale);
+				kplus.execute2(kbuff, _values[valOff + i] * _scale);
+			return kbuff._sum;
+		}
+	}
+
+	@Override
+	protected void colSum(double[] c, int[] counts, int[] colIndexes, KahanFunction kplus) {
+
+		if(!(kplus instanceof KahanPlusSq)) {
+			int[] sum = new int[colIndexes.length];
+			for(int k = 0, valOff = 0; k < _values.length; k++, valOff += colIndexes.length) {
+				int cntk = counts[k];
+				for(int j = 0; j < colIndexes.length; j++) {
+					sum[j] += cntk * getValueByte(valOff + j);
+				}
+			}
+			for(int j = 0; j < colIndexes.length; j++) {
+				c[colIndexes[j]] = c[colIndexes[j]] + sum[j] * _scale;
+			}
+		}
+		else {
+			KahanObject kbuff = new KahanObject(0, 0);
+			for(int k = 0, valOff = 0; k < _values.length; k++, valOff += colIndexes.length) {
+				int cntk = counts[k];
+				for(int j = 0; j < colIndexes.length; j++) {
+					kbuff.set(c[colIndexes[j]], c[colIndexes[j] + colIndexes.length]);
+					kplus.execute3(kbuff, getValue(valOff + j), cntk);
+					c[colIndexes[j]] = kbuff._sum;
+					c[colIndexes[j] + colIndexes.length] = kbuff._correction;
+				}
+			}
+		}
+	}
+
+	@Override
+	protected double sum(int[] counts, int ncol, KahanFunction kplus) {
+		if(!(kplus instanceof KahanPlusSq)) {
+			int sum = 0;
+			for(int k = 0, valOff = 0; k < _values.length; k++, valOff += ncol) {
+				int cntk = counts[k];
+				for(int j = 0; j < ncol; j++) {
+					sum += cntk * getValueByte(valOff + j);
+				}
+			}
+			return sum * _scale;
+		}
+		else {
+			KahanObject kbuff = new KahanObject(0, 0);
+			for(int k = 0, valOff = 0; k < _values.length; k++, valOff += ncol) {
+				int cntk = counts[k];
+				for(int j = 0; j < ncol; j++) {
+					kplus.execute3(kbuff, getValue(valOff + j), cntk);
+				}
+			}
 			return kbuff._sum;
 		}
 	}
diff --git a/src/main/java/org/apache/sysds/runtime/compress/estim/CompressedSizeEstimator.java b/src/main/java/org/apache/sysds/runtime/compress/estim/CompressedSizeEstimator.java
index 509b340..f7675ae 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/estim/CompressedSizeEstimator.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/estim/CompressedSizeEstimator.java
@@ -32,7 +32,7 @@
 import org.apache.sysds.runtime.compress.CompressionSettings;
 import org.apache.sysds.runtime.compress.colgroup.ColGroup.CompressionType;
 import org.apache.sysds.runtime.compress.colgroup.ColGroupSizes;
-import org.apache.sysds.runtime.compress.utils.AbstractBitmap;
+import org.apache.sysds.runtime.compress.utils.ABitmap;
 import org.apache.sysds.runtime.matrix.data.MatrixBlock;
 import org.apache.sysds.runtime.util.CommonThreadPool;
 
@@ -145,17 +145,13 @@
 	 * @param ubm the UncompressedBitmap, either extracted from a sample or from the entier dataset
 	 * @return The size factors estimated from the Bit Map.
 	 */
-	public EstimationFactors estimateCompressedColGroupSize(AbstractBitmap ubm) {
+	public EstimationFactors estimateCompressedColGroupSize(ABitmap ubm) {
 		return EstimationFactors.computeSizeEstimationFactors(ubm,
 			_compSettings.validCompressions.contains(CompressionType.RLE),
 			_numRows,
 			ubm.getNumColumns());
 	}
 
-	// ------------------------------------------------
-	// PARALLEL CODE
-	// ------------------------------------------------
-
 	private CompressedSizeInfoColGroup[] CompressedSizeInfoColGroup(int clen) {
 		CompressedSizeInfoColGroup[] ret = new CompressedSizeInfoColGroup[clen];
 		for(int col = 0; col < clen; col++)
@@ -196,12 +192,6 @@
 		}
 	}
 
-	// ------------------------------------------------
-	// PARALLEL CODE END
-	// ------------------------------------------------
-
-	// UTIL
-
 	private int[] makeColIndexes() {
 		int[] colIndexes = new int[_numCols];
 		for(int i = 0; i < _numCols; i++) {
diff --git a/src/main/java/org/apache/sysds/runtime/compress/estim/CompressedSizeEstimatorExact.java b/src/main/java/org/apache/sysds/runtime/compress/estim/CompressedSizeEstimatorExact.java
index 3003936..00bc011 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/estim/CompressedSizeEstimatorExact.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/estim/CompressedSizeEstimatorExact.java
@@ -21,7 +21,7 @@
 
 import org.apache.sysds.runtime.compress.BitmapEncoder;
 import org.apache.sysds.runtime.compress.CompressionSettings;
-import org.apache.sysds.runtime.compress.utils.AbstractBitmap;
+import org.apache.sysds.runtime.compress.utils.ABitmap;
 import org.apache.sysds.runtime.matrix.data.MatrixBlock;
 
 /**
@@ -35,7 +35,7 @@
 
 	@Override
 	public CompressedSizeInfoColGroup estimateCompressedColGroupSize(int[] colIndexes) {
-		AbstractBitmap entireBitMap = BitmapEncoder.extractBitmap(colIndexes, _data, _compSettings);
+		ABitmap entireBitMap = BitmapEncoder.extractBitmap(colIndexes, _data, _compSettings);
 		return new CompressedSizeInfoColGroup(estimateCompressedColGroupSize(entireBitMap),
 			_compSettings.validCompressions);
 	}
diff --git a/src/main/java/org/apache/sysds/runtime/compress/estim/CompressedSizeEstimatorSample.java b/src/main/java/org/apache/sysds/runtime/compress/estim/CompressedSizeEstimatorSample.java
index adbf086..017abb1 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/estim/CompressedSizeEstimatorSample.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/estim/CompressedSizeEstimatorSample.java
@@ -24,8 +24,8 @@
 import org.apache.sysds.runtime.compress.BitmapEncoder;
 import org.apache.sysds.runtime.compress.CompressionSettings;
 import org.apache.sysds.runtime.compress.estim.sample.HassAndStokes;
-import org.apache.sysds.runtime.compress.utils.AbstractBitmap;
-import org.apache.sysds.runtime.compress.utils.AbstractBitmap.BitmapType;
+import org.apache.sysds.runtime.compress.utils.ABitmap;
+import org.apache.sysds.runtime.compress.utils.ABitmap.BitmapType;
 import org.apache.sysds.runtime.matrix.data.MatrixBlock;
 import org.apache.sysds.runtime.util.UtilFunctions;
 
@@ -63,7 +63,7 @@
 		int[] sampleRows = _sampleRows;
 
 		// extract statistics from sample
-		AbstractBitmap ubm = BitmapEncoder.extractBitmap(colIndexes, _data, _compSettings);
+		ABitmap ubm = BitmapEncoder.extractBitmap(colIndexes, _data, _compSettings);
 		EstimationFactors fact = EstimationFactors.computeSizeEstimationFactors(ubm, false, _numRows, numCols);
 
 		// estimate number of distinct values (incl fixes for anomalies w/ large sample fraction)
@@ -99,12 +99,12 @@
 		return new CompressedSizeInfoColGroup(totalFacts, _compSettings.validCompressions);
 	}
 
-	private static int getNumDistinctValues(AbstractBitmap ubm, int numRows, int[] sampleRows,
+	private static int getNumDistinctValues(ABitmap ubm, int numRows, int[] sampleRows,
 		HashMap<Integer, Double> solveCache) {
 		return HassAndStokes.haasAndStokes(ubm, numRows, sampleRows.length, solveCache);
 	}
 
-	private static int getNumRuns(AbstractBitmap ubm, int sampleSize, int totalNumRows, int[] sampleRows) {
+	private static int getNumRuns(ABitmap ubm, int sampleSize, int totalNumRows, int[] sampleRows) {
 		int numVals = ubm.getNumValues();
 		double numRuns = 0;
 		for(int vi = 0; vi < numVals; vi++) {
diff --git a/src/main/java/org/apache/sysds/runtime/compress/estim/CompressedSizeInfo.java b/src/main/java/org/apache/sysds/runtime/compress/estim/CompressedSizeInfo.java
index 62d3c5e..c920772 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/estim/CompressedSizeInfo.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/estim/CompressedSizeInfo.java
@@ -50,6 +50,7 @@
 
 	/**
 	 * Method for returning the calculated memory usage from this specific compression plan.
+	 * 
 	 * @return The in memory estimate as a long counting bytes.
 	 */
 	public long memoryEstimate() {
@@ -63,5 +64,4 @@
 		return est;
 	}
 
-	
 }
diff --git a/src/main/java/org/apache/sysds/runtime/compress/estim/CompressedSizeInfoColGroup.java b/src/main/java/org/apache/sysds/runtime/compress/estim/CompressedSizeInfoColGroup.java
index 2ba2f3e..cf89e8c 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/estim/CompressedSizeInfoColGroup.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/estim/CompressedSizeInfoColGroup.java
@@ -99,16 +99,10 @@
 		switch(ct) {
 			case DDC:
 				if(fact.numVals < 256) {
-					size = ColGroupSizes.estimateInMemorySizeDDC1(fact.numCols,
-						fact.numVals + (fact.containsZero ? 1 : 0),
-						fact.numRows,
-						fact.lossy);
+					size = ColGroupSizes.estimateInMemorySizeDDC1(fact.numCols, fact.numVals, fact.numRows, fact.lossy);
 				}
 				else {
-					size = ColGroupSizes.estimateInMemorySizeDDC2(fact.numCols,
-						fact.numVals + (fact.containsZero ? 1 : 0),
-						fact.numRows,
-						fact.lossy);
+					size = ColGroupSizes.estimateInMemorySizeDDC2(fact.numCols, fact.numVals, fact.numRows, fact.lossy);
 				}
 				break;
 			case RLE:
@@ -124,9 +118,6 @@
 					fact.numCols,
 					((double) fact.numVals / (fact.numRows * fact.numCols)));
 				break;
-			case QUAN:
-				size = ColGroupSizes.estimateInMemorySizeQuan(fact.numRows, fact.numCols);
-				break;
 			default:
 				throw new NotImplementedException("The col compression Type is not yet supported");
 		}
diff --git a/src/main/java/org/apache/sysds/runtime/compress/estim/EstimationFactors.java b/src/main/java/org/apache/sysds/runtime/compress/estim/EstimationFactors.java
index c5db40c..ef37d85 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/estim/EstimationFactors.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/estim/EstimationFactors.java
@@ -22,8 +22,8 @@
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import org.apache.sysds.runtime.compress.CompressionSettings;
-import org.apache.sysds.runtime.compress.utils.AbstractBitmap;
-import org.apache.sysds.runtime.compress.utils.AbstractBitmap.BitmapType;
+import org.apache.sysds.runtime.compress.utils.ABitmap;
+import org.apache.sysds.runtime.compress.utils.ABitmap.BitmapType;
 
 /**
  * Compressed Size Estimation factors. Contains meta information used to estimate the compression sizes of given columns
@@ -39,7 +39,7 @@
 	protected final int numVals; // Number of unique values in the compressed group
 	/** The number of offsets, to tuples of values in the column groups */
 	protected final int numOffs;
-	/** The Number of runs, of consecutive equal numbers, used primarily in RLE*/
+	/** The Number of runs, of consecutive equal numbers, used primarily in RLE */
 	protected final int numRuns;
 	/** The Number of Values in the collection not Zero , Also refered to as singletons */
 	protected final int numSingle;
@@ -60,11 +60,11 @@
 		LOG.debug(this);
 	}
 
-	protected static EstimationFactors computeSizeEstimationFactors(AbstractBitmap ubm, boolean inclRLE, int numRows,
+	protected static EstimationFactors computeSizeEstimationFactors(ABitmap ubm, boolean inclRLE, int numRows,
 		int numCols) {
 		int numVals = ubm.getNumValues();
 		boolean containsZero = ubm.containsZero();
-		
+
 		int numRuns = 0;
 		int numOffs = 0;
 		int numSingle = 0;
@@ -79,7 +79,7 @@
 			if(inclRLE) {
 				int[] list = ubm.getOffsetsList(i).extractValues();
 				int lastOff = -2;
-				numRuns += list[listSize - 1] / (CompressionSettings.BITMAP_BLOCK_SZ- 1);
+				numRuns += list[listSize - 1] / (CompressionSettings.BITMAP_BLOCK_SZ - 1);
 				for(int j = 0; j < listSize; j++) {
 					if(list[j] != lastOff + 1) {
 						numRuns++;
diff --git a/src/main/java/org/apache/sysds/runtime/compress/estim/sample/FrequencyCount.java b/src/main/java/org/apache/sysds/runtime/compress/estim/sample/FrequencyCount.java
index 3568683..3badbe3 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/estim/sample/FrequencyCount.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/estim/sample/FrequencyCount.java
@@ -19,7 +19,7 @@
 
 package org.apache.sysds.runtime.compress.estim.sample;
 
-import org.apache.sysds.runtime.compress.utils.AbstractBitmap;
+import org.apache.sysds.runtime.compress.utils.ABitmap;
 
 public class FrequencyCount {
 
@@ -30,7 +30,7 @@
 	 * @param ubm uncompressed bitmap
 	 * @return frequency counts
 	 */
-	protected static int[] get(AbstractBitmap ubm) {
+	protected static int[] get(ABitmap ubm) {
 		// determine max frequency
 		int numVals = ubm.getNumValues();
 		int maxCount = 0;
diff --git a/src/main/java/org/apache/sysds/runtime/compress/estim/sample/HassAndStokes.java b/src/main/java/org/apache/sysds/runtime/compress/estim/sample/HassAndStokes.java
index ff33809..b745d4e 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/estim/sample/HassAndStokes.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/estim/sample/HassAndStokes.java
@@ -23,7 +23,7 @@
 
 import org.apache.commons.math3.analysis.UnivariateFunction;
 import org.apache.commons.math3.analysis.solvers.UnivariateSolverUtils;
-import org.apache.sysds.runtime.compress.utils.AbstractBitmap;
+import org.apache.sysds.runtime.compress.utils.ABitmap;
 
 public class HassAndStokes {
 
@@ -46,7 +46,7 @@
 	 * @param solveCache A Hashmap containing information for getDuj2aEstimate
 	 * @return An estimation of distinct elements in the population.
 	 */
-	public static int haasAndStokes(AbstractBitmap ubm, int nRows, int sampleSize,
+	public static int haasAndStokes(ABitmap ubm, int nRows, int sampleSize,
 		HashMap<Integer, Double> solveCache) {
 		// obtain value and frequency histograms
 		int numVals = ubm.getNumValues();
diff --git a/src/main/java/org/apache/sysds/runtime/compress/utils/AbstractBitmap.java b/src/main/java/org/apache/sysds/runtime/compress/utils/ABitmap.java
similarity index 70%
rename from src/main/java/org/apache/sysds/runtime/compress/utils/AbstractBitmap.java
rename to src/main/java/org/apache/sysds/runtime/compress/utils/ABitmap.java
index c7cc8ee..abc745d 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/utils/AbstractBitmap.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/utils/ABitmap.java
@@ -24,48 +24,48 @@
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 
-public abstract class AbstractBitmap {
-    protected static final Log LOG = LogFactory.getLog(AbstractBitmap.class.getName());
+public abstract class ABitmap {
+	protected static final Log LOG = LogFactory.getLog(ABitmap.class.getName());
 
 	public enum BitmapType {
-		Lossy,
-		Full
+		Lossy, Full
 	}
-    protected final int _numCols;
-    
-    /** Bitmaps (as lists of offsets) for each of the values. */
-    protected IntArrayList[] _offsetsLists;
 
-    /** int specifying the number of zero value groups contained in the rows. */
+	protected final int _numCols;
+
+	/** Bitmaps (as lists of offsets) for each of the values. */
+	protected IntArrayList[] _offsetsLists;
+
+	/** int specifying the number of zero value groups contained in the rows. */
 	protected final int _numZeros;
 
-    public AbstractBitmap(int numCols, IntArrayList[] offsetsLists, int numZeroGroups){
+	public ABitmap(int numCols, IntArrayList[] offsetsLists, int numZeroGroups) {
 		_numCols = numCols;
-        _numZeros = numZeroGroups;
-        _offsetsLists = offsetsLists;
-    }
+		_numZeros = numZeroGroups;
+		_offsetsLists = offsetsLists;
+	}
 
 	public int getNumColumns() {
 		return _numCols;
-    }
-    
-    /**
+	}
+
+	/**
 	 * Obtain number of distinct value groups in the column. this number is also the number of bitmaps, since there is
 	 * one bitmap per value
 	 * 
 	 * @return number of distinct value groups in the column;
 	 */
 	public abstract int getNumValues();
-    
-    
-    public IntArrayList[] getOffsetList() {
+
+	public IntArrayList[] getOffsetList() {
 		return _offsetsLists;
-    }
+	}
+
 	public IntArrayList getOffsetsList(int idx) {
 		return _offsetsLists[idx];
-    }
-    
-    public long getNumOffsets() {
+	}
+
+	public long getNumOffsets() {
 		long ret = 0;
 		for(IntArrayList offlist : _offsetsLists)
 			ret += offlist.size();
@@ -74,22 +74,21 @@
 
 	public int getNumOffsets(int ix) {
 		return _offsetsLists[ix].size();
-    }
-    
+	}
 
-    public abstract void sortValuesByFrequency();
+	public abstract void sortValuesByFrequency();
 
-    public boolean containsZero() {
+	public boolean containsZero() {
 		return _numZeros > 0;
-    }
-    
-    public int getZeroCounts() {
+	}
+
+	public int getZeroCounts() {
 		return _numZeros;
-    }
-	
+	}
+
 	public abstract BitmapType getType();
 
-    @Override
+	@Override
 	public String toString() {
 		StringBuilder sb = new StringBuilder();
 		sb.append(super.toString());
diff --git a/src/main/java/org/apache/sysds/runtime/compress/utils/Bitmap.java b/src/main/java/org/apache/sysds/runtime/compress/utils/Bitmap.java
index 2aba804..6a921ee 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/utils/Bitmap.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/utils/Bitmap.java
@@ -22,14 +22,12 @@
 import java.util.Arrays;
 
 import org.apache.commons.lang.ArrayUtils;
-import org.apache.sysds.runtime.compress.utils.DblArrayIntListHashMap.DArrayIListEntry;
-import org.apache.sysds.runtime.compress.utils.DoubleIntListHashMap.DIListEntry;
 import org.apache.sysds.runtime.util.SortUtils;
 
 /**
  * Uncompressed representation of one or more columns in bitmap format.
  */
-public final class Bitmap extends AbstractBitmap {
+public final class Bitmap extends ABitmap {
 
 	/**
 	 * Distinct values that appear in the column. Linearized as value groups <v11 v12> <v21 v22>.
@@ -41,35 +39,6 @@
 		_values = values;
 	}
 
-	public static Bitmap makeBitmap(DblArrayIntListHashMap distinctVals, int numColumns, int numZeros) {
-		// added for one pass bitmap construction
-		// Convert inputs to arrays
-		int numVals = distinctVals.size();
-		int numCols = numColumns;
-		double[] values = new double[numVals * numCols];
-		IntArrayList[] offsetsLists = new IntArrayList[numVals];
-		int bitmapIx = 0;
-		for(DArrayIListEntry val : distinctVals.extractValues()) {
-			System.arraycopy(val.key.getData(), 0, values, bitmapIx * numCols, numCols);
-			offsetsLists[bitmapIx++] = val.value;
-		}
-		return new Bitmap(numCols, offsetsLists, numZeros, values);
-	}
-
-	public static Bitmap makeBitmap(DoubleIntListHashMap distinctVals, int numZeros) {
-		// added for one pass bitmap construction
-		// Convert inputs to arrays
-		int numVals = distinctVals.size();
-		double[] values = new double[numVals];
-		IntArrayList[] offsetsLists = new IntArrayList[numVals];
-		int bitmapIx = 0;
-		for(DIListEntry val : distinctVals.extractValues()) {
-			values[bitmapIx] = val.key;
-			offsetsLists[bitmapIx++] = val.value;
-		}
-		return new Bitmap(1, offsetsLists, numZeros, values);
-	}
-
 	/**
 	 * Get all values without unnecessary allocations and copies.
 	 * 
diff --git a/src/main/java/org/apache/sysds/runtime/compress/utils/BitmapLossy.java b/src/main/java/org/apache/sysds/runtime/compress/utils/BitmapLossy.java
index 9037c00..7f86794 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/utils/BitmapLossy.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/utils/BitmapLossy.java
@@ -19,23 +19,14 @@
 
 package org.apache.sysds.runtime.compress.utils;
 
-import java.util.ArrayList;
 import java.util.Arrays;
-import java.util.DoubleSummaryStatistics;
-import java.util.HashMap;
-import java.util.Iterator;
-import java.util.LinkedList;
-import java.util.List;
-import java.util.Map;
-import java.util.Map.Entry;
-import java.util.Queue;
 
 import org.apache.commons.lang.NotImplementedException;
 
 /**
  * Uncompressed but Quantized representation of contained data.
  */
-public final class BitmapLossy extends AbstractBitmap {
+public final class BitmapLossy extends ABitmap {
 
 	/**
 	 * Distinct values that appear in the column. Linearized as value groups <v11 v12> <v21 v22>.
@@ -49,137 +40,6 @@
 		_scale = scale;
 	}
 
-	public static AbstractBitmap makeBitmapLossy(Bitmap ubm) {
-		int numCols = ubm.getNumColumns();
-		double[] fp = ubm.getValues();
-		double scale = getScale(fp);
-		if(Double.isNaN(scale)) {
-			LOG.warn("Defaulting to incompressable colGroup");
-			return ubm;
-		}
-		else {
-			byte[] scaledValues = scaleValues(fp, scale);
-			if(numCols == 1) {
-				return makeBitmapLossySingleCol(ubm, scaledValues, scale);
-			}
-			else {
-				return makeBitmapLossyMultiCol(ubm, scaledValues, scale);
-			}
-		}
-
-	}
-
-	private static AbstractBitmap makeBitmapLossySingleCol(Bitmap ubm, byte[] scaledValues, double scale) {
-
-		Map<Byte, Queue<IntArrayList>> values = new HashMap<>();
-		IntArrayList[] fullSizeOffsetsLists = ubm.getOffsetList();
-		int numZeroGroups = ubm.getZeroCounts();
-		for(int idx = 0; idx < scaledValues.length; idx++) {
-			if(scaledValues[idx] != 0) { // Throw away zero values.
-				if(values.containsKey(scaledValues[idx])) {
-					values.get(scaledValues[idx]).add(fullSizeOffsetsLists[idx]);
-				}
-				else {
-					Queue<IntArrayList> offsets = new LinkedList<IntArrayList>();
-					offsets.add(fullSizeOffsetsLists[idx]);
-					values.put(scaledValues[idx], offsets);
-				}
-			}
-			else {
-				numZeroGroups++;
-			}
-		}
-		byte[] scaledValuesReduced = new byte[values.keySet().size()];
-		IntArrayList[] newOffsetsLists = new IntArrayList[values.keySet().size()];
-		Iterator<Entry<Byte, Queue<IntArrayList>>> x = values.entrySet().iterator();
-		int idx = 0;
-		while(x.hasNext()) {
-			Entry<Byte, Queue<IntArrayList>> ent = x.next();
-			scaledValuesReduced[idx] = ent.getKey().byteValue();
-			newOffsetsLists[idx] = mergeOffsets(ent.getValue());
-			idx++;
-		}
-		return new BitmapLossy(ubm.getNumColumns(), newOffsetsLists, numZeroGroups, scaledValuesReduced, scale);
-	}
-
-	private static AbstractBitmap makeBitmapLossyMultiCol(Bitmap ubm, byte[] scaledValues, double scale) {
-		int numColumns = ubm.getNumColumns();
-		Map<List<Byte>, Queue<IntArrayList>> values = new HashMap<>();
-		IntArrayList[] fullSizeOffsetsLists = ubm.getOffsetList();
-		int numZeroGroups = ubm.getZeroCounts();
-		boolean allZero = true;
-		for(int idx = 0; idx < scaledValues.length; idx += numColumns) {
-			List<Byte> array = new ArrayList<>();
-			for(int off = 0; off < numColumns; off++) {
-				allZero = scaledValues[idx + off] == 0 && allZero;
-				array.add(scaledValues[idx + off]);
-			}
-			
-			numZeroGroups += allZero ? 1 : 0;
-			if(!allZero) {
-				if(values.containsKey(array)) {
-					values.get(array).add(fullSizeOffsetsLists[idx / numColumns]);
-				}
-				else {
-					Queue<IntArrayList> offsets = new LinkedList<IntArrayList>();
-					offsets.add(fullSizeOffsetsLists[idx / numColumns]);
-					values.put(array, offsets);
-				}
-				// LOG.error(array);
-			}
-			allZero = true;
-		}
-		// LOG.error(array);
-		// LOG.error(values);
-
-
-		byte[] scaledValuesReduced = new byte[values.keySet().size() * numColumns];
-		IntArrayList[] newOffsetsLists = new IntArrayList[values.keySet().size()];
-		Iterator<Entry<List<Byte>, Queue<IntArrayList>>> x = values.entrySet().iterator();
-		int idx = 0;
-		while(x.hasNext()) {
-			Entry<List<Byte>, Queue<IntArrayList>> ent = x.next();
-			List<Byte> key = ent.getKey();
-			int row = idx * numColumns;
-			for(int off = 0; off < numColumns; off++) {
-				scaledValuesReduced[row + off] = key.get(off);
-			}
-			newOffsetsLists[idx] = mergeOffsets(ent.getValue());
-			idx++;
-		}
-		// LOG.error(Arrays.toString(scaledValuesReduced));
-		// try {
-		// 	Thread.sleep(1000);
-		// }
-		// catch(InterruptedException e) {
-		// 	// TODO Auto-generated catch block
-		// 	e.printStackTrace();
-		// }
-		return new BitmapLossy(ubm.getNumColumns(), newOffsetsLists, numZeroGroups, scaledValuesReduced, scale);
-	}
-
-	/**
-	 * Get the scale for the given double array.
-	 * 
-	 * @param fp A array of double values
-	 * @return a scale to scale to range [-127, 127]
-	 */
-	public static double getScale(double[] fp) {
-		DoubleSummaryStatistics stat = Arrays.stream(fp).summaryStatistics();
-		double max = Math.abs(Math.max(stat.getMax(), Math.abs(stat.getMin())));
-		double scale;
-		if(Double.isInfinite(max)) {
-			LOG.warn("Invalid Column, can't quantize Infinite value.");
-			return Double.NaN;
-		}
-		else if(max == 0) { // The column group is filled with 0.
-			scale = 1;
-		}
-		else {
-			scale = max / (double) (Byte.MAX_VALUE);
-		}
-		return scale;
-	}
 
 	/**
 	 * Get all values without unnecessary allocations and copies.
@@ -231,11 +91,15 @@
 
 	@Override
 	public void sortValuesByFrequency() {
-		// TODO Auto-generated method stub
 		throw new NotImplementedException("Not Implemented Sorting of Lossy Bit Map");
 	}
 
 	@Override
+	public BitmapType getType() {
+		return BitmapType.Lossy;
+	}
+
+	@Override
 	public String toString() {
 		StringBuilder sb = new StringBuilder();
 		sb.append(super.toString());
@@ -246,60 +110,4 @@
 		return sb.toString();
 	}
 
-	// UTIL FUNCTIONS
-
-	private static IntArrayList mergeOffsets(Queue<IntArrayList> offsets) {
-		if(offsets.size() == 1) {
-			return offsets.remove();
-		}
-		else {
-			IntArrayList h = offsets.remove();
-			IntArrayList t = offsets.remove();
-			IntArrayList n = mergeOffsets(h, t);
-			offsets.add(n);
-			return mergeOffsets(offsets);
-		}
-	}
-
-	private static IntArrayList mergeOffsets(IntArrayList h, IntArrayList t) {
-		int lhsSize = h.size(); // Size left
-		int rhsSize = t.size(); // Size right
-		int[] res = new int[lhsSize + rhsSize]; // Result array.
-		int[] lhs = h.extractValues(); // Left hand side values
-		int[] rhs = t.extractValues(); // Right hand side values
-		int lhsP = 0; // Left hand side pointer
-		int rhsP = 0; // Right hand side pointer
-		int p = 0; // Pointer in array.
-		while(lhsP < lhsSize || rhsP < rhsSize) {
-			if(lhsP < lhsSize && (rhsP == rhsSize || lhs[lhsP] < rhs[rhsP])) {
-				res[p++] = lhs[lhsP++];
-			}
-			else {
-				res[p++] = rhs[rhsP++];
-			}
-		}
-		return new IntArrayList(res);
-	}
-
-	@Override
-	public BitmapType getType() {
-		return BitmapType.Lossy;
-	}
-
-	/**
-	 * Utility method to scale all the values in the array to byte range
-	 * 
-	 * TODO make scaling parallel since each scaling is independent.
-	 * 
-	 * @param fp    doulbe array to scale
-	 * @param scale the scale to apply
-	 * @return the scaled values in byte
-	 */
-	public static byte[] scaleValues(double[] fp, double scale) {
-		byte[] res = new byte[fp.length];
-		for(int idx = 0; idx < fp.length; idx++) {
-			res[idx] = (byte) (fp[idx] / scale);
-		}
-		return res;
-	}
 }
diff --git a/src/main/java/org/apache/sysds/runtime/compress/utils/DblArrayIntListHashMap.java b/src/main/java/org/apache/sysds/runtime/compress/utils/DblArrayIntListHashMap.java
index 24bde6d..aced358 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/utils/DblArrayIntListHashMap.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/utils/DblArrayIntListHashMap.java
@@ -20,16 +20,21 @@
 package org.apache.sysds.runtime.compress.utils;
 
 import java.util.ArrayList;
+import java.util.Collections;
+import java.util.Comparator;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
 
 /**
  * This class provides a memory-efficient replacement for {@code HashMap<DblArray,IntArrayList>} for restricted use
  * cases.
  * 
- * TODO: Fix allocation of size such that it contains some amount of overhead from the start, to enable hashmap
- * performance.
  */
 public class DblArrayIntListHashMap extends CustomHashMap {
 
+	protected static final Log LOG = LogFactory.getLog(DblArrayIntListHashMap.class.getName());
+
 	private DArrayIListEntry[] _data = null;
 
 	public DblArrayIntListHashMap() {
@@ -92,7 +97,9 @@
 				ret.add(e);
 			}
 		}
+		Collections.sort(ret);
 
+		LOG.info(ret);
 		return ret;
 	}
 
@@ -132,7 +139,7 @@
 		return h & (length - 1);
 	}
 
-	public class DArrayIListEntry {
+	public class DArrayIListEntry implements Comparator<DArrayIListEntry>, Comparable<DArrayIListEntry> {
 		public DblArray key;
 		public IntArrayList value;
 		public DArrayIListEntry next;
@@ -142,5 +149,30 @@
 			value = evalue;
 			next = null;
 		}
+
+		@Override
+		public int compare(DArrayIListEntry o1, DArrayIListEntry o2) {
+			double[] o1d = o1.key.getData();
+			double[] o2d = o2.key.getData();
+			for(int i = 0; i < o1d.length && i < o2d.length; i++) {
+				if(o1d[i] > o2d[i]) {
+					return 1;
+				}
+				else if(o1d[i] < o2d[i]) {
+					return -1;
+				}
+			}
+			if(o1d.length > o2d.length) {
+				return 1;
+			}
+			else {
+				return -1;
+			}
+		}
+
+		@Override
+		public int compareTo(DArrayIListEntry o) {
+			return compare(this, o);
+		}
 	}
 }
diff --git a/src/main/java/org/apache/sysds/runtime/compress/utils/DoubleIntListHashMap.java b/src/main/java/org/apache/sysds/runtime/compress/utils/DoubleIntListHashMap.java
index 5236196..2a4d5f1 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/utils/DoubleIntListHashMap.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/utils/DoubleIntListHashMap.java
@@ -20,6 +20,8 @@
 package org.apache.sysds.runtime.compress.utils;
 
 import java.util.ArrayList;
+import java.util.Collections;
+import java.util.Comparator;
 
 /**
  * This class provides a memory-efficient replacement for {@code HashMap<Double,IntArrayList>} for restricted use cases.
@@ -91,6 +93,7 @@
 				ret.add(e);
 			}
 		}
+		Collections.sort(ret);
 
 		return ret;
 	}
@@ -133,7 +136,7 @@
 		return h & (length - 1);
 	}
 
-	public class DIListEntry {
+	public class DIListEntry implements Comparator<DIListEntry>, Comparable<DIListEntry> {
 		public double key = Double.MAX_VALUE;
 		public IntArrayList value = null;
 		public DIListEntry next = null;
@@ -143,5 +146,16 @@
 			value = evalue;
 			next = null;
 		}
+
+		@Override
+		public int compareTo(DIListEntry o) {
+			return compare(this, o);
+		}
+
+		@Override
+		public int compare(DIListEntry arg0, DIListEntry arg1) {
+			return Double.compare(arg0.key, arg1.key);
+		}
+
 	}
 }
diff --git a/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixCountDistinct.java b/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixCountDistinct.java
index c078e36..e582ee9 100644
--- a/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixCountDistinct.java
+++ b/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixCountDistinct.java
@@ -115,7 +115,7 @@
 		Set<Double> distinct = new HashSet<>();
 		double[] data;
 		long nonZeros = in.getNonZeros();
-		if(nonZeros < in.getNumColumns() * in.getNumRows()){
+		if(nonZeros < in.getNumColumns() * in.getNumRows()) {
 			distinct.add(0d);
 		}
 		if(in.sparseBlock == null && in.denseBlock == null) {
@@ -126,7 +126,7 @@
 		}
 		else if(in.sparseBlock != null) {
 			SparseBlock sb = in.sparseBlock;
-			
+
 			if(in.sparseBlock.isContiguous()) {
 				data = sb.values(0);
 				countDistinctValuesNaive(data, distinct);
diff --git a/src/main/java/org/apache/sysds/runtime/matrix/operators/LeftScalarOperator.java b/src/main/java/org/apache/sysds/runtime/matrix/operators/LeftScalarOperator.java
index 6e2e78e..7a40a3f 100644
--- a/src/main/java/org/apache/sysds/runtime/matrix/operators/LeftScalarOperator.java
+++ b/src/main/java/org/apache/sysds/runtime/matrix/operators/LeftScalarOperator.java
@@ -38,14 +38,24 @@
 	private static final long serialVersionUID = 2360577666575746424L;
 	
 	public LeftScalarOperator(ValueFunction p, double cst) {
+		this(p,cst,1);
+	}
+	
+	public LeftScalarOperator(ValueFunction p, double cst, int numThreads){
 		super(p, cst, (p instanceof GreaterThan && cst<=0)
 			|| (p instanceof GreaterThanEquals && cst<0)
 			|| (p instanceof LessThan && cst>=0)
 			|| (p instanceof LessThanEquals && cst>0)
 			|| (Builtin.isBuiltinCode(p, BuiltinCode.MAX) && cst<=0)
-			|| (Builtin.isBuiltinCode(p, BuiltinCode.MIN) && cst>=0));
+			|| (Builtin.isBuiltinCode(p, BuiltinCode.MIN) && cst>=0),
+			numThreads);
 	}
-	
+
+	@Override
+	public ScalarOperator setConstant(double cst, int numThreads) {
+		return new LeftScalarOperator(fn, cst, numThreads);
+	}
+
 	@Override
 	public ScalarOperator setConstant(double cst) {
 		return new LeftScalarOperator(fn, cst);
diff --git a/src/main/java/org/apache/sysds/runtime/matrix/operators/RightScalarOperator.java b/src/main/java/org/apache/sysds/runtime/matrix/operators/RightScalarOperator.java
index 9d5c12e..a55ed66 100644
--- a/src/main/java/org/apache/sysds/runtime/matrix/operators/RightScalarOperator.java
+++ b/src/main/java/org/apache/sysds/runtime/matrix/operators/RightScalarOperator.java
@@ -39,6 +39,10 @@
 	private static final long serialVersionUID = 5148300801904349919L;
 	
 	public RightScalarOperator(ValueFunction p, double cst) {
+		this(p, cst, 1);
+	}
+
+	public RightScalarOperator(ValueFunction p, double cst, int numThreads){
 		super(p, cst, (p instanceof GreaterThan && cst>=0)
 			|| (p instanceof GreaterThanEquals && cst>0)
 			|| (p instanceof LessThan && cst<=0)
@@ -46,13 +50,19 @@
 			|| (p instanceof Divide && cst!=0)
 			|| (p instanceof Power && cst!=0)
 			|| (Builtin.isBuiltinCode(p, BuiltinCode.MAX) && cst<=0)
-			|| (Builtin.isBuiltinCode(p, BuiltinCode.MIN) && cst>=0));
+			|| (Builtin.isBuiltinCode(p, BuiltinCode.MIN) && cst>=0), 
+			numThreads);
 	}
 
 	@Override
 	public ScalarOperator setConstant(double cst) {
 		return new RightScalarOperator(fn, cst);
 	}
+
+	@Override
+	public ScalarOperator setConstant(double cst, int numThreads) {
+		return new RightScalarOperator(fn, cst, numThreads);
+	}
 	
 	@Override
 	public double executeScalar(double in) {
diff --git a/src/main/java/org/apache/sysds/runtime/matrix/operators/ScalarOperator.java b/src/main/java/org/apache/sysds/runtime/matrix/operators/ScalarOperator.java
index a395397..8f27209 100644
--- a/src/main/java/org/apache/sysds/runtime/matrix/operators/ScalarOperator.java
+++ b/src/main/java/org/apache/sysds/runtime/matrix/operators/ScalarOperator.java
@@ -44,12 +44,17 @@
 
 	public final ValueFunction fn;
 	protected final double _constant;
+	private final int k; //num threads
 	
 	public ScalarOperator(ValueFunction p, double cst) {
 		this(p, cst, false);
 	}
 	
 	protected ScalarOperator(ValueFunction p, double cst, boolean altSparseSafe) {
+		this(p,cst,altSparseSafe, 1);
+	}
+
+	protected ScalarOperator(ValueFunction p, double cst, boolean altSparseSafe, int numThreads) {
 		super( isSparseSafeStatic(p) || altSparseSafe
 				|| (p instanceof NotEquals && cst==0)
 				|| (p instanceof Equals && cst!=0)
@@ -58,6 +63,7 @@
 				|| (p instanceof Builtin && ((Builtin)p).getBuiltinCode()==BuiltinCode.MIN && cst>=0));
 		fn = p;
 		_constant = cst;
+		k = numThreads;
 	}
 	
 	public double getConstant() {
@@ -66,6 +72,8 @@
 	
 	public abstract ScalarOperator setConstant(double cst);
 	
+	public abstract ScalarOperator setConstant(double cst, int numThreads);
+
 	/**
 	 * Apply the scalar operator over a given input value.
 	 * 
@@ -86,4 +94,8 @@
 			|| fn instanceof Builtin && ((Builtin)fn).getBuiltinCode()==BuiltinCode.LOG_NZ)
 			|| fn instanceof BitwShiftL || fn instanceof BitwShiftR;
 	}
+
+	public int getNumThreads() {
+		return k;
+	}
 }
diff --git a/src/main/java/org/apache/sysds/runtime/util/DataConverter.java b/src/main/java/org/apache/sysds/runtime/util/DataConverter.java
index fe72ebc..da2578a 100644
--- a/src/main/java/org/apache/sysds/runtime/util/DataConverter.java
+++ b/src/main/java/org/apache/sysds/runtime/util/DataConverter.java
@@ -1350,13 +1350,6 @@
 		return ret;
 	}
 
-	public static double[] toDouble(byte[] data) {
-		double[] ret = new double[data.length];
-		for(int i=0; i<data.length; i++)
-			ret[i] = data[i];
-		return ret;
-	}
-	
 	public static double[] toDouble(BitSet data, int len) {
 		double[] ret = new double[len];
 		for(int i=0; i<len; i++)
diff --git a/src/test/java/org/apache/sysds/test/TestUtils.java b/src/test/java/org/apache/sysds/test/TestUtils.java
index 5dd924e..2123e8a 100644
--- a/src/test/java/org/apache/sysds/test/TestUtils.java
+++ b/src/test/java/org/apache/sysds/test/TestUtils.java
@@ -725,33 +725,38 @@
 			new double[][]{actualMatrix}, 1, expectedMatrix.length, epsilon);
 	}
 	
-	/**
-	 * Compares two matrices in array format.
-	 * 
-	 * @param expectedMatrix expected values
-	 * @param actualMatrix actual values
-	 * @param rows number of rows
-	 * @param cols number of columns
-	 * @param epsilon tolerance for value comparison
-	 */
+	
 	public static void compareMatrices(double[][] expectedMatrix, double[][] actualMatrix, int rows, int cols,
-			double epsilon) {
+		double epsilon) {
+		compareMatrices(expectedMatrix, actualMatrix, expectedMatrix.length, expectedMatrix[0].length, epsilon, "");
+	}
+
+	public static void compareMatrices(double[][] expectedMatrix, double[][] actualMatrix, int rows, int cols,
+			double epsilon, String message) {
 		int countErrors = 0;
-		for (int i = 0; i < rows; i++) {
-			for (int j = 0; j < cols; j++) {
+		for (int i = 0; i < rows && countErrors < 50; i++) {
+			for (int j = 0; j < cols && countErrors < 50; j++) {
 				if (!compareCellValue(expectedMatrix[i][j], actualMatrix[i][j], epsilon, false)) {
-					System.out.println(expectedMatrix[i][j] +" vs actual: "+actualMatrix[i][j]+" at "+i+" "+j);
+					message += ("\n " +expectedMatrix[i][j] +" vs actual: "+actualMatrix[i][j]+" at "+i+" "+j);
 					countErrors++;
 				}
 			}
 		}
-		assertTrue("" + countErrors + " values are not in equal", countErrors == 0);
+		if(countErrors == 50){
+			assertTrue(message+" \n More than 50 values are not equal using epsilon " + epsilon, countErrors == 0);
+		}else{
+			assertTrue(message+" \n" + countErrors + " values are not in equal using epsilon " + epsilon, countErrors == 0);
+		}
 	}
 
 	public static void compareMatrices(double[][] expectedMatrix, double[][] actualMatrix, double epsilon){
-		assertTrue("The number of columns in the matrixes should be equal", expectedMatrix.length == actualMatrix.length);
-		assertTrue("The number of rows in the matrixes should be equal", expectedMatrix[0].length == actualMatrix[0].length);
-		compareMatrices(expectedMatrix, actualMatrix, expectedMatrix.length, expectedMatrix[0].length, epsilon);
+		compareMatrices(expectedMatrix, actualMatrix, epsilon, "");
+	}
+
+	public static void compareMatrices(double[][] expectedMatrix, double[][] actualMatrix, double epsilon, String message){
+		assertTrue(message+"\n The number of columns in the matrixes should be equal", expectedMatrix.length == actualMatrix.length);
+		assertTrue(message+"\n The number of rows in the matrixes should be equal", expectedMatrix[0].length == actualMatrix[0].length);
+		compareMatrices(expectedMatrix, actualMatrix, expectedMatrix.length, expectedMatrix[0].length, epsilon, message);
 	}
 	
 	public static void compareFrames(String[][] expectedFrame, String[][] actualFrame, int rows, int cols ) {
@@ -799,20 +804,24 @@
 		int countErrors = 0;
 		long sumDistance = 0;
 		long distance;
-		for (int i = 0; i < rows; i++) {
-			for (int j = 0; j < cols; j++) {
+		for (int i = 0; i < rows && countErrors < 50; i++) {
+			for (int j = 0; j < cols && countErrors < 50; j++) {
 				distance = compareScalarBits(expectedMatrix[i][j], actualMatrix[i][j]);
 				sumDistance += distance;
 				if(distance > maxUnitsOfLeastPrecision){
-					System.out.println(expectedMatrix[i][j] +" vs actual: "+actualMatrix[i][j]+" at "+i+" "+j + " Distance in bits: " + distance);
+					message += ("\n " + expectedMatrix[i][j] +" vs actual: "+actualMatrix[i][j]+" at "+i+" "+j + " Distance in bits: " + distance);
 					countErrors++;
 				}
 			}
 		}
-		long avgDistance = sumDistance / (rows * cols);
-		assertTrue(message + "\n" + countErrors + " values are not in equal", countErrors == 0);
-		assertTrue(message + "\nThe avg distance in bits: "+ avgDistance +" was higher than max: " + maxAvgDistance,
-			avgDistance <= maxAvgDistance);
+		if(countErrors == 50){
+			assertTrue(message + "\n At least 50 values are not in equal", countErrors == 0);
+		}else{
+			long avgDistance = sumDistance / (rows * cols);
+			assertTrue(message + "\n" + countErrors + " values are not in equal", countErrors == 0);
+			assertTrue(message + "\nThe avg distance in bits: "+ avgDistance +" was higher than max: " + maxAvgDistance,
+				avgDistance <= maxAvgDistance);
+		}
 	}
 
 	/**
@@ -868,7 +877,7 @@
 					distance = getPercentDistance(expectedMatrix[i][j], actualMatrix[i][j], ignoreZero);
 					sumPercentDistance += distance;
 					if(distance < percentDistanceAllowed){
-						System.out.println(expectedMatrix[i][j] +" vs actual: "+actualMatrix[i][j]+" at "+i+" "+j + " Distance in percent " + distance);
+						message += (expectedMatrix[i][j] +" vs actual: "+actualMatrix[i][j]+" at "+i+" "+j + " Distance in percent " + distance);
 						countErrors++;
 					}
 				}
@@ -901,6 +910,9 @@
 	 * @return Whether distance in bits
 	 */
 	public static long compareScalarBits(double d1, double d2) {
+		
+		// assertTrue("Both values should be positive or negative",(d1 >= 0 && d2 >= 0) || (d2 <= 0 && d1 <= 0));
+		
 		long expectedBits = Double.doubleToLongBits(d1) < 0 ? 0x8000000000000000L - Double.doubleToLongBits(d1) : Double.doubleToLongBits(d1);
 		long actualBits = Double.doubleToLongBits(d2) < 0 ? 0x8000000000000000L - Double.doubleToLongBits(d2) : Double.doubleToLongBits(d2);
 		long difference = expectedBits > actualBits ? expectedBits - actualBits : actualBits - expectedBits;
@@ -910,6 +922,8 @@
 	public static boolean compareScalarBits(double d1, double d2, long maxUnitsOfLeastPrecision) {
 		if (Double.isNaN(d1) || Double.isNaN(d2))
 			return false;
+
+		// assertTrue("Both values should be positive or negative",(d1 >= 0 && d2 >= 0) || (d2 <= 0 && d1 <= 0));
 		long expectedBits = Double.doubleToLongBits(d1) < 0 ? 0x8000000000000000L - Double.doubleToLongBits(d1) : Double.doubleToLongBits(d1);
 		long actualBits = Double.doubleToLongBits(d2) < 0 ? 0x8000000000000000L - Double.doubleToLongBits(d2) : Double.doubleToLongBits(d2);
 		long difference = expectedBits > actualBits ? expectedBits - actualBits : actualBits - expectedBits;
diff --git a/src/test/java/org/apache/sysds/test/component/compress/AbstractCompressedUnaryTests.java b/src/test/java/org/apache/sysds/test/component/compress/AbstractCompressedUnaryTests.java
index 3717be7..560d234 100644
--- a/src/test/java/org/apache/sysds/test/component/compress/AbstractCompressedUnaryTests.java
+++ b/src/test/java/org/apache/sysds/test/component/compress/AbstractCompressedUnaryTests.java
@@ -171,25 +171,25 @@
 			assertTrue("dim 2 is equal in non compressed res", d1[0].length == dim2);
 			assertTrue("dim 2 is equal in compressed res", d2[0].length == dim2);
 
+			String css = compressionSettings.toString();
 			if(compressionSettings.lossy) {
 				if(aggType == AggType.COLSUMS) {
-					TestUtils.compareMatrices(d1, d2, lossyTolerance * 30 * dim2);
+					TestUtils.compareMatrices(d1, d2, lossyTolerance * 150 * cols, css);
 				}
 				else if(aggType == AggType.ROWSUMS) {
-					TestUtils.compareMatrices(d1, d2, lossyTolerance * 16 * dim1);
+					TestUtils.compareMatrices(d1, d2, lossyTolerance * 16 * rows, css);
+				}
+				else if(aggType == AggType.SUM) {
+					TestUtils.compareMatrices(d1, d2, lossyTolerance * 10 * cols * rows, css);
+
 				}
 				else {
 					boolean ignoreZero = true;
-					TestUtils.compareMatricesPercentageDistance(d1,
-						d2,
-						0.1,
-						0.9,
-						compressionSettings.toString(),
-						ignoreZero);
+					TestUtils.compareMatricesPercentageDistance(d1, d2, 0.1, 0.9, css, ignoreZero);
 				}
 			}
 			else {
-				TestUtils.compareMatricesBitAvgDistance(d1, d2, 2048, 20, compressionSettings.toString());
+				TestUtils.compareMatricesBitAvgDistance(d1, d2, 2048, 30, css);
 			}
 		}
 		catch(NotImplementedException e) {
diff --git a/src/test/java/org/apache/sysds/test/component/compress/CompressedMatrixTest.java b/src/test/java/org/apache/sysds/test/component/compress/CompressedMatrixTest.java
index ff09d45..ba83baa 100644
--- a/src/test/java/org/apache/sysds/test/component/compress/CompressedMatrixTest.java
+++ b/src/test/java/org/apache/sysds/test/component/compress/CompressedMatrixTest.java
@@ -27,14 +27,11 @@
 import java.io.DataInputStream;
 import java.io.DataOutputStream;
 
-import org.apache.sysds.lops.MMTSJ.MMTSJType;
-import org.apache.sysds.lops.MapMultChain.ChainType;
 import org.apache.sysds.runtime.compress.CompressedMatrixBlock;
 import org.apache.sysds.runtime.compress.CompressionSettings;
 import org.apache.sysds.runtime.compress.CompressionStatistics;
 import org.apache.sysds.runtime.compress.colgroup.ColGroup;
 import org.apache.sysds.runtime.functionobjects.Multiply;
-import org.apache.sysds.runtime.functionobjects.Plus;
 import org.apache.sysds.runtime.matrix.data.LibMatrixCountDistinct;
 import org.apache.sysds.runtime.matrix.data.MatrixBlock;
 import org.apache.sysds.runtime.matrix.operators.AggregateUnaryOperator;
@@ -123,131 +120,14 @@
 	}
 
 	@Test
-	public void testMatrixMultChain() {
-		try {
-			if(!(cmb instanceof CompressedMatrixBlock))
-				return; // Input was not compressed then just pass test
-
-			MatrixBlock vector1 = DataConverter
-				.convertToMatrixBlock(TestUtils.generateTestMatrix(cols, 1, 0.5, 1.5, 1.0, 3));
-
-			// ChainType ctype = ChainType.XtwXv;
-			// Linear regression .
-			for(ChainType ctype : new ChainType[] {ChainType.XtwXv, ChainType.XtXv,
-				// ChainType.XtXvy
-			}) {
-
-				MatrixBlock vector2 = (ctype == ChainType.XtwXv) ? DataConverter
-					.convertToMatrixBlock(TestUtils.generateTestMatrix(rows, 1, 0.5, 1.5, 1.0, 3)) : null;
-
-				// matrix-vector uncompressed
-				MatrixBlock ret1 = mb.chainMatrixMultOperations(vector1, vector2, new MatrixBlock(), ctype);
-
-				// matrix-vector compressed
-				MatrixBlock ret2 = cmb.chainMatrixMultOperations(vector1, vector2, new MatrixBlock(), ctype);
-
-				// compare result with input
-				double[][] d1 = DataConverter.convertToDoubleMatrix(ret1);
-				double[][] d2 = DataConverter.convertToDoubleMatrix(ret2);
-
-				if(compressionSettings.lossy) {
-					// TODO Make actual calculation to know the tolerance
-					// double scaledTolerance = lossyTolerance * d1.length * d1.length * 1.5;
-					// if(ctype == ChainType.XtwXv){
-					// scaledTolerance *= d1.length * d1.length * 0.5;
-					// }
-					// TestUtils.compareMatrices(d1, d2, d1.length, d1[0].length, scaledTolerance );
-					TestUtils.compareMatricesPercentageDistance(d1, d2, 0.95, 0.95, compressionSettings.toString());
-				}
-				else {
-					TestUtils.compareMatricesBitAvgDistance(d1, d2, 512, 350, compressionSettings.toString());
-				}
-			}
-		}
-		catch(Exception e) {
-			e.printStackTrace();
-			throw new RuntimeException(this.toString() + "\n" + e.getMessage(), e);
-		}
-	}
-
-	@Test
-	public void testTransposeSelfMatrixMult() {
-		try {
-			if(!(cmb instanceof CompressedMatrixBlock))
-				return; // Input was not compressed then just pass test
-			// ChainType ctype = ChainType.XtwXv;
-			for(MMTSJType mType : new MMTSJType[] {MMTSJType.LEFT,
-				// MMTSJType.RIGHT
-			}) {
-				// matrix-vector uncompressed
-				MatrixBlock ret1 = mb.transposeSelfMatrixMultOperations(new MatrixBlock(), mType);
-
-				// matrix-vector compressed
-				MatrixBlock ret2 = cmb.transposeSelfMatrixMultOperations(new MatrixBlock(), mType);
-
-				// compare result with input
-				double[][] d1 = DataConverter.convertToDoubleMatrix(ret1);
-				double[][] d2 = DataConverter.convertToDoubleMatrix(ret2);
-				if(compressionSettings.lossy) {
-					/**
-					 * Probably one of the worst thing you can do to increase the amount the values are estimated wrong
-					 */
-					TestUtils.compareMatricesPercentageDistance(d1, d2, 0.0, 0.8, compressionSettings.toString());
-				}
-				else {
-					TestUtils.compareMatricesBitAvgDistance(d1, d2, 2048, 20, compressionSettings.toString());
-				}
-			}
-		}
-		catch(Exception e) {
-			e.printStackTrace();
-			throw new RuntimeException(this.toString() + "\n" + e.getMessage(), e);
-		}
-	}
-
-	@Test
-	public void testScalarOperationsSparseUnsafe() {
-		try {
-			if(!(cmb instanceof CompressedMatrixBlock))
-				return; // Input was not compressed then just pass test
-
-			double addValue = 1000;
-			// matrix-scalar uncompressed
-			ScalarOperator sop = new RightScalarOperator(Plus.getPlusFnObject(), addValue);
-			MatrixBlock ret1 = mb.scalarOperations(sop, new MatrixBlock());
-
-			// matrix-scalar compressed
-			MatrixBlock ret2 = cmb.scalarOperations(sop, new MatrixBlock());
-			if(ret2 instanceof CompressedMatrixBlock)
-				ret2 = ((CompressedMatrixBlock) ret2).decompress();
-
-			// compare result with input
-			double[][] d1 = DataConverter.convertToDoubleMatrix(ret1);
-			double[][] d2 = DataConverter.convertToDoubleMatrix(ret2);
-
-			if(compressionSettings.lossy) {
-				double modifiedTolerance = Math.max(TestConstants.getMaxRangeValue(valRange) + addValue,
-					Math.abs(TestConstants.getMinRangeValue(valRange) + addValue)) * 2 / 127.0;
-				TestUtils.compareMatrices(d1, d2, modifiedTolerance);
-			}
-			else {
-				TestUtils.compareMatricesBitAvgDistance(d1, d2, 150, 1, compressionSettings.toString());
-			}
-		}
-		catch(Exception e) {
-			e.printStackTrace();
-			throw new RuntimeException(this.toString() + "\n" + e.getMessage(), e);
-		}
-	}
-
-	@Test
 	public void testScalarOperations() {
 		try {
 			if(!(cmb instanceof CompressedMatrixBlock))
 				return; // Input was not compressed then just pass test
 
+			double mult = 7;
 			// matrix-scalar uncompressed
-			ScalarOperator sop = new RightScalarOperator(Multiply.getMultiplyFnObject(), 7);
+			ScalarOperator sop = new RightScalarOperator(Multiply.getMultiplyFnObject(), mult, _k);
 			MatrixBlock ret1 = mb.scalarOperations(sop, new MatrixBlock());
 
 			// matrix-scalar compressed
@@ -259,8 +139,8 @@
 			double[][] d1 = DataConverter.convertToDoubleMatrix(ret1);
 			double[][] d2 = DataConverter.convertToDoubleMatrix(ret2);
 			if(compressionSettings.lossy) {
-				double modifiedTolerance = lossyTolerance * 7;
-				TestUtils.compareMatrices(d1, d2, modifiedTolerance);
+				double modifiedTolerance = lossyTolerance * mult + lossyTolerance * 0.00001;
+				TestUtils.compareMatrices(d1, d2, modifiedTolerance, compressionSettings.toString());
 			}
 			else {
 				TestUtils.compareMatricesBitAvgDistance(d1, d2, 150, 1, compressionSettings.toString());
@@ -277,20 +157,18 @@
 		try {
 			if(!(cmb instanceof CompressedMatrixBlock))
 				return; // Input was not compressed then just pass test
-			// compare result with input
 
-			// matrix-scalar uncompressed
 			CountDistinctOperator op = new CountDistinctOperator(CountDistinctTypes.COUNT);
 			int ret1 = LibMatrixCountDistinct.estimateDistinctValues(mb, op);
-			// matrix-scalar compressed
 			int ret2 = LibMatrixCountDistinct.estimateDistinctValues(cmb, op);
-
-			// assertTrue(compressionSettings.toString(), ret1 == ret2);
 			String base = compressionSettings.toString() + "\n";
 			if(compressionSettings.lossy) {
-				// The number of distinct values should be significantly lower in lossy mode.
-				assertTrue(base + "estimate is less than actual", ret1 >= ret2);
-				assertTrue(base + "estimate is greater than 0", 0 < ret2);
+				// The number of distinct values should be same or lower in lossy mode.
+				// assertTrue(base + "lossy distinct count " +ret2+ "is less than full " + ret1, ret1 >= ret2);
+
+				// above assumption is false, since the distinct count when using multiple different scales becomes
+				// larger due to differences in scale.
+				assertTrue(base + "lossy distinct count " + ret2 + "is greater than 0", 0 < ret2);
 			}
 			else {
 				assertEquals(base, ret1, ret2);
@@ -368,7 +246,7 @@
 			long colsEstimate = cStat.estimatedSizeCols;
 			long actualSize = cStat.size;
 			long originalSize = cStat.originalSize;
-			int allowedTolerance = 0;
+			int allowedTolerance = 4096;
 
 			if(compressionSettings.samplingRatio < 1.0) {
 				allowedTolerance = sampleTolerance;
@@ -384,7 +262,7 @@
 			builder.append("\n\tcol groups types: " + cStat.getGroupsTypesString());
 			builder.append("\n\tcol groups sizes: " + cStat.getGroupsSizesString());
 			builder.append("\n\t" + this.toString());
-			boolean res = actualSize - colsEstimate <= allowedTolerance;
+			boolean res = Math.abs(actualSize - colsEstimate) <= allowedTolerance;
 			assertTrue(builder.toString(), res);
 		}
 		catch(Exception e) {
@@ -417,7 +295,7 @@
 			// it treats the object hierarchy as a tree and not a graph
 			assertTrue(builder.toString(),
 				actualSize <= originalSize &&
-					(compressionSettings.allowSharedDDCDictionary || actualSize == JolEstimatedSize));
+					(compressionSettings.allowSharedDictionary || actualSize == JolEstimatedSize));
 		}
 		catch(Exception e) {
 			e.printStackTrace();
diff --git a/src/test/java/org/apache/sysds/test/component/compress/CompressedTestBase.java b/src/test/java/org/apache/sysds/test/component/compress/CompressedTestBase.java
index 57d60d4..2440c2d 100644
--- a/src/test/java/org/apache/sysds/test/component/compress/CompressedTestBase.java
+++ b/src/test/java/org/apache/sysds/test/component/compress/CompressedTestBase.java
@@ -28,6 +28,7 @@
 import org.apache.commons.lang3.tuple.Pair;
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
+import org.apache.sysds.lops.MMTSJ.MMTSJType;
 import org.apache.sysds.lops.MapMultChain.ChainType;
 import org.apache.sysds.runtime.compress.CompressedMatrixBlock;
 import org.apache.sysds.runtime.compress.CompressedMatrixBlockFactory;
@@ -37,9 +38,12 @@
 import org.apache.sysds.runtime.compress.colgroup.ColGroup.CompressionType;
 import org.apache.sysds.runtime.functionobjects.Multiply;
 import org.apache.sysds.runtime.functionobjects.Plus;
+import org.apache.sysds.runtime.instructions.InstructionUtils;
 import org.apache.sysds.runtime.matrix.data.MatrixBlock;
 import org.apache.sysds.runtime.matrix.operators.AggregateBinaryOperator;
 import org.apache.sysds.runtime.matrix.operators.AggregateOperator;
+import org.apache.sysds.runtime.matrix.operators.RightScalarOperator;
+import org.apache.sysds.runtime.matrix.operators.ScalarOperator;
 import org.apache.sysds.runtime.util.DataConverter;
 import org.apache.sysds.test.TestUtils;
 import org.apache.sysds.test.component.compress.TestConstants.MatrixTypology;
@@ -51,10 +55,11 @@
 
 public abstract class CompressedTestBase extends TestBase {
 	protected static final Log LOG = LogFactory.getLog(CompressedTestBase.class.getName());
+
 	protected static SparsityType[] usedSparsityTypes = new SparsityType[] { // Sparsity 0.9, 0.1, 0.01 and 0.0
-		// SparsityType.FULL,
-		SparsityType.DENSE,
-		SparsityType.SPARSE, 
+		SparsityType.FULL,
+		// SparsityType.DENSE,
+		SparsityType.SPARSE,
 		// SparsityType.ULTRA_SPARSE,
 		// SparsityType.EMPTY
 	};
@@ -63,50 +68,66 @@
 		// ValueType.RAND,
 		// ValueType.CONST,
 		ValueType.RAND_ROUND,
-		//  ValueType.OLE_COMPRESSIBLE,
+		ValueType.OLE_COMPRESSIBLE,
 		// ValueType.RLE_COMPRESSIBLE,
 	};
 
 	protected static ValueRange[] usedValueRanges = new ValueRange[] {
-		// ValueRange.SMALL, 
-		ValueRange.LARGE,
+		ValueRange.SMALL, 
+		// ValueRange.LARGE,
 		// ValueRange.BYTE
 	};
 
 	private static final int compressionSeed = 7;
 
 	protected static CompressionSettings[] usedCompressionSettings = new CompressionSettings[] {
-		// new CompressionSettingsBuilder().setSamplingRatio(0.1).setAllowSharedDDCDictionary(false)
-		// .setSeed(compressionSeed).setValidCompressions(EnumSet.of(CompressionType.DDC)).setInvestigateEstimate(true).create(),
-		new CompressionSettingsBuilder().setSamplingRatio(0.1)//.setAllowSharedDDCDictionary(true)
-			.setSeed(compressionSeed).setValidCompressions(EnumSet.of(CompressionType.DDC)).setInvestigateEstimate(true)
-			.create(),
+		// CLA TESTS!
+
+		new CompressionSettingsBuilder().setSamplingRatio(0.1).setSeed(compressionSeed)
+			.setValidCompressions(EnumSet.of(CompressionType.DDC)).setInvestigateEstimate(true).create(),
 		new CompressionSettingsBuilder().setSamplingRatio(0.1).setSeed(compressionSeed)
 			.setValidCompressions(EnumSet.of(CompressionType.OLE)).setInvestigateEstimate(true).create(),
 		new CompressionSettingsBuilder().setSamplingRatio(0.1).setSeed(compressionSeed)
 			.setValidCompressions(EnumSet.of(CompressionType.RLE)).setInvestigateEstimate(true).create(),
 		new CompressionSettingsBuilder().setSamplingRatio(0.1).setSeed(compressionSeed).setInvestigateEstimate(true)
 			.create(),
+		new CompressionSettingsBuilder().setSamplingRatio(1.0).setSeed(compressionSeed).setInvestigateEstimate(true)
+			.setAllowSharedDictionary(false).setmaxStaticColGroupCoCode(1).create(),
+
+		// LOSSY TESTS!
+
+		new CompressionSettingsBuilder().setSamplingRatio(0.1).setSeed(compressionSeed)
+			.setValidCompressions(EnumSet.of(CompressionType.DDC)).setInvestigateEstimate(true).setLossy(true).create(),
+		new CompressionSettingsBuilder().setSamplingRatio(0.1).setSeed(compressionSeed)
+			.setValidCompressions(EnumSet.of(CompressionType.OLE)).setInvestigateEstimate(true).setLossy(true).create(),
+		new CompressionSettingsBuilder().setSamplingRatio(0.1).setSeed(compressionSeed)
+			.setValidCompressions(EnumSet.of(CompressionType.RLE)).setInvestigateEstimate(true).setLossy(true).create(),
+
+		new CompressionSettingsBuilder().setSamplingRatio(0.1).setSeed(compressionSeed).setInvestigateEstimate(true)
+			.create(),
+
 		// new CompressionSettingsBuilder().setSamplingRatio(1.0).setSeed(compressionSeed).setInvestigateEstimate(true)
-		// .addValidCompression(CompressionType.QUAN).create(),
-		new CompressionSettingsBuilder().setSamplingRatio(1.0).setSeed(compressionSeed).setInvestigateEstimate(true)
-		.setAllowSharedDDCDictionary(false).setmaxStaticColGroupCoCode(1).create(),
-		new CompressionSettingsBuilder().setSamplingRatio(1.0).setSeed(compressionSeed).setInvestigateEstimate(true)
-		.setAllowSharedDDCDictionary(false).setmaxStaticColGroupCoCode(1).setLossy(true).create(),
+		// .setAllowSharedDictionary(false).setmaxStaticColGroupCoCode(1).setLossy(true).create(),
+
+		// COCODING TESTS!!
+
 		// new CompressionSettingsBuilder().setSamplingRatio(1.0).setSeed(compressionSeed).setInvestigateEstimate(true)
 		// .setAllowSharedDDCDictionary(false).setmaxStaticColGroupCoCode(20).create(),
 		// new CompressionSettingsBuilder().setSamplingRatio(1.0).setSeed(compressionSeed).setInvestigateEstimate(true)
 		// .setAllowSharedDDCDictionary(false).setmaxStaticColGroupCoCode(20).setLossy(true).create()
+
+		// SHARED DICTIONARY TESTS!!
+
 	};
 
 	protected static MatrixTypology[] usedMatrixTypology = new MatrixTypology[] { // Selected Matrix Types
-		// MatrixTypology.SMALL,
+		MatrixTypology.SMALL,
 		// MatrixTypology.FEW_COL,
 		// MatrixTypology.FEW_ROW,
-		MatrixTypology.LARGE,
-		// MatrixTypology.SINGLE_COL,
+		// MatrixTypology.LARGE,
+		MatrixTypology.SINGLE_COL,
 		// MatrixTypology.SINGLE_ROW,
-		// MatrixTypology.L_ROWS,
+		MatrixTypology.L_ROWS,
 		// MatrixTypology.XL_ROWS,
 	};
 
@@ -118,10 +139,10 @@
 	protected MatrixBlock cmbDeCompressed;
 	protected double[][] deCompressed;
 
-	/** Method returning the number of threads used for the operation */
+	/** number of threads used for the operation */
 	protected final int _k;
 
-	protected int sampleTolerance = 1024;
+	protected int sampleTolerance = 4096 * 4;
 
 	protected double lossyTolerance;
 
@@ -138,7 +159,7 @@
 			cmb = pair.getLeft();
 			cmbStats = pair.getRight();
 			if(cmb instanceof CompressedMatrixBlock) {
-				cmbDeCompressed = ((CompressedMatrixBlock) cmb).decompress();
+				cmbDeCompressed = ((CompressedMatrixBlock) cmb).decompress(_k);
 				if(cmbDeCompressed != null) {
 
 					deCompressed = DataConverter.convertToDoubleMatrix(cmbDeCompressed);
@@ -162,8 +183,9 @@
 		 * Tolerance for encoding values is the maximum value in dataset divided by number distinct values available in
 		 * a single Byte (since we encode our quntization in Byte)
 		 */
-		lossyTolerance = (double) Math.max(TestConstants.getMaxRangeValue(valueRange),
-			Math.abs(TestConstants.getMinRangeValue(valueRange))) / 127.0;
+		lossyTolerance = (double) (Math.max(TestConstants.getMaxRangeValue(valueRange),
+			Math.abs(TestConstants.getMinRangeValue(valueRange)))) * (1.0 / 127.0) / 2.0;
+		// LOG.debug("TOLERANCE IN TEST:" + lossyTolerance);
 
 	}
 
@@ -198,7 +220,7 @@
 				// Assert.assertTrue("Compression Failed \n" + this.toString(), false);
 			}
 			if(compressionSettings.lossy) {
-				TestUtils.compareMatrices(input, deCompressed, lossyTolerance);
+				TestUtils.compareMatrices(input, deCompressed, lossyTolerance, compressionSettings.toString() + "\n");
 			}
 			else {
 				TestUtils.compareMatricesBitAvgDistance(input, deCompressed, 0, 0, compressionSettings.toString());
@@ -238,7 +260,7 @@
 				return; // Input was not compressed then just pass test
 
 			MatrixBlock vector1 = DataConverter
-				.convertToMatrixBlock(TestUtils.generateTestMatrix(cols, 1, 0.5, 1.5, 1.0, 3));
+				.convertToMatrixBlock(TestUtils.generateTestMatrix(cols, 1, 0.9, 1.1, 1.0, 3));
 
 			// ChainType ctype = ChainType.XtwXv;
 			// Linear regression .
@@ -247,7 +269,7 @@
 			}) {
 
 				MatrixBlock vector2 = (ctype == ChainType.XtwXv) ? DataConverter
-					.convertToMatrixBlock(TestUtils.generateTestMatrix(rows, 1, 0.5, 1.5, 1.0, 3)) : null;
+					.convertToMatrixBlock(TestUtils.generateTestMatrix(rows, 1, 0.9, 1.1, 1.0, 3)) : null;
 
 				// matrix-vector uncompressed
 				MatrixBlock ret1 = mb.chainMatrixMultOperations(vector1, vector2, new MatrixBlock(), ctype, _k);
@@ -310,7 +332,7 @@
 			// Make Operator // matrix-vector uncompressed
 			// AggregateBinaryOperator abop = InstructionUtils.getMatMultOperator(_k);
 			AggregateOperator aop = new AggregateOperator(0, Plus.getPlusFnObject());
-			AggregateBinaryOperator abop = new AggregateBinaryOperator(Multiply.getMultiplyFnObject(), aop);
+			AggregateBinaryOperator abop = new AggregateBinaryOperator(Multiply.getMultiplyFnObject(), aop, _k);
 
 			// matrix-vector uncompressed
 			MatrixBlock ret1 = mb.aggregateBinaryOperations(mb, vector, new MatrixBlock(), abop);
@@ -336,4 +358,109 @@
 			throw new RuntimeException(this.toString() + "\n" + e.getMessage(), e);
 		}
 	}
+
+	@Test
+	public void testVectorMatrixMult() {
+		try {
+			if(!(cmb instanceof CompressedMatrixBlock))
+				return; // Input was not compressed then just pass test
+
+			MatrixBlock vector = DataConverter
+				.convertToMatrixBlock(TestUtils.generateTestMatrix(1, rows, 0.9, 1.5, 1.0, 3));
+
+			// Make Operator
+			AggregateBinaryOperator abop = InstructionUtils.getMatMultOperator(_k);
+
+			// vector-matrix uncompressed
+			MatrixBlock ret1 = mb.aggregateBinaryOperations(vector, mb, new MatrixBlock(), abop);
+
+			// vector-matrix compressed
+			MatrixBlock ret2 = cmb.aggregateBinaryOperations(vector, cmb, new MatrixBlock(), abop);
+
+			// compare result with input
+			double[][] d1 = DataConverter.convertToDoubleMatrix(ret1);
+			double[][] d2 = DataConverter.convertToDoubleMatrix(ret2);
+			if(compressionSettings.lossy) {
+				TestUtils.compareMatricesPercentageDistance(d1, d2, 0.35, 0.96, compressionSettings.toString());
+			}
+			else {
+				TestUtils.compareMatricesBitAvgDistance(d1, d2, 10000, 500, compressionSettings.toString());
+			}
+		}
+		catch(Exception e) {
+			e.printStackTrace();
+			throw new RuntimeException(this.toString() + "\n" + e.getMessage(), e);
+		}
+	}
+
+	@Test
+	public void testTransposeSelfMatrixMult() {
+		try {
+			if(!(cmb instanceof CompressedMatrixBlock))
+				return; // Input was not compressed then just pass test
+			// ChainType ctype = ChainType.XtwXv;
+			for(MMTSJType mType : new MMTSJType[] {MMTSJType.LEFT,
+				// MMTSJType.RIGHT
+			}) {
+				// matrix-vector uncompressed
+				MatrixBlock ret1 = mb.transposeSelfMatrixMultOperations(new MatrixBlock(), mType, _k);
+
+				// matrix-vector compressed
+				MatrixBlock ret2 = cmb.transposeSelfMatrixMultOperations(new MatrixBlock(), mType, _k);
+
+				// compare result with input
+				double[][] d1 = DataConverter.convertToDoubleMatrix(ret1);
+				double[][] d2 = DataConverter.convertToDoubleMatrix(ret2);
+				// High probability that The value is off by some amount
+				if(compressionSettings.lossy) {
+					/**
+					 * Probably the worst thing you can do to increase the amount the values are estimated wrong
+					 */
+					TestUtils.compareMatricesPercentageDistance(d1, d2, 0.0, 0.8, compressionSettings.toString());
+				}
+				else {
+					TestUtils.compareMatricesBitAvgDistance(d1, d2, 2048, 64, compressionSettings.toString());
+				}
+			}
+		}
+		catch(Exception e) {
+			e.printStackTrace();
+			throw new RuntimeException(this.toString() + "\n" + e.getMessage(), e);
+		}
+	}
+
+	@Test
+	public void testScalarOperationsSparseUnsafe() {
+		try {
+			if(!(cmb instanceof CompressedMatrixBlock))
+				return; // Input was not compressed then just pass test
+
+			double addValue = 1000;
+			// matrix-scalar uncompressed
+			ScalarOperator sop = new RightScalarOperator(Plus.getPlusFnObject(), addValue);
+			MatrixBlock ret1 = mb.scalarOperations(sop, new MatrixBlock());
+
+			// matrix-scalar compressed
+			MatrixBlock ret2 = cmb.scalarOperations(sop, new MatrixBlock());
+			if(ret2 instanceof CompressedMatrixBlock)
+				ret2 = ((CompressedMatrixBlock) ret2).decompress();
+
+			// compare result with input
+			double[][] d1 = DataConverter.convertToDoubleMatrix(ret1);
+			double[][] d2 = DataConverter.convertToDoubleMatrix(ret2);
+
+			if(compressionSettings.lossy) {
+				double modifiedTolerance = Math.max(TestConstants.getMaxRangeValue(valRange) + addValue,
+					Math.abs(TestConstants.getMinRangeValue(valRange) + addValue)) * 2 / 127.0;
+				TestUtils.compareMatrices(d1, d2, modifiedTolerance);
+			}
+			else {
+				TestUtils.compareMatricesBitAvgDistance(d1, d2, 150, 1, compressionSettings.toString());
+			}
+		}
+		catch(Exception e) {
+			e.printStackTrace();
+			throw new RuntimeException(this.toString() + "\n" + e.getMessage(), e);
+		}
+	}
 }
diff --git a/src/test/java/org/apache/sysds/test/component/compress/CompressedVectorTest.java b/src/test/java/org/apache/sysds/test/component/compress/CompressedVectorTest.java
index 0f42ac4..d407602 100644
--- a/src/test/java/org/apache/sysds/test/component/compress/CompressedVectorTest.java
+++ b/src/test/java/org/apache/sysds/test/component/compress/CompressedVectorTest.java
@@ -19,6 +19,8 @@
 
 package org.apache.sysds.test.component.compress;
 
+import static org.junit.Assert.assertTrue;
+
 import java.util.ArrayList;
 import java.util.Collection;
 
@@ -33,7 +35,6 @@
 import org.apache.sysds.test.component.compress.TestConstants.SparsityType;
 import org.apache.sysds.test.component.compress.TestConstants.ValueRange;
 import org.apache.sysds.test.component.compress.TestConstants.ValueType;
-import org.junit.Ignore;
 import org.junit.Test;
 import org.junit.runner.RunWith;
 import org.junit.runners.Parameterized;
@@ -49,7 +50,7 @@
 		// MatrixTypology.SINGLE_COL_L
 	};
 
-	protected int getK(){
+	protected int getK() {
 		return _k;
 	}
 
@@ -75,8 +76,6 @@
 		super(sparType, valType, valRange, compSettings, matrixTypology, 1);
 	}
 
-
-	@Ignore
 	@Test
 	public void testCentralMoment() throws Exception {
 		// TODO: Make Central Moment Test work on Multi dimensional Matrix
@@ -93,10 +92,14 @@
 			// quantile compressed
 			double ret2 = cmb.cmOperations(cm).getRequiredResult(opType);
 
-			if (compressionSettings.lossy) {
-				TestUtils.compareCellValue(ret1, ret2, lossyTolerance, false);
-			} else {
-				TestUtils.compareScalarBitsJUnit(ret1, ret2, 64);
+			if(compressionSettings.lossy) {
+				double tol = lossyTolerance * 10;
+				assertTrue(
+					this.toString() + ": values uncomprssed: " + ret1 + "vs compressed: " + ret2 + " tolerance " + tol,
+					TestUtils.compareCellValue(ret1, ret2, tol, false));
+			}
+			else {
+				assertTrue(this.toString(), TestUtils.compareScalarBits(ret1, ret2, 64));
 			}
 		}
 		catch(Exception e) {
@@ -105,7 +108,6 @@
 		}
 	}
 
-	@Ignore
 	@Test
 	public void testQuantile() {
 		try {
@@ -117,10 +119,11 @@
 			MatrixBlock tmp2 = cmb.sortOperations(null, new MatrixBlock());
 			double ret2 = tmp2.pickValue(0.95);
 
-			if (compressionSettings.lossy) {
+			if(compressionSettings.lossy) {
 				TestUtils.compareCellValue(ret1, ret2, lossyTolerance, false);
-			} else {
-				TestUtils.compareScalarBitsJUnit(ret1, ret2, 64);
+			}
+			else {
+				assertTrue(this.toString(), TestUtils.compareScalarBits(ret1, ret2, 64));
 			}
 		}
 		catch(Exception e) {
diff --git a/src/test/java/org/apache/sysds/test/component/compress/CompressibleInputGenerator.java b/src/test/java/org/apache/sysds/test/component/compress/CompressibleInputGenerator.java
index be78e2f..811a85e 100644
--- a/src/test/java/org/apache/sysds/test/component/compress/CompressibleInputGenerator.java
+++ b/src/test/java/org/apache/sysds/test/component/compress/CompressibleInputGenerator.java
@@ -19,12 +19,18 @@
 
 package org.apache.sysds.test.component.compress;
 
+import static org.junit.Assert.assertTrue;
+
 import java.util.ArrayList;
+import java.util.Arrays;
 import java.util.Collections;
+import java.util.DoubleSummaryStatistics;
 import java.util.List;
 import java.util.Random;
 
 import org.apache.commons.lang3.NotImplementedException;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
 import org.apache.sysds.runtime.compress.colgroup.ColGroup.CompressionType;
 import org.apache.sysds.runtime.matrix.data.MatrixBlock;
 import org.apache.sysds.runtime.util.DataConverter;
@@ -35,6 +41,7 @@
  * 
  */
 public class CompressibleInputGenerator {
+	protected static final Log LOG = LogFactory.getLog(CompressibleInputGenerator.class.getName());
 
 	public static MatrixBlock getInput(int rows, int cols, CompressionType ct, int nrUnique, double sparsity,
 		int seed) {
@@ -61,7 +68,15 @@
 			default:
 				throw new NotImplementedException("Not implemented generator.");
 		}
+		for(double[] x : output) {
 
+			DoubleSummaryStatistics stat = Arrays.stream(x).summaryStatistics();
+			double maxV = stat.getMax();
+			double minV = stat.getMin();
+			// LOG.debug("MAX: " + maxV + " - MIN:" + minV);
+			assertTrue(maxV <= max);
+			assertTrue(minV >= min);
+		}
 		return output;
 	}
 
@@ -139,32 +154,45 @@
 
 		// Generate the first column.
 		for(int x = 0; x < rows; x++) {
-			if(r.nextDouble() < sparsity) {
-				if(transpose) {
-					matrix[x][0] = values.get(r.nextInt(nrUnique));
-				}
-				else {
-					matrix[0][x] = values.get(r.nextInt(nrUnique));
-				}
+			if(transpose) {
+				matrix[x][0] = values.get(r.nextInt(nrUnique));
+				// LOG.debug(matrix[x][0]);
+			}
+			else {
+				matrix[0][x] = values.get(r.nextInt(nrUnique));
 			}
 		}
 
 		for(int y = 1; y < cols; y++) {
 			for(int x = 0; x < rows; x++) {
-				// if(r.nextDouble() < sparsity) {
-				if(transpose) {
-					if(matrix[x][0] != 0) {
-						matrix[x][y] = (matrix[x][0] * y + y) % (max - min) + min;
+				if(r.nextDouble() < sparsity) {
+					if(transpose) {
+						double off = (double) y;
+						int v = (int) (matrix[x][0] * off);
+						matrix[x][y] = Math.abs((v) % ((int) (max - min))) + min;
+					}
+					else {
+						double off = (double) y;
+						int v = (int) (matrix[0][x] * off);
+						matrix[y][x] = Math.abs((v) % ((int) (max - min))) + min;
+						// matrix[y][x] = ((int) (matrix[0][x] * y + y)) % ((int) (max - min)) + min;
 					}
 				}
-				else {
-					if(matrix[0][x] != 0) {
-						matrix[y][x] = (matrix[0][x] * y + y) % (max - min) + min;
-					}
-				}
-				// }
 			}
 		}
+
+		for(int x = 0; x < rows; x++) {
+			if(r.nextDouble() > sparsity) {
+				if(transpose) {
+					matrix[x][0] = 0.0;
+					// LOG.debug(matrix[x][0]);
+				}
+				else {
+					matrix[0][x] = 0.0;
+				}
+			}
+		}
+
 		return matrix;
 	}
 
@@ -186,9 +214,10 @@
 	private static List<Double> getNRandomValues(int nrUnique, Random r, int max, int min) {
 		List<Double> values = new ArrayList<>();
 		for(int i = 0; i < nrUnique; i++) {
-			double v = (r.nextDouble() * (double) (max - min)) + (double) min;
-			values.add(Math.floor(v));
+			values.add((r.nextDouble() * (max - min)) + min);
+			// values.add(Math.floor(v));
 		}
+		// LOG.debug(values);
 		return values;
 	}
 }
diff --git a/src/test/java/org/apache/sysds/test/component/compress/ParCompressedMatrixTest.java b/src/test/java/org/apache/sysds/test/component/compress/ParCompressedMatrixTest.java
index 23fd604..91cc710 100644
--- a/src/test/java/org/apache/sysds/test/component/compress/ParCompressedMatrixTest.java
+++ b/src/test/java/org/apache/sysds/test/component/compress/ParCompressedMatrixTest.java
@@ -19,126 +19,23 @@
 
 package org.apache.sysds.test.component.compress;
 
-import org.apache.sysds.lops.MMTSJ.MMTSJType;
-import org.apache.sysds.runtime.compress.CompressedMatrixBlock;
 import org.apache.sysds.runtime.compress.CompressionSettings;
 import org.apache.sysds.runtime.controlprogram.parfor.stat.InfrastructureAnalyzer;
-import org.apache.sysds.runtime.instructions.InstructionUtils;
-import org.apache.sysds.runtime.matrix.data.MatrixBlock;
-import org.apache.sysds.runtime.matrix.operators.AggregateBinaryOperator;
 import org.apache.sysds.runtime.matrix.operators.AggregateUnaryOperator;
-import org.apache.sysds.runtime.util.DataConverter;
-import org.apache.sysds.test.TestUtils;
 import org.apache.sysds.test.component.compress.TestConstants.MatrixTypology;
 import org.apache.sysds.test.component.compress.TestConstants.SparsityType;
 import org.apache.sysds.test.component.compress.TestConstants.ValueRange;
 import org.apache.sysds.test.component.compress.TestConstants.ValueType;
-import org.junit.Test;
 import org.junit.runner.RunWith;
 import org.junit.runners.Parameterized;
 
 @RunWith(value = Parameterized.class)
 public class ParCompressedMatrixTest extends AbstractCompressedUnaryTests {
 
-
 	public ParCompressedMatrixTest(SparsityType sparType, ValueType valType, ValueRange valRange,
 		CompressionSettings compressionSettings, MatrixTypology matrixTypology) {
-		super(sparType, valType, valRange, compressionSettings, matrixTypology, InfrastructureAnalyzer.getLocalParallelism());
-	}
-
-	@Test
-	public void testGetValue() {
-
-		try {
-			if(!(cmb instanceof CompressedMatrixBlock))
-				return; // Input was not compressed then just pass test
-
-			for(int i = 0; i < rows; i++)
-				for(int j = 0; j < cols; j++) {
-					double ulaVal = input[i][j];
-					double claVal = cmb.getValue(i, j); // calls quickGetValue internally
-					if(compressionSettings.lossy) {
-						TestUtils.compareCellValue(ulaVal, claVal, lossyTolerance, false);
-					}
-					else {
-						TestUtils.compareScalarBitsJUnit(ulaVal, claVal, 0); // Should be exactly same value
-					}
-				}
-		}
-		catch(Exception e) {
-			e.printStackTrace();
-			throw new RuntimeException(this.toString() + "\n" + e.getMessage(), e);
-		}
-	}
-
-	@Test
-	public void testTransposeSelfMatrixMult() {
-		try {
-			if(!(cmb instanceof CompressedMatrixBlock))
-				return; // Input was not compressed then just pass test
-			// ChainType ctype = ChainType.XtwXv;
-			for(MMTSJType mType : new MMTSJType[] {MMTSJType.LEFT,
-				// MMTSJType.RIGHT
-			}) {
-				// matrix-vector uncompressed
-				MatrixBlock ret1 = mb.transposeSelfMatrixMultOperations(new MatrixBlock(), mType, _k);
-
-				// matrix-vector compressed
-				MatrixBlock ret2 = cmb.transposeSelfMatrixMultOperations(new MatrixBlock(), mType, _k);
-
-				// compare result with input
-				double[][] d1 = DataConverter.convertToDoubleMatrix(ret1);
-				double[][] d2 = DataConverter.convertToDoubleMatrix(ret2);
-				// High probability that The value is off by some amount
-				if(compressionSettings.lossy) {
-					/**
-					 * Probably the worst thing you can do to increase the amount the values are estimated wrong
-					 */
-					TestUtils.compareMatricesPercentageDistance(d1, d2, 0.0, 0.8, compressionSettings.toString());
-				}
-				else {
-					TestUtils.compareMatricesBitAvgDistance(d1, d2, 2048, 20, compressionSettings.toString());
-				}
-			}
-		}
-		catch(Exception e) {
-			e.printStackTrace();
-			throw new RuntimeException(this.toString() + "\n" + e.getMessage(), e);
-		}
-	}
-
-	@Test
-	public void testVectorMatrixMult() {
-		try {
-			if(!(cmb instanceof CompressedMatrixBlock))
-				return; // Input was not compressed then just pass test
-
-			MatrixBlock vector = DataConverter
-				.convertToMatrixBlock(TestUtils.generateTestMatrix(1, rows, 1, 1, 1.0, 3));
-
-			// Make Operator
-			AggregateBinaryOperator abop = InstructionUtils.getMatMultOperator(_k);
-
-			// vector-matrix uncompressed
-			MatrixBlock ret1 = mb.aggregateBinaryOperations(vector, mb, new MatrixBlock(), abop);
-
-			// vector-matrix compressed
-			MatrixBlock ret2 = cmb.aggregateBinaryOperations(vector, cmb, new MatrixBlock(), abop);
-
-			// compare result with input
-			double[][] d1 = DataConverter.convertToDoubleMatrix(ret1);
-			double[][] d2 = DataConverter.convertToDoubleMatrix(ret2);
-			if(compressionSettings.lossy) {
-				TestUtils.compareMatricesPercentageDistance(d1, d2, 0.35, 0.96, compressionSettings.toString());
-			}
-			else {
-				TestUtils.compareMatricesBitAvgDistance(d1, d2, 10000, 500, compressionSettings.toString());
-			}
-		}
-		catch(Exception e) {
-			e.printStackTrace();
-			throw new RuntimeException(this.toString() + "\n" + e.getMessage(), e);
-		}
+		super(sparType, valType, valRange, compressionSettings, matrixTypology,
+			InfrastructureAnalyzer.getLocalParallelism());
 	}
 
 	@Override
diff --git a/src/test/java/org/apache/sysds/test/component/compress/TestBase.java b/src/test/java/org/apache/sysds/test/component/compress/TestBase.java
index c6ab8d9..f31ad48 100644
--- a/src/test/java/org/apache/sysds/test/component/compress/TestBase.java
+++ b/src/test/java/org/apache/sysds/test/component/compress/TestBase.java
@@ -26,14 +26,13 @@
 import org.apache.sysds.runtime.compress.colgroup.ColGroup.CompressionType;
 import org.apache.sysds.runtime.matrix.data.MatrixBlock;
 import org.apache.sysds.runtime.util.DataConverter;
-import org.apache.sysds.test.AutomatedTestBase;
 import org.apache.sysds.test.TestUtils;
 import org.apache.sysds.test.component.compress.TestConstants.MatrixTypology;
 import org.apache.sysds.test.component.compress.TestConstants.SparsityType;
 import org.apache.sysds.test.component.compress.TestConstants.ValueRange;
 import org.apache.sysds.test.component.compress.TestConstants.ValueType;
 
-public class TestBase extends AutomatedTestBase {
+public class TestBase {
 
 	protected ValueType valType;
 	protected ValueRange valRange;
@@ -53,7 +52,7 @@
 	protected MatrixBlock mb;
 
 	public TestBase(SparsityType sparType, ValueType valType, ValueRange valueRange,
-			CompressionSettings compressionSettings, MatrixTypology MatrixTypology) {
+		CompressionSettings compressionSettings, MatrixTypology MatrixTypology) {
 
 		this.sparsity = TestConstants.getSparsityValue(sparType);
 		this.rows = TestConstants.getNumberOfRows(MatrixTypology);
@@ -63,7 +62,7 @@
 		this.min = TestConstants.getMinRangeValue(valueRange);
 
 		try {
-			switch (valType) {
+			switch(valType) {
 				case CONST:
 					this.min = this.max;
 					// Do not Break, utilize the RAND afterwards.
@@ -75,23 +74,36 @@
 					break;
 				case OLE_COMPRESSIBLE:
 					// Note the Compressible Input generator, generates an already Transposed input
-					// normally, therefore last
-					// argument is true, to build a "normal" matrix.
-					this.input = CompressibleInputGenerator.getInputDoubleMatrix(rows, cols, CompressionType.OLE,
-							(max - min) / 10, max, min, sparsity, 7, true);
+					// normally, therefore last argument is true, to build a non transposed matrix.
+					this.input = CompressibleInputGenerator.getInputDoubleMatrix(rows,
+						cols,
+						CompressionType.OLE,
+						(max - min) / 10,
+						max,
+						min,
+						sparsity,
+						7,
+						true);
 					break;
 				case RLE_COMPRESSIBLE:
-					this.input = CompressibleInputGenerator.getInputDoubleMatrix(rows, cols, CompressionType.RLE,
-							(max - min) / 10, max, min, sparsity, 7, true);
+					this.input = CompressibleInputGenerator.getInputDoubleMatrix(rows,
+						cols,
+						CompressionType.RLE,
+						(max - min) / 10,
+						max,
+						min,
+						sparsity,
+						7,
+						true);
 					break;
 				default:
 					throw new NotImplementedException("Not Implemented Test Value type input generator");
 			}
 
-		} catch (Exception e) {
+		}
+		catch(Exception e) {
 			e.printStackTrace();
 			assertTrue("Error in construction of input Test Base", false);
-			// TODO: handle exception
 		}
 
 		this.valRange = valueRange;
@@ -102,14 +114,6 @@
 	}
 
 	@Override
-	public void setUp() {
-	}
-
-	@Override
-	public void tearDown() {
-	}
-
-	@Override
 	public String toString() {
 		StringBuilder builder = new StringBuilder();
 
@@ -122,7 +126,6 @@
 		builder.append(String.format("%6s%12s", "Min:", min));
 		builder.append(String.format("%6s%12s", "Max:", max));
 		builder.append(String.format("%6s%5s", "Spar:", sparsity));
-
 		builder.append(String.format("%6s%8s", "CP:", compressionSettings));
 
 		return builder.toString();
diff --git a/src/test/java/org/apache/sysds/test/component/compress/TestConstants.java b/src/test/java/org/apache/sysds/test/component/compress/TestConstants.java
index 83c2f37..a74c138 100644
--- a/src/test/java/org/apache/sysds/test/component/compress/TestConstants.java
+++ b/src/test/java/org/apache/sysds/test/component/compress/TestConstants.java
@@ -28,8 +28,8 @@
 	private static final int cols[] = {20, 20, 13, 998, 321, 1, 8, 10, 1};
 	private static final double[] sparsityValues = {0.9, 0.1, 0.01, 0.0, 1.0};
 
-	private static final int[] mins = {-10, -2147};
-	private static final int[] maxs = {10, 2147};
+	private static final int[] mins = {-10, -127 * 2};
+	private static final int[] maxs = {10, 127};
 
 	public enum SparsityType {
 		DENSE, SPARSE, ULTRA_SPARSE, EMPTY, FULL
@@ -51,17 +51,14 @@
 		SINGLE_ROW, // Single Row with some columns
 		SINGLE_COL, // Single Column with some rows
 		L_ROWS, // Many Rows
-		XL_ROWS, // A LOT of rows. 
+		XL_ROWS, // A LOT of rows.
 		SINGLE_COL_L, // Single Column large.
 	}
 
 	public enum ValueRange {
-		SMALL, 
-		LARGE,
-		BYTE
+		SMALL, LARGE, BYTE
 	}
 
-
 	public static double getSparsityValue(SparsityType sparsityType) {
 		switch(sparsityType) {
 			case DENSE:
@@ -75,7 +72,7 @@
 			case FULL:
 				return sparsityValues[4];
 			default:
-				throw new RuntimeException("Invalid Sparsity type"); 
+				throw new RuntimeException("Invalid Sparsity type");
 		}
 	}
 
@@ -88,7 +85,7 @@
 			case BYTE:
 				return -127;
 			default:
-			throw new RuntimeException("Invalid range value enum type"); 
+				throw new RuntimeException("Invalid range value enum type");
 		}
 	}
 
@@ -101,7 +98,7 @@
 			case BYTE:
 				return 127;
 			default:
-				throw new RuntimeException("Invalid range value enum type"); 
+				throw new RuntimeException("Invalid range value enum type");
 		}
 	}
 
@@ -126,7 +123,7 @@
 			case SINGLE_COL_L:
 				return rows[8];
 			default:
-				throw new RuntimeException("Invalid matrix enum type"); 
+				throw new RuntimeException("Invalid matrix enum type");
 		}
 	}
 
@@ -151,7 +148,7 @@
 			case SINGLE_COL_L:
 				return cols[8];
 			default:
-				throw new RuntimeException("Invalid matrix enum type"); 
+				throw new RuntimeException("Invalid matrix enum type");
 		}
 	}
 }
diff --git a/src/test/java/org/apache/sysds/test/component/compress/colgroup/JolEstimateDDCTest.java b/src/test/java/org/apache/sysds/test/component/compress/colgroup/JolEstimateDDCTest.java
index e36da12..6973d3f 100644
--- a/src/test/java/org/apache/sysds/test/component/compress/colgroup/JolEstimateDDCTest.java
+++ b/src/test/java/org/apache/sysds/test/component/compress/colgroup/JolEstimateDDCTest.java
@@ -44,8 +44,8 @@
 		// DCC is different in that it is a dense compression
 		// that also encode 0 values the same as all the other values.
 
-		mb = DataConverter.convertToMatrixBlock(new double[][] {{0}});
-		tests.add(new Object[] {mb, 8});
+		// mb = DataConverter.convertToMatrixBlock(new double[][] {{0}});
+		// tests.add(new Object[] {mb, 0});
 
 		mb = DataConverter.convertToMatrixBlock(new double[][] {{1}});
 		tests.add(new Object[] {mb, 0});
@@ -71,34 +71,34 @@
 		// Random Sparse Very big, because 0 is materialized.
 		mb = DataConverter
 			.convertToMatrixBlock(TestUtils.round(TestUtils.generateTestMatrix(1, 4000, 0, 254, 0.01, 7)));
-		tests.add(new Object[] {mb, 16});
+		tests.add(new Object[] {mb, 0});
 		mb = DataConverter
 			.convertToMatrixBlock(TestUtils.round(TestUtils.generateTestMatrix(1, 8000, 0, 254, 0.01, 7)));
-		tests.add(new Object[] {mb, 8});
+		tests.add(new Object[] {mb, 0});
 		mb = DataConverter
 			.convertToMatrixBlock(TestUtils.round(TestUtils.generateTestMatrix(1, 16000, 0, 254, 0.01, 7)));
-		tests.add(new Object[] {mb, 8});
+		tests.add(new Object[] {mb, 0});
 
 		mb = DataConverter
 			.convertToMatrixBlock(TestUtils.round(TestUtils.generateTestMatrix(1, 4000, 0, 254, 0.001, 7)));
-		tests.add(new Object[] {mb, 8});
+		tests.add(new Object[] {mb, 0});
 		mb = DataConverter
 			.convertToMatrixBlock(TestUtils.round(TestUtils.generateTestMatrix(1, 8000, 0, 254, 0.001, 7)));
-		tests.add(new Object[] {mb, 8});
+		tests.add(new Object[] {mb, 0});
 		mb = DataConverter
 			.convertToMatrixBlock(TestUtils.round(TestUtils.generateTestMatrix(1, 16000, 0, 254, 0.001, 7)));
-		tests.add(new Object[] {mb, 8});
+		tests.add(new Object[] {mb, 0});
 
 		// DDC2 instances, need more unique values than 255
 
 		mb = DataConverter.convertToMatrixBlock(TestUtils.round(TestUtils.generateTestMatrix(1, 4000, 0, 512, 0.7, 7)));
-		tests.add(new Object[] {mb, 8});
+		tests.add(new Object[] {mb, 0});
 		mb = DataConverter
 			.convertToMatrixBlock(TestUtils.round(TestUtils.generateTestMatrix(1, 8000, 0, 1024, 0.7, 7)));
-		tests.add(new Object[] {mb, 8});
+		tests.add(new Object[] {mb, 0});
 		mb = DataConverter
 			.convertToMatrixBlock(TestUtils.round(TestUtils.generateTestMatrix(1, 16000, 0, 2048, 0.7, 7)));
-		tests.add(new Object[] {mb, 8});
+		tests.add(new Object[] {mb, 0});
 
 		return tests;
 	}
diff --git a/src/test/java/org/apache/sysds/test/component/compress/colgroup/JolEstimateOLETest.java b/src/test/java/org/apache/sysds/test/component/compress/colgroup/JolEstimateOLETest.java
index 4d20eef..16a4248 100644
--- a/src/test/java/org/apache/sysds/test/component/compress/colgroup/JolEstimateOLETest.java
+++ b/src/test/java/org/apache/sysds/test/component/compress/colgroup/JolEstimateOLETest.java
@@ -40,89 +40,89 @@
 
 		MatrixBlock mb;
 		// base tests
-		mb = DataConverter.convertToMatrixBlock(new double[][] { { 1 } });
-		tests.add(new Object[] { mb,  0 });
-		mb = DataConverter.convertToMatrixBlock(new double[][] { { 0 } });
-		tests.add(new Object[] { mb,  0 });
-		mb = DataConverter.convertToMatrixBlock(new double[][] { { 0, 0, 0, 0, 0 } });
-		tests.add(new Object[] { mb,  0 });
+		mb = DataConverter.convertToMatrixBlock(new double[][] {{1}});
+		tests.add(new Object[] {mb, 0});
+		mb = DataConverter.convertToMatrixBlock(new double[][] {{0}});
+		tests.add(new Object[] {mb, 0});
+		mb = DataConverter.convertToMatrixBlock(new double[][] {{0, 0, 0, 0, 0}});
+		tests.add(new Object[] {mb, 0});
 
 		// The size of the compression increase at repeated values.
-		mb = DataConverter.convertToMatrixBlock(new double[][] { { 0, 0, 0, 0, 5, 0 } });
-		tests.add(new Object[] { mb,  0 });
-		mb = DataConverter.convertToMatrixBlock(new double[][] { { 0, 0, 0, 0, 5, 5, 0 } });
-		tests.add(new Object[] { mb,  0 });
-		mb = DataConverter.convertToMatrixBlock(new double[][] { { 0, 0, 0, 0, 5, 5, 5, 0 } });
-		tests.add(new Object[] { mb,  0 });
-		mb = DataConverter.convertToMatrixBlock(new double[][] { { 0, 0, 0, 0, 5, 5, 5, 5, 5, 5 } });
-		tests.add(new Object[] { mb,  0 });
+		mb = DataConverter.convertToMatrixBlock(new double[][] {{0, 0, 0, 0, 5, 0}});
+		tests.add(new Object[] {mb, 0});
+		mb = DataConverter.convertToMatrixBlock(new double[][] {{0, 0, 0, 0, 5, 5, 0}});
+		tests.add(new Object[] {mb, 0});
+		mb = DataConverter.convertToMatrixBlock(new double[][] {{0, 0, 0, 0, 5, 5, 5, 0}});
+		tests.add(new Object[] {mb, 0});
+		mb = DataConverter.convertToMatrixBlock(new double[][] {{0, 0, 0, 0, 5, 5, 5, 5, 5, 5}});
+		tests.add(new Object[] {mb, 0});
 
 		// all values grow by 1 if new value is introduced
-		mb = DataConverter.convertToMatrixBlock(new double[][] { { 0, 0, 0, 0, 5, 7, 0 } });
-		tests.add(new Object[] { mb,  0 });
-		mb = DataConverter.convertToMatrixBlock(new double[][] { { 0, 0, 0, 0, 5, 2, 1, 0 } });
-		tests.add(new Object[] { mb,  0 });
-		mb = DataConverter.convertToMatrixBlock(new double[][] { { 0, 0, 0, 0, 5, 2, 1, 3, 6, 7 } });
-		tests.add(new Object[] { mb,  0 });
+		mb = DataConverter.convertToMatrixBlock(new double[][] {{0, 0, 0, 0, 5, 7, 0}});
+		tests.add(new Object[] {mb, 0});
+		mb = DataConverter.convertToMatrixBlock(new double[][] {{0, 0, 0, 0, 5, 2, 1, 0}});
+		tests.add(new Object[] {mb, 0});
+		mb = DataConverter.convertToMatrixBlock(new double[][] {{0, 0, 0, 0, 5, 2, 1, 3, 6, 7}});
+		tests.add(new Object[] {mb, 0});
 
 		// Dense random... Horrible compression at full precision
 		mb = DataConverter.convertToMatrixBlock(TestUtils.generateTestMatrix(1, 100, 0, 100, 1.0, 7));
-		tests.add(new Object[] { mb,  0 });
+		tests.add(new Object[] {mb, 0});
 		mb = DataConverter.convertToMatrixBlock(TestUtils.generateTestMatrix(1, 1000, 0, 100, 1.0, 7));
-		tests.add(new Object[] { mb,  0 });
+		tests.add(new Object[] {mb, 0});
 		mb = DataConverter.convertToMatrixBlock(TestUtils.generateTestMatrix(1, 10000, 0, 100, 1.0, 7));
-		tests.add(new Object[] { mb,  0 });
+		tests.add(new Object[] {mb, 0});
 
 		// Random rounded numbers dense
 		mb = DataConverter.convertToMatrixBlock(TestUtils.round(TestUtils.generateTestMatrix(1, 1523, 0, 99, 1.0, 7)));
-		tests.add(new Object[] { mb,  0 });
+		tests.add(new Object[] {mb, 0});
 		mb = DataConverter.convertToMatrixBlock(TestUtils.round(TestUtils.generateTestMatrix(1, 4000, 0, 255, 1.0, 7)));
-		tests.add(new Object[] { mb,  0 });
+		tests.add(new Object[] {mb, 0});
 
 		// Sparse rounded numbers
 		mb = DataConverter.convertToMatrixBlock(TestUtils.round(TestUtils.generateTestMatrix(1, 1523, 0, 99, 0.1, 7)));
-		tests.add(new Object[] { mb,  0 });
+		tests.add(new Object[] {mb, 0});
 		mb = DataConverter
-				.convertToMatrixBlock(TestUtils.round(TestUtils.generateTestMatrix(1, 1621, 0, 99, 0.1, 142)));
-		tests.add(new Object[] { mb,  0 });
+			.convertToMatrixBlock(TestUtils.round(TestUtils.generateTestMatrix(1, 1621, 0, 99, 0.1, 142)));
+		tests.add(new Object[] {mb, 0});
 		mb = DataConverter
-				.convertToMatrixBlock(TestUtils.round(TestUtils.generateTestMatrix(1, 2321, 0, 99, 0.1, 512)));
-		tests.add(new Object[] { mb,  0 });
+			.convertToMatrixBlock(TestUtils.round(TestUtils.generateTestMatrix(1, 2321, 0, 99, 0.1, 512)));
+		tests.add(new Object[] {mb, 0});
 		mb = DataConverter.convertToMatrixBlock(TestUtils.round(TestUtils.generateTestMatrix(1, 4000, 0, 255, 0.1, 7)));
-		tests.add(new Object[] { mb,  0 });
+		tests.add(new Object[] {mb, 0});
 
 		mb = DataConverter.convertToMatrixBlock(TestUtils.round(TestUtils.generateTestMatrix(1, 1523, 0, 99, 0.5, 7)));
-		tests.add(new Object[] { mb,  0 });
+		tests.add(new Object[] {mb, 0});
 		mb = DataConverter
-				.convertToMatrixBlock(TestUtils.round(TestUtils.generateTestMatrix(1, 1621, 0, 99, 0.5, 142)));
-		tests.add(new Object[] { mb,  0 });
+			.convertToMatrixBlock(TestUtils.round(TestUtils.generateTestMatrix(1, 1621, 0, 99, 0.5, 142)));
+		tests.add(new Object[] {mb, 0});
 		mb = DataConverter
-				.convertToMatrixBlock(TestUtils.round(TestUtils.generateTestMatrix(1, 2321, 0, 99, 0.5, 512)));
-		tests.add(new Object[] { mb,  0 });
+			.convertToMatrixBlock(TestUtils.round(TestUtils.generateTestMatrix(1, 2321, 0, 99, 0.5, 512)));
+		tests.add(new Object[] {mb, 0});
 		mb = DataConverter.convertToMatrixBlock(TestUtils.round(TestUtils.generateTestMatrix(1, 4000, 0, 255, 0.5, 7)));
-		tests.add(new Object[] { mb,  0 });
+		tests.add(new Object[] {mb, 0});
 
 		// Paper
-		mb = DataConverter.convertToMatrixBlock(
-				new double[][] { { 7, 3, 7, 7, 3, 7, 3, 3, 7, 3 }, { 6, 4, 6, 5, 4, 5, 4, 4, 6, 4 } });
-		tests.add(new Object[] { mb,  0 });
+		mb = DataConverter
+			.convertToMatrixBlock(new double[][] {{7, 3, 7, 7, 3, 7, 3, 3, 7, 3}, {6, 4, 6, 5, 4, 5, 4, 4, 6, 4}});
+		tests.add(new Object[] {mb, 0});
 
 		// Dream Inputs
-		int[] cols = new int[] { 2, 6, 111 };
-		int[] rows = new int[] { 10, 121, 513 };
-		int[] unique = new int[] { 3, 5 };
-		for (int y : cols) {
-			for (int x : rows) {
-				for (int u : unique) {
+		int[] cols = new int[] {2, 6, 111};
+		int[] rows = new int[] {10, 121, 513};
+		int[] unique = new int[] {3, 5};
+		for(int y : cols) {
+			for(int x : rows) {
+				for(int u : unique) {
 					mb = CompressibleInputGenerator.getInput(x, y, CompressionType.OLE, u, 1.0, 5);
-					tests.add(new Object[] { mb,  0 });
+					tests.add(new Object[] {mb, 0});
 				}
 			}
 		}
 
 		// Sparse test.
 		mb = CompressibleInputGenerator.getInput(571, 1, CompressionType.OLE, 40, 0.6, 5);
-		tests.add(new Object[] { mb,  0 });
+		tests.add(new Object[] {mb, 0});
 
 		return tests;
 	}
diff --git a/src/test/java/org/apache/sysds/test/component/compress/colgroup/JolEstimateTest.java b/src/test/java/org/apache/sysds/test/component/compress/colgroup/JolEstimateTest.java
index 25db7d5..6e085d4 100644
--- a/src/test/java/org/apache/sysds/test/component/compress/colgroup/JolEstimateTest.java
+++ b/src/test/java/org/apache/sysds/test/component/compress/colgroup/JolEstimateTest.java
@@ -32,7 +32,7 @@
 import org.apache.sysds.runtime.compress.estim.CompressedSizeEstimator;
 import org.apache.sysds.runtime.compress.estim.CompressedSizeEstimatorFactory;
 import org.apache.sysds.runtime.compress.estim.CompressedSizeInfoColGroup;
-import org.apache.sysds.runtime.compress.utils.AbstractBitmap;
+import org.apache.sysds.runtime.compress.utils.ABitmap;
 import org.apache.sysds.runtime.matrix.data.MatrixBlock;
 import org.junit.Test;
 import org.junit.runner.RunWith;
@@ -72,9 +72,9 @@
 			colIndexes[x] = x;
 		}
 		try {
-			AbstractBitmap ubm = BitmapEncoder.extractBitmap(colIndexes, mbt, cs);
+			ABitmap ubm = BitmapEncoder.extractBitmap(colIndexes, mbt, cs);
 			cg = ColGroupFactory.compress(colIndexes, mbt.getNumColumns(), ubm, getCT(), cs, mbt);
-			AbstractBitmap ubml = BitmapEncoder.extractBitmap(colIndexes, mbt, csl);
+			ABitmap ubml = BitmapEncoder.extractBitmap(colIndexes, mbt, csl);
 			cgl = ColGroupFactory.compress(colIndexes, mbt.getNumColumns(), ubml, getCT(), csl, mbt);
 
 		}