blob: 776dd50360d8e364a9dcf7e012fdb7f297c6c764 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.sysds.runtime.compress.cocode;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.stream.Collectors;
import org.apache.commons.lang.ArrayUtils;
import org.apache.sysds.runtime.compress.CompressionSettings;
import org.apache.sysds.runtime.compress.cocode.PlanningCoCoder.GroupableColInfo;
import org.apache.sysds.runtime.compress.utils.IntArrayList;
import org.apache.sysds.runtime.util.SortUtils;
/**
* Column group partitioning with bin packing heuristic.
*/
public class ColumnGroupPartitionerBinPacking extends ColumnGroupPartitioner {
private static final boolean FIRST_FIT_DEC = true;
private static final int MAX_COL_FIRST_FIT = 16384;
private static final int MAX_COL_PER_GROUP = 1024;
// we use a constant partition size (independent of the number of rows
// in order to ensure constant compression speed independent of blocking)
public static double BIN_CAPACITY = 0.000032; // higher values, more grouping
@Override
public List<int[]> partitionColumns(List<Integer> groupCols, HashMap<Integer, GroupableColInfo> groupColsInfo,
CompressionSettings cs) {
// obtain column weights
int[] items = new int[groupCols.size()];
double[] itemWeights = new double[groupCols.size()];
for(int i = 0; i < groupCols.size(); i++) {
int col = groupCols.get(i);
items[i] = col;
itemWeights[i] = groupColsInfo.get(col).cardRatio;
}
// run first fit heuristic over sequences of at most MAX_COL_FIRST_FIT
// items to ensure robustness for matrices with many columns due to O(n^2)
List<IntArrayList> bins = new ArrayList<>();
for(int i = 0; i < items.length; i += MAX_COL_FIRST_FIT) {
// extract sequence of items and item weights
int iu = Math.min(i + MAX_COL_FIRST_FIT, items.length);
int[] litems = Arrays.copyOfRange(items, i, iu);
double[] litemWeights = Arrays.copyOfRange(itemWeights, i, iu);
// sort items (first fit decreasing)
if(FIRST_FIT_DEC) {
SortUtils.sortByValue(0, litems.length, litemWeights, litems);
ArrayUtils.reverse(litems);
ArrayUtils.reverse(litemWeights);
}
// partition columns via bin packing
bins.addAll(packFirstFit(litems, litemWeights));
}
// extract native int arrays for individual bins
return bins.stream().map(b -> b.extractValues(true)).collect(Collectors.toList());
}
/**
* NOTE: upper bound is 17/10 OPT
*
* @param items the items in terms of columns
* @param itemWeights the weights of the items
* @return
*/
private static List<IntArrayList> packFirstFit(int[] items, double[] itemWeights) {
List<IntArrayList> bins = new ArrayList<>();
double[] binWeights = new double[16];
for(int i = 0; i < items.length; i++) {
// add to existing bin
boolean assigned = false;
for(int j = 0; j < bins.size(); j++) {
double newBinWeight = binWeights[j] - itemWeights[i];
if(newBinWeight >= 0 && bins.get(j).size() < MAX_COL_PER_GROUP - 1) {
bins.get(j).appendValue(items[i]);
binWeights[j] = newBinWeight;
assigned = true;
break;
}
}
// create new bin at end of list
if(!assigned) {
if(bins.size() == binWeights.length)
binWeights = Arrays.copyOf(binWeights, 2 * binWeights.length);
bins.add(new IntArrayList(items[i]));
binWeights[bins.size() - 1] = BIN_CAPACITY - itemWeights[i];
}
}
return bins;
}
}