blob: 9097c0bc402a5f03cf6dfa649f6a079f3ea593e0 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.sysds.runtime.compress.colgroup.offset;
import java.io.DataInput;
import java.io.IOException;
import java.util.Arrays;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.sysds.runtime.compress.DMLCompressionException;
import org.apache.sysds.runtime.compress.utils.IntArrayList;
public final class OffsetFactory {
static final Log LOG = LogFactory.getLog(OffsetFactory.class.getName());
private OffsetFactory() {
// Empty private constructor.
}
/** The specific underlying types of offsets. */
public enum OFF_TYPE {
BYTE, CHAR
}
/** Specialized types of underlying offsets. */
public enum OFF_TYPE_SPECIALIZATIONS {
BYTE, CHAR, SINGLE_OFFSET, TWO_OFFSET, EMPTY
}
/**
* Main factory pattern creator for Offsets.
*
* Note this creator is unsafe it is assumed that the input index list only contain sequential non duplicate
* incrementing values.
*
* @param indexes List of indexes, that is assumed to be sorted and have no duplicates
* @return AOffset object containing offsets to the next value.
*/
public static AOffset createOffset(int[] indexes) {
return createOffset(indexes, 0, indexes.length);
}
/**
* Create the offsets based on our primitive IntArrayList.
*
* Note this creator is unsafe it is assumed that the input index list only contain sequential non duplicate
* incrementing values.
*
* @param indexes The List of indexes, that is assumed to be sorted and have no duplicates
* @return AOffset object containing offsets to the next value.
*/
public static AOffset createOffset(IntArrayList indexes) {
return createOffset(indexes.extractValues(), 0, indexes.size());
}
/**
* try to create a specific type of offset.
*
* @param indexes the List of indexes, that is assumed to be sorted and have no duplicates
* @param type The type requested.
* @return The return offset
*/
public static AOffset createOffset(int[] indexes, OFF_TYPE type) {
return createOffset(indexes, 0, indexes.length, type);
}
/**
* Create a Offset based on a subset of the indexes given.
*
* This is useful if the input is created from a CSR matrix, since it allows us to not reallocate the indexes[] but
* use the shared indexes from the entire CSR representation.
*
* Note this creator is unsafe it is assumed that the input index list only contain sequential non duplicate
* incrementing values.
*
* @param indexes The indexes from which to take the offsets.
* @param apos The position to start looking from in the indexes.
* @param alen The position to end looking at in the indexes.
* @return A new Offset.
*/
public static AOffset createOffset(int[] indexes, int apos, int alen) {
try {
if(indexes == null)
throw new DMLCompressionException("Invalid null indexes input");
final int endLength = alen - apos - 1;
if(endLength < 0)
return new OffsetEmpty();
else if(indexes[0] < 0)
throw new DMLCompressionException("Invalid negative offset");
else if(endLength == 0) // means size of 1 since we store the first offset outside the list
return new OffsetSingle(indexes[apos]);
else if(endLength == 1)
return new OffsetTwo(indexes[apos], indexes[apos + 1]);
final int minValue = indexes[apos];
final int maxValue = indexes[alen - 1];
final int range = maxValue - minValue;
// -1 because one index is skipped using a first idex allocated as a int.
final int correctionByte = correctionByte(range, endLength);
final int correctionChar = correctionChar(range, endLength);
final long byteSize = OffsetByte.estimateInMemorySize(endLength + correctionByte);
final long charSize = OffsetChar.estimateInMemorySize(endLength + correctionChar);
if(byteSize < charSize)
return createByte(indexes, apos, alen);
else
return createChar(indexes, apos, alen);
}
catch(Exception e) {
if(indexes == null)
throw e;
for(int i = apos + 1; i < alen; i++) {
if(indexes[i] <= indexes[i - 1]) {
String message = "Invalid input to create offset, all values should be continuously increasing.\n";
message += "Index " + (i - 1) + " and Index " + i + " are wrong with values: " + indexes[i - 1] + " and "
+ indexes[i];
throw new DMLCompressionException(message, e);
}
}
throw new DMLCompressionException(
"Failed to create offset with input:" + Arrays.toString(indexes) + " Apos: " + apos + " Alen: " + alen, e);
}
}
public static AOffset createOffset(int[] indexes, int apos, int alen, OFF_TYPE type) {
final int indexesLength = alen - apos;
if(indexesLength <= 0)
return new OffsetEmpty();
else if(indexesLength == 1)
return new OffsetSingle(indexes[apos]);
else if(indexesLength == 2)
return new OffsetTwo(indexes[apos], indexes[apos + 1]);
else if(type == OFF_TYPE.BYTE)
return createByte(indexes, 0, indexes.length);
else
return createChar(indexes, 0, indexes.length);
}
/**
* Read in AOffset from the DataInput.
*
* @param in DataInput to read from
* @return The AOffset data instance
* @throws IOException If the DataInput fails reading in the variables
*/
public static AOffset readIn(DataInput in) throws IOException {
OFF_TYPE_SPECIALIZATIONS t = OFF_TYPE_SPECIALIZATIONS.values()[in.readByte()];
switch(t) {
case EMPTY:
return OffsetEmpty.readFields(in);
case SINGLE_OFFSET:
return OffsetSingle.readFields(in);
case TWO_OFFSET:
return OffsetTwo.readFields(in);
case BYTE:
return OffsetByte.readFields(in);
case CHAR:
default:
return OffsetChar.readFields(in);
}
}
/**
* Avg diff only works assuming a normal distribution of the offsets. This means that if we have 1000 rows and 100
* offsets, it is assumed that on average the distance between elements is 10.
*
* Optionally todo is to add some number of size if the average distance is almost the same as the max value of the
* OffsetLists. this would add to the estimated size and approximate better the real compression size. It would also
* then handle edge cases better.
*
* @param size The estimated number of offsets
* @param nRows The number of rows.
* @return The estimated size of an offset given the number of offsets and rows.
*/
public static long estimateInMemorySize(int size, int nRows) {
if(size == 0) // If this is the case, then the compression results in constant col groups
return OffsetEmpty.estimateInMemorySize();
else if(size == 1)
return OffsetSingle.estimateInMemorySize();
else if(size == 2)
return OffsetTwo.estimateInMemorySize();
final int avgDiff = nRows / size;
if(avgDiff < 256) {
final int correctionByte = correctionByte(nRows, size);
return OffsetByte.estimateInMemorySize(size - 1 + correctionByte);
}
else {
final int correctionChar = correctionChar(nRows, size);
return OffsetChar.estimateInMemorySize(size - 1 + correctionChar);
}
}
public static int correctionByte(int nRows, int size) {
return Math.max((nRows - size * 256), 0) / 256;
}
public static int correctionChar(int nRows, int size) {
return Math.max((nRows - size * Character.MAX_VALUE), 0) / Character.MAX_VALUE;
}
private static AOffset createByte(int[] indexes, int apos, int alen) {
final int indexesLength = alen - apos;
int endSize = 0;
int offsetToFirst = indexes[apos];
int offsetToLast = indexes[alen - 1];
int ov = offsetToFirst;
// find the size of the array
for(int i = apos + 1; i < alen; i++) {
final int nv = indexes[i];
endSize += 1 + (nv - ov - 1) / OffsetByte.maxV;
ov = nv;
}
boolean noZero = endSize == indexesLength - 1;
byte[] offsets = new byte[endSize];
ov = offsetToFirst;
int p = 0;
// populate the array
for(int i = apos + 1; i < alen; i++) {
final int nv = indexes[i];
final int offsetSize = nv - ov;
if(offsetSize <= 0)
throw new DMLCompressionException("Invalid offset");
final int div = offsetSize / OffsetByte.maxV;
final int mod = offsetSize % OffsetByte.maxV;
if(mod == 0) {
p += div - 1; // skip values
offsets[p++] = (byte) OffsetByte.maxV;
}
else {
p += div; // skip values
offsets[p++] = (byte) (mod);
}
ov = nv;
}
boolean noOverHalf = getNoOverHalf(offsets);
return new OffsetByte(offsets, offsetToFirst, offsetToLast, indexesLength, noOverHalf, noZero);
}
private static AOffset createChar(int[] indexes, int apos, int alen) {
int endSize = 0;
int offsetToFirst = indexes[apos];
int offsetToLast = indexes[alen - 1];
int ov = offsetToFirst;
for(int i = apos + 1; i < alen; i++) {
final int nv = indexes[i];
endSize += 1 + (nv - ov - 1) / OffsetChar.maxV;
ov = nv;
}
boolean noZero = endSize == alen - apos - 1;
char[] offsets = new char[endSize];
ov = offsetToFirst;
int p = 0;
for(int i = apos + 1; i < alen; i++) {
final int nv = indexes[i];
final int offsetSize = (nv - ov);
if(offsetSize <= 0)
throw new DMLCompressionException("Invalid offset");
final int div = offsetSize / OffsetChar.maxV;
final int mod = offsetSize % OffsetChar.maxV;
if(mod == 0) {
p += div - 1; // skip values
offsets[p++] = (char) OffsetChar.maxV;
}
else {
p += div; // skip values
offsets[p++] = (char) (mod);
}
ov = nv;
}
return new OffsetChar(offsets, offsetToFirst, offsetToLast, noZero);
}
protected static boolean getNoOverHalf(byte[] off) {
boolean noOverHalf = true;
for(byte b : off)
if(b < 1) {
noOverHalf = false;
break;
}
return noOverHalf;
}
protected static boolean getNoZero(byte[] off) {
boolean noZero = true;
for(byte b : off)
if(b == 0) {
noZero = false;
break;
}
return noZero;
}
protected static boolean getNoZero(char[] off) {
boolean noZero = true;
for(char b : off)
if(b == 0) {
noZero = false;
break;
}
return noZero;
}
}