blob: b55d664c91e09897056ff622af2e2c61683b003e [file] [log] [blame]
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package parquet
import (
"encoding/binary"
"reflect"
"strings"
"time"
"unsafe"
"github.com/apache/arrow/go/arrow"
format "github.com/apache/arrow/go/parquet/internal/gen-go/parquet"
)
const (
julianUnixEpoch int64 = 2440588
nanosPerDay int64 = 3600 * 24 * 1000 * 1000 * 1000
// Int96SizeBytes is the number of bytes that make up an Int96
Int96SizeBytes int = 12
)
var (
// Int96Traits provides information about the Int96 type
Int96Traits int96Traits
// ByteArrayTraits provides information about the ByteArray type, which is just an []byte
ByteArrayTraits byteArrayTraits
// FixedLenByteArrayTraits provides information about the FixedLenByteArray type which is just an []byte
FixedLenByteArrayTraits fixedLenByteArrayTraits
// ByteArraySizeBytes is the number of bytes returned by reflect.TypeOf(ByteArray{}).Size()
ByteArraySizeBytes int = int(reflect.TypeOf(ByteArray{}).Size())
// FixedLenByteArraySizeBytes is the number of bytes returned by reflect.TypeOf(FixedLenByteArray{}).Size()
FixedLenByteArraySizeBytes int = int(reflect.TypeOf(FixedLenByteArray{}).Size())
)
// NewInt96 creates a new Int96 from the given 3 uint32 values.
func NewInt96(v [3]uint32) (out Int96) {
binary.LittleEndian.PutUint32(out[0:], v[0])
binary.LittleEndian.PutUint32(out[4:], v[1])
binary.LittleEndian.PutUint32(out[8:], v[2])
return
}
// Int96 is a 12 byte integer value utilized for representing timestamps as a 64 bit integer and a 32 bit
// integer.
type Int96 [12]byte
// SetNanoSeconds sets the Nanosecond field of the Int96 timestamp to the provided value
func (i96 *Int96) SetNanoSeconds(nanos int64) {
binary.LittleEndian.PutUint64(i96[:8], uint64(nanos))
}
// String provides the string representation as a timestamp via converting to a time.Time
// and then calling String
func (i96 Int96) String() string {
return i96.ToTime().String()
}
// ToTime returns a go time.Time object that represents the same time instant as the given Int96 value
func (i96 Int96) ToTime() time.Time {
nanos := binary.LittleEndian.Uint64(i96[:8])
jdays := binary.LittleEndian.Uint32(i96[8:])
nanos = (uint64(jdays)-uint64(julianUnixEpoch))*uint64(nanosPerDay) + nanos
t := time.Unix(0, int64(nanos))
return t.UTC()
}
type int96Traits struct{}
func (int96Traits) BytesRequired(n int) int { return Int96SizeBytes * n }
func (int96Traits) CastFromBytes(b []byte) []Int96 {
h := (*reflect.SliceHeader)(unsafe.Pointer(&b))
var res []Int96
s := (*reflect.SliceHeader)(unsafe.Pointer(&res))
s.Data = h.Data
s.Len = h.Len / Int96SizeBytes
s.Cap = h.Cap / Int96SizeBytes
return res
}
func (int96Traits) CastToBytes(b []Int96) []byte {
h := (*reflect.SliceHeader)(unsafe.Pointer(&b))
var res []byte
s := (*reflect.SliceHeader)(unsafe.Pointer(&res))
s.Data = h.Data
s.Len = h.Len * Int96SizeBytes
s.Cap = h.Cap * Int96SizeBytes
return res
}
// ByteArray is a type to be utilized for representing the Parquet ByteArray physical type, represented as a byte slice
type ByteArray []byte
// Len returns the current length of the ByteArray, equivalent to len(bytearray)
func (b ByteArray) Len() int {
return len(b)
}
// String returns a string representation of the ByteArray
func (b ByteArray) String() string {
return *(*string)(unsafe.Pointer(&b))
}
type byteArrayTraits struct{}
func (byteArrayTraits) BytesRequired(n int) int {
return ByteArraySizeBytes * n
}
func (byteArrayTraits) CastFromBytes(b []byte) []ByteArray {
h := (*reflect.SliceHeader)(unsafe.Pointer(&b))
var res []ByteArray
s := (*reflect.SliceHeader)(unsafe.Pointer(&res))
s.Data = h.Data
s.Len = h.Len / ByteArraySizeBytes
s.Cap = h.Cap / ByteArraySizeBytes
return res
}
// FixedLenByteArray is a go type to represent a FixedLengthByteArray as a byte slice
type FixedLenByteArray []byte
// Len returns the current length of this FixedLengthByteArray, equivalent to len(fixedlenbytearray)
func (b FixedLenByteArray) Len() int {
return len(b)
}
// String returns a string representation of the FixedLenByteArray
func (b FixedLenByteArray) String() string {
return *(*string)(unsafe.Pointer(&b))
}
type fixedLenByteArrayTraits struct{}
func (fixedLenByteArrayTraits) BytesRequired(n int) int {
return FixedLenByteArraySizeBytes * n
}
func (fixedLenByteArrayTraits) CastFromBytes(b []byte) []FixedLenByteArray {
h := (*reflect.SliceHeader)(unsafe.Pointer(&b))
var res []FixedLenByteArray
s := (*reflect.SliceHeader)(unsafe.Pointer(&res))
s.Data = h.Data
s.Len = h.Len / FixedLenByteArraySizeBytes
s.Cap = h.Cap / FixedLenByteArraySizeBytes
return res
}
// Creating our own enums allows avoiding the transitive dependency on the
// compiled thrift definitions in the public API, allowing us to not export
// the entire Thrift definitions, while making everything a simple cast between.
//
// It also let's us add special values like NONE to distinguish between values
// that are set or not set
type (
// Type is the physical type as in parquet.thrift
Type format.Type
// Cipher is the parquet Cipher Algorithms
Cipher int
// ColumnOrder is the Column Order from the parquet.thrift
ColumnOrder *format.ColumnOrder
// Version is the parquet version type
Version int8
// DataPageVersion is the version of the Parquet Data Pages
DataPageVersion int8
// Encoding is the parquet Encoding type
Encoding format.Encoding
// Repetition is the underlying parquet field repetition type as in parquet.thrift
Repetition format.FieldRepetitionType
// ColumnPath is the path from the root of the schema to a given column
ColumnPath []string
)
func (c ColumnPath) String() string {
if c == nil {
return ""
}
return strings.Join(c, ".")
}
// Extend creates a new ColumnPath from an existing one, with the new ColumnPath having s appended to the end.
func (c ColumnPath) Extend(s string) ColumnPath {
p := make([]string, len(c), len(c)+1)
copy(p, c)
return append(p, s)
}
// ColumnPathFromString constructs a ColumnPath from a dot separated string
func ColumnPathFromString(s string) ColumnPath {
return strings.Split(s, ".")
}
// constants for choosing the Aes Algorithm to use for encryption/decryption
const (
AesGcm Cipher = iota
AesCtr
)
// Constants for the parquet Version
const (
V1 Version = 1
V2 Version = 2
)
// constants for the parquet DataPage Version to use
const (
DataPageV1 DataPageVersion = iota
DataPageV2
)
func (e Encoding) String() string {
return format.Encoding(e).String()
}
var (
// Types contains constants for the Physical Types that are used in the Parquet Spec
//
// They can be specified when needed as such: `parquet.Types.Int32` etc. The values
// all correspond to the values in parquet.thrift
Types = struct {
Boolean Type
Int32 Type
Int64 Type
Int96 Type
Float Type
Double Type
ByteArray Type
FixedLenByteArray Type
// this only exists as a convienence so we can denote it when necessary
// nearly all functions that take a parquet.Type will error/panic if given
// Undefined
Undefined Type
}{
Boolean: Type(format.Type_BOOLEAN),
Int32: Type(format.Type_INT32),
Int64: Type(format.Type_INT64),
Int96: Type(format.Type_INT96),
Float: Type(format.Type_FLOAT),
Double: Type(format.Type_DOUBLE),
ByteArray: Type(format.Type_BYTE_ARRAY),
FixedLenByteArray: Type(format.Type_FIXED_LEN_BYTE_ARRAY),
Undefined: Type(format.Type_FIXED_LEN_BYTE_ARRAY + 1),
}
// Encodings contains constants for the encoding types of the column data
//
// The values used all correspond to the values in parquet.thrift for the
// corresponding encoding type.
Encodings = struct {
Plain Encoding
PlainDict Encoding
RLE Encoding
RLEDict Encoding
BitPacked Encoding // deprecated, not implemented
DeltaByteArray Encoding
DeltaBinaryPacked Encoding
DeltaLengthByteArray Encoding
}{
Plain: Encoding(format.Encoding_PLAIN),
PlainDict: Encoding(format.Encoding_PLAIN_DICTIONARY),
RLE: Encoding(format.Encoding_RLE),
RLEDict: Encoding(format.Encoding_RLE_DICTIONARY),
BitPacked: Encoding(format.Encoding_BIT_PACKED),
DeltaByteArray: Encoding(format.Encoding_DELTA_BYTE_ARRAY),
DeltaBinaryPacked: Encoding(format.Encoding_DELTA_BINARY_PACKED),
DeltaLengthByteArray: Encoding(format.Encoding_DELTA_LENGTH_BYTE_ARRAY),
}
// ColumnOrders contains constants for the Column Ordering fields
ColumnOrders = struct {
Undefined ColumnOrder
TypeDefinedOrder ColumnOrder
}{
Undefined: format.NewColumnOrder(),
TypeDefinedOrder: &format.ColumnOrder{TYPE_ORDER: format.NewTypeDefinedOrder()},
}
// DefaultColumnOrder is to use TypeDefinedOrder
DefaultColumnOrder = ColumnOrders.TypeDefinedOrder
// Repetitions contains the constants for Field Repetition Types
Repetitions = struct {
Required Repetition
Optional Repetition
Repeated Repetition
Undefined Repetition // convenience value
}{
Required: Repetition(format.FieldRepetitionType_REQUIRED),
Optional: Repetition(format.FieldRepetitionType_OPTIONAL),
Repeated: Repetition(format.FieldRepetitionType_REPEATED),
Undefined: Repetition(format.FieldRepetitionType_REPEATED + 1),
}
)
func (t Type) String() string {
switch t {
case Types.Undefined:
return "UNDEFINED"
default:
return format.Type(t).String()
}
}
func (r Repetition) String() string {
return strings.ToLower(format.FieldRepetitionType(r).String())
}
// ByteSize returns the number of bytes required to store a single value of
// the given parquet.Type in memory.
func (t Type) ByteSize() int {
switch t {
case Types.Boolean:
return 1
case Types.Int32:
return arrow.Int32SizeBytes
case Types.Int64:
return arrow.Int64SizeBytes
case Types.Int96:
return Int96SizeBytes
case Types.Float:
return arrow.Float32SizeBytes
case Types.Double:
return arrow.Float64SizeBytes
case Types.ByteArray:
return ByteArraySizeBytes
case Types.FixedLenByteArray:
return FixedLenByteArraySizeBytes
}
panic("no bytesize info for type")
}