blob: 9b666864c1bc3516606984d1124aec7e01f75f45 [file] [log] [blame]
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package gen
import (
"github.com/apache/arrow/go/v6/arrow"
"github.com/apache/arrow/go/v6/arrow/array"
"github.com/apache/arrow/go/v6/arrow/bitutil"
"github.com/apache/arrow/go/v6/arrow/memory"
"golang.org/x/exp/rand"
"gonum.org/v1/gonum/stat/distuv"
)
// RandomArrayGenerator is a struct used for constructing Random Arrow arrays
// for use with testing.
type RandomArrayGenerator struct {
seed uint64
extra uint64
src rand.Source
seedRand *rand.Rand
mem memory.Allocator
}
// NewRandomArrayGenerator constructs a new generator with the requested Seed
func NewRandomArrayGenerator(seed uint64, mem memory.Allocator) RandomArrayGenerator {
src := rand.NewSource(seed)
return RandomArrayGenerator{seed, 0, src, rand.New(src), mem}
}
// GenerateBitmap generates a bitmap of n bits and stores it into buffer. Prob is the probability
// that a given bit will be zero, with 1-prob being the probability it will be 1. The return value
// is the number of bits that were left unset. The assumption being that buffer is currently
// zero initialized as this function does not clear any bits, it only sets 1s.
func (r *RandomArrayGenerator) GenerateBitmap(buffer []byte, n int64, prob float64) int64 {
count := int64(0)
r.extra++
// bernoulli distribution uses P to determine the probabitiliy of a 0 or a 1,
// which we'll use to generate the bitmap.
dist := distuv.Bernoulli{P: 1 - prob, Src: rand.NewSource(r.seed + r.extra)}
for i := 0; int64(i) < n; i++ {
if dist.Rand() != float64(0.0) {
bitutil.SetBit(buffer, i)
} else {
count++
}
}
return count
}
func (r *RandomArrayGenerator) Boolean(size int64, prob, nullProb float64) array.Interface {
buffers := make([]*memory.Buffer, 2)
nullcount := int64(0)
buffers[0] = memory.NewResizableBuffer(r.mem)
buffers[0].Resize(int(bitutil.BytesForBits(size)))
defer buffers[0].Release()
nullcount = r.GenerateBitmap(buffers[0].Bytes(), size, nullProb)
buffers[1] = memory.NewResizableBuffer(r.mem)
buffers[1].Resize(int(bitutil.BytesForBits(size)))
defer buffers[1].Release()
r.GenerateBitmap(buffers[1].Bytes(), size, prob)
data := array.NewData(arrow.FixedWidthTypes.Boolean, int(size), buffers, nil, int(nullcount), 0)
defer data.Release()
return array.NewBooleanData(data)
}
func (r *RandomArrayGenerator) baseGenPrimitive(size int64, prob float64, byteWidth int) ([]*memory.Buffer, int64) {
buffers := make([]*memory.Buffer, 2)
nullCount := int64(0)
buffers[0] = memory.NewResizableBuffer(r.mem)
buffers[0].Resize(int(bitutil.BytesForBits(size)))
nullCount = r.GenerateBitmap(buffers[0].Bytes(), size, prob)
buffers[1] = memory.NewResizableBuffer(r.mem)
buffers[1].Resize(int(size) * byteWidth)
return buffers, nullCount
}
func (r *RandomArrayGenerator) Int8(size int64, min, max int8, prob float64) array.Interface {
buffers, nullcount := r.baseGenPrimitive(size, prob, arrow.Int8SizeBytes)
for _, b := range buffers {
defer b.Release()
}
r.extra++
dist := rand.New(rand.NewSource(r.seed + r.extra))
out := arrow.Int8Traits.CastFromBytes(buffers[1].Bytes())
for i := int64(0); i < size; i++ {
out[i] = int8(dist.Intn(int(max)-int(min+1))) + min
}
data := array.NewData(arrow.PrimitiveTypes.Int8, int(size), buffers, nil, int(nullcount), 0)
defer data.Release()
return array.NewInt8Data(data)
}
func (r *RandomArrayGenerator) Uint8(size int64, min, max uint8, prob float64) array.Interface {
buffers, nullcount := r.baseGenPrimitive(size, prob, arrow.Uint8SizeBytes)
for _, b := range buffers {
defer b.Release()
}
r.extra++
dist := rand.New(rand.NewSource(r.seed + r.extra))
out := arrow.Uint8Traits.CastFromBytes(buffers[1].Bytes())
for i := int64(0); i < size; i++ {
out[i] = uint8(dist.Intn(int(max-min+1))) + min
}
data := array.NewData(arrow.PrimitiveTypes.Uint8, int(size), buffers, nil, int(nullcount), 0)
defer data.Release()
return array.NewUint8Data(data)
}
func (r *RandomArrayGenerator) Int16(size int64, min, max int16, prob float64) array.Interface {
buffers, nullcount := r.baseGenPrimitive(size, prob, arrow.Int16SizeBytes)
for _, b := range buffers {
defer b.Release()
}
r.extra++
dist := rand.New(rand.NewSource(r.seed + r.extra))
out := arrow.Int16Traits.CastFromBytes(buffers[1].Bytes())
for i := int64(0); i < size; i++ {
out[i] = int16(dist.Intn(int(max-min+1))) + min
}
data := array.NewData(arrow.PrimitiveTypes.Int16, int(size), buffers, nil, int(nullcount), 0)
defer data.Release()
return array.NewInt16Data(data)
}
func (r *RandomArrayGenerator) Uint16(size int64, min, max uint16, prob float64) array.Interface {
buffers, nullcount := r.baseGenPrimitive(size, prob, arrow.Uint16SizeBytes)
for _, b := range buffers {
defer b.Release()
}
r.extra++
dist := rand.New(rand.NewSource(r.seed + r.extra))
out := arrow.Uint16Traits.CastFromBytes(buffers[1].Bytes())
for i := int64(0); i < size; i++ {
out[i] = uint16(dist.Intn(int(max-min+1))) + min
}
data := array.NewData(arrow.PrimitiveTypes.Uint16, int(size), buffers, nil, int(nullcount), 0)
defer data.Release()
return array.NewUint16Data(data)
}
func (r *RandomArrayGenerator) Int32(size int64, min, max int32, prob float64) array.Interface {
buffers, nullcount := r.baseGenPrimitive(size, prob, arrow.Int32SizeBytes)
for _, b := range buffers {
defer b.Release()
}
r.extra++
dist := rand.New(rand.NewSource(r.seed + r.extra))
out := arrow.Int32Traits.CastFromBytes(buffers[1].Bytes())
for i := int64(0); i < size; i++ {
out[i] = dist.Int31n(max-min+1) + min
}
data := array.NewData(arrow.PrimitiveTypes.Int32, int(size), buffers, nil, int(nullcount), 0)
defer data.Release()
return array.NewInt32Data(data)
}
func (r *RandomArrayGenerator) Uint32(size int64, min, max uint32, prob float64) array.Interface {
buffers, nullcount := r.baseGenPrimitive(size, prob, arrow.Uint32SizeBytes)
for _, b := range buffers {
defer b.Release()
}
r.extra++
dist := rand.New(rand.NewSource(r.seed + r.extra))
out := arrow.Uint32Traits.CastFromBytes(buffers[1].Bytes())
for i := int64(0); i < size; i++ {
out[i] = uint32(dist.Uint64n(uint64(max-min+1))) + min
}
data := array.NewData(arrow.PrimitiveTypes.Uint32, int(size), buffers, nil, int(nullcount), 0)
defer data.Release()
return array.NewUint32Data(data)
}
func (r *RandomArrayGenerator) Int64(size int64, min, max int64, prob float64) array.Interface {
buffers, nullcount := r.baseGenPrimitive(size, prob, arrow.Int64SizeBytes)
for _, b := range buffers {
defer b.Release()
}
r.extra++
dist := rand.New(rand.NewSource(r.seed + r.extra))
out := arrow.Int64Traits.CastFromBytes(buffers[1].Bytes())
for i := int64(0); i < size; i++ {
out[i] = dist.Int63n(max-min+1) + min
}
data := array.NewData(arrow.PrimitiveTypes.Int64, int(size), buffers, nil, int(nullcount), 0)
defer data.Release()
return array.NewInt64Data(data)
}
func (r *RandomArrayGenerator) Uint64(size int64, min, max uint64, prob float64) array.Interface {
buffers, nullcount := r.baseGenPrimitive(size, prob, arrow.Uint64SizeBytes)
for _, b := range buffers {
defer b.Release()
}
r.extra++
dist := rand.New(rand.NewSource(r.seed + r.extra))
out := arrow.Uint64Traits.CastFromBytes(buffers[1].Bytes())
for i := int64(0); i < size; i++ {
out[i] = dist.Uint64n(max-min+1) + min
}
data := array.NewData(arrow.PrimitiveTypes.Uint64, int(size), buffers, nil, int(nullcount), 0)
defer data.Release()
return array.NewUint64Data(data)
}
func (r *RandomArrayGenerator) Float32(size int64, min, max float32, prob float64) array.Interface {
buffers, nullcount := r.baseGenPrimitive(size, prob, arrow.Float32SizeBytes)
for _, b := range buffers {
defer b.Release()
}
r.extra++
dist := rand.New(rand.NewSource(r.seed + r.extra))
out := arrow.Float32Traits.CastFromBytes(buffers[1].Bytes())
for i := int64(0); i < size; i++ {
out[i] = min + dist.Float32()*(max+1-min)
}
data := array.NewData(arrow.PrimitiveTypes.Float32, int(size), buffers, nil, int(nullcount), 0)
defer data.Release()
return array.NewFloat32Data(data)
}
func (r *RandomArrayGenerator) Float64(size int64, min, max float64, prob float64) array.Interface {
buffers, nullcount := r.baseGenPrimitive(size, prob, arrow.Float64SizeBytes)
for _, b := range buffers {
defer b.Release()
}
r.extra++
dist := rand.New(rand.NewSource(r.seed + r.extra))
out := arrow.Float64Traits.CastFromBytes(buffers[1].Bytes())
for i := int64(0); i < size; i++ {
out[i] = dist.NormFloat64() + (max - min)
}
data := array.NewData(arrow.PrimitiveTypes.Float64, int(size), buffers, nil, int(nullcount), 0)
defer data.Release()
return array.NewFloat64Data(data)
}
func (r *RandomArrayGenerator) String(size int64, minLength, maxLength int, nullprob float64) array.Interface {
lengths := r.Int32(size, int32(minLength), int32(maxLength), nullprob).(*array.Int32)
defer lengths.Release()
bldr := array.NewStringBuilder(r.mem)
defer bldr.Release()
r.extra++
dist := rand.New(rand.NewSource(r.seed + r.extra))
buf := make([]byte, 0, maxLength)
gen := func(n int32) string {
out := buf[:n]
for i := range out {
out[i] = uint8(dist.Int31n(int32('z')-int32('A')+1) + int32('A'))
}
return string(out)
}
for i := 0; i < lengths.Len(); i++ {
if lengths.IsValid(i) {
bldr.Append(gen(lengths.Value(i)))
} else {
bldr.AppendNull()
}
}
return bldr.NewArray()
}