| // Licensed to the Apache Software Foundation (ASF) under one |
| // or more contributor license agreements. See the NOTICE file |
| // distributed with this work for additional information |
| // regarding copyright ownership. The ASF licenses this file |
| // to you under the Apache License, Version 2.0 (the |
| // "License"); you may not use this file except in compliance |
| // with the License. You may obtain a copy of the License at |
| // |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, software |
| // distributed under the License is distributed on an "AS IS" BASIS, |
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| // See the License for the specific language governing permissions and |
| // limitations under the License. |
| |
| package encoding |
| |
| import ( |
| "encoding/binary" |
| "unsafe" |
| |
| "github.com/apache/arrow/go/v6/arrow" |
| "github.com/apache/arrow/go/v6/parquet" |
| "github.com/apache/arrow/go/v6/parquet/internal/utils" |
| ) |
| |
| // PlainByteArrayEncoder encodes byte arrays according to the spec for Plain encoding |
| // by encoding the length as a int32 followed by the bytes of the value. |
| type PlainByteArrayEncoder struct { |
| encoder |
| |
| bitSetReader utils.SetBitRunReader |
| } |
| |
| // PutByteArray writes out the 4 bytes for the length followed by the data |
| func (enc *PlainByteArrayEncoder) PutByteArray(val parquet.ByteArray) { |
| inc := val.Len() + arrow.Uint32SizeBytes |
| enc.sink.Reserve(inc) |
| vlen := utils.ToLEUint32(uint32(val.Len())) |
| enc.sink.UnsafeWrite((*(*[4]byte)(unsafe.Pointer(&vlen)))[:]) |
| enc.sink.UnsafeWrite(val) |
| } |
| |
| // Put writes out all of the values in this slice to the encoding sink |
| func (enc *PlainByteArrayEncoder) Put(in []parquet.ByteArray) { |
| for _, val := range in { |
| enc.PutByteArray(val) |
| } |
| } |
| |
| // PutSpaced uses the bitmap of validBits to leave out anything that is null according |
| // to the bitmap. |
| // |
| // If validBits is nil, this is equivalent to calling Put |
| func (enc *PlainByteArrayEncoder) PutSpaced(in []parquet.ByteArray, validBits []byte, validBitsOffset int64) { |
| if validBits != nil { |
| if enc.bitSetReader == nil { |
| enc.bitSetReader = utils.NewSetBitRunReader(validBits, validBitsOffset, int64(len(in))) |
| } else { |
| enc.bitSetReader.Reset(validBits, validBitsOffset, int64(len(in))) |
| } |
| |
| for { |
| run := enc.bitSetReader.NextRun() |
| if run.Length == 0 { |
| break |
| } |
| enc.Put(in[int(run.Pos):int(run.Pos+run.Length)]) |
| } |
| } else { |
| enc.Put(in) |
| } |
| } |
| |
| // Type returns parquet.Types.ByteArray for the bytearray encoder |
| func (PlainByteArrayEncoder) Type() parquet.Type { |
| return parquet.Types.ByteArray |
| } |
| |
| // WriteDict writes the dictionary out to the provided slice, out should be |
| // at least DictEncodedSize() bytes |
| func (enc *DictByteArrayEncoder) WriteDict(out []byte) { |
| enc.memo.(BinaryMemoTable).VisitValues(0, func(v []byte) { |
| binary.LittleEndian.PutUint32(out, uint32(len(v))) |
| out = out[arrow.Uint32SizeBytes:] |
| copy(out, v) |
| out = out[len(v):] |
| }) |
| } |
| |
| // PutByteArray adds a single byte array to buffer, updating the dictionary |
| // and encoded size if it's a new value |
| func (enc *DictByteArrayEncoder) PutByteArray(in parquet.ByteArray) { |
| if in == nil { |
| in = empty[:] |
| } |
| memoIdx, found, err := enc.memo.GetOrInsert(in) |
| if err != nil { |
| panic(err) |
| } |
| if !found { |
| enc.dictEncodedSize += in.Len() + arrow.Uint32SizeBytes |
| } |
| enc.addIndex(memoIdx) |
| } |
| |
| // Put takes a slice of ByteArrays to add and encode. |
| func (enc *DictByteArrayEncoder) Put(in []parquet.ByteArray) { |
| for _, val := range in { |
| enc.PutByteArray(val) |
| } |
| } |
| |
| // PutSpaced like with the non-dict encoder leaves out the values where the validBits bitmap is 0 |
| func (enc *DictByteArrayEncoder) PutSpaced(in []parquet.ByteArray, validBits []byte, validBitsOffset int64) { |
| utils.VisitSetBitRuns(validBits, validBitsOffset, int64(len(in)), func(pos, length int64) error { |
| for i := int64(0); i < length; i++ { |
| enc.PutByteArray(in[i+pos]) |
| } |
| return nil |
| }) |
| } |