blob: 2dca427e7c6aa3383f46c524d6a0e0c2f55dc2fb [file] [log] [blame]
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
using Test, Arrow, Tables, Dates, PooledArrays, TimeZones, UUIDs, CategoricalArrays
include(joinpath(dirname(pathof(Arrow)), "../test/testtables.jl"))
include(joinpath(dirname(pathof(Arrow)), "../test/integrationtest.jl"))
include(joinpath(dirname(pathof(Arrow)), "../test/dates.jl"))
struct CustomStruct
x::Int
y::Float64
z::String
end
@testset "Arrow" begin
@testset "table roundtrips" begin
for case in testtables
testtable(case...)
end
end # @testset "table roundtrips"
@testset "arrow json integration tests" begin
for file in readdir(joinpath(dirname(pathof(Arrow)), "../test/arrowjson"))
jsonfile = joinpath(joinpath(dirname(pathof(Arrow)), "../test/arrowjson"), file)
println("integration test for $jsonfile")
df = ArrowJSON.parsefile(jsonfile);
io = IOBuffer()
Arrow.write(io, df)
seekstart(io)
tbl = Arrow.Table(io; convert=false);
@test isequal(df, tbl)
end
end # @testset "arrow json integration tests"
@testset "misc" begin
# multiple record batches
t = Tables.partitioner(((col1=Union{Int64, Missing}[1,2,3,4,5,6,7,8,9,missing],), (col1=Union{Int64, Missing}[missing,11],)))
io = Arrow.tobuffer(t)
tt = Arrow.Table(io)
@test length(tt) == 1
@test isequal(tt.col1, vcat([1,2,3,4,5,6,7,8,9,missing], [missing,11]))
@test eltype(tt.col1) === Union{Int64, Missing}
# Arrow.Stream
seekstart(io)
str = Arrow.Stream(io)
state = iterate(str)
@test state !== nothing
tt, st = state
@test length(tt) == 1
@test isequal(tt.col1, [1,2,3,4,5,6,7,8,9,missing])
state = iterate(str, st)
@test state !== nothing
tt, st = state
@test length(tt) == 1
@test isequal(tt.col1, [missing,11])
@test iterate(str, st) === nothing
@test isequal(collect(str)[1].col1, [1,2,3,4,5,6,7,8,9,missing])
@test isequal(collect(str)[2].col1, [missing,11])
# dictionary batch isDelta
t = (
col1=Int64[1,2,3,4],
col2=Union{String, Missing}["hey", "there", "sailor", missing],
col3=NamedTuple{(:a, :b), Tuple{Int64, Union{Missing, NamedTuple{(:c,), Tuple{String}}}}}[(a=Int64(1), b=missing), (a=Int64(1), b=missing), (a=Int64(3), b=(c="sailor",)), (a=Int64(4), b=(c="jo-bob",))]
)
t2 = (
col1=Int64[1,2,5,6],
col2=Union{String, Missing}["hey", "there", "sailor2", missing],
col3=NamedTuple{(:a, :b), Tuple{Int64, Union{Missing, NamedTuple{(:c,), Tuple{String}}}}}[(a=Int64(1), b=missing), (a=Int64(1), b=missing), (a=Int64(5), b=(c="sailor2",)), (a=Int64(4), b=(c="jo-bob",))]
)
tt = Tables.partitioner((t, t2))
tt = Arrow.Table(Arrow.tobuffer(tt; dictencode=true, dictencodenested=true))
@test tt.col1 == [1,2,3,4,1,2,5,6]
@test isequal(tt.col2, ["hey", "there", "sailor", missing, "hey", "there", "sailor2", missing])
@test isequal(tt.col3, vcat(NamedTuple{(:a, :b), Tuple{Int64, Union{Missing, NamedTuple{(:c,), Tuple{String}}}}}[(a=Int64(1), b=missing), (a=Int64(1), b=missing), (a=Int64(3), b=(c="sailor",)), (a=Int64(4), b=(c="jo-bob",))], NamedTuple{(:a, :b), Tuple{Int64, Union{Missing, NamedTuple{(:c,), Tuple{String}}}}}[(a=Int64(1), b=missing), (a=Int64(1), b=missing), (a=Int64(5), b=(c="sailor2",)), (a=Int64(4), b=(c="jo-bob",))]))
t = (col1=Int64[1,2,3,4,5,6,7,8,9,10],)
meta = Dict("key1" => "value1", "key2" => "value2")
Arrow.setmetadata!(t, meta)
meta2 = Dict("colkey1" => "colvalue1", "colkey2" => "colvalue2")
Arrow.setmetadata!(t.col1, meta2)
tt = Arrow.Table(Arrow.tobuffer(t))
@test length(tt) == length(t)
@test tt.col1 == t.col1
@test eltype(tt.col1) === Int64
@test Arrow.getmetadata(tt) == meta
@test Arrow.getmetadata(tt.col1) == meta2
# custom compressors
lz4 = Arrow.CodecLz4.LZ4FrameCompressor(; compressionlevel=8)
Arrow.CodecLz4.TranscodingStreams.initialize(lz4)
t = (col1=Int64[1,2,3,4,5,6,7,8,9,10],)
tt = Arrow.Table(Arrow.tobuffer(t; compress=lz4))
@test length(tt) == length(t)
@test all(isequal.(values(t), values(tt)))
zstd = Arrow.CodecZstd.ZstdCompressor(; level=8)
Arrow.CodecZstd.TranscodingStreams.initialize(zstd)
t = (col1=Int64[1,2,3,4,5,6,7,8,9,10],)
tt = Arrow.Table(Arrow.tobuffer(t; compress=zstd))
@test length(tt) == length(t)
@test all(isequal.(values(t), values(tt)))
# custom alignment
t = (col1=Int64[1,2,3,4,5,6,7,8,9,10],)
tt = Arrow.Table(Arrow.tobuffer(t; alignment=64))
@test length(tt) == length(t)
@test all(isequal.(values(t), values(tt)))
# 53
s = "a" ^ 100
t = (a=[SubString(s, 1:10), SubString(s, 11:20)],)
tt = Arrow.Table(Arrow.tobuffer(t))
@test tt.a == ["aaaaaaaaaa", "aaaaaaaaaa"]
# 49
@test_throws ArgumentError Arrow.Table("file_that_doesnt_exist")
# 52
t = (a=Arrow.DictEncode(string.(1:129)),)
tt = Arrow.Table(Arrow.tobuffer(t))
# 60: unequal column lengths
io = IOBuffer()
@test_throws ArgumentError Arrow.write(io, (a = Int[], b = ["asd"], c=collect(1:100)))
# nullability of custom extension types
t = (a=['a', missing],)
tt = Arrow.Table(Arrow.tobuffer(t))
@test isequal(tt.a, ['a', missing])
# automatic custom struct serialization/deserialization
t = (col1=[CustomStruct(1, 2.3, "hey"), CustomStruct(4, 5.6, "there")],)
tt = Arrow.Table(Arrow.tobuffer(t))
@test length(tt) == length(t)
@test all(isequal.(values(t), values(tt)))
# 76
t = (col1=NamedTuple{(:a,),Tuple{Union{Int,String}}}[(a=1,), (a="x",)],)
tt = Arrow.Table(Arrow.tobuffer(t))
@test length(tt) == length(t)
@test all(isequal.(values(t), values(tt)))
# 89 etc. - test deprecation paths for old UUID autoconversion + UUID FixedSizeListType overloads
u = 0x6036fcbd20664bd8a65cdfa25434513f
@test Arrow.ArrowTypes.arrowconvert(UUID, (value=u,)) === UUID(u)
@test Arrow.ArrowTypes.arrowconvert(UUID, u) === UUID(u)
@test Arrow.ArrowTypes.gettype(UUID) == UInt8
@test Arrow.ArrowTypes.getsize(UUID) == 16
# 98
t = (a = [Nanosecond(0), Nanosecond(1)], b = [uuid4(), uuid4()], c = [missing, Nanosecond(1)])
tt = Arrow.Table(Arrow.tobuffer(t))
@test copy(tt.a) isa Vector{Nanosecond}
@test copy(tt.b) isa Vector{UUID}
@test copy(tt.c) isa Vector{Union{Missing,Nanosecond}}
# copy on DictEncoding w/ missing values
x = PooledArray(["hey", missing])
x2 = Arrow.toarrowvector(x)
@test isequal(copy(x2), x)
# some dict encoding coverage
# signed indices for DictEncodedType #112 #113 #114
av = Arrow.toarrowvector(PooledArray(repeat(["a", "b"], inner = 5)))
@test isa(first(av.indices), Signed)
av = Arrow.toarrowvector(CategoricalArray(repeat(["a", "b"], inner = 5)))
@test isa(first(av.indices), Signed)
av = Arrow.toarrowvector(CategoricalArray(["a", "bb", missing]))
@test isa(first(av.indices), Signed)
@test length(av) == 3
@test eltype(av) == Union{String, Missing}
av = Arrow.toarrowvector(CategoricalArray(["a", "bb", "ccc"]))
@test isa(first(av.indices), Signed)
@test length(av) == 3
@test eltype(av) == String
# 121
a = PooledArray(repeat(string.('S', 1:130), inner=5), compress=true)
@test eltype(a.refs) == UInt8
av = Arrow.toarrowvector(a)
@test eltype(av.indices) == Int16
# 123
t = (x = collect(zip(rand(10), rand(10))),)
tt = Arrow.Table(Arrow.tobuffer(t))
@test tt.x == t.x
# 144
t = Tables.partitioner(((a=Arrow.DictEncode([1,2,3]),), (a=Arrow.DictEncode(fill(1, 129)),)))
tt = Arrow.Table(Arrow.tobuffer(t))
@test length(tt.a) == 132
# 126
t = Tables.partitioner(
(
(a=Arrow.toarrowvector(PooledArray([1,2,3 ])),),
(a=Arrow.toarrowvector(PooledArray([1,2,3,4])),),
(a=Arrow.toarrowvector(PooledArray([1,2,3,4,5])),),
)
)
tt = Arrow.Table(Arrow.tobuffer(t))
@test length(tt.a) == 12
@test tt.a == [1, 2, 3, 1, 2, 3, 4, 1, 2, 3, 4, 5]
t = Tables.partitioner(
(
(a=Arrow.toarrowvector(PooledArray([1,2,3 ], signed=true, compress=true)),),
(a=Arrow.toarrowvector(PooledArray(collect(1:129))),),
)
)
io = IOBuffer()
@test_throws ErrorException Arrow.write(io, t)
# 150
struct Baz{T}
x::Type{T}
end
Arrow.ArrowTypes.registertype!(Baz, NamedTuple{(:x,),Tuple{String}})
Arrow.ArrowTypes.arrowconvert(::Type{NamedTuple{(:x,),Tuple{String}}}, b::Baz) = (x=string(b.x),)
Baz(s::String) = s === "Int64" ? Baz(Int64) : Baz(DataType)
Arrow.ArrowTypes.arrownameof(::Type{<:Baz}) = "JuliaLang.Baz"
t = Arrow.Table(Arrow.tobuffer((x=[Baz(Int64)],)))
@test t.x[1] === Baz(Int64)
struct Foo{T}
x::T
end
Arrow.ArrowTypes.registertype!(Foo, NamedTuple{(:x,),Tuple{String}})
Arrow.ArrowTypes.arrowconvert(::Type{NamedTuple{(:x,),Tuple{String}}}, f::Foo) = (x=string(f.x),)
t = Arrow.Table(Arrow.tobuffer((x=[Foo(1)],)))
@test t.x[1] === Foo(1)
Arrow.ArrowTypes.arrownameof(::Type{<:Foo}) = "JuliaLang.Foo"
t = Arrow.Table(Arrow.tobuffer((x=[Foo(1)],)))
@test t.x[1] === Foo("1")
end # @testset "misc"
end