blob: f9dc540b3b1add3855ee50c5912018a71a7829a8 [file] [log] [blame]
-module(hyper_test).
-include_lib("triq/include/triq.hrl").
-include_lib("eunit/include/eunit.hrl").
-record(hyper, {p, registers}). % copy of #hyper in hyper.erl
hyper_test_() ->
RunProp = fun (P) ->
{timeout, 600,
fun () ->
?assert(triq:check(P))
end}
end,
{foreach, fun () -> ok end, fun (_) -> ok end,
[
?_test(is_hyper_t()),
?_test(basic_t()),
?_test(serialization_t()),
?_test(reduce_precision_t()),
{timeout, 60, ?_test(backend_t())},
?_test(encoding_t()),
?_test(register_sum_t()),
{timeout, 30, ?_test(error_range_t())},
?_test(many_union_t()),
?_test(union_t()),
?_test(union_mixed_precision_t()),
?_test(small_big_union_t()),
?_test(intersect_card_t()),
?_test(bad_serialization_t()),
{"Union property with hyper_carry", RunProp(prop_union(hyper_carray))},
{"Union property with hyper_binary", RunProp(prop_union(hyper_binary))},
{"Union property with hyper_array", RunProp(prop_union(hyper_array))},
%{"Union property with hyper_bisect", RunProp(prop_union(hyper_bisect))},
{"Union property with hyper_gb", RunProp(prop_union(hyper_gb))},
RunProp(prop_set()),
RunProp(prop_serialize())
]}.
is_hyper_t() ->
?assert(hyper:is_hyper(hyper:new(4, hyper_binary))),
?assertNot(hyper:is_hyper(foo)).
basic_t() ->
[?assertEqual(1, trunc(
hyper:card(
hyper:insert(<<"1">>, hyper:new(4, Mod)))))
|| Mod <- backends()].
serialization_t() ->
Mod = hyper_binary,
Hyper = hyper:compact(hyper:insert_many(generate_unique(10), hyper:new(5, Mod))),
?assertEqual(trunc(hyper:card(Hyper)),
trunc(
hyper:card(
hyper:from_json(
hyper:to_json(Hyper), Mod)))),
?assertEqual(Hyper#hyper.p, (hyper:from_json(
hyper:to_json(Hyper), Mod))#hyper.p).
reduce_precision_t() ->
random:seed(1, 2, 3),
Card = 1000,
Values = generate_unique(Card),
[begin
HighRes = hyper:insert_many(Values, hyper:new(16, Mod)),
lists:foreach(
fun (P) ->
Estimate = hyper:card(hyper:reduce_precision(P, HighRes)),
% accept error rate for one precision step less
M = trunc(math:pow(2, P-1)),
Error = 1.04 / math:sqrt(M),
?assert(abs(Estimate - Card) < Card * Error)
end, lists:seq(4, 15))
%end || Mod <- backend()].
end || Mod <- [hyper_binary]].
backend_t() ->
Values = generate_unique(10000),
P = 9,
M = trunc(math:pow(2, P)),
Gb = hyper:compact(hyper:insert_many(Values, hyper:new(P, hyper_gb))),
Array = hyper:compact(hyper:insert_many(Values, hyper:new(P, hyper_array))),
%Bisect = hyper:compact(hyper:insert_many(Values, hyper:new(P, hyper_bisect))),
Binary = hyper:compact(hyper:insert_many(Values, hyper:new(P, hyper_binary))),
Carray = hyper:compact(hyper:insert_many(Values, hyper:new(P, hyper_carray))),
{hyper_gb , GbRegisters} = Gb#hyper.registers,
{hyper_array , ArrayRegisters} = Array#hyper.registers,
%{hyper_bisect, BisectRegisters} = Bisect#hyper.registers,
{hyper_binary, BinaryRegisters} = Binary#hyper.registers,
{hyper_carray, CarrayRegisters} = Carray#hyper.registers,
ExpectedRegisters = lists:foldl(
fun (Value, Registers) ->
Hash = crypto:hash(sha, Value),
<<Index:P, RegisterValue:P/bitstring,
_/bitstring>> = Hash,
ZeroCount = hyper:run_of_zeroes(RegisterValue)
+ 1,
case dict:find(Index, Registers) of
{ok, R} when R > ZeroCount ->
Registers;
_ ->
dict:store(Index, ZeroCount, Registers)
end
end, dict:new(), Values),
ExpectedBytes = iolist_to_binary(
[begin
case dict:find(I, ExpectedRegisters) of
{ok, V} ->
<<V:8/integer>>;
error ->
<<0>>
end
end || I <- lists:seq(0, M-1)]),
?assertEqual(ExpectedBytes, hyper_gb:encode_registers(GbRegisters)),
?assertEqual(ExpectedBytes, hyper_array:encode_registers(ArrayRegisters)),
%?assertEqual(ExpectedBytes, hyper_bisect:encode_registers(BisectRegisters)),
?assertEqual(ExpectedBytes, hyper_binary:encode_registers(BinaryRegisters)),
?assertEqual(ExpectedBytes, hyper_carray:encode_registers(CarrayRegisters)),
?assertEqual(hyper:card(Gb),
hyper:card(hyper:from_json(hyper:to_json(Array), hyper_gb))),
?assertEqual(Array, hyper:from_json(hyper:to_json(Array), hyper_array)),
%?assertEqual(Bisect, hyper:from_json(hyper:to_json(Array), hyper_bisect)),
?assertEqual(Binary, hyper:from_json(hyper:to_json(Binary), hyper_binary)),
% TODO this test fails on R20 with a reference mismatch
% ?assertEqual(Carray, hyper:from_json(hyper:to_json(Carray), hyper_carray)),
?assertEqual(hyper:to_json(Gb), hyper:to_json(Array)),
%?assertEqual(hyper:to_json(Gb), hyper:to_json(Bisect)),
?assertEqual(hyper:to_json(Gb), hyper:to_json(Binary)),
?assertEqual(hyper:to_json(Gb), hyper:to_json(Carray)),
?assertEqual(hyper:card(Gb), hyper:card(Array)),
%?assertEqual(hyper:card(Gb), hyper:card(Bisect)),
?assertEqual(hyper:card(Gb), hyper:card(Binary)),
?assertEqual(hyper:card(Gb), hyper:card(Carray)).
encoding_t() ->
[begin
P = 15,
M = trunc(math:pow(2, P)),
Hyper = hyper:insert_many(generate_unique(1000), hyper:new(P, Mod)),
?assertEqual(trunc(hyper:card(Hyper)),
trunc(hyper:card(hyper:from_json(hyper:to_json(Hyper), Mod)))),
{Struct} = hyper:to_json(Hyper),
Serialized = zlib:gunzip(
base64:decode(
proplists:get_value(<<"registers">>, Struct))),
WithPadding = <<Serialized/binary, 0>>,
B = Mod:encode_registers(Mod:decode_registers(Serialized, P)),
BWithPadding = Mod:encode_registers(Mod:decode_registers(WithPadding, P)),
?assertEqual(M, byte_size(B)),
?assertEqual(M, byte_size(BWithPadding)),
?assertEqual(B, BWithPadding)
end || Mod <- backends()].
register_sum_t() ->
Mods = backends(),
P = 4,
M = trunc(math:pow(2, P)),
SetRegisters = [1, 5, 10],
RegisterValue = 3,
ExpectedSum =
(math:pow(2, -0) * M)
- (math:pow(2, -0) * length(SetRegisters))
+ (math:pow(2, -RegisterValue) * length(SetRegisters)),
[begin
Registers = lists:foldl(fun (I, Acc) ->
Mod:set(I, RegisterValue, Acc)
end, Mod:new(P), SetRegisters),
Compact = Mod:compact(Registers),
?assertEqual({Mod, ExpectedSum}, {Mod, Mod:register_sum(Compact)})
end || Mod <- Mods].
error_range_t() ->
Mods = backends(),
Run = fun (Cardinality, P, Mod) ->
lists:foldl(fun (V, H) ->
hyper:insert(V, H)
end, hyper:new(P, Mod), generate_unique(Cardinality))
end,
ExpectedError = 0.02,
P = 14,
random:seed(1, 2, 3),
[begin
Estimate = trunc(hyper:card(Run(Card, P, Mod))),
?assert(abs(Estimate - Card) < Card * ExpectedError)
end || Card <- lists:seq(1000, 50000, 5000),
Mod <- Mods].
many_union_t() ->
random:seed(1, 2, 3),
Card = 100,
NumSets = 3,
[begin
M = trunc(math:pow(2, P)),
Error = 1.04 / math:sqrt(M),
Sets = [sets:from_list(generate_unique(Card))
|| _ <- lists:seq(1, NumSets)],
Filters = [hyper:insert_many(sets:to_list(S), hyper:new(P, Mod))
|| S <- Sets],
ExpectedFilter = hyper:compact(
hyper:insert_many(
lists:flatten([sets:to_list(S) || S <- Sets]),
hyper:new(P, Mod))),
H = hyper:union(Filters),
{Mod, ExpectedRegisters} = ExpectedFilter#hyper.registers,
{Mod, ActualRegisters} = H#hyper.registers,
?assertEqual(Mod:encode_registers(ExpectedRegisters),
Mod:encode_registers(ActualRegisters)),
Delta = abs(sets:size(sets:union(Sets)) - hyper:card(hyper:union(Filters))),
case Delta < (Card * NumSets * Error) of
true ->
ok;
false ->
error_logger:info_msg("too high error, expected ~.2f%, actual ~.2f%~n"
"~p, p = ~p, card = ~p",
[Error, Delta / (Card * NumSets), Mod, P, Card]),
?assert(false)
end
end || Mod <- [hyper_binary_rle],
P <- [15]].
union_t() ->
random:seed(1, 2, 3),
Mod = hyper_binary_rle,
LeftDistinct = sets:from_list(generate_unique(100)),
RightDistinct = sets:from_list(generate_unique(50)
++ lists:sublist(sets:to_list(LeftDistinct),
50)),
LeftHyper = hyper:insert_many(sets:to_list(LeftDistinct),
hyper:new(13, Mod)),
RightHyper = hyper:insert_many(sets:to_list(RightDistinct),
hyper:new(13, Mod)),
UnionHyper = hyper:union([LeftHyper, RightHyper]),
Intersection = hyper:card(LeftHyper)
+ hyper:card(RightHyper) - hyper:card(UnionHyper),
?assert(abs(hyper:card(UnionHyper) -
sets:size(sets:union(LeftDistinct, RightDistinct)))
< 200),
?assert(abs(Intersection - sets:size(
sets:intersection(LeftDistinct, RightDistinct)))
< 200).
union_mixed_precision_t() ->
[?assertEqual(4, trunc(
hyper:card(
hyper:union([
hyper:insert(<<"1">>, hyper:new(4, Mod)),
hyper:insert(<<"2">>, hyper:new(6, Mod)),
hyper:insert(<<"3">>, hyper:new(8, Mod)),
hyper:insert(<<"4">>, hyper:new(16, Mod))
]))))
%|| Mod <- backends()].
|| Mod <- [hyper_binary]].
small_big_union_t() ->
random:seed(1, 2, 3),
SmallCard = 100,
BigCard = 15000, % switches to dense at 10922 items
SmallSet = sets:from_list(generate_unique(SmallCard)),
BigSet = sets:from_list(generate_unique(BigCard)),
SmallHyper = hyper:insert_many(sets:to_list(SmallSet),
hyper:new(15)),
BigHyper = hyper:insert_many(sets:to_list(BigSet),
hyper:new(15)),
%?assertMatch({hyper_bisect, {sparse, _, _, _}}, SmallHyper#hyper.registers),
%?assertMatch({hyper_bisect, {dense, _}}, BigHyper#hyper.registers),
UnionHyper = hyper:union(SmallHyper, BigHyper),
TrueUnion = sets:size(sets:union(SmallSet, BigSet)),
?assert(abs(hyper:card(UnionHyper) - TrueUnion) < TrueUnion * 0.01).
intersect_card_t() ->
random:seed(1, 2, 3),
LeftDistinct = sets:from_list(generate_unique(10000)),
RightDistinct = sets:from_list(generate_unique(5000)
++ lists:sublist(sets:to_list(LeftDistinct),
5000)),
LeftHyper = hyper:insert_many(sets:to_list(LeftDistinct), hyper:new(13)),
RightHyper = hyper:insert_many(sets:to_list(RightDistinct), hyper:new(13)),
IntersectCard = hyper:intersect_card(LeftHyper, RightHyper),
?assert(IntersectCard =< hyper:card(hyper:union(LeftHyper, RightHyper))),
%% NOTE: we can't really say much about the error here,
%% so just pick something and see if the intersection makes sense
Error = 0.05,
?assert((abs(5000 - IntersectCard) / 5000) =< Error).
bad_serialization_t() ->
[begin
P = 15,
M = trunc(math:pow(2, P)),
{ok, WithNewlines} = file:read_file("../test/filter.txt"),
Raw = case zlib:gunzip(
base64:decode(
binary:replace(WithNewlines, <<"\n">>, <<>>))) of
<<RawBytes:M/binary>> ->
RawBytes;
<<RawBytes:M/binary, 0>> ->
%% TODO: test padding
RawBytes
end,
H = hyper:from_json({[{<<"p">>, 15},
{<<"registers">>, base64:encode(zlib:gzip(Raw))}]},
Mod),
{Mod, Registers} = H#hyper.registers,
Encoded = Mod:encode_registers(Registers),
?assertEqual(size(Raw), size(Encoded)),
lists:foreach(fun (I) ->
?assertEqual(binary:at(Raw, I),
binary:at(Encoded, I))
end, lists:seq(0, size(Encoded) - 1)),
?assertEqual(Raw, Mod:encode_registers(Registers)),
?assertEqual({[{<<"p">>, 15},
{<<"registers">>, base64:encode(zlib:gzip(Raw))}]},
hyper:to_json(H))
end || Mod <- backends()].
%%
%% PROPERTIES
%%
backends() ->
[hyper_gb, hyper_array, hyper_binary, hyper_carray].
gen_values() ->
?SIZED(Size, gen_values(Size)).
gen_values(0) ->
[<<(random:uniform(100000000000000)):64/integer>>];
gen_values(Size) ->
[<<(random:uniform(100000000000000)):64/integer>> | gen_values(Size-1)].
%%gen_values(0) ->
%% [non_empty(binary())];
%%gen_values(Size) ->
%% [non_empty(binary()) | gen_values(Size-1)].
gen_filters(Values) ->
?LET(NumFilters, choose(2, 10),
gen_filters(Values, length(Values) div NumFilters, NumFilters)).
gen_filters(Values, Size, 0) ->
[Values];
gen_filters(Values, Size, NumFilters) ->
case split(Size, Values) of
{[], _} ->
[];
{Filter, Rest} ->
[Filter | gen_filters(Rest, Size, NumFilters-1)]
end.
split(N, []) -> {[], []};
split(N, L) when length(L) < N -> {L, []};
split(N, L) -> lists:split(N, L).
gen_getset(P) ->
?SIZED(Size, gen_getset(Size, P)).
gen_getset(0, _P) ->
[];
gen_getset(Size, P) ->
M = trunc(math:pow(2, P)),
?LET({I, V}, {choose(0, M-1), choose(1, 6)},
[{I, V} | gen_getset(Size-1, P)]).
prop_set() ->
?FORALL(
{Mod, P}, {oneof(backends()), choose(4, 16)},
?FORALL(
Values, gen_getset(P),
begin
R = lists:foldl(
fun ({Index, ZeroCount}, Register) ->
Mod:set(Index, ZeroCount, Register)
end, Mod:new(P), Values),
Max = lists:foldl(fun ({I, V}, Acc) ->
case dict:find(I, Acc) of
{ok, OtherV} when OtherV >= V ->
Acc;
_ ->
dict:store(I, V, Acc)
end
end, dict:new(), Values),
Expected = lists:map(fun (I) ->
case dict:find(I, Max) of
{ok, V} ->
<<V:8/integer>>;
error ->
<<0>>
end
end, lists:seq(0, trunc(math:pow(2, P)) - 1)),
case Mod:encode_registers(Mod:compact(R))
=:= iolist_to_binary(Expected) of
true ->
true;
false ->
%% error_logger:info_msg("values~n~p~n"
%% "encoded~n~p~n"
%% "expected~n~p~n",
%% [Values,
%% Mod:encode_registers(R),
%% iolist_to_binary(Expected)]),
false
end
end)).
prop_serialize() ->
?FORALL(
{LeftMod, RightMod, P, Values}, {oneof(backends()),
oneof(backends()),
choose(4, 16),
gen_values()},
begin
Left = hyper:compact(
hyper:insert_many(Values, hyper:new(P, LeftMod))),
Right = hyper:compact(
hyper:insert_many(Values, hyper:new(P, RightMod))),
hyper:to_json(Left) =:= hyper:to_json(Right)
end).
prop_union(Mod) ->
?FORALL(
{P, NumFilters, Values}, {choose(4, 16), choose(2, 5), gen_values()},
begin
Filters = lists:map(fun (Vs) ->
hyper:insert_many(Vs, hyper:new(P, Mod))
end, partition(NumFilters, Values)),
Filter = hyper:insert_many(Values, hyper:new(P, Mod)),
Union = hyper:union(Filters),
hyper:card(Filter) =:= hyper:card(Union)
end).
%%
%% HELPERS
%%
generate_unique(N) ->
generate_unique(lists:usort(random_bytes(N)), N).
generate_unique(L, N) ->
case length(L) of
N ->
L;
Less ->
generate_unique(lists:usort(random_bytes(N - Less) ++ L), N)
end.
random_bytes(N) ->
random_bytes([], N).
random_bytes(Acc, 0) -> Acc;
random_bytes(Acc, N) ->
Int = random:uniform(100000000000000),
random_bytes([<<Int:64/integer>> | Acc], N-1).
%% Lifted from stdlib2, https://github.com/cannedprimates/stdlib2
partition(N, Xs)
when is_integer(N)
, N > 0
, is_list(Xs) ->
Len = length(Xs),
case {Len > 0, Len > N} of
{true, true } -> [take(N, Xs)|partition(N, drop(N, Xs))];
{true, false} -> [Xs];
{false, false} -> []
end.
take(N, _) when N =< 0 -> [];
take(_, []) -> [];
take(N, [X|Xs]) -> [X|take(N - 1, Xs)].
drop(N, Xs) when N =< 0 -> Xs;
drop(_, []) -> [];
drop(N, [_|Xs]) -> drop(N - 1, Xs).