blob: b63e0115274506dbb20430398bcc89d596c14b7a [file] [log] [blame]
% Licensed under the Apache License, Version 2.0 (the "License"); you may not
% use this file except in compliance with the License. You may obtain a copy of
% the License at
%
% http://www.apache.org/licenses/LICENSE-2.0
%
% Unless required by applicable law or agreed to in writing, software
% distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
% WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
% License for the specific language governing permissions and limitations under
% the License.
-module(json_stream_parse).
-export([events/2, to_ejson/1, collect_object/2]).
-define(IS_WS(X), (X == $\ orelse X == $\t orelse X == $\n orelse X == $\r)).
-define(IS_DELIM(X), (X == $} orelse X == $] orelse X == $,)).
-define(IS_DIGIT(X), (X >= $0 andalso X =< $9)).
% Parses the json into events.
%
% The DataFun param is a function that produces the data for parsing. When
% called it must yield a tuple, or the atom done. The first element in the
% tuple is the data itself, and the second element is a function to be called
% next to get the next chunk of data in the stream.
%
% The EventFun is called everytime a json element is parsed. It must produce
% a new function to be called for the next event.
%
% Events happen each time a new element in the json string is parsed.
% For simple value types, the data itself is returned:
% Strings
% Integers
% Floats
% true
% false
% null
%
% For arrays, the start of the array is signaled by the event array_start
% atom. The end is signaled by array_end. The events before the end are the
% values, or nested values.
%
% For objects, the start of the object is signaled by the event object_start
% atom. The end is signaled by object_end. Each key is signaled by
% {key, KeyString}, and the following event is the value, or start of the
% value (array_start, object_start).
%
events(Data,EventFun) when is_list(Data)->
events(list_to_binary(Data),EventFun);
events(Data,EventFun) when is_binary(Data)->
events(fun() -> {Data, fun() -> done end} end,EventFun);
events(DataFun,EventFun) ->
parse_one(DataFun, EventFun, <<>>).
% converts the JSON directly to the erlang represention of Json
to_ejson(DF) ->
{_DF2, EF, _Rest} = events(DF, fun(Ev) -> collect_events(Ev, []) end),
[[EJson]] = make_ejson(EF(get_results), [[]]),
EJson.
% This function is used to return complete objects while parsing streams.
%
% Return this function from inside an event function right after getting an
% object_start event. It then collects the remaining events for that object
% and converts it to the erlang represention of Json.
%
% It then calls your ReturnControl function with the erlang object. Your
% return control function then should yield another event function.
%
% This example stream parses an array of objects, calling
% fun do_something_with_the_object/1 for each object.
%
% ev_array(array_start) ->
% fun(Ev) -> ev_object_loop(Ev) end.
%
% ev_object_loop(object_start) ->
% fun(Ev) ->
% json_stream_parse:collect_object(Ev,
% fun(Obj) ->
% do_something_with_the_object(Obj),
% fun(Ev2) -> ev_object_loop(Ev2) end
% end)
% end;
% ev_object_loop(array_end) ->
% ok
% end.
%
% % invoke the parse
% main() ->
% ...
% events(Data, fun(Ev) -> ev_array(Ev) end).
collect_object(Ev, ReturnControl) ->
collect_object(Ev, 0, ReturnControl, [object_start]).
% internal methods
parse_one(DF,EF,Acc) ->
case toke(DF, Acc) of
none ->
none;
{Token, DF2, Rest} ->
case Token of
"{" ->
EF2 = EF(object_start),
{DF3, EF3, Rest2} = parse_object(DF2, EF2, Rest),
{DF3, EF3(object_end), Rest2};
"[" ->
EF2 = EF(array_start),
{DF3, EF3, Rest2} = parse_array(DF2, EF2, Rest),
{DF3, EF3(array_end), Rest2};
Int when is_integer(Int)->
{DF2, EF(Int), Rest};
Float when is_float(Float)->
{DF2, EF(Float), Rest};
Atom when is_atom(Atom)->
{DF2, EF(Atom), Rest};
String when is_binary(String)->
{DF2, EF(String), Rest};
_OtherToken ->
err(unexpected_token)
end
end.
must_parse_one(DF,EF,Acc,Error)->
case parse_one(DF, EF, Acc) of
none ->
err(Error);
Else ->
Else
end.
must_toke(DF, Data, Error) ->
case toke(DF, Data) of
none ->
err(Error);
Result ->
Result
end.
toke(DF, <<>>) ->
case DF() of
done ->
none;
{Data, DF2} ->
toke(DF2, Data)
end;
toke(DF, <<C,Rest/binary>>) when ?IS_WS(C)->
toke(DF, Rest);
toke(DF, <<${,Rest/binary>>) ->
{"{", DF, Rest};
toke(DF, <<$},Rest/binary>>) ->
{"}", DF, Rest};
toke(DF, <<$[,Rest/binary>>) ->
{"[", DF, Rest};
toke(DF, <<$],Rest/binary>>) ->
{"]", DF, Rest};
toke(DF, <<$",Rest/binary>>) ->
toke_string(DF,Rest,[]);
toke(DF, <<$,,Rest/binary>>) ->
{",", DF, Rest};
toke(DF, <<$:,Rest/binary>>) ->
{":", DF, Rest};
toke(DF, <<$-,Rest/binary>>) ->
{<<C,_/binary>> = Data, DF2} = must_df(DF,1,Rest,expected_number),
case ?IS_DIGIT(C) of
true ->
toke_number_leading(DF2, Data, "-");
false ->
err(expected_number)
end;
toke(DF, <<C,_/binary>> = Data) when ?IS_DIGIT(C) ->
toke_number_leading(DF, Data, []);
toke(DF, <<$t,Rest/binary>>) ->
{Data, DF2} = must_match(<<"rue">>, DF, Rest),
{true, DF2, Data};
toke(DF, <<$f,Rest/binary>>) ->
{Data, DF2} = must_match(<<"alse">>, DF, Rest),
{false, DF2, Data};
toke(DF, <<$n,Rest/binary>>) ->
{Data, DF2} = must_match(<<"ull">>, DF, Rest),
{null, DF2, Data};
toke(_, _) ->
err(bad_token).
must_match(Pattern, DF, Data) ->
Size = size(Pattern),
case must_df(DF, Size, Data, bad_token) of
{<<Pattern:Size/binary,Data2/binary>>, DF2} ->
{Data2, DF2};
{_, _} ->
err(bad_token)
end.
must_df(DF,Error)->
case DF() of
done ->
err(Error);
{Data, DF2} ->
{Data, DF2}
end.
must_df(DF,NeedLen,Acc,Error)->
if size(Acc) >= NeedLen ->
{Acc, DF};
true ->
case DF() of
done ->
err(Error);
{Data, DF2} ->
must_df(DF2, NeedLen, <<Acc/binary, Data/binary>>, Error)
end
end.
parse_object(DF,EF,Acc) ->
case must_toke(DF, Acc, unterminated_object) of
{String, DF2, Rest} when is_binary(String)->
EF2 = EF({key,String}),
case must_toke(DF2,Rest,unterminated_object) of
{":", DF3, Rest2} ->
{DF4, EF3, Rest3} = must_parse_one(DF3, EF2, Rest2, expected_value),
case must_toke(DF4,Rest3, unterminated_object) of
{",", DF5, Rest4} ->
parse_object(DF5, EF3, Rest4);
{"}", DF5, Rest4} ->
{DF5, EF3, Rest4};
{_, _, _} ->
err(unexpected_token)
end;
_Else ->
err(expected_colon)
end;
{"}", DF2, Rest} ->
{DF2, EF, Rest};
{_, _, _} ->
err(unexpected_token)
end.
parse_array0(DF,EF,Acc) ->
case toke(DF, Acc) of
none ->
err(unterminated_array);
{",", DF2, Rest} ->
parse_array(DF2,EF,Rest);
{"]", DF2, Rest} ->
{DF2,EF,Rest};
_ ->
err(unexpected_token)
end.
parse_array(DF,EF,Acc) ->
case toke(DF, Acc) of
none ->
err(unterminated_array);
{Token, DF2, Rest} ->
case Token of
"{" ->
EF2 = EF(object_start),
{DF3, EF3, Rest2} = parse_object(DF2, EF2, Rest),
parse_array0(DF3, EF3(object_end), Rest2);
"[" ->
EF2 = EF(array_start),
{DF3, EF3, Rest2} = parse_array(DF2, EF2, Rest),
parse_array0(DF3, EF3(array_end), Rest2);
Int when is_integer(Int)->
parse_array0(DF2, EF(Int), Rest);
Float when is_float(Float)->
parse_array0(DF2, EF(Float), Rest);
Atom when is_atom(Atom)->
parse_array0(DF2, EF(Atom), Rest);
String when is_binary(String)->
parse_array0(DF2, EF(String), Rest);
"]" ->
{DF2, EF, Rest};
_ ->
err(unexpected_token)
end
end.
toke_string(DF, <<>>, Acc) ->
{Data, DF2} = must_df(DF, unterminated_string),
toke_string(DF2, Data, Acc);
toke_string(DF, <<$\\,$",Rest/binary>>, Acc) ->
toke_string(DF, Rest, [$" | Acc]);
toke_string(DF, <<$\\,$\\,Rest/binary>>, Acc) ->
toke_string(DF, Rest, [$\\ | Acc]);
toke_string(DF, <<$\\,$/,Rest/binary>>, Acc) ->
toke_string(DF, Rest, [$/ | Acc]);
toke_string(DF, <<$\\,$b,Rest/binary>>, Acc) ->
toke_string(DF, Rest, [$\b | Acc]);
toke_string(DF, <<$\\,$f,Rest/binary>>, Acc) ->
toke_string(DF, Rest, [$\f | Acc]);
toke_string(DF, <<$\\,$n,Rest/binary>>, Acc) ->
toke_string(DF, Rest, [$\n | Acc]);
toke_string(DF, <<$\\,$r,Rest/binary>>, Acc) ->
toke_string(DF, Rest, [$\r | Acc]);
toke_string(DF, <<$\\,$t,Rest/binary>>, Acc) ->
toke_string(DF, Rest, [$\t | Acc]);
toke_string(DF, <<$\\,$u,Rest/binary>>, Acc) ->
{<<A,B,C,D,Data/binary>>, DF2} = must_df(DF,4,Rest,missing_hex),
UTFChar = erlang:list_to_integer([A, B, C, D], 16),
if UTFChar == 16#FFFF orelse UTFChar == 16#FFFE ->
err(invalid_utf_char);
true ->
ok
end,
Chars = xmerl_ucs:to_utf8(UTFChar),
toke_string(DF2, Data, lists:reverse(Chars) ++ Acc);
toke_string(DF, <<$\\>>, Acc) ->
{Data, DF2} = must_df(DF, unterminated_string),
toke_string(DF2, <<$\\,Data/binary>>, Acc);
toke_string(_DF, <<$\\, _/binary>>, _Acc) ->
err(bad_escape);
toke_string(DF, <<$", Rest/binary>>, Acc) ->
{list_to_binary(lists:reverse(Acc)), DF, Rest};
toke_string(DF, <<C, Rest/binary>>, Acc) ->
toke_string(DF, Rest, [C | Acc]).
toke_number_leading(DF, <<Digit,Rest/binary>>, Acc)
when ?IS_DIGIT(Digit) ->
toke_number_leading(DF, Rest, [Digit | Acc]);
toke_number_leading(DF, <<C,_/binary>>=Rest, Acc)
when ?IS_WS(C) orelse ?IS_DELIM(C) ->
{list_to_integer(lists:reverse(Acc)), DF, Rest};
toke_number_leading(DF, <<>>, Acc) ->
case DF() of
done ->
{list_to_integer(lists:reverse(Acc)), fun() -> done end, <<>>};
{Data, DF2} ->
toke_number_leading(DF2, Data, Acc)
end;
toke_number_leading(DF, <<$., Rest/binary>>, Acc) ->
toke_number_trailing(DF, Rest, [$.|Acc]);
toke_number_leading(DF, <<$e, Rest/binary>>, Acc) ->
toke_number_exponent(DF, Rest, [$e, $0, $.|Acc]);
toke_number_leading(DF, <<$E, Rest/binary>>, Acc) ->
toke_number_exponent(DF, Rest, [$e, $0, $.|Acc]);
toke_number_leading(_, _, _) ->
err(unexpected_character_in_number).
toke_number_trailing(DF, <<Digit,Rest/binary>>, Acc)
when ?IS_DIGIT(Digit) ->
toke_number_trailing(DF, Rest, [Digit | Acc]);
toke_number_trailing(DF, <<C,_/binary>>=Rest, Acc)
when ?IS_WS(C) orelse ?IS_DELIM(C) ->
{list_to_float(lists:reverse(Acc)), DF, Rest};
toke_number_trailing(DF, <<>>, Acc) ->
case DF() of
done ->
{list_to_float(lists:reverse(Acc)), fun() -> done end, <<>>};
{Data, DF2} ->
toke_number_trailing(DF2, Data, Acc)
end;
toke_number_trailing(DF, <<"e", Rest/binary>>, [C|_]=Acc) when C /= $. ->
toke_number_exponent(DF, Rest, [$e|Acc]);
toke_number_trailing(DF, <<"E", Rest/binary>>, [C|_]=Acc) when C /= $. ->
toke_number_exponent(DF, Rest, [$e|Acc]);
toke_number_trailing(_, _, _) ->
err(unexpected_character_in_number).
toke_number_exponent(DF, <<Digit,Rest/binary>>, Acc) when ?IS_DIGIT(Digit) ->
toke_number_exponent(DF, Rest, [Digit | Acc]);
toke_number_exponent(DF, <<Sign,Rest/binary>>, [$e|_]=Acc)
when Sign == $+ orelse Sign == $- ->
toke_number_exponent(DF, Rest, [Sign | Acc]);
toke_number_exponent(DF, <<C,_/binary>>=Rest, Acc)
when ?IS_WS(C) orelse ?IS_DELIM(C) ->
{list_to_float(lists:reverse(Acc)), DF, Rest};
toke_number_exponent(DF, <<>>, Acc) ->
case DF() of
done ->
{list_to_float(lists:reverse(Acc)), fun() -> done end, <<>>};
{Data, DF2} ->
toke_number_exponent(DF2, Data, Acc)
end;
toke_number_exponent(_, _, _) ->
err(unexpected_character_in_number).
err(Error)->
throw({parse_error,Error}).
make_ejson([], Stack) ->
Stack;
make_ejson([array_start | RevEvs], [ArrayValues, PrevValues | RestStack]) ->
make_ejson(RevEvs, [[ArrayValues | PrevValues] | RestStack]);
make_ejson([array_end | RevEvs], Stack) ->
make_ejson(RevEvs, [[] | Stack]);
make_ejson([object_start | RevEvs], [ObjValues, PrevValues | RestStack]) ->
make_ejson(RevEvs, [[{ObjValues} | PrevValues] | RestStack]);
make_ejson([object_end | RevEvs], Stack) ->
make_ejson(RevEvs, [[] | Stack]);
make_ejson([{key, String} | RevEvs], [[PrevValue|RestObject] | RestStack] = _Stack) ->
make_ejson(RevEvs, [[{String, PrevValue}|RestObject] | RestStack]);
make_ejson([Value | RevEvs], [Vals | RestStack] = _Stack) ->
make_ejson(RevEvs, [[Value | Vals] | RestStack]).
collect_events(get_results, Acc) ->
Acc;
collect_events(Ev, Acc) ->
fun(NextEv) -> collect_events(NextEv, [Ev | Acc]) end.
collect_object(object_end, 0, ReturnControl, Acc) ->
[[Obj]] = make_ejson([object_end | Acc], [[]]),
ReturnControl(Obj);
collect_object(object_end, NestCount, ReturnControl, Acc) ->
fun(Ev) ->
collect_object(Ev, NestCount - 1, ReturnControl, [object_end | Acc])
end;
collect_object(object_start, NestCount, ReturnControl, Acc) ->
fun(Ev) ->
collect_object(Ev, NestCount + 1, ReturnControl, [object_start | Acc])
end;
collect_object(Ev, NestCount, ReturnControl, Acc) ->
fun(Ev2) ->
collect_object(Ev2, NestCount, ReturnControl, [Ev | Acc])
end.