%%% @author Fred Hebert <mononcqc@ferd.ca>
%%%  [http://ferd.ca/]
%%% @doc Recon, as a module, provides access to the high-level functionality
%%% contained in the Recon application.
%%%
%%% It has functions in five main categories:
%%%
%%% <dl>
%%%     <dt>1. State information</dt>
%%%     <dd>Process information is everything that has to do with the
%%%         general state of the node. Functions such as {@link info/1}
%%%         and {@link info/3} are wrappers to provide more details than
%%%         `erlang:process_info/2', while providing it in a production-safe
%%%         manner.</dd>
%%%     <dd>{@link proc_count/2} and {@link proc_window/3} are to be used
%%%         when you require information about processes in a larger sense:
%%%         biggest consumers of given process information (say memory or
%%%         reductions), either absolutely or over a sliding time window,
%%%         respectively.</dd>
%%%     <dd>{@link bin_leak/1} is a function that can be used to try and
%%%         see if your Erlang node is leaking refc binaries. See the function
%%%         itself for more details.</dd>
%%%     <dd>Functions to access node statistics, in a manner somewhat similar
%%%         to what <a href="https://github.com/ferd/vmstats">vmstats</a>
%%%         provides as a library. There are 3 of them:
%%%         {@link node_stats_print/2}, which displays them,
%%%         {@link node_stats_list/2}, which returns them in a list, and
%%%         {@link node_stats/4}, which provides a fold-like interface
%%%         for stats gathering.</dd>
%%%
%%%     <dt>2. OTP tools</dt>
%%%     <dd>This category provides tools to interact with pieces of OTP
%%%         more easily. At this point, the only function included is
%%%         {@link get_state/1}, which works as a wrapper around
%%%         `sys:get_state/1' in R16B01, and provides the required
%%%         functionality for older versions of Erlang.</dd>
%%%
%%%     <dt>3. Code Handling</dt>
%%%     <dd>Specific functions are in `recon' for the sole purpose
%%%         of interacting with source and compiled code.
%%%         {@link remote_load/1} and {@link remote_load/2} will allow
%%%         to take a local module, and load it remotely (in a diskless
%%%         manner) on another Erlang node you're connected to.</dd>
%%%     <dd>{@link source/1} allows to print the source of a loaded module,
%%%         in case it's not available in the currently running node.</dd>
%%%
%%%     <dt>4. Ports and Sockets</dt>
%%%     <dd>To make it simpler to debug some network-related issues,
%%%         recon contains functions to deal with Erlang ports (raw, file
%%%         handles, or inet). Functions {@link tcp/0}, {@link udp/0},
%%%         {@link sctp/0}, {@link files/0}, and {@link port_types/0} will
%%%         list all the Erlang ports of a given type. The latter function
%%%         prints counts of all individual types.</dd>
%%%     <dd>Finally, the functions {@link inet_count/2} and {@link inet_window/3}
%%%         provide the absolute or sliding window functionality of
%%%         {@link proc_count/2} and {@link proc_count/3} to inet ports
%%%         and connections currently on the node.</dd>
%%%
%%%     <dt>5. RPC</dt>
%%%     <dd>These are wrappers to make RPC work simpler with clusters of
%%%         Erlang nodes. Default RPC mechanisms (from the `rpc' module)
%%%         make it somewhat painful to call shell-defined funs over node
%%%         boundaries. The functions {@link rpc/1}, {@link rpc/2}, and
%%%         {@link rpc/3} will do it with a simpler interface.</dd>
%%%     <dd>Additionally, when you're running diagnostic code on remote
%%%         nodes and want to know which node evaluated what result, using
%%%         {@link named_rpc/1}, {@link named_rpc/2}, and {@link named_rpc/3}
%%%         will wrap the results in a tuple that tells you which node it's
%%%         coming from, making it easier to identify bad nodes.</dd>
%%% </dl>
%%% @end
-module(recon).
-export([info/1,info/3,
         proc_count/2, proc_window/3,
         bin_leak/1,
         node_stats_print/2, node_stats_list/2, node_stats/4]).
-export([get_state/1]).
-export([remote_load/1, remote_load/2,
         source/1]).
-export([tcp/0, udp/0, sctp/0, files/0, port_types/0,
         inet_count/2, inet_window/3]).
-export([rpc/1, rpc/2, rpc/3,
         named_rpc/1, named_rpc/2, named_rpc/3]).

%%%%%%%%%%%%%
%%% TYPES %%%
%%%%%%%%%%%%%
-type proc_attrs() :: {pid(),
                       Attr::_,
                       [Name::atom()
                       |{current_function, mfa()}
                       |{initial_call, mfa()}, ...]}.
-type inet_attrs() :: {port(),
                       Attr::_,
                       [{atom(), term()}]}.

-type pid_term() :: pid() | atom() | string()
                  | {global, term()} | {via, module(), term()}
                  | {non_neg_integer(), non_neg_integer(), non_neg_integer()}.

-export_type([proc_attrs/0, inet_attrs/0, pid_term/0]).
%%%%%%%%%%%%%%%%%%
%%% PUBLIC API %%%
%%%%%%%%%%%%%%%%%%

%%% Process Info %%%

%% @doc Equivalent to `info(<A.B.C>)' where `A', `B', and `C' are integers part
%% of a pid
-spec info(N,N,N) -> [{atom(), [{atom(),term()}]},...] when
      N :: non_neg_integer().
info(A,B,C) -> info(recon_lib:triple_to_pid(A,B,C)).

%% @doc Allows to be similar to `erlang:process_info/1', but excludes fields
%% such as the mailbox, which have a tendency to grow and be unsafe when called
%% in production systems. Also includes a few more fields than what is usually
%% given (`monitors', `monitored_by', etc.), and separates the fields in a more
%% readable format based on the type of information contained.
%%
%% Moreover, it will fetch and read information on local processes that were
%% registered locally (an atom), globally (`{global, Name}'), or through
%% another registry supported in the `{via, Module, Name}' syntax (must have a
%% `Module:whereis_name/1' function). Pids can also be passed in as a string
%% (`"<0.39.0>"') or a triple (`{0,39,0}') and will be converted to be used.
-spec info(pid_term()) -> [{Type, [{Key, Value}]},...] when
      Type :: meta | signals | location | memory | work,
      Key :: registered_name | dictionary | group_leader | status
           | links | monitors | monitored_by | trap_exit | initial_call
           | current_stacktrace | memory | message_queue_len | heap_size
           | total_heap_size | garbage_collection | reductions,
      Value :: term().
info(PidTerm) ->
    Pid = recon_lib:term_to_pid(PidTerm),
    Info = fun(List) -> erlang:process_info(Pid, List) end,
    [{meta, Info([registered_name, dictionary, group_leader, status])},
     {signals, Info([links, monitors, monitored_by, trap_exit])},
     {location, Info([initial_call, current_stacktrace])},
     {memory, Info([memory, message_queue_len, heap_size, total_heap_size,
                    garbage_collection])},
     {work, Info([reductions])}].

%% @doc Fetches a given attribute from all processes and returns
%% the biggest `Num' consumers.
%% @todo Implement this function so it only stores `Num' entries in
%% memory at any given time, instead of as many as there are
%% processes.
-spec proc_count(AttributeName, Num) -> [proc_attrs()] when
      AttributeName :: atom(),
      Num :: non_neg_integer().
proc_count(AttrName, Num) ->
    lists:sublist(lists:usort(
        fun({_,A,_},{_,B,_}) -> A > B end,
        recon_lib:proc_attrs(AttrName)
    ), Num).

%% @doc Fetches a given attribute from all processes and returns
%% the biggest entries, over a sliding time window.
%%
%% This function is particularly useful when processes on the node
%% are mostly short-lived, usually too short to inspect through other
%% tools, in order to figure out what kind of processes are eating
%% through a lot resources on a given node.
%%
%% It is important to see this function as a snapshot over a sliding
%% window. A program's timeline during sampling might look like this:
%%
%%  `--w---- [Sample1] ---x-------------y----- [Sample2] ---z--->'
%%
%% Some processes will live between `w' and die at `x', some between `y' and
%% `z', and some between `x' and `y'. These samples will not be too significant
%% as they're incomplete. If the majority of your processes run between a time
%% interval `x'...`y' (in absolute terms), you should make sure that your
%% sampling time is smaller than this so that for many processes, their
%% lifetime spans the equivalent of `w' and `z'. Not doing this can skew the
%% results: long-lived processes, that have 10 times the time to accumulate
%% data (say reductions) will look like bottlenecks when they're not one.
%%
%% Warning: this function depends on data gathered at two snapshots, and then
%% building a dictionary with entries to differentiate them. This can take a
%% heavy toll on memory when you have many dozens of thousands of processes.
-spec proc_window(AttributeName, Num, Milliseconds) -> [proc_attrs()] when
      AttributeName :: atom(),
      Num :: non_neg_integer(),
      Milliseconds :: pos_integer().
proc_window(AttrName, Num, Time) ->
    Sample = fun() -> recon_lib:proc_attrs(AttrName) end,
    {First,Last} = recon_lib:sample(Time, Sample),
    lists:sublist(lists:usort(
        fun({_,A,_},{_,B,_}) -> A > B end,
        recon_lib:sliding_window(First, Last)
    ), Num).

%% @doc Refc binaries can be leaking when barely-busy processes route them
%% around and do little else, or when extremely busy processes reach a stable
%% amount of memory allocated and do the vast majority of their work with refc
%% binaries. When this happens, it may take a very long while before references
%% get deallocated and refc binaries get to be garbage collected, leading to
%% Out Of Memory crashes.
%% This function fetches the number of refc binary references in each process
%% of the node, garbage collects them, and compares the resulting number of
%% references in each of them. The function then returns the `N' processes
%% that freed the biggest amount of binaries, potentially highlighting leaks.
%% 
%% See <a href="http://www.erlang.org/doc/efficiency_guide/binaryhandling.html#id65722">The efficiency guide</a>
%% for more details on refc binaries
-spec bin_leak(pos_integer()) -> [proc_attrs()].
bin_leak(N) ->
    lists:sublist(
        lists:usort(
            fun({K1,V1,_},{K2,V2,_}) -> {V1,K1} =< {V2,K2} end,
            [try
                {ok, {_,Pre,Id}} = recon_lib:proc_attrs(binary, Pid),
                erlang:garbage_collect(Pid),
                {ok, {_,Post,_}} = recon_lib:proc_attrs(binary, Pid),
                {Pid, length(Post)-length(Pre), Id}
            catch
                _:_ -> {Pid, 0}
            end || Pid <- processes()]),
        N).

%% @doc Shorthand for `node_stats(N, Interval, fun(X,_) -> io:format("~p~n",[X]) end, nostate)'.
-spec node_stats_print(Repeat, Interval) -> term() when
      Repeat :: non_neg_integer(),
      Interval :: pos_integer().
node_stats_print(N, Interval) ->
    node_stats(N, Interval, fun(X, _) -> io:format("~p~n",[X]) end, ok).

%% @doc Shorthand for `node_stats(N, Interval, fun(X,Acc) -> [X|Acc] end, [])'
%% with the results reversed to be in the right temporal order.
-spec node_stats_list(Repeat, Interval) -> [Stats] when
      Repeat :: non_neg_integer(),
      Interval :: pos_integer(),
      Stats :: {[Absolutes::{atom(),term()}],
                [Increments::{atom(),term()}]}.
node_stats_list(N, Interval) ->
    lists:reverse(node_stats(N, Interval, fun(X,Acc) -> [X|Acc] end, [])).

%% @doc Gathers statistics `N' time, waiting `Interval' milliseconds between
%% each run, and accumulates results using a folding function `FoldFun'.
%% The function will gather statistics in two forms: Absolutes and Increments.
%%
%% Absolutes are values that keep changing with time, and are useful to know
%% about as a datapoint: process count, size of the run queue, error_logger
%% queue length, and the memory of the node (total, processes, atoms, binaries,
%% and ets tables).
%%
%% Increments are values that are mostly useful when compared to a previous
%% one to have an idea what they're doing, because otherwise they'd never
%% stop increasing: bytes in and out of the node, number of garbage colelctor
%% runs, words of memory that were garbage collected, and the global reductions
%% count for the node.
-spec node_stats(N, Interval, FoldFun, Acc) -> Acc when
      N :: non_neg_integer(),
      Interval :: pos_integer(),
      FoldFun :: fun((Stats, Acc) -> Acc),
      Acc :: term(),
      Stats :: {[Absolutes::{atom(),term()}],
                [Increments::{atom(),term()}]}.
node_stats(N, Interval, FoldFun, Init) ->
    %% Stats is an ugly fun, but it does its thing.
    Stats = fun({{OldIn,OldOut},{OldGCs,OldWords,_}}) ->
        %% Absolutes
        ProcC = erlang:system_info(process_count),
        RunQ = erlang:statistics(run_queue),
        {_,LogQ} = process_info(whereis(error_logger),  message_queue_len),
        %% Mem (Absolutes)
        Mem = erlang:memory(),
        Tot = proplists:get_value(total, Mem),
        ProcM = proplists:get_value(processes_used,Mem),
        Atom = proplists:get_value(atom_used,Mem),
        Bin = proplists:get_value(binary, Mem),
        Ets = proplists:get_value(ets, Mem),
        %% Incremental
        {{input,In},{output,Out}} = erlang:statistics(io),
        GC={GCs,Words,_} = erlang:statistics(garbage_collection),
        BytesIn = In-OldIn,
        BytesOut = Out-OldOut,
        GCCount = GCs-OldGCs,
        GCWords = Words-OldWords,
        {_, Reds} = erlang:statistics(reductions),
         %% Stats Results
        {{[{process_count,ProcC}, {run_queue,RunQ},
           {error_logger_queue_len,LogQ}, {memory_total,Tot},
           {memory_procs,ProcM}, {memory_atoms,Atom},
           {memory_bin,Bin}, {memory_ets,Ets}],
          [{bytes_in,BytesIn}, {bytes_out,BytesOut},
           {gc_count,GCCount}, {gc_words_reclaimed,GCWords},
           {reductions,Reds}]},
         %% New State
         {{In,Out}, GC}}
    end,
    {{input,In},{output,Out}} = erlang:statistics(io),
    Gc = erlang:statistics(garbage_collection),
    recon_lib:time_fold(N, Interval, Stats, {{In,Out}, Gc}, FoldFun, Init).

%%% OTP & Manipulations %%%

%% @doc Fetch the internal state of an OTP process.
%% Calls `sys:get_state/1' directly in R16B01+, and fetches
%% it dynamically on older versions of OTP.
-spec get_state(pid_term()) -> term().
get_state(PidTerm) ->
    Proc = recon_lib:term_to_pid(PidTerm),
    try
        sys:get_state(Proc)
    catch
        error:undef ->
            case sys:get_status(Proc) of
                {status,_Pid,{module,gen_server},Data} ->
                    {data, Props} = lists:last(lists:nth(5, Data)),
                    proplists:get_value("State", Props);
                {status,_Pod,{module,gen_fsm},Data} ->
                    {data, Props} = lists:last(lists:nth(5, Data)),
                    proplists:get_value("StateData", Props)
            end
    end.

%%% Code & Stuff %%%

%% @doc Equivalent to `remote_load(nodes(), Mod)'.
-spec remote_load(module()) -> term().
remote_load(Mod) -> remote_load(nodes(), Mod).

%% @doc Loads one or more modules remotely, in a diskless manner.  Allows to
%% share code loaded locally with a remote node that doesn't have it
-spec remote_load(Nodes, module()) -> term() when
      Nodes :: [node(),...] | node().
remote_load(Nodes=[_|_], Mod) when is_atom(Mod) ->
    {Mod, Bin, File} = code:get_object_code(Mod), 
    rpc:multicall(Nodes, code, load_binary, [Mod, File, Bin]);
remote_load(Nodes=[_|_], Modules) when is_list(Modules) ->
    [remote_load(Nodes, Mod) || Mod <- Modules];
remote_load(Node, Mod) ->
    remote_load([Node], Mod).

%% @doc Obtain the source code of a module compiled with `debug_info'.
%% The returned list sadly does not allow to format the types and typed
%% records the way they look in the original module, but instead goes to
%% an intermediary form used in the AST. They will still be placed
%% in the right module attributes, however.
%% @todo Figure out a way to pretty-print typespecs and records.
-spec source(module()) -> iolist().
source(Module) ->
    Path = code:which(Module),
    {ok,{_,[{abstract_code,{_,AC}}]}} = beam_lib:chunks(Path, [abstract_code]),
    erl_prettypr:format(erl_syntax:form_list(AC)).

%%% Ports Info %%%

%% @doc returns a list of all TCP ports (the data type) open on the node.
-spec tcp() -> [port()].
tcp() -> recon_lib:port_list(name, "tcp_inet").

%% @doc returns a list of all UDP ports (the data type) open on the node.
-spec udp() -> [port()].
udp() -> recon_lib:port_list(name, "udp_inet").

%% @doc returns a list of all SCTP ports (the data type) open on the node.
-spec sctp() -> [port()].
sctp() -> recon_lib:port_list(name, "sctp_inet").

%% @doc returns a list of all file handles open on the node.
-spec files() -> [port()].
files() -> recon_lib:port_list(name, "efile").

%% @doc Shows a list of all different ports on the node with their respective
%% types.
-spec port_types() -> [{pos_integer(),Type::string()}].
port_types() ->
    lists:usort(
        %% sorts by biggest count, smallest type
        fun({KA,VA}, {KB,VB}) -> {VA,KB} > {VB,KA} end,
        recon_lib:count([Name || {_, Name} <- recon_lib:port_list(name)])
    ).

%% @doc Fetches a given attribute from all inet ports (TCP, UDP, SCTP)
%% and returns the biggest `Num' consumers.
%%
%% The values to be used can be the number of octets (bytes) sent, received,
%% or both (`send_oct', `recv_oct', `oct', respectively), or the number
%% of packets sent, received, or both (`send_cnt', `recv_cnt', `cnt',
%% respectively). Individual absolute values for each metric will be returned
%% in the 3rd position of the resulting tuple.
%%
%% @todo Implement this function so it only stores `Num' entries in
%% memory at any given time, instead of as many as there are
%% processes.
-spec inet_count(AttributeName, Num) -> [inet_attrs()] when
      AttributeName :: 'recv_cnt' | 'recv_oct' | 'send_cnt' | 'send_oct'
                     | 'cnt' | 'oct',
      Num :: non_neg_integer().
inet_count(Attr, Num) ->
    lists:sublist(lists:usort(
        fun({_,A,_},{_,B,_}) -> A > B end,
        recon_lib:inet_attrs(Attr)
    ), Num).

%% @doc Fetches a given attribute from all inet ports (TCP, UDP, SCTP)
%% and returns the biggest entries, over a sliding time window.
%%
%% Warning: this function depends on data gathered at two snapshots, and then
%% building a dictionary with entries to differentiate them. This can take a
%% heavy toll on memory when you have many dozens of thousands of ports open.
%%
%% The values to be used can be the number of octets (bytes) sent, received,
%% or both (`send_oct', `recv_oct', `oct', respectively), or the number
%% of packets sent, received, or both (`send_cnt', `recv_cnt', `cnt',
%% respectively). Individual absolute values for each metric will be returned
%% in the 3rd position of the resulting tuple.
-spec inet_window(AttributeName, Num, Milliseconds) -> [inet_attrs()] when
      AttributeName :: 'recv_cnt' | 'recv_oct' | 'send_cnt' | 'send_oct'
                     | 'cnt' | 'oct',
      Num :: non_neg_integer(),
      Milliseconds :: pos_integer().
inet_window(Attr, Num, Time) when is_atom(Attr) ->
    Sample = fun() -> recon_lib:inet_attrs(Attr) end,
    {First,Last} = recon_lib:sample(Time, Sample),
    lists:sublist(lists:usort(
        fun({_,A,_},{_,B,_}) -> A > B end,
        recon_lib:sliding_window(First, Last)
    ), Num).


%%% RPC Utils %%%

%% @doc Shorthand for `rpc([node()|nodes()], Fun)'.
-spec rpc(fun(() -> term())) -> {[Success::_],[Fail::_]}.
rpc(Fun) ->
    rpc([node()|nodes()], Fun).

%% @doc Shorthand for `rpc(Nodes, Fun, infinity)'.
-spec rpc(node()|[node(),...], fun(() -> term())) -> {[Success::_],[Fail::_]}.
rpc(Nodes, Fun) ->
    rpc(Nodes, Fun, infinity).

%% @doc Runs an arbitrary fun (of arity 0) over one or more nodes.
-spec rpc(node()|[node(),...], fun(() -> term()), timeout()) -> {[Success::_],[Fail::_]}.
rpc(Nodes=[_|_], Fun, Timeout) when is_function(Fun,0) ->
    rpc:multicall(Nodes, erlang, apply, [Fun,[]], Timeout);
rpc(Node, Fun, Timeout) when is_atom(Node) ->
    rpc([Node], Fun, Timeout).

%% @doc Shorthand for `named_rpc([node()|nodes()], Fun)'.
-spec named_rpc(fun(() -> term())) -> {[Success::_],[Fail::_]}.
named_rpc(Fun) ->
    named_rpc([node()|nodes()], Fun).

%% @doc Shorthand for `named_rpc(Nodes, Fun, infinity)'.
-spec named_rpc(node()|[node(),...], fun(() -> term())) -> {[Success::_],[Fail::_]}.
named_rpc(Nodes, Fun) ->
    named_rpc(Nodes, Fun, infinity).

%% @doc Runs an arbitrary fun (of arity 0) over one or more nodes, and returns the
%% name of the node that computed a given result along with it, in a tuple.
-spec named_rpc(node()|[node(),...], fun(() -> term()), timeout()) -> {[Success::_],[Fail::_]}.
named_rpc(Nodes=[_|_], Fun, Timeout) when is_function(Fun,0) ->
    rpc:multicall(Nodes, erlang, apply, [fun() -> {node(),Fun()} end,[]], Timeout);
named_rpc(Node, Fun, Timeout) when is_atom(Node) ->
    named_rpc([Node], Fun, Timeout).

