julia/src/model.jl - mxnet - Git at Google

 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #   http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.

 """
     AbstractModel

 The abstract super type of all models in MXNet.jl.
 """
 abstract type AbstractModel end

 """
     FeedForward

 The feedforward model provides convenient interface to train and predict on
 feedforward architectures like multi-layer MLP, ConvNets, etc. There is no
 explicitly handling of *time index*, but it is relatively easy to implement
 unrolled RNN / LSTM under this framework (*TODO*: add example). For models
 that handles sequential data explicitly, please use *TODO*...
 """
 mutable struct FeedForward <: AbstractModel
   arch        :: SymbolicNode
   ctx         :: Vector{Context}

   arg_params  :: Dict{Symbol}
   aux_params  :: Dict{Symbol}

   pred_exec   :: Union{Executor,Cvoid}

   # leave the rest fields undefined
   FeedForward(arch::SymbolicNode, ctx::Vector{Context}) = new(arch, ctx)
   FeedForward(arch::SymbolicNode, ctx::Context) = new(arch, [ctx])
 end

 """
 Get a split of `batch_size` into `n_split` pieces for data parallelization. Returns a vector
 of length `n_split`, with each entry a `UnitRange{Int}` indicating the slice index for that
 piece.
 """
 function _split_inputs(batch_size::Int, n_split::Int)
   @assert(batch_size >= n_split)
   per_split = floor(Int, batch_size / n_split)
   counts    = Base.zeros(Int, n_split) .+ per_split
   extra     = batch_size - Base.sum(counts)
   counts[1:extra] .+= 1

   cum = [0, cumsum(counts)...]
   idx = [cum[i-1]+1:cum[i] for i = 2:length(cum)]
   return idx
 end

 """
     FeedForward(arch :: SymbolicNode, ctx)

 # Arguments:
 * `arch`: the architecture of the network constructed using the symbolic API.
 * `ctx`: the devices on which this model should do computation. It could be a single `Context`
          or a list of `Context` objects. In the latter case, data parallelization will be used
          for training. If no context is provided, the default context `cpu()` will be used.
 """
 FeedForward(arch::SymbolicNode; context::Union{Context,Vector{Context}} = [cpu()]) =
   FeedForward(arch, context)

 """
     init_model(self, initializer; overwrite=false, input_shapes...)

 Initialize the weights in the model.

 This method will be called automatically when training a model. So there is usually no
 need to call this method unless one needs to inspect a model with only randomly initialized
 weights.

 # Arguments:
 * `self::FeedForward`: the model to be initialized.
 * `initializer::AbstractInitializer`: an initializer describing how the weights should be initialized.
 * `overwrite::Bool`: keyword argument, force initialization even when weights already exists.
 * `input_shapes`: the shape of all data and label inputs to this model, given as keyword arguments.
                   For example, `data=(28,28,1,100), label=(100,)`.
 """
 function init_model(self::FeedForward, initializer::AbstractInitializer; overwrite::Bool=false, input_shapes...)
   # all arg names, including data, label, and parameters
   arg_names    = list_arguments(self.arch)

   input_names  = [x[1] for x in input_shapes]

   param_names = setdiff(arg_names, input_names)
   aux_names   = list_auxiliary_states(self.arch)

   arg_shapes, out_shapes, aux_shapes = infer_shape(self.arch; input_shapes...)

   # If target dict is not yet defined set a temporary one
   if !isdefined(self, :arg_params)
     self.arg_params = Dict{Symbol, NDArray}()
   end
   if !isdefined(self, :aux_params)
     self.aux_params = Dict{Symbol, NDArray}()
   end

   arg_params = Dict{Symbol,NDArray}()
   aux_params = Dict{Symbol,NDArray}()

   for (name, shape) in filter(x -> in(x[1],param_names), zip(arg_names, arg_shapes))
     if haskey(self.arg_params, name)
       if shape == size(self.arg_params[name])
         arg_params[name] = self.arg_params[name]
         continue
       else
         @warn("Shape mismatch for $name. Overwriting with new one.")
         delete!(self.arg_params, name)
       end
     end
     arg_params[name] = NDArray(undef, shape)
   end

   for (name, shape) in zip(aux_names, aux_shapes)
     if haskey(self.aux_params, name)
       if shape == size(self.aux_params[name])
         aux_params[name] = self.aux_params[name]
         continue
       else
         @warn("Shape mismatch for $name. Overwriting with new one.")
         delete!(self.aux_params, name)
       end
     end
     aux_params[name] = NDArray(undef, shape)
   end

   for (k,v) in arg_params
     if overwrite || !haskey(self.arg_params, k)
       init(initializer, k, v)
     end
   end
   for (k,v) in aux_params
     if overwrite || !haskey(self.aux_params, k)
       init(initializer, k, v)
     end
   end

   self.arg_params = arg_params
   self.aux_params = aux_params

   return (arg_names, param_names, aux_names)
 end

 function _setup_predictor(self::FeedForward, overwrite::Bool=false; verbosity::Integer = 1, data_shapes...)
   if !isdefined(self, :pred_exec) || isa(self.pred_exec, Cvoid) || overwrite
     if !isdefined(self, :arg_params) || !isdefined(self, :aux_params)
       @assert(false, "Model weights not defined, please init or train the model, or load from file")
     end

     # the predictor use only the first device
     self.pred_exec = simple_bind(self.arch, self.ctx[1]; grad_req=GRAD_NOP, data_shapes...)
     dbg_str = mx.debug_str(self.pred_exec)
     verbosity >= 1 && @info(string("TempSpace: ", split(dbg_str, ['\n'])[end-2]..., " on ", self.ctx[1]))
     copy_params_from(self.pred_exec, self.arg_params, self.aux_params)
   else
     # make sure the new setup is compatible with the existing one
     for (d_name, d_shape) in data_shapes
       @assert(d_shape == size(self.pred_exec.arg_dict[d_name]),
               "Shape of $d_name mismatch with existing predictor, use overwrite=true overwrite existing predictor")
     end
   end
 end

 """
     predict(self, data; overwrite=false, callback=nothing)

 Predict using an existing model. The model should be already initialized, or trained or loaded from
 a checkpoint. There is an overloaded function that allows to pass the callback as the first argument,
 so it is possible to do

 ```julia
 predict(model, data) do batch_output
   # consume or write batch_output to file
 end
 ```

 # Arguments:
 * `self::FeedForward`:  the model.
 * `data::AbstractDataProvider`: the data to perform prediction on.
 * `overwrite::Bool`: an `Executor` is initialized the first time predict is called. The memory
                      allocation of the `Executor` depends on the mini-batch size of the test
                      data provider. If you call predict twice with data provider of the same batch-size,
                      then the executor can be potentially be re-used. So, if `overwrite` is false,
                      we will try to re-use, and raise an error if batch-size changed. If `overwrite`
                      is true (the default), a new `Executor` will be created to replace the old one.
 * `verbosity::Integer`: Determines the verbosity of the print messages. Higher numbers
           leads to more verbose printing. Acceptable values are
           - `0`: Do not print anything during prediction
           - `1`: Print allocation information during prediction

 !!! note
     Prediction is computationally much less costly than training, so the bottleneck sometimes becomes the IO
     for copying mini-batches of data. Since there is no concern about convergence in prediction, it is better
     to set the mini-batch size as large as possible (limited by your device memory) if prediction speed is a
     concern.

     For the same reason, currently prediction will only use the first device even if multiple devices are
     provided to construct the model.

 !!! note
     If you perform further after prediction. The weights are not automatically synchronized if `overwrite`
     is set to false and the old predictor is re-used. In this case
     setting `overwrite` to true (the default) will re-initialize the predictor the next time you call
     predict and synchronize the weights again.

 See also [`train`](@ref), [`fit`](@ref), [`init_model`](@ref), and [`load_checkpoint`](@ref)
 """
 function predict(callback::Function, self::FeedForward, data::AbstractDataProvider;
                  overwrite::Bool = true, verbosity::Integer = 1)
   predict(self, data; overwrite = overwrite, callback=callback, verbosity = verbosity)
 end
 function predict(self::FeedForward, data::AbstractDataProvider;
                  overwrite::Bool = true, callback::Union{Function,Cvoid}=nothing, verbosity::Integer = 1)
   data_shapes = provide_data(data)
   data_names  = [x[1] for x in data_shapes]
   _setup_predictor(self, overwrite; verbosity = verbosity, data_shapes...)

   batch_size  = get_batch_size(data)
   data_arrays =  [self.pred_exec.arg_dict[name] for name in data_names]
   output_list = [Array{MX_float}[] for i=1:length(self.pred_exec.outputs)]
   for batch in eachbatch(data)
     load_data!(data, batch, data_arrays)
     forward(self.pred_exec, is_train=false)
     if isa(callback, Cvoid)
       # no callback, accumulate the data and return at the end
       for (o_list, o_nd) in zip(output_list, self.pred_exec.outputs)
         push!(o_list, copy(slice(o_nd, 1:count_samples(data, batch))))
       end
     else
       outputs = self.pred_exec.outputs
       if length(outputs) == 1
         outputs = outputs[1]
       end
       callback(outputs)
     end
   end

   if !isa(callback, Cvoid)
     # callback exists, do not accumulate data
     return nothing
   end

   if isempty(output_list)
     # maybe model does not have outputs
     return nothing
   end
   if isempty(output_list[1])
     # maybe no output because data is empty
     return length(output_list) == 1 ? output_list[1] : output_list
   end

   # concatenate along mini-batches
   output_arrays = [cat(x..., dims = ndims(x[1])) for x in output_list]
   if length(output_arrays) == 1
     # only 1 output, return it directly, instead of a list
     output_arrays = output_arrays[1]
   end
   return output_arrays
 end

 function _init_model(self::FeedForward, data::AbstractDataProvider,
                      initializer::AbstractInitializer, overwrite::Bool)
   init_model(self, initializer; overwrite=overwrite,
              [provide_data(data)..., provide_label(data)...]...)
 end

 function _create_kvstore(kv_type::Symbol, num_device::Int, arg_params::Dict{Symbol}, verbosity::Int)
   if num_device == 1 && !occursin(r"dist", string(kv_type))
     return nothing
   else
     if kv_type == :local
       max_size = maximum([prod(size(param)) for (k,param) in arg_params])
       if max_size < 1024 * 1024 * 16
         kv_type = :local_update_cpu
       else
         kv_type = :local_allreduce_cpu
       end
       verbosity >= 2 && @info("Auto-select kvstore type = $kv_type")
     end
     return KVStore(kv_type)
   end
 end

 @defstruct TrainingOptions (
   initializer :: AbstractInitializer = UniformInitializer(0.01),
   n_epoch     :: Int = 10,
   eval_data   :: Union{Cvoid,AbstractDataProvider} = nothing,
   eval_metric :: AbstractEvalMetric = Accuracy(),
   kvstore     :: Union{Symbol,KVStore} = :local,
   force_init  :: Bool = false,
   callbacks   :: Vector{AbstractCallback} = AbstractCallback[],
   verbosity   :: Int = 3,
   η_decay     :: Symbol = :epoch,
 )

 function _invoke_callbacks(m::FeedForward, callbacks::Vector{AbstractCallback},
                            state::OptimizationState, type_filter::Type;
                            metric = Vector{Tuple{Symbol,Real}}())
   map(callbacks) do cb
     !isa(cb, type_filter) && return

     # epoch callback have extra access to the model object
     type_filter == AbstractEpochCallback && return cb(m, state, metric)

     cb(state)
   end
 end

 """
     train(model :: FeedForward, ...)

 Alias to [`fit`](@ref).
 """
 train(m::FeedForward, opt::AbstractOptimizer, data::AbstractDataProvider; kw...) =
   fit(m, opt, data; kw...)

 """
     fit(model::FeedForward, optimizer, data; kwargs...)

 Train the `model` on `data` with the `optimizer`.

 * `model::FeedForward`: the model to be trained.
 * `optimizer::AbstractOptimizer`: the optimization algorithm to use.
 * `data::AbstractDataProvider`: the training data provider.
 * `n_epoch::Int`: default 10, the number of full data-passes to run.
 * `eval_data::AbstractDataProvider`: keyword argument, default `nothing`. The data provider for
           the validation set.
 * `eval_metric::AbstractEvalMetric`: keyword argument, default [`Accuracy()`](@ref). The metric used
           to evaluate the training performance. If `eval_data` is provided, the same metric is also
           calculated on the validation set.
 * `kvstore`: keyword argument, default `:local`. The key-value store used to synchronize gradients
           and parameters when multiple devices are used for training.
    :type kvstore: `KVStore` or `Symbol`
 * `initializer::AbstractInitializer`: keyword argument, default `UniformInitializer(0.01)`.
 * `force_init::Bool`: keyword argument, default false. By default, the random initialization using the
           provided `initializer` will be skipped if the model weights already exists, maybe from a previous
           call to [`train`](@ref) or an explicit call to [`init_model`](@ref) or [`load_checkpoint`](@ref). When
           this option is set, it will always do random initialization at the begining of training.
 * `callbacks::Vector{AbstractCallback}`: keyword argument, default `[]`. Callbacks to be invoked at each epoch or mini-batch,
           see `AbstractCallback`.
 * `verbosity::Int`: Determines the verbosity of the print messages. Higher numbers
           leads to more verbose printing. Acceptable values are
           - `0`: Do not print anything during training
           - `1`: Print starting and final messages
           - `2`: Print one time messages and a message at the start of each epoch
           - `3`: Print a summary of the training and validation accuracy for each epoch
 * `η_decay::Symbol`: `:epoch` or `:batch`, decay learning rate on epoch or batch.
 """
 function fit(self::FeedForward, optimizer::AbstractOptimizer, data::AbstractDataProvider;
              kwargs...)
   opts = TrainingOptions(; kwargs...)

   opts.verbosity >= 1 && @info("Start training on $(self.ctx)")

   batch_size  = get_batch_size(data)
   num_dev     = length(self.ctx)
   slices      = _split_inputs(batch_size, num_dev)

   # initialize parameters
   opts.verbosity >= 2 && @info("Initializing parameters...")
   arg_names, param_names, aux_names = _init_model(self, data, opts.initializer, opts.force_init)

   # setup kvstore
   kvstore = opts.kvstore
   if isa(kvstore, Symbol)
     opts.verbosity >= 2 && @info("Creating KVStore...")
     kvstore = _create_kvstore(kvstore, length(self.ctx), self.arg_params, opts.verbosity)
   end

   update_on_kvstore = true
   if isa(kvstore, Cvoid) || occursin(r"local_allreduce", string(get_type(kvstore)))
     update_on_kvstore = false
   end

   # get grad attribute to allow for freezing
   freeze_names = Symbol[]
   for (attr, value) in list_all_attr(self.arch)
     sattr = string(attr)
     if endswith(sattr, "grad") && value == "freeze"
       push!(freeze_names, Symbol(sattr[1:end-5]))
     end
   end
   # Needs to correspond to the correct id in the update loop layer idx=1:length(param_names).
   freeze_idx = filter(i -> in(param_names[i], freeze_names), 1:length(param_names))

   # Setup grad_req as a dictionary
   grad_req = Dict{Symbol,GRAD_REQ}()
   for param in param_names
     if in(param, freeze_names)
       grad_req[param] = GRAD_NOP
     else
       grad_req[param] = GRAD_WRITE
     end
   end

   train_execs = Array{Executor}(undef, num_dev)
   for i = 1:num_dev
     data_shapes = Dict(map((x) -> x[1] => tuple(x[2][1:end-1]...,length(slices[i])), provide_data(data)))
     label_shapes = Dict(map((x) -> x[1] => tuple(x[2][1:end-1]...,length(slices[i])), provide_label(data)))
     train_execs[i] = simple_bind(self.arch, self.ctx[i]; grad_req=grad_req, data_shapes..., label_shapes...)
     dbg_str = mx.debug_str(train_execs[i])
     opts.verbosity >= 2 && @info(string("TempSpace: ", split(dbg_str, ['\n'])[end-2]..., " on ", self.ctx[i]))

     copy_params_from(train_execs[i], self.arg_params, self.aux_params)
   end

   # set up input data structures
   data_names   = [x[1] for x in provide_data(data)]
   label_names  = [x[1] for x in provide_label(data)]

   data_arrays  = [SlicedNDArray[(slices[i], exec.arg_dict[name]) for (i,exec) in enumerate(train_execs)]
                   for name in data_names]
   label_arrays = [SlicedNDArray[(slices[i], exec.arg_dict[name]) for (i,exec) in enumerate(train_execs)]
                   for name in label_names]

   param_idx    = filter(i -> in(arg_names[i], param_names), 1:length(arg_names))

   param_arrays = [NDArray[exec.arg_arrays[i] for exec in train_execs] for i in param_idx]
   grad_arrays  = [NDArray[exec.grad_arrays[i] for exec in train_execs] for i in param_idx]
   aux_arrays   = [NDArray[exec.aux_arrays[i] for exec in train_execs] for i = 1:length(aux_names)]

   op_state = OptimizationState(batch_size)
   # set up the gradient rescaling if user not set
   iszero(optimizer.scale) && (optimizer.scale = 1 / batch_size)

   if !update_on_kvstore
     updater = getupdater(optimizer)
   end

   if !isa(kvstore, Cvoid)
     if update_on_kvstore
       set_optimizer(kvstore, optimizer)
     end

     opts.verbosity >= 2 && @info("Initializing KVStore...")
     # init kv with gradients
     for idx = 1:length(param_arrays)
       param_on_devs = param_arrays[idx]

       init!(kvstore, idx, self.arg_params[param_names[idx]])

       if update_on_kvstore
         # pull weights back
         pull!(kvstore, idx, param_on_devs, priority=-idx)
       end
     end
   end

   # set up output and labels in CPU for evaluation metric
   output_shapes = [tuple(size(x)[1:end-1]...,batch_size) for x in train_execs[1].outputs]
   cpu_dev = Context(CPU)
   cpu_output_arrays = [NDArray(undef, shape, ctx = cpu_dev) for shape in output_shapes]
   cpu_label_arrays  = [NDArray(undef, shape, ctx = cpu_dev) for (name,shape) in provide_label(data)]

   # invoke callbacks on epoch 0
   _invoke_callbacks(self, opts.callbacks, op_state, AbstractEpochCallback)

   opts.verbosity >= 2 && @info("Start training...")
   for i_epoch = 1:opts.n_epoch
     time_start = time()
     reset!(opts.eval_metric)

     op_state.curr_epoch = i_epoch
     op_state.curr_batch = 0

     # invoke callbacks on iteration 0
     _invoke_callbacks(self, opts.callbacks, op_state, AbstractBatchCallback)

     for batch in eachbatch(data)
       load_data!(data, batch, data_arrays)
       load_label!(data, batch, label_arrays)

       # forward and backward
       for (texec, islice) in zip(train_execs, slices)
         forward(texec, is_train=true)

         # copy outputs into cpu ndarray, for evaluation metric
         for (cpu_out, dev_out) in zip(cpu_output_arrays, texec.outputs)
           copy!(slice(cpu_out, islice), dev_out)
         end

         backward(texec)
       end

       op_state.curr_iter  += 1
       op_state.curr_batch += 1

       # update parameters
       for idx = 1:length(param_names)
         if in(idx, freeze_idx)
           continue # Skip parameter update entirely
         end

         # gradient synchronization
         if !isa(kvstore, Cvoid)
           # push gradient, priority is negative index
           push!(kvstore, idx, grad_arrays[idx], priority=-idx)
           if update_on_kvstore
             # pull back the weights
             pull!(kvstore, idx, param_arrays[idx], priority=-idx)
           else
             # pull back the sum-ed gradients, to the same locations
             pull!(kvstore, idx, grad_arrays[idx], priority=-idx)
           end
         end

         if !update_on_kvstore
           # manual updating
           for i_dev = 1:num_dev
             # create a fake index, so that the updater create states
             # for different param AND different devices, TODO(mli)
             # use a better solution later
             fake_idx = idx * num_dev + i_dev
             updater(fake_idx, grad_arrays[idx][i_dev], param_arrays[idx][i_dev])
           end
         end
       end

       # trigger learning rate decay
       opts.η_decay == :batch && update!(optimizer.η_sched)

       # invoke callbacks after finishing each iteration
       _invoke_callbacks(self, opts.callbacks, op_state, AbstractBatchCallback)

       # update evaluation metric on training set
       load_label!(data, batch, cpu_label_arrays)
       update!(opts.eval_metric, cpu_label_arrays, cpu_output_arrays)
     end # end of one epoch

     time_stop = time()
     metric = get(opts.eval_metric)
     opts.verbosity >= 2 && @info(format("== Epoch {1:0>3d}/{2:0>3d} ==========", i_epoch, opts.n_epoch))
     if opts.verbosity >= 3
         @info("## Training summary")
         for (name, value) in metric
             @info(format("{1:>18s} = {2:.4f}", string(name), value))
         end
         @info(format("{1:>18s} = {2:.4f} seconds", "time", time_stop-time_start))
     end

     # evaluation on validation set
     if !isa(opts.eval_data, Cvoid)
       # because we are re-using the memory allocated for the training network,
       # the batch_size of the validation dataset must be the same as the training
       # batch_size
       @assert(get_batch_size(opts.eval_data) == batch_size)

       reset!(opts.eval_metric)
       for batch in eachbatch(opts.eval_data)
         load_data!(opts.eval_data, batch, data_arrays)

         # forward and backward
         for (texec, islice) in zip(train_execs, slices)
           forward(texec, is_train=true)

           # copy outputs into cpu ndarray, for evaluation metric
           for (cpu_out, dev_out) in zip(cpu_output_arrays, texec.outputs)
             copy!(slice(cpu_out, islice), dev_out)
           end
         end
         load_label!(opts.eval_data, batch, cpu_label_arrays)
         update!(opts.eval_metric, cpu_label_arrays, cpu_output_arrays)
       end

       if opts.verbosity >= 3
           @info("## Validation summary")
           for (name, value) in get(opts.eval_metric)
             @info(format("{1:>18s} = {2:.4f}", string(name), value))
           end
       end
     end

     if i_epoch == opts.n_epoch || any(x->isa(x, AbstractEpochCallback), opts.callbacks)
       # copy data back to cpu
       for (name, weights) in zip(param_names, param_arrays)
         # average parameters across devices
         weight = +([copy(w, cpu()) for w in weights]...) / length(weights)
         copy!(self.arg_params[name], weight)
       end
       for (name, aux_devs) in zip(aux_names, aux_arrays)
         aux_avg = +([copy(aux, cpu()) for aux in aux_devs]...) / length(aux_devs)
         copy!(self.aux_params[name], aux_avg)
       end
     end

     # trigger learning rate decay
     opts.η_decay == :epoch && update!(optimizer.η_sched)

     _invoke_callbacks(self, opts.callbacks, op_state, AbstractEpochCallback; metric=metric)
   end # end of all epochs

   opts.verbosity >= 1 && @info("Finish training on $(self.ctx)")
   nothing
 end

 save_checkpoint(self::FeedForward, prefix::AbstractString, state::OptimizationState) =
   save_checkpoint(self.arch, self.arg_params, self.aux_params, prefix, state.curr_epoch)

 function save_checkpoint(sym::SymbolicNode, arg_params::Dict{Symbol},
                          aux_params::Dict{Symbol}, prefix::AbstractString, epoch::Int)
   save("$prefix-symbol.json", sym)
   save_dict = Dict{Symbol,NDArray}(
     Symbol("arg:$(x[1])") => x[2] for x in arg_params
   )
   if !isempty(aux_params)
     merge!(save_dict, Dict(map((x) -> Symbol("aux:$(x[1])") => x[2], aux_params)))
   end
   save_filename = format("{1}-{2:04d}.params", prefix, epoch)
   save(save_filename, save_dict)
   @info("Saved checkpoint to '$save_filename'")
 end

 function load_checkpoint(prefix::AbstractString, epoch::Int)
   arch       = load("$prefix-symbol.json", SymbolicNode)
   saved_dict = load(format("{1}-{2:04d}.params", prefix, epoch), NDArray)
   arg_params = Dict{Symbol,Any}()
   aux_params = Dict{Symbol,Any}()
   for (k,v) in saved_dict
     tp, name = split(string(k), ':')
     name = Symbol(name)
     if tp == "arg"
       arg_params[name] = v
     else
       aux_params[name] = v
     end
   end

   return (arch, arg_params, aux_params)
 end

 """
     load_checkpoint(prefix, epoch, ::mx.FeedForward; context)

 Load a mx.FeedForward model from the checkpoint *prefix*, *epoch* and optionally provide a context.
 """
 function load_checkpoint(prefix::AbstractString, epoch::Int, ::Type{FeedForward}; context = nothing)
   arch, arg_params, aux_params = load_checkpoint(prefix, epoch)
   model = FeedForward(arch, context = context)
   model.arg_params = arg_params
   model.aux_params = aux_params
   return model
 end

 function load_checkpoint(self::FeedForward, prefix::AbstractString, epoch::Int;
                          overwrite::Bool = true, allow_different_arch::Bool = false)
   if isdefined(self, :arg_params) && isdefined(self, :aux_params) && !overwrite
     @info("model weights already exists, skip loading... (call with overwrite=true if needed)")
     return self
   end

   arch, arg_params, aux_params = load_checkpoint(prefix, epoch)
   if !allow_different_arch
     # TODO: is there better way to compare two symbols
     @assert(to_json(self.arch) == to_json(arch), "Cannot load from a checkpoint with different network architecture")
   end
   self.arg_params = arg_params
   self.aux_params = aux_params
   return self
 end