Correctly import Caffe BatchNorm (#6176)

* Correctly import Caffe BatchNorm

* Compensate for cudnn epsilon shift by changing the variance

Cudnn requires BatchNorm variance eps to be bigger than 1e-05 (CUDNN_BN_MIN_EPSILON). Before this commit eps values were clipped to 1.1e-05, thus introducing a small numerical discrepancy in evaluation.

This discrepancy is avoided here by compensating from this shift in the actual variance value.

* Improved epsilon shift compensation and comments
diff --git a/tools/caffe_converter/convert_model.py b/tools/caffe_converter/convert_model.py
index 83b1c24..e612ea5 100644
--- a/tools/caffe_converter/convert_model.py
+++ b/tools/caffe_converter/convert_model.py
@@ -43,6 +43,7 @@
 
     layers, names = caffe_parser.read_caffemodel(prototxt_fname, caffemodel_fname)
     layer_iter = caffe_parser.layer_iter(layers, names)
+    layers_proto = caffe_parser.get_layers(caffe_parser.read_prototxt(prototxt_fname))
 
     for layer_name, layer_type, layer_blobs in layer_iter:
         if layer_type == 'Convolution' or layer_type == 'InnerProduct' or layer_type == 4 or layer_type == 14 \
@@ -120,18 +121,26 @@
             bn_name = layer_name
             mean = layer_blobs[0].data
             var = layer_blobs[1].data
-            moving_average_factor = layer_blobs[2].data
+            rescale_factor = layer_blobs[2].data
+            if rescale_factor != 0:
+            	rescale_factor = 1 / rescale_factor
             mean_name = '{}_moving_mean'.format(bn_name)
             var_name = '{}_moving_var'.format(bn_name)
-            maf_name = '{}_momentum'.format(bn_name)
             mean = mean.reshape(aux_shape_dic[mean_name])
             var = var.reshape(aux_shape_dic[var_name])
             aux_params[mean_name] = mx.nd.zeros(mean.shape)
             aux_params[var_name] = mx.nd.zeros(var.shape)
-            arg_params[maf_name] = mx.nd.zeros(moving_average_factor.shape)
-            aux_params[mean_name][:] = mean
-            aux_params[var_name][:] = var
-            arg_params[maf_name][:] = moving_average_factor
+            # Get the original epsilon
+            for idx, layer in enumerate(layers_proto):
+            	if layer.name == bn_name:
+            		bn_index = idx
+            eps_caffe = layers_proto[bn_index].batch_norm_param.eps
+            # Compensate for the epsilon shift performed in convert_symbol
+            eps_symbol = float( sym.attr_dict()[bn_name + '_moving_mean']['eps'] )
+            eps_correction = eps_caffe - eps_symbol
+            # Fill parameters
+            aux_params[mean_name][:] = mean * rescale_factor
+            aux_params[var_name][:] = var * rescale_factor + eps_correction
             assert var.flags['C_CONTIGUOUS'] is True
             assert mean.flags['C_CONTIGUOUS'] is True
             print ('converting batchnorm layer, mean shape = {}, var shape = {}'.format(mean.shape, var.shape))
diff --git a/tools/caffe_converter/convert_symbol.py b/tools/caffe_converter/convert_symbol.py
index db105b7..55808c3 100644
--- a/tools/caffe_converter/convert_symbol.py
+++ b/tools/caffe_converter/convert_symbol.py
@@ -155,7 +155,13 @@
         if layer[i].type == 'BatchNorm':
             type_string = 'mx.symbol.BatchNorm'
             param = layer[i].batch_norm_param
-            param_string = 'use_global_stats=%s, fix_gamma=False' % param.use_global_stats
+            # CuDNN requires eps to be greater than 1e-05
+            # We compensate for this change in convert_model
+            epsilon = param.eps
+            if(epsilon <= 1e-05):
+            	epsilon = 1e-04 
+            param_string = 'use_global_stats=%s, fix_gamma=False, eps=%f' % (
+            	param.use_global_stats, epsilon)
             need_flatten[name] = need_flatten[mapping[layer[i].bottom[0]]]
         if layer[i].type == 'Scale':
             assert layer[i-1].type == 'BatchNorm'