CLIMATE-658 - Restructure evaluation results

- now the evaluation results are provided as a list of missing arrays.
- update _run_no_subregion_evaluation and _run_unary_metric_evaluation in evaluation.py
- update test_evaluation
diff --git a/examples/knmi_to_cru31_full_bias.py b/examples/knmi_to_cru31_full_bias.py
index c6dac47..a241442 100644
--- a/examples/knmi_to_cru31_full_bias.py
+++ b/examples/knmi_to_cru31_full_bias.py
@@ -28,7 +28,6 @@
 import ocw.evaluation as evaluation
 import ocw.metrics as metrics
 import ocw.plotter as plotter
-
 # File URL leader
 FILE_LEADER = "http://zipper.jpl.nasa.gov/dist/"
 # This way we can easily adjust the time span of the retrievals
diff --git a/ocw/evaluation.py b/ocw/evaluation.py
index 89e34b4..ff9ad33 100644
--- a/ocw/evaluation.py
+++ b/ocw/evaluation.py
@@ -25,6 +25,8 @@
 from dataset import Dataset, Bounds
 import ocw.dataset_processor as DSP
 
+import numpy.ma as ma
+
 logger = logging.getLogger(__name__)
 
 class Evaluation(object):
@@ -285,7 +287,7 @@
 
                     run_result = metric.run(new_ref, new_tar)
                     results[-1][-1].append(run_result)
-        return results
+        return convert_evaluation_result(results, subregion=True)
 
     def _run_no_subregion_evaluation(self):
         results = []
@@ -294,7 +296,7 @@
             for metric in self.metrics:
                 run_result = metric.run(self.ref_dataset, target)
                 results[-1].append(run_result)
-        return results
+        return convert_evaluation_result(results)
 
     def _run_unary_metric_evaluation(self):
         unary_results = []
@@ -306,7 +308,7 @@
 
             for target in self.target_datasets:
                 unary_results[-1].append(metric.run(target))
-        return unary_results
+        return convert_unary_evaluation_result(unary_results)
 
     def _run_subregion_unary_evaluation(self):
         unary_results = []
@@ -330,7 +332,7 @@
                 for t in range(len(self.target_datasets)):
                     unary_results[-1][-1].append(metric.run(new_targets[t][i]))
 
-        return unary_results
+        return convert_unary_evaluation_result(unary_results, subregion = True)
 
     def __str__(self):
         formatted_repr = (
@@ -349,3 +351,65 @@
             str(self.subregions)
         )
 
+def convert_evaluation_result(evaluation_result, subregion = False):
+    if not subregion:
+        nmodel = len(evaluation_result)
+        nmetric = len(evaluation_result[0])
+        results = [] 
+        for imetric in range(nmetric):
+            result_shape = list(evaluation_result[0][imetric].shape)
+            result_shape.insert(0, nmodel)
+            result = ma.zeros(result_shape)
+            for imodel in range(nmodel):
+                result[imodel,:] = evaluation_result[imodel][imetric]
+            results.append(result)
+        return results
+    else:
+        nmodel = len(evaluation_result)
+        nmetric = len(evaluation_result[0])
+        nsubregion = len(evaluation_result[0][0])
+
+        results = []
+        for isubregion in range(nsubregion):
+            subregion_results = []
+            for imetric in range(nmetric):
+                result_shape = list(evaluation_result[0][imetric][isubregion].shape)
+                result_shape.insert(0, nmodel)
+                result = ma.zeros(result_shape)
+                for imodel in range(nmodel):
+                    result[imodel,:] = evaluation_result[imodel][imetric][isubregion]
+                subregion_results.append(result)
+            results.append(subregion_results)
+        return results
+             
+def convert_unary_evaluation_result(evaluation_result, subregion = False):
+    if not subregion:
+        nmetric = len(evaluation_result)
+        nmodel = len(evaluation_result[0])
+        results = []
+        for imetric in range(nmetric):
+            result_shape = list(evaluation_result[imetric][0].shape)
+            result_shape.insert(0, nmodel)
+            result = ma.zeros(result_shape)
+            for imodel in range(nmodel):
+                result[imodel,:] = evaluation_result[imetric][imodel]
+            results.append(result)
+        return results
+    else:
+        nmetric = len(evaluation_result)
+        nsubregion = len(evaluation_result[0])
+        nmodel = len(evaluation_result[0][0])
+
+        results = []
+        for isubregion in range(nsubregion):
+            subregion_results = []
+            for imetric in range(nmetric):
+                result_shape = list(evaluation_result[imetric][isubregion][0].shape)
+                result_shape.insert(0, nmodel)
+                result = ma.zeros(result_shape)
+                for imodel in range(nmodel):
+                    result[imodel,:] = evaluation_result[imetric][isubregion][imodel]
+                subregion_results.append(result)
+            results.append(subregion_results)
+        return results
+
diff --git a/ocw/tests/test_evaluation.py b/ocw/tests/test_evaluation.py
index 4a02714..2d48e72 100644
--- a/ocw/tests/test_evaluation.py
+++ b/ocw/tests/test_evaluation.py
@@ -128,40 +128,29 @@
     def test_result_shape(self):
         bias_eval = Evaluation(
             self.test_dataset,
-            [self.another_test_dataset, self.another_test_dataset],
-            [Bias()]
+            [self.another_test_dataset, self.another_test_dataset, self.another_test_dataset],
+            [Bias(), Bias()]
         )
         bias_eval.run()
 
         # Expected result shape is
-        # [
-        #   [
-        #       bias.run(reference, target1)
-        #   ],
-        #   [
-        #       bias.run(reference, target2)
-        #   ]
-        # ]
+        # [bias, bias] where bias.shape[0] = number of datasets
         self.assertTrue(len(bias_eval.results) == 2)
-        self.assertTrue(len(bias_eval.results[0]) == 1)
-        self.assertTrue(len(bias_eval.results[1]) == 1)
+        self.assertTrue(bias_eval.results[0].shape[0] == 3)
 
     def test_unary_result_shape(self):
         new_eval = Evaluation(
             self.test_dataset,
-            [self.another_test_dataset, self.another_test_dataset],
+            [self.another_test_dataset, self.another_test_dataset, self.another_test_dataset, self.another_test_dataset],
             [TemporalStdDev()]
         )
         new_eval.run()
 
         # Expected result shape is
-        # [
-        #   temporalstddev.run(reference),
-        #   temporalstddev.run(target1),
-        #   temporalstddev.run(target2)
-        # ]
+        # [stddev] where stddev.shape[0] = number of datasets
+        
         self.assertTrue(len(new_eval.unary_results) == 1)
-        self.assertTrue(len(new_eval.unary_results[0]) == 3)
+        self.assertTrue(new_eval.unary_results[0].shape[0] == 5)
 
     def test_subregion_result_shape(self):
         bound = Bounds(
@@ -179,26 +168,14 @@
 
         # Expected result shape is
         # [
-        #   [
         #       [   # Subregions cause this extra layer
-        #           bias.run(reference, target1)
+        #           [number of targets, bias.run(reference, target1).shape]
         #       ]
         #   ],
-        #   [
-        #       [
-        #           bias.run(reference, target2)
-        #       ]
-        #   ]
-        # ]
-        self.assertTrue(len(bias_eval.results) == 2)
-
+        self.assertTrue(len(bias_eval.results) == 1)
         self.assertTrue(len(bias_eval.results[0]) == 1)
-        self.assertTrue(type(bias_eval.results[0]) == type([]))
-        self.assertTrue(len(bias_eval.results[1]) == 1)
-        self.assertTrue(type(bias_eval.results[1]) == type([]))
-
-        self.assertTrue(len(bias_eval.results[0][0]) == 1)
-        self.assertTrue(len(bias_eval.results[1][0]) == 1)
+        self.assertTrue(bias_eval.results[0][0].shape[0] == 2)
+        self.assertTrue(type(bias_eval.results) == type([]))
 
     def test_subregion_unary_result_shape(self):
         bound = Bounds(
@@ -209,27 +186,21 @@
         new_eval = Evaluation(
             self.test_dataset,
             [self.another_test_dataset, self.another_test_dataset],
-            [TemporalStdDev()],
-            [bound]
+            [TemporalStdDev(), TemporalStdDev()],
+            [bound, bound, bound, bound, bound]
         )
         new_eval.run()
 
         # Expected result shape is
         # [
-        #   [   
         #       [   # Subregions cause this extra layer
-        #           temporalstddev.run(reference),
-        #           temporalstddev.run(target1),
-        #           temporalstddev.run(target2)
+        #           [3, temporalstddev.run(reference).shape],
         #       ]
-        #   ]
         # ]
-        self.assertTrue(len(new_eval.unary_results) == 1)
+        self.assertTrue(len(new_eval.unary_results) == 5)  # number of subregions
+        self.assertTrue(len(new_eval.unary_results[0]) == 2) # number of metrics
         self.assertTrue(type(new_eval.unary_results) == type([]))
-
-        self.assertTrue(len(new_eval.unary_results[0]) == 1)
-
-        self.assertTrue(len(new_eval.unary_results[0][0]) == 3)
+        self.assertTrue(new_eval.unary_results[0][0].shape[0] == 3) # number of datasets (ref + target)
 
 
 if __name__  == '__main__':