CLIMATE-658 - Restructure evaluation results

- now the evaluation results are provided as a list of missing arrays.
- update _run_no_subregion_evaluation and _run_unary_metric_evaluation in evaluation.py
- update test_evaluation
diff --git a/ocw/data_source/local.py b/ocw/data_source/local.py
index 474dffb..c0d4b07 100644
--- a/ocw/data_source/local.py
+++ b/ocw/data_source/local.py
@@ -21,6 +21,7 @@
 from glob import glob
 import re
 import string
+import os
 
 from ocw.dataset import Dataset
 import ocw.utils as utils
@@ -268,3 +269,68 @@
 
     return Dataset(lats, lons, times, values, variable=variable_name,
                    units=variable_unit, name=name, origin=origin)
+
+def load_multiple_files(file_path,
+                        filename_pattern,
+                        variable_name,
+                        dataset_name='ref',
+                        variable_unit=None,
+                        lat_name=None,
+                        lon_name=None,
+                        time_name=None):
+    ''' load multiple netcdf files with common filename pattern and return an array of OCW datasets
+
+    :param file_path: directory name where the NetCDF files to load are stored.
+    :type file_path: :mod:`string`
+    :param filename_pattern: common file name patterns
+    :type filename_pattern: :list:`string`
+    :param dataset_name: a name of dataset when reading a single file 
+    :type dataset_name: :mod:'string'
+    :param variable_name: The variable name to load from the NetCDF file.
+    :type variable_name: :mod:`string`
+    :param variable_unit: (Optional) The variable unit to load from the NetCDF file.
+    :type variable_unit: :mod:`string`
+    :param elevation_index: (Optional) The elevation index for which data should
+        be returned. Climate data is often times 4 dimensional data. Some
+        datasets will have readins at different height/elevation levels. OCW
+        expects 3D data so a single layer needs to be stripped out when loading.
+        By default, the first elevation layer is used. If desired you may
+        specify the elevation value to use.
+    :param lat_name: (Optional) The latitude variable name to extract from the
+        dataset.
+    :type lat_name: :mod:`string`
+    :param lon_name: (Optional) The longitude variable name to extract from the
+        dataset.
+    :type lon_name: :mod:`string`
+    :param time_name: (Optional) The time variable name to extract from the
+        dataset.
+    :type time_name: :mod:`string`
+    :returns: An array of OCW Dataset objects, an array of dataset names
+    :rtype: :class:`list`
+    '''
+
+    data_filenames = []
+    for pattern in filename_pattern:
+        data_filenames.extend(glob(file_path + pattern))
+    data_filenames.sort()
+
+    # number of files
+    ndata = len(data_filenames)
+    if ndata == 1:
+        data_name = [dataset_name]
+    else:
+        data_name = []
+        data_filenames_reversed = []
+        for element in data_filenames:
+            data_filenames_reversed.append(element[::-1])
+        prefix = os.path.commonprefix(data_filenames)
+        postfix = os.path.commonprefix(data_filenames_reversed)[::-1]
+        for element in data_filenames:
+            data_name.append(element.replace(prefix,'').replace(postfix,''))
+
+    datasets = []
+    for ifile,filename in enumerate(data_filenames):
+        datasets.append(load_file(filename, variable_name, variable_unit, name=data_name[ifile],
+                        lat_name=lat_name, lon_name=lon_name, time_name=time_name))
+    
+    return datasets
diff --git a/ocw/dataset_processor.py b/ocw/dataset_processor.py
index acd2ad4..d7b1a3f 100755
--- a/ocw/dataset_processor.py
+++ b/ocw/dataset_processor.py
@@ -500,10 +500,10 @@
     :returns: A Dataset with values converted to new units.
     :rtype: :class:`dataset.Dataset`
     '''
-    waterFluxVariables = ['pr', 'evspsbl', 'mrro', 'swe']
+    water_flux_variables = ['pr', 'prec','evspsbl', 'mrro', 'swe']
     variable = dataset.variable.lower()
 
-    if any(subString in variable for subString in waterFluxVariables):
+    if any(sub_string in variable for sub_string in water_flux_variables):
         dataset_units = dataset.units.lower()
         if variable in 'swe':
             if any(unit in dataset_units for unit in ['m', 'meter']):
@@ -516,6 +516,29 @@
                 dataset.units = 'mm/day'
 
     return dataset
+
+def temperature_unit_conversion(dataset):
+    ''' Convert temperature units as necessary
+
+    Automatically convert Celcius to Kelvin in the given dataset.
+
+    :param dataset: The dataset for which units should be updated.
+    :type dataset; :class:`dataset.Dataset`
+
+    :returns: The dataset with (potentially) updated units.
+    :rtype: :class:`dataset.Dataset`
+    '''
+    temperature_variables = ['temp','tas','tasmax','taxmin','T']
+    variable = dataset.variable.lower()
+
+    if any(sub_string in variable for sub_string in temperature_variables):
+        dataset_units = dataset.units.lower()
+        if dataset_units == 'c':
+            dataset.values = 273.15 + dataset.values
+            dataset.units = 'K'
+
+    return dataset
+
 def variable_unit_conversion(dataset):
     ''' Convert water flux or temperature variables units as necessary
     
@@ -529,28 +552,9 @@
     :rtype: :class:`dataset.Dataset`
     '''
 
-    water_flux_variables = ['pr', 'prec','evspsbl', 'mrro', 'swe']
-    temperature_variables = ['temp','tas','tasmax','taxmin','T']
-    variable = dataset.variable.lower()
-
-    if any(subString in variable for subString in water_flux_variables):
-        dataset_units = dataset.units.lower()
-        if variable in 'swe':
-            if any(unit in dataset_units for unit in ['m', 'meter']):
-                dataset.values = 1.e3 * dataset.values
-                dataset.units = 'km'
-        else:
-            if any(unit in dataset_units
-                for unit in ['kg m-2 s-1', 'mm s-1', 'mm/sec']):
-                    dataset.values = 86400. * dataset.values
-                    dataset.units = 'mm/day'
-
-    if any(subString in variable for subString in temperature_variables):
-        dataset_units = dataset.units.lower()
-        if dataset_units == 'c':
-            dataset.values = 273.15+dataset.values
-            dataset.units = 'K'
-
+    dataset = water_flux_unit_conversion(dataset)
+    dataset = temperature_unit_conversion(dataset)
+    
     return dataset
 
 def _rcmes_normalize_datetimes(datetimes, timestep):
diff --git a/ocw/metrics.py b/ocw/metrics.py
index 9b9aad6..62f4e3d 100644
--- a/ocw/metrics.py
+++ b/ocw/metrics.py
@@ -87,7 +87,28 @@
         :returns: The difference between the reference and target datasets.
         :rtype: :class:`numpy.ndarray`
         '''
-        return target_dataset.values - ref_dataset.values  
+        return calc_bias(target_dataset.values,ref_dataset.values) 
+
+class SpatialPatternTaylorDiagram(BinaryMetric):
+    ''' Calculate the target to reference ratio of spatial standard deviation and pattern correlation'''
+
+    def run(self, ref_dataset, target_dataset):
+        '''Calculate two metrics to plot a Taylor diagram to compare spatial patterns      
+
+        .. note::
+           Overrides BinaryMetric.run() 
+        
+        :param ref_dataset: The reference dataset to use in this metric run.
+        :type ref_dataset: :class:`dataset.Dataset`
+
+        :param target_dataset: The target dataset to evaluate against the
+            reference dataset in this metric run.
+        :type target_dataset: :class:`dataset.Dataset`
+
+        :returns: standard deviation ratio, pattern correlation coefficient
+        :rtype: :float:'float','float' 
+        '''
+        return ma.array([calc_stddev_ratio(target_dataset.values, ref_dataset.values), calc_correlation(target_dataset.values, ref_dataset.values)])
 
 
 class TemporalStdDev(UnaryMetric):
@@ -106,7 +127,7 @@
         :returns: The temporal standard deviation of the target dataset
         :rtype: :class:`ndarray`
         '''
-        return ma.std(target_dataset.values, axis=0, ddof=1)
+        return calc_stddev(target_dataset.values, axis=0)
 
 
 class StdDevRatio(BinaryMetric):
@@ -127,7 +148,8 @@
 
         :returns: The standard deviation ratio of the reference and target
         '''
-        return ma.std(target_dataset.values)/ma.std(ref_dataset.values)
+       
+        return calc_stddev_ratio(target_dataset.values, ref_dataset.values)
 
 
 class PatternCorrelation(BinaryMetric):
@@ -151,7 +173,8 @@
         # stats.pearsonr returns correlation_coefficient, 2-tailed p-value
         # We only care about the correlation coefficient
         # Docs at http://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.pearsonr.html
-        return mstats.pearsonr(ref_dataset.values.flatten(), target_dataset.values.flatten())[0]
+
+        return calc_correlation(target_dataset.values, ref_dataset.values)
 
 
 class TemporalCorrelation(BinaryMetric):
@@ -179,23 +202,18 @@
         '''
         num_times, num_lats, num_lons = reference_dataset.values.shape
         coefficients = ma.zeros([num_lats, num_lons])
-        levels = ma.zeros([num_lats, num_lons])
         for i in numpy.arange(num_lats):
             for j in numpy.arange(num_lons):
-                coefficients[i, j], levels[i, j] = (
-                    mstats.pearsonr(
-                        reference_dataset.values[:, i, j],
-                        target_dataset.values[:, i, j]
-                    )
-                )
-                levels[i, j] = 1 - levels[i, j]
-        return coefficients, levels 
+                coefficients[i, j] = calc_correlation(
+                        target_dataset.values[:, i, j],
+                        reference_dataset.values[:, i, j])
+        return coefficients 
 
 
 class TemporalMeanBias(BinaryMetric):
     '''Calculate the bias averaged over time.'''
 
-    def run(self, ref_dataset, target_dataset, absolute=False):
+    def run(self, ref_dataset, target_dataset):
         '''Calculate the bias averaged over time.
 
         .. note::
@@ -211,37 +229,7 @@
         :returns: The mean bias between a reference and target dataset over time.
         '''
 
-        diff = target_dataset.values - ref_dataset.values 
-        if absolute:
-            diff = abs(diff)
-        mean_bias = ma.mean(diff, axis=0)
-
-        return mean_bias
-
-
-class SpatialMeanOfTemporalMeanBias(BinaryMetric):
-    '''Calculate the bias averaged over time and domain.'''
-
-    def run(self, reference_dataset, target_dataset):
-        '''Calculate the bias averaged over time and domain.
-
-        .. note::
-           Overrides BinaryMetric.run()
-
-        :param reference_dataset: The reference dataset to use in this metric
-            run
-        :type reference_dataset: :class:`dataset.Dataset`
-
-        :param target_dataset: The target dataset to evaluate against the
-            reference dataset in this metric run
-        :type target_dataset: :class:`dataset.Dataset`
-
-        :returns: The bias averaged over time and domain
-        '''
-
-        bias = target_dataset.values - reference_dataset.values 
-        return ma.mean(bias)
-
+        return calc_bias(target_dataset.values,ref_dataset.values, average_over_time=True) 
 
 class RMSError(BinaryMetric):
     '''Calculate the Root Mean Square Difference (RMS Error), with the mean
@@ -265,6 +253,96 @@
         :returns: The RMS error, with the mean calculated over time and space
         '''
 
-        sqdiff = (reference_dataset.values - target_dataset.values) ** 2
-        return (ma.mean(sqdiff))**0.5
+        return calc_rmse(target_dataset.values, reference_dataset.values)
 
+def calc_bias(target_array, reference_array, average_over_time = False):
+    ''' Calculate difference between two arrays
+
+    :param target_array: an array to be evaluated, as model output
+    :type target_array: :class:'numpy.ma.core.MaskedArray'
+
+    :param reference_array: an array of reference dataset
+    :type reference_array: :class:'numpy.ma.core.MaskedArray'
+
+    :param average_over_time: if True, calculated bias is averaged for the axis=0
+    :type average_over_time: 'bool'
+
+    :returns: Biases array of the target dataset
+    :rtype: :class:'numpy.ma.core.MaskedArray'
+    '''
+    
+    bias = target_array - reference_array
+    if average_over_time:
+        return ma.average(bias, axis=0)
+    else:
+        return bias
+
+def calc_stddev(array, axis=None):
+    ''' Calculate a sample standard deviation of an array along the array
+
+    :param array: an array to calculate sample standard deviation
+    :type array: :class:'numpy.ma.core.MaskedArray'
+    
+    :param axis: Axis along which the sample standard deviation is computed.
+    :type axis: 'int'
+
+    :returns: sample standard deviation of array
+    :rtype: :class:'numpy.ma.core.MaskedArray'
+    '''
+
+    if isinstance(axis, int):
+        return ma.std(array, axis=axis, ddof=1)
+    else:
+        return ma.std(array, ddof=1)
+        
+
+def calc_stddev_ratio(target_array, reference_array):
+    ''' Calculate ratio of standard deivations of the two arrays
+
+    :param target_array: an array to be evaluated, as model output
+    :type target_array: :class:'numpy.ma.core.MaskedArray'
+
+    :param reference_array: an array of reference dataset
+    :type reference_array: :class:'numpy.ma.core.MaskedArray'
+
+    :param average_over_time: if True, calculated bias is averaged for the axis=0
+    :type average_over_time: 'bool'
+
+    :returns: (standard deviation of target_array)/(standard deviation of reference array)
+    :rtype: :class:'float'
+    '''
+
+    return calc_stddev(target_array)/calc_stddev(reference_array)
+
+def calc_correlation(target_array, reference_array):
+    '''Calculate the correlation coefficient between two arrays.
+
+    :param target_array: an array to be evaluated, as model output
+    :type target_array: :class:'numpy.ma.core.MaskedArray'
+
+    :param reference_array: an array of reference dataset
+    :type reference_array: :class:'numpy.ma.core.MaskedArray'
+
+    :returns: pearson's correlation coefficient between the two input arrays
+    :rtype: :class:'numpy.ma.core.MaskedArray'
+    '''
+
+    return mstats.pearsonr(reference_array.flatten(), target_array.flatten())[0]  
+       
+def calc_rmse(target_array, reference_array):
+    ''' Calculate ratio of standard deivations of the two arrays
+
+    :param target_array: an array to be evaluated, as model output
+    :type target_array: :class:'numpy.ma.core.MaskedArray'
+
+    :param reference_array: an array of reference dataset
+    :type reference_array: :class:'numpy.ma.core.MaskedArray'
+
+    :param average_over_time: if True, calculated bias is averaged for the axis=0
+    :type average_over_time: 'bool'
+
+    :returns: root mean square error
+    :rtype: :class:'float'
+    '''
+
+    return (ma.mean((calc_bias(target_array, reference_array))**2))**0.5 
diff --git a/ocw/tests/test_metrics.py b/ocw/tests/test_metrics.py
index 8edeaff..facf1d3 100644
--- a/ocw/tests/test_metrics.py
+++ b/ocw/tests/test_metrics.py
@@ -24,6 +24,7 @@
 import ocw.metrics as metrics
 
 import numpy as np
+import numpy.ma as ma
 import numpy.testing as npt
 
 class TestBias(unittest.TestCase):
@@ -56,6 +57,30 @@
         expected_result.fill(-300)
         np.testing.assert_array_equal(self.bias.run(self.target_dataset, self.reference_dataset), expected_result)
 
+class TestSpatialPatternTaylorDiagram(unittest.TestCase):
+    '''Test the metrics.SpatialPatternTaylorDiagram'''
+    def setUp(self):
+        self.taylor_diagram = metrics.SpatialPatternTaylorDiagram()
+        self.ref_dataset = Dataset(
+            np.array([1., 1., 1., 1., 1.]),
+            np.array([1., 1., 1., 1., 1.]),
+            np.array([dt.datetime(2000, x, 1) for x in range(1, 13)]),
+            # Reshapped array with 300 values incremented by 5
+            np.arange(0, 1500, 5).reshape(12, 5, 5),
+            'ds1'
+        )
+
+        self.tar_dataset = Dataset(
+            np.array([1., 1., 1., 1., 1.]),
+            np.array([1., 1., 1., 1., 1.]),
+            np.array([dt.datetime(2000, x, 1) for x in range(1, 13)]),
+            # Reshapped array with 300 values incremented by 2
+            np.arange(0, 600, 2).reshape(12, 5, 5),
+            'ds2'
+        )
+
+    def test_function_run(self):
+        np.testing.assert_array_equal(self.taylor_diagram.run(self.ref_dataset, self.tar_dataset), ma.array([0.4,1.0]))
 
 class TestTemporalStdDev(unittest.TestCase):
     '''Test the metrics.TemporalStdDev metric.'''
@@ -102,7 +127,7 @@
         )
 
     def test_function_run(self):
-        self.assertTrue(self.std_dev_ratio.run(self.ref_dataset, self.tar_dataset), 2.5)
+        self.assertTrue(self.std_dev_ratio.run(self.ref_dataset, self.tar_dataset), 0.4)
 
 
 class TestPatternCorrelation(unittest.TestCase):
@@ -161,22 +186,18 @@
 
     def test_identical_inputs(self):
         expected = np.ones(25).reshape(5, 5)
-        tc, cl = self.metric.run(self.ref_dataset, self.ref_dataset)
+        tc = self.metric.run(self.ref_dataset, self.ref_dataset)
         np.testing.assert_array_equal(tc, expected)
-        np.testing.assert_array_equal(cl, expected)
 
     def test_positive_correlation(self):
         expected = np.ones(25).reshape(5, 5)
-        tc, cl = self.metric.run(self.ref_dataset, self.tgt_dataset_inc)
+        tc = self.metric.run(self.ref_dataset, self.tgt_dataset_inc)
         np.testing.assert_array_equal(tc, expected)
-        np.testing.assert_array_equal(cl, expected)
 
     def test_negative_correlation(self):
         expected_tc = np.array([-1] * 25).reshape(5, 5)
-        expected_cl = np.ones(25).reshape(5, 5)
-        tc, cl = self.metric.run(self.ref_dataset, self.tgt_dataset_dec)
+        tc = self.metric.run(self.ref_dataset, self.tgt_dataset_dec)
         np.testing.assert_array_equal(tc, expected_tc)
-        np.testing.assert_array_equal(cl, expected_cl)
 
 
 class TestTemporalMeanBias(unittest.TestCase):
@@ -208,42 +229,6 @@
         expected_result.fill(-300)
         np.testing.assert_array_equal(self.mean_bias.run(self.target_dataset,self.reference_dataset), expected_result)
 
-    def test_function_run_abs(self):
-        '''Test mean bias function between reference dataset and target dataset with abs as True.'''
-        expected_result = np.zeros((5, 5), dtype=np.int)
-        expected_result.fill(300)
-        np.testing.assert_array_equal(self.mean_bias.run(self.reference_dataset, self.target_dataset, True), expected_result)
-
-
-class TestSpatialMeanOfTemporalMeanBias(unittest.TestCase):
-    '''Test the metrics.SpatialMeanOfTemporalMeanBias metric.'''
-    def setUp(self):
-        # Set metric.
-        self.metric = metrics.SpatialMeanOfTemporalMeanBias()
-        # Initialize reference dataset.
-        self.ref_lats = np.array([10, 20, 30, 40, 50])
-        self.ref_lons = np.array([5, 15, 25, 35, 45])
-        self.ref_times = np.array([dt.datetime(2000, x, 1)
-                                   for x in range(1, 13)])
-        self.ref_values = np.array(range(300)).reshape(12, 5, 5)
-        self.ref_variable = "ref"
-        self.ref_dataset = Dataset(self.ref_lats, self.ref_lons,
-            self.ref_times, self.ref_values, self.ref_variable)
-        # Initialize target dataset.
-        self.tgt_lats = np.array([10, 20, 30, 40, 50])
-        self.tgt_lons = np.array([5, 15, 25, 35, 45])
-        self.tgt_times = np.array([dt.datetime(2000, x, 1)
-                                   for x in range(1, 13)])
-        self.tgt_values = np.array(range(299, -1, -1)).reshape(12, 5, 5)
-        self.tgt_variable = "tgt"
-        self.tgt_dataset = Dataset(self.tgt_lats, self.tgt_lons,
-            self.tgt_times, self.tgt_values, self.tgt_variable)
-
-    def test_function_run(self):
-        result = self.metric.run(self.ref_dataset, self.tgt_dataset)
-        self.assertEqual(result, 0.0)
-
-
 class TestRMSError(unittest.TestCase):
     '''Test the metrics.RMSError metric.'''
     def setUp(self):