Updates to DatasetLoader

commit: b2f0ad4b1516b8824acb4321a463b8d7e0c7c90c [log] [tgz]
author: Alex Goodman <agoodm@users.noreply.github.com> Mon Aug 01 14:11:27 2016 -0700
committer: Alex Goodman <agoodm@users.noreply.github.com> Mon Aug 01 14:11:27 2016 -0700
tree: 53a3b3afeafb8028f1a73f155c189a3d75cb2d3d
parent: 7ab0141061b9a11638452e8e60c3400a92057390 [diff]
diff --git a/RCMES/run_RCMES.py b/RCMES/run_RCMES.py
index cd69bc4..3cc2245 100644
--- a/RCMES/run_RCMES.py
+++ b/RCMES/run_RCMES.py

@@ -39,7 +39,7 @@
 

 from metrics_and_plots import *

 

-import ssl 

+import ssl

 

 if hasattr(ssl, '_create_unverified_context'):

   ssl._create_default_https_context = ssl._create_unverified_context

@@ -107,12 +107,13 @@
 boundary_check_model = True

 if 'GCM_data' in model_data_info.keys():

     if model_data_info['GCM_data']:

-        boundary_check_model = False                                           

+        boundary_check_model = False

 print 'Loading model datasets:\n',model_data_info

 if model_data_info['data_source'] == 'local':

-    model_datasets, model_names = local.load_multiple_files(file_path = model_data_info['path'],

-                                                            variable_name =model_data_info['variable'], 

-                                                            lat_name=model_lat_name, lon_name=model_lon_name)

+    model_datasets = local.load_multiple_files(file_path=model_data_info['path'],

+                                               variable_name =model_data_info['variable'],

+                                               lat_name=model_lat_name, lon_name=model_lon_name)

+    model_names = [dataset.name for dataset in model_datasets]

 elif model_data_info['data_source'] == 'ESGF':

       md = esgf.load_dataset(dataset_id=model_data_info['dataset_id'],

                              variable=model_data_info['variable'],

@@ -166,7 +167,7 @@
 # generate grid points for regridding

 if config['regrid']['regrid_on_reference']:

     new_lat = ref_dataset.lats

-    new_lon = ref_dataset.lons 

+    new_lon = ref_dataset.lons

 else:

     delta_lat = config['regrid']['regrid_dlat']

     delta_lon = config['regrid']['regrid_dlon']

@@ -178,7 +179,7 @@
 # number of models

 nmodel = len(model_datasets)

 print 'Dataset loading completed'

-print 'Observation data:', ref_name 

+print 'Observation data:', ref_name

 print 'Number of model datasets:',nmodel

 for model_name in model_names:

     print model_name

@@ -200,7 +201,7 @@
 ref_dataset = dsp.variable_unit_conversion(ref_dataset)

 for idata,dataset in enumerate(model_datasets):

     model_datasets[idata] = dsp.variable_unit_conversion(dataset)

-    

+

 

 print 'Generating multi-model ensemble'

 if len(model_datasets) >= 2.:

@@ -217,8 +218,8 @@
 

     print 'Calculating spatial averages and standard deviations of ',str(nsubregion),' subregions'

 

-    ref_subregion_mean, ref_subregion_std, subregion_array = utils.calc_subregion_area_mean_and_std([ref_dataset], subregions) 

-    model_subregion_mean, model_subregion_std, subregion_array = utils.calc_subregion_area_mean_and_std(model_datasets, subregions) 

+    ref_subregion_mean, ref_subregion_std, subregion_array = utils.calc_subregion_area_mean_and_std([ref_dataset], subregions)

+    model_subregion_mean, model_subregion_std, subregion_array = utils.calc_subregion_area_mean_and_std(model_datasets, subregions)

 

 """ Step 7: Write a netCDF file """

 workdir = config['workdir']

@@ -231,7 +232,7 @@
 if config['use_subregions']:

     dsp.write_netcdf_multiple_datasets_with_subregions(ref_dataset, ref_name, model_datasets, model_names,

                                                        path=workdir+config['output_netcdf_filename'],

-                                                       subregions=subregions, subregion_array = subregion_array, 

+                                                       subregions=subregions, subregion_array = subregion_array,

                                                        ref_subregion_mean=ref_subregion_mean, ref_subregion_std=ref_subregion_std,

                                                        model_subregion_mean=model_subregion_mean, model_subregion_std=model_subregion_std)

 else:

@@ -279,5 +280,3 @@
                                       file_name)

         else:

             print 'please check the currently supported metrics'

-

-


diff --git a/ocw/data_source/local.py b/ocw/data_source/local.py
index 35041ac..98de937 100644
--- a/ocw/data_source/local.py
+++ b/ocw/data_source/local.py

@@ -350,7 +350,7 @@
         datasets.append(load_file(filename, variable_name, variable_unit, name=data_name[ifile],
                         lat_name=lat_name, lon_name=lon_name, time_name=time_name))
 
-    return datasets, data_name
+    return datasets
 
 def load_WRF_2d_files_RAIN(file_path=None,
                       filename_pattern=None,

diff --git a/ocw/dataset_loader.py b/ocw/dataset_loader.py
index 8ee1b93..3c8b95f 100644
--- a/ocw/dataset_loader.py
+++ b/ocw/dataset_loader.py

@@ -27,20 +27,19 @@
 

 

 class DatasetLoader:

-    '''Generate OCW Dataset objects from a variety of sources.'''

+    '''Generate a list of OCW Dataset objects from a variety of sources.'''

 

-    def __init__(self, reference, targets):

-        '''Generate OCW Dataset objects from a variety of sources.

+    def __init__(self, *loader_opts):

+        '''Generate a list of OCW Dataset objects from a variety of sources.

 

         Each keyword argument can be information for a dataset in dictionary

         form. For example:

         ``

-        >>> reference = {'data_source':'rcmed', 'name':'cru', 'dataset_id':10,

-                         'parameter_id':34}

-        >>> targets = {'data_source':'local_multiple',

-                       'path':'./data/CORDEX-Africa_data/AFRICA*pr.nc',

-                       'variable':'pr'}

-        >>> loader = DatasetLoader(reference, targets)

+        >>> loader_opt1 = {'loader_name': 'rcmed', 'name': 'cru',

+                           'dataset_id': 10, 'parameter_id': 34}

+        >>> loader_opt2 = {'path': './data/TRMM_v7_3B43_1980-2010.nc,

+                           'variable': 'pcp'}

+        >>> loader = DatasetLoader(loader_opt1, loader_opt2)

         ``

 

         Or more conveniently if the loader configuration is defined in a

@@ -48,67 +47,57 @@
         ``

         >>> import yaml

         >>> config = yaml.load(open(config_file))

-        >>> loader = DatasetLoader(**config['datasets'])

+        >>> obs_loader_config = config['datasets']['reference']

+        >>> loader = DatasetLoader(*obs_loader_config)

         ``

 

-        As shown in the first example, the dictionary for each keyword argument

-        should contain a data source and parameters specific to the loader for

-        that data source. Once the configuration is entered, the datasets may be

-        loaded using:

+        As shown in the first example, the dictionary for each argument should

+        contain a loader name and parameters specific to the particular loader.

+        Once the configuration is entered, the datasets may be loaded using:

         ``

         >>> loader.load_datasets()

-        >>> target_datasets = loader.target_datasets

+        >>> obs_datasets = loader.datasets

         ``

 

-        If ``reference`` is entered as a keyword argument, then it may be

-        accesed from:

-        ``

-        >>> reference_dataset = loader.reference_dataset

-        ``

-

-        Additionally, each dataset must have a ``data_source`` keyword. This may

+        Additionally, each dataset must have a ``loader_name`` keyword. This may

         be one of the following:

-        * ``'local'`` - A single dataset file in a local directory

+        * ``'local'`` - One or multiple dataset files in a local directory

         * ``'local_split'`` - A single dataset split accross multiple files in a

                               local directory

-        * ``'local_multiple'`` - Multiple datasets in a local directory

         * ``'esgf'`` - Download the dataset from the Earth System Grid

                        Federation

         * ``'rcmed'`` - Download the dataset from the Regional Climate Model

                         Evaluation System Database

         * ``'dap'`` - Download the dataset from an OPeNDAP URL

 

-        Users who wish to download datasets from sources not described above

+        Users who wish to download datasets from loaders not described above

         may define their own custom dataset loader function and incorporate it

         as follows:

-        >>> loader.add_source_loader('my_source_name', my_loader_func)

+        >>> loader.add_source_loader('my_loader_name', my_loader_func)

 

-        :param reference: The reference dataset loader configuration.

-        :type reference: :mod:`dict`

-

-        :param targets: The target dataset loader configurations.

-        :type targets: :mod:`dict` or list of mod:`dict`

+        :param loader_opts: Dictionaries containing the each dataset loader

+                            configuration, representing the keyword arguments of

+                            the loader function specified by an additional key

+                            called 'loader_name'. If not specified by the user,

+                            this defaults to local.

+        :type loader_opts: :class:`dict`

 

         :raises KeyError: If an invalid argument is passed to a data source

         loader function.

         '''

         # Reference dataset config

-        self.set_reference(**reference)

-

-        # Target dataset(s) config

-        self.set_targets(targets)

+        self.set_loader_opts(*loader_opts)

 

         # Default loaders

         self._source_loaders = {

-            'local': local.load_file,

+            'local': local.load_multiple_files,

             'local_split': local.load_dataset_from_multiple_netcdf_files,

-            'local_multiple': local.load_multiple_files,

             'esgf': esgf.load_dataset,

             'rcmed': rcmed.parameter_dataset,

             'dap': dap.load

         }

 

-    def add_source_loader(self, source_name, loader_func):

+    def add_source_loader(self, loader_name, loader_func):

         '''

         Add a custom source loader.

 

@@ -119,89 +108,72 @@
         return an OCW Dataset object.

         :type loader_func: :class:`callable`

         '''

-        self._source_loaders[source_name] = loader_func

+        self._source_loaders[loader_name] = loader_func

 

-    def add_target(self, **kwargs):

+    def add_loader_opts(self, *loader_opts):

         '''

-        A convenient means of adding a target dataset to the loader.

-        :raises KeyError: If data_source is not specified.

-        '''

-        if 'data_source' not in kwargs:

-            raise KeyError('Dataset configuration must contain a data_source.')

-        self._target_config.append(kwargs)

+        A convenient means of adding loader options for each dataset to the

+        loader. If 'loader_name' is not entered as a keyword argument, then

+        'local' is used by default.

 

-    def add_targets(self, targets):

+        :param loader_opts: Dictionaries containing the each dataset loader

+                            configuration, representing the keyword arguments of

+                            the loader function specified by an additional key

+                            called 'loader_name'. If not specified by the user,

+                            this defaults to local.

+        :type loader_opts: :mod:`dict`

         '''

-        A convenient means of adding multiple target datasets to the loader.

+        for opt in loader_opts:

+            if 'loader_name' not in opt:

+                opt['loader_name'] = 'local'

+        self._config.extend(loader_opts)

 

-        :param targets: List of loader configurations for each target

-        :type targets: List of :mod:`dict`

-

-        :raises KeyError: If data_source is not specified.

+    def set_loader_opts(self, *loader_opts):

         '''

-        for target_config in targets:

-            self.add_target(**target_config)

+        Reset the dataset loader config.

 

-    def set_targets(self, targets):

+        :param loader_opts: Dictionaries containing the each dataset loader

+                            configuration, representing the keyword arguments of

+                            the loader function specified by an additional key

+                            called 'loader_name'. If not specified by the user,

+                            this defaults to local.

+        :type loader_opts: :mod:`dict`

         '''

-        Reset the target dataset config.

-

-        :param targets: List of loader configurations for each target

-        :type targets: List of :mod:`dict`

-

-        :raises KeyError: If data_source is not specified.

-        '''

-        # This check allows for the user to enter targets as one block or

-        # as a list of separate blocks in their config files

-        if not isinstance(targets, list):

-            targets = [targets]

-        self._target_config = []

-        self.add_targets(targets)

-

-    def set_reference(self, **kwargs):

-        '''

-        Reset the reference dataset config.

-        :raises KeyError: If data_source is not specified.

-        '''

-        if 'data_source' not in kwargs:

-            raise KeyError('Dataset configuration must contain a data_source.')

-        self._reference_config = kwargs

+        self._config = []

+        self.add_loader_opts(*loader_opts)

 

     def load_datasets(self):

         '''

         Loads the datasets from the given loader configurations.

         '''

-        # Load the reference dataset

-        self.reference_dataset = self._load(**self._reference_config)

-

         # Ensure output is clear if loading is performed more than once to

         # prevent duplicates.

-        self.target_datasets = []

+        self.datasets = []

 

         # Load the target datasets

-        for loader_params in self._target_config:

-            output = self._load(**loader_params)

+        for loader_opt in self._config:

+            output = self._load(**loader_opt)

 

             # Need to account for the fact that some loaders return lists

             # of OCW Dataset objects instead of just one

             if isinstance(output, list):

-                self.target_datasets.extend(output)

+                self.datasets.extend(output)

             else:

-                self.target_datasets.append(output)

+                self.datasets.append(output)

 

     def _load(self, **kwargs):

         '''

         Generic dataset loading method.

         '''

-        # Extract the data source

-        data_source = kwargs.pop('data_source')

+        # Extract the loader name

+        loader_name = kwargs.pop('loader_name')

 

         # Find the correct loader function for the given data source

-        loader_func = self._source_loaders[data_source]

+        loader_func = self._source_loaders[loader_name]

 

         # The remaining kwargs should be specific to the loader

         output = loader_func(**kwargs)

 

-        # Preserve data_source info for later use

-        kwargs['data_source'] = data_source

+        # Preserve loader_name info for later use

+        kwargs['loader_name'] = loader_name

         return output


diff --git a/ocw/tests/test_dataset_loader.py b/ocw/tests/test_dataset_loader.py
index 2d192c1..b3c613b 100644
--- a/ocw/tests/test_dataset_loader.py
+++ b/ocw/tests/test_dataset_loader.py

@@ -17,7 +17,6 @@
 
 import unittest
 import os
-import copy
 import netCDF4
 import numpy as np
 from ocw.dataset import Dataset
@@ -37,13 +36,8 @@
         self.values2 = self.values + 1
 
         # Set up config
-        self.reference_config = {'data_source': 'local',
-                                 'file_path': self.file_path,
-                                 'variable_name': 'value'}
-        self.target_config = copy.deepcopy(self.reference_config)
-        self.no_data_source_config = {'file_path': self.file_path,
-                                      'variable_name': 'value'}
-        self.new_data_source_config = {'data_source': 'foo',
+        self.config = {'file_path': self.file_path, 'variable_name': 'value'}
+        self.new_data_source_config = {'loader_name': 'foo',
                                        'lats': self.latitudes,
                                        'lons': self.longitudes,
                                        'times': self.times,
@@ -53,77 +47,45 @@
     def tearDown(self):
         os.remove(self.file_path)
 
-    def testInputHasDataSource(self):
-        '''
-        Make sure input data source is specified for each dataset to be loaded
-        '''
-        with self.assertRaises(KeyError):
-            self.loader = DatasetLoader(self.reference_config,
-                                        self.no_data_source_config)
-
-    def testReferenceHasDataSource(self):
-        '''
-        Make sure ref data source is specified for each dataset to be loaded
-        '''
-        with self.assertRaises(KeyError):
-            self.loader = DatasetLoader(self.reference_config,
-                                        self.target_config)
-            self.loader.set_reference(**self.no_data_source_config)
-
-    def testTargetHasDataSource(self):
-        '''
-        Make sure target data source is specified for each dataset to be loaded
-        '''
-        with self.assertRaises(KeyError):
-            self.loader = DatasetLoader(self.reference_config,
-                                        self.target_config)
-            self.loader.add_target(**self.no_data_source_config)
-
     def testNewDataSource(self):
         '''
         Ensures that custom data source loaders can be added
         '''
-        self.loader = DatasetLoader(self.new_data_source_config,
-                                    self.target_config)
+        self.loader = DatasetLoader(self.new_data_source_config)
 
-        # Here the the data_source "foo" represents the Dataset constructor
+        # Here the data_source "foo" represents the Dataset constructor
         self.loader.add_source_loader('foo', build_dataset)
         self.loader.load_datasets()
-        self.assertEqual(self.loader.reference_dataset.origin['source'],
-                         'foo')
-        np.testing.assert_array_equal(self.loader.reference_dataset.values,
+        self.assertEqual(self.loader.datasets[0].origin['source'], 'foo')
+        np.testing.assert_array_equal(self.loader.datasets[0].values,
                                       self.values2)
 
     def testExistingDataSource(self):
         '''
         Ensures that existing data source loaders can be added
         '''
-        self.loader = DatasetLoader(self.reference_config,
-                                    self.target_config)
+        self.loader = DatasetLoader(self.config)
         self.loader.load_datasets()
-        self.assertEqual(self.loader.reference_dataset.origin['source'],
-                         'local')
-        np.testing.assert_array_equal(self.loader.reference_dataset.values,
+        self.assertEqual(self.loader.datasets[0].origin['source'], 'local')
+        np.testing.assert_array_equal(self.loader.datasets[0].values,
                                       self.values)
 
-    def testMultipleTargets(self):
+    def testMultipleDataSources(self):
         '''
-        Test for when multiple target dataset configs are specified
+        Test for when multiple dataset configs are specified
         '''
-        self.loader = DatasetLoader(self.reference_config,
-                                    [self.target_config,
-                                     self.new_data_source_config])
+        self.loader = DatasetLoader(self.config, self.new_data_source_config)
 
-        # Here the the data_source "foo" represents the Dataset constructor
+        # Here the data_source "foo" represents the Dataset constructor
         self.loader.add_source_loader('foo', build_dataset)
         self.loader.load_datasets()
-        self.assertEqual(self.loader.target_datasets[0].origin['source'],
+        self.assertEqual(self.loader.datasets[0].origin['source'],
                          'local')
-        self.assertEqual(self.loader.target_datasets[1].origin['source'],
+        self.assertEqual(self.loader.datasets[1].origin['source'],
                          'foo')
-        np.testing.assert_array_equal(self.loader.target_datasets[0].values,
+        np.testing.assert_array_equal(self.loader.datasets[0].values,
                                       self.values)
-        np.testing.assert_array_equal(self.loader.target_datasets[1].values,
+        np.testing.assert_array_equal(self.loader.datasets[1].values,
                                       self.values2)
 
 def build_dataset(*args, **kwargs):
commit	b2f0ad4b1516b8824acb4321a463b8d7e0c7c90c	[log] [tgz]
author	Alex Goodman <agoodm@users.noreply.github.com>	Mon Aug 01 14:11:27 2016 -0700
committer	Alex Goodman <agoodm@users.noreply.github.com>	Mon Aug 01 14:11:27 2016 -0700
tree	53a3b3afeafb8028f1a73f155c189a3d75cb2d3d
parent	7ab0141061b9a11638452e8e60c3400a92057390 [diff]