Updates to DatasetLoader
diff --git a/RCMES/run_RCMES.py b/RCMES/run_RCMES.py
index cd69bc4..3cc2245 100644
--- a/RCMES/run_RCMES.py
+++ b/RCMES/run_RCMES.py
@@ -39,7 +39,7 @@
from metrics_and_plots import *
-import ssl
+import ssl
if hasattr(ssl, '_create_unverified_context'):
ssl._create_default_https_context = ssl._create_unverified_context
@@ -107,12 +107,13 @@
boundary_check_model = True
if 'GCM_data' in model_data_info.keys():
if model_data_info['GCM_data']:
- boundary_check_model = False
+ boundary_check_model = False
print 'Loading model datasets:\n',model_data_info
if model_data_info['data_source'] == 'local':
- model_datasets, model_names = local.load_multiple_files(file_path = model_data_info['path'],
- variable_name =model_data_info['variable'],
- lat_name=model_lat_name, lon_name=model_lon_name)
+ model_datasets = local.load_multiple_files(file_path=model_data_info['path'],
+ variable_name =model_data_info['variable'],
+ lat_name=model_lat_name, lon_name=model_lon_name)
+ model_names = [dataset.name for dataset in model_datasets]
elif model_data_info['data_source'] == 'ESGF':
md = esgf.load_dataset(dataset_id=model_data_info['dataset_id'],
variable=model_data_info['variable'],
@@ -166,7 +167,7 @@
# generate grid points for regridding
if config['regrid']['regrid_on_reference']:
new_lat = ref_dataset.lats
- new_lon = ref_dataset.lons
+ new_lon = ref_dataset.lons
else:
delta_lat = config['regrid']['regrid_dlat']
delta_lon = config['regrid']['regrid_dlon']
@@ -178,7 +179,7 @@
# number of models
nmodel = len(model_datasets)
print 'Dataset loading completed'
-print 'Observation data:', ref_name
+print 'Observation data:', ref_name
print 'Number of model datasets:',nmodel
for model_name in model_names:
print model_name
@@ -200,7 +201,7 @@
ref_dataset = dsp.variable_unit_conversion(ref_dataset)
for idata,dataset in enumerate(model_datasets):
model_datasets[idata] = dsp.variable_unit_conversion(dataset)
-
+
print 'Generating multi-model ensemble'
if len(model_datasets) >= 2.:
@@ -217,8 +218,8 @@
print 'Calculating spatial averages and standard deviations of ',str(nsubregion),' subregions'
- ref_subregion_mean, ref_subregion_std, subregion_array = utils.calc_subregion_area_mean_and_std([ref_dataset], subregions)
- model_subregion_mean, model_subregion_std, subregion_array = utils.calc_subregion_area_mean_and_std(model_datasets, subregions)
+ ref_subregion_mean, ref_subregion_std, subregion_array = utils.calc_subregion_area_mean_and_std([ref_dataset], subregions)
+ model_subregion_mean, model_subregion_std, subregion_array = utils.calc_subregion_area_mean_and_std(model_datasets, subregions)
""" Step 7: Write a netCDF file """
workdir = config['workdir']
@@ -231,7 +232,7 @@
if config['use_subregions']:
dsp.write_netcdf_multiple_datasets_with_subregions(ref_dataset, ref_name, model_datasets, model_names,
path=workdir+config['output_netcdf_filename'],
- subregions=subregions, subregion_array = subregion_array,
+ subregions=subregions, subregion_array = subregion_array,
ref_subregion_mean=ref_subregion_mean, ref_subregion_std=ref_subregion_std,
model_subregion_mean=model_subregion_mean, model_subregion_std=model_subregion_std)
else:
@@ -279,5 +280,3 @@
file_name)
else:
print 'please check the currently supported metrics'
-
-
diff --git a/ocw/data_source/local.py b/ocw/data_source/local.py
index 35041ac..98de937 100644
--- a/ocw/data_source/local.py
+++ b/ocw/data_source/local.py
@@ -350,7 +350,7 @@
datasets.append(load_file(filename, variable_name, variable_unit, name=data_name[ifile],
lat_name=lat_name, lon_name=lon_name, time_name=time_name))
- return datasets, data_name
+ return datasets
def load_WRF_2d_files_RAIN(file_path=None,
filename_pattern=None,
diff --git a/ocw/dataset_loader.py b/ocw/dataset_loader.py
index 8ee1b93..3c8b95f 100644
--- a/ocw/dataset_loader.py
+++ b/ocw/dataset_loader.py
@@ -27,20 +27,19 @@
class DatasetLoader:
- '''Generate OCW Dataset objects from a variety of sources.'''
+ '''Generate a list of OCW Dataset objects from a variety of sources.'''
- def __init__(self, reference, targets):
- '''Generate OCW Dataset objects from a variety of sources.
+ def __init__(self, *loader_opts):
+ '''Generate a list of OCW Dataset objects from a variety of sources.
Each keyword argument can be information for a dataset in dictionary
form. For example:
``
- >>> reference = {'data_source':'rcmed', 'name':'cru', 'dataset_id':10,
- 'parameter_id':34}
- >>> targets = {'data_source':'local_multiple',
- 'path':'./data/CORDEX-Africa_data/AFRICA*pr.nc',
- 'variable':'pr'}
- >>> loader = DatasetLoader(reference, targets)
+ >>> loader_opt1 = {'loader_name': 'rcmed', 'name': 'cru',
+ 'dataset_id': 10, 'parameter_id': 34}
+ >>> loader_opt2 = {'path': './data/TRMM_v7_3B43_1980-2010.nc,
+ 'variable': 'pcp'}
+ >>> loader = DatasetLoader(loader_opt1, loader_opt2)
``
Or more conveniently if the loader configuration is defined in a
@@ -48,67 +47,57 @@
``
>>> import yaml
>>> config = yaml.load(open(config_file))
- >>> loader = DatasetLoader(**config['datasets'])
+ >>> obs_loader_config = config['datasets']['reference']
+ >>> loader = DatasetLoader(*obs_loader_config)
``
- As shown in the first example, the dictionary for each keyword argument
- should contain a data source and parameters specific to the loader for
- that data source. Once the configuration is entered, the datasets may be
- loaded using:
+ As shown in the first example, the dictionary for each argument should
+ contain a loader name and parameters specific to the particular loader.
+ Once the configuration is entered, the datasets may be loaded using:
``
>>> loader.load_datasets()
- >>> target_datasets = loader.target_datasets
+ >>> obs_datasets = loader.datasets
``
- If ``reference`` is entered as a keyword argument, then it may be
- accesed from:
- ``
- >>> reference_dataset = loader.reference_dataset
- ``
-
- Additionally, each dataset must have a ``data_source`` keyword. This may
+ Additionally, each dataset must have a ``loader_name`` keyword. This may
be one of the following:
- * ``'local'`` - A single dataset file in a local directory
+ * ``'local'`` - One or multiple dataset files in a local directory
* ``'local_split'`` - A single dataset split accross multiple files in a
local directory
- * ``'local_multiple'`` - Multiple datasets in a local directory
* ``'esgf'`` - Download the dataset from the Earth System Grid
Federation
* ``'rcmed'`` - Download the dataset from the Regional Climate Model
Evaluation System Database
* ``'dap'`` - Download the dataset from an OPeNDAP URL
- Users who wish to download datasets from sources not described above
+ Users who wish to download datasets from loaders not described above
may define their own custom dataset loader function and incorporate it
as follows:
- >>> loader.add_source_loader('my_source_name', my_loader_func)
+ >>> loader.add_source_loader('my_loader_name', my_loader_func)
- :param reference: The reference dataset loader configuration.
- :type reference: :mod:`dict`
-
- :param targets: The target dataset loader configurations.
- :type targets: :mod:`dict` or list of mod:`dict`
+ :param loader_opts: Dictionaries containing the each dataset loader
+ configuration, representing the keyword arguments of
+ the loader function specified by an additional key
+ called 'loader_name'. If not specified by the user,
+ this defaults to local.
+ :type loader_opts: :class:`dict`
:raises KeyError: If an invalid argument is passed to a data source
loader function.
'''
# Reference dataset config
- self.set_reference(**reference)
-
- # Target dataset(s) config
- self.set_targets(targets)
+ self.set_loader_opts(*loader_opts)
# Default loaders
self._source_loaders = {
- 'local': local.load_file,
+ 'local': local.load_multiple_files,
'local_split': local.load_dataset_from_multiple_netcdf_files,
- 'local_multiple': local.load_multiple_files,
'esgf': esgf.load_dataset,
'rcmed': rcmed.parameter_dataset,
'dap': dap.load
}
- def add_source_loader(self, source_name, loader_func):
+ def add_source_loader(self, loader_name, loader_func):
'''
Add a custom source loader.
@@ -119,89 +108,72 @@
return an OCW Dataset object.
:type loader_func: :class:`callable`
'''
- self._source_loaders[source_name] = loader_func
+ self._source_loaders[loader_name] = loader_func
- def add_target(self, **kwargs):
+ def add_loader_opts(self, *loader_opts):
'''
- A convenient means of adding a target dataset to the loader.
- :raises KeyError: If data_source is not specified.
- '''
- if 'data_source' not in kwargs:
- raise KeyError('Dataset configuration must contain a data_source.')
- self._target_config.append(kwargs)
+ A convenient means of adding loader options for each dataset to the
+ loader. If 'loader_name' is not entered as a keyword argument, then
+ 'local' is used by default.
- def add_targets(self, targets):
+ :param loader_opts: Dictionaries containing the each dataset loader
+ configuration, representing the keyword arguments of
+ the loader function specified by an additional key
+ called 'loader_name'. If not specified by the user,
+ this defaults to local.
+ :type loader_opts: :mod:`dict`
'''
- A convenient means of adding multiple target datasets to the loader.
+ for opt in loader_opts:
+ if 'loader_name' not in opt:
+ opt['loader_name'] = 'local'
+ self._config.extend(loader_opts)
- :param targets: List of loader configurations for each target
- :type targets: List of :mod:`dict`
-
- :raises KeyError: If data_source is not specified.
+ def set_loader_opts(self, *loader_opts):
'''
- for target_config in targets:
- self.add_target(**target_config)
+ Reset the dataset loader config.
- def set_targets(self, targets):
+ :param loader_opts: Dictionaries containing the each dataset loader
+ configuration, representing the keyword arguments of
+ the loader function specified by an additional key
+ called 'loader_name'. If not specified by the user,
+ this defaults to local.
+ :type loader_opts: :mod:`dict`
'''
- Reset the target dataset config.
-
- :param targets: List of loader configurations for each target
- :type targets: List of :mod:`dict`
-
- :raises KeyError: If data_source is not specified.
- '''
- # This check allows for the user to enter targets as one block or
- # as a list of separate blocks in their config files
- if not isinstance(targets, list):
- targets = [targets]
- self._target_config = []
- self.add_targets(targets)
-
- def set_reference(self, **kwargs):
- '''
- Reset the reference dataset config.
- :raises KeyError: If data_source is not specified.
- '''
- if 'data_source' not in kwargs:
- raise KeyError('Dataset configuration must contain a data_source.')
- self._reference_config = kwargs
+ self._config = []
+ self.add_loader_opts(*loader_opts)
def load_datasets(self):
'''
Loads the datasets from the given loader configurations.
'''
- # Load the reference dataset
- self.reference_dataset = self._load(**self._reference_config)
-
# Ensure output is clear if loading is performed more than once to
# prevent duplicates.
- self.target_datasets = []
+ self.datasets = []
# Load the target datasets
- for loader_params in self._target_config:
- output = self._load(**loader_params)
+ for loader_opt in self._config:
+ output = self._load(**loader_opt)
# Need to account for the fact that some loaders return lists
# of OCW Dataset objects instead of just one
if isinstance(output, list):
- self.target_datasets.extend(output)
+ self.datasets.extend(output)
else:
- self.target_datasets.append(output)
+ self.datasets.append(output)
def _load(self, **kwargs):
'''
Generic dataset loading method.
'''
- # Extract the data source
- data_source = kwargs.pop('data_source')
+ # Extract the loader name
+ loader_name = kwargs.pop('loader_name')
# Find the correct loader function for the given data source
- loader_func = self._source_loaders[data_source]
+ loader_func = self._source_loaders[loader_name]
# The remaining kwargs should be specific to the loader
output = loader_func(**kwargs)
- # Preserve data_source info for later use
- kwargs['data_source'] = data_source
+ # Preserve loader_name info for later use
+ kwargs['loader_name'] = loader_name
return output
diff --git a/ocw/tests/test_dataset_loader.py b/ocw/tests/test_dataset_loader.py
index 2d192c1..b3c613b 100644
--- a/ocw/tests/test_dataset_loader.py
+++ b/ocw/tests/test_dataset_loader.py
@@ -17,7 +17,6 @@
import unittest
import os
-import copy
import netCDF4
import numpy as np
from ocw.dataset import Dataset
@@ -37,13 +36,8 @@
self.values2 = self.values + 1
# Set up config
- self.reference_config = {'data_source': 'local',
- 'file_path': self.file_path,
- 'variable_name': 'value'}
- self.target_config = copy.deepcopy(self.reference_config)
- self.no_data_source_config = {'file_path': self.file_path,
- 'variable_name': 'value'}
- self.new_data_source_config = {'data_source': 'foo',
+ self.config = {'file_path': self.file_path, 'variable_name': 'value'}
+ self.new_data_source_config = {'loader_name': 'foo',
'lats': self.latitudes,
'lons': self.longitudes,
'times': self.times,
@@ -53,77 +47,45 @@
def tearDown(self):
os.remove(self.file_path)
- def testInputHasDataSource(self):
- '''
- Make sure input data source is specified for each dataset to be loaded
- '''
- with self.assertRaises(KeyError):
- self.loader = DatasetLoader(self.reference_config,
- self.no_data_source_config)
-
- def testReferenceHasDataSource(self):
- '''
- Make sure ref data source is specified for each dataset to be loaded
- '''
- with self.assertRaises(KeyError):
- self.loader = DatasetLoader(self.reference_config,
- self.target_config)
- self.loader.set_reference(**self.no_data_source_config)
-
- def testTargetHasDataSource(self):
- '''
- Make sure target data source is specified for each dataset to be loaded
- '''
- with self.assertRaises(KeyError):
- self.loader = DatasetLoader(self.reference_config,
- self.target_config)
- self.loader.add_target(**self.no_data_source_config)
-
def testNewDataSource(self):
'''
Ensures that custom data source loaders can be added
'''
- self.loader = DatasetLoader(self.new_data_source_config,
- self.target_config)
+ self.loader = DatasetLoader(self.new_data_source_config)
- # Here the the data_source "foo" represents the Dataset constructor
+ # Here the data_source "foo" represents the Dataset constructor
self.loader.add_source_loader('foo', build_dataset)
self.loader.load_datasets()
- self.assertEqual(self.loader.reference_dataset.origin['source'],
- 'foo')
- np.testing.assert_array_equal(self.loader.reference_dataset.values,
+ self.assertEqual(self.loader.datasets[0].origin['source'], 'foo')
+ np.testing.assert_array_equal(self.loader.datasets[0].values,
self.values2)
def testExistingDataSource(self):
'''
Ensures that existing data source loaders can be added
'''
- self.loader = DatasetLoader(self.reference_config,
- self.target_config)
+ self.loader = DatasetLoader(self.config)
self.loader.load_datasets()
- self.assertEqual(self.loader.reference_dataset.origin['source'],
- 'local')
- np.testing.assert_array_equal(self.loader.reference_dataset.values,
+ self.assertEqual(self.loader.datasets[0].origin['source'], 'local')
+ np.testing.assert_array_equal(self.loader.datasets[0].values,
self.values)
- def testMultipleTargets(self):
+ def testMultipleDataSources(self):
'''
- Test for when multiple target dataset configs are specified
+ Test for when multiple dataset configs are specified
'''
- self.loader = DatasetLoader(self.reference_config,
- [self.target_config,
- self.new_data_source_config])
+ self.loader = DatasetLoader(self.config, self.new_data_source_config)
- # Here the the data_source "foo" represents the Dataset constructor
+ # Here the data_source "foo" represents the Dataset constructor
self.loader.add_source_loader('foo', build_dataset)
self.loader.load_datasets()
- self.assertEqual(self.loader.target_datasets[0].origin['source'],
+ self.assertEqual(self.loader.datasets[0].origin['source'],
'local')
- self.assertEqual(self.loader.target_datasets[1].origin['source'],
+ self.assertEqual(self.loader.datasets[1].origin['source'],
'foo')
- np.testing.assert_array_equal(self.loader.target_datasets[0].values,
+ np.testing.assert_array_equal(self.loader.datasets[0].values,
self.values)
- np.testing.assert_array_equal(self.loader.target_datasets[1].values,
+ np.testing.assert_array_equal(self.loader.datasets[1].values,
self.values2)
def build_dataset(*args, **kwargs):