Merge branch 'CLIMATE-825'
diff --git a/docs/source/index.rst b/docs/source/index.rst
index 1485385..2834ee6 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -13,6 +13,7 @@
ocw/overview
ocw/dataset
+ ocw/dataset_loader
ocw/dataset_processor
ocw/evaluation
ocw/metrics
@@ -33,4 +34,3 @@
* :ref:`genindex`
* :ref:`modindex`
* :ref:`search`
-
diff --git a/docs/source/ocw/dataset_loader.rst b/docs/source/ocw/dataset_loader.rst
new file mode 100644
index 0000000..833b7f9
--- /dev/null
+++ b/docs/source/ocw/dataset_loader.rst
@@ -0,0 +1,5 @@
+Dataset Loader Module
+**************
+
+.. automodule:: dataset_loader
+ :members:
diff --git a/ocw/dataset_loader.py b/ocw/dataset_loader.py
new file mode 100644
index 0000000..8ee1b93
--- /dev/null
+++ b/ocw/dataset_loader.py
@@ -0,0 +1,207 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+'''
+Classes:
+ DatasetLoader - Generate OCW Dataset objects from a variety of sources.
+'''
+
+import ocw.data_source.local as local
+import ocw.data_source.esgf as esgf
+import ocw.data_source.rcmed as rcmed
+import ocw.data_source.dap as dap
+
+
+class DatasetLoader:
+ '''Generate OCW Dataset objects from a variety of sources.'''
+
+ def __init__(self, reference, targets):
+ '''Generate OCW Dataset objects from a variety of sources.
+
+ Each keyword argument can be information for a dataset in dictionary
+ form. For example:
+ ``
+ >>> reference = {'data_source':'rcmed', 'name':'cru', 'dataset_id':10,
+ 'parameter_id':34}
+ >>> targets = {'data_source':'local_multiple',
+ 'path':'./data/CORDEX-Africa_data/AFRICA*pr.nc',
+ 'variable':'pr'}
+ >>> loader = DatasetLoader(reference, targets)
+ ``
+
+ Or more conveniently if the loader configuration is defined in a
+ yaml file named config_file (see RCMES examples):
+ ``
+ >>> import yaml
+ >>> config = yaml.load(open(config_file))
+ >>> loader = DatasetLoader(**config['datasets'])
+ ``
+
+ As shown in the first example, the dictionary for each keyword argument
+ should contain a data source and parameters specific to the loader for
+ that data source. Once the configuration is entered, the datasets may be
+ loaded using:
+ ``
+ >>> loader.load_datasets()
+ >>> target_datasets = loader.target_datasets
+ ``
+
+ If ``reference`` is entered as a keyword argument, then it may be
+ accesed from:
+ ``
+ >>> reference_dataset = loader.reference_dataset
+ ``
+
+ Additionally, each dataset must have a ``data_source`` keyword. This may
+ be one of the following:
+ * ``'local'`` - A single dataset file in a local directory
+ * ``'local_split'`` - A single dataset split accross multiple files in a
+ local directory
+ * ``'local_multiple'`` - Multiple datasets in a local directory
+ * ``'esgf'`` - Download the dataset from the Earth System Grid
+ Federation
+ * ``'rcmed'`` - Download the dataset from the Regional Climate Model
+ Evaluation System Database
+ * ``'dap'`` - Download the dataset from an OPeNDAP URL
+
+ Users who wish to download datasets from sources not described above
+ may define their own custom dataset loader function and incorporate it
+ as follows:
+ >>> loader.add_source_loader('my_source_name', my_loader_func)
+
+ :param reference: The reference dataset loader configuration.
+ :type reference: :mod:`dict`
+
+ :param targets: The target dataset loader configurations.
+ :type targets: :mod:`dict` or list of mod:`dict`
+
+ :raises KeyError: If an invalid argument is passed to a data source
+ loader function.
+ '''
+ # Reference dataset config
+ self.set_reference(**reference)
+
+ # Target dataset(s) config
+ self.set_targets(targets)
+
+ # Default loaders
+ self._source_loaders = {
+ 'local': local.load_file,
+ 'local_split': local.load_dataset_from_multiple_netcdf_files,
+ 'local_multiple': local.load_multiple_files,
+ 'esgf': esgf.load_dataset,
+ 'rcmed': rcmed.parameter_dataset,
+ 'dap': dap.load
+ }
+
+ def add_source_loader(self, source_name, loader_func):
+ '''
+ Add a custom source loader.
+
+ :param source_name: The name of the data source.
+ :type source_name: :mod:`string`
+
+ :param loader_func: Reference to a custom defined function. This should
+ return an OCW Dataset object.
+ :type loader_func: :class:`callable`
+ '''
+ self._source_loaders[source_name] = loader_func
+
+ def add_target(self, **kwargs):
+ '''
+ A convenient means of adding a target dataset to the loader.
+ :raises KeyError: If data_source is not specified.
+ '''
+ if 'data_source' not in kwargs:
+ raise KeyError('Dataset configuration must contain a data_source.')
+ self._target_config.append(kwargs)
+
+ def add_targets(self, targets):
+ '''
+ A convenient means of adding multiple target datasets to the loader.
+
+ :param targets: List of loader configurations for each target
+ :type targets: List of :mod:`dict`
+
+ :raises KeyError: If data_source is not specified.
+ '''
+ for target_config in targets:
+ self.add_target(**target_config)
+
+ def set_targets(self, targets):
+ '''
+ Reset the target dataset config.
+
+ :param targets: List of loader configurations for each target
+ :type targets: List of :mod:`dict`
+
+ :raises KeyError: If data_source is not specified.
+ '''
+ # This check allows for the user to enter targets as one block or
+ # as a list of separate blocks in their config files
+ if not isinstance(targets, list):
+ targets = [targets]
+ self._target_config = []
+ self.add_targets(targets)
+
+ def set_reference(self, **kwargs):
+ '''
+ Reset the reference dataset config.
+ :raises KeyError: If data_source is not specified.
+ '''
+ if 'data_source' not in kwargs:
+ raise KeyError('Dataset configuration must contain a data_source.')
+ self._reference_config = kwargs
+
+ def load_datasets(self):
+ '''
+ Loads the datasets from the given loader configurations.
+ '''
+ # Load the reference dataset
+ self.reference_dataset = self._load(**self._reference_config)
+
+ # Ensure output is clear if loading is performed more than once to
+ # prevent duplicates.
+ self.target_datasets = []
+
+ # Load the target datasets
+ for loader_params in self._target_config:
+ output = self._load(**loader_params)
+
+ # Need to account for the fact that some loaders return lists
+ # of OCW Dataset objects instead of just one
+ if isinstance(output, list):
+ self.target_datasets.extend(output)
+ else:
+ self.target_datasets.append(output)
+
+ def _load(self, **kwargs):
+ '''
+ Generic dataset loading method.
+ '''
+ # Extract the data source
+ data_source = kwargs.pop('data_source')
+
+ # Find the correct loader function for the given data source
+ loader_func = self._source_loaders[data_source]
+
+ # The remaining kwargs should be specific to the loader
+ output = loader_func(**kwargs)
+
+ # Preserve data_source info for later use
+ kwargs['data_source'] = data_source
+ return output
diff --git a/ocw/tests/test_dataset_loader.py b/ocw/tests/test_dataset_loader.py
new file mode 100644
index 0000000..2d192c1
--- /dev/null
+++ b/ocw/tests/test_dataset_loader.py
@@ -0,0 +1,185 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import unittest
+import os
+import copy
+import netCDF4
+import numpy as np
+from ocw.dataset import Dataset
+from ocw.dataset_loader import DatasetLoader
+
+class TestDatasetLoader(unittest.TestCase):
+ def setUp(self):
+ # Read netCDF file
+ self.file_path = create_netcdf_object()
+ self.netCDF_file = netCDF4.Dataset(self.file_path, 'r')
+ self.latitudes = self.netCDF_file.variables['latitude'][:]
+ self.longitudes = self.netCDF_file.variables['longitude'][:]
+ self.times = self.netCDF_file.variables['time'][:]
+ self.alt_lats = self.netCDF_file.variables['alt_lat'][:]
+ self.alt_lons = self.netCDF_file.variables['alt_lon'][:]
+ self.values = self.netCDF_file.variables['value'][:]
+ self.values2 = self.values + 1
+
+ # Set up config
+ self.reference_config = {'data_source': 'local',
+ 'file_path': self.file_path,
+ 'variable_name': 'value'}
+ self.target_config = copy.deepcopy(self.reference_config)
+ self.no_data_source_config = {'file_path': self.file_path,
+ 'variable_name': 'value'}
+ self.new_data_source_config = {'data_source': 'foo',
+ 'lats': self.latitudes,
+ 'lons': self.longitudes,
+ 'times': self.times,
+ 'values': self.values2,
+ 'variable': 'value'}
+
+ def tearDown(self):
+ os.remove(self.file_path)
+
+ def testInputHasDataSource(self):
+ '''
+ Make sure input data source is specified for each dataset to be loaded
+ '''
+ with self.assertRaises(KeyError):
+ self.loader = DatasetLoader(self.reference_config,
+ self.no_data_source_config)
+
+ def testReferenceHasDataSource(self):
+ '''
+ Make sure ref data source is specified for each dataset to be loaded
+ '''
+ with self.assertRaises(KeyError):
+ self.loader = DatasetLoader(self.reference_config,
+ self.target_config)
+ self.loader.set_reference(**self.no_data_source_config)
+
+ def testTargetHasDataSource(self):
+ '''
+ Make sure target data source is specified for each dataset to be loaded
+ '''
+ with self.assertRaises(KeyError):
+ self.loader = DatasetLoader(self.reference_config,
+ self.target_config)
+ self.loader.add_target(**self.no_data_source_config)
+
+ def testNewDataSource(self):
+ '''
+ Ensures that custom data source loaders can be added
+ '''
+ self.loader = DatasetLoader(self.new_data_source_config,
+ self.target_config)
+
+ # Here the the data_source "foo" represents the Dataset constructor
+ self.loader.add_source_loader('foo', build_dataset)
+ self.loader.load_datasets()
+ self.assertEqual(self.loader.reference_dataset.origin['source'],
+ 'foo')
+ np.testing.assert_array_equal(self.loader.reference_dataset.values,
+ self.values2)
+
+ def testExistingDataSource(self):
+ '''
+ Ensures that existing data source loaders can be added
+ '''
+ self.loader = DatasetLoader(self.reference_config,
+ self.target_config)
+ self.loader.load_datasets()
+ self.assertEqual(self.loader.reference_dataset.origin['source'],
+ 'local')
+ np.testing.assert_array_equal(self.loader.reference_dataset.values,
+ self.values)
+
+ def testMultipleTargets(self):
+ '''
+ Test for when multiple target dataset configs are specified
+ '''
+ self.loader = DatasetLoader(self.reference_config,
+ [self.target_config,
+ self.new_data_source_config])
+
+ # Here the the data_source "foo" represents the Dataset constructor
+ self.loader.add_source_loader('foo', build_dataset)
+ self.loader.load_datasets()
+ self.assertEqual(self.loader.target_datasets[0].origin['source'],
+ 'local')
+ self.assertEqual(self.loader.target_datasets[1].origin['source'],
+ 'foo')
+ np.testing.assert_array_equal(self.loader.target_datasets[0].values,
+ self.values)
+ np.testing.assert_array_equal(self.loader.target_datasets[1].values,
+ self.values2)
+
+def build_dataset(*args, **kwargs):
+ '''
+ Wrapper to Dataset constructor from fictitious 'foo' data_source.
+ '''
+ origin = {'source': 'foo'}
+ return Dataset(*args, origin=origin, **kwargs)
+
+def create_netcdf_object():
+ # To create the temporary netCDF file
+ file_path = '/tmp/temporaryNetcdf.nc'
+ netCDF_file = netCDF4.Dataset(file_path, 'w', format='NETCDF4')
+ # To create dimensions
+ netCDF_file.createDimension('lat_dim', 5)
+ netCDF_file.createDimension('lon_dim', 5)
+ netCDF_file.createDimension('time_dim', 3)
+ # To create variables
+ latitudes = netCDF_file.createVariable('latitude', 'd', ('lat_dim',))
+ longitudes = netCDF_file.createVariable('longitude', 'd', ('lon_dim',))
+ times = netCDF_file.createVariable('time', 'd', ('time_dim',))
+ # unusual variable names to test optional arguments for Dataset constructor
+ alt_lats = netCDF_file.createVariable('alt_lat', 'd', ('lat_dim',))
+ alt_lons = netCDF_file.createVariable('alt_lon', 'd', ('lon_dim',))
+ alt_times = netCDF_file.createVariable('alt_time', 'd', ('time_dim',))
+ values = netCDF_file.createVariable('value', 'd',
+ ('time_dim',
+ 'lat_dim',
+ 'lon_dim')
+ )
+
+ # To latitudes and longitudes for five values
+ latitudes_data = np.arange(5.)
+ longitudes_data = np.arange(150., 155.)
+ # Three months of data.
+ times_data = np.arange(3)
+ # Create 150 values
+ values_data = np.array([i for i in range(75)])
+ # Reshape values to 4D array (level, time, lats, lons)
+ values_data = values_data.reshape(len(times_data), len(latitudes_data),
+ len(longitudes_data))
+
+ # Ingest values to netCDF file
+ latitudes[:] = latitudes_data
+ longitudes[:] = longitudes_data
+ times[:] = times_data
+ alt_lats[:] = latitudes_data + 10
+ alt_lons[:] = longitudes_data - 10
+ alt_times[:] = times_data
+ values[:] = values_data
+ # Assign time info to time variable
+ netCDF_file.variables['time'].units = 'months since 2001-01-01 00:00:00'
+ netCDF_file.variables['alt_time'].units = 'months since 2001-04-01 00:00:00'
+ netCDF_file.variables['value'].units = 'foo_units'
+ netCDF_file.close()
+ return file_path
+
+if __name__ == '__main__':
+ unittest.main()