ocw/dataset_loader.py - climate - Git at Google

 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #    http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.

 '''
 Classes:
     DatasetLoader - Generate OCW Dataset objects from a variety of sources.
 '''

 import ocw.data_source.local as local
 import ocw.data_source.rcmed as rcmed
 import ocw.data_source.podaac_datasource as podaac
 import warnings

 class DatasetLoader:
     '''Generate a list of OCW Dataset objects from a variety of sources.'''

     def __init__(self, *loader_opts):
         '''Generate a list of OCW Dataset objects from a variety of sources.

         Each keyword argument can be information for a dataset in dictionary
         form. For example:

         >>> loader_opt1 = {'loader_name': 'rcmed', 'name': 'cru',
                            'dataset_id': 10, 'parameter_id': 34}
         >>> loader_opt2 = {'path': './data/TRMM_v7_3B43_1980-2010.nc,
                            'variable': 'pcp'}
         >>> loader = DatasetLoader(loader_opt1, loader_opt2)

         Or more conveniently if the loader configuration is defined in a
         yaml file named config_file (see RCMES examples):

         >>> import yaml
         >>> config = yaml.load(open(config_file))
         >>> obs_loader_config = config['datasets']['reference']
         >>> loader = DatasetLoader(*obs_loader_config)

         As shown in the first example, the dictionary for each argument should
         contain a loader name and parameters specific to the particular loader.
         Once the configuration is entered, the datasets may be loaded using:

         >>> loader.load_datasets()
         >>> obs_datasets = loader.datasets

         Additionally, each dataset must have a ``loader_name`` keyword. This may
         be one of the following:

         * ``'local'`` - One or multiple dataset files in a local directory
         * ``'local_split'`` - A single dataset split accross multiple files in a
                               local directory
         * ``'esgf'`` - Download the dataset from the Earth System Grid
                        Federation
         * ``'rcmed'`` - Download the dataset from the Regional Climate Model
                         Evaluation System Database
         * ``'dap'`` - Download the dataset from an OPeNDAP URL
         * ``'podaac'`` - Download the dataset from Physical Oceanography
                         Distributed Active Archive Center

         Users who wish to load datasets from loaders not described above may
         define their own custom dataset loader function and incorporate it as
         follows:

         >>> loader.add_source_loader('my_loader_name', my_loader_func)

         :param loader_opts: Dictionaries containing the each dataset loader
                             configuration, representing the keyword arguments of
                             the loader function specified by an additional key
                             called 'loader_name'. If not specified by the user,
                             this defaults to local.
         :type loader_opts: :class:`dict`

         :raises KeyError: If an invalid argument is passed to a data source
                             loader function.
         '''
         # dataset loader config
         self.set_loader_opts(*loader_opts)

         # Default loaders
         self._source_loaders = {
             'local': local.load_multiple_files,
             'local_split': local.load_dataset_from_multiple_netcdf_files,
             'rcmed': rcmed.parameter_dataset,
             'podaac': podaac.extract_l4_granule
         }

         # Exclude esgf and dap for python 3 until they are compatible
         try:
             import ocw.data_source.esgf as esgf
             import ocw.data_source.dap as dap
             self._source_loaders['dap'] = dap.load
             self._source_loaders['esgf'] = esgf.load_dataset
         except ImportError:
             warnings.warn('dap and esgf loaders missing. If these are needed, '
                           'fallback to python 2.7.x.')

     def add_source_loader(self, loader_name, loader_func):
         '''
         Add a custom source loader.

         :param loader_name: The name of the data source.
         :type loader_name: :mod:`string`

         :param loader_func: Reference to a custom defined function. This should
                             return an OCW Dataset object, and have an origin which satisfies
                             origin['source'] == loader_name.
         :type loader_func: :class:`callable`
         '''
         self._source_loaders[loader_name] = loader_func

     def add_loader_opts(self, *loader_opts):
         '''
         A convenient means of adding loader options for each dataset to the
         loader.

         :param loader_opts: Dictionaries containing the each dataset loader
                             configuration, representing the keyword arguments of
                             the loader function specified by an additional key
                             called 'loader_name'. If not specified by the user,
                             this defaults to local.
         :type loader_opts: :mod:`dict`
         '''
         for opt in loader_opts:
             if 'loader_name' not in opt:
                 opt['loader_name'] = 'local'
         self._config.extend(loader_opts)

     def set_loader_opts(self, *loader_opts):
         '''
         Reset the dataset loader config.

         :param loader_opts: Dictionaries containing the each dataset loader
                             configuration, representing the keyword arguments of
                             the loader function specified by an additional key
                             called 'loader_name'. If not specified by the user,
                             this defaults to local.
         :type loader_opts: :mod:`dict`
         '''
         self._config = []
         self.add_loader_opts(*loader_opts)

     def load_datasets(self):
         '''
         Loads the datasets from the given loader configurations.
         '''
         # Ensure output is clear if loading is performed more than once to
         # prevent duplicates.
         self.datasets = []

         # Load the datasets
         for loader_opt in self._config:
             output = self._load(**loader_opt)

             # Need to account for the fact that some loaders return lists
             # of OCW Dataset objects instead of just one
             if isinstance(output, list):
                 self.datasets.extend(output)
             else:
                 self.datasets.append(output)

     def _load(self, **kwargs):
         '''
         Generic dataset loading method.
         '''
         # Extract the loader name
         loader_name = kwargs.pop('loader_name')

         # Find the correct loader function for the given data source
         loader_func = self._source_loaders[loader_name]

         # The remaining kwargs should be specific to the loader
         output = loader_func(**kwargs)

         # Preserve loader_name info for later use
         kwargs['loader_name'] = loader_name
         return output
	# Licensed to the Apache Software Foundation (ASF) under one
	# or more contributor license agreements. See the NOTICE file
	# distributed with this work for additional information
	# regarding copyright ownership. The ASF licenses this file
	# to you under the Apache License, Version 2.0 (the
	# "License"); you may not use this file except in compliance
	# with the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing,
	# software distributed under the License is distributed on an
	# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	# KIND, either express or implied. See the License for the
	# specific language governing permissions and limitations
	# under the License.

	'''
	Classes:
	DatasetLoader - Generate OCW Dataset objects from a variety of sources.
	'''

	import ocw.data_source.local as local
	import ocw.data_source.rcmed as rcmed
	import ocw.data_source.podaac_datasource as podaac
	import warnings

	class DatasetLoader:
	'''Generate a list of OCW Dataset objects from a variety of sources.'''

	def __init__(self, *loader_opts):
	'''Generate a list of OCW Dataset objects from a variety of sources.

	Each keyword argument can be information for a dataset in dictionary
	form. For example:

	>>> loader_opt1 = {'loader_name': 'rcmed', 'name': 'cru',
	'dataset_id': 10, 'parameter_id': 34}
	>>> loader_opt2 = {'path': './data/TRMM_v7_3B43_1980-2010.nc,
	'variable': 'pcp'}
	>>> loader = DatasetLoader(loader_opt1, loader_opt2)

	Or more conveniently if the loader configuration is defined in a
	yaml file named config_file (see RCMES examples):

	>>> import yaml
	>>> config = yaml.load(open(config_file))
	>>> obs_loader_config = config['datasets']['reference']
	>>> loader = DatasetLoader(*obs_loader_config)

	As shown in the first example, the dictionary for each argument should
	contain a loader name and parameters specific to the particular loader.
	Once the configuration is entered, the datasets may be loaded using:

	>>> loader.load_datasets()
	>>> obs_datasets = loader.datasets

	Additionally, each dataset must have a ``loader_name`` keyword. This may
	be one of the following:

	* ``'local'`` - One or multiple dataset files in a local directory
	* ``'local_split'`` - A single dataset split accross multiple files in a
	local directory
	* ``'esgf'`` - Download the dataset from the Earth System Grid
	Federation
	* ``'rcmed'`` - Download the dataset from the Regional Climate Model
	Evaluation System Database
	* ``'dap'`` - Download the dataset from an OPeNDAP URL
	* ``'podaac'`` - Download the dataset from Physical Oceanography
	Distributed Active Archive Center

	Users who wish to load datasets from loaders not described above may
	define their own custom dataset loader function and incorporate it as
	follows:

	>>> loader.add_source_loader('my_loader_name', my_loader_func)

	:param loader_opts: Dictionaries containing the each dataset loader
	configuration, representing the keyword arguments of
	the loader function specified by an additional key
	called 'loader_name'. If not specified by the user,
	this defaults to local.
	:type loader_opts: :class:`dict`

	:raises KeyError: If an invalid argument is passed to a data source
	loader function.
	'''
	# dataset loader config
	self.set_loader_opts(*loader_opts)

	# Default loaders
	self._source_loaders = {
	'local': local.load_multiple_files,
	'local_split': local.load_dataset_from_multiple_netcdf_files,
	'rcmed': rcmed.parameter_dataset,
	'podaac': podaac.extract_l4_granule
	}

	# Exclude esgf and dap for python 3 until they are compatible
	try:
	import ocw.data_source.esgf as esgf
	import ocw.data_source.dap as dap
	self._source_loaders['dap'] = dap.load
	self._source_loaders['esgf'] = esgf.load_dataset
	except ImportError:
	warnings.warn('dap and esgf loaders missing. If these are needed, '
	'fallback to python 2.7.x.')

	def add_source_loader(self, loader_name, loader_func):
	'''
	Add a custom source loader.

	:param loader_name: The name of the data source.
	:type loader_name: :mod:`string`

	:param loader_func: Reference to a custom defined function. This should
	return an OCW Dataset object, and have an origin which satisfies
	origin['source'] == loader_name.
	:type loader_func: :class:`callable`
	'''
	self._source_loaders[loader_name] = loader_func

	def add_loader_opts(self, *loader_opts):
	'''
	A convenient means of adding loader options for each dataset to the
	loader.

	:param loader_opts: Dictionaries containing the each dataset loader
	configuration, representing the keyword arguments of
	the loader function specified by an additional key
	called 'loader_name'. If not specified by the user,
	this defaults to local.
	:type loader_opts: :mod:`dict`
	'''
	for opt in loader_opts:
	if 'loader_name' not in opt:
	opt['loader_name'] = 'local'
	self._config.extend(loader_opts)

	def set_loader_opts(self, *loader_opts):
	'''
	Reset the dataset loader config.

	:param loader_opts: Dictionaries containing the each dataset loader
	configuration, representing the keyword arguments of
	the loader function specified by an additional key
	called 'loader_name'. If not specified by the user,
	this defaults to local.
	:type loader_opts: :mod:`dict`
	'''
	self._config = []
	self.add_loader_opts(*loader_opts)

	def load_datasets(self):
	'''
	Loads the datasets from the given loader configurations.
	'''
	# Ensure output is clear if loading is performed more than once to
	# prevent duplicates.
	self.datasets = []

	# Load the datasets
	for loader_opt in self._config:
	output = self._load(**loader_opt)

	# Need to account for the fact that some loaders return lists
	# of OCW Dataset objects instead of just one
	if isinstance(output, list):
	self.datasets.extend(output)
	else:
	self.datasets.append(output)

	def _load(self, **kwargs):
	'''
	Generic dataset loading method.
	'''
	# Extract the loader name
	loader_name = kwargs.pop('loader_name')

	# Find the correct loader function for the given data source
	loader_func = self._source_loaders[loader_name]

	# The remaining kwargs should be specific to the loader
	output = loader_func(**kwargs)

	# Preserve loader_name info for later use
	kwargs['loader_name'] = loader_name
	return output