ocw/data_source/esgf.py - climate - Git at Google

 #
 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #    http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
 #
 """
 A set of functions to wrap downloading ESGF datasets into an OCW dataset object.

 *** Note *** The ESGF data source requires that the user have certain credentials downloaded from
 the ESG. The current version of the module should download these automatically.  Older versions of
 the library will not download them. The solution is to use the WGET script from the EGS to download
 a test dataset to get the credentials. The data source should work as expected then.

 """
 import os
 import sys

 import requests
 from bs4 import BeautifulSoup

 import ocw.data_source.local as local
 from ocw.esgf.constants import DEFAULT_ESGF_SEARCH
 from ocw.esgf.download import download
 from ocw.esgf.logon import logon

 if sys.version_info[0] >= 3:
     from urllib.error import HTTPError
 else:
     # Not Python 3 - today, it is most likely to be Python 2
     # But note that this might need an update when Python 4
     # might be around one day
     from urllib2 import HTTPError


 def load_dataset(dataset_id,
                  variable_name,
                  esgf_username,
                  esgf_password,
                  search_url=DEFAULT_ESGF_SEARCH,
                  elevation_index=0,
                  name='',
                  save_path='/tmp'):
     """ Load an ESGF dataset.

     :param dataset_id: The ESGF ID of the dataset to load.
     :type dataset_id: :mod:`string`

     :param variable_name: The variable to load.
     :type variable_name: :mod:`string`

     :param esgf_username: ESGF OpenID value to use for authentication.
     :type esgf_username: :mod:`string`

     :param esgf_password: ESGF Password to use for authentication.
     :type esgf_password: :mod:`string`

     :param search_url: (Optional) The ESGF node to use for searching. Defaults
         to the Jet Propulsion Laboratory node.
     :type search_url: :mod:`string`

     :param elevation_index: (Optional) The elevation level to strip out when
         loading the dataset using ocw.data_source.local.
     :type elevation_index: :class:`int`

     :param name: (Optional) A name for the loaded dataset.
     :type name: :mod:`string`

     :param save_path: (Optional) Path to where downloaded files should be saved.
     :type save_path: :mod:`string`

     :returns: A :class:`list` of :class:`dataset.Dataset` contained the
         requested dataset. If the dataset is stored in multiple files each will
         be loaded into a separate :class:`dataset.Dataset`.

     :raises ValueError: If no dataset can be found for the supplied ID and
         variable, or if the requested dataset is a multi-file dataset.
     """
     download_data = \
         _get_file_download_data(url=search_url, dataset_id=dataset_id, variable=variable_name)

     datasets = []

     for url, var in download_data:
         _download_files([url], esgf_username, esgf_password, download_directory=save_path)

         file_save_path = os.path.join(save_path, url.split('/')[-1])

         datasets.append(local.load_file(file_save_path, var, name=name,
                                         elevation_index=elevation_index))

     origin = {
         'source': 'esgf',
         'dataset_id': dataset_id,
         'variable': variable_name
     }

     for dataset in datasets:
         dataset.origin = origin

     return datasets


 def _get_file_download_data(dataset_id, variable, url=DEFAULT_ESGF_SEARCH):
     """"""
     url += '?type=File&dataset_id={}&variable={}'
     url = url.format(dataset_id, variable)

     raw_data = requests.get(url)
     xml = BeautifulSoup(raw_data.content, "html.parser")

     dont_have_results = not bool(xml.response.result['numfound'])

     if dont_have_results:
         err = "esgf.load_dataset: No files found for specified dataset."
         raise ValueError(err)

     # Split out URLs for dataset download along with variable names for each
     # of those files.
     url_groups = xml.response.result.findAll('arr', {'name': 'url'})
     variable_groups = xml.response.result.findAll('arr', {'name': 'variable'})

     urls = [group.findAll('str')[0].string.split('|')[0]
             for group in url_groups]
     variables = [group.findAll('str')[0].string
                  for group in variable_groups]

     return zip(urls, variables)


 def _download_files(file_urls, username, password, download_directory='/tmp'):
     """"""
     try:
         logon(username, password)
     except HTTPError:
         raise ValueError('esgf._download_files: Invalid login credentials')

     for url in file_urls:
         download(url, toDirectory=download_directory)
	#
	# Licensed to the Apache Software Foundation (ASF) under one
	# or more contributor license agreements. See the NOTICE file
	# distributed with this work for additional information
	# regarding copyright ownership. The ASF licenses this file
	# to you under the Apache License, Version 2.0 (the
	# "License"); you may not use this file except in compliance
	# with the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing,
	# software distributed under the License is distributed on an
	# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	# KIND, either express or implied. See the License for the
	# specific language governing permissions and limitations
	# under the License.
	#
	"""
	A set of functions to wrap downloading ESGF datasets into an OCW dataset object.

	* Note * The ESGF data source requires that the user have certain credentials downloaded from
	the ESG. The current version of the module should download these automatically. Older versions of
	the library will not download them. The solution is to use the WGET script from the EGS to download
	a test dataset to get the credentials. The data source should work as expected then.

	"""
	import os
	import sys

	import requests
	from bs4 import BeautifulSoup

	import ocw.data_source.local as local
	from ocw.esgf.constants import DEFAULT_ESGF_SEARCH
	from ocw.esgf.download import download
	from ocw.esgf.logon import logon

	if sys.version_info[0] >= 3:
	from urllib.error import HTTPError
	else:
	# Not Python 3 - today, it is most likely to be Python 2
	# But note that this might need an update when Python 4
	# might be around one day
	from urllib2 import HTTPError


	def load_dataset(dataset_id,
	variable_name,
	esgf_username,
	esgf_password,
	search_url=DEFAULT_ESGF_SEARCH,
	elevation_index=0,
	name='',
	save_path='/tmp'):
	""" Load an ESGF dataset.

	:param dataset_id: The ESGF ID of the dataset to load.
	:type dataset_id: :mod:`string`

	:param variable_name: The variable to load.
	:type variable_name: :mod:`string`

	:param esgf_username: ESGF OpenID value to use for authentication.
	:type esgf_username: :mod:`string`

	:param esgf_password: ESGF Password to use for authentication.
	:type esgf_password: :mod:`string`

	:param search_url: (Optional) The ESGF node to use for searching. Defaults
	to the Jet Propulsion Laboratory node.
	:type search_url: :mod:`string`

	:param elevation_index: (Optional) The elevation level to strip out when
	loading the dataset using ocw.data_source.local.
	:type elevation_index: :class:`int`

	:param name: (Optional) A name for the loaded dataset.
	:type name: :mod:`string`

	:param save_path: (Optional) Path to where downloaded files should be saved.
	:type save_path: :mod:`string`

	:returns: A :class:`list` of :class:`dataset.Dataset` contained the
	requested dataset. If the dataset is stored in multiple files each will
	be loaded into a separate :class:`dataset.Dataset`.

	:raises ValueError: If no dataset can be found for the supplied ID and
	variable, or if the requested dataset is a multi-file dataset.
	"""
	download_data = \
	_get_file_download_data(url=search_url, dataset_id=dataset_id, variable=variable_name)

	datasets = []

	for url, var in download_data:
	_download_files([url], esgf_username, esgf_password, download_directory=save_path)

	file_save_path = os.path.join(save_path, url.split('/')[-1])

	datasets.append(local.load_file(file_save_path, var, name=name,
	elevation_index=elevation_index))

	origin = {
	'source': 'esgf',
	'dataset_id': dataset_id,
	'variable': variable_name
	}

	for dataset in datasets:
	dataset.origin = origin

	return datasets


	def _get_file_download_data(dataset_id, variable, url=DEFAULT_ESGF_SEARCH):
	""""""
	url += '?type=File&dataset_id={}&variable={}'
	url = url.format(dataset_id, variable)

	raw_data = requests.get(url)
	xml = BeautifulSoup(raw_data.content, "html.parser")

	dont_have_results = not bool(xml.response.result['numfound'])

	if dont_have_results:
	err = "esgf.load_dataset: No files found for specified dataset."
	raise ValueError(err)

	# Split out URLs for dataset download along with variable names for each
	# of those files.
	url_groups = xml.response.result.findAll('arr', {'name': 'url'})
	variable_groups = xml.response.result.findAll('arr', {'name': 'variable'})

	urls = [group.findAll('str')[0].string.split('\|')[0]
	for group in url_groups]
	variables = [group.findAll('str')[0].string
	for group in variable_groups]

	return zip(urls, variables)


	def _download_files(file_urls, username, password, download_directory='/tmp'):
	""""""
	try:
	logon(username, password)
	except HTTPError:
	raise ValueError('esgf._download_files: Invalid login credentials')

	for url in file_urls:
	download(url, toDirectory=download_directory)