ocw/data_source/esgf.py - climate - Git at Google

 #
 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #    http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
 #

 import urllib2

 from ocw.esgf.constants import DEFAULT_ESGF_SEARCH
 from ocw.esgf.download import download
 from ocw.esgf.logon2 import logon2
 from ocw.esgf.search import SearchClient
 import ocw.data_source.local as local

 from bs4 import BeautifulSoup
 import requests

 def load_dataset(dataset_id,
                  variable,
                  esgf_username,
                  esgf_password,
                  search_url=DEFAULT_ESGF_SEARCH,
                  elevation_index=0,
                  name='',
                  **additional_constraints):
     ''' Load an ESGF dataset.

     :param dataset_id: The ESGF ID of the dataset to load.
     :type dataset_id: :mod:`string`

     :param variable: The variable to load.
     :type variable: :mod:`string`

     :param esgf_username: ESGF OpenID value to use for authentication.
     :type esgf_username: :mod:`string`

     :param esgf_password: ESGF Password to use for authentication.
     :type esgf_password: :mod:`string`

     :param search_url: (Optional) The ESGF node to use for searching. Defaults
         to the Jet Propulsion Laboratory node.
     :type search_url: :mod:`string`

     :param elevation_index: (Optional) The elevation level to strip out when
         loading the dataset using ocw.data_source.local.
     :type elevation_index: :class:`int`

     :param name: (Optional) A name for the loaded dataset.
     :type name: :mod:`string`

     :param additional_constraints: (Optional) Additional key,value pairs to
         pass as constraints to the search wrapper. These can be anything found
         on the ESGF metadata page for a dataset.

     :returns: A :class:`list` of :class:`dataset.Dataset` contained the
         requested dataset. If the dataset is stored in multiple files each will
         be loaded into a separate :class:`dataset.Dataset`.

     :raises ValueError: If no dataset can be found for the supplied ID and
         variable, or if the requested dataset is a multi-file dataset.
     '''
     download_data = _get_file_download_data(url=search_url,
                                             dataset_id=dataset_id,
                                             variable=variable)

     datasets = []
     for url, var in download_data:
         _download_files([url], esgf_username, esgf_password)
         datasets.append(local.load_file('/tmp/' + url.split('/')[-1],
                                         var,
                                         name=name,
                                         elevation_index=elevation_index))

     return datasets

 def _get_file_download_data(dataset_id, variable, url=DEFAULT_ESGF_SEARCH):
     ''''''
     url += '?distrib=false&type=File&dataset_id={}&variable={}'
     url = url.format(dataset_id, variable)

     r = requests.get(url)
     xml = BeautifulSoup(r.content)

     dont_have_results = not bool(xml.response.result['numfound'])

     if dont_have_results:
         err = "esgf.load_dataset: No files found for specified dataset."
         raise ValueError(err)

     # Split out URLs for dataset download along with variable names for each
     # of those files.
     url_groups = xml.response.result.findAll('arr', {'name': 'url'})
     variable_groups = xml.response.result.findAll('arr', {'name': 'variable'})

     urls = [group.findAll('str')[0].string.split('|')[0]
             for group in url_groups]
     variables = [group.findAll('str')[0].string
                  for group in variable_groups]

     return zip(urls, variables)

 def _download_files(file_urls, username, password, download_directory='/tmp'):
     ''''''
     try:
         logon2(username, password)
     except urllib2.HTTPError:
         raise ValueError('esgf._download_files: Invalid login credentials')

     for url in file_urls:
         download(url, toDirectory=download_directory)
	#
	# Licensed to the Apache Software Foundation (ASF) under one
	# or more contributor license agreements. See the NOTICE file
	# distributed with this work for additional information
	# regarding copyright ownership. The ASF licenses this file
	# to you under the Apache License, Version 2.0 (the
	# "License"); you may not use this file except in compliance
	# with the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing,
	# software distributed under the License is distributed on an
	# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	# KIND, either express or implied. See the License for the
	# specific language governing permissions and limitations
	# under the License.
	#

	import urllib2

	from ocw.esgf.constants import DEFAULT_ESGF_SEARCH
	from ocw.esgf.download import download
	from ocw.esgf.logon2 import logon2
	from ocw.esgf.search import SearchClient
	import ocw.data_source.local as local

	from bs4 import BeautifulSoup
	import requests

	def load_dataset(dataset_id,
	variable,
	esgf_username,
	esgf_password,
	search_url=DEFAULT_ESGF_SEARCH,
	elevation_index=0,
	name='',
	**additional_constraints):
	''' Load an ESGF dataset.

	:param dataset_id: The ESGF ID of the dataset to load.
	:type dataset_id: :mod:`string`

	:param variable: The variable to load.
	:type variable: :mod:`string`

	:param esgf_username: ESGF OpenID value to use for authentication.
	:type esgf_username: :mod:`string`

	:param esgf_password: ESGF Password to use for authentication.
	:type esgf_password: :mod:`string`

	:param search_url: (Optional) The ESGF node to use for searching. Defaults
	to the Jet Propulsion Laboratory node.
	:type search_url: :mod:`string`

	:param elevation_index: (Optional) The elevation level to strip out when
	loading the dataset using ocw.data_source.local.
	:type elevation_index: :class:`int`

	:param name: (Optional) A name for the loaded dataset.
	:type name: :mod:`string`

	:param additional_constraints: (Optional) Additional key,value pairs to
	pass as constraints to the search wrapper. These can be anything found
	on the ESGF metadata page for a dataset.

	:returns: A :class:`list` of :class:`dataset.Dataset` contained the
	requested dataset. If the dataset is stored in multiple files each will
	be loaded into a separate :class:`dataset.Dataset`.

	:raises ValueError: If no dataset can be found for the supplied ID and
	variable, or if the requested dataset is a multi-file dataset.
	'''
	download_data = _get_file_download_data(url=search_url,
	dataset_id=dataset_id,
	variable=variable)

	datasets = []
	for url, var in download_data:
	_download_files([url], esgf_username, esgf_password)
	datasets.append(local.load_file('/tmp/' + url.split('/')[-1],
	var,
	name=name,
	elevation_index=elevation_index))

	return datasets

	def _get_file_download_data(dataset_id, variable, url=DEFAULT_ESGF_SEARCH):
	''''''
	url += '?distrib=false&type=File&dataset_id={}&variable={}'
	url = url.format(dataset_id, variable)

	r = requests.get(url)
	xml = BeautifulSoup(r.content)

	dont_have_results = not bool(xml.response.result['numfound'])

	if dont_have_results:
	err = "esgf.load_dataset: No files found for specified dataset."
	raise ValueError(err)

	# Split out URLs for dataset download along with variable names for each
	# of those files.
	url_groups = xml.response.result.findAll('arr', {'name': 'url'})
	variable_groups = xml.response.result.findAll('arr', {'name': 'variable'})

	urls = [group.findAll('str')[0].string.split('\|')[0]
	for group in url_groups]
	variables = [group.findAll('str')[0].string
	for group in variable_groups]

	return zip(urls, variables)

	def _download_files(file_urls, username, password, download_directory='/tmp'):
	''''''
	try:
	logon2(username, password)
	except urllib2.HTTPError:
	raise ValueError('esgf._download_files: Invalid login credentials')

	for url in file_urls:
	download(url, toDirectory=download_directory)