blob: 6b2f042f48d76340810b56180e4469ea3356cd2f [file] [log] [blame]
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#
"""
A set of functions to wrap downloading ESGF datasets into an OCW dataset object.
*** Note *** The ESGF data source requires that the user have certain credentials downloaded from
the ESG. The current version of the module should download these automatically. Older versions of
the library will not download them. The solution is to use the WGET script from the EGS to download
a test dataset to get the credentials. The data source should work as expected then.
"""
import os
import sys
import requests
from bs4 import BeautifulSoup
import ocw.data_source.local as local
from ocw.esgf.constants import DEFAULT_ESGF_SEARCH
from ocw.esgf.download import download
from ocw.esgf.logon import logon
if sys.version_info[0] >= 3:
from urllib.error import HTTPError
else:
# Not Python 3 - today, it is most likely to be Python 2
# But note that this might need an update when Python 4
# might be around one day
from urllib2 import HTTPError
def load_dataset(dataset_id,
variable_name,
esgf_username,
esgf_password,
search_url=DEFAULT_ESGF_SEARCH,
elevation_index=0,
name='',
save_path='/tmp'):
""" Load an ESGF dataset.
:param dataset_id: The ESGF ID of the dataset to load.
:type dataset_id: :mod:`string`
:param variable_name: The variable to load.
:type variable_name: :mod:`string`
:param esgf_username: ESGF OpenID value to use for authentication.
:type esgf_username: :mod:`string`
:param esgf_password: ESGF Password to use for authentication.
:type esgf_password: :mod:`string`
:param search_url: (Optional) The ESGF node to use for searching. Defaults
to the Jet Propulsion Laboratory node.
:type search_url: :mod:`string`
:param elevation_index: (Optional) The elevation level to strip out when
loading the dataset using ocw.data_source.local.
:type elevation_index: :class:`int`
:param name: (Optional) A name for the loaded dataset.
:type name: :mod:`string`
:param save_path: (Optional) Path to where downloaded files should be saved.
:type save_path: :mod:`string`
:returns: A :class:`list` of :class:`dataset.Dataset` contained the
requested dataset. If the dataset is stored in multiple files each will
be loaded into a separate :class:`dataset.Dataset`.
:raises ValueError: If no dataset can be found for the supplied ID and
variable, or if the requested dataset is a multi-file dataset.
"""
download_data = \
_get_file_download_data(url=search_url, dataset_id=dataset_id, variable=variable_name)
datasets = []
for url, var in download_data:
_download_files([url], esgf_username, esgf_password, download_directory=save_path)
file_save_path = os.path.join(save_path, url.split('/')[-1])
datasets.append(local.load_file(file_save_path, var, name=name,
elevation_index=elevation_index))
origin = {
'source': 'esgf',
'dataset_id': dataset_id,
'variable': variable_name
}
for dataset in datasets:
dataset.origin = origin
return datasets
def _get_file_download_data(dataset_id, variable, url=DEFAULT_ESGF_SEARCH):
""""""
url += '?type=File&dataset_id={}&variable={}'
url = url.format(dataset_id, variable)
raw_data = requests.get(url)
xml = BeautifulSoup(raw_data.content, "html.parser")
dont_have_results = not bool(xml.response.result['numfound'])
if dont_have_results:
err = "esgf.load_dataset: No files found for specified dataset."
raise ValueError(err)
# Split out URLs for dataset download along with variable names for each
# of those files.
url_groups = xml.response.result.findAll('arr', {'name': 'url'})
variable_groups = xml.response.result.findAll('arr', {'name': 'variable'})
urls = [group.findAll('str')[0].string.split('|')[0]
for group in url_groups]
variables = [group.findAll('str')[0].string
for group in variable_groups]
return zip(urls, variables)
def _download_files(file_urls, username, password, download_directory='/tmp'):
""""""
try:
logon(username, password)
except HTTPError:
raise ValueError('esgf._download_files: Invalid login credentials')
for url in file_urls:
download(url, toDirectory=download_directory)