ocw/data_source/local.py - climate - Git at Google

 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #    http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.

 import calendar
 from datetime import timedelta ,datetime
 from time import strptime
 from glob import glob
 import re
 import string
 import os

 from ocw.dataset import Dataset
 import ocw.utils as utils

 import netCDF4
 import numpy
 import numpy.ma as ma

 LAT_NAMES = ['x', 'rlat', 'rlats', 'lat', 'lats', 'latitude', 'latitudes']
 LON_NAMES = ['y', 'rlon', 'rlons', 'lon', 'lons', 'longitude', 'longitudes']
 TIME_NAMES = ['time', 'times', 'date', 'dates', 'julian']


 def _get_netcdf_variable_name(valid_var_names, netcdf, netcdf_var):
     ''' Determine if one of a set of variable names are in a NetCDF Dataset.

     Looks for an occurrence of a valid_var_name in the NetCDF variable data.
     This is useful for automatically determining the names of the lat, lon,
     and time variable names inside of a dataset object.

     :param valid_var_names: The possible variable names to search for in
         the netCDF object.
     :type valid_var_names: List of Strings
     :param netcdf: The netCDF Dataset object in which to check for
         valid_var_names.
     :type netcdf: netcdf4.Dataset
     :param netcdf_var: The relevant variable name to search over in the
         netcdf object. This is used to narrow down the search for valid
         variable names by first checking the desired variable's dimension
         values for one or more of the valid variable names.

     :returns: The variable from valid_var_names that it locates in
         the netCDF object.

     :raises ValueError: When unable to locate a single matching variable
         name in the NetCDF Dataset from the supplied list of valid variable
         names.
     '''

     # Check for valid variable names in netCDF variable dimensions
     dimensions = netcdf.variables[netcdf_var].dimensions
     dims_lower = [dim.encode().lower() for dim in dimensions]

     intersect = set(valid_var_names).intersection(dims_lower)

     if len(intersect) == 1:
         # Retrieve the name of the dimension where we found the matching
         # variable name
         index = dims_lower.index(intersect.pop())
         dimension_name = dimensions[index].encode()

         # Locate all of the variables that share the dimension that we matched
         # earlier. If the dimension's name matches then that variable is
         # potentially what we want to return to the user.
         possible_vars = []
         for var in netcdf.variables.keys():
             var_dimensions = netcdf.variables[var].dimensions

             # Skip any dimensions are > 1D
             if len(var_dimensions) != 1:
                 continue

             if var_dimensions[0].encode() == dimension_name:
                 possible_vars.append(var)

         # If there are multiple variables with matching dimension names then we
         # aren't able to determining the correct variable name using the
         # variable dimensions. We need to try a different approach. Otherwise,
         # we're done!
         if len(possible_vars) == 1:
             return possible_vars[0]

     # Check for valid variable names in netCDF variable names
     variables = netcdf.variables.keys()
     vars_lower = [var.encode().lower() for var in variables]

     intersect = set(valid_var_names).intersection(vars_lower)

     if len(intersect) == 1:
         index = vars_lower.index(intersect.pop())
         return variables[index]

     # If we couldn't locate a single matching valid variable then we're unable
     # to automatically determine the variable names for the user.
     error = (
         "Unable to locate a single matching variable name from the "
         "supplied list of valid variable names. "
     )
     raise ValueError(error)

 def load_WRF_2d_files(file_path,
                       filename_pattern,
                       variable_name,
                       name=''):
     ''' Load multiple WRF (or nuWRF) original output files containing 2D fields such as precipitation and surface variables into a Dataset.
     The dataset can be spatially subset.
     :param file_path: Directory to the NetCDF file to load.
     :type file_path: :mod:`string`
     :param filename_pattern: Path to the NetCDF file to load.
     :type filename_pattern: :list:`string`
     :param variable_name: The variable name to load from the NetCDF file.
     :type variable_name: :mod:`string`
     :param name: (Optional) A name for the loaded dataset.
     :type name: :mod:`string`
     :returns: An OCW Dataset object with the requested variable's data from
         the NetCDF file.
     :rtype: :class:`dataset.Dataset`
     :raises ValueError:
     '''

     WRF_files = []
     for pattern in filename_pattern:
         WRF_files.extend(glob(file_path + pattern))
     WRF_files.sort()

     file_object_first = netCDF4.Dataset(WRF_files[0])
     lats = file_object_first.variables['XLAT'][0,:]
     lons = file_object_first.variables['XLONG'][0,:]

     times = []
     for ifile, file in enumerate(WRF_files):
         file_object = netCDF4.Dataset(file)
         time_struct_parsed = strptime(file[-19:],"%Y-%m-%d_%H:%M:%S")
         for ihour in numpy.arange(24):
             times.append(datetime(*time_struct_parsed[:6]) + timedelta(hours=ihour))
         values0= file_object.variables[variable_name][:]
         if ifile == 0:
             values = file_object.variables[variable_name][:]
         else:
             values = numpy.concatenate((values, file_object.variables[variable_name][:]))
         file_object.close()
     times = numpy.array(times)
     return Dataset(lats, lons, times, values, variable_name, name=name)

 def load_file(file_path,
               variable_name,
               variable_unit = None,
               elevation_index=0,
               name='',
               lat_name=None,
               lon_name=None,
               time_name=None):
     ''' Load a NetCDF file into a Dataset.

     :param file_path: Path to the NetCDF file to load.
     :type file_path: :mod:`string`

     :param variable_name: The variable name to load from the NetCDF file.
     :type variable_name: :mod:`string`

     :param variable_unit: (Optional) The variable unit to load from the NetCDF file.
     :type variable_unit: :mod:`string`

     :param elevation_index: (Optional) The elevation index for which data should
         be returned. Climate data is often times 4 dimensional data. Some
         datasets will have readins at different height/elevation levels. OCW
         expects 3D data so a single layer needs to be stripped out when loading.
         By default, the first elevation layer is used. If desired you may
         specify the elevation value to use.
     :type elevation_index: :class:`int`

     :param name: (Optional) A name for the loaded dataset.
     :type name: :mod:`string`

     :param lat_name: (Optional) The latitude variable name to extract from the
         dataset.
     :type lat_name: :mod:`string`

     :param lon_name: (Optional) The longitude variable name to extract from the
         dataset.
     :type lon_name: :mod:`string`

     :param time_name: (Optional) The time variable name to extract from the
         dataset.
     :type time_name: :mod:`string`

     :returns: An OCW Dataset object with the requested variable's data from
         the NetCDF file.
     :rtype: :class:`dataset.Dataset`

     :raises ValueError: When the specified file path cannot be loaded by ndfCDF4
         or when the lat/lon/time variable name cannot be determined
         automatically.
     '''

     try:
         netcdf = netCDF4.Dataset(file_path, mode='r')
     except RuntimeError:
         err = "Dataset filepath is invalid. Please ensure it is correct."
         raise ValueError(err)
     except:
         err = (
             "The given file cannot be loaded. Please ensure that it is a valid "
             "NetCDF file. If problems persist, report them to the project's "
             "mailing list."
         )
         raise ValueError(err)

     if not lat_name:
         lat_name = _get_netcdf_variable_name(LAT_NAMES, netcdf, variable_name)
     if not lon_name:
         lon_name = _get_netcdf_variable_name(LON_NAMES, netcdf, variable_name)
     if not time_name:
         time_name = _get_netcdf_variable_name(TIME_NAMES, netcdf, variable_name)

     lats = netcdf.variables[lat_name][:]
     lons = netcdf.variables[lon_name][:]
     time_raw_values = netcdf.variables[time_name][:]
     times = utils.decode_time_values(netcdf, time_name)
     times = numpy.array(times)
     values = ma.array(netcdf.variables[variable_name][:])
     variable_unit = netcdf.variables[variable_name].units

     # If the values are 4D then we need to strip out the elevation index
     if len(values.shape) == 4:
         # Determine the set of possible elevation dimension names excluding
         # the list of names that are used for the lat, lon, and time values.
         dims = netcdf.variables[variable_name].dimensions
         dimension_names = [dim_name.encode() for dim_name in dims]
         lat_lon_time_var_names = [lat_name, lon_name, time_name]

         elev_names = set(dimension_names) - set(lat_lon_time_var_names)

         # Grab the index value for the elevation values
         level_index = dimension_names.index(elev_names.pop())

         # Strip out the elevation values so we're left with a 3D array.
         if level_index == 0:
             values = values [elevation_index,:,:,:]
         elif level_index == 1:
             values = values [:,elevation_index,:,:]
         elif level_index == 2:
             values = values [:,:,elevation_index,:]
         else:
             values = values [:,:,:,elevation_index]

     origin = {
         'source': 'local',
         'path': file_path,
         'lat_name': lat_name,
         'lon_name': lon_name,
         'time_name': time_name
     }
     if elevation_index != 0: origin['elevation_index'] = elevation_index

     return Dataset(lats, lons, times, values, variable=variable_name,
                    units=variable_unit, name=name, origin=origin)

 def load_multiple_files(file_path,
                         filename_pattern,
                         variable_name,
                         dataset_name='ref',
                         variable_unit=None,
                         lat_name=None,
                         lon_name=None,
                         time_name=None):
     ''' load multiple netcdf files with common filename pattern and return an array of OCW datasets

     :param file_path: directory name where the NetCDF files to load are stored.
     :type file_path: :mod:`string`
     :param filename_pattern: common file name patterns
     :type filename_pattern: :list:`string`
     :param dataset_name: a name of dataset when reading a single file
     :type dataset_name: :mod:'string'
     :param variable_name: The variable name to load from the NetCDF file.
     :type variable_name: :mod:`string`
     :param variable_unit: (Optional) The variable unit to load from the NetCDF file.
     :type variable_unit: :mod:`string`
     :param elevation_index: (Optional) The elevation index for which data should
         be returned. Climate data is often times 4 dimensional data. Some
         datasets will have readins at different height/elevation levels. OCW
         expects 3D data so a single layer needs to be stripped out when loading.
         By default, the first elevation layer is used. If desired you may
         specify the elevation value to use.
     :param lat_name: (Optional) The latitude variable name to extract from the
         dataset.
     :type lat_name: :mod:`string`
     :param lon_name: (Optional) The longitude variable name to extract from the
         dataset.
     :type lon_name: :mod:`string`
     :param time_name: (Optional) The time variable name to extract from the
         dataset.
     :type time_name: :mod:`string`
     :returns: An array of OCW Dataset objects, an array of dataset names
     :rtype: :class:`list`
     '''

     data_filenames = []
     for pattern in filename_pattern:
         data_filenames.extend(glob(file_path + pattern))
     data_filenames.sort()

     # number of files
     ndata = len(data_filenames)
     if ndata == 1:
         data_name = [dataset_name]
     else:
         data_name = []
         data_filenames_reversed = []
         for element in data_filenames:
             data_filenames_reversed.append(element[::-1])
         prefix = os.path.commonprefix(data_filenames)
         postfix = os.path.commonprefix(data_filenames_reversed)[::-1]
         for element in data_filenames:
             data_name.append(element.replace(prefix,'').replace(postfix,''))

     datasets = []
     for ifile,filename in enumerate(data_filenames):
         datasets.append(load_file(filename, variable_name, variable_unit, name=data_name[ifile],
                         lat_name=lat_name, lon_name=lon_name, time_name=time_name))

     return datasets
	# Licensed to the Apache Software Foundation (ASF) under one
	# or more contributor license agreements. See the NOTICE file
	# distributed with this work for additional information
	# regarding copyright ownership. The ASF licenses this file
	# to you under the Apache License, Version 2.0 (the
	# "License"); you may not use this file except in compliance
	# with the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing,
	# software distributed under the License is distributed on an
	# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	# KIND, either express or implied. See the License for the
	# specific language governing permissions and limitations
	# under the License.

	import calendar
	from datetime import timedelta ,datetime
	from time import strptime
	from glob import glob
	import re
	import string
	import os

	from ocw.dataset import Dataset
	import ocw.utils as utils

	import netCDF4
	import numpy
	import numpy.ma as ma

	LAT_NAMES = ['x', 'rlat', 'rlats', 'lat', 'lats', 'latitude', 'latitudes']
	LON_NAMES = ['y', 'rlon', 'rlons', 'lon', 'lons', 'longitude', 'longitudes']
	TIME_NAMES = ['time', 'times', 'date', 'dates', 'julian']


	def _get_netcdf_variable_name(valid_var_names, netcdf, netcdf_var):
	''' Determine if one of a set of variable names are in a NetCDF Dataset.

	Looks for an occurrence of a valid_var_name in the NetCDF variable data.
	This is useful for automatically determining the names of the lat, lon,
	and time variable names inside of a dataset object.

	:param valid_var_names: The possible variable names to search for in
	the netCDF object.
	:type valid_var_names: List of Strings
	:param netcdf: The netCDF Dataset object in which to check for
	valid_var_names.
	:type netcdf: netcdf4.Dataset
	:param netcdf_var: The relevant variable name to search over in the
	netcdf object. This is used to narrow down the search for valid
	variable names by first checking the desired variable's dimension
	values for one or more of the valid variable names.

	:returns: The variable from valid_var_names that it locates in
	the netCDF object.

	:raises ValueError: When unable to locate a single matching variable
	name in the NetCDF Dataset from the supplied list of valid variable
	names.
	'''

	# Check for valid variable names in netCDF variable dimensions
	dimensions = netcdf.variables[netcdf_var].dimensions
	dims_lower = [dim.encode().lower() for dim in dimensions]

	intersect = set(valid_var_names).intersection(dims_lower)

	if len(intersect) == 1:
	# Retrieve the name of the dimension where we found the matching
	# variable name
	index = dims_lower.index(intersect.pop())
	dimension_name = dimensions[index].encode()

	# Locate all of the variables that share the dimension that we matched
	# earlier. If the dimension's name matches then that variable is
	# potentially what we want to return to the user.
	possible_vars = []
	for var in netcdf.variables.keys():
	var_dimensions = netcdf.variables[var].dimensions

	# Skip any dimensions are > 1D
	if len(var_dimensions) != 1:
	continue

	if var_dimensions[0].encode() == dimension_name:
	possible_vars.append(var)

	# If there are multiple variables with matching dimension names then we
	# aren't able to determining the correct variable name using the
	# variable dimensions. We need to try a different approach. Otherwise,
	# we're done!
	if len(possible_vars) == 1:
	return possible_vars[0]

	# Check for valid variable names in netCDF variable names
	variables = netcdf.variables.keys()
	vars_lower = [var.encode().lower() for var in variables]

	intersect = set(valid_var_names).intersection(vars_lower)

	if len(intersect) == 1:
	index = vars_lower.index(intersect.pop())
	return variables[index]

	# If we couldn't locate a single matching valid variable then we're unable
	# to automatically determine the variable names for the user.
	error = (
	"Unable to locate a single matching variable name from the "
	"supplied list of valid variable names. "
	)
	raise ValueError(error)

	def load_WRF_2d_files(file_path,
	filename_pattern,
	variable_name,
	name=''):
	''' Load multiple WRF (or nuWRF) original output files containing 2D fields such as precipitation and surface variables into a Dataset.
	The dataset can be spatially subset.
	:param file_path: Directory to the NetCDF file to load.
	:type file_path: :mod:`string`
	:param filename_pattern: Path to the NetCDF file to load.
	:type filename_pattern: :list:`string`
	:param variable_name: The variable name to load from the NetCDF file.
	:type variable_name: :mod:`string`
	:param name: (Optional) A name for the loaded dataset.
	:type name: :mod:`string`
	:returns: An OCW Dataset object with the requested variable's data from
	the NetCDF file.
	:rtype: :class:`dataset.Dataset`
	:raises ValueError:
	'''

	WRF_files = []
	for pattern in filename_pattern:
	WRF_files.extend(glob(file_path + pattern))
	WRF_files.sort()

	file_object_first = netCDF4.Dataset(WRF_files[0])
	lats = file_object_first.variables['XLAT'][0,:]
	lons = file_object_first.variables['XLONG'][0,:]

	times = []
	for ifile, file in enumerate(WRF_files):
	file_object = netCDF4.Dataset(file)
	time_struct_parsed = strptime(file[-19:],"%Y-%m-%d_%H:%M:%S")
	for ihour in numpy.arange(24):
	times.append(datetime(*time_struct_parsed[:6]) + timedelta(hours=ihour))
	values0= file_object.variables[variable_name][:]
	if ifile == 0:
	values = file_object.variables[variable_name][:]
	else:
	values = numpy.concatenate((values, file_object.variables[variable_name][:]))
	file_object.close()
	times = numpy.array(times)
	return Dataset(lats, lons, times, values, variable_name, name=name)

	def load_file(file_path,
	variable_name,
	variable_unit = None,
	elevation_index=0,
	name='',
	lat_name=None,
	lon_name=None,
	time_name=None):
	''' Load a NetCDF file into a Dataset.

	:param file_path: Path to the NetCDF file to load.
	:type file_path: :mod:`string`

	:param variable_name: The variable name to load from the NetCDF file.
	:type variable_name: :mod:`string`

	:param variable_unit: (Optional) The variable unit to load from the NetCDF file.
	:type variable_unit: :mod:`string`

	:param elevation_index: (Optional) The elevation index for which data should
	be returned. Climate data is often times 4 dimensional data. Some
	datasets will have readins at different height/elevation levels. OCW
	expects 3D data so a single layer needs to be stripped out when loading.
	By default, the first elevation layer is used. If desired you may
	specify the elevation value to use.
	:type elevation_index: :class:`int`

	:param name: (Optional) A name for the loaded dataset.
	:type name: :mod:`string`

	:param lat_name: (Optional) The latitude variable name to extract from the
	dataset.
	:type lat_name: :mod:`string`

	:param lon_name: (Optional) The longitude variable name to extract from the
	dataset.
	:type lon_name: :mod:`string`

	:param time_name: (Optional) The time variable name to extract from the
	dataset.
	:type time_name: :mod:`string`

	:returns: An OCW Dataset object with the requested variable's data from
	the NetCDF file.
	:rtype: :class:`dataset.Dataset`

	:raises ValueError: When the specified file path cannot be loaded by ndfCDF4
	or when the lat/lon/time variable name cannot be determined
	automatically.
	'''

	try:
	netcdf = netCDF4.Dataset(file_path, mode='r')
	except RuntimeError:
	err = "Dataset filepath is invalid. Please ensure it is correct."
	raise ValueError(err)
	except:
	err = (
	"The given file cannot be loaded. Please ensure that it is a valid "
	"NetCDF file. If problems persist, report them to the project's "
	"mailing list."
	)
	raise ValueError(err)

	if not lat_name:
	lat_name = _get_netcdf_variable_name(LAT_NAMES, netcdf, variable_name)
	if not lon_name:
	lon_name = _get_netcdf_variable_name(LON_NAMES, netcdf, variable_name)
	if not time_name:
	time_name = _get_netcdf_variable_name(TIME_NAMES, netcdf, variable_name)

	lats = netcdf.variables[lat_name][:]
	lons = netcdf.variables[lon_name][:]
	time_raw_values = netcdf.variables[time_name][:]
	times = utils.decode_time_values(netcdf, time_name)
	times = numpy.array(times)
	values = ma.array(netcdf.variables[variable_name][:])
	variable_unit = netcdf.variables[variable_name].units

	# If the values are 4D then we need to strip out the elevation index
	if len(values.shape) == 4:
	# Determine the set of possible elevation dimension names excluding
	# the list of names that are used for the lat, lon, and time values.
	dims = netcdf.variables[variable_name].dimensions
	dimension_names = [dim_name.encode() for dim_name in dims]
	lat_lon_time_var_names = [lat_name, lon_name, time_name]

	elev_names = set(dimension_names) - set(lat_lon_time_var_names)

	# Grab the index value for the elevation values
	level_index = dimension_names.index(elev_names.pop())

	# Strip out the elevation values so we're left with a 3D array.
	if level_index == 0:
	values = values [elevation_index,:,:,:]
	elif level_index == 1:
	values = values [:,elevation_index,:,:]
	elif level_index == 2:
	values = values [:,:,elevation_index,:]
	else:
	values = values [:,:,:,elevation_index]

	origin = {
	'source': 'local',
	'path': file_path,
	'lat_name': lat_name,
	'lon_name': lon_name,
	'time_name': time_name
	}
	if elevation_index != 0: origin['elevation_index'] = elevation_index

	return Dataset(lats, lons, times, values, variable=variable_name,
	units=variable_unit, name=name, origin=origin)

	def load_multiple_files(file_path,
	filename_pattern,
	variable_name,
	dataset_name='ref',
	variable_unit=None,
	lat_name=None,
	lon_name=None,
	time_name=None):
	''' load multiple netcdf files with common filename pattern and return an array of OCW datasets

	:param file_path: directory name where the NetCDF files to load are stored.
	:type file_path: :mod:`string`
	:param filename_pattern: common file name patterns
	:type filename_pattern: :list:`string`
	:param dataset_name: a name of dataset when reading a single file
	:type dataset_name: :mod:'string'
	:param variable_name: The variable name to load from the NetCDF file.
	:type variable_name: :mod:`string`
	:param variable_unit: (Optional) The variable unit to load from the NetCDF file.
	:type variable_unit: :mod:`string`
	:param elevation_index: (Optional) The elevation index for which data should
	be returned. Climate data is often times 4 dimensional data. Some
	datasets will have readins at different height/elevation levels. OCW
	expects 3D data so a single layer needs to be stripped out when loading.
	By default, the first elevation layer is used. If desired you may
	specify the elevation value to use.
	:param lat_name: (Optional) The latitude variable name to extract from the
	dataset.
	:type lat_name: :mod:`string`
	:param lon_name: (Optional) The longitude variable name to extract from the
	dataset.
	:type lon_name: :mod:`string`
	:param time_name: (Optional) The time variable name to extract from the
	dataset.
	:type time_name: :mod:`string`
	:returns: An array of OCW Dataset objects, an array of dataset names
	:rtype: :class:`list`
	'''

	data_filenames = []
	for pattern in filename_pattern:
	data_filenames.extend(glob(file_path + pattern))
	data_filenames.sort()

	# number of files
	ndata = len(data_filenames)
	if ndata == 1:
	data_name = [dataset_name]
	else:
	data_name = []
	data_filenames_reversed = []
	for element in data_filenames:
	data_filenames_reversed.append(element[::-1])
	prefix = os.path.commonprefix(data_filenames)
	postfix = os.path.commonprefix(data_filenames_reversed)[::-1]
	for element in data_filenames:
	data_name.append(element.replace(prefix,'').replace(postfix,''))

	datasets = []
	for ifile,filename in enumerate(data_filenames):
	datasets.append(load_file(filename, variable_name, variable_unit, name=data_name[ifile],
	lat_name=lat_name, lon_name=lon_name, time_name=time_name))

	return datasets