blob: d7b1a3f37e5651e11e2795b3c6a7826db466f062 [file] [log] [blame]
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# See the License for the specific language governing permissions and
# limitations under the License.
from ocw import dataset as ds
import datetime
import numpy as np
import as ma
import scipy.interpolate
import scipy.ndimage
from scipy.ndimage import map_coordinates
import netCDF4
import logging
logger = logging.getLogger(__name__)
def temporal_subset(month_start, month_end, target_dataset, average_each_year=False):
""" Temporally subset data given month_index.
:param month_start: An integer for beginning month (Jan=1)
:type month_start: :class:`int`
:param month_end: An integer for ending month (Jan=1)
:type month_end: :class:`int`
:param target_dataset: Dataset object that needs temporal subsetting
:type target_dataset: Open Climate Workbench Dataset Object
:param average_each_year: If True, output dataset is averaged for each year
:type average_each_year: :class:'boolean'
:returns: A temporal subset OCW Dataset
:rtype: Open Climate Workbench Dataset Object
if month_start > month_end:
month_index = range(month_start,13)
month_index.extend(range(1, month_end+1))
month_index = range(month_start, month_end+1)
dates = target_dataset.times
months = np.array([d.month for d in dates])
time_index = []
for m_value in month_index:
time_index = np.append(time_index, np.where(months == m_value)[0])
if m_value == month_index[0]:
time_index_first = np.min(np.where(months == m_value)[0])
if m_value == month_index[-1]:
time_index_last = np.max(np.where(months == m_value)[0])
time_index = np.sort(time_index)
time_index = time_index[np.where((time_index >= time_index_first) & (time_index <= time_index_last))]
time_index = list(time_index)
new_dataset = ds.Dataset(target_dataset.lats,
if average_each_year:
nmonth = len(month_index)
ntime = new_dataset.times.size
nyear = ntime/nmonth
averaged_time = []
ny, nx = target_dataset.values.shape[1:]
averaged_values =ma.zeros([nyear, ny, nx])
for iyear in np.arange(nyear):
# centered time index of the season between month_start and month_end in each year
center_index = int(nmonth/2)+iyear*nmonth
if nmonth == 1:
center_index = iyear
averaged_values[iyear,:] = ma.average(new_dataset.values[nmonth*iyear:nmonth*iyear+nmonth,:], axis=0)
new_dataset = ds.Dataset(target_dataset.lats,
return new_dataset
def temporal_rebin(target_dataset, temporal_resolution):
""" Rebin a Dataset to a new temporal resolution
:param target_dataset: Dataset object that needs temporal rebinned
:type target_dataset: :class:`dataset.Dataset`
:param temporal_resolution: The new temporal bin size
:type temporal_resolution: :class:`datetime.timedelta`
:returns: A new temporally rebinned Dataset
:rtype: :class:`dataset.Dataset`
# Decode the temporal resolution into a string format that
# _rcmes_calc_average_on_new_time_unit_K() can understand
day_count = temporal_resolution.days
time_unit = None
if day_count == 1:
time_unit = 'daily'
elif day_count > 1 and day_count <= 31:
time_unit = 'monthly'
elif day_count > 31 and day_count <= 366:
time_unit = 'annual'
time_unit = 'full'
masked_values = target_dataset.values.view(ma.MaskedArray)
binned_values, binned_dates = _rcmes_calc_average_on_new_time_unit_K(masked_values, target_dataset.times, time_unit)
binned_dates = np.array(binned_dates)
new_dataset = ds.Dataset(target_dataset.lats,
return new_dataset
def spatial_regrid(target_dataset, new_latitudes, new_longitudes):
""" Regrid a Dataset using the new latitudes and longitudes
:param target_dataset: Dataset object that needs spatially regridded
:type target_dataset: :class:`dataset.Dataset`
:param new_latitudes: Array of latitudes
:type new_latitudes: :class:`numpy.ndarray`
:param new_longitudes: Array of longitudes
:type new_longitudes: :class:`numpy.ndarray`
:returns: A new spatially regridded Dataset
:rtype: :class:`dataset.Dataset`
# Make masked array of shape (times, new_latitudes,new_longitudes)
new_values = ma.zeros([len(target_dataset.times),
# Create grids of the given lats and lons for the underlying API
# NOTE: np.meshgrid() requires inputs (x, y) and returns data
# of shape(y|lat|rows, x|lon|columns). So we pass in lons, lats
# and get back data.shape(lats, lons)
lons, lats = np.meshgrid(target_dataset.lons, target_dataset.lats)
new_lons, new_lats = np.meshgrid(new_longitudes, new_latitudes)
# Convert all lats and lons into Numpy Masked Arrays
lats = ma.array(lats)
lons = ma.array(lons)
new_lats = ma.array(new_lats)
new_lons = ma.array(new_lons)
target_values = ma.array(target_dataset.values)
# Call _rcmes_spatial_regrid on each time slice
for i in range(len(target_dataset.times)):
new_values[i] = _rcmes_spatial_regrid(target_values[i],
# This will call down to the _congrid() function and the lat and lon
# axis will be adjusted with the time axis being held constant
# Create a new Dataset Object to return using new data
regridded_dataset = ds.Dataset(new_latitudes,
return regridded_dataset
def ensemble(datasets):
Generate a single dataset which is the mean of the input datasets
An ensemble datasets combines input datasets assuming the all have
similar shape, dimensions, and units.
:param datasets: Datasets to be used to compose the ensemble dataset from.
All Datasets must be the same shape.
:type datasets: :class:`list` of :class:`dataset.Dataset`
:returns: New Dataset with a name of 'Dataset Ensemble'
:rtype: :class:`dataset.Dataset`
dataset_values = [dataset.values for dataset in datasets]
ensemble_values = ma.mean(dataset_values, axis=0)
# Build new dataset object from the input datasets and the ensemble values and return it
ensemble_dataset = ds.Dataset(datasets[0].lats,
name="Dataset Ensemble")
return ensemble_dataset
def subset(subregion, target_dataset, subregion_name=None):
'''Subset given dataset(s) with subregion information
:param subregion: The Bounds with which to subset the target Dataset.
:type subregion: :class:`dataset.Bounds`
:param target_dataset: The Dataset object to subset.
:type target_dataset: :class:`dataset.Dataset`
:param subregion_name: The subset-ed Dataset name
:type subregion_name: :mod:`string`
:returns: The subset-ed Dataset object
:rtype: :class:`dataset.Dataset`
:raises: ValueError
if not subregion.start:
subregion.start = target_dataset.times[0]
subregion.end = target_dataset.times[-1]
# Ensure that the subregion information is well formed
_are_bounds_contained_by_dataset(subregion, target_dataset)
# Get subregion indices into subregion data
dataset_slices = _get_subregion_slice_indices(subregion, target_dataset)
if not subregion_name:
subregion_name =
# Slice the values array with our calculated slice indices
if target_dataset.values.ndim == 2:
subset_values = ma.zeros([len(target_dataset.values[
subset_values = target_dataset.values[
dataset_slices["lat_start"]:dataset_slices["lat_end"] + 1,
dataset_slices["lon_start"]:dataset_slices["lon_end"] + 1]
elif target_dataset.values.ndim == 3:
subset_values = ma.zeros([len(target_dataset.values[
subset_values = target_dataset.values[
dataset_slices["time_start"]:dataset_slices["time_end"] + 1,
dataset_slices["lat_start"]:dataset_slices["lat_end"] + 1,
dataset_slices["lon_start"]:dataset_slices["lon_end"] + 1]
# Build new dataset with subset information
return ds.Dataset(
# Slice the lats array with our calculated slice indices
dataset_slices["lat_end"] + 1],
# Slice the lons array with our calculated slice indices
dataset_slices["lon_end"] + 1],
# Slice the times array with our calculated slice indices
dataset_slices["time_end"]+ 1],
# Slice the values array with our calculated slice indices
def safe_subset(subregion, target_dataset, subregion_name=None):
'''Safely subset given dataset with subregion information
A standard subset requires that the provided subregion be entirely contained
within the datasets bounds. `safe_subset` returns the overlap of the
subregion and dataset without returning an error.
:param subregion: The Bounds with which to subset the target Dataset.
:type subregion: :class:`dataset.Bounds`
:param target_dataset: The Dataset object to subset.
:type target_dataset: :class:`dataset.Dataset`
:param subregion_name: The subset-ed Dataset name
:type subregion_name: :mod:`string`
:returns: The subset-ed Dataset object
:rtype: :class:`dataset.Dataset`
lat_min, lat_max, lon_min, lon_max = target_dataset.spatial_boundaries()
start, end = target_dataset.time_range()
if subregion.lat_min < lat_min:
subregion.lat_min = lat_min
if subregion.lat_max > lat_max:
subregion.lat_max = lat_max
if subregion.lon_min < lon_min:
subregion.lon_min = lon_min
if subregion.lon_max > lon_max:
subregion.lon_max = lon_max
if subregion.start:
if subregion.start < start:
subregion.start = start
if subregion.end:
if subregion.end > end:
subregion.end = end
return subset(subregion, target_dataset, subregion_name)
def normalize_dataset_datetimes(dataset, timestep):
''' Normalize Dataset datetime values.
Force daily to an hour time value of 00:00:00.
Force monthly data to the first of the month at midnight.
:param dataset: The Dataset which will have its time value normalized.
:type dataset: :class:`dataset.Dataset`
:param timestep: The timestep of the Dataset's values. Either 'daily' or
:type timestep: :mod:`string`
:returns: A new Dataset with normalized datetime values.
:rtype: :class:`dataset.Dataset`
new_times = _rcmes_normalize_datetimes(dataset.times, timestep)
return ds.Dataset(
def write_netcdf(dataset, path, compress=True):
''' Write a dataset to a NetCDF file.
:param dataset: The dataset to write.
:type dataset: :class:`dataset.Dataset`
:param path: The output file path.
:type path: :mod:`string`
out_file = netCDF4.Dataset(path, 'w', format='NETCDF4')
# Set attribute lenghts
lat_len = len(dataset.lats)
lon_len = len(dataset.lons)
time_len = len(dataset.times)
# Create attribute dimensions
lat_dim = out_file.createDimension('lat', lat_len)
lon_dim = out_file.createDimension('lon', lon_len)
time_dim = out_file.createDimension('time', time_len)
# Create variables
lats = out_file.createVariable('lat', 'f8', ('lat',), zlib=compress)
lons = out_file.createVariable('lon', 'f8', ('lon',), zlib=compress)
times = out_file.createVariable('time', 'f8', ('time',), zlib=compress)
var_name = dataset.variable if dataset.variable else 'var'
values = out_file.createVariable(var_name,
('time', 'lat', 'lon'),
# Set the time variable units
# We don't deal with hourly/minutely/anything-less-than-a-day data so
# we can safely stick with a 'days since' offset here. Note that the
# NetCDF4 helper date2num doesn't support 'months' or 'years' instead
# of days.
times.units = "days since %s" % dataset.times[0]
# Store the dataset's values
lats[:] = dataset.lats
lons[:] = dataset.lons
times[:] = netCDF4.date2num(dataset.times, times.units)
values[:] = dataset.values
values.units = dataset.units
def write_netcdf_multiple_datasets_with_subregions(ref_dataset, ref_name,
model_dataset_array, model_names,
subregions = None, subregion_array = None,
ref_subregion_mean = None, ref_subregion_std = None,
model_subregion_mean = None, model_subregion_std = None):
#Write multiple reference and model datasets and their subregional means and standard deivations in a NetCDF file.
#:To be updated
out_file = netCDF4.Dataset(path, 'w', format='NETCDF4')
dataset = ref_dataset
# Set attribute lenghts
nobs = 1
nmodel = len(model_dataset_array)
lat_len = len(dataset.lats)
lon_len = len(dataset.lons)
time_len = len(dataset.times)
if not subregions == None:
nsubregion = len(subregions)
# Create attribute dimensions
lat_dim = out_file.createDimension('y', lat_len)
lon_dim = out_file.createDimension('x', lon_len)
time_dim = out_file.createDimension('time', time_len)
# Create variables and store the values
lats = out_file.createVariable('lat', 'f8', ('y'))
lats[:] = dataset.lats
lons = out_file.createVariable('lon', 'f8', ('x'))
lons[:] = dataset.lons
times = out_file.createVariable('time', 'f8', ('time',))
times.units = "days since %s" % dataset.times[0]
times[:] = netCDF4.date2num(dataset.times, times.units)
#mask_array = np.zeros([time_len, lat_len, lon_len])
#for iobs in np.arange(nobs):
# index = np.where(ref_dataset_array[iobs].values.mask[:] == True)
# mask_array[index] = 1
out_file.createVariable(ref_name, 'f8', ('time','y','x'))
out_file.variables[ref_name][:] = ref_dataset.values
out_file.variables[ref_name].units = ref_dataset.units
for imodel in np.arange(nmodel):
out_file.createVariable(model_names[imodel], 'f8', ('time','y','x'))
#out_file.variables[model_names[imodel]][:] = ma.array(model_dataset_array[imodel].values, mask = mask_array)
out_file.variables[model_names[imodel]][:] = model_dataset_array[imodel].values
out_file.variables[model_names[imodel]].units = model_dataset_array[imodel].units
if not subregions == None:
out_file.createVariable('subregion_array', 'i4', ('y','x'))
out_file.variables['subregion_array'][:] = subregion_array[:]
nsubregion = len(subregions)
out_file.createDimension('nsubregion', nsubregion)
out_file.createDimension('nobs', nobs)
out_file.createDimension('nmodel', nmodel)
out_file.createVariable('obs_subregion_mean', 'f8', ('nobs','time','nsubregion'))
out_file.variables['obs_subregion_mean'][:] = ref_subregion_mean[:]
out_file.createVariable('obs_subregion_std', 'f8', ('nobs','time','nsubregion'))
out_file.variables['obs_subregion_std'][:] = ref_subregion_std[:]
out_file.createVariable('model_subregion_mean', 'f8', ('nmodel','time','nsubregion'))
out_file.variables['model_subregion_mean'][:] = model_subregion_mean[:]
out_file.createVariable('model_subregion_std', 'f8', ('nmodel','time','nsubregion'))
out_file.variables['model_subregion_std'][:] = model_subregion_std[:]
def water_flux_unit_conversion(dataset):
''' Convert water flux variables units as necessary
Convert full SI units water flux units to more common units.
:param dataset: The dataset to convert.
:type dataset: :class:`dataset.Dataset`
:returns: A Dataset with values converted to new units.
:rtype: :class:`dataset.Dataset`
water_flux_variables = ['pr', 'prec','evspsbl', 'mrro', 'swe']
variable = dataset.variable.lower()
if any(sub_string in variable for sub_string in water_flux_variables):
dataset_units = dataset.units.lower()
if variable in 'swe':
if any(unit in dataset_units for unit in ['m', 'meter']):
dataset.values = 1.e3 * dataset.values
dataset.units = 'km'
if any(unit in dataset_units
for unit in ['kg m-2 s-1', 'mm s-1', 'mm/sec']):
dataset.values = 86400. * dataset.values
dataset.units = 'mm/day'
return dataset
def temperature_unit_conversion(dataset):
''' Convert temperature units as necessary
Automatically convert Celcius to Kelvin in the given dataset.
:param dataset: The dataset for which units should be updated.
:type dataset; :class:`dataset.Dataset`
:returns: The dataset with (potentially) updated units.
:rtype: :class:`dataset.Dataset`
temperature_variables = ['temp','tas','tasmax','taxmin','T']
variable = dataset.variable.lower()
if any(sub_string in variable for sub_string in temperature_variables):
dataset_units = dataset.units.lower()
if dataset_units == 'c':
dataset.values = 273.15 + dataset.values
dataset.units = 'K'
return dataset
def variable_unit_conversion(dataset):
''' Convert water flux or temperature variables units as necessary
For water flux variables, convert full SI units water flux units to more common units.
For temperature, convert Celcius to Kelvin.
:param dataset: The dataset to convert.
:type dataset: :class:`dataset.Dataset`
:returns: A Dataset with values converted to new units.
:rtype: :class:`dataset.Dataset`
dataset = water_flux_unit_conversion(dataset)
dataset = temperature_unit_conversion(dataset)
return dataset
def _rcmes_normalize_datetimes(datetimes, timestep):
""" Normalize Dataset datetime values.
Force daily to an hour time value of 00:00:00.
Force monthly data to the first of the month at midnight.
:param datetimes: The datetimes to normalize.
:type datetimes: List of `datetime` values.
:param timestep: The flag for how to normalize the datetimes.
:type timestep: String
normalDatetimes = []
if timestep.lower() == 'monthly':
for inputDatetime in datetimes:
if != 1:
# Clean the inputDatetime
inputDatetimeString = inputDatetime.strftime('%Y%m%d')
normalInputDatetimeString = inputDatetimeString[:6] + '01'
inputDatetime = datetime.datetime.strptime(normalInputDatetimeString, '%Y%m%d')
elif timestep.lower() == 'daily':
for inputDatetime in datetimes:
if inputDatetime.hour != 0 or inputDatetime.minute != 0 or inputDatetime.second != 0:
datetimeString = inputDatetime.strftime('%Y%m%d%H%M%S')
normalDatetimeString = datetimeString[:8] + '000000'
inputDatetime = datetime.datetime.strptime(normalDatetimeString, '%Y%m%d%H%M%S')
return normalDatetimes
def mask_missing_data(dataset_array):
''' Check missing values in observation and model datasets.
If any of dataset in dataset_array has missing values at a grid point,
the values at the grid point in all other datasets are masked.
:param dataset_array: an array of OCW datasets
mask_array = np.zeros(dataset_array[0].values.shape)
for dataset in dataset_array:
index = np.where(dataset.values.mask == True)
if index[0].size >0:
mask_array[index] = 1
masked_array = []
for dataset in dataset_array:
dataset.values = ma.array(dataset.values, mask=mask_array)
return [masked_dataset for masked_dataset in masked_array]
def _rcmes_spatial_regrid(spatial_values, lat, lon, lat2, lon2, order=1):
Spatial regrid from one set of lat,lon values onto a new set (lat2,lon2)
:param spatial_values: Values in a spatial grid that need to be regridded
:type spatial_values: 2d masked numpy array. shape (latitude, longitude)
:param lat: Grid of latitude values which map to the spatial values
:type lat: 2d numpy array. shape(latitudes, longitudes)
:param lon: Grid of longitude values which map to the spatial values
:type lon: 2d numpy array. shape(latitudes, longitudes)
:param lat2: Grid of NEW latitude values to regrid the spatial_values onto
:type lat2: 2d numpy array. shape(latitudes, longitudes)
:param lon2: Grid of NEW longitude values to regrid the spatial_values onto
:type lon2: 2d numpy array. shape(latitudes, longitudes)
:param order: Interpolation order flag. 1=bi-linear, 3=cubic spline
:type order: [optional] Integer
:returns: 2d masked numpy array with shape(len(lat2), len(lon2))
:rtype: (float, float)
nlat = spatial_values.shape[0]
nlon = spatial_values.shape[1]
#print nlat, nlon, "lats, lons - incoming dataset"
nlat2 = lat2.shape[0]
nlon2 = lon2.shape[1]
#print nlat2, nlon2, "NEW lats, lons - for the new grid output"
# To make our lives easier down the road, let's
# turn these into arrays of x & y coords
loni = lon2.ravel()
lati = lat2.ravel()
loni = loni.copy() # NB. it won't run unless you do this...
lati = lati.copy()
# Now, we'll set points outside the boundaries to lie along an edge
loni[loni > lon.max()] = lon.max()
loni[loni < lon.min()] = lon.min()
# To deal with the "hard" break, we'll have to treat y differently,
# so we're just setting the min here...
lati[lati > lat.max()] = lat.max()
lati[lati < lat.min()] = lat.min()
# We need to convert these to (float) indicies
# (xi should range from 0 to (nx - 1), etc)
loni = (nlon - 1) * (loni - lon.min()) / (lon.max() - lon.min())
# Deal with the "hard" break in the y-direction
lati = (nlat - 1) * (lati - lat.min()) / (lat.max() - lat.min())
TODO: Review this docstring and see if it still holds true.
NOTE: This function doesn't use MDI currently. These are legacy comments
Notes on dealing with MDI when regridding data.
Method adopted here:
Use bilinear interpolation of data by default (but user can specify other order using order=... in call)
Perform bilinear interpolation of data, and of mask.
To be conservative, new grid point which contained some missing data on the old grid is set to missing data.
-this is achieved by looking for any non-zero interpolated mask values.
To avoid issues with bilinear interpolation producing strong gradients leading into the MDI,
set values at MDI points to mean data value so little gradient visible = not ideal, but acceptable for now.
Set values in MDI so that similar to surroundings so don't produce large gradients when interpolating
Preserve MDI mask, by only changing data part of masked array object.
for shift in (-1, 1):
for axis in (0, 1):
q_shifted = np.roll(spatial_values, shift=shift, axis=axis)
idx = ~q_shifted.mask * spatial_values.mask[idx] = q_shifted[idx]
# Now we actually interpolate
# map_coordinates does cubic interpolation by default,
# use "order=1" to preform bilinear interpolation instead...
regridded_values = map_coordinates(spatial_values, [lati, loni], order=order)
regridded_values = regridded_values.reshape([nlat2, nlon2])
# Set values to missing data outside of original domain
regridded_values = ma.masked_array(regridded_values, mask=np.logical_or(np.logical_or(lat2 >= lat.max(),
lat2 <= lat.min()),
np.logical_or(lon2 <= lon.min(),
lon2 >= lon.max())))
# Make second map using nearest neighbour interpolation -use this to determine locations with MDI and mask these
qmdi = np.zeros_like(spatial_values)
qmdi[spatial_values.mask == True] = 1.
qmdi[spatial_values.mask == False] = 0.
qmdi_r = map_coordinates(qmdi, [lati, loni], order=order)
qmdi_r = qmdi_r.reshape([nlat2, nlon2])
mdimask = (qmdi_r != 0.0)
# Combine missing data mask, with outside domain mask define above.
regridded_values.mask = np.logical_or(mdimask, regridded_values.mask)
return regridded_values
def _rcmes_create_mask_using_threshold(masked_array, threshold=0.5):
'''Mask an array if percent of values missing data is above a threshold.
For each value along an axis, if the proportion of steps that are missing
data is above ``threshold`` then the value is marked as missing data.
..note:: The 0th axis is currently always used.
:param masked_array: Masked array of data
:type masked_array: Numpy Masked Array
:param threshold: (optional) Threshold proportion above which a value is
marked as missing data.
:type threshold: Float
:returns: A Numpy array describing the mask for masked_array.
# try, except used as some model files don't have a full mask, but a single bool
# the except catches this situation and deals with it appropriately.
nT = masked_array.mask.shape[0]
# For each pixel, count how many times are masked.
nMasked = masked_array.mask[:, :, :].sum(axis=0)
# Define new mask as when a pixel has over a defined threshold ratio of masked data
# e.g. if the threshold is 75%, and there are 10 times,
# then a pixel will be masked if more than 5 times are masked.
mymask = nMasked > (nT * threshold)
mymask = np.zeros_like([0, :, :])
return mymask
def _rcmes_calc_average_on_new_time_unit_K(data, dates, unit):
""" Rebin 3d array and list of dates using the provided unit parameter
:param data: Input data that needs to be averaged
:type data: 3D masked numpy array of shape (times, lats, lons)
:param dates: List of dates that correspond to the given data values
:type dates: Python datetime objects
:param unit: Time unit to average the data into
:type unit: String matching one of these values : full | annual | monthly | daily
:returns: meanstorem, newTimesList
:rtype: 3D numpy masked array the same shape as the input array, list of python datetime objects
# Check if the user-selected temporal grid is valid. If not, EXIT
acceptable = (unit=='full')|(unit=='annual')|(unit=='monthly')|(unit=='daily')
if not acceptable:
print 'Error: unknown unit type selected for time averaging: EXIT'
return -1,-1,-1,-1
# Calculate arrays of: annual timeseries: year (2007,2007),
# monthly time series: year-month (200701,200702),
# daily timeseries: year-month-day (20070101,20070102)
# depending on user-selected averaging period.
# Year list
if unit=='annual':
timeunits = np.array([int(d.strftime("%Y")) for d in dates])
unique_times = np.unique(timeunits)
# YearMonth format list
if unit=='monthly':
timeunits = np.array([int(d.strftime("%Y%m")) for d in dates])
unique_times = np.unique(timeunits)
# YearMonthDay format list
if unit=='daily':
timeunits = np.array([int(d.strftime("%Y%m%d")) for d in dates])
unique_times = np.unique(timeunits)
# TODO: add pentad setting using Julian days?
# Full list: a special case
if unit == 'full':
# Calculating means data over the entire time range: i.e., annual-mean climatology
timeunits = []
for i in np.arange(len(dates)):
timeunits.append(999) # i.e. we just want the same value for all times.
timeunits = np.array(timeunits, dtype=int)
unique_times = np.unique(timeunits)
# empty list to store new times
newTimesList = []
# Decide whether or not you need to do any time averaging.
# i.e. if data are already on required time unit then just pass data through and
# calculate and return representative datetimes.
processing_required = True
if len(timeunits)==(len(unique_times)):
processing_required = False
# 1D data arrays, i.e. time series
if data.ndim==1:
# Create array to store the resulting data
meanstore = np.zeros(len(unique_times))
# Calculate the means across each unique time unit
for myunit in unique_times:
if processing_required:
meanstore[i] = ma.average(datam)
# construct new times list
yyyy, mm, dd = _create_new_year_month_day(myunit, dates)
i = i+1
# 3D data arrays
if data.ndim==3:
# Create array to store the resulting data
meanstore = np.zeros([len(unique_times),data.shape[1],data.shape[2]])
# Calculate the means across each unique time unit
datamask_store = []
for myunit in unique_times:
if processing_required:
mask = np.zeros_like(data)
mask[timeunits!=myunit,:,:] = 1.0
# Calculate missing data mask within each time unit...
datamask_at_this_timeunit = np.zeros_like(data)
datamask_at_this_timeunit[:]= _rcmes_create_mask_using_threshold(data[timeunits==myunit,:,:],threshold=0.75)
# Store results for masking later
# Calculate means for each pixel in this time unit, ignoring missing data (using masked array).
datam = ma.masked_array(data,np.logical_or(mask,datamask_at_this_timeunit))
meanstore[i,:,:] = ma.average(datam,axis=0)
# construct new times list
yyyy, mm, dd = _create_new_year_month_day(myunit, dates)
i += 1
if not processing_required:
meanstorem = data
if processing_required:
# Create masked array (using missing data mask defined above)
datamask_store = np.array(datamask_store)
meanstorem = ma.masked_array(meanstore, datamask_store)
return meanstorem, newTimesList
def _create_new_year_month_day(time_unit, dates):
smyunit = str(time_unit)
if len(smyunit)==4: # YYYY
yyyy = int(smyunit[0:4])
mm = 1
dd = 1
elif len(smyunit)==6: # YYYYMM
yyyy = int(smyunit[0:4])
mm = int(smyunit[4:6])
dd = 1
elif len(smyunit)==8: # YYYYMMDD
yyyy = int(smyunit[0:4])
mm = int(smyunit[4:6])
dd = int(smyunit[6:8])
elif len(smyunit)==3: # Full time range
# Need to set an appropriate time representing the mid-point of the entire time span
dt = dates[-1]-dates[0]
halfway = dates[0]+(dt/2)
yyyy = int(halfway.year)
mm = int(halfway.month)
dd = int(
return (yyyy, mm, dd)
def _congrid(a, newdims, method='linear', centre=False, minusone=False):
This function is from - Example 3
It has been refactored and changed a bit, but the original functionality
has been preserved.
Arbitrary resampling of source array to new dimension sizes.
Currently only supports maintaining the same number of dimensions.
To use 1-D arrays, first promote them to shape (x,1).
Uses the same parameters and creates the same co-ordinate lookup points
as IDL''s congrid routine, which apparently originally came from a VAX/VMS
routine of the same name.
neighbour - closest value from original data
nearest and linear - uses n x 1-D interpolations using
(see Numerical Recipes for validity of use of n 1-D interpolations)
spline - uses ndimage.map_coordinates
True - interpolation points are at the centres of the bins
False - points are at the front edge of the bin
For example- inarray.shape = (i,j) & new dimensions = (x,y)
False - inarray is resampled by factors of (i/x) * (j/y)
True - inarray is resampled by(i-1)/(x-1) * (j-1)/(y-1)
This prevents extrapolation one element beyond bounds of input array.
if not a.dtype in [np.float64, np.float32]:
a = np.cast[float](a)
# this will merely take the True/False input and convert it to an array(1) or array(0)
m1 = np.cast[int](minusone)
# this also casts the True False input into a floating point number of 0.5 or 0.0
ofs = np.cast[int](centre) * 0.5
old = np.array( a.shape )
ndims = len( a.shape )
if len( newdims ) != ndims:
print "[congrid] dimensions error. " \
"This routine currently only supports " \
"rebinning to the same number of dimensions."
return None
newdims = np.asarray( newdims, dtype=float )
dimlist = []
if method == 'neighbour':
newa = _congrid_neighbor(a, newdims, m1, ofs)
elif method in ['nearest','linear']:
# calculate new dims
for i in range( ndims ):
base = np.arange( newdims[i] )
dimlist.append( (old[i] - m1) / (newdims[i] - m1) \
* (base + ofs) - ofs )
# specify old dims
olddims = [np.arange(i, dtype = np.float) for i in list( a.shape )]
# first interpolation - for ndims = any
mint = scipy.interpolate.interp1d( olddims[-1], a, kind=method )
newa = mint( dimlist[-1] )
trorder = [ndims - 1] + range( ndims - 1 )
for i in range( ndims - 2, -1, -1 ):
newa = newa.transpose( trorder )
mint = scipy.interpolate.interp1d( olddims[i], newa, kind=method )
newa = mint( dimlist[i] )
if ndims > 1:
# need one more transpose to return to original dimensions
newa = newa.transpose( trorder )
return newa
elif method in ['spline']:
oslices = [ slice(0, j) for j in old ]
oldcoords = np.ogrid[oslices]
nslices = [ slice(0, j) for j in list(newdims) ]
newcoords = np.mgrid[nslices]
newcoords_dims = range(np.rank(newcoords))
#make first index last
newcoords_tr = newcoords.transpose(newcoords_dims)
# makes a view that affects newcoords
newcoords_tr += ofs
deltas = (np.asarray(old) - m1) / (newdims - m1)
newcoords_tr *= deltas
newcoords_tr -= ofs
newa = scipy.ndimage.map_coordinates(a, newcoords)
return newa
print "Congrid error: Unrecognized interpolation type.\n", \
"Currently only \'neighbour\', \'nearest\',\'linear\',", \
"and \'spline\' are supported."
return None
def _check_dataset_shapes(datasets):
""" If the datasets are not the same shape throw a ValueError Exception
:param datasets: OCW Datasets to check for a consistent shape
:type datasets: List of OCW Dataset Objects
:raises: ValueError
dataset_shape = None
for dataset in datasets:
if dataset_shape == None:
dataset_shape = dataset.values.shape
if dataset.values.shape != dataset_shape:
msg = "%s != %s" % (dataset.values.shape, dataset_shape)
raise ValueError("Input datasets must be the same shape for an ensemble :: ", msg)
def _congrid_neighbor(values, new_dims, minus_one, offset):
""" Use the nearest neighbor to create a new array
:param values: Array of values that need to be interpolated
:type values: Numpy ndarray
:param new_dims: Longitude resolution in degrees
:type new_dims: float
:param lat_resolution: Latitude resolution in degrees
:type lat_resolution: float
:returns: A new spatially regridded Dataset
:rtype: Open Climate Workbench Dataset Object
ndims = len( values.shape )
dimlist = []
old_dims = np.array( values.shape )
for i in range( ndims ):
base = np.indices(new_dims)[i]
dimlist.append( (old_dims[i] - minus_one) / (new_dims[i] - minus_one) \
* (base + offset) - offset )
cd = np.array( dimlist ).round().astype(int)
new_values = values[list( cd )]
return new_values
def _are_bounds_contained_by_dataset(bounds, dataset):
'''Check if a Dataset fully contains a bounds.
:param bounds: The Bounds object to check.
:type bounds: Bounds
:param dataset: The Dataset that should be fully contain the
:type dataset: Dataset
:returns: True if the Bounds are contained by the Dataset, Raises
a ValueError otherwise
lat_min, lat_max, lon_min, lon_max = dataset.spatial_boundaries()
start, end = dataset.time_range()
errors = []
# TODO: THIS IS TERRIBLY inefficent and we need to use a geometry lib instead in the future
if not lat_min <= bounds.lat_min <= lat_max:
error = "bounds.lat_min: %s is not between lat_min: %s and lat_max: %s" % (bounds.lat_min, lat_min, lat_max)
if not lat_min <= bounds.lat_max <= lat_max:
error = "bounds.lat_max: %s is not between lat_min: %s and lat_max: %s" % (bounds.lat_max, lat_min, lat_max)
if not lon_min <= bounds.lon_min <= lon_max:
error = "bounds.lon_min: %s is not between lon_min: %s and lon_max: %s" % (bounds.lon_min, lon_min, lon_max)
if not lon_min <= bounds.lon_max <= lon_max:
error = "bounds.lon_max: %s is not between lon_min: %s and lon_max: %s" % (bounds.lon_max, lon_min, lon_max)
if not start <= bounds.start <= end:
error = "bounds.start: %s is not between start: %s and end: %s" % (bounds.start, start, end)
if not start <= bounds.end <= end:
error = "bounds.end: %s is not between start: %s and end: %s" % (bounds.end, start, end)
if len(errors) == 0:
return True
error_message = '\n'.join(errors)
raise ValueError(error_message)
def _get_subregion_slice_indices(subregion, target_dataset):
'''Get the indices for slicing Dataset values to generate the subregion.
:param subregion: The Bounds that specify the subset of the Dataset
that should be extracted.
:type subregion: Bounds
:param target_dataset: The Dataset to subset.
:type target_dataset: Dataset
:returns: The indices to slice the Datasets arrays as a Dictionary.
latStart = min(np.nonzero(target_dataset.lats >= subregion.lat_min)[0])
latEnd = max(np.nonzero(target_dataset.lats <= subregion.lat_max)[0])
lonStart = min(np.nonzero(target_dataset.lons >= subregion.lon_min)[0])
lonEnd = max(np.nonzero(target_dataset.lons <= subregion.lon_max)[0])
timeStart = min(np.nonzero(target_dataset.times >= subregion.start)[0])
timeEnd = max(np.nonzero(target_dataset.times <= subregion.end)[0])
return {
"lat_start" : latStart,
"lat_end" : latEnd,
"lon_start" : lonStart,
"lon_end" : lonEnd,
"time_start" : timeStart,
"time_end" : timeEnd