blob: 1b5c545a3d934a39cf0923139f65998918998693 [file] [log] [blame]
"""Prepare Datasets both model and observations for analysis using metrics"""
import numpy as np
import numpy.ma as ma
import sys, os
import Nio
from storage import db, files
import process
from utils import misc
# TODO: swap gridBox for Domain
def prep_data(settings, obsDatasetList, gridBox, modelList):
"""
returns numOBSs,numMDLs,nT,ngrdY,ngrdX,Times,lons,lats,obsData,modelData,obsList
numOBSs - number of Observational Datasets. Really need to look at using len(obsDatasetList) instead
numMDLs - number of Models used. Again should use the len(modelList) instead
nT - Time value count after temporal regridding. Should use length of the time axis for a given dataset
ngrdY - size of the Y-Axis grid after spatial regridding
ngrdX - size of the X-Axis grid after spatial regridding
Times - list of python datetime objects the represent the list of time to be used in further calculations
lons -
lats -
obsData -
modelData -
obsList -
"""
# TODO: Stop the object Deserialization and work on refactoring the core code here
cachedir = settings.cacheDir
workdir = settings.workDir
# Use list comprehensions to deconstruct obsDatasetList
# ['TRMM_pr_mon', 'CRU3.1_pr'] Basically a list of Dataset NAME +'_' + parameter name - THE 'CRU*' one triggers unit conversion issues later
# the plan here is to use the obsDatasetList which contains a longName key we can use.
obsList = [str(x['longname']) for x in obsDatasetList]
# Also using the obsDatasetList with a key of ['dataset_id']
obsDatasetId = [str(x['dataset_id']) for x in obsDatasetList]
# obsDatasetList ['paramter_id'] list
obsParameterId = [str(x['parameter_id']) for x in obsDatasetList]
obsTimestep = [str(x['timestep']) for x in obsDatasetList]
mdlList = [model.filename for model in modelList]
# Since all of the model objects in the modelList have the same Varnames and Precip Flag, I am going to merely
# pull this from modelList[0] for now
modelVarName = modelList[0].varName
precipFlag = modelList[0].precipFlag
modelTimeVarName = modelList[0].timeVariable
modelLatVarName = modelList[0].latVariable
modelLonVarName = modelList[0].lonVariable
regridOption = settings.spatialGrid
timeRegridOption = settings.temporalGrid
"""
Routine to read-in and re-grid both obs and mdl datasets.
Processes both single and multiple files of obs and mdl or combinations in a general way.
i) retrieve observations from the database
ii) load in model data
iii) temporal regridding
iv) spatial regridding
v) area-averaging
Input:
cachedir - string describing directory path
workdir - string describing directory path
obsList - string describing the observation data files
obsDatasetId - int, db dataset id
obsParameterId - int, db parameter id
latMin, latMax, lonMin, lonMax, dLat, dLon, naLats, naLons: define the evaluation/analysis domain/grid system
latMin - float
latMax - float
lonMin - float
lonMax - float
dLat - float
dLon - float
naLats - integer
naLons - integer
mdlList - string describing model file name + path
modelVarName - string describing name of variable to evaluate (as written in model file)
precipFlag - bool (is this precipitation data? True/False)
modelTimeVarName - string describing name of time variable in model file
modelLatVarName - string describing name of latitude variable in model file
modelLonVarName - string describing name of longitude variable in model file
regridOption - string: 'obs'|'model'|'user'
timeRegridOption -string: 'full'|'annual'|'monthly'|'daily'
maskOption - Boolean
# TODO: This isn't true in the current codebase.
Instead the SubRegion's are being used. You can see that these values are not
being used in the code, at least they are not being passed in from the function
maskLatMin - float (only used if maskOption=1)
maskLatMax - float (only used if maskOption=1)
maskLonMin - float (only used if maskOption=1)
maskLonMax - float (only used if maskOption=1)
Output: image files of plots + possibly data
Jinwon Kim, 7/11/2012
"""
# check the number of obs & model data files
numOBSs = len(obsList)
numMDLs = len(mdlList)
# assign parameters that must be preserved throughout the process
print 'start & end time = ', settings.startDate, settings.endDate
yymm0 = settings.startDate.strftime("%Y%m")
yymm1 = settings.endDate.strftime("%Y%m")
print 'start & end eval period = ', yymm0, yymm1
#TODO: Wrap in try except blocks instead
if numMDLs < 1:
print 'No input model data file. EXIT'
sys.exit()
if numOBSs < 1:
print 'No input observation data file. EXIT'
sys.exit()
## Part 1: retrieve observation data from the database and regrid them
## NB. automatically uses local cache if already retrieved.
# preparation for spatial re-gridding: define the size of horizontal array of the target interpolation grid system (ngrdX and ngrdY)
print 'regridOption in prep_data= ', regridOption
if regridOption == 'model':
ifile = mdlList[0]
typeF = 'nc'
lats, lons, mTimes = files.read_lolaT_from_file(ifile, modelLatVarName, modelLonVarName, modelTimeVarName, typeF)
modelObject = modelList[0]
latMin = modelObject.latMin
latMax = modelObject.latMax
lonMin = modelObject.lonMin
lonMax = modelObject.lonMax
elif regridOption == 'user':
# Use the GridBox Object
latMin = gridBox.latMin
latMax = gridBox.latMax
lonMin = gridBox.lonMin
lonMax = gridBox.lonMax
naLats = gridBox.latCount
naLons = gridBox.lonCount
dLat = gridBox.latStep
dLon = gridBox.lonStep
lat = np.arange(naLats) * dLat + latMin
lon = np.arange(naLons) * dLon + lonMin
lons, lats = np.meshgrid(lon, lat)
lon = 0.
lat = 0.
else:
print "INVALID REGRID OPTION USED"
sys.exit()
ngrdY = lats.shape[0]
ngrdX = lats.shape[1]
regObsData = []
for n in np.arange(numOBSs):
# spatial regridding
oLats, oLons, _, oTimes, oData = db.extractData(obsDatasetId[n],
obsParameterId[n],
latMin, latMax,
lonMin, lonMax,
settings.startDate, settings.endDate,
cachedir, obsTimestep[n])
# TODO: modify this if block with new metadata usage.
if precipFlag == True and obsList[n][0:3] == 'CRU':
oData = 86400.0 * oData
nstOBSs = oData.shape[0] # note that the length of obs data can vary for different obs intervals (e.g., daily vs. monthly)
print 'Regrid OBS dataset onto the ', regridOption, ' grid system: ngrdY, ngrdX, nstOBSs= ', ngrdY, ngrdX, nstOBSs
print 'For dataset: %s' % obsList[n]
tmpOBS = ma.zeros((nstOBSs, ngrdY, ngrdX))
print 'tmpOBS shape = ', tmpOBS.shape
for t in np.arange(nstOBSs):
tmpOBS[t, :, :] = process.do_regrid(oData[t, :, :], oLats, oLons, lats, lons)
# TODO: Not sure this is needed with Python Garbage Collector
# The used memory should be freed when the objects are no longer referenced. If this continues to be an issue we may need to look
# at using generators where possible.
oLats = 0.0
oLons = 0.0 # release the memory occupied by the temporary variables oLats and oLons.
# temporally regrid the spatially regridded obs data
oData, newObsTimes = process.calc_average_on_new_time_unit_K(tmpOBS, oTimes, unit=timeRegridOption)
tmpOBS = 0.0
# check the consistency of temporally regridded obs data
if n == 0:
oldObsTimes = newObsTimes
else:
if oldObsTimes != newObsTimes:
print 'temporally regridded obs data time levels do not match at ', n - 1, n
print '%s Time through Loop' % (n + 1)
print 'oldObsTimes Count: %s' % len(oldObsTimes)
print 'newObsTimes Count: %s' % len(newObsTimes)
# TODO: We need to handle these cases using Try Except Blocks or insert a sys.exit if appropriate
sys.exit()
else:
oldObsTimes = newObsTimes
# if everything's fine, append the spatially and temporally regridded data in the obs data array (obsData)
regObsData.append(oData)
""" all obs datasets have been read-in and regridded. convert the regridded obs data from 'list' to 'array'
also finalize 'obsTimes', the time coordinate values of the regridded obs data.
NOTE: using 'list_to_array' assigns values to the missing points; this has become a problem in handling the CRU data.
this problem disappears by using 'ma.array'."""
obsData = ma.array(regObsData)
obsTimes = newObsTimes
regObsData = 0
oldObsTimes = 0
nT = len(obsTimes)
# TODO: Refactor this into a function within the toolkit module
# compute the simple multi-obs ensemble if multiple obs are used
if numOBSs > 1:
print 'numOBSs = ', numOBSs
oData = obsData
print 'oData shape = ', oData.shape
obsData = ma.zeros((numOBSs + 1, nT, ngrdY, ngrdX))
print 'obsData shape = ', obsData.shape
avg = ma.zeros((nT, ngrdY, ngrdX))
for i in np.arange(numOBSs):
obsData[i, :, :, :] = oData[i, :, :, :]
avg[:, :, :] = avg[:, :, :] + oData[i, :, :, :]
avg = avg / float(numOBSs)
obsData[numOBSs, :, :, :] = avg[:, :, :] # store the model-ensemble data
numOBSs = numOBSs + 1 # update the number of obs data to include the model ensemble
obsList.append('ENS-OBS')
print 'OBS regridded: ', obsData.shape
## Part 2: load in and regrid model data from file(s)
## NOTE: tthe wo parameters, numMDLs and numMOmx are defined to represent the number of models (w/ all 240 mos) &
## the total number of months, respectively, in later multi-model calculations.
typeF = 'nc'
regridMdlData = []
# extract the model names and store them in the list 'mdlList'
mdlName = []
mdlListReversed=[]
if len(mdlList) >1:
for element in mdlList:
mdlListReversed.append(element[::-1])
prefix=os.path.commonprefix(mdlList)
postfix=os.path.commonprefix(mdlListReversed)[::-1]
for element in mdlList:
mdlName.append(element.replace(prefix,'').replace(postfix,''))
else:
mdlName.append('model')
for n in np.arange(numMDLs):
# read model grid info, then model data
ifile = mdlList[n]
print 'ifile= ', ifile
modelLats, modelLons, mTimes = files.read_lolaT_from_file(ifile, modelLatVarName, modelLonVarName, modelTimeVarName, typeF)
mTime, mdlDat, mvUnit = files.read_data_from_one_file(ifile, modelVarName, modelTimeVarName, modelLats, typeF)
mdlT = []
mStep = len(mTimes)
for i in np.arange(mStep):
mdlT.append(mTimes[i].strftime("%Y%m"))
wh = (np.array(mdlT) >= yymm0) & (np.array(mdlT) <= yymm1)
modelTimes = list(np.array(mTimes)[wh])
mData = mdlDat[wh, :, :]
# determine the dimension size from the model time and latitude data.
nT = len(modelTimes)
# UNUSED VARIABLES - WILL DELETE AFTER TESTING
# nmdlY=modelLats.shape[0]
# nmdlX=modelLats.shape[1]
#print 'nT, ngrdY, ngrdX = ',nT,ngrdY, ngrdX,min(modelTimes),max(modelTimes)
print ' The shape of model data to be processed= ', mData.shape, ' for the period ', min(modelTimes), max(modelTimes)
# spatial regridding of the modl data
tmpMDL = ma.zeros((nT, ngrdY, ngrdX))
if regridOption != 'model':
for t in np.arange(nT):
tmpMDL[t, :, :] = process.do_regrid(mData[t, :, :], modelLats, modelLons, lats, lons)
else:
tmpMDL = mData
# temporally regrid the model data
mData, newMdlTimes = process.regrid_in_time(tmpMDL, modelTimes, unit=timeRegridOption)
tmpMDL = 0.0
# check data consistency for all models
if n == 0:
oldMdlTimes = newMdlTimes
else:
if oldMdlTimes != newMdlTimes:
print 'temporally regridded mdl data time levels do not match at ', n - 1, n
print len(oldMdlTimes), len(newMdlTimes)
sys.exit()
else:
oldMdlTimes = newMdlTimes
# if everything's fine, append the spatially and temporally regridded data in the obs data array (obsData)
regridMdlData.append(mData)
modelData = ma.array(regridMdlData)
modelTimes = newMdlTimes
regridMdlData = 0
oldMdlTimes = 0
newMdlTimes = 0
if (precipFlag == True) & (mvUnit == 'KG M-2 S-1'):
print 'convert model variable unit from mm/s to mm/day'
modelData = 86400.*modelData
# check consistency between the time levels of the model and obs data
# this is the final update of time levels: 'Times' and 'nT'
if obsTimes != modelTimes:
print 'time levels of the obs and model data are not consistent. EXIT'
print 'obsTimes'
print obsTimes
print 'modelTimes'
print modelTimes
sys.exit()
# 'Times = modelTimes = obsTimes' has been established and modelTimes and obsTimes will not be used hereafter. (de-allocated)
Times = modelTimes
nT = len(modelTimes)
modelTimes = 0
obsTimes = 0
print 'Reading and regridding model data are completed'
print 'numMDLs, modelData.shape= ', numMDLs, modelData.shape
# TODO: Do we need to make this a user supplied flag, or do we just create an ensemble ALWAYS
# TODO: Add in Kyo's code here as well
# TODO: Commented out until I can talk with Jinwon about this
# compute the simple multi-model ensemble if multiple models are evaluated
if numMDLs > 1:
mdlData=modelData
modelData=ma.zeros((numMDLs+1,nT,ngrdY,ngrdX))
avg=ma.zeros((nT,ngrdY,ngrdX))
for i in np.arange(numMDLs):
modelData[i,:,:,:]=mdlData[i,:,:,:]
avg[:,:,:]=avg[:,:,:]+mdlData[i,:,:,:]
avg=avg/float(numMDLs)
modelData[numMDLs,:,:,:]=avg[:,:,:] # store the model-ensemble data
# THESE ARE NOT USED. WILL DELETE AFTER TESTING
# i0mdl=0
# i1mdl=numMDLs
numMDLs=numMDLs+1
mdlList.append('ENS-MODEL')
print 'Eval mdl-mean timeseries for the obs periods: modelData.shape= ',modelData.shape
# GOODALE: This ensemble code should be refactored into process.py module since it's purpose is data processing
# Processing complete
print 'data_prep is completed: both obs and mdl data are re-gridded to a common analysis grid'
return numOBSs, numMDLs, nT, ngrdY, ngrdX, Times, lons, lats, obsData, modelData, obsList, mdlName