src/main/python/rcmes/toolkit/.svn/text-base/do_data_prep.py.svn-base - climate - Git at Google

 """Prepare Datasets both model and observations for analysis using metrics"""


 import numpy as np
 import numpy.ma as ma
 import sys, os

 import Nio

 from storage import db, files
 import process
 from utils import misc

 # TODO:  swap gridBox for Domain
 def prep_data(settings, obsDatasetList, gridBox, modelList):
     """

     returns numOBSs,numMDLs,nT,ngrdY,ngrdX,Times,lons,lats,obsData,modelData,obsList

         numOBSs - number of Observational Datasets.  Really need to look at using len(obsDatasetList) instead
         numMDLs - number of Models used.  Again should use the len(modelList) instead
         nT - Time value count after temporal regridding. Should use length of the time axis for a given dataset
         ngrdY - size of the Y-Axis grid after spatial regridding
         ngrdX - size of the X-Axis grid after spatial regridding
         Times - list of python datetime objects the represent the list of time to be used in further calculations
         lons -
         lats -
         obsData -
         modelData -
         obsList -


     """

     # TODO:  Stop the object Deserialization and work on refactoring the core code here
     cachedir = settings.cacheDir
     workdir = settings.workDir

     # Use list comprehensions to deconstruct obsDatasetList
     #  ['TRMM_pr_mon', 'CRU3.1_pr']    Basically a list of Dataset NAME +'_' + parameter name - THE 'CRU*' one triggers unit conversion issues later
     # the plan here is to use the obsDatasetList which contains a longName key we can use.
     obsList = [str(x['longname']) for x in obsDatasetList]
     # Also using the obsDatasetList with a key of ['dataset_id']
     obsDatasetId = [str(x['dataset_id']) for x in obsDatasetList]
     # obsDatasetList ['paramter_id'] list
     obsParameterId = [str(x['parameter_id']) for x in obsDatasetList]
     obsTimestep = [str(x['timestep']) for x in obsDatasetList]
     mdlList = [model.filename for model in modelList]

     # Since all of the model objects in the modelList have the same Varnames and Precip Flag, I am going to merely
     # pull this from modelList[0] for now
     modelVarName = modelList[0].varName
     precipFlag = modelList[0].precipFlag
     modelTimeVarName = modelList[0].timeVariable
     modelLatVarName = modelList[0].latVariable
     modelLonVarName = modelList[0].lonVariable
     regridOption = settings.spatialGrid
     timeRegridOption = settings.temporalGrid

     """
      Routine to read-in and re-grid both obs and mdl datasets.
      Processes both single and multiple files of obs and mdl or combinations in a general way.
            i)    retrieve observations from the database
            ii)   load in model data
            iii)  temporal regridding
            iv)   spatial regridding
            v)    area-averaging
            Input:
                    cachedir 	- string describing directory path
                    workdir 	- string describing directory path
                    obsList        - string describing the observation data files
                    obsDatasetId 	- int, db dataset id
                    obsParameterId	- int, db parameter id
                    latMin, latMax, lonMin, lonMax, dLat, dLon, naLats, naLons: define the evaluation/analysis domain/grid system
     	         latMin		- float
                    latMax		- float
                    lonMin		- float
                    lonMax		- float
                    dLat  		- float
                    dLon  		- float
                    naLats		- integer
                    naLons		- integer
                    mdlList	- string describing model file name + path
                    modelVarName	- string describing name of variable to evaluate (as written in model file)
     	         precipFlag	- bool  (is this precipitation data? True/False)
                    modelTimeVarName - string describing name of time variable in model file
                    modelLatVarName  - string describing name of latitude variable in model file
                    modelLonVarName  - string describing name of longitude variable in model file
                    regridOption 	 - string: 'obs'|'model'|'user'
                    timeRegridOption -string: 'full'|'annual'|'monthly'|'daily'
                    maskOption - Boolean

                    # TODO:  This isn't true in the current codebase.
                    Instead the SubRegion's are being used.  You can see that these values are not
                    being used in the code, at least they are not being passed in from the function

                    maskLatMin - float (only used if maskOption=1)
                    maskLatMax - float (only used if maskOption=1)
     	         maskLonMin - float (only used if maskOption=1)
                    maskLonMax - float (only used if maskOption=1)
            Output: image files of plots + possibly data
            Jinwon Kim, 7/11/2012
     """


     # check the number of obs & model data files
     numOBSs = len(obsList)
     numMDLs = len(mdlList)

     # assign parameters that must be preserved throughout the process

     print 'start & end time = ', settings.startDate, settings.endDate
     yymm0 = settings.startDate.strftime("%Y%m")
     yymm1 = settings.endDate.strftime("%Y%m")
     print 'start & end eval period = ', yymm0, yymm1


     #TODO: Wrap in try except blocks instead
     if numMDLs < 1:
         print 'No input model data file. EXIT'
         sys.exit()
     if numOBSs < 1:
         print 'No input observation data file. EXIT'
         sys.exit()

     ## Part 1: retrieve observation data from the database and regrid them
     ##       NB. automatically uses local cache if already retrieved.

     # preparation for spatial re-gridding: define the size of horizontal array of the target interpolation grid system (ngrdX and ngrdY)
     print 'regridOption in prep_data= ', regridOption
     if regridOption == 'model':
         ifile = mdlList[0]
         typeF = 'nc'
         lats, lons, mTimes = files.read_lolaT_from_file(ifile, modelLatVarName, modelLonVarName, modelTimeVarName, typeF)
         modelObject = modelList[0]
         latMin = modelObject.latMin
         latMax = modelObject.latMax
         lonMin = modelObject.lonMin
         lonMax = modelObject.lonMax
     elif regridOption == 'user':
         # Use the GridBox Object
         latMin = gridBox.latMin
         latMax = gridBox.latMax
         lonMin = gridBox.lonMin
         lonMax = gridBox.lonMax
         naLats = gridBox.latCount
         naLons = gridBox.lonCount
         dLat = gridBox.latStep
         dLon = gridBox.lonStep
         lat = np.arange(naLats) * dLat + latMin
         lon = np.arange(naLons) * dLon + lonMin
         lons, lats = np.meshgrid(lon, lat)
         lon = 0.
         lat = 0.
     else:
         print "INVALID REGRID OPTION USED"
         sys.exit()

     ngrdY = lats.shape[0]
     ngrdX = lats.shape[1]

     regObsData = []

     for n in np.arange(numOBSs):
         # spatial regridding
         oLats, oLons, _, oTimes, oData = db.extractData(obsDatasetId[n],
                                                         obsParameterId[n],
                                                         latMin, latMax,
                                                         lonMin, lonMax,
                                                         settings.startDate, settings.endDate,
                                                         cachedir, obsTimestep[n])

         # TODO: modify this if block with new metadata usage.
         if precipFlag == True and obsList[n][0:3] == 'CRU':
             oData = 86400.0 * oData

         nstOBSs = oData.shape[0]         # note that the length of obs data can vary for different obs intervals (e.g., daily vs. monthly)
         print 'Regrid OBS dataset onto the ', regridOption, ' grid system: ngrdY, ngrdX, nstOBSs= ', ngrdY, ngrdX, nstOBSs
         print 'For dataset: %s' % obsList[n]

         tmpOBS = ma.zeros((nstOBSs, ngrdY, ngrdX))

         print 'tmpOBS shape = ', tmpOBS.shape

         for t in np.arange(nstOBSs):
             tmpOBS[t, :, :] = process.do_regrid(oData[t, :, :], oLats, oLons, lats, lons)

         # TODO:  Not sure this is needed with Python Garbage Collector
         # The used memory should be freed when the objects are no longer referenced.  If this continues to be an issue we may need to look
         # at using generators where possible.
         oLats = 0.0
         oLons = 0.0       # release the memory occupied by the temporary variables oLats and oLons.

         # temporally regrid the spatially regridded obs data
         oData, newObsTimes = process.calc_average_on_new_time_unit_K(tmpOBS, oTimes, unit=timeRegridOption)

         tmpOBS = 0.0

         # check the consistency of temporally regridded obs data
         if n == 0:
             oldObsTimes = newObsTimes
         else:
             if oldObsTimes != newObsTimes:
                 print 'temporally regridded obs data time levels do not match at ', n - 1, n
                 print '%s Time through Loop' % (n + 1)
                 print 'oldObsTimes Count: %s' % len(oldObsTimes)
                 print 'newObsTimes Count: %s' % len(newObsTimes)
                 # TODO:  We need to handle these cases using Try Except Blocks or insert a sys.exit if appropriate
                 sys.exit()
             else:
                 oldObsTimes = newObsTimes
         # if everything's fine, append the spatially and temporally regridded data in the obs data array (obsData)
         regObsData.append(oData)


     """ all obs datasets have been read-in and regridded. convert the regridded obs data from 'list' to 'array'
     also finalize 'obsTimes', the time coordinate values of the regridded obs data.
     NOTE: using 'list_to_array' assigns values to the missing points; this has become a problem in handling the CRU data.
           this problem disappears by using 'ma.array'."""

     obsData = ma.array(regObsData)
     obsTimes = newObsTimes
     regObsData = 0
     oldObsTimes = 0
     nT = len(obsTimes)

     # TODO:  Refactor this into a function within the toolkit module
     # compute the simple multi-obs ensemble if multiple obs are used
     if numOBSs > 1:
         print 'numOBSs = ', numOBSs
         oData = obsData
         print 'oData shape = ', oData.shape
         obsData = ma.zeros((numOBSs + 1, nT, ngrdY, ngrdX))
         print 'obsData shape = ', obsData.shape
         avg = ma.zeros((nT, ngrdY, ngrdX))

         for i in np.arange(numOBSs):
             obsData[i, :, :, :] = oData[i, :, :, :]
             avg[:, :, :] = avg[:, :, :] + oData[i, :, :, :]

         avg = avg / float(numOBSs)
         obsData[numOBSs, :, :, :] = avg[:, :, :]     # store the model-ensemble data
         numOBSs = numOBSs + 1                     # update the number of obs data to include the model ensemble
         obsList.append('ENS-OBS')
     print 'OBS regridded: ', obsData.shape


     ## Part 2: load in and regrid model data from file(s)
     ## NOTE: tthe wo parameters, numMDLs and numMOmx are defined to represent the number of models (w/ all 240 mos) &
     ##       the total number of months, respectively, in later multi-model calculations.

     typeF = 'nc'
     regridMdlData = []
     # extract the model names and store them in the list 'mdlList'
     mdlName = []
     mdlListReversed=[]
     if len(mdlList) >1:
        for element in mdlList:
            mdlListReversed.append(element[::-1])
        prefix=os.path.commonprefix(mdlList)
        postfix=os.path.commonprefix(mdlListReversed)[::-1]
        for element in mdlList:
            mdlName.append(element.replace(prefix,'').replace(postfix,''))
     else:
         mdlName.append('model')


     for n in np.arange(numMDLs):
         # read model grid info, then model data
         ifile = mdlList[n]
         print 'ifile= ', ifile
         modelLats, modelLons, mTimes = files.read_lolaT_from_file(ifile, modelLatVarName, modelLonVarName, modelTimeVarName, typeF)
         mTime, mdlDat, mvUnit = files.read_data_from_one_file(ifile, modelVarName, modelTimeVarName, modelLats, typeF)
         mdlT = []
         mStep = len(mTimes)

         for i in np.arange(mStep):
             mdlT.append(mTimes[i].strftime("%Y%m"))

         wh = (np.array(mdlT) >= yymm0) & (np.array(mdlT) <= yymm1)
         modelTimes = list(np.array(mTimes)[wh])
         mData = mdlDat[wh, :, :]

         # determine the dimension size from the model time and latitude data.
         nT = len(modelTimes)

         # UNUSED VARIABLES - WILL DELETE AFTER TESTING
         # nmdlY=modelLats.shape[0]
         # nmdlX=modelLats.shape[1]
         #print 'nT, ngrdY, ngrdX = ',nT,ngrdY, ngrdX,min(modelTimes),max(modelTimes)
         print '  The shape of model data to be processed= ', mData.shape, ' for the period ', min(modelTimes), max(modelTimes)
         # spatial regridding of the modl data
         tmpMDL = ma.zeros((nT, ngrdY, ngrdX))

         if regridOption != 'model':
             for t in np.arange(nT):
                 tmpMDL[t, :, :] = process.do_regrid(mData[t, :, :], modelLats, modelLons, lats, lons)
         else:
             tmpMDL = mData

         # temporally regrid the model data
         mData, newMdlTimes = process.regrid_in_time(tmpMDL, modelTimes, unit=timeRegridOption)
         tmpMDL = 0.0

         # check data consistency for all models
         if n == 0:
             oldMdlTimes = newMdlTimes
         else:
             if oldMdlTimes != newMdlTimes:
                 print 'temporally regridded mdl data time levels do not match at ', n - 1, n
                 print len(oldMdlTimes), len(newMdlTimes)
                 sys.exit()
             else:
                 oldMdlTimes = newMdlTimes

         # if everything's fine, append the spatially and temporally regridded data in the obs data array (obsData)
         regridMdlData.append(mData)

     modelData = ma.array(regridMdlData)
     modelTimes = newMdlTimes
     regridMdlData = 0
     oldMdlTimes = 0
     newMdlTimes = 0
     if (precipFlag == True) & (mvUnit == 'KG M-2 S-1'):
         print 'convert model variable unit from mm/s to mm/day'
         modelData = 86400.*modelData

     # check consistency between the time levels of the model and obs data
     #   this is the final update of time levels: 'Times' and 'nT'
     if obsTimes != modelTimes:
         print 'time levels of the obs and model data are not consistent. EXIT'
         print 'obsTimes'
         print obsTimes
         print 'modelTimes'
         print modelTimes
         sys.exit()
     #  'Times = modelTimes = obsTimes' has been established and modelTimes and obsTimes will not be used hereafter. (de-allocated)
     Times = modelTimes
     nT = len(modelTimes)
     modelTimes = 0
     obsTimes = 0

     print 'Reading and regridding model data are completed'
     print 'numMDLs, modelData.shape= ', numMDLs, modelData.shape

     # TODO: Do we need to make this a user supplied flag, or do we just create an ensemble ALWAYS
     # TODO: Add in Kyo's code here as well
     # TODO:  Commented out until I can talk with Jinwon about this
     # compute the simple multi-model ensemble if multiple models are evaluated
     if numMDLs > 1:
         mdlData=modelData
         modelData=ma.zeros((numMDLs+1,nT,ngrdY,ngrdX))
         avg=ma.zeros((nT,ngrdY,ngrdX))
         for i in np.arange(numMDLs):
             modelData[i,:,:,:]=mdlData[i,:,:,:]
             avg[:,:,:]=avg[:,:,:]+mdlData[i,:,:,:]
         avg=avg/float(numMDLs)
         modelData[numMDLs,:,:,:]=avg[:,:,:]     # store the model-ensemble data
         # THESE ARE NOT USED.  WILL DELETE AFTER TESTING
         # i0mdl=0
         # i1mdl=numMDLs
         numMDLs=numMDLs+1
         mdlList.append('ENS-MODEL')
         print 'Eval mdl-mean timeseries for the obs periods: modelData.shape= ',modelData.shape
     # GOODALE:  This ensemble code should be refactored into process.py module since it's purpose is data processing

     # Processing complete
     print 'data_prep is completed: both obs and mdl data are re-gridded to a common analysis grid'
     return numOBSs, numMDLs, nT, ngrdY, ngrdX, Times, lons, lats, obsData, modelData, obsList, mdlName
	"""Prepare Datasets both model and observations for analysis using metrics"""


	import numpy as np
	import numpy.ma as ma
	import sys, os

	import Nio

	from storage import db, files
	import process
	from utils import misc

	# TODO: swap gridBox for Domain
	def prep_data(settings, obsDatasetList, gridBox, modelList):
	"""

	returns numOBSs,numMDLs,nT,ngrdY,ngrdX,Times,lons,lats,obsData,modelData,obsList

	numOBSs - number of Observational Datasets. Really need to look at using len(obsDatasetList) instead
	numMDLs - number of Models used. Again should use the len(modelList) instead
	nT - Time value count after temporal regridding. Should use length of the time axis for a given dataset
	ngrdY - size of the Y-Axis grid after spatial regridding
	ngrdX - size of the X-Axis grid after spatial regridding
	Times - list of python datetime objects the represent the list of time to be used in further calculations
	lons -
	lats -
	obsData -
	modelData -
	obsList -


	"""

	# TODO: Stop the object Deserialization and work on refactoring the core code here
	cachedir = settings.cacheDir
	workdir = settings.workDir

	# Use list comprehensions to deconstruct obsDatasetList
	# ['TRMM_pr_mon', 'CRU3.1_pr'] Basically a list of Dataset NAME +'_' + parameter name - THE 'CRU*' one triggers unit conversion issues later
	# the plan here is to use the obsDatasetList which contains a longName key we can use.
	obsList = [str(x['longname']) for x in obsDatasetList]
	# Also using the obsDatasetList with a key of ['dataset_id']
	obsDatasetId = [str(x['dataset_id']) for x in obsDatasetList]
	# obsDatasetList ['paramter_id'] list
	obsParameterId = [str(x['parameter_id']) for x in obsDatasetList]
	obsTimestep = [str(x['timestep']) for x in obsDatasetList]
	mdlList = [model.filename for model in modelList]

	# Since all of the model objects in the modelList have the same Varnames and Precip Flag, I am going to merely
	# pull this from modelList[0] for now
	modelVarName = modelList[0].varName
	precipFlag = modelList[0].precipFlag
	modelTimeVarName = modelList[0].timeVariable
	modelLatVarName = modelList[0].latVariable
	modelLonVarName = modelList[0].lonVariable
	regridOption = settings.spatialGrid
	timeRegridOption = settings.temporalGrid

	"""
	Routine to read-in and re-grid both obs and mdl datasets.
	Processes both single and multiple files of obs and mdl or combinations in a general way.
	i) retrieve observations from the database
	ii) load in model data
	iii) temporal regridding
	iv) spatial regridding
	v) area-averaging
	Input:
	cachedir - string describing directory path
	workdir - string describing directory path
	obsList - string describing the observation data files
	obsDatasetId - int, db dataset id
	obsParameterId - int, db parameter id
	latMin, latMax, lonMin, lonMax, dLat, dLon, naLats, naLons: define the evaluation/analysis domain/grid system
	latMin - float
	latMax - float
	lonMin - float
	lonMax - float
	dLat - float
	dLon - float
	naLats - integer
	naLons - integer
	mdlList - string describing model file name + path
	modelVarName - string describing name of variable to evaluate (as written in model file)
	precipFlag - bool (is this precipitation data? True/False)
	modelTimeVarName - string describing name of time variable in model file
	modelLatVarName - string describing name of latitude variable in model file
	modelLonVarName - string describing name of longitude variable in model file
	regridOption - string: 'obs'\|'model'\|'user'
	timeRegridOption -string: 'full'\|'annual'\|'monthly'\|'daily'
	maskOption - Boolean

	# TODO: This isn't true in the current codebase.
	Instead the SubRegion's are being used. You can see that these values are not
	being used in the code, at least they are not being passed in from the function

	maskLatMin - float (only used if maskOption=1)
	maskLatMax - float (only used if maskOption=1)
	maskLonMin - float (only used if maskOption=1)
	maskLonMax - float (only used if maskOption=1)
	Output: image files of plots + possibly data
	Jinwon Kim, 7/11/2012
	"""


	# check the number of obs & model data files
	numOBSs = len(obsList)
	numMDLs = len(mdlList)

	# assign parameters that must be preserved throughout the process

	print 'start & end time = ', settings.startDate, settings.endDate
	yymm0 = settings.startDate.strftime("%Y%m")
	yymm1 = settings.endDate.strftime("%Y%m")
	print 'start & end eval period = ', yymm0, yymm1



	#TODO: Wrap in try except blocks instead
	if numMDLs < 1:
	print 'No input model data file. EXIT'
	sys.exit()
	if numOBSs < 1:
	print 'No input observation data file. EXIT'
	sys.exit()

	## Part 1: retrieve observation data from the database and regrid them
	## NB. automatically uses local cache if already retrieved.

	# preparation for spatial re-gridding: define the size of horizontal array of the target interpolation grid system (ngrdX and ngrdY)
	print 'regridOption in prep_data= ', regridOption
	if regridOption == 'model':
	ifile = mdlList[0]
	typeF = 'nc'
	lats, lons, mTimes = files.read_lolaT_from_file(ifile, modelLatVarName, modelLonVarName, modelTimeVarName, typeF)
	modelObject = modelList[0]
	latMin = modelObject.latMin
	latMax = modelObject.latMax
	lonMin = modelObject.lonMin
	lonMax = modelObject.lonMax
	elif regridOption == 'user':
	# Use the GridBox Object
	latMin = gridBox.latMin
	latMax = gridBox.latMax
	lonMin = gridBox.lonMin
	lonMax = gridBox.lonMax
	naLats = gridBox.latCount
	naLons = gridBox.lonCount
	dLat = gridBox.latStep
	dLon = gridBox.lonStep
	lat = np.arange(naLats) * dLat + latMin
	lon = np.arange(naLons) * dLon + lonMin
	lons, lats = np.meshgrid(lon, lat)
	lon = 0.
	lat = 0.
	else:
	print "INVALID REGRID OPTION USED"
	sys.exit()

	ngrdY = lats.shape[0]
	ngrdX = lats.shape[1]

	regObsData = []

	for n in np.arange(numOBSs):
	# spatial regridding
	oLats, oLons, _, oTimes, oData = db.extractData(obsDatasetId[n],
	obsParameterId[n],
	latMin, latMax,
	lonMin, lonMax,
	settings.startDate, settings.endDate,
	cachedir, obsTimestep[n])

	# TODO: modify this if block with new metadata usage.
	if precipFlag == True and obsList[n][0:3] == 'CRU':
	oData = 86400.0 * oData

	nstOBSs = oData.shape[0] # note that the length of obs data can vary for different obs intervals (e.g., daily vs. monthly)
	print 'Regrid OBS dataset onto the ', regridOption, ' grid system: ngrdY, ngrdX, nstOBSs= ', ngrdY, ngrdX, nstOBSs
	print 'For dataset: %s' % obsList[n]

	tmpOBS = ma.zeros((nstOBSs, ngrdY, ngrdX))

	print 'tmpOBS shape = ', tmpOBS.shape

	for t in np.arange(nstOBSs):
	tmpOBS[t, :, :] = process.do_regrid(oData[t, :, :], oLats, oLons, lats, lons)

	# TODO: Not sure this is needed with Python Garbage Collector
	# The used memory should be freed when the objects are no longer referenced. If this continues to be an issue we may need to look
	# at using generators where possible.
	oLats = 0.0
	oLons = 0.0 # release the memory occupied by the temporary variables oLats and oLons.

	# temporally regrid the spatially regridded obs data
	oData, newObsTimes = process.calc_average_on_new_time_unit_K(tmpOBS, oTimes, unit=timeRegridOption)

	tmpOBS = 0.0

	# check the consistency of temporally regridded obs data
	if n == 0:
	oldObsTimes = newObsTimes
	else:
	if oldObsTimes != newObsTimes:
	print 'temporally regridded obs data time levels do not match at ', n - 1, n
	print '%s Time through Loop' % (n + 1)
	print 'oldObsTimes Count: %s' % len(oldObsTimes)
	print 'newObsTimes Count: %s' % len(newObsTimes)
	# TODO: We need to handle these cases using Try Except Blocks or insert a sys.exit if appropriate
	sys.exit()
	else:
	oldObsTimes = newObsTimes
	# if everything's fine, append the spatially and temporally regridded data in the obs data array (obsData)
	regObsData.append(oData)


	""" all obs datasets have been read-in and regridded. convert the regridded obs data from 'list' to 'array'
	also finalize 'obsTimes', the time coordinate values of the regridded obs data.
	NOTE: using 'list_to_array' assigns values to the missing points; this has become a problem in handling the CRU data.
	this problem disappears by using 'ma.array'."""

	obsData = ma.array(regObsData)
	obsTimes = newObsTimes
	regObsData = 0
	oldObsTimes = 0
	nT = len(obsTimes)

	# TODO: Refactor this into a function within the toolkit module
	# compute the simple multi-obs ensemble if multiple obs are used
	if numOBSs > 1:
	print 'numOBSs = ', numOBSs
	oData = obsData
	print 'oData shape = ', oData.shape
	obsData = ma.zeros((numOBSs + 1, nT, ngrdY, ngrdX))
	print 'obsData shape = ', obsData.shape
	avg = ma.zeros((nT, ngrdY, ngrdX))

	for i in np.arange(numOBSs):
	obsData[i, :, :, :] = oData[i, :, :, :]
	avg[:, :, :] = avg[:, :, :] + oData[i, :, :, :]

	avg = avg / float(numOBSs)
	obsData[numOBSs, :, :, :] = avg[:, :, :] # store the model-ensemble data
	numOBSs = numOBSs + 1 # update the number of obs data to include the model ensemble
	obsList.append('ENS-OBS')
	print 'OBS regridded: ', obsData.shape


	## Part 2: load in and regrid model data from file(s)
	## NOTE: tthe wo parameters, numMDLs and numMOmx are defined to represent the number of models (w/ all 240 mos) &
	## the total number of months, respectively, in later multi-model calculations.

	typeF = 'nc'
	regridMdlData = []
	# extract the model names and store them in the list 'mdlList'
	mdlName = []
	mdlListReversed=[]
	if len(mdlList) >1:
	for element in mdlList:
	mdlListReversed.append(element[::-1])
	prefix=os.path.commonprefix(mdlList)
	postfix=os.path.commonprefix(mdlListReversed)[::-1]
	for element in mdlList:
	mdlName.append(element.replace(prefix,'').replace(postfix,''))
	else:
	mdlName.append('model')


	for n in np.arange(numMDLs):
	# read model grid info, then model data
	ifile = mdlList[n]
	print 'ifile= ', ifile
	modelLats, modelLons, mTimes = files.read_lolaT_from_file(ifile, modelLatVarName, modelLonVarName, modelTimeVarName, typeF)
	mTime, mdlDat, mvUnit = files.read_data_from_one_file(ifile, modelVarName, modelTimeVarName, modelLats, typeF)
	mdlT = []
	mStep = len(mTimes)

	for i in np.arange(mStep):
	mdlT.append(mTimes[i].strftime("%Y%m"))

	wh = (np.array(mdlT) >= yymm0) & (np.array(mdlT) <= yymm1)
	modelTimes = list(np.array(mTimes)[wh])
	mData = mdlDat[wh, :, :]

	# determine the dimension size from the model time and latitude data.
	nT = len(modelTimes)

	# UNUSED VARIABLES - WILL DELETE AFTER TESTING
	# nmdlY=modelLats.shape[0]
	# nmdlX=modelLats.shape[1]
	#print 'nT, ngrdY, ngrdX = ',nT,ngrdY, ngrdX,min(modelTimes),max(modelTimes)
	print ' The shape of model data to be processed= ', mData.shape, ' for the period ', min(modelTimes), max(modelTimes)
	# spatial regridding of the modl data
	tmpMDL = ma.zeros((nT, ngrdY, ngrdX))

	if regridOption != 'model':
	for t in np.arange(nT):
	tmpMDL[t, :, :] = process.do_regrid(mData[t, :, :], modelLats, modelLons, lats, lons)
	else:
	tmpMDL = mData

	# temporally regrid the model data
	mData, newMdlTimes = process.regrid_in_time(tmpMDL, modelTimes, unit=timeRegridOption)
	tmpMDL = 0.0

	# check data consistency for all models
	if n == 0:
	oldMdlTimes = newMdlTimes
	else:
	if oldMdlTimes != newMdlTimes:
	print 'temporally regridded mdl data time levels do not match at ', n - 1, n
	print len(oldMdlTimes), len(newMdlTimes)
	sys.exit()
	else:
	oldMdlTimes = newMdlTimes

	# if everything's fine, append the spatially and temporally regridded data in the obs data array (obsData)
	regridMdlData.append(mData)

	modelData = ma.array(regridMdlData)
	modelTimes = newMdlTimes
	regridMdlData = 0
	oldMdlTimes = 0
	newMdlTimes = 0
	if (precipFlag == True) & (mvUnit == 'KG M-2 S-1'):
	print 'convert model variable unit from mm/s to mm/day'
	modelData = 86400.*modelData

	# check consistency between the time levels of the model and obs data
	# this is the final update of time levels: 'Times' and 'nT'
	if obsTimes != modelTimes:
	print 'time levels of the obs and model data are not consistent. EXIT'
	print 'obsTimes'
	print obsTimes
	print 'modelTimes'
	print modelTimes
	sys.exit()
	# 'Times = modelTimes = obsTimes' has been established and modelTimes and obsTimes will not be used hereafter. (de-allocated)
	Times = modelTimes
	nT = len(modelTimes)
	modelTimes = 0
	obsTimes = 0

	print 'Reading and regridding model data are completed'
	print 'numMDLs, modelData.shape= ', numMDLs, modelData.shape

	# TODO: Do we need to make this a user supplied flag, or do we just create an ensemble ALWAYS
	# TODO: Add in Kyo's code here as well
	# TODO: Commented out until I can talk with Jinwon about this
	# compute the simple multi-model ensemble if multiple models are evaluated
	if numMDLs > 1:
	mdlData=modelData
	modelData=ma.zeros((numMDLs+1,nT,ngrdY,ngrdX))
	avg=ma.zeros((nT,ngrdY,ngrdX))
	for i in np.arange(numMDLs):
	modelData[i,:,:,:]=mdlData[i,:,:,:]
	avg[:,:,:]=avg[:,:,:]+mdlData[i,:,:,:]
	avg=avg/float(numMDLs)
	modelData[numMDLs,:,:,:]=avg[:,:,:] # store the model-ensemble data
	# THESE ARE NOT USED. WILL DELETE AFTER TESTING
	# i0mdl=0
	# i1mdl=numMDLs
	numMDLs=numMDLs+1
	mdlList.append('ENS-MODEL')
	print 'Eval mdl-mean timeseries for the obs periods: modelData.shape= ',modelData.shape
	# GOODALE: This ensemble code should be refactored into process.py module since it's purpose is data processing

	# Processing complete
	print 'data_prep is completed: both obs and mdl data are re-gridded to a common analysis grid'
	return numOBSs, numMDLs, nT, ngrdY, ngrdX, Times, lons, lats, obsData, modelData, obsList, mdlName