src/main/python/rcmes/storage/files.py - climate - Git at Google

 #
 #  Licensed to the Apache Software Foundation (ASF) under one or more
 #  contributor license agreements.  See the NOTICE file distributed with
 #  this work for additional information regarding copyright ownership.
 #  The ASF licenses this file to You under the Apache License, Version 2.0
 #  (the "License"); you may not use this file except in compliance with
 #  the License.  You may obtain a copy of the License at
 #
 #      http://www.apache.org/licenses/LICENSE-2.0
 #
 #  Unless required by applicable law or agreed to in writing, software
 #  distributed under the License is distributed on an "AS IS" BASIS,
 #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 #
 """
 Module for handling data input files.  Requires netCDF and Numpy be
 installed.

 This module can easily open NetCDF, HDF and Grib files.  Search the netCDF4
 documentation for a complete list of supported formats.
 """

 from os import path
 import netCDF4
 import numpy as np
 import numpy.ma as ma
 import sys

 from toolkit import process
 from utils import fortranfile
 from utils import misc


 VARIABLE_NAMES = {'time': ['time', 'times', 'date', 'dates', 'julian'],
                   'latitude': ['latitude', 'lat', 'lats', 'latitudes'],
                   'longitude': ['longitude', 'lon', 'lons', 'longitudes']
                   }


 def findunique(seq):
     keys = {}
     for e in seq:
         keys[e] = 1
     return keys.keys()

 def getVariableByType(filename, variableType):
     """
     Function that will try to return the variable from a file based on a provided
     parameter type.

     Input::
         filename - the file to inspect
         variableType - time | latitude | longitude

     Output::
         variable name OR list of all variables in the file if a single variable
         name match cannot be found.
     """
     try:
         f = netCDF4.Dataset(filename, mode='r')
     except:
         print "netCDF4Error:", sys.exc_info()[0]
         raise

     variableKeys = f.variables.keys()
     f.close()
     variableKeys = [variable.encode().lower() for variable in variableKeys]
     variableMatch = VARIABLE_NAMES[variableType]

     commonVariables = list(set(variableKeys).intersection(variableMatch))

     if len(commonVariables) == 1:
         return str(commonVariables[0])

     else:
         return variableKeys

 def getVariableRange(filename, variableName):
     """
     Function to return the min and max values of the given variable in
     the supplied filename.

     Input::
         filename - absolute path to a file
         variableName - variable whose min and max values should be returned

     Output::
         variableRange - tuple of order (variableMin, variableMax)
     """
     try:
         f = netCDF4.Dataset(filename, mode='r')
     except:
         print "netCDF4Error:", sys.exc_info()[0]
         raise

     varArray = f.variables[variableName][:]
     return (varArray.min(), varArray.max())


 def read_data_from_file_list(filelist, myvar, timeVarName, latVarName, lonVarName):
     '''
     Read in data from a list of model files into a single data structure

     Input:
        filelist - list of filenames (including path)
        myvar    - string containing name of variable to load in (as it appears in file)
     Output:
        lat, lon - 2D array of latitude and longitude values
        timestore    - list of times
        t2store  - numpy array containing data from all files

      NB. originally written specific for WRF netCDF output files
          modified to make more general (Feb 2011)

       Peter Lean July 2010
     '''

     filelist.sort()
     filename = filelist[0]
     # Crash nicely if 'filelist' is zero length
     """TODO:  Throw Error instead via try Except"""
     if len(filelist) == 0:
         print 'Error: no files have been passed to read_data_from_file_list()'
         sys.exit()

     # Open the first file in the list to:
     #    i) read in lats, lons
     #    ii) find out how many timesteps in the file
     #        (assume same ntimes in each file in list)
     #     -allows you to create an empty array to store variable data for all times
     tmp = netCDF4.Dataset(filename, mode='r')
     latsraw = tmp.variables[latVarName][:]
     lonsraw = tmp.variables[lonVarName][:]
     if(latsraw.ndim == 1):
         lon, lat = np.meshgrid(lonsraw, latsraw)
     if(latsraw.ndim == 2):
         lon = lonsraw
         lat = latsraw

     timesraw = tmp.variables[timeVarName]
     ntimes = len(timesraw)

     print 'Lats and lons read in for first file in filelist'

     # Create a single empty masked array to store model data from all files
     t2store = ma.zeros((ntimes * len(filelist), len(lat[:, 0]), len(lon[0, :])))
     timestore = ma.zeros((ntimes * len(filelist)))

     # Now load in the data for real
     #  NB. no need to reload in the latitudes and longitudes -assume invariant
     i = 0
     timesaccu = 0 # a counter for number of times stored so far in t2store
     #  NB. this method allows for missing times in data files
     #      as no assumption made that same number of times in each file...


     for i, ifile in enumerate(filelist):

         #print 'Loading data from file: ',filelist[i]
         f = netCDF4.Dataset(ifile, mode='r')
         t2raw = ma.array(f.variables[myvar][:])
         timesraw = f.variables[timeVarName]
         time = timesraw[:]
         ntimes = len(time)
         print 'file= ', i, 'ntimes= ', ntimes, filelist[i]
         print 't2raw shape: ', t2raw.shape

         # Flatten dimensions which needn't exist, i.e. level
         #   e.g. if for single level then often data have 4 dimensions, when 3 dimensions will do.
         #  Code requires data to have dimensions, (time,lat,lon)
         #    i.e. remove level dimensions
         # Remove 1d axis from the t2raw array
         # Example: t2raw.shape == (365, 180, 360 1) <maps to (time, lat, lon, height)>
         # After the squeeze you will be left with (365, 180, 360) instead
         t2tmp = t2raw.squeeze()
         # Nb. if this happens to be data for a single time only, then we just flattened it by accident
         #     lets put it back...
         if t2tmp.ndim == 2:
             t2tmp = np.expand_dims(t2tmp, 0)

         t2store[timesaccu + np.arange(ntimes), :, :] = t2tmp[:, :, :]
         timestore[timesaccu + np.arange(ntimes)] = time
         timesaccu += ntimes
         f.close()

     print 'Data read in successfully with dimensions: ', t2store.shape

     # TODO: search for duplicated entries (same time) and remove duplicates.
     # Check to see if number of unique times == number of times, if so then no problem

     if(len(np.unique(timestore)) != len(np.where(timestore != 0)[0].view())):
         print 'WARNING: Possible duplicated times'

     # Decode model times into python datetime objects. Note: timestore becomes a list (no more an array) here
     timestore, _ = process.getModelTimes(filename, timeVarName)

     # Make sure latlon grid is monotonically increasing and that the domains
     # are correct
     lat, lon, t2store = checkLatLon(lat, lon, t2store)
     data_dict = {'lats': lat, 'lons': lon, 'times': timestore, 'data': t2store}
     return data_dict

 def select_var_from_file(myfile, fmt='not set'):
     '''
      Routine to act as user interface to allow users to select variable of interest from a file.

       Input:
          myfile - filename
          fmt - (optional) specify fileformat for netCDF4 if filename suffix is non-standard

       Output:
          myvar - variable name in file

         Peter Lean  September 2010
     '''

     print fmt

     if fmt == 'not set':
         f = netCDF4.Dataset(myfile, mode='r')

     if fmt != 'not set':
         f = netCDF4.Dataset(myfile, mode='r', format=fmt)

     keylist = [key.encode().lower() for key in f.variables.keys()]

     i = 0
     for v in keylist:
         print '[', i, '] ', f.variables[v].long_name, ' (', v, ')'
         i += 1

     user_selection = raw_input('Please select variable : [0 -' + str(i - 1) + ']  ')

     myvar = keylist[int(user_selection)]

     return myvar

 def select_var_from_wrf_file(myfile):
     '''
      Routine to act as user interface to allow users to select variable of interest from a wrf netCDF file.

       Input:
          myfile - filename

       Output:
          mywrfvar - variable name in wrf file

         Peter Lean  September 2010
     '''

     f = netCDF4.Dataset(myfile, mode='r', format='NETCDF4')
     keylist = [key.encode().lower() for key in f.variables.keys()]

     i = 0
     for v in keylist:
         try:
             print '[', i, '] ', f.variables[v].description, ' (', v, ')'
         except:
             print ''

         i += 1

     user_selection = raw_input('Please select WRF variable : [0 -' + str(i - 1) + ']  ')

     mywrfvar = keylist[int(user_selection)]

     return mywrfvar

 def read_data_from_one_file(ifile, myvar, latVarName, lonVarName, timeVarName, file_type):
     """
     Purpose::
         Read in data from one file at a time

     Input::
         filelist - list of filenames (including path)
         myvar - string containing name of variable to load in (as it appears in file)s
         lonVarName - name of the Longitude Variable
         timeVarName - name of the Time Variable
         fileType - type of file we are trying to parse

      Output::
         lat, lon - 2D arrays of latitude and longitude values
         times - list of times
         t2store - numpy array containing data from the file for the requested variable
         varUnit - units for the variable given by t2store
     """
     f = netCDF4.Dataset(ifile, mode='r')
     try:
         varUnit = f.variables[myvar].units.encode().upper()
     except:
         varUnit = raw_input('Enter the model variable unit: \n> ').upper()
     t2raw = ma.array(f.variables[myvar][:])
     t2tmp = t2raw.squeeze()
     if t2tmp.ndim == 2:
         t2tmp = np.expand_dims(t2tmp, 0)

     lonsraw = f.variables[lonVarName][:]
     latsraw = f.variables[latVarName][:]
     if(latsraw.ndim == 1):
         lon, lat = np.meshgrid(lonsraw, latsraw)
     if(latsraw.ndim == 2):
         lon = lonsraw
         lat = latsraw

     f.close()
     print '  success read_data_from_one_file: VarName=', myvar, ' Shape(Full)= ', t2tmp.shape, ' Unit= ', varUnit
     timestore = process.decode_model_timesK(ifile, timeVarName, file_type)

     # Make sure latlon grid is monotonically increasing and that the domains
     # are correct
     lat, lon, t2store = checkLatLon(lat, lon, t2tmp)
     return lat, lon, timestore, t2store, varUnit

 def findTimeVariable(filename):
     """
      Function to find what the time variable is called in a model file.
         Input::
             filename - file to crack open and check for a time variable
         Output::
             timeName - name of the input file's time variable
             variableNameList - list of variable names from the input filename
     """
     try:
         f = netCDF4.Dataset(filename, mode='r')
     except:
         print("Unable to open '%s' to try and read the Time variable" % filename)
         raise

     variableNameList = [variable.encode() for variable in f.variables.keys()]
     # convert all variable names into lower case
     varNameListLowerCase = [x.lower() for x in variableNameList]

     # Use "set" types for finding common variable name from in the file and from the list of possibilities
     possibleTimeNames = set(['time', 'times', 'date', 'dates', 'julian'])

     # Use the sets to find the intersection where variable names are in possibleNames
     timeNameSet = possibleTimeNames.intersection(varNameListLowerCase)

     if len(timeNameSet) == 0:
         print("Unable to autodetect the Time Variable Name in the '%s'" % filename)
         timeName = misc.askUserForVariableName(variableNameList, targetName ="Time")

     else:
         timeName = timeNameSet.pop()

     return timeName, variableNameList


 def findLatLonVarFromFile(filename):
     """
     Function to find what the latitude and longitude variables are called in a model file.

     Input::
         -filename
     Output::
         -latVarName
         -lonVarName
         -latMin
         -latMax
         -lonMin
         -lonMax
     """
     try:
         f = netCDF4.Dataset(filename, mode='r')
     except:
         print("Unable to open '%s' to try and read the Latitude and Longitude variables" % filename)
         raise

     variableNameList = [variable.encode() for variable in f.variables.keys()]
     # convert all variable names into lower case
     varNameListLowerCase = [x.lower() for x in variableNameList]

     # Use "set" types for finding common variable name from in the file and from the list of possibilities
     possibleLatNames = set(['latitude', 'lat', 'lats', 'latitudes'])
     possibleLonNames = set(['longitude', 'lon', 'lons', 'longitudes'])

     # Use the sets to find the intersection where variable names are in possibleNames
     latNameSet = possibleLatNames.intersection(varNameListLowerCase)
     lonNameSet = possibleLonNames.intersection(varNameListLowerCase)

     if len(latNameSet) == 0 or len(lonNameSet) == 0:
         print("Unable to autodetect Latitude and/or Longitude values in the file")
         latName = misc.askUserForVariableName(variableNameList, targetName ="Latitude")
         lonName = misc.askUserForVariableName(variableNameList, targetName ="Longitude")

     else:
         latName = latNameSet.pop()
         lonName = lonNameSet.pop()

     lats = np.array(f.variables[latName][:])
     lons = np.array(f.variables[lonName][:])

     latMin = lats.min()
     latMax = lats.max()

     # Convert the lons from 0:360 into -180:180
     lons[lons > 180] = lons[lons > 180] - 360.
     lonMin = lons.min()
     lonMax = lons.max()

     return latName, lonName, latMin, latMax, lonMin, lonMax


 def read_data_from_file_list_K(filelist, myvar, timeVarName, latVarName, lonVarName, file_type):
     ##################################################################################
     # Read in data from a list of model files into a single data structure
     # Input:   filelist - list of filenames (including path)
     #          myvar    - string containing name of variable to load in (as it appears in file)
     # Output:  lat, lon - 2D array of latitude and longitude values
     #          times    - list of times
     #          t2store  - numpy array containing data from all files
     # Modified from read_data_from_file_list to read data from multiple models into a 4-D array
     # 1. The code now processes model data that completely covers the 20-yr period. Thus,
     #    all model data must have the same time levels (ntimes). Unlike in the oroginal, ntimes
     #    is fixed here.
     # 2. Because one of the model data exceeds 240 mos (243 mos), the model data must be
     #    truncated to the 240 mons using the ntimes determined from the first file.
     ##################################################################################
     filelist.sort()
     nfiles = len(filelist)
     # Crash nicely if 'filelist' is zero length
     if nfiles == 0:
         print 'Error: no files have been passed to read_data_from_file_list(): Exit'
         sys.exit()

     # Open the first file in the list to:
     #    i)  read in lats, lons
     #    ii) find out how many timesteps in the file (assume same ntimes in each file in list)
     #     -allows you to create an empty array to store variable data for all times
     tmp = netCDF4.Dataset(filelist[0], mode='r', format=file_type)
     latsraw = tmp.variables[latVarName][:]
     lonsraw = tmp.variables[lonVarName][:]
     timesraw = tmp.variables[timeVarName]

     if(latsraw.ndim == 1):
         lon, lat = np.meshgrid(lonsraw, latsraw)

     elif(latsraw.ndim == 2):
         lon = lonsraw
         lat = latsraw
     ntimes = len(timesraw); nygrd = len(lat[:, 0]); nxgrd = len(lon[0, :])

     print 'Lats and lons read in for first file in filelist'

     # Create a single empty masked array to store model data from all files
     #t2store = ma.zeros((ntimes*nfiles,nygrd,nxgrd))
     t2store = ma.zeros((nfiles, ntimes, nygrd, nxgrd))
     #timestore=ma.zeros((ntimes*nfiles))

     ## Now load in the data for real
     ##  NB. no need to reload in the latitudes and longitudes -assume invariant
     #timesaccu=0 # a counter for number of times stored so far in t2store
     #  NB. this method allows for missing times in data files
     #      as no assumption made that same number of times in each file...

     for i, ifile in enumerate(filelist):
         #print 'Loading data from file: ',filelist[i]
         f = netCDF4.Dataset(ifile, mode='r')
         t2raw = ma.array(f.variables[myvar][:])
         timesraw = f.variables[timeVarName]
         #ntimes=len(time)
         #print 'file= ',i,'ntimes= ',ntimes,filelist[i]
         ## Flatten dimensions which needn't exist, i.e. level
         ##   e.g. if for single level then often data have 4 dimensions, when 3 dimensions will do.
         ##  Code requires data to have dimensions, (time,lat,lon)
         ##    i.e. remove level dimensions
         t2tmp = t2raw.squeeze()
         ## Nb. if data happen to be for a single time, we flattened it by accident; lets put it back...
         if t2tmp.ndim == 2:
             t2tmp = np.expand_dims(t2tmp, 0)
         #t2store[timesaccu+np.arange(ntimes),:,:]=t2tmp[0:ntimes,:,:]
         t2store[i, 0:ntimes, :, :] = t2tmp[0:ntimes, :, :]
         #timestore[timesaccu+np.arange(ntimes)]=time
         #timesaccu=timesaccu+ntimes
         f.close()

     print 'Data read in successfully with dimensions: ', t2store.shape

     # Decode model times into python datetime objects. Note: timestore becomes a list (no more an array) here
     ifile = filelist[0]
     timestore, _ = process.getModelTimes(ifile, timeVarName)

     # Make sure latlon grid is monotonically increasing and that the domains
     # are correct
     lat, lon, t2store = checkLatLon(lat, lon, t2store)
     return lat, lon, timestore, t2store

 def find_latlon_ranges(filelist, lat_var_name, lon_var_name):
     # Function to return the latitude and longitude ranges of the data in a file,
     # given the identifying variable names.
     #
     #    Input:
     #            filelist - list of filenames (data is read in from first file only)
     #            lat_var_name - variable name of the 'latitude' variable
     #            lon_var_name - variable name of the 'longitude' variable
     #
     #    Output:
     #            latMin, latMax, lonMin, lonMax - self explanatory
     #
     #                    Peter Lean      March 2011

     filename = filelist[0]

     try:
         f = netCDF4.Dataset(filename, mode='r')

         lats = f.variables[lat_var_name][:]
         latMin = lats.min()
         latMax = lats.max()

         lons = f.variables[lon_var_name][:]
         lons[lons > 180] = lons[lons > 180] - 360.
         lonMin = lons.min()
         lonMax = lons.max()

         return latMin, latMax, lonMin, lonMax

     except:
         print 'Error: there was a problem with finding the latitude and longitude ranges in the file'
         print '       Please check that you specified the filename, and variable names correctly.'

         sys.exit()

 def writeBN_lola(fileName, lons, lats):
     # write a binary data file that include longitude (1-d) and latitude (1-d) values

     F = fortranfile.FortranFile(fileName, mode='w')
     ngrdY = lons.shape[0]; ngrdX = lons.shape[1]
     tmpDat = ma.zeros(ngrdX); tmpDat[:] = lons[0, :]; F.writeReals(tmpDat)
     tmpDat = ma.zeros(ngrdY); tmpDat[:] = lats[:, 0]; F.writeReals(tmpDat)
     # release temporary arrays
     tmpDat = 0
     F.close()

 def writeBNdata(fileName, numOBSs, numMDLs, nT, ngrdX, ngrdY, numSubRgn, obsData, mdlData, obsRgnAvg, mdlRgnAvg):
     #(fileName,maskOption,numOBSs,numMDLs,nT,ngrdX,ngrdY,numSubRgn,obsData,mdlData,obsRgnAvg,mdlRgnAvg):
     # write spatially- and regionally regridded data into a binary data file
     missing = -1.e26
     F = fortranfile.FortranFile(fileName, mode='w')
     # construct a data array to replace mask flag with a missing value (missing=-1.e12) for printing
     data = ma.zeros((nT, ngrdY, ngrdX))
     for m in np.arange(numOBSs):
         data[:, :, :] = obsData[m, :, :, :]; msk = data.mask
         for n in np.arange(nT):
             for j in np.arange(ngrdY):
                 for i in np.arange(ngrdX):
                     if msk[n, j, i]: data[n, j, i] = missing

         # write observed data. allowed to write only one row at a time
         tmpDat = ma.zeros(ngrdX)
         for n in np.arange(nT):
             for j in np.arange(ngrdY):
                 tmpDat[:] = data[n, j, :]
                 F.writeReals(tmpDat)

     # write model data (dep. on the number of models).
     for m in np.arange(numMDLs):
         data[:, :, :] = mdlData[m, :, :, :]
         msk = data.mask
         for n in np.arange(nT):
             for j in np.arange(ngrdY):
                 for i in np.arange(ngrdX):
                     if msk[n, j, i]:
                         data[n, j, i] = missing

         for n in np.arange(nT):
             for j in np.arange(ngrdY):
                 tmpDat[:] = data[n, j, :]
                 F.writeReals(tmpDat)

     data = 0     # release the array allocated for data
     # write data in subregions
     if numSubRgn > 0:
         print 'Also included are the time series of the means over ', numSubRgn, ' areas from obs and model data'
         tmpDat = ma.zeros(nT); print numSubRgn
         for m in np.arange(numOBSs):
             for n in np.arange(numSubRgn):
                 tmpDat[:] = obsRgnAvg[m, n, :]
                 F.writeReals(tmpDat)
         for m in np.arange(numMDLs):
             for n in np.arange(numSubRgn):
                 tmpDat[:] = mdlRgnAvg[m, n, :]
                 F.writeReals(tmpDat)
     tmpDat = 0     # release the array allocated for tmpDat
     F.close()

 def writeNCfile(fileName, numSubRgn, lons, lats, obsData, mdlData, obsRgnAvg, mdlRgnAvg, obsList, mdlList, subRegions):
     # write an output file of variables up to 3 dimensions
     # fileName: the name of the output data file
     # numSubRgn  : the number of subregions
     # lons[ngrdX]: longitude
     # lats[ngrdY]: latitudes
     # obsData[nT,ngrdY,ngrdX]: the obs time series of the entire model domain
     # mdlData[numMDLs,nT,ngrdY,ngrdX]: the mdltime series of the entire model domain
     # obsRgnAvg[numSubRgn,nT]: the obs time series for the all subregions
     # mdlRgnAvg[numMDLs,numSubRgn,nT]: the mdl time series for the all subregions
     dimO = obsData.shape[0]      # the number of obs data
     dimM = mdlData.shape[0]      # the number of mdl data
     dimT = mdlData.shape[1]      # the number of time levels
     dimY = mdlData.shape[2]      # y-dimension
     dimX = mdlData.shape[3]      # x-dimension
     dimR = obsRgnAvg.shape[1]    # the number of subregions
     f = netCDF4.Dataset(fileName, mode='w', format='NETCDF4')
     print mdlRgnAvg.shape, dimM, dimR, dimT
     #create global attributes
     f.description = ''
     # create dimensions
     print 'Creating Dimensions within the NetCDF Object...'
     f.createDimension('unity', 1)
     f.createDimension('time', dimT)
     f.createDimension('west_east', dimX)
     f.createDimension('south_north', dimY)
     f.createDimension('obs', dimO)
     f.createDimension('models', dimM)

     # create the variable (real*4) to be written in the file
     print 'Creating Variables...'
     f.createVariable('lon', 'd', ('south_north', 'west_east'))
     f.createVariable('lat', 'd', ('south_north', 'west_east'))
     f.createVariable('oDat', 'd', ('obs', 'time', 'south_north', 'west_east'))
     f.createVariable('mDat', 'd', ('models', 'time', 'south_north', 'west_east'))

     if subRegions:
         f.createDimension('regions', dimR)
         f.createVariable('oRgn', 'd', ('obs', 'regions', 'time'))
         f.createVariable('mRgn', 'd', ('models', 'regions', 'time'))
         f.variables['oRgn'].varAttName = 'Observation time series: Subregions'
         f.variables['mRgn'].varAttName = 'Model time series: Subregions'

     loadDataIntoNetCDF(f, obsList, obsData, 'Observation')
     print 'Loaded the Observations into the NetCDF'

     loadDataIntoNetCDF(f, mdlList, mdlData, 'Model')

     # create attributes and units for the variable
     print 'Creating Attributes and Units...'
     f.variables['lon'].varAttName = 'Longitudes'
     f.variables['lon'].varUnit = 'degrees East'
     f.variables['lat'].varAttName = 'Latitudes'
     f.variables['lat'].varUnit = 'degrees North'
     f.variables['oDat'].varAttName = 'Observation time series: entire domain'
     f.variables['mDat'].varAttName = 'Model time series: entire domain'

     # assign the values to the variable and write it
     f.variables['lon'][:] = lons[:]
     f.variables['lat'][:] = lats[:]
     if subRegions:
         f.variables['oRgn'][:] = obsRgnAvg[:]
         f.variables['mRgn'][:] = mdlRgnAvg[:]

     f.close()

 def writeNCfile1(fileName, numDatasets, numObs, numMdl, numSubRgn, lons, lats, allData, dataRgn, dataList, subRegions):
     # write an output file of variables up to 3 dimensions
     # fileName: the name of the output data file
     # numDatasets: the total number of datasets (numObs + numMdl)
     # numObs     : the number of obs datasets (including the obs ensemble, if exists)
     # numMdl     : the number of mdl datasets (including the mdl ensemble, if exists)
     # numSubRgn  : the number of subregions
     # lons[ngrdX]: longitude
     # lats[ngrdY]: latitudes
     # allData[numDatasets,nT,ngrdY,ngrdX]: the combined obs and model time series for the entire model domain (t,y,x)
     # dataRgn[numDatasets,numSubRgn,nT]: the obs time series for the all subregions
     # dataList   : the names of datasets
     # subRegions : sub-region info
     dimN = allData.shape[0]      # the number of all (obs + model) datasets
     dimT = allData.shape[1]      # the number of time levels
     dimY = allData.shape[2]      # y-dimension
     dimX = allData.shape[3]      # x-dimension
     dimR = dataRgn.shape[1]      # the number of subregions
     f = netCDF4.Dataset(fileName, mode='w', format='NETCDF4')
     print dataRgn.shape, dimN, dimR, dimT
     #create global attributes
     f.description = ''
     # create dimensions
     print 'Creating Dimensions within the NetCDF Object...'
     f.createDimension('unity', 1)
     f.createDimension('time', dimT)
     f.createDimension('west_east', dimX)
     f.createDimension('south_north', dimY)
     f.createDimension('data', dimN)
     # create the variable (real*4) to be written in the file
     print 'Creating Variables...'
     f.createVariable('lon', 'd', ('south_north', 'west_east'))
     f.createVariable('lat', 'd', ('south_north', 'west_east'))
     f.createVariable('dGrd', 'd', ('data', 'time', 'south_north', 'west_east'))

     if subRegions:
         f.createDimension('regions', dimR)
         f.createVariable('dRgn', 'd', ('data', 'regions', 'time'))
         f.variables['dRgn'].varAttName = 'Sub-region mean time series'

     # load the gridded data into the output file
     loadDataIntoNetCDF1(f, dataList, allData)
     print 'Loaded the gridded into the NetCDF'

     # create attributes and units for the variable
     print 'Creating Attributes and Units...'
     f.variables['lon'].varAttName = 'Longitudes'
     f.variables['lon'].varUnit = 'degrees East'
     f.variables['lat'].varAttName = 'Latitudes'
     f.variables['lat'].varUnit = 'degrees North'
     f.variables['dGrd'].varAttName = 'Gridded time series'

     # assign the values to the variable and write it
     f.variables['lon'][:] = lons[:]
     f.variables['lat'][:] = lats[:]
     if subRegions:
         f.variables['dRgn'][:] = dataRgn[:]

     f.close()

 def loadDataIntoNetCDF(fileObject, datasets, dataArray, dataType):
     """
     Input::
         fileObject - netCDF4 file object data will be loaded into
         datasets - List of dataset names
         dataArray - Multi-dimensional array of data to be loaded into the NetCDF file
         dataType - String with value of either 'Model' or 'Observation'
     Output::
         No return value.  netCDF4 file object is updated in place
     """
     datasetCount = 0
     for datasetCount, dataset in enumerate(datasets):
         if dataType.lower() == 'observation':
             datasetName = dataset.replace(' ','')
         elif dataType.lower() == 'model':
             datasetName = path.splitext(path.basename(dataset))[0]
         print "Creating variable %s" % datasetName
         fileObject.createVariable(datasetName, 'd', ('time', 'south_north', 'west_east'))
         fileObject.variables[datasetName].varAttName = 'Obseration time series: entire domain'
         print 'Loading values into %s' % datasetName
         fileObject.variables[datasetName][:] = dataArray[datasetCount,:,:,:]

 def loadDataIntoNetCDF1(fileObject, datasets, dataArray):
     """
     Input::
         fileObject - netCDF4 file object data will be loaded into
         datasets - List of dataset names
         dataArray - Multi-dimensional array of data to be loaded into the NetCDF file
     Output::
         No return value.  netCDF4 file object is updated in place
     """
     datasetCount = 0
     for datasetCount, dataset in enumerate(datasets):
         datasetName = path.splitext(path.basename(dataset))[0]
         print "Creating variable %s" % datasetName
         fileObject.createVariable(datasetName, 'd', ('time', 'south_north', 'west_east'))
         fileObject.variables[datasetName].varAttName = 'Gridded time series: entire domain'
         print 'Loading values into %s' % datasetName
         fileObject.variables[datasetName][:] = dataArray[datasetCount,:,:,:]

 def checkLatLon(latsin, lonsin, datain):
     """
     Purpose::
         Checks whether latitudes and longitudes are monotonically increasing
         within the domains [-90, 90) and [-180, 180) respectively, and rearranges the input data
         accordingly if they are not.

     Input::
         latsin - Array of latitudes read from a raw netcdf file
         lonsin - Array of longitudes read from a raw netcdf file
         datain  - Array of data values read from a raw netcdf file.
                    The shape is assumed to be (..., nLat, nLon).

     Output::
         latsout - 2D array of (rearranged) latitudes
         lonsout - 2D array of (rearranged) longitudes
         dataout - Array of (rearranged) data
     """
     # Avoid unnecessary shifting if all lons are higher than 180
     if lonsin.min() > 180:
         lonsin -= 360

     # Make sure lats and lons are monotonically increasing
     latsDecreasing = np.diff(latsin[:, 0]) < 0
     lonsDecreasing = np.diff(lonsin[0]) < 0

     # If all values are decreasing then they just need to be reversed
     latsReversed, lonsReversed = latsDecreasing.all(), lonsDecreasing.all()

     # If the lat values are unsorted then raise an exception
     if not latsReversed and latsDecreasing.any():
         raise ValueError('Latitudes must be monotonically increasing.')

     # Perform same checks now for lons
     if not lonsReversed and lonsDecreasing.any():
         raise ValueError('Longitudes must be monotonically increasing.')

     # Also check if lons go from [0, 360), and convert to [-180, 180)
     # if necessary
     lonsShifted = lonsin.max() > 180
     latsout, lonsout, dataout = latsin[:], lonsin[:], datain[:]
     # Now correct data if latlon grid needs to be shifted
     if latsReversed:
         latsout = latsout[::-1]
         dataout = dataout[..., ::-1, :]

     if lonsReversed:
         lonsout = lonsout[..., ::-1]
         dataout = dataout[..., ::-1]

     if lonsShifted:
         lat1d = latsout[:, 0]
         dataout, lon1d = shiftgrid(180, dataout, lonsout[0], start=False)
         lonsout, latsout = np.meshgrid(lon1d, lat1d)

     return latsout, lonsout, dataout

 def shiftgrid(lon0, datain, lonsin, start= True, cyclic=360.0):
     """
     Purpose::
         Shift global lat/lon grid east or west. This function is taken directly
         from the (unreleased) basemap 1.0.7 source code as version 1.0.6 does not
         currently support arrays with more than two dimensions.
         https://github.com/matplotlib/basemap

     Input::
         lon0 - starting longitude for shifted grid (ending longitude if start=False).
                lon0 must be on input grid (within the range of lonsin).
         datain - original data with longitude the right-most dimension.
         lonsin - original longitudes.
         start  - if True, lon0 represents the starting longitude of the new grid.
                  if False, lon0 is the ending longitude. Default True.
         cyclic - width of periodic domain (default 360)

     Output::
         dataout - data on shifted grid
         lonsout - lons on shifted grid
     """
     if np.fabs(lonsin[-1]-lonsin[0]-cyclic) > 1.e-4:
         # Use all data instead of raise ValueError, 'cyclic point not included'
         start_idx = 0
     else:
         # If cyclic, remove the duplicate point
         start_idx = 1
     if lon0 < lonsin[0] or lon0 > lonsin[-1]:
         raise ValueError('lon0 outside of range of lonsin')
     i0 = np.argmin(np.fabs(lonsin-lon0))
     i0_shift = len(lonsin)-i0
     if ma.isMA(datain):
         dataout  = ma.zeros(datain.shape,datain.dtype)
     else:
         dataout  = np.zeros(datain.shape,datain.dtype)
     if ma.isMA(lonsin):
         lonsout = ma.zeros(lonsin.shape,lonsin.dtype)
     else:
         lonsout = np.zeros(lonsin.shape,lonsin.dtype)
     if start:
         lonsout[0:i0_shift] = lonsin[i0:]
     else:
         lonsout[0:i0_shift] = lonsin[i0:]-cyclic
     dataout[...,0:i0_shift] = datain[...,i0:]
     if start:
         lonsout[i0_shift:] = lonsin[start_idx:i0+start_idx]+cyclic
     else:
         lonsout[i0_shift:] = lonsin[start_idx:i0+start_idx]
     dataout[...,i0_shift:] = datain[...,start_idx:i0+start_idx]
     return dataout,lonsout