projects/rcmes/trunk/rcmet/src/main/python/rcmes/storage/db.py - oodt - Git at Google

 """Collection of functions used to interface with the database and to create netCDF file
 """
 import os
 import urllib2
 import re
 import numpy as np
 import numpy.ma as ma
 import json
 import Nio

 from classes import RCMED
 from toolkit import process
 from datetime import timedelta ,datetime

 def reorderXYT(lons, lats, times, values):
     # Re-order values in values array such that when reshaped everywhere is where it should be
     #  (as DB doesn't necessarily return everything in order)
     order = np.lexsort((lons, lats, times))
     counter = 0
     sortedValues = np.zeros_like(values)
     sortedLats = np.zeros_like(lats)
     sortedLons = np.zeros_like(lons)
     for i in order:
         sortedValues[counter] = values[i]
         sortedLats[counter] = lats[i]
         sortedLons[counter] = lons[i]
         counter += 1

     return sortedValues, sortedLats, sortedLons

 def findUnique(seq, idfun=None):
     """
      Function to find unique values (used in construction of unique datetime list)
      NB. order preserving
      Input: seq  - a list of randomly ordered values
      Output: result - list of ordered values
     """
     if idfun is None:
         def idfun(x):
             return x

     seen = {};
     result = []

     for item in seq:
         marker = idfun(item)
         # in old Python versions:
         # if seen.has_key(marker)
         # but in new ones:
         if marker in seen: continue
         seen[marker] = 1
         result.append(item)
     return result

 def extractData(datasetID, paramID, latMin, latMax, lonMin, lonMax, startTime, endTime, cachedir, timestep):
     """
     Main function to extract data from DB into numpy masked arrays

     Input::
         datasetID, paramID: required identifiers of data in database
         latMin, latMax, lonMin, lonMax: location range to extract data for
         startTime, endTime: python datetime objects describing required time range to extract
         cachedir: directory path used to store temporary cache files
         timestep: "daily" | "monthly" so we can be sure to query the RCMED properly
     Output:
         uniqueLatitudes,uniqueLongitudes: 1d-numpy array of latitude and longitude grid values
         uniqueLevels:	1d-numpy array of vertical level values
         timesUnique: list of python datetime objects describing times of returned data
         mdata: masked numpy arrays of data values

     """

     url = RCMED.jplUrl(datasetID, paramID, latMin, latMax, lonMin, lonMax, startTime, endTime, cachedir, timestep)

     # To get the parameter's information from parameter table
     database,timestep,realm,instrument,start_date,end_date,unit=get_param_info(url)

     # Create a directory inside the cachedir folder
     name=[]
     # activity is a fix value
     activity="obs4cmip5"
     name.append(activity)
     # product is a fix value
     product="observations"
     name.append(product)
     # realm, variable,frequency and instrument will be get from parameter table
     realm=realm
     name.append(realm)
     variable=database
     name.append(variable)
     frequency=timestep
     name.append(frequency)
     data_structure="grid"
     name.append(data_structure)
     institution="NASA"
     name.append(institution)
     project="RCMES"
     name.append(project)
     instrument=instrument
     name.append(instrument)
     version="v1"
     name.append(version)

     # Check to see whether the folder is already created for netCDF or not, then it will be created
     os.chdir(cachedir)
     path=os.getcwd()
     for n in name:
         if os.path.exists(path+"/"+n):
             os.chdir(path+"/"+n)
             path=os.getcwd()
         else:
             os.mkdir(n)
             os.chdir(path+"/"+n)
             path=os.getcwd()

     # To establish the netCDF file name
     processing_level='L3'
     processing_version="processing_version"  # the processing version is still unknown
     start_date=str(startTime)[0:4]+str(startTime)[5:7]+str(startTime)[8:10]
     end_date=str(endTime)[0:4]+str(endTime)[5:7]+str(endTime)[8:10]
     netCD_fileName=variable + '_' + project + '_' + processing_level + '_' + processing_version + '_' + str(latMin) + '_' + str(latMax) + '_' + str(lonMin) + '_' + str(lonMax) + '_' + start_date + '_' + end_date + '.nc'

     # To check if netCDF file  exists, then use it
     if os.path.exists(path+"/"+netCD_fileName):
         latitudes, longitudes, uniqueLevels, timesUnique, mdata=read_netcdf(path+"/"+netCD_fileName,timestep)
     # If the netCDF file does not exists, then create one.
     else:
         latitudes, longitudes, uniqueLevels, timesUnique, mdata=create_netCDF(url, database, latMin, latMax, lonMin, lonMax, startTime, endTime, unit, path+"/"+netCD_fileName,timestep)

     return latitudes, longitudes, uniqueLevels, timesUnique, mdata

 def get_param_info(url):

     '''
     This function will get the general information by given URL from the parameter table.
     '''
     url=url + "&info=yes"
     result = urllib2.urlopen(url)
     datastring = result.read()
     datastring=json.loads(datastring)
     database=datastring["database"]
     timestep=datastring["timestep"]
     realm=datastring["realm"]
     instrument=datastring["instrument"]
     start_date=datastring["start_date"]
     end_date=datastring["end_date"]
     unit=datastring["units"]

     return database,timestep,realm,instrument,start_date,end_date,unit


 def create_netCDF(url, database, latMin, latMax, lonMin, lonMax, startTime, endTime, unit, netCD_fileName, timestep):


     print 'Starting retrieval from DB (this may take several minutes)'
     result = urllib2.urlopen(url)
     datastring = result.read()
     d = re.search('data: \r\n', datastring)
     data = datastring[d.end():len(datastring)]

     # To create a list of all datapoints
     data=data.split('\r\n')

     latitudes = []
     longitudes = []
     levels = []
     values = []
     timestamps = []

     # To make a series of lists from datapoints
     for i in range(len(data)-1):  # Because the last row is empty, "len(data)-1" is used.
         row=data[i].split(',')
         latitudes.append(np.float32(row[0]))
         longitudes.append(np.float32(row[1]))
         levels.append(np.float32(row[2]))
         # timestamps are strings so we will leave them alone for now
         timestamps.append(row[3])
         values.append(np.float32(row[4]))

        # Need to sort time to make sure start and end times are correct
     time_label_list = timestamps
     print "start sort"
     time_label_list.sort()
     print "done sort"
     start_time_label = time_label_list[0]
     end_time_label = time_label_list[-1]

     hours=[]
     timeFormat = "%Y-%m-%d %H:%M:%S"
     base_date=datetime.strptime(start_time_label, timeFormat)
     # To convert the date to hours
     for t in timestamps:
         date=datetime.strptime(t, timeFormat)
         dif=date-base_date
         hours.append(dif.days*24)


     # To generate netCDF file from database
     print "Generating netCDF file in the cache directory...."
     netcdf =  Nio.open_file(netCD_fileName,'w')
     string="The netCDF file for parameter: " + database + ", latMin: " + str(latMin) + ", latMax: " + str(latMax) + ", lonMin: " + str(lonMin) + ", lonMax: " + str(lonMax) + " startTime: " + str(start_time_label) + " and endTime: " + str(end_time_label) + "."
     netcdf.globalAttName = str(string)
     netcdf.create_dimension('dim', len(latitudes))
     latitude = netcdf.create_variable('lat', 'd', ('dim',))
     longitude = netcdf.create_variable('lon', 'd', ('dim',))
     level = netcdf.create_variable('lev', 'd', ('dim',))
     time = netcdf.create_variable('time', 'd', ('dim',))
     value = netcdf.create_variable('value', 'd', ('dim',))

     netcdf.variables['lat'].varAttName = 'latitude'
     netcdf.variables['lat'].units = 'degrees_north'
     netcdf.variables['lon'].varAttName = 'longitude'
     netcdf.variables['lon'].units = 'degrees_east'
     netcdf.variables['time'].varAttName = 'time'
     netcdf.variables['time'].units = 'hours since ' + str(start_time_label)
     netcdf.variables['value'].varAttName = 'value'
     netcdf.variables['value'].units = str(unit)
     netcdf.variables['lev'].varAttName = 'level'
     netcdf.variables['lev'].units = 'hPa'
     latitude[:]=latitudes[:]
     longitude[:]=longitudes[:]
     level[:]=levels[:]
     time[:]=hours[:]
     value[:]=values[:]
     netcdf.close()

     print "Data stored as netCDF file (cache file)"

     latitudes, longitudes, uniqueLevels, timesUnique, mdata = read_netcdf(netCD_fileName,timestep)

     return latitudes, longitudes, uniqueLevels, timesUnique, mdata

 def read_netcdf(netCD_fileName,timestep):

     # To use the created netCDF file
     print 'Retrieving data from cache (netCDF file)'
     netcdf = Nio.open_file(netCD_fileName , 'r')
     # To get all data from netCDF file
     latitudes = netcdf.variables['lat'][:]
     longitudes = netcdf.variables['lon'][:]
     levels = netcdf.variables['lev'][:]
     hours = netcdf.variables['time'][:]
     values = netcdf.variables['value'][:]

     # To get the base date
     time_unit=netcdf.variables['time'].units
     time_unit=time_unit.split(' ')
     base_data=time_unit[2] + " " + time_unit[3]

     netcdf.close()

     timeFormat = "%Y-%m-%d %H:%M:%S"

     # Because time in netCDF file is based on hours since a specific date, it needs to be converted to date format
     times=[]
     # To convert the base date to the python datetime format
     dt = datetime.strptime(base_data, timeFormat)
     for t in range(len(hours)):
         d=timedelta(hours[t]/24)
         add=dt+d
         times.append(str(add.year) + '-' + str("%02d" % (add.month)) + '-' + str("%02d" % (add.day)) + ' ' + str("%02d" % (add.hour)) + ':' + str("%02d" % (add.minute)) + ':' + str("%02d" % (add.second)))

     # Make arrays of unique latitudes, longitudes, levels and times
     uniqueLatitudes = np.unique(latitudes)
     uniqueLongitudes = np.unique(longitudes)
     uniqueLevels = np.unique(levels)
     uniqueTimestamps = np.unique(times)

     # Calculate nx and ny
     uniqueLongitudeCount = len(uniqueLongitudes)
     uniqueLatitudeCount = len(uniqueLatitudes)
     uniqueLevelCount = len(uniqueLevels)
     uniqueTimeCount = len(uniqueTimestamps)

     values, latitudes, longitudes = reorderXYT(longitudes, latitudes, times, values)

     # Convert each unique time from strings into list of Python datetime objects
     # TODO - LIST COMPS!
     timesUnique = [datetime.strptime(t, timeFormat) for t in uniqueTimestamps]
     timesUnique.sort()
     timesUnique = process.normalizeDatetimes(timesUnique, timestep)

     # Reshape arrays
     latitudes = latitudes.reshape(uniqueTimeCount, uniqueLatitudeCount, uniqueLongitudeCount, uniqueLevelCount)
     longitudes = longitudes.reshape(uniqueTimeCount, uniqueLatitudeCount, uniqueLongitudeCount, uniqueLevelCount)
     levels = np.array(levels).reshape(uniqueTimeCount, uniqueLatitudeCount, uniqueLongitudeCount, uniqueLevelCount)
     values = values.reshape(uniqueTimeCount, uniqueLatitudeCount, uniqueLongitudeCount, uniqueLevelCount)

     # Flatten dimension if only single level
     if uniqueLevelCount == 1:
         values = values[:, :, :, 0]
         latitudes = latitudes[0, :, :, 0]
         longitudes = longitudes[0, :, :, 0]

     # Created masked array to deal with missing values
     #  -these make functions like values.mean(), values.max() etc ignore missing values
     mdi = -9999  # TODO: extract this value from the DB retrieval metadata
     mdata = ma.masked_array(values, mask=(values == mdi))

     return latitudes, longitudes, uniqueLevels, timesUnique, mdata
	"""Collection of functions used to interface with the database and to create netCDF file
	"""
	import os
	import urllib2
	import re
	import numpy as np
	import numpy.ma as ma
	import json
	import Nio

	from classes import RCMED
	from toolkit import process
	from datetime import timedelta ,datetime

	def reorderXYT(lons, lats, times, values):
	# Re-order values in values array such that when reshaped everywhere is where it should be
	# (as DB doesn't necessarily return everything in order)
	order = np.lexsort((lons, lats, times))
	counter = 0
	sortedValues = np.zeros_like(values)
	sortedLats = np.zeros_like(lats)
	sortedLons = np.zeros_like(lons)
	for i in order:
	sortedValues[counter] = values[i]
	sortedLats[counter] = lats[i]
	sortedLons[counter] = lons[i]
	counter += 1

	return sortedValues, sortedLats, sortedLons

	def findUnique(seq, idfun=None):
	"""
	Function to find unique values (used in construction of unique datetime list)
	NB. order preserving
	Input: seq - a list of randomly ordered values
	Output: result - list of ordered values
	"""
	if idfun is None:
	def idfun(x):
	return x

	seen = {};
	result = []

	for item in seq:
	marker = idfun(item)
	# in old Python versions:
	# if seen.has_key(marker)
	# but in new ones:
	if marker in seen: continue
	seen[marker] = 1
	result.append(item)
	return result

	def extractData(datasetID, paramID, latMin, latMax, lonMin, lonMax, startTime, endTime, cachedir, timestep):
	"""
	Main function to extract data from DB into numpy masked arrays

	Input::
	datasetID, paramID: required identifiers of data in database
	latMin, latMax, lonMin, lonMax: location range to extract data for
	startTime, endTime: python datetime objects describing required time range to extract
	cachedir: directory path used to store temporary cache files
	timestep: "daily" \| "monthly" so we can be sure to query the RCMED properly
	Output:
	uniqueLatitudes,uniqueLongitudes: 1d-numpy array of latitude and longitude grid values
	uniqueLevels: 1d-numpy array of vertical level values
	timesUnique: list of python datetime objects describing times of returned data
	mdata: masked numpy arrays of data values

	"""

	url = RCMED.jplUrl(datasetID, paramID, latMin, latMax, lonMin, lonMax, startTime, endTime, cachedir, timestep)

	# To get the parameter's information from parameter table
	database,timestep,realm,instrument,start_date,end_date,unit=get_param_info(url)

	# Create a directory inside the cachedir folder
	name=[]
	# activity is a fix value
	activity="obs4cmip5"
	name.append(activity)
	# product is a fix value
	product="observations"
	name.append(product)
	# realm, variable,frequency and instrument will be get from parameter table
	realm=realm
	name.append(realm)
	variable=database
	name.append(variable)
	frequency=timestep
	name.append(frequency)
	data_structure="grid"
	name.append(data_structure)
	institution="NASA"
	name.append(institution)
	project="RCMES"
	name.append(project)
	instrument=instrument
	name.append(instrument)
	version="v1"
	name.append(version)

	# Check to see whether the folder is already created for netCDF or not, then it will be created
	os.chdir(cachedir)
	path=os.getcwd()
	for n in name:
	if os.path.exists(path+"/"+n):
	os.chdir(path+"/"+n)
	path=os.getcwd()
	else:
	os.mkdir(n)
	os.chdir(path+"/"+n)
	path=os.getcwd()

	# To establish the netCDF file name
	processing_level='L3'
	processing_version="processing_version" # the processing version is still unknown
	start_date=str(startTime)[0:4]+str(startTime)[5:7]+str(startTime)[8:10]
	end_date=str(endTime)[0:4]+str(endTime)[5:7]+str(endTime)[8:10]
	netCD_fileName=variable + '_' + project + '_' + processing_level + '_' + processing_version + '_' + str(latMin) + '_' + str(latMax) + '_' + str(lonMin) + '_' + str(lonMax) + '_' + start_date + '_' + end_date + '.nc'

	# To check if netCDF file exists, then use it
	if os.path.exists(path+"/"+netCD_fileName):
	latitudes, longitudes, uniqueLevels, timesUnique, mdata=read_netcdf(path+"/"+netCD_fileName,timestep)
	# If the netCDF file does not exists, then create one.
	else:
	latitudes, longitudes, uniqueLevels, timesUnique, mdata=create_netCDF(url, database, latMin, latMax, lonMin, lonMax, startTime, endTime, unit, path+"/"+netCD_fileName,timestep)

	return latitudes, longitudes, uniqueLevels, timesUnique, mdata

	def get_param_info(url):

	'''
	This function will get the general information by given URL from the parameter table.
	'''
	url=url + "&info=yes"
	result = urllib2.urlopen(url)
	datastring = result.read()
	datastring=json.loads(datastring)
	database=datastring["database"]
	timestep=datastring["timestep"]
	realm=datastring["realm"]
	instrument=datastring["instrument"]
	start_date=datastring["start_date"]
	end_date=datastring["end_date"]
	unit=datastring["units"]

	return database,timestep,realm,instrument,start_date,end_date,unit


	def create_netCDF(url, database, latMin, latMax, lonMin, lonMax, startTime, endTime, unit, netCD_fileName, timestep):


	print 'Starting retrieval from DB (this may take several minutes)'
	result = urllib2.urlopen(url)
	datastring = result.read()
	d = re.search('data: \r\n', datastring)
	data = datastring[d.end():len(datastring)]

	# To create a list of all datapoints
	data=data.split('\r\n')

	latitudes = []
	longitudes = []
	levels = []
	values = []
	timestamps = []

	# To make a series of lists from datapoints
	for i in range(len(data)-1): # Because the last row is empty, "len(data)-1" is used.
	row=data[i].split(',')
	latitudes.append(np.float32(row[0]))
	longitudes.append(np.float32(row[1]))
	levels.append(np.float32(row[2]))
	# timestamps are strings so we will leave them alone for now
	timestamps.append(row[3])
	values.append(np.float32(row[4]))

	# Need to sort time to make sure start and end times are correct
	time_label_list = timestamps
	print "start sort"
	time_label_list.sort()
	print "done sort"
	start_time_label = time_label_list[0]
	end_time_label = time_label_list[-1]

	hours=[]
	timeFormat = "%Y-%m-%d %H:%M:%S"
	base_date=datetime.strptime(start_time_label, timeFormat)
	# To convert the date to hours
	for t in timestamps:
	date=datetime.strptime(t, timeFormat)
	dif=date-base_date
	hours.append(dif.days*24)


	# To generate netCDF file from database
	print "Generating netCDF file in the cache directory...."
	netcdf = Nio.open_file(netCD_fileName,'w')
	string="The netCDF file for parameter: " + database + ", latMin: " + str(latMin) + ", latMax: " + str(latMax) + ", lonMin: " + str(lonMin) + ", lonMax: " + str(lonMax) + " startTime: " + str(start_time_label) + " and endTime: " + str(end_time_label) + "."
	netcdf.globalAttName = str(string)
	netcdf.create_dimension('dim', len(latitudes))
	latitude = netcdf.create_variable('lat', 'd', ('dim',))
	longitude = netcdf.create_variable('lon', 'd', ('dim',))
	level = netcdf.create_variable('lev', 'd', ('dim',))
	time = netcdf.create_variable('time', 'd', ('dim',))
	value = netcdf.create_variable('value', 'd', ('dim',))

	netcdf.variables['lat'].varAttName = 'latitude'
	netcdf.variables['lat'].units = 'degrees_north'
	netcdf.variables['lon'].varAttName = 'longitude'
	netcdf.variables['lon'].units = 'degrees_east'
	netcdf.variables['time'].varAttName = 'time'
	netcdf.variables['time'].units = 'hours since ' + str(start_time_label)
	netcdf.variables['value'].varAttName = 'value'
	netcdf.variables['value'].units = str(unit)
	netcdf.variables['lev'].varAttName = 'level'
	netcdf.variables['lev'].units = 'hPa'
	latitude[:]=latitudes[:]
	longitude[:]=longitudes[:]
	level[:]=levels[:]
	time[:]=hours[:]
	value[:]=values[:]
	netcdf.close()

	print "Data stored as netCDF file (cache file)"

	latitudes, longitudes, uniqueLevels, timesUnique, mdata = read_netcdf(netCD_fileName,timestep)

	return latitudes, longitudes, uniqueLevels, timesUnique, mdata

	def read_netcdf(netCD_fileName,timestep):

	# To use the created netCDF file
	print 'Retrieving data from cache (netCDF file)'
	netcdf = Nio.open_file(netCD_fileName , 'r')
	# To get all data from netCDF file
	latitudes = netcdf.variables['lat'][:]
	longitudes = netcdf.variables['lon'][:]
	levels = netcdf.variables['lev'][:]
	hours = netcdf.variables['time'][:]
	values = netcdf.variables['value'][:]

	# To get the base date
	time_unit=netcdf.variables['time'].units
	time_unit=time_unit.split(' ')
	base_data=time_unit[2] + " " + time_unit[3]

	netcdf.close()

	timeFormat = "%Y-%m-%d %H:%M:%S"

	# Because time in netCDF file is based on hours since a specific date, it needs to be converted to date format
	times=[]
	# To convert the base date to the python datetime format
	dt = datetime.strptime(base_data, timeFormat)
	for t in range(len(hours)):
	d=timedelta(hours[t]/24)
	add=dt+d
	times.append(str(add.year) + '-' + str("%02d" % (add.month)) + '-' + str("%02d" % (add.day)) + ' ' + str("%02d" % (add.hour)) + ':' + str("%02d" % (add.minute)) + ':' + str("%02d" % (add.second)))

	# Make arrays of unique latitudes, longitudes, levels and times
	uniqueLatitudes = np.unique(latitudes)
	uniqueLongitudes = np.unique(longitudes)
	uniqueLevels = np.unique(levels)
	uniqueTimestamps = np.unique(times)

	# Calculate nx and ny
	uniqueLongitudeCount = len(uniqueLongitudes)
	uniqueLatitudeCount = len(uniqueLatitudes)
	uniqueLevelCount = len(uniqueLevels)
	uniqueTimeCount = len(uniqueTimestamps)

	values, latitudes, longitudes = reorderXYT(longitudes, latitudes, times, values)

	# Convert each unique time from strings into list of Python datetime objects
	# TODO - LIST COMPS!
	timesUnique = [datetime.strptime(t, timeFormat) for t in uniqueTimestamps]
	timesUnique.sort()
	timesUnique = process.normalizeDatetimes(timesUnique, timestep)

	# Reshape arrays
	latitudes = latitudes.reshape(uniqueTimeCount, uniqueLatitudeCount, uniqueLongitudeCount, uniqueLevelCount)
	longitudes = longitudes.reshape(uniqueTimeCount, uniqueLatitudeCount, uniqueLongitudeCount, uniqueLevelCount)
	levels = np.array(levels).reshape(uniqueTimeCount, uniqueLatitudeCount, uniqueLongitudeCount, uniqueLevelCount)
	values = values.reshape(uniqueTimeCount, uniqueLatitudeCount, uniqueLongitudeCount, uniqueLevelCount)

	# Flatten dimension if only single level
	if uniqueLevelCount == 1:
	values = values[:, :, :, 0]
	latitudes = latitudes[0, :, :, 0]
	longitudes = longitudes[0, :, :, 0]

	# Created masked array to deal with missing values
	# -these make functions like values.mean(), values.max() etc ignore missing values
	mdi = -9999 # TODO: extract this value from the DB retrieval metadata
	mdata = ma.masked_array(values, mask=(values == mdi))

	return latitudes, longitudes, uniqueLevels, timesUnique, mdata