blob: c817dc58fc82c3e4ec8d9c5ccffc4c9be2f0d99e [file] [log] [blame]
"""Collection of functions used to interface with the database and to create netCDF file
"""
import os
import urllib2
import re
import numpy as np
import numpy.ma as ma
import json
import Nio
from classes import RCMED
from toolkit import process
from datetime import timedelta ,datetime
def reorderXYT(lons, lats, times, values):
# Re-order values in values array such that when reshaped everywhere is where it should be
# (as DB doesn't necessarily return everything in order)
order = np.lexsort((lons, lats, times))
counter = 0
sortedValues = np.zeros_like(values)
sortedLats = np.zeros_like(lats)
sortedLons = np.zeros_like(lons)
for i in order:
sortedValues[counter] = values[i]
sortedLats[counter] = lats[i]
sortedLons[counter] = lons[i]
counter += 1
return sortedValues, sortedLats, sortedLons
def findUnique(seq, idfun=None):
"""
Function to find unique values (used in construction of unique datetime list)
NB. order preserving
Input: seq - a list of randomly ordered values
Output: result - list of ordered values
"""
if idfun is None:
def idfun(x):
return x
seen = {};
result = []
for item in seq:
marker = idfun(item)
# in old Python versions:
# if seen.has_key(marker)
# but in new ones:
if marker in seen: continue
seen[marker] = 1
result.append(item)
return result
def extractData(datasetID, paramID, latMin, latMax, lonMin, lonMax, startTime, endTime, cachedir, timestep):
"""
Main function to extract data from DB into numpy masked arrays
Input::
datasetID, paramID: required identifiers of data in database
latMin, latMax, lonMin, lonMax: location range to extract data for
startTime, endTime: python datetime objects describing required time range to extract
cachedir: directory path used to store temporary cache files
timestep: "daily" | "monthly" so we can be sure to query the RCMED properly
Output:
uniqueLatitudes,uniqueLongitudes: 1d-numpy array of latitude and longitude grid values
uniqueLevels: 1d-numpy array of vertical level values
timesUnique: list of python datetime objects describing times of returned data
mdata: masked numpy arrays of data values
"""
url = RCMED.jplUrl(datasetID, paramID, latMin, latMax, lonMin, lonMax, startTime, endTime, cachedir, timestep)
# To get the parameter's information from parameter table
database,timestep,realm,instrument,start_date,end_date,unit=get_param_info(url)
# Create a directory inside the cachedir folder
name=[]
# activity is a fix value
activity="obs4cmip5"
name.append(activity)
# product is a fix value
product="observations"
name.append(product)
# realm, variable,frequency and instrument will be get from parameter table
realm=realm
name.append(realm)
variable=database
name.append(variable)
frequency=timestep
name.append(frequency)
data_structure="grid"
name.append(data_structure)
institution="NASA"
name.append(institution)
project="RCMES"
name.append(project)
instrument=instrument
name.append(instrument)
version="v1"
name.append(version)
# Check to see whether the folder is already created for netCDF or not, then it will be created
os.chdir(cachedir)
path=os.getcwd()
for n in name:
if os.path.exists(path+"/"+n):
os.chdir(path+"/"+n)
path=os.getcwd()
else:
os.mkdir(n)
os.chdir(path+"/"+n)
path=os.getcwd()
# To establish the netCDF file name
processing_level='L3'
processing_version="processing_version" # the processing version is still unknown
start_date=str(startTime)[0:4]+str(startTime)[5:7]+str(startTime)[8:10]
end_date=str(endTime)[0:4]+str(endTime)[5:7]+str(endTime)[8:10]
netCD_fileName=variable + '_' + project + '_' + processing_level + '_' + processing_version + '_' + str(latMin) + '_' + str(latMax) + '_' + str(lonMin) + '_' + str(lonMax) + '_' + start_date + '_' + end_date + '.nc'
# To check if netCDF file exists, then use it
if os.path.exists(path+"/"+netCD_fileName):
latitudes, longitudes, uniqueLevels, timesUnique, mdata=read_netcdf(path+"/"+netCD_fileName,timestep)
# If the netCDF file does not exists, then create one.
else:
latitudes, longitudes, uniqueLevels, timesUnique, mdata=create_netCDF(url, database, latMin, latMax, lonMin, lonMax, startTime, endTime, unit, path+"/"+netCD_fileName,timestep)
return latitudes, longitudes, uniqueLevels, timesUnique, mdata
def get_param_info(url):
'''
This function will get the general information by given URL from the parameter table.
'''
url=url + "&info=yes"
result = urllib2.urlopen(url)
datastring = result.read()
datastring=json.loads(datastring)
database=datastring["database"]
timestep=datastring["timestep"]
realm=datastring["realm"]
instrument=datastring["instrument"]
start_date=datastring["start_date"]
end_date=datastring["end_date"]
unit=datastring["units"]
return database,timestep,realm,instrument,start_date,end_date,unit
def create_netCDF(url, database, latMin, latMax, lonMin, lonMax, startTime, endTime, unit, netCD_fileName, timestep):
print 'Starting retrieval from DB (this may take several minutes)'
result = urllib2.urlopen(url)
datastring = result.read()
d = re.search('data: \r\n', datastring)
data = datastring[d.end():len(datastring)]
# To create a list of all datapoints
data=data.split('\r\n')
latitudes = []
longitudes = []
levels = []
values = []
timestamps = []
# To make a series of lists from datapoints
for i in range(len(data)-1): # Because the last row is empty, "len(data)-1" is used.
row=data[i].split(',')
latitudes.append(np.float32(row[0]))
longitudes.append(np.float32(row[1]))
levels.append(np.float32(row[2]))
# timestamps are strings so we will leave them alone for now
timestamps.append(row[3])
values.append(np.float32(row[4]))
# Need to sort time to make sure start and end times are correct
time_label_list = timestamps
print "start sort"
time_label_list.sort()
print "done sort"
start_time_label = time_label_list[0]
end_time_label = time_label_list[-1]
hours=[]
timeFormat = "%Y-%m-%d %H:%M:%S"
base_date=datetime.strptime(start_time_label, timeFormat)
# To convert the date to hours
for t in timestamps:
date=datetime.strptime(t, timeFormat)
dif=date-base_date
hours.append(dif.days*24)
# To generate netCDF file from database
print "Generating netCDF file in the cache directory...."
netcdf = Nio.open_file(netCD_fileName,'w')
string="The netCDF file for parameter: " + database + ", latMin: " + str(latMin) + ", latMax: " + str(latMax) + ", lonMin: " + str(lonMin) + ", lonMax: " + str(lonMax) + " startTime: " + str(start_time_label) + " and endTime: " + str(end_time_label) + "."
netcdf.globalAttName = str(string)
netcdf.create_dimension('dim', len(latitudes))
latitude = netcdf.create_variable('lat', 'd', ('dim',))
longitude = netcdf.create_variable('lon', 'd', ('dim',))
level = netcdf.create_variable('lev', 'd', ('dim',))
time = netcdf.create_variable('time', 'd', ('dim',))
value = netcdf.create_variable('value', 'd', ('dim',))
netcdf.variables['lat'].varAttName = 'latitude'
netcdf.variables['lat'].units = 'degrees_north'
netcdf.variables['lon'].varAttName = 'longitude'
netcdf.variables['lon'].units = 'degrees_east'
netcdf.variables['time'].varAttName = 'time'
netcdf.variables['time'].units = 'hours since ' + str(start_time_label)
netcdf.variables['value'].varAttName = 'value'
netcdf.variables['value'].units = str(unit)
netcdf.variables['lev'].varAttName = 'level'
netcdf.variables['lev'].units = 'hPa'
latitude[:]=latitudes[:]
longitude[:]=longitudes[:]
level[:]=levels[:]
time[:]=hours[:]
value[:]=values[:]
netcdf.close()
print "Data stored as netCDF file (cache file)"
latitudes, longitudes, uniqueLevels, timesUnique, mdata = read_netcdf(netCD_fileName,timestep)
return latitudes, longitudes, uniqueLevels, timesUnique, mdata
def read_netcdf(netCD_fileName,timestep):
# To use the created netCDF file
print 'Retrieving data from cache (netCDF file)'
netcdf = Nio.open_file(netCD_fileName , 'r')
# To get all data from netCDF file
latitudes = netcdf.variables['lat'][:]
longitudes = netcdf.variables['lon'][:]
levels = netcdf.variables['lev'][:]
hours = netcdf.variables['time'][:]
values = netcdf.variables['value'][:]
# To get the base date
time_unit=netcdf.variables['time'].units
time_unit=time_unit.split(' ')
base_data=time_unit[2] + " " + time_unit[3]
netcdf.close()
timeFormat = "%Y-%m-%d %H:%M:%S"
# Because time in netCDF file is based on hours since a specific date, it needs to be converted to date format
times=[]
# To convert the base date to the python datetime format
dt = datetime.strptime(base_data, timeFormat)
for t in range(len(hours)):
d=timedelta(hours[t]/24)
add=dt+d
times.append(str(add.year) + '-' + str("%02d" % (add.month)) + '-' + str("%02d" % (add.day)) + ' ' + str("%02d" % (add.hour)) + ':' + str("%02d" % (add.minute)) + ':' + str("%02d" % (add.second)))
# Make arrays of unique latitudes, longitudes, levels and times
uniqueLatitudes = np.unique(latitudes)
uniqueLongitudes = np.unique(longitudes)
uniqueLevels = np.unique(levels)
uniqueTimestamps = np.unique(times)
# Calculate nx and ny
uniqueLongitudeCount = len(uniqueLongitudes)
uniqueLatitudeCount = len(uniqueLatitudes)
uniqueLevelCount = len(uniqueLevels)
uniqueTimeCount = len(uniqueTimestamps)
values, latitudes, longitudes = reorderXYT(longitudes, latitudes, times, values)
# Convert each unique time from strings into list of Python datetime objects
# TODO - LIST COMPS!
timesUnique = [datetime.strptime(t, timeFormat) for t in uniqueTimestamps]
timesUnique.sort()
timesUnique = process.normalizeDatetimes(timesUnique, timestep)
# Reshape arrays
latitudes = latitudes.reshape(uniqueTimeCount, uniqueLatitudeCount, uniqueLongitudeCount, uniqueLevelCount)
longitudes = longitudes.reshape(uniqueTimeCount, uniqueLatitudeCount, uniqueLongitudeCount, uniqueLevelCount)
levels = np.array(levels).reshape(uniqueTimeCount, uniqueLatitudeCount, uniqueLongitudeCount, uniqueLevelCount)
values = values.reshape(uniqueTimeCount, uniqueLatitudeCount, uniqueLongitudeCount, uniqueLevelCount)
# Flatten dimension if only single level
if uniqueLevelCount == 1:
values = values[:, :, :, 0]
latitudes = latitudes[0, :, :, 0]
longitudes = longitudes[0, :, :, 0]
# Created masked array to deal with missing values
# -these make functions like values.mean(), values.max() etc ignore missing values
mdi = -9999 # TODO: extract this value from the DB retrieval metadata
mdata = ma.masked_array(values, mask=(values == mdi))
return latitudes, longitudes, uniqueLevels, timesUnique, mdata