tools/cdms/cdms_reader.py - sdap-nexus - Git at Google

 # Licensed to the Apache Software Foundation (ASF) under one or more
 # contributor license agreements.  See the NOTICE file distributed with
 # this work for additional information regarding copyright ownership.
 # The ASF licenses this file to You under the Apache License, Version 2.0
 # (the "License"); you may not use this file except in compliance with
 # the License.  You may obtain a copy of the License at
 #
 #   http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.

 import argparse
 import string
 from netCDF4 import Dataset, num2date
 import sys
 import datetime
 import csv
 from collections import OrderedDict
 import logging


 #TODO: Get rid of numpy errors?
 #TODO: Update big SDAP README

 LOGGER =  logging.getLogger("cdms_reader")

 def assemble_matches(filename):
     """
     Read a CDMS netCDF file and return a list of matches.

     Parameters
     ----------
     filename : str
         The CDMS netCDF file name.

     Returns
     -------
     matches : list
         List of matches. Each list element is a dictionary.
         For match m, netCDF group GROUP (PrimaryData or SecondaryData), and
         group variable VARIABLE:
         matches[m][GROUP]['matchID']: MatchedRecords dimension ID for the match
         matches[m][GROUP]['GROUPID']: GROUP dim dimension ID for the record
         matches[m][GROUP][VARIABLE]: variable value
     """

     try:
         # Open the netCDF file

         with Dataset(filename, 'r') as cdms_nc:
             # Check that the number of groups is consistent w/ the MatchedGroups
             # dimension
             assert len(cdms_nc.groups) == cdms_nc.dimensions['MatchedGroups'].size,\
                 ("Number of groups isn't the same as MatchedGroups dimension.")

             matches = []
             matched_records = cdms_nc.dimensions['MatchedRecords'].size

             # Loop through the match IDs to assemble matches
             for match in range(0, matched_records):
                 match_dict = OrderedDict()
                 # Grab the data from each platform (group) in the match
                 for group_num, group in enumerate(cdms_nc.groups):
                     match_dict[group] = OrderedDict()
                     match_dict[group]['matchID'] = match
                     ID = cdms_nc.variables['matchIDs'][match][group_num]
                     match_dict[group][group + 'ID'] = ID
                     for var in cdms_nc.groups[group].variables.keys():
                         match_dict[group][var] = cdms_nc.groups[group][var][ID]

                     # Create a UTC datetime field from timestamp
                     dt = num2date(match_dict[group]['time'],
                                   cdms_nc.groups[group]['time'].units)
                     match_dict[group]['datetime'] = dt
                 LOGGER.info(match_dict)
                 matches.append(match_dict)

             return matches
     except (OSError, IOError) as err:
         LOGGER.exception("Error reading netCDF file " + filename)
         raise err

 def assemble_matches_by_primary(filename):
     """
     Read a CDMS netCDF file and return a list of matches, in which secondary data
     points are grouped together by their primary data point match.

     This function returns matches in a different order than the 'assemble_matches' function.
     In this function, all secondary data is associated with its primary match without the need
     to access multiple matches.

     Parameters
     ----------
     filename : str
         The CDMS netCDF file name.

     Returns
     -------
     matches : list
         List of matches. Each list element is a dictionary that maps a primary record to all of its associated secondary records.
         For match m, netCDF group GROUP (PrimaryData or SecondaryData), and
         group variable VARIABLE:

         matches[m][GROUP]['matchID']: MatchedRecords dimension ID for the match
         matches[m][GROUP]['GROUPID']: GROUP dim dimension ID for the record
         matches[m][GROUP][VARIABLE]: variable value. Each VARIABLE is returned as a masked array.

         ex. To access the first secondary time value available for a given match:
             matches[m]['SecondaryData']['time'][0]
     """

     try:
         # Open the netCDF file
         with Dataset(filename, 'r') as cdms_nc:
             # Check that the number of groups is consistent w/ the MatchedGroups
             # dimension
             assert len(cdms_nc.groups) == cdms_nc.dimensions['MatchedGroups'].size,\
                 ("Number of groups isn't the same as MatchedGroups dimension.")

             matched_records = cdms_nc.dimensions['MatchedRecords'].size
             primary_matches = cdms_nc.groups['PrimaryData'].dimensions['dim'].size
             matches = [OrderedDict()] * primary_matches

             for match in range(matched_records):
                 PID = int(cdms_nc.variables['matchIDs'][match][0])

                 if len(matches[PID]) == 0: #establishes ordered dictionary for first match[PID]
                     matches[PID] = OrderedDict()

                 for group_num, group in enumerate(cdms_nc.groups):

                     if group_num == 0: #primary

                         if group not in matches[PID].keys(): #initialization
                                 matches[PID][group] = OrderedDict()
                                 matches[PID][group]['matchID'] = []

                         matches[PID][group]['matchID'].append(match)
                         ID = cdms_nc.variables['matchIDs'][match][group_num]
                         matches[PID][group][group + 'ID'] = ID

                         for var in cdms_nc.groups[group].variables.keys():
                             matches[PID][group][var] = cdms_nc.groups[group][var][ID]

                         dt = num2date(matches[PID][group]['time'], cdms_nc.groups[group]['time'].units)
                         matches[PID][group]['datetime'] = dt

                     elif group_num == 1: #secondary

                         if group not in matches[PID].keys(): #initialization
                             matches[PID][group] = OrderedDict()
                             matches[PID][group]['matchID'] = []
                             matches[PID][group][group + 'ID'] = []
                             matches[PID][group]['datetime'] = []

                         matches[PID][group]['matchID'].append(match)
                         ID = cdms_nc.variables['matchIDs'][match][group_num]
                         matches[PID][group][group + 'ID'].append(ID)

                         for var in cdms_nc.groups[group].variables.keys():
                             if var not in matches[PID][group].keys():
                                 matches[PID][group][var] = []
                             matches[PID][group][var].append(cdms_nc.groups[group][var][ID])

                         dt = num2date(matches[PID][group]['time'], cdms_nc.groups[group]['time'].units)
                         matches[PID][group]['datetime'].append(dt[0])

             return matches
     except (OSError, IOError) as err:
         LOGGER.exception("Error reading netCDF file " + filename)
         raise err

 def matches_to_csv(matches, csvfile):
     """
     Write the CDMS matches to a CSV file. Include a header of column names
     which are based on the group and variable names from the netCDF file.

     Parameters
     ----------
     matches : list
         The list of dictionaries containing the CDMS matches as returned from
         assemble_matches.
     csvfile : str
         The name of the CSV output file.
     """
     # Create a header for the CSV. Column names are GROUP_VARIABLE or
     # GROUP_GROUPID.
     header = []
     for key, value in matches[0].items():
         for otherkey in value.keys():
             header.append(key + "_" + otherkey)

     try:
         # Write the CSV file
         with open(csvfile, 'w') as output_file:
             csv_writer = csv.writer(output_file)
             csv_writer.writerow(header)
             for match in matches:
                 row = []
                 for group, data in match.items():
                     for value in data.values():
                         row.append(value)
                 csv_writer.writerow(row)
     except (OSError, IOError) as err:
         LOGGER.exception("Error writing CSV file " + csvfile)
         raise err

 def get_globals(filename):
     """
     Write the CDMS  global attributes to a text file. Additionally,
      within the file there will be a description of where all the different
      outputs go and how to best utlize this program.

     Parameters
     ----------
     filename : str
         The name of the original '.nc' input file.

     """
     x0 = "README / cdms_reader.py Program Use and Description:\n"
     x1 = "\nThe cdms_reader.py program reads a CDMS netCDF (a NETCDF file with a matchIDs variable)\n"
     x2 = "file into memory, assembles a list of matches of primary and secondary data\n"
     x3 = "and optionally\n"
     x4 = "output the matches to a CSV file. Each matched pair contains one primary\n"
     x5 = "data record and one secondary data record.\n"
     x6 = "\nBelow, this file wil list the global attributes of the .nc (NETCDF) file.\n"
     x7 = "If you wish to see a full dump of the data from the .nc file,\n"
     x8 = "please utilize the ncdump command from NETCDF (or look at the CSV file).\n"
     try:
         with Dataset(filename, "r", format="NETCDF4") as ncFile:
             txtName = filename.replace(".nc", ".txt")
             with open(txtName, "w") as txt:
                 txt.write(x0 + x1 +x2 +x3 + x4 + x5 + x6 + x7 + x8)
                 txt.write("\nGlobal Attributes:")
                 for x in ncFile.ncattrs():
                     txt.write(f'\t :{x} = "{ncFile.getncattr(x)}" ;\n')


     except (OSError, IOError) as err:
         LOGGER.exception("Error reading netCDF file " + filename)
         print("Error reading file!")
         raise err

 def create_logs(user_option, logName):
     """
     Write the CDMS log information to a file. Additionally, the user may
     opt to print this information directly to stdout, or discard it entirely.

     Parameters
     ----------
     user_option : str
         The result of the arg.log 's interpretation of
          what option the user selected.
     logName : str
         The name of the log file we wish to write to,
         assuming the user did not use the -l option.
     """
     if user_option == 'N':
         print("** Note: No log was created **")


     elif user_option == '1':
         #prints the log contents to stdout
         logging.basicConfig(format='%(asctime)s %(levelname)-8s %(message)s',
                         level=logging.INFO,
                         datefmt='%Y-%m-%d %H:%M:%S',
                         handlers=[
                             logging.StreamHandler(sys.stdout)
                             ])

     else:
         #prints log to a .log file
         logging.basicConfig(format='%(asctime)s %(levelname)-8s %(message)s',
                         level=logging.INFO,
                         datefmt='%Y-%m-%d %H:%M:%S',
                         handlers=[
                             logging.FileHandler(logName)
                             ])
         if user_option != 1 and user_option != 'Y':
             print(f"** Bad usage of log option. Log will print to {logName} **")


 if __name__ == '__main__':
     """
     Execution:
         python cdms_reader.py filename
         OR
         python3 cdms_reader.py filename
         OR
         python3 cdms_reader.py filename -c -g
         OR
         python3 cdms_reader.py filename --csv --meta

     Note (For Help Try):
             python3 cdms_reader.py -h
             OR
             python3 cdms_reader.py --help

     """

     u0 = '\n%(prog)s -h OR --help \n'
     u1 = '%(prog)s filename -c -g\n%(prog)s filename --csv --meta\n'
     u2 ='Use -l OR -l1 to modify destination of logs'
     p = argparse.ArgumentParser(usage= u0 + u1 + u2)

     #below block is to customize user options
     p.add_argument('filename', help='CDMS netCDF file to read')
     p.add_argument('-c', '--csv', nargs='?', const= 'Y', default='N',
      help='Use -c or --csv to retrieve CSV output')
     p.add_argument('-g', '--meta', nargs='?', const='Y', default='N',
      help='Use -g or --meta to retrieve global attributes / metadata')
     p.add_argument('-l', '--log', nargs='?', const='N', default='Y',
      help='Use -l or --log to AVOID creating log files, OR use -l1 to print to stdout/console')

     #arguments are processed by the next line
     args = p.parse_args()

     logName = args.filename.replace(".nc", ".log")
     create_logs(args.log, logName)

     cdms_matches = assemble_matches(args.filename)

     if args.csv == 'Y' :
         matches_to_csv(cdms_matches, args.filename.replace(".nc",".csv"))

     if args.meta == 'Y' :
         get_globals(args.filename)
	# Licensed to the Apache Software Foundation (ASF) under one or more
	# contributor license agreements. See the NOTICE file distributed with
	# this work for additional information regarding copyright ownership.
	# The ASF licenses this file to You under the Apache License, Version 2.0
	# (the "License"); you may not use this file except in compliance with
	# the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	import argparse
	import string
	from netCDF4 import Dataset, num2date
	import sys
	import datetime
	import csv
	from collections import OrderedDict
	import logging


	#TODO: Get rid of numpy errors?
	#TODO: Update big SDAP README

	LOGGER = logging.getLogger("cdms_reader")

	def assemble_matches(filename):
	"""
	Read a CDMS netCDF file and return a list of matches.

	Parameters
	----------
	filename : str
	The CDMS netCDF file name.

	Returns
	-------
	matches : list
	List of matches. Each list element is a dictionary.
	For match m, netCDF group GROUP (PrimaryData or SecondaryData), and
	group variable VARIABLE:
	matches[m][GROUP]['matchID']: MatchedRecords dimension ID for the match
	matches[m][GROUP]['GROUPID']: GROUP dim dimension ID for the record
	matches[m][GROUP][VARIABLE]: variable value
	"""

	try:
	# Open the netCDF file

	with Dataset(filename, 'r') as cdms_nc:
	# Check that the number of groups is consistent w/ the MatchedGroups
	# dimension
	assert len(cdms_nc.groups) == cdms_nc.dimensions['MatchedGroups'].size,\
	("Number of groups isn't the same as MatchedGroups dimension.")

	matches = []
	matched_records = cdms_nc.dimensions['MatchedRecords'].size

	# Loop through the match IDs to assemble matches
	for match in range(0, matched_records):
	match_dict = OrderedDict()
	# Grab the data from each platform (group) in the match
	for group_num, group in enumerate(cdms_nc.groups):
	match_dict[group] = OrderedDict()
	match_dict[group]['matchID'] = match
	ID = cdms_nc.variables['matchIDs'][match][group_num]
	match_dict[group][group + 'ID'] = ID
	for var in cdms_nc.groups[group].variables.keys():
	match_dict[group][var] = cdms_nc.groups[group][var][ID]

	# Create a UTC datetime field from timestamp
	dt = num2date(match_dict[group]['time'],
	cdms_nc.groups[group]['time'].units)
	match_dict[group]['datetime'] = dt
	LOGGER.info(match_dict)
	matches.append(match_dict)

	return matches
	except (OSError, IOError) as err:
	LOGGER.exception("Error reading netCDF file " + filename)
	raise err

	def assemble_matches_by_primary(filename):
	"""
	Read a CDMS netCDF file and return a list of matches, in which secondary data
	points are grouped together by their primary data point match.

	This function returns matches in a different order than the 'assemble_matches' function.
	In this function, all secondary data is associated with its primary match without the need
	to access multiple matches.

	Parameters
	----------
	filename : str
	The CDMS netCDF file name.

	Returns
	-------
	matches : list
	List of matches. Each list element is a dictionary that maps a primary record to all of its associated secondary records.
	For match m, netCDF group GROUP (PrimaryData or SecondaryData), and
	group variable VARIABLE:

	matches[m][GROUP]['matchID']: MatchedRecords dimension ID for the match
	matches[m][GROUP]['GROUPID']: GROUP dim dimension ID for the record
	matches[m][GROUP][VARIABLE]: variable value. Each VARIABLE is returned as a masked array.

	ex. To access the first secondary time value available for a given match:
	matches[m]['SecondaryData']['time'][0]
	"""

	try:
	# Open the netCDF file
	with Dataset(filename, 'r') as cdms_nc:
	# Check that the number of groups is consistent w/ the MatchedGroups
	# dimension
	assert len(cdms_nc.groups) == cdms_nc.dimensions['MatchedGroups'].size,\
	("Number of groups isn't the same as MatchedGroups dimension.")

	matched_records = cdms_nc.dimensions['MatchedRecords'].size
	primary_matches = cdms_nc.groups['PrimaryData'].dimensions['dim'].size
	matches = [OrderedDict()] * primary_matches

	for match in range(matched_records):
	PID = int(cdms_nc.variables['matchIDs'][match][0])

	if len(matches[PID]) == 0: #establishes ordered dictionary for first match[PID]
	matches[PID] = OrderedDict()

	for group_num, group in enumerate(cdms_nc.groups):

	if group_num == 0: #primary

	if group not in matches[PID].keys(): #initialization
	matches[PID][group] = OrderedDict()
	matches[PID][group]['matchID'] = []

	matches[PID][group]['matchID'].append(match)
	ID = cdms_nc.variables['matchIDs'][match][group_num]
	matches[PID][group][group + 'ID'] = ID

	for var in cdms_nc.groups[group].variables.keys():
	matches[PID][group][var] = cdms_nc.groups[group][var][ID]

	dt = num2date(matches[PID][group]['time'], cdms_nc.groups[group]['time'].units)
	matches[PID][group]['datetime'] = dt

	elif group_num == 1: #secondary

	if group not in matches[PID].keys(): #initialization
	matches[PID][group] = OrderedDict()
	matches[PID][group]['matchID'] = []
	matches[PID][group][group + 'ID'] = []
	matches[PID][group]['datetime'] = []

	matches[PID][group]['matchID'].append(match)
	ID = cdms_nc.variables['matchIDs'][match][group_num]
	matches[PID][group][group + 'ID'].append(ID)

	for var in cdms_nc.groups[group].variables.keys():
	if var not in matches[PID][group].keys():
	matches[PID][group][var] = []
	matches[PID][group][var].append(cdms_nc.groups[group][var][ID])

	dt = num2date(matches[PID][group]['time'], cdms_nc.groups[group]['time'].units)
	matches[PID][group]['datetime'].append(dt[0])

	return matches
	except (OSError, IOError) as err:
	LOGGER.exception("Error reading netCDF file " + filename)
	raise err

	def matches_to_csv(matches, csvfile):
	"""
	Write the CDMS matches to a CSV file. Include a header of column names
	which are based on the group and variable names from the netCDF file.

	Parameters
	----------
	matches : list
	The list of dictionaries containing the CDMS matches as returned from
	assemble_matches.
	csvfile : str
	The name of the CSV output file.
	"""
	# Create a header for the CSV. Column names are GROUP_VARIABLE or
	# GROUP_GROUPID.
	header = []
	for key, value in matches[0].items():
	for otherkey in value.keys():
	header.append(key + "_" + otherkey)

	try:
	# Write the CSV file
	with open(csvfile, 'w') as output_file:
	csv_writer = csv.writer(output_file)
	csv_writer.writerow(header)
	for match in matches:
	row = []
	for group, data in match.items():
	for value in data.values():
	row.append(value)
	csv_writer.writerow(row)
	except (OSError, IOError) as err:
	LOGGER.exception("Error writing CSV file " + csvfile)
	raise err

	def get_globals(filename):
	"""
	Write the CDMS global attributes to a text file. Additionally,
	within the file there will be a description of where all the different
	outputs go and how to best utlize this program.

	Parameters
	----------
	filename : str
	The name of the original '.nc' input file.

	"""
	x0 = "README / cdms_reader.py Program Use and Description:\n"
	x1 = "\nThe cdms_reader.py program reads a CDMS netCDF (a NETCDF file with a matchIDs variable)\n"
	x2 = "file into memory, assembles a list of matches of primary and secondary data\n"
	x3 = "and optionally\n"
	x4 = "output the matches to a CSV file. Each matched pair contains one primary\n"
	x5 = "data record and one secondary data record.\n"
	x6 = "\nBelow, this file wil list the global attributes of the .nc (NETCDF) file.\n"
	x7 = "If you wish to see a full dump of the data from the .nc file,\n"
	x8 = "please utilize the ncdump command from NETCDF (or look at the CSV file).\n"
	try:
	with Dataset(filename, "r", format="NETCDF4") as ncFile:
	txtName = filename.replace(".nc", ".txt")
	with open(txtName, "w") as txt:
	txt.write(x0 + x1 +x2 +x3 + x4 + x5 + x6 + x7 + x8)
	txt.write("\nGlobal Attributes:")
	for x in ncFile.ncattrs():
	txt.write(f'\t :{x} = "{ncFile.getncattr(x)}" ;\n')


	except (OSError, IOError) as err:
	LOGGER.exception("Error reading netCDF file " + filename)
	print("Error reading file!")
	raise err

	def create_logs(user_option, logName):
	"""
	Write the CDMS log information to a file. Additionally, the user may
	opt to print this information directly to stdout, or discard it entirely.

	Parameters
	----------
	user_option : str
	The result of the arg.log 's interpretation of
	what option the user selected.
	logName : str
	The name of the log file we wish to write to,
	assuming the user did not use the -l option.
	"""
	if user_option == 'N':
	print(" Note: No log was created ")


	elif user_option == '1':
	#prints the log contents to stdout
	logging.basicConfig(format='%(asctime)s %(levelname)-8s %(message)s',
	level=logging.INFO,
	datefmt='%Y-%m-%d %H:%M:%S',
	handlers=[
	logging.StreamHandler(sys.stdout)
	])

	else:
	#prints log to a .log file
	logging.basicConfig(format='%(asctime)s %(levelname)-8s %(message)s',
	level=logging.INFO,
	datefmt='%Y-%m-%d %H:%M:%S',
	handlers=[
	logging.FileHandler(logName)
	])
	if user_option != 1 and user_option != 'Y':
	print(f" Bad usage of log option. Log will print to {logName} ")




	if __name__ == '__main__':
	"""
	Execution:
	python cdms_reader.py filename
	OR
	python3 cdms_reader.py filename
	OR
	python3 cdms_reader.py filename -c -g
	OR
	python3 cdms_reader.py filename --csv --meta

	Note (For Help Try):
	python3 cdms_reader.py -h
	OR
	python3 cdms_reader.py --help

	"""

	u0 = '\n%(prog)s -h OR --help \n'
	u1 = '%(prog)s filename -c -g\n%(prog)s filename --csv --meta\n'
	u2 ='Use -l OR -l1 to modify destination of logs'
	p = argparse.ArgumentParser(usage= u0 + u1 + u2)

	#below block is to customize user options
	p.add_argument('filename', help='CDMS netCDF file to read')
	p.add_argument('-c', '--csv', nargs='?', const= 'Y', default='N',
	help='Use -c or --csv to retrieve CSV output')
	p.add_argument('-g', '--meta', nargs='?', const='Y', default='N',
	help='Use -g or --meta to retrieve global attributes / metadata')
	p.add_argument('-l', '--log', nargs='?', const='N', default='Y',
	help='Use -l or --log to AVOID creating log files, OR use -l1 to print to stdout/console')

	#arguments are processed by the next line
	args = p.parse_args()

	logName = args.filename.replace(".nc", ".log")
	create_logs(args.log, logName)

	cdms_matches = assemble_matches(args.filename)

	if args.csv == 'Y' :
	matches_to_csv(cdms_matches, args.filename.replace(".nc",".csv"))

	if args.meta == 'Y' :
	get_globals(args.filename)