ocw_config_runner/configuration_parsing.py - climate - Git at Google

 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #    http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.

 import logging
 import re
 import sys

 import ocw.metrics as metrics

 import yaml

 logging.basicConfig()
 logger = logging.getLogger(__name__)

 def is_config_valid(config_data):
     """ Validate supplied evaluation configuration data.

     :param config_data: Dictionary of the data parsed from the supplied YAML
         configuration file.
     :type config_data: :func:`dict`

     :returns: True if the configuration data is sufficient for an evaluation and
         seems to be well formed, False otherwise.
     """
     if not _valid_minimal_config(config_data):
         logger.error('Insufficient configuration file data for an evaluation')
         return False

     if not _config_is_well_formed(config_data):
         logger.error('Configuration data is not well formed')
         return False

     return True

 def _valid_minimal_config(config_data):
     """"""
     if not 'datasets' in config_data.keys():
         logger.error('No datasets specified in configuration data.')
         return False

     if not 'metrics' in config_data.keys():
         logger.error('No metrics specified in configuration data.')
         return False

     if _contains_unary_metrics(config_data['metrics']):
         if (not 'reference' in config_data['datasets'].keys() and
             not 'targets' in config_data['datasets'].keys()):
             err = (
                 'Unary metric in configuration data requires either a reference '
                 'or target dataset to be present for evaluation. Please ensure '
                 'that your config is well formed.'
             )
             logger.error(err)
             return False

     if _contains_binary_metrics(config_data['metrics']):
         if (not 'reference' in config_data['datasets'].keys() or
             not 'targets' in config_data['datasets'].keys()):
             logger.error(
                 'Binary metric in configuration requires both a reference '
                 'and target dataset to be present for evaluation. Please ensure '
                 'that your config is well formed.'
             )
             return False

     return True

 def _config_is_well_formed(config_data):
     """"""
     is_well_formed = True

     if 'reference' in config_data['datasets']:
         if not _valid_dataset_config_data(config_data['datasets']['reference']):
             is_well_formed = False

     if 'targets' in config_data['datasets']:
         targets = config_data['datasets']['targets']
         if type(targets) != type(list()):
             err = (
                 'Expected to find list of target datasets but instead found '
                 'object of type {}'
             ).format(type(targets))
             logger.error(err)
             is_well_formed = False
         else:
             for t in targets:
                 if not _valid_dataset_config_data(t):
                     is_well_formed = False

     available_metrics = _fetch_built_in_metrics()
     for metric in config_data['metrics']:
         if metric not in available_metrics:
             warning = (
                 'Unable to locate metric name {} in built-in metrics. If this '
                 'is not a user defined metric then please check for potential '
                 'misspellings.'
             ).format(metric)
             logger.warn(warning)
             is_well_formed = False

     if 'subregions' in config_data:
         for subregion in config_data['subregions']:
             if not _valid_subregion_config_data(subregion):
                 is_well_formed = False

     if 'plots' in config_data:
         for plot in config_data['plots']:
             if not _valid_plot_config_data(plot):
                 is_well_formed = False
             # Ensure that if we're trying to make a plot that require
             # subregion info that the config has this present.
             elif plot['type'] in ['subregion', 'portrait']:
                 if ('subregions' not in config_data or
                     len(config_data['subregions']) < 1):
                     logger.error(
                         'Plot config that requires subregion information is present '
                         'in a config file without adequate subregion information '
                         'provided. Please ensure that you have properly supplied 1 or '
                         'more subregion config values.'
                     )
                     is_well_formed = False


     return is_well_formed

 def _contains_unary_metrics(config_metric_data):
     """"""
     unarys = [cls.__name__ for cls in metrics.UnaryMetric.__subclasses__()]
     return any(metric in unarys for metric in config_metric_data)

 def _contains_binary_metrics(config_metric_data):
     """"""
     binarys = [cls.__name__ for cls in metrics.BinaryMetric.__subclasses__()]
     return any(metric in binarys for metric in config_metric_data)

 def _fetch_built_in_metrics():
     """"""
     unarys = [cls.__name__ for cls in metrics.UnaryMetric.__subclasses__()]
     binarys = [cls.__name__ for cls in metrics.BinaryMetric.__subclasses__()]
     return unarys + binarys

 def _valid_dataset_config_data(dataset_config_data):
     """"""
     try:
         data_source = dataset_config_data['data_source']
     except KeyError:
         logger.error('Dataset does not contain a data_source attribute.')
         return False

     if data_source == 'local':
         required_keys = set(['data_source', 'file_count', 'path', 'variable'])
     elif data_source == 'rcmed':
         required_keys = set([
             'dataset_id',
             'parameter_id',
             'min_lat',
             'max_lat',
             'min_lon',
             'max_lon',
             'start_time',
             'end_time',
         ])
     elif data_source == 'esgf':
         required_keys = set([
             'data_source',
             'dataset_id',
             'variable',
             'esgf_username',
             'esgf_password'
         ])
     elif data_source == 'dap':
         required_keys = set({'url', 'variable'})
     else:
         logger.error('Dataset does not contain a valid data_source location.')
         return False

     present_keys = set(dataset_config_data.keys())
     missing_keys = required_keys - present_keys
     contains_required = len(missing_keys) == 0

     if contains_required:
         if data_source == 'local' and dataset_config_data['file_count'] > 1:
             # If the dataset is a multi-file dataset then we need to make sure
             # that the file glob pattern is included.
             if not 'file_glob_pattern' in dataset_config_data:
                 logger.error(
                     'Multi-file local dataset is missing key: file_glob_pattern'
                 )
                 return False
         return True
     else:
         missing = sorted(list(missing_keys))
         logger.error(
             'Dataset does not contain required keys. '
             'The following keys are missing: {}'.format(', '.join(missing))
         )
         return False

 def _valid_plot_config_data(plot_config_data):
     """"""
     try:
         plot_type = plot_config_data['type']
     except KeyError:
         logger.error('Plot config does not include a type attribute.')
         return False

     if plot_type == 'contour':
         required_keys = set([
             'results_indices',
             'lats',
             'lons',
             'output_name'
         ])
     elif plot_type == 'taylor':
         required_keys = set([
             'stddev_results_indices',
             'pattern_corr_results_indices',
             'output_name'
         ])
     elif plot_type == 'subregion':
         required_keys = set([
             'lats',
             'lons',
             'output_name'
         ])
     elif plot_type == 'time_series':
         required_keys = set([
             'temporal_boundaries'
         ])
     elif plot_type == 'portrait':
         required_keys = set([
             'metric_index',
             'output_name'
         ])
     else:
         logger.error('Invalid plot type specified.')
         return False

     present_keys = set(plot_config_data.keys())
     missing_keys = required_keys - present_keys
     contains_required = len(missing_keys) == 0

     if not contains_required:
         missing = sorted(list(missing_keys))
         logger.error(
             'Plot config does not contain required keys. '
             'The following keys are missing: {}'.format(', '.join(missing))
         )
         return False

     return True

 def _valid_subregion_config_data(subregion_config_data):
     """"""
     if type(subregion_config_data) != type([]):
         logger.error(
             'Subregions should be passed as a list of lists where '
             'each sub-list contains a bounding box of the form: '
             '[lat_min, lat_max, lon_min, lon_max].'
         )
         return False

     if len(subregion_config_data) != 4:
         logger.error(
             'Subregions should be passed as a list of lists where '
             'each sub-list contains a bounding box of the form: '
             '[lat_min, lat_max, lon_min, lon_max].'
         )
         return False

     return True
	# Licensed to the Apache Software Foundation (ASF) under one
	# or more contributor license agreements. See the NOTICE file
	# distributed with this work for additional information
	# regarding copyright ownership. The ASF licenses this file
	# to you under the Apache License, Version 2.0 (the
	# "License"); you may not use this file except in compliance
	# with the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing,
	# software distributed under the License is distributed on an
	# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	# KIND, either express or implied. See the License for the
	# specific language governing permissions and limitations
	# under the License.

	import logging
	import re
	import sys

	import ocw.metrics as metrics

	import yaml

	logging.basicConfig()
	logger = logging.getLogger(__name__)

	def is_config_valid(config_data):
	""" Validate supplied evaluation configuration data.

	:param config_data: Dictionary of the data parsed from the supplied YAML
	configuration file.
	:type config_data: :func:`dict`

	:returns: True if the configuration data is sufficient for an evaluation and
	seems to be well formed, False otherwise.
	"""
	if not _valid_minimal_config(config_data):
	logger.error('Insufficient configuration file data for an evaluation')
	return False

	if not _config_is_well_formed(config_data):
	logger.error('Configuration data is not well formed')
	return False

	return True

	def _valid_minimal_config(config_data):
	""""""
	if not 'datasets' in config_data.keys():
	logger.error('No datasets specified in configuration data.')
	return False

	if not 'metrics' in config_data.keys():
	logger.error('No metrics specified in configuration data.')
	return False

	if _contains_unary_metrics(config_data['metrics']):
	if (not 'reference' in config_data['datasets'].keys() and
	not 'targets' in config_data['datasets'].keys()):
	err = (
	'Unary metric in configuration data requires either a reference '
	'or target dataset to be present for evaluation. Please ensure '
	'that your config is well formed.'
	)
	logger.error(err)
	return False

	if _contains_binary_metrics(config_data['metrics']):
	if (not 'reference' in config_data['datasets'].keys() or
	not 'targets' in config_data['datasets'].keys()):
	logger.error(
	'Binary metric in configuration requires both a reference '
	'and target dataset to be present for evaluation. Please ensure '
	'that your config is well formed.'
	)
	return False

	return True

	def _config_is_well_formed(config_data):
	""""""
	is_well_formed = True

	if 'reference' in config_data['datasets']:
	if not _valid_dataset_config_data(config_data['datasets']['reference']):
	is_well_formed = False

	if 'targets' in config_data['datasets']:
	targets = config_data['datasets']['targets']
	if type(targets) != type(list()):
	err = (
	'Expected to find list of target datasets but instead found '
	'object of type {}'
	).format(type(targets))
	logger.error(err)
	is_well_formed = False
	else:
	for t in targets:
	if not _valid_dataset_config_data(t):
	is_well_formed = False

	available_metrics = _fetch_built_in_metrics()
	for metric in config_data['metrics']:
	if metric not in available_metrics:
	warning = (
	'Unable to locate metric name {} in built-in metrics. If this '
	'is not a user defined metric then please check for potential '
	'misspellings.'
	).format(metric)
	logger.warn(warning)
	is_well_formed = False

	if 'subregions' in config_data:
	for subregion in config_data['subregions']:
	if not _valid_subregion_config_data(subregion):
	is_well_formed = False

	if 'plots' in config_data:
	for plot in config_data['plots']:
	if not _valid_plot_config_data(plot):
	is_well_formed = False
	# Ensure that if we're trying to make a plot that require
	# subregion info that the config has this present.
	elif plot['type'] in ['subregion', 'portrait']:
	if ('subregions' not in config_data or
	len(config_data['subregions']) < 1):
	logger.error(
	'Plot config that requires subregion information is present '
	'in a config file without adequate subregion information '
	'provided. Please ensure that you have properly supplied 1 or '
	'more subregion config values.'
	)
	is_well_formed = False


	return is_well_formed

	def _contains_unary_metrics(config_metric_data):
	""""""
	unarys = [cls.__name__ for cls in metrics.UnaryMetric.__subclasses__()]
	return any(metric in unarys for metric in config_metric_data)

	def _contains_binary_metrics(config_metric_data):
	""""""
	binarys = [cls.__name__ for cls in metrics.BinaryMetric.__subclasses__()]
	return any(metric in binarys for metric in config_metric_data)

	def _fetch_built_in_metrics():
	""""""
	unarys = [cls.__name__ for cls in metrics.UnaryMetric.__subclasses__()]
	binarys = [cls.__name__ for cls in metrics.BinaryMetric.__subclasses__()]
	return unarys + binarys

	def _valid_dataset_config_data(dataset_config_data):
	""""""
	try:
	data_source = dataset_config_data['data_source']
	except KeyError:
	logger.error('Dataset does not contain a data_source attribute.')
	return False

	if data_source == 'local':
	required_keys = set(['data_source', 'file_count', 'path', 'variable'])
	elif data_source == 'rcmed':
	required_keys = set([
	'dataset_id',
	'parameter_id',
	'min_lat',
	'max_lat',
	'min_lon',
	'max_lon',
	'start_time',
	'end_time',
	])
	elif data_source == 'esgf':
	required_keys = set([
	'data_source',
	'dataset_id',
	'variable',
	'esgf_username',
	'esgf_password'
	])
	elif data_source == 'dap':
	required_keys = set({'url', 'variable'})
	else:
	logger.error('Dataset does not contain a valid data_source location.')
	return False

	present_keys = set(dataset_config_data.keys())
	missing_keys = required_keys - present_keys
	contains_required = len(missing_keys) == 0

	if contains_required:
	if data_source == 'local' and dataset_config_data['file_count'] > 1:
	# If the dataset is a multi-file dataset then we need to make sure
	# that the file glob pattern is included.
	if not 'file_glob_pattern' in dataset_config_data:
	logger.error(
	'Multi-file local dataset is missing key: file_glob_pattern'
	)
	return False
	return True
	else:
	missing = sorted(list(missing_keys))
	logger.error(
	'Dataset does not contain required keys. '
	'The following keys are missing: {}'.format(', '.join(missing))
	)
	return False

	def _valid_plot_config_data(plot_config_data):
	""""""
	try:
	plot_type = plot_config_data['type']
	except KeyError:
	logger.error('Plot config does not include a type attribute.')
	return False

	if plot_type == 'contour':
	required_keys = set([
	'results_indices',
	'lats',
	'lons',
	'output_name'
	])
	elif plot_type == 'taylor':
	required_keys = set([
	'stddev_results_indices',
	'pattern_corr_results_indices',
	'output_name'
	])
	elif plot_type == 'subregion':
	required_keys = set([
	'lats',
	'lons',
	'output_name'
	])
	elif plot_type == 'time_series':
	required_keys = set([
	'temporal_boundaries'
	])
	elif plot_type == 'portrait':
	required_keys = set([
	'metric_index',
	'output_name'
	])
	else:
	logger.error('Invalid plot type specified.')
	return False

	present_keys = set(plot_config_data.keys())
	missing_keys = required_keys - present_keys
	contains_required = len(missing_keys) == 0

	if not contains_required:
	missing = sorted(list(missing_keys))
	logger.error(
	'Plot config does not contain required keys. '
	'The following keys are missing: {}'.format(', '.join(missing))
	)
	return False

	return True

	def _valid_subregion_config_data(subregion_config_data):
	""""""
	if type(subregion_config_data) != type([]):
	logger.error(
	'Subregions should be passed as a list of lists where '
	'each sub-list contains a bounding box of the form: '
	'[lat_min, lat_max, lon_min, lon_max].'
	)
	return False

	if len(subregion_config_data) != 4:
	logger.error(
	'Subregions should be passed as a list of lists where '
	'each sub-list contains a bounding box of the form: '
	'[lat_min, lat_max, lon_min, lon_max].'
	)
	return False

	return True