Model/predictor-dl-model/predictor_dl_model/config.yml - incubator-bluemarlin - Git at Google

 product_tag: 'dlpm'
 pipeline_tag: '111021_no_residency_no_mapping' # IMPORTANT: The pipeline tag has to be changed before each run to prevent record duplication.
 factdata_table_name: 'factdata_test_02112022'  # factdata_10202021 #factdata_hq_09222020

 log:
   level: 'warn' # log level for spark and app

 pipeline:
   config_table: '{product_tag}_{pipeline_tag}_config'

   rti_transform: # This is to transform reques-based factdata to impression-based by filling out empty places
     default_hour: 7
     default_price_cat: '1'
     day_step: 2
     start_day: '2020-01-01'
     end_day: '2020-01-02'
     new_bucket_size: 2
     input_table: 'factdata_request_01012022'
     # output_table is factdata_table_name

   filter: # This is for data filtering- si and region
     percentile: 10     # This is for filtering traffic less than 1/10 of average traffic
     output_table_name: '{product_tag}_{pipeline_tag}_tmp_area_map'
     init_start_bucket: 0
     bucket_size: 1000
     bucket_step: 100
     new_bucket_size: 10
     condition: ''
     new_si_list: ['a47eavw7ex',
     '66bcd2720e5011e79bc8fa163e05184e',
     'x0ej5xhk60kjwq',
     'l03493p0r3',
     '7b0d7b55ab0c11e68b7900163e3e481d',
     'b6le0s4qo8',
     'e351de37263311e6af7500163e291137',
     'a290af82884e11e5bdec00163e291137',
     '68bcd2720e5011e79bc8fa163e05184e',
     'f1iprgyl13',
     'w9fmyd5r0i',
     'w3wx3nv9ow5i97',
     'd971z9825e',
     'l2d4ec6csv',
     'z041bf6g4s',
     '71bcd2720e5011e79bc8fa163e05184e',
     '5cd1c663263511e6af7500163e291137',
     'x2fpfbm8rt',
     'd9jucwkpr3',
     'k4werqx13k',
     'j1430itab9wj3b',
     'a8syykhszz',
     's4z85pd1h8',
     '17dd6d8098bf11e5bdec00163e291137',
     'd4d7362e879511e5bdec00163e291137']

   time_series: # This is done on whole bucketized data
     input_table_name: '{product_tag}_{pipeline_tag}_tmp_area_map'
     conditions: []
     yesterday: "2021-07-21" # data is used for training from -<prepare_past_days> to -1(yesterday)
     prepare_past_days: 82 # this should be equal to duration.tfrecorder_reader
     bucket_size: 10 # maximum number of buckets to process starting from 0
     bucket_step: 1 # size of bucket batch that is processed in one iteration
     output_table_name: '{product_tag}_{pipeline_tag}_tmp_ts' # name of the hive table that keeps cleansed and normalized data before writing it into tfrecords, over-writes the existing table
     outlier_table: '{product_tag}_{pipeline_tag}_tmp_outlier'
   uckey_clustering: # This is done on whole data, not slicing on buckets
     pre_cluster_table_name: '{product_tag}_{pipeline_tag}_tmp_pre_cluster_test_12212021'
     create_pre_cluster_table: True
     output_table_name: '{product_tag}_{pipeline_tag}_tmp_cluster_test_12212021'
     cluster_size:
       number_of_virtual_clusters: 1000
       cluster_dense_num_ratio_cap: 0.01
       datapoints_min_th: 0.12 #was [0.15]
       datapoints_th_uckeys: 0.12
       datapoints_th_clusters: 0.5
       popularity_norm: 0.01
       popularity_th: 4
       median_popularity_of_dense: 1856.2833251953125 # median imp of sparse=False, calculate once
   normalization: # This is done on whole data, not slicing on buckets
     output_table_name: '{product_tag}_{pipeline_tag}_trainready_test_12212021'
     columns: {
     'price_cat':['1','2','3'],
     'a': ['','1','2','3','4','5','6'],
     'g':['','g_f','g_m','g_x'],
     't':['UNKNOWN','3G','4G','WIFI','2G'],
     'si':[
       'a47eavw7ex',
     '66bcd2720e5011e79bc8fa163e05184e',
     'x0ej5xhk60kjwq',
     'l03493p0r3',
     '7b0d7b55ab0c11e68b7900163e3e481d',
     'b6le0s4qo8',
     'e351de37263311e6af7500163e291137',
     'a290af82884e11e5bdec00163e291137',
     '68bcd2720e5011e79bc8fa163e05184e',
     'f1iprgyl13',
     'w9fmyd5r0i',
     'w3wx3nv9ow5i97',
     'd971z9825e',
     'l2d4ec6csv',
     'z041bf6g4s',
     '71bcd2720e5011e79bc8fa163e05184e',
     '5cd1c663263511e6af7500163e291137',
     'x2fpfbm8rt',
     'd9jucwkpr3',
     'k4werqx13k',
     'j1430itab9wj3b',
     'a8syykhszz',
     's4z85pd1h8',
     '17dd6d8098bf11e5bdec00163e291137',
     'd4d7362e879511e5bdec00163e291137']
     }
     holidays: ['2019-11-09', '2019-11-10', '2019-11-11', '2019-11-25', '2019-11-26', '2019-11-27','2019-11-28', '2019-12-24','2019-12-25', '2019-12-26','2019-12-31', '2020-01-01', '2020-01-02', '2020-01-19','2020-01-20', '2020-01-21', '2020-01-22', '2020-01-23',  '2020-01-24',  '2020-01-25', '2020-02-08']
   tfrecords:
     tfrecords_hdfs_path: 'factdata.tfrecord.{pipeline_tag}' # it is hdfs location for tfrecords, over-writes the existing files
     tf_statistics_path: './tf_statistics_{pipeline_tag}.pkl'
   distribution:
     output_table_name: '{product_tag}_{pipeline_tag}_tmp_distribution_test_12212021'
     output_detail_table_name: '{product_tag}_{pipeline_tag}_tmp_distribution_detail_test_12212021'

 tfrecorder_reader:
   tfrecords_local_path: './factdata.tfrecord.{pipeline_tag}' # it us local path for tfrecords, over-writes the existing files
   data_dir: 'data/vars'
   valid_threshold: 0.0 # default=0.0, type=float, help="Series minimal length threshold (pct of data length)"
   add_days: 0 # default=64, type=int, help="Add N days in a future for prediction"
   start: '' # help="Effective start date. Data before the start is dropped"
   end: '' # help="Effective end date. Data past the end is dropped"
   corr_backoffset: 0 # default=0, type=int, help="Offset for correlation calculation"
   batch_size: 155000 # batch size of exmaples in tfrecord,
   duration: 82 # time series length, This has to less or equal prepare_past_days
   tf_statistics_path: './tf_statistics_{pipeline_tag}.pkl'

 trainer:
   name: 's32' # default='s32', help='Model name to identify different logs/checkpoints'
   hparam_set: 's32' # default='s32', help="Hyperparameters set to use (see hparams.py for available sets)"
   n_models: 1 # default=1, type=int, help="Jointly train n models with different seeds"
   multi_gpu: false # default=False,  action='store_true', help="Use multiple GPUs for multi-model training, one GPU per model"
   seed: 5 # default=5, type=int, help="Random seed"
   logdir: 'data/logs' # efault='data/logs', help="Directory for summary logs"
   max_epoch: 250 # type=int, default=100, help="Max number of epochs"
   patience: 2 # type=int, default=2, help="Early stopping: stop after N epochs without improvement. Requires do_eval=True"
   train_sampling: 1.0 # type=float, default=1.0, help="Sample this percent of data for training"
   eval_sampling: 1.0 # type=float, default=1.0, help="Sample this percent of data for evaluation"
   eval_memsize: 15 # type=int, default=5, help="Approximate amount of avalable memory on GPU, used for calculation of optimal evaluation batch size"
   gpu: 0 # default=0, type=int, help='GPU instance to use'
   gpu_allow_growth: false # default=False,  action='store_true', help='Allow to gradually increase GPU memory usage instead of grabbing all available memory at start'
   save_best_model: false # default=False,  action='store_true', help='Save best model during training. Requires do_eval=True'
   forward_split: false # default=True, dest='forward_split',  action='store_false', help='Use walk-forward split for model evaluation. Requires do_eval=True'
   side_split: false # default=False, action='store_true', help='Use side split for model evaluation. Requires do_eval=True'
   do_eval: false # default=True, dest='do_eval', action='store_false', help="Don't evaluate model quality during training"
   write_summaries: true # default=True, dest='write_summaries', action='store_false', help="Don't Write Tensorflow summaries"
   verbose: false # default=False, action='store_true', help='Print additional information during graph construction'
   asgd_decay: 0.99 # type=float,  help="EMA decay for averaged SGD. Not use ASGD if not set"
   tqdm: true # default=True, dest='tqdm', action='store_false', help="Don't use tqdm for status display during training"
   max_steps: 20000 # type=int, help="Stop training after max steps"
   save_from_step: 100 # type=int, help="Save model on each evaluation (10 evals per epoch), starting from this step"
   predict_window: 10 # default=3, type=int, help="Number of days to predict"
   back_offset: 0 # don't change it.

 save_model:
   table: '{product_tag}_{pipeline_tag}_model_stat_test_12212021'
   data_dir: data/vars
   ckpt_dir: data/cpt/s32
   saved_dir: data/vars
   model_version: 'version_{pipeline_tag}'
   model_name: 'model_{product_tag}_{pipeline_tag}_test_12212021'
   train_window: 60 # Should be same as the one in hparams

 elastic_search:
   es_host: "10.213.37.41"
   es_port: 9200
   es_index: 'model_stats'
   es_type: 'stat'
	product_tag: 'dlpm'
	pipeline_tag: '111021_no_residency_no_mapping' # IMPORTANT: The pipeline tag has to be changed before each run to prevent record duplication.
	factdata_table_name: 'factdata_test_02112022' # factdata_10202021 #factdata_hq_09222020

	log:
	level: 'warn' # log level for spark and app

	pipeline:
	config_table: '{product_tag}_{pipeline_tag}_config'

	rti_transform: # This is to transform reques-based factdata to impression-based by filling out empty places
	default_hour: 7
	default_price_cat: '1'
	day_step: 2
	start_day: '2020-01-01'
	end_day: '2020-01-02'
	new_bucket_size: 2
	input_table: 'factdata_request_01012022'
	# output_table is factdata_table_name

	filter: # This is for data filtering- si and region
	percentile: 10 # This is for filtering traffic less than 1/10 of average traffic
	output_table_name: '{product_tag}_{pipeline_tag}_tmp_area_map'
	init_start_bucket: 0
	bucket_size: 1000
	bucket_step: 100
	new_bucket_size: 10
	condition: ''
	new_si_list: ['a47eavw7ex',
	'66bcd2720e5011e79bc8fa163e05184e',
	'x0ej5xhk60kjwq',
	'l03493p0r3',
	'7b0d7b55ab0c11e68b7900163e3e481d',
	'b6le0s4qo8',
	'e351de37263311e6af7500163e291137',
	'a290af82884e11e5bdec00163e291137',
	'68bcd2720e5011e79bc8fa163e05184e',
	'f1iprgyl13',
	'w9fmyd5r0i',
	'w3wx3nv9ow5i97',
	'd971z9825e',
	'l2d4ec6csv',
	'z041bf6g4s',
	'71bcd2720e5011e79bc8fa163e05184e',
	'5cd1c663263511e6af7500163e291137',
	'x2fpfbm8rt',
	'd9jucwkpr3',
	'k4werqx13k',
	'j1430itab9wj3b',
	'a8syykhszz',
	's4z85pd1h8',
	'17dd6d8098bf11e5bdec00163e291137',
	'd4d7362e879511e5bdec00163e291137']

	time_series: # This is done on whole bucketized data
	input_table_name: '{product_tag}_{pipeline_tag}_tmp_area_map'
	conditions: []
	yesterday: "2021-07-21" # data is used for training from -<prepare_past_days> to -1(yesterday)
	prepare_past_days: 82 # this should be equal to duration.tfrecorder_reader
	bucket_size: 10 # maximum number of buckets to process starting from 0
	bucket_step: 1 # size of bucket batch that is processed in one iteration
	output_table_name: '{product_tag}_{pipeline_tag}_tmp_ts' # name of the hive table that keeps cleansed and normalized data before writing it into tfrecords, over-writes the existing table
	outlier_table: '{product_tag}_{pipeline_tag}_tmp_outlier'
	uckey_clustering: # This is done on whole data, not slicing on buckets
	pre_cluster_table_name: '{product_tag}_{pipeline_tag}_tmp_pre_cluster_test_12212021'
	create_pre_cluster_table: True
	output_table_name: '{product_tag}_{pipeline_tag}_tmp_cluster_test_12212021'
	cluster_size:
	number_of_virtual_clusters: 1000
	cluster_dense_num_ratio_cap: 0.01
	datapoints_min_th: 0.12 #was [0.15]
	datapoints_th_uckeys: 0.12
	datapoints_th_clusters: 0.5
	popularity_norm: 0.01
	popularity_th: 4
	median_popularity_of_dense: 1856.2833251953125 # median imp of sparse=False, calculate once
	normalization: # This is done on whole data, not slicing on buckets
	output_table_name: '{product_tag}_{pipeline_tag}_trainready_test_12212021'
	columns: {
	'price_cat':['1','2','3'],
	'a': ['','1','2','3','4','5','6'],
	'g':['','g_f','g_m','g_x'],
	't':['UNKNOWN','3G','4G','WIFI','2G'],
	'si':[
	'a47eavw7ex',
	'66bcd2720e5011e79bc8fa163e05184e',
	'x0ej5xhk60kjwq',
	'l03493p0r3',
	'7b0d7b55ab0c11e68b7900163e3e481d',
	'b6le0s4qo8',
	'e351de37263311e6af7500163e291137',
	'a290af82884e11e5bdec00163e291137',
	'68bcd2720e5011e79bc8fa163e05184e',
	'f1iprgyl13',
	'w9fmyd5r0i',
	'w3wx3nv9ow5i97',
	'd971z9825e',
	'l2d4ec6csv',
	'z041bf6g4s',
	'71bcd2720e5011e79bc8fa163e05184e',
	'5cd1c663263511e6af7500163e291137',
	'x2fpfbm8rt',
	'd9jucwkpr3',
	'k4werqx13k',
	'j1430itab9wj3b',
	'a8syykhszz',
	's4z85pd1h8',
	'17dd6d8098bf11e5bdec00163e291137',
	'd4d7362e879511e5bdec00163e291137']
	}
	holidays: ['2019-11-09', '2019-11-10', '2019-11-11', '2019-11-25', '2019-11-26', '2019-11-27','2019-11-28', '2019-12-24','2019-12-25', '2019-12-26','2019-12-31', '2020-01-01', '2020-01-02', '2020-01-19','2020-01-20', '2020-01-21', '2020-01-22', '2020-01-23', '2020-01-24', '2020-01-25', '2020-02-08']
	tfrecords:
	tfrecords_hdfs_path: 'factdata.tfrecord.{pipeline_tag}' # it is hdfs location for tfrecords, over-writes the existing files
	tf_statistics_path: './tf_statistics_{pipeline_tag}.pkl'
	distribution:
	output_table_name: '{product_tag}_{pipeline_tag}_tmp_distribution_test_12212021'
	output_detail_table_name: '{product_tag}_{pipeline_tag}_tmp_distribution_detail_test_12212021'

	tfrecorder_reader:
	tfrecords_local_path: './factdata.tfrecord.{pipeline_tag}' # it us local path for tfrecords, over-writes the existing files
	data_dir: 'data/vars'
	valid_threshold: 0.0 # default=0.0, type=float, help="Series minimal length threshold (pct of data length)"
	add_days: 0 # default=64, type=int, help="Add N days in a future for prediction"
	start: '' # help="Effective start date. Data before the start is dropped"
	end: '' # help="Effective end date. Data past the end is dropped"
	corr_backoffset: 0 # default=0, type=int, help="Offset for correlation calculation"
	batch_size: 155000 # batch size of exmaples in tfrecord,
	duration: 82 # time series length, This has to less or equal prepare_past_days
	tf_statistics_path: './tf_statistics_{pipeline_tag}.pkl'

	trainer:
	name: 's32' # default='s32', help='Model name to identify different logs/checkpoints'
	hparam_set: 's32' # default='s32', help="Hyperparameters set to use (see hparams.py for available sets)"
	n_models: 1 # default=1, type=int, help="Jointly train n models with different seeds"
	multi_gpu: false # default=False, action='store_true', help="Use multiple GPUs for multi-model training, one GPU per model"
	seed: 5 # default=5, type=int, help="Random seed"
	logdir: 'data/logs' # efault='data/logs', help="Directory for summary logs"
	max_epoch: 250 # type=int, default=100, help="Max number of epochs"
	patience: 2 # type=int, default=2, help="Early stopping: stop after N epochs without improvement. Requires do_eval=True"
	train_sampling: 1.0 # type=float, default=1.0, help="Sample this percent of data for training"
	eval_sampling: 1.0 # type=float, default=1.0, help="Sample this percent of data for evaluation"
	eval_memsize: 15 # type=int, default=5, help="Approximate amount of avalable memory on GPU, used for calculation of optimal evaluation batch size"
	gpu: 0 # default=0, type=int, help='GPU instance to use'
	gpu_allow_growth: false # default=False, action='store_true', help='Allow to gradually increase GPU memory usage instead of grabbing all available memory at start'
	save_best_model: false # default=False, action='store_true', help='Save best model during training. Requires do_eval=True'
	forward_split: false # default=True, dest='forward_split', action='store_false', help='Use walk-forward split for model evaluation. Requires do_eval=True'
	side_split: false # default=False, action='store_true', help='Use side split for model evaluation. Requires do_eval=True'
	do_eval: false # default=True, dest='do_eval', action='store_false', help="Don't evaluate model quality during training"
	write_summaries: true # default=True, dest='write_summaries', action='store_false', help="Don't Write Tensorflow summaries"
	verbose: false # default=False, action='store_true', help='Print additional information during graph construction'
	asgd_decay: 0.99 # type=float, help="EMA decay for averaged SGD. Not use ASGD if not set"
	tqdm: true # default=True, dest='tqdm', action='store_false', help="Don't use tqdm for status display during training"
	max_steps: 20000 # type=int, help="Stop training after max steps"
	save_from_step: 100 # type=int, help="Save model on each evaluation (10 evals per epoch), starting from this step"
	predict_window: 10 # default=3, type=int, help="Number of days to predict"
	back_offset: 0 # don't change it.

	save_model:
	table: '{product_tag}_{pipeline_tag}_model_stat_test_12212021'
	data_dir: data/vars
	ckpt_dir: data/cpt/s32
	saved_dir: data/vars
	model_version: 'version_{pipeline_tag}'
	model_name: 'model_{product_tag}_{pipeline_tag}_test_12212021'
	train_window: 60 # Should be same as the one in hparams

	elastic_search:
	es_host: "10.213.37.41"
	es_port: 9200
	es_index: 'model_stats'
	es_type: 'stat'