| product_tag: 'dlpm' |
| pipeline_tag: '111021_no_residency_no_mapping' # IMPORTANT: The pipeline tag has to be changed before each run to prevent record duplication. |
| factdata_table_name: 'factdata_test_02112022' # factdata_10202021 #factdata_hq_09222020 |
| |
| log: |
| level: 'warn' # log level for spark and app |
| |
| pipeline: |
| config_table: '{product_tag}_{pipeline_tag}_config' |
| |
| rti_transform: # This is to transform reques-based factdata to impression-based by filling out empty places |
| default_hour: 7 |
| default_price_cat: '1' |
| day_step: 2 |
| start_day: '2020-01-01' |
| end_day: '2020-01-02' |
| new_bucket_size: 2 |
| input_table: 'factdata_request_01012022' |
| # output_table is factdata_table_name |
| |
| filter: # This is for data filtering- si and region |
| percentile: 10 # This is for filtering traffic less than 1/10 of average traffic |
| output_table_name: '{product_tag}_{pipeline_tag}_tmp_area_map' |
| init_start_bucket: 0 |
| bucket_size: 1000 |
| bucket_step: 100 |
| new_bucket_size: 10 |
| condition: '' |
| new_si_list: ['a47eavw7ex', |
| '66bcd2720e5011e79bc8fa163e05184e', |
| 'x0ej5xhk60kjwq', |
| 'l03493p0r3', |
| '7b0d7b55ab0c11e68b7900163e3e481d', |
| 'b6le0s4qo8', |
| 'e351de37263311e6af7500163e291137', |
| 'a290af82884e11e5bdec00163e291137', |
| '68bcd2720e5011e79bc8fa163e05184e', |
| 'f1iprgyl13', |
| 'w9fmyd5r0i', |
| 'w3wx3nv9ow5i97', |
| 'd971z9825e', |
| 'l2d4ec6csv', |
| 'z041bf6g4s', |
| '71bcd2720e5011e79bc8fa163e05184e', |
| '5cd1c663263511e6af7500163e291137', |
| 'x2fpfbm8rt', |
| 'd9jucwkpr3', |
| 'k4werqx13k', |
| 'j1430itab9wj3b', |
| 'a8syykhszz', |
| 's4z85pd1h8', |
| '17dd6d8098bf11e5bdec00163e291137', |
| 'd4d7362e879511e5bdec00163e291137'] |
| |
| time_series: # This is done on whole bucketized data |
| input_table_name: '{product_tag}_{pipeline_tag}_tmp_area_map' |
| conditions: [] |
| yesterday: "2021-07-21" # data is used for training from -<prepare_past_days> to -1(yesterday) |
| prepare_past_days: 82 # this should be equal to duration.tfrecorder_reader |
| bucket_size: 10 # maximum number of buckets to process starting from 0 |
| bucket_step: 1 # size of bucket batch that is processed in one iteration |
| output_table_name: '{product_tag}_{pipeline_tag}_tmp_ts' # name of the hive table that keeps cleansed and normalized data before writing it into tfrecords, over-writes the existing table |
| outlier_table: '{product_tag}_{pipeline_tag}_tmp_outlier' |
| uckey_clustering: # This is done on whole data, not slicing on buckets |
| pre_cluster_table_name: '{product_tag}_{pipeline_tag}_tmp_pre_cluster_test_12212021' |
| create_pre_cluster_table: True |
| output_table_name: '{product_tag}_{pipeline_tag}_tmp_cluster_test_12212021' |
| cluster_size: |
| number_of_virtual_clusters: 1000 |
| cluster_dense_num_ratio_cap: 0.01 |
| datapoints_min_th: 0.12 #was [0.15] |
| datapoints_th_uckeys: 0.12 |
| datapoints_th_clusters: 0.5 |
| popularity_norm: 0.01 |
| popularity_th: 4 |
| median_popularity_of_dense: 1856.2833251953125 # median imp of sparse=False, calculate once |
| normalization: # This is done on whole data, not slicing on buckets |
| output_table_name: '{product_tag}_{pipeline_tag}_trainready_test_12212021' |
| columns: { |
| 'price_cat':['1','2','3'], |
| 'a': ['','1','2','3','4','5','6'], |
| 'g':['','g_f','g_m','g_x'], |
| 't':['UNKNOWN','3G','4G','WIFI','2G'], |
| 'si':[ |
| 'a47eavw7ex', |
| '66bcd2720e5011e79bc8fa163e05184e', |
| 'x0ej5xhk60kjwq', |
| 'l03493p0r3', |
| '7b0d7b55ab0c11e68b7900163e3e481d', |
| 'b6le0s4qo8', |
| 'e351de37263311e6af7500163e291137', |
| 'a290af82884e11e5bdec00163e291137', |
| '68bcd2720e5011e79bc8fa163e05184e', |
| 'f1iprgyl13', |
| 'w9fmyd5r0i', |
| 'w3wx3nv9ow5i97', |
| 'd971z9825e', |
| 'l2d4ec6csv', |
| 'z041bf6g4s', |
| '71bcd2720e5011e79bc8fa163e05184e', |
| '5cd1c663263511e6af7500163e291137', |
| 'x2fpfbm8rt', |
| 'd9jucwkpr3', |
| 'k4werqx13k', |
| 'j1430itab9wj3b', |
| 'a8syykhszz', |
| 's4z85pd1h8', |
| '17dd6d8098bf11e5bdec00163e291137', |
| 'd4d7362e879511e5bdec00163e291137'] |
| } |
| holidays: ['2019-11-09', '2019-11-10', '2019-11-11', '2019-11-25', '2019-11-26', '2019-11-27','2019-11-28', '2019-12-24','2019-12-25', '2019-12-26','2019-12-31', '2020-01-01', '2020-01-02', '2020-01-19','2020-01-20', '2020-01-21', '2020-01-22', '2020-01-23', '2020-01-24', '2020-01-25', '2020-02-08'] |
| tfrecords: |
| tfrecords_hdfs_path: 'factdata.tfrecord.{pipeline_tag}' # it is hdfs location for tfrecords, over-writes the existing files |
| tf_statistics_path: './tf_statistics_{pipeline_tag}.pkl' |
| distribution: |
| output_table_name: '{product_tag}_{pipeline_tag}_tmp_distribution_test_12212021' |
| output_detail_table_name: '{product_tag}_{pipeline_tag}_tmp_distribution_detail_test_12212021' |
| |
| tfrecorder_reader: |
| tfrecords_local_path: './factdata.tfrecord.{pipeline_tag}' # it us local path for tfrecords, over-writes the existing files |
| data_dir: 'data/vars' |
| valid_threshold: 0.0 # default=0.0, type=float, help="Series minimal length threshold (pct of data length)" |
| add_days: 0 # default=64, type=int, help="Add N days in a future for prediction" |
| start: '' # help="Effective start date. Data before the start is dropped" |
| end: '' # help="Effective end date. Data past the end is dropped" |
| corr_backoffset: 0 # default=0, type=int, help="Offset for correlation calculation" |
| batch_size: 155000 # batch size of exmaples in tfrecord, |
| duration: 82 # time series length, This has to less or equal prepare_past_days |
| tf_statistics_path: './tf_statistics_{pipeline_tag}.pkl' |
| |
| trainer: |
| name: 's32' # default='s32', help='Model name to identify different logs/checkpoints' |
| hparam_set: 's32' # default='s32', help="Hyperparameters set to use (see hparams.py for available sets)" |
| n_models: 1 # default=1, type=int, help="Jointly train n models with different seeds" |
| multi_gpu: false # default=False, action='store_true', help="Use multiple GPUs for multi-model training, one GPU per model" |
| seed: 5 # default=5, type=int, help="Random seed" |
| logdir: 'data/logs' # efault='data/logs', help="Directory for summary logs" |
| max_epoch: 250 # type=int, default=100, help="Max number of epochs" |
| patience: 2 # type=int, default=2, help="Early stopping: stop after N epochs without improvement. Requires do_eval=True" |
| train_sampling: 1.0 # type=float, default=1.0, help="Sample this percent of data for training" |
| eval_sampling: 1.0 # type=float, default=1.0, help="Sample this percent of data for evaluation" |
| eval_memsize: 15 # type=int, default=5, help="Approximate amount of avalable memory on GPU, used for calculation of optimal evaluation batch size" |
| gpu: 0 # default=0, type=int, help='GPU instance to use' |
| gpu_allow_growth: false # default=False, action='store_true', help='Allow to gradually increase GPU memory usage instead of grabbing all available memory at start' |
| save_best_model: false # default=False, action='store_true', help='Save best model during training. Requires do_eval=True' |
| forward_split: false # default=True, dest='forward_split', action='store_false', help='Use walk-forward split for model evaluation. Requires do_eval=True' |
| side_split: false # default=False, action='store_true', help='Use side split for model evaluation. Requires do_eval=True' |
| do_eval: false # default=True, dest='do_eval', action='store_false', help="Don't evaluate model quality during training" |
| write_summaries: true # default=True, dest='write_summaries', action='store_false', help="Don't Write Tensorflow summaries" |
| verbose: false # default=False, action='store_true', help='Print additional information during graph construction' |
| asgd_decay: 0.99 # type=float, help="EMA decay for averaged SGD. Not use ASGD if not set" |
| tqdm: true # default=True, dest='tqdm', action='store_false', help="Don't use tqdm for status display during training" |
| max_steps: 20000 # type=int, help="Stop training after max steps" |
| save_from_step: 100 # type=int, help="Save model on each evaluation (10 evals per epoch), starting from this step" |
| predict_window: 10 # default=3, type=int, help="Number of days to predict" |
| back_offset: 0 # don't change it. |
| |
| save_model: |
| table: '{product_tag}_{pipeline_tag}_model_stat_test_12212021' |
| data_dir: data/vars |
| ckpt_dir: data/cpt/s32 |
| saved_dir: data/vars |
| model_version: 'version_{pipeline_tag}' |
| model_name: 'model_{product_tag}_{pipeline_tag}_test_12212021' |
| train_window: 60 # Should be same as the one in hparams |
| |
| elastic_search: |
| es_host: "10.213.37.41" |
| es_port: 9200 |
| es_index: 'model_stats' |
| es_type: 'stat' |