blob: 16db94f9468cc384c418a72ca5858283de9acf2a [file] [log] [blame]
product_tag: 'dlpm'
pipeline_tag: '111021_no_residency_no_mapping' # IMPORTANT: The pipeline tag has to be changed before each run to prevent record duplication.
factdata_table_name: 'factdata_test_02112022' # factdata_10202021 #factdata_hq_09222020
log:
level: 'warn' # log level for spark and app
pipeline:
config_table: '{product_tag}_{pipeline_tag}_config'
rti_transform: # This is to transform reques-based factdata to impression-based by filling out empty places
default_hour: 7
default_price_cat: '1'
day_step: 2
start_day: '2020-01-01'
end_day: '2020-01-02'
new_bucket_size: 2
input_table: 'factdata_request_01012022'
# output_table is factdata_table_name
filter: # This is for data filtering- si and region
percentile: 10 # This is for filtering traffic less than 1/10 of average traffic
output_table_name: '{product_tag}_{pipeline_tag}_tmp_area_map'
init_start_bucket: 0
bucket_size: 1000
bucket_step: 100
new_bucket_size: 10
condition: ''
new_si_list: ['a47eavw7ex',
'66bcd2720e5011e79bc8fa163e05184e',
'x0ej5xhk60kjwq',
'l03493p0r3',
'7b0d7b55ab0c11e68b7900163e3e481d',
'b6le0s4qo8',
'e351de37263311e6af7500163e291137',
'a290af82884e11e5bdec00163e291137',
'68bcd2720e5011e79bc8fa163e05184e',
'f1iprgyl13',
'w9fmyd5r0i',
'w3wx3nv9ow5i97',
'd971z9825e',
'l2d4ec6csv',
'z041bf6g4s',
'71bcd2720e5011e79bc8fa163e05184e',
'5cd1c663263511e6af7500163e291137',
'x2fpfbm8rt',
'd9jucwkpr3',
'k4werqx13k',
'j1430itab9wj3b',
'a8syykhszz',
's4z85pd1h8',
'17dd6d8098bf11e5bdec00163e291137',
'd4d7362e879511e5bdec00163e291137']
time_series: # This is done on whole bucketized data
input_table_name: '{product_tag}_{pipeline_tag}_tmp_area_map'
conditions: []
yesterday: "2021-07-21" # data is used for training from -<prepare_past_days> to -1(yesterday)
prepare_past_days: 82 # this should be equal to duration.tfrecorder_reader
bucket_size: 10 # maximum number of buckets to process starting from 0
bucket_step: 1 # size of bucket batch that is processed in one iteration
output_table_name: '{product_tag}_{pipeline_tag}_tmp_ts' # name of the hive table that keeps cleansed and normalized data before writing it into tfrecords, over-writes the existing table
outlier_table: '{product_tag}_{pipeline_tag}_tmp_outlier'
uckey_clustering: # This is done on whole data, not slicing on buckets
pre_cluster_table_name: '{product_tag}_{pipeline_tag}_tmp_pre_cluster_test_12212021'
create_pre_cluster_table: True
output_table_name: '{product_tag}_{pipeline_tag}_tmp_cluster_test_12212021'
cluster_size:
number_of_virtual_clusters: 1000
cluster_dense_num_ratio_cap: 0.01
datapoints_min_th: 0.12 #was [0.15]
datapoints_th_uckeys: 0.12
datapoints_th_clusters: 0.5
popularity_norm: 0.01
popularity_th: 4
median_popularity_of_dense: 1856.2833251953125 # median imp of sparse=False, calculate once
normalization: # This is done on whole data, not slicing on buckets
output_table_name: '{product_tag}_{pipeline_tag}_trainready_test_12212021'
columns: {
'price_cat':['1','2','3'],
'a': ['','1','2','3','4','5','6'],
'g':['','g_f','g_m','g_x'],
't':['UNKNOWN','3G','4G','WIFI','2G'],
'si':[
'a47eavw7ex',
'66bcd2720e5011e79bc8fa163e05184e',
'x0ej5xhk60kjwq',
'l03493p0r3',
'7b0d7b55ab0c11e68b7900163e3e481d',
'b6le0s4qo8',
'e351de37263311e6af7500163e291137',
'a290af82884e11e5bdec00163e291137',
'68bcd2720e5011e79bc8fa163e05184e',
'f1iprgyl13',
'w9fmyd5r0i',
'w3wx3nv9ow5i97',
'd971z9825e',
'l2d4ec6csv',
'z041bf6g4s',
'71bcd2720e5011e79bc8fa163e05184e',
'5cd1c663263511e6af7500163e291137',
'x2fpfbm8rt',
'd9jucwkpr3',
'k4werqx13k',
'j1430itab9wj3b',
'a8syykhszz',
's4z85pd1h8',
'17dd6d8098bf11e5bdec00163e291137',
'd4d7362e879511e5bdec00163e291137']
}
holidays: ['2019-11-09', '2019-11-10', '2019-11-11', '2019-11-25', '2019-11-26', '2019-11-27','2019-11-28', '2019-12-24','2019-12-25', '2019-12-26','2019-12-31', '2020-01-01', '2020-01-02', '2020-01-19','2020-01-20', '2020-01-21', '2020-01-22', '2020-01-23', '2020-01-24', '2020-01-25', '2020-02-08']
tfrecords:
tfrecords_hdfs_path: 'factdata.tfrecord.{pipeline_tag}' # it is hdfs location for tfrecords, over-writes the existing files
tf_statistics_path: './tf_statistics_{pipeline_tag}.pkl'
distribution:
output_table_name: '{product_tag}_{pipeline_tag}_tmp_distribution_test_12212021'
output_detail_table_name: '{product_tag}_{pipeline_tag}_tmp_distribution_detail_test_12212021'
tfrecorder_reader:
tfrecords_local_path: './factdata.tfrecord.{pipeline_tag}' # it us local path for tfrecords, over-writes the existing files
data_dir: 'data/vars'
valid_threshold: 0.0 # default=0.0, type=float, help="Series minimal length threshold (pct of data length)"
add_days: 0 # default=64, type=int, help="Add N days in a future for prediction"
start: '' # help="Effective start date. Data before the start is dropped"
end: '' # help="Effective end date. Data past the end is dropped"
corr_backoffset: 0 # default=0, type=int, help="Offset for correlation calculation"
batch_size: 155000 # batch size of exmaples in tfrecord,
duration: 82 # time series length, This has to less or equal prepare_past_days
tf_statistics_path: './tf_statistics_{pipeline_tag}.pkl'
trainer:
name: 's32' # default='s32', help='Model name to identify different logs/checkpoints'
hparam_set: 's32' # default='s32', help="Hyperparameters set to use (see hparams.py for available sets)"
n_models: 1 # default=1, type=int, help="Jointly train n models with different seeds"
multi_gpu: false # default=False, action='store_true', help="Use multiple GPUs for multi-model training, one GPU per model"
seed: 5 # default=5, type=int, help="Random seed"
logdir: 'data/logs' # efault='data/logs', help="Directory for summary logs"
max_epoch: 250 # type=int, default=100, help="Max number of epochs"
patience: 2 # type=int, default=2, help="Early stopping: stop after N epochs without improvement. Requires do_eval=True"
train_sampling: 1.0 # type=float, default=1.0, help="Sample this percent of data for training"
eval_sampling: 1.0 # type=float, default=1.0, help="Sample this percent of data for evaluation"
eval_memsize: 15 # type=int, default=5, help="Approximate amount of avalable memory on GPU, used for calculation of optimal evaluation batch size"
gpu: 0 # default=0, type=int, help='GPU instance to use'
gpu_allow_growth: false # default=False, action='store_true', help='Allow to gradually increase GPU memory usage instead of grabbing all available memory at start'
save_best_model: false # default=False, action='store_true', help='Save best model during training. Requires do_eval=True'
forward_split: false # default=True, dest='forward_split', action='store_false', help='Use walk-forward split for model evaluation. Requires do_eval=True'
side_split: false # default=False, action='store_true', help='Use side split for model evaluation. Requires do_eval=True'
do_eval: false # default=True, dest='do_eval', action='store_false', help="Don't evaluate model quality during training"
write_summaries: true # default=True, dest='write_summaries', action='store_false', help="Don't Write Tensorflow summaries"
verbose: false # default=False, action='store_true', help='Print additional information during graph construction'
asgd_decay: 0.99 # type=float, help="EMA decay for averaged SGD. Not use ASGD if not set"
tqdm: true # default=True, dest='tqdm', action='store_false', help="Don't use tqdm for status display during training"
max_steps: 20000 # type=int, help="Stop training after max steps"
save_from_step: 100 # type=int, help="Save model on each evaluation (10 evals per epoch), starting from this step"
predict_window: 10 # default=3, type=int, help="Number of days to predict"
back_offset: 0 # don't change it.
save_model:
table: '{product_tag}_{pipeline_tag}_model_stat_test_12212021'
data_dir: data/vars
ckpt_dir: data/cpt/s32
saved_dir: data/vars
model_version: 'version_{pipeline_tag}'
model_name: 'model_{product_tag}_{pipeline_tag}_test_12212021'
train_window: 60 # Should be same as the one in hparams
elastic_search:
es_host: "10.213.37.41"
es_port: 9200
es_index: 'model_stats'
es_type: 'stat'