| { |
| "name": "Batch-Preprocessing-Example", |
| "process.type": "batch", |
| "data.sources": [ |
| { |
| "name": "source_1", |
| "baseline": true, |
| "connector": { |
| "type": "file", |
| "config": { |
| "format": "text", |
| "paths": [ |
| "measure/src/test/resources/users_info_src.csv" |
| ] |
| }, |
| "pre.proc": [ |
| "select split(value, ',') as part from this", |
| "select cast(part[0] as long) as user_id, part[1] as first_name, part[2] as last_name, part[3] as address, part[4] as email, part[5] as phone, part[6] as post_code from this" |
| ] |
| } |
| }, |
| { |
| "name": "source_2", |
| "baseline": true, |
| "connector": { |
| "type": "file", |
| "config": { |
| "format": "text", |
| "paths": [ |
| "measure/src/test/resources/duplicates_users_info_src.csv" |
| ] |
| }, |
| "pre.proc": [ |
| "select split(value, ',') as part from this", |
| "select cast(part[0] as int) as id, part[1] as name, part[2] as gender from this" |
| ] |
| } |
| } |
| ], |
| "measures": [ |
| { |
| "name": "completeness_measure", |
| "type": "completeness", |
| "data.source": "source_1", |
| "config": { |
| "expr": "post_code is null OR address RLIKE '\\\\d+$'" |
| }, |
| "out": [ |
| { |
| "type": "metric", |
| "name": "comp_metric", |
| "flatten": "map" |
| }, |
| { |
| "type": "record", |
| "name": "comp_records" |
| } |
| ] |
| }, |
| { |
| "name": "profiling_measure", |
| "type": "profiling", |
| "data.source": "source_1", |
| "config": { |
| "expr": "user_id, post_code", |
| "approx.distinct.count": true, |
| "round.scale": 2 |
| }, |
| "out": [ |
| { |
| "type": "metric", |
| "name": "prof_metric", |
| "flatten": "map" |
| } |
| ] |
| }, |
| { |
| "name": "query_measure", |
| "type": "sparkSQL", |
| "data.source": "source_1", |
| "config": { |
| "expr": "select * from source_1", |
| "bad.record.definition": "post_code is null" |
| }, |
| "out": [ |
| { |
| "type": "metric", |
| "name": "sql_metric", |
| "flatten": "map" |
| }, |
| { |
| "type": "record", |
| "name": "sql_records" |
| } |
| ] |
| }, |
| { |
| "name": "duplication_measure", |
| "type": "duplication", |
| "data.source": "source_2", |
| "config": { |
| "expr": "name", |
| "bad.record.definition": "duplicate" |
| }, |
| "out": [ |
| { |
| "type": "metric", |
| "name": "duplication_metric", |
| "flatten": "map" |
| }, |
| { |
| "type": "record", |
| "name": "duplication_records" |
| } |
| ] |
| } |
| ], |
| "sinks": [ |
| "consoleSink" |
| ] |
| } |