| { |
| "name": "Batch-Preprocessing-Example", |
| "process.type": "batch", |
| "data.sources": [ |
| { |
| "name": "crime_report_source", |
| "baseline": true, |
| "connector": { |
| "type": "file", |
| "config": { |
| "format": "text", |
| "paths": [ |
| "measure/src/main/resources/crime_report.csv" |
| ] |
| }, |
| "pre.proc": [ |
| "select split(value, ',') as part from this", |
| "select part[0] as date_time, part[1] as incident, part[2] as address, part[3] as city, part[4] as zipcode from this", |
| "select cast(date_time as timestamp) as date_time, incident, address, city, cast(zipcode as int) as zipcode from this" |
| ] |
| } |
| }, |
| { |
| "name": "crime_report_truth", |
| "baseline": true, |
| "connector": { |
| "type": "file", |
| "config": { |
| "format": "csv", |
| "options": { |
| "header": "true" |
| }, |
| "paths": [ |
| "measure/src/main/resources/crime_report_truth.csv" |
| ] |
| } |
| } |
| } |
| ], |
| "measures": [ |
| { |
| "name": "completeness_measure", |
| "type": "completeness", |
| "data.source": "crime_report_source", |
| "config": { |
| "expr": "zipcode is null OR city is null" |
| }, |
| "out": [ |
| { |
| "type": "metric", |
| "name": "comp_metric", |
| "flatten": "map" |
| }, |
| { |
| "type": "record", |
| "name": "comp_records" |
| } |
| ] |
| }, |
| { |
| "name": "profiling_measure", |
| "type": "profiling", |
| "data.source": "crime_report_source", |
| "config": { |
| "approx.distinct.count": true, |
| "round.scale": 2 |
| }, |
| "out": [ |
| { |
| "type": "metric", |
| "name": "prof_metric", |
| "flatten": "map" |
| } |
| ] |
| }, |
| { |
| "name": "query_measure", |
| "type": "sparkSQL", |
| "data.source": "crime_report_source", |
| "config": { |
| "expr": "select * from crime_report_source", |
| "bad.record.definition": "zipcode is null and city is null" |
| }, |
| "out": [ |
| { |
| "type": "metric", |
| "name": "sql_metric", |
| "flatten": "map" |
| }, |
| { |
| "type": "record", |
| "name": "sql_records" |
| } |
| ] |
| }, |
| { |
| "name": "duplication_measure", |
| "type": "duplication", |
| "data.source": "crime_report_source", |
| "config": { |
| "bad.record.definition": "duplicate" |
| }, |
| "out": [ |
| { |
| "type": "metric", |
| "name": "duplication_metric", |
| "flatten": "map" |
| }, |
| { |
| "type": "record", |
| "name": "duplication_records" |
| } |
| ] |
| }, |
| { |
| "name": "accuracy_measure", |
| "type": "accuracy", |
| "data.source": "crime_report_source", |
| "config": { |
| "target.source": "crime_report_truth", |
| "expr": [ |
| { |
| "source.col": "city", |
| "target.col": "city_name" |
| } |
| ] |
| }, |
| "out": [ |
| { |
| "type": "metric", |
| "name": "accuracy_metric", |
| "flatten": "map" |
| }, |
| { |
| "type": "record", |
| "name": "accuracy_records" |
| } |
| ] |
| } |
| ], |
| "sinks": [ |
| "consoleSink" |
| ] |
| } |