blob: 5a091e62e2c45b07ce2b56d42698fdc454930b48 [file] [log] [blame]
{
"name": "Batch-Preprocessing-Example",
"process.type": "batch",
"data.sources": [
{
"name": "crime_report_source",
"baseline": true,
"connector": {
"type": "file",
"config": {
"format": "text",
"paths": [
"measure/src/main/resources/crime_report.csv"
]
},
"pre.proc": [
"select split(value, ',') as part from this",
"select part[0] as date_time, part[1] as incident, part[2] as address, part[3] as city, part[4] as zipcode from this",
"select cast(date_time as timestamp) as date_time, incident, address, city, cast(zipcode as int) as zipcode from this"
]
}
},
{
"name": "crime_report_truth",
"baseline": true,
"connector": {
"type": "file",
"config": {
"format": "csv",
"options": {
"header": "true"
},
"paths": [
"measure/src/main/resources/crime_report_truth.csv"
]
}
}
}
],
"measures": [
{
"name": "completeness_measure",
"type": "completeness",
"data.source": "crime_report_source",
"config": {
"expr": "zipcode is null OR city is null"
},
"out": [
{
"type": "metric",
"name": "comp_metric",
"flatten": "map"
},
{
"type": "record",
"name": "comp_records"
}
]
},
{
"name": "profiling_measure",
"type": "profiling",
"data.source": "crime_report_source",
"config": {
"approx.distinct.count": true,
"round.scale": 2
},
"out": [
{
"type": "metric",
"name": "prof_metric",
"flatten": "map"
}
]
},
{
"name": "query_measure",
"type": "sparkSQL",
"data.source": "crime_report_source",
"config": {
"expr": "select * from crime_report_source",
"bad.record.definition": "zipcode is null and city is null"
},
"out": [
{
"type": "metric",
"name": "sql_metric",
"flatten": "map"
},
{
"type": "record",
"name": "sql_records"
}
]
},
{
"name": "duplication_measure",
"type": "duplication",
"data.source": "crime_report_source",
"config": {
"bad.record.definition": "duplicate"
},
"out": [
{
"type": "metric",
"name": "duplication_metric",
"flatten": "map"
},
{
"type": "record",
"name": "duplication_records"
}
]
},
{
"name": "accuracy_measure",
"type": "accuracy",
"data.source": "crime_report_source",
"config": {
"target.source": "crime_report_truth",
"expr": [
{
"source.col": "city",
"target.col": "city_name"
}
]
},
"out": [
{
"type": "metric",
"name": "accuracy_metric",
"flatten": "map"
},
{
"type": "record",
"name": "accuracy_records"
}
]
}
],
"sinks": [
"consoleSink"
]
}