blob: 510ad7cf942149e3d99813f81dd7fb32d87aa4ad [file] [log] [blame]
{
"name": "Batch-Preprocessing-Example",
"process.type": "batch",
"data.sources": [
{
"name": "crime_report_source",
"baseline": true,
"connector": {
"type": "file",
"config": {
"format": "text",
"paths": [
"measure/src/main/resources/crime_report.csv"
]
},
"pre.proc": [
"select split(value, ',') as part from this",
"select part[0] as date_time, part[1] as incident, part[2] as address, part[3] as city, part[4] as zipcode from this",
"select cast(date_time as timestamp) as date_time, incident, address, city, cast(zipcode as int) as zipcode from this"
]
}
},
{
"name": "crime_report_truth",
"baseline": true,
"connector": {
"type": "file",
"config": {
"format": "csv",
"options": {
"header": "true"
},
"paths": [
"measure/src/main/resources/crime_report_truth.csv"
]
}
}
}
],
"measures": [
{
"name": "completeness_measure",
"type": "completeness",
"data.source": "crime_report_source",
"config": {
"expr": "zipcode is null OR city is null"
},
"out": [
{
"type": "metric",
"name": "comp_metric",
"flatten": "map"
},
{
"type": "record",
"name": "comp_records"
}
]
},
{
"name": "profiling_measure",
"type": "profiling",
"data.source": "crime_report_source",
"config": {
"approx.distinct.count": true,
"round.scale": 2
},
"out": [
{
"type": "metric",
"name": "prof_metric",
"flatten": "map"
}
]
},
{
"name": "spark_sql_measure",
"type": "sparkSQL",
"data.source": "crime_report_source",
"config": {
"expr": "SELECT t.*, sq.zip IS NULL AS __measure_spark_sql_measure FROM crime_report_source AS t LEFT OUTER JOIN (SELECT zipcode as zip, COUNT(DISTINCT city) AS city_count FROM crime_report_source GROUP BY zipcode having city_count = 1) as sq ON sq.zip=t.zipcode",
"bad.record.definition": "__measure_spark_sql_measure"
},
"out": [
{
"type": "metric",
"name": "spark_sql_metric",
"flatten": "map"
},
{
"type": "record",
"name": "spark_sql_records"
}
]
},
{
"name": "duplication_measure",
"type": "duplication",
"data.source": "crime_report_source",
"config": {
"bad.record.definition": "duplicate"
},
"out": [
{
"type": "metric",
"name": "duplication_metric",
"flatten": "map"
},
{
"type": "record",
"name": "duplication_records"
}
]
},
{
"name": "accuracy_measure",
"type": "accuracy",
"data.source": "crime_report_source",
"config": {
"ref.source": "crime_report_truth",
"expr": [
{
"source.col": "city",
"ref.col": "city_name"
}
]
},
"out": [
{
"type": "metric",
"name": "accuracy_metric",
"flatten": "map"
},
{
"type": "record",
"name": "accuracy_records"
}
]
}
],
"sinks": [
"consoleSink"
]
}