blob: 2dcba5ec14627baaf058a264f8479ac26c7606e5 [file] [log] [blame]
{
"name": "Batch-Preprocessing-Example",
"process.type": "batch",
"data.sources": [
{
"name": "source_1",
"baseline": true,
"connector": {
"type": "file",
"config": {
"format": "text",
"paths": [
"measure/src/test/resources/users_info_src.csv"
]
},
"pre.proc": [
"select split(value, ',') as part from this",
"select cast(part[0] as long) as user_id, part[1] as first_name, part[2] as last_name, part[3] as address, part[4] as email, part[5] as phone, part[6] as post_code from this"
]
}
},
{
"name": "source_2",
"baseline": true,
"connector": {
"type": "file",
"config": {
"format": "text",
"paths": [
"measure/src/test/resources/duplicates_users_info_src.csv"
]
},
"pre.proc": [
"select split(value, ',') as part from this",
"select cast(part[0] as int) as id, part[1] as name, part[2] as gender from this"
]
}
}
],
"measures": [
{
"name": "completeness_measure",
"type": "completeness",
"data.source": "source_1",
"config": {
"expr": "post_code is null OR address RLIKE '\\\\d+$'"
},
"out": [
{
"type": "metric",
"name": "comp_metric",
"flatten": "map"
},
{
"type": "record",
"name": "comp_records"
}
]
},
{
"name": "profiling_measure",
"type": "profiling",
"data.source": "source_1",
"config": {
"expr": "user_id, post_code",
"approx.distinct.count": true,
"round.scale": 2
},
"out": [
{
"type": "metric",
"name": "prof_metric",
"flatten": "map"
}
]
},
{
"name": "query_measure",
"type": "sparkSQL",
"data.source": "source_1",
"config": {
"expr": "select * from source_1",
"bad.record.definition": "post_code is null"
},
"out": [
{
"type": "metric",
"name": "sql_metric",
"flatten": "map"
},
{
"type": "record",
"name": "sql_records"
}
]
},
{
"name": "duplication_measure",
"type": "duplication",
"data.source": "source_2",
"config": {
"expr": "name",
"bad.record.definition": "duplicate"
},
"out": [
{
"type": "metric",
"name": "duplication_metric",
"flatten": "map"
},
{
"type": "record",
"name": "duplication_records"
}
]
}
],
"sinks": [
"consoleSink"
]
}