[GRIFFIN-358] Changed 'target' to 'ref' to clear terminology
diff --git a/measure/src/main/resources/config-batch-preproc.json b/measure/src/main/resources/config-batch-preproc.json
index b40f57a..510ad7c 100644
--- a/measure/src/main/resources/config-batch-preproc.json
+++ b/measure/src/main/resources/config-batch-preproc.json
@@ -117,11 +117,11 @@
"type": "accuracy",
"data.source": "crime_report_source",
"config": {
- "target.source": "crime_report_truth",
+ "ref.source": "crime_report_truth",
"expr": [
{
"source.col": "city",
- "target.col": "city_name"
+ "ref.col": "city_name"
}
]
},
diff --git a/measure/src/main/scala/org/apache/griffin/measure/execution/impl/AccuracyMeasure.scala b/measure/src/main/scala/org/apache/griffin/measure/execution/impl/AccuracyMeasure.scala
index 2f1cdf3..b2da9ca 100644
--- a/measure/src/main/scala/org/apache/griffin/measure/execution/impl/AccuracyMeasure.scala
+++ b/measure/src/main/scala/org/apache/griffin/measure/execution/impl/AccuracyMeasure.scala
@@ -29,7 +29,7 @@
case class AccuracyMeasure(measureParam: MeasureParam) extends Measure {
- case class AccuracyExpr(sourceCol: String, targetCol: String)
+ case class AccuracyExpr(sourceCol: String, refCol: String)
import AccuracyMeasure._
import Measure._
@@ -38,7 +38,7 @@
override val supportsMetricWrite: Boolean = true
- val targetSource: String = getFromConfig[String](TargetSourceStr, null)
+ val refSource: String = getFromConfig[String](ReferenceSourceStr, null)
val exprOpt: Option[Seq[Map[String, String]]] =
Option(getFromConfig[Seq[Map[String, String]]](Expression, null))
@@ -50,33 +50,33 @@
val dataSource = addColumnPrefix(originalSource, SourcePrefixStr)
- val targetDataSource =
+ val refDataSource =
addColumnPrefix(
- sparkSession.read.table(targetSource).drop(ConstantColumns.tmst),
- TargetPrefixStr)
+ sparkSession.read.table(refSource).drop(ConstantColumns.tmst),
+ refPrefixStr)
val accuracyExprs = exprOpt.get
.map(toAccuracyExpr)
.distinct
.map(x =>
- AccuracyExpr(s"$SourcePrefixStr${x.sourceCol}", s"$TargetPrefixStr${x.targetCol}"))
+ AccuracyExpr(s"$SourcePrefixStr${x.sourceCol}", s"$refPrefixStr${x.refCol}"))
val joinExpr =
accuracyExprs
- .map(e => col(e.sourceCol) === col(e.targetCol))
+ .map(e => col(e.sourceCol) === col(e.refCol))
.reduce(_ and _)
val indicatorExpr =
accuracyExprs
.map(e =>
- coalesce(col(e.sourceCol), emptyCol) notEqual coalesce(col(e.targetCol), emptyCol))
+ coalesce(col(e.sourceCol), emptyCol) notEqual coalesce(col(e.refCol), emptyCol))
.reduce(_ or _)
val nullExpr = accuracyExprs.map(e => col(e.sourceCol).isNull).reduce(_ or _)
val recordsDf = removeColumnPrefix(
dataSource
- .join(targetDataSource, joinExpr, "left")
+ .join(refDataSource, joinExpr, "left")
.withColumn(valueColumn, when(indicatorExpr or nullExpr, 1).otherwise(0)),
SourcePrefixStr)
.select((originalCols :+ valueColumn).map(col): _*)
@@ -99,45 +99,45 @@
assert(exprOpt.get.flatten.nonEmpty, s"'$Expression' must not be empty or of invalid type.")
assert(
- !StringUtil.isNullOrEmpty(targetSource),
- s"'$TargetSourceStr' must not be null, empty or of invalid type.")
+ !StringUtil.isNullOrEmpty(refSource),
+ s"'$ReferenceSourceStr' must not be null, empty or of invalid type.")
datasetValidations()
}
private def toAccuracyExpr(map: Map[String, String]): AccuracyExpr = {
assert(map.contains(SourceColStr), s"'$SourceColStr' must be defined.")
- assert(map.contains(TargetColStr), s"'$TargetColStr' must be defined.")
+ assert(map.contains(ReferenceColStr), s"'$ReferenceColStr' must be defined.")
- AccuracyExpr(map(SourceColStr), map(TargetColStr))
+ AccuracyExpr(map(SourceColStr), map(ReferenceColStr))
}
private def datasetValidations(): Unit = {
val sparkSession = SparkSession.getDefaultSession.get
assert(
- sparkSession.catalog.tableExists(targetSource),
- s"Target source with name '$targetSource' does not exist.")
+ sparkSession.catalog.tableExists(refSource),
+ s"Reference source with name '$refSource' does not exist.")
val datasourceName = measureParam.getDataSource
val dataSourceCols =
sparkSession.read.table(datasourceName).columns.map(_.toLowerCase(Locale.ROOT)).toSet
- val targetDataSourceCols =
- sparkSession.read.table(targetSource).columns.map(_.toLowerCase(Locale.ROOT)).toSet
+ val refDataSourceCols =
+ sparkSession.read.table(refSource).columns.map(_.toLowerCase(Locale.ROOT)).toSet
val accuracyExpr = exprOpt.get.map(toAccuracyExpr).distinct
- val (forDataSource, forTarget) =
+ val (forDataSource, forRefDataSource) =
accuracyExpr
.map(
e =>
(
(e.sourceCol, dataSourceCols.contains(e.sourceCol)),
- (e.targetCol, targetDataSourceCols.contains(e.targetCol))))
+ (e.refCol, refDataSourceCols.contains(e.refCol))))
.unzip
val invalidColsDataSource = forDataSource.filterNot(_._2)
- val invalidColsTarget = forTarget.filterNot(_._2)
+ val invalidColsRefSource = forRefDataSource.filterNot(_._2)
assert(
invalidColsDataSource.isEmpty,
@@ -145,9 +145,9 @@
s"do not exist in data set with name '$datasourceName'")
assert(
- invalidColsTarget.isEmpty,
- s"Column(s) [${invalidColsTarget.map(_._1).mkString(", ")}] " +
- s"do not exist in target data set with name '$targetSource'")
+ invalidColsRefSource.isEmpty,
+ s"Column(s) [${invalidColsRefSource.map(_._1).mkString(", ")}] " +
+ s"do not exist in reference data set with name '$refSource'")
}
private def addColumnPrefix(dataFrame: DataFrame, prefix: String): DataFrame = {
@@ -163,11 +163,11 @@
object AccuracyMeasure {
final val SourcePrefixStr: String = "__source_"
- final val TargetPrefixStr: String = "__target_"
+ final val refPrefixStr: String = "__ref_"
- final val TargetSourceStr: String = "target.source"
+ final val ReferenceSourceStr: String = "ref.source"
final val SourceColStr: String = "source.col"
- final val TargetColStr: String = "target.col"
+ final val ReferenceColStr: String = "ref.col"
final val AccurateStr: String = "accurate"
final val InAccurateStr: String = "inaccurate"
diff --git a/measure/src/test/resources/_accuracy-batch-griffindsl.json b/measure/src/test/resources/_accuracy-batch-griffindsl.json
index 9f5892b..2831044 100644
--- a/measure/src/test/resources/_accuracy-batch-griffindsl.json
+++ b/measure/src/test/resources/_accuracy-batch-griffindsl.json
@@ -33,35 +33,35 @@
"type": "accuracy",
"data.source": "target",
"config": {
- "target.source": "source",
+ "ref.source": "source",
"expr": [
{
"source.col": "user_id",
- "target.col": "user_id"
+ "ref.col": "user_id"
},
{
"source.col": "first_name",
- "target.col": "first_name"
+ "ref.col": "first_name"
},
{
"source.col": "last_name",
- "target.col": "last_name"
+ "ref.col": "last_name"
},
{
"source.col": "address",
- "target.col": "address"
+ "ref.col": "address"
},
{
"source.col": "email",
- "target.col": "email"
+ "ref.col": "email"
},
{
"source.col": "phone",
- "target.col": "phone"
+ "ref.col": "phone"
},
{
"source.col": "post_code",
- "target.col": "post_code"
+ "ref.col": "post_code"
}
]
},
diff --git a/measure/src/test/scala/org/apache/griffin/measure/execution/impl/AccuracyMeasureTest.scala b/measure/src/test/scala/org/apache/griffin/measure/execution/impl/AccuracyMeasureTest.scala
index e18efaf..0e3f06f 100644
--- a/measure/src/test/scala/org/apache/griffin/measure/execution/impl/AccuracyMeasureTest.scala
+++ b/measure/src/test/scala/org/apache/griffin/measure/execution/impl/AccuracyMeasureTest.scala
@@ -33,8 +33,8 @@
"Accuracy",
"source",
Map(
- Expression -> Seq(Map(SourceColStr -> "gender", TargetColStr -> "gender")),
- TargetSourceStr -> "target"))
+ Expression -> Seq(Map(SourceColStr -> "gender", ReferenceColStr -> "gender")),
+ ReferenceSourceStr -> "reference"))
}
"AccuracyMeasure" should "validate expression config" in {
@@ -69,48 +69,49 @@
// Invalid Expr
assertThrows[AssertionError] {
AccuracyMeasure(
- param.copy(config = Map(Expression -> Seq(Map("a" -> "b")), TargetSourceStr -> "target")))
+ param.copy(config =
+ Map(Expression -> Seq(Map("a" -> "b")), ReferenceSourceStr -> "reference")))
}
- // Invalid Expr as target.col is missing
+ // Invalid Expr as ref.col is missing
assertThrows[AssertionError] {
AccuracyMeasure(
param.copy(config =
- Map(Expression -> Seq(Map(SourceColStr -> "b")), TargetSourceStr -> "target")))
+ Map(Expression -> Seq(Map(SourceColStr -> "b")), ReferenceSourceStr -> "reference")))
}
// Invalid Expr as source.col is missing
assertThrows[AssertionError] {
AccuracyMeasure(
param.copy(config =
- Map(Expression -> Seq(Map(TargetColStr -> "b")), TargetSourceStr -> "target")))
+ Map(Expression -> Seq(Map(ReferenceColStr -> "b")), ReferenceSourceStr -> "reference")))
}
- // Validations for Target source
+ // Validations for Reference source
// Empty
assertThrows[AssertionError] {
AccuracyMeasure(
param.copy(config =
- Map(Expression -> Seq(Map("a" -> "b")), TargetSourceStr -> StringUtils.EMPTY)))
+ Map(Expression -> Seq(Map("a" -> "b")), ReferenceSourceStr -> StringUtils.EMPTY)))
}
// Incorrect Type
assertThrows[AssertionError] {
AccuracyMeasure(
- param.copy(config = Map(Expression -> Seq(Map("a" -> "b")), TargetSourceStr -> 2331)))
+ param.copy(config = Map(Expression -> Seq(Map("a" -> "b")), ReferenceSourceStr -> 2331)))
}
// Null
assertThrows[AssertionError] {
AccuracyMeasure(
- param.copy(config = Map(Expression -> Seq(Map("a" -> "b")), TargetSourceStr -> null)))
+ param.copy(config = Map(Expression -> Seq(Map("a" -> "b")), ReferenceSourceStr -> null)))
}
- // Invalid target
+ // Invalid Reference
assertThrows[AssertionError] {
AccuracyMeasure(
- param.copy(config = Map(Expression -> Seq(Map("a" -> "b")), TargetSourceStr -> "jj")))
+ param.copy(config = Map(Expression -> Seq(Map("a" -> "b")), ReferenceSourceStr -> "jj")))
}
}
diff --git a/measure/src/test/scala/org/apache/griffin/measure/execution/impl/MeasureTest.scala b/measure/src/test/scala/org/apache/griffin/measure/execution/impl/MeasureTest.scala
index 992a549..91857d6 100644
--- a/measure/src/test/scala/org/apache/griffin/measure/execution/impl/MeasureTest.scala
+++ b/measure/src/test/scala/org/apache/griffin/measure/execution/impl/MeasureTest.scala
@@ -30,13 +30,13 @@
trait MeasureTest extends SparkSuiteBase with Matchers {
var sourceSchema: StructType = _
- var targetSchema: StructType = _
+ var referenceSchema: StructType = _
var recordDfSchema: StructType = _
var metricDfSchema: StructType = _
var context: DQContext = _
var source: DataFrame = _
- var target: DataFrame = _
+ var reference: DataFrame = _
override def beforeAll(): Unit = {
super.beforeAll()
@@ -48,7 +48,7 @@
sourceSchema =
new StructType().add("id", "integer").add("name", "string").add("gender", "string")
- targetSchema = new StructType().add("gender", "string")
+ referenceSchema = new StructType().add("gender", "string")
recordDfSchema = sourceSchema.add(Status, "string", nullable = false)
metricDfSchema = new StructType()
@@ -67,10 +67,10 @@
Row(5, null, null)))(RowEncoder(sourceSchema))
.cache()
- target = spark.createDataset(Seq(Row("Male")))(RowEncoder(targetSchema)).cache()
+ reference = spark.createDataset(Seq(Row("Male")))(RowEncoder(referenceSchema)).cache()
source.createOrReplaceTempView("source")
- target.createOrReplaceTempView("target")
+ reference.createOrReplaceTempView("reference")
}
}