[GRIFFIN-358] Added sampling option to ProfilingMeasure
diff --git a/griffin-doc/measure/measure-configuration-guide/profiling.md b/griffin-doc/measure/measure-configuration-guide/profiling.md
index 76ec9cc..c0d69d3 100644
--- a/griffin-doc/measure/measure-configuration-guide/profiling.md
+++ b/griffin-doc/measure/measure-configuration-guide/profiling.md
@@ -103,13 +103,19 @@
data set will be profiled.
- `approx.distinct.count`: The value for this key is boolean. If this is `true`, the distinct counts will be
- approximated to allow up to 5% error. Approximate counts are usually faster by are less accurate. If this is set
+ approximated to allow up to 5% error. Approximate counts are usually faster but are less accurate. If this is set
to `false`, then the counts will be 100% accurate.
- `round.scale`: Several resultant metrics of profiling measure are floating-point numbers. This key controls to extent
to which these floating-point numbers are rounded. For example, if `round.scale = 2` then all floating-point metric
values will be rounded to 2 decimal places.
+- `dataset.sample`: The value of this key determines what percentage of data is to be profiled. The decimal value
+ belongs to range [0.0, 1.0], where 0.0 means the whole dataset will be skipped, 1.0 means the whole dataset will be
+ profiled. An intermediate value, say 0.5 will approximately take random 50% of the dataset rows (without replacement)
+ and perform profiling on it. This option can be used when the dataset to be profiled is large, and an approximate
+ profile is needed.
+
### Outputs
Unlike other measures, Profiling does not produce record outputs. Thus, only metric outputs must be configured.
diff --git a/measure/src/main/resources/config-batch-all-measures.json b/measure/src/main/resources/config-batch-all-measures.json
index 4c83f45..8917ccf 100644
--- a/measure/src/main/resources/config-batch-all-measures.json
+++ b/measure/src/main/resources/config-batch-all-measures.json
@@ -63,7 +63,8 @@
"data.source": "crime_report_source",
"config": {
"approx.distinct.count": true,
- "round.scale": 2
+ "round.scale": 2,
+ "dataset.sample": 0.45
},
"out": [
{
diff --git a/measure/src/main/scala/org/apache/griffin/measure/execution/impl/ProfilingMeasure.scala b/measure/src/main/scala/org/apache/griffin/measure/execution/impl/ProfilingMeasure.scala
index 01b0a65..7a3c49a 100644
--- a/measure/src/main/scala/org/apache/griffin/measure/execution/impl/ProfilingMeasure.scala
+++ b/measure/src/main/scala/org/apache/griffin/measure/execution/impl/ProfilingMeasure.scala
@@ -81,6 +81,16 @@
val roundScale: Int = getFromConfig[java.lang.Integer](RoundScaleStr, 3)
/**
+ * The value of this key determines what percentage of data is to be profiled. The decimal value
+ * belongs to range [0.0, 1.0], where 0.0 means the whole dataset will be skipped, 1.0 means the whole
+ * dataset will be profiled. An intermediate value, say 0.5 will approximately take random 50% of
+ * the dataset rows (without replacement) and perform profiling on it.
+ *
+ * This option can be used when the dataset to be profiled is large, and an approximate profile is needed.
+ */
+ val dataSetSample: Double = getFromConfig[java.lang.Double](DataSetSampleStr, 1.0)
+
+ /**
* Several resultant metrics of profiling measure are floating-point numbers. This key controls to extent
* to which these floating-point numbers are rounded. For example, if `round.scale = 2` then all
* floating-point metric values will be rounded to 2 decimal places.
@@ -109,7 +119,8 @@
* @return tuple of records dataframe and metric dataframe
*/
override def impl(): (DataFrame, DataFrame) = {
- val input = sparkSession.read.table(measureParam.getDataSource)
+ info(s"Selecting random ${dataSetSample * 100}% of the rows for profiling.")
+ val input = sparkSession.read.table(measureParam.getDataSource).sample(dataSetSample)
val profilingColNames = exprOpt
.getOrElse(input.columns.mkString(","))
.split(",")
@@ -167,6 +178,7 @@
/**
* Options Keys
*/
+ final val DataSetSampleStr: String = "dataset.sample"
final val RoundScaleStr: String = "round.scale"
final val ApproxDistinctCountStr: String = "approx.distinct.count"