[GRIFFIN-358] Added sampling option to ProfilingMeasure

commit: 0a08b859ea5e467fd2ed15a3c3e638624451f0ec [log] [tgz]
author: chitralverma <chitralverma@gmail.com> Fri Jun 11 11:25:42 2021 +0530
committer: chitralverma <chitralverma@gmail.com> Fri Jun 11 11:25:42 2021 +0530
tree: 3aa7776a376c390ef4fb036942a685d7fec41e44
parent: 3586a30caec02eaa248964d839a604cef1e0adfd [diff]
diff --git a/griffin-doc/measure/measure-configuration-guide/profiling.md b/griffin-doc/measure/measure-configuration-guide/profiling.md
index 76ec9cc..c0d69d3 100644
--- a/griffin-doc/measure/measure-configuration-guide/profiling.md
+++ b/griffin-doc/measure/measure-configuration-guide/profiling.md

@@ -103,13 +103,19 @@
   data set will be profiled.
 
 - `approx.distinct.count`: The value for this key is boolean. If this is `true`, the distinct counts will be
-  approximated to allow up to 5% error. Approximate counts are usually faster by are less accurate. If this is set
+  approximated to allow up to 5% error. Approximate counts are usually faster but are less accurate. If this is set
   to `false`, then the counts will be 100% accurate.
 
 - `round.scale`: Several resultant metrics of profiling measure are floating-point numbers. This key controls to extent
   to which these floating-point numbers are rounded. For example, if `round.scale = 2` then all floating-point metric
   values will be rounded to 2 decimal places.
 
+- `dataset.sample`: The value of this key determines what percentage of data is to be profiled. The decimal value
+  belongs to range [0.0, 1.0], where 0.0 means the whole dataset will be skipped, 1.0 means the whole dataset will be
+  profiled. An intermediate value, say 0.5 will approximately take random 50% of the dataset rows (without replacement)
+  and perform profiling on it. This option can be used when the dataset to be profiled is large, and an approximate
+  profile is needed.
+
 ### Outputs
 
 Unlike other measures, Profiling does not produce record outputs. Thus, only metric outputs must be configured.

diff --git a/measure/src/main/resources/config-batch-all-measures.json b/measure/src/main/resources/config-batch-all-measures.json
index 4c83f45..8917ccf 100644
--- a/measure/src/main/resources/config-batch-all-measures.json
+++ b/measure/src/main/resources/config-batch-all-measures.json

@@ -63,7 +63,8 @@
       "data.source": "crime_report_source",
       "config": {
         "approx.distinct.count": true,
-        "round.scale": 2
+        "round.scale": 2,
+        "dataset.sample": 0.45
       },
       "out": [
         {

diff --git a/measure/src/main/scala/org/apache/griffin/measure/execution/impl/ProfilingMeasure.scala b/measure/src/main/scala/org/apache/griffin/measure/execution/impl/ProfilingMeasure.scala
index 01b0a65..7a3c49a 100644
--- a/measure/src/main/scala/org/apache/griffin/measure/execution/impl/ProfilingMeasure.scala
+++ b/measure/src/main/scala/org/apache/griffin/measure/execution/impl/ProfilingMeasure.scala

@@ -81,6 +81,16 @@
   val roundScale: Int = getFromConfig[java.lang.Integer](RoundScaleStr, 3)
 
   /**
+   * The value of this key determines what percentage of data is to be profiled. The decimal value
+   * belongs to range [0.0, 1.0], where 0.0 means the whole dataset will be skipped, 1.0 means the whole
+   * dataset will be profiled. An intermediate value, say 0.5 will approximately take random 50% of
+   * the dataset rows (without replacement) and perform profiling on it.
+   *
+   * This option can be used when the dataset to be profiled is large, and an approximate profile is needed.
+   */
+  val dataSetSample: Double = getFromConfig[java.lang.Double](DataSetSampleStr, 1.0)
+
+  /**
    * Several resultant metrics of profiling measure are floating-point numbers. This key controls to extent
    * to which these floating-point numbers are rounded. For example, if `round.scale = 2` then all
    * floating-point metric values will be rounded to 2 decimal places.
@@ -109,7 +119,8 @@
    *  @return tuple of records dataframe and metric dataframe
    */
   override def impl(): (DataFrame, DataFrame) = {
-    val input = sparkSession.read.table(measureParam.getDataSource)
+    info(s"Selecting random ${dataSetSample * 100}% of the rows for profiling.")
+    val input = sparkSession.read.table(measureParam.getDataSource).sample(dataSetSample)
     val profilingColNames = exprOpt
       .getOrElse(input.columns.mkString(","))
       .split(",")
@@ -167,6 +178,7 @@
   /**
    * Options Keys
    */
+  final val DataSetSampleStr: String = "dataset.sample"
   final val RoundScaleStr: String = "round.scale"
   final val ApproxDistinctCountStr: String = "approx.distinct.count"
commit	0a08b859ea5e467fd2ed15a3c3e638624451f0ec	[log] [tgz]
author	chitralverma <chitralverma@gmail.com>	Fri Jun 11 11:25:42 2021 +0530
committer	chitralverma <chitralverma@gmail.com>	Fri Jun 11 11:25:42 2021 +0530
tree	3aa7776a376c390ef4fb036942a685d7fec41e44
parent	3586a30caec02eaa248964d839a604cef1e0adfd [diff]