DRILL-8433: Add Percent Change UDF to Drill (#2801)
diff --git a/contrib/udfs/README.md b/contrib/udfs/README.md
index 23ece7d..844341c 100644
--- a/contrib/udfs/README.md
+++ b/contrib/udfs/README.md
@@ -1,6 +1,6 @@
# Drill User Defined Functions
-This `README` documents functions which users have submitted to Apache Drill.
+This `README` documents functions which users have submitted to Apache Drill.
## Geospatial Functions
@@ -201,7 +201,7 @@
### Time Bucket Functions
These functions are useful for doing time series analysis by grouping the data into arbitrary intervals. See: https://blog.timescale.com/blog/simplified-time-series-analytics
--using-the-time_bucket-function/ for more examples.
+-using-the-time_bucket-function/ for more examples.
There are two versions of the function:
* `time_bucket(<timestamp>, <interval>)`
@@ -225,7 +225,7 @@
This function is based on Niels Basjes Java library for parsing user agent strings which is available here: <https://github.com/nielsbasjes/yauaa>.
### Basic usage
-The function `parse_user_agent()` takes a user agent string as an argument and returns a map of the available fields. Note that not every field will be present in every user agent string.
+The function `parse_user_agent()` takes a user agent string as an argument and returns a map of the available fields. Note that not every field will be present in every user agent string.
The basic function signature looks like this
@@ -243,7 +243,7 @@
### Analyzing the User-Agent
```
-SELECT parse_user_agent( columns[0] ) as ua
+SELECT parse_user_agent( columns[0] ) as ua
FROM dfs.`/tmp/data/drill-httpd/ua.csv`;
```
The query above returns:
@@ -287,7 +287,7 @@
```sql
SELECT parse_user_agent( `user_agent`, 'AgentName` ) as AgentName ...
```
-which will just return the requested field. If the user agent string is empty, all fields will have the value of `Hacker`.
+which will just return the requested field. If the user agent string is empty, all fields will have the value of `Hacker`.
### Analyzing the User-Agent Client Hints
@@ -380,7 +380,7 @@
The improvement after adding the Client Hints is evident.
## Map Schema Function
-This function allows you to drill down into the schema of maps. The REST API and JDBC interfaces will only return `MAP`, `LIST` for the MAP, however, it is not possible to get
+This function allows you to drill down into the schema of maps. The REST API and JDBC interfaces will only return `MAP`, `LIST` for the MAP, however, it is not possible to get
the schema of the inner map. The function `getMapSchema(<MAP>)` will return a `MAP` of the fields and datatypes.
### Example Usage
@@ -419,7 +419,7 @@
# Threat Hunting Functions
These functions are useful for doing threat hunting with Apache Drill. These were inspired by huntlib.[1]
-The functions are:
+The functions are:
* `punctuation_pattern(<string>)`: Extracts the pattern of punctuation in text.
* `entropy(<string>)`: This function calculates the Shannon Entropy of a given string of text.
* `entropyPerByte(<string>)`: This function calculates the Shannon Entropy of a given string of text, normed for the string length.
@@ -431,7 +431,9 @@
* `width_bucket(value, min, max, buckets)`: Useful for crafting histograms and understanding distributions of continuous variables.
* `kendall_correlation(col1, col2)`: Calculates the kendall correlation coefficient of two columns within a dataset.
* `regr_slope(x,y)`: Determines the slope of the least-squares-fit linear equation
-* `regr_intercept(x,y)`: Computes the y-intercept of the least-squares-fit linear equation
+* `regr_intercept(x,y)`: Computes the y-intercept of they least-squares-fit linear equation
+* `percent_change(x,y)`: Computes the percent change between `x` and `y`. Handles nulls and
+ zero values.
[1]: https://github.com/target/huntlib
@@ -443,4 +445,4 @@
* `getHostName(<IP address>)`: Returns the host name associated with an IP address.
* `getHostAddress(<host>)`: Returns an IP address associated with a host name.
* `dnsLookup(<host>, [<Resolver>])`: Performs a DNS lookup on a given host. You can optionally provide a resolver. Possible resolver values are: `cloudflare`, `cloudflare_secondary`, `google`, `google_secondary`, `verisign`, `verisign_secondary`, `yandex`, `yandex_secondary`.
-* `whois(<host>, [<Resolver>])`: Performs a whois lookup on the given host name. You can optionally provide a resolver URL. Note that not all providers allow bulk automated whois lookups, so please follow the terms fo service for your provider.
\ No newline at end of file
+* `whois(<host>, [<Resolver>])`: Performs a whois lookup on the given host name. You can optionally provide a resolver URL. Note that not all providers allow bulk automated whois lookups, so please follow the terms fo service for your provider.
diff --git a/contrib/udfs/src/main/java/org/apache/drill/exec/udfs/DistributionFunctions.java b/contrib/udfs/src/main/java/org/apache/drill/exec/udfs/DistributionFunctions.java
index 0b4b623..fd18fd4 100644
--- a/contrib/udfs/src/main/java/org/apache/drill/exec/udfs/DistributionFunctions.java
+++ b/contrib/udfs/src/main/java/org/apache/drill/exec/udfs/DistributionFunctions.java
@@ -327,4 +327,40 @@
ss_xy.value = 0;
}
}
+
+ /**
+ * This UDF calculates the percent change between two numeric columns.
+ */
+ @FunctionTemplate(names = {"percentChange", "percent_change"},
+ scope = FunctionScope.SIMPLE,
+ nulls = NullHandling.NULL_IF_NULL)
+ public static class PercentChangeFunction implements DrillSimpleFunc {
+ @Param
+ Float8Holder oldHolder;
+
+ @Param
+ Float8Holder newHolder;
+
+ @Output
+ Float8Holder resultHolder;
+
+ @Override
+ public void setup() {
+ // No op
+ }
+
+ @Override
+ public void eval() {
+ double v1 = oldHolder.value;
+ double v2 = newHolder.value;
+
+ if (v2 == 0) {
+ resultHolder.value = 0;
+ } else if (v1 == 0) {
+ // No op
+ } else {
+ resultHolder.value = (v2 - v1) * 100.0 / v1;
+ }
+ }
+ }
}
diff --git a/contrib/udfs/src/test/java/org/apache/drill/exec/udfs/TestDistributionFunctions.java b/contrib/udfs/src/test/java/org/apache/drill/exec/udfs/TestDistributionFunctions.java
index b7b8b53..34f9ee0 100644
--- a/contrib/udfs/src/test/java/org/apache/drill/exec/udfs/TestDistributionFunctions.java
+++ b/contrib/udfs/src/test/java/org/apache/drill/exec/udfs/TestDistributionFunctions.java
@@ -105,4 +105,63 @@
.baselineValues(1400.2322223740048)
.go();
}
+
+ @Test
+ public void testPercentChange() throws Exception {
+ String query = "SELECT percent_change(100, 150) as delta";
+ testBuilder()
+ .sqlQuery(query)
+ .unOrdered()
+ .baselineColumns("delta")
+ .baselineValues(50.0)
+ .go();
+
+ query = "SELECT percent_change(200, 100) as delta";
+ testBuilder()
+ .sqlQuery(query)
+ .unOrdered()
+ .baselineColumns("delta")
+ .baselineValues(-50.0)
+ .go();
+
+ query = "SELECT percent_change(200, null) as delta";
+ testBuilder()
+ .sqlQuery(query)
+ .unOrdered()
+ .baselineColumns("delta")
+ .baselineValues(null)
+ .go();
+
+ query = "SELECT percent_change(null, null) as delta";
+ testBuilder()
+ .sqlQuery(query)
+ .unOrdered()
+ .baselineColumns("delta")
+ .baselineValues(null)
+ .go();
+
+ query = "SELECT percent_change(null, 200) as delta";
+ testBuilder()
+ .sqlQuery(query)
+ .unOrdered()
+ .baselineColumns("delta")
+ .baselineValues(null)
+ .go();
+
+ query = "SELECT percent_change(200, 0) as delta";
+ testBuilder()
+ .sqlQuery(query)
+ .unOrdered()
+ .baselineColumns("delta")
+ .baselineValues(0.0)
+ .go();
+
+ query = "SELECT percent_change(0, 34) as delta";
+ testBuilder()
+ .sqlQuery(query)
+ .unOrdered()
+ .baselineColumns("delta")
+ .baselineValues(0.0)
+ .go();
+ }
}