| <!DOCTYPE html><html><head><title>R: Collection functions for Column operations</title> |
| <meta http-equiv="Content-Type" content="text/html; charset=utf-8" /> |
| <meta name="viewport" content="width=device-width, initial-scale=1.0, user-scalable=yes" /> |
| <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/katex@0.15.3/dist/katex.min.css"> |
| <script type="text/javascript"> |
| const macros = { "\\R": "\\textsf{R}", "\\code": "\\texttt"}; |
| function processMathHTML() { |
| var l = document.getElementsByClassName('reqn'); |
| for (let e of l) { katex.render(e.textContent, e, { throwOnError: false, macros }); } |
| return; |
| }</script> |
| <script defer src="https://cdn.jsdelivr.net/npm/katex@0.15.3/dist/katex.min.js" |
| onload="processMathHTML();"></script> |
| <link rel="stylesheet" type="text/css" href="R.css" /> |
| |
| <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/8.3/styles/github.min.css"> |
| <script src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/8.3/highlight.min.js"></script> |
| <script src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/8.3/languages/r.min.js"></script> |
| <script>hljs.initHighlightingOnLoad();</script> |
| </head><body><div class="container"> |
| |
| <table style="width: 100%;"><tr><td>column_collection_functions {SparkR}</td><td style="text-align: right;">R Documentation</td></tr></table> |
| |
| <h2>Collection functions for Column operations</h2> |
| |
| <h3>Description</h3> |
| |
| <p>Collection functions defined for <code>Column</code>. |
| </p> |
| |
| |
| <h3>Usage</h3> |
| |
| <pre><code class='language-R'>array_aggregate(x, initialValue, merge, ...) |
| |
| array_contains(x, value) |
| |
| array_distinct(x) |
| |
| array_except(x, y) |
| |
| array_exists(x, f) |
| |
| array_forall(x, f) |
| |
| array_filter(x, f) |
| |
| array_intersect(x, y) |
| |
| array_join(x, delimiter, ...) |
| |
| array_max(x) |
| |
| array_min(x) |
| |
| array_position(x, value) |
| |
| array_remove(x, value) |
| |
| array_repeat(x, count) |
| |
| array_sort(x) |
| |
| array_transform(x, f) |
| |
| arrays_overlap(x, y) |
| |
| array_union(x, y) |
| |
| arrays_zip(x, ...) |
| |
| arrays_zip_with(x, y, f) |
| |
| concat(x, ...) |
| |
| element_at(x, extraction) |
| |
| explode(x) |
| |
| explode_outer(x) |
| |
| flatten(x) |
| |
| from_json(x, schema, ...) |
| |
| from_csv(x, schema, ...) |
| |
| map_concat(x, ...) |
| |
| map_entries(x) |
| |
| map_filter(x, f) |
| |
| map_from_arrays(x, y) |
| |
| map_from_entries(x) |
| |
| map_keys(x) |
| |
| map_values(x) |
| |
| map_zip_with(x, y, f) |
| |
| posexplode(x) |
| |
| posexplode_outer(x) |
| |
| reverse(x) |
| |
| schema_of_csv(x, ...) |
| |
| schema_of_json(x, ...) |
| |
| shuffle(x) |
| |
| size(x) |
| |
| slice(x, start, length) |
| |
| sort_array(x, asc = TRUE) |
| |
| transform_keys(x, f) |
| |
| transform_values(x, f) |
| |
| to_json(x, ...) |
| |
| to_csv(x, ...) |
| |
| ## S4 method for signature 'Column' |
| reverse(x) |
| |
| ## S4 method for signature 'Column' |
| to_json(x, ...) |
| |
| ## S4 method for signature 'Column' |
| to_csv(x, ...) |
| |
| ## S4 method for signature 'Column' |
| concat(x, ...) |
| |
| ## S4 method for signature 'Column,characterOrstructTypeOrColumn' |
| from_json(x, schema, as.json.array = FALSE, ...) |
| |
| ## S4 method for signature 'characterOrColumn' |
| schema_of_json(x, ...) |
| |
| ## S4 method for signature 'Column,characterOrstructTypeOrColumn' |
| from_csv(x, schema, ...) |
| |
| ## S4 method for signature 'characterOrColumn' |
| schema_of_csv(x, ...) |
| |
| ## S4 method for signature 'characterOrColumn,Column,'function'' |
| array_aggregate(x, initialValue, merge, finish = NULL) |
| |
| ## S4 method for signature 'Column' |
| array_contains(x, value) |
| |
| ## S4 method for signature 'Column' |
| array_distinct(x) |
| |
| ## S4 method for signature 'Column,Column' |
| array_except(x, y) |
| |
| ## S4 method for signature 'characterOrColumn,'function'' |
| array_exists(x, f) |
| |
| ## S4 method for signature 'characterOrColumn,'function'' |
| array_filter(x, f) |
| |
| ## S4 method for signature 'characterOrColumn,'function'' |
| array_forall(x, f) |
| |
| ## S4 method for signature 'Column,Column' |
| array_intersect(x, y) |
| |
| ## S4 method for signature 'Column,character' |
| array_join(x, delimiter, nullReplacement = NULL) |
| |
| ## S4 method for signature 'Column' |
| array_max(x) |
| |
| ## S4 method for signature 'Column' |
| array_min(x) |
| |
| ## S4 method for signature 'Column' |
| array_position(x, value) |
| |
| ## S4 method for signature 'Column' |
| array_remove(x, value) |
| |
| ## S4 method for signature 'Column,numericOrColumn' |
| array_repeat(x, count) |
| |
| ## S4 method for signature 'Column' |
| array_sort(x) |
| |
| ## S4 method for signature 'characterOrColumn,'function'' |
| array_transform(x, f) |
| |
| ## S4 method for signature 'Column,Column' |
| arrays_overlap(x, y) |
| |
| ## S4 method for signature 'Column,Column' |
| array_union(x, y) |
| |
| ## S4 method for signature 'Column' |
| arrays_zip(x, ...) |
| |
| ## S4 method for signature 'characterOrColumn,characterOrColumn,'function'' |
| arrays_zip_with(x, y, f) |
| |
| ## S4 method for signature 'Column' |
| shuffle(x) |
| |
| ## S4 method for signature 'Column' |
| flatten(x) |
| |
| ## S4 method for signature 'Column' |
| map_concat(x, ...) |
| |
| ## S4 method for signature 'Column' |
| map_entries(x) |
| |
| ## S4 method for signature 'characterOrColumn,'function'' |
| map_filter(x, f) |
| |
| ## S4 method for signature 'Column,Column' |
| map_from_arrays(x, y) |
| |
| ## S4 method for signature 'Column' |
| map_from_entries(x) |
| |
| ## S4 method for signature 'Column' |
| map_keys(x) |
| |
| ## S4 method for signature 'characterOrColumn,'function'' |
| transform_keys(x, f) |
| |
| ## S4 method for signature 'characterOrColumn,'function'' |
| transform_values(x, f) |
| |
| ## S4 method for signature 'Column' |
| map_values(x) |
| |
| ## S4 method for signature 'characterOrColumn,characterOrColumn,'function'' |
| map_zip_with(x, y, f) |
| |
| ## S4 method for signature 'Column' |
| element_at(x, extraction) |
| |
| ## S4 method for signature 'Column' |
| explode(x) |
| |
| ## S4 method for signature 'Column' |
| size(x) |
| |
| ## S4 method for signature 'Column' |
| slice(x, start, length) |
| |
| ## S4 method for signature 'Column' |
| sort_array(x, asc = TRUE) |
| |
| ## S4 method for signature 'Column' |
| posexplode(x) |
| |
| ## S4 method for signature 'Column' |
| explode_outer(x) |
| |
| ## S4 method for signature 'Column' |
| posexplode_outer(x) |
| </code></pre> |
| |
| |
| <h3>Arguments</h3> |
| |
| <table> |
| <tr style="vertical-align: top;"><td><code>x</code></td> |
| <td> |
| <p>Column to compute on. Note the difference in the following methods: |
| </p> |
| |
| <ul> |
| <li> <p><code>to_json</code>: it is the column containing the struct, array of the structs, |
| the map or array of maps. |
| </p> |
| </li> |
| <li> <p><code>to_csv</code>: it is the column containing the struct. |
| </p> |
| </li> |
| <li> <p><code>from_json</code>: it is the column containing the JSON string. |
| </p> |
| </li> |
| <li> <p><code>from_csv</code>: it is the column containing the CSV string. |
| </p> |
| </li></ul> |
| </td></tr> |
| <tr style="vertical-align: top;"><td><code>initialValue</code></td> |
| <td> |
| <p>a <code>Column</code> used as the initial value in <code>array_aggregate</code></p> |
| </td></tr> |
| <tr style="vertical-align: top;"><td><code>merge</code></td> |
| <td> |
| <p>a <code>function</code> a binary function <code>(Column, Column) -> Column</code> |
| used in <code>array_aggregate</code>to merge values (the second argument) |
| into accumulator (the first argument).</p> |
| </td></tr> |
| <tr style="vertical-align: top;"><td><code>...</code></td> |
| <td> |
| <p>additional argument(s). |
| </p> |
| |
| <ul> |
| <li> <p><code>to_json</code>, <code>from_json</code> and <code>schema_of_json</code>: this contains |
| additional named properties to control how it is converted and accepts the |
| same options as the JSON data source. |
| You can find the JSON-specific options for reading/writing JSON files in |
| <a href="https://spark.apache.org/docs/latest/sql-data-sources-json.html#data-source-option">https://spark.apache.org/docs/latest/sql-data-sources-json.html#data-source-option</a> |
| Data Source Option in the version you use. |
| </p> |
| </li> |
| <li> <p><code>to_json</code>: it supports the "pretty" option which enables pretty |
| JSON generation. |
| </p> |
| </li> |
| <li> <p><code>to_csv</code>, <code>from_csv</code> and <code>schema_of_csv</code>: this contains |
| additional named properties to control how it is converted and accepts the |
| same options as the CSV data source. |
| You can find the CSV-specific options for reading/writing CSV files in |
| <a href="https://spark.apache.org/docs/latest/sql-data-sources-csv.html#data-source-option">https://spark.apache.org/docs/latest/sql-data-sources-csv.html#data-source-option</a> |
| Data Source Option in the version you use. |
| </p> |
| </li> |
| <li> <p><code>arrays_zip</code>, this contains additional Columns of arrays to be merged. |
| </p> |
| </li> |
| <li> <p><code>map_concat</code>, this contains additional Columns of maps to be unioned. |
| </p> |
| </li></ul> |
| </td></tr> |
| <tr style="vertical-align: top;"><td><code>value</code></td> |
| <td> |
| <p>A value to compute on. |
| </p> |
| |
| <ul> |
| <li> <p><code>array_contains</code>: a value to be checked if contained in the column. |
| </p> |
| </li> |
| <li> <p><code>array_position</code>: a value to locate in the given array. |
| </p> |
| </li> |
| <li> <p><code>array_remove</code>: a value to remove in the given array. |
| </p> |
| </li></ul> |
| </td></tr> |
| <tr style="vertical-align: top;"><td><code>y</code></td> |
| <td> |
| <p>Column to compute on.</p> |
| </td></tr> |
| <tr style="vertical-align: top;"><td><code>f</code></td> |
| <td> |
| <p>a <code>function</code> mapping from <code>Column(s)</code> to <code>Column</code>. |
| </p> |
| |
| <ul> |
| <li> <p><code>array_exists</code> |
| </p> |
| </li> |
| <li> <p><code>array_filter</code> the Boolean <code>function</code> used to filter the data. |
| Either unary or binary. In the latter case the second argument |
| is the index in the array (0-based). |
| </p> |
| </li> |
| <li> <p><code>array_forall</code> the Boolean unary <code>function</code> used to filter the data. |
| </p> |
| </li> |
| <li> <p><code>array_transform</code> a <code>function</code> used to transform the data. |
| Either unary or binary. In the latter case the second argument |
| is the index in the array (0-based). |
| </p> |
| </li> |
| <li> <p><code>arrays_zip_with</code> |
| </p> |
| </li> |
| <li> <p><code>map_zip_with</code> |
| </p> |
| </li> |
| <li> <p><code>map_filter</code> the Boolean binary <code>function</code> used to filter the data. |
| The first argument is the key, the second argument is the value. |
| </p> |
| </li> |
| <li> <p><code>transform_keys</code> a binary <code>function</code> |
| used to transform the data. The first argument is the key, the second argument |
| is the value. |
| </p> |
| </li> |
| <li> <p><code>transform_values</code> a binary <code>function</code> |
| used to transform the data. The first argument is the key, the second argument |
| is the value. |
| </p> |
| </li></ul> |
| </td></tr> |
| <tr style="vertical-align: top;"><td><code>delimiter</code></td> |
| <td> |
| <p>a character string that is used to concatenate the elements of column.</p> |
| </td></tr> |
| <tr style="vertical-align: top;"><td><code>count</code></td> |
| <td> |
| <p>a Column or constant determining the number of repetitions.</p> |
| </td></tr> |
| <tr style="vertical-align: top;"><td><code>extraction</code></td> |
| <td> |
| <p>index to check for in array or key to check for in map</p> |
| </td></tr> |
| <tr style="vertical-align: top;"><td><code>schema</code></td> |
| <td> |
| |
| <ul> |
| <li> <p><code>from_json</code>: a structType object to use as the schema to use |
| when parsing the JSON string. Since Spark 2.3, the DDL-formatted string is |
| also supported for the schema. Since Spark 3.0, <code>schema_of_json</code> or |
| the DDL-formatted string literal can also be accepted. |
| </p> |
| </li> |
| <li> <p><code>from_csv</code>: a structType object, DDL-formatted string or <code>schema_of_csv</code> |
| </p> |
| </li></ul> |
| </td></tr> |
| <tr style="vertical-align: top;"><td><code>start</code></td> |
| <td> |
| <p>the starting index</p> |
| </td></tr> |
| <tr style="vertical-align: top;"><td><code>length</code></td> |
| <td> |
| <p>the length of the slice</p> |
| </td></tr> |
| <tr style="vertical-align: top;"><td><code>asc</code></td> |
| <td> |
| <p>a logical flag indicating the sorting order. |
| TRUE, sorting is in ascending order. |
| FALSE, sorting is in descending order.</p> |
| </td></tr> |
| <tr style="vertical-align: top;"><td><code>as.json.array</code></td> |
| <td> |
| <p>indicating if input string is JSON array of objects or a single object.</p> |
| </td></tr> |
| <tr style="vertical-align: top;"><td><code>finish</code></td> |
| <td> |
| <p>an unary <code>function</code> <code>(Column) -> Column</code> used to |
| apply final transformation on the accumulated data in <code>array_aggregate</code>.</p> |
| </td></tr> |
| <tr style="vertical-align: top;"><td><code>nullReplacement</code></td> |
| <td> |
| <p>an optional character string that is used to replace the Null values.</p> |
| </td></tr> |
| </table> |
| |
| |
| <h3>Details</h3> |
| |
| <p><code>reverse</code>: Returns a reversed string or an array with reverse order of elements. |
| </p> |
| <p><code>to_json</code>: Converts a column containing a <code>structType</code>, a <code>mapType</code> |
| or an <code>arrayType</code> into a Column of JSON string. |
| Resolving the Column can fail if an unsupported type is encountered. |
| </p> |
| <p><code>to_csv</code>: Converts a column containing a <code>structType</code> into a Column of CSV string. |
| Resolving the Column can fail if an unsupported type is encountered. |
| </p> |
| <p><code>concat</code>: Concatenates multiple input columns together into a single column. |
| The function works with strings, binary and compatible array columns. |
| </p> |
| <p><code>from_json</code>: Parses a column containing a JSON string into a Column of <code>structType</code> |
| with the specified <code>schema</code> or array of <code>structType</code> if <code>as.json.array</code> is set |
| to <code>TRUE</code>. If the string is unparseable, the Column will contain the value NA. |
| </p> |
| <p><code>schema_of_json</code>: Parses a JSON string and infers its schema in DDL format. |
| </p> |
| <p><code>from_csv</code>: Parses a column containing a CSV string into a Column of <code>structType</code> |
| with the specified <code>schema</code>. |
| If the string is unparseable, the Column will contain the value NA. |
| </p> |
| <p><code>schema_of_csv</code>: Parses a CSV string and infers its schema in DDL format. |
| </p> |
| <p><code>array_aggregate</code> Applies a binary operator to an initial state |
| and all elements in the array, and reduces this to a single state. |
| The final state is converted into the final result by applying |
| a finish function. |
| </p> |
| <p><code>array_contains</code>: Returns null if the array is null, true if the array contains |
| the value, and false otherwise. |
| </p> |
| <p><code>array_distinct</code>: Removes duplicate values from the array. |
| </p> |
| <p><code>array_except</code>: Returns an array of the elements in the first array but not in the second |
| array, without duplicates. The order of elements in the result is not determined. |
| </p> |
| <p><code>array_exists</code> Returns whether a predicate holds for one or more elements in the array. |
| </p> |
| <p><code>array_filter</code> Returns an array of elements for which a predicate holds in a given array. |
| </p> |
| <p><code>array_forall</code> Returns whether a predicate holds for every element in the array. |
| </p> |
| <p><code>array_intersect</code>: Returns an array of the elements in the intersection of the given two |
| arrays, without duplicates. |
| </p> |
| <p><code>array_join</code>: Concatenates the elements of column using the delimiter. |
| Null values are replaced with nullReplacement if set, otherwise they are ignored. |
| </p> |
| <p><code>array_max</code>: Returns the maximum value of the array. |
| </p> |
| <p><code>array_min</code>: Returns the minimum value of the array. |
| </p> |
| <p><code>array_position</code>: Locates the position of the first occurrence of the given value |
| in the given array. Returns NA if either of the arguments are NA. |
| Note: The position is not zero based, but 1 based index. Returns 0 if the given |
| value could not be found in the array. |
| </p> |
| <p><code>array_remove</code>: Removes all elements that equal to element from the given array. |
| </p> |
| <p><code>array_repeat</code>: Creates an array containing <code>x</code> repeated the number of times |
| given by <code>count</code>. |
| </p> |
| <p><code>array_sort</code>: Sorts the input array in ascending order. The elements of the input array |
| must be orderable. NA elements will be placed at the end of the returned array. |
| </p> |
| <p><code>array_transform</code> Returns an array of elements after applying |
| a transformation to each element in the input array. |
| </p> |
| <p><code>arrays_overlap</code>: Returns true if the input arrays have at least one non-null element in |
| common. If not and both arrays are non-empty and any of them contains a null, it returns null. |
| It returns false otherwise. |
| </p> |
| <p><code>array_union</code>: Returns an array of the elements in the union of the given two arrays, |
| without duplicates. |
| </p> |
| <p><code>arrays_zip</code>: Returns a merged array of structs in which the N-th struct contains all N-th |
| values of input arrays. |
| </p> |
| <p><code>arrays_zip_with</code> Merge two given arrays, element-wise, into a single array |
| using a function. If one array is shorter, nulls are appended at the end |
| to match the length of the longer array, before applying the function. |
| </p> |
| <p><code>shuffle</code>: Returns a random permutation of the given array. |
| </p> |
| <p><code>flatten</code>: Creates a single array from an array of arrays. |
| If a structure of nested arrays is deeper than two levels, only one level of nesting is removed. |
| </p> |
| <p><code>map_concat</code>: Returns the union of all the given maps. |
| </p> |
| <p><code>map_entries</code>: Returns an unordered array of all entries in the given map. |
| </p> |
| <p><code>map_filter</code> Returns a map whose key-value pairs satisfy a predicate. |
| </p> |
| <p><code>map_from_arrays</code>: Creates a new map column. The array in the first column is used for |
| keys. The array in the second column is used for values. All elements in the array for key |
| should not be null. |
| </p> |
| <p><code>map_from_entries</code>: Returns a map created from the given array of entries. |
| </p> |
| <p><code>map_keys</code>: Returns an unordered array containing the keys of the map. |
| </p> |
| <p><code>transform_keys</code> Applies a function to every key-value pair in a map and returns |
| a map with the results of those applications as the new keys for the pairs. |
| </p> |
| <p><code>transform_values</code> Applies a function to every key-value pair in a map and returns |
| a map with the results of those applications as the new values for the pairs. |
| </p> |
| <p><code>map_values</code>: Returns an unordered array containing the values of the map. |
| </p> |
| <p><code>map_zip</code> Merge two given maps, key-wise into a single map using a function. |
| </p> |
| <p><code>element_at</code>: Returns element of array at given index in <code>extraction</code> if |
| <code>x</code> is array. Returns value for the given key in <code>extraction</code> if <code>x</code> is map. |
| Note: The position is not zero based, but 1 based index. |
| </p> |
| <p><code>explode</code>: Creates a new row for each element in the given array or map column. |
| Uses the default column name <code>col</code> for elements in the array and |
| <code>key</code> and <code>value</code> for elements in the map unless specified otherwise. |
| </p> |
| <p><code>size</code>: Returns length of array or map. |
| </p> |
| <p><code>slice</code>: Returns an array containing all the elements in x from the index start |
| (array indices start at 1, or from the end if start is negative) with the specified length. |
| </p> |
| <p><code>sort_array</code>: Sorts the input array in ascending or descending order according to |
| the natural ordering of the array elements. NA elements will be placed at the beginning of |
| the returned array in ascending order or at the end of the returned array in descending order. |
| </p> |
| <p><code>posexplode</code>: Creates a new row for each element with position in the given array |
| or map column. Uses the default column name <code>pos</code> for position, and <code>col</code> |
| for elements in the array and <code>key</code> and <code>value</code> for elements in the map |
| unless specified otherwise. |
| </p> |
| <p><code>explode</code>: Creates a new row for each element in the given array or map column. |
| Unlike <code>explode</code>, if the array/map is <code>null</code> or empty |
| then <code>null</code> is produced. |
| Uses the default column name <code>col</code> for elements in the array and |
| <code>key</code> and <code>value</code> for elements in the map unless specified otherwise. |
| </p> |
| <p><code>posexplode_outer</code>: Creates a new row for each element with position in the given |
| array or map column. Unlike <code>posexplode</code>, if the array/map is <code>null</code> or empty |
| then the row (<code>null</code>, <code>null</code>) is produced. |
| Uses the default column name <code>pos</code> for position, and <code>col</code> |
| for elements in the array and <code>key</code> and <code>value</code> for elements in the map |
| unless specified otherwise. |
| </p> |
| |
| |
| <h3>Note</h3> |
| |
| <p>reverse since 1.5.0 |
| </p> |
| <p>to_json since 2.2.0 |
| </p> |
| <p>to_csv since 3.0.0 |
| </p> |
| <p>concat since 1.5.0 |
| </p> |
| <p>from_json since 2.2.0 |
| </p> |
| <p>schema_of_json since 3.0.0 |
| </p> |
| <p>from_csv since 3.0.0 |
| </p> |
| <p>schema_of_csv since 3.0.0 |
| </p> |
| <p>array_aggregate since 3.1.0 |
| </p> |
| <p>array_contains since 1.6.0 |
| </p> |
| <p>array_distinct since 2.4.0 |
| </p> |
| <p>array_except since 2.4.0 |
| </p> |
| <p>array_exists since 3.1.0 |
| </p> |
| <p>array_filter since 3.1.0 |
| </p> |
| <p>array_forall since 3.1.0 |
| </p> |
| <p>array_intersect since 2.4.0 |
| </p> |
| <p>array_join since 2.4.0 |
| </p> |
| <p>array_max since 2.4.0 |
| </p> |
| <p>array_min since 2.4.0 |
| </p> |
| <p>array_position since 2.4.0 |
| </p> |
| <p>array_remove since 2.4.0 |
| </p> |
| <p>array_repeat since 2.4.0 |
| </p> |
| <p>array_sort since 2.4.0 |
| </p> |
| <p>array_transform since 3.1.0 |
| </p> |
| <p>arrays_overlap since 2.4.0 |
| </p> |
| <p>array_union since 2.4.0 |
| </p> |
| <p>arrays_zip since 2.4.0 |
| </p> |
| <p>zip_with since 3.1.0 |
| </p> |
| <p>shuffle since 2.4.0 |
| </p> |
| <p>flatten since 2.4.0 |
| </p> |
| <p>map_concat since 3.0.0 |
| </p> |
| <p>map_entries since 3.0.0 |
| </p> |
| <p>map_filter since 3.1.0 |
| </p> |
| <p>map_from_arrays since 2.4.0 |
| </p> |
| <p>map_from_entries since 3.0.0 |
| </p> |
| <p>map_keys since 2.3.0 |
| </p> |
| <p>transform_keys since 3.1.0 |
| </p> |
| <p>transform_values since 3.1.0 |
| </p> |
| <p>map_values since 2.3.0 |
| </p> |
| <p>map_zip_with since 3.1.0 |
| </p> |
| <p>element_at since 2.4.0 |
| </p> |
| <p>explode since 1.5.0 |
| </p> |
| <p>size since 1.5.0 |
| </p> |
| <p>slice since 2.4.0 |
| </p> |
| <p>sort_array since 1.6.0 |
| </p> |
| <p>posexplode since 2.1.0 |
| </p> |
| <p>explode_outer since 2.3.0 |
| </p> |
| <p>posexplode_outer since 2.3.0 |
| </p> |
| |
| |
| <h3>Examples</h3> |
| |
| <pre><code class="language-r">## Not run: |
| ##D # Dataframe used throughout this doc |
| ##D df <- createDataFrame(cbind(model = rownames(mtcars), mtcars)) |
| ##D tmp <- mutate(df, v1 = create_array(df$mpg, df$cyl, df$hp)) |
| ##D head(select(tmp, array_contains(tmp$v1, 21), size(tmp$v1), shuffle(tmp$v1))) |
| ##D head(select(tmp, array_max(tmp$v1), array_min(tmp$v1), array_distinct(tmp$v1))) |
| ##D head(select(tmp, array_position(tmp$v1, 21), array_repeat(df$mpg, 3), array_sort(tmp$v1))) |
| ##D head(select(tmp, reverse(tmp$v1), array_remove(tmp$v1, 21))) |
| ##D head(select(tmp, array_transform("v1", function(x) x * 10))) |
| ##D head(select(tmp, array_exists("v1", function(x) x > 120))) |
| ##D head(select(tmp, array_forall("v1", function(x) x >= 8.0))) |
| ##D head(select(tmp, array_filter("v1", function(x) x < 10))) |
| ##D head(select(tmp, array_aggregate("v1", lit(0), function(acc, y) acc + y))) |
| ##D head(select( |
| ##D tmp, |
| ##D array_aggregate("v1", lit(0), function(acc, y) acc + y, function(acc) acc / 10))) |
| ##D tmp2 <- mutate(tmp, v2 = explode(tmp$v1)) |
| ##D head(tmp2) |
| ##D head(select(tmp, posexplode(tmp$v1))) |
| ##D head(select(tmp, slice(tmp$v1, 2L, 2L))) |
| ##D head(select(tmp, sort_array(tmp$v1))) |
| ##D head(select(tmp, sort_array(tmp$v1, asc = FALSE))) |
| ##D tmp3 <- mutate(df, v3 = create_map(df$model, df$cyl)) |
| ##D head(select(tmp3, map_entries(tmp3$v3), map_keys(tmp3$v3), map_values(tmp3$v3))) |
| ##D head(select(tmp3, element_at(tmp3$v3, "Valiant"), map_concat(tmp3$v3, tmp3$v3))) |
| ##D head(select(tmp3, transform_keys("v3", function(k, v) upper(k)))) |
| ##D head(select(tmp3, transform_values("v3", function(k, v) v * 10))) |
| ##D head(select(tmp3, map_filter("v3", function(k, v) v < 42))) |
| ##D tmp4 <- mutate(df, v4 = create_array(df$mpg, df$cyl), v5 = create_array(df$cyl, df$hp)) |
| ##D head(select(tmp4, concat(tmp4$v4, tmp4$v5), arrays_overlap(tmp4$v4, tmp4$v5))) |
| ##D head(select(tmp4, array_except(tmp4$v4, tmp4$v5), array_intersect(tmp4$v4, tmp4$v5))) |
| ##D head(select(tmp4, array_union(tmp4$v4, tmp4$v5))) |
| ##D head(select(tmp4, arrays_zip(tmp4$v4, tmp4$v5))) |
| ##D head(select(tmp, concat(df$mpg, df$cyl, df$hp))) |
| ##D head(select(tmp4, arrays_zip_with(tmp4$v4, tmp4$v5, function(x, y) x * y))) |
| ##D tmp5 <- mutate(df, v6 = create_array(df$model, df$model)) |
| ##D head(select(tmp5, array_join(tmp5$v6, "#"), array_join(tmp5$v6, "#", "NULL"))) |
| ##D tmp6 <- mutate(df, v7 = create_array(create_array(df$model, df$model))) |
| ##D head(select(tmp6, flatten(tmp6$v7))) |
| ##D tmp7 <- mutate(df, v8 = create_array(df$model, df$cyl), v9 = create_array(df$model, df$hp)) |
| ##D head(select(tmp7, arrays_zip_with("v8", "v9", function(x, y) (x * y) %% 3))) |
| ##D head(select(tmp7, map_from_arrays(tmp7$v8, tmp7$v9))) |
| ##D tmp8 <- mutate(df, v10 = create_array(struct(df$model, df$cyl))) |
| ##D head(select(tmp8, map_from_entries(tmp8$v10))) |
| ## End(Not run) |
| |
| ## Not run: |
| ##D # Converts a struct into a JSON object |
| ##D df2 <- sql("SELECT named_struct('date', cast('2000-01-01' as date)) as d") |
| ##D select(df2, to_json(df2$d, dateFormat = 'dd/MM/yyyy')) |
| ##D |
| ##D # Converts an array of structs into a JSON array |
| ##D df2 <- sql("SELECT array(named_struct('name', 'Bob'), named_struct('name', 'Alice')) as people") |
| ##D df2 <- mutate(df2, people_json = to_json(df2$people)) |
| ##D |
| ##D # Converts a map into a JSON object |
| ##D df2 <- sql("SELECT map('name', 'Bob') as people") |
| ##D df2 <- mutate(df2, people_json = to_json(df2$people)) |
| ##D |
| ##D # Converts an array of maps into a JSON array |
| ##D df2 <- sql("SELECT array(map('name', 'Bob'), map('name', 'Alice')) as people") |
| ##D df2 <- mutate(df2, people_json = to_json(df2$people)) |
| ##D |
| ##D # Converts a map into a pretty JSON object |
| ##D df2 <- sql("SELECT map('name', 'Bob') as people") |
| ##D df2 <- mutate(df2, people_json = to_json(df2$people, pretty = TRUE)) |
| ## End(Not run) |
| |
| ## Not run: |
| ##D # Converts a struct into a CSV string |
| ##D df2 <- sql("SELECT named_struct('date', cast('2000-01-01' as date)) as d") |
| ##D select(df2, to_csv(df2$d, dateFormat = 'dd/MM/yyyy')) |
| ## End(Not run) |
| |
| ## Not run: |
| ##D df2 <- sql("SELECT named_struct('date', cast('2000-01-01' as date)) as d") |
| ##D df2 <- mutate(df2, d2 = to_json(df2$d, dateFormat = 'dd/MM/yyyy')) |
| ##D schema <- structType(structField("date", "string")) |
| ##D head(select(df2, from_json(df2$d2, schema, dateFormat = 'dd/MM/yyyy'))) |
| ##D df2 <- sql("SELECT named_struct('name', 'Bob') as people") |
| ##D df2 <- mutate(df2, people_json = to_json(df2$people)) |
| ##D schema <- structType(structField("name", "string")) |
| ##D head(select(df2, from_json(df2$people_json, schema))) |
| ##D head(select(df2, from_json(df2$people_json, "name STRING"))) |
| ##D head(select(df2, from_json(df2$people_json, schema_of_json(head(df2)$people_json)))) |
| ## End(Not run) |
| |
| ## Not run: |
| ##D json <- "{\"name\":\"Bob\"}" |
| ##D df <- sql("SELECT * FROM range(1)") |
| ##D head(select(df, schema_of_json(json))) |
| ## End(Not run) |
| |
| ## Not run: |
| ##D csv <- "Amsterdam,2018" |
| ##D df <- sql(paste0("SELECT '", csv, "' as csv")) |
| ##D schema <- "city STRING, year INT" |
| ##D head(select(df, from_csv(df$csv, schema))) |
| ##D head(select(df, from_csv(df$csv, structType(schema)))) |
| ##D head(select(df, from_csv(df$csv, schema_of_csv(csv)))) |
| ## End(Not run) |
| |
| ## Not run: |
| ##D csv <- "Amsterdam,2018" |
| ##D df <- sql("SELECT * FROM range(1)") |
| ##D head(select(df, schema_of_csv(csv))) |
| ## End(Not run) |
| |
| ## Not run: |
| ##D df2 <- createDataFrame(data.frame( |
| ##D id = c(1, 2, 3), text = c("a,b,c", NA, "d,e") |
| ##D )) |
| ##D |
| ##D head(select(df2, df2$id, explode_outer(split_string(df2$text, ",")))) |
| ##D head(select(df2, df2$id, posexplode_outer(split_string(df2$text, ",")))) |
| ## End(Not run) |
| </code></pre> |
| |
| |
| <hr /><div style="text-align: center;">[Package <em>SparkR</em> version 3.2.3 <a href="00Index.html">Index</a>]</div> |
| </div> |
| </body></html> |