| <!DOCTYPE html><html><head><title>R: String functions for Column operations</title> |
| <meta http-equiv="Content-Type" content="text/html; charset=utf-8" /> |
| <meta name="viewport" content="width=device-width, initial-scale=1.0, user-scalable=yes" /> |
| <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/katex@0.15.3/dist/katex.min.css"> |
| <script type="text/javascript"> |
| const macros = { "\\R": "\\textsf{R}", "\\code": "\\texttt"}; |
| function processMathHTML() { |
| var l = document.getElementsByClassName('reqn'); |
| for (let e of l) { katex.render(e.textContent, e, { throwOnError: false, macros }); } |
| return; |
| }</script> |
| <script defer src="https://cdn.jsdelivr.net/npm/katex@0.15.3/dist/katex.min.js" |
| onload="processMathHTML();"></script> |
| <link rel="stylesheet" type="text/css" href="R.css" /> |
| |
| <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/8.3/styles/github.min.css"> |
| <script src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/8.3/highlight.min.js"></script> |
| <script src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/8.3/languages/r.min.js"></script> |
| <script>hljs.initHighlightingOnLoad();</script> |
| </head><body><div class="container"> |
| |
| <table style="width: 100%;"><tr><td>column_string_functions {SparkR}</td><td style="text-align: right;">R Documentation</td></tr></table> |
| |
| <h2>String functions for Column operations</h2> |
| |
| <h3>Description</h3> |
| |
| <p>String functions defined for <code>Column</code>. |
| </p> |
| |
| |
| <h3>Usage</h3> |
| |
| <pre><code class='language-R'>ascii(x) |
| |
| base64(x) |
| |
| concat_ws(sep, x, ...) |
| |
| decode(x, charset) |
| |
| encode(x, charset) |
| |
| format_number(y, x) |
| |
| format_string(format, x, ...) |
| |
| initcap(x) |
| |
| instr(y, x) |
| |
| levenshtein(y, x) |
| |
| locate(substr, str, ...) |
| |
| lower(x) |
| |
| lpad(x, len, pad) |
| |
| ltrim(x, trimString) |
| |
| overlay(x, replace, pos, ...) |
| |
| regexp_extract(x, pattern, idx) |
| |
| regexp_replace(x, pattern, replacement) |
| |
| repeat_string(x, n) |
| |
| rpad(x, len, pad) |
| |
| rtrim(x, trimString) |
| |
| split_string(x, pattern, ...) |
| |
| soundex(x) |
| |
| substring_index(x, delim, count) |
| |
| translate(x, matchingString, replaceString) |
| |
| trim(x, trimString) |
| |
| unbase64(x) |
| |
| upper(x) |
| |
| ## S4 method for signature 'Column' |
| ascii(x) |
| |
| ## S4 method for signature 'Column' |
| base64(x) |
| |
| ## S4 method for signature 'Column,character' |
| decode(x, charset) |
| |
| ## S4 method for signature 'Column,character' |
| encode(x, charset) |
| |
| ## S4 method for signature 'Column' |
| initcap(x) |
| |
| ## S4 method for signature 'Column' |
| length(x) |
| |
| ## S4 method for signature 'Column' |
| lower(x) |
| |
| ## S4 method for signature 'Column,missing' |
| ltrim(x, trimString) |
| |
| ## S4 method for signature 'Column,character' |
| ltrim(x, trimString) |
| |
| ## S4 method for signature 'Column,Column,numericOrColumn' |
| overlay(x, replace, pos, len = -1) |
| |
| ## S4 method for signature 'Column,missing' |
| rtrim(x, trimString) |
| |
| ## S4 method for signature 'Column,character' |
| rtrim(x, trimString) |
| |
| ## S4 method for signature 'Column' |
| soundex(x) |
| |
| ## S4 method for signature 'Column,missing' |
| trim(x, trimString) |
| |
| ## S4 method for signature 'Column,character' |
| trim(x, trimString) |
| |
| ## S4 method for signature 'Column' |
| unbase64(x) |
| |
| ## S4 method for signature 'Column' |
| upper(x) |
| |
| ## S4 method for signature 'Column' |
| levenshtein(y, x) |
| |
| ## S4 method for signature 'Column,character' |
| instr(y, x) |
| |
| ## S4 method for signature 'Column,numeric' |
| format_number(y, x) |
| |
| ## S4 method for signature 'character,Column' |
| concat_ws(sep, x, ...) |
| |
| ## S4 method for signature 'character,Column' |
| format_string(format, x, ...) |
| |
| ## S4 method for signature 'character,Column' |
| locate(substr, str, pos = 1) |
| |
| ## S4 method for signature 'Column,numeric,character' |
| lpad(x, len, pad) |
| |
| ## S4 method for signature 'Column,character,numeric' |
| regexp_extract(x, pattern, idx) |
| |
| ## S4 method for signature 'Column,character,character' |
| regexp_replace(x, pattern, replacement) |
| |
| ## S4 method for signature 'Column,numeric,character' |
| rpad(x, len, pad) |
| |
| ## S4 method for signature 'Column,character,numeric' |
| substring_index(x, delim, count) |
| |
| ## S4 method for signature 'Column,character,character' |
| translate(x, matchingString, replaceString) |
| |
| ## S4 method for signature 'Column,character' |
| split_string(x, pattern, limit = -1) |
| |
| ## S4 method for signature 'Column,numeric' |
| repeat_string(x, n) |
| </code></pre> |
| |
| |
| <h3>Arguments</h3> |
| |
| <table> |
| <tr style="vertical-align: top;"><td><code>x</code></td> |
| <td> |
| <p>Column to compute on except in the following methods: |
| </p> |
| |
| <ul> |
| <li> <p><code>instr</code>: <code>character</code>, the substring to check. See 'Details'. |
| </p> |
| </li> |
| <li> <p><code>format_number</code>: <code>numeric</code>, the number of decimal place to |
| format to. See 'Details'. |
| </p> |
| </li></ul> |
| </td></tr> |
| <tr style="vertical-align: top;"><td><code>sep</code></td> |
| <td> |
| <p>separator to use.</p> |
| </td></tr> |
| <tr style="vertical-align: top;"><td><code>...</code></td> |
| <td> |
| <p>additional Columns.</p> |
| </td></tr> |
| <tr style="vertical-align: top;"><td><code>charset</code></td> |
| <td> |
| <p>character set to use (one of "US-ASCII", "ISO-8859-1", "UTF-8", "UTF-16BE", |
| "UTF-16LE", "UTF-16").</p> |
| </td></tr> |
| <tr style="vertical-align: top;"><td><code>y</code></td> |
| <td> |
| <p>Column to compute on.</p> |
| </td></tr> |
| <tr style="vertical-align: top;"><td><code>format</code></td> |
| <td> |
| <p>a character object of format strings.</p> |
| </td></tr> |
| <tr style="vertical-align: top;"><td><code>substr</code></td> |
| <td> |
| <p>a character string to be matched.</p> |
| </td></tr> |
| <tr style="vertical-align: top;"><td><code>str</code></td> |
| <td> |
| <p>a Column where matches are sought for each entry.</p> |
| </td></tr> |
| <tr style="vertical-align: top;"><td><code>len</code></td> |
| <td> |
| <p>In </p> |
| |
| <ul> |
| <li> <p><code>lpad</code> the maximum length of each output result. |
| </p> |
| </li> |
| <li> <p><code>overlay</code> a number of bytes to replace. |
| </p> |
| </li></ul> |
| </td></tr> |
| <tr style="vertical-align: top;"><td><code>pad</code></td> |
| <td> |
| <p>a character string to be padded with.</p> |
| </td></tr> |
| <tr style="vertical-align: top;"><td><code>trimString</code></td> |
| <td> |
| <p>a character string to trim with</p> |
| </td></tr> |
| <tr style="vertical-align: top;"><td><code>replace</code></td> |
| <td> |
| <p>a Column with replacement.</p> |
| </td></tr> |
| <tr style="vertical-align: top;"><td><code>pos</code></td> |
| <td> |
| <p>In </p> |
| |
| <ul> |
| <li> <p><code>locate</code>: a start position of search. |
| </p> |
| </li> |
| <li> <p><code>overlay</code>: a start position for replacement. |
| </p> |
| </li></ul> |
| </td></tr> |
| <tr style="vertical-align: top;"><td><code>pattern</code></td> |
| <td> |
| <p>a regular expression.</p> |
| </td></tr> |
| <tr style="vertical-align: top;"><td><code>idx</code></td> |
| <td> |
| <p>a group index.</p> |
| </td></tr> |
| <tr style="vertical-align: top;"><td><code>replacement</code></td> |
| <td> |
| <p>a character string that a matched <code>pattern</code> is replaced with.</p> |
| </td></tr> |
| <tr style="vertical-align: top;"><td><code>n</code></td> |
| <td> |
| <p>number of repetitions.</p> |
| </td></tr> |
| <tr style="vertical-align: top;"><td><code>delim</code></td> |
| <td> |
| <p>a delimiter string.</p> |
| </td></tr> |
| <tr style="vertical-align: top;"><td><code>count</code></td> |
| <td> |
| <p>number of occurrences of <code>delim</code> before the substring is returned. |
| A positive number means counting from the left, while negative means |
| counting from the right.</p> |
| </td></tr> |
| <tr style="vertical-align: top;"><td><code>matchingString</code></td> |
| <td> |
| <p>a source string where each character will be translated.</p> |
| </td></tr> |
| <tr style="vertical-align: top;"><td><code>replaceString</code></td> |
| <td> |
| <p>a target string where each <code>matchingString</code> character will |
| be replaced by the character in <code>replaceString</code> |
| at the same location, if any.</p> |
| </td></tr> |
| <tr style="vertical-align: top;"><td><code>limit</code></td> |
| <td> |
| <p>determines the length of the returned array. |
| </p> |
| |
| <ul> |
| <li> <p><code>limit > 0</code>: length of the array will be at most <code>limit</code> |
| </p> |
| </li> |
| <li> <p><code>limit <= 0</code>: the returned array can have any length |
| </p> |
| </li></ul> |
| </td></tr> |
| </table> |
| |
| |
| <h3>Details</h3> |
| |
| <p><code>ascii</code>: Computes the numeric value of the first character of the string column, |
| and returns the result as an int column. |
| </p> |
| <p><code>base64</code>: Computes the BASE64 encoding of a binary column and returns it as |
| a string column. This is the reverse of unbase64. |
| </p> |
| <p><code>decode</code>: Computes the first argument into a string from a binary using the provided |
| character set. |
| </p> |
| <p><code>encode</code>: Computes the first argument into a binary from a string using the provided |
| character set. |
| </p> |
| <p><code>initcap</code>: Returns a new string column by converting the first letter of |
| each word to uppercase. Words are delimited by whitespace. For example, "hello world" |
| will become "Hello World". |
| </p> |
| <p><code>length</code>: Computes the character length of a string data or number of bytes |
| of a binary data. The length of string data includes the trailing spaces. |
| The length of binary data includes binary zeros. |
| </p> |
| <p><code>lower</code>: Converts a string column to lower case. |
| </p> |
| <p><code>ltrim</code>: Trims the spaces from left end for the specified string value. Optionally a |
| <code>trimString</code> can be specified. |
| </p> |
| <p><code>overlay</code>: Overlay the specified portion of <code>x</code> with <code>replace</code>, |
| starting from byte position <code>pos</code> of <code>src</code> and proceeding for |
| <code>len</code> bytes. |
| </p> |
| <p><code>rtrim</code>: Trims the spaces from right end for the specified string value. Optionally a |
| <code>trimString</code> can be specified. |
| </p> |
| <p><code>soundex</code>: Returns the soundex code for the specified expression. |
| </p> |
| <p><code>trim</code>: Trims the spaces from both ends for the specified string column. Optionally a |
| <code>trimString</code> can be specified. |
| </p> |
| <p><code>unbase64</code>: Decodes a BASE64 encoded string column and returns it as a binary column. |
| This is the reverse of base64. |
| </p> |
| <p><code>upper</code>: Converts a string column to upper case. |
| </p> |
| <p><code>levenshtein</code>: Computes the Levenshtein distance of the two given string columns. |
| </p> |
| <p><code>instr</code>: Locates the position of the first occurrence of a substring (<code>x</code>) |
| in the given string column (<code>y</code>). Returns null if either of the arguments are null. |
| Note: The position is not zero based, but 1 based index. Returns 0 if the substring |
| could not be found in the string column. |
| </p> |
| <p><code>format_number</code>: Formats numeric column <code>y</code> to a format like '#,###,###.##', |
| rounded to <code>x</code> decimal places with HALF_EVEN round mode, and returns the result |
| as a string column. |
| If <code>x</code> is 0, the result has no decimal point or fractional part. |
| If <code>x</code> < 0, the result will be null. |
| </p> |
| <p><code>concat_ws</code>: Concatenates multiple input string columns together into a single |
| string column, using the given separator. |
| </p> |
| <p><code>format_string</code>: Formats the arguments in printf-style and returns the result |
| as a string column. |
| </p> |
| <p><code>locate</code>: Locates the position of the first occurrence of substr. |
| Note: The position is not zero based, but 1 based index. Returns 0 if substr |
| could not be found in str. |
| </p> |
| <p><code>lpad</code>: Left-padded with pad to a length of len. |
| </p> |
| <p><code>regexp_extract</code>: Extracts a specific <code>idx</code> group identified by a Java regex, |
| from the specified string column. If the regex did not match, or the specified group did |
| not match, an empty string is returned. |
| </p> |
| <p><code>regexp_replace</code>: Replaces all substrings of the specified string value that |
| match regexp with rep. |
| </p> |
| <p><code>rpad</code>: Right-padded with pad to a length of len. |
| </p> |
| <p><code>substring_index</code>: Returns the substring from string (<code>x</code>) before <code>count</code> |
| occurrences of the delimiter (<code>delim</code>). If <code>count</code> is positive, everything the left of |
| the final delimiter (counting from left) is returned. If <code>count</code> is negative, every to the |
| right of the final delimiter (counting from the right) is returned. <code>substring_index</code> |
| performs a case-sensitive match when searching for the delimiter. |
| </p> |
| <p><code>translate</code>: Translates any character in the src by a character in replaceString. |
| The characters in replaceString is corresponding to the characters in matchingString. |
| The translate will happen when any character in the string matching with the character |
| in the matchingString. |
| </p> |
| <p><code>split_string</code>: Splits string on regular expression. |
| Equivalent to <code>split</code> SQL function. Optionally a |
| <code>limit</code> can be specified |
| </p> |
| <p><code>repeat_string</code>: Repeats string n times. |
| Equivalent to <code>repeat</code> SQL function. |
| </p> |
| |
| |
| <h3>Note</h3> |
| |
| <p>ascii since 1.5.0 |
| </p> |
| <p>base64 since 1.5.0 |
| </p> |
| <p>decode since 1.6.0 |
| </p> |
| <p>encode since 1.6.0 |
| </p> |
| <p>initcap since 1.5.0 |
| </p> |
| <p>length since 1.5.0 |
| </p> |
| <p>lower since 1.4.0 |
| </p> |
| <p>ltrim since 1.5.0 |
| </p> |
| <p>ltrim(Column, character) since 2.3.0 |
| </p> |
| <p>overlay since 3.0.0 |
| </p> |
| <p>rtrim since 1.5.0 |
| </p> |
| <p>rtrim(Column, character) since 2.3.0 |
| </p> |
| <p>soundex since 1.5.0 |
| </p> |
| <p>trim since 1.5.0 |
| </p> |
| <p>trim(Column, character) since 2.3.0 |
| </p> |
| <p>unbase64 since 1.5.0 |
| </p> |
| <p>upper since 1.4.0 |
| </p> |
| <p>levenshtein since 1.5.0 |
| </p> |
| <p>instr since 1.5.0 |
| </p> |
| <p>format_number since 1.5.0 |
| </p> |
| <p>concat_ws since 1.5.0 |
| </p> |
| <p>format_string since 1.5.0 |
| </p> |
| <p>locate since 1.5.0 |
| </p> |
| <p>lpad since 1.5.0 |
| </p> |
| <p>regexp_extract since 1.5.0 |
| </p> |
| <p>regexp_replace since 1.5.0 |
| </p> |
| <p>rpad since 1.5.0 |
| </p> |
| <p>substring_index since 1.5.0 |
| </p> |
| <p>translate since 1.5.0 |
| </p> |
| <p>split_string 2.3.0 |
| </p> |
| <p>repeat_string since 2.3.0 |
| </p> |
| |
| |
| <h3>Examples</h3> |
| |
| <pre><code class="r">## Not run: |
| ##D # Dataframe used throughout this doc |
| ##D df <- createDataFrame(as.data.frame(Titanic, stringsAsFactors = FALSE)) |
| ## End(Not run) |
| |
| ## Not run: |
| ##D head(select(df, ascii(df$Class), ascii(df$Sex))) |
| ## End(Not run) |
| |
| ## Not run: |
| ##D tmp <- mutate(df, s1 = encode(df$Class, "UTF-8")) |
| ##D str(tmp) |
| ##D tmp2 <- mutate(tmp, s2 = base64(tmp$s1), s3 = decode(tmp$s1, "UTF-8"), |
| ##D s4 = soundex(tmp$Sex)) |
| ##D head(tmp2) |
| ##D head(select(tmp2, unbase64(tmp2$s2))) |
| ## End(Not run) |
| |
| ## Not run: |
| ##D tmp <- mutate(df, sex_lower = lower(df$Sex), age_upper = upper(df$age), |
| ##D sex_age = concat_ws(" ", lower(df$sex), lower(df$age))) |
| ##D head(tmp) |
| ##D tmp2 <- mutate(tmp, s1 = initcap(tmp$sex_lower), s2 = initcap(tmp$sex_age), |
| ##D s3 = reverse(df$Sex)) |
| ##D head(tmp2) |
| ## End(Not run) |
| |
| ## Not run: |
| ##D tmp <- mutate(df, SexLpad = lpad(df$Sex, 6, " "), SexRpad = rpad(df$Sex, 7, " ")) |
| ##D head(select(tmp, length(tmp$Sex), length(tmp$SexLpad), length(tmp$SexRpad))) |
| ##D tmp2 <- mutate(tmp, SexLtrim = ltrim(tmp$SexLpad), SexRtrim = rtrim(tmp$SexRpad), |
| ##D SexTrim = trim(tmp$SexLpad)) |
| ##D head(select(tmp2, length(tmp2$Sex), length(tmp2$SexLtrim), |
| ##D length(tmp2$SexRtrim), length(tmp2$SexTrim))) |
| ##D |
| ##D tmp <- mutate(df, SexLpad = lpad(df$Sex, 6, "xx"), SexRpad = rpad(df$Sex, 7, "xx")) |
| ##D head(tmp) |
| ## End(Not run) |
| |
| ## Not run: |
| ##D tmp <- mutate(df, d1 = levenshtein(df$Class, df$Sex), |
| ##D d2 = levenshtein(df$Age, df$Sex), |
| ##D d3 = levenshtein(df$Age, df$Age)) |
| ##D head(tmp) |
| ## End(Not run) |
| |
| ## Not run: |
| ##D tmp <- mutate(df, s1 = instr(df$Sex, "m"), s2 = instr(df$Sex, "M"), |
| ##D s3 = locate("m", df$Sex), s4 = locate("m", df$Sex, pos = 4)) |
| ##D head(tmp) |
| ## End(Not run) |
| |
| ## Not run: |
| ##D tmp <- mutate(df, v1 = df$Freq/3) |
| ##D head(select(tmp, format_number(tmp$v1, 0), format_number(tmp$v1, 2), |
| ##D format_string("%4.2f %s", tmp$v1, tmp$Sex)), 10) |
| ## End(Not run) |
| |
| ## Not run: |
| ##D # concatenate strings |
| ##D tmp <- mutate(df, s1 = concat_ws("_", df$Class, df$Sex), |
| ##D s2 = concat_ws("+", df$Class, df$Sex, df$Age, df$Survived)) |
| ##D head(tmp) |
| ## End(Not run) |
| |
| ## Not run: |
| ##D tmp <- mutate(df, s1 = regexp_extract(df$Class, "(\\d+)\\w+", 1), |
| ##D s2 = regexp_extract(df$Sex, "^(\\w)\\w+", 1), |
| ##D s3 = regexp_replace(df$Class, "\\D+", ""), |
| ##D s4 = substring_index(df$Sex, "a", 1), |
| ##D s5 = substring_index(df$Sex, "a", -1), |
| ##D s6 = translate(df$Sex, "ale", ""), |
| ##D s7 = translate(df$Sex, "a", "-")) |
| ##D head(tmp) |
| ## End(Not run) |
| |
| ## Not run: |
| ##D head(select(df, split_string(df$Class, "\\d", 2))) |
| ##D head(select(df, split_string(df$Sex, "a"))) |
| ##D head(select(df, split_string(df$Class, "\\d"))) |
| ##D # This is equivalent to the following SQL expression |
| ##D head(selectExpr(df, "split(Class, '\\\\d')")) |
| ## End(Not run) |
| |
| ## Not run: |
| ##D head(select(df, repeat_string(df$Class, 3))) |
| ##D # This is equivalent to the following SQL expression |
| ##D head(selectExpr(df, "repeat(Class, 3)")) |
| ## End(Not run) |
| </code></pre> |
| |
| |
| <hr /><div style="text-align: center;">[Package <em>SparkR</em> version 3.2.2 <a href="00Index.html">Index</a>]</div> |
| </div> |
| </body></html> |