blob: f03e155d31e38b453b14acacb9654dcb138688eb [file] [log] [blame]
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"><html xmlns="http://www.w3.org/1999/xhtml"><head><title>R: String functions for Column operations</title>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
<link rel="stylesheet" type="text/css" href="R.css" />
<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/8.3/styles/github.min.css">
<script src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/8.3/highlight.min.js"></script>
<script src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/8.3/languages/r.min.js"></script>
<script>hljs.initHighlightingOnLoad();</script>
</head><body>
<table width="100%" summary="page for column_string_functions {SparkR}"><tr><td>column_string_functions {SparkR}</td><td style="text-align: right;">R Documentation</td></tr></table>
<h2>String functions for Column operations</h2>
<h3>Description</h3>
<p>String functions defined for <code>Column</code>.
</p>
<h3>Usage</h3>
<pre>
ascii(x)
base64(x)
concat_ws(sep, x, ...)
decode(x, charset)
encode(x, charset)
format_number(y, x)
format_string(format, x, ...)
initcap(x)
instr(y, x)
levenshtein(y, x)
locate(substr, str, ...)
lower(x)
lpad(x, len, pad)
ltrim(x, trimString)
regexp_extract(x, pattern, idx)
regexp_replace(x, pattern, replacement)
repeat_string(x, n)
rpad(x, len, pad)
rtrim(x, trimString)
split_string(x, pattern)
soundex(x)
substring_index(x, delim, count)
translate(x, matchingString, replaceString)
trim(x, trimString)
unbase64(x)
upper(x)
## S4 method for signature 'Column'
ascii(x)
## S4 method for signature 'Column'
base64(x)
## S4 method for signature 'Column,character'
decode(x, charset)
## S4 method for signature 'Column,character'
encode(x, charset)
## S4 method for signature 'Column'
initcap(x)
## S4 method for signature 'Column'
length(x)
## S4 method for signature 'Column'
lower(x)
## S4 method for signature 'Column,missing'
ltrim(x, trimString)
## S4 method for signature 'Column,character'
ltrim(x, trimString)
## S4 method for signature 'Column,missing'
rtrim(x, trimString)
## S4 method for signature 'Column,character'
rtrim(x, trimString)
## S4 method for signature 'Column'
soundex(x)
## S4 method for signature 'Column,missing'
trim(x, trimString)
## S4 method for signature 'Column,character'
trim(x, trimString)
## S4 method for signature 'Column'
unbase64(x)
## S4 method for signature 'Column'
upper(x)
## S4 method for signature 'Column'
levenshtein(y, x)
## S4 method for signature 'Column,character'
instr(y, x)
## S4 method for signature 'Column,numeric'
format_number(y, x)
## S4 method for signature 'character,Column'
concat_ws(sep, x, ...)
## S4 method for signature 'character,Column'
format_string(format, x, ...)
## S4 method for signature 'character,Column'
locate(substr, str, pos = 1)
## S4 method for signature 'Column,numeric,character'
lpad(x, len, pad)
## S4 method for signature 'Column,character,numeric'
regexp_extract(x, pattern, idx)
## S4 method for signature 'Column,character,character'
regexp_replace(x, pattern, replacement)
## S4 method for signature 'Column,numeric,character'
rpad(x, len, pad)
## S4 method for signature 'Column,character,numeric'
substring_index(x, delim, count)
## S4 method for signature 'Column,character,character'
translate(x, matchingString, replaceString)
## S4 method for signature 'Column,character'
split_string(x, pattern)
## S4 method for signature 'Column,numeric'
repeat_string(x, n)
</pre>
<h3>Arguments</h3>
<table summary="R argblock">
<tr valign="top"><td><code>x</code></td>
<td>
<p>Column to compute on except in the following methods:
</p>
<ul>
<li> <p><code>instr</code>: <code>character</code>, the substring to check. See 'Details'.
</p>
</li>
<li> <p><code>format_number</code>: <code>numeric</code>, the number of decimal place to
format to. See 'Details'.
</p>
</li></ul>
</td></tr>
<tr valign="top"><td><code>sep</code></td>
<td>
<p>separator to use.</p>
</td></tr>
<tr valign="top"><td><code>...</code></td>
<td>
<p>additional Columns.</p>
</td></tr>
<tr valign="top"><td><code>charset</code></td>
<td>
<p>character set to use (one of &quot;US-ASCII&quot;, &quot;ISO-8859-1&quot;, &quot;UTF-8&quot;, &quot;UTF-16BE&quot;,
&quot;UTF-16LE&quot;, &quot;UTF-16&quot;).</p>
</td></tr>
<tr valign="top"><td><code>y</code></td>
<td>
<p>Column to compute on.</p>
</td></tr>
<tr valign="top"><td><code>format</code></td>
<td>
<p>a character object of format strings.</p>
</td></tr>
<tr valign="top"><td><code>substr</code></td>
<td>
<p>a character string to be matched.</p>
</td></tr>
<tr valign="top"><td><code>str</code></td>
<td>
<p>a Column where matches are sought for each entry.</p>
</td></tr>
<tr valign="top"><td><code>len</code></td>
<td>
<p>maximum length of each output result.</p>
</td></tr>
<tr valign="top"><td><code>pad</code></td>
<td>
<p>a character string to be padded with.</p>
</td></tr>
<tr valign="top"><td><code>trimString</code></td>
<td>
<p>a character string to trim with</p>
</td></tr>
<tr valign="top"><td><code>pattern</code></td>
<td>
<p>a regular expression.</p>
</td></tr>
<tr valign="top"><td><code>idx</code></td>
<td>
<p>a group index.</p>
</td></tr>
<tr valign="top"><td><code>replacement</code></td>
<td>
<p>a character string that a matched <code>pattern</code> is replaced with.</p>
</td></tr>
<tr valign="top"><td><code>n</code></td>
<td>
<p>number of repetitions.</p>
</td></tr>
<tr valign="top"><td><code>delim</code></td>
<td>
<p>a delimiter string.</p>
</td></tr>
<tr valign="top"><td><code>count</code></td>
<td>
<p>number of occurrences of <code>delim</code> before the substring is returned.
A positive number means counting from the left, while negative means
counting from the right.</p>
</td></tr>
<tr valign="top"><td><code>matchingString</code></td>
<td>
<p>a source string where each character will be translated.</p>
</td></tr>
<tr valign="top"><td><code>replaceString</code></td>
<td>
<p>a target string where each <code>matchingString</code> character will
be replaced by the character in <code>replaceString</code>
at the same location, if any.</p>
</td></tr>
<tr valign="top"><td><code>pos</code></td>
<td>
<p>start position of search.</p>
</td></tr>
</table>
<h3>Details</h3>
<p><code>ascii</code>: Computes the numeric value of the first character of the string column,
and returns the result as an int column.
</p>
<p><code>base64</code>: Computes the BASE64 encoding of a binary column and returns it as
a string column. This is the reverse of unbase64.
</p>
<p><code>decode</code>: Computes the first argument into a string from a binary using the provided
character set.
</p>
<p><code>encode</code>: Computes the first argument into a binary from a string using the provided
character set.
</p>
<p><code>initcap</code>: Returns a new string column by converting the first letter of
each word to uppercase. Words are delimited by whitespace. For example, &quot;hello world&quot;
will become &quot;Hello World&quot;.
</p>
<p><code>length</code>: Computes the character length of a string data or number of bytes
of a binary data. The length of string data includes the trailing spaces.
The length of binary data includes binary zeros.
</p>
<p><code>lower</code>: Converts a string column to lower case.
</p>
<p><code>ltrim</code>: Trims the spaces from left end for the specified string value. Optionally a
<code>trimString</code> can be specified.
</p>
<p><code>rtrim</code>: Trims the spaces from right end for the specified string value. Optionally a
<code>trimString</code> can be specified.
</p>
<p><code>soundex</code>: Returns the soundex code for the specified expression.
</p>
<p><code>trim</code>: Trims the spaces from both ends for the specified string column. Optionally a
<code>trimString</code> can be specified.
</p>
<p><code>unbase64</code>: Decodes a BASE64 encoded string column and returns it as a binary column.
This is the reverse of base64.
</p>
<p><code>upper</code>: Converts a string column to upper case.
</p>
<p><code>levenshtein</code>: Computes the Levenshtein distance of the two given string columns.
</p>
<p><code>instr</code>: Locates the position of the first occurrence of a substring (<code>x</code>)
in the given string column (<code>y</code>). Returns null if either of the arguments are null.
Note: The position is not zero based, but 1 based index. Returns 0 if the substring
could not be found in the string column.
</p>
<p><code>format_number</code>: Formats numeric column <code>y</code> to a format like '#,###,###.##',
rounded to <code>x</code> decimal places with HALF_EVEN round mode, and returns the result
as a string column.
If <code>x</code> is 0, the result has no decimal point or fractional part.
If <code>x</code> &lt; 0, the result will be null.
</p>
<p><code>concat_ws</code>: Concatenates multiple input string columns together into a single
string column, using the given separator.
</p>
<p><code>format_string</code>: Formats the arguments in printf-style and returns the result
as a string column.
</p>
<p><code>locate</code>: Locates the position of the first occurrence of substr.
Note: The position is not zero based, but 1 based index. Returns 0 if substr
could not be found in str.
</p>
<p><code>lpad</code>: Left-padded with pad to a length of len.
</p>
<p><code>regexp_extract</code>: Extracts a specific <code>idx</code> group identified by a Java regex,
from the specified string column. If the regex did not match, or the specified group did
not match, an empty string is returned.
</p>
<p><code>regexp_replace</code>: Replaces all substrings of the specified string value that
match regexp with rep.
</p>
<p><code>rpad</code>: Right-padded with pad to a length of len.
</p>
<p><code>substring_index</code>: Returns the substring from string (<code>x</code>) before <code>count</code>
occurrences of the delimiter (<code>delim</code>). If <code>count</code> is positive, everything the left of
the final delimiter (counting from left) is returned. If <code>count</code> is negative, every to the
right of the final delimiter (counting from the right) is returned. <code>substring_index</code>
performs a case-sensitive match when searching for the delimiter.
</p>
<p><code>translate</code>: Translates any character in the src by a character in replaceString.
The characters in replaceString is corresponding to the characters in matchingString.
The translate will happen when any character in the string matching with the character
in the matchingString.
</p>
<p><code>split_string</code>: Splits string on regular expression.
Equivalent to <code>split</code> SQL function.
</p>
<p><code>repeat_string</code>: Repeats string n times.
Equivalent to <code>repeat</code> SQL function.
</p>
<h3>Note</h3>
<p>ascii since 1.5.0
</p>
<p>base64 since 1.5.0
</p>
<p>decode since 1.6.0
</p>
<p>encode since 1.6.0
</p>
<p>initcap since 1.5.0
</p>
<p>length since 1.5.0
</p>
<p>lower since 1.4.0
</p>
<p>ltrim since 1.5.0
</p>
<p>ltrim(Column, character) since 2.3.0
</p>
<p>rtrim since 1.5.0
</p>
<p>rtrim(Column, character) since 2.3.0
</p>
<p>soundex since 1.5.0
</p>
<p>trim since 1.5.0
</p>
<p>trim(Column, character) since 2.3.0
</p>
<p>unbase64 since 1.5.0
</p>
<p>upper since 1.4.0
</p>
<p>levenshtein since 1.5.0
</p>
<p>instr since 1.5.0
</p>
<p>format_number since 1.5.0
</p>
<p>concat_ws since 1.5.0
</p>
<p>format_string since 1.5.0
</p>
<p>locate since 1.5.0
</p>
<p>lpad since 1.5.0
</p>
<p>regexp_extract since 1.5.0
</p>
<p>regexp_replace since 1.5.0
</p>
<p>rpad since 1.5.0
</p>
<p>substring_index since 1.5.0
</p>
<p>translate since 1.5.0
</p>
<p>split_string 2.3.0
</p>
<p>repeat_string since 2.3.0
</p>
<h3>Examples</h3>
<pre><code class="r">## Not run:
##D # Dataframe used throughout this doc
##D df &lt;- createDataFrame(as.data.frame(Titanic, stringsAsFactors = FALSE))
## End(Not run)
## Not run:
##D head(select(df, ascii(df$Class), ascii(df$Sex)))
## End(Not run)
## Not run:
##D tmp &lt;- mutate(df, s1 = encode(df$Class, &quot;UTF-8&quot;))
##D str(tmp)
##D tmp2 &lt;- mutate(tmp, s2 = base64(tmp$s1), s3 = decode(tmp$s1, &quot;UTF-8&quot;),
##D s4 = soundex(tmp$Sex))
##D head(tmp2)
##D head(select(tmp2, unbase64(tmp2$s2)))
## End(Not run)
## Not run:
##D tmp &lt;- mutate(df, sex_lower = lower(df$Sex), age_upper = upper(df$age),
##D sex_age = concat_ws(&quot; &quot;, lower(df$sex), lower(df$age)))
##D head(tmp)
##D tmp2 &lt;- mutate(tmp, s1 = initcap(tmp$sex_lower), s2 = initcap(tmp$sex_age),
##D s3 = reverse(df$Sex))
##D head(tmp2)
## End(Not run)
## Not run:
##D tmp &lt;- mutate(df, SexLpad = lpad(df$Sex, 6, &quot; &quot;), SexRpad = rpad(df$Sex, 7, &quot; &quot;))
##D head(select(tmp, length(tmp$Sex), length(tmp$SexLpad), length(tmp$SexRpad)))
##D tmp2 &lt;- mutate(tmp, SexLtrim = ltrim(tmp$SexLpad), SexRtrim = rtrim(tmp$SexRpad),
##D SexTrim = trim(tmp$SexLpad))
##D head(select(tmp2, length(tmp2$Sex), length(tmp2$SexLtrim),
##D length(tmp2$SexRtrim), length(tmp2$SexTrim)))
##D
##D tmp &lt;- mutate(df, SexLpad = lpad(df$Sex, 6, &quot;xx&quot;), SexRpad = rpad(df$Sex, 7, &quot;xx&quot;))
##D head(tmp)
## End(Not run)
## Not run:
##D tmp &lt;- mutate(df, d1 = levenshtein(df$Class, df$Sex),
##D d2 = levenshtein(df$Age, df$Sex),
##D d3 = levenshtein(df$Age, df$Age))
##D head(tmp)
## End(Not run)
## Not run:
##D tmp &lt;- mutate(df, s1 = instr(df$Sex, &quot;m&quot;), s2 = instr(df$Sex, &quot;M&quot;),
##D s3 = locate(&quot;m&quot;, df$Sex), s4 = locate(&quot;m&quot;, df$Sex, pos = 4))
##D head(tmp)
## End(Not run)
## Not run:
##D tmp &lt;- mutate(df, v1 = df$Freq/3)
##D head(select(tmp, format_number(tmp$v1, 0), format_number(tmp$v1, 2),
##D format_string(&quot;%4.2f %s&quot;, tmp$v1, tmp$Sex)), 10)
## End(Not run)
## Not run:
##D # concatenate strings
##D tmp &lt;- mutate(df, s1 = concat_ws(&quot;_&quot;, df$Class, df$Sex),
##D s2 = concat_ws(&quot;+&quot;, df$Class, df$Sex, df$Age, df$Survived))
##D head(tmp)
## End(Not run)
## Not run:
##D tmp &lt;- mutate(df, s1 = regexp_extract(df$Class, &quot;(\\d+)\\w+&quot;, 1),
##D s2 = regexp_extract(df$Sex, &quot;^(\\w)\\w+&quot;, 1),
##D s3 = regexp_replace(df$Class, &quot;\\D+&quot;, &quot;&quot;),
##D s4 = substring_index(df$Sex, &quot;a&quot;, 1),
##D s5 = substring_index(df$Sex, &quot;a&quot;, -1),
##D s6 = translate(df$Sex, &quot;ale&quot;, &quot;&quot;),
##D s7 = translate(df$Sex, &quot;a&quot;, &quot;-&quot;))
##D head(tmp)
## End(Not run)
## Not run:
##D head(select(df, split_string(df$Sex, &quot;a&quot;)))
##D head(select(df, split_string(df$Class, &quot;\\d&quot;)))
##D # This is equivalent to the following SQL expression
##D head(selectExpr(df, &quot;split(Class, &#39;\\\\d&#39;)&quot;))
## End(Not run)
## Not run:
##D head(select(df, repeat_string(df$Class, 3)))
##D # This is equivalent to the following SQL expression
##D head(selectExpr(df, &quot;repeat(Class, 3)&quot;))
## End(Not run)
</code></pre>
<hr /><div style="text-align: center;">[Package <em>SparkR</em> version 2.4.7 <a href="00Index.html">Index</a>]</div>
</body></html>