blob: c07734e273051422ae6c45f45f6a74767261d571 [file] [log] [blame]
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import itertools
import os
import re
from collections import namedtuple
# To avoid adding a new direct dependency, we import markdown from within mkdocs.
from mkdocs.structure.pages import markdown
from pyspark.java_gateway import launch_gateway
ExpressionInfo = namedtuple("ExpressionInfo", "name usage examples group")
groups = {
"agg_funcs", "array_funcs", "datetime_funcs",
"json_funcs", "map_funcs", "window_funcs",
}
def _list_grouped_function_infos(jvm):
"""
Returns a list of function information grouped by each group value via JVM.
Sorts wrapped expression infos in each group by name and returns them.
"""
jinfos = jvm.org.apache.spark.sql.api.python.PythonSQLUtils.listBuiltinFunctionInfos()
infos = []
for jinfo in filter(lambda x: x.getGroup() in groups, jinfos):
name = jinfo.getName()
usage = jinfo.getUsage()
usage = usage.replace("_FUNC_", name) if usage is not None else usage
infos.append(ExpressionInfo(
name=name,
usage=usage,
examples=jinfo.getExamples().replace("_FUNC_", name),
group=jinfo.getGroup()))
# Groups expression info by each group value
grouped_infos = itertools.groupby(sorted(infos, key=lambda x: x.group), key=lambda x: x.group)
# Then, sort expression infos in each group by name
return [(k, sorted(g, key=lambda x: x.name)) for k, g in grouped_infos]
# TODO(SPARK-31499): Needs to add a column to describe arguments and their types
def _make_pretty_usage(infos):
"""
Makes the usage description pretty and returns a formatted string.
Expected input:
func(*) - ...
func(expr[, expr...]) - ...
Expected output:
<table class="table">
<thead>
<tr>
<th style="width:25%">Function</th>
<th>Description</th>
</tr>
</thead>
<tbody>
<tr>
<td>func(*)</td>
<td>...</td>
</tr>
<tr>
<td>func(expr[, expr...])</td>
<td>...</td>
</tr>
</tbody>
...
</table>
"""
result = []
result.append("<table class=\"table\">")
result.append(" <thead>")
result.append(" <tr>")
result.append(" <th style=\"width:25%\">Function</th>")
result.append(" <th>Description</th>")
result.append(" </tr>")
result.append(" </thead>")
result.append(" <tbody>")
for info in infos:
# Extracts (signature, description) pairs from `info.usage`.
# Expected formats are as follows;
# - `_FUNC_(...) - description`, or
# - `_FUNC_ - description`
usages = iter(re.split(r"(%s.*) - " % info.name, info.usage.strip())[1:])
for (sig, description) in zip(usages, usages):
result.append(" <tr>")
result.append(" <td>%s</td>" % sig)
result.append(" <td>%s</td>" % description.strip())
result.append(" </tr>")
result.append(" </tbody>")
result.append("</table>\n")
return "\n".join(result)
def _make_pretty_examples(jspark, infos):
"""
Makes the examples description pretty and returns a formatted string if `infos`
has any `examples` starting with the example prefix. Otherwise, returns None.
Expected input:
Examples:
> SELECT func(col)...;
...
> SELECT func(col)...;
...
Expected output:
<div class="codehilite"><pre><span></span>
<span class="c1">-- func</span>
<span class="k">SELECT</span>
...
</pre></div>
```
"""
pretty_output = ""
for info in infos:
if info.examples.startswith("\n Examples:"):
output = []
output.append("-- %s" % info.name)
query_examples = filter(lambda x: x.startswith(" > "), info.examples.split("\n"))
for query_example in query_examples:
query = query_example.lstrip(" > ")
print(" %s" % query)
query_output = jspark.sql(query).showString(20, 20, False)
output.append(query)
output.append(query_output)
pretty_output += "\n" + "\n".join(output)
if pretty_output != "":
return markdown.markdown(
"```sql%s```" % pretty_output, extensions=['codehilite', 'fenced_code'])
def generate_functions_table_html(jvm, html_output_dir):
"""
Generates a HTML file after listing the function information. The output file
is created under `html_output_dir`.
Expected output:
<table class="table">
<thead>
<tr>
<th style="width:25%">Function</th>
<th>Description</th>
</tr>
</thead>
<tbody>
<tr>
<td>func(*)</td>
<td>...</td>
</tr>
<tr>
<td>func(expr[, expr...])</td>
<td>...</td>
</tr>
</tbody>
...
</table>
"""
for key, infos in _list_grouped_function_infos(jvm):
function_table = _make_pretty_usage(infos)
key = key.replace("_", "-")
with open("%s/generated-%s-table.html" % (html_output_dir, key), 'w') as table_html:
table_html.write(function_table)
def generate_functions_examples_html(jvm, jspark, html_output_dir):
"""
Generates a HTML file after listing and executing the function information.
The output file is created under `html_output_dir`.
Expected output:
<div class="codehilite"><pre><span></span>
<span class="c1">-- func</span>
<span class="k">SELECT</span>
...
</pre></div>
"""
print("Running SQL examples to generate formatted output.")
for key, infos in _list_grouped_function_infos(jvm):
examples = _make_pretty_examples(jspark, infos)
key = key.replace("_", "-")
if examples is not None:
with open("%s/generated-%s-examples.html" % (
html_output_dir, key), 'w') as examples_html:
examples_html.write(examples)
if __name__ == "__main__":
jvm = launch_gateway().jvm
jspark = jvm.org.apache.spark.sql.SparkSession.builder().getOrCreate()
jspark.sparkContext().setLogLevel("ERROR") # Make it less noisy.
spark_root_dir = os.path.dirname(os.path.dirname(__file__))
html_output_dir = os.path.join(spark_root_dir, "docs")
generate_functions_table_html(jvm, html_output_dir)
generate_functions_examples_html(jvm, jspark, html_output_dir)