sql/gen-sql-functions-docs.py - spark - Git at Google

 #
 # Licensed to the Apache Software Foundation (ASF) under one or more
 # contributor license agreements.  See the NOTICE file distributed with
 # this work for additional information regarding copyright ownership.
 # The ASF licenses this file to You under the Apache License, Version 2.0
 # (the "License"); you may not use this file except in compliance with
 # the License.  You may obtain a copy of the License at
 #
 #    http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #

 import itertools
 import os
 import re
 from collections import namedtuple

 # To avoid adding a new direct dependency, we import markdown from within mkdocs.
 from mkdocs.structure.pages import markdown

 from pyspark.java_gateway import launch_gateway


 ExpressionInfo = namedtuple("ExpressionInfo", "name usage examples group")

 groups = {
     "agg_funcs", "array_funcs", "datetime_funcs",
     "json_funcs", "map_funcs", "window_funcs",
 }


 def _list_grouped_function_infos(jvm):
     """
     Returns a list of function information grouped by each group value via JVM.
     Sorts wrapped expression infos in each group by name and returns them.
     """

     jinfos = jvm.org.apache.spark.sql.api.python.PythonSQLUtils.listBuiltinFunctionInfos()
     infos = []

     for jinfo in filter(lambda x: x.getGroup() in groups, jinfos):
         name = jinfo.getName()
         usage = jinfo.getUsage()
         usage = usage.replace("_FUNC_", name) if usage is not None else usage
         infos.append(ExpressionInfo(
             name=name,
             usage=usage,
             examples=jinfo.getExamples().replace("_FUNC_", name),
             group=jinfo.getGroup()))

     # Groups expression info by each group value
     grouped_infos = itertools.groupby(sorted(infos, key=lambda x: x.group), key=lambda x: x.group)
     # Then, sort expression infos in each group by name
     return [(k, sorted(g, key=lambda x: x.name)) for k, g in grouped_infos]


 # TODO(SPARK-31499): Needs to add a column to describe arguments and their types
 def _make_pretty_usage(infos):
     """
     Makes the usage description pretty and returns a formatted string.

     Expected input:

         func(*) - ...

         func(expr[, expr...]) - ...

     Expected output:
     <table class="table">
       <thead>
         <tr>
           <th style="width:25%">Function</th>
           <th>Description</th>
         </tr>
       </thead>
       <tbody>
         <tr>
           <td>func(*)</td>
           <td>...</td>
         </tr>
         <tr>
           <td>func(expr[, expr...])</td>
           <td>...</td>
         </tr>
       </tbody>
       ...
     </table>

     """

     result = []
     result.append("<table class=\"table\">")
     result.append("  <thead>")
     result.append("    <tr>")
     result.append("      <th style=\"width:25%\">Function</th>")
     result.append("      <th>Description</th>")
     result.append("    </tr>")
     result.append("  </thead>")
     result.append("  <tbody>")

     for info in infos:
         # Extracts (signature, description) pairs from `info.usage`.
         # Expected formats are as follows;
         #  - `_FUNC_(...) - description`, or
         #  - `_FUNC_ - description`
         usages = iter(re.split(r"(%s.*) - " % info.name, info.usage.strip())[1:])
         for (sig, description) in zip(usages, usages):
             result.append("    <tr>")
             result.append("      <td>%s</td>" % sig)
             result.append("      <td>%s</td>" % description.strip())
             result.append("    </tr>")

     result.append("  </tbody>")
     result.append("</table>\n")
     return "\n".join(result)


 def _make_pretty_examples(jspark, infos):
     """
     Makes the examples description pretty and returns a formatted string if `infos`
     has any `examples` starting with the example prefix. Otherwise, returns None.

     Expected input:

         Examples:
           > SELECT func(col)...;
            ...
           > SELECT func(col)...;
            ...

     Expected output:
     <div class="codehilite"><pre><span></span>
       <span class="c1">-- func</span>
       <span class="k">SELECT</span>
       ...
     </pre></div>
     ```

     """

     pretty_output = ""
     for info in infos:
         if info.examples.startswith("\n    Examples:"):
             output = []
             output.append("-- %s" % info.name)
             query_examples = filter(lambda x: x.startswith("      > "), info.examples.split("\n"))
             for query_example in query_examples:
                 query = query_example.lstrip("      > ")
                 print("    %s" % query)
                 query_output = jspark.sql(query).showString(20, 20, False)
                 output.append(query)
                 output.append(query_output)
             pretty_output += "\n" + "\n".join(output)
     if pretty_output != "":
         return markdown.markdown(
             "```sql%s```" % pretty_output, extensions=['codehilite', 'fenced_code'])


 def generate_functions_table_html(jvm, html_output_dir):
     """
     Generates a HTML file after listing the function information. The output file
     is created under `html_output_dir`.

     Expected output:

     <table class="table">
       <thead>
         <tr>
           <th style="width:25%">Function</th>
           <th>Description</th>
         </tr>
       </thead>
       <tbody>
         <tr>
           <td>func(*)</td>
           <td>...</td>
         </tr>
         <tr>
           <td>func(expr[, expr...])</td>
           <td>...</td>
         </tr>
       </tbody>
       ...
     </table>

     """
     for key, infos in _list_grouped_function_infos(jvm):
         function_table = _make_pretty_usage(infos)
         key = key.replace("_", "-")
         with open("%s/generated-%s-table.html" % (html_output_dir, key), 'w') as table_html:
             table_html.write(function_table)


 def generate_functions_examples_html(jvm, jspark, html_output_dir):
     """
     Generates a HTML file after listing and executing the function information.
     The output file is created under `html_output_dir`.

     Expected output:

     <div class="codehilite"><pre><span></span>
       <span class="c1">-- func</span>
       <span class="k">SELECT</span>
       ...
     </pre></div>

     """
     print("Running SQL examples to generate formatted output.")
     for key, infos in _list_grouped_function_infos(jvm):
         examples = _make_pretty_examples(jspark, infos)
         key = key.replace("_", "-")
         if examples is not None:
             with open("%s/generated-%s-examples.html" % (
                     html_output_dir, key), 'w') as examples_html:
                 examples_html.write(examples)


 if __name__ == "__main__":
     jvm = launch_gateway().jvm
     jspark = jvm.org.apache.spark.sql.SparkSession.builder().getOrCreate()
     jspark.sparkContext().setLogLevel("ERROR")  # Make it less noisy.
     spark_root_dir = os.path.dirname(os.path.dirname(__file__))
     html_output_dir = os.path.join(spark_root_dir, "docs")
     generate_functions_table_html(jvm, html_output_dir)
     generate_functions_examples_html(jvm, jspark, html_output_dir)
	#
	# Licensed to the Apache Software Foundation (ASF) under one or more
	# contributor license agreements. See the NOTICE file distributed with
	# this work for additional information regarding copyright ownership.
	# The ASF licenses this file to You under the Apache License, Version 2.0
	# (the "License"); you may not use this file except in compliance with
	# the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	#

	import itertools
	import os
	import re
	from collections import namedtuple

	# To avoid adding a new direct dependency, we import markdown from within mkdocs.
	from mkdocs.structure.pages import markdown

	from pyspark.java_gateway import launch_gateway


	ExpressionInfo = namedtuple("ExpressionInfo", "name usage examples group")

	groups = {
	"agg_funcs", "array_funcs", "datetime_funcs",
	"json_funcs", "map_funcs", "window_funcs",
	}


	def _list_grouped_function_infos(jvm):
	"""
	Returns a list of function information grouped by each group value via JVM.
	Sorts wrapped expression infos in each group by name and returns them.
	"""

	jinfos = jvm.org.apache.spark.sql.api.python.PythonSQLUtils.listBuiltinFunctionInfos()
	infos = []

	for jinfo in filter(lambda x: x.getGroup() in groups, jinfos):
	name = jinfo.getName()
	usage = jinfo.getUsage()
	usage = usage.replace("_FUNC_", name) if usage is not None else usage
	infos.append(ExpressionInfo(
	name=name,
	usage=usage,
	examples=jinfo.getExamples().replace("_FUNC_", name),
	group=jinfo.getGroup()))

	# Groups expression info by each group value
	grouped_infos = itertools.groupby(sorted(infos, key=lambda x: x.group), key=lambda x: x.group)
	# Then, sort expression infos in each group by name
	return [(k, sorted(g, key=lambda x: x.name)) for k, g in grouped_infos]


	# TODO(SPARK-31499): Needs to add a column to describe arguments and their types
	def _make_pretty_usage(infos):
	"""
	Makes the usage description pretty and returns a formatted string.

	Expected input:

	func(*) - ...

	func(expr[, expr...]) - ...

	Expected output:
	<table class="table">
	<thead>
	<tr>
	<th style="width:25%">Function</th>
	<th>Description</th>
	</tr>
	</thead>
	<tbody>
	<tr>
	<td>func(*)</td>
	<td>...</td>
	</tr>
	<tr>
	<td>func(expr[, expr...])</td>
	<td>...</td>
	</tr>
	</tbody>
	...
	</table>

	"""

	result = []
	result.append("<table class=\"table\">")
	result.append(" <thead>")
	result.append(" <tr>")
	result.append(" <th style=\"width:25%\">Function</th>")
	result.append(" <th>Description</th>")
	result.append(" </tr>")
	result.append(" </thead>")
	result.append(" <tbody>")

	for info in infos:
	# Extracts (signature, description) pairs from `info.usage`.
	# Expected formats are as follows;
	# - `_FUNC_(...) - description`, or
	# - `_FUNC_ - description`
	usages = iter(re.split(r"(%s.*) - " % info.name, info.usage.strip())[1:])
	for (sig, description) in zip(usages, usages):
	result.append(" <tr>")
	result.append(" <td>%s</td>" % sig)
	result.append(" <td>%s</td>" % description.strip())
	result.append(" </tr>")

	result.append(" </tbody>")
	result.append("</table>\n")
	return "\n".join(result)


	def _make_pretty_examples(jspark, infos):
	"""
	Makes the examples description pretty and returns a formatted string if `infos`
	has any `examples` starting with the example prefix. Otherwise, returns None.

	Expected input:

	Examples:
	> SELECT func(col)...;
	...
	> SELECT func(col)...;
	...

	Expected output:
	<div class="codehilite"><pre><span></span>
	<span class="c1">-- func</span>
	<span class="k">SELECT</span>
	...
	</pre></div>
	```

	"""

	pretty_output = ""
	for info in infos:
	if info.examples.startswith("\n Examples:"):
	output = []
	output.append("-- %s" % info.name)
	query_examples = filter(lambda x: x.startswith(" > "), info.examples.split("\n"))
	for query_example in query_examples:
	query = query_example.lstrip(" > ")
	print(" %s" % query)
	query_output = jspark.sql(query).showString(20, 20, False)
	output.append(query)
	output.append(query_output)
	pretty_output += "\n" + "\n".join(output)
	if pretty_output != "":
	return markdown.markdown(
	"```sql%s```" % pretty_output, extensions=['codehilite', 'fenced_code'])


	def generate_functions_table_html(jvm, html_output_dir):
	"""
	Generates a HTML file after listing the function information. The output file
	is created under `html_output_dir`.

	Expected output:

	<table class="table">
	<thead>
	<tr>
	<th style="width:25%">Function</th>
	<th>Description</th>
	</tr>
	</thead>
	<tbody>
	<tr>
	<td>func(*)</td>
	<td>...</td>
	</tr>
	<tr>
	<td>func(expr[, expr...])</td>
	<td>...</td>
	</tr>
	</tbody>
	...
	</table>

	"""
	for key, infos in _list_grouped_function_infos(jvm):
	function_table = _make_pretty_usage(infos)
	key = key.replace("_", "-")
	with open("%s/generated-%s-table.html" % (html_output_dir, key), 'w') as table_html:
	table_html.write(function_table)


	def generate_functions_examples_html(jvm, jspark, html_output_dir):
	"""
	Generates a HTML file after listing and executing the function information.
	The output file is created under `html_output_dir`.

	Expected output:

	<div class="codehilite"><pre><span></span>
	<span class="c1">-- func</span>
	<span class="k">SELECT</span>
	...
	</pre></div>

	"""
	print("Running SQL examples to generate formatted output.")
	for key, infos in _list_grouped_function_infos(jvm):
	examples = _make_pretty_examples(jspark, infos)
	key = key.replace("_", "-")
	if examples is not None:
	with open("%s/generated-%s-examples.html" % (
	html_output_dir, key), 'w') as examples_html:
	examples_html.write(examples)


	if __name__ == "__main__":
	jvm = launch_gateway().jvm
	jspark = jvm.org.apache.spark.sql.SparkSession.builder().getOrCreate()
	jspark.sparkContext().setLogLevel("ERROR") # Make it less noisy.
	spark_root_dir = os.path.dirname(os.path.dirname(__file__))
	html_output_dir = os.path.join(spark_root_dir, "docs")
	generate_functions_table_html(jvm, html_output_dir)
	generate_functions_examples_html(jvm, jspark, html_output_dir)