blob: 604d3f559a499f75a7512230dec3e165ddbb2f14 [file] [log] [blame]
#!/usr/bin/env python3
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# This script generates llms.txt file for Apache Spark documentation
import sys
import argparse
from pathlib import Path
def generate_llms_txt(docs_path: Path, output_path: Path, version: str = "latest") -> None:
"""
Generate the llms.txt file for Apache Spark documentation with hardcoded categories.
"""
content = []
content.append("# Apache Spark")
content.append("")
content.append(
"> Apache Sparkā„¢ is a unified analytics engine for large-scale data processing. "
"It provides high-level APIs in Java, Scala, Python, and R, and an optimized engine "
"that supports general execution graphs. It also supports a rich set of higher-level "
"tools including Spark SQL for SQL and structured data processing, MLlib for machine "
"learning, GraphX for graph processing, and Structured Streaming for incremental "
"computation and stream processing."
)
content.append("")
doc_home_url = f"https://spark.apache.org/docs/{version}/"
content.append(f"Documentation home: {doc_home_url}")
content.append("")
content.append("## Programming Guides")
content.append("")
programming_guides = [
("Quick Start", f"https://spark.apache.org/docs/{version}/quick-start.html"),
(
"RDD Programming Guide",
f"https://spark.apache.org/docs/{version}/rdd-programming-guide.html",
),
(
"Spark SQL, Datasets, and DataFrames",
f"https://spark.apache.org/docs/{version}/sql-programming-guide.html",
),
(
"Structured Streaming",
f"https://spark.apache.org/docs/{version}/structured-streaming-programming-guide.html",
),
(
"Spark Streaming",
f"https://spark.apache.org/docs/{version}/streaming-programming-guide.html",
),
("MLlib", f"https://spark.apache.org/docs/{version}/ml-guide.html"),
("GraphX", f"https://spark.apache.org/docs/{version}/graphx-programming-guide.html"),
("SparkR", f"https://spark.apache.org/docs/{version}/sparkr.html"),
(
"PySpark",
f"https://spark.apache.org/docs/{version}/api/python/getting_started/index.html",
),
(
"Spark SQL CLI",
f"https://spark.apache.org/docs/{version}/"
f"sql-distributed-sql-engine-spark-sql-cli.html",
),
]
for title, url in programming_guides:
content.append(f"- [{title}]({url})")
content.append("")
content.append("## API Docs")
content.append("")
# TODO: Update API docs to point to their own llms.txt files once available
# e.g., https://spark.apache.org/docs/{version}/api/python/llms.txt
api_docs = [
("Spark Python API", f"https://spark.apache.org/docs/{version}/api/python/index.html"),
(
"Spark Scala API",
f"https://spark.apache.org/docs/{version}/api/scala/org/apache/spark/index.html",
),
("Spark Java API", f"https://spark.apache.org/docs/{version}/api/java/index.html"),
("Spark R API", f"https://spark.apache.org/docs/{version}/api/R/index.html"),
(
"Spark SQL Built-in Functions",
f"https://spark.apache.org/docs/{version}/api/sql/index.html",
),
]
for title, url in api_docs:
content.append(f"- [{title}]({url})")
content.append("")
content.append("## Deployment Guides")
content.append("")
deployment_guides = [
("Cluster Overview", f"https://spark.apache.org/docs/{version}/cluster-overview.html"),
(
"Submitting Applications",
f"https://spark.apache.org/docs/{version}/submitting-applications.html",
),
(
"Standalone Deploy Mode",
f"https://spark.apache.org/docs/{version}/spark-standalone.html",
),
("YARN", f"https://spark.apache.org/docs/{version}/running-on-yarn.html"),
("Kubernetes", f"https://spark.apache.org/docs/{version}/running-on-kubernetes.html"),
]
for title, url in deployment_guides:
content.append(f"- [{title}]({url})")
content.append("")
content.append("## Other Documents")
content.append("")
other_docs = [
("Configuration", f"https://spark.apache.org/docs/{version}/configuration.html"),
("Monitoring", f"https://spark.apache.org/docs/{version}/monitoring.html"),
("Web UI", f"https://spark.apache.org/docs/{version}/web-ui.html"),
("Tuning Guide", f"https://spark.apache.org/docs/{version}/tuning.html"),
("Job Scheduling", f"https://spark.apache.org/docs/{version}/job-scheduling.html"),
("Security", f"https://spark.apache.org/docs/{version}/security.html"),
(
"Hardware Provisioning",
f"https://spark.apache.org/docs/{version}/hardware-provisioning.html",
),
(
"Cloud Infrastructures",
f"https://spark.apache.org/docs/{version}/cloud-integration.html",
),
("Migration Guide", f"https://spark.apache.org/docs/{version}/migration-guide.html"),
("Building Spark", f"https://spark.apache.org/docs/{version}/building-spark.html"),
]
for title, url in other_docs:
content.append(f"- [{title}]({url})")
content.append("")
content.append("## External Resources")
content.append("")
content.append("- [Apache Spark Home](https://spark.apache.org/)")
content.append("- [Downloads](https://spark.apache.org/downloads.html)")
content.append("- [GitHub Repository](https://github.com/apache/spark)")
content.append("- [Issue Tracker (JIRA)](https://issues.apache.org/jira/projects/SPARK)")
content.append("- [Mailing Lists](https://spark.apache.org/mailing-lists.html)")
content.append("- [Community](https://spark.apache.org/community.html)")
content.append("- [Contributing](https://spark.apache.org/contributing.html)")
content.append("")
with open(output_path, "w", encoding="utf-8") as f:
f.write("\n".join(content))
print(f"Generated {output_path}")
total_docs = len(programming_guides) + len(api_docs) + len(deployment_guides) + len(other_docs)
sections_count = 5
print(f"Total documentation pages indexed: {total_docs}")
print(f"Sections: {sections_count}")
def main():
parser = argparse.ArgumentParser(
description="Generate llms.txt file for Apache Spark documentation"
)
parser.add_argument(
"--docs-path", type=str, default="docs", help="Path to the docs directory (default: docs)"
)
parser.add_argument(
"--output", type=str, default="llms.txt", help="Output file path (default: llms.txt)"
)
parser.add_argument(
"--version",
type=str,
default="latest",
help="Spark documentation version (default: latest)",
)
args = parser.parse_args()
# Convert to Path objects
script_dir = Path(__file__).parent
project_root = script_dir.parent.parent # Go up two levels from dev/create-release/
docs_path = project_root / args.docs_path
output_path = project_root / args.output
# Check if docs directory exists
if not docs_path.exists():
print(f"Error: Documentation directory '{docs_path}' does not exist")
sys.exit(1)
# Generate the llms.txt file
generate_llms_txt(docs_path, output_path, args.version)
if __name__ == "__main__":
main()