dev/generate_srs_registry.py - spark - Git at Google

 #!/usr/bin/env python3

 #
 # Licensed to the Apache Software Foundation (ASF) under one or more
 # contributor license agreements.  See the NOTICE file distributed with
 # this work for additional information regarding copyright ownership.
 # The ASF licenses this file to You under the Apache License, Version 2.0
 # (the "License"); you may not use this file except in compliance with
 # the License.  You may obtain a copy of the License at
 #
 #    http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #

 """
 Generate the Spatial Reference System (SRS) registry for Apache Spark.

 Downloads CRS definitions from the PROJ (Cartographic Projections and
 Coordinate Transformations Library) GitHub repository and generates a CSV
 file used by Spark for geospatial types on both the JVM and Python sides.

 PROJ is a C/C++ library (https://proj.org/) that maintains the authoritative
 EPSG and ESRI CRS databases. This script extracts SRID metadata from PROJ's
 SQL source files, which contain easily parseable plain-text SRS information.

 The script produces entries from the following PROJ SQL files:
   - geodetic_crs.sql     (EPSG geodetic CRS: geographic, geocentric, etc.)
   - projected_crs.sql    (EPSG projected CRS)
   - compound_crs.sql     (EPSG compound CRS)
   - vertical_crs.sql     (EPSG vertical CRS)
   - engineering_crs.sql  (EPSG engineering CRS)
   - esri.sql             (ESRI geodetic, projected, compound, vertical, engineering CRS)

 Prerequisites:
     Python 3.9+ (no third-party packages required)

 Usage:
     # Generate from the default PROJ version:
     python dev/generate_srs_registry.py

     # Generate from a specific PROJ version:
     python dev/generate_srs_registry.py --proj-version 9.7.1

     # Verify the generated files:
     wc -l sql/api/src/main/resources/org/apache/spark/sql/srs_registry.csv
     wc -l python/pyspark/sql/srs_registry.csv

 Upgrade workflow:
     1. Update `DEFAULT_PROJ_VERSION` to the new PROJ release tag.
     2. Run this script using `python dev/generate_srs_registry.py`.
     3. Review the diff to see which SRIDs were added or removed.
 """

 import argparse
 import csv
 import io
 import os
 import re
 import sys
 import urllib.request

 # Default PROJ version to download SQL files from.
 DEFAULT_PROJ_VERSION = "9.7.1"
 # PLEASE ENSURE THIS IS UPDATED TO A VALID PROJ VERSION TAG WHEN UPGRADING!

 # Default timeout (in seconds) for downloading SQL files from GitHub.
 DEFAULT_DOWNLOAD_TIMEOUT_SECS = 30

 # URL template for raw SQL files from the PROJ GitHub repository.
 PROJ_RAW_URL = "https://raw.githubusercontent.com/OSGeo/PROJ/{version}/data/sql/{filename}"

 # PROJ SQL files to download. EPSG CRS definitions are spread across
 # dedicated files, while ESRI definitions are all in a single file.
 PROJ_SQL_FILES = [
     "geodetic_crs.sql",
     "projected_crs.sql",
     "compound_crs.sql",
     "vertical_crs.sql",
     "engineering_crs.sql",
     "esri.sql",
 ]

 # Output paths for the generated CSV, relative to the Spark repo root.
 JAVA_RESOURCE_PATH = os.path.join(
     "sql", "api", "src", "main", "resources", "org", "apache", "spark", "sql", "srs_registry.csv"
 )
 PYTHON_RESOURCE_PATH = os.path.join("python", "pyspark", "sql", "srs_registry.csv")


 def download_sql(version, filename, timeout=DEFAULT_DOWNLOAD_TIMEOUT_SECS):
     """Download a SQL file from the PROJ GitHub repository at a pinned version tag."""
     url = PROJ_RAW_URL.format(version=version, filename=filename)
     print(f"  Downloading {url}")
     try:
         with urllib.request.urlopen(url, timeout=timeout) as response:
             return response.read().decode("utf-8")
     except urllib.error.URLError as e:
         print(f"ERROR: Failed to download {url}: {e}", file=sys.stderr)
         if "CERTIFICATE_VERIFY_FAILED" in str(e):
             print(
                 "Hint: Run 'Install Certificates.command' from your Python "
                 "installation, or set the SSL_CERT_FILE environment variable.",
                 file=sys.stderr,
             )
         print(f"Check that PROJ version '{version}' exists as a GitHub tag.", file=sys.stderr)
         sys.exit(1)


 def parse_sql_values(values_str):
     """
     Parse the comma-separated fields inside a SQL VALUES(...) clause.

     Handles SQL-quoted strings (single quotes with '' escape for literal
     apostrophes) and unquoted NULL / integer literals.

     Returns a list of string values, with NULL represented as None.
     """
     fields = []
     i = 0
     n = len(values_str)
     while i < n:
         if values_str[i] in (" ", "\t"):
             i += 1
             continue
         if values_str[i] == "'":
             # Quoted string: scan until closing quote ('' is an escaped quote).
             i += 1
             buf = []
             while i < n:
                 if values_str[i] == "'" and i + 1 < n and values_str[i + 1] == "'":
                     buf.append("'")
                     i += 2
                 elif values_str[i] == "'":
                     i += 1
                     break
                 else:
                     buf.append(values_str[i])
                     i += 1
             fields.append("".join(buf))
         elif values_str[i : i + 4].upper() == "NULL":
             fields.append(None)
             i += 4
         else:
             # Unquoted literal (integer, etc.)
             j = i
             while j < n and values_str[j] not in (",", ")"):
                 j += 1
             fields.append(values_str[i:j].strip())
             i = j
         # Skip comma separator.
         while i < n and values_str[i] in (",", " ", "\t"):
             if values_str[i] == ",":
                 i += 1
                 break
             i += 1
     return fields


 def parse_geodetic_crs(sql_content):
     """
     Parse geodetic_crs INSERT statements from SQL content.

     The `type` field (position 4) determines whether the CRS is geographic:
         'geographic 2D', 'geographic 3D' -> geographic
         'geocentric', 'other'            -> non-geographic

     Deprecated entries are included for completeness.
     Entries with non-numeric codes are skipped.

     Returns a list of (srid, string_id, is_geographic) tuples.
     """
     results = []
     pattern = re.compile(r'INSERT INTO "geodetic_crs" VALUES\((.+)\);', re.IGNORECASE)
     for line in sql_content.splitlines():
         match = pattern.search(line)
         if not match:
             continue
         fields = parse_sql_values(match.group(1))
         if len(fields) < 5:
             continue
         auth_name = fields[0]
         code = fields[1]
         crs_type = fields[4]
         try:
             srid = int(code)
         except (ValueError, TypeError):
             continue
         is_geographic = crs_type is not None and crs_type.startswith("geographic")
         string_id = f"{auth_name}:{code}"
         results.append((srid, string_id, is_geographic))
     return results


 def parse_simple_crs(sql_content, table_name):
     """
     Parse INSERT statements for CRS tables that are always non-geographic.

     Works for projected_crs, compound_crs, vertical_crs, and engineering_crs tables.
     Fields used: auth_name (0), code (1).

     Deprecated entries are included for completeness.
     Entries with non-numeric codes are skipped.

     Returns a list of (srid, string_id, is_geographic=False) tuples.
     """
     results = []
     pattern = re.compile(rf'INSERT INTO "{table_name}" VALUES\((.+)\);', re.IGNORECASE)
     for line in sql_content.splitlines():
         match = pattern.search(line)
         if not match:
             continue
         fields = parse_sql_values(match.group(1))
         if len(fields) < 2:
             continue
         auth_name = fields[0]
         code = fields[1]
         try:
             srid = int(code)
         except (ValueError, TypeError):
             continue
         string_id = f"{auth_name}:{code}"
         results.append((srid, string_id, False))
     return results


 def parse_all_crs_from_sql(sql_content):
     """
     Parse all CRS types (geodetic, projected, compound, vertical, engineering)
     from a single SQL file. Used for multi-table files like esri.sql.
     """
     entries = []
     entries.extend(parse_geodetic_crs(sql_content))
     for table in ["projected_crs", "compound_crs", "vertical_crs", "engineering_crs"]:
         entries.extend(parse_simple_crs(sql_content, table))
     return entries


 def write_csv(entries, proj_version, output_path):
     """Write SRS entries to a CSV file with a metadata header."""
     os.makedirs(os.path.dirname(output_path), exist_ok=True)
     buf = io.StringIO()
     writer = csv.writer(buf)
     writer.writerow(["srid", "string_id", "is_geographic"])
     for srid, string_id, is_geographic in sorted(entries):
         writer.writerow([srid, string_id, str(is_geographic).lower()])
     csv_data = buf.getvalue().replace("\r\n", "\n")
     with open(output_path, "w", encoding="utf-8") as f:
         f.write(
             f"# Generated by dev/generate_srs_registry.py from PROJ {proj_version}\n"
             f"# Source: https://github.com/OSGeo/PROJ/tree/{proj_version}/data/sql\n"
             f"# Do not edit manually. Re-run the script to regenerate.\n"
         )
         f.write(csv_data)


 def main():
     parser = argparse.ArgumentParser(
         description="Generate the SRS registry for Apache Spark from PROJ data."
     )
     parser.add_argument(
         "--proj-version",
         default=DEFAULT_PROJ_VERSION,
         help=f"PROJ release tag to download from (default: {DEFAULT_PROJ_VERSION})",
     )
     parser.add_argument(
         "--repo-root",
         default=None,
         help="Path to the Spark repository root (auto-detected if not set)",
     )
     parser.add_argument(
         "--timeout",
         type=int,
         default=DEFAULT_DOWNLOAD_TIMEOUT_SECS,
         help=f"Download timeout in seconds (default: {DEFAULT_DOWNLOAD_TIMEOUT_SECS})",
     )
     args = parser.parse_args()

     # Auto-detect repo root: this script lives in dev/ under the repo root.
     if args.repo_root:
         repo_root = args.repo_root
     else:
         repo_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))

     print(f"Spark repo root: {repo_root}")
     print(f"PROJ version: {args.proj_version}")
     print()

     # Download PROJ SQL files.
     print("Downloading PROJ SQL files...")
     sql_files = {}
     for filename in PROJ_SQL_FILES:
         sql_files[filename] = download_sql(args.proj_version, filename, args.timeout)
     print()

     # Parse CRS entries from EPSG-specific files.
     print("Parsing CRS entries...")
     all_entries = []

     geodetic = parse_geodetic_crs(sql_files["geodetic_crs.sql"])
     print(f"  geodetic_crs.sql:  {len(geodetic)} entries")
     all_entries.extend(geodetic)

     projected = parse_simple_crs(sql_files["projected_crs.sql"], "projected_crs")
     print(f"  projected_crs.sql: {len(projected)} entries")
     all_entries.extend(projected)

     compound = parse_simple_crs(sql_files["compound_crs.sql"], "compound_crs")
     print(f"  compound_crs.sql:  {len(compound)} entries")
     all_entries.extend(compound)

     vertical = parse_simple_crs(sql_files["vertical_crs.sql"], "vertical_crs")
     print(f"  vertical_crs.sql:  {len(vertical)} entries")
     all_entries.extend(vertical)

     engineering = parse_simple_crs(sql_files["engineering_crs.sql"], "engineering_crs")
     print(f"  engineering_crs.sql: {len(engineering)} entries")
     all_entries.extend(engineering)

     # Parse ESRI entries from the combined esri.sql file.
     esri = parse_all_crs_from_sql(sql_files["esri.sql"])
     print(f"  esri.sql:          {len(esri)} entries")
     all_entries.extend(esri)

     print()

     # Deduplicate: when the same SRID appears in multiple tables or authorities,
     # keep the first occurrence. Since EPSG files are parsed before ESRI, this
     # gives EPSG precedence over ESRI for conflicting SRIDs.
     seen = set()
     deduped = []
     duplicates = 0
     for entry in all_entries:
         if entry[0] not in seen:
             deduped.append(entry)
             seen.add(entry[0])
         else:
             duplicates += 1
     if duplicates:
         print(f"  Removed {duplicates} duplicate SRID(s)")
     all_entries = deduped

     # Count entries by authority.
     authority_counts = {}
     for _, string_id, _ in all_entries:
         auth = string_id.split(":")[0]
         authority_counts[auth] = authority_counts.get(auth, 0) + 1

     n_geographic = sum(1 for _, _, g in all_entries if g)
     n_nongeographic = len(all_entries) - n_geographic
     print()
     print(
         f"  Total: {len(all_entries)} entries "
         f"({n_geographic} geographic, {n_nongeographic} non-geographic)"
     )
     print("  Breakdown by authority:")
     for auth in sorted(authority_counts):
         print(f"    {auth}: {authority_counts[auth]}")
     print()

     # Write CSV to both Java and Python resource directories.
     java_path = os.path.join(repo_root, JAVA_RESOURCE_PATH)
     python_path = os.path.join(repo_root, PYTHON_RESOURCE_PATH)

     print("Writing CSV files...")
     write_csv(all_entries, args.proj_version, java_path)
     print(f"  {java_path}")
     write_csv(all_entries, args.proj_version, python_path)
     print(f"  {python_path}")
     print()

     print("Done. Verify with:")
     print(f"  wc -l {JAVA_RESOURCE_PATH}")
     print(f"  git diff {JAVA_RESOURCE_PATH}")


 if __name__ == "__main__":
     main()
	#!/usr/bin/env python3

	#
	# Licensed to the Apache Software Foundation (ASF) under one or more
	# contributor license agreements. See the NOTICE file distributed with
	# this work for additional information regarding copyright ownership.
	# The ASF licenses this file to You under the Apache License, Version 2.0
	# (the "License"); you may not use this file except in compliance with
	# the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	#

	"""
	Generate the Spatial Reference System (SRS) registry for Apache Spark.

	Downloads CRS definitions from the PROJ (Cartographic Projections and
	Coordinate Transformations Library) GitHub repository and generates a CSV
	file used by Spark for geospatial types on both the JVM and Python sides.

	PROJ is a C/C++ library (https://proj.org/) that maintains the authoritative
	EPSG and ESRI CRS databases. This script extracts SRID metadata from PROJ's
	SQL source files, which contain easily parseable plain-text SRS information.

	The script produces entries from the following PROJ SQL files:
	- geodetic_crs.sql (EPSG geodetic CRS: geographic, geocentric, etc.)
	- projected_crs.sql (EPSG projected CRS)
	- compound_crs.sql (EPSG compound CRS)
	- vertical_crs.sql (EPSG vertical CRS)
	- engineering_crs.sql (EPSG engineering CRS)
	- esri.sql (ESRI geodetic, projected, compound, vertical, engineering CRS)

	Prerequisites:
	Python 3.9+ (no third-party packages required)

	Usage:
	# Generate from the default PROJ version:
	python dev/generate_srs_registry.py

	# Generate from a specific PROJ version:
	python dev/generate_srs_registry.py --proj-version 9.7.1

	# Verify the generated files:
	wc -l sql/api/src/main/resources/org/apache/spark/sql/srs_registry.csv
	wc -l python/pyspark/sql/srs_registry.csv

	Upgrade workflow:
	1. Update `DEFAULT_PROJ_VERSION` to the new PROJ release tag.
	2. Run this script using `python dev/generate_srs_registry.py`.
	3. Review the diff to see which SRIDs were added or removed.
	"""

	import argparse
	import csv
	import io
	import os
	import re
	import sys
	import urllib.request

	# Default PROJ version to download SQL files from.
	DEFAULT_PROJ_VERSION = "9.7.1"
	# PLEASE ENSURE THIS IS UPDATED TO A VALID PROJ VERSION TAG WHEN UPGRADING!

	# Default timeout (in seconds) for downloading SQL files from GitHub.
	DEFAULT_DOWNLOAD_TIMEOUT_SECS = 30

	# URL template for raw SQL files from the PROJ GitHub repository.
	PROJ_RAW_URL = "https://raw.githubusercontent.com/OSGeo/PROJ/{version}/data/sql/{filename}"

	# PROJ SQL files to download. EPSG CRS definitions are spread across
	# dedicated files, while ESRI definitions are all in a single file.
	PROJ_SQL_FILES = [
	"geodetic_crs.sql",
	"projected_crs.sql",
	"compound_crs.sql",
	"vertical_crs.sql",
	"engineering_crs.sql",
	"esri.sql",
	]

	# Output paths for the generated CSV, relative to the Spark repo root.
	JAVA_RESOURCE_PATH = os.path.join(
	"sql", "api", "src", "main", "resources", "org", "apache", "spark", "sql", "srs_registry.csv"
	)
	PYTHON_RESOURCE_PATH = os.path.join("python", "pyspark", "sql", "srs_registry.csv")


	def download_sql(version, filename, timeout=DEFAULT_DOWNLOAD_TIMEOUT_SECS):
	"""Download a SQL file from the PROJ GitHub repository at a pinned version tag."""
	url = PROJ_RAW_URL.format(version=version, filename=filename)
	print(f" Downloading {url}")
	try:
	with urllib.request.urlopen(url, timeout=timeout) as response:
	return response.read().decode("utf-8")
	except urllib.error.URLError as e:
	print(f"ERROR: Failed to download {url}: {e}", file=sys.stderr)
	if "CERTIFICATE_VERIFY_FAILED" in str(e):
	print(
	"Hint: Run 'Install Certificates.command' from your Python "
	"installation, or set the SSL_CERT_FILE environment variable.",
	file=sys.stderr,
	)
	print(f"Check that PROJ version '{version}' exists as a GitHub tag.", file=sys.stderr)
	sys.exit(1)


	def parse_sql_values(values_str):
	"""
	Parse the comma-separated fields inside a SQL VALUES(...) clause.

	Handles SQL-quoted strings (single quotes with '' escape for literal
	apostrophes) and unquoted NULL / integer literals.

	Returns a list of string values, with NULL represented as None.
	"""
	fields = []
	i = 0
	n = len(values_str)
	while i < n:
	if values_str[i] in (" ", "\t"):
	i += 1
	continue
	if values_str[i] == "'":
	# Quoted string: scan until closing quote ('' is an escaped quote).
	i += 1
	buf = []
	while i < n:
	if values_str[i] == "'" and i + 1 < n and values_str[i + 1] == "'":
	buf.append("'")
	i += 2
	elif values_str[i] == "'":
	i += 1
	break
	else:
	buf.append(values_str[i])
	i += 1
	fields.append("".join(buf))
	elif values_str[i : i + 4].upper() == "NULL":
	fields.append(None)
	i += 4
	else:
	# Unquoted literal (integer, etc.)
	j = i
	while j < n and values_str[j] not in (",", ")"):
	j += 1
	fields.append(values_str[i:j].strip())
	i = j
	# Skip comma separator.
	while i < n and values_str[i] in (",", " ", "\t"):
	if values_str[i] == ",":
	i += 1
	break
	i += 1
	return fields


	def parse_geodetic_crs(sql_content):
	"""
	Parse geodetic_crs INSERT statements from SQL content.

	The `type` field (position 4) determines whether the CRS is geographic:
	'geographic 2D', 'geographic 3D' -> geographic
	'geocentric', 'other' -> non-geographic

	Deprecated entries are included for completeness.
	Entries with non-numeric codes are skipped.

	Returns a list of (srid, string_id, is_geographic) tuples.
	"""
	results = []
	pattern = re.compile(r'INSERT INTO "geodetic_crs" VALUES\((.+)\);', re.IGNORECASE)
	for line in sql_content.splitlines():
	match = pattern.search(line)
	if not match:
	continue
	fields = parse_sql_values(match.group(1))
	if len(fields) < 5:
	continue
	auth_name = fields[0]
	code = fields[1]
	crs_type = fields[4]
	try:
	srid = int(code)
	except (ValueError, TypeError):
	continue
	is_geographic = crs_type is not None and crs_type.startswith("geographic")
	string_id = f"{auth_name}:{code}"
	results.append((srid, string_id, is_geographic))
	return results


	def parse_simple_crs(sql_content, table_name):
	"""
	Parse INSERT statements for CRS tables that are always non-geographic.

	Works for projected_crs, compound_crs, vertical_crs, and engineering_crs tables.
	Fields used: auth_name (0), code (1).

	Deprecated entries are included for completeness.
	Entries with non-numeric codes are skipped.

	Returns a list of (srid, string_id, is_geographic=False) tuples.
	"""
	results = []
	pattern = re.compile(rf'INSERT INTO "{table_name}" VALUES\((.+)\);', re.IGNORECASE)
	for line in sql_content.splitlines():
	match = pattern.search(line)
	if not match:
	continue
	fields = parse_sql_values(match.group(1))
	if len(fields) < 2:
	continue
	auth_name = fields[0]
	code = fields[1]
	try:
	srid = int(code)
	except (ValueError, TypeError):
	continue
	string_id = f"{auth_name}:{code}"
	results.append((srid, string_id, False))
	return results


	def parse_all_crs_from_sql(sql_content):
	"""
	Parse all CRS types (geodetic, projected, compound, vertical, engineering)
	from a single SQL file. Used for multi-table files like esri.sql.
	"""
	entries = []
	entries.extend(parse_geodetic_crs(sql_content))
	for table in ["projected_crs", "compound_crs", "vertical_crs", "engineering_crs"]:
	entries.extend(parse_simple_crs(sql_content, table))
	return entries


	def write_csv(entries, proj_version, output_path):
	"""Write SRS entries to a CSV file with a metadata header."""
	os.makedirs(os.path.dirname(output_path), exist_ok=True)
	buf = io.StringIO()
	writer = csv.writer(buf)
	writer.writerow(["srid", "string_id", "is_geographic"])
	for srid, string_id, is_geographic in sorted(entries):
	writer.writerow([srid, string_id, str(is_geographic).lower()])
	csv_data = buf.getvalue().replace("\r\n", "\n")
	with open(output_path, "w", encoding="utf-8") as f:
	f.write(
	f"# Generated by dev/generate_srs_registry.py from PROJ {proj_version}\n"
	f"# Source: https://github.com/OSGeo/PROJ/tree/{proj_version}/data/sql\n"
	f"# Do not edit manually. Re-run the script to regenerate.\n"
	)
	f.write(csv_data)


	def main():
	parser = argparse.ArgumentParser(
	description="Generate the SRS registry for Apache Spark from PROJ data."
	)
	parser.add_argument(
	"--proj-version",
	default=DEFAULT_PROJ_VERSION,
	help=f"PROJ release tag to download from (default: {DEFAULT_PROJ_VERSION})",
	)
	parser.add_argument(
	"--repo-root",
	default=None,
	help="Path to the Spark repository root (auto-detected if not set)",
	)
	parser.add_argument(
	"--timeout",
	type=int,
	default=DEFAULT_DOWNLOAD_TIMEOUT_SECS,
	help=f"Download timeout in seconds (default: {DEFAULT_DOWNLOAD_TIMEOUT_SECS})",
	)
	args = parser.parse_args()

	# Auto-detect repo root: this script lives in dev/ under the repo root.
	if args.repo_root:
	repo_root = args.repo_root
	else:
	repo_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))

	print(f"Spark repo root: {repo_root}")
	print(f"PROJ version: {args.proj_version}")
	print()

	# Download PROJ SQL files.
	print("Downloading PROJ SQL files...")
	sql_files = {}
	for filename in PROJ_SQL_FILES:
	sql_files[filename] = download_sql(args.proj_version, filename, args.timeout)
	print()

	# Parse CRS entries from EPSG-specific files.
	print("Parsing CRS entries...")
	all_entries = []

	geodetic = parse_geodetic_crs(sql_files["geodetic_crs.sql"])
	print(f" geodetic_crs.sql: {len(geodetic)} entries")
	all_entries.extend(geodetic)

	projected = parse_simple_crs(sql_files["projected_crs.sql"], "projected_crs")
	print(f" projected_crs.sql: {len(projected)} entries")
	all_entries.extend(projected)

	compound = parse_simple_crs(sql_files["compound_crs.sql"], "compound_crs")
	print(f" compound_crs.sql: {len(compound)} entries")
	all_entries.extend(compound)

	vertical = parse_simple_crs(sql_files["vertical_crs.sql"], "vertical_crs")
	print(f" vertical_crs.sql: {len(vertical)} entries")
	all_entries.extend(vertical)

	engineering = parse_simple_crs(sql_files["engineering_crs.sql"], "engineering_crs")
	print(f" engineering_crs.sql: {len(engineering)} entries")
	all_entries.extend(engineering)

	# Parse ESRI entries from the combined esri.sql file.
	esri = parse_all_crs_from_sql(sql_files["esri.sql"])
	print(f" esri.sql: {len(esri)} entries")
	all_entries.extend(esri)

	print()

	# Deduplicate: when the same SRID appears in multiple tables or authorities,
	# keep the first occurrence. Since EPSG files are parsed before ESRI, this
	# gives EPSG precedence over ESRI for conflicting SRIDs.
	seen = set()
	deduped = []
	duplicates = 0
	for entry in all_entries:
	if entry[0] not in seen:
	deduped.append(entry)
	seen.add(entry[0])
	else:
	duplicates += 1
	if duplicates:
	print(f" Removed {duplicates} duplicate SRID(s)")
	all_entries = deduped

	# Count entries by authority.
	authority_counts = {}
	for _, string_id, _ in all_entries:
	auth = string_id.split(":")[0]
	authority_counts[auth] = authority_counts.get(auth, 0) + 1

	n_geographic = sum(1 for _, _, g in all_entries if g)
	n_nongeographic = len(all_entries) - n_geographic
	print()
	print(
	f" Total: {len(all_entries)} entries "
	f"({n_geographic} geographic, {n_nongeographic} non-geographic)"
	)
	print(" Breakdown by authority:")
	for auth in sorted(authority_counts):
	print(f" {auth}: {authority_counts[auth]}")
	print()

	# Write CSV to both Java and Python resource directories.
	java_path = os.path.join(repo_root, JAVA_RESOURCE_PATH)
	python_path = os.path.join(repo_root, PYTHON_RESOURCE_PATH)

	print("Writing CSV files...")
	write_csv(all_entries, args.proj_version, java_path)
	print(f" {java_path}")
	write_csv(all_entries, args.proj_version, python_path)
	print(f" {python_path}")
	print()

	print("Done. Verify with:")
	print(f" wc -l {JAVA_RESOURCE_PATH}")
	print(f" git diff {JAVA_RESOURCE_PATH}")


	if __name__ == "__main__":
	main()