blob: 2bdbe50da12b4d2f1bd2963dde9e881d1f68bc9c [file] [log] [blame]
#!/usr/bin/env python3
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
"""Merge incremental provider data into an existing registry dataset.
During incremental builds, extract_metadata.py produces JSON containing
only the updated provider(s). This script merges those partial results
into the full dataset previously downloaded from S3, so that the Eleventy
build still sees all providers.
Usage:
python dev/registry/merge_registry_data.py \
--existing-providers /tmp/existing-registry/providers.json \
--existing-modules /tmp/existing-registry/modules.json \
--new-providers dev/registry/providers.json \
--new-modules dev/registry/modules.json \
--output dev/registry/
"""
from __future__ import annotations
import argparse
import json
from pathlib import Path
from registry_contract_models import validate_modules_catalog, validate_providers_catalog
def merge(
existing_providers_path: Path,
existing_modules_path: Path,
new_providers_path: Path,
new_modules_path: Path,
output_dir: Path,
) -> None:
"""Replace entries for updated providers and write merged JSON files."""
existing_providers: list[dict] = []
if existing_providers_path.exists():
existing_providers = json.loads(existing_providers_path.read_text())["providers"]
new_providers = json.loads(new_providers_path.read_text())["providers"]
existing_modules: list[dict] = []
if existing_modules_path.exists():
existing_modules = json.loads(existing_modules_path.read_text())["modules"]
new_modules: list[dict] = []
if new_modules_path.exists():
new_modules = json.loads(new_modules_path.read_text())["modules"]
# IDs being replaced
new_ids = {p["id"] for p in new_providers}
# Merge providers: keep existing (except those being replaced), add new
merged_providers = [p for p in existing_providers if p["id"] not in new_ids]
merged_providers.extend(new_providers)
merged_providers.sort(key=lambda p: p["name"].lower())
# Merge modules: keep existing (except for replaced providers), add new
merged_modules = [m for m in existing_modules if m["provider_id"] not in new_ids]
merged_modules.extend(new_modules)
# Sort modules by provider's last_updated date (newest first)
date_by_provider = {p["id"]: p.get("last_updated", "") for p in merged_providers}
merged_modules.sort(key=lambda m: date_by_provider.get(m["provider_id"], ""), reverse=True)
providers_payload = validate_providers_catalog({"providers": merged_providers})
modules_payload = validate_modules_catalog({"modules": merged_modules})
# Write merged output
output_dir.mkdir(parents=True, exist_ok=True)
(output_dir / "providers.json").write_text(json.dumps(providers_payload, indent=2) + "\n")
(output_dir / "modules.json").write_text(json.dumps(modules_payload, indent=2) + "\n")
print(f"Merged {len(new_ids)} updated provider(s) into {len(merged_providers)} total providers")
print(f"Total modules: {len(merged_modules)}")
def main() -> None:
parser = argparse.ArgumentParser(description="Merge incremental registry data into existing dataset")
parser.add_argument("--existing-providers", required=True, type=Path)
parser.add_argument("--existing-modules", required=True, type=Path)
parser.add_argument("--new-providers", required=True, type=Path)
parser.add_argument("--new-modules", required=True, type=Path)
parser.add_argument("--output", required=True, type=Path)
args = parser.parse_args()
merge(args.existing_providers, args.existing_modules, args.new_providers, args.new_modules, args.output)
if __name__ == "__main__":
main()