dev-tools/scripts/changes2logchange.py - solr - Git at Google

 #!/usr/bin/env python3
 #
 # Licensed to the Apache Software Foundation (ASF) under one or more
 # contributor license agreements.  See the NOTICE file distributed with
 # this work for additional information regarding copyright ownership.
 # The ASF licenses this file to You under the Apache License, Version 2.0
 # (the "License"); you may not use this file except in compliance with
 # the License.  You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.

 """
 Migration script to convert Apache Solr's legacy CHANGES.txt file format
 to the new logchange YAML-based format.

 This script parses the monolithic CHANGES.txt file and generates individual
 YAML files for each changelog entry, organized by version.
 """

 import os
 import re
 import sys
 import json
 import yaml
 import html
 from pathlib import Path
 from dataclasses import dataclass, asdict, field
 from typing import List, Optional, Tuple


 class ChangeType:
     """Mapping of CHANGES.txt section headings to logchange types."""

     # Section headings that should be skipped entirely (no entries created)
     SKIP_SECTIONS = {
         "Versions of Major Components",
         "Detailed Change List",
         "Upgrading from Solr any prior release",
         "Upgrading from previous Solr versions",
         "System Requirements",
         "Lucene Information",
         "Status",
     }

     # Maps various section heading patterns to logchange types
     HEADING_MAP = {
         # New Features / Additions
         "New Features": "added",
         "Features": "added",
         "New Functionality": "added",

         # Improvements / Changes
         "Improvements": "changed",
         "Enhancements": "changed",
         "Changes": "changed",
         "Improvements / Changes": "changed",

         # Performance / Optimizations
         "Optimizations": "changed",
         "Performance": "changed",
         "Optimization": "changed",

         # Bug Fixes
         "Bug Fixes": "fixed",
         "Bug Fix": "fixed",
         "Bugs": "fixed",

         # Deprecations
         "Deprecations": "deprecated",
         "Deprecation": "deprecated",
         "Deprecation Notices": "deprecated",
         "Deprecation Removals": "removed",  # This is more about removals but was in Deprecations section

         # Removed / Removed Features
         "Removed": "removed",
         "Removal": "removed",
         "Removed Features": "removed",
         "Removals": "removed",

         # Security
         "Security": "security",
         "Security Fixes": "security",

         # Dependency Upgrades
         "Dependency Upgrades": "dependency_update",
         "Dependency Updates": "dependency_update",
         "Dependency Upgrade": "dependency_update",
         "Dependencies": "dependency_update",

         # Build / Infrastructure
         "Build": "other",
         "Build Changes": "other",
         "Build Fixes": "other",

         # Upgrade Notes - special category
         "Upgrade Notes": "upgrade_notes",

         # Other
         "Other Changes": "other",
         "Other": "other",
         "Miscellaneous": "other",
         "Docker": "other",
         "Ref Guide": "other",
         "Documentation": "other",
     }

     @staticmethod
     def get_type(heading: str) -> str:
         """Map a section heading to a logchange type."""
         heading_normalized = heading.strip()
         if heading_normalized in ChangeType.HEADING_MAP:
             return ChangeType.HEADING_MAP[heading_normalized]

         # Fallback: try case-insensitive matching
         for key, value in ChangeType.HEADING_MAP.items():
             if key.lower() == heading_normalized.lower():
                 return value

         # Default to "other" if no match found
         print(f"Warning: Unknown section heading '{heading}', defaulting to 'other'", file=sys.stderr)
         return "other"


 @dataclass
 class Author:
     """Represents a changelog entry author/contributor."""
     name: str
     nick: Optional[str] = None
     url: Optional[str] = None

     def to_dict(self):
         """Convert to dictionary, excluding None values."""
         result = {"name": self.name}
         if self.nick:
             result["nick"] = self.nick
         if self.url:
             result["url"] = self.url
         return result


 @dataclass
 class Link:
     """Represents a link (JIRA issue or GitHub PR)."""
     name: str
     url: str

     def to_dict(self):
         """Convert to dictionary."""
         return {"name": self.name, "url": self.url}


 @dataclass
 class ChangeEntry:
     """Represents a single changelog entry."""
     title: str
     change_type: str
     authors: List[Author] = field(default_factory=list)
     links: List[Link] = field(default_factory=list)

     def to_dict(self):
         """Convert to dictionary for YAML serialization."""
         return {
             "title": self.title,
             "type": self.change_type,
             "authors": [author.to_dict() for author in self.authors],
             "links": [link.to_dict() for link in self.links],
         }


 class AuthorParser:
     """Parses author/contributor information from entry text."""

     # Pattern to match TRAILING author list at the end of an entry: (Author1, Author2 via Committer)
     # Must be at the very end, possibly with trailing punctuation
     # Strategy: Match from the last '(' that leads to end-of-string pattern matching
     # This regex finds the LAST occurrence of a parenthesized group followed by optional whitespace/punctuation
     # and then end of string
     AUTHOR_PATTERN = re.compile(r'\s+\(([^()]+)\)\s*[.,]?\s*$', re.MULTILINE)

     # Pattern to detect JIRA/GitHub issue references (should be extracted as links, not authors)
     # Matches: SOLR-65, LUCENE-123, INFRA-456, PR#789, PR-789, GITHUB#123
     ISSUE_PATTERN = re.compile(r'^(?:SOLR|LUCENE|INFRA)-\d+$|^PR[#-]\d+$|^GITHUB#\d+$')

     @staticmethod
     def parse_authors(entry_text: str) -> Tuple[str, List[Author]]:
         """
         Extract authors from entry text.

         Returns:
             Tuple of (cleaned_text, list_of_authors)

         Patterns handled:
         - (Author Name)
         - (Author1, Author2)
         - (Author Name via CommitterName)
         - (Author1 via Committer1, Author2 via Committer2)

         Only matches author attribution at the END of the entry text,
         not in the middle of descriptions like (aka Standalone)

         Note: JIRA/GitHub issue IDs found in the author section are NOT added as authors,
         but are preserved in the returned text so IssueExtractor can process them as links.
         """
         # Find ALL matches and use the LAST one (rightmost)
         # This ensures we get the actual author attribution, not mid-text parentheses
         matches = list(AuthorParser.AUTHOR_PATTERN.finditer(entry_text))
         if not matches:
             return entry_text, []

         # Use the last match (rightmost)
         match = matches[-1]

         author_text = match.group(1)
         # Include the space before the parenthesis in what we remove
         cleaned_text = entry_text[:match.start()].rstrip()

         authors = []
         found_issues = []  # Track JIRA issues found in author section

         # Split by comma and slash, which are both used as delimiters in author sections
         # Patterns handled:
         # - "Author1, Author2" (comma delimiter)
         # - "Author1 / Author2" (slash delimiter)
         # - "Author1, Issue1 / Author2" (mixed delimiters)
         # Also aware of "via" keyword: "Author via Committer"
         segments = [seg.strip() for seg in re.split(r'[,/]', author_text)]

         for segment in segments:
             segment = segment.strip()
             if not segment:
                 continue

             # Check if this is a JIRA/GitHub issue reference
             if AuthorParser.ISSUE_PATTERN.match(segment):
                 # Don't add as author, but remember to add it back to text for IssueExtractor
                 found_issues.append(segment)
                 continue

             # Handle "via" prefix (standalone or after author name)
             if segment.startswith('via '):
                 # Malformed: standalone "via Committer" (comma was added incorrectly)
                 # Extract just the committer name
                 committer_name = segment[4:].strip()  # Remove "via " prefix
                 if committer_name and not AuthorParser.ISSUE_PATTERN.match(committer_name):
                     authors.append(Author(name=committer_name))
             elif ' via ' in segment:
                 # Format: "Author via Committer"
                 parts = segment.split(' via ')
                 author_name = parts[0].strip()
                 committer_name = parts[1].strip() if len(parts) > 1 else ""

                 # Add author if not an issue ID
                 if author_name and not AuthorParser.ISSUE_PATTERN.match(author_name):
                     authors.append(Author(name=author_name))

                 # Also add committer (the part after "via") as an author
                 if committer_name and not AuthorParser.ISSUE_PATTERN.match(committer_name):
                     authors.append(Author(name=committer_name))
             else:
                 # Just an author name (if not an issue ID)
                 if not AuthorParser.ISSUE_PATTERN.match(segment):
                     authors.append(Author(name=segment))

         # Add found issues back to the cleaned text so IssueExtractor can find them
         if found_issues:
             cleaned_text = cleaned_text + " " + " ".join(found_issues)

         return cleaned_text, authors


 class IssueExtractor:
     """Extracts issue/PR references from entry text."""

     JIRA_ISSUE_PATTERN = re.compile(r'(?:SOLR|LUCENE|INFRA)-(\d+)')
     GITHUB_PR_PATTERN = re.compile(r'(?:GitHub\s*)?#(\d+)')

     @staticmethod
     def extract_issues(entry_text: str) -> List[Link]:
         """Extract JIRA and GitHub issue references."""
         links = []
         seen_issues = set()  # Track seen issues to avoid duplicates

         # Extract SOLR, LUCENE, INFRA issues
         for match in IssueExtractor.JIRA_ISSUE_PATTERN.finditer(entry_text):
             issue_id = match.group(0)  # Full "SOLR-12345" or "LUCENE-12345" format
             if issue_id not in seen_issues:
                 url = f"https://issues.apache.org/jira/browse/{issue_id}"
                 links.append(Link(name=issue_id, url=url))
                 seen_issues.add(issue_id)

         # Extract GitHub PRs in multiple formats:
         # "PR#3758", "PR-2475", "GITHUB#3666"
         github_patterns = [
             (r'PR[#-](\d+)', 'PR#'),  # PR#1234 or PR-1234
             (r'GITHUB#(\d+)', 'GITHUB#'),  # GITHUB#3666
         ]

         for pattern_str, prefix in github_patterns:
             pattern = re.compile(pattern_str)
             for match in pattern.finditer(entry_text):
                 pr_num = match.group(1)
                 pr_name = f"{prefix}{pr_num}"
                 if pr_name not in seen_issues:
                     url = f"https://github.com/apache/solr/pull/{pr_num}"
                     links.append(Link(name=pr_name, url=url))
                     seen_issues.add(pr_name)

         return links


 class SlugGenerator:
     """Generates slug-style filenames for YAML files."""

     # Characters that are unsafe in filenames on various filesystems
     # Avoid: < > : " / \ | ? *  and control characters
     # Note: # is safe on most filesystems
     UNSAFE_CHARS_PATTERN = re.compile(r'[<>:"/\\|?*\x00-\x1f]+')

     @staticmethod
     def generate_slug(issue_id: str, title: str) -> str:
         """
         Generate a slug from issue ID and title.

         Format: ISSUE-12345 short slug or VERSION entry 001 short slug
         Note: Previous slug formats used dashes ("ISSUE-12345-short-slug"), but this script now uses spaces between components (e.g., "ISSUE-12345 short slug").
         Spaces are preferred over dashes for improved readability, better preservation of word boundaries, and to avoid unnecessary character substitutions. This change also ensures that filenames remain filesystem-safe while being more human-friendly.
         Uses the actual issue ID without forcing SOLR- prefix
         Ensures filesystem-safe filenames and respects word boundaries
         Whitespace is preserved as spaces (not converted to dashes)
         """
         # Sanitize issue_id to remove unsafe characters (preserve case and # for readability)
         base_issue = SlugGenerator._sanitize_issue_id(issue_id)

         # Create slug from title: lowercase, preserve spaces, replace only unsafe chars with dash
         title_slug = SlugGenerator._sanitize_filename_part(title)

         # Limit to reasonable length while respecting word boundaries
         # Target max length: 50 chars for slug (leaving room for base_issue and space)
         if len(title_slug) > 50:
             # Find last word/space boundary within 50 chars
             truncated = title_slug[:50]
             # Find the last space within the limit
             last_space = truncated.rfind(' ')
             if last_space > 20:  # Keep at least 20 chars to avoid too-short slugs
                 title_slug = truncated[:last_space]
             else:
                 # If no good space boundary, try to find a dash (from unsafe chars)
                 last_dash = truncated.rfind('-')
                 if last_dash > 20:
                     title_slug = truncated[:last_dash]
                 else:
                     # If no good boundary, use hard limit and clean up
                     title_slug = truncated.rstrip(' -')

         return f"{base_issue} {title_slug}"

     @staticmethod
     def _sanitize_issue_id(issue_id: str) -> str:
         """
         Sanitize issue ID while preserving uppercase letters and # for readability.
         Examples: SOLR-12345, LUCENE-1234, PR#3758, GITHUB#2408, v9.8.0-entry-001
         """
         # Replace unsafe characters with dash (preserving case)
         sanitized = SlugGenerator.UNSAFE_CHARS_PATTERN.sub('-', issue_id)

         # Replace remaining unsafe characters (but keep letters/numbers/dash/hash/dot)
         sanitized = re.sub(r'[^a-zA-Z0-9.#-]+', '-', sanitized)

         # Replace multiple consecutive dashes with single dash
         sanitized = re.sub(r'-+', '-', sanitized)

         # Strip leading/trailing dashes
         sanitized = sanitized.strip('-')

         return sanitized

     @staticmethod
     def _sanitize_filename_part(text: str) -> str:
         """
         Sanitize text for use in filenames.
         - Convert to lowercase
         - Remove quotes, colons, backticks
         - Replace other unsafe characters with dashes
         - Convert any whitespace to single space
         - Strip leading/trailing spaces and dashes
         """
         # Convert to lowercase
         text = text.lower()

         # Normalize all whitespace to single spaces
         text = re.sub(r'\s+', ' ', text)

         # Remove quotes, colons, backticks entirely (don't replace with dash)
         text = re.sub(r'["\':´`]', '', text)

         # Replace other unsafe characters (from UNSAFE_CHARS_PATTERN) with dash
         # This covers: < > " / \ | ? * and control characters
         # Note: we already removed quotes and colons above
         text = SlugGenerator.UNSAFE_CHARS_PATTERN.sub('-', text)

         # Replace other non-alphanumeric (except space, dash, and dot) with dash
         text = re.sub(r'[^a-z0-9\s.\-]+', '-', text)

         # Replace multiple consecutive dashes with single dash (but preserve spaces)
         text = re.sub(r'-+', '-', text)

         # Remove trailing dashes before we clean up space-dash sequences
         text = text.rstrip('-')

         # Handle " -" and "- " sequences: collapse to single space
         text = re.sub(r'\s*-\s*', ' ', text)

         # Replace multiple consecutive spaces with single space
         text = re.sub(r'\s+', ' ', text)

         # Strip leading/trailing spaces
         text = text.strip(' ')

         return text


 class VersionSection:
     """Represents all entries for a specific version."""

     def __init__(self, version: str):
         self.version = version
         self.entries: List[ChangeEntry] = []

     def add_entry(self, entry: ChangeEntry):
         """Add an entry to this version."""
         self.entries.append(entry)

     def get_directory_name(self) -> str:
         """Get the directory name for this version (e.g., 'v10.0.0')."""
         return f"v{self.version}"


 class ChangesParser:
     """Main parser for CHANGES.txt file."""

     # Pattern to match version headers: ==================  10.0.0 ==================
     # Also supports pre-release versions: 4.0.0-ALPHA, 4.0.0-BETA, 4.0.0-RC1, etc.
     VERSION_HEADER_PATTERN = re.compile(r'=+\s+([\d.]+(?:-[A-Za-z0-9]+)?)\s+=+')

     # Pattern to match section headers: "Section Name" followed by dashes
     # Matches patterns like "New Features\n---------------------"
     SECTION_HEADER_PATTERN = re.compile(r'^([A-Za-z][A-Za-z0-9\s/&-]*?)\n\s*-+\s*$', re.MULTILINE)

     def __init__(self, changes_file_path: str):
         self.changes_file_path = changes_file_path
         self.versions: List[VersionSection] = []

     def parse(self):
         """Parse the CHANGES.txt file."""
         with open(self.changes_file_path, 'r', encoding='utf-8') as f:
             content = f.read()

         # Split into version sections
         version_matches = list(self.VERSION_HEADER_PATTERN.finditer(content))

         for i, version_match in enumerate(version_matches):
             version = version_match.group(1)
             start_pos = version_match.end()

             # Find the end of this version section (start of next version or EOF)
             if i + 1 < len(version_matches):
                 end_pos = version_matches[i + 1].start()
             else:
                 end_pos = len(content)

             version_content = content[start_pos:end_pos]
             version_section = self._parse_version_section(version, version_content)
             self.versions.append(version_section)

     def _parse_version_section(self, version: str, content: str) -> VersionSection:
         """Parse all entries within a single version section."""
         version_section = VersionSection(version)

         # Split into subsections (New Features, Bug Fixes, etc.)
         section_matches = list(self.SECTION_HEADER_PATTERN.finditer(content))

         for i, section_match in enumerate(section_matches):
             section_name = section_match.group(1)

             # Skip sections that should not be migrated
             if section_name in ChangeType.SKIP_SECTIONS:
                 continue

             section_type = ChangeType.get_type(section_name)

             start_pos = section_match.end()

             # Find the end of this section (start of next section or EOF)
             if i + 1 < len(section_matches):
                 end_pos = section_matches[i + 1].start()
             else:
                 end_pos = len(content)

             section_content = content[start_pos:end_pos]

             # Parse entries in this section
             entries = self._parse_entries(section_content, section_type)
             for entry in entries:
                 version_section.add_entry(entry)

         return version_section

     def _parse_entries(self, section_content: str, change_type: str) -> List[ChangeEntry]:
         """Parse individual entries within a section.

         Handles both:
         - Bulleted entries: * text
         - Numbered entries: 1. text, 2. text, etc. (older format)
         """
         entries = []

         # First try to split by bulleted entries (* prefix)
         bulleted_pattern = re.compile(r'^\*\s+', re.MULTILINE)
         bulleted_entries = bulleted_pattern.split(section_content)

         if len(bulleted_entries) > 1:
             # Has bulleted entries
             for entry_text in bulleted_entries[1:]:  # Skip first empty split
                 entry_text = entry_text.strip()
                 if not entry_text or entry_text == "(No changes)":
                     continue
                 entry = self._parse_single_entry(entry_text, change_type)
                 if entry:
                     entries.append(entry)
         else:
             # No bulleted entries, try numbered entries (old format: "1. text", "2. text", etc.)
             numbered_pattern = re.compile(r'^\s{0,2}\d+\.\s+', re.MULTILINE)
             if numbered_pattern.search(section_content):
                 # Has numbered entries
                 numbered_entries = numbered_pattern.split(section_content)
                 for entry_text in numbered_entries[1:]:  # Skip first empty split
                     entry_text = entry_text.strip()
                     if not entry_text:
                         continue
                     entry = self._parse_single_entry(entry_text, change_type)
                     if entry:
                         entries.append(entry)
             else:
                 # No standard entries found, try as paragraph
                 entry_text = section_content.strip()
                 if entry_text and entry_text != "(No changes)":
                     entry = self._parse_single_entry(entry_text, change_type)
                     if entry:
                         entries.append(entry)

         return entries

     def _parse_single_entry(self, entry_text: str, change_type: str) -> Optional[ChangeEntry]:
         """Parse a single entry into a ChangeEntry object."""
         # Extract authors
         description, authors = AuthorParser.parse_authors(entry_text)

         # Extract issues/PRs
         links = IssueExtractor.extract_issues(description)

         # Remove all issue/PR IDs from the description text
         # Handle multiple formats of issue references at the beginning:

         # 1. Remove leading issues with mixed projects: "LUCENE-3323,SOLR-2659,LUCENE-3329,SOLR-2666: description"
         description = re.sub(r'^(?:(?:SOLR|LUCENE|INFRA)-\d+(?:\s*[,:]?\s*)?)+:\s*', '', description)

         # 2. Remove SOLR-specific issues: "SOLR-12345: description" or "SOLR-12345, SOLR-12346: description"
         description = re.sub(r'^(?:SOLR-\d+(?:\s*,\s*SOLR-\d+)*\s*[:,]?\s*)+', '', description)

         # 3. Remove PR references: "PR#123: description" or "GITHUB#456: description"
         description = re.sub(r'^(?:(?:PR|GITHUB)#\d+(?:\s*,\s*(?:PR|GITHUB)#\d+)*\s*[:,]?\s*)+', '', description)

         # 4. Remove parenthesized issue lists at start: "(SOLR-123, SOLR-456)"
         description = re.sub(r'^\s*\((?:SOLR-\d+(?:\s*,\s*)?)+\)\s*', '', description)
         description = re.sub(r'^\s*\((?:(?:SOLR|LUCENE|INFRA)-\d+(?:\s*,\s*)?)+\)\s*', '', description)

         # 5. Remove any remaining leading issue references
         description = re.sub(r'^[\s,;]*(?:SOLR-\d+|LUCENE-\d+|INFRA-\d+|PR#\d+|GITHUB#\d+)[\s,:;]*', '', description)
         while re.match(r'^[\s,;]*(?:SOLR-\d+|LUCENE-\d+|INFRA-\d+|PR#\d+|GITHUB#\d+)', description):
             description = re.sub(r'^[\s,;]*(?:SOLR-\d+|LUCENE-\d+|INFRA-\d+|PR#\d+|GITHUB#\d+)[\s,:;]*', '', description)

         description = description.strip()

         # Normalize whitespace: collapse multiple newlines/spaces into single spaces
         # This joins multi-line formatted text into a single coherent paragraph
         description = re.sub(r'\s+', ' ', description)

         # Escape HTML angle brackets to prevent markdown rendering issues
         # Only escape < and > to avoid breaking markdown links and quotes
         description = description.replace('<', '&lt;').replace('>', '&gt;')

         if not description:
             return None

         return ChangeEntry(
             title=description,
             change_type=change_type,
             authors=authors,
             links=links,
         )


 class YamlWriter:
     """Writes ChangeEntry objects to YAML files."""

     @staticmethod
     def write_entry(entry: ChangeEntry, slug: str, output_dir: Path):
         """Write a single entry to a YAML file."""
         # Ensure output directory exists
         output_dir.mkdir(parents=True, exist_ok=True)

         filename = f"{slug}.yml"
         filepath = output_dir / filename

         # Convert entry to dictionary and write as YAML
         entry_dict = entry.to_dict()

         with open(filepath, 'w', encoding='utf-8') as f:
             # Use custom YAML dumper for better formatting
             yaml.dump(
                 entry_dict,
                 f,
                 default_flow_style=False,
                 sort_keys=False,
                 allow_unicode=True,
                 width=80  # Line width for better readability
             )

         return filepath


 class ReleaseDate:
     """Fetches and manages release dates from Apache projects JSON."""

     @staticmethod
     def fetch_release_dates_and_latest() -> tuple:
         """
         Fetch release dates from Apache projects JSON and identify latest version.

         Returns:
             Tuple of (version_dates_dict, latest_version_string)
             Example: ({'9.9.0': '2025-07-24', ...}, '9.9.0')
         """
         import urllib.request
         from packaging import version as pkg_version

         version_dates = {}
         latest_version = None
         latest_version_obj = None

         url = "https://projects.apache.org/json/projects/solr.json"

         try:
             response = urllib.request.urlopen(url, timeout=10)
             data = json.loads(response.read().decode('utf-8'))

             releases = data.get('release', [])
             for release in releases:
                 ver = release.get('revision')
                 created = release.get('created')

                 if ver and created:
                     version_dates[ver] = created

                     # Track the latest (highest) version
                     try:
                         ver_obj = pkg_version.parse(ver)
                         if latest_version_obj is None or ver_obj > latest_version_obj:
                             latest_version_obj = ver_obj
                             latest_version = ver
                     except Exception:
                         # Skip invalid version strings
                         pass
         except Exception as e:
             print(f"Warning: Could not fetch release dates: {e}", file=sys.stderr)

         return version_dates, latest_version


 class VersionWriter:
     """Handles version enumeration, comparison, and release-date.txt writing."""

     def __init__(self, changes_file_path: str, changelog_dir: str):
         self.changes_file_path = changes_file_path
         self.changelog_dir = Path(changelog_dir)
         self.parser = ChangesParser(changes_file_path)

         # Fetch release dates from Apache projects JSON
         version_dates_raw, _ = ReleaseDate.fetch_release_dates_and_latest()

         # Normalize version keys for consistent lookup (e.g., "3.1" -> "3.1.0")
         self.version_dates = {}
         for version, date in version_dates_raw.items():
             normalized = self._normalize_version(version)
             # Keep the first occurrence (most canonical form)
             if normalized not in self.version_dates:
                 self.version_dates[normalized] = date

     def run(self):
         """Execute version comparison and release-date.txt writing."""
         print("Parsing CHANGES.txt for versions...")
         self.parser.parse()

         # Extract versions from CHANGES.txt
         changes_versions = set(vs.version for vs in self.parser.versions)
         print(f"Found {len(changes_versions)} versions in CHANGES.txt")

         # Get existing version folders
         existing_folders = self.get_existing_version_folders()
         print(f"Found {len(existing_folders)} existing version folders in changelog/")

         # Get versions from solr.json (which is what ReleaseDate fetches)
         solr_json_versions = set(self.version_dates.keys())
         print(f"Found {len(solr_json_versions)} versions in solr.json\n")

         # Build normalized version mappings for matching (supports semver like 3.1 == 3.1.0)
         changes_normalized = {self._normalize_version(v): v for v in changes_versions}
         existing_normalized = {self._normalize_version(v): v for v in existing_folders}
         solr_normalized = {self._normalize_version(v): v for v in solr_json_versions}

         # Combine all normalized versions
         all_normalized = sorted(set(changes_normalized.keys()) | set(solr_normalized.keys()) | set(existing_normalized.keys()),
                                key=self._version_sort_key)

         # Print comparison report
         self._print_comparison_report(all_normalized, changes_normalized, solr_normalized, existing_normalized)

         # Write release-date.txt for existing folders
         self._write_release_dates(existing_normalized)

     def get_existing_version_folders(self) -> set:
         """Get all existing vX.Y.Z folders in changelog/."""
         if not self.changelog_dir.exists():
             return set()

         folders = set()
         for item in self.changelog_dir.iterdir():
             if item.is_dir() and item.name.startswith('v') and item.name[1:].replace('.', '').isdigit():
                 # Extract version without 'v' prefix
                 version = item.name[1:]
                 folders.add(version)

         return folders

     @staticmethod
     def _normalize_version(version: str) -> str:
         """
         Normalize incomplete version strings to X.Y.Z format.
         Complete versions (3+ numeric parts) are left unchanged.
         Incomplete versions are padded with zeros.
         Pre-release versions (e.g., 4.0.0-ALPHA) are handled correctly.

         Supports semantic versioning where "3.1" matches "3.1.0".
         But keeps distinct versions separate: 3.6.0, 3.6.1, 3.6.2 are NOT normalized to the same value.

         Examples:
         - "3.1" -> "3.1.0" (2 parts, pad to 3)
         - "3" -> "3.0.0" (1 part, pad to 3)
         - "3.1.0" -> "3.1.0" (3 parts, unchanged)
         - "3.6.1" -> "3.6.1" (3 parts, unchanged)
         - "3.6.2" -> "3.6.2" (3 parts, unchanged - NOT collapsed!)
         - "4.0.0-ALPHA" -> "4.0.0-ALPHA" (pre-release, unchanged)
         - "4.0-ALPHA" -> "4.0.0-ALPHA" (incomplete pre-release, pad to 3 numeric parts)
         - "4.0.0-ALPHA.0" -> "4.0.0-ALPHA" (remove spurious .0 from pre-release)
         - "3.1.0.0" -> "3.1.0.0" (4 parts, unchanged)
         """
         # Check if this is a pre-release version (contains dash)
         if '-' in version:
             # Split on the dash to separate numeric version from pre-release identifier
             base_version, prerelease = version.split('-', 1)
             base_parts = base_version.split('.')

             # Pad the base version to 3 parts
             while len(base_parts) < 3:
                 base_parts.append('0')

             # Take only first 3 numeric parts, then rejoin with pre-release identifier
             # This prevents "4.0.0-ALPHA.0" from being added
             normalized_base = '.'.join(base_parts[:3])
             return f"{normalized_base}-{prerelease}"
         else:
             # Non-pre-release version - use original logic
             parts = version.split('.')

             # If already 3+ parts, return as-is (complete version)
             if len(parts) >= 3:
                 return version

             # If less than 3 parts, pad with zeros to make it 3 parts
             while len(parts) < 3:
                 parts.append('0')
             return '.'.join(parts)

     def _version_sort_key(self, version: str) -> tuple:
         """Convert version string to sortable tuple for proper ordering."""
         try:
             from packaging import version as pkg_version
             return (pkg_version.parse(version),)
         except Exception:
             return (version,)

     def _print_comparison_report(self, all_normalized_versions: list, changes_normalized: dict,
                                 solr_normalized: dict, existing_normalized: dict):
         """
         Print a comparison report of versions across sources.

         Args:
             all_normalized_versions: List of normalized versions to display
             changes_normalized: Dict mapping normalized version -> original version from CHANGES.txt
             solr_normalized: Dict mapping normalized version -> original version from solr.json
             existing_normalized: Dict mapping normalized version -> original version from folders
         """
         print("=" * 100)
         print(f"{'Normalized':<15} | {'CHANGES.txt':<15} | {'solr.json':<15} | {'Folder':<15} | {'Release Date':<20}")
         print("-" * 100)

         for norm_version in all_normalized_versions:
             in_changes = "✓" if norm_version in changes_normalized else " "
             in_solr_json = "✓" if norm_version in solr_normalized else " "
             has_folder = "✓" if norm_version in existing_normalized else " "

             # Get original version strings for display
             orig_changes = changes_normalized.get(norm_version, "")
             orig_solr = solr_normalized.get(norm_version, "")
             orig_folder = existing_normalized.get(norm_version, "")

             # Get release date using normalized version (all version_dates keys are normalized)
             release_date = self.version_dates.get(norm_version, "(no date)")

             # Format original versions as "orig" if different from normalized
             changes_str = f"{orig_changes}" if orig_changes and orig_changes != norm_version else ""
             solr_str = f"{orig_solr}" if orig_solr and orig_solr != norm_version else ""
             folder_str = f"{orig_folder}" if orig_folder and orig_folder != norm_version else ""

             print(f"{norm_version:<15} | {in_changes} {changes_str:<13} | {in_solr_json} {solr_str:<13} | {has_folder} {folder_str:<13} | {release_date:<20}")

         print("=" * 100)

     def _write_release_dates(self, existing_normalized: dict):
         """
         Write release-date.txt files for existing version folders that don't have them.

         Args:
             existing_normalized: Dict mapping normalized version -> original folder version string
         """
         written_count = 0
         skipped_count = 0

         print("\nWriting release-date.txt files:")
         for norm_version in sorted(existing_normalized.keys(), key=self._version_sort_key):
             orig_folder_version = existing_normalized[norm_version]
             version_dir = self.changelog_dir / f"v{orig_folder_version}"
             release_date_file = version_dir / "release-date.txt"

             # Get release date using normalized version (all version_dates keys are normalized)
             release_date = self.version_dates.get(norm_version)

             if release_date:
                 if release_date_file.exists():
                     existing_content = release_date_file.read_text().strip()
                     if existing_content == release_date:
                         print(f"  ✓ {orig_folder_version}: already has release-date.txt")
                     else:
                         print(f"  ⚠ {orig_folder_version}: already has release-date.txt with different date ({existing_content})")
                     skipped_count += 1
                 else:
                     with open(release_date_file, 'w', encoding='utf-8') as f:
                         f.write(release_date + '\n')
                     version_display = f"{orig_folder_version} (normalized: {norm_version})" if orig_folder_version != norm_version else orig_folder_version
                     print(f"  ✓ {version_display}: wrote release-date.txt ({release_date})")
                     written_count += 1
             else:
                 version_display = f"{orig_folder_version} (normalized: {norm_version})" if orig_folder_version != norm_version else orig_folder_version
                 print(f"  ⚠ {version_display}: no date found in solr.json")
                 skipped_count += 1

         print(f"\nSummary: {written_count} files written, {skipped_count} skipped/existing")


 class MigrationRunner:
     """Orchestrates the complete migration process."""

     def __init__(self, changes_file_path: str, output_base_dir: str, last_released_version: Optional[str] = None):
         self.changes_file_path = changes_file_path
         self.output_base_dir = Path(output_base_dir)
         self.parser = ChangesParser(changes_file_path)

         # Fetch release dates and latest version
         self.version_dates, detected_latest = ReleaseDate.fetch_release_dates_and_latest()

         # Use provided version or detected latest
         self.last_released_version = last_released_version or detected_latest

         if self.last_released_version:
             print(f"Latest released version: {self.last_released_version}", file=sys.stderr)

         self.stats = {
             'versions_processed': 0,
             'entries_migrated': 0,
             'entries_skipped': 0,
             'files_created': 0,
             'release_dates_written': 0,
             'unreleased_entries': 0,
         }

     def run(self):
         """Execute the migration."""
         print(f"Parsing CHANGES.txt from: {self.changes_file_path}")
         self.parser.parse()

         print(f"Found {len(self.parser.versions)} versions")

         for version_section in self.parser.versions:
             self._process_version(version_section)

         self._print_summary()

     def _process_version(self, version_section: VersionSection):
         """Process all entries for a single version."""
         from packaging import version as pkg_version

         # Determine if this version should go to unreleased folder
         is_unreleased = False
         if self.last_released_version:
             try:
                 current_ver = pkg_version.parse(version_section.version)
                 latest_ver = pkg_version.parse(self.last_released_version)
                 is_unreleased = current_ver > latest_ver
             except Exception:
                 # If parsing fails, treat as unreleased (conservative approach)
                 is_unreleased = True

         # Route to appropriate directory
         if is_unreleased:
             version_dir = self.output_base_dir / "unreleased"
             print(f"\nProcessing version {version_section.version} (unreleased):")
             self.stats['unreleased_entries'] += len(version_section.entries)
         else:
             version_dir = self.output_base_dir / version_section.get_directory_name()
             print(f"\nProcessing version {version_section.version}:")

         print(f"  Found {len(version_section.entries)} entries")

         # Write release-date.txt if we have a date for this version (only for released versions)
         if not is_unreleased and version_section.version in self.version_dates:
             release_date = self.version_dates[version_section.version]
             release_date_file = version_dir / "release-date.txt"
             version_dir.mkdir(parents=True, exist_ok=True)

             with open(release_date_file, 'w', encoding='utf-8') as f:
                 f.write(release_date + '\n')

             self.stats['release_dates_written'] += 1
             print(f"  Release date: {release_date}")

         entry_counter = 0  # For entries without explicit issue IDs

         for entry in version_section.entries:
             # Find primary issue ID from links
             issue_id = None
             for link in entry.links:
                 if link.name.startswith('SOLR-'):
                     issue_id = link.name
                     break

             if not issue_id:
                 # If no SOLR issue found, try to use other JIRA/PR formats
                 for link in entry.links:
                     if link.name.startswith(('LUCENE-', 'INFRA-', 'PR#', 'GITHUB#')):
                         issue_id = link.name
                         break

             if not issue_id:
                 # No standard issue/PR found, generate a synthetic ID
                 # Use format: unknown-001, unknown-002, etc.
                 entry_counter += 1
                 synthetic_id = f"unknown-{entry_counter:03d}"
                 issue_id = synthetic_id

             # Generate slug and write YAML
             slug = SlugGenerator.generate_slug(issue_id, entry.title)
             filepath = YamlWriter.write_entry(entry, slug, version_dir)

             print(f"    ✓ {slug}.yml")
             self.stats['entries_migrated'] += 1
             self.stats['files_created'] += 1

         self.stats['versions_processed'] += 1

     def _print_summary(self):
         """Print migration summary."""
         print("\n" + "="*60)
         print("Migration Summary:")
         print(f"  Versions processed:    {self.stats['versions_processed']}")
         print(f"  Entries migrated:      {self.stats['entries_migrated']}")
         print(f"  Entries skipped:       {self.stats['entries_skipped']}")
         print(f"  Files created:         {self.stats['files_created']}")
         print(f"  Release dates written: {self.stats['release_dates_written']}")
         if self.stats['unreleased_entries'] > 0:
             print(f"  Unreleased entries:    {self.stats['unreleased_entries']}")
         print("="*60)


 class StdinProcessor:
     """Process individual changelog entries from stdin and output YAML to stdout."""

     @staticmethod
     def process():
         """
         Read from stdin, parse individual changelog entries, and output YAML.

         Ignores headers and nested structure.
         Outputs YAML entries separated by '----' YAML separator.
         """
         import sys

         # Read all lines from stdin
         lines = sys.stdin.readlines()

         entries_yaml = []
         i = 0

         while i < len(lines):
             line = lines[i]

             # Skip empty lines and header lines (lines with only dashes or equals)
             if not line.strip() or re.match(r'^[-=\s]+$', line):
                 i += 1
                 continue

             # Check if this line starts a changelog entry (bullet point)
             if line.strip().startswith('*') or line.strip().startswith('-'):
                 # Collect the full entry (may span multiple lines)
                 entry_text = line.strip()[1:].strip()  # Remove bullet and leading spaces

                 # Continue reading continuation lines
                 i += 1
                 while i < len(lines):
                     next_line = lines[i]
                     # If the next line is another entry or empty, stop collecting
                     if (next_line.strip().startswith('*') or
                         next_line.strip().startswith('-') or
                         re.match(r'^[-=\s]+$', next_line) or
                         not next_line.strip()):
                         break
                     # Add to entry text
                     entry_text += ' ' + next_line.strip()
                     i += 1

                 # Parse the entry to a ChangeEntry
                 entry = EntryParser.parse_entry_line(entry_text)
                 if entry:
                     # Serialize to YAML
                     yaml_dict = {
                         'title': entry.title,
                         'type': entry.change_type,
                     }
                     if entry.authors:
                         yaml_dict['authors'] = [{'name': a.name} for a in entry.authors]
                     if entry.links:
                         yaml_dict['links'] = [
                             {'name': link.name, 'url': link.url}
                             for link in entry.links
                         ]

                     yaml_str = yaml.dump(yaml_dict, default_flow_style=False, sort_keys=False, allow_unicode=True)
                     entries_yaml.append(yaml_str.rstrip())
             else:
                 i += 1

         # Output entries separated by YAML separators
         for i, yaml_entry in enumerate(entries_yaml):
             if i > 0:
                 print('----')
             print(yaml_entry, end='')
             if yaml_entry and not yaml_entry.endswith('\n'):
                 print()


 class EntryParser:
     """Parse a single changelog entry line."""

     @staticmethod
     def parse_entry_line(text: str) -> Optional[ChangeEntry]:
         """
         Parse a single changelog entry line.

         Format: [ISSUE-ID: ]description (author1) (author2) ...
         """
         if not text.strip():
             return None

         # Extract issue links
         links = IssueExtractor.extract_issues(text)

         # Remove issue IDs from text
         for link in links:
             # Remove markdown link format [ID](url)
             text = re.sub(rf'\[{re.escape(link.name)}\]\([^)]+\)', '', text)
             # Remove plain text issue IDs
             text = re.sub(rf'{re.escape(link.name)}\s*:?\s*', '', text)

         text = text.strip()

         # Extract authors
         text, authors = AuthorParser.parse_authors(text)
         text = text.strip()

         # Escape HTML angle brackets
         text = text.replace('<', '&lt;').replace('>', '&gt;')

         if not text:
             return None

         # Default to 'other' type
         change_type = 'other'

         return ChangeEntry(
             title=text,
             change_type=change_type,
             authors=authors,
             links=links,
         )


 def main():
     """Main entry point."""
     import argparse

     parser = argparse.ArgumentParser(
         description="Migrate Apache Solr CHANGES.txt to logchange YAML format"
     )
     parser.add_argument(
         "changes_file",
         help="Path to the CHANGES.txt file to migrate. Use '-' to read individual changelog entries from stdin and output YAML to stdout"
     )
     parser.add_argument(
         "-o", "--output-dir",
         default="changelog",
         help="Directory to write changelog/ structure (default: ./changelog)"
     )
     parser.add_argument(
         "--last-released",
         help="Last released version (e.g., 9.9.0). Versions newer than this go to unreleased/. "
              "If not specified, fetches from Apache projects JSON."
     )
     parser.add_argument(
         "--write-versions",
         action="store_true",
         help="Parse CHANGES.txt to enumerate versions, compare with solr.json, and write release-date.txt files to existing changelog folders"
     )

     args = parser.parse_args()

     # Handle stdin/stdout mode
     if args.changes_file == '-':
         StdinProcessor.process()
         return

     if not os.path.exists(args.changes_file):
         print(f"Error: CHANGES.txt file not found: {args.changes_file}", file=sys.stderr)
         sys.exit(1)

     # Handle --write-versions mode
     if args.write_versions:
         writer = VersionWriter(args.changes_file, args.output_dir)
         writer.run()
         return

     # Standard migration mode
     runner = MigrationRunner(args.changes_file, args.output_dir, args.last_released)
     runner.run()


 if __name__ == "__main__":
     main()