blob: e7ab1cb1ce0e0812f9e2c098e98d21ff1fa48841 [file] [log] [blame]
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Simple script that converts old refguide page names as of 8.11.1 to the new Antora URLs from 9.0
See input files in folder gen-refguide-redirects/
The old-guide.txt is the plain .adoc names from an 'ls | grep adoc' in old ref-guide src folder
The new-guide.txt is the output from this command from the new repo in the 'modules' folder:
find . | grep adoc | sed 's/\/pages//g' | sed 's/^.\///g'
The mappings.csv comes from the explicit page renamings sourced from spreadsheet
https://docs.google.com/spreadsheets/d/1mwxSpn5Ky7-P4DLFrJGel2h7Il4muTlHmAA-AuRY1rs/edit#gid=982988701
"""
import os
import sys
from pprint import pprint
sys.path.append(os.path.dirname(__file__))
import argparse
def read_config():
parser = argparse.ArgumentParser(description='Convert old refguide page names to new')
parser.add_argument('--old', required=True, help='Old pagenames file, one .adoc filename per line')
parser.add_argument('--new', required=True, help='New pagenames file, one .adoc filename per line')
parser.add_argument('--mapping', required=True, help='Semicolon separated from-to file names (adoc)')
parser.add_argument('--htaccess', action='store_true', default=False, help='Output as htaccess rules')
newconf = parser.parse_args()
return newconf
def out(text):
global conf
if not conf.htaccess:
print(text)
def lines_from_file(filename):
with open(filename, 'r') as fp:
lines = []
for line in fp.readlines():
if line.startswith("#") or len(line.strip()) == 0:
continue
lines.append(line.replace(".adoc", ".html").strip())
return lines
def main():
global conf
conf = read_config()
new = {}
name_map = {}
out("Reading config")
old = lines_from_file(conf.old)
for line in lines_from_file(conf.new):
(path, file) = line.split("/")
new[file] = line
for line in lines_from_file(conf.mapping):
(frm, to) = line.split(";")
name_map[frm] = to
# Files in src/old-pages as of 2022-02-04
old_pages = ["configuration-apis.html", "configuration-guide.html", "controlling-results.html", "deployment-guide.html", "enhancing-queries.html", "field-types.html", "fields-and-schema-design.html", "getting-started.html", "indexing-data-operations.html", "installation-deployment.html", "monitoring-solr.html", "query-guide.html", "scaling-solr.html", "schema-indexing-guide.html", "solr-concepts.html", "solr-schema.html", "solrcloud-clusters.html", "user-managed-clusters.html"]
result = {}
old_guide = []
failed = {}
regex_new = {}
out("Converting...")
for frm in old:
if frm in new:
(subpath, name) = new[frm].split("/")
if subpath not in regex_new:
regex_new[subpath] = []
regex_new[subpath].append(name.split(".html")[0])
elif frm in name_map:
new_name = name_map[frm]
new_name_without_anchor = new_name
anchor = ""
anchor_index = new_name.find("#")
if anchor_index > 0:
new_name_without_anchor = new_name[:anchor_index]
anchor = new_name[anchor_index:]
if new_name_without_anchor.startswith("https://"):
result[frm] = new_name
elif new_name_without_anchor in new:
result[frm] = new[new_name_without_anchor] + anchor
elif new_name_without_anchor.startswith("/guide/"):
result[frm] = new_name[7:]
elif new_name_without_anchor == "_8_11":
old_guide.append(frm.split(".html")[0])
else:
failed[frm] = "Mapped value %s not in new guide" % new_name_without_anchor
elif frm in old_pages:
failed[frm] = "Not yet mapped (in src/old-pages)"
else:
failed[frm] = "404"
if conf.htaccess:
print("# Existing pages moved to sub path in the 9.0 guide")
for key in regex_new:
print("RedirectMatch 301 ^/guide/(%s)\.html /guide/solr/latest/%s/$1.html" % ("|".join(regex_new[key]), key))
print("# Page renames between 8.x and 9.0")
print("RewriteRule ^guide/9_0/solr-tutorial.html /guide/solr/latest/getting-started/solr-tutorial.html [R=301,NE,L]")
for key in result:
if result[key].startswith("https://"):
print("RewriteRule ^guide/%s %s [R=301,NE,L]" % (key, result[key]))
else:
print("RewriteRule ^guide/%s /guide/solr/latest/%s [R=301,NE,L]" % (key, result[key]))
print("# Removed pages redirected to latest 8.x guide")
old_version_pages_regex = "(%s)\.html" % "|".join(old_guide)
print("RedirectMatch 301 ^/guide/%s /guide/8_11/$1.html" % old_version_pages_regex)
print("# Paths we could not map")
for key in failed:
print("# %s: %s" % (key, failed[key]))
print("""
# Do not index old reference guide pages on search engines, except for pages that don't exist in 9+
<If "%%{REQUEST_URI} =~ m#/guide/(6|7|8)_.*# && %%{REQUEST_URI} !~ m#/guide/8_11/%s$#">
Header set X-Robots-Tag "noindex,nofollow,noarchive"
</If>""" % old_version_pages_regex)
else:
out("Regex mappings:")
pprint(regex_new)
out("Rename mappings:")
pprint(result)
out("Old refGuide mappings:")
pprint(old_guide)
out("Failed mappings:")
pprint(failed)
if __name__ == '__main__':
try:
main()
except KeyboardInterrupt:
print('\nReceived Ctrl-C, exiting early')