blob: 67ed2b14249be9bb5fddae7e91370819077dd83e [file] [log] [blame]
#!/usr/bin/python
# Usage: scripts/synthesize_entries.py \
# third_party/effective_tld_names/effective_tld_names.dat
#
# Author: Jeff Kaufman (jefftk@google.com)
#
# There are two unusual cases that the C code for the domain registry provider
# can't handle properly:
#
# 1. list contains both *.c and a.b.c
# 2. list contains a.b.c.d and c.d but not b.c.d
#
# We can fix this by synthesizing entries for b.c in case 1 and b.c.d in case
# 2. The first fix is completely ok, just redundant, but the second case means
# categorizing foo.b.c.d as being under public suffic b.c.d instead of the
# technically correct c.d. This is rare and doesn't seem to be a problem for
# existing entries (like amazonaws.com), so for now we'll just work around the
# broken code with slightly messy data.
#
# See https://code.google.com/p/domain-registry-provider/issues/detail?id=3 for
# more discussion.
import sys
from collections import defaultdict
def start(effective_tld_names_fname):
entries = set()
with open(effective_tld_names_fname) as inf:
for line in inf:
line = line.split("//")[0].strip() # remove comments and whitespace
if not line:
continue
entries.add(line)
# Allow multiple comments per entry, in case we want the same entry multiple
# times for different reasons.
new_entries = defaultdict(list) # {new_entry: [comments], ...]
for entry in entries:
if entry.count('.') >= 2:
leaf, candidate_entry = entry.split(".", 1)
if leaf.startswith("!") or leaf == "*":
continue
_, parent = candidate_entry.split(".", 1)
if candidate_entry in entries:
continue
star_parent = "*.%s" % parent
if star_parent in entries or parent in entries:
# If entry is 'a.b.c' and '*.c' is also an entry, we need to
# synthesize a new entry 'b.c'.
#
# If entry is a.b.c and c is also an entry but b.c is not, we
# need to synthesize a new entry b.c. This one isn't
# technically legal, but it's better than getting it completely
# wrong which is what the C code would otherwise do with this
# case.
new_entries[candidate_entry].append("For %s" % entry)
if not new_entries:
return
with open(effective_tld_names_fname, "a") as outf:
category = "DOMAIN REGISTRY PROVIDER SYNTHESIZED DOMAINS"
def add_comment(s):
outf.write("// %s\n" % s)
add_comment("===BEGIN %s===" % category)
add_comment("synthesized by scripts/synthesize_entries.py")
for new_entry in sorted(new_entries):
for comment in sorted(new_entries[new_entry]):
add_comment(comment)
outf.write("%s\n" % new_entry)
add_comment("===END %s===" % category)
if __name__ == "__main__":
start(*sys.argv[1:])