/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.analysis.standard;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.net.URL;
import java.net.URLConnection;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.text.DateFormat;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.Date;
import java.util.List;
import java.util.Locale;
import java.util.SortedMap;
import java.util.SortedSet;
import java.util.TimeZone;
import java.util.TreeMap;
import java.util.TreeSet;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * Generates a file containing JFlex macros to accept valid ASCII TLDs (top level domains), for
 * inclusion in JFlex grammars that can accept domain names.
 *
 * <p>The IANA Root Zone Database is queried via HTTP from URL cmdline arg #0, the response is
 * parsed, and the results are written out to a file containing a JFlex macro that will accept all
 * valid ASCII-only TLDs, including punycode forms of internationalized TLDs (output file cmdline
 * arg #1).
 */
public class GenerateJflexTLDMacros {

  public static void main(String... args) throws Exception {
    if (args.length != 2 || args[0].equals("--help") || args[0].equals("-help")) {
      System.err.println("Cmd line params:");
      System.err.println(
          "\tjava " + GenerateJflexTLDMacros.class.getName() + "<ZoneFileURL> <JFlexOutputFile>");
      System.exit(1);
    }
    new GenerateJflexTLDMacros(args[0], args[1]).execute();
  }

  private static final String NL = System.getProperty("line.separator");

  private static final String APACHE_LICENSE =
      "/*"
          + NL
          + " * Licensed to the Apache Software Foundation (ASF) under one or more"
          + NL
          + " * contributor license agreements.  See the NOTICE file distributed with"
          + NL
          + " * this work for additional information regarding copyright ownership."
          + NL
          + " * The ASF licenses this file to You under the Apache License, Version 2.0"
          + NL
          + " * (the \"License\"); you may not use this file except in compliance with"
          + NL
          + " * the License.  You may obtain a copy of the License at"
          + NL
          + " *"
          + NL
          + " *     http://www.apache.org/licenses/LICENSE-2.0"
          + NL
          + " *"
          + NL
          + " * Unless required by applicable law or agreed to in writing, software"
          + NL
          + " * distributed under the License is distributed on an \"AS IS\" BASIS,"
          + NL
          + " * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied."
          + NL
          + " * See the License for the specific language governing permissions and"
          + NL
          + " * limitations under the License."
          + NL
          + " */"
          + NL;

  private static final Pattern TLD_PATTERN_1 = Pattern.compile("([-A-Za-z0-9]+)\\.\\s+NS\\s+.*");
  private static final Pattern TLD_PATTERN_2 =
      Pattern.compile("([-A-Za-z0-9]+)\\.\\s+\\d+\\s+IN\\s+NS\\s+.*");
  private final URL tldFileURL;
  private long tldFileLastModified = -1L;
  private final Path outputFile;
  private final SortedMap<String, Boolean> processedTLDsLongestFirst =
      new TreeMap<>(
          Comparator.comparing(String::length).reversed().thenComparing(String::compareTo));
  private final List<SortedSet<String>> TLDsBySuffixLength =
      new ArrayList<>(); // list position indicates suffix length

  public GenerateJflexTLDMacros(String tldFileURL, String outputFile) throws Exception {
    this.tldFileURL = new URL(tldFileURL);
    this.outputFile = Paths.get(outputFile);
  }

  /**
   * Downloads the IANA Root Zone Database, extracts the ASCII TLDs, then writes a set of JFlex
   * macros accepting any of them case-insensitively out to the specified output file.
   *
   * @throws IOException if there is a problem either downloading the database or writing out the
   *     output file.
   */
  public void execute() throws IOException {
    getIANARootZoneDatabase();
    partitionTLDprefixesBySuffixLength();
    writeOutput();
    System.out.println("Wrote TLD macros to '" + outputFile + "':");
    int totalDomains = 0;
    for (int suffixLength = 0; suffixLength < TLDsBySuffixLength.size(); ++suffixLength) {
      int domainsAtThisSuffixLength = TLDsBySuffixLength.get(suffixLength).size();
      totalDomains += domainsAtThisSuffixLength;
      System.out.printf(
          Locale.ROOT, "%30s: %4d TLDs%n", getMacroName(suffixLength), domainsAtThisSuffixLength);
    }
    System.out.printf(Locale.ROOT, "%30s: %4d TLDs%n", "Total", totalDomains);
  }

  /**
   * Downloads the IANA Root Zone Database.
   *
   * @throws java.io.IOException if there is a problem downloading the database
   */
  private void getIANARootZoneDatabase() throws IOException {
    final URLConnection connection = tldFileURL.openConnection();
    connection.setUseCaches(false);
    connection.addRequestProperty("Cache-Control", "no-cache");
    connection.connect();
    tldFileLastModified = connection.getLastModified();
    try (BufferedReader reader =
        new BufferedReader(
            new InputStreamReader(connection.getInputStream(), StandardCharsets.US_ASCII))) {
      String line;
      while (null != (line = reader.readLine())) {
        Matcher matcher = TLD_PATTERN_1.matcher(line);
        if (matcher.matches()) {
          // System.out.println("Found: " + matcher.group(1).toLowerCase(Locale.ROOT));
          processedTLDsLongestFirst.put(matcher.group(1).toLowerCase(Locale.ROOT), Boolean.FALSE);
        } else {
          matcher = TLD_PATTERN_2.matcher(line);
          if (matcher.matches()) {
            // System.out.println("Found: " + matcher.group(1).toLowerCase(Locale.ROOT));
            processedTLDsLongestFirst.put(matcher.group(1).toLowerCase(Locale.ROOT), Boolean.FALSE);
          }
        }
      }
    }
    System.out.println(
        "Found "
            + processedTLDsLongestFirst.size()
            + " TLDs in IANA Root Zone Database at "
            + tldFileURL);
  }

  /**
   * Partition TLDs by whether they are prefixes of other TLDs and then by suffix length. We only
   * care about TLDs that are prefixes and are exactly one character shorter than another TLD. See
   * LUCENE-8278 and LUCENE-5391.
   */
  private void partitionTLDprefixesBySuffixLength() {
    TLDsBySuffixLength.add(new TreeSet<>()); // initialize set for zero-suffix TLDs
    for (SortedMap.Entry<String, Boolean> entry : processedTLDsLongestFirst.entrySet()) {
      String TLD = entry.getKey();
      if (entry.getValue()) {
        // System.out.println("Skipping already processed: " + TLD);
        continue;
      }
      // System.out.println("Adding zero-suffix TLD: " + TLD);
      TLDsBySuffixLength.get(0).add(TLD);
      for (int suffixLength = 1; (TLD.length() - suffixLength) >= 2; ++suffixLength) {
        String TLDprefix = TLD.substring(0, TLD.length() - suffixLength);
        if (false == processedTLDsLongestFirst.containsKey(TLDprefix)) {
          // System.out.println("Ignoring non-TLD prefix: " + TLDprefix);
          break; // shorter prefixes can be ignored
        }
        if (processedTLDsLongestFirst.get(TLDprefix)) {
          // System.out.println("Skipping already processed prefix: " + TLDprefix);
          break; // shorter prefixes have already been processed
        }

        processedTLDsLongestFirst.put(TLDprefix, true); // mark as processed
        if (TLDsBySuffixLength.size() == suffixLength) TLDsBySuffixLength.add(new TreeSet<>());
        SortedSet<String> TLDbucket = TLDsBySuffixLength.get(suffixLength);
        TLDbucket.add(TLDprefix);
        // System.out.println("Adding TLD prefix of " + TLD + " with suffix length " + suffixLength
        // + ": " + TLDprefix);
      }
    }
  }

  /**
   * Writes a file containing a JFlex macro that will accept any of the given TLDs
   * case-insensitively.
   */
  private void writeOutput() throws IOException {
    final DateFormat dateFormat =
        DateFormat.getDateTimeInstance(DateFormat.FULL, DateFormat.FULL, Locale.ROOT);
    dateFormat.setTimeZone(TimeZone.getTimeZone("UTC"));
    try (Writer writer =
        new OutputStreamWriter(Files.newOutputStream(outputFile), StandardCharsets.UTF_8)) {
      writer.write(APACHE_LICENSE);
      writer.write("// Generated from IANA Root Zone Database <");
      writer.write(tldFileURL.toString());
      writer.write(">");
      writer.write(NL);
      if (tldFileLastModified > 0L) {
        writer.write("// file version from ");
        writer.write(dateFormat.format(tldFileLastModified));
        writer.write(NL);
      }
      writer.write("// generated on ");
      writer.write(dateFormat.format(new Date()));
      writer.write(NL);
      writer.write("// by ");
      writer.write(this.getClass().getName());
      writer.write(NL);
      writer.write(NL);

      for (int i = 0; i < TLDsBySuffixLength.size(); ++i) {
        String macroName = getMacroName(i);
        writer.write("// LUCENE-8278: ");
        if (i == 0) {
          writer.write(
              "None of the TLDs in {"
                  + macroName
                  + "} is a 1-character-shorter prefix of another TLD");
        } else {
          writer.write("Each TLD in {" + macroName + "} is a prefix of another TLD by");
          writer.write(" " + i + " character");
          if (i > 1) {
            writer.write("s");
          }
        }
        writer.write(NL);
        writeTLDmacro(writer, macroName, TLDsBySuffixLength.get(i));
      }
    }
  }

  private String getMacroName(int suffixLength) {
    return "ASCIITLD" + (suffixLength > 0 ? "prefix_" + suffixLength + "CharSuffix" : "");
  }

  private void writeTLDmacro(Writer writer, String macroName, SortedSet<String> TLDs)
      throws IOException {
    writer.write(macroName);
    writer.write(" = \".\" (");
    writer.write(NL);

    boolean isFirst = true;
    for (String TLD : TLDs) {
      writer.write("\t");
      if (isFirst) {
        isFirst = false;
        writer.write("  ");
      } else {
        writer.write("| ");
      }
      writer.write(getCaseInsensitiveRegex(TLD));
      writer.write(NL);
    }
    writer.write("\t) \".\"?   // Accept trailing root (empty) domain");
    writer.write(NL);
    writer.write(NL);
  }

  /**
   * Returns a regex that will accept the given ASCII TLD case-insensitively.
   *
   * @param ASCIITLD The ASCII TLD to generate a regex for
   * @return a regex that will accept the given ASCII TLD case-insensitively
   */
  private String getCaseInsensitiveRegex(String ASCIITLD) {
    StringBuilder builder = new StringBuilder();
    for (int pos = 0; pos < ASCIITLD.length(); ++pos) {
      char ch = ASCIITLD.charAt(pos);
      if (Character.isDigit(ch) || ch == '-') {
        builder.append(ch);
      } else {
        builder.append("[").append(ch).append(Character.toUpperCase(ch)).append("]");
      }
    }
    return builder.toString();
  }
}
