#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements.  See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License.  You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
"""
Combine a set of Joshua configuration and resources into a portable
directory tree.
"""
from __future__ import print_function
import argparse
from collections import namedtuple
import logging
import os
import shutil
import signal
import stat
from subprocess import CalledProcessError, Popen, PIPE
import sys

EXAMPLE = r"""
Example invocation:

$JOSHUA/scripts/language-pack/copy_model.py \
  --force \
  --verbose \
  --copy-config-options \
    '-top-n 1 -output-format %S -mark-oovs false' \
  /path/to/origin/directory/test/model/joshua.config \
  /path/to/destination/directory

Note: The options included in the value string for the --copy-config-options
argument can either be Joshua options or options for the
$JOSHUA/scripts/copy-config.pl script. 
"""

JOSHUA_PATH = os.environ.get('JOSHUA')
default_normalizer = os.path.join(JOSHUA_PATH, "scripts/preparation/normalize.pl")
default_tokenizer = os.path.join(JOSHUA_PATH, "scripts/preparation/tokenize.pl")
FILE_TYPE_TOKENS = ['lm', 'tm']
FILE_TYPE_OPTIONS = ['-path', '-lm_file']

OUTPUT_CONFIG_FILE_NAME = 'joshua.config'

def bundle_runner_text(mem):
    text = """#!/bin/bash
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements.  See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License.  You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Joshua decoder invocation script.
# 
# This script takes care of passing arguments to Java and to the
# Joshua decoder. It changes to the current directory so that paths in 
# the config file are relative to the current directory. Usage:
#
# joshua [-m memory] [Joshua arguments]
#
# The default amount of memory is 4gb.

NUM_ARGS=0
E_OPTERROR=1

## memory usage; default is 4 GB
mem=%s

if [[ $1 == "-m" ]]; then
    mem=$2
    shift
    shift
fi

set -u

bundledir=$(dirname $0)

exec java -mx${mem} \\
    -Dfile.encoding=utf8 \\
    -Djava.library.path=$bundledir/lib \\
    -cp $bundledir/target/joshua-*-jar-with-dependencies.jar \\
    org.apache.joshua.decoder.JoshuaDecoder -c $bundledir/joshua.config -v 0 "$@"
""" % mem

    return text


LineParts = namedtuple('LineParts', ['config', 'comment'])


class PathException(Exception):
    """Error involving a specified path"""
    pass


class PackingError(Exception):
    """Error packing a grammar"""
    pass


def error_quit(message):
    logging.error(message)
    sys.exit(2)


def extract_line_parts(line):
    """
    Builds a LineParts object containing tokenized config and comment
    portions of a config line
    """
    config, hash_char, comment = line.partition('#')
    return LineParts(config=config, comment=comment)


def filter_through_copy_config_script(config_text, copy_configs):
    """
    Run the config_text through the 'copy-config.pl' script, applying
    the copy_configs options
    """
    cmd = os.path.join(JOSHUA_PATH, "scripts/copy-config.pl") + ' ' + copy_configs
    logging.info(
        'Running the copy-config.pl script with the command: ' + cmd
    )
    p = Popen(cmd, shell=True, stdin=PIPE, stdout=PIPE)
    result, err = p.communicate(config_text)
    if p.returncode != 0:
        raise CalledProcessError(
            'Encountered an error running the copy-config.pl script.\n'
            '  command: %s\n'
            '  error: %s'
            % (cmd, err or '')
        )
    return result


def line_specifies_path(line):
    """
    Return True if the line matches the format of a joshua.config line
    that specifies a file or directory path, and False otherwise.

    >>> line_specifies_path('tm = thrax glue -1 1/data/tune/grammar.glue')
    True
    >>> line_specifies_path('tm = moses -owner pt -maxspan 0 -path phrase-table.packed -max-source-len 5')
    True
    >>> line_specifies_path('tm = moses pt 0 phrase-table.packed')
    True
    >>> line_specifies_path('feature-function = WordPenalty')
    False
    >>> line_specifies_path('feature_function = Distortion')
    False
    >>> line_specifies_path('feature-function = StateMinimizingLanguageModel -lm_type kenlm -lm_order 5 -lm_file expts/systems/es-en/1/lm.kenlm')
    True
    >>> line_specifies_path('# Foo')
    False
    """
    line_parts = extract_line_parts(line)
    if not line_parts.config:
        return False

    config_tokens = line_parts.config.split()
    if not config_tokens:
        return False

    if config_tokens[0] in FILE_TYPE_TOKENS:
        # The first token is the type of config that would specify a
        # path.
        return True

    # Look for tokens that match options indicating a path
    # using intersection of sets
    if set(config_tokens) & set(FILE_TYPE_OPTIONS):
        return True

    return False


def validate_path(path):
    """
    If the specified path does not exist, quit with an nonzero return
    code, and log an error
    """
    if not os.path.exists(path):
        raise PathException(
            'The path "%s" does not exist. Cannot proceed.' % path
        )


def parse_path(config_line):
    """
    Given a Joshua config line with no comments, return a path specified
    by the config.

    >>> parse_path('tm = moses -owner pt -maxspan 0 -path phrase-table.packed -max-source-len 5')
    'phrase-table.packed'
    >>> parse_path('tm = moses pt 0 phrase-table.packed')
    'phrase-table.packed'
    """
    config_tokens = config_line.split()
    # Look for -lm_file or -path option tokens indicating a path
    # If one of those options is not found, assume the final path is the
    # final token.
    path_index = -1
    for path_opt in FILE_TYPE_OPTIONS:
        if path_opt in config_tokens:
            path_index = config_tokens.index(path_opt) + 1
            break

    return config_tokens[path_index]


duplicate_name_counts = {}


def get_unique_dest(name):
    """
    If file/dir name was previously seen, rename the destination path
    by incrementing the number if type it has been seen.
    """
    global duplicate_name_counts
    times_seen = duplicate_name_counts.get(name, 0) + 1
    duplicate_name_counts[name] = times_seen
    pre_extension, extension = os.path.splitext(name)
    result = name
    if times_seen > 1:
        result = "{0}.{1}{2}".format(pre_extension, times_seen, extension)
    return result


def recursive_copy(src, dest, symlink = False):
    """
    Copy the src file or recursively copy the directory rooted at src to
    dest
    """
    if symlink:
        os.symlink(src, dest)
    else:
        if os.path.isdir(src):
            shutil.copytree(src, dest, True)
        else:
            shutil.copy(src, dest)


def process_line_containing_path(line, dest_dir, symlink, absolute):
    """
    The line has already been determined to contain a path, so generate
    an operation tuple, and update the config line based on the passed
    orig_dir and dest_dir

    >>> with open('/tmp/lm.kenlm', 'w') as fh:
    ...     fh.write('')
    >>> line = 'feature-function = StateMinimizingLanguageModel -lm_type kenlm -lm_order 5 -lm_file ./lm.kenlm'

    >>> process_line_containing_path(line, '/tmp', '/foobar')
    ... # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
    ('feature-function = StateMinimizingLanguageModel -lm_type kenlm -lm_order 5 -lm_file lm.kenlm',
     (<function recursive_copy at ...>,
      ('/tmp/lm.kenlm', '/foobar/lm.kenlm'),
      'Making a copy of /tmp/lm.kenlm at /foobar/lm.kenlm'))

    >>> line = 'feature-function = StateMinimizingLanguageModel -lm_type kenlm -lm_order 5 -lm_file /tmp/lm.kenlm'
    >>> process_line_containing_path(line, '/tmp', '/foobar')
    ... # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
    ('feature-function = StateMinimizingLanguageModel -lm_type kenlm -lm_order 5 -lm_file lm.2.kenlm',
     (<function recursive_copy at ...>,
      ('/tmp/lm.kenlm', '/foobar/lm.2.kenlm'),
      'Making a copy of /tmp/lm.kenlm at /foobar/lm.2.kenlm'))
    """
    #####################
    # Get the source path

    logging.debug('Looking for a path in the line:\n    %s' % line)
    line_parts = extract_line_parts(line)

    src_path = parse_path(line_parts.config)
    logging.debug('* Found path "%s"' % src_path)

    #####################################
    # Determine a unique destination path

    # Get directory name or file name of source path
    src_name = os.path.basename(src_path)
    dest_name = get_unique_dest(src_name)

    #############################################################
    # Generate an operation tuple to copy from orig_dir to dest_dir

    # Coerce the source path to its absolute path if it's relative
    full_src_path = os.path.normpath(src_path)
    validate_path(full_src_path)

    dest_path = os.path.join(dest_dir, 'model', dest_name)
    operation = (
        recursive_copy, (full_src_path, dest_path, symlink),
        'Making a copy of {0} at {1}'.format(full_src_path, dest_path)
    )

    ########################
    # Update the config line
    updated_config = line_parts.config.replace(src_path, dest_path if absolute else os.path.join('model', dest_name))
    if line_parts.comment:
        line = '#'.join([updated_config, line_parts.comment])
    else:
        line = updated_config

    return line, operation


class _PackGrammarPath(str):
    """
    Used when parsing command-line arguments to distinguish a grammar
    to be packed from a grammar to be copied.
    """
    pass


def handle_args(clargs):
    """
    Process the command-line options
    """
    class MyParser(argparse.ArgumentParser):
        def error(self, message):
            logging.error('ERROR: %s\n' % message)
            self.print_help()
            print(EXAMPLE)
            sys.exit(2)

    # Parse the command-line arguments.
    parser = MyParser(description='create a Joshua configuration bundle from '
                                  'an existing configuration and set of files')
    parser.add_argument(
        'config', type=argparse.FileType('r'),
        help='path to the origin configuration file. e.g. '
             '/path/to/tune/dir/joshua.config.final'
    )
    parser.add_argument(
        'dest_dir',
        help='destination directory, which should not already exist. But if '
             'it does, it will be removed if -f is used.'
    )
    parser.add_argument(
        '-f', '--force', action='store_true',
        help='extant destination directory will be overwritten'
    )
    parser.add_argument(
        '-o', '--copy-config-options', default='-top-n 0 -output-format %S -mark-oovs false',
        help='optional additional or replacement configuration options for '
             'Joshua, all surrounded by one pair of quotes. Defaults to '
             ' \'-top-n 0 -output-format %%S -mark-oovs false\''
    )
    parser.add_argument(
        '-m', '--mem', default='4g',
        help='default amount of memory for Joshua. Defaults to 4g'
    )
    parser.add_argument(
        '-v', '--verbose', action='store_true',
        help='print informational messages'
    )
    parser.add_argument(
        '--no-comments', dest='suppress_comments', action='store_true',
        help="delete comments and multiple consecutive empty lines")
    parser.add_argument(
        '--symlink', dest='symlink', action='store_true',
        help="symlink (where possible) to TM and LM files, instead of copying them")
    parser.add_argument(
        '--absolute', dest='absolute', action='store_true', default=False,
        help="Use absolute instead of relative paths for model file locations")
    parser.add_argument(
        '--source', dest='source',
        help="Source language two-character code (ISO 639-1)")
    parser.add_argument(
        '--normalizer', default=default_normalizer,
        help="source sentence normalizer that was applied to the model")
    parser.add_argument(
        '--tokenizer', default=default_tokenizer,
        help="source sentence tokenizer that was applied to the model")
    parser.add_argument(
        '-T', dest='tmpdir', default='/tmp',
        help="temp directory")

    return parser.parse_args(clargs)


def write_string_to_file(path, text):
    """
    Write the file at the specified path with the given lines
    """
    with open(path, 'w') as fh:
        fh.write(text)


def collect_operations(opts):
    """
    Produce a list of operations to take.

    Each element in the operations list is in the format:
      (function, (arguments,), 'logging message')
    """
    operations = []

    #######################
    # Destination directory
    if os.path.exists(opts.dest_dir):
        if not opts.force:
            raise Exception(
                'ERROR: The destination directory exists: "%s"\n'
                'Use -f or --force option to overwrite the directory.'
                % opts.dest_dir
            )
        else:
            operations.append(
                (shutil.rmtree, (opts.dest_dir,),
                 'Forcing deletion of existing destination directory "%s"'
                 % opts.dest_dir)
            )

    operations.append(
        (os.makedirs, (os.path.join(opts.dest_dir, 'model'),),
         'Creating destination directory "%s"' % opts.dest_dir)
    )

    ##########################
    # Input joshua.config file
    config_text = opts.config.read()
    if opts.copy_config_options:
        config_text = filter_through_copy_config_script(
            config_text,
            opts.copy_config_options
        )

    config_lines = config_text.split('\n')

    ###############
    # Files to copy
    # Parse the joshua.config and collect copy operations
    result_config_lines = []
    for i, line in enumerate(config_lines):
        line_num = i + 1

        if line_specifies_path(line):
            try:
                line, operation = process_line_containing_path(
                    line, opts.dest_dir, opts.symlink, opts.absolute
                )
            except PathException as e:
                # Prepend the line number to the error message
                message = (
                    'ERROR: Configuration file "{0}" line {1}: {2}'
                    .format(opts.config.name, line_num, e.message)
                )
                e.message = message
                raise e
            operations.append(operation)
        result_config_lines.append(line)

    ###########################
    # Output joshua.config file
    # Create the Joshua configuration file for the package
    path = os.path.join(opts.dest_dir, OUTPUT_CONFIG_FILE_NAME)
    text = '\n'.join(result_config_lines) + '\n'
    operations.append(
        (write_string_to_file, (path, text),
         'Writing the updated joshua.config to %s' % path
         )
    )

    #######################
    # Bundle runner scripts
    # Write the scripts that run Joshua using the configuration and
    # resource in the bundle, and make their mode world-readable, and
    # world-executable.
    for file_name, file_text in [['joshua', bundle_runner_text(opts.mem)],]:
        path = os.path.join(opts.dest_dir, file_name)
        operations.append(
            (write_string_to_file, (path, file_text),
             'Writing the bundle runner file "%s"' % path)
        )
        mode = (stat.S_IREAD | stat.S_IRGRP | stat.S_IROTH |
                stat.S_IEXEC | stat.S_IXGRP | stat.S_IXOTH)
        operations.append(
            (os.chmod, (path, mode),
             'Making the bundle runner file executable')
        )

    return operations


def execute_operations(operations):
    """
    Execute the list of operations.
    """
    for func, args, msg in operations:
        logging.info(msg)
        func(*args)


def main(argv):
    global opts
    opts = handle_args(argv[1:])

    logging.basicConfig(
        level=logging.DEBUG if opts.verbose else logging.WARNING,
        format='* %(message)s'
    )

    try:
#        validate_path(opts.orig_dir)
        operations = collect_operations(opts)
        execute_operations(operations)
    except Exception as e:
        error_quit(e.message)


if __name__ == "__main__":
    try:
        assert JOSHUA_PATH
    except AssertionError:
        error_quit('ERROR: The JOSHUA environment variable must be defined.')

    main(sys.argv)
