/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License. You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package opennlp.tools.coref.resolver;

import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.regex.Pattern;

import opennlp.tools.coref.DiscourseEntity;
import opennlp.tools.coref.mention.MentionContext;
import opennlp.tools.coref.mention.Parse;
import opennlp.tools.coref.sim.GenderEnum;
import opennlp.tools.coref.sim.NumberEnum;
import opennlp.tools.coref.sim.TestSimilarityModel;

/**
 * This class provides a set of utilities for turning mentions into normalized strings and features.
 */
public class ResolverUtils {
  
  private static final Pattern ENDS_WITH_PERIOD = Pattern.compile("\\.$");
  private static final Pattern initialCaps = Pattern.compile("^[A-Z]");

  /** Regular expression for English singular third person pronouns. */
  public static final Pattern singularThirdPersonPronounPattern = Pattern.compile("^(he|she|it|him|her|his|hers|its|himself|herself|itself)$",Pattern.CASE_INSENSITIVE);
  /** Regular expression for English plural third person pronouns. */
  public static final Pattern pluralThirdPersonPronounPattern = Pattern.compile("^(they|their|theirs|them|themselves)$",Pattern.CASE_INSENSITIVE);
  /** Regular expression for English speech pronouns. */
  public static final Pattern speechPronounPattern = Pattern.compile("^(I|me|my|you|your|you|we|us|our|ours)$",Pattern.CASE_INSENSITIVE);
  /** Regular expression for English female pronouns. */
  public static final Pattern femalePronounPattern = Pattern.compile("^(she|her|hers|herself)$",Pattern.CASE_INSENSITIVE);
  /** Regular expression for English neuter pronouns. */
  public static final Pattern neuterPronounPattern = Pattern.compile("^(it|its|itself)$",Pattern.CASE_INSENSITIVE);
  /** Regular expression for English first person pronouns. */
  public static final Pattern firstPersonPronounPattern = Pattern.compile("^(I|me|my|we|our|us|ours)$",Pattern.CASE_INSENSITIVE);
  /** Regular expression for English singular second person pronouns. */
  public static final Pattern secondPersonPronounPattern = Pattern.compile("^(you|your|yours)$",Pattern.CASE_INSENSITIVE);
  /** Regular expression for English third person pronouns. */
  public static final Pattern thirdPersonPronounPattern = Pattern.compile("^(he|she|it|him|her|his|hers|its|himself|herself|itself|they|their|theirs|them|themselves)$",Pattern.CASE_INSENSITIVE);
  /** Regular expression for English singular pronouns. */
  public static final Pattern singularPronounPattern = Pattern.compile("^(I|me|my|he|she|it|him|her|his|hers|its|himself|herself|itself)$",Pattern.CASE_INSENSITIVE);
  /** Regular expression for English plural pronouns. */
  public static final Pattern pluralPronounPattern = Pattern.compile("^(we|us|our|ours|they|their|theirs|them|themselves)$",Pattern.CASE_INSENSITIVE);
  /** Regular expression for English male pronouns. */
  public static final Pattern malePronounPattern = Pattern.compile("^(he|him|his|himself)$",Pattern.CASE_INSENSITIVE);
  /** Regular expression for English honorifics. */
  public static final Pattern honorificsPattern = Pattern.compile("[A-Z][a-z]+\\.$|^[A-Z][b-df-hj-np-tv-xz]+$");
  /** Regular expression for English corporate designators. */
  public static final Pattern designatorsPattern = Pattern.compile("[a-z]\\.$|^[A-Z][b-df-hj-np-tv-xz]+$|^Co(rp)?$");

  
  private static final String NUM_COMPATIBLE = "num.compatible";
  private static final String NUM_INCOMPATIBLE = "num.incompatible";
  private static final String NUM_UNKNOWN = "num.unknown";
  
  private static final String GEN_COMPATIBLE = "gen.compatible";
  private static final String GEN_INCOMPATIBLE = "gen.incompatible";
  private static final String GEN_UNKNOWN = "gen.unknown";
  private static final String SIM_COMPATIBLE = "sim.compatible";
  private static final String SIM_INCOMPATIBLE = "sim.incompatible";
  private static final String SIM_UNKNOWN = "sim.unknown";

  
  private static final double MIN_SIM_PROB = 0.60;



  /**
   * Returns a list of features based on the surrounding context of the specified mention.
   * @param mention he mention whose surround context the features model.
   * @return a list of features based on the surrounding context of the specified mention
   */
  public static List<String> getContextFeatures(MentionContext mention) {
    List<String> features = new ArrayList<String>();
    if (mention.getPreviousToken() != null) {
      features.add("pt=" + mention.getPreviousToken().getSyntacticType());
      features.add("pw=" + mention.getPreviousToken().toString());
    }
    else {
      features.add("pt=BOS");
      features.add("pw=BOS");
    }
    if (mention.getNextToken() != null) {
      features.add("nt=" + mention.getNextToken().getSyntacticType());
      features.add("nw=" + mention.getNextToken().toString());
    }
    else {
      features.add("nt=EOS");
      features.add("nw=EOS");
    }
    if (mention.getNextTokenBasal() != null) {
      features.add("bnt=" + mention.getNextTokenBasal().getSyntacticType());
      features.add("bnw=" + mention.getNextTokenBasal().toString());
    }
    else {
      features.add("bnt=EOS");
      features.add("bnw=EOS");
    }
    return (features);
  }

  /**
   * Returns a list of word features for the specified tokens.
   * @param token The token for which features are to be computed.
   * @return a list of word features for the specified tokens.
   */
  public static List<String> getWordFeatures(Parse token) {
    List<String> wordFeatures = new ArrayList<String>();
    String word = token.toString().toLowerCase();
    String wf = "";
    if (ENDS_WITH_PERIOD.matcher(word).find()) {
      wf = ",endWithPeriod";
    }
    String tokTag = token.getSyntacticType();
    wordFeatures.add("w=" + word + ",t=" + tokTag + wf);
    wordFeatures.add("t=" + tokTag + wf);
    return wordFeatures;
  }

  public static Set<String> constructModifierSet(Parse[] tokens, int headIndex) {
    Set<String> modSet = new HashSet<String>();
    for (int ti = 0; ti < headIndex; ti++) {
      Parse tok = tokens[ti];
      modSet.add(tok.toString().toLowerCase());
    }
    return (modSet);
  }

  public static String excludedDeterminerMentionString(MentionContext ec) {
    StringBuilder sb = new StringBuilder();
    boolean first = true;
    Parse[] mtokens = ec.getTokenParses();
    for (int ti = 0, tl = mtokens.length; ti < tl; ti++) {
      Parse token = mtokens[ti];
      String tag = token.getSyntacticType();
      if (!tag.equals("DT")) {
        if (!first) {
          sb.append(" ");
        }
        sb.append(token.toString());
        first = false;
      }
    }
    return sb.toString();
  }

  public static String excludedHonorificMentionString(MentionContext ec) {
    StringBuilder sb = new StringBuilder();
    boolean first = true;
    Object[] mtokens = ec.getTokens();
    for (int ti = 0, tl = mtokens.length; ti < tl; ti++) {
      String token = mtokens[ti].toString();
      if (!honorificsPattern.matcher(token).matches()) {
        if (!first) {
          sb.append(" ");
        }
        sb.append(token);
        first = false;
      }
    }
    return sb.toString();
  }

  public static String excludedTheMentionString(MentionContext ec) {
    StringBuilder sb = new StringBuilder();
    boolean first = true;
    Object[] mtokens = ec.getTokens();
    for (int ti = 0, tl = mtokens.length; ti < tl; ti++) {
      String token = mtokens[ti].toString();
      if (!token.equals("the") && !token.equals("The") && !token.equals("THE")) {
        if (!first) {
          sb.append(" ");
        }
        sb.append(token);
        first = false;
      }
    }
    return sb.toString();
  }

  public static String getExactMatchFeature(MentionContext ec, MentionContext xec) {
    //System.err.println("getExactMatchFeature: ec="+mentionString(ec)+" mc="+mentionString(xec));
    if (mentionString(ec).equals(mentionString(xec))) {
      return "exactMatch";
    }
    else if (excludedHonorificMentionString(ec).equals(excludedHonorificMentionString(xec))) {
      return "exactMatchNoHonor";
    }
    else if (excludedTheMentionString(ec).equals(excludedTheMentionString(xec))) {
      return "exactMatchNoThe";
    }
    else if (excludedDeterminerMentionString(ec).equals(excludedDeterminerMentionString(xec))) {
      return "exactMatchNoDT";
    }
    return null;
  }

  /**
   * Returns string-match features for the the specified mention and entity.
   * @param mention The mention.
   * @param entity The entity.
   * @return list of string-match features for the the specified mention and entity.
   */
  public static List<String> getStringMatchFeatures(MentionContext mention, DiscourseEntity entity) {
    boolean sameHead = false;
    boolean modsMatch = false;
    boolean titleMatch = false;
    boolean nonTheModsMatch = false;
    List<String> features = new ArrayList<String>();
    Parse[] mtokens = mention.getTokenParses();
    Set<String> ecModSet = constructModifierSet(mtokens, mention.getHeadTokenIndex());
    String mentionHeadString = mention.getHeadTokenText().toLowerCase();
    Set<String> featureSet = new HashSet<String>();
    for (Iterator<MentionContext> ei = entity.getMentions(); ei.hasNext();) {
      MentionContext entityMention = ei.next();
      String exactMatchFeature = getExactMatchFeature(entityMention, mention);
      if (exactMatchFeature != null) {
        featureSet.add(exactMatchFeature);
      }
      else if (entityMention.getParse().isCoordinatedNounPhrase() && !mention.getParse().isCoordinatedNounPhrase()) {
        featureSet.add("cmix");
      }
      else {
        String mentionStrip = stripNp(mention);
        String entityMentionStrip = stripNp(entityMention);
        if (mentionStrip != null && entityMentionStrip != null) {
          if (isSubstring(mentionStrip, entityMentionStrip)) {
            featureSet.add("substring");
          }
        }
      }
      Parse[] xtoks = entityMention.getTokenParses();
      int headIndex = entityMention.getHeadTokenIndex();
      //if (!mention.getHeadTokenTag().equals(entityMention.getHeadTokenTag())) {
      //  //System.err.println("skipping "+mention.headTokenText+" with "+xec.headTokenText+" because "+mention.headTokenTag+" != "+xec.headTokenTag);
      //  continue;
      //}  want to match NN NNP
      String entityMentionHeadString = entityMention.getHeadTokenText().toLowerCase();
      // model lexical similarity
      if (mentionHeadString.equals(entityMentionHeadString)) {
        sameHead = true;
        featureSet.add("hds=" + mentionHeadString);
        if (!modsMatch || !nonTheModsMatch) { //only check if we haven't already found one which is the same
          modsMatch = true;
          nonTheModsMatch = true;
          Set<String> entityMentionModifierSet = constructModifierSet(xtoks, headIndex);
          for (Iterator<String> mi = ecModSet.iterator(); mi.hasNext();) {
            String mw = mi.next();
            if (!entityMentionModifierSet.contains(mw)) {
              modsMatch = false;
              if (!mw.equals("the")) {
                nonTheModsMatch = false;
                featureSet.add("mmw=" + mw);
              }
            }
          }
        }
      }
      Set<String> descModSet = constructModifierSet(xtoks, entityMention.getNonDescriptorStart());
      if (descModSet.contains(mentionHeadString)) {
        titleMatch = true;
      }
    }
    if (!featureSet.isEmpty()) {
      features.addAll(featureSet);
    }
    if (sameHead) {
      features.add("sameHead");
      if (modsMatch) {
        features.add("modsMatch");
      }
      else if (nonTheModsMatch) {
        features.add("nonTheModsMatch");
      }
      else {
        features.add("modsMisMatch");
      }
    }
    if (titleMatch) {
      features.add("titleMatch");
    }
    return features;
  }

  public static boolean isSubstring(String ecStrip, String xecStrip) {
    //System.err.println("MaxentResolver.isSubstring: ec="+ecStrip+" xec="+xecStrip);
    int io = xecStrip.indexOf(ecStrip);
    if (io != -1) {
      //check boundries
      if (io != 0 && xecStrip.charAt(io - 1) != ' ') {
        return false;
      }
      int end = io + ecStrip.length();
      if (end != xecStrip.length() && xecStrip.charAt(end) != ' ') {
        return false;
      }
      return true;
    }
    return false;
  }

  public static String mentionString(MentionContext ec) {
    StringBuilder sb = new StringBuilder();
    Object[] mtokens = ec.getTokens();
    sb.append(mtokens[0].toString());
    for (int ti = 1, tl = mtokens.length; ti < tl; ti++) {
      String token = mtokens[ti].toString();
      sb.append(" ").append(token);
    }
    //System.err.println("mentionString "+ec+" == "+sb.toString()+" mtokens.length="+mtokens.length);
    return sb.toString();
  }

  /**
   * Returns a string for the specified mention with punctuation, honorifics,
   * designators, and determiners removed.
   * 
   * @param mention The mention to be striped.
   * 
   * @return a normalized string representation of the specified mention.
   */
  public static String stripNp(MentionContext mention) {
    int start=mention.getNonDescriptorStart(); //start after descriptors
  
    Parse[] mtokens = mention.getTokenParses();
    int end=mention.getHeadTokenIndex()+1;
    if (start == end) {
      //System.err.println("stripNp: return null 1");
      return null;
    }
    //strip determiners
    if (mtokens[start].getSyntacticType().equals("DT")) {
      start++;
    }
    if (start == end) {
      //System.err.println("stripNp: return null 2");
      return null;
    }
    //get to first NNP
    String type;
    for (int i=start;i<end;i++) {
      type = mtokens[start].getSyntacticType();
      if (type.startsWith("NNP")) {
        break;
      }
      start++;
    }
    if (start == end) {
      //System.err.println("stripNp: return null 3");
      return null;
    }
    if (start+1 != end) { // don't do this on head words, to keep "U.S."
      //strip off honorifics in begining
      if (honorificsPattern.matcher(mtokens[start].toString()).find()) {
        start++;
      }
      if (start == end) {
        //System.err.println("stripNp: return null 4");
        return null;
      }
      //strip off and honerifics on the end
      if (designatorsPattern.matcher(mtokens[mtokens.length - 1].toString()).find()) {
        end--;
      }
    }
    if (start == end) {
      //System.err.println("stripNp: return null 5");
      return null;
    }
    String strip = "";
    for (int i = start; i < end; i++) {
      strip += mtokens[i].toString() + ' ';
    }
    return strip.trim();
  }

  public static MentionContext getProperNounExtent(DiscourseEntity de) {
    for (Iterator<MentionContext> ei = de.getMentions(); ei.hasNext();) { //use first extent which is propername
      MentionContext xec = ei.next();
      String xecHeadTag = xec.getHeadTokenTag();
      if (xecHeadTag.startsWith("NNP") || initialCaps.matcher(xec.getHeadTokenText()).find()) {
        return xec;
      }
    }
    return null;
  }

  private static Map<String, String> getPronounFeatureMap(String pronoun) {
    Map<String, String> pronounMap = new HashMap<String, String>();
    if (malePronounPattern.matcher(pronoun).matches()) {
      pronounMap.put("gender","male");
    }
    else if (femalePronounPattern.matcher(pronoun).matches()) {
      pronounMap.put("gender","female");
    }
    else if (neuterPronounPattern.matcher(pronoun).matches()) {
      pronounMap.put("gender","neuter");
    }
    if (singularPronounPattern.matcher(pronoun).matches()) {
      pronounMap.put("number","singular");
    }
    else if (pluralPronounPattern.matcher(pronoun).matches()) {
      pronounMap.put("number","plural");
    }
    /*
    if (Linker.firstPersonPronounPattern.matcher(pronoun).matches()) {
      pronounMap.put("person","first");
    }
    else if (Linker.secondPersonPronounPattern.matcher(pronoun).matches()) {
      pronounMap.put("person","second");
    }
    else if (Linker.thirdPersonPronounPattern.matcher(pronoun).matches()) {
      pronounMap.put("person","third");
    }
    */
    return pronounMap;
  }

  /**
   * Returns features indicating whether the specified mention is compatible with the pronouns
   * of the specified entity.
   * @param mention The mention.
   * @param entity The entity.
   * @return list of features indicating whether the specified mention is compatible with the pronouns
   * of the specified entity.
   */
  public static List<String> getPronounMatchFeatures(MentionContext mention, DiscourseEntity entity) {
    boolean foundCompatiblePronoun = false;
    boolean foundIncompatiblePronoun = false;
    if (mention.getHeadTokenTag().startsWith("PRP")) {
      Map<String, String> pronounMap = getPronounFeatureMap(mention.getHeadTokenText());
      //System.err.println("getPronounMatchFeatures.pronounMap:"+pronounMap);
      for (Iterator<MentionContext> mi=entity.getMentions();mi.hasNext();) {
        MentionContext candidateMention = mi.next();
        if (candidateMention.getHeadTokenTag().startsWith("PRP")) {
          if (mention.getHeadTokenText().equalsIgnoreCase(candidateMention.getHeadTokenText())) {
            foundCompatiblePronoun = true;
            break;
          }
          else {
            Map<String, String> candidatePronounMap = getPronounFeatureMap(candidateMention.getHeadTokenText());
            //System.err.println("getPronounMatchFeatures.candidatePronounMap:"+candidatePronounMap);
            boolean allKeysMatch = true;
            for (Iterator<String> ki = pronounMap.keySet().iterator(); ki.hasNext();) {
              String key = ki.next();
              String cfv = candidatePronounMap.get(key);
              if (cfv != null) {
                if (!pronounMap.get(key).equals(cfv)) {
                  foundIncompatiblePronoun = true;
                  allKeysMatch = false;
                }
              }
              else {
                allKeysMatch = false;
              }
            }
            if (allKeysMatch) {
              foundCompatiblePronoun = true;
            }
          }
        }
      }
    }
    List<String> pronounFeatures = new ArrayList<String>();
    if (foundCompatiblePronoun) {
      pronounFeatures.add("compatiblePronoun");
    }
    if (foundIncompatiblePronoun) {
      pronounFeatures.add("incompatiblePronoun");
    }
    return pronounFeatures;
  }

  /**
   * Returns distance features for the specified mention and entity.
   * @param mention The mention.
   * @param entity The entity.
   * @return list of distance features for the specified mention and entity.
   */
  public static List<String> getDistanceFeatures(MentionContext mention, DiscourseEntity entity) {
    List<String> features = new ArrayList<String>();
    MentionContext cec = entity.getLastExtent();
    int entityDistance = mention.getNounPhraseDocumentIndex()- cec.getNounPhraseDocumentIndex();
    int sentenceDistance = mention.getSentenceNumber() - cec.getSentenceNumber();
    int hobbsEntityDistance;
    if (sentenceDistance == 0) {
      hobbsEntityDistance = cec.getNounPhraseSentenceIndex();
    }
    else {
      //hobbsEntityDistance = entityDistance - (entities within sentence from mention to end) + (entities within sentence form start to mention)
      //hobbsEntityDistance = entityDistance - (cec.maxNounLocation - cec.getNounPhraseSentenceIndex) + cec.getNounPhraseSentenceIndex;
      hobbsEntityDistance = entityDistance + (2 * cec.getNounPhraseSentenceIndex()) - cec.getMaxNounPhraseSentenceIndex();
    }
    features.add("hd=" + hobbsEntityDistance);
    features.add("de=" + entityDistance);
    features.add("ds=" + sentenceDistance);
    //features.add("ds=" + sdist + pronoun);
    //features.add("dn=" + cec.sentenceNumber);
    //features.add("ep=" + cec.nounLocation);
    return (features);
  }

  /**
   * Returns whether the specified token is a definite article.
   * @param tok The token.
   * @param tag The pos-tag for the specified token.
   * @return whether the specified token is a definite article.
   */
  public static boolean definiteArticle(String tok, String tag) {
    tok = tok.toLowerCase();
    if (tok.equals("the") || tok.equals("these") || tok.equals("these") || tag.equals("PRP$")) {
      return (true);
    }
    return (false);
  }

  public static String getNumberCompatibilityFeature(MentionContext ec, DiscourseEntity de) {
    NumberEnum en = de.getNumber();
    if (en == NumberEnum.UNKNOWN || ec.getNumber() == NumberEnum.UNKNOWN) {
      return NUM_UNKNOWN;
    }
    else if (ec.getNumber() == en) {
      return NUM_COMPATIBLE;
    }
    else {
      return NUM_INCOMPATIBLE;
    }
  }



  /**
   * Returns features indicating whether the specified mention and the specified entity are compatible.
   * @param mention The mention.
   * @param entity The entity.
   * @return list of features indicating whether the specified mention and the specified entity are compatible.
   */
  public static List<String> getCompatibilityFeatures(MentionContext mention, DiscourseEntity entity, TestSimilarityModel simModel) {
    List<String> compatFeatures = new ArrayList<String>();
    String semCompatible = getSemanticCompatibilityFeature(mention, entity, simModel);
    compatFeatures.add(semCompatible);
    String genCompatible = getGenderCompatibilityFeature(mention, entity);
    compatFeatures.add(genCompatible);
    String numCompatible = ResolverUtils.getNumberCompatibilityFeature(mention, entity);
    compatFeatures.add(numCompatible);
    if (semCompatible.equals(SIM_COMPATIBLE) && genCompatible.equals(GEN_COMPATIBLE) && numCompatible.equals(ResolverUtils.NUM_COMPATIBLE)) {
      compatFeatures.add("all.compatible");
    }
    else if (semCompatible.equals(SIM_INCOMPATIBLE) || genCompatible.equals(GEN_INCOMPATIBLE) || numCompatible.equals(ResolverUtils.NUM_INCOMPATIBLE)) {
      compatFeatures.add("some.incompatible");
    }
    return compatFeatures;
  }

  public static String getGenderCompatibilityFeature(MentionContext ec, DiscourseEntity de) {
    GenderEnum eg = de.getGender();
    //System.err.println("getGenderCompatibility: mention="+ec.getGender()+" entity="+eg);
    if (eg == GenderEnum.UNKNOWN || ec.getGender() == GenderEnum.UNKNOWN) {
      return GEN_UNKNOWN;
    }
    else if (ec.getGender() == eg) {
      return GEN_COMPATIBLE;
    }
    else {
      return GEN_INCOMPATIBLE;
    }
  }

  public static String getSemanticCompatibilityFeature(MentionContext ec, DiscourseEntity de, TestSimilarityModel simModel) {
    if (simModel != null) {
      double best = 0;
      for (Iterator<MentionContext> xi = de.getMentions(); xi.hasNext();) {
        MentionContext ec2 = xi.next();
        double sim = simModel.compatible(ec, ec2);
        if (sim > best) {
          best = sim;
        }
      }
      if (best > MIN_SIM_PROB) {
        return SIM_COMPATIBLE;
      }
      else if (best > (1 - MIN_SIM_PROB)) {
        return SIM_UNKNOWN;
      }
      else {
        return SIM_INCOMPATIBLE;
      }
    }
    else {
      System.err.println("MaxentResolver: Uninitialized Semantic Model");
      return SIM_UNKNOWN;
    }
  }

  public static String getMentionCountFeature(DiscourseEntity de) {
    if (de.getNumMentions() >= 5) {
      return ("mc=5+");
    }
    else {
      return ("mc=" + de.getNumMentions());
    }
  }

  /**
   * Returns a string representing the gender of the specified pronoun.
   * @param pronoun An English pronoun.
   * @return the gender of the specified pronoun.
   */
  public static String getPronounGender(String pronoun) {
    if (malePronounPattern.matcher(pronoun).matches()) {
      return "m";
    }
    else if (femalePronounPattern.matcher(pronoun).matches()) {
      return "f";
    }
    else if (neuterPronounPattern.matcher(pronoun).matches()) {
      return "n";
    }
    else {
      return "u";
    }
  }
  
  

}
