/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License. You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package opennlp.tools.similarity.apps;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.logging.Logger;

import opennlp.tools.similarity.apps.utils.PageFetcher;
import opennlp.tools.similarity.apps.utils.StringDistanceMeasurer;
import opennlp.tools.similarity.apps.utils.Utils;
import opennlp.tools.textsimilarity.ParseTreeChunk;
import opennlp.tools.textsimilarity.ParseTreeChunkListScorer;
import opennlp.tools.textsimilarity.SentencePairMatchResult;
import opennlp.tools.textsimilarity.chunker2matcher.ParserChunker2MatcherProcessor;

import org.apache.commons.lang.StringUtils;

/*
 * This class does content generation by using web mining and syntactic generalization to get sentences from the web, convert and combine them in the form 
 * expected to be readable by humans.
 * 
 * These are examples of generated articles, given the article title
 * http://www.allvoices.com/contributed-news/9423860/content/81937916-ichie-sings-jazz-blues-contemporary-tunes
 * http://www.allvoices.com/contributed-news/9415063-britney-spears-femme-fatale-in-north-sf-bay-area
 * 
 */

public class RelatedSentenceFinder {
  private static Logger LOG = Logger
      .getLogger("opennlp.tools.similarity.apps.RelatedSentenceFinder");
  PageFetcher pFetcher = new PageFetcher();

  private ParseTreeChunkListScorer parseTreeChunkListScorer = new ParseTreeChunkListScorer();
  private ParseTreeChunk parseTreeChunk = new ParseTreeChunk();

  static StringDistanceMeasurer STRING_DISTANCE_MEASURER = new StringDistanceMeasurer();

  // used to indicate that a sentence is an opinion, so more appropriate
  static List<String> MENTAL_VERBS = new ArrayList<String>(
      Arrays.asList(new String[] { "want", "know", "believe", "appeal", "ask",
          "accept", "agree", "allow", "appeal", "ask", "assume", "believe",
          "check", "confirm", "convince", "deny", "disagree", "explain",
          "ignore", "inform", "remind", "request", "suggest", "suppose",
          "think", "threaten", "try", "understand" }));

  private static final int MAX_FRAGMENT_SENTS = 10;

  public RelatedSentenceFinder() {

  }

  public List<HitBase> findRelatedOpinionsForSentenceFastAndDummy(String word,
      List<String> sents) throws Exception {
    BingWebQueryRunner yrunner = new BingWebQueryRunner();
    List<HitBase> searchResult = yrunner.runSearch(word);
    return searchResult;
  }

  public List<HitBase> findRelatedOpinionsForSentence(String sentence,
      List<String> sents) throws Exception {
    List<HitBase> opinionSentencesToAdd = new ArrayList<HitBase>();
    System.out.println(" \n\n=== Sentence  = " + sentence);
    List<String> nounPhraseQueries = buildSearchEngineQueryFromSentence(sentence);

    BingWebQueryRunner yrunner = new BingWebQueryRunner();
    for (String query : nounPhraseQueries) {
      System.out.println("\nquery = " + query);
      // query += " "+join(MENTAL_VERBS, " OR ") ;
      List<HitBase> searchResult = yrunner.runSearch(query);
      if (searchResult != null) {
        for (HitBase item : searchResult) { // got some text from .html
          if (item.getAbstractText() != null
              && !(item.getUrl().indexOf(".pdf") > 0)) { // exclude
                                                         // pdf
            opinionSentencesToAdd
                .add(augmentWithMinedSentencesAndVerifyRelevance(item,
                    sentence, sents));
          }
        }
      }
    }

    opinionSentencesToAdd = removeDuplicatesFromResultantHits(opinionSentencesToAdd);
    return opinionSentencesToAdd;
  }

  /**
   * Main content generation function which takes a seed as a person, rock
   * group, or other entity name and produce a list of text fragments by web
   * mining for <br>
   * 
   * @param String
   *          entity name
   * @return List<HitBase> of text fragment structures which contain approved
   *         (in terms of relevance) mined sentences, as well as original search
   *         results objects such as doc titles, abstracts, and urls.
   */

  public List<HitBase> generateContentAbout(String sentence) throws Exception {
    List<HitBase> opinionSentencesToAdd = new ArrayList<HitBase>();
    System.out.println(" \n=== Entity to write about = " + sentence);
    List<String> nounPhraseQueries = new ArrayList<String>();

    // nounPhraseQueries.add(sentence + frequentPerformingVerbs);

    BingWebQueryRunner yrunner = new BingWebQueryRunner();
    for (String verbAddition : StoryDiscourseNavigator.frequentPerformingVerbs) {
      List<HitBase> searchResult = yrunner.runSearch(sentence + " "
          + verbAddition);
      if (searchResult != null) {
        for (HitBase item : searchResult) { // got some text from .html
          if (item.getAbstractText() != null
              && !(item.getUrl().indexOf(".pdf") > 0)) { // exclude pdf
            opinionSentencesToAdd
                .add(augmentWithMinedSentencesAndVerifyRelevance(item,
                    sentence, null));
          }
        }
      }
    }

    opinionSentencesToAdd = removeDuplicatesFromResultantHits(opinionSentencesToAdd);
    return opinionSentencesToAdd;
  }

  /**
   * Takes a sentence and extracts noun phrases and entity names to from search
   * queries for finding relevant sentences on the web, which are then subject
   * to relevance assessment by Similarity. Search queries should not be too
   * general (irrelevant search results) or too specific (too few search
   * results)
   * 
   * @param String
   *          input sentence to form queries
   * @return List<String> of search expressions
   */
  public static List<String> buildSearchEngineQueryFromSentence(String sentence) {
    ParseTreeChunk matcher = new ParseTreeChunk();
    ParserChunker2MatcherProcessor pos = ParserChunker2MatcherProcessor
        .getInstance();
    List<List<ParseTreeChunk>> sent1GrpLst = null;

    List<ParseTreeChunk> nPhrases = pos
        .formGroupedPhrasesFromChunksForSentence(sentence).get(0);
    List<String> queryArrayStr = new ArrayList<String>();
    for (ParseTreeChunk ch : nPhrases) {
      String query = "";
      int size = ch.getLemmas().size();

      for (int i = 0; i < size; i++) {
        if (ch.getPOSs().get(i).startsWith("N")
            || ch.getPOSs().get(i).startsWith("J")) {
          query += ch.getLemmas().get(i) + " ";
        }
      }
      query = query.trim();
      int len = query.split(" ").length;
      if (len < 2 || len > 5)
        continue;
      if (len < 4) { // every word should start with capital
        String[] qs = query.split(" ");
        boolean bAccept = true;
        for (String w : qs) {
          if (w.toLowerCase().equals(w)) // idf only two words then
            // has to be person name,
            // title or geo location
            bAccept = false;
        }
        if (!bAccept)
          continue;
      }

      query = query.trim().replace(" ", " +");
      query = " +" + query;

      queryArrayStr.add(query);

    }
    if (queryArrayStr.size() < 1) { // release constraints on NP down to 2
                                    // keywords
      for (ParseTreeChunk ch : nPhrases) {
        String query = "";
        int size = ch.getLemmas().size();

        for (int i = 0; i < size; i++) {
          if (ch.getPOSs().get(i).startsWith("N")
              || ch.getPOSs().get(i).startsWith("J")) {
            query += ch.getLemmas().get(i) + " ";
          }
        }
        query = query.trim();
        int len = query.split(" ").length;
        if (len < 2)
          continue;

        query = query.trim().replace(" ", " +");
        query = " +" + query;

        queryArrayStr.add(query);

      }
    }

    queryArrayStr = removeDuplicatesFromQueries(queryArrayStr);
    queryArrayStr.add(sentence);

    return queryArrayStr;

  }

  /**
   * remove dupes from queries to easy cleaning dupes and repetitive search
   * afterwards
   * 
   * @param List
   *          <String> of sentences (search queries, or search results
   *          abstracts, or titles
   * @return List<String> of sentences where dupes are removed
   */
  public static List<String> removeDuplicatesFromQueries(List<String> hits) {
    StringDistanceMeasurer meas = new StringDistanceMeasurer();
    double dupeThresh = 0.8; // if more similar, then considered dupes was
    // 0.7
    List<Integer> idsToRemove = new ArrayList<Integer>();
    List<String> hitsDedup = new ArrayList<String>();
    try {
      for (int i = 0; i < hits.size(); i++)
        for (int j = i + 1; j < hits.size(); j++) {
          String title1 = hits.get(i);
          String title2 = hits.get(j);
          if (StringUtils.isEmpty(title1) || StringUtils.isEmpty(title2))
            continue;
          if (meas.measureStringDistance(title1, title2) > dupeThresh) {
            idsToRemove.add(j); // dupes found, later list member to
            // be deleted

          }
        }

      for (int i = 0; i < hits.size(); i++)
        if (!idsToRemove.contains(i))
          hitsDedup.add(hits.get(i));

      if (hitsDedup.size() < hits.size()) {
        LOG.info("Removed duplicates from formed query, including "
            + hits.get(idsToRemove.get(0)));
      }

    } catch (Exception e) {
      LOG.severe("Problem removing duplicates from query list");
    }

    return hitsDedup;

  }

  /**
   * remove dupes from search results
   * 
   * @param List
   *          <HitBase> of search results objects
   * @return List<String> of search results objects where dupes are removed
   */
  public static List<HitBase> removeDuplicatesFromResultantHits(
      List<HitBase> hits) {
    StringDistanceMeasurer meas = new StringDistanceMeasurer();
    double dupeThresh = // 0.8; // if more similar, then considered dupes was
    0.7;
    List<Integer> idsToRemove = new ArrayList<Integer>();
    List<HitBase> hitsDedup = new ArrayList<HitBase>();
    try {
      for (int i = 0; i < hits.size(); i++)
        for (int j = i + 1; j < hits.size(); j++) {
          HitBase hit2 = hits.get(j);
          List<Fragment> fragmList1 = hits.get(i).getFragments();
          List<Fragment> fragmList2 = hits.get(j).getFragments();
          List<Fragment> fragmList2Results = new ArrayList<Fragment>(fragmList2);
          for (Fragment f1 : fragmList1)
            for (Fragment f2 : fragmList2) {
              String sf1 = f1.getResultText();
              String sf2 = f2.getResultText();
              if (StringUtils.isEmpty(sf1) || StringUtils.isEmpty(sf1))
                continue;
              if (meas.measureStringDistance(sf1, sf2) > dupeThresh) {
                fragmList2Results.remove(f2);
                LOG.info("Removed duplicates from formed fragments list: "
                    + sf2);
              }
            }

          hit2.setFragments(fragmList2Results);
          hits.set(j, hit2);
        }
    } catch (Exception e) {
      LOG.severe("Problem removing duplicates from list of fragment");
    }
    return hits;
  }

  /**
   * Takes single search result for an entity which is the subject of the essay
   * to be written and forms essey sentences from the title, abstract, and
   * possibly original page
   * 
   * @param HitBase
   *          item : search result
   * @param originalSentence
   *          : seed for the essay to be written
   * @param sentsAll
   *          : list<String> of other sentences in the seed if it is
   *          multi-sentence
   * @return search result
   */

  public HitBase augmentWithMinedSentencesAndVerifyRelevance(HitBase item,
      String originalSentence, List<String> sentsAll) {
    if (sentsAll == null)
      sentsAll = new ArrayList<String>();
    // put orig sentence in structure
    List<String> origs = new ArrayList<String>();
    origs.add(originalSentence);
    item.setOriginalSentences(origs);
    String title = item.getTitle().replace("<b>", " ").replace("</b>", " ")
        .replace("  ", " ").replace("  ", " ");
    // generation results for this sentence
    List<Fragment> result = new ArrayList<Fragment>();
    // form plain text from snippet
    String snapshot = item.getAbstractText().replace("<b>", " ")
        .replace("</b>", " ").replace("  ", " ").replace("  ", " ");

    ParserChunker2MatcherProcessor sm = ParserChunker2MatcherProcessor
        .getInstance();
    // fix a template expression which can be substituted by original if
    // relevant
    String snapshotMarked = snapshot.replace("...",
        " _should_find_orig_ . _should_find_orig_");
    String[] fragments = sm.splitSentences(snapshotMarked);
    List<String> allFragms = new ArrayList<String>();
    allFragms.addAll(Arrays.asList(fragments));

    String[] sents = null;
    String downloadedPage;
    try {
      if (snapshotMarked.length() != snapshot.length()) {
        downloadedPage = pFetcher.fetchPage(item.getUrl());
        if (downloadedPage != null && downloadedPage.length() > 100) {
          item.setPageContent(downloadedPage);
          String pageContent = Utils.fullStripHTML(item.getPageContent());
          pageContent = GeneratedSentenceProcessor
              .normalizeForSentenceSplitting(pageContent);
          pageContent = pageContent.trim().replaceAll("  [A-Z]", ". $0")// .replace("  ",
                                                                        // ". ")
              .replace("..", ".").replace(". . .", " ").trim(); // sometimes
                                                                // html breaks
                                                                // are converted
                                                                // into ' ' (two
                                                                // spaces), so
                                                                // we need to
                                                                // put '.'
          sents = sm.splitSentences(snapshotMarked);
          ;
          sents = cleanListOfSents(sents);
        }
      }
    } catch (Exception e) {
      // TODO Auto-generated catch block
      // e.printStackTrace();
      System.err
          .println("Problem downloading  the page and splitting into sentences");
      return item;
    }

    for (String fragment : allFragms) {
      String followSent = null;
      if (fragment.length() < 50)
        continue;
      String pageSentence = "";
      // try to find original sentence from webpage
      if (fragment.indexOf("_should_find_orig_") > -1 && sents != null
          && sents.length > 0)
        try {
          String[] mainAndFollowSent = getFullOriginalSentenceFromWebpageBySnippetFragment(
              fragment.replace("_should_find_orig_", ""), sents);
          pageSentence = mainAndFollowSent[0];
          followSent = mainAndFollowSent[1];

        } catch (Exception e) {

          // TODO Auto-generated catch block
          e.printStackTrace();
        }
      else
        // or get original snippet
        pageSentence = fragment;
      if (pageSentence != null)
        pageSentence.replace("_should_find_orig_", "");

      // resultant sentence SHOULD NOT be longer than twice the size of
      // snippet fragment
      if (pageSentence != null
          && (float) pageSentence.length() / (float) fragment.length() < 4.0) { // was
                                                                                // 2.0,
                                                                                // but
                                                                                // since
                                                                                // snippet
                                                                                // sentences
                                                                                // are
                                                                                // rather
                                                                                // short
                                                                                // now...
        try { // get score from syntactic match between sentence in
              // original text and mined sentence
          double measScore = 0.0, syntScore = 0.0, mentalScore = 0.0;

          SentencePairMatchResult matchRes = sm.assessRelevance(pageSentence
              + " " + title, originalSentence);
          List<List<ParseTreeChunk>> match = matchRes.getMatchResult();
          if (!matchRes.isVerbExists() || matchRes.isImperativeVerb()) {
            System.out
                .println("Rejected Sentence : No verb OR Yes imperative verb :"
                    + pageSentence);
            continue;
          }

          syntScore = parseTreeChunkListScorer
              .getParseTreeChunkListScore(match);
          System.out.println(parseTreeChunk.listToString(match) + " "
              + syntScore + "\n pre-processed sent = '" + pageSentence);

          if (syntScore < 1.5) { // trying other sents
            for (String currSent : sentsAll) {
              if (currSent.startsWith(originalSentence))
                continue;
              match = sm.assessRelevance(currSent, pageSentence)
                  .getMatchResult();
              double syntScoreCurr = parseTreeChunkListScorer
                  .getParseTreeChunkListScore(match);
              if (syntScoreCurr > syntScore) {
                syntScore = syntScoreCurr;
              }
            }
            if (syntScore > 1.5) {
              System.out.println("Got match with other sent: "
                  + parseTreeChunk.listToString(match) + " " + syntScore);
            }
          }

          measScore = STRING_DISTANCE_MEASURER.measureStringDistance(
              originalSentence, pageSentence);

          // now possibly increase score by finding mental verbs
          // indicating opinions
          for (String s : MENTAL_VERBS) {
            if (pageSentence.indexOf(s) > -1) {
              mentalScore += 0.3;
              break;
            }
          }

          if ((syntScore > 1.5 || measScore > 0.5 || mentalScore > 0.5)
              && measScore < 0.8 && pageSentence.length() > 40) // >70
          {
            String pageSentenceProc = GeneratedSentenceProcessor
                .acceptableMinedSentence(pageSentence);
            if (pageSentenceProc != null) {
              pageSentenceProc = GeneratedSentenceProcessor
                  .processSentence(pageSentenceProc);
              if (followSent != null) {
                pageSentenceProc += " "
                    + GeneratedSentenceProcessor.processSentence(followSent);
              }

              pageSentenceProc = Utils.convertToASCII(pageSentenceProc);
              Fragment f = new Fragment(pageSentenceProc, syntScore + measScore
                  + mentalScore + (double) pageSentenceProc.length()
                  / (double) 50);
              f.setSourceURL(item.getUrl());
              f.fragment = fragment;
              result.add(f);
              System.out.println("Accepted sentence: " + pageSentenceProc
                  + "| with title= " + title);
              System.out.println("For fragment = " + fragment);
            } else
              System.out
                  .println("Rejected sentence due to wrong area at webpage: "
                      + pageSentence);
          } else
            System.out.println("Rejected sentence due to low score: "
                + pageSentence);
          // }
        } catch (Throwable t) {
          t.printStackTrace();
        }
      }
    }
    item.setFragments(result);
    return item;
  }

  public static String[] cleanListOfSents(String[] sents) {
    List<String> sentsClean = new ArrayList<String>();
    for (String s : sents) {
      if (s == null || s.trim().length() < 30 || s.length() < 20)
        continue;
      sentsClean.add(s);
    }
    return (String[]) sentsClean.toArray(new String[0]);
  }

  // given a fragment from snippet, finds an original sentence at a webpage by
  // optimizing alignmemt score
  public static String[] getFullOriginalSentenceFromWebpageBySnippetFragment(
      String fragment, String[] sents) {
    if (fragment.trim().length() < 15)
      return null;

    StringDistanceMeasurer meas = new StringDistanceMeasurer();
    Double dist = 0.0;
    String result = null, followSent = null;
    for (int i = 0; i < sents.length; i++) {
      String s = sents[i];
      if (s == null || s.length() < 30)
        continue;
      Double distCurr = meas.measureStringDistance(s, fragment);
      if (distCurr > dist && distCurr > 0.4) {
        result = s;
        dist = distCurr;
        if (i < sents.length - 1 && sents[i + 1].length() > 60) {
          followSent = sents[i + 1];
        }

      }
    }
    return new String[] { result, followSent };
  }

  // given a fragment from snippet, finds an original sentence at a webpage by
  // optimizing alignmemt score
  public static String[] getBestFullOriginalSentenceFromWebpageBySnippetFragment(
      String fragment, String[] sents) {
    if (fragment.trim().length() < 15)
      return null;
    int bestSentIndex = -1;
    StringDistanceMeasurer meas = new StringDistanceMeasurer();
    Double distBest = 10.0; // + sup
    String result = null, followSent = null;
    for (int i = 0; i < sents.length; i++) {
      String s = sents[i];
      if (s == null || s.length() < 30)
        continue;
      Double distCurr = meas.measureStringDistance(s, fragment);
      if (distCurr > distBest) {
        distBest = distCurr;
        bestSentIndex = i;
      }

    }
    if (distBest > 0.4) {
      result = sents[bestSentIndex];

      if (bestSentIndex < sents.length - 1
          && sents[bestSentIndex + 1].length() > 60) {
        followSent = sents[bestSentIndex + 1];
      }

    }

    return new String[] { result, followSent };
  }

  public static void main(String[] args) {
    RelatedSentenceFinder f = new RelatedSentenceFinder();

    List<HitBase> hits = null;
    try {
      // uncomment the sentence you would like to serve as a seed sentence for
      // content generation for an event description

      // uncomment the sentence you would like to serve as a seed sentence for
      // content generation for an event description
      hits = f.generateContentAbout("Albert Einstein"
      // "Britney Spears - The Femme Fatale Tour"
      // "Rush Time Machine",
      // "Blue Man Group" ,
      // "Belly Dance With Zaharah",
      // "Hollander Musicology Lecture: Danielle Fosler-Lussier, Guest Lecturer",
      // "Jazz Master and arguably the most famous jazz musician alive, trumpeter Wynton Marsalis",
          );
      System.out.println(HitBase.toString(hits));
      System.out.println(HitBase.toResultantString(hits));
      // WordFileGenerator.createWordDoc("Essey about Albert Einstein",
      // hits.get(0).getTitle(), hits);

    } catch (Exception e) {
      e.printStackTrace();
    }

  }

}