/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.nutch.parse.headings;

import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.hadoop.conf.Configuration;
import org.apache.nutch.parse.HTMLMetaTags;
import org.apache.nutch.parse.Parse;
import org.apache.nutch.parse.HtmlParseFilter;
import org.apache.nutch.parse.ParseResult;
import org.apache.nutch.protocol.Content;
import org.apache.nutch.util.NodeWalker;
import org.w3c.dom.DocumentFragment;
import org.w3c.dom.Node;

/**
 * HtmlParseFilter to retrieve h1 and h2 values from the DOM.
 */
public class HeadingsParseFilter implements HtmlParseFilter {

  /**
   * Pattern used to strip surpluss whitespace
   */
  protected static Pattern whitespacePattern = Pattern.compile("\\s+");

  private Configuration conf;
  private String[] headings;
  private boolean multiValued = false;

  public ParseResult filter(Content content, ParseResult parseResult,
      HTMLMetaTags metaTags, DocumentFragment doc) {
    Parse parse = parseResult.get(content.getUrl());

    for (int i = 0; headings != null && i < headings.length; i++) {
      List<String> discoveredHeadings = getElement(doc, headings[i]);

      if (discoveredHeadings.size() > 0) {
        for (String heading : discoveredHeadings) {
          if (heading != null) {
            heading = heading.trim();

            if (heading.length() > 0) {
              parse.getData().getParseMeta().add(headings[i], heading);
            }
          }
        }
      }
    }

    return parseResult;
  }

  public void setConf(Configuration conf) {
    this.conf = conf;

    headings = conf.getStrings("headings");
    multiValued = conf.getBoolean("headings.multivalued", false);
  }

  public Configuration getConf() {
    return this.conf;
  }

  /**
   * Finds the specified element and returns its value
   * @param doc the input {@link org.w3c.dom.DocumentFragment} to process
   * @param element the element to find in the DocumentFragment
   * @return a {@link java.util.List} containing headings
   */
  protected List<String> getElement(DocumentFragment doc, String element) {
    List<String> headings = new ArrayList<>();
    NodeWalker walker = new NodeWalker(doc);

    while (walker.hasNext()) {
      Node currentNode = walker.nextNode();

      if (currentNode.getNodeType() == Node.ELEMENT_NODE) {
        if (element.equalsIgnoreCase(currentNode.getNodeName())) {
          headings.add(getNodeValue(currentNode));

          // Check for multiValued here, if disabled we don't need
          // to discover more headings.
          if (!multiValued) {
            break;
          }
        }
      }
    }

    return headings;
  }

  /**
   * Returns the text value of the specified Node and child nodes
   * @param node the input {@link Node} to extract a value(s) for
   * @return the whitespace-stripped String node value(s)
   */
  protected static String getNodeValue(Node node) {
    StringBuilder buffer = new StringBuilder();
    NodeWalker walker = new NodeWalker(node);

    while (walker.hasNext()) {
      final Node n = walker.nextNode();

      if (n.getNodeType() == Node.TEXT_NODE) {
        buffer.append(n.getNodeValue());
      }
    }

    // Return with stripped surplus whitespace
    Matcher matcher = whitespacePattern.matcher(buffer.toString().trim());
    return matcher.replaceAll(" ").trim();
  }
}
