/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *  http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.any23.extractor.html;

import org.apache.any23.extractor.ExtractionContext;
import org.apache.any23.extractor.ExtractionException;
import org.apache.any23.extractor.ExtractionParameters;
import org.apache.any23.extractor.ExtractionResult;
import org.apache.any23.extractor.Extractor;
import org.apache.any23.extractor.ExtractorDescription;
import org.apache.any23.rdf.RDFUtils;
import org.apache.any23.vocab.SINDICE;
import org.eclipse.rdf4j.model.IRI;
import org.eclipse.rdf4j.model.impl.SimpleValueFactory;
import org.w3c.dom.Document;
import org.w3c.dom.NamedNodeMap;
import org.w3c.dom.Node;

import java.io.IOException;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;

/**
 * This extractor represents the <i>HTML META</i> tag values according the
 * <a href="http://www.w3.org/TR/html401/struct/global.html#h-7.4.4">HTML4 specification</a>.
 *
 * @author Davide Palmisano ( dpalmisano@gmail.com )
 */
public class HTMLMetaExtractor implements Extractor.TagSoupDOMExtractor {

    private static final SINDICE vSINDICE = SINDICE.getInstance();

    private IRI profile;

    private Map<String, IRI> prefixes = new HashMap<>();

    private String documentLang;

    /**
     * {@inheritDoc}
     */
    @Override
    public void run(ExtractionParameters extractionParameters, ExtractionContext extractionContext, Document in,
            ExtractionResult out) throws IOException, ExtractionException {
        profile = extractProfile(in);
        documentLang = getDocumentLanguage(in);
        extractLinkDefinedPrefixes(in);

        String baseProfile = vSINDICE.NS;
        if (profile != null) {
            baseProfile = profile.toString();
        }

        final IRI documentIRI = extractionContext.getDocumentIRI();
        Set<Meta> metas = extractMetaElement(in, baseProfile);
        for (Meta meta : metas) {
            String lang = documentLang;
            if (meta.getLang() != null) {
                lang = meta.getLang();
            }
            if (meta.isPragmaDirective) {
                if (lang != null) {
                    out.writeTriple(documentIRI, meta.getHttpEquiv(),
                            SimpleValueFactory.getInstance().createLiteral(meta.getContent(), lang));
                } else {
                    out.writeTriple(documentIRI, meta.getHttpEquiv(),
                            SimpleValueFactory.getInstance().createLiteral(meta.getContent()));
                }
            } else {
                if (lang != null) {
                    out.writeTriple(documentIRI, meta.getName(),
                            SimpleValueFactory.getInstance().createLiteral(meta.getContent(), lang));
                } else {
                    out.writeTriple(documentIRI, meta.getName(),
                            SimpleValueFactory.getInstance().createLiteral(meta.getContent()));
                }
            }
        }
    }

    /**
     * Returns the {@link Document} language if declared, <code>null</code> otherwise.
     *
     * @param in
     *            a instance of {@link Document}.
     * 
     * @return the language declared, could be <code>null</code>.
     */
    private String getDocumentLanguage(Document in) {
        String lang = DomUtils.find(in, "string(/HTML/@lang)");
        if ("".equals(lang)) {
            return null;
        }
        return lang;
    }

    private IRI extractProfile(Document in) {
        String profile = DomUtils.find(in, "string(/HTML/@profile)");
        if ("".equals(profile)) {
            return null;
        }
        return SimpleValueFactory.getInstance().createIRI(profile);
    }

    /**
     * It extracts prefixes defined in the <i>LINK</i> meta tags.
     *
     * @param in
     */
    private void extractLinkDefinedPrefixes(Document in) {
        List<Node> linkNodes = DomUtils.findAll(in, "/HTML/HEAD/LINK");
        for (Node linkNode : linkNodes) {
            NamedNodeMap attributes = linkNode.getAttributes();
            Node relNode = attributes.getNamedItem("rel");
            String rel = relNode == null ? null : relNode.getTextContent();
            Node hrefNode = attributes.getNamedItem("href");
            String href = hrefNode == null ? null : hrefNode.getTextContent();
            if (rel != null && href != null && RDFUtils.isAbsoluteIRI(href)) {
                prefixes.put(rel, SimpleValueFactory.getInstance().createIRI(href));
            }
        }
    }

    private Set<Meta> extractMetaElement(Document in, String baseProfile) {
        List<Node> metaNodes = DomUtils.findAll(in, "/HTML/HEAD/META");
        Set<Meta> result = new HashSet<>();
        for (Node metaNode : metaNodes) {
            NamedNodeMap attributes = metaNode.getAttributes();
            Node nameAttribute = attributes.getNamedItem("name");
            Node httpEquivAttribute = attributes.getNamedItem("http-equiv");
            Node contentAttribute = attributes.getNamedItem("content");
            if (nameAttribute == null && httpEquivAttribute == null)
                continue; // support HTML5 meta element nodes that do not have both name and http-equiv
            if (nameAttribute != null || httpEquivAttribute != null) {
                if (contentAttribute == null) {
                    continue;
                }
            }
            boolean isPragmaDirective = (httpEquivAttribute != null) ? true : false;
            if (isPragmaDirective) {
                String httpEquiv = httpEquivAttribute.getTextContent();
                String content = contentAttribute.getTextContent();
                String xpath = DomUtils.getXPathForNode(metaNode);
                IRI httpEquivAsIRI = getPrefixIfExists(httpEquiv);
                if (httpEquivAsIRI == null) {
                    httpEquivAsIRI = SimpleValueFactory.getInstance().createIRI(baseProfile + httpEquiv);
                }
                Meta meta = new Meta(xpath, content, httpEquivAsIRI);
                result.add(meta);
            } else {
                String name = nameAttribute.getTextContent();
                String content = contentAttribute.getTextContent();
                String xpath = DomUtils.getXPathForNode(metaNode);
                IRI nameAsIRI = getPrefixIfExists(name);
                if (nameAsIRI == null) {
                    nameAsIRI = SimpleValueFactory.getInstance().createIRI(baseProfile + name);
                }
                Meta meta = new Meta(xpath, nameAsIRI, content);
                result.add(meta);
            }
        }
        return result;
    }

    private IRI getPrefixIfExists(String name) {
        String[] split = name.split("\\.");
        if (split.length == 2 && prefixes.containsKey(split[0])) {
            return SimpleValueFactory.getInstance().createIRI(prefixes.get(split[0]) + split[1]);
        }
        return null;
    }

    @Override
    public ExtractorDescription getDescription() {
        return HTMLMetaExtractorFactory.getDescriptionInstance();
    }

    private static class Meta {

        private String xpath;

        private IRI name;

        private IRI httpEquiv;

        private String lang;

        private String content;

        private boolean isPragmaDirective;

        public Meta(String xpath, String content, IRI httpEquiv) {
            this.xpath = xpath;
            this.content = content;
            this.httpEquiv = httpEquiv;
            this.setPragmaDirective(true);
        }

        @SuppressWarnings("unused")
        public Meta(String xpath, String content, IRI httpEquiv, String lang) {
            this(xpath, content, httpEquiv);
            this.lang = lang;
        }

        public Meta(String xpath, IRI name, String content) {
            this.xpath = xpath;
            this.name = name;
            this.content = content;
        }

        @SuppressWarnings("unused")
        public Meta(String xpath, IRI name, String content, String lang) {
            this(xpath, name, content);
            this.lang = lang;
        }

        private void setPragmaDirective(boolean value) {
            this.isPragmaDirective = value;
        }

        public IRI getHttpEquiv() {
            return httpEquiv;
        }

        public IRI getName() {
            return name;
        }

        public String getLang() {
            return lang;
        }

        public String getContent() {
            return content;
        }

        @Override
        public boolean equals(Object o) {
            if (this == o)
                return true;
            if (o == null || getClass() != o.getClass())
                return false;

            Meta meta = (Meta) o;

            if (xpath != null ? !xpath.equals(meta.xpath) : meta.xpath != null)
                return false;

            return true;
        }

        @Override
        public int hashCode() {
            return xpath != null ? xpath.hashCode() : 0;
        }
    }

}
