--[[
 Licensed to the Apache Software Foundation (ASF) under one or more
 contributor license agreements.  See the NOTICE file distributed with
 this work for additional information regarding copyright ownership.
 The ASF licenses this file to You under the Apache License, Version 2.0
 (the "License"); you may not use this file except in compliance with
 the License.  You may obtain a copy of the License at

     http://www.apache.org/licenses/LICENSE-2.0

 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
]]--

-- This is stats.lua - the main stats/ML rendering script for the web.

local JSON = require 'cjson'
local elastic = require 'lib/elastic'
local user = require 'lib/user'
local aaa = require 'lib/aaa'
local config = require 'lib/config'
local cross = require 'lib/cross'

local days = {
    31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31, 30, 31 
}

function sortEmail(thread)
    if thread.children and type(thread.children) == "table" then
        table.sort (thread.children, function (k1, k2) return k1.epoch > k2.epoch end )
        for k, v in pairs(thread.children) do
            sortEmail(v)
        end
    end
end

function leapYear(year)
    if (year % 4 == 0) then
        if (year%100 == 0)then                
            if (year %400 == 0) then                    
                return true
            end
        else                
            return true
        end
        return false
    end
end


function handle(r)
    cross.contentType(r, "application/json")
    local t = {}
    local now = r:clock()
    local tnow = now
    local get = r:parseargs()
    
    -- statsOnly: Whether to only send statistical info (for n-grams etc), and not the
    -- thread struct and message bodies
    local statsOnly = get.quick
    if not get.list or not get.domain then
        r:puts("{}")
        return cross.OK
    end
    local qs = "*"
    local nqs = ""
    local dd = "lte=1M"
    local maxresults = config.maxResults or 5000
    local account = user.get(r)
    local rights = nil
    if get.d and tonumber(get.d) and tonumber(get.d) > 0 then
        dd = tonumber(get.d)
    end
    if get.q and #get.q > 0 then
        x = {}
        nx = {}
        local q = get.q
        for k, v in pairs({'from','subject','body'}) do
            y = {}
            z = {}
            local words = {}
            
            -- first, grab all "foo bar" quotes
            for lword in q:gmatch([[("[^"]+")]]) do
                table.insert(words, lword)
            end
            -- then cut them out of the query
            for _, word in pairs(words) do
                q = q:gsub('"' .. word:gsub('[.%-%%%?%+]', "%%%1") .. '"', "")
            end
            
            -- then remaining single words
            for word in q:gmatch("(%S+)") do
                table.insert(words, word)
            end
            
            for _, word in pairs(words) do
                local preface = ""
                if word:match("^%-") then
                    preface = "-"
                    word = word:sub(2)
                end
                if preface == "" then
                    table.insert(y, ("%s:\"%s\""):format(v, r:escape_html( word:gsub("[()\"]+", "") )))
                else
                    table.insert(z, ("%s:\"%s\""):format(v, r:escape_html( word:gsub("[()\"]+", "") )))
                end
            end
            if #y > 0 then
                table.insert(x, "(" .. table.concat(y, " AND ") .. ")")
            end
            if #z > 0 then
                table.insert(nx, "(" .. table.concat(z, " OR ") .. ")")
            end
        end
        qs = table.concat(x, " OR ")
        if qs == "" then
            qs = "*"
        end
        nqs = table.concat(nx, " OR ")
        r:err(qs)
    end
    
    local listraw = "<" .. get.list .. "." .. get.domain .. ">"
    local listdata = {
        name = get.list,
        domain = get.domain
    }

    z = {}
    for k, v in pairs({'from','subject','body', 'to'}) do
        if get['header_' .. v] then
            local word = get['header_' .. v]
            table.insert(z, ("(%s:\"%s\")"):format(v, r:escape_html( word:gsub("[()\"]+", "") )))
        end
    end
    if #z > 0 then
        if #qs > 0 and qs ~= "*" then
            qs = qs .. " AND (" .. table.concat(z, " AND ") .. ")"
        else
            qs = table.concat(z, " AND ")
        end
    end
    
    -- Debug time point 1
    table.insert(t, r:clock() - tnow)
    tnow = r:clock()
    
    local daterange = {gt = "now-1M", lte = "now+1d" }
    if get.dfrom and get.dto then
        local ef = tonumber(get.dfrom:match("(%d+)$")) or 0
        local et = tonumber(get.dto:match("^(%d+)")) or 0
        if ef > 0 and et > 0 then
            if et > ef then
                et = ef
            end
            daterange = {
                gte = "now-" .. ef .. "d",
                lte = "now-" .. (ef-et) .. "d"
            }
        end
    end
    if not get.d then
        get.d = dd
    end
    if get.d and get.d:match("lte=.+") then
        local lte = get.d:match("lte=([0-9]+[wMyd])")
        if lte then
            daterange.lte = "now+1d"
            daterange.gte = "now-" .. lte
            daterange.gt = nil
        end
    end
    if get.d and get.d:match("gte=.+") then
        local gte = get.d:match("gte=([0-9]+[wMyd])")
        if gte then
            daterange.gte = nil
            daterange.gt = nil
            daterange.lte = "now-" .. gte
        end
    end
    if get.d and get.d:match("dfr=.+") then
        local y,m,d = get.d:match("dfr=(%d+)%-(%d+)%-(%d+)")
        if y and m and d then
            daterange.gte = ("%04u/%02u/%02u 00:00:00"):format(y,m,d)
            daterange.gt = nil
        end
    end
    if get.d and get.d:match("dto=.+") then
        local y,m,d = get.d:match("dto=(%d+)%-(%d+)%-(%d+)")
        if y and m and d then
            daterange.lte = ("%04u/%02u/%02u 23:59:59"):format(y,m,d)
            daterange.gt = nil
        end
    end
    if get.s and get.e then
        local em = tonumber(get.e:match("(%d+)$"))
        local ey = tonumber(get.e:match("^(%d+)"))
        ec = days[em]
        if em == 2 and leapYear(ey) then
            ec = ec + 1
        end
        daterange = {        
            gte = get.s:gsub("%-","/").."/01 00:00:00",
            lte = get.e:gsub("%-","/").."/" .. ec .. " 23:59:59",
        }
    end
    local wc = false
    local sterm = {
                    term = {
                        list_raw = listraw
                    }
                }
    if get.list == "*" then
        wc = true
        sterm = {
                    wildcard = {
                        list = "*." .. get.domain
                    }
                }
        maxresults = config.maxResults or 5000
    end
    if get.domain == "*" then
        wc = true
        sterm = {
                    wildcard = {
                        list = "*"
                    }
                }
        maxresults = config.maxResults or 5000
    end
    
    local top10 = {}
    local allparts = 0
    if config.slow_count then
        -- Debug time point 2
        table.insert(t, r:clock() - tnow)
        tnow = r:clock()
        
        local doc = elastic.raw {
            aggs = {
                from = {
                    terms = {
                        field = "from_raw",
                        size = 10
                    }
                }
            },
            
            query = {
                
                bool = {
                    must = {
                        
                        {
                            range = {
                                date = daterange
                            }
                        },
                        {
                            query_string = {
                                default_field = "subject",
                                query = qs
                            }
                        },
                        sterm
                        
                },
                    must_not = {
                        {
                            query_string = {
                                default_field = "subject",
                                query = nqs
                            }
                        }
                }}
                
            }
        }
        
    
        -- Debug time point 3
        table.insert(t, r:clock() - tnow)
        tnow = r:clock()
        
        for x,y in pairs (doc.aggregations.from.buckets) do
            local eml = y.key:match("<(.-)>") or y.key:match("%S+@%S+") or "unknown"
            local gravatar = r:md5(eml)
            local name = y.key:match("([^<]+)%s*<.->") or y.key:match("%S+@%S+")
            name = name:gsub("\"", "")
            table.insert(top10, {
                id = y.key,
                email = eml,
                gravatar = gravatar,
                name = name,
                count = y.doc_count
            })
        end
    end
    
    -- Debug time point 4
    table.insert(t, r:clock() - tnow)
    tnow = r:clock()
    local cloud = nil
    if config.wordcloud and not statsOnly then
        cloud = {}
        -- Word cloud!
        local doc = elastic.raw {
            aggregations = {
                subdoc = {
                    filter = {
                       limit = {value = 100} -- Max 100 x N documents used for this, otherwise it's too slow
                   },
                    aggregations = {
                        cloud = {
                            significant_terms =  {
                                field =  "subject",
                                size = 10,
                                chi_square = {}
                            }
                        }
                    }
                }
            
        }, 
            query = {
                
                bool = {
                    must = {
                        {
                            range = {
                                date = daterange
                            }
                        }, 
                        {
                            query_string = {
                                default_field = "subject",
                                query = qs
                            }
                        },
                        sterm, {
                            term = {
                                private = false
                            }
                        }
                        
                }}
                
            }
        }
        
        for x,y in pairs (doc.aggregations.subdoc.cloud.buckets) do
            cloud[y.key] = y.doc_count
        end
    end
    -- Debug time point 5
    table.insert(t, r:clock() - tnow)
    tnow = r:clock()
    
    -- Get years active
    local nowish = math.floor(os.time()/600)
    local firstYear = r:ivm_get("firstYear:" .. nowish .. ":" ..get.list .. "@" .. get.domain)
    if (not firstYear or firstYear == "") and not statsOnly then
        local doc = elastic.raw {
            query = {
                bool = {
                    must = {
                        {
                        range = {
                            date = {
                                gt = "1970/01/01 00:00:00",
                            }
                        }
                    },sterm
                }}
            },
            
            sort = {
                {
                    epoch = {
                        order = "asc"
                    }
                }  
            },
            size = 1
        }
        firstYear = tonumber(os.date("%Y", doc.hits.hits[1] and doc.hits.hits[1]._source.epoch or os.time()))
        r:ivm_set("firstYear:" .. nowish .. ":" .. get.list .. "@" .. get.domain, firstYear)
    end
    
    -- Get years active
    local lastYear = r:ivm_get("lastYear:" .. nowish .. ":" ..get.list .. "@" .. get.domain)
    if (not lastYear or lastYear == "")  and not statsOnly then
        local doc = elastic.raw {
    
            query = {
                bool = {
                    must = {
                        {
                        range = {
                            date = {
                                gt = "1970/01/01 00:00:00",
                            }
                        }
                    },sterm
                }}
            },
            
            sort = {
                {
                    epoch = {
                        order = "desc"
                    }
                }  
            },
            size = 1
        }
        lastYear = tonumber(os.date("%Y", doc.hits.hits[1] and doc.hits.hits[1]._source.epoch or os.time()))
        r:ivm_set("lastYear:"  .. nowish .. ":" ..get.list .. "@" .. get.domain, lastYear)
    end
    
    
    -- Debug time point 6
    table.insert(t, r:clock() - tnow)
    tnow = r:clock()

    -- Get threads
    local threads = {}
    local emails = {}
    local emails_full = {}
    local emls = {}
    local senders = {}
    local doc = elastic.raw {
        _source = {'message-id','in-reply-to','from','subject','epoch','references','list_raw', 'private', 'attachments', 'body'},
        query = {
            bool = {
                must = {
                    {
                        range = {
                            date = daterange
                        }
                    },
                    sterm,
                    {
                        query_string = {
                            default_field = "subject",
                            query = qs
                        }
                    }
            },
                must_not = {
                    {
                        query_string = {
                            default_field = "subject",
                            query = nqs
                        }
                    }
            }}
        },
        
        sort = {
            {
                epoch = {
                    order = "desc"
                }
            }  
        },
        size = maxresults
    }
    local h = #doc.hits.hits
    
    -- Debug time point 7
    table.insert(t, r:clock() - tnow)
    tnow = r:clock()
    
    -- Sometimes ES screws up, so let's sort for it!
    table.sort (doc.hits.hits, function (k1, k2) return k1._source.epoch > k2._source.epoch end )
    
    for k = #doc.hits.hits, 1, -1 do
        local v = doc.hits.hits[k]
        local email = v._source
        local canUse = true
        if email.private then
            if account and not rights then
                rights = aaa.rights(r, account)
            end
            canUse = false
            if account then
                local lid = email.list_raw:match("<[^.]+%.(.-)>")
                for k, v in pairs(rights or {}) do
                    if v == "*" or v == lid then
                        canUse = true
                        break
                    end
                end
            end
        end
        if canUse then
            
            if not config.slow_count then
                local eml = email.from:match("<(.-)>") or email.from:match("%S+@%S+") or "unknown"
                local gravatar = r:md5(eml)
                local name = email.from:match("([^<]+)%s*<.->") or email.from:match("%S+@%S+") or "unknown"
                email.gravatar = gravatar
                name = name:gsub("\"", ""):gsub("%s+$", "")
                local eid = ("%s <%s>"):format(name, eml)
                senders[eid] = senders[eid] or {
                    email = eml,
                    gravatar = gravatar,
                    name = name,
                    count = 0
                }
                senders[eid].count = senders[eid].count + 1
            end
            local mid = email['message-id']
            local irt = email['in-reply-to']
            email.id = v._id
            email.irt = irt
            emails[mid] = {
                tid = v._id,
                nest = 1,
                epoch = email.epoch,
                children = {
                    
                }
            }
            
            if not irt or irt == JSON.null or #irt == 0 then
                irt = ""
            end
            if not emails[irt] and email.references and email.references ~= JSON.null then
                for ref in email.references:gmatch("([^%s]+)") do
                    if emails[ref] then
                        irt = ref
                        break
                    end
                end
            end
            
            if not irt or irt == JSON.null or #irt == 0 then
                irt = email.subject:gsub("^[a-zA-Z]+:%s+", "")
            end
            
            -- If we can't match by in-reply-to or references, match/group by subject, ignoring Re:/Fwd:/etc
            if not emails[irt] then
                irt = email.subject:gsub("^[a-zA-Z]+:%s+", "")
                while irt:match("^[a-zA-Z]+:%s+") do
                    irt = irt:gsub("^[a-zA-Z]+:%s+", "")
                end
            end
            if emails[irt] then
                if emails[irt].nest < 50 then
                    emails[mid].nest = emails[irt].nest + 1
                    table.insert(emails[irt].children, emails[mid])
                end
            else
                if (email['in-reply-to'] ~= JSON.null and #email['in-reply-to'] > 0) then
                    emails[irt] = {
                        children = {
                            emails[mid]
                        },
                        nest = 1,
                        epoch = email.epoch,
                        tid = v._id
                    }
                    emails[mid].nest = emails[irt].nest + 1
                    table.insert(threads, emails[irt])
                else
                    table.insert(threads, emails[mid])
                end
                if not statsOnly then
                    threads[#threads].body = #email.body < 300 and email.body or email.body:sub(1,300) .. "..."
                end
            end
            email.references = nil
            email.to = nil
            email['in-reply-to'] = nil
            if not account and config.antispam then
                email.from = email.from:gsub("(%S+)@(%S+)", function(a,b) return a:sub(1,2) .. "..." .. "@" .. b end)
            end
            if email.attachments then
                email.attachments = #email.attachments
            else
                email.attachments = 0
            end
            email.body = nil
            if not statsOnly then
                table.insert(emls, email)
            else
                table.insert(emls, {epoch= email.epoch})
            end
        elseif config.slow_count then
            for k, v in pairs(top10) do
                local eml = email.from:match("<(.-)>") or email.from:match("%S+@%S+") or "unknown"
                if v.email == eml then
                    v.count = v.count - 1
                end
            end
        end
    end
    
    if not config.slow_count and not statsOnly then
        local stable = {}
        for k, v in pairs(senders) do
            table.insert(stable, v)
        end
        table.sort(stable, function(a,b) return a.count > b.count end )
        allparts = #stable
        for k, v in pairs(stable) do
            if k <= 10 then
                table.insert(top10, v)
            else
                break
            end
        end
    end
    for k, v in pairs(top10) do
        if v.count <= 0 then
            top10[k] = nil
        end
    end
    
    -- anonymize emails if not logged in - anti-spam!
    if not account and config.antispam then
        for k, v in pairs(top10) do
            top10[k].email = top10[k].email:gsub("(%S+)@(%S+)", function(a,b) return a:sub(1,2) .. "..." .. "@" .. b end)
        end
    end
    
    -- Debug time point 8
    table.insert(t, r:clock() - tnow)
    tnow = r:clock()
    sortEmail(threads)
    
    -- Debug time point 9
    table.insert(t, r:clock() - tnow)
    tnow = r:clock()
    if JSON.encode_max_depth then
        JSON.encode_max_depth(500)
    end
    listdata.max = maxresults
    listdata.using_wc = wc
    listdata.no_threads = #threads
    if not statsOnly then
        listdata.thread_struct = threads
    end
    listdata.firstYear = firstYear
    listdata.lastYear = lastYear
    listdata.list = listraw:gsub("^([^.]+)%.", "%1@"):gsub("[<>]+", "")
    listdata.emails = emls
    listdata.hits = h
    listdata.searchlist = listraw
    listdata.participants = top10
    listdata.cloud = cloud
    listdata.took = r:clock() - now
    listdata.numparts = allparts
    
    
    -- Debug time point 9
    table.insert(t, r:clock() - tnow)
    tnow = r:clock()
    
    listdata.debug = t
    
    r:puts(JSON.encode(listdata))
    
    return cross.OK
end

cross.start(handle)
