blob: 3e82bdbdbc7f145093ac4ec2ce682f5b8efcdecd [file] [log] [blame]
--[[
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
]]--
-- This is stats.lua - the main stats/ML rendering script for the web.
local JSON = require 'cjson'
local elastic = require 'lib/elastic'
local user = require 'lib/user'
local aaa = require 'lib/aaa'
local config = require 'lib/config'
local cross = require 'lib/cross'
local utils = require 'lib/utils'
local BODY_MAXLEN = config.stats_maxBody or 200
-- words to exclude from word cloud:
local EXCLUDE = config.stats_wordExclude or ".|..|..."
local function sortEmail(thread)
if thread.children and type(thread.children) == "table" then
table.sort (thread.children, function (k1, k2) return k1.epoch > k2.epoch end )
for k, v in pairs(thread.children) do
sortEmail(v)
end
end
end
-- findSubject: match an email with an earlier one with the same topic
-- used for orphaned emails
local function findSubject(gblob, blob, subject, epoch, maxAge)
local subj = subject:gsub("^[A-Za-z]:%s+", "")
for k, v in pairs(blob) do
if v.subject and v.subject == subj and v.epoch < epoch and (not maxAge or (maxAge and v.epoch >= (epoch - (maxAge*86400)))) then
local mid = v['message-id']
if gblob[mid] then
return gblob[mid]
end
end
end
return nil
end
-- extract canonical email name from from field
local function extractCanonName(from)
local name = from:match("([^<]+)%s*<.->") or from:match("%S+@%S+") or from:match("%((.-)%)") or "unknown"
return name:gsub("\"", ""):gsub("%s+$", "")
end
function handle(r)
cross.contentType(r, "application/json; charset=UTF-8")
local DEBUG = config.debug or false
local t = {}
local START = DEBUG and r:clock() or nil
local tnow = START
local get = r:parseargs()
-- statsOnly: Whether to only send statistical info (for n-grams etc), and not the
-- thread struct and message bodies
-- Param: quick
local statsOnly = get.quick
-- Param: list=<listname> or '*' (required)
-- Param: domain=<domain> or '*' (required)
if not get.list or not get.domain then
r:puts("{}")
return cross.OK
end
-- emailsOnly: return email summaries only, not derived data:
-- i.e. omit thread_struct, top 10 participants and word-cloud
local emailsOnly = get.emailsOnly
local qs = "*" -- positive query
local nqs = "" -- negative query
local dd = "lte=1M"
local maxresults = config.maxResults or 5000
local account = user.get(r)
-- Param: d=nnnnn (numeric)
-- does not appear to be supported below
if get.d and tonumber(get.d) and tonumber(get.d) > 0 then
dd = tonumber(get.d)
end
-- Param: q=query
if get.q and #get.q > 0 then
local x = {}
local nx = {}
local q = get.q:gsub("+", " ") -- apache quirk?
for k, v in pairs({'from','subject','body'}) do
local y = {}
local z = {}
local words = {}
-- first, grab all "foo bar" quotes
for lword in q:gmatch([[("[^"]+")]]) do
table.insert(words, lword)
end
-- then cut them out of the query
for _, word in pairs(words) do
q = q:gsub('"' .. word:gsub('[.%-%%%?%+]', "%%%1") .. '"', "")
end
-- then remaining single words
for word in q:gmatch("(%S+)") do
table.insert(words, word)
end
for _, word in pairs(words) do
local preface = ""
if word:match("^%-") then
preface = "-"
word = word:sub(2)
end
if preface == "" then
table.insert(y, ("%s:\"%s\""):format(v, r:escape_html( word:gsub("[()\"]+", "") )))
else
table.insert(z, ("%s:\"%s\""):format(v, r:escape_html( word:gsub("[()\"]+", "") )))
end
end
if #y > 0 then
table.insert(x, "(" .. table.concat(y, " AND ") .. ")")
end
if #z > 0 then
table.insert(nx, "(" .. table.concat(z, " OR ") .. ")")
end
end
qs = table.concat(x, " OR ")
if qs == "" then
qs = "*"
end
nqs = table.concat(nx, " OR ")
end
local listraw = "<" .. get.list .. "." .. get.domain .. ">"
local listdata = {
name = get.list,
domain = get.domain
}
local z = {}
-- Param: header_from=
-- Param: header_subject=
-- Param: header_body=
-- Param: header_to=
for k, v in pairs({'from','subject','body', 'to'}) do
if get['header_' .. v] then
local word = get['header_' .. v]
table.insert(z, ("(%s:\"%s\")"):format(v, r:escape_html( word:gsub("[()\"]+", "") )))
end
end
if #z > 0 then
if #qs > 0 and qs ~= "*" then
qs = qs .. " AND (" .. table.concat(z, " AND ") .. ")"
else
qs = table.concat(z, " AND ")
end
end
-- Debug time point 1
if DEBUG then
table.insert(t, r:clock() - tnow)
tnow = r:clock()
end
local daterange = {gt = "now-1M", lte = "now+1d" }
-- Param: dfrom=.*ddd (days ago to start)
-- Param: dto=dddd.* (total days to match)
-- Must both be present
if get.dfrom and get.dto then
local ef = tonumber(get.dfrom:match("(%d+)$")) or 0
local et = tonumber(get.dto:match("^(%d+)")) or 0
if ef > 0 and et > 0 then
if et > ef then
et = ef
end
daterange = {
gte = "now-" .. ef .. "d",
lte = "now-" .. (ef-et) .. "d"
}
end
end
if not get.d then
get.d = dd
end
-- d=YYYY-mm translates into s+e being equal to d
-- Param: d=yyyy-mm
if not (get.s and get.e) and get.d and get.d:match("^%d+%-%d+$") then
get.s = get.d
get.e = get.d
end
-- Param: d=.*lte=n[wMyd].* (less than or equal to n days/weeks/etc ago)
-- from now-nP to now+1d (P=period)
if get.d then
local lte = get.d:match("lte=([0-9]+[wMyd])")
if lte then
daterange.lte = "now+1d"
daterange.gte = "now-" .. lte
daterange.gt = nil
end
end
-- Param: d=.*gte=n[wMyd].* (greater than or equal to n days/weeks/etc ago)
-- before now-nP (P=period)
if get.d then
local gte = get.d:match("gte=([0-9]+[wMyd])")
if gte then
daterange.gte = nil
daterange.gt = nil
daterange.lte = "now-" .. gte
end
end
-- Param: d=.*dfr=yyyy-mm-dd.* (dates from)
-- start date for search
if get.d then
local y,m,d = get.d:match("dfr=(%d+)%-(%d+)%-(%d+)")
if y and m and d then
daterange.gte = ("%04u/%02u/%02u 00:00:00"):format(y,m,d)
daterange.gt = nil
end
end
-- Param: d=.*dto=yyyy-mm-dd.* (dates up to)
-- end date for search
if get.d then
local y,m,d = get.d:match("dto=(%d+)%-(%d+)%-(%d+)")
if y and m and d then
daterange.lte = ("%04u/%02u/%02u 23:59:59"):format(y,m,d)
daterange.gt = nil
end
end
-- Param: s=yyyy-m[m]
-- Param: e=yyyy-m[m]
if get.s and get.e then
local em = tonumber(get.e:match("%-(%d%d?)$"))
local ey = tonumber(get.e:match("^(%d%d%d%d)"))
local ec = utils.lastDayOfMonth(ey, em)
daterange = {
gte = get.s:gsub("%-","/").."/01 00:00:00",
lte = get.e:gsub("%-","/").."/" .. ec .. " 23:59:59",
}
end
local wc = false -- wildcard?
local sterm = { -- which lists to process
term = {
list_raw = listraw
}
}
if get.list == "*" then
wc = true
sterm = {
regexp = {
list_raw = "\\<[^.]+\\." .. get.domain .. "\\>"
}
}
end
if get.domain == "*" then
wc = true
sterm = {
wildcard = {
list = "*"
}
}
end
--[[ ============ Completed all parameter parsing, now process the data ============ ]]
-- Check for changes?
-- Param: since=epoch (optional, defaults to now)
if get.since then
local epoch = tonumber(get.since) or os.time()
local doc = elastic.raw {
_source = {'message-id'},
query = {
bool = {
must = {
{
range = {
epoch = {
gt = epoch
}
}
},
{
range = {
date = daterange
}
},
sterm,
{
query_string = {
default_field = "subject",
query = qs
}
}
},
must_not = {
{
query_string = {
default_field = "subject",
query = nqs
}
}
}
}
},
size = 1
}
if #doc.hits.hits == 0 then
r:puts(JSON.encode{
changed = false,
took = DEBUG and (r:clock() - START) or nil
})
return cross.OK
end
end
-- Debug time point 2
if DEBUG then
table.insert(t, r:clock() - tnow)
tnow = r:clock()
end
local cloud = nil
if config.wordcloud and not statsOnly and not emailsOnly then
cloud = {}
-- Word cloud!
local doc = elastic.raw {
size = 0, -- we don't need the hits themselves
terminate_after = 100,
aggs = {
cloud = {
terms = {
field = "subject",
size = 10,
exclude = EXCLUDE
}
}
},
query = {
bool = {
must = {
{
range = {
date = daterange
}
},
{
query_string = {
default_field = "subject",
query = qs
}
},
sterm,
{
term = {
private = false
}
}
},
must_not = {
{
query_string = {
default_field = "subject",
query = nqs
}
}
}
}
}
}
for x,y in pairs (doc.aggregations.cloud.buckets) do
cloud[y.key] = y.doc_count
end
end
-- Debug time point 3
if DEBUG then
table.insert(t, r:clock() - tnow)
tnow = r:clock()
end
-- Get years active
local NOWISH = math.floor(os.time()/600)
local DATESPAN_KEY = "dateSpan:" .. NOWISH .. ":" .. get.list .. "@" .. get.domain
local datespan = JSON.decode(r:ivm_get(DATESPAN_KEY) or "{}")
if not (datespan.pubfirst and datespan.publast) then
local doc = elastic.raw {
size = 0,
query = {
bool = {
must = {
{
range = {
epoch = {
gt = 0
}
}
},
sterm
}
}
},
aggs = {
lists = {
terms = {
field = "list_raw",
size = utils.MAX_LIST_COUNT
},
aggs = {
private = {
terms = {
field = "private",
size = 2
},
aggs = {
first = {
min = {
field = "epoch"
}
},
last = {
max = {
field = "epoch"
}
}
}
}
}
}
}
}
datespan = {}
datespan.pubfirst = nil
datespan.publast = nil
-- find public min and max (buckets will be empty if there are no matching lists)
for _, list in pairs(doc.aggregations.lists.buckets) do
for _, private in pairs(list.private.buckets) do
if private.key_as_string == "false" then
if (datespan.publast == nil) or (private.last.value > datespan.publast) then datespan.publast = private.last.value end
if (datespan.pubfirst == nil) or (private.first.value < datespan.pubfirst) then datespan.pubfirst = private.first.value end
end
end
end
if datespan.publast == nil then -- did not find any values
local NOW = os.time()
datespan.publast = NOW
datespan.pubfirst = NOW
end
-- find private min and max and store them if they could change the public ones
-- store the list entries under the 'private' key to make them easier to process
for _, list in pairs(doc.aggregations.lists.buckets) do
for _, private in pairs(list.private.buckets) do
if private.key_as_string == "true" then
local prvlast = private.last.value
if prvlast > datespan.publast then
datespan.private = datespan.private or {}
datespan.private[list.key] = datespan.private[list.key] or {}
datespan.private[list.key].last = prvlast
end
local prvfirst = private.first.value
if prvfirst < datespan.pubfirst then
datespan.private = datespan.private or {}
datespan.private[list.key] = datespan.private[list.key] or {}
datespan.private[list.key].first = prvfirst
end
end
end
end
r:ivm_set(DATESPAN_KEY, JSON.encode(datespan))
end
-- process the raw list data:
-- get the first and last dates, adjusting as necessary to allow for private lists
local first = datespan.pubfirst
local last = datespan.publast
for lid, prvdates in pairs(datespan.private or {}) do
if aaa.canAccessList(r, lid, account) then
if prvdates.first and prvdates.first < first then first = prvdates.first end
if prvdates.last and prvdates.last > last then last = prvdates.last end
end
end
-- extract years and months for response (using UTC, i.e. leading !)
datespan.firstYear = tonumber(os.date("!%Y", first))
datespan.firstMonth = tonumber(os.date("!%m", first))
datespan.lastYear = tonumber(os.date("!%Y", last))
datespan.lastMonth = tonumber(os.date("!%m", last))
-- Debug time point 4
if DEBUG then
table.insert(t, r:clock() - tnow)
tnow = r:clock()
end
-- Get threads
local threads = {}
local emails = {}
local emls = {}
local senders = {}
local dhh = {}
-- construct thread query
local squery = {
_source = {'message-id','in-reply-to','from','subject','epoch','references','list_raw', 'private', 'attachments', 'body'},
query = {
bool = {
must = {
{
range = {
date = daterange
}
},
sterm,
{
query_string = {
default_field = "subject",
query = qs
}
}
},
must_not = {
{
query_string = {
default_field = "subject",
query = nqs
}
}
}
}
},
sort = {
{
epoch = {
order = "desc"
}
}
},
size = maxresults
}
-- If max results limit is beyond the limit, we have to do a scroll to fetch it.
if maxresults > elastic.MAX_RESULT_WINDOW then
squery.size = elastic.MAX_RESULT_WINDOW -- limit the maximum batch sizes
local js, sid = elastic.scroll(squery)
while js and js.hits and js.hits.hits and #js.hits.hits > 0 do -- scroll as long as we get new results
for k, v in pairs(js.hits.hits) do
table.insert(dhh, v)
end
js, sid = elastic.scroll(sid)
end
elastic.clear_scroll(sid) -- we're done with the sid, release it
-- otherwise, we can just do a standard raw query
else
local doc = elastic.raw(squery)
dhh = doc.hits.hits
end
-- Debug time point 5
if DEBUG then
table.insert(t, r:clock() - tnow)
tnow = r:clock()
end
for k = #dhh, 1, -1 do
local v = dhh[k]
local email = v._source
if aaa.canAccessDoc(r, email, account) then
local eml = utils.extractCanonEmail(email.from)
local gravatar = r:md5(eml:lower())
email.gravatar = gravatar
local name = extractCanonName(email.from)
local eid = ("%s <%s>"):format(name, eml)
if not statsOnly and not emailsOnly then
senders[eid] = senders[eid] or {
email = eml,
gravatar = gravatar,
name = name,
count = 0
}
senders[eid].count = senders[eid].count + 1
end
local mid = email['message-id']
local irt = email['in-reply-to']
email.id = v._id
email.irt = irt
-- only calculate threads if necessary
if not emailsOnly then
emails[mid] = {
tid = v._id,
nest = 1,
epoch = email.epoch,
children = {
}
}
if not irt or irt == JSON.null or #irt == 0 then
irt = ""
end
if not emails[irt] and email.references and email.references ~= JSON.null then
for ref in email.references:gmatch("([^%s]+)") do
if emails[ref] then
irt = ref
break
end
end
end
if not irt or irt == JSON.null or #irt == 0 then
irt = email.subject:gsub("^[a-zA-Z]+:%s+", "")
end
-- If we can't match by in-reply-to or references, match/group by subject, ignoring Re:/Fwd:/etc
if not emails[irt] then
irt = email.subject:gsub("^[a-zA-Z]+:%s+", "")
while irt:match("^[a-zA-Z]+:%s+") do
irt = irt:gsub("^[a-zA-Z]+:%s+", "")
end
end
local refpoint = irt or email['references'] or ""
local point = emails[irt] or (#refpoint > 0 and findSubject(emails, emls, irt, email.epoch))
-- Try a little harder??
if not point and email.subject:match("^[A-Za-z]+:%s+") then -- if this is a 'Re:' or 'Aw:' or 'Fwd:', try to find parent anyway
point = findSubject(emails, emls, irt, email.epoch, 30) -- at most, go back 30 days. if not, then they don't belong together...I guess
end
if point then
if point.nest < 50 then
point.nest = point.nest + 1
table.insert(point.children, emails[mid])
end
else
if (irt ~= JSON.null and #irt > 0) then
emails[irt] = {
children = {
emails[mid]
},
nest = 1,
epoch = email.epoch,
tid = v._id
}
emails[mid].nest = emails[irt].nest + 1
table.insert(threads, emails[irt])
else
table.insert(threads, emails[mid])
end
end
-- needed for emailsOnly
email.references = nil
end
email.to = nil
-- Duplicate of email.irt
email['in-reply-to'] = nil
if not account and config.antispam then
email.from = email.from:gsub("(%S+)@(%S+)", function(a,b) return a:sub(1,2) .. "..." .. "@" .. b end)
end
if email.attachments then
email.attachments = #email.attachments
else
email.attachments = 0
end
email.body = #email.body < BODY_MAXLEN and email.body or email.body:sub(1, BODY_MAXLEN) .. "..."
if not statsOnly then
table.insert(emls, email)
else
table.insert(emls, {epoch= email.epoch})
end
end
end
local allparts = 0 -- number of participants
local top10 = {}
if not statsOnly and not emailsOnly then
local stable = {}
for k, v in pairs(senders) do
table.insert(stable, v)
end
table.sort(stable, function(a,b) return a.count > b.count end )
allparts = #stable
for k, v in pairs(stable) do
if k <= 10 then
table.insert(top10, v)
else
break
end
end
end
-- anonymize emails if not logged in - anti-spam!
if not account and config.antispam then
for k, v in pairs(top10) do
top10[k].email = top10[k].email:gsub("(%S+)@(%S+)", function(a,b) return a:sub(1,2) .. "..." .. "@" .. b end)
end
end
-- Debug time point 6
if DEBUG then
table.insert(t, r:clock() - tnow)
tnow = r:clock()
end
sortEmail(threads)
-- Debug time point 7
if DEBUG then
table.insert(t, r:clock() - tnow)
tnow = r:clock()
end
if JSON.encode_max_depth then
-- TODO see #439
JSON.encode_max_depth(1000)
end
listdata.max = maxresults
listdata.using_wc = wc
listdata.no_threads = #threads
if not statsOnly and not emailsOnly then
listdata.thread_struct = threads
end
listdata.firstYear = datespan.firstYear
listdata.lastYear = datespan.lastYear
listdata.firstMonth = datespan.firstMonth
listdata.lastMonth = datespan.lastMonth
listdata.list = listraw:gsub("^([^.]+)%.", "%1@"):gsub("[<>]+", "")
listdata.emails = emls
listdata.hits = #emls
listdata.searchlist = listraw
listdata.participants = top10
listdata.cloud = cloud
if DEBUG then
listdata.took = r:clock() - START
listdata.debug = t
end
listdata.numparts = allparts
listdata.unixtime = os.time()
r:puts(JSON.encode(listdata))
return cross.OK
end
cross.start(handle)