Move download stats to a plugin, add support for monthly-rounded stats
diff --git a/server/app/endpoints/downloads.py b/server/app/endpoints/downloads.py
index 692f920..0286bdf 100644
--- a/server/app/endpoints/downloads.py
+++ b/server/app/endpoints/downloads.py
@@ -20,259 +20,14 @@
"""Handler for download stats - ported from https://github.com/apache/infrastructure-dlstats"""
import quart
from ..lib import middleware, config, asfuid
-import elasticsearch
-import elasticsearch_dsl
-from .. import plugins
-import re
-import time
-import ua_parser.user_agent_parser
-
-MAX_HITS = 60 # Max number of artifacts to track in a single search
-MAX_HITS_UA = 60 # Max number of user agents to collate
-DOWNLOADS_CACHE_ITEMS = 200 # Keep the 200 latest search results in cache. 200 results is ~50MB
-DOWNLOADS_CACHE_TTL = 7200 # Only cache items for 2 hours
-
-INTERNAL_AGENTS = {
- "Windows Package Manager": ("winget-cli", "Microsoft-Delivery-Optimization", "WindowsPackageManager", "Microsoft BITS",),
- "NSIS (plugin)": ("NSIS_Inetc", ),
- "Transmission": ("Transmission/", ),
- "Free Download Manager": ("FDM", ),
- "Patch My PC Client": ("Patch My PC Publishing Service", ),
- "Artifactory": ("Artifactory", ),
- "Scoop/Shovel": ("Scoop/", "Shovel/", ),
- "BigFix": ("BigFix", ),
-}
-
-# Different indices have different field names, account for it here:
-FIELD_NAMES = {
- "fastly": { # the index prefix
- "geo_country": "geo_country_code",
- "bytes": "bytes",
- "vhost": "vhost",
- "uri": "url",
- "timestamp": "timestamp",
- "_vhost_": "dlcdn.apache.org", # This is a variable field value, not a name
- "request_method": "request",
- "useragent": "request_user_agent",
- },
- "loggy": { # the index prefix
- "geo_country": "geo_country",
- "bytes": "bytes",
- "vhost": "vhost",
- "uri": "uri",
- "timestamp": "@timestamp",
- "_vhost_": "downloads.apache.org", # This is a variable field value, not a name
- "request_method": "request_method",
- "useragent": "useragent",
- },
-}
-
-dataurl = "http://localhost:9200"
-if hasattr(config.reporting, "downloads"): # If prod...
- dataurl = config.reporting.downloads["dataurl"]
-
-es_client = elasticsearch.AsyncElasticsearch(hosts=[dataurl], timeout=45)
-
-# WARNING: whilst operations on lists are generally thread-safe, this cache is not,
-# because updating the cache requires several operations which are not currently protected by a lock.
-# However, it appears that access to instances of this code are single-threaded by hypercorn,
-# so the lack of thread safety should not be a problem.
-downloads_data_cache = []
-
-async def make_query(provider, field_names, project, duration, filters, max_hits=MAX_HITS, max_ua=MAX_HITS_UA, downscaled=False):
- q = elasticsearch_dsl.Search(using=es_client)
- q = q.filter("range", **{field_names["timestamp"]: {"gte": f"now-{duration}d"}})
- q = q.filter("match", **{field_names["request_method"]: "GET"})
- q = q.filter("range", bytes={"gt": 5000}) # this filters out hashes and (most?) sigs
- q = q.filter("prefix", **{field_names["uri"] + ".keyword": f"/{project}/"})
- q = q.filter("match", **{field_names["vhost"]: field_names["_vhost_"]})
-
- # Various standard filters for weeding out bogus requests
- if "empty_ua" in filters: # Empty User-Agent header, usually automation gone wrong
- q = q.exclude("terms", **{field_names["useragent"]+".keyword": ["", "-"]})
- # TODO: Make this not extremely slow. For now, we'll filter in post.
- #if "no_query" in filters: # Don't show results with query strings in them
- # q = q.exclude("wildcard", **{field_names["uri"]+".keyword": "*="})
-
- # Bucket sorting by most downloaded items
- main_bucket = q.aggs.bucket(
- "most_downloads", elasticsearch_dsl.A("terms", field=f"{field_names['uri']}.keyword", size=max_hits)
- )
- main_bucket.metric("useragents", "terms", field=field_names["useragent"]+".keyword", size=max_ua)
- main_bucket.bucket("per_day", "date_histogram", interval="day", field=field_names["timestamp"]
- ).metric(
- "bytes_sum", "sum", field=field_names["bytes"]
- ).metric(
- "unique_ips", "cardinality", field="client_ip.keyword"
- ).metric(
- "cca2", "terms", field=field_names["geo_country"] + ".keyword"
- )
-
- # Bucket sorting by most bytes downloaded (may differ from most downloads top 60!)
- main_bucket = q.aggs.bucket(
- "most_traffic", elasticsearch_dsl.A("terms", field=f"{field_names['uri']}.keyword", size=max_hits, order={"bytes_sum": "desc"})
- )
- main_bucket.metric("useragents", "terms", field=field_names["useragent"]+".keyword", size=max_ua)
- main_bucket.metric(
- "bytes_sum", "sum", field=field_names["bytes"]
- ).bucket("per_day", "date_histogram", interval="day", field=field_names["timestamp"]
- ).metric(
- "bytes_sum", "sum", field=field_names["bytes"]
- ).metric(
- "unique_ips", "cardinality", field="client_ip.keyword"
- ).metric(
- "cca2", "terms", field=field_names["geo_country"] + ".keyword"
- )
- try:
- resp = await es_client.search(index=f"{provider}-*", body=q.to_dict(), size=0, timeout="60s")
- if downscaled and resp:
- resp["downscaled"] = True
- return resp
- except elasticsearch.TransportError as e:
- # If too many buckets for us to handle, downscale the UA search
- if isinstance(e.info, dict) and 'too_many_buckets_exception' in e.info["error"].get("caused_by", {}).get("type", ""):
- max_ua = int(max_ua*0.67)
- max_hits = int(max_hits*0.67)
- print(f"Too many buckets for {project}, downscaling query by 33%")
- if max_ua > 2:
- return await make_query(provider, field_names, project, duration, filters, max_hits, max_ua, True)
- return {"downscaled": downscaled}
-
-
+from ..plugins import downloads
@asfuid.session_required
async def process(form_data):
project = form_data.get("project", "httpd") # Project/podling to fetch stats for
duration = form_data.get("duration", 7) # Timespan to search (in whole days)
filters = form_data.get("filters", "empty_ua,no_query") # Various search filters
- if isinstance(duration, str):
- if duration.endswith("d"):
- duration = duration[:-1]
- try:
- duration = int(duration)
- except ValueError:
- return {"success": False, "message": "Invalid duration window! Please specify a whole number of days"}
-
- downloaded_artifacts = {}
-
- # Check if we have a cached result
- cache_found = False
- # TODO: the cache key needs to take account of form_data filters as they affect the content
- cache_key = f"{project}-{duration}"
- cache_timeout_ts = time.time() - DOWNLOADS_CACHE_TTL
- for item in downloads_data_cache: # (cache_key, cache_ts, cache_data)
- if item[0] == cache_key and item[1] >= cache_timeout_ts:
- cache_found = True
- downloaded_artifacts = item[2]
- break
-
- if not cache_found:
- downscaled = False
- for provider, field_names in FIELD_NAMES.items():
- resp = await make_query(provider, field_names, project, duration, filters)
- if "aggregations" not in resp: # Skip this provider if no data is available
- continue
- if resp.get("downscaled"): # Too many damn buckets
- downscaled = True
- for methodology in (
- "most_downloads",
- "most_traffic",
- ):
- for entry in resp["aggregations"][methodology]["buckets"]:
- # url, shortened = /incubator/ponymail/foo.tar.gz -> foo.tar.gz
- url = re.sub(r"/+", "/", entry["key"]).replace(f"/{project}/", "", 1)
- # TODO: Address in OpenSearch later on...
- if "no_query" in filters and "?" in url:
- continue
- if "." not in url or url.endswith("/") or url.endswith("KEYS"): # Never count KEYS or non-files
- continue
- if url not in downloaded_artifacts:
- downloaded_artifacts[url] = {
- "bytes": 0,
- "hits": 0,
- "hits_unique": 0,
- "cca2": {},
- "daily_stats": {},
- "useragents": {},
- }
- no_bytes = 0
- no_hits = 0
- no_hits_unique = 0
- cca2_hits = {}
- daily_data = []
-
- # User Agent (Browser + OS) summation
- uas = {}
- for uaentry in entry["useragents"]["buckets"]:
- ua_agent = uaentry["key"] # the full agent string
- # NOTE: ua_parser will set OS and UA Family to "Other" when it doesn't recognize the UA string.
- ua = ua_parser.user_agent_parser.Parse(ua_agent)
- ua_os_family = ua.get("os", {}).get("family", "Unknown")
- # If OS is "Other", we'll adjust it to "Unknown" ourselves.
- if ua_os_family == "Other":
- ua_os_family = "Unknown"
- # UA family will typically be "Other" when unknown to the parser, we'll address this below.
- # If the family is empty, we'll also set to Other and adjust later on.
- ua_agent_family = ua.get("user_agent", {}).get("family", "Other")
- # Adjust for various package managers we know of
- if ua_agent_family == "Other":
- for ia_key, ia_names in INTERNAL_AGENTS.items():
- if any(x in ua_agent for x in ia_names):
- ua_agent_family = ia_key
- break
- # If we still don't know what this is, mark as "Unknown", to distinguish from the combined "Other" chart group.
- if ua_agent_family == "Other":
- ua_agent_family = "Unknown"
- ua_key = ua_os_family + " / " + ua_agent_family
- uas[ua_key] = uas.get(ua_key, 0) + uaentry["doc_count"]
- for key, val in uas.items():
- # There will be duplicate entries here, so we are going to go for the highest count found for each URL
- downloaded_artifacts[url]["useragents"][key] = max(downloaded_artifacts[url]["useragents"].get(key, 0), val)
-
- for daily_entry in entry["per_day"]["buckets"]:
- day_ts = int(daily_entry["key"] / 1000)
- nb_daily = int(daily_entry["bytes_sum"]["value"])
- nh_daily = int(daily_entry["doc_count"])
- no_bytes += nb_daily
-
- visits_unique = int(daily_entry["unique_ips"]["value"])
- no_hits += nh_daily
- no_hits_unique += visits_unique
-
- for ccaentry in daily_entry["cca2"]["buckets"]:
- cca2 = ccaentry["key"]
- cca2_count = ccaentry["doc_count"]
- if cca2 and cca2 != "-":
- cca2_hits[cca2] = cca2_hits.get(cca2, 0) + cca2_count
- daily_data.append([day_ts, nh_daily, visits_unique, nb_daily])
-
- # The prevailing agg (most hits or most traffic) wins
- if no_bytes > downloaded_artifacts[url]["bytes"]:
- downloaded_artifacts[url]["bytes"] += no_bytes
- downloaded_artifacts[url]["daily_stats"] = daily_data
- if no_hits > downloaded_artifacts[url]["hits"]:
- downloaded_artifacts[url]["hits"] += no_hits
- downloaded_artifacts[url]["daily_stats"] = daily_data
- if no_hits_unique > downloaded_artifacts[url]["hits_unique"]:
- downloaded_artifacts[url]["hits_unique"] += no_hits_unique
- if sum([x for x in cca2_hits.values()]) > sum([x for x in downloaded_artifacts[url]["cca2"].values()]):
- downloaded_artifacts[url]["cca2"] = cca2_hits
- # Ensure all entries are properly marked if query was downscaled
- if downscaled:
- for key, val in downloaded_artifacts.items():
- val["downscaled"] = True
-
- # Set cache data and cull old cache list if needed
- new_cache_list = [item for item in downloads_data_cache if item[1] >= cache_timeout_ts]
- downloads_data_cache.clear()
- downloads_data_cache.extend(new_cache_list)
- # Make sure there is room to add another entry
- # entries are added in date order, so [0] is the oldest
- while len(downloads_data_cache) >= DOWNLOADS_CACHE_ITEMS:
- del downloads_data_cache[0]
- downloads_data_cache.append((cache_key, time.time(), downloaded_artifacts))
-
- return downloaded_artifacts or {"success": False, "message": "No results found"}
+ return await downloads.generate_stats(project, duration, filters)
quart.current_app.add_url_rule(
@@ -283,4 +38,4 @@
view_func=middleware.glued(process),
)
-plugins.root.register(slug="downloads", title="Download Statistics", icon="bi-cloud-download", private=True)
+
diff --git a/server/app/plugins/__init__.py b/server/app/plugins/__init__.py
index 0961277..060ffc2 100644
--- a/server/app/plugins/__init__.py
+++ b/server/app/plugins/__init__.py
@@ -51,3 +51,4 @@
from . import jirastats
from . import uptime
from . import mailstats
+from . import downloads
diff --git a/server/app/plugins/downloads.py b/server/app/plugins/downloads.py
new file mode 100644
index 0000000..916ef73
--- /dev/null
+++ b/server/app/plugins/downloads.py
@@ -0,0 +1,290 @@
+#!/usr/bin/env python3
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""ASF Infrastructure Reporting Dashboard - Download Statistics Tasks"""
+import asyncio
+from ..lib import middleware, config, asfuid
+import elasticsearch
+import elasticsearch_dsl
+from .. import plugins
+import re
+import time
+import os
+import ua_parser.user_agent_parser
+import aiohttp
+import json
+import datetime
+
+DEFAULT_PROJECTS_LIST = "https://whimsy.apache.org/public/public_ldap_projects.json"
+MAX_HITS = 60 # Max number of artifacts to track in a single search
+MAX_HITS_UA = 60 # Max number of user agents to collate
+DOWNLOADS_CACHE_ITEMS = 200 # Keep the 200 latest search results in cache. 200 results is ~50MB
+DOWNLOADS_CACHE_TTL = 7200 # Only cache items for 2 hours
+PERSISTENT_REPORTS_BACKFILL_MONTHS = 6 # Try to backfill download reports six months back if possible
+
+INTERNAL_AGENTS = {
+ "Windows Package Manager": ("winget-cli", "Microsoft-Delivery-Optimization", "WindowsPackageManager", "Microsoft BITS",),
+ "NSIS (plugin)": ("NSIS_Inetc", ),
+ "Transmission": ("Transmission/", ),
+ "Free Download Manager": ("FDM", ),
+ "Patch My PC Client": ("Patch My PC Publishing Service", ),
+ "Artifactory": ("Artifactory", ),
+ "Scoop/Shovel": ("Scoop/", "Shovel/", ),
+ "BigFix": ("BigFix", ),
+}
+
+# Different indices have different field names, account for it here:
+FIELD_NAMES = {
+ "fastly": { # the index prefix
+ "geo_country": "geo_country_code",
+ "bytes": "bytes",
+ "vhost": "vhost",
+ "uri": "url",
+ "timestamp": "timestamp",
+ "_vhost_": "dlcdn.apache.org", # This is a variable field value, not a name
+ "request_method": "request",
+ "useragent": "request_user_agent",
+ },
+ "loggy": { # the index prefix
+ "geo_country": "geo_country",
+ "bytes": "bytes",
+ "vhost": "vhost",
+ "uri": "uri",
+ "timestamp": "@timestamp",
+ "_vhost_": "downloads.apache.org", # This is a variable field value, not a name
+ "request_method": "request_method",
+ "useragent": "useragent",
+ },
+}
+
+dataurl = "http://localhost:9200"
+datadir = None # Where to store persistent data
+if hasattr(config.reporting, "downloads"): # If prod...
+ dataurl = config.reporting.downloads["dataurl"]
+ datadir = config.reporting.downloads.get("datadir")
+
+es_client = elasticsearch.AsyncElasticsearch(hosts=[dataurl], timeout=45)
+
+if datadir:
+ if not os.path.exists(datadir):
+ print(f"Setting up persistent data dir for download stats: {datadir}")
+ try:
+ os.mkdir(datadir)
+ except OSError as e:
+ print(f"Could not set up data directory {datadir}, will not store persistent download stats!")
+
+# WARNING: whilst operations on lists are generally thread-safe, this cache is not,
+# because updating the cache requires several operations which are not currently protected by a lock.
+# However, it appears that access to instances of this code are single-threaded by hypercorn,
+# so the lack of thread safety should not be a problem.
+downloads_data_cache = []
+
+async def make_query(provider, field_names, project, duration, filters, max_hits=MAX_HITS, max_ua=MAX_HITS_UA, downscaled=False):
+ q = elasticsearch_dsl.Search(using=es_client)
+ if isinstance(duration, str) and "-" in duration: # Whole month math, e.g. now-1M/M for this month only
+ q = q.filter("range", **{field_names["timestamp"]: {"gte": duration, "lte": duration}})
+ else:
+ q = q.filter("range", **{field_names["timestamp"]: {"gte": f"now-{duration}d"}})
+ q = q.filter("match", **{field_names["request_method"]: "GET"})
+ q = q.filter("range", bytes={"gt": 5000}) # this filters out hashes and (most?) sigs
+ q = q.filter("prefix", **{field_names["uri"] + ".keyword": f"/{project}/"})
+ q = q.filter("match", **{field_names["vhost"]: field_names["_vhost_"]})
+
+ # Various standard filters for weeding out bogus requests
+ if "empty_ua" in filters: # Empty User-Agent header, usually automation gone wrong
+ q = q.exclude("terms", **{field_names["useragent"]+".keyword": ["", "-"]})
+ # TODO: Make this not extremely slow. For now, we'll filter in post.
+ #if "no_query" in filters: # Don't show results with query strings in them
+ # q = q.exclude("wildcard", **{field_names["uri"]+".keyword": "*="})
+
+ # Bucket sorting by most downloaded items
+ main_bucket = q.aggs.bucket(
+ "most_downloads", elasticsearch_dsl.A("terms", field=f"{field_names['uri']}.keyword", size=max_hits)
+ )
+ main_bucket.metric("useragents", "terms", field=field_names["useragent"]+".keyword", size=max_ua)
+ main_bucket.bucket("per_day", "date_histogram", interval="day", field=field_names["timestamp"]
+ ).metric(
+ "bytes_sum", "sum", field=field_names["bytes"]
+ ).metric(
+ "unique_ips", "cardinality", field="client_ip.keyword"
+ ).metric(
+ "cca2", "terms", field=field_names["geo_country"] + ".keyword"
+ )
+
+ # Bucket sorting by most bytes downloaded (may differ from most downloads top 60!)
+ main_bucket = q.aggs.bucket(
+ "most_traffic", elasticsearch_dsl.A("terms", field=f"{field_names['uri']}.keyword", size=max_hits, order={"bytes_sum": "desc"})
+ )
+ main_bucket.metric("useragents", "terms", field=field_names["useragent"]+".keyword", size=max_ua)
+ main_bucket.metric(
+ "bytes_sum", "sum", field=field_names["bytes"]
+ ).bucket("per_day", "date_histogram", interval="day", field=field_names["timestamp"]
+ ).metric(
+ "bytes_sum", "sum", field=field_names["bytes"]
+ ).metric(
+ "unique_ips", "cardinality", field="client_ip.keyword"
+ ).metric(
+ "cca2", "terms", field=field_names["geo_country"] + ".keyword"
+ )
+ try:
+ resp = await es_client.search(index=f"{provider}-*", body=q.to_dict(), size=0, timeout="60s")
+ if downscaled and resp:
+ resp["downscaled"] = True
+ return resp
+ except elasticsearch.TransportError as e:
+ # If too many buckets for us to handle, downscale the UA search
+ if isinstance(e.info, dict) and 'too_many_buckets_exception' in e.info["error"].get("caused_by", {}).get("type", ""):
+ max_ua = int(max_ua*0.67)
+ max_hits = int(max_hits*0.67)
+ print(f"Too many buckets for {project}, downscaling query by 33%")
+ if max_ua > 2:
+ return await make_query(provider, field_names, project, duration, filters, max_hits, max_ua, True)
+ return {"downscaled": downscaled}
+
+
+async def generate_stats(project: str, duration: str, filters: str="empty_ua,no_query"):
+
+ if isinstance(duration, str) and "M/M" not in duration:
+ if duration.endswith("d"):
+ duration = duration[:-1]
+ try:
+ duration = int(duration)
+ except ValueError:
+ return {"success": False, "message": "Invalid duration window! Please specify a whole number of days"}
+
+ downloaded_artifacts = {}
+
+ # Check if we have a cached result
+ cache_found = False
+ # TODO: the cache key needs to take account of form_data filters as they affect the content
+ cache_key = f"{project}-{duration}"
+ cache_timeout_ts = time.time() - DOWNLOADS_CACHE_TTL
+ for item in downloads_data_cache: # (cache_key, cache_ts, cache_data)
+ if item[0] == cache_key and item[1] >= cache_timeout_ts:
+ cache_found = True
+ downloaded_artifacts = item[2]
+ break
+
+ if not cache_found:
+ downscaled = False
+ for provider, field_names in FIELD_NAMES.items():
+ resp = await make_query(provider, field_names, project, duration, filters)
+ if "aggregations" not in resp: # Skip this provider if no data is available
+ continue
+ if resp.get("downscaled"): # Too many damn buckets
+ downscaled = True
+ for methodology in (
+ "most_downloads",
+ "most_traffic",
+ ):
+ for entry in resp["aggregations"][methodology]["buckets"]:
+ # url, shortened = /incubator/ponymail/foo.tar.gz -> foo.tar.gz
+ url = re.sub(r"/+", "/", entry["key"]).replace(f"/{project}/", "", 1)
+ # TODO: Address in OpenSearch later on...
+ if "no_query" in filters and "?" in url:
+ continue
+ if "." not in url or url.endswith("/") or url.endswith("KEYS"): # Never count KEYS or non-files
+ continue
+ if url not in downloaded_artifacts:
+ downloaded_artifacts[url] = {
+ "bytes": 0,
+ "hits": 0,
+ "hits_unique": 0,
+ "cca2": {},
+ "daily_stats": {},
+ "useragents": {},
+ }
+ no_bytes = 0
+ no_hits = 0
+ no_hits_unique = 0
+ cca2_hits = {}
+ daily_data = []
+
+ # User Agent (Browser + OS) summation
+ uas = {}
+ for uaentry in entry["useragents"]["buckets"]:
+ ua_agent = uaentry["key"] # the full agent string
+ # NOTE: ua_parser will set OS and UA Family to "Other" when it doesn't recognize the UA string.
+ ua = ua_parser.user_agent_parser.Parse(ua_agent)
+ ua_os_family = ua.get("os", {}).get("family", "Unknown")
+ # If OS is "Other", we'll adjust it to "Unknown" ourselves.
+ if ua_os_family == "Other":
+ ua_os_family = "Unknown"
+ # UA family will typically be "Other" when unknown to the parser, we'll address this below.
+ # If the family is empty, we'll also set to Other and adjust later on.
+ ua_agent_family = ua.get("user_agent", {}).get("family", "Other")
+ # Adjust for various package managers we know of
+ if ua_agent_family == "Other":
+ for ia_key, ia_names in INTERNAL_AGENTS.items():
+ if any(x in ua_agent for x in ia_names):
+ ua_agent_family = ia_key
+ break
+ # If we still don't know what this is, mark as "Unknown", to distinguish from the combined "Other" chart group.
+ if ua_agent_family == "Other":
+ ua_agent_family = "Unknown"
+ ua_key = ua_os_family + " / " + ua_agent_family
+ uas[ua_key] = uas.get(ua_key, 0) + uaentry["doc_count"]
+ for key, val in uas.items():
+ # There will be duplicate entries here, so we are going to go for the highest count found for each URL
+ downloaded_artifacts[url]["useragents"][key] = max(downloaded_artifacts[url]["useragents"].get(key, 0), val)
+
+ for daily_entry in entry["per_day"]["buckets"]:
+ day_ts = int(daily_entry["key"] / 1000)
+ nb_daily = int(daily_entry["bytes_sum"]["value"])
+ nh_daily = int(daily_entry["doc_count"])
+ no_bytes += nb_daily
+
+ visits_unique = int(daily_entry["unique_ips"]["value"])
+ no_hits += nh_daily
+ no_hits_unique += visits_unique
+
+ for ccaentry in daily_entry["cca2"]["buckets"]:
+ cca2 = ccaentry["key"]
+ cca2_count = ccaentry["doc_count"]
+ if cca2 and cca2 != "-":
+ cca2_hits[cca2] = cca2_hits.get(cca2, 0) + cca2_count
+ daily_data.append([day_ts, nh_daily, visits_unique, nb_daily])
+
+ # The prevailing agg (most hits or most traffic) wins
+ if no_bytes > downloaded_artifacts[url]["bytes"]:
+ downloaded_artifacts[url]["bytes"] += no_bytes
+ downloaded_artifacts[url]["daily_stats"] = daily_data
+ if no_hits > downloaded_artifacts[url]["hits"]:
+ downloaded_artifacts[url]["hits"] += no_hits
+ downloaded_artifacts[url]["daily_stats"] = daily_data
+ if no_hits_unique > downloaded_artifacts[url]["hits_unique"]:
+ downloaded_artifacts[url]["hits_unique"] += no_hits_unique
+ if sum([x for x in cca2_hits.values()]) > sum([x for x in downloaded_artifacts[url]["cca2"].values()]):
+ downloaded_artifacts[url]["cca2"] = cca2_hits
+ # Ensure all entries are properly marked if query was downscaled
+ if downscaled:
+ for key, val in downloaded_artifacts.items():
+ val["downscaled"] = True
+
+ # Set cache data and cull old cache list if needed
+ new_cache_list = [item for item in downloads_data_cache if item[1] >= cache_timeout_ts]
+ downloads_data_cache.clear()
+ downloads_data_cache.extend(new_cache_list)
+ # Make sure there is room to add another entry
+ # entries are added in date order, so [0] is the oldest
+ while len(downloads_data_cache) >= DOWNLOADS_CACHE_ITEMS:
+ del downloads_data_cache[0]
+ downloads_data_cache.append((cache_key, time.time(), downloaded_artifacts))
+
+ return downloaded_artifacts or {"success": False, "message": "No results found"}
+
+plugins.root.register(slug="downloads", title="Download Statistics", icon="bi-cloud-download", private=True)
\ No newline at end of file