blob: b3c3825eaafeb584bc23c1226e11b272763ce232 [file] [log] [blame]
--[[
This is a LUA CGI script that uses LibEZT to produce templated mirror content
It uses the output from the Apache GeoIP module to choose the appropriate mirror(s)
The script supports the following optional URL parameters:
cca2 :
override the country code
preferred/Preferred :
sets the preferred server if available, otherwise it is chosen at random
as_json/asjson :
don't process the template, but return the mirror data as JSON
action=download together with filename :
generate a redirect to the file on the preferred mirror
archive_aware :
sets ARCHIVE_AWARE = true ; default : false
This script is dist, attic and archive aware.
If the target's first path-component (TLP) has a corresponding
file /www/attic.apache.org/projects/TLP.html,
then the script redirects to that page.
If the final target is not in dist, it uses template archive.html ;
if ARCHIVE_AWARE, it looks up the target on archive.apache.org with
a HEAD request. The lookups are cached ; forever for positive results.
]]
-- version number of this file (automatically generated by SVN)
local VERSION = ("$Revision: 1889116 $"):match("(%d+)")
function version()
return VERSION
end
local ARCHIVE_AWARE = true -- do archive.a.o lookup with HEAD requests
local CACHE_TIMEOUT = 3600 -- should be 0 in test ; 3600 in production
local LOG_LOOKUPS = 0 -- should be 1 in test ; 0 in production
local JSON = require 'JSON'
local ezt = require 'libezt'
local posix = require 'posix'
local SOCK = require 'socket'
local HTTP = require 'socket.http'
local HTTPS = require 'ssl.https'
-- Set 5 second max timeout for http(s) lookups
HTTP.TIMEOUT = 5
HTTPS.TIMEOUT = 5
local mirror_file = "/www/www.apache.org/mirrors/mirrors.list"
local MAXAGE = 24*3600 -- max mirror age
local ATTIC_URI = 'http://attic.apache.org/projects/'
local ATTIC_DIR = '/var/www/attic.apache.org/projects/'
local DIST_DIR = '/var/www/www.apache.org/content/dist/'
local ARCH_URI = 'https://archive.apache.org/dist/'
local DOWN_URI = 'https://downloads.apache.org/'
local DYN_DIR = '/var/www/www.apache.org/dyn/'
local CLOSER_PG = DYN_DIR .. 'closer.html'
local ARCHIVE_PG = DYN_DIR .. 'archive.html'
local STATS_DIR = DYN_DIR .. 'stats/'
local LOOKUP_LOG = STATS_DIR .. 'AAAA'
local dist_hit = false
local arch_hit = false
local mirror_stamp = 0 -- when mirror_file was last processed
local mirror_map = {} -- map of all recent mirrors. [ftp|http|rsync][cc|backup]=url
local mirror_map_v6 = {} -- mirror_map for ipv6-enabled mirrors
local mirror_templates = {} -- cache of unprocessed mirror templates
local mirror_templates_generated = {} -- cache of generated templates
local mymap -- map of mirrors for the current request (based on the country code
function get_mirrors()
local now = os.time()
local atleast = now - MAXAGE
local f = io.open(mirror_file, "r")
local mirrord = f:read("*a")
-- Check the age of the mirrors relative to the mirror list, rather than now. (As was done by mirrors.cgi)
-- This allows the system to still work even if the list is a bit stale
-- LUA does not have a standard API to get a file date
-- However, the timestamp when the information was collected is more useful anyway
-- Parse the file header: # date : Wed Sep 2 09:49:53 2015 [UTC]
local mon, day, hh, mm, ss, yy = mirrord:match("# date : %w+ (%w+) +(%d+) (%d%d):(%d%d):(%d%d) (%d%d%d%d) %[UTC%]")
if mon then
local MON = {Jan=1,Feb=2,Mar=3,Apr=4,May=5,Jun=6,Jul=7,Aug=8,Sep=9,Oct=10,Nov=11,Dec=12}
-- use isdst = false as the timestamp is UTC
local filetime = os.time({year = yy, month = MON[mon], day = day, hour = hh, min = mm, sec = ss, isdst=false})
atleast = filetime - MAXAGE
end
mirror_map = {}
mirror_map_v6 = {}
f:close()
for t, c, url, timestamp, ipversion in mirrord:gmatch("([a-zA-Z]+)%s+([a-zA-Z]+)%s+(%S+)%s+(%d+)(.-)\r?\n") do
if c then
c = c:lower()
-- if backup, force http -> https
if c == 'backup' then
url = url:gsub('http://', 'https://')
end
-- Don't check the timestamp for backup mirrors
if c == 'backup' or tonumber(timestamp) >= atleast then
mirror_map[c] = mirror_map[c] or {}
mirror_map[c][t] = mirror_map[c][t] or {}
--url = url:gsub("/$", "")
ipversion = ipversion:match("^%s*(%S+)$")
if not ipversion then
ipversion = 'ipv4'
end
table.insert(mirror_map[c][t], url)
if ipversion == 'ipv6' or c == 'backup' then
mirror_map_v6[c] = mirror_map_v6[c] or {}
mirror_map_v6[c][t] = mirror_map_v6[c][t] or {}
table.insert(mirror_map_v6[c][t], url)
end
end
end
end
mirror_stamp = now
return mirror_map
end
function log_lookup(inarch, cs, path)
local f = io.open(LOOKUP_LOG,'a')
if f then
f:write(os.date('%Y-%m-%d/%H:%M:%S')
.. " [" .. ( posix.getpid().pid or 'pid' ) .. ']'
.. " look=" .. tostring((cs and false or true))
.. " hit=" .. tostring( inarch )
.. ' ' .. path
.. "\n"
)
f:close()
end
end
function interval(t) return 1000 * ( SOCK.gettime() - t ) end
function elapsed(t) return string.format("%.3f ms'\n",interval(t)) end
function file_exists(file) return posix.stat(file) ~= nil end
function is_in_attic(proj) return file_exists(ATTIC_DIR .. proj .. '.html') end
function arch_uri(path) return ARCH_URI .. path end
function dl_uri(path) return DOWN_URI .. path end
function archive_url(path)
local uri = arch_uri(path)
return '<a href="' .. uri .. '" rel="nofollow">' .. uri .. '</a>'
end
function is_in_arch(r, path)
-- Cached lookups on archive.a.o for files and dirs. a non-404 response code is considered a hit
local cache_hit_result = r:ivm_get("archive_ao_cache_result_" .. path)
local cache_hit_stamp = r:ivm_get("archive_ao_cache_stamp_" .. path)
local exists = false
local is_fresh = false
if cache_hit_stamp then
is_fresh = ( os.time() - cache_hit_stamp ) < CACHE_TIMEOUT
if is_fresh then
exists = (cache_hit_result == 1)
end
end
if not is_fresh
then
local rv, c, h, _ = HTTPS.request { method = "HEAD", url = arch_uri(path), sink = ltn12.sink.table(resp), protocol = "tlsv1_2" }
exists = ( c and c ~= 404 )
r:ivm_set("archive_ao_cache_result_" .. path, exists and 1 or 0)
r:ivm_set("archive_ao_cache_stamp_" .. path, os.time())
end
return exists, cache_hit_stamp
end
function is_on_downloads_ao(r, path)
-- Cached lookups on downloads.a.o for files and dirs. a non-404 response code is considered a hit
local cache_hit_result = r:ivm_get("downloads_ao_cache_result_" .. path)
local cache_hit_stamp = r:ivm_get("downloads_ao_cache_stamp_" .. path)
local exists = false
local is_fresh = false
if cache_hit_stamp then
is_fresh = ( os.time() - cache_hit_stamp ) < CACHE_TIMEOUT
if is_fresh then
exists = (cache_hit_result == 1)
end
end
if not is_fresh
then
local rv, c, h, _ = HTTPS.request { method = "HEAD", url = dl_uri(path), sink = ltn12.sink.table(resp), protocol = "tlsv1_2" }
exists = ( c and c ~= 404 )
r:ivm_set("downloads_ao_cache_result_" .. path, exists and 1 or 0)
r:ivm_set("downloads_ao_cache_stamp_" .. path, os.time())
end
return exists, cache_hit_stamp
end
function get_page(url)
if not mirror_templates[url] or mirror_templates[url].timestamp < (os.time() - 2*CACHE_TIMEOUT) then
local f = io.open(url, "r")
mirror_templates[url] = {
data = f and f:read("*a") or "No such page",
timestamp = os.time()
}
if f then
f:close()
end
end
return mirror_templates[url]
end
function get_output_cached(page, defs, r, ezt_defs)
local pref = defs.preferred or ""
local path_info = defs.path_info or ""
local cacheKey = page .. ":" .. pref .. ":" .. path_info
if not mirror_templates_generated[cacheKey] or mirror_templates_generated[cacheKey].timestamp < (os.time() - CACHE_TIMEOUT) then
local template = get_page(page)
local tdata = recurse(defs, template.data, r, ezt_defs)
mirror_templates_generated[cacheKey] = {
data = tdata,
timestamp = os.time()
}
end
return mirror_templates_generated[cacheKey]
end
function recurse(defs, tdata, r, ezt_defs)
-- SSI emulation
tdata = tdata:gsub("<!%-%-%s*#include virtual=\"(.-)\"%s*%-%->",
function(inc)
local filepath = (defs.filepath .. inc):gsub("[/]+", "/")
if r:stat(filepath) then
local f = io.open(filepath, "r")
local d = f:read("*a")
f:close()
return d
else
return ""
end
end
)
-- Parse EZT
local structure, error = ezt:import("[ezt]"..tdata.."[end]")
-- Render output
if structure then return ezt:construct(structure, ezt_defs) else return error end
end
-- true if the string (s) ends with (e)
function endsWith(s, e)
return e == s:sub(-e:len())
end
-- true if the string (s) begins with (b)
function beginsWith(s, b)
return b == s:sub(1, b:len())
end
-- return false if string is empty (or nil)
function nonEmpty(s)
if s == null or s == '' then return nil end
return s
end
-- Temporary fix to extract the missing path_info for dyn/closer.cgi redirects only
function get_path_info(s)
local CGI_SCRIPT = "/dyn/closer.cgi/" -- original CGI script name
if beginsWith(s, CGI_SCRIPT) then
return s:sub(CGI_SCRIPT:len()) -- keep just the suffix
else
return nil
end
end
-- The request parameter has the data structures and functions as described here:
-- http://httpd.apache.org/docs/trunk/mod/mod_lua.html#datastructures
-- http://httpd.apache.org/docs/trunk/mod/mod_lua.html#functions
function handle(r)
r.headers_out['Cache-Control'] = 'private' -- Invalidate any cache
local get = r:parseargs()
if get.archive_aware and not ( get.archive_aware == '0' ) then
ARCHIVE_AWARE = true
end
local now = os.time()
if mirror_stamp < (now - 3600) then
get_mirrors()
end
local country = r.notes['GEOIP_COUNTRY_NAME'] or r.subprocess_env['GEOIP_COUNTRY_NAME'] or "Unknown"
local cca2 = (get.cca2 or r.notes['GEOIP_COUNTRY_CODE'] or r.subprocess_env['GEOIP_COUNTRY_CODE'] or r.subprocess_env['GEOIP_COUNTRY_CODE_V6'] or 'Backup'):lower()
if cca2 == 'gb' then
cca2 = 'uk'
end
local client_is_ipv6 = r.useragent_ip:match("(:[a-f0-9]+):?:?$") and true or false
local occa2 = cca2
if not mirror_map[cca2] then
cca2 = 'backup'
end
mymap = mirror_map[cca2] or mirror_map['backup']
if client_is_ipv6 then
mymap = mirror_map_v6[cca2] or mirror_map['backup']
end
local bmap = mirror_map['backup']
mymap['backup'] = bmap['http']
local URL = {}
for _, t in pairs({'http','ftp'}) do
URL[t] = (mymap[t] and mymap[t][math.random(1, #mymap[t])]) or (bmap[t] and bmap[t][math.random(1, #bmap[t])])
end
local page = r.filename
local got_f = get.f -- work on a copy of the parameter
if got_f then
-- path normalization: We get all sorts of /var/www, /www/ (or nothing!) etc thrown at us,
-- due to legacy puppet cruft and EU/US alternate hostnames. We want to normalize that.
local hname = r.hostname:gsub("www%.", "")
got_f = got_f:gsub("^/var/www/html/", "/var/www/")
got_f = got_f:gsub(hname, ""):gsub("/var/www//var/www/", "/var/www/")
got_f = got_f:gsub("^/var/www//?www/", "/var/www/")
if r:stat(got_f) or r:stat(got_f:gsub("%.cgi", ".html")) then
page = got_f
else
page = got_f:gsub("/www/", "/www/" .. hname:gsub("%.[eu][us]%.", ".") .. "/"):gsub("[/]+", "/")
end
end
-- Rewrite foo.cgi or foo.lua to foo.html
page = page:gsub("%.cgi", ".html"):gsub("%.lua", ".html")
-- Ensure the target template exists, or fall back to default template
if not r:stat(page) or not (page:match("^/var/www/") or page:match("^/www/")) then
page = CLOSER_PG
end
-- Final sanity check: page variable must match this path to be a valid template file
-- If not, default to our standard template
-- TODO: Weed out the /var/www later on and always only have /www/foo.a.o/bar.html as valid
-- Do not allow '.' in path segments apart from the last (the file name)
if not r:regex(page, [[^(/var/www|/www)/([-a-z0-9]+\.apache\.org)/([-_a-zA-Z0-9/]+/)?[-_a-zA-Z0-9.]+\.html?$]]) then
page = CLOSER_PG
end
local defs = {}
local ezt_defs = {
strings = {},
arrays = {}
}
defs.filepath = page:gsub("[^/]+$", "")
defs.debug = get.debug and true or false
defs.preferred = r:escape_html(get.preferred or get.Preferred or URL['http'] or "")
defs.path_info = r:escape_html(get.path -- command-line override
or nonEmpty(r.path_info) -- if path provided by server
or get_path_info(r.uri) -- temporary fix to extract it from r.uri for dyn/closer.cgi calls
-- Disable for now; it was previously effectively disabled because r.path_info was never false
-- or r.unparsed_uri:gsub("^.+%.cgi/*", ""):gsub("^.+%.lua/*", "") -- not sure what this is trying to do
-- TODO in any case seems wrong to use the unparsed URI as that will include the query string
or "/") -- default
:gsub("^/","",1) -- trim leading "/" as per Python version
defs.country = country
defs.cca2 = cca2
defs.ipv6 = client_is_ipv6
-- proj is the first path component of defs.path_info
local proj = defs.path_info
if proj and proj:find('/') then
proj = proj:sub(1,proj:find('/')-1)
end
defs.project = proj
ezt_defs.strings = defs
ezt_defs.arrays = {
http = mymap['http'] or bmap['http'],
ftp = mymap['ftp'] or bmap['ftp'],
backup = bmap['http'],
}
-- Check that preferred http/ftp exists, otherwise default to none
local prefIsOkay = false
for _,b in ipairs({'http', 'ftp', 'backup'}) do
for _, v in pairs(ezt_defs.arrays[b] or {}) do -- arrays[b] may not exist
if r:escape_html(v) == defs.preferred then
prefIsOkay = true
break
end
end
if prefIsOkay then
break
end
end
if not prefIsOkay then
ezt_defs.preferred = ""
defs.preferred = URL['http']
end
-- string only repr of preferred URL
if get.preferred and get.preferred == "true" then
r.content_type = "text/plain"
r:puts(defs.preferred)
return apache2.OK
end
local do_json = false
if (get.as_json and not (get.as_json == "0")) or (get.asjson and not (get.asjson == "0")) then
do_json = true
end
if get.action then
local d_uri = get.filename or nonEmpty(defs.path_info)
if get.action == 'download' and nonEmpty(d_uri) then
if is_on_downloads_ao(r, d_uri) then
r.headers_out['Location'] = defs.preferred .. d_uri
r.status = 302
return apache2.OK
elseif is_in_arch(r, d_uri) then
r.headers_out['Location'] = ARCH_URI .. d_uri
r.status = 302
return apache2.OK
else
r.content_type = "text/plain"
r.status = 404
r:puts("The requested file does not exist in our mirror system or in our archives.")
return apache2.OK
end
elseif get.action == 'info' then
r.content_type = "text/plain"
r:puts(string.format("%s\ncloser revision: %s\nlibezt revision: %s\n",
_VERSION, -- LUA
version(), -- closer
ezt:version())) -- libezt
-- Show any arguments
for k, v in pairs( get ) do
r:puts( string.format("arg %s: %s\n", k, v) )
end
local t0 = SOCK.gettime() ;
local URI = r.subprocess_env['SCRIPT_URI'] or "nil"
-- Request parameters
r:puts("r.hostname : '",r.hostname or "nil", "'\n")
r:puts("r.document_root : '",r.document_root or "nil", "'\n")
r:puts("r.uri : '",r.uri or "nil", "'\n")
-- r:puts("r.the_request: '",r.the_request or "nil", "'\n")
-- r:puts("r.unparsed_uri: '",r.unparsed_uri or "nil", "'\n")
r:puts("r.path_info : '",r.path_info or "nil","'\n")
r:puts("env[SCRIPT_URI] : '",URI,"'\n")
-- calculated values
r:puts("defs.path_info : '",defs.path_info or "nil","'\n")
r:puts("defs.filepath : '",defs.filepath or "nil","'\n")
r:puts("occa2 : '",occa2,"'\n")
r:puts("proj : '",proj,"'\n")
r:puts("proj in attic : '",tostring(is_in_attic(proj)),"'\n")
r:puts("elapsed : '",elapsed(t0))
r:puts("... dist lookup ...\n")
local on_downloads_ao, cs = is_on_downloads_ao(r, defs.path_info)
r:puts("exists on downloads.a.o? : '",tostring(on_downloads_ao), "'\n")
r:puts("cache stamp : '",tostring(cs), "'\n")
r:puts("dist uri : '",dl_uri(defs.path_info),"'\n")
r:puts("elapsed : '",elapsed(t0))
r:puts("archive aware : '",tostring(ARCHIVE_AWARE),"'\n")
if on_downloads_ao == 'false' then
r:puts("archive uri : '",arch_uri(defs.path_info),"'\n")
end
if ARCHIVE_AWARE then
r:puts("... archive lookup ...\n")
r:puts("process PID : '",tostring(posix.getpid().pid),"'\n")
local in_arch, cs = is_in_arch(r, defs.path_info)
r:puts("archive uri : '",arch_uri(defs.path_info),"'\n")
r:puts("path in arch? : '",tostring(in_arch),"'\n")
r:puts("arch stamp : '",tostring(cs),"'\n")
r:puts("elapsed : '",elapsed(t0))
end
return apache2.OK
elseif get.action == 'catlog' then
r.content_type = "text/plain"
local f = io.open(LOOKUP_LOG)
if f then
while true do
local line = f:read()
if line == nil then break end
r:puts(line,"\n")
end
f:close()
else
r:puts("can't open " .. LOOKUP_LOG .. "\n")
end
return apache2.OK
else
r.content_type = "text/plain"
r:puts("unknown action [" .. get.action .. "]\n")
return apache2.OK
end
end
if do_json then
r.content_type = "application/json"
r:puts(JSON:encode_pretty({
path_info = defs.path_info,
preferred = defs.preferred,
http = mymap['http'] or bmap['http'],
ftp = mymap['ftp'] or bmap['ftp'],
backup = bmap['http'],
ipv6 = defs.ipv6,
in_dist = is_on_downloads_ao(r, defs.path_info),
in_attic = is_in_attic(proj),
cca2 = occa2
}))
return apache2.OK
end
if is_in_attic(proj) then
r.headers_out['Location'] = ATTIC_URI .. proj .. ".html"
r.status = 302
return apache2.OK
end
if not is_on_downloads_ao(r, defs.path_info) then
local arch_home = archive_url('') ;
local arch_path = archive_url(defs.path_info)
local lookup = '' ;
if ARCHIVE_AWARE
then
local inarch, cs = is_in_arch(r, defs.path_info)
if inarch == nil then
if HTTP then
lookup = 'A lookup on ' .. arch_home .. ' failed.'
else
lookup = "Can't do lookups on " .. arch_home
end
lookup = lookup .. "<br>Try " .. arch_path
elseif inarch then
lookup = 'The object is in our archive : ' .. arch_path
else
lookup = 'The object is in not in our archive ' .. arch_home
end
if LOG_LOOKUPS then log_lookup(inarch, cs, defs.path_info) end
else -- not ARCHIVE_AWARE
lookup = "It may be in our archive : " .. arch_path
end
defs.lookup = lookup
page = ARCHIVE_PG
end
local rootpath = defs.path_info:match("^([-a-z0-9]+)/")
if rootpath and rootpath == "incubator" then
rootpath = defs.path_info:match("^incubator/([-a-z0-9]+)/")
end
if rootpath then
local f = io.open(STATS_DIR .. rootpath .. ".log", "a")
if f then
-- get a bit of the IP to identify multiple unique request with same TS/CCA2
local ipbit = r.useragent_ip:match("([a-f0-9]+):?:?$") or r.useragent_ip:match("^([a-f0-9]+)") or "000"
f:write(os.time() .. " " .. ipbit .. " " .. occa2 .. " " .. defs.path_info .. "\n")
f:close()
end
end
local tdata = get_output_cached(page, defs, r, ezt_defs)
-- check for special content-type based on file name
if endsWith(page,"--xml.html") then
r.content_type = "text/xml"
else
r.content_type = "text/html"
end
r:puts(tdata.data)
if r.hostname == 'www.apache.org' then
r:puts("<!-- " .. occa2 .. " -->")
end
return apache2.OK
end