blob: 52838f77f7e9a486cff86c66ce5203a2ee67e7f5 [file] [log] [blame]
#!/usr/bin/env ruby
# Gather simple statistics from whimsy server logs
# TODO security check ASF::Auth.decode before reading log files
require 'whimsy/asf'
require 'json'
require 'stringio'
require 'zlib'
require 'time'
# Utility methods to turn server logs into hashes of interesting data
module LogParser
extend self
ERROR_LOG_DIR = '/srv/whimsy/www/members/log' # generally points to /var/log/apache2
# Constants and ignored regex for whimsy_access logs
WHIMSY_APPS = {
'roster' => 'Roster tool',
'board/agenda' => 'Board agenda tool',
'board/minutes' => 'Board public minutes',
'public' => 'Public JSON files',
'secretary' => 'Secretary Workbench',
'site.cgi' => 'TLP Site Checker',
'pods.cgi' => 'Podling Site Checker',
'foundation/orgchart' => 'Public OrgChart',
'status' => 'Server Status',
'committers' => 'Other Committer Private Tools',
'members' => 'Other Member Private Tools'
}
RUSER = 'remote_user'
REFERER = 'referer'
REMAINDER = 'remainder'
HITTOTAL = 'total'
URIHIT = 'uri'
IGNORED_URIS = [
/\A\/whimsy.svg/,
/\A\/favicon.ico/,
/\A\/robots.txt/,
/\A\/assets/,
/\A\/fonts/,
/\A\/icons/,
/\.js\z/,
/\.css\z/,
/\.action\z/,
/\.zip\z/, # Below here are all from site scanners
/\..?ar\z/,
/\.tar\..{1}z.?\z/,
/\.bak\z/,
/\.sql\z/,
/\.7z\z/,
/\.asp\.?\z/i,
/\.txt\z/,
/\.php\z/,
/\.woff2/
]
# Related to timestamps in error log output
TRUNCATE = 6 # Ensure consistency in keys
TIME_OFFSET = 10_000_000.0 # Offset milliseconds slightly for array entries
# Ignore error lines from other tools with long tracebacks
IGNORE_TRACEBACKS = ["rack.rb", "asf/themes", "phusion_passenger"]
# Read a text or .gz file
# @param f filename: .log or .log.gz
# @return File.read(f)
def read_logz(f)
if f.end_with? '.gz'
reader = Zlib::GzipReader.open(f)
logfile = reader.read
reader.close
else
logfile = File.read(f)
end
return logfile
end
# Parse whimsy_access and return interesting entries
# @param f filename of whimsy_access.log or .gz
# @return array of reduced, scrubbed entries as hashes
def parse_whimsy_access(f)
access = read_logz(f).scan(/<%JSON:apache_access%> (\{.*\})/).flatten
logs = JSON.parse('[' + access.join(',') + ']').reject do |i|
(i['useragent'] =~ /Ping My Box/) || (i['uri'] =~ Regexp.union(IGNORED_URIS)) || (i['status'] == 304)
end
logs.each do |i|
%w(geo_country geo_long geo_lat geo_coords geo_city geo_combo duration request
bytes vhost document request_method clientip query_string).each do |g|
i.delete(g)
end
end
return logs
end
# Collate/partition whimsy_access entries by app areas
# @param logs full set of items to scan
# @return apps categorized by apphash, with REMAINDER entry all others not captured
def collate_whimsy_access(logs, apphash = WHIMSY_APPS)
remainder = logs
apps = {}
apphash.each_key do |a|
apps[a] = Hash.new {|h, k| h[k] = [] }
apps[a][RUSER] = Hash.new {|h, k| h[k] = 0 }
apps[a][REFERER] = Hash.new {|h, k| h[k] = 0 }
apps[a][URIHIT] = Hash.new {|h, k| h[k] = 0 }
end
apps.each do |app, data|
items, remainder = remainder.partition{ |l| l['uri'] =~ /\A\/#{app}/ }
items.each do |l|
data[RUSER][l[RUSER]] += 1
data[REFERER][l[REFERER]] += 1
data[URIHIT][l[URIHIT]] += 1
end
end
apps[REMAINDER] = Hash.new {|h, k| h[k] = [] }
apps[REMAINDER][RUSER] = Hash.new {|h, k| h[k] = 0 }
apps[REMAINDER][REFERER] = Hash.new {|h, k| h[k] = 0 }
apps[REMAINDER][URIHIT] = Hash.new {|h, k| h[k] = 0 }
apps[REMAINDER]['useragent'] = Hash.new {|h, k| h[k] = 0 }
remainder.each do |l|
apps[REMAINDER][RUSER][l[RUSER]] += 1
apps[REMAINDER][REFERER][l[REFERER]] += 1
apps[REMAINDER][URIHIT][l[URIHIT]] += 1
apps[REMAINDER]['useragent'][l['useragent']] += 1
end
return apps
end
# Get a simplistic hash report of access entries
# @param f filepath to whimsy_access.log
# @return app_report, misses_data
def get_access_reports(f = File.join(ERROR_LOG_DIR, 'whimsy_access.log'))
access = parse_whimsy_access(f)
hits, miss = access.partition { |l| l['status'] == 200 }
apps = collate_whimsy_access(hits)
return apps, miss
end
# Parse error.log and return interesting entries
# @param f filename of error.log or .gz
# @param logs hash to append to (created if nil)
# @return hash of string|array of interesting entries
# "timestamp" => "Passenger restarts and messages",
# "timestamp" => ['_ERROR msg', '_WARN msg'... ]
def parse_error_log(f, logs = {})
last_time = 'uninitialized_time' # Cheap marker
ignored = Regexp.union(IGNORE_TRACEBACKS)
read_logz(f).lines.each do |l|
begin
# Emit each interesting item in order we read it
# Include good-enough timestamping, even for un-timestamped items
# (Date.today.to_time + 4/100000.0).iso8601(TRUNCATE)
if l =~ /\[ . (.{24}) .+\]: (.+)/
last_time = $1
capture = $2
if capture =~ /Passenger/
logs[Time.parse(last_time).iso8601(TRUNCATE)] = capture
end
elsif (l =~ /(_ERROR|_WARN (.+)whimsy)/) && l !~ ignored
# Offset our time so it doesn't overwrite any Passenger entries
(logs[(Time.parse(last_time) + 1 / TIME_OFFSET).iso8601(TRUNCATE)] ||= []) << l
end
rescue StandardError => e
puts e
end
end
return logs
end
# Parse error.log* files in dir and return interesting entries
# @param d directory to scan for error.log*
# @return hash of arrays of interesting entries
def parse_error_logs(d = ERROR_LOG_DIR, logs = {})
Dir[File.join(d, 'error?lo*')].each do |f|
parse_error_log(f, logs)
end
return logs
end
# Parse whimsy_error.log and return interesting entries
# @param f filename of error.log or .gz
# @return hash of string of interesting entries
# "timestamp" => "AH01215: undefined method `map' for #<String:0x0000000240e1e0> (NoMethodError): /x1/srv/whimsy/www/status/errors.cgi"
# [..date..] [cgi:error] [pid ...] [client ...] End of script output before headers: site.cgi
# [..date..] [cgi:error] [pid ...] [client ...] AH01215: /var/lib/g...
# [..date..] [proxy:error] [pid ...] [client ...] AH00898: Error during SSL Handshake with remote server returned by /board/agenda/websocket/
# [..date..] [proxy:error] [pid ...] (20014)Internal error (specific information not available): [client ...] AH01084: pass request body failed to 127.0.0.1:34234 (localhost)
def parse_whimsy_error(f, logs = {})
r = Regexp.new('\[(?<errdate>[^\]]*)\] \[[\w_]+:error\] \[.+?\] (.+: )?\[.+?\] (?<errline>.+)')
ignored = Regexp.union(IGNORE_TRACEBACKS)
read_logz(f).lines.each do |l|
r.match(l) do |m|
unless ignored =~ m[2]
begin
logs[Time.parse(m[1]).iso8601(6)] = m[2]
rescue StandardError
# Fallback to merely using the string representation
logs[m[1]] = m[2]
end
end
end
end
return logs
end
# Parse whimsy_error.log* files in dir and return interesting entries
# @param d directory to scan for whimsy_error.log*
# @return hash of arrays of interesting entries
def parse_whimsy_errors(d = ERROR_LOG_DIR, logs = {})
Dir[File.join(d, 'whimsy_error.lo*')].each do |f|
parse_whimsy_error(f, logs)
end
return logs
end
# get the most recently modified matching file
# Note that Dir may return files in any order
def latest(path)
Dir.glob(path).max_by {|f| File.mtime(f) }
end
# Get a list of all current|available error logs interesting entries
# @param current - only scan current day? or scan all week's logs
# @param d directory to scan for *error.log*
# @return hash of arrays of interesting entries
def get_errors(current, dir: ERROR_LOG_DIR)
if current
whimsy_log = latest(File.join(dir, 'whimsy_error.log*'))
logs = LogParser.parse_whimsy_error(whimsy_log)
error_log = latest(File.join(dir, 'error?log*'))
LogParser.parse_error_log(error_log, logs) if error_log
else
logs = LogParser.parse_whimsy_errors(dir)
LogParser.parse_error_logs(dir, logs)
end
return logs.sort.to_h # Sort by time order
end
end