| #!/usr/bin/env perl |
| |
| # Licensed to the Apache Software Foundation (ASF) under one |
| # or more contributor license agreements. See the NOTICE file |
| # distributed with this work for additional information |
| # regarding copyright ownership. The ASF licenses this file |
| # to you under the Apache License, Version 2.0 (the |
| # "License"); you may not use this file except in compliance |
| # with the License. You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, |
| # software distributed under the License is distributed on an |
| # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| # KIND, either express or implied. See the License for the |
| # specific language governing permissions and limitations |
| # under the License. |
| |
| use strict; |
| use warnings; |
| use Cwd qw/realpath/; |
| use POSIX qw/:sys_wait_h mkfifo setsid/; |
| use Fcntl qw/:DEFAULT :flock/; |
| use Getopt::Long qw/:config require_order gnu_compat/; |
| use FindBin; |
| use File::Spec; |
| use File::Copy; |
| |
| sub logdie($) |
| { |
| my ($msg) = @_; |
| chomp $msg; |
| die "[" . (scalar localtime()) . "] $msg\n"; |
| } |
| |
| sub logit($) |
| { |
| my ($msg) = @_; |
| chomp $msg; |
| warn "[" . (scalar localtime()) . "] $msg\n"; |
| } |
| |
| sub usage |
| { |
| die "usage: $0 -c <conf file> [-d <var dir>] [-t <kill timeout>] [--svlogd <optional conf file>]\n"; |
| } |
| |
| sub read_config_file |
| { |
| my ($config_file) = @_; |
| |
| open my $config_fh, "<", $config_file |
| or die "open $config_file: $!"; |
| |
| my @commands; |
| my @verify; |
| my $kill_timeout; |
| while (my $line = <$config_fh>) { |
| chomp $line; |
| next if $line =~ /^(\s*\#.*|\s*)$/; |
| |
| if ($line =~ /^(:verify|:kill-timeout|(?:\!p[0-9]+\s+)?[^:]\S+)\s+(.+)$/) { |
| my $name = $1; |
| my $order = 50; |
| my $command = $2; |
| |
| if ($name =~ /^(?:\!p([0-9]+)\s+)(.*)$/) { |
| $order = $1; |
| $name = $2; |
| } |
| |
| if ($name eq ':verify') { |
| push @verify, $command; |
| } elsif ($name eq ':kill-timeout') { |
| $kill_timeout = int($command); |
| } else { |
| die "Duplicate command: $line\n" if grep { $_->{name} eq $name } @commands; |
| push @commands, { |
| name => $name, |
| command => $command, |
| order => $order, # Stop order for this command |
| pid => 0, # Current pid, or 0 if not running |
| down => 0, # Time the proc should be down until |
| killed => 0, # Signal we sent to this process |
| restarting => 0, # True if this command is currently restarting |
| }; |
| } |
| } else { |
| die "Syntax error: $line\n"; |
| } |
| } |
| |
| close $config_fh; |
| return { commands => \@commands, verify => \@verify, 'kill-timeout' => $kill_timeout }; |
| } |
| |
| sub stringify_exit_status |
| { |
| my ($status) = @_; |
| my $string; |
| my $signal = $status & 127; |
| my $cored = $status & 128; |
| my $code = $status >> 8; |
| |
| if ($signal) { |
| $string = "signal = $signal"; |
| } else { |
| $string = "exited = $code"; |
| } |
| |
| if ($cored) { |
| $string = $string . ", dumped core"; |
| } |
| |
| return $string; |
| } |
| |
| sub open_control_fifo |
| { |
| my ($svdir) = @_; |
| my $fifofile = "$svdir/.ctrl"; |
| if (-e $fifofile) { |
| unlink $fifofile or die "Cannot remove fifo: $fifofile\n"; |
| } |
| mkfifo($fifofile, 0700) or die "Cannot create fifo: $fifofile\n"; |
| sysopen my $fifofh, $fifofile, O_NONBLOCK | O_RDWR or die "Cannot open fifo for reading: $fifofile\n"; |
| return $fifofh; |
| } |
| |
| sub pretty |
| { |
| my ($text, $color) = @_; |
| if (-t STDERR) { |
| if ($color eq 'bold') { |
| return "\x1b[1m$text\x1b[0m"; |
| } elsif ($color eq 'red') { |
| return "\x1b[31m\x1b[1m$text\x1b[0m"; |
| } else { |
| return $text; |
| } |
| } else { |
| return $text; |
| } |
| } |
| |
| my @commands; |
| |
| # If nonzero we should be exiting. -1 means exit without signal, >0 means exit with signal |
| my $killed = 0; |
| |
| # If >0 then kill -9 all procs at this time |
| my $killkill = 0; |
| |
| # Current proc order we're stopping. Ignored unless $killed is nonzero |
| my $stopping = 100; |
| |
| # We'll do our own reaping |
| $SIG{CHLD} = sub {}; |
| |
| # Redirect stderr to stdout |
| open STDERR, ">&STDOUT" or die; |
| |
| # Parse arguments |
| my %opt = ( |
| 'chdir' => realpath("$FindBin::Bin/.."), |
| 'vardir' => realpath("$FindBin::Bin/../var"), |
| 'kill-timeout' => 360, |
| ); |
| |
| usage() unless GetOptions( |
| \%opt, |
| 'conf|c=s', |
| 'vardir|d=s', |
| 'kill-timeout|t=i', |
| 'chdir=s', |
| 'svlogd:s' |
| ); |
| |
| usage() unless $opt{'conf'} && $opt{'vardir'}; |
| |
| # Read config file |
| my $config = read_config_file($opt{'conf'}); |
| @commands = @{$config->{commands}}; |
| |
| if (!@commands) { |
| die "Nothing to run.\n"; |
| } |
| |
| # Potentially override --kill-timeout |
| if (defined $config->{'kill-timeout'}) { |
| $opt{'kill-timeout'} = $config->{'kill-timeout'}; |
| } |
| |
| # Remember where vardir, logdir, svdir are after chdiring |
| my $vardir = File::Spec->rel2abs($opt{vardir}); |
| my $logdir = File::Spec->rel2abs(realpath($ENV{'DRUID_LOG_DIR'} || "$FindBin::Bin/../log")); |
| my $svdir = "$vardir/sv"; |
| |
| # chdir to the root of the distribution (or whereever) |
| chdir($opt{chdir}) or die "chdir[$opt{chdir}] failed: $!\n"; |
| |
| # Create vardir with tmp/ |
| if (! -e "$vardir/tmp") { |
| system("mkdir -p \Q$vardir\E/tmp") == 0 or die "mkdir $vardir/tmp failed: $!\n"; |
| } |
| |
| # Create svdir |
| if (! -e $svdir) { |
| system("mkdir -p \Q$svdir\E") == 0 or die "mkdir $svdir failed: $!\n"; |
| } |
| |
| # Create logdir, if needed |
| if (!defined $opt{svlogd} && ! -e "$logdir") { |
| system("mkdir -p \Q$logdir\E") == 0 or die "mkdir $logdir failed: $!\n"; |
| } |
| |
| # Lock svdir and keep it locked until we exit |
| my $lockfile = "$svdir/.lock"; |
| open my $lockfh, ">", $lockfile or die "Cannot write to svdir, please check permissions: $svdir\n"; |
| flock($lockfh, LOCK_EX | LOCK_NB) or die "Cannot lock svdir, maybe another 'supervise' is running: $svdir\n"; |
| |
| # Create control fifo in svdir |
| my $fifofh = open_control_fifo($svdir); |
| |
| # Run verification commands |
| for my $verify_cmd (@{$config->{verify}}) { |
| system($verify_cmd) == 0 or exit 1; |
| } |
| |
| # Catch killy signals and do an orderly shutdown |
| $SIG{HUP} = sub { if (!$killed) { $killed = 1; $killkill = time + $opt{'kill-timeout'}; } }; |
| $SIG{INT} = sub { if (!$killed) { $killed = 2; $killkill = time + $opt{'kill-timeout'}; } }; |
| $SIG{TERM} = sub { if (!$killed) { $killed = 15; $killkill = time + $opt{'kill-timeout'}; } }; |
| |
| # Build up control fifo command over multiple sysreads, potentially |
| my $fifobuffer = ''; |
| |
| if (defined $opt{svlogd}) { |
| logit "Staring services with log directory [svdir]."; |
| } else { |
| logit "Starting services with log directory [$logdir]."; |
| } |
| |
| while (1) { |
| # Spawn new procs |
| if (!$killed) { |
| for my $command (grep { !$_->{pid} } @commands) { |
| if ($command->{down} < time) { |
| if (my $pid = fork) { |
| $command->{pid} = $pid; |
| } else { |
| setsid; |
| |
| if (defined $opt{'svlogd'}) { |
| # If using svlogd, program output goes into the service directory. We do not use $logdir here. |
| my $logfile = "$svdir/$command->{name}"; |
| logit "Running command[" . pretty($command->{name}, 'bold') . "]: $command->{command}"; |
| |
| if (! -e $logfile) { |
| system("mkdir -p \Q$logfile\E") == 0 or logdie "mkdir $logfile failed: $!\n"; |
| } |
| |
| if ($opt{'svlogd'}) { |
| copy($opt{'svlogd'}, "$logfile/config") or logdie "Failed copying $opt{'svlogd'} to $logfile/config: $!"; |
| } else { |
| open my $configfh, ">", "$logfile/config" or logdie "Cannot write svlogd config, please check permissions: $logfile/config\n"; |
| print $configfh "s100000000\nn10\nN5\nt604800"; |
| close $configfh; |
| } |
| |
| open STDOUT, "|svlogd $logfile" or logdie "pipe to svlogd $logfile failed: $!\n"; |
| } else { |
| # If not using svlogd, program output goes to $logdir. In the default configuration, this will be a small |
| # amount of logging from the JVM itself, because all of the Druid and ZooKeeper logs are written into |
| # separate files by log4j2. |
| logit "Running command[" . pretty($command->{name}, 'bold') . "]: $command->{command}"; |
| my $logfile = "$logdir/$command->{name}.stdout.log"; |
| open STDOUT, ">>", $logfile or logdie "open $logfile failed: $!\n"; |
| } |
| |
| open STDERR, ">&STDOUT" or logdie "redirecting stderr failed: $!\n"; |
| exec('sh', '-c', "exec $command->{command}") or logdie "exec [$command->{command}] failed: $!"; |
| } |
| } |
| } |
| } |
| |
| # Reap dead procs |
| my $pid; |
| while (($pid = waitpid(-1, WNOHANG)) > 0) { |
| my $status = $?; |
| my ($command) = (grep { $_->{pid} eq $pid } @commands); |
| if ($command) { |
| $command->{pid} = 0; |
| $command->{down} = time + 2; |
| logit "Command[" . pretty($command->{name}, 'bold') . "] exited (pid = $pid, " . stringify_exit_status($status) . ")"; |
| if ($status && !$killed && !$command->{restarting}) { |
| # Unexpected exit |
| logit "Command[" . pretty($command->{name}, 'bold') . "] " . pretty("failed", "red") . ", see its logfile for more details"; |
| } |
| $command->{restarting} = 0; |
| } else { |
| logit "ERR: Reaped unknown command (pid = $pid, " . stringify_exit_status($status) . ")"; |
| } |
| } |
| |
| # Kill procs, maybe |
| if ($killed) { |
| my $should_killkill = time > $killkill; |
| |
| # Update stopping position, maybe |
| if ($should_killkill) { |
| $stopping = 0; |
| } else { |
| my $maxorder = 0; |
| for my $command (grep { $_->{pid} } @commands) { |
| if ($command->{order} > $maxorder) { |
| $maxorder = $command->{order}; |
| } |
| } |
| |
| if ($maxorder < $stopping) { |
| $stopping = $maxorder; |
| } |
| } |
| |
| for my $command (grep { $_->{pid} && $_->{order} >= $stopping } @commands) { |
| my $want_signal; |
| if ($command->{killed} == 9 || $should_killkill) { |
| $want_signal = 9; |
| } else { |
| $want_signal = 15; |
| } |
| |
| if ($command->{killed} != $want_signal) { |
| if ($want_signal != 9) { |
| my $kt = $opt{'kill-timeout'}; |
| logit "Sending signal[$want_signal] to command[" . pretty($command->{name}, 'bold') . "] (timeout ${kt}s)."; |
| } else { |
| logit "Sending signal[$want_signal] to command[" . pretty($command->{name}, 'bold') . "]."; |
| } |
| kill $want_signal, $command->{pid} or logit "WARN: Could not signal pid: $command->{pid}"; |
| $command->{killed} = $want_signal; |
| } |
| } |
| } |
| |
| # Kill ourselves, maybe |
| if ($killed && ! grep { $_->{pid} } @commands) { |
| logit "Exiting."; |
| $SIG{HUP} = $SIG{INT} = $SIG{TERM} = 'DEFAULT'; |
| if ($killed > 0) { |
| kill $killed, $$; |
| exit 1; |
| } else { |
| # Normal exit |
| exit 0; |
| } |
| } |
| |
| # Be controlled, maybe |
| my $fifostr = ""; |
| if (sysread $fifofh, $fifostr, 4096) { |
| $fifobuffer .= $fifostr; |
| |
| while ($fifobuffer =~ /^([^\n]*)\n(.*)/s) { |
| my $fifocmd = $1; |
| $fifobuffer = $2; |
| if ($fifocmd =~ /^k (.+)$/ && !$killed) { |
| my $name = $1; |
| my ($command) = grep { $_->{name} eq $name && $_->{pid} } @commands; |
| if ($command) { |
| logit "Restarting command[" . pretty($name, "bold") . "]."; |
| if (kill TERM => $command->{pid}) { |
| $command->{restarting} = 1; |
| } else { |
| logit "WARN: Could not signal pid: $command->{pid}" |
| } |
| } else { |
| logit "Asked to restart unknown command[" . pretty($name, "bold") . "], ignoring."; |
| } |
| } elsif ($fifocmd eq 'd') { |
| # -1 means exit without signal |
| $killed = -1; |
| $killkill = time + $opt{'kill-timeout'} |
| } else { |
| logit "Received unknown control command, ignoring."; |
| } |
| } |
| } |
| |
| sleep 1; |
| } |
| |
| exit 0; |