supervise-daemon: add health checks
Health checks are a way to monitor a service and make sure it stays healthy. If a service is not healthy, it will be automatically restarted after running the unhealthy() function to clean up.
This commit is contained in:
		
							
								
								
									
										4
									
								
								NEWS.md
									
									
									
									
									
								
							
							
						
						
									
										4
									
								
								NEWS.md
									
									
									
									
									
								
							@@ -22,6 +22,10 @@ This version adds timed shutdown and cancelation of shutdown to
 | 
			
		||||
openrc-shutdown. Shutdowns can now be delayed for a certain amount of
 | 
			
		||||
time or scheduled for an exact time.
 | 
			
		||||
 | 
			
		||||
supervise-daemon supports health checks, which are a periodic way to make sure a
 | 
			
		||||
service is healthy. For more information on setting this up, please see
 | 
			
		||||
supervise-daemon-guide.md.
 | 
			
		||||
 | 
			
		||||
## OpenRC 0.37
 | 
			
		||||
 | 
			
		||||
start-stop-daemon now supports logging stdout and stderr of daemons to
 | 
			
		||||
 
 | 
			
		||||
@@ -16,6 +16,10 @@
 | 
			
		||||
.Nd starts a daemon and restarts it if it crashes
 | 
			
		||||
.Sh SYNOPSIS
 | 
			
		||||
.Nm
 | 
			
		||||
.Fl a , -healthcheck-timer
 | 
			
		||||
.Ar seconds
 | 
			
		||||
.Fl A , -healthcheck-delay
 | 
			
		||||
.Ar seconds
 | 
			
		||||
.Fl D , -respawn-delay
 | 
			
		||||
.Ar seconds
 | 
			
		||||
.Fl d , -chdir
 | 
			
		||||
@@ -90,6 +94,11 @@ Print the action(s) that are taken just before doing them.
 | 
			
		||||
.Pp
 | 
			
		||||
The options are as follows:
 | 
			
		||||
.Bl -tag -width indent
 | 
			
		||||
.Fl a , -healthcheck-timer Ar seconds
 | 
			
		||||
Run the healthcheck() command, possibly followed by the unhealthy()
 | 
			
		||||
command every time this number of seconds passes.
 | 
			
		||||
.Fl A , -healthcheck-delay Ar seconds
 | 
			
		||||
Wait this long before the first health check.
 | 
			
		||||
.It Fl D , -respawn-delay Ar seconds
 | 
			
		||||
wait this number of seconds before restarting a daemon after it crashes.
 | 
			
		||||
The default is 0.
 | 
			
		||||
 
 | 
			
		||||
@@ -10,6 +10,8 @@
 | 
			
		||||
# This file may not be copied, modified, propagated, or distributed
 | 
			
		||||
#    except according to the terms contained in the LICENSE file.
 | 
			
		||||
 | 
			
		||||
extra_commands="healthcheck unhealthy ${extra_commands}"
 | 
			
		||||
 | 
			
		||||
supervise_start()
 | 
			
		||||
{
 | 
			
		||||
	if [ -z "$command" ]; then
 | 
			
		||||
@@ -32,6 +34,8 @@ supervise_start()
 | 
			
		||||
		${respawn_delay:+--respawn-delay} $respawn_delay \
 | 
			
		||||
		${respawn_max:+--respawn-max} $respawn_max \
 | 
			
		||||
		${respawn_period:+--respawn-period} $respawn_period \
 | 
			
		||||
		${healthcheck_delay:+--healthcheck-delay} $healthcheck_delay \
 | 
			
		||||
		${healthcheck_timer:+--healthcheck-timer} $healthcheck_timer \
 | 
			
		||||
		${command_user+--user} $command_user \
 | 
			
		||||
		${umask+--umask} $umask \
 | 
			
		||||
		${supervise_daemon_args:-${start_stop_daemon_args}} \
 | 
			
		||||
@@ -98,3 +102,13 @@ supervise_status()
 | 
			
		||||
		return 3
 | 
			
		||||
	fi
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
healthcheck()
 | 
			
		||||
{
 | 
			
		||||
	return 0
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
unhealthy()
 | 
			
		||||
{
 | 
			
		||||
	return 0
 | 
			
		||||
}
 | 
			
		||||
 
 | 
			
		||||
@@ -161,7 +161,7 @@ rc-update: rc-update.o _usage.o rc-misc.o
 | 
			
		||||
start-stop-daemon: start-stop-daemon.o _usage.o rc-misc.o rc-pipes.o rc-schedules.o
 | 
			
		||||
	${CC} ${LOCAL_CFLAGS} ${LOCAL_LDFLAGS} ${CFLAGS} ${LDFLAGS} -o $@ $^ ${LDADD}
 | 
			
		||||
 | 
			
		||||
supervise-daemon: supervise-daemon.o _usage.o rc-misc.o rc-schedules.o
 | 
			
		||||
supervise-daemon: supervise-daemon.o _usage.o rc-misc.o rc-plugin.o rc-schedules.o
 | 
			
		||||
	${CC} ${LOCAL_CFLAGS} ${LOCAL_LDFLAGS} ${CFLAGS} ${LDFLAGS} -o $@ $^ ${LDADD}
 | 
			
		||||
 | 
			
		||||
service_get_value service_set_value get_options save_options: do_value.o rc-misc.o
 | 
			
		||||
 
 | 
			
		||||
@@ -61,15 +61,18 @@ static struct pam_conv conv = { NULL, NULL};
 | 
			
		||||
#include "queue.h"
 | 
			
		||||
#include "rc.h"
 | 
			
		||||
#include "rc-misc.h"
 | 
			
		||||
#include "rc-plugin.h"
 | 
			
		||||
#include "rc-schedules.h"
 | 
			
		||||
#include "_usage.h"
 | 
			
		||||
#include "helpers.h"
 | 
			
		||||
 | 
			
		||||
const char *applet = NULL;
 | 
			
		||||
const char *extraopts = NULL;
 | 
			
		||||
const char *getoptstring = "D:d:e:g:I:Kk:m:N:p:R:r:Su:1:2:3" \
 | 
			
		||||
const char *getoptstring = "A:a:D:d:e:g:H:I:Kk:m:N:p:R:r:Su:1:2:3" \
 | 
			
		||||
	getoptstring_COMMON;
 | 
			
		||||
const struct option longopts[] = {
 | 
			
		||||
	{ "healthcheck-timer",        1, NULL, 'a'},
 | 
			
		||||
	{ "healthcheck-delay",        1, NULL, 'A'},
 | 
			
		||||
	{ "respawn-delay",        1, NULL, 'D'},
 | 
			
		||||
	{ "chdir",        1, NULL, 'd'},
 | 
			
		||||
	{ "env",          1, NULL, 'e'},
 | 
			
		||||
@@ -91,6 +94,8 @@ const struct option longopts[] = {
 | 
			
		||||
	longopts_COMMON
 | 
			
		||||
};
 | 
			
		||||
const char * const longopts_help[] = {
 | 
			
		||||
	"set an initial health check delay",
 | 
			
		||||
	"set a health check timer",
 | 
			
		||||
	"Set a respawn delay",
 | 
			
		||||
	"Change the PWD",
 | 
			
		||||
	"Set an environment string",
 | 
			
		||||
@@ -113,6 +118,9 @@ const char * const longopts_help[] = {
 | 
			
		||||
};
 | 
			
		||||
const char *usagestring = NULL;
 | 
			
		||||
 | 
			
		||||
static int healthcheckdelay = 0;
 | 
			
		||||
static int healthchecktimer = 0;
 | 
			
		||||
static volatile sig_atomic_t do_healthcheck = 0;
 | 
			
		||||
static int nicelevel = 0;
 | 
			
		||||
static int ionicec = -1;
 | 
			
		||||
static int ioniced = 0;
 | 
			
		||||
@@ -183,6 +191,12 @@ static void handle_signal(int sig)
 | 
			
		||||
		re_exec_supervisor();
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static void healthcheck(int sig)
 | 
			
		||||
{
 | 
			
		||||
	if (sig == SIGALRM)
 | 
			
		||||
		do_healthcheck = 1;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static char * expand_home(const char *home, const char *path)
 | 
			
		||||
{
 | 
			
		||||
	char *opath, *ppath, *p, *nh;
 | 
			
		||||
@@ -423,11 +437,14 @@ static void child_process(char *exec, char **argv)
 | 
			
		||||
static void supervisor(char *exec, char **argv)
 | 
			
		||||
{
 | 
			
		||||
	FILE *fp;
 | 
			
		||||
	pid_t wait_pid;
 | 
			
		||||
	int i;
 | 
			
		||||
	int nkilled;
 | 
			
		||||
	struct timespec ts;
 | 
			
		||||
	time_t respawn_now= 0;
 | 
			
		||||
	time_t first_spawn= 0;
 | 
			
		||||
	pid_t health_pid;
 | 
			
		||||
	int health_status;
 | 
			
		||||
 | 
			
		||||
#ifndef RC_DEBUG
 | 
			
		||||
	signal_setup_restart(SIGHUP, handle_signal);
 | 
			
		||||
@@ -488,46 +505,88 @@ static void supervisor(char *exec, char **argv)
 | 
			
		||||
	 * Supervisor main loop
 | 
			
		||||
	 */
 | 
			
		||||
	i = 0;
 | 
			
		||||
	if (healthcheckdelay) {
 | 
			
		||||
		signal_setup(SIGALRM, healthcheck);
 | 
			
		||||
		alarm(healthcheckdelay);
 | 
			
		||||
	} else if (healthchecktimer) {
 | 
			
		||||
		signal_setup(SIGALRM, healthcheck);
 | 
			
		||||
		alarm(healthchecktimer);
 | 
			
		||||
	}
 | 
			
		||||
	while (!exiting) {
 | 
			
		||||
		wait(&i);
 | 
			
		||||
		if (exiting) {
 | 
			
		||||
			signal_setup(SIGCHLD, SIG_IGN);
 | 
			
		||||
			syslog(LOG_INFO, "stopping %s, pid %d", exec, child_pid);
 | 
			
		||||
			nkilled = run_stop_schedule(applet, exec, NULL, child_pid, 0,
 | 
			
		||||
					false, false, true);
 | 
			
		||||
			if (nkilled > 0)
 | 
			
		||||
				syslog(LOG_INFO, "killed %d processes", nkilled);
 | 
			
		||||
		} else {
 | 
			
		||||
			ts.tv_sec = respawn_delay;
 | 
			
		||||
			ts.tv_nsec = 0;
 | 
			
		||||
			nanosleep(&ts, NULL);
 | 
			
		||||
			if (respawn_max > 0 && respawn_period > 0) {
 | 
			
		||||
				respawn_now = time(NULL);
 | 
			
		||||
				if (first_spawn == 0)
 | 
			
		||||
					first_spawn = respawn_now;
 | 
			
		||||
				if (respawn_now - first_spawn > respawn_period) {
 | 
			
		||||
					respawn_count = 0;
 | 
			
		||||
					first_spawn = 0;
 | 
			
		||||
				} else
 | 
			
		||||
					respawn_count++;
 | 
			
		||||
				if (respawn_count > respawn_max) {
 | 
			
		||||
					syslog(LOG_WARNING,
 | 
			
		||||
							"respawned \"%s\" too many times, exiting", exec);
 | 
			
		||||
					exiting = true;
 | 
			
		||||
		wait_pid = wait(&i);
 | 
			
		||||
		if (wait_pid == -1) {
 | 
			
		||||
			if (do_healthcheck) {
 | 
			
		||||
				do_healthcheck = 0;
 | 
			
		||||
				alarm(0);
 | 
			
		||||
				syslog(LOG_DEBUG, "running health check for %s", svcname);
 | 
			
		||||
				health_pid = exec_service(svcname, "healthcheck");
 | 
			
		||||
				health_status = rc_waitpid(health_pid);
 | 
			
		||||
				if (WIFEXITED(health_status) && !WEXITSTATUS(health_status)) {
 | 
			
		||||
					alarm(healthchecktimer);
 | 
			
		||||
					continue;
 | 
			
		||||
				} else {
 | 
			
		||||
					syslog(LOG_WARNING, "health check for %s failed", svcname);
 | 
			
		||||
					health_pid = exec_service(svcname, "unhealthy");
 | 
			
		||||
					rc_waitpid(health_pid);
 | 
			
		||||
					syslog(LOG_INFO, "stopping %s, pid %d", exec, child_pid);
 | 
			
		||||
					nkilled = run_stop_schedule(applet, NULL, NULL, child_pid, 0,
 | 
			
		||||
							false, false, true);
 | 
			
		||||
					if (nkilled > 0)
 | 
			
		||||
						syslog(LOG_INFO, "killed %d processes", nkilled);
 | 
			
		||||
					else if (errno != 0)
 | 
			
		||||
						syslog(LOG_INFO, "Unable to kill %d: %s",
 | 
			
		||||
								child_pid, strerror(errno));
 | 
			
		||||
				}
 | 
			
		||||
			} else if (exiting ) {
 | 
			
		||||
				alarm(0);
 | 
			
		||||
				syslog(LOG_INFO, "stopping %s, pid %d", exec, child_pid);
 | 
			
		||||
				nkilled = run_stop_schedule(applet, exec, NULL, child_pid, 0,
 | 
			
		||||
						false, false, true);
 | 
			
		||||
				if (nkilled > 0)
 | 
			
		||||
					syslog(LOG_INFO, "killed %d processes", nkilled);
 | 
			
		||||
				continue;
 | 
			
		||||
			}
 | 
			
		||||
		} else if (wait_pid == child_pid) {
 | 
			
		||||
			if (WIFEXITED(i))
 | 
			
		||||
				syslog(LOG_WARNING, "%s, pid %d, exited with return code %d",
 | 
			
		||||
						exec, child_pid, WEXITSTATUS(i));
 | 
			
		||||
			else if (WIFSIGNALED(i))
 | 
			
		||||
				syslog(LOG_WARNING, "%s, pid %d, terminated by signal %d",
 | 
			
		||||
						exec, child_pid, WTERMSIG(i));
 | 
			
		||||
			child_pid = fork();
 | 
			
		||||
			if (child_pid == -1)
 | 
			
		||||
				eerrorx("%s: fork: %s", applet, strerror(errno));
 | 
			
		||||
			if (child_pid == 0)
 | 
			
		||||
				child_process(exec, argv);
 | 
			
		||||
		} else
 | 
			
		||||
			continue;
 | 
			
		||||
 | 
			
		||||
		ts.tv_sec = respawn_delay;
 | 
			
		||||
		ts.tv_nsec = 0;
 | 
			
		||||
		nanosleep(&ts, NULL);
 | 
			
		||||
		if (respawn_max > 0 && respawn_period > 0) {
 | 
			
		||||
			respawn_now = time(NULL);
 | 
			
		||||
			if (first_spawn == 0)
 | 
			
		||||
				first_spawn = respawn_now;
 | 
			
		||||
			if (respawn_now - first_spawn > respawn_period) {
 | 
			
		||||
				respawn_count = 0;
 | 
			
		||||
				first_spawn = 0;
 | 
			
		||||
			} else
 | 
			
		||||
				respawn_count++;
 | 
			
		||||
			if (respawn_count > respawn_max) {
 | 
			
		||||
				syslog(LOG_WARNING,
 | 
			
		||||
						"respawned \"%s\" too many times, exiting", exec);
 | 
			
		||||
				exiting = true;
 | 
			
		||||
				continue;
 | 
			
		||||
			}
 | 
			
		||||
		}
 | 
			
		||||
		alarm(0);
 | 
			
		||||
		child_pid = fork();
 | 
			
		||||
		if (child_pid == -1)
 | 
			
		||||
			eerrorx("%s: fork: %s", applet, strerror(errno));
 | 
			
		||||
		if (child_pid == 0)
 | 
			
		||||
			child_process(exec, argv);
 | 
			
		||||
		if (healthcheckdelay) {
 | 
			
		||||
			signal_setup(SIGALRM, healthcheck);
 | 
			
		||||
			alarm(healthcheckdelay);
 | 
			
		||||
		} else if (healthchecktimer) {
 | 
			
		||||
			signal_setup(SIGALRM, healthcheck);
 | 
			
		||||
			alarm(healthchecktimer);
 | 
			
		||||
		}
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
@@ -612,6 +671,16 @@ int main(int argc, char **argv)
 | 
			
		||||
	while ((opt = getopt_long(argc, argv, getoptstring, longopts,
 | 
			
		||||
		    (int *) 0)) != -1)
 | 
			
		||||
		switch (opt) {
 | 
			
		||||
		case 'a':  /* --healthcheck-timer <time> */
 | 
			
		||||
			if (sscanf(optarg, "%d", &healthchecktimer) != 1 || healthchecktimer < 1)
 | 
			
		||||
				eerrorx("%s: invalid health check timer %s", applet, optarg);
 | 
			
		||||
			break;
 | 
			
		||||
 | 
			
		||||
		case 'A':  /* --healthcheck-delay <time> */
 | 
			
		||||
			if (sscanf(optarg, "%d", &healthcheckdelay) != 1 || healthcheckdelay < 1)
 | 
			
		||||
				eerrorx("%s: invalid health check delay %s", applet, optarg);
 | 
			
		||||
			break;
 | 
			
		||||
 | 
			
		||||
		case 'D':  /* --respawn-delay time */
 | 
			
		||||
			n = sscanf(optarg, "%d", &respawn_delay);
 | 
			
		||||
			if (n	!= 1 || respawn_delay < 1)
 | 
			
		||||
@@ -668,6 +737,11 @@ int main(int argc, char **argv)
 | 
			
		||||
			gid = gr->gr_gid;
 | 
			
		||||
			break;
 | 
			
		||||
 | 
			
		||||
		case 'H':  /* --healthcheck-timer <minutes> */
 | 
			
		||||
			if (sscanf(optarg, "%d", &healthchecktimer) != 1 || healthchecktimer < 1)
 | 
			
		||||
				eerrorx("%s: invalid health check timer %s", applet, optarg);
 | 
			
		||||
			break;
 | 
			
		||||
 | 
			
		||||
		case 'k':
 | 
			
		||||
			if (parse_mode(&numask, optarg))
 | 
			
		||||
				eerrorx("%s: invalid mode `%s'",
 | 
			
		||||
 
 | 
			
		||||
@@ -22,6 +22,28 @@ The following is a brief guide on using this capability.
 | 
			
		||||
  instructs it not to fork to the command_args_foreground variable shown
 | 
			
		||||
  below.
 | 
			
		||||
 | 
			
		||||
# Health Checks
 | 
			
		||||
 | 
			
		||||
Health checks are a way to make sure a service monitored by
 | 
			
		||||
supervise-daemon stays healthy. To configure a health check for a
 | 
			
		||||
service, you need to write a healthcheck() function, and optionally an
 | 
			
		||||
unhealthy() function in the service script. Also, you will need to set
 | 
			
		||||
the healthcheck_timer and optionally healthcheck_delay variables.
 | 
			
		||||
 | 
			
		||||
## healthcheck() function
 | 
			
		||||
 | 
			
		||||
The healthcheck() function is run repeatedly based on the settings of
 | 
			
		||||
the healthcheck_* variables. This function should return zero if the
 | 
			
		||||
service is currently healthy or non-zero otherwise.
 | 
			
		||||
 | 
			
		||||
## unhealthy() function
 | 
			
		||||
 | 
			
		||||
If the healthcheck() function returns non-zero, the unhealthy() function
 | 
			
		||||
is run, then the service is restarted. Since the service will be
 | 
			
		||||
restarted by the supervisor, the unhealthy function should not try to
 | 
			
		||||
restart it; the purpose of the function is to allow any cleanup tasks
 | 
			
		||||
other than restarting the service to be run.
 | 
			
		||||
 | 
			
		||||
# Variable Settings
 | 
			
		||||
 | 
			
		||||
The most important setting is the supervisor variable. At the top of
 | 
			
		||||
@@ -52,6 +74,20 @@ This 	should be used if the daemon you want to monitor
 | 
			
		||||
forks and goes to the background by default. This should be set to the
 | 
			
		||||
command line option that instructs the daemon to stay in the foreground.
 | 
			
		||||
 | 
			
		||||
``` sh
 | 
			
		||||
healthcheck_delay=seconds
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
This is the delay, in seconds, before the first health check is run.
 | 
			
		||||
If it is not set, we use the value of healthcheck_timer.
 | 
			
		||||
 | 
			
		||||
``` sh
 | 
			
		||||
healthcheck_timer=seconds
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
This is the  number of seconds between health checks. If it is not set,
 | 
			
		||||
no health checks will be run.
 | 
			
		||||
 | 
			
		||||
``` sh
 | 
			
		||||
respawn_delay
 | 
			
		||||
```
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user