supervise-daemon: add health checks
Health checks are a way to monitor a service and make sure it stays healthy. If a service is not healthy, it will be automatically restarted after running the unhealthy() function to clean up.
This commit is contained in:
parent
7a75bfb00c
commit
c1e582586d
4
NEWS.md
4
NEWS.md
@ -22,6 +22,10 @@ This version adds timed shutdown and cancelation of shutdown to
|
|||||||
openrc-shutdown. Shutdowns can now be delayed for a certain amount of
|
openrc-shutdown. Shutdowns can now be delayed for a certain amount of
|
||||||
time or scheduled for an exact time.
|
time or scheduled for an exact time.
|
||||||
|
|
||||||
|
supervise-daemon supports health checks, which are a periodic way to make sure a
|
||||||
|
service is healthy. For more information on setting this up, please see
|
||||||
|
supervise-daemon-guide.md.
|
||||||
|
|
||||||
## OpenRC 0.37
|
## OpenRC 0.37
|
||||||
|
|
||||||
start-stop-daemon now supports logging stdout and stderr of daemons to
|
start-stop-daemon now supports logging stdout and stderr of daemons to
|
||||||
|
@ -16,6 +16,10 @@
|
|||||||
.Nd starts a daemon and restarts it if it crashes
|
.Nd starts a daemon and restarts it if it crashes
|
||||||
.Sh SYNOPSIS
|
.Sh SYNOPSIS
|
||||||
.Nm
|
.Nm
|
||||||
|
.Fl a , -healthcheck-timer
|
||||||
|
.Ar seconds
|
||||||
|
.Fl A , -healthcheck-delay
|
||||||
|
.Ar seconds
|
||||||
.Fl D , -respawn-delay
|
.Fl D , -respawn-delay
|
||||||
.Ar seconds
|
.Ar seconds
|
||||||
.Fl d , -chdir
|
.Fl d , -chdir
|
||||||
@ -90,6 +94,11 @@ Print the action(s) that are taken just before doing them.
|
|||||||
.Pp
|
.Pp
|
||||||
The options are as follows:
|
The options are as follows:
|
||||||
.Bl -tag -width indent
|
.Bl -tag -width indent
|
||||||
|
.Fl a , -healthcheck-timer Ar seconds
|
||||||
|
Run the healthcheck() command, possibly followed by the unhealthy()
|
||||||
|
command every time this number of seconds passes.
|
||||||
|
.Fl A , -healthcheck-delay Ar seconds
|
||||||
|
Wait this long before the first health check.
|
||||||
.It Fl D , -respawn-delay Ar seconds
|
.It Fl D , -respawn-delay Ar seconds
|
||||||
wait this number of seconds before restarting a daemon after it crashes.
|
wait this number of seconds before restarting a daemon after it crashes.
|
||||||
The default is 0.
|
The default is 0.
|
||||||
|
@ -10,6 +10,8 @@
|
|||||||
# This file may not be copied, modified, propagated, or distributed
|
# This file may not be copied, modified, propagated, or distributed
|
||||||
# except according to the terms contained in the LICENSE file.
|
# except according to the terms contained in the LICENSE file.
|
||||||
|
|
||||||
|
extra_commands="healthcheck unhealthy ${extra_commands}"
|
||||||
|
|
||||||
supervise_start()
|
supervise_start()
|
||||||
{
|
{
|
||||||
if [ -z "$command" ]; then
|
if [ -z "$command" ]; then
|
||||||
@ -32,6 +34,8 @@ supervise_start()
|
|||||||
${respawn_delay:+--respawn-delay} $respawn_delay \
|
${respawn_delay:+--respawn-delay} $respawn_delay \
|
||||||
${respawn_max:+--respawn-max} $respawn_max \
|
${respawn_max:+--respawn-max} $respawn_max \
|
||||||
${respawn_period:+--respawn-period} $respawn_period \
|
${respawn_period:+--respawn-period} $respawn_period \
|
||||||
|
${healthcheck_delay:+--healthcheck-delay} $healthcheck_delay \
|
||||||
|
${healthcheck_timer:+--healthcheck-timer} $healthcheck_timer \
|
||||||
${command_user+--user} $command_user \
|
${command_user+--user} $command_user \
|
||||||
${umask+--umask} $umask \
|
${umask+--umask} $umask \
|
||||||
${supervise_daemon_args:-${start_stop_daemon_args}} \
|
${supervise_daemon_args:-${start_stop_daemon_args}} \
|
||||||
@ -98,3 +102,13 @@ supervise_status()
|
|||||||
return 3
|
return 3
|
||||||
fi
|
fi
|
||||||
}
|
}
|
||||||
|
|
||||||
|
healthcheck()
|
||||||
|
{
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
|
||||||
|
unhealthy()
|
||||||
|
{
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
@ -161,7 +161,7 @@ rc-update: rc-update.o _usage.o rc-misc.o
|
|||||||
start-stop-daemon: start-stop-daemon.o _usage.o rc-misc.o rc-pipes.o rc-schedules.o
|
start-stop-daemon: start-stop-daemon.o _usage.o rc-misc.o rc-pipes.o rc-schedules.o
|
||||||
${CC} ${LOCAL_CFLAGS} ${LOCAL_LDFLAGS} ${CFLAGS} ${LDFLAGS} -o $@ $^ ${LDADD}
|
${CC} ${LOCAL_CFLAGS} ${LOCAL_LDFLAGS} ${CFLAGS} ${LDFLAGS} -o $@ $^ ${LDADD}
|
||||||
|
|
||||||
supervise-daemon: supervise-daemon.o _usage.o rc-misc.o rc-schedules.o
|
supervise-daemon: supervise-daemon.o _usage.o rc-misc.o rc-plugin.o rc-schedules.o
|
||||||
${CC} ${LOCAL_CFLAGS} ${LOCAL_LDFLAGS} ${CFLAGS} ${LDFLAGS} -o $@ $^ ${LDADD}
|
${CC} ${LOCAL_CFLAGS} ${LOCAL_LDFLAGS} ${CFLAGS} ${LDFLAGS} -o $@ $^ ${LDADD}
|
||||||
|
|
||||||
service_get_value service_set_value get_options save_options: do_value.o rc-misc.o
|
service_get_value service_set_value get_options save_options: do_value.o rc-misc.o
|
||||||
|
@ -61,15 +61,18 @@ static struct pam_conv conv = { NULL, NULL};
|
|||||||
#include "queue.h"
|
#include "queue.h"
|
||||||
#include "rc.h"
|
#include "rc.h"
|
||||||
#include "rc-misc.h"
|
#include "rc-misc.h"
|
||||||
|
#include "rc-plugin.h"
|
||||||
#include "rc-schedules.h"
|
#include "rc-schedules.h"
|
||||||
#include "_usage.h"
|
#include "_usage.h"
|
||||||
#include "helpers.h"
|
#include "helpers.h"
|
||||||
|
|
||||||
const char *applet = NULL;
|
const char *applet = NULL;
|
||||||
const char *extraopts = NULL;
|
const char *extraopts = NULL;
|
||||||
const char *getoptstring = "D:d:e:g:I:Kk:m:N:p:R:r:Su:1:2:3" \
|
const char *getoptstring = "A:a:D:d:e:g:H:I:Kk:m:N:p:R:r:Su:1:2:3" \
|
||||||
getoptstring_COMMON;
|
getoptstring_COMMON;
|
||||||
const struct option longopts[] = {
|
const struct option longopts[] = {
|
||||||
|
{ "healthcheck-timer", 1, NULL, 'a'},
|
||||||
|
{ "healthcheck-delay", 1, NULL, 'A'},
|
||||||
{ "respawn-delay", 1, NULL, 'D'},
|
{ "respawn-delay", 1, NULL, 'D'},
|
||||||
{ "chdir", 1, NULL, 'd'},
|
{ "chdir", 1, NULL, 'd'},
|
||||||
{ "env", 1, NULL, 'e'},
|
{ "env", 1, NULL, 'e'},
|
||||||
@ -91,6 +94,8 @@ const struct option longopts[] = {
|
|||||||
longopts_COMMON
|
longopts_COMMON
|
||||||
};
|
};
|
||||||
const char * const longopts_help[] = {
|
const char * const longopts_help[] = {
|
||||||
|
"set an initial health check delay",
|
||||||
|
"set a health check timer",
|
||||||
"Set a respawn delay",
|
"Set a respawn delay",
|
||||||
"Change the PWD",
|
"Change the PWD",
|
||||||
"Set an environment string",
|
"Set an environment string",
|
||||||
@ -113,6 +118,9 @@ const char * const longopts_help[] = {
|
|||||||
};
|
};
|
||||||
const char *usagestring = NULL;
|
const char *usagestring = NULL;
|
||||||
|
|
||||||
|
static int healthcheckdelay = 0;
|
||||||
|
static int healthchecktimer = 0;
|
||||||
|
static volatile sig_atomic_t do_healthcheck = 0;
|
||||||
static int nicelevel = 0;
|
static int nicelevel = 0;
|
||||||
static int ionicec = -1;
|
static int ionicec = -1;
|
||||||
static int ioniced = 0;
|
static int ioniced = 0;
|
||||||
@ -183,6 +191,12 @@ static void handle_signal(int sig)
|
|||||||
re_exec_supervisor();
|
re_exec_supervisor();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void healthcheck(int sig)
|
||||||
|
{
|
||||||
|
if (sig == SIGALRM)
|
||||||
|
do_healthcheck = 1;
|
||||||
|
}
|
||||||
|
|
||||||
static char * expand_home(const char *home, const char *path)
|
static char * expand_home(const char *home, const char *path)
|
||||||
{
|
{
|
||||||
char *opath, *ppath, *p, *nh;
|
char *opath, *ppath, *p, *nh;
|
||||||
@ -423,11 +437,14 @@ static void child_process(char *exec, char **argv)
|
|||||||
static void supervisor(char *exec, char **argv)
|
static void supervisor(char *exec, char **argv)
|
||||||
{
|
{
|
||||||
FILE *fp;
|
FILE *fp;
|
||||||
|
pid_t wait_pid;
|
||||||
int i;
|
int i;
|
||||||
int nkilled;
|
int nkilled;
|
||||||
struct timespec ts;
|
struct timespec ts;
|
||||||
time_t respawn_now= 0;
|
time_t respawn_now= 0;
|
||||||
time_t first_spawn= 0;
|
time_t first_spawn= 0;
|
||||||
|
pid_t health_pid;
|
||||||
|
int health_status;
|
||||||
|
|
||||||
#ifndef RC_DEBUG
|
#ifndef RC_DEBUG
|
||||||
signal_setup_restart(SIGHUP, handle_signal);
|
signal_setup_restart(SIGHUP, handle_signal);
|
||||||
@ -488,46 +505,88 @@ static void supervisor(char *exec, char **argv)
|
|||||||
* Supervisor main loop
|
* Supervisor main loop
|
||||||
*/
|
*/
|
||||||
i = 0;
|
i = 0;
|
||||||
|
if (healthcheckdelay) {
|
||||||
|
signal_setup(SIGALRM, healthcheck);
|
||||||
|
alarm(healthcheckdelay);
|
||||||
|
} else if (healthchecktimer) {
|
||||||
|
signal_setup(SIGALRM, healthcheck);
|
||||||
|
alarm(healthchecktimer);
|
||||||
|
}
|
||||||
while (!exiting) {
|
while (!exiting) {
|
||||||
wait(&i);
|
wait_pid = wait(&i);
|
||||||
if (exiting) {
|
if (wait_pid == -1) {
|
||||||
signal_setup(SIGCHLD, SIG_IGN);
|
if (do_healthcheck) {
|
||||||
syslog(LOG_INFO, "stopping %s, pid %d", exec, child_pid);
|
do_healthcheck = 0;
|
||||||
nkilled = run_stop_schedule(applet, exec, NULL, child_pid, 0,
|
alarm(0);
|
||||||
false, false, true);
|
syslog(LOG_DEBUG, "running health check for %s", svcname);
|
||||||
if (nkilled > 0)
|
health_pid = exec_service(svcname, "healthcheck");
|
||||||
syslog(LOG_INFO, "killed %d processes", nkilled);
|
health_status = rc_waitpid(health_pid);
|
||||||
} else {
|
if (WIFEXITED(health_status) && !WEXITSTATUS(health_status)) {
|
||||||
ts.tv_sec = respawn_delay;
|
alarm(healthchecktimer);
|
||||||
ts.tv_nsec = 0;
|
|
||||||
nanosleep(&ts, NULL);
|
|
||||||
if (respawn_max > 0 && respawn_period > 0) {
|
|
||||||
respawn_now = time(NULL);
|
|
||||||
if (first_spawn == 0)
|
|
||||||
first_spawn = respawn_now;
|
|
||||||
if (respawn_now - first_spawn > respawn_period) {
|
|
||||||
respawn_count = 0;
|
|
||||||
first_spawn = 0;
|
|
||||||
} else
|
|
||||||
respawn_count++;
|
|
||||||
if (respawn_count > respawn_max) {
|
|
||||||
syslog(LOG_WARNING,
|
|
||||||
"respawned \"%s\" too many times, exiting", exec);
|
|
||||||
exiting = true;
|
|
||||||
continue;
|
continue;
|
||||||
|
} else {
|
||||||
|
syslog(LOG_WARNING, "health check for %s failed", svcname);
|
||||||
|
health_pid = exec_service(svcname, "unhealthy");
|
||||||
|
rc_waitpid(health_pid);
|
||||||
|
syslog(LOG_INFO, "stopping %s, pid %d", exec, child_pid);
|
||||||
|
nkilled = run_stop_schedule(applet, NULL, NULL, child_pid, 0,
|
||||||
|
false, false, true);
|
||||||
|
if (nkilled > 0)
|
||||||
|
syslog(LOG_INFO, "killed %d processes", nkilled);
|
||||||
|
else if (errno != 0)
|
||||||
|
syslog(LOG_INFO, "Unable to kill %d: %s",
|
||||||
|
child_pid, strerror(errno));
|
||||||
}
|
}
|
||||||
|
} else if (exiting ) {
|
||||||
|
alarm(0);
|
||||||
|
syslog(LOG_INFO, "stopping %s, pid %d", exec, child_pid);
|
||||||
|
nkilled = run_stop_schedule(applet, exec, NULL, child_pid, 0,
|
||||||
|
false, false, true);
|
||||||
|
if (nkilled > 0)
|
||||||
|
syslog(LOG_INFO, "killed %d processes", nkilled);
|
||||||
|
continue;
|
||||||
}
|
}
|
||||||
|
} else if (wait_pid == child_pid) {
|
||||||
if (WIFEXITED(i))
|
if (WIFEXITED(i))
|
||||||
syslog(LOG_WARNING, "%s, pid %d, exited with return code %d",
|
syslog(LOG_WARNING, "%s, pid %d, exited with return code %d",
|
||||||
exec, child_pid, WEXITSTATUS(i));
|
exec, child_pid, WEXITSTATUS(i));
|
||||||
else if (WIFSIGNALED(i))
|
else if (WIFSIGNALED(i))
|
||||||
syslog(LOG_WARNING, "%s, pid %d, terminated by signal %d",
|
syslog(LOG_WARNING, "%s, pid %d, terminated by signal %d",
|
||||||
exec, child_pid, WTERMSIG(i));
|
exec, child_pid, WTERMSIG(i));
|
||||||
child_pid = fork();
|
} else
|
||||||
if (child_pid == -1)
|
continue;
|
||||||
eerrorx("%s: fork: %s", applet, strerror(errno));
|
|
||||||
if (child_pid == 0)
|
ts.tv_sec = respawn_delay;
|
||||||
child_process(exec, argv);
|
ts.tv_nsec = 0;
|
||||||
|
nanosleep(&ts, NULL);
|
||||||
|
if (respawn_max > 0 && respawn_period > 0) {
|
||||||
|
respawn_now = time(NULL);
|
||||||
|
if (first_spawn == 0)
|
||||||
|
first_spawn = respawn_now;
|
||||||
|
if (respawn_now - first_spawn > respawn_period) {
|
||||||
|
respawn_count = 0;
|
||||||
|
first_spawn = 0;
|
||||||
|
} else
|
||||||
|
respawn_count++;
|
||||||
|
if (respawn_count > respawn_max) {
|
||||||
|
syslog(LOG_WARNING,
|
||||||
|
"respawned \"%s\" too many times, exiting", exec);
|
||||||
|
exiting = true;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
alarm(0);
|
||||||
|
child_pid = fork();
|
||||||
|
if (child_pid == -1)
|
||||||
|
eerrorx("%s: fork: %s", applet, strerror(errno));
|
||||||
|
if (child_pid == 0)
|
||||||
|
child_process(exec, argv);
|
||||||
|
if (healthcheckdelay) {
|
||||||
|
signal_setup(SIGALRM, healthcheck);
|
||||||
|
alarm(healthcheckdelay);
|
||||||
|
} else if (healthchecktimer) {
|
||||||
|
signal_setup(SIGALRM, healthcheck);
|
||||||
|
alarm(healthchecktimer);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -612,6 +671,16 @@ int main(int argc, char **argv)
|
|||||||
while ((opt = getopt_long(argc, argv, getoptstring, longopts,
|
while ((opt = getopt_long(argc, argv, getoptstring, longopts,
|
||||||
(int *) 0)) != -1)
|
(int *) 0)) != -1)
|
||||||
switch (opt) {
|
switch (opt) {
|
||||||
|
case 'a': /* --healthcheck-timer <time> */
|
||||||
|
if (sscanf(optarg, "%d", &healthchecktimer) != 1 || healthchecktimer < 1)
|
||||||
|
eerrorx("%s: invalid health check timer %s", applet, optarg);
|
||||||
|
break;
|
||||||
|
|
||||||
|
case 'A': /* --healthcheck-delay <time> */
|
||||||
|
if (sscanf(optarg, "%d", &healthcheckdelay) != 1 || healthcheckdelay < 1)
|
||||||
|
eerrorx("%s: invalid health check delay %s", applet, optarg);
|
||||||
|
break;
|
||||||
|
|
||||||
case 'D': /* --respawn-delay time */
|
case 'D': /* --respawn-delay time */
|
||||||
n = sscanf(optarg, "%d", &respawn_delay);
|
n = sscanf(optarg, "%d", &respawn_delay);
|
||||||
if (n != 1 || respawn_delay < 1)
|
if (n != 1 || respawn_delay < 1)
|
||||||
@ -668,6 +737,11 @@ int main(int argc, char **argv)
|
|||||||
gid = gr->gr_gid;
|
gid = gr->gr_gid;
|
||||||
break;
|
break;
|
||||||
|
|
||||||
|
case 'H': /* --healthcheck-timer <minutes> */
|
||||||
|
if (sscanf(optarg, "%d", &healthchecktimer) != 1 || healthchecktimer < 1)
|
||||||
|
eerrorx("%s: invalid health check timer %s", applet, optarg);
|
||||||
|
break;
|
||||||
|
|
||||||
case 'k':
|
case 'k':
|
||||||
if (parse_mode(&numask, optarg))
|
if (parse_mode(&numask, optarg))
|
||||||
eerrorx("%s: invalid mode `%s'",
|
eerrorx("%s: invalid mode `%s'",
|
||||||
|
@ -22,6 +22,28 @@ The following is a brief guide on using this capability.
|
|||||||
instructs it not to fork to the command_args_foreground variable shown
|
instructs it not to fork to the command_args_foreground variable shown
|
||||||
below.
|
below.
|
||||||
|
|
||||||
|
# Health Checks
|
||||||
|
|
||||||
|
Health checks are a way to make sure a service monitored by
|
||||||
|
supervise-daemon stays healthy. To configure a health check for a
|
||||||
|
service, you need to write a healthcheck() function, and optionally an
|
||||||
|
unhealthy() function in the service script. Also, you will need to set
|
||||||
|
the healthcheck_timer and optionally healthcheck_delay variables.
|
||||||
|
|
||||||
|
## healthcheck() function
|
||||||
|
|
||||||
|
The healthcheck() function is run repeatedly based on the settings of
|
||||||
|
the healthcheck_* variables. This function should return zero if the
|
||||||
|
service is currently healthy or non-zero otherwise.
|
||||||
|
|
||||||
|
## unhealthy() function
|
||||||
|
|
||||||
|
If the healthcheck() function returns non-zero, the unhealthy() function
|
||||||
|
is run, then the service is restarted. Since the service will be
|
||||||
|
restarted by the supervisor, the unhealthy function should not try to
|
||||||
|
restart it; the purpose of the function is to allow any cleanup tasks
|
||||||
|
other than restarting the service to be run.
|
||||||
|
|
||||||
# Variable Settings
|
# Variable Settings
|
||||||
|
|
||||||
The most important setting is the supervisor variable. At the top of
|
The most important setting is the supervisor variable. At the top of
|
||||||
@ -52,6 +74,20 @@ This should be used if the daemon you want to monitor
|
|||||||
forks and goes to the background by default. This should be set to the
|
forks and goes to the background by default. This should be set to the
|
||||||
command line option that instructs the daemon to stay in the foreground.
|
command line option that instructs the daemon to stay in the foreground.
|
||||||
|
|
||||||
|
``` sh
|
||||||
|
healthcheck_delay=seconds
|
||||||
|
```
|
||||||
|
|
||||||
|
This is the delay, in seconds, before the first health check is run.
|
||||||
|
If it is not set, we use the value of healthcheck_timer.
|
||||||
|
|
||||||
|
``` sh
|
||||||
|
healthcheck_timer=seconds
|
||||||
|
```
|
||||||
|
|
||||||
|
This is the number of seconds between health checks. If it is not set,
|
||||||
|
no health checks will be run.
|
||||||
|
|
||||||
``` sh
|
``` sh
|
||||||
respawn_delay
|
respawn_delay
|
||||||
```
|
```
|
||||||
|
Loading…
Reference in New Issue
Block a user