From f218ec229cd0ed4df0b19d9f6f92e082d89d1b46 Mon Sep 17 00:00:00 2001 From: Donald Buczek Date: Wed, 15 Sep 2021 21:44:34 +0200 Subject: [PATCH] mxqd: Verify names of reaper processes no restart When the daemon restarts, it has to figure out which of the jobs, the database shows as running on the sever, are in fact still running and which are gone. Currently we only check, whether the process with the pid from the database still exists. However, this can give wrong results if the pid of a job is reused after a system reboot or after a pid wrap. mxqd might regard an unrelated process as one of its jobs and nanny and kill it. Update code to only regard a proces as a running mxqd job if its "comm"-value (/proc/PID/comm) is "mxqd reaper". --- mxqd.c | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/mxqd.c b/mxqd.c index e6b78f54..bff62e82 100644 --- a/mxqd.c +++ b/mxqd.c @@ -1187,6 +1187,16 @@ int user_process(struct mxq_group_list *glist, struct mxq_job *job) static const char REAPER_PNAME[] = "mxqd reaper"; +static int is_reaper(pid_t pid) { + char comm[16]; + if (mx_proc_get_comm(pid, comm) == NULL) + return 0; + if (strcmp(comm, REAPER_PNAME) == 0) + return 1; + else + return 0; +} + int reaper_process(struct mxq_server *server,struct mxq_group_list *glist, struct mxq_job *job) { pid_t pid; struct rusage rusage; @@ -2285,8 +2295,6 @@ static int lost_scan_one(struct mxq_server *server) struct mxq_job *job; - int res; - for (ulist = server->users; ulist; ulist = ulist->next) { for (glist = ulist->groups; glist; glist = glist->next) { for (jlist = glist->jobs; jlist; jlist = jlist->next) { @@ -2304,13 +2312,9 @@ static int lost_scan_one(struct mxq_server *server) continue; } - res = kill(job->host_pid, 0); - if (res >= 0) + if (is_reaper(job->host_pid)) continue; - if (errno != ESRCH) - return -errno; - if (!fspool_file_exists(server, job->job_id)) { mx_log_warning("pid %u: process is gone. setting job status of job %lu to unknown.", jlist->job.host_pid,