From 89aca1f2a06140bf556d5460963be0d213a58657 Mon Sep 17 00:00:00 2001 From: Donald Buczek Date: Wed, 15 Sep 2021 21:41:52 +0200 Subject: [PATCH 1/3] mx_proc: Add mx_proc_get_comm Add function to get "comm"-value of a process. --- mx_proc.c | 24 ++++++++++++++++++++++++ mx_proc.h | 1 + 2 files changed, 25 insertions(+) diff --git a/mx_proc.c b/mx_proc.c index 129d7216..4edc2b2a 100644 --- a/mx_proc.c +++ b/mx_proc.c @@ -484,3 +484,27 @@ pid_t mx_proc_get_parent(pid_t pid) { return -1; return parent; } + +/* + * get "comm" value of a process. + * buf MUST point to char[16] array + * return NULL on any error, otherwise pointer to buf + */ +char *mx_proc_get_comm(pid_t pid, char *buf) { + char commfilename[32]; // "/proc/2147483647/comm" = 22 + FILE *commfile; + char *p; + + snprintf(commfilename, sizeof(commfilename), "/proc/%d/comm", pid); + commfile = fopen(commfilename, "r"); + if (commfile == NULL) + return NULL; + p = fgets(buf, 16, commfile); + fclose(commfile); + if (p == NULL) + return NULL; + p = strrchr(buf, '\n'); + if (p != NULL) + *p = '\0'; + return (char *)buf; +} diff --git a/mx_proc.h b/mx_proc.h index 1fe07f6f..d4abcb35 100644 --- a/mx_proc.h +++ b/mx_proc.h @@ -86,4 +86,5 @@ int mx_proc_tree_free(struct mx_proc_tree **tree); struct mx_proc_info *mx_proc_tree_proc_info(struct mx_proc_tree *tree, pid_t pid); pid_t mx_proc_get_parent(pid_t pid); +char *mx_proc_get_comm(pid_t pid, char *buf); #endif From f218ec229cd0ed4df0b19d9f6f92e082d89d1b46 Mon Sep 17 00:00:00 2001 From: Donald Buczek Date: Wed, 15 Sep 2021 21:44:34 +0200 Subject: [PATCH 2/3] mxqd: Verify names of reaper processes no restart When the daemon restarts, it has to figure out which of the jobs, the database shows as running on the sever, are in fact still running and which are gone. Currently we only check, whether the process with the pid from the database still exists. However, this can give wrong results if the pid of a job is reused after a system reboot or after a pid wrap. mxqd might regard an unrelated process as one of its jobs and nanny and kill it. Update code to only regard a proces as a running mxqd job if its "comm"-value (/proc/PID/comm) is "mxqd reaper". --- mxqd.c | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/mxqd.c b/mxqd.c index e6b78f54..bff62e82 100644 --- a/mxqd.c +++ b/mxqd.c @@ -1187,6 +1187,16 @@ int user_process(struct mxq_group_list *glist, struct mxq_job *job) static const char REAPER_PNAME[] = "mxqd reaper"; +static int is_reaper(pid_t pid) { + char comm[16]; + if (mx_proc_get_comm(pid, comm) == NULL) + return 0; + if (strcmp(comm, REAPER_PNAME) == 0) + return 1; + else + return 0; +} + int reaper_process(struct mxq_server *server,struct mxq_group_list *glist, struct mxq_job *job) { pid_t pid; struct rusage rusage; @@ -2285,8 +2295,6 @@ static int lost_scan_one(struct mxq_server *server) struct mxq_job *job; - int res; - for (ulist = server->users; ulist; ulist = ulist->next) { for (glist = ulist->groups; glist; glist = glist->next) { for (jlist = glist->jobs; jlist; jlist = jlist->next) { @@ -2304,13 +2312,9 @@ static int lost_scan_one(struct mxq_server *server) continue; } - res = kill(job->host_pid, 0); - if (res >= 0) + if (is_reaper(job->host_pid)) continue; - if (errno != ESRCH) - return -errno; - if (!fspool_file_exists(server, job->job_id)) { mx_log_warning("pid %u: process is gone. setting job status of job %lu to unknown.", jlist->job.host_pid, From 035059c06cbe85e630331ade4fe139ff6de1b8be Mon Sep 17 00:00:00 2001 From: Donald Buczek Date: Thu, 16 Sep 2021 09:33:52 +0200 Subject: [PATCH 3/3] mxqd: Release mxqd lock in reaper mxqd hold a flock lock on /dev/shm/mxqd.HOST.DAEMON.lck to avoid being run multiple times. The open file and the lock is inherited by forked children. It is lost when the child does an execve(), because the file is opened O_CLOEXEC. However, the reaper is a long running forked child which doesen't do execve(), so it holds the lock as well. Usually this isn't a problem, because if mxqd terminates via one of the signals defined for that purpose, it will unlink the lock file before terminating. When it is restarted, a new file is generated and the lock on the new file is not in conflict with locks on the unlinked file. Only if mqxd is terminated by SIGKILL (which can't be ignored or handled) it does not clean up the lock file. In that case, trying to restart the daemon can fail because of the locks held by jobs of the previous daemon. Unlock the lock in the reaper after it has been forked. --- mxqd.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mxqd.c b/mxqd.c index bff62e82..b960426a 100644 --- a/mxqd.c +++ b/mxqd.c @@ -1390,6 +1390,8 @@ unsigned long start_job(struct mxq_group_list *glist) job->host_pid = getpid(); mx_log_debug("starting reaper process."); + mx_funlock(server->flock); + server->flock = NULL; mx_mysql_finish(&server->mysql); res = reaper_process(server, glist, job);