diff --git a/mx_proc.c b/mx_proc.c index 129d7216..4edc2b2a 100644 --- a/mx_proc.c +++ b/mx_proc.c @@ -484,3 +484,27 @@ pid_t mx_proc_get_parent(pid_t pid) { return -1; return parent; } + +/* + * get "comm" value of a process. + * buf MUST point to char[16] array + * return NULL on any error, otherwise pointer to buf + */ +char *mx_proc_get_comm(pid_t pid, char *buf) { + char commfilename[32]; // "/proc/2147483647/comm" = 22 + FILE *commfile; + char *p; + + snprintf(commfilename, sizeof(commfilename), "/proc/%d/comm", pid); + commfile = fopen(commfilename, "r"); + if (commfile == NULL) + return NULL; + p = fgets(buf, 16, commfile); + fclose(commfile); + if (p == NULL) + return NULL; + p = strrchr(buf, '\n'); + if (p != NULL) + *p = '\0'; + return (char *)buf; +} diff --git a/mx_proc.h b/mx_proc.h index 1fe07f6f..d4abcb35 100644 --- a/mx_proc.h +++ b/mx_proc.h @@ -86,4 +86,5 @@ int mx_proc_tree_free(struct mx_proc_tree **tree); struct mx_proc_info *mx_proc_tree_proc_info(struct mx_proc_tree *tree, pid_t pid); pid_t mx_proc_get_parent(pid_t pid); +char *mx_proc_get_comm(pid_t pid, char *buf); #endif diff --git a/mxqd.c b/mxqd.c index e6b78f54..b960426a 100644 --- a/mxqd.c +++ b/mxqd.c @@ -1187,6 +1187,16 @@ int user_process(struct mxq_group_list *glist, struct mxq_job *job) static const char REAPER_PNAME[] = "mxqd reaper"; +static int is_reaper(pid_t pid) { + char comm[16]; + if (mx_proc_get_comm(pid, comm) == NULL) + return 0; + if (strcmp(comm, REAPER_PNAME) == 0) + return 1; + else + return 0; +} + int reaper_process(struct mxq_server *server,struct mxq_group_list *glist, struct mxq_job *job) { pid_t pid; struct rusage rusage; @@ -1380,6 +1390,8 @@ unsigned long start_job(struct mxq_group_list *glist) job->host_pid = getpid(); mx_log_debug("starting reaper process."); + mx_funlock(server->flock); + server->flock = NULL; mx_mysql_finish(&server->mysql); res = reaper_process(server, glist, job); @@ -2285,8 +2297,6 @@ static int lost_scan_one(struct mxq_server *server) struct mxq_job *job; - int res; - for (ulist = server->users; ulist; ulist = ulist->next) { for (glist = ulist->groups; glist; glist = glist->next) { for (jlist = glist->jobs; jlist; jlist = jlist->next) { @@ -2304,13 +2314,9 @@ static int lost_scan_one(struct mxq_server *server) continue; } - res = kill(job->host_pid, 0); - if (res >= 0) + if (is_reaper(job->host_pid)) continue; - if (errno != ESRCH) - return -errno; - if (!fspool_file_exists(server, job->job_id)) { mx_log_warning("pid %u: process is gone. setting job status of job %lu to unknown.", jlist->job.host_pid,