Skip to content

Verify job pids after restart #110

Merged
merged 3 commits into from
Sep 17, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
24 changes: 24 additions & 0 deletions mx_proc.c
Original file line number Diff line number Diff line change
Expand Up @@ -484,3 +484,27 @@ pid_t mx_proc_get_parent(pid_t pid) {
return -1;
return parent;
}

/*
* get "comm" value of a process.
* buf MUST point to char[16] array
* return NULL on any error, otherwise pointer to buf
*/
char *mx_proc_get_comm(pid_t pid, char *buf) {
char commfilename[32]; // "/proc/2147483647/comm" = 22
FILE *commfile;
char *p;

snprintf(commfilename, sizeof(commfilename), "/proc/%d/comm", pid);
commfile = fopen(commfilename, "r");
if (commfile == NULL)
return NULL;
p = fgets(buf, 16, commfile);
fclose(commfile);
if (p == NULL)
return NULL;
p = strrchr(buf, '\n');
if (p != NULL)
*p = '\0';
return (char *)buf;
}
1 change: 1 addition & 0 deletions mx_proc.h
Original file line number Diff line number Diff line change
Expand Up @@ -86,4 +86,5 @@ int mx_proc_tree_free(struct mx_proc_tree **tree);

struct mx_proc_info *mx_proc_tree_proc_info(struct mx_proc_tree *tree, pid_t pid);
pid_t mx_proc_get_parent(pid_t pid);
char *mx_proc_get_comm(pid_t pid, char *buf);
#endif
20 changes: 13 additions & 7 deletions mxqd.c
Original file line number Diff line number Diff line change
Expand Up @@ -1187,6 +1187,16 @@ int user_process(struct mxq_group_list *glist, struct mxq_job *job)

static const char REAPER_PNAME[] = "mxqd reaper";

static int is_reaper(pid_t pid) {
char comm[16];
if (mx_proc_get_comm(pid, comm) == NULL)
return 0;
if (strcmp(comm, REAPER_PNAME) == 0)
return 1;
else
return 0;
}

int reaper_process(struct mxq_server *server,struct mxq_group_list *glist, struct mxq_job *job) {
pid_t pid;
struct rusage rusage;
Expand Down Expand Up @@ -1380,6 +1390,8 @@ unsigned long start_job(struct mxq_group_list *glist)
job->host_pid = getpid();

mx_log_debug("starting reaper process.");
mx_funlock(server->flock);
server->flock = NULL;
mx_mysql_finish(&server->mysql);

res = reaper_process(server, glist, job);
Expand Down Expand Up @@ -2285,8 +2297,6 @@ static int lost_scan_one(struct mxq_server *server)

struct mxq_job *job;

int res;

for (ulist = server->users; ulist; ulist = ulist->next) {
for (glist = ulist->groups; glist; glist = glist->next) {
for (jlist = glist->jobs; jlist; jlist = jlist->next) {
Expand All @@ -2304,13 +2314,9 @@ static int lost_scan_one(struct mxq_server *server)
continue;
}

res = kill(job->host_pid, 0);
if (res >= 0)
if (is_reaper(job->host_pid))
continue;

if (errno != ESRCH)
return -errno;

if (!fspool_file_exists(server, job->job_id)) {
mx_log_warning("pid %u: process is gone. setting job status of job %lu to unknown.",
jlist->job.host_pid,
Expand Down