Skip to content

Commit

Permalink
Merge pull request #110 from mariux64/verify-job-pids-after-restart
Browse files Browse the repository at this point in the history
Verify job pids after restart
  • Loading branch information
donald authored Sep 17, 2021
2 parents 16a3711 + 035059c commit d417de8
Show file tree
Hide file tree
Showing 3 changed files with 38 additions and 7 deletions.
24 changes: 24 additions & 0 deletions mx_proc.c
Original file line number Diff line number Diff line change
Expand Up @@ -484,3 +484,27 @@ pid_t mx_proc_get_parent(pid_t pid) {
return -1;
return parent;
}

/*
* get "comm" value of a process.
* buf MUST point to char[16] array
* return NULL on any error, otherwise pointer to buf
*/
char *mx_proc_get_comm(pid_t pid, char *buf) {
char commfilename[32]; // "/proc/2147483647/comm" = 22
FILE *commfile;
char *p;

snprintf(commfilename, sizeof(commfilename), "/proc/%d/comm", pid);
commfile = fopen(commfilename, "r");
if (commfile == NULL)
return NULL;
p = fgets(buf, 16, commfile);
fclose(commfile);
if (p == NULL)
return NULL;
p = strrchr(buf, '\n');
if (p != NULL)
*p = '\0';
return (char *)buf;
}
1 change: 1 addition & 0 deletions mx_proc.h
Original file line number Diff line number Diff line change
Expand Up @@ -86,4 +86,5 @@ int mx_proc_tree_free(struct mx_proc_tree **tree);

struct mx_proc_info *mx_proc_tree_proc_info(struct mx_proc_tree *tree, pid_t pid);
pid_t mx_proc_get_parent(pid_t pid);
char *mx_proc_get_comm(pid_t pid, char *buf);
#endif
20 changes: 13 additions & 7 deletions mxqd.c
Original file line number Diff line number Diff line change
Expand Up @@ -1187,6 +1187,16 @@ int user_process(struct mxq_group_list *glist, struct mxq_job *job)

static const char REAPER_PNAME[] = "mxqd reaper";

static int is_reaper(pid_t pid) {
char comm[16];
if (mx_proc_get_comm(pid, comm) == NULL)
return 0;
if (strcmp(comm, REAPER_PNAME) == 0)
return 1;
else
return 0;
}

int reaper_process(struct mxq_server *server,struct mxq_group_list *glist, struct mxq_job *job) {
pid_t pid;
struct rusage rusage;
Expand Down Expand Up @@ -1380,6 +1390,8 @@ unsigned long start_job(struct mxq_group_list *glist)
job->host_pid = getpid();

mx_log_debug("starting reaper process.");
mx_funlock(server->flock);
server->flock = NULL;
mx_mysql_finish(&server->mysql);

res = reaper_process(server, glist, job);
Expand Down Expand Up @@ -2285,8 +2297,6 @@ static int lost_scan_one(struct mxq_server *server)

struct mxq_job *job;

int res;

for (ulist = server->users; ulist; ulist = ulist->next) {
for (glist = ulist->groups; glist; glist = glist->next) {
for (jlist = glist->jobs; jlist; jlist = jlist->next) {
Expand All @@ -2304,13 +2314,9 @@ static int lost_scan_one(struct mxq_server *server)
continue;
}

res = kill(job->host_pid, 0);
if (res >= 0)
if (is_reaper(job->host_pid))
continue;

if (errno != ESRCH)
return -errno;

if (!fspool_file_exists(server, job->job_id)) {
mx_log_warning("pid %u: process is gone. setting job status of job %lu to unknown.",
jlist->job.host_pid,
Expand Down

0 comments on commit d417de8

Please sign in to comment.