From 6bdd79d47a68ef3b077886d049926995a6087c33 Mon Sep 17 00:00:00 2001 From: Donald Buczek Date: Sun, 22 Aug 2021 16:59:40 +0200 Subject: [PATCH 1/8] mxqd: Remove "received sigchld" log message There is no value in this message. Remove it. --- mxqd.c | 1 - 1 file changed, 1 deletion(-) diff --git a/mxqd.c b/mxqd.c index 3941912f..29b64043 100644 --- a/mxqd.c +++ b/mxqd.c @@ -2585,7 +2585,6 @@ static void process_signal(struct mxq_server *server,int sig,int extra) } break; case SIGCHLD: - mx_log_info("received sigchld"); break; default: mx_log_warning("received signal %d (unexpected!)",sig); From 4a0b7c9732b2b7bec93a79698a65ef93d8243208 Mon Sep 17 00:00:00 2001 From: Donald Buczek Date: Sun, 22 Aug 2021 16:59:40 +0200 Subject: [PATCH 2/8] mxqd: Remove job startup chatter --- mxqd.c | 16 ++-------------- 1 file changed, 2 insertions(+), 14 deletions(-) diff --git a/mxqd.c b/mxqd.c index 29b64043..a0c45ea4 100644 --- a/mxqd.c +++ b/mxqd.c @@ -1325,8 +1325,6 @@ unsigned long start_job(struct mxq_group_list *glist) if (!res) { return 0; } - mx_log_info(" job=%s(%d):%lu:%lu :: new job loaded.", - group->user_name, group->user_uid, group->group_id, job->job_id); if (group->job_tmpdir_size > 0) { mx_mysql_disconnect(server->mysql); @@ -1361,8 +1359,6 @@ unsigned long start_job(struct mxq_group_list *glist) mx_free_null(job->host_cpu_set_str); job->host_cpu_set_str = mx_cpuset_to_str(&job->host_cpu_set); - mx_log_info("job assigned cpus: [%s]", job->host_cpu_set_str); - mx_mysql_disconnect(server->mysql); pid = fork(); @@ -1374,14 +1370,6 @@ unsigned long start_job(struct mxq_group_list *glist) } else if (pid == 0) { job->host_pid = getpid(); - mx_log_info(" job=%s(%d):%lu:%lu host_pid=%d pgrp=%d :: new child process forked.", - group->user_name, - group->user_uid, - group->group_id, - job->job_id, - job->host_pid, - getpgrp()); - mx_log_debug("starting reaper process."); mx_mysql_finish(&server->mysql); @@ -1416,8 +1404,8 @@ unsigned long start_job(struct mxq_group_list *glist) if (res < 0) mx_log_err("start_job: failed to update daemon instance statistics: %m"); - mx_log_info(" job=%s(%d):%lu:%lu :: added running job to watch queue.", - group->user_name, group->user_uid, group->group_id, job->job_id); + mx_log_info(" job=%s(%d):%lu:%lu :: started. pid=%d", + group->user_name, group->user_uid, group->group_id, job->job_id, pid); return 1; } From f8c7531212962b59df69af0cfe6f2c50446dd9a0 Mon Sep 17 00:00:00 2001 From: Donald Buczek Date: Sun, 22 Aug 2021 16:59:40 +0200 Subject: [PATCH 3/8] mxqd: Fix "Main Loop freed N slots" message Count released slots in fspool_scan so that the "Main Loop freed N slots" message correctly shows the number of freed slots and not the number of finished jobs. Also shorten the message slots_returned=N :: Main Loop freed N slots. to just Main loop freed N slots. which pairs with the message Main loop started N slots. --- mxqd.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/mxqd.c b/mxqd.c index a0c45ea4..71a7472d 100644 --- a/mxqd.c +++ b/mxqd.c @@ -2022,6 +2022,7 @@ static int fspool_process_file(struct mxq_server *server,char *filename, uint64_ struct mxq_job_list *jlist; struct mxq_job *job; struct mxq_group *group; + int slots_returned = 0; in=fopen(filename,"r"); if (!in) { @@ -2088,12 +2089,12 @@ static int fspool_process_file(struct mxq_server *server,char *filename, uint64_ job->stats_status = status; job->stats_rusage = rusage; - job_has_finished(server, group, jlist); + slots_returned = job_has_finished(server, group, jlist); unlink(filename); res = server_update_daemon_statistics(server); if (res < 0) mx_log_err("recover: failed to update daemon instance statistics: %m"); - return(0); + return(slots_returned); } static int fspool_is_valid_name_parse(const char *name, unsigned long long int *job_id) { @@ -2127,6 +2128,7 @@ static int fspool_scan(struct mxq_server *server) { int res; unsigned long long int job_id; char *filename; + int slots_returned = 0; entries=scandir(server->finished_jobsdir,&namelist,&fspool_is_valid_name,&alphasort); @@ -2139,15 +2141,15 @@ static int fspool_scan(struct mxq_server *server) { mx_asprintf_forever(&filename,"%s/%s",server->finished_jobsdir,namelist[i]->d_name); fspool_is_valid_name_parse(namelist[i]->d_name,&job_id); res=fspool_process_file(server,filename,job_id); - if (res==0) { - cnt++; + if (res>0) { + slots_returned += res; } free(namelist[i]); free(filename); } free(namelist); - return cnt; + return slots_returned; } static int file_exists(char *name) { @@ -2697,7 +2699,7 @@ int main(int argc, char *argv[]) slots_returned += fspool_scan(server); if (slots_returned) - mx_log_info("slots_returned=%lu :: Main Loop freed %lu slots.", slots_returned, slots_returned); + mx_log_info("Main loop freed %lu slots.", slots_returned); group_cnt = load_running_groups(server); if (group_cnt) From caa8dce40021ee68aa1a9a998bd478145f23776b Mon Sep 17 00:00:00 2001 From: Donald Buczek Date: Sun, 22 Aug 2021 16:59:40 +0200 Subject: [PATCH 4/8] mxq_control: Remove "Remove orphaned ..." messages Remove the "Removing orphaned group" and "Removed orphaned user" messages. --- mxqd_control.c | 7 ------- 1 file changed, 7 deletions(-) diff --git a/mxqd_control.c b/mxqd_control.c index 138963c8..a06029f9 100644 --- a/mxqd_control.c +++ b/mxqd_control.c @@ -530,11 +530,6 @@ int server_remove_orphaned_groups(struct mxq_server *server) ulist->groups = gnext; } - mx_log_info("group=%s(%d):%lu : Removing orphaned group.", - group->user_name, - group->user_uid, - group->group_id); - ulist->group_cnt--; ulist->global_slots_running -= glist->global_slots_running; ulist->global_threads_running -= glist->global_threads_running; @@ -560,8 +555,6 @@ int server_remove_orphaned_groups(struct mxq_server *server) server->user_cnt--; mx_free_null(ulist); - - mx_log_info("Removed orphaned user. %lu users left.", server->user_cnt); } return cnt; } From 40258b95b206862f06c4de9157988827fa2ce639 Mon Sep 17 00:00:00 2001 From: Donald Buczek Date: Sun, 22 Aug 2021 16:59:40 +0200 Subject: [PATCH 5/8] Avoid sporadic "No matching job found" warning We currently get a "No matching job found - maybe another server was a bit faster. ;)" warning when we just started the last job of a group. The reason is that the INQ count going to zero is tracked in the database by the sql triggers but is not yet updated in the in-memory copy. Decrement group_jobs_inq after we loaded a job. --- mxqd.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/mxqd.c b/mxqd.c index 71a7472d..940bf306 100644 --- a/mxqd.c +++ b/mxqd.c @@ -1407,6 +1407,17 @@ unsigned long start_job(struct mxq_group_list *glist) mx_log_info(" job=%s(%d):%lu:%lu :: started. pid=%d", group->user_name, group->user_uid, group->group_id, job->job_id, pid); + /* The group counts in the database were updated by the sql triggers when + * we set the job from ASSIGNED to LOADED. We would pick that up in the + * next round of the main loop. Update the in-memory counts right now so + * that we don't try to start a new job when there are no INQ jobs left. + * This avoids a "No matching job found - maybe another server was a bit + * faster" warning when we started the last INQ jobs from a group. + */ + + group->group_jobs_inq--; + group->group_jobs_running++; + return 1; } From f7ba91801480e7c6628d0e9365981aa442d55aa4 Mon Sep 17 00:00:00 2001 From: Donald Buczek Date: Sun, 22 Aug 2021 12:52:24 +0200 Subject: [PATCH 6/8] mxqd: Give reaper thread fixed name Give reaper thread a fixed name "mxqd reaper". This can be used by a restarting mxqd to verify that a runnings jobs pid in the database still identifies a reaper process and hasn't been reused after a system reboot or a pid wrap. The checks can be implemented only after the rolling upgrade adding this commit is completed and all jobs running during the upgrade have finished. --- mxqd.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/mxqd.c b/mxqd.c index 3941912f..ce3ce642 100644 --- a/mxqd.c +++ b/mxqd.c @@ -1184,6 +1184,8 @@ int user_process(struct mxq_group_list *glist, struct mxq_job *job) return res; } +static const char REAPER_PNAME[] = "mxqd reaper"; + int reaper_process(struct mxq_server *server,struct mxq_group_list *glist, struct mxq_job *job) { pid_t pid; struct rusage rusage; @@ -1201,6 +1203,12 @@ int reaper_process(struct mxq_server *server,struct mxq_group_list *glist, struc group = &glist->group; + res = prctl(PR_SET_NAME, REAPER_PNAME, NULL, NULL, NULL); + if (res < 0) { + mx_log_err("reaper_process set name: %m"); + return res; + } + res = setsid(); if (res < 0) { mx_log_warning("reaper_process setsid: %m"); From 5bd7baff62a880225de613476531b89e01951e52 Mon Sep 17 00:00:00 2001 From: Donald Buczek Date: Mon, 23 Aug 2021 16:22:08 +0200 Subject: [PATCH 7/8] mxqsub: Remove unused defines --- mxqsub.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/mxqsub.c b/mxqsub.c index c1c8997e..595a98c7 100644 --- a/mxqsub.c +++ b/mxqsub.c @@ -36,11 +36,6 @@ #include "parser.tab.h" #include "mxq.h" -#define MXQ_TASK_JOB_FORCE_APPEND (1<<0) -#define MXQ_TASK_JOB_FORCE_NEW (1<<1) - -#define MXQ_JOB_STATUS_ACTIVE (1) - #define UINT64_UNSET (uint64_t)(-1) static void print_usage(void) From 907c92c91dfb566b0897e698830ce9f4023d3412 Mon Sep 17 00:00:00 2001 From: Donald Buczek Date: Tue, 24 Aug 2021 13:43:40 +0200 Subject: [PATCH 8/8] mxqd: Change severity of failing setsid to error --- mxqd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mxqd.c b/mxqd.c index 3941912f..29dd38c7 100644 --- a/mxqd.c +++ b/mxqd.c @@ -1203,7 +1203,7 @@ int reaper_process(struct mxq_server *server,struct mxq_group_list *glist, struc res = setsid(); if (res < 0) { - mx_log_warning("reaper_process setsid: %m"); + mx_log_err("reaper_process setsid: %m"); return res; }