From 9fd65807f2271cc7a57607a0b7ba2bf9f947bd83 Mon Sep 17 00:00:00 2001 From: Marius Tolzmann Date: Mon, 24 Aug 2015 12:39:30 +0200 Subject: [PATCH] mxqd: kill_over_time send SIGXCPU, SIGTERM, SIGHUP and SIGKILL job time - >100.0% and <101.6% send main process SIGXCPU - >101.6% send process group SIGXCPU - >105.0% send main process SIGTERM and process group SIGHUP - >106.6% send process group SIGTERM - >110.0% send main process and process group SIGKILL --- mxqd.c | 58 +++++++++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 45 insertions(+), 13 deletions(-) diff --git a/mxqd.c b/mxqd.c index 1c4a8a5..aed75f9 100644 --- a/mxqd.c +++ b/mxqd.c @@ -1349,7 +1349,7 @@ int killall(struct mxq_server *server, int sig, unsigned int pgrp) return 0; } -int killall_over_time(struct mxq_server *server, int sig, unsigned int pgrp) +int killall_over_time(struct mxq_server *server) { struct mxq_user_list *user; struct mxq_group_list *group; @@ -1369,17 +1369,50 @@ int killall_over_time(struct mxq_server *server, int sig, unsigned int pgrp) for (job=group->jobs; job; job=job->next) { timersub(&now, &job->job.stats_starttime, &delta); - if (delta.tv_sec <= group->group.job_time*63) + if (delta.tv_sec <= group->group.job_time*60) continue; pid = job->job.host_pid; - if (pgrp) - pid = -pid; - mx_log_info("killall_over_time(): Sending signal=%d to job=%s(%d):%lu:%lu %s=%d", - sig, - group->group.user_name, group->group.user_uid, group->group.group_id, job->job.job_id, - pgrp?"pgrp":"pid", pid); - kill(pid, sig); + + if (delta.tv_sec <= group->group.job_time*61) { + mx_log_debug("killall_over_time(): Sending signal=XCPU to job=%s(%d):%lu:%lu pid=%d", + group->group.user_name, group->group.user_uid, group->group.group_id, job->job.job_id, pid); + kill(pid, SIGXCPU); + continue; + } + + mx_log_debug("killall_over_time(): Sending signal=XCPU to job=%s(%d):%lu:%lu pgrp=%d", + group->group.user_name, group->group.user_uid, group->group.group_id, job->job.job_id, pid); + kill(-pid, SIGXCPU); + + if (delta.tv_sec <= group->group.job_time*63) + continue; + + mx_log_info("killall_over_time(): Sending signal=TERM to job=%s(%d):%lu:%lu pid=%d", + group->group.user_name, group->group.user_uid, group->group.group_id, job->job.job_id, pid); + kill(pid, SIGTERM); + + mx_log_info("killall_over_time(): Sending signal=HUP to job=%s(%d):%lu:%lu pgrp=%d", + group->group.user_name, group->group.user_uid, group->group.group_id, job->job.job_id, pid); + kill(-pid, SIGHUP); + + if (delta.tv_sec <= group->group.job_time*64) + continue; + + mx_log_info("killall_over_time(): Sending signal=TERM to job=%s(%d):%lu:%lu pgrp=%d", + group->group.user_name, group->group.user_uid, group->group.group_id, job->job.job_id, pid); + kill(-pid, SIGTERM); + + if (delta.tv_sec <= group->group.job_time*66) + continue; + + mx_log_info("killall_over_time(): Sending signal=KILL to job=%s(%d):%lu:%lu pid=%d", + group->group.user_name, group->group.user_uid, group->group.group_id, job->job.job_id, pid); + kill(pid, SIGKILL); + + mx_log_info("killall_over_time(): Sending signal=KILL to job=%s(%d):%lu:%lu pgrp=%d", + group->group.user_name, group->group.user_uid, group->group.group_id, job->job.job_id, pid); + kill(-pid, SIGKILL); } } } @@ -1672,8 +1705,7 @@ int main(int argc, char *argv[]) killallcancelled(&server, SIGTERM, 0); killallcancelled(&server, SIGINT, 0); - killall_over_time(&server, SIGTERM, 0); - killall_over_time(&server, SIGINT, 0); + killall_over_time(&server); if (!server.group_cnt) { assert(!server.jobs_running); @@ -1720,8 +1752,8 @@ int main(int argc, char *argv[]) killallcancelled(&server, SIGTERM, 0); killallcancelled(&server, SIGINT, 0); - killall_over_time(&server, SIGTERM, 0); - killall_over_time(&server, SIGINT, 0); + killall_over_time(&server); + killall_over_time(&server); mx_log_info("jobs_running=%lu global_sigint_cnt=%d global_sigterm_cnt=%d : Exiting. Wating for jobs to finish. Sleeping for a while.", server.jobs_running, global_sigint_cnt, global_sigterm_cnt);