Skip to content

Commit

Permalink
mxqd: Kill jobs over memory at most every 60 seconds
Browse files Browse the repository at this point in the history
this resolves mariux64#16
  • Loading branch information
mariux committed Oct 23, 2015
1 parent bdd2d5a commit 924db86
Showing 1 changed file with 62 additions and 0 deletions.
62 changes: 62 additions & 0 deletions mxqd.c
Original file line number Diff line number Diff line change
Expand Up @@ -1574,6 +1574,66 @@ int killall_over_time(struct mxq_server *server)
return 0;
}

int killall_over_memory(struct mxq_server *server)
{
struct mxq_user_list *user;
struct mxq_group_list *group;
struct mxq_job_list *job;
struct mx_proc_tree *pt = NULL;
struct mx_proc_info *pinfo;
long pagesize;
pid_t pid;
unsigned long long int memory;
int res;

assert(server);

if (!server->jobs_running)
return 0;

/* limit killing to every >= 1 minutes */
mx_within_rate_limit_or_return(1*60, 0);

pagesize = sysconf(_SC_PAGESIZE);
if (!pagesize) {
mx_log_warning("killall_over_memory(): Can't get _SC_PAGESIZE. Assuming 4096.");
pagesize = 4096;
}

res = mx_proc_tree(&pt);
if (res < 0) {
mx_log_err("killall_over_memory(): Reading process tree failed: %m");
return res;
}

for (user=server->users; user; user=user->next) {
for (group=user->groups; group; group=group->next) {
for (job=group->jobs; job; job=job->next) {
pid = job->job.host_pid;

pinfo = mx_proc_tree_proc_info(pt, pid);
if (!pinfo) {
mx_log_warning("killall_over_memory(): Can't find process with pid %llu in process tree", pid);
continue;
}

memory = pinfo->sum_rss * pagesize / 1024 / 1024;

if (memory <= group->group.job_memory)
continue;

mx_log_info("killall_over_memory(): used(%llu) > requested(%llu): Sending signal=KILL to job=%s(%d):%lu:%lu pgrp=%d",
memory, group->group.job_memory,
group->group.user_name, group->group.user_uid, group->group.group_id, job->job.job_id, pid);

kill(-pid, SIGKILL);
}
}
}
mx_proc_tree_free(&pt);
return 0;
}

int killallcancelled(struct mxq_server *server, int sig, unsigned int pgrp)
{
struct mxq_user_list *user;
Expand Down Expand Up @@ -1867,6 +1927,7 @@ int main(int argc, char *argv[])
killallcancelled(&server, SIGTERM, 0);
killallcancelled(&server, SIGINT, 0);
killall_over_time(&server);
killall_over_memory(&server);

if (!server.group_cnt) {
assert(!server.jobs_running);
Expand Down Expand Up @@ -1914,6 +1975,7 @@ int main(int argc, char *argv[])
killallcancelled(&server, SIGTERM, 0);
killallcancelled(&server, SIGINT, 0);
killall_over_time(&server);
killall_over_memory(&server);

mx_log_info("jobs_running=%lu global_sigint_cnt=%d global_sigterm_cnt=%d : Exiting. Wating for jobs to finish. Sleeping for a while.",
server.jobs_running, global_sigint_cnt, global_sigterm_cnt);
Expand Down

0 comments on commit 924db86

Please sign in to comment.