Skip to content

Commit

Permalink
Merge branch 'issues/issue16'
Browse files Browse the repository at this point in the history
* issues/issue16:
  mxqd: Send TERM to process when over memory instead of KILL to pgrp
  mxqd: Store and use max_sumrss
  mxq_job: Minor cleanup
  • Loading branch information
mariux committed Oct 26, 2015
2 parents 05bd9d9 + df2c998 commit 0bf7518
Show file tree
Hide file tree
Showing 10 changed files with 67 additions and 43 deletions.
4 changes: 3 additions & 1 deletion mxq_group.c
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
#include "mx_util.h"
#include "mx_mysql.h"

#define GROUP_FIELDS_CNT 30
#define GROUP_FIELDS_CNT 31
#define GROUP_FIELDS \
" group_id," \
" group_name," \
Expand All @@ -37,6 +37,7 @@
" group_jobs_unknown," \
" group_jobs_restarted," \
" group_slots_running," \
" stats_max_sumrss," \
" stats_max_maxrss," \
" stats_max_utime_sec," \
" stats_max_stime_sec," \
Expand Down Expand Up @@ -85,6 +86,7 @@ static int bind_result_group_fields(struct mx_mysql_bind *result, struct mxq_gro

res += mx_mysql_bind_var(result, idx++, uint64, &(g->group_slots_running));

res += mx_mysql_bind_var(result, idx++, uint64, &(g->stats_max_sumrss));
res += mx_mysql_bind_var(result, idx++, uint64, &(g->stats_max_maxrss));
res += mx_mysql_bind_var(result, idx++, int64, &(g->stats_max_utime.tv_sec));
res += mx_mysql_bind_var(result, idx++, int64, &(g->stats_max_stime.tv_sec));
Expand Down
1 change: 1 addition & 0 deletions mxq_group.h
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ struct mxq_group {

uint64_t group_slots_running;

uint64_t stats_max_sumrss;
uint64_t stats_max_maxrss;

struct timeval stats_max_utime;
Expand Down
62 changes: 28 additions & 34 deletions mxq_job.c
Original file line number Diff line number Diff line change
Expand Up @@ -16,44 +16,39 @@
#include "mxq_group.h"
#include "mxq_job.h"

#define JOB_FIELDS_CNT 35
#define JOB_FIELDS_CNT 36
#define JOB_FIELDS \
" job_id, " \
" job_status, " \
" job_flags, " \
" job_priority, " \
" group_id, " \
\
" job_workdir, " \
" job_argc, " \
" job_argv, " \
" job_stdout, " \
" job_stderr, " \
\
" job_umask, " \
" host_submit, " \
" host_id, " \
" server_id, " \
" host_hostname, " \
\
" host_pid, " \
" host_slots, " \
" UNIX_TIMESTAMP(date_submit) as date_submit, " \
" UNIX_TIMESTAMP(date_start) as date_start, " \
" UNIX_TIMESTAMP(date_end) as date_end, " \
\
" stats_max_sumrss, " \
" stats_status, " \
" stats_utime_sec, " \
" stats_utime_usec, " \
" stats_stime_sec, " \
" stats_stime_usec, " \
\
" stats_real_sec, " \
" stats_real_usec, " \
" stats_maxrss, " \
" stats_minflt, " \
" stats_majflt, " \
\
" stats_nswap, " \
" stats_inblock, " \
" stats_oublock, " \
Expand All @@ -73,37 +68,32 @@ static int bind_result_job_fields(struct mx_mysql_bind *result, struct mxq_job *
res += mx_mysql_bind_var(result, idx++, uint64, &(j->job_flags));
res += mx_mysql_bind_var(result, idx++, uint16, &(j->job_priority));
res += mx_mysql_bind_var(result, idx++, uint64, &(j->group_id));

res += mx_mysql_bind_var(result, idx++, string, &(j->job_workdir));
res += mx_mysql_bind_var(result, idx++, uint16, &(j->job_argc));
res += mx_mysql_bind_var(result, idx++, string, &(j->job_argv_str));
res += mx_mysql_bind_var(result, idx++, string, &(j->job_stdout));
res += mx_mysql_bind_var(result, idx++, string, &(j->job_stderr));

res += mx_mysql_bind_var(result, idx++, uint32, &(j->job_umask));
res += mx_mysql_bind_var(result, idx++, string, &(j->host_submit));
res += mx_mysql_bind_var(result, idx++, string, &(j->host_id));
res += mx_mysql_bind_var(result, idx++, string, &(j->server_id));
res += mx_mysql_bind_var(result, idx++, string, &(j->host_hostname));

res += mx_mysql_bind_var(result, idx++, uint32, &(j->host_pid));
res += mx_mysql_bind_var(result, idx++, uint32, &(j->host_slots));
res += mx_mysql_bind_var(result, idx++, int64, &(j->date_submit));
res += mx_mysql_bind_var(result, idx++, int64, &(j->date_start));
res += mx_mysql_bind_var(result, idx++, int64, &(j->date_end));

res += mx_mysql_bind_var(result, idx++, uint64, &(j->stats_max_sumrss));
res += mx_mysql_bind_var(result, idx++, int32, &(j->stats_status));
res += mx_mysql_bind_var(result, idx++, int64, &(j->stats_rusage.ru_utime.tv_sec));
res += mx_mysql_bind_var(result, idx++, int64, &(j->stats_rusage.ru_utime.tv_usec));
res += mx_mysql_bind_var(result, idx++, int64, &(j->stats_rusage.ru_stime.tv_sec));
res += mx_mysql_bind_var(result, idx++, int64, &(j->stats_rusage.ru_stime.tv_usec));

res += mx_mysql_bind_var(result, idx++, int64, &(j->stats_realtime.tv_sec));
res += mx_mysql_bind_var(result, idx++, int64, &(j->stats_realtime.tv_usec));
res += mx_mysql_bind_var(result, idx++, int64, &(j->stats_rusage.ru_maxrss));
res += mx_mysql_bind_var(result, idx++, int64, &(j->stats_rusage.ru_minflt));
res += mx_mysql_bind_var(result, idx++, int64, &(j->stats_rusage.ru_majflt));

res += mx_mysql_bind_var(result, idx++, int64, &(j->stats_rusage.ru_nswap));
res += mx_mysql_bind_var(result, idx++, int64, &(j->stats_rusage.ru_inblock));
res += mx_mysql_bind_var(result, idx++, int64, &(j->stats_rusage.ru_oublock));
Expand Down Expand Up @@ -457,6 +447,7 @@ int mxq_set_job_status_running(struct mx_mysql *mysql, struct mxq_job *job)
int mxq_set_job_status_exited(struct mx_mysql *mysql, struct mxq_job *job)
{
int res;
int idx;
uint16_t newstatus;
struct mx_mysql_bind param = {0};

Expand Down Expand Up @@ -485,6 +476,7 @@ int mxq_set_job_status_exited(struct mx_mysql *mysql, struct mxq_job *job)
"UPDATE mxq_job SET"
" job_status = ?,"
" date_end = NULL,"
" stats_max_sumrss = ?, "
" stats_status = ?, "
" stats_utime_sec = ?, "
" stats_utime_usec = ?, "
Expand All @@ -506,30 +498,32 @@ int mxq_set_job_status_exited(struct mx_mysql *mysql, struct mxq_job *job)
" AND server_id = ?"
" AND host_pid = ?";

res = mx_mysql_bind_init_param(&param, 20);
res = mx_mysql_bind_init_param(&param, 21);
assert(res == 0);

idx = 0;
res = 0;
res += mx_mysql_bind_var(&param, 0, uint16, &(newstatus));
res += mx_mysql_bind_var(&param, 1, int32, &(job->stats_status));
res += mx_mysql_bind_var(&param, 2, int64, &(job->stats_rusage.ru_utime.tv_sec));
res += mx_mysql_bind_var(&param, 3, int64, &(job->stats_rusage.ru_utime.tv_usec));
res += mx_mysql_bind_var(&param, 4, int64, &(job->stats_rusage.ru_stime.tv_sec));
res += mx_mysql_bind_var(&param, 5, int64, &(job->stats_rusage.ru_stime.tv_usec));
res += mx_mysql_bind_var(&param, 6, int64, &(job->stats_realtime.tv_sec));
res += mx_mysql_bind_var(&param, 7, int64, &(job->stats_realtime.tv_usec));
res += mx_mysql_bind_var(&param, 8, int64, &(job->stats_rusage.ru_maxrss));
res += mx_mysql_bind_var(&param, 9, int64, &(job->stats_rusage.ru_minflt));
res += mx_mysql_bind_var(&param, 10, int64, &(job->stats_rusage.ru_majflt));
res += mx_mysql_bind_var(&param, 11, int64, &(job->stats_rusage.ru_nswap));
res += mx_mysql_bind_var(&param, 12, int64, &(job->stats_rusage.ru_inblock));
res += mx_mysql_bind_var(&param, 13, int64, &(job->stats_rusage.ru_oublock));
res += mx_mysql_bind_var(&param, 14, int64, &(job->stats_rusage.ru_nvcsw));
res += mx_mysql_bind_var(&param, 15, int64, &(job->stats_rusage.ru_nivcsw));
res += mx_mysql_bind_var(&param, 16, uint64, &(job->job_id));
res += mx_mysql_bind_var(&param, 17, string, &(job->host_hostname));
res += mx_mysql_bind_var(&param, 18, string, &(job->server_id));
res += mx_mysql_bind_var(&param, 19, uint32, &(job->host_pid));
res += mx_mysql_bind_var(&param, idx++, uint16, &(newstatus));
res += mx_mysql_bind_var(&param, idx++, uint64, &(job->stats_max_sumrss));
res += mx_mysql_bind_var(&param, idx++, int32, &(job->stats_status));
res += mx_mysql_bind_var(&param, idx++, int64, &(job->stats_rusage.ru_utime.tv_sec));
res += mx_mysql_bind_var(&param, idx++, int64, &(job->stats_rusage.ru_utime.tv_usec));
res += mx_mysql_bind_var(&param, idx++, int64, &(job->stats_rusage.ru_stime.tv_sec));
res += mx_mysql_bind_var(&param, idx++, int64, &(job->stats_rusage.ru_stime.tv_usec));
res += mx_mysql_bind_var(&param, idx++, int64, &(job->stats_realtime.tv_sec));
res += mx_mysql_bind_var(&param, idx++, int64, &(job->stats_realtime.tv_usec));
res += mx_mysql_bind_var(&param, idx++, int64, &(job->stats_rusage.ru_maxrss));
res += mx_mysql_bind_var(&param, idx++, int64, &(job->stats_rusage.ru_minflt));
res += mx_mysql_bind_var(&param, idx++, int64, &(job->stats_rusage.ru_majflt));
res += mx_mysql_bind_var(&param, idx++, int64, &(job->stats_rusage.ru_nswap));
res += mx_mysql_bind_var(&param, idx++, int64, &(job->stats_rusage.ru_inblock));
res += mx_mysql_bind_var(&param, idx++, int64, &(job->stats_rusage.ru_oublock));
res += mx_mysql_bind_var(&param, idx++, int64, &(job->stats_rusage.ru_nvcsw));
res += mx_mysql_bind_var(&param, idx++, int64, &(job->stats_rusage.ru_nivcsw));
res += mx_mysql_bind_var(&param, idx++, uint64, &(job->job_id));
res += mx_mysql_bind_var(&param, idx++, string, &(job->host_hostname));
res += mx_mysql_bind_var(&param, idx++, string, &(job->server_id));
res += mx_mysql_bind_var(&param, idx++, uint32, &(job->host_pid));
assert(res == 0);

res = mx_mysql_do_statement_noresult_retry_on_fail(mysql, query, &param);
Expand Down
2 changes: 2 additions & 0 deletions mxq_job.h
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,8 @@ struct mxq_job {

struct timeval stats_starttime;

uint64_t stats_max_sumrss;

int32_t stats_status;
struct timeval stats_realtime;
struct rusage stats_rusage;
Expand Down
12 changes: 6 additions & 6 deletions mxqd.c
Original file line number Diff line number Diff line change
Expand Up @@ -1638,19 +1638,19 @@ int killall_over_memory(struct mxq_server *server)
continue;
}

memory = pinfo->sum_rss * pagesize / 1024 / 1024;
memory = pinfo->sum_rss * pagesize / 1024;

if (job->max_sum_rss < memory)
job->max_sum_rss = memory;

if (memory <= group->group.job_memory)
if (memory/1024 <= group->group.job_memory)
continue;

mx_log_info("killall_over_memory(): used(%llu) > requested(%llu): Sending signal=KILL to job=%s(%d):%lu:%lu pgrp=%d",
memory, group->group.job_memory,
mx_log_info("killall_over_memory(): used(%lluMiB) > requested(%lluMiB): Sending signal=TERM to job=%s(%d):%lu:%lu pid=%d",
memory/1024, group->group.job_memory,
group->group.user_name, group->group.user_uid, group->group.group_id, job->job.job_id, pid);

kill(-pid, SIGKILL);
kill(pid, SIGTERM);
}
}
}
Expand Down Expand Up @@ -1762,7 +1762,7 @@ int catchall(struct mxq_server *server) {
g = &job->group->group;

timersub(&now, &j->stats_starttime, &j->stats_realtime);

j->stats_max_sumrss = job->max_sum_rss;
j->stats_status = status;
j->stats_rusage = rusage;

Expand Down
8 changes: 6 additions & 2 deletions mxqdump.c
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,7 @@ static int print_group(struct mxq_group *g)
" max_utime=%lu"
" max_real=%lu"
" max_memory=%lukiB"
" max_rss=%lukiB"
" wait_sec=%lu"
" run_sec=%lu"
" idle_sec=%lu"
Expand All @@ -150,10 +151,11 @@ static int print_group(struct mxq_group *g)
g->job_threads,
g->job_memory*1024,
g->job_time*60,
(100UL*(uint64_t)g->stats_max_maxrss/1024UL/g->job_memory),
(100UL*(uint64_t)g->stats_max_sumrss/1024UL/g->job_memory),
(100UL*(uint64_t)g->stats_max_real.tv_sec/60UL/g->job_time),
g->stats_max_utime.tv_sec,
g->stats_max_real.tv_sec,
g->stats_max_sumrss,
g->stats_max_maxrss,
g->stats_wait_sec,
g->stats_run_sec,
Expand Down Expand Up @@ -219,6 +221,7 @@ static int print_job(struct mxq_group *g, struct mxq_job *j)
" runtime_requested=%us"
" time_load=%lu%%"
" memory_requested=%lukiB"
" max_memory=%lukiB"
" max_rss=%lukiB"
" memory_load=%lu%%"
" threads=%d"
Expand All @@ -244,8 +247,9 @@ static int print_job(struct mxq_group *g, struct mxq_job *j)
g->job_time*60,
(100UL*(run_sec)/60UL/g->job_time),
g->job_memory*1024,
j->stats_max_sumrss,
j->stats_rusage.ru_maxrss,
(100UL*j->stats_rusage.ru_maxrss/1024UL/g->job_memory),
(100UL*j->stats_max_sumrss/1024UL/g->job_memory),
g->job_threads,
j->host_slots,
mxq_job_status_to_name(j->job_status),
Expand Down
11 changes: 11 additions & 0 deletions mysql/alter_tables_0.17.0.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
ALTER TABLE mxq_group
ADD COLUMN
stats_max_sumrss INT8 UNSIGNED NOT NULL DEFAULT 0
AFTER
group_date_end;

ALTER TABLE mxq_job
ADD COLUMN
stats_max_sumrss INT8 UNSIGNED NOT NULL DEFAULT 0
AFTER
job_id_first;
4 changes: 4 additions & 0 deletions mysql/create_tables.sql
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,8 @@ CREATE TABLE IF NOT EXISTS mxq_group (

group_date_end TIMESTAMP NOT NULL DEFAULT 0,

stats_max_sumrss INT8 UNSIGNED NOT NULL DEFAULT 0,

stats_max_maxrss INT8 UNSIGNED NOT NULL DEFAULT 0,
stats_max_utime_sec INT8 UNSIGNED NOT NULL DEFAULT 0,
stats_max_stime_sec INT8 UNSIGNED NOT NULL DEFAULT 0,
Expand Down Expand Up @@ -94,6 +96,8 @@ CREATE TABLE IF NOT EXISTS mxq_job (
job_id_old INT8 UNSIGNED NULL DEFAULT NULL,
job_id_first INT8 UNSIGNED NULL DEFAULT NULL,

stats_max_sumrss INT8 UNSIGNED NOT NULL DEFAULT 0,

stats_status INT4 UNSIGNED NOT NULL DEFAULT 0,

stats_utime_sec INT8 UNSIGNED NOT NULL DEFAULT 0,
Expand Down
3 changes: 3 additions & 0 deletions mysql/create_trigger.sql
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ CREATE TRIGGER mxq_update_job BEFORE UPDATE ON mxq_job
group_slots_running=group_slots_running-NEW.host_slots,
group_jobs_running=group_jobs_running-1,
group_jobs_failed=group_jobs_failed+1,
stats_max_sumrss=GREATEST(stats_max_sumrss, NEW.stats_max_sumrss),
stats_max_maxrss=GREATEST(stats_max_maxrss, NEW.stats_maxrss),
stats_max_utime_sec=GREATEST(stats_max_utime_sec, NEW.stats_utime_sec),
stats_max_stime_sec=GREATEST(stats_max_stime_sec, NEW.stats_stime_sec),
Expand Down Expand Up @@ -99,6 +100,7 @@ CREATE TRIGGER mxq_update_job BEFORE UPDATE ON mxq_job
group_slots_running=group_slots_running-NEW.host_slots,
group_jobs_running=group_jobs_running-1,
group_jobs_finished=group_jobs_finished+1,
stats_max_sumrss=GREATEST(stats_max_sumrss, NEW.stats_max_sumrss),
stats_max_maxrss=GREATEST(stats_max_maxrss, NEW.stats_maxrss),
stats_max_utime_sec=GREATEST(stats_max_utime_sec, NEW.stats_utime_sec),
stats_max_stime_sec=GREATEST(stats_max_stime_sec, NEW.stats_stime_sec),
Expand All @@ -113,6 +115,7 @@ CREATE TRIGGER mxq_update_job BEFORE UPDATE ON mxq_job
WHERE group_id=NEW.group_id;
ELSEIF NEW.job_status NOT IN (399, 755, 989, 990) THEN
UPDATE mxq_group SET
stats_max_sumrss=GREATEST(stats_max_sumrss, NEW.stats_max_sumrss),
stats_max_maxrss=GREATEST(stats_max_maxrss, NEW.stats_maxrss),
stats_max_utime_sec=GREATEST(stats_max_utime_sec, NEW.stats_utime_sec),
stats_max_stime_sec=GREATEST(stats_max_stime_sec, NEW.stats_stime_sec),
Expand Down
3 changes: 3 additions & 0 deletions web/pages/mxq/mxq.in
Original file line number Diff line number Diff line change
Expand Up @@ -269,6 +269,7 @@ group_mtime : $o{group_mtime}
group_date_end : $o{group_date_end}
stats_max_sumrss : $o{stats_max_maxrss} kiB
stats_max_maxrss : $o{stats_max_maxrss}
stats_max_utime_sec : $o{stats_max_utime_sec}
stats_max_stime_sec : $o{stats_max_stime_sec}
Expand Down Expand Up @@ -364,6 +365,8 @@ job_id_new : $o{job_id_new}
job_id_old : $o{job_id_old}
job_id_first : $o{job_id_first}
stats_max_sumrss : $o{stats_max_maxrss} kiB
stats_status : $o{stats_status}
stats_utime_sec : $o{stats_utime_sec}
Expand Down

0 comments on commit 0bf7518

Please sign in to comment.