Skip to content

Commit

Permalink
Merge branch 'mxqkill'
Browse files Browse the repository at this point in the history
* mxqkill:
  mxq_job: Add new intermediate status KILLING(399)
  mxqkill: Add option -J to cancel single job
  mxq_job: Add new intermediate status CANCELLING(989)
  mxqkill: Fix usage typo
  mxq_job: Name temporary intermediate status 755 => UNKNOWN_PRE
  • Loading branch information
mariux committed Aug 13, 2015
2 parents e90f09f + 737278b commit 50b8e21
Show file tree
Hide file tree
Showing 5 changed files with 172 additions and 8 deletions.
6 changes: 6 additions & 0 deletions mxq_job.c
Original file line number Diff line number Diff line change
Expand Up @@ -130,12 +130,18 @@ char *mxq_job_status_to_name(uint64_t status)
return "stopped";
case MXQ_JOB_STATUS_EXIT:
return "exited";
case MXQ_JOB_STATUS_KILLING:
return "killing";
case MXQ_JOB_STATUS_KILLED:
return "killed";
case MXQ_JOB_STATUS_FAILED:
return "failed";
case MXQ_JOB_STATUS_UNKNOWN_PRE:
return "unknownpre";
case MXQ_JOB_STATUS_CANCELLED:
return "cancelled";
case MXQ_JOB_STATUS_CANCELLING:
return "cancelling";
case MXQ_JOB_STATUS_UNKNOWN:
return "unknown";
case MXQ_JOB_STATUS_FINISHED:
Expand Down
9 changes: 8 additions & 1 deletion mxq_job.h
Original file line number Diff line number Diff line change
Expand Up @@ -69,13 +69,20 @@ struct mxq_job {
#define MXQ_JOB_STATUS_EXTRUNNING 300
#define MXQ_JOB_STATUS_STOPPED 350

#define MXQ_JOB_STATUS_EXIT 1024
#define MXQ_JOB_STATUS_KILLING 399

#define MXQ_JOB_STATUS_KILLED 400
#define MXQ_JOB_STATUS_FAILED 750
#define MXQ_JOB_STATUS_UNKNOWN_PRE 755

#define MXQ_JOB_STATUS_CANCELLING 989

#define MXQ_JOB_STATUS_CANCELLED 990
#define MXQ_JOB_STATUS_UNKNOWN 999
#define MXQ_JOB_STATUS_FINISHED 1000

#define MXQ_JOB_STATUS_EXIT 1024

#define MXQ_JOB_FLAGS_RESTART_ON_HOSTFAIL (1<<0)
#define MXQ_JOB_FLAGS_REQUEUE_ON_HOSTFAIL (1<<1)

Expand Down
152 changes: 150 additions & 2 deletions mxqkill.c
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,8 @@ static void print_usage(void)
"\n"
"options:\n"
"\n"
" -g | --group-id=GROUPID cancel/kill group <group-id>\n"
" -g | --group-id=GROUPID cancel/kill group <GROUPID>\n"
" -J | --job-id=JOBID cancel job <JOBID>\n"
" -u | --user=NAME|UID cancel group for user. (root only)\n"
"\n"
" -v | --verbose be more verbose\n"
Expand Down Expand Up @@ -159,6 +160,96 @@ static int update_job_status_cancelled_by_group(struct mx_mysql *mysql, struct m
return (int)num_rows;
}

static int update_job_status_cancelling_by_job_id_for_user(struct mx_mysql *mysql, uint64_t job_id, uint64_t user_uid)
{
struct mx_mysql_stmt *stmt = NULL;
unsigned long long num_rows = 0;
int res;

assert(job_id);

res = mx_mysql_statement_init(mysql, &stmt);
if (res < 0)
return res;

stmt = mx_mysql_statement_prepare(mysql,
"UPDATE mxq_job AS j"
" LEFT JOIN mxq_group AS g"
" ON j.group_id = g.group_id"
" SET"
" job_status = " status_str(MXQ_JOB_STATUS_CANCELLING)
" WHERE job_id = ?"
" AND user_uid = ?"
" AND job_status = " status_str(MXQ_JOB_STATUS_INQ)
" AND host_hostname = ''"
" AND server_id = ''"
" AND host_pid = 0"
);
if (res < 0) {
mx_log_err("mx_mysql_statement_prepare(): %m");
mx_mysql_statement_close(&stmt);
return -(errno=-res);
}

res += mx_mysql_statement_param_bind(stmt, 0, uint64, &(job_id));
res += mx_mysql_statement_param_bind(stmt, 1, uint64, &(user_uid));
assert(res == 0);

res = mx_mysql_statement_execute(stmt, &num_rows);

if (res < 0)
mx_log_err("mx_mysql_statement_execute(): %m");

mx_mysql_statement_close(&stmt);

if (res < 0)
return -(errno=-res);

return (int)num_rows;
}

static int update_job_status_cancelled_by_job_id(struct mx_mysql *mysql, uint64_t job_id)
{
struct mx_mysql_stmt *stmt = NULL;
unsigned long long num_rows = 0;
int res;

assert(job_id);

res = mx_mysql_statement_init(mysql, &stmt);
if (res < 0)
return res;

stmt = mx_mysql_statement_prepare(mysql,
"UPDATE mxq_job SET"
" job_status = " status_str(MXQ_JOB_STATUS_CANCELLED)
" WHERE job_id = ?"
" AND job_status = " status_str(MXQ_JOB_STATUS_CANCELLING)
" AND host_hostname = ''"
" AND server_id = ''"
" AND host_pid = 0"
);
if (res < 0) {
mx_log_err("mx_mysql_statement_prepare(): %m");
mx_mysql_statement_close(&stmt);
return -(errno=-res);
}

res += mx_mysql_statement_param_bind(stmt, 0, uint64, &(job_id));
assert(res == 0);

res = mx_mysql_statement_execute(stmt, &num_rows);

if (res < 0)
mx_log_err("mx_mysql_statement_execute(): %m");

mx_mysql_statement_close(&stmt);

if (res < 0)
return -(errno=-res);

return (int)num_rows;
}

int main(int argc, char *argv[])
{
Expand All @@ -171,6 +262,7 @@ int main(int argc, char *argv[])
int res;

uint64_t arg_group_id;
uint64_t arg_job_id;
char arg_debug;
uint64_t arg_uid;

Expand All @@ -189,6 +281,7 @@ int main(int argc, char *argv[])

MX_OPTION_REQUIRED_ARG("user", 'u'),
MX_OPTION_REQUIRED_ARG("group-id", 'g'),
MX_OPTION_REQUIRED_ARG("job-id", 'J'),

MX_OPTION_OPTIONAL_ARG("mysql-default-file", 'M'),
MX_OPTION_OPTIONAL_ARG("mysql-default-group", 'S'),
Expand All @@ -204,6 +297,7 @@ int main(int argc, char *argv[])
arg_mysql_default_file = MXQ_MYSQL_DEFAULT_FILE;

arg_group_id = 0;
arg_job_id = 0;
arg_debug = 0;
arg_uid = UINT64_UNSET;

Expand Down Expand Up @@ -278,6 +372,15 @@ int main(int argc, char *argv[])
}
break;

case 'J':
if (mx_strtou64(optctl.optarg, &arg_job_id) < 0 || !arg_job_id) {
if (!arg_job_id)
errno = ERANGE;
mx_log_err("Invalid argument for --job-id '%s': %m", optctl.optarg);
exit(1);
}
break;

case 'M':
arg_mysql_default_file = optctl.optarg;
break;
Expand All @@ -290,7 +393,7 @@ int main(int argc, char *argv[])

MX_GETOPT_FINISH(optctl, argc, argv);

if (!arg_group_id) {
if (!arg_group_id && !arg_job_id) {
print_usage();
exit(EX_USAGE);
}
Expand Down Expand Up @@ -328,6 +431,51 @@ int main(int argc, char *argv[])

mx_log_info("MySQL: Connection to database established.");

if (arg_job_id) {
int res1, res2;

res1 = update_job_status_cancelling_by_job_id_for_user(mysql, arg_job_id, passwd->pw_uid);
res2 = update_job_status_cancelled_by_job_id(mysql, arg_job_id);

mx_mysql_finish(&mysql);
mx_log_info("MySQL: Connection to database closed.");

if (res1 == -ENOENT)
res1=0;

if (res2 == -ENOENT)
res1=0;

if (res1 < 0)
mx_log_err("setting status of job %lu to CANCELLING failed: %s", arg_job_id, strerror(-res1));

if (res2 < 0)
mx_log_err("setting status of job %lu to CANCELLED failed: %s", arg_job_id, strerror(-res1));

if (res2 > 0) {
mx_log_notice("Job %lu cancelled!", arg_job_id);
return 0;
}

if (res1 > 0) {
mx_log_notice("Updated status of job %lu to CANCELLING.", arg_job_id);
if (res2 == 0) {
mx_log_warning("Updating status of job %lu to CANCELLED failed. Job vanished. Please retry.", arg_job_id);
return 2;
}
return 1;
}

if (res1 == 0 && res2 == 0) {
mx_log_notice("No queued job with job_id=%lu for user %s(%d) found in q.", arg_job_id, passwd->pw_name, passwd->pw_uid);
mx_log_warning("Killing a single job is not implemented yet.");
mx_log_warning("See https://github.molgen.mpg.de/mariux64/mxq/issues/4 for more details.");
return 0;
}

return 1;
}

if (arg_group_id) {
memset(&group, 0, sizeof(group));

Expand Down
10 changes: 5 additions & 5 deletions mysql/create_tables.sql
Original file line number Diff line number Diff line change
Expand Up @@ -200,7 +200,7 @@ CREATE TRIGGER mxq_update_job BEFORE UPDATE ON mxq_job
group_slots_running=group_slots_running-OLD.host_slots+NEW.host_slots,
group_mtime=NULL
WHERE group_id=NEW.group_id;
ELSEIF NEW.job_status IN (400, 750) AND OLD.job_status IN (150, 200, 250, 300, 350) THEN
ELSEIF NEW.job_status IN (400, 750) AND OLD.job_status IN (150, 200, 250, 300, 350, 399) THEN
UPDATE mxq_group SET
group_slots_running=group_slots_running-NEW.host_slots,
group_jobs_running=group_jobs_running-1,
Expand All @@ -214,13 +214,13 @@ CREATE TRIGGER mxq_update_job BEFORE UPDATE ON mxq_job
stats_total_real_sec=stats_total_real_sec+NEW.stats_real_sec,
group_mtime=NULL
WHERE group_id=NEW.group_id;
ELSEIF NEW.job_status = 990 AND OLD.job_status = 0 THEN
ELSEIF NEW.job_status = 990 AND OLD.job_status IN (0, 989) THEN
UPDATE mxq_group SET
group_jobs_inq=group_jobs_inq-1,
group_jobs_cancelled=group_jobs_cancelled+1,
group_mtime=NULL
WHERE group_id=NEW.group_id;
ELSEIF NEW.job_status = 999 AND OLD.job_status IN (150, 200, 250) THEN
ELSEIF NEW.job_status = 999 AND OLD.job_status IN (150, 200, 250, 399) THEN
UPDATE mxq_group SET
group_slots_running=group_slots_running-NEW.host_slots,
group_jobs_running=group_jobs_running-1,
Expand All @@ -233,7 +233,7 @@ CREATE TRIGGER mxq_update_job BEFORE UPDATE ON mxq_job
group_jobs_unknown=group_jobs_unknown+1,
group_mtime=NULL
WHERE group_id=NEW.group_id;
ELSEIF NEW.job_status = 1000 AND OLD.job_status IN (150, 200, 250, 300, 350) THEN
ELSEIF NEW.job_status = 1000 AND OLD.job_status IN (150, 200, 250, 300, 350, 399) THEN
UPDATE mxq_group SET
group_slots_running=group_slots_running-NEW.host_slots,
group_jobs_running=group_jobs_running-1,
Expand All @@ -250,7 +250,7 @@ CREATE TRIGGER mxq_update_job BEFORE UPDATE ON mxq_job
stats_total_real_sec_finished=stats_total_real_sec_finished+NEW.stats_real_sec,
group_mtime=NULL
WHERE group_id=NEW.group_id;
ELSEIF NEW.job_status != 990 AND NEW.job_status != 755 THEN
ELSEIF NEW.job_status NOT IN (399, 755, 989, 990) THEN
UPDATE mxq_group SET
stats_max_maxrss=GREATEST(stats_max_maxrss, NEW.stats_maxrss),
stats_max_utime_sec=GREATEST(stats_max_utime_sec, NEW.stats_utime_sec),
Expand Down
3 changes: 3 additions & 0 deletions web/pages/mxq/mxq.in
Original file line number Diff line number Diff line change
Expand Up @@ -82,8 +82,11 @@ sub db_init {
EXTRUNNING => 300,
STOPPED => 350,
EXIT => 1024,
KILLING => 399,
KILLED => 400,
FAILED => 750,
UNKNOWN_PRE => 755,
CANCELLING => 989,
CANCELLED => 990,
UNKNOWN => 999,
FINISHED => 1000,
Expand Down

0 comments on commit 50b8e21

Please sign in to comment.