From 811fa1bd20396ff7c0a9f8cadb6af195f0b3c652 Mon Sep 17 00:00:00 2001 From: Marius Tolzmann Date: Thu, 13 Aug 2015 11:35:36 +0200 Subject: [PATCH 1/5] mxq_job: Name temporary intermediate status 755 => UNKNOWN_PRE --- mxq_job.c | 2 ++ mxq_job.h | 2 ++ web/pages/mxq/mxq.in | 1 + 3 files changed, 5 insertions(+) diff --git a/mxq_job.c b/mxq_job.c index 21ff955..a3f7cd0 100644 --- a/mxq_job.c +++ b/mxq_job.c @@ -134,6 +134,8 @@ char *mxq_job_status_to_name(uint64_t status) return "killed"; case MXQ_JOB_STATUS_FAILED: return "failed"; + case MXQ_JOB_STATUS_UNKNOWN_PRE: + return "unknownpre"; case MXQ_JOB_STATUS_CANCELLED: return "cancelled"; case MXQ_JOB_STATUS_UNKNOWN: diff --git a/mxq_job.h b/mxq_job.h index 32e9f32..fa2ff66 100644 --- a/mxq_job.h +++ b/mxq_job.h @@ -72,6 +72,8 @@ struct mxq_job { #define MXQ_JOB_STATUS_EXIT 1024 #define MXQ_JOB_STATUS_KILLED 400 #define MXQ_JOB_STATUS_FAILED 750 +#define MXQ_JOB_STATUS_UNKNOWN_PRE 755 + #define MXQ_JOB_STATUS_CANCELLED 990 #define MXQ_JOB_STATUS_UNKNOWN 999 #define MXQ_JOB_STATUS_FINISHED 1000 diff --git a/web/pages/mxq/mxq.in b/web/pages/mxq/mxq.in index f1aa1ed..c96f602 100755 --- a/web/pages/mxq/mxq.in +++ b/web/pages/mxq/mxq.in @@ -84,6 +84,7 @@ sub db_init { EXIT => 1024, KILLED => 400, FAILED => 750, + UNKNOWN_PRE => 755, CANCELLED => 990, UNKNOWN => 999, FINISHED => 1000, From 58fb8ce7ec0d77e7d7a2f280a61a4333b8c9e554 Mon Sep 17 00:00:00 2001 From: Marius Tolzmann Date: Thu, 13 Aug 2015 10:52:59 +0200 Subject: [PATCH 2/5] mxqkill: Fix usage typo --- mxqkill.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mxqkill.c b/mxqkill.c index 54c03a5..06074e0 100644 --- a/mxqkill.c +++ b/mxqkill.c @@ -48,7 +48,7 @@ static void print_usage(void) "\n" "options:\n" "\n" - " -g | --group-id=GROUPID cancel/kill group \n" + " -g | --group-id=GROUPID cancel/kill group \n" " -u | --user=NAME|UID cancel group for user. (root only)\n" "\n" " -v | --verbose be more verbose\n" From f055cb584a3162236defec9e9166c5616c67e250 Mon Sep 17 00:00:00 2001 From: Marius Tolzmann Date: Thu, 13 Aug 2015 11:37:06 +0200 Subject: [PATCH 3/5] mxq_job: Add new intermediate status CANCELLING(989) This state is needed to cancel a single job while joining mysql tables mxq_job and mxq_group to not activate triggers on mxq_group. valid status changes are: INQ -> CANCELLING -> CANCELLED INQ -> CANCELLED MySQL will block those requests to prevent loops and deadlocks. error would be: ERROR 1442 (HY000): Can't update table 'mxq_group' in stored function/trigger because it is already used by statement which invoked this stored function/trigger. --- mxq_job.c | 2 ++ mxq_job.h | 1 + mysql/create_tables.sql | 4 ++-- web/pages/mxq/mxq.in | 1 + 4 files changed, 6 insertions(+), 2 deletions(-) diff --git a/mxq_job.c b/mxq_job.c index a3f7cd0..e77f7e3 100644 --- a/mxq_job.c +++ b/mxq_job.c @@ -138,6 +138,8 @@ char *mxq_job_status_to_name(uint64_t status) return "unknownpre"; case MXQ_JOB_STATUS_CANCELLED: return "cancelled"; + case MXQ_JOB_STATUS_CANCELLING: + return "cancelling"; case MXQ_JOB_STATUS_UNKNOWN: return "unknown"; case MXQ_JOB_STATUS_FINISHED: diff --git a/mxq_job.h b/mxq_job.h index fa2ff66..b52cd43 100644 --- a/mxq_job.h +++ b/mxq_job.h @@ -74,6 +74,7 @@ struct mxq_job { #define MXQ_JOB_STATUS_FAILED 750 #define MXQ_JOB_STATUS_UNKNOWN_PRE 755 +#define MXQ_JOB_STATUS_CANCELLING 989 #define MXQ_JOB_STATUS_CANCELLED 990 #define MXQ_JOB_STATUS_UNKNOWN 999 #define MXQ_JOB_STATUS_FINISHED 1000 diff --git a/mysql/create_tables.sql b/mysql/create_tables.sql index 8b6b99a..c318497 100644 --- a/mysql/create_tables.sql +++ b/mysql/create_tables.sql @@ -214,7 +214,7 @@ CREATE TRIGGER mxq_update_job BEFORE UPDATE ON mxq_job stats_total_real_sec=stats_total_real_sec+NEW.stats_real_sec, group_mtime=NULL WHERE group_id=NEW.group_id; - ELSEIF NEW.job_status = 990 AND OLD.job_status = 0 THEN + ELSEIF NEW.job_status = 990 AND OLD.job_status IN (0, 989) THEN UPDATE mxq_group SET group_jobs_inq=group_jobs_inq-1, group_jobs_cancelled=group_jobs_cancelled+1, @@ -250,7 +250,7 @@ CREATE TRIGGER mxq_update_job BEFORE UPDATE ON mxq_job stats_total_real_sec_finished=stats_total_real_sec_finished+NEW.stats_real_sec, group_mtime=NULL WHERE group_id=NEW.group_id; - ELSEIF NEW.job_status != 990 AND NEW.job_status != 755 THEN + ELSEIF NEW.job_status NOT IN (755, 989, 990) THEN UPDATE mxq_group SET stats_max_maxrss=GREATEST(stats_max_maxrss, NEW.stats_maxrss), stats_max_utime_sec=GREATEST(stats_max_utime_sec, NEW.stats_utime_sec), diff --git a/web/pages/mxq/mxq.in b/web/pages/mxq/mxq.in index c96f602..21a3bdc 100755 --- a/web/pages/mxq/mxq.in +++ b/web/pages/mxq/mxq.in @@ -85,6 +85,7 @@ sub db_init { KILLED => 400, FAILED => 750, UNKNOWN_PRE => 755, + CANCELLING => 989, CANCELLED => 990, UNKNOWN => 999, FINISHED => 1000, From 69ce84593589a14bc44015d7763878d9952753c7 Mon Sep 17 00:00:00 2001 From: Marius Tolzmann Date: Thu, 13 Aug 2015 11:01:03 +0200 Subject: [PATCH 4/5] mxqkill: Add option -J to cancel single job fixes part 1 of issue https://github.molgen.mpg.de/mariux64/mxq/issues/4 --- mxqkill.c | 150 +++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 149 insertions(+), 1 deletion(-) diff --git a/mxqkill.c b/mxqkill.c index 06074e0..2c4eff3 100644 --- a/mxqkill.c +++ b/mxqkill.c @@ -49,6 +49,7 @@ static void print_usage(void) "options:\n" "\n" " -g | --group-id=GROUPID cancel/kill group \n" + " -J | --job-id=JOBID cancel job \n" " -u | --user=NAME|UID cancel group for user. (root only)\n" "\n" " -v | --verbose be more verbose\n" @@ -159,6 +160,96 @@ static int update_job_status_cancelled_by_group(struct mx_mysql *mysql, struct m return (int)num_rows; } +static int update_job_status_cancelling_by_job_id_for_user(struct mx_mysql *mysql, uint64_t job_id, uint64_t user_uid) +{ + struct mx_mysql_stmt *stmt = NULL; + unsigned long long num_rows = 0; + int res; + + assert(job_id); + + res = mx_mysql_statement_init(mysql, &stmt); + if (res < 0) + return res; + + stmt = mx_mysql_statement_prepare(mysql, + "UPDATE mxq_job AS j" + " LEFT JOIN mxq_group AS g" + " ON j.group_id = g.group_id" + " SET" + " job_status = " status_str(MXQ_JOB_STATUS_CANCELLING) + " WHERE job_id = ?" + " AND user_uid = ?" + " AND job_status = " status_str(MXQ_JOB_STATUS_INQ) + " AND host_hostname = ''" + " AND server_id = ''" + " AND host_pid = 0" + ); + if (res < 0) { + mx_log_err("mx_mysql_statement_prepare(): %m"); + mx_mysql_statement_close(&stmt); + return -(errno=-res); + } + + res += mx_mysql_statement_param_bind(stmt, 0, uint64, &(job_id)); + res += mx_mysql_statement_param_bind(stmt, 1, uint64, &(user_uid)); + assert(res == 0); + + res = mx_mysql_statement_execute(stmt, &num_rows); + + if (res < 0) + mx_log_err("mx_mysql_statement_execute(): %m"); + + mx_mysql_statement_close(&stmt); + + if (res < 0) + return -(errno=-res); + + return (int)num_rows; +} + +static int update_job_status_cancelled_by_job_id(struct mx_mysql *mysql, uint64_t job_id) +{ + struct mx_mysql_stmt *stmt = NULL; + unsigned long long num_rows = 0; + int res; + + assert(job_id); + + res = mx_mysql_statement_init(mysql, &stmt); + if (res < 0) + return res; + + stmt = mx_mysql_statement_prepare(mysql, + "UPDATE mxq_job SET" + " job_status = " status_str(MXQ_JOB_STATUS_CANCELLED) + " WHERE job_id = ?" + " AND job_status = " status_str(MXQ_JOB_STATUS_CANCELLING) + " AND host_hostname = ''" + " AND server_id = ''" + " AND host_pid = 0" + ); + if (res < 0) { + mx_log_err("mx_mysql_statement_prepare(): %m"); + mx_mysql_statement_close(&stmt); + return -(errno=-res); + } + + res += mx_mysql_statement_param_bind(stmt, 0, uint64, &(job_id)); + assert(res == 0); + + res = mx_mysql_statement_execute(stmt, &num_rows); + + if (res < 0) + mx_log_err("mx_mysql_statement_execute(): %m"); + + mx_mysql_statement_close(&stmt); + + if (res < 0) + return -(errno=-res); + + return (int)num_rows; +} int main(int argc, char *argv[]) { @@ -171,6 +262,7 @@ int main(int argc, char *argv[]) int res; uint64_t arg_group_id; + uint64_t arg_job_id; char arg_debug; uint64_t arg_uid; @@ -189,6 +281,7 @@ int main(int argc, char *argv[]) MX_OPTION_REQUIRED_ARG("user", 'u'), MX_OPTION_REQUIRED_ARG("group-id", 'g'), + MX_OPTION_REQUIRED_ARG("job-id", 'J'), MX_OPTION_OPTIONAL_ARG("mysql-default-file", 'M'), MX_OPTION_OPTIONAL_ARG("mysql-default-group", 'S'), @@ -204,6 +297,7 @@ int main(int argc, char *argv[]) arg_mysql_default_file = MXQ_MYSQL_DEFAULT_FILE; arg_group_id = 0; + arg_job_id = 0; arg_debug = 0; arg_uid = UINT64_UNSET; @@ -278,6 +372,15 @@ int main(int argc, char *argv[]) } break; + case 'J': + if (mx_strtou64(optctl.optarg, &arg_job_id) < 0 || !arg_job_id) { + if (!arg_job_id) + errno = ERANGE; + mx_log_err("Invalid argument for --job-id '%s': %m", optctl.optarg); + exit(1); + } + break; + case 'M': arg_mysql_default_file = optctl.optarg; break; @@ -290,7 +393,7 @@ int main(int argc, char *argv[]) MX_GETOPT_FINISH(optctl, argc, argv); - if (!arg_group_id) { + if (!arg_group_id && !arg_job_id) { print_usage(); exit(EX_USAGE); } @@ -328,6 +431,51 @@ int main(int argc, char *argv[]) mx_log_info("MySQL: Connection to database established."); + if (arg_job_id) { + int res1, res2; + + res1 = update_job_status_cancelling_by_job_id_for_user(mysql, arg_job_id, passwd->pw_uid); + res2 = update_job_status_cancelled_by_job_id(mysql, arg_job_id); + + mx_mysql_finish(&mysql); + mx_log_info("MySQL: Connection to database closed."); + + if (res1 == -ENOENT) + res1=0; + + if (res2 == -ENOENT) + res1=0; + + if (res1 < 0) + mx_log_err("setting status of job %lu to CANCELLING failed: %s", arg_job_id, strerror(-res1)); + + if (res2 < 0) + mx_log_err("setting status of job %lu to CANCELLED failed: %s", arg_job_id, strerror(-res1)); + + if (res2 > 0) { + mx_log_notice("Job %lu cancelled!", arg_job_id); + return 0; + } + + if (res1 > 0) { + mx_log_notice("Updated status of job %lu to CANCELLING.", arg_job_id); + if (res2 == 0) { + mx_log_warning("Updating status of job %lu to CANCELLED failed. Job vanished. Please retry.", arg_job_id); + return 2; + } + return 1; + } + + if (res1 == 0 && res2 == 0) { + mx_log_notice("No queued job with job_id=%lu for user %s(%d) found in q.", arg_job_id, passwd->pw_name, passwd->pw_uid); + mx_log_warning("Killing a single job is not implemented yet."); + mx_log_warning("See https://github.molgen.mpg.de/mariux64/mxq/issues/4 for more details."); + return 0; + } + + return 1; + } + if (arg_group_id) { memset(&group, 0, sizeof(group)); From 737278b8928a5352012f9363f24b1d57ed778d86 Mon Sep 17 00:00:00 2001 From: Marius Tolzmann Date: Thu, 13 Aug 2015 12:44:17 +0200 Subject: [PATCH 5/5] mxq_job: Add new intermediate status KILLING(399) RUNNING -> KILLING -> (RUNNING -> *) LOADED -> KILLING -> (RUNNING -> *) --- mxq_job.c | 2 ++ mxq_job.h | 6 +++++- mysql/create_tables.sql | 8 ++++---- web/pages/mxq/mxq.in | 1 + 4 files changed, 12 insertions(+), 5 deletions(-) diff --git a/mxq_job.c b/mxq_job.c index e77f7e3..1f909de 100644 --- a/mxq_job.c +++ b/mxq_job.c @@ -130,6 +130,8 @@ char *mxq_job_status_to_name(uint64_t status) return "stopped"; case MXQ_JOB_STATUS_EXIT: return "exited"; + case MXQ_JOB_STATUS_KILLING: + return "killing"; case MXQ_JOB_STATUS_KILLED: return "killed"; case MXQ_JOB_STATUS_FAILED: diff --git a/mxq_job.h b/mxq_job.h index b52cd43..5d78ba1 100644 --- a/mxq_job.h +++ b/mxq_job.h @@ -69,16 +69,20 @@ struct mxq_job { #define MXQ_JOB_STATUS_EXTRUNNING 300 #define MXQ_JOB_STATUS_STOPPED 350 -#define MXQ_JOB_STATUS_EXIT 1024 +#define MXQ_JOB_STATUS_KILLING 399 + #define MXQ_JOB_STATUS_KILLED 400 #define MXQ_JOB_STATUS_FAILED 750 #define MXQ_JOB_STATUS_UNKNOWN_PRE 755 #define MXQ_JOB_STATUS_CANCELLING 989 + #define MXQ_JOB_STATUS_CANCELLED 990 #define MXQ_JOB_STATUS_UNKNOWN 999 #define MXQ_JOB_STATUS_FINISHED 1000 +#define MXQ_JOB_STATUS_EXIT 1024 + #define MXQ_JOB_FLAGS_RESTART_ON_HOSTFAIL (1<<0) #define MXQ_JOB_FLAGS_REQUEUE_ON_HOSTFAIL (1<<1) diff --git a/mysql/create_tables.sql b/mysql/create_tables.sql index c318497..deeb890 100644 --- a/mysql/create_tables.sql +++ b/mysql/create_tables.sql @@ -200,7 +200,7 @@ CREATE TRIGGER mxq_update_job BEFORE UPDATE ON mxq_job group_slots_running=group_slots_running-OLD.host_slots+NEW.host_slots, group_mtime=NULL WHERE group_id=NEW.group_id; - ELSEIF NEW.job_status IN (400, 750) AND OLD.job_status IN (150, 200, 250, 300, 350) THEN + ELSEIF NEW.job_status IN (400, 750) AND OLD.job_status IN (150, 200, 250, 300, 350, 399) THEN UPDATE mxq_group SET group_slots_running=group_slots_running-NEW.host_slots, group_jobs_running=group_jobs_running-1, @@ -220,7 +220,7 @@ CREATE TRIGGER mxq_update_job BEFORE UPDATE ON mxq_job group_jobs_cancelled=group_jobs_cancelled+1, group_mtime=NULL WHERE group_id=NEW.group_id; - ELSEIF NEW.job_status = 999 AND OLD.job_status IN (150, 200, 250) THEN + ELSEIF NEW.job_status = 999 AND OLD.job_status IN (150, 200, 250, 399) THEN UPDATE mxq_group SET group_slots_running=group_slots_running-NEW.host_slots, group_jobs_running=group_jobs_running-1, @@ -233,7 +233,7 @@ CREATE TRIGGER mxq_update_job BEFORE UPDATE ON mxq_job group_jobs_unknown=group_jobs_unknown+1, group_mtime=NULL WHERE group_id=NEW.group_id; - ELSEIF NEW.job_status = 1000 AND OLD.job_status IN (150, 200, 250, 300, 350) THEN + ELSEIF NEW.job_status = 1000 AND OLD.job_status IN (150, 200, 250, 300, 350, 399) THEN UPDATE mxq_group SET group_slots_running=group_slots_running-NEW.host_slots, group_jobs_running=group_jobs_running-1, @@ -250,7 +250,7 @@ CREATE TRIGGER mxq_update_job BEFORE UPDATE ON mxq_job stats_total_real_sec_finished=stats_total_real_sec_finished+NEW.stats_real_sec, group_mtime=NULL WHERE group_id=NEW.group_id; - ELSEIF NEW.job_status NOT IN (755, 989, 990) THEN + ELSEIF NEW.job_status NOT IN (399, 755, 989, 990) THEN UPDATE mxq_group SET stats_max_maxrss=GREATEST(stats_max_maxrss, NEW.stats_maxrss), stats_max_utime_sec=GREATEST(stats_max_utime_sec, NEW.stats_utime_sec), diff --git a/web/pages/mxq/mxq.in b/web/pages/mxq/mxq.in index 21a3bdc..8e7cfd0 100755 --- a/web/pages/mxq/mxq.in +++ b/web/pages/mxq/mxq.in @@ -82,6 +82,7 @@ sub db_init { EXTRUNNING => 300, STOPPED => 350, EXIT => 1024, + KILLING => 399, KILLED => 400, FAILED => 750, UNKNOWN_PRE => 755,