diff --git a/Makefile b/Makefile index e0c3fc5..60fc6b9 100644 --- a/Makefile +++ b/Makefile @@ -46,11 +46,14 @@ endif ############################################################################## -MXQ_MYSQL_DEFAULT_FILE = ${SYSCONFDIR}/mxq/mysql.cnf +MXQ_MYSQL_DEFAULT_FILE = ${SYSCONFDIR}/mxq/mysql.cnf +MXQ_MYSQL_DEFAULT_GROUP = mxqclient + MXQ_INITIAL_PATH = /sbin:/bin:/usr/sbin:/usr/bin:/usr/local/sbin:/usr/local/bin -CFLAGS_MXQ_MYSQL_DEFAULT_FILE = -DMXQ_MYSQL_DEFAULT_FILE=\"$(MXQ_MYSQL_DEFAULT_FILE)\" -CFLAGS_MXQ_INITIAL_PATH = -DMXQ_INITIAL_PATH=\"$(MXQ_INITIAL_PATH)\" +CFLAGS_MXQ_MYSQL_DEFAULT_FILE = -DMXQ_MYSQL_DEFAULT_FILE=\"$(MXQ_MYSQL_DEFAULT_FILE)\" +CFLAGS_MXQ_MYSQL_DEFAULT_GROUP = -DMXQ_MYSQL_DEFAULT_GROUP=\"$(MXQ_MYSQL_DEFAULT_GROUP)\" +CFLAGS_MXQ_INITIAL_PATH = -DMXQ_INITIAL_PATH=\"$(MXQ_INITIAL_PATH)\" MYSQL_CONFIG = mysql_config @@ -177,12 +180,6 @@ mx_flock.h += mx_flock.h mx_mysql.h += mx_mysql.h mx_mysql.h += $(mx_util.h) -### mxq.h -------------------------------------------------------------- - -mxq.h += mxq.h -mxq.h += $(mxq_group.h) -mxq.h += $(mxq_job.h) - ### mxq_mysql.h -------------------------------------------------------- mxq_mysql.h += mxq_mysql.h @@ -192,7 +189,6 @@ mxq_mysql.h += $(mxq_util.h) mxq_util.h += mxq_util.h mxq_util.h += $(mx_log.h) -mxq_util.h += $(mxq.h) ### mxq_group.h -------------------------------------------------------- @@ -314,7 +310,6 @@ clean: CLEAN += mxq_group.o mxq_job.o: $(mx_util.h) mxq_job.o: $(mx_log.h) -mxq_job.o: $(mxq.h) mxq_job.o: $(mxq_job.h) mxq_job.o: $(mxq_group.h) mxq_job.o: $(mxq_mysql.h) @@ -342,13 +337,15 @@ clean: CLEAN += mxqd.o ### mxqsub.o ------------------------------------------------------- mxqsub.o: $(mx_getopt.h) -mxqsub.o: $(mxq_util.h) mxqsub.o: $(mx_util.h) mxqsub.o: $(mx_log.h) -mxqsub.o: $(mxq_mysql.h) mxqsub.o: $(mx_mysql.h) +mxqsub.o: $(mxq_group.h) +mxqsub.o: $(mxq_job.h) +mxqsub.o: $(mxq_util.h) mxqsub.o: CFLAGS += $(CFLAGS_MYSQL) mxqsub.o: CFLAGS += $(CFLAGS_MXQ_MYSQL_DEFAULT_FILE) +mxqsub.o: CFLAGS += $(CFLAGS_MXQ_MYSQL_DEFAULT_GROUP) clean: CLEAN += mxqsub.o @@ -377,7 +374,6 @@ install:: mxqd ### mxqsub ------------------------------------------------------------ mxqsub: mx_getopt.o -mxqsub: mxq_mysql.o mxqsub: mxq_util.o mxqsub: mx_util.o mxqsub: mx_log.o diff --git a/mxq.h b/mxq.h deleted file mode 100644 index a2c0246..0000000 --- a/mxq.h +++ /dev/null @@ -1,78 +0,0 @@ -#ifndef __MXQ_H__ -#define __MXQ_H__ 1 - -#include -#include -#include - -#include "mxq_group.h" -#include "mxq_job.h" - -struct mxq_job_full { - u_int64_t job_id; - u_int8_t job_status; - u_int16_t job_priority; - - char group_id[512]; - u_int8_t group_status; - u_int16_t group_priority; - - uid_t user_uid; - char user_name[256]; - gid_t user_gid; - char user_group[256]; - - u_int16_t job_threads; - u_int64_t job_memory; - u_int32_t job_time; - - char job_workdir[4096]; - char job_command[4096]; - u_int16_t job_argc; - char job_argv[40960]; - - char job_stdout[4096]; - char job_stderr[4096]; - - char tmp_stdout[4096]; - char tmp_stderr[4096]; - - mode_t job_umask; - - char host_submit[1024]; - - char server_id[1024]; - - char host_hostname[1014]; - pid_t host_pid; - - char date_submit[256]; - char date_start[256]; - char date_end[256]; - - struct timeval stats_starttime; - - int stats_status; - struct timeval stats_realtime; - struct rusage stats_rusage; -}; - -#ifndef MXQ_MYSQL_DEFAULT_FILE -#define MXQ_MYSQL_DEFAULT_FILE NULL -#endif - -struct mxq_job_full_list { - struct mxq_job_full_list_item *first; - struct mxq_job_full_list_item *last; - - int count; -}; - -struct mxq_job_full_list_item { - struct mxq_job_full_list_item *next; - struct mxq_job_full_list_item *prev; - - struct mxq_job_full *job; -}; - -#endif diff --git a/mxq_group.c b/mxq_group.c index bb4c7ba..46007fd 100644 --- a/mxq_group.c +++ b/mxq_group.c @@ -8,6 +8,7 @@ #include "mx_log.h" #include "mxq_group.h" +#include "mxq_job.h" #include "mxq_mysql.h" #define MXQ_GROUP_FIELDS "group_id," \ diff --git a/mxq_job.c b/mxq_job.c index b8ee941..6c73846 100644 --- a/mxq_job.c +++ b/mxq_job.c @@ -16,7 +16,6 @@ #include "mxq_group.h" #include "mxq_job.h" #include "mxq_mysql.h" -#include "mxq.h" #define MXQ_JOB_FIELDS "job_id, " \ "job_status, " \ diff --git a/mxq_job.h b/mxq_job.h index ff48a9b..80a5a5f 100644 --- a/mxq_job.h +++ b/mxq_job.h @@ -4,6 +4,9 @@ #include #include +#include +#include + #include "mxq_group.h" struct mxq_job { @@ -74,6 +77,13 @@ struct mxq_job { #define MXQ_JOB_STATUS_UNKNOWN 999 #define MXQ_JOB_STATUS_FINISHED 1000 +#define MXQ_JOB_FLAGS_RESTART_ON_HOSTFAIL (1<<0) +#define MXQ_JOB_FLAGS_REQUEUE_ON_HOSTFAIL (1<<1) + +#define MXQ_JOB_FLAGS_AUTORESTART (1<<62) +#define MXQ_JOB_FLAGS_HOSTFAIL (1<<63) + + #define _to_string(s) #s #define status_str(x) _to_string(x) diff --git a/mxq_job_dump.c b/mxq_job_dump.c index 352816a..604d077 100644 --- a/mxq_job_dump.c +++ b/mxq_job_dump.c @@ -12,6 +12,9 @@ #include "mxq_util.h" #include "mxq_mysql.h" +#include "mxq_group.h" +#include "mxq_job.h" + struct result { struct mxq_job job; diff --git a/mxq_util.c b/mxq_util.c index 055921f..a44335a 100644 --- a/mxq_util.c +++ b/mxq_util.c @@ -94,14 +94,6 @@ void *realloc_forever(void *ptr, size_t size) } while (1); } -void mxq_free_job(struct mxq_job_full *job) -{ - if (!job) - return; - - free(job); -} - char **strvec_new(void) { char **strvec; diff --git a/mxq_util.h b/mxq_util.h index 91a5b38..a0cb2a3 100644 --- a/mxq_util.h +++ b/mxq_util.h @@ -6,12 +6,8 @@ #include #include -#include "mxq.h" - char *mxq_hostname(void); -void mxq_free_job(struct mxq_job_full *job); - char **stringtostringvec(int argc, char *s); char *stringvectostring(int argc, char *argv[]); int chrcnt(char *s, char c); diff --git a/mxqd.c b/mxqd.c index f6985dc..de2b64d 100644 --- a/mxqd.c +++ b/mxqd.c @@ -30,7 +30,6 @@ #include "mx_util.h" #include "mx_log.h" -#include "mxq.h" #include "mxq_group.h" #include "mxq_job.h" #include "mxq_mysql.h" diff --git a/mxqdump.c b/mxqdump.c index ae54a61..4d1fb9f 100644 --- a/mxqdump.c +++ b/mxqdump.c @@ -14,6 +14,8 @@ #include "mxq_mysql.h" #include "mx_getopt.h" +#include "mxq_group.h" + #ifndef MXQ_VERSION #define MXQ_VERSION "0.00" #endif diff --git a/mxqkill.c b/mxqkill.c index 8093644..27db9c1 100644 --- a/mxqkill.c +++ b/mxqkill.c @@ -22,6 +22,9 @@ #include "mxq_mysql.h" #include "mx_getopt.h" +#include "mxq_group.h" +#include "mxq_job.h" + #ifndef MXQ_VERSION #define MXQ_VERSION "0.00" #endif diff --git a/mxqsub.c b/mxqsub.c index 8b33eb0..0361a6a 100644 --- a/mxqsub.c +++ b/mxqsub.c @@ -21,8 +21,17 @@ #include +#include +#include + +#include + #include "mxq_util.h" +#include "mxq_group.h" +#include "mxq_job.h" + +#include "mx_log.h" #include "mx_util.h" #include "mx_getopt.h" #include "mx_mysql.h" @@ -45,8 +54,18 @@ #define MXQ_VERSIONDATE "2015" #endif -#define MYSQL_DEFAULT_FILE MXQ_MYSQL_DEFAULT_FILE -#define MYSQL_DEFAULT_GROUP "mxqsub" +#ifndef MXQ_MYSQL_DEFAULT_FILE +# define MXQ_MYSQL_DEFAULT_FILE NULL +# define MXQ_MYSQL_DEFAULT_FILE_STR "\"MySQL defaults\"" +#else +# define MXQ_MYSQL_DEFAULT_FILE_STR MXQ_MYSQL_DEFAULT_FILE +#endif + +#ifndef MXQ_MYSQL_DEFAULT_GROUP +# define MXQ_MYSQL_DEFAULT_GROUP program_invocation_short_name +#endif +#define MXQ_MYSQL_DEFAULT_GROUP_STR MXQ_MYSQL_DEFAULT_GROUP + static void print_version(void) { @@ -63,7 +82,7 @@ static void print_usage(void) printf( "\n" "Usage:\n" - " mxqsub [mxqsub-options] [command options and arguments ..]\n" + " %s [mxqsub-options] [command options and arguments ..]\n" "\n" "Synopsis:\n" " queue a job to be executed on a cluster node.\n" @@ -87,6 +106,15 @@ static void print_usage(void) " -m | --memory set amount of memory in MiB (default: 2048)\n" " -t | --time set runtime in minutes (default: 15)\n" "\n" + "Job handling:\n" + " Define what to do if something bad happens.\n" + "\n" + " -r | --restart [restartmode] restart job on system failure (default: 'never')\n" + "\n" + " available [restartmode]s:\n" + " 'samehost' only restart if running on the same host.\n" + " 'always' always restart or requeue. (default)\n" + "\n" "Job grouping:\n" " Grouping is done by default based on the jobs resource\n" " and priority information, so that jobs using the same\n" @@ -106,13 +134,16 @@ static void print_usage(void) "\n" "Change how to connect to the mysql server:\n" "\n" - " -M | --mysql-default-file [mysql-file] (default: " MYSQL_DEFAULT_FILE ")\n" - " -S | --mysql-default-group [mysql-group] (default: " MYSQL_DEFAULT_GROUP ")\n" + " -M | --mysql-default-file [mysql-file] (default: %s)\n" + " -S | --mysql-default-group [mysql-group] (default: %s)\n" "\n" "Environment:\n" " MXQ_MYSQL_DEFAULT_FILE change default for [mysql-file]\n" " MXQ_MYSQL_DEFAULT_GROUP change default for [mysql-group]\n" - "\n" + "\n", + program_invocation_short_name, + MXQ_MYSQL_DEFAULT_FILE_STR, + MXQ_MYSQL_DEFAULT_GROUP_STR ); } @@ -159,19 +190,17 @@ static int load_group_id(struct mx_mysql *mysql, struct mxq_group *g) return res; } - assert(mx_mysql_statement_field_count(stmt) == 1); - assert(mx_mysql_statement_param_count(stmt) == 10); - - mx_mysql_statement_param_bind(stmt, 0, string, &(g->group_name)); - mx_mysql_statement_param_bind(stmt, 1, uint32, &(g->user_uid)); - mx_mysql_statement_param_bind(stmt, 2, string, &(g->user_name)); - mx_mysql_statement_param_bind(stmt, 3, uint32, &(g->user_gid)); - mx_mysql_statement_param_bind(stmt, 4, string, &(g->user_group)); - mx_mysql_statement_param_bind(stmt, 5, string, &(g->job_command)); - mx_mysql_statement_param_bind(stmt, 6, uint16, &(g->job_threads)); - mx_mysql_statement_param_bind(stmt, 7, uint64, &(g->job_memory)); - mx_mysql_statement_param_bind(stmt, 8, uint32, &(g->job_time)); - mx_mysql_statement_param_bind(stmt, 9, uint16, &(g->group_priority)); + res = mx_mysql_statement_param_bind(stmt, 0, string, &(g->group_name)); + res += mx_mysql_statement_param_bind(stmt, 1, uint32, &(g->user_uid)); + res += mx_mysql_statement_param_bind(stmt, 2, string, &(g->user_name)); + res += mx_mysql_statement_param_bind(stmt, 3, uint32, &(g->user_gid)); + res += mx_mysql_statement_param_bind(stmt, 4, string, &(g->user_group)); + res += mx_mysql_statement_param_bind(stmt, 5, string, &(g->job_command)); + res += mx_mysql_statement_param_bind(stmt, 6, uint16, &(g->job_threads)); + res += mx_mysql_statement_param_bind(stmt, 7, uint64, &(g->job_memory)); + res += mx_mysql_statement_param_bind(stmt, 8, uint32, &(g->job_time)); + res += mx_mysql_statement_param_bind(stmt, 9, uint16, &(g->group_priority)); + assert(res == 0); res = mx_mysql_statement_execute(stmt, &num_rows); if (res < 0) { @@ -233,19 +262,17 @@ static int add_group(struct mx_mysql *mysql, struct mxq_group *g) return res; } - assert(mx_mysql_statement_field_count(stmt) == 0); - assert(mx_mysql_statement_param_count(stmt) == 10); - - mx_mysql_statement_param_bind(stmt, 0, string, &(g->group_name)); - mx_mysql_statement_param_bind(stmt, 1, uint32, &(g->user_uid)); - mx_mysql_statement_param_bind(stmt, 2, string, &(g->user_name)); - mx_mysql_statement_param_bind(stmt, 3, uint32, &(g->user_gid)); - mx_mysql_statement_param_bind(stmt, 4, string, &(g->user_group)); - mx_mysql_statement_param_bind(stmt, 5, string, &(g->job_command)); - mx_mysql_statement_param_bind(stmt, 6, uint16, &(g->job_threads)); - mx_mysql_statement_param_bind(stmt, 7, uint64, &(g->job_memory)); - mx_mysql_statement_param_bind(stmt, 8, uint32, &(g->job_time)); - mx_mysql_statement_param_bind(stmt, 9, uint16, &(g->group_priority)); + res = mx_mysql_statement_param_bind(stmt, 0, string, &(g->group_name)); + res += mx_mysql_statement_param_bind(stmt, 1, uint32, &(g->user_uid)); + res += mx_mysql_statement_param_bind(stmt, 2, string, &(g->user_name)); + res += mx_mysql_statement_param_bind(stmt, 3, uint32, &(g->user_gid)); + res += mx_mysql_statement_param_bind(stmt, 4, string, &(g->user_group)); + res += mx_mysql_statement_param_bind(stmt, 5, string, &(g->job_command)); + res += mx_mysql_statement_param_bind(stmt, 6, uint16, &(g->job_threads)); + res += mx_mysql_statement_param_bind(stmt, 7, uint64, &(g->job_memory)); + res += mx_mysql_statement_param_bind(stmt, 8, uint32, &(g->job_time)); + res += mx_mysql_statement_param_bind(stmt, 9, uint16, &(g->group_priority)); + assert(res ==0); res = mx_mysql_statement_execute(stmt, &num_rows); if (res < 0) { @@ -300,28 +327,32 @@ static int add_job(struct mx_mysql *mysql, struct mxq_job *j) " job_umask = ?," - " host_submit = ?"); + " host_submit = ?," + + " job_flags = ?" + ); if (res < 0) { mx_log_err("mx_mysql_statement_prepare(): %m"); + mx_mysql_statement_close(&stmt); return res; } - assert(mx_mysql_statement_field_count(stmt) == 0); - assert(mx_mysql_statement_param_count(stmt) == 9); - - mx_mysql_statement_param_bind(stmt, 0, uint16, &(j->job_priority)); - mx_mysql_statement_param_bind(stmt, 1, uint64, &(j->group_id)); - mx_mysql_statement_param_bind(stmt, 2, string, &(j->job_workdir)); - mx_mysql_statement_param_bind(stmt, 3, uint16, &(j->job_argc)); - mx_mysql_statement_param_bind(stmt, 4, string, &(j->job_argv_str)); - mx_mysql_statement_param_bind(stmt, 5, string, &(j->job_stdout)); - mx_mysql_statement_param_bind(stmt, 6, string, &(j->job_stderr)); - mx_mysql_statement_param_bind(stmt, 7, uint32, &(j->job_umask)); - mx_mysql_statement_param_bind(stmt, 8, string, &(j->host_submit)); + res = mx_mysql_statement_param_bind(stmt, 0, uint16, &(j->job_priority)); + res += mx_mysql_statement_param_bind(stmt, 1, uint64, &(j->group_id)); + res += mx_mysql_statement_param_bind(stmt, 2, string, &(j->job_workdir)); + res += mx_mysql_statement_param_bind(stmt, 3, uint16, &(j->job_argc)); + res += mx_mysql_statement_param_bind(stmt, 4, string, &(j->job_argv_str)); + res += mx_mysql_statement_param_bind(stmt, 5, string, &(j->job_stdout)); + res += mx_mysql_statement_param_bind(stmt, 6, string, &(j->job_stderr)); + res += mx_mysql_statement_param_bind(stmt, 7, uint32, &(j->job_umask)); + res += mx_mysql_statement_param_bind(stmt, 8, string, &(j->host_submit)); + res += mx_mysql_statement_param_bind(stmt, 9, uint64, &(j->job_flags)); + assert(res ==0); res = mx_mysql_statement_execute(stmt, &num_rows); if (res < 0) { mx_log_err("mx_mysql_statement_execute(): %m"); + mx_mysql_statement_close(&stmt); return res; } @@ -356,6 +387,11 @@ static int mxq_submit_task(struct mx_mysql *mysql, struct mxq_job *j, int flags) mx_log_err("Failed to add new group."); return -(errno=EIO); } + + mx_log_info("The new job will be added to new group with group_id=%lu", g->group_id); + + } else { + mx_log_info("The new job will be attached to existing group with group_id=%lu", g->group_id); } assert(g->group_id); @@ -371,6 +407,8 @@ static int mxq_submit_task(struct mx_mysql *mysql, struct mxq_job *j, int flags) return -(errno=EIO); } + mx_log_info("The new job has been queued successfully with job_id=%lu in group with group_id=%lu", j->job_id, g->group_id); + assert(j->job_id); return res; @@ -397,6 +435,7 @@ int main(int argc, char *argv[]) char *arg_mysql_default_file; char *arg_mysql_default_group; char arg_debug; + char arg_jobflags; _mx_cleanup_free_ char *current_workdir = NULL; _mx_cleanup_free_ char *arg_stdout_absolute = NULL; @@ -428,6 +467,8 @@ int main(int argc, char *argv[]) MX_OPTION_NO_ARG("debug", 5), MX_OPTION_NO_ARG("verbose", 'v'), + MX_OPTION_OPTIONAL_ARG("restartable", 'r'), + MX_OPTION_REQUIRED_ARG("group-name", 'N'), MX_OPTION_REQUIRED_ARG("group-priority", 'P'), @@ -469,14 +510,15 @@ int main(int argc, char *argv[]) arg_stderr = "stdout"; arg_umask = getumask(); arg_debug = 0; + arg_jobflags = 0; arg_mysql_default_group = getenv("MXQ_MYSQL_DEFAULT_GROUP"); if (!arg_mysql_default_group) - arg_mysql_default_group = MYSQL_DEFAULT_GROUP; + arg_mysql_default_group = MXQ_MYSQL_DEFAULT_GROUP; arg_mysql_default_file = getenv("MXQ_MYSQL_DEFAULT_FILE"); if (!arg_mysql_default_file) - arg_mysql_default_file = MYSQL_DEFAULT_FILE; + arg_mysql_default_file = MXQ_MYSQL_DEFAULT_FILE; /******************************************************************/ @@ -507,6 +549,20 @@ int main(int argc, char *argv[]) mx_log_level_set(MX_LOG_INFO); break; + case 'r': + if (!optctl.optarg || streq(optctl.optarg, "always")) { + arg_jobflags |= MXQ_JOB_FLAGS_RESTART_ON_HOSTFAIL; + arg_jobflags |= MXQ_JOB_FLAGS_REQUEUE_ON_HOSTFAIL; + } else if (streq(optctl.optarg, "samehost")) { + arg_jobflags |= MXQ_JOB_FLAGS_RESTART_ON_HOSTFAIL; + } else if (streq(optctl.optarg, "never")) { + arg_jobflags &= ~(MXQ_JOB_FLAGS_RESTART_ON_HOSTFAIL|MXQ_JOB_FLAGS_REQUEUE_ON_HOSTFAIL); + } else { + mx_log_crit("--restart '%s': restartmode unknown.", optctl.optarg); + exit(EX_CONFIG); + } + break; + case 'p': if (mx_strtou16(optctl.optarg, &arg_priority) < 0) { mx_log_crit("--priority '%s': %m", optctl.optarg); @@ -671,6 +727,7 @@ int main(int argc, char *argv[]) group.job_memory = arg_memory; group.job_time = arg_time; + job.job_flags = arg_jobflags; job.job_priority = arg_priority; job.job_workdir = arg_workdir; job.job_stdout = arg_stdout; @@ -716,10 +773,14 @@ int main(int argc, char *argv[]) res = mx_mysql_connect_forever(&mysql); assert(res == 0); + mx_log_info("MySQL: Connection to database established."); + res = mxq_submit_task(mysql, &job, flags); mx_mysql_finish(&mysql); + mx_log_info("MySQL: Connection to database closed."); + if (res < 0) { mx_log_err("Job submission failed: %m"); return 1; diff --git a/mysql/create_tables b/mysql/create_tables index c187420..3c38705 100644 --- a/mysql/create_tables +++ b/mysql/create_tables @@ -1,4 +1,8 @@ +ALTER TABLE mxq_job + ADD COLUMN job_id_new INT8 UNSIGNED NULL DEFAULT NULL + AFTER date_end; + ALTER TABLE mxq_job ADD COLUMN job_flags INT8 UNSIGNED NOT NULL DEFAULT 0 AFTER job_status; @@ -82,6 +86,8 @@ CREATE TABLE IF NOT EXISTS mxq_job ( date_start TIMESTAMP NOT NULL DEFAULT 0, date_end TIMESTAMP NOT NULL DEFAULT 0, + job_id_new INT8 UNSIGNED NULL DEFAULT NULL, + stats_status INT4 UNSIGNED NOT NULL DEFAULT 0, stats_utime_sec INT4 UNSIGNED NOT NULL DEFAULT 0,