Skip to content

Commit

Permalink
mxqd: implement time contraint
Browse files Browse the repository at this point in the history
This allows mxqd to be started with "-t <minutes>" to have a job time
contraint for the server. jobs which are submitten with a longer maximum
runtime are not started on this server.

This is an emergency implementation and might need some
cleanup/rethinking in the future. But we need the feature now.
  • Loading branch information
donald committed Jul 13, 2016
1 parent 19b1545 commit 0a4a773
Show file tree
Hide file tree
Showing 2 changed files with 22 additions and 23 deletions.
44 changes: 21 additions & 23 deletions mxqd.c
Original file line number Diff line number Diff line change
Expand Up @@ -340,6 +340,7 @@ int server_init(struct mxq_server *server, int argc, char *argv[])
unsigned long arg_memory_total = 2048;
unsigned long arg_memory_limit_slot_soft = 0;
unsigned long arg_memory_limit_slot_hard = 0;
unsigned long arg_maxtime = 0;
int i;
struct mxq_daemon *daemon = &server->daemon;

Expand Down Expand Up @@ -369,6 +370,7 @@ int server_init(struct mxq_server *server, int argc, char *argv[])
MX_OPTION_REQUIRED_ARG("hostname", 6),
MX_OPTION_OPTIONAL_ARG("mysql-default-file", 'M'),
MX_OPTION_OPTIONAL_ARG("mysql-default-group", 'S'),
MX_OPTION_OPTIONAL_ARG("max-time", 't'),
MX_OPTION_END
};

Expand Down Expand Up @@ -515,6 +517,13 @@ int server_init(struct mxq_server *server, int argc, char *argv[])
case 'S':
arg_mysql_default_group = optctl.optarg;
break;

case 't':
if (mx_strtoul(optctl.optarg, &arg_maxtime) < 0) {
mx_log_err("Invalid argument supplied for option --max-time '%s': %m", optctl.optarg);
return -EX_USAGE;
}
break;
}
}

Expand Down Expand Up @@ -643,6 +652,9 @@ int server_init(struct mxq_server *server, int argc, char *argv[])
mx_log_err("MAIN: cpuset_init() failed. exiting.");
return -EX_OSERR;
}

server->maxtime = arg_maxtime;

server->memory_total = arg_memory_total;

server->memory_avg_per_slot = (long double)server->memory_total / (long double)server->slots;
Expand Down Expand Up @@ -1233,7 +1245,7 @@ unsigned long start_job(struct mxq_group_list *glist)

/**********************************************************************/

unsigned long start_user(struct mxq_user_list *ulist, int job_limit, long slots_to_start)
unsigned long start_user(struct mxq_user_list *ulist, int job_limit, long slots_to_start, int *need_more_slots)
{
struct mxq_server *server;
struct mxq_group_list *glist;
Expand Down Expand Up @@ -1277,10 +1289,13 @@ unsigned long start_user(struct mxq_user_list *ulist, int job_limit, long slots_
if (mxq_group_jobs_inq(group) == 0) {
goto start_user_continue;
}
if (server->maxtime && glist->group.job_time>server->maxtime) {
goto start_user_continue;
}
if (glist->slots_per_job > slots_to_start) {
*need_more_slots=1;
goto start_user_continue;
}

if (group->group_priority < prio) {
if (started) {
goto start_user_rewind;
Expand Down Expand Up @@ -1315,12 +1330,10 @@ unsigned long start_user(struct mxq_user_list *ulist, int job_limit, long slots_
long start_user_with_least_running_global_slot_count(struct mxq_server *server)
{
struct mxq_user_list *ulist;
struct mxq_group_list *glist;
struct mxq_group *group;
unsigned long slots_started = 0;
unsigned long slots_free;
unsigned long global_slots_per_user;
int waiting = 0;
int need_more_slots=0;

assert(server);

Expand All @@ -1336,25 +1349,10 @@ long start_user_with_least_running_global_slot_count(struct mxq_server *server)
global_slots_per_user = server->global_slots_running / server->user_cnt;

for (ulist = server->users; ulist; ulist = ulist->next) {
/* if other users are waiting and this user is already using
* more slots then avg user in cluster do not start anything
* (next users are using even more atm because list is sorted) */
if (waiting && ulist->global_slots_running > global_slots_per_user)
return -1;

slots_started = start_user(ulist, 1, slots_free);
if (slots_started)
need_more_slots=0;
slots_started = start_user(ulist, 1, slots_free, &need_more_slots);
if (slots_started || need_more_slots) {
return slots_started;

if (waiting)
continue;

for (glist = ulist->groups; glist; glist = glist->next) {
group = &glist->group;
if (glist->jobs_max > glist->jobs_running && group->group_jobs_inq) {

This comment has been minimized.

Copy link
@mariux

mariux Jul 16, 2016

Contributor

this is really really badly implemented... since there is no pull to comment on i'll do it here.. ;)

to add constraints add them to the calculation of jobs_max ... which should go equal to zero when job_time > maxtime in this case... much less implementation to do even for a quick-fix ;)

also removing this code will not balance globally anymore... which is needed even more with many maxtime servers running..

This comment has been minimized.

Copy link
@donald

donald Jul 17, 2016

Author Contributor
  • agreed, the server->maxtime constraint could go into jobs_max computation as well. Would it make a functional difference or is this more a question of style?
  • I don't see, why global balance wouldn't work with this logic. Can you give an example?

This comment has been minimized.

Copy link
@mariux

mariux Jul 18, 2016

Contributor

to a) i think it's both.. style and functional.. if you calculate a jobs_max it should be used everywhere and not silently reduced to zero in some subroutines..(style) (hard to debug).. jobs_max should actually be zero here for this group because calculations should be able to depend on it.. (functional)

This comment has been minimized.

Copy link
@mariux

mariux Jul 18, 2016

Contributor

to b) because you stop to start new jobs for users even if they might be able to start jobs based on global balance.. the old version continues to start more jobs as long as users are not running enough jobs (based on global_slots_per_user) or in other words: if the first user in the list needs more slots to run a job, no jobs of other users are started anymore - no matter what the current state of global balance is atm.

This comment has been minimized.

Copy link
@donald

donald Jul 18, 2016

Author Contributor

Ok, now I understand. The previous implementation allows users even with more running jobs to get additional jobs as long as they are not over the global jobs/user ratio.

If you find time, please have a look at branch https://github.molgen.mpg.de/mariux64/mxq/commits/servertimelimit2 which follows your suggestions.

waiting = 1;
break;
}
}
}
return 0;
Expand Down
1 change: 1 addition & 0 deletions mxqd.h
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,7 @@ struct mxq_server {
long double memory_avg_per_slot;
unsigned long memory_limit_slot_soft;
unsigned long memory_limit_slot_hard;
unsigned long maxtime;
cpu_set_t cpu_set_available;

struct mx_mysql *mysql;
Expand Down

0 comments on commit 0a4a773

Please sign in to comment.