From 35e69d784af0aed6f317537e484a52b7cfdf63f2 Mon Sep 17 00:00:00 2001 From: Donald Buczek Date: Thu, 24 Feb 2022 16:33:27 +0100 Subject: [PATCH 1/9] mxqd: Clean up usage message --- mxqd.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/mxqd.c b/mxqd.c index a3ba9adf..f9ac629f 100644 --- a/mxqd.c +++ b/mxqd.c @@ -81,9 +81,7 @@ static void print_usage(void) " -t, --max-time default: 0 (unlimited)\n" " --prerequisites default: ''\n" " -x, --max-memory-per-slot-soft \n" - " root user: default: /\n" - " non-root user: default: \n" - " --gpu\n" + " --gpu\n" "\n" " -X, --max-memory-per-slot-hard \n" " default: \n" From 646d71caa804ba1056f454f4bc8339b17a27470c Mon Sep 17 00:00:00 2001 From: Donald Buczek Date: Thu, 24 Feb 2022 16:49:25 +0100 Subject: [PATCH 2/9] web: Remove unused job columns Don't display the job columns we are going to remove in the next commit. --- web/pages/mxq/mxq.in | 4 ---- 1 file changed, 4 deletions(-) diff --git a/web/pages/mxq/mxq.in b/web/pages/mxq/mxq.in index 544497d7..f91d6918 100755 --- a/web/pages/mxq/mxq.in +++ b/web/pages/mxq/mxq.in @@ -623,10 +623,6 @@ date_submit : $o{date_submit} date_start : $o{date_start} $ago date_end : $o{date_end} $rt -job_id_new : $o{job_id_new} -job_id_old : $o{job_id_old} -job_id_first : $o{job_id_first} - stats_max_sumrss : $o{stats_max_sumrss} kiB stats_status : $o{stats_status} From 38260a5df1557e9973316d57a6e7a89d376c0a09 Mon Sep 17 00:00:00 2001 From: Donald Buczek Date: Thu, 24 Feb 2022 16:47:27 +0100 Subject: [PATCH 3/9] mysql: Remove unused job attributes The columns were included to support job restarts. However, that has never been implemented. So remove the unused columns. Note: This change reduced the size of our mxq_job.ibd with 35824930 rows from 32GB to 24GB which is more than I've expected. --- mysql/create_tables.sql | 4 ---- mysql/migrate_014_remove_job_id_fields.sql | 4 ++++ 2 files changed, 4 insertions(+), 4 deletions(-) create mode 100644 mysql/migrate_014_remove_job_id_fields.sql diff --git a/mysql/create_tables.sql b/mysql/create_tables.sql index 3ff573f8..549b55a7 100644 --- a/mysql/create_tables.sql +++ b/mysql/create_tables.sql @@ -102,10 +102,6 @@ CREATE TABLE IF NOT EXISTS mxq_job ( date_start TIMESTAMP NOT NULL DEFAULT 0, date_end TIMESTAMP NOT NULL DEFAULT 0, - job_id_new INT8 UNSIGNED NULL DEFAULT NULL, - job_id_old INT8 UNSIGNED NULL DEFAULT NULL, - job_id_first INT8 UNSIGNED NULL DEFAULT NULL, - stats_max_sumrss INT8 UNSIGNED NOT NULL DEFAULT 0, stats_status INT4 UNSIGNED NOT NULL DEFAULT 0, diff --git a/mysql/migrate_014_remove_job_id_fields.sql b/mysql/migrate_014_remove_job_id_fields.sql new file mode 100644 index 00000000..e01a96bd --- /dev/null +++ b/mysql/migrate_014_remove_job_id_fields.sql @@ -0,0 +1,4 @@ +ALTER TABLE mxq_job + DROP COLUMN job_id_new, + DROP COLUMN job_id_old, + DROP COLUMN job_id_first; From 074977dc6b53cf47f230f2c898de38acb0535187 Mon Sep 17 00:00:00 2001 From: Donald Buczek Date: Mon, 28 Feb 2022 08:37:51 +0100 Subject: [PATCH 4/9] Makefile: Add ppidcache.h dependencies --- Makefile | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/Makefile b/Makefile index d898b93d..628def0e 100644 --- a/Makefile +++ b/Makefile @@ -347,6 +347,10 @@ mx_getopt.h += mx_getopt.h keywordset.h += keywordset.h +### ppidchache.h ------------------------------------------------------- + +ppidcache.h += ppidcache.h + ######################################################################## ### mx_getopt.o -------------------------------------------------------- @@ -489,6 +493,7 @@ mxqd.o: $(mxq_group.h) mxqd.o: $(mxq_job.h) mxqd.o: $(mx_mysql.h) mxqd.o: $(keywordset.h) +mxqd.o: $(ppidcache.h) mxqd.o: CFLAGS += $(CFLAGS_MYSQL) mxqd.o: CFLAGS += $(CFLAGS_MXQ_INITIAL_PATH) mxqd.o: CFLAGS += $(CFLAGS_MXQ_INITIAL_TMPDIR) @@ -522,6 +527,7 @@ clean: CLEAN += keywordset.o ### ppidcache.o ------------------------------------------------------- +ppidcache.o: $(ppidcache.h) ppidcache.o: $(mx_util.h) ppidcache.o: $(mx_proc.h) From 5b7e60b822239d8f08209f49ad1b7ba57c2239de Mon Sep 17 00:00:00 2001 From: Donald Buczek Date: Mon, 28 Feb 2022 08:49:48 +0100 Subject: [PATCH 5/9] tree: Declare empty argument lists MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Declare functions w/o arguments explicitly to satisfy my personal aesthetics. A function declaration `void foo()` has as unspecified argument list while a function declared as `void foo(void)` has explicitly no arguments. On x86_64 this makes a slight difference on how this functions needs to be called. Because a function with an unspecified argument list might be a variadic function, the ABI requires that the caller sets %al to the number of floating point registers which might have been used to pass in floating point parameters: "For calls that may call functions that use varargs or stdargs (prototype-less calls or calls to functions containing ellipsis (...) in the declaration) %al is used as hidden argument to specify the number of vector registers used. The contents of %al do not need to match exactly the number of registers, but must be an upper bound on the number of vector registers used and is in the range 0–8 inclusive" [1] If the callee is a variadic function, the usual routine in the function prologue is to save the registers, which might have been used to pass parameters, to memory so that they can be access one-by-one in the function body via pointers or indices. To give the callee an uppper bound of the number of floating point registers used to pass parameters opens an opportunity to avoid saving unused registers. The call of foo in void foo(); void bar(void) { foo(); } might compile to something like movl $0, %eax call foo The `movl $0, %eax` instruction would not be needed if foo was defined as `void foo(void)`. Note 1: gcc avoids the unneeded instructions to set up %al with -O1 or higher O1` if the function is known to be non-variadic because it is defined in the compilation uniti where it is used. Note 2: gcc function prologue for variadic functions just checks whether %al is zero or not and either saves none or all 8 floating point registers potentially used for arguments. Note 3: An instruction to clear %al (e.g. `31 c0 : xor %eax, %eax`) might have zero runtime cost on a modern processors. This change here is not a relevant optimization. I said, this is more about aesthetics. [1]: https://refspecs.linuxfoundation.org/elf/x86_64-abi-0.99.pdf --- mx_util.c | 2 +- mx_util.h | 2 +- mxqd.c | 2 +- ppidcache.c | 2 +- ppidcache.h | 2 +- test_mx_util.c | 8 ++++---- xmalloc.h | 2 +- 7 files changed, 10 insertions(+), 10 deletions(-) diff --git a/mx_util.c b/mx_util.c index e8eef56e..8b675159 100644 --- a/mx_util.c +++ b/mx_util.c @@ -1306,7 +1306,7 @@ unsigned long mx_df(const char *path) { return s.f_bavail*s.f_frsize; } -time_t mx_clock_boottime() { +time_t mx_clock_boottime(void) { struct timespec ts; int res = clock_gettime(CLOCK_BOOTTIME, &ts); if (res != 0) { diff --git a/mx_util.h b/mx_util.h index 701eb850..e7971261 100644 --- a/mx_util.h +++ b/mx_util.h @@ -179,7 +179,7 @@ void _mx_sort_linked_list(void **list, int (*cmp)(void *o1,void *o2), void ** (* #define mx_sort_linked_list(list,cmp,getnextptr) _mx_sort_linked_list((void **)(list),(int (*)(void *,void *))(cmp),(void ** (*)(void *))(getnextptr)) unsigned long mx_df(const char *path); -time_t mx_clock_boottime(); +time_t mx_clock_boottime(void); char *mx_call_external(char *args, ...); #endif diff --git a/mxqd.c b/mxqd.c index f9ac629f..32be11c0 100644 --- a/mxqd.c +++ b/mxqd.c @@ -334,7 +334,7 @@ static void read_hostconfig_retry(struct keywordset *kws) { static char gpu_setup_script[] = LIBEXECDIR "/mxq/gpu-setup"; -static int get_gpus() { +static int get_gpus(void) { char *line = mx_call_external(gpu_setup_script, "init", NULL); if (!line) { mx_log_err("gpu-setup init: %m"); diff --git a/ppidcache.c b/ppidcache.c index 11d95204..35c1de3d 100644 --- a/ppidcache.c +++ b/ppidcache.c @@ -16,7 +16,7 @@ struct ppidcache { struct entry *entries; }; -struct ppidcache *ppidcache_new() { +struct ppidcache *ppidcache_new(void) { struct ppidcache *ppidcache = mx_malloc_forever(sizeof(struct ppidcache)); ppidcache->count = 0; ppidcache->alloc = 500; diff --git a/ppidcache.h b/ppidcache.h index ca4a7e37..99e4bb22 100644 --- a/ppidcache.h +++ b/ppidcache.h @@ -3,7 +3,7 @@ #include -struct ppidcache *ppidcache_new(); +struct ppidcache *ppidcache_new(void); void ppidcache_free (struct ppidcache *ppidcache); pid_t ppidcache_get_ppid(struct ppidcache *ppidcache, pid_t pid); int ppidcache_is_descendant(struct ppidcache *ppidcache, pid_t ancestor, pid_t candidate); diff --git a/test_mx_util.c b/test_mx_util.c index 028a9ebb..6f473093 100644 --- a/test_mx_util.c +++ b/test_mx_util.c @@ -347,7 +347,7 @@ static void test_mx_strscan(void) mx_proc_pid_stat_free_content(pps); } -static void test_mx_strvec() { +static void test_mx_strvec(void) { char **strvec; char *str; @@ -395,7 +395,7 @@ static void test_mx_strvec() { free(strvec); } -static void test_mx_strcat() { +static void test_mx_strcat(void) { char *str; char *str2; @@ -517,11 +517,11 @@ static void test_listsort(void) assert(o[9].next==NULL); } -static void test_mx_df() { +static void test_mx_df(void) { assert(mx_df("/") > 0); } -static void test_mx_call_external() { +static void test_mx_call_external(void) { char *line; errno = 999; diff --git a/xmalloc.h b/xmalloc.h index 23705bdd..0e90e1e8 100644 --- a/xmalloc.h +++ b/xmalloc.h @@ -4,7 +4,7 @@ #include #include -__attribute__ ((noreturn, unused)) static void out_of_memory() { +__attribute__ ((noreturn, unused)) static void out_of_memory(void) { fprintf(stderr,"out of memory\n"); abort(); } From f03d998412018318f66d226a561e205f14185129 Mon Sep 17 00:00:00 2001 From: Donald Buczek Date: Sun, 6 Mar 2022 13:59:24 +0100 Subject: [PATCH 6/9] gpu-setup: Remove unused variable --- helper/gpu-setup | 1 - 1 file changed, 1 deletion(-) diff --git a/helper/gpu-setup b/helper/gpu-setup index da19fd76..937d9cf0 100755 --- a/helper/gpu-setup +++ b/helper/gpu-setup @@ -63,7 +63,6 @@ policy_a100_split_7() { umask 022 mkdir -p /dev/nvidia-caps for i in $(seq 0 6);do - instance_id=$(($i+7)) nvidia-smi mig --id $gpu_uuid --create-gpu-instance 1g.5gb:$i --default-compute-instance done From c003bf5d800d7ec337ce59567b0820aa83590d07 Mon Sep 17 00:00:00 2001 From: Donald Buczek Date: Sun, 6 Mar 2022 14:00:05 +0100 Subject: [PATCH 7/9] gpu_setup: Allow unpredictable gpu idents --- helper/gpu-setup | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/helper/gpu-setup b/helper/gpu-setup index 937d9cf0..72be537a 100755 --- a/helper/gpu-setup +++ b/helper/gpu-setup @@ -27,20 +27,17 @@ policy_a100_split_7() { # # The ID of the 1g.5gb profile is 19 (see `nvidia-smi mig -lgip`). # The placements for profile 19 are 0..6 (see `nvidia-smi mig -lgipp`). - # We know that these placements create GPU instance IDs from 7 to 13 from - # the output of the `nvidia-smi mig --create-gpu-instance`commands. - # We know how the instance IDs map to minor device numbers from /proc/driver/nvidia-caps/mig-minors - # Note, that the minor numbers are only valid for gpu0, this is why this code is - # only suited for machines with a single GPU. # - # PROFILE PLACEMENT GPU-INSTANCE-ID MINOR MINOR-COMPUTE-0 - # 1g.5gb 0 7 66 67 - # 1g.5gb 1 8 75 76 - # 1g.5gb 2 9 84 85 - # 1g.5gb 3 10 93 94 - # 1g.5gb 4 11 102 103 - # 1g.5gb 5 12 111 112 - # 1g.5gb 6 13 120 121 + # Strangely, the gpu instance idents created seem to vary from hardware to hardware. + # With `for p in $(seq 0 6); do nvidia-smi mig -cgi 1g.5gb:$p;done` we've seen these gpu instances created: + # + # theredqueen: 7, 8 , 9, 10, 11, 12, 13 + # whiterabbit: 7, 8 , 9, 10, 11, 12, 13 + # bandersnatch: 7, 8 , 9, 10, 11, 12, 13 + # jabberwocky: 11, 12, 13, 14, 7, 8 , 9 + # + # So we walk through /proc/driver/nvidia/capabilities/gpu0/mig/gi* after we've + # created the gpu instances to find the gpu idents. # Note: Nvidia regards persistence-mode as legacy.... @@ -70,8 +67,10 @@ policy_a100_split_7() { # for the gpu instances and the compute instances in /dev/nvidia-caps nvidia-smi --list-gpus - for i in $(seq 0 6);do - instance_id=$(($i+7)) + i=0 + for name in $(ls /proc/driver/nvidia/capabilities/gpu0/mig/|sort -k1.3n); do + [[ $name =~ ^gi([0-9]+)$ ]] || continue + instance_id=${BASH_REMATCH[1]} caps_minor=$(grep "^gpu0/gi$instance_id/access " /proc/driver/nvidia-caps/mig-minors|cut -f 2 -d ' ') caps_minor_compute=$(grep "^gpu0/gi$instance_id/ci0/access " /proc/driver/nvidia-caps/mig-minors|cut -f 2 -d ' ') d=$(printf "/dev/shm/mxqd/gpu_devs/%03d" $i) @@ -83,9 +82,10 @@ policy_a100_split_7() { chown root:root $f chmod go= $f done + i=$((i+1)) done - echo 7 >&20 + echo $i >&20 } policy_phys_gpus() { From cdecd0211ede60fb3ec001bb1345c0921d793a73 Mon Sep 17 00:00:00 2001 From: Donald Buczek Date: Sun, 6 Mar 2022 14:23:59 +0100 Subject: [PATCH 8/9] gpu-setup: Refactor Before we add more A100 policies, factor out common code into two separate shell functions. --- helper/gpu-setup | 65 ++++++++++++++++++++++++++++-------------------- 1 file changed, 38 insertions(+), 27 deletions(-) diff --git a/helper/gpu-setup b/helper/gpu-setup index 72be537a..d36b5c88 100755 --- a/helper/gpu-setup +++ b/helper/gpu-setup @@ -20,34 +20,12 @@ die() { exit 1 } -policy_a100_split_7() { - - # This policy splits a single A100 with 40 GB into 7 1g.5gb gpu instances each with a - # single default compute instance. - # - # The ID of the 1g.5gb profile is 19 (see `nvidia-smi mig -lgip`). - # The placements for profile 19 are 0..6 (see `nvidia-smi mig -lgipp`). - # - # Strangely, the gpu instance idents created seem to vary from hardware to hardware. - # With `for p in $(seq 0 6); do nvidia-smi mig -cgi 1g.5gb:$p;done` we've seen these gpu instances created: - # - # theredqueen: 7, 8 , 9, 10, 11, 12, 13 - # whiterabbit: 7, 8 , 9, 10, 11, 12, 13 - # bandersnatch: 7, 8 , 9, 10, 11, 12, 13 - # jabberwocky: 11, 12, 13, 14, 7, 8 , 9 - # - # So we walk through /proc/driver/nvidia/capabilities/gpu0/mig/gi* after we've - # created the gpu instances to find the gpu idents. +common_setup_a100() { + gpu_uuid=$1 # Note: Nvidia regards persistence-mode as legacy.... - nvidia-smi --persistence-mode=1 - gpu_uuid=$(nvidia-smi --query-gpu=uuid --format=csv,noheader,nounits) - caps_major=$(grep ' nvidia-caps$' /proc/devices|cut -f 1 -d' ') - - test "$gpu_uuid" || die "GPU not found!" - nvidia-smi mig --destroy-compute-instance --id $gpu_uuid >/dev/null || true nvidia-smi mig --destroy-gpu-instance --id $gpu_uuid >/dev/null || true @@ -59,9 +37,11 @@ policy_a100_split_7() { umask 022 mkdir -p /dev/nvidia-caps - for i in $(seq 0 6);do - nvidia-smi mig --id $gpu_uuid --create-gpu-instance 1g.5gb:$i --default-compute-instance - done +} + +common_setup_a100_complete() { + gpu_uuid=$1 + caps_major=$(grep ' nvidia-caps$' /proc/devices|cut -f 1 -d' ') # calling `nvidia-smi --list-gpus` as root has the side effect of creating the cap files # for the gpu instances and the compute instances in /dev/nvidia-caps @@ -88,6 +68,37 @@ policy_a100_split_7() { echo $i >&20 } +policy_a100_split_7() { + + # This policy splits a single A100 with 40 GB into 7 1g.5gb gpu instances each with a + # single default compute instance. + # + # The ID of the 1g.5gb profile is 19 (see `nvidia-smi mig -lgip`). + # The placements for profile 19 are 0..6 (see `nvidia-smi mig -lgipp`). + # + # Strangely, the gpu instance idents created seem to vary from hardware to hardware. + # With `for p in $(seq 0 6); do nvidia-smi mig -cgi 1g.5gb:$p;done` we've seen these gpu instances created: + # + # theredqueen: 7, 8 , 9, 10, 11, 12, 13 + # whiterabbit: 7, 8 , 9, 10, 11, 12, 13 + # bandersnatch: 7, 8 , 9, 10, 11, 12, 13 + # jabberwocky: 11, 12, 13, 14, 7, 8 , 9 + # + # So we walk through /proc/driver/nvidia/capabilities/gpu0/mig/gi* after we've + # created the gpu instances to find the gpu idents. + + gpu_uuid=$(nvidia-smi --query-gpu=uuid --format=csv,noheader,nounits) + test "$gpu_uuid" || die "GPU not found!" + + common_setup_a100 $gpu_uuid + + for i in $(seq 0 6);do + nvidia-smi mig --id $gpu_uuid --create-gpu-instance 1g.5gb:$i --default-compute-instance + done + + common_setup_a100_complete $gpu_uuid +} + policy_phys_gpus() { # This policy just enumrate the physical GPUs and reserved them From ebaeac6e6a5e69afbde06af80ed969c59fb65639 Mon Sep 17 00:00:00 2001 From: Donald Buczek Date: Sun, 6 Mar 2022 14:31:34 +0100 Subject: [PATCH 9/9] gpu-setup: Add policy a100_split_3 Add policy to create thee 2g.10gb instances on a A100. --- helper/gpu-setup | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/helper/gpu-setup b/helper/gpu-setup index d36b5c88..d1ef1881 100755 --- a/helper/gpu-setup +++ b/helper/gpu-setup @@ -99,6 +99,26 @@ policy_a100_split_7() { common_setup_a100_complete $gpu_uuid } +policy_a100_split_3() { + + # This policy splits a single A100 with 40 GB into 3 2g.10gb gpu instances each with a + # single default compute instance. + # + # The ID of the 2g.10gb profile is 14 (see `nvidia-smi mig -lgip`). + # The placements for profile 14 are 0,2,4 (see `nvidia-smi mig -lgipp`). + + gpu_uuid=$(nvidia-smi --query-gpu=uuid --format=csv,noheader,nounits) + test "$gpu_uuid" || die "GPU not found!" + + common_setup_a100 $gpu_uuid + + for i in $(seq 0 2 4);do + nvidia-smi mig --id $gpu_uuid --create-gpu-instance 2g.10gb:$i --default-compute-instance + done + + common_setup_a100_complete $gpu_uuid +} + policy_phys_gpus() { # This policy just enumrate the physical GPUs and reserved them @@ -149,6 +169,9 @@ init() { a100-split-7) policy_a100_split_7 ;; + a100-split-3) + policy_a100_split_3 + ;; phys-gpus) policy_phys_gpus ;;