diff --git a/helper/gpu-setup b/helper/gpu-setup index 937d9cf0..72be537a 100755 --- a/helper/gpu-setup +++ b/helper/gpu-setup @@ -27,20 +27,17 @@ policy_a100_split_7() { # # The ID of the 1g.5gb profile is 19 (see `nvidia-smi mig -lgip`). # The placements for profile 19 are 0..6 (see `nvidia-smi mig -lgipp`). - # We know that these placements create GPU instance IDs from 7 to 13 from - # the output of the `nvidia-smi mig --create-gpu-instance`commands. - # We know how the instance IDs map to minor device numbers from /proc/driver/nvidia-caps/mig-minors - # Note, that the minor numbers are only valid for gpu0, this is why this code is - # only suited for machines with a single GPU. # - # PROFILE PLACEMENT GPU-INSTANCE-ID MINOR MINOR-COMPUTE-0 - # 1g.5gb 0 7 66 67 - # 1g.5gb 1 8 75 76 - # 1g.5gb 2 9 84 85 - # 1g.5gb 3 10 93 94 - # 1g.5gb 4 11 102 103 - # 1g.5gb 5 12 111 112 - # 1g.5gb 6 13 120 121 + # Strangely, the gpu instance idents created seem to vary from hardware to hardware. + # With `for p in $(seq 0 6); do nvidia-smi mig -cgi 1g.5gb:$p;done` we've seen these gpu instances created: + # + # theredqueen: 7, 8 , 9, 10, 11, 12, 13 + # whiterabbit: 7, 8 , 9, 10, 11, 12, 13 + # bandersnatch: 7, 8 , 9, 10, 11, 12, 13 + # jabberwocky: 11, 12, 13, 14, 7, 8 , 9 + # + # So we walk through /proc/driver/nvidia/capabilities/gpu0/mig/gi* after we've + # created the gpu instances to find the gpu idents. # Note: Nvidia regards persistence-mode as legacy.... @@ -70,8 +67,10 @@ policy_a100_split_7() { # for the gpu instances and the compute instances in /dev/nvidia-caps nvidia-smi --list-gpus - for i in $(seq 0 6);do - instance_id=$(($i+7)) + i=0 + for name in $(ls /proc/driver/nvidia/capabilities/gpu0/mig/|sort -k1.3n); do + [[ $name =~ ^gi([0-9]+)$ ]] || continue + instance_id=${BASH_REMATCH[1]} caps_minor=$(grep "^gpu0/gi$instance_id/access " /proc/driver/nvidia-caps/mig-minors|cut -f 2 -d ' ') caps_minor_compute=$(grep "^gpu0/gi$instance_id/ci0/access " /proc/driver/nvidia-caps/mig-minors|cut -f 2 -d ' ') d=$(printf "/dev/shm/mxqd/gpu_devs/%03d" $i) @@ -83,9 +82,10 @@ policy_a100_split_7() { chown root:root $f chmod go= $f done + i=$((i+1)) done - echo 7 >&20 + echo $i >&20 } policy_phys_gpus() {