diff --git a/helper/gpu-setup b/helper/gpu-setup index 72be537a..d36b5c88 100755 --- a/helper/gpu-setup +++ b/helper/gpu-setup @@ -20,34 +20,12 @@ die() { exit 1 } -policy_a100_split_7() { - - # This policy splits a single A100 with 40 GB into 7 1g.5gb gpu instances each with a - # single default compute instance. - # - # The ID of the 1g.5gb profile is 19 (see `nvidia-smi mig -lgip`). - # The placements for profile 19 are 0..6 (see `nvidia-smi mig -lgipp`). - # - # Strangely, the gpu instance idents created seem to vary from hardware to hardware. - # With `for p in $(seq 0 6); do nvidia-smi mig -cgi 1g.5gb:$p;done` we've seen these gpu instances created: - # - # theredqueen: 7, 8 , 9, 10, 11, 12, 13 - # whiterabbit: 7, 8 , 9, 10, 11, 12, 13 - # bandersnatch: 7, 8 , 9, 10, 11, 12, 13 - # jabberwocky: 11, 12, 13, 14, 7, 8 , 9 - # - # So we walk through /proc/driver/nvidia/capabilities/gpu0/mig/gi* after we've - # created the gpu instances to find the gpu idents. +common_setup_a100() { + gpu_uuid=$1 # Note: Nvidia regards persistence-mode as legacy.... - nvidia-smi --persistence-mode=1 - gpu_uuid=$(nvidia-smi --query-gpu=uuid --format=csv,noheader,nounits) - caps_major=$(grep ' nvidia-caps$' /proc/devices|cut -f 1 -d' ') - - test "$gpu_uuid" || die "GPU not found!" - nvidia-smi mig --destroy-compute-instance --id $gpu_uuid >/dev/null || true nvidia-smi mig --destroy-gpu-instance --id $gpu_uuid >/dev/null || true @@ -59,9 +37,11 @@ policy_a100_split_7() { umask 022 mkdir -p /dev/nvidia-caps - for i in $(seq 0 6);do - nvidia-smi mig --id $gpu_uuid --create-gpu-instance 1g.5gb:$i --default-compute-instance - done +} + +common_setup_a100_complete() { + gpu_uuid=$1 + caps_major=$(grep ' nvidia-caps$' /proc/devices|cut -f 1 -d' ') # calling `nvidia-smi --list-gpus` as root has the side effect of creating the cap files # for the gpu instances and the compute instances in /dev/nvidia-caps @@ -88,6 +68,37 @@ policy_a100_split_7() { echo $i >&20 } +policy_a100_split_7() { + + # This policy splits a single A100 with 40 GB into 7 1g.5gb gpu instances each with a + # single default compute instance. + # + # The ID of the 1g.5gb profile is 19 (see `nvidia-smi mig -lgip`). + # The placements for profile 19 are 0..6 (see `nvidia-smi mig -lgipp`). + # + # Strangely, the gpu instance idents created seem to vary from hardware to hardware. + # With `for p in $(seq 0 6); do nvidia-smi mig -cgi 1g.5gb:$p;done` we've seen these gpu instances created: + # + # theredqueen: 7, 8 , 9, 10, 11, 12, 13 + # whiterabbit: 7, 8 , 9, 10, 11, 12, 13 + # bandersnatch: 7, 8 , 9, 10, 11, 12, 13 + # jabberwocky: 11, 12, 13, 14, 7, 8 , 9 + # + # So we walk through /proc/driver/nvidia/capabilities/gpu0/mig/gi* after we've + # created the gpu instances to find the gpu idents. + + gpu_uuid=$(nvidia-smi --query-gpu=uuid --format=csv,noheader,nounits) + test "$gpu_uuid" || die "GPU not found!" + + common_setup_a100 $gpu_uuid + + for i in $(seq 0 6);do + nvidia-smi mig --id $gpu_uuid --create-gpu-instance 1g.5gb:$i --default-compute-instance + done + + common_setup_a100_complete $gpu_uuid +} + policy_phys_gpus() { # This policy just enumrate the physical GPUs and reserved them