Skip to content

Commit

Permalink
gpu-setup: Refactor
Browse files Browse the repository at this point in the history
Before we add more A100 policies, factor out common code into two
separate shell functions.
  • Loading branch information
donald committed Mar 6, 2022
1 parent c003bf5 commit cdecd02
Showing 1 changed file with 38 additions and 27 deletions.
65 changes: 38 additions & 27 deletions helper/gpu-setup
Original file line number Diff line number Diff line change
Expand Up @@ -20,34 +20,12 @@ die() {
exit 1
}

policy_a100_split_7() {

# This policy splits a single A100 with 40 GB into 7 1g.5gb gpu instances each with a
# single default compute instance.
#
# The ID of the 1g.5gb profile is 19 (see `nvidia-smi mig -lgip`).
# The placements for profile 19 are 0..6 (see `nvidia-smi mig -lgipp`).
#
# Strangely, the gpu instance idents created seem to vary from hardware to hardware.
# With `for p in $(seq 0 6); do nvidia-smi mig -cgi 1g.5gb:$p;done` we've seen these gpu instances created:
#
# theredqueen: 7, 8 , 9, 10, 11, 12, 13
# whiterabbit: 7, 8 , 9, 10, 11, 12, 13
# bandersnatch: 7, 8 , 9, 10, 11, 12, 13
# jabberwocky: 11, 12, 13, 14, 7, 8 , 9
#
# So we walk through /proc/driver/nvidia/capabilities/gpu0/mig/gi* after we've
# created the gpu instances to find the gpu idents.
common_setup_a100() {
gpu_uuid=$1

# Note: Nvidia regards persistence-mode as legacy....

nvidia-smi --persistence-mode=1

gpu_uuid=$(nvidia-smi --query-gpu=uuid --format=csv,noheader,nounits)
caps_major=$(grep ' nvidia-caps$' /proc/devices|cut -f 1 -d' ')

test "$gpu_uuid" || die "GPU not found!"

nvidia-smi mig --destroy-compute-instance --id $gpu_uuid >/dev/null || true
nvidia-smi mig --destroy-gpu-instance --id $gpu_uuid >/dev/null || true

Expand All @@ -59,9 +37,11 @@ policy_a100_split_7() {

umask 022
mkdir -p /dev/nvidia-caps
for i in $(seq 0 6);do
nvidia-smi mig --id $gpu_uuid --create-gpu-instance 1g.5gb:$i --default-compute-instance
done
}

common_setup_a100_complete() {
gpu_uuid=$1
caps_major=$(grep ' nvidia-caps$' /proc/devices|cut -f 1 -d' ')

# calling `nvidia-smi --list-gpus` as root has the side effect of creating the cap files
# for the gpu instances and the compute instances in /dev/nvidia-caps
Expand All @@ -88,6 +68,37 @@ policy_a100_split_7() {
echo $i >&20
}

policy_a100_split_7() {

# This policy splits a single A100 with 40 GB into 7 1g.5gb gpu instances each with a
# single default compute instance.
#
# The ID of the 1g.5gb profile is 19 (see `nvidia-smi mig -lgip`).
# The placements for profile 19 are 0..6 (see `nvidia-smi mig -lgipp`).
#
# Strangely, the gpu instance idents created seem to vary from hardware to hardware.
# With `for p in $(seq 0 6); do nvidia-smi mig -cgi 1g.5gb:$p;done` we've seen these gpu instances created:
#
# theredqueen: 7, 8 , 9, 10, 11, 12, 13
# whiterabbit: 7, 8 , 9, 10, 11, 12, 13
# bandersnatch: 7, 8 , 9, 10, 11, 12, 13
# jabberwocky: 11, 12, 13, 14, 7, 8 , 9
#
# So we walk through /proc/driver/nvidia/capabilities/gpu0/mig/gi* after we've
# created the gpu instances to find the gpu idents.

gpu_uuid=$(nvidia-smi --query-gpu=uuid --format=csv,noheader,nounits)
test "$gpu_uuid" || die "GPU not found!"

common_setup_a100 $gpu_uuid

for i in $(seq 0 6);do
nvidia-smi mig --id $gpu_uuid --create-gpu-instance 1g.5gb:$i --default-compute-instance
done

common_setup_a100_complete $gpu_uuid
}

policy_phys_gpus() {

# This policy just enumrate the physical GPUs and reserved them
Expand Down

0 comments on commit cdecd02

Please sign in to comment.