Skip to content

Commit

Permalink
gpu_setup: Allow unpredictable gpu idents
Browse files Browse the repository at this point in the history
  • Loading branch information
donald committed Mar 6, 2022
1 parent f03d998 commit c003bf5
Showing 1 changed file with 16 additions and 16 deletions.
32 changes: 16 additions & 16 deletions helper/gpu-setup
Original file line number Diff line number Diff line change
Expand Up @@ -27,20 +27,17 @@ policy_a100_split_7() {
#
# The ID of the 1g.5gb profile is 19 (see `nvidia-smi mig -lgip`).
# The placements for profile 19 are 0..6 (see `nvidia-smi mig -lgipp`).
# We know that these placements create GPU instance IDs from 7 to 13 from
# the output of the `nvidia-smi mig --create-gpu-instance`commands.
# We know how the instance IDs map to minor device numbers from /proc/driver/nvidia-caps/mig-minors
# Note, that the minor numbers are only valid for gpu0, this is why this code is
# only suited for machines with a single GPU.
#
# PROFILE PLACEMENT GPU-INSTANCE-ID MINOR MINOR-COMPUTE-0
# 1g.5gb 0 7 66 67
# 1g.5gb 1 8 75 76
# 1g.5gb 2 9 84 85
# 1g.5gb 3 10 93 94
# 1g.5gb 4 11 102 103
# 1g.5gb 5 12 111 112
# 1g.5gb 6 13 120 121
# Strangely, the gpu instance idents created seem to vary from hardware to hardware.
# With `for p in $(seq 0 6); do nvidia-smi mig -cgi 1g.5gb:$p;done` we've seen these gpu instances created:
#
# theredqueen: 7, 8 , 9, 10, 11, 12, 13
# whiterabbit: 7, 8 , 9, 10, 11, 12, 13
# bandersnatch: 7, 8 , 9, 10, 11, 12, 13
# jabberwocky: 11, 12, 13, 14, 7, 8 , 9
#
# So we walk through /proc/driver/nvidia/capabilities/gpu0/mig/gi* after we've
# created the gpu instances to find the gpu idents.

# Note: Nvidia regards persistence-mode as legacy....

Expand Down Expand Up @@ -70,8 +67,10 @@ policy_a100_split_7() {
# for the gpu instances and the compute instances in /dev/nvidia-caps
nvidia-smi --list-gpus

for i in $(seq 0 6);do
instance_id=$(($i+7))
i=0
for name in $(ls /proc/driver/nvidia/capabilities/gpu0/mig/|sort -k1.3n); do
[[ $name =~ ^gi([0-9]+)$ ]] || continue
instance_id=${BASH_REMATCH[1]}
caps_minor=$(grep "^gpu0/gi$instance_id/access " /proc/driver/nvidia-caps/mig-minors|cut -f 2 -d ' ')
caps_minor_compute=$(grep "^gpu0/gi$instance_id/ci0/access " /proc/driver/nvidia-caps/mig-minors|cut -f 2 -d ' ')
d=$(printf "/dev/shm/mxqd/gpu_devs/%03d" $i)
Expand All @@ -83,9 +82,10 @@ policy_a100_split_7() {
chown root:root $f
chmod go= $f
done
i=$((i+1))
done

echo 7 >&20
echo $i >&20
}

policy_phys_gpus() {
Expand Down

0 comments on commit c003bf5

Please sign in to comment.