Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
mxq/helper/gpu-setup
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
executable file
333 lines (281 sloc)
9.42 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /bin/bash | |
usage() { | |
cat <<EOF >&2 | |
usage: $0 cmd | |
$0 show # Show state | |
$0 release # Release all GPUs, restore public access. Don't do this while mxqd or a job is running | |
$0 init # Called by mxqd: Setup hardware as needed, send no. of GPUs to stdout | |
$0 job-init PID UID # Called by mxqd: Find a free gpu for job and user, send UUID to stdout | |
$0 job-release PID # Called by mxqd: Relase GPU of this job | |
EOF | |
exit 1 | |
} | |
die() { | |
echo "$@" >&2 | |
exit 1 | |
} | |
common_setup_a100() { | |
gpu_uuid=$1 | |
# Note: Nvidia regards persistence-mode as legacy.... | |
nvidia-smi --persistence-mode=1 | |
nvidia-smi mig --destroy-compute-instance --id $gpu_uuid >/dev/null || true | |
nvidia-smi mig --destroy-gpu-instance --id $gpu_uuid >/dev/null || true | |
# this is stored on the card nvram. We set it anyway | |
# switching requires card reset | |
nvidia-smi --id=$gpu_uuid --multi-instance-gpu=1 | |
nvidia-smi --id=$gpu_uuid --gpu-reset | |
umask 022 | |
mkdir -p /dev/nvidia-caps | |
} | |
common_setup_a100_complete() { | |
gpu_uuid=$1 | |
caps_major=$(grep ' nvidia-caps$' /proc/devices|cut -f 1 -d' ') | |
# calling `nvidia-smi --list-gpus` as root has the side effect of creating the cap files | |
# for the gpu instances and the compute instances in /dev/nvidia-caps | |
nvidia-smi --list-gpus | |
i=0 | |
for name in $(ls /proc/driver/nvidia/capabilities/gpu0/mig/|sort -k1.3n); do | |
[[ $name =~ ^gi([0-9]+)$ ]] || continue | |
instance_id=${BASH_REMATCH[1]} | |
caps_minor=$(grep "^gpu0/gi$instance_id/access " /proc/driver/nvidia-caps/mig-minors|cut -f 2 -d ' ') | |
caps_minor_compute=$(grep "^gpu0/gi$instance_id/ci0/access " /proc/driver/nvidia-caps/mig-minors|cut -f 2 -d ' ') | |
d=$(printf "/dev/shm/mxqd/gpu_devs/%03d" $i) | |
mkdir -p $d | |
echo MIG-$gpu_uuid/$instance_id > $d/uuid | |
access_files="/dev/nvidia-caps/nvidia-cap$caps_minor /dev/nvidia-caps/nvidia-cap$caps_minor_compute" | |
echo $access_files > $d/access-files | |
for f in $access_files; do | |
chown root:root $f | |
chmod go= $f | |
done | |
i=$((i+1)) | |
done | |
echo $i >&20 | |
} | |
policy_a100_split_7() { | |
# This policy splits a single A100 with 40 GB into 7 1g.5gb gpu instances each with a | |
# single default compute instance. | |
# | |
# The ID of the 1g.5gb profile is 19 (see `nvidia-smi mig -lgip`). | |
# The placements for profile 19 are 0..6 (see `nvidia-smi mig -lgipp`). | |
# | |
# Strangely, the gpu instance idents created seem to vary from hardware to hardware. | |
# With `for p in $(seq 0 6); do nvidia-smi mig -cgi 1g.5gb:$p;done` we've seen these gpu instances created: | |
# | |
# theredqueen: 7, 8 , 9, 10, 11, 12, 13 | |
# whiterabbit: 7, 8 , 9, 10, 11, 12, 13 | |
# bandersnatch: 7, 8 , 9, 10, 11, 12, 13 | |
# jabberwocky: 11, 12, 13, 14, 7, 8 , 9 | |
# | |
# So we walk through /proc/driver/nvidia/capabilities/gpu0/mig/gi* after we've | |
# created the gpu instances to find the gpu idents. | |
gpu_uuid=$(nvidia-smi --query-gpu=uuid --format=csv,noheader,nounits) | |
test "$gpu_uuid" || die "GPU not found!" | |
common_setup_a100 $gpu_uuid | |
for i in $(seq 0 6);do | |
nvidia-smi mig --id $gpu_uuid --create-gpu-instance 1g.5gb:$i --default-compute-instance | |
done | |
common_setup_a100_complete $gpu_uuid | |
} | |
policy_a100_split_3() { | |
# This policy splits a single A100 with 40 GB into 3 2g.10gb gpu instances each with a | |
# single default compute instance. | |
# | |
# The ID of the 2g.10gb profile is 14 (see `nvidia-smi mig -lgip`). | |
# The placements for profile 14 are 0,2,4 (see `nvidia-smi mig -lgipp`). | |
gpu_uuid=$(nvidia-smi --query-gpu=uuid --format=csv,noheader,nounits) | |
test "$gpu_uuid" || die "GPU not found!" | |
common_setup_a100 $gpu_uuid | |
for i in $(seq 0 2 4);do | |
nvidia-smi mig --id $gpu_uuid --create-gpu-instance 2g.10gb:$i --default-compute-instance | |
done | |
common_setup_a100_complete $gpu_uuid | |
} | |
policy_a100_split_2() { | |
# This policy splits a single A100 with 40 GB into 2 3g.20gb gpu instances each with a | |
# single default compute instance. | |
# | |
# The ID of the 2g.10gb profile is 9 (see `nvidia-smi mig -lgip`). | |
# The placements for profile 9 are 0,4 (see `nvidia-smi mig -lgipp`). | |
gpu_uuid=$(nvidia-smi --query-gpu=uuid --format=csv,noheader,nounits) | |
test "$gpu_uuid" || die "GPU not found!" | |
common_setup_a100 $gpu_uuid | |
for i in 0 4;do | |
nvidia-smi mig --id $gpu_uuid --create-gpu-instance 3g.20gb:$i --default-compute-instance | |
done | |
common_setup_a100_complete $gpu_uuid | |
} | |
policy_phys_gpus() { | |
# This policy just enumrate the physical GPUs and reserved them | |
# all for mxqd | |
# | |
# Note: We assume, that we are getting the cards from nvidia-smi in the order | |
# corresponding to the /dev/nvidiaNNN files enumeration. | |
# | |
# Note: Nvidia regards persistence-mode as legacy.... | |
nvidia-smi --persistence-mode=1 | |
# Create the device files | |
nvidia-smi --list-gpus | |
i=0 | |
for gpu_uuid in $(nvidia-smi --query-gpu=uuid --format=csv,noheader,nounits); do | |
d=$(printf "/dev/shm/mxqd/gpu_devs/%03d" $i) | |
mkdir -p $d | |
echo $gpu_uuid > $d/uuid | |
echo /dev/nvidia$i > $d/access-files | |
chown root:root /dev/nvidia$i | |
chmod go= /dev/nvidia$i | |
i=$((i+1)) | |
done | |
echo $i >&20 | |
} | |
init() { | |
set -e | |
if [ -d /dev/shm/mxqd/gpu_devs/ ]; then | |
echo /dev/shm/mxqd/gpu_devs/??? | wc -w | |
exit | |
fi | |
policy="$(/etc/mxq/gpu-policy)" | |
if [ -z "$policy" ]; then | |
die "$0: no policy returned for $(hostname) from etc/mxqd/gpu-policy" | |
fi | |
# we can't trust nvidia-smi not to write to stdout | |
exec 20>&1 | |
exec >&2 | |
case "$policy" in | |
a100-split-7) | |
policy_a100_split_7 | |
;; | |
a100-split-3) | |
policy_a100_split_3 | |
;; | |
a100-split-2) | |
policy_a100_split_2 | |
;; | |
phys-gpus) | |
policy_phys_gpus | |
;; | |
*) | |
die "$0: policy returned by /etc/mxqd/gpu-policy not known" | |
;; | |
esac | |
} | |
release() { | |
set -e | |
for d in /dev/shm/mxqd/gpu_devs/???; do | |
if [ -e $d/pid ] && kill -0 $(cat $d/pid) 2>/dev/null; then | |
echo "WARNING: job $(cat $d/pid) still active. GPU $(cat $d/uuid) not returned" | |
else | |
if [ -e $d/access-files ]; then | |
for f in $(cat $d/access-files); do | |
case $f in | |
/dev/nvidia-caps/nvidia-cap*) | |
chown root:root $f | |
chmod 0444 $f | |
;; | |
/dev/nvidia*) | |
chown root:root $f | |
chmod 0666 $f | |
;; | |
esac | |
done | |
fi | |
rm -rf $d | |
fi | |
done | |
rmdir /dev/shm/mxqd/gpu_devs 2>/dev/null || true | |
} | |
job_init() { | |
(( $# == 2 )) || usage | |
pid=$1 | |
uid=$2 | |
test -d /dev/shm/mxqd/gpu_devs || die "$0: Not initialized (no dir /dev/shm/mxqd/gpu_devs)" | |
shopt -s nullglob | |
for d in /dev/shm/mxqd/gpu_devs/???; do | |
if pid=$pid f=$d/pid flock $d/pid -c 'test -s $f && exit 1; echo $pid>$f'; then | |
for f in $(cat $d/access-files); do | |
case $f in | |
/dev/nvidia-caps/nvidia-cap*) | |
chown $uid:root $f | |
chmod 0400 $f | |
;; | |
/dev/nvidia*) | |
chown $uid:root $f | |
chmod 0600 $f | |
;; | |
esac | |
done | |
cat $d/uuid | |
exit | |
fi | |
done | |
die "$0: logic error: no free GPUs!" | |
} | |
job_release() { | |
(( $# == 1 )) || usage | |
pid=$1 | |
test -d /dev/shm/mxqd/gpu_devs || die "$0: Not initialized (no dir /dev/shm/mxqd/gpu_devs)" | |
for d in /dev/shm/mxqd/gpu_devs/???; do | |
if [ -e $d/pid ]; then | |
test_pid="$(cat $d/pid 2>/dev/null)" | |
if [ "$pid" = "$test_pid" ]; then | |
rm $d/pid | |
for f in $(cat $d/access-files); do | |
case $f in | |
/dev/nvidia-caps/nvidia-cap*) | |
chown root:root $f | |
chmod 0400 $f | |
;; | |
/dev/nvidia*) | |
chown root:root $f | |
chmod 0600 $f | |
;; | |
esac | |
done | |
exit 0 | |
fi | |
fi | |
done | |
die "$0: job_release: job with $pid has no GPU locked" | |
} | |
show() { | |
if [ ! -d /dev/shm/mxqd/gpu_devs ]; then | |
echo Not initialized | |
exit | |
fi | |
devices=0 | |
shopt -s nullglob | |
for d in /dev/shm/mxqd/gpu_devs/???; do | |
uuid=$(cat $d/uuid) | |
if [ -e $d/pid ]; then | |
job_id=$(cat $d/pid) | |
else | |
job_id="avail" | |
fi | |
access_files=$(cat $d/access-files) | |
printf "%2d: %10s %s %s\n" $devices $job_id $uuid "$access_files" | |
devices=$((devices+1)) | |
done | |
} | |
(( $# > 0 )) || usage | |
cmd="$1" | |
shift; | |
case "$cmd" in | |
init) | |
init | |
;; | |
release) | |
release | |
;; | |
job-init) | |
job_init "$@" | |
;; | |
job-release) | |
job_release "$@" | |
;; | |
show) | |
show | |
;; | |
*) | |
usage | |
;; | |
esac |