Skip to content
Permalink
f3d9fb8c61
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
executable file 333 lines (281 sloc) 9.42 KB
#! /bin/bash
usage() {
cat <<EOF >&2
usage: $0 cmd
$0 show # Show state
$0 release # Release all GPUs, restore public access. Don't do this while mxqd or a job is running
$0 init # Called by mxqd: Setup hardware as needed, send no. of GPUs to stdout
$0 job-init PID UID # Called by mxqd: Find a free gpu for job and user, send UUID to stdout
$0 job-release PID # Called by mxqd: Relase GPU of this job
EOF
exit 1
}
die() {
echo "$@" >&2
exit 1
}
common_setup_a100() {
gpu_uuid=$1
# Note: Nvidia regards persistence-mode as legacy....
nvidia-smi --persistence-mode=1
nvidia-smi mig --destroy-compute-instance --id $gpu_uuid >/dev/null || true
nvidia-smi mig --destroy-gpu-instance --id $gpu_uuid >/dev/null || true
# this is stored on the card nvram. We set it anyway
# switching requires card reset
nvidia-smi --id=$gpu_uuid --multi-instance-gpu=1
nvidia-smi --id=$gpu_uuid --gpu-reset
umask 022
mkdir -p /dev/nvidia-caps
}
common_setup_a100_complete() {
gpu_uuid=$1
caps_major=$(grep ' nvidia-caps$' /proc/devices|cut -f 1 -d' ')
# calling `nvidia-smi --list-gpus` as root has the side effect of creating the cap files
# for the gpu instances and the compute instances in /dev/nvidia-caps
nvidia-smi --list-gpus
i=0
for name in $(ls /proc/driver/nvidia/capabilities/gpu0/mig/|sort -k1.3n); do
[[ $name =~ ^gi([0-9]+)$ ]] || continue
instance_id=${BASH_REMATCH[1]}
caps_minor=$(grep "^gpu0/gi$instance_id/access " /proc/driver/nvidia-caps/mig-minors|cut -f 2 -d ' ')
caps_minor_compute=$(grep "^gpu0/gi$instance_id/ci0/access " /proc/driver/nvidia-caps/mig-minors|cut -f 2 -d ' ')
d=$(printf "/dev/shm/mxqd/gpu_devs/%03d" $i)
mkdir -p $d
echo MIG-$gpu_uuid/$instance_id > $d/uuid
access_files="/dev/nvidia-caps/nvidia-cap$caps_minor /dev/nvidia-caps/nvidia-cap$caps_minor_compute"
echo $access_files > $d/access-files
for f in $access_files; do
chown root:root $f
chmod go= $f
done
i=$((i+1))
done
echo $i >&20
}
policy_a100_split_7() {
# This policy splits a single A100 with 40 GB into 7 1g.5gb gpu instances each with a
# single default compute instance.
#
# The ID of the 1g.5gb profile is 19 (see `nvidia-smi mig -lgip`).
# The placements for profile 19 are 0..6 (see `nvidia-smi mig -lgipp`).
#
# Strangely, the gpu instance idents created seem to vary from hardware to hardware.
# With `for p in $(seq 0 6); do nvidia-smi mig -cgi 1g.5gb:$p;done` we've seen these gpu instances created:
#
# theredqueen: 7, 8 , 9, 10, 11, 12, 13
# whiterabbit: 7, 8 , 9, 10, 11, 12, 13
# bandersnatch: 7, 8 , 9, 10, 11, 12, 13
# jabberwocky: 11, 12, 13, 14, 7, 8 , 9
#
# So we walk through /proc/driver/nvidia/capabilities/gpu0/mig/gi* after we've
# created the gpu instances to find the gpu idents.
gpu_uuid=$(nvidia-smi --query-gpu=uuid --format=csv,noheader,nounits)
test "$gpu_uuid" || die "GPU not found!"
common_setup_a100 $gpu_uuid
for i in $(seq 0 6);do
nvidia-smi mig --id $gpu_uuid --create-gpu-instance 1g.5gb:$i --default-compute-instance
done
common_setup_a100_complete $gpu_uuid
}
policy_a100_split_3() {
# This policy splits a single A100 with 40 GB into 3 2g.10gb gpu instances each with a
# single default compute instance.
#
# The ID of the 2g.10gb profile is 14 (see `nvidia-smi mig -lgip`).
# The placements for profile 14 are 0,2,4 (see `nvidia-smi mig -lgipp`).
gpu_uuid=$(nvidia-smi --query-gpu=uuid --format=csv,noheader,nounits)
test "$gpu_uuid" || die "GPU not found!"
common_setup_a100 $gpu_uuid
for i in $(seq 0 2 4);do
nvidia-smi mig --id $gpu_uuid --create-gpu-instance 2g.10gb:$i --default-compute-instance
done
common_setup_a100_complete $gpu_uuid
}
policy_a100_split_2() {
# This policy splits a single A100 with 40 GB into 2 3g.20gb gpu instances each with a
# single default compute instance.
#
# The ID of the 2g.10gb profile is 9 (see `nvidia-smi mig -lgip`).
# The placements for profile 9 are 0,4 (see `nvidia-smi mig -lgipp`).
gpu_uuid=$(nvidia-smi --query-gpu=uuid --format=csv,noheader,nounits)
test "$gpu_uuid" || die "GPU not found!"
common_setup_a100 $gpu_uuid
for i in 0 4;do
nvidia-smi mig --id $gpu_uuid --create-gpu-instance 3g.20gb:$i --default-compute-instance
done
common_setup_a100_complete $gpu_uuid
}
policy_phys_gpus() {
# This policy just enumrate the physical GPUs and reserved them
# all for mxqd
#
# Note: We assume, that we are getting the cards from nvidia-smi in the order
# corresponding to the /dev/nvidiaNNN files enumeration.
#
# Note: Nvidia regards persistence-mode as legacy....
nvidia-smi --persistence-mode=1
# Create the device files
nvidia-smi --list-gpus
i=0
for gpu_uuid in $(nvidia-smi --query-gpu=uuid --format=csv,noheader,nounits); do
d=$(printf "/dev/shm/mxqd/gpu_devs/%03d" $i)
mkdir -p $d
echo $gpu_uuid > $d/uuid
echo /dev/nvidia$i > $d/access-files
chown root:root /dev/nvidia$i
chmod go= /dev/nvidia$i
i=$((i+1))
done
echo $i >&20
}
init() {
set -e
if [ -d /dev/shm/mxqd/gpu_devs/ ]; then
echo /dev/shm/mxqd/gpu_devs/??? | wc -w
exit
fi
policy="$(/etc/mxq/gpu-policy)"
if [ -z "$policy" ]; then
die "$0: no policy returned for $(hostname) from etc/mxqd/gpu-policy"
fi
# we can't trust nvidia-smi not to write to stdout
exec 20>&1
exec >&2
case "$policy" in
a100-split-7)
policy_a100_split_7
;;
a100-split-3)
policy_a100_split_3
;;
a100-split-2)
policy_a100_split_2
;;
phys-gpus)
policy_phys_gpus
;;
*)
die "$0: policy returned by /etc/mxqd/gpu-policy not known"
;;
esac
}
release() {
set -e
for d in /dev/shm/mxqd/gpu_devs/???; do
if [ -e $d/pid ] && kill -0 $(cat $d/pid) 2>/dev/null; then
echo "WARNING: job $(cat $d/pid) still active. GPU $(cat $d/uuid) not returned"
else
if [ -e $d/access-files ]; then
for f in $(cat $d/access-files); do
case $f in
/dev/nvidia-caps/nvidia-cap*)
chown root:root $f
chmod 0444 $f
;;
/dev/nvidia*)
chown root:root $f
chmod 0666 $f
;;
esac
done
fi
rm -rf $d
fi
done
rmdir /dev/shm/mxqd/gpu_devs 2>/dev/null || true
}
job_init() {
(( $# == 2 )) || usage
pid=$1
uid=$2
test -d /dev/shm/mxqd/gpu_devs || die "$0: Not initialized (no dir /dev/shm/mxqd/gpu_devs)"
shopt -s nullglob
for d in /dev/shm/mxqd/gpu_devs/???; do
if pid=$pid f=$d/pid flock $d/pid -c 'test -s $f && exit 1; echo $pid>$f'; then
for f in $(cat $d/access-files); do
case $f in
/dev/nvidia-caps/nvidia-cap*)
chown $uid:root $f
chmod 0400 $f
;;
/dev/nvidia*)
chown $uid:root $f
chmod 0600 $f
;;
esac
done
cat $d/uuid
exit
fi
done
die "$0: logic error: no free GPUs!"
}
job_release() {
(( $# == 1 )) || usage
pid=$1
test -d /dev/shm/mxqd/gpu_devs || die "$0: Not initialized (no dir /dev/shm/mxqd/gpu_devs)"
for d in /dev/shm/mxqd/gpu_devs/???; do
if [ -e $d/pid ]; then
test_pid="$(cat $d/pid 2>/dev/null)"
if [ "$pid" = "$test_pid" ]; then
rm $d/pid
for f in $(cat $d/access-files); do
case $f in
/dev/nvidia-caps/nvidia-cap*)
chown root:root $f
chmod 0400 $f
;;
/dev/nvidia*)
chown root:root $f
chmod 0600 $f
;;
esac
done
exit 0
fi
fi
done
die "$0: job_release: job with $pid has no GPU locked"
}
show() {
if [ ! -d /dev/shm/mxqd/gpu_devs ]; then
echo Not initialized
exit
fi
devices=0
shopt -s nullglob
for d in /dev/shm/mxqd/gpu_devs/???; do
uuid=$(cat $d/uuid)
if [ -e $d/pid ]; then
job_id=$(cat $d/pid)
else
job_id="avail"
fi
access_files=$(cat $d/access-files)
printf "%2d: %10s %s %s\n" $devices $job_id $uuid "$access_files"
devices=$((devices+1))
done
}
(( $# > 0 )) || usage
cmd="$1"
shift;
case "$cmd" in
init)
init
;;
release)
release
;;
job-init)
job_init "$@"
;;
job-release)
job_release "$@"
;;
show)
show
;;
*)
usage
;;
esac