From 009304f03b38d57b2c26c24e53b4f9c86121c738 Mon Sep 17 00:00:00 2001 From: Donald Buczek Date: Sat, 30 Dec 2023 13:27:46 +0100 Subject: [PATCH] gpu-setup: Don't unlock to early during release Currently, the gpu lock file `pid` is released (removed) to early, so that there is a small race condition with a new GPU allocation: ``` MXQ job1 job2 * fork job1 * other initialization * reserve gpu: * * find slot without pid * * change access to UID * run user program * exit * fork job2 * other initialization * cleanup job 1: * * rm .../pid * reserve gpu: * * find slot without pid * * change access to UID * * change access to root ``` On release, keep the `pid` file until after the access mode has been changed back to root. --- helper/gpu-setup | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/helper/gpu-setup b/helper/gpu-setup index e06d5b11..a0413fb3 100755 --- a/helper/gpu-setup +++ b/helper/gpu-setup @@ -274,7 +274,6 @@ job_release() { test_pid="$(cat $d/pid 2>/dev/null)" if [ "$pid" = "$test_pid" ]; then echo "XXX $$ job_release $pid: found my pid in $d, releasing" >&2 - rm $d/pid for f in $(cat $d/access-files); do case $f in /dev/nvidia-caps/nvidia-cap*) @@ -287,6 +286,7 @@ job_release() { ;; esac done + rm $d/pid exit 0 fi fi