new srun randomizer
This commit is contained in:
46
anduril/srun-random
Executable file
46
anduril/srun-random
Executable file
@@ -0,0 +1,46 @@
|
||||
#!/bin/bash
|
||||
|
||||
if [ -z "${ANDURIL_NODELIST}" ]
|
||||
then NODELIST=( vm3 vm4 vm5 vm6 vm7 vm8 vm9 )
|
||||
else read -a NODELIST <<< "$ANDURIL_NODELIST"
|
||||
fi
|
||||
|
||||
if [ "$1" == "-h" ]
|
||||
then echo -ne '
|
||||
This tools selects the lowest cpu allocated node for slurm run
|
||||
export ANDURIL_NODELIST="vm3 vm4 vm5"
|
||||
^ to have your own preferred list of nodes
|
||||
Current nodelist: "'${NODELIST[@]}'"'"( ${#NODELIST[@]} nodes)\n"
|
||||
exit
|
||||
fi
|
||||
|
||||
NODERUNS=( )
|
||||
TRY=0
|
||||
# find node with max free CPUs
|
||||
for e in ${NODELIST[@]}
|
||||
do NODERUNS+=( 0 )
|
||||
done
|
||||
while [ true ]
|
||||
do
|
||||
INDEX=$(( ( RANDOM % ${#NODELIST[@]} ) ))
|
||||
NODERUNS=$(( $( scontrol -o show node ${NODELIST[$INDEX]} | sed 's,.*CPUAlloc=\([0-9]\+\).*CPUTot=\([0-9]\+\).*,\2-\1 ,' ) ))
|
||||
scontrol -o show node ${NODELIST[$INDEX]} | grep State=DOWN > /dev/null && NODERUNS=-1
|
||||
echo "srun: Node ${NODELIST[$INDEX]}, Free sockets: $NODERUNS" >&2
|
||||
[ $NODERUNS -gt 0 ] && {
|
||||
break
|
||||
} || {
|
||||
TRY=$(( $TRY + 1 ))
|
||||
echo "try again $TRY"
|
||||
[ $TRY -gt 3 ] && {
|
||||
TRY=0
|
||||
SECONDS=$(( ( RANDOM % 10 ) ))
|
||||
echo "waiting for free sockets for $SECONDS s."
|
||||
sleep $SECONDS
|
||||
}
|
||||
}
|
||||
done
|
||||
|
||||
|
||||
srun --nodelist=${NODELIST[$INDEX]} "$@"
|
||||
|
||||
|
||||
Reference in New Issue
Block a user