114 lines
3.1 KiB
Bash
Executable File
114 lines
3.1 KiB
Bash
Executable File
#!/bin/bash
|
|
|
|
if [ -z "$1" ]
|
|
then echo provide the script to run
|
|
exit 1
|
|
fi
|
|
|
|
if [ -z "${ANDURIL_NODELIST}" ]
|
|
then NODELIST=( vm3 vm4 vm5 vm6 vm7 vm8 vm9 )
|
|
else read -a NODELIST <<< "$ANDURIL_NODELIST"
|
|
fi
|
|
JOBROOT="$HOME/.srun"
|
|
|
|
if [ "$1" == "-h" ]
|
|
then echo -ne '
|
|
This tool selects a random node for slurm run.
|
|
Use it with anduril: --exec-mode prefix --prefix '$( basename $0 )'
|
|
To change the list of nodes for randomization:
|
|
export ANDURIL_NODELIST="vm3 vm4 vm5"
|
|
|
|
Current nodelist: "'${NODELIST[@]}'"'"( ${#NODELIST[@]} nodes)\n"
|
|
exit
|
|
fi
|
|
|
|
NODERUNS=( )
|
|
TRY=0
|
|
for e in ${NODELIST[@]}
|
|
do NODERUNS+=( 0 )
|
|
done
|
|
while [ true ]
|
|
do
|
|
INDEX=$(( ( RANDOM % ${#NODELIST[@]} ) ))
|
|
NODERUNS=$(( $( scontrol -o show node ${NODELIST[$INDEX]} | sed 's,.*CPUAlloc=\([0-9]\+\).*CPUTot=\([0-9]\+\).*,\2-\1 ,' ) ))
|
|
scontrol -o show node ${NODELIST[$INDEX]} | grep State=DOWN > /dev/null && NODERUNS=-1
|
|
echo "srun: Node ${NODELIST[$INDEX]}, Free sockets: $NODERUNS" >&2
|
|
[ $NODERUNS -gt 0 ] && {
|
|
break
|
|
} || {
|
|
TRY=$(( $TRY + 1 ))
|
|
echo "try again $TRY"
|
|
[ $TRY -gt 3 ] && {
|
|
TRY=0
|
|
SECONDS=$(( ( RANDOM % 10 ) ))
|
|
echo "waiting for free sockets for $SECONDS s."
|
|
sleep $SECONDS
|
|
}
|
|
}
|
|
done
|
|
|
|
mkdir -p "$JOBROOT"
|
|
JOBNAME=job_$( date +"%y%m%d_%H%M%S" )_$( echo $@ | md5sum | cut -f1 -d" " )
|
|
JOBPATH="$JOBROOT/$JOBNAME"
|
|
while [ -d "$JOBPATH" ]
|
|
do echo Jobpath "$JOBPATH" exists
|
|
JOBNAME=job_$( date +"%y%m%d_%H%M%S" )_$( echo $@ | md5sum | cut -f1 -d" " )
|
|
JOBPATH="$JOBROOT/$JOBNAME"
|
|
done
|
|
mkdir -p "$JOBPATH"
|
|
JOBFILE="$JOBPATH/job"
|
|
STATFILE="$JOBPATH/statistics"
|
|
STRMFILE="$JOBPATH/stream"
|
|
EXECPATH=$( pwd )
|
|
|
|
# create the jobfile
|
|
echo '#!/bin/bash' > "$JOBFILE"
|
|
chmod 755 "$JOBFILE"
|
|
# Find _command file
|
|
for (( i=1; i<=$#; i++ ))
|
|
do if [[ "${!i}" == */_command ]]
|
|
then echo 'retrys=0' >> "$JOBFILE"
|
|
echo 'while :
|
|
do [ -f "'${!i}'" ] && {
|
|
break
|
|
} || {
|
|
echo Waiting for _command file '${!i}'
|
|
retrys=$(( $retrys + 1 ))
|
|
sleep 1; [ "$retrys" -gt 15 ] && break;
|
|
}
|
|
done ' >> "$JOBFILE"
|
|
fi
|
|
done
|
|
|
|
echo 'echo Node: $HOSTNAME'" >> \"$STATFILE\" " >> "$JOBFILE"
|
|
echo "pwd >> \"$STATFILE\" " >> "$JOBFILE"
|
|
echo "date +'Start: %s' >> \"$STATFILE\" " >> "$JOBFILE"
|
|
echo -n "/usr/bin/time -o \"$STATFILE\" --append " >> "$JOBFILE"
|
|
for (( i=1; i<=$#; i++ ))
|
|
do echo -n "\"${!i}\" " >> "$JOBFILE"
|
|
done
|
|
#Catch the input stream (for R launcher)
|
|
cat - >> "$STRMFILE"
|
|
echo -n ' < "'$STRMFILE'"' >> "$JOBFILE"
|
|
echo -e "\n" >> "$JOBFILE"
|
|
echo 'EC=$?' >> "$JOBFILE"
|
|
echo "date +'Stop: %s' >> \"$STATFILE\" " >> "$JOBFILE"
|
|
echo 'exit $EC' >> "$JOBFILE"
|
|
|
|
# clear out old job definitions (semirandomly)
|
|
(
|
|
[[ -e "$JOBROOT"/.lastdel ]] || touch "$JOBROOT"/.lastdel
|
|
if test "$( find $JOBROOT/.lastdel -mmin +30 )"
|
|
then touch "$JOBROOT"/.lastdel
|
|
find "$JOBROOT" -maxdepth 2 -mindepth 2 -type f -mtime +20 -delete 2>/dev/null
|
|
find "$JOBROOT" -type d -depth -empty -delete 2>/dev/null
|
|
fi
|
|
) &
|
|
|
|
echo "The job file is in $JOBFILE"
|
|
# send the job
|
|
srun --nodelist=${NODELIST[$INDEX]} slurm-cake "$JOBFILE" "$JOBFILE"
|
|
EC=$?
|
|
wait
|
|
exit $EC
|