#!/bin/bash # Author: Aleksander, 2010-2011 VERSION="1.04" NAME="joblets" function printUsage { echo "$NAME $VERSION" >&2 echo "" >&2 echo "Usage: $NAME [options]" >&2 echo "" >&2 echo "Program for aggregating multiple mpiexec instances (joblets) " >&2 echo "into a single job. Supports IntelMPI 4.0.1, Torque 2.3.0 and " >&2 echo "Nehalem/Westmere CPU." >&2 echo "" >&2 echo "Mandatory environment:" >&2 echo "NUM_JOBLETS: number of joblets" >&2 echo "NP_PER_JOBLET: number of processes per joblet" >&2 echo "" >&2 echo "The program will define a unique JOBLET_ID for each joblet." >&2 echo "Processes will be uniformly scattered between CPU sockets." >&2 } if [[ "$#" -eq "0" ]] then printUsage exit -1 fi if [[ "$#" -eq "1" ]] then if [[ "$1" == "--help" ]] || [[ "$1" == "-h" ]] then printUsage exit -1 fi fi if [[ "$PBS_NODEFILE" == "" ]] then echo "$0 error: PBS_NODEFILE undefined" >&2 exit -1 fi if [[ "$NUM_JOBLETS" == "" ]] then echo "$0 error: NUM_JOBLETS undefined" >&2 exit -1 fi if [[ "$NUM_JOBLETS" -le "1" ]] then echo "$0 error: at least two joblets required" >&2 exit -1 fi if [[ "$NP_PER_JOBLET" == "" ]] then echo "$0 error: NP_PER_JOBLET undefined" >&2 exit -1 fi if [[ "$PBS_JOBID" == "" ]] then echo "$0 error: PBS_JOBID undefined" >&2 exit -1 fi NP=$(cat $PBS_NODEFILE | wc -l) PPN_RETRIEVAL_EXPR="sed -e 's/^ *\([0-9]\+\) .*$/\1/g'" PPN=$(uniq -c "$PBS_NODEFILE" | head --lines=1 | eval $PPN_RETRIEVAL_EXPR) if [[ "$NP" == "" ]] || [[ "$PPN" == "" ]] || [[ "$NP" -ne "$PPN" ]] then echo "$0 error: aggregation of joblets supported only on a single node" >&2 exit -1 fi PARSE_CMD="sed 's/.* \([0-9]\+\)$/\1/g'" THREADSPERCORE=`cpuinfo | grep '^Threads per core' | eval $PARSE_CMD`; if [[ "$THREADSPERCORE" == "" ]] then echo "$0 error: could not retrieve the number of hyperthreads per core" >&2 exit -1 fi if [[ "$THREADSPERCORE" -ne "1" ]] then echo "$0 error: only 1 hyperthread per core supported" >&2 exit -1 fi ALLCORES=`cpuinfo | grep '^Processors(CPUs)' | eval $PARSE_CMD`; if [[ "$ALLCORES" == "" ]] then echo "$0 error: could not retrieve the total number of cores" >&2 exit -1 fi if [[ "$ALLCORES" -ne "12" ]] && [[ "$ALLCORES" -ne "8" ]] then echo "$0 error: only 12-core or 8-core nodes supported" >&2 exit -1 fi CORES_PER_SOCK=`cpuinfo | grep '^Cores per package' | eval $PARSE_CMD`; if [[ "$CORES_PER_SOCK" == "" ]] then echo "$0 error: could not retrieve the number of cores per package" >&2 exit -1 fi if [[ "$((ALLCORES / CORES_PER_SOCK))" -ne "2" ]] then echo "$0 error: only servers with two sockets are supported" >&2 exit -1 fi OLD_IFS="$IFS"; IFS="."; JOBID_ELEMS=( $PBS_JOBID ); IFS="$OLD_IFS" if [[ "$PBS_ARRAYID" == "" ]] then export UNIQ_ID="${JOBID_ELEMS[@]:0:1}" else export UNIQ_ID="${JOBID_ELEMS[@]:0:1}-$PBS_ARRAYID" fi ALLPROCESSES="$(( NP_PER_JOBLET * NUM_JOBLETS ))" export I_MPI_PIN_DOMAIN="$(( ALLCORES / ALLPROCESSES ))" NUM_JOBLETS_ON_2ND_SOCK="$((NUM_JOBLETS / 2))" NUM_JOBLETS_ON_1ST_SOCK="$((NUM_JOBLETS - NUM_JOBLETS_ON_2ND_SOCK))" PROCESSES_ON_1ST_SOCK="$((NUM_JOBLETS_ON_1ST_SOCK * NP_PER_JOBLET))" USED_CORES_ON_1ST_SOCK="$((PROCESSES_ON_1ST_SOCK * I_MPI_PIN_DOMAIN))" UNUSED_CORES_ON_1ST_SOCK="$((CORES_PER_SOCK - USED_CORES_ON_1ST_SOCK))" if [[ "$UNUSED_CORES_ON_1ST_SOCK" -lt 0 ]] then UNUSED_CORES_ON_1ST_SOCK=0 fi UNUSED_DOMAINS_ON_1ST_SOCK="$((UNUSED_CORES_ON_1ST_SOCK / I_MPI_PIN_DOMAIN))" mpdboot --totalnum="1" export JOBLET_ID for((JOBLET_ID=1; JOBLET_ID <= NUM_JOBLETS; JOBLET_ID++)) do JOBLET_ID_MINUS1="$((JOBLET_ID - 1))" export I_MPI_PIN_OFFSET="$((JOBLET_ID_MINUS1 * NP_PER_JOBLET))" if [[ "$JOBLET_ID" -gt "$NUM_JOBLETS_ON_1ST_SOCK" ]] then I_MPI_PIN_OFFSET=$(( I_MPI_PIN_OFFSET + UNUSED_DOMAINS_ON_1ST_SOCK )) fi eval mpiexec -tune -n "$NP_PER_JOBLET" "$@" \ > "joblets.o$UNIQ_ID-$JOBLET_ID" 2> "joblets.e$UNIQ_ID-$JOBLET_ID" & done wait mpdallexit