Changeset b562aa for util/src


Ignore:
Timestamp:
Aug 29, 2008, 10:41:35 AM (17 years ago)
Author:
Frederik Heber <heber@…>
Children:
181f9e
Parents:
830971
Message:

rewritten MultiRunSim to allow for minimum of rsh calls

MultiRunSim beforehand went through each fragment, one after the other, and called mpirun to commit it to a node. As the calculations with MPQC are so fast (roughly a second), this lead to a lot of rsh calls. The inetd of the job distributing node subsequently hanged itself after a short while. This was especially a problem in the BOSSANOVA scheme.
Now, we split the total sum of fragments up into as many packets as there are processor groups and commit them by single rsh call with all jobs in the packet concatenated with ";". Hence, we have a lot less rsh calls and now hangup of inetd. Note however, that for small molecules this still may lead to above described behaviour, i.e. if per packet there remains only one or two jobs. This cannot be overcome by any other mean than to lower the number of processor groups.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • util/src/dynamicANOVA.sh.in

    r830971 rb562aa  
    1010JOINER="@bindir@/joiner"
    1111CRUNCHER="/mount/bespin/heber/build/mpqc-2.3.0/bin/mpqc"
    12 CONVERTER="/mount/bespin/heber/tmp/mpqc/espack2mpqc.sh"
     12CONVERTER="/mount/bespin/heber/tmp/mpqc/espack2mpqc.py"
    1313PREPARER="/mount/bespin/heber/tmp/mpqc/convertresults.sh"
    1414
     
    3232}
    3333
    34 function RunSim {
    35         # 1 is the config file
    36         # 2 is the number of nodes
    37         # 3 further command line option
    38         # 4 and argument
    39         # set the maximum number of nodes
    40         MaxNodes=`cat $2 | awk 'END{print NR}'`
    41         gamma=`grep ProcPEGamma $1 | awk -F"\t" {'print $2'}`
    42         psi=`grep ProcPEPsi $1 | awk -F"\t" {'print $2'}`
    43         let nodes=$gamma*$psi
    44         if [ $nodes -gt $MaxNodes ]; then
    45                 echo "Process $1 needs too many nodes! Breaking." | tee -a dynamic.log
    46                 exit 1
    47         fi
    48         ${MPIRUN} -machinefile $2 -np $nodes ${CRUNCHER} $3 $4 $1 2>/dev/stdout 1>${1/in/out}
    49         check | tee -a dynamic.log
    50 }
     34#function MultiRunSim {
     35#       # 1 is config file dir (with all files)
     36#       # 2 is the machine file
     37#
     38#       ${JOBRUNNER} --mpqc ${CRUNCHER} -nprocpernode 2 -nprocperjob 1 -nthreadperproc 2 --threadgrp=posix --messagegrp=proc --memorygrp=proc --nodefile $2 --readdir $1 --inputprefix=${1}/ --outputprefix=${1}/ --autoout --verbose --rerun 2>/dev/stdout     | tee -a dynamic.log
     39#}
    5140
    5241function MultiRunSim {
    53         # 1 is config file
    54         # 2 is the number of groups
     42        # 1 is the number of groups
     43        # 2 is the directory
     44        # 3, ... are config files
    5545       
    5646        # find the next free proc group
    57         DIR=`dirname $1`
     47        divisor=$1
     48        shift
     49        DIR=$1
     50        shift
    5851        started=0
     52        pwd=`pwd`
    5953        while [ $started -eq 0 ]; do
    6054                groupnr=1
    61                 while [ $groupnr -le $2 ]; do
     55                while [ $groupnr -le $divisor ]; do
    6256                        if [ ! -e "${DIR}/ProcRuns${groupnr}" ]; then
    63                                 MaxNodes=`cat ${DIR}/ProcGroup${groupnr} | awk 'END{print NR}'`
    64                                 gamma=`grep ProcPEGamma $1 | awk -F"\t" {'print $2'}`
    65                                 psi=`grep ProcPEPsi $1 | awk -F"\t" {'print $2'}`
    66                                 let nodes=$gamma*$psi
    67                                 if [ $nodes -gt $MaxNodes ]; then
    68                                         echo "Process $1 needs too many nodes! Breaking." | tee -a dynamic.log
    69                                         exit 1
     57                                #MaxNodes=`cat ${DIR}/ProcGroup${groupnr} | awk 'END{print NR}'`
     58                                #gamma=`grep ProcPEGamma $1 | awk -F"\t" {'print $2'}`
     59                                #psi=`grep ProcPEPsi $1 | awk -F"\t" {'print $2'}`
     60                                #let nodes=$gamma*$psi
     61                                #if [ $nodes -gt $MaxNodes ]; then
     62                                #       echo "Process $1 needs too many nodes! Breaking." | tee -a dynamic.log
     63                                #       exit 1
     64                                #fi
     65                                nodes=1
     66                                echo "touch ${DIR}/ProcRuns${groupnr}" >"${DIR}/ProcBatch${groupnr}"
     67                                if [ ! -z $1 ]; then
     68                                        echo -n "rsh `cat <${DIR}/ProcGroup${groupnr}` 'cd ${pwd}/${DIR}" >>"${DIR}/ProcBatch${groupnr}"
    7069                                fi
    71                                 echo "touch ${DIR}/ProcRuns${groupnr}" >"${DIR}/ProcBatch${groupnr}"
    72                                 echo "#${MPIRUN} -machinefile ${DIR}/ProcGroup${groupnr} -np $nodes" >>"${DIR}/ProcBatch${groupnr}"
    73                                 echo "${CRUNCHER} ${1/conf/in} 2>/dev/stdout 1>${1/conf/out}" >>"${DIR}/ProcBatch${groupnr}"
     70                                while [ ! -z $1 ]; do   # add all config files as single lines
     71                                        #echo -n "${MPIRUN} -machinefile ${DIR}/ProcGroup${groupnr} -np $nodes " >>"${DIR}/ProcBatch${groupnr}"
     72                                        echo -n "; ${CRUNCHER} -o ${1/conf/out} ${1/conf/in}" >>"${DIR}/ProcBatch${groupnr}"
     73                                        shift
     74                                done
     75                                echo "'" >>"${DIR}/ProcBatch${groupnr}"
    7476                                echo "rm -f ${DIR}/ProcRuns${groupnr}" >>"${DIR}/ProcBatch${groupnr}"
    7577                                /bin/sh "${DIR}/ProcBatch${groupnr}" &
    7678                                started=1
    77                                 let groupnr=${2}+1
     79                                let groupnr=${divisor}+1
    7880                        else
    7981                                let groupnr=$groupnr+1
     
    8183                done
    8284                # wait a few seconds
    83                 if [ $2 -gt 1 ]; then
    84                         sleep 2
    85                 fi
     85                #if [ $2 -gt 1 ]; then
     86                #       sleep 2
     87                #fi
    8688        done
    8789}
    8890
    8991# get command line options
    90 if [ -z $4 ]; then
     92if [ -z $3 ]; then
    9193        echo "Usage: $0 <config file> <Order> <max. bond distance> <MaxNodes> [MaxMDsteps]"
    9294        echo -e "\t<config file> the pcp config file of the total molecule"
    9395        echo -e "\t<Order> the highest bond order (i.e. the cutoff number in ANOVA series expansion)"
    9496        echo -e "\t<max. bond distance> maximum distance to look for bonds (bonds are associated by element covalent radii criterion)"
    95         echo -e "\t<MaxNodes> number of nodes to use"
    9697        echo -e "\t[MaxMDSteps] overrides given MaxOuterStep in config file"
    9798        exit 1;
     
    101102        order=$2
    102103        distance=$3
    103         MaxNodes=$4
    104         if [ -z $5 ]; then
     104        if [ -z $4 ]; then
    105105                MaxSteps=`grep MaxOuterStep $arg | awk -F"\t" {'print $2'}`
    106106        else
    107                 MaxSteps=$5
     107                MaxSteps=$4
    108108        fi
    109         echo "Going to run for a total of $MaxSteps steps, bond order $order and maximum distance $distance of config file $arg with a total of $MaxNodes nodes." | tee -a dynamic.log
     109        echo "Going to run for a total of $MaxSteps steps, bond order $order and maximum distance $distance of config file $arg." | tee -a dynamic.log
    110110fi
    111111
     
    131131
    132132# put nodes into groups
     133MaxNodes=0
     134for node in `cat <$PBS_NODEFILE`; do
     135        let MaxNodes=$MaxNodes+1
     136done
    133137gamma=`grep ProcPEGamma $arg | awk -F"\t" {'print $2'}`
    134138psi=`grep ProcPEPsi $arg | awk -F"\t" {'print $2'}`
     
    187191        echo "There are $frag fragments." | tee -a dynamic.log
    188192
     193
    189194# evaluate each fragment
     195#  j=0
     196#  while [ $j -lt $frag ]; do
     197#               number=`printf "%0${digits}d" $j`
     198#               # convert all configs
     199#               echo -n "Converting ${DIR}/BondFragment${number}.conf ..." | tee -a dynamic.log
     200#               sh $CONVERTER ${DIR}/BondFragment${number}.conf
     201#               check | tee -a dynamic.log
     202#    let j=$j+1
     203#       done
     204#
     205#       MultiRunSim ${DIR} $PBS_NODEFILE
     206#
     207#  j=0
     208#       while [ $j -lt $frag ]; do
     209#               number=`printf "%0${digits}d" $j`
     210#               # rename output files
     211#               echo -n "Renaming `ls ${DIR}/BondFragment${number}.out.001.02.02` ..." | tee -a dynamic.log
     212#               mv ${DIR}/BondFragment${number}.out.001.02.02 ${DIR}/BondFragment${number}.out
     213#               check | tee -a dynamic.log
     214#    let j=$j+1
     215#       done
     216
     217        # reset command arrays
     218  grp=0;
     219  while [ $grp -lt $divisor ]; do
     220                command[$grp]=""
     221    let grp=$grp+1
     222  done
     223
     224        # distribute the jobs among the groups
    190225  j=0;
    191226  while [ $j -lt $frag ]; do
    192227                number=`printf "%0${digits}d" $j`
    193228                # convert all configs
    194                 echo -n "Converting ${DIR}/BondFragment${number}.conf ..."
    195                 sh $CONVERTER ${DIR}/BondFragment${number}.conf
    196                 check | tee -a dynamic.log
    197                 # and evaluate
    198                 echo -n  "Starting calculation of Fragment $number at step $i ... " | tee -a dynamic.log
    199                 MultiRunSim ${DIR}/BondFragment${number}.conf $divisor
    200                 echo "done." | tee -a dynamic.log
     229                #echo -n "Converting ${DIR}/BondFragment${number}.conf ..." | tee -a dynamic.log
     230                #sh $CONVERTER ${DIR}/BondFragment${number}.conf
     231                #check | tee -a dynamic.log
     232                # and distribute
     233                let grp=${j}%${divisor}
     234                #echo "BondFragment${number}.conf is evaluated by group $grp."
     235                command[$grp]="${command[$grp]}BondFragment${number}.conf "
    201236    let j=$j+1
    202237  done
    203238
     239        # go through all groups and run the job
     240  grp=0;
     241  while [ $grp -lt $divisor ]; do
     242                number=`printf "%0${digits}d" $j`
     243                echo -n  "Starting calculation of group $grp with fragments \"${command[$grp]}\" at step $i ... " | tee -a dynamic.log
     244                MultiRunSim $divisor ${DIR} ${command[$grp]}
     245                echo "done." | tee -a dynamic.log
     246    let grp=$grp+1
     247        done
     248
    204249# wait till all ProcRuns files are gone
    205         if [ $divisor -gt 1 ]; then
     250#       if [ $divisor -gt 1 ]; then
    206251                echo "Waiting for all running jobs at step $i to end ... " | tee -a dynamic.log
    207                 while [ ! -z "${DIR}/ProcRuns*" ]; do
    208                         sleep 3
     252                while [ ! -z "`find ${DIR} -name 'ProcRuns*'`" ]; do
     253                        #if [ ! -z "`find ${DIR} -name 'ProcRuns*'`" ]; then
     254                        #       echo "still `ls ${DIR}/ProcRuns*` present"
     255                        #fi
     256                        sleep 1
    209257                done
    210258                echo "done." | tee -a dynamic.log
    211         fi
     259#       fi
     260
    212261
    213262# convert results
    214263        sleep 1         # necessary for result files to close
    215         echo -n "Converting all results ... "
     264        echo -n "Converting all results ... " | tee -a dynamic.log
    216265        sh $PREPARER $DIR
    217266        check | tee -a dynamic.log
    218267
    219268# join the resulting forces into a single file
    220 #       cp ${DIR}/pcp.full.energy.all ${DIR}/pcp.energy.all
    221 #       cp ${DIR}/pcp.full.forces.all ${DIR}/pcp.forces.all
    222269        echo -n "Joining fragment energies ... " | tee -a dynamic.log
    223270  ${JOINER} ${DIR}/ $mainname >/dev/null 2>/dev/null
     
    239286done
    240287
    241 # draw densities of each step
    242 #sed -e "s#DoOutVis.*\##DoOutVis\t2\t\##" ${arg}.MD.MD >${arg}.MD
    243 #echo -n "Calling simulation to draw final densities of all steps ... " | tee -a dynamic.log
    244 #RunSim ${arg}.MD $PBS_NODEFILE
    245 #echo "done." | tee -a dynamic.log
    246 
    247288exit 0
Note: See TracChangeset for help on using the changeset viewer.