#!/bin/csh # set running=`ps -ef | grep gridpoll |grep -v grep |grep -v rosetta | wc -l` if ($running <= 3) then set rundir = `echo $0 | awk -v d=$0:h"/" '{i = index($0,"/") } {if (i == 0) $0 = "./"; else $0 = d } {print $0}'` set ncount = `ls -1 $rundir |grep jobid |grep -v csr | wc -l` if ($ncount > 0) then # # CHECK THE STATUS OF THE RUNNING JOBS # foreach i(`ls -1 -tr {$rundir}gridjob*.jobid`) set jobroot = $i:r set nline=`wc -l $i |awk '{print $1}'` if ($nline > 2) then head -2 $i >tmpp \mv tmpp $i endif set ok = 0 set aborted = 0 set waiting = 0 set cleared = 0 glite-wms-job-status -i $i >$i:r.status set ok = `cat $i:r.status | awk 'BEGIN{ok=0}$1 == "Current" && $2 == "Status:" && $3 == "Done" {ok = 1} END{print ok}'` set aborted = `cat $i:r.status | awk 'BEGIN{ok=0}$1 == "Current" && $2 == "Status:" && $3 == "Aborted" {aborted = 1} END{print aborted}'` set waiting = `cat $i:r.status | awk 'BEGIN{ok=0}$1 == "Current" && $2 == "Status:" && $3 == "Waiting" {waiting = 1} END{print waiting}'` set cleared = `cat $i:r.status | awk 'BEGIN{ok=0}$1 == "Current" && $2 == "Status:" && $3 == "Cleared" {cleared = 1} END{print cleared}'` # # JOB FINISHED SUCCESSFULLY # if ($ok == 1) then glite-wms-job-output --nosubdir -i $i --dir `pwd`/$jobroot >&/dev/null # # CHECK IF RESULT FILE IS PRESENT # if (-e $jobroot/${jobroot}-result.tar.gz) then set errsize = `grep -v grid-env $jobroot/${jobroot}.err |grep -v AAvocachedir | wc -l |awk '{print $1}'` set tarsize = `du -ks $jobroot/${jobroot}-result.tar.gz | awk '{print $1}'` # # CHECK IF THE RESULT FILE IS NOT EMPTY AND ERROR FILE IS NOT TOO LARGE # if ($errsize < 500 && $tarsize > 0) then echo '===========================' echo `date` "GRID job finished successfully" echo 'JOBID ' $jobroot grep 'Destination' $jobroot.status grep 'https' $jobroot.jobid echo '===========================' \rm $i \rm $jobroot.jdl \rm $jobroot.sh set outname=`cat $jobroot.dir` \mv $jobroot/${jobroot}-result.tar.gz `cat $jobroot.dir` chmod g+rw `cat $jobroot.dir` \rm $jobroot.dir \rm -rf $jobroot \rm $jobroot.tar.gz \rm $i:r.process $jobroot.process $jobroot.status>&/dev/null # # PROBLEM DETECTED - MAKE JOB READY FOR RESUBMISSION # else echo '===========================' echo `date` "Error detected... resubmitting" if ($tarsize == 0) then echo "Empty result file" endif grep HOSTNAME $jobroot/${jobroot}.out echo 'JOBID ' $jobroot grep 'Destination' $jobroot.status grep 'https' $jobroot.jobid echo '===========================' echo '===========================' >> /home/haddock/grid/gridpoll.err echo `date` "Error detected... resubmitting" >> /home/haddock/grid/gridpoll.err grep HOSTNAME $jobroot/${jobroot}.out >>/home/haddock/grid/gridpoll.err echo 'JOBID ' $jobroot >> /home/haddock/grid/gridpoll.err grep 'Destination' $jobroot.status >> /home/haddock/grid/gridpoll.err grep 'https' $jobroot.jobid >> /home/haddock/grid/gridpoll.err cat $jobroot/${jobroot}.err >> /home/haddock/grid/gridpoll.err echo '===========================' >> /home/haddock/grid/gridpoll.err \rm -rf $jobroot \rm $jobroot.jobid \rm $i:r.process $jobroot.process $jobroot.status>&/dev/null endif # # PROBLEM DETECTED - MISSING RESULT FILE - MAKE JOB READY FOR RESUBMISSION # else echo '===========================' echo `date` "Missing result file... resubmitting" echo 'JOBID ' $jobroot grep 'Destination' $jobroot.status grep 'https' $jobroot.jobid echo '===========================' echo '===========================' >> /home/haddock/grid/gridpoll.err echo `date` "Missing result file... resubmitting" >> /home/haddock/grid/gridpoll.err echo 'JOBID ' $jobroot >> /home/haddock/grid/gridpoll.err grep 'Destination' $jobroot.status >> /home/haddock/grid/gridpoll.err grep 'https' $jobroot.jobid >> /home/haddock/grid/gridpoll.err echo '===========================' >> /home/haddock/grid/gridpoll.err \rm -rf $jobroot \rm $jobroot.jobid \rm $i:r.process $jobroot.process $jobroot.status>&/dev/null \rm $i:r.process $jobroot.process $jobroot.status>&/dev/null endif endif # # JOB WAS ABORTED - MAKE JOB READY FOR RESUBMISSION # if ($aborted == 1) then echo '===========================' echo `date` "Aborted job... resubmitting" echo 'JOBID ' $jobroot grep 'Destination' $jobroot.status grep 'https' $jobroot.jobid echo '===========================' \rm $jobroot.jobid \rm $i:r.process $jobroot.process $jobroot.status>&/dev/null endif # # JOB WAS CLEARED - MAKE JOB READY FOR RESUBMISSION # if ($cleared == 1) then echo '===========================' echo `date` "Cleared job... resubmitting" echo 'JOBID ' $jobroot grep 'Destination' $jobroot.status grep 'https' $jobroot.jobid echo '===========================' \rm $jobroot.jobid \rm $i:r.process $jobroot.process $jobroot.status>&/dev/null endif end # # CHECK NOW FOR JOBS THAT HAVE BEEN SUBMITTED A LONG TIME AGO - FOR HADDOCK > 10 HOURS # foreach j (`/usr/bin/find /home/haddock/grid/ -mmin +600 -name gridjob\*.process -prune`) glite-wms-job-status -i $j >$j:r.status set ok = `cat $j:r.status | awk 'BEGIN{ok=0}$1 == "Current" && $2 == "Status:" && ($3 == "Done" || $3 == "Running") {ok = 1} END{print ok}'` # # IF NOT FINISHED (OK) THEN CANCEL AND MAKE READY FOR RESUBMISSION # if ($ok != 1) then glite-wms-job-cancel -i $j:r.jobid <<_Eod_ y _Eod_ echo '===========================' echo 'JOBID ' $jobroot echo `date` "Waited for more than 5 hours... resubmitting" grep 'Destination' $jobroot.status grep 'https' $jobroot.jobid echo '===========================' \rm -rf $j:r.jobid \rm -rf $j $j:r.process $j:r.status >&/dev/null endif end else goto exit endif endif exit: