-
Notifications
You must be signed in to change notification settings - Fork 107
/
submit.sh
205 lines (179 loc) · 7.33 KB
/
submit.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
#!/bin/bash
# On some sites we know there were some problems with environment cleaning
# with using 'env -i'. To overcome this issue, whenever we start a job, we have
# to save full current environment into file, and whenever it is needed we can load
# it. Be aware, that there are some read-only variables, like: BASHOPTS, BASH_VERSINFO,
# EUID, PPID, SHELLOPTS, UID, etc.
set | sed 's/^/export /g' > startup_environment.sh
# Function to check the exit code of this bootstrap script and the job/python
# wrapper exit code.
# 1) If the bootstrap exit code is not 0, then something is wrong with the worker
# node and this script will sleep for WMA_MIN_JOB_RUNTIMESECS before it exits.
# 2) If the job exit code is not 0, then again sleep for WMA_MIN_JOB_RUNTIMESECS
# 3) If all exit codes are 0, then just quit
finish() {
exitCode=$?
echo "======== WMAgent final job runtime checks STARTING at $(TZ=GMT date) ========"
END_TIME=$(date +%s)
DIFF_TIME=$((END_TIME-START_TIME))
echo "$(TZ=GMT date): Job Runtime in seconds: " $DIFF_TIME
echo "$(TZ=GMT date): Job bootstrap script exited: " $exitCode
echo "$(TZ=GMT date): Job execution exited: " $jobrc
if [ $exitCode -ne 0 ];
then
WMA_MIN_JOB_RUNTIMESECS=300
elif [ $jobrc -eq 0 ];
then
WMA_MIN_JOB_RUNTIMESECS=0
fi
if [ $DIFF_TIME -lt $WMA_MIN_JOB_RUNTIMESECS ];
then
SLEEP_TIME=$((WMA_MIN_JOB_RUNTIMESECS - DIFF_TIME))
echo "$(TZ=GMT date): Job runtime is less than $WMA_MIN_JOB_RUNTIMESECS seconds. Sleeping " $SLEEP_TIME
sleep $SLEEP_TIME
fi
echo -e "======== WMAgent final job runtime checks FINISHED at $(TZ=GMT date) ========\n"
}
# Trap all exits and execute finish function
trap finish EXIT
# should be a bit nicer than before
echo "======== WMAgent bootstrap STARTING at $(TZ=GMT date) ========"
echo "User id: $(id)"
echo "Local time: $(date)"
echo "Hostname: $(hostname -f)"
echo "System: $(uname -a)"
echo "Arguments: $@"
# Python library required for Python2/Python3 compatibility through "future"
PY_FUTURE_VERSION=0.18.2
# Saving START_TIME and when job finishes END_TIME.
WMA_MIN_JOB_RUNTIMESECS=300
START_TIME=$(date +%s)
# assign arguments
SANDBOX=$1
INDEX=$2
RETRY_NUM=$3
export JOBSTARTDIR=$PWD
if [ "X$_CONDOR_JOB_AD" != "X" ];
then
WMA_SiteName=`grep '^MachineAttrGLIDEIN_CMSSite0 =' $_CONDOR_JOB_AD | tr -d '"' | awk '{print $NF;}'`
echo "Site name: $WMA_SiteName"
echo "======== HTCondor jobAds start at $(TZ=GMT date) ========"
while read i; do
echo " $i"
done < $_CONDOR_JOB_AD | sort
echo -e "======== HTCondor jobAds finished at $(TZ=GMT date) ========\n"
fi
# We need to create the expected output file in advance, just in case
# some problem happens during the job bootstrap
outputFile="Report.$RETRY_NUM.pkl"
touch $outputFile
echo "======== WMAgent validate arguments starting at $(TZ=GMT date) ========"
if [ -z "$1" ]
then
echo "Error during job bootstrap: A sandbox must be specified" >&2
exit 11001
fi
if [ -z "$2" ]
then
echo "Error during job bootstrap: A job index must be specified" >&2
exit 11002
fi
echo -e "======== WMAgent validate arguments finished at $(TZ=GMT date) ========\n"
echo "======== WMAgent CMS environment load starting at $(TZ=GMT date) ========"
if [ -f "$VO_CMS_SW_DIR"/cmsset_default.sh ]
then # LCG style --
echo "WN with a LCG style environment, thus using VO_CMS_SW_DIR=$VO_CMS_SW_DIR"
. $VO_CMS_SW_DIR/cmsset_default.sh
elif [ -f "$OSG_APP"/cmssoft/cms/cmsset_default.sh ]
then # OSG style --
echo "WN with an OSG style environment, thus using OSG_APP=$OSG_APP"
. $OSG_APP/cmssoft/cms/cmsset_default.sh CMSSW_3_3_2
elif [ -f "$CVMFS"/cms.cern.ch/cmsset_default.sh ]
then
echo "WN with CVMFS environment, thus using CVMFS=$CVMFS"
. $CVMFS/cms.cern.ch/cmsset_default.sh
elif [ -f /cvmfs/cms.cern.ch/cmsset_default.sh ]
then # ok, lets call it CVMFS then
export CVMFS=/cvmfs/cms.cern.ch
echo "WN missing VO_CMS_SW_DIR/OSG_APP/CVMFS environment variable, forcing it to CVMFS=$CVMFS"
. $CVMFS/cmsset_default.sh
else
echo "Error during job bootstrap: VO_CMS_SW_DIR, OSG_APP, CVMFS or /cvmfs were not found." >&2
echo " Because of this, we can't load CMSSW. Not good." >&2
exit 11003
fi
echo "WMAgent bootstrap: WMAgent thinks it found the correct CMSSW setup script"
echo -e "======== WMAgent CMS environment load finished at $(TZ=GMT date) ========\n"
echo "======== WMAgent COMP Python bootstrap starting at $(TZ=GMT date) ========"
# First, decide which COMP ScramArch to use based on the required OS and Architecture
THIS_ARCH=`uname -m` # if it's PowerPC, it returns `ppc64le`
if [ "$THIS_ARCH" = "x86_64" ]
then
THIS_ARCH="amd64"
fi
if [ "$REQUIRED_OS" = "rhel7" ];
then
WMA_SCRAM_ARCH=slc7_${THIS_ARCH}_gcc630
else
WMA_SCRAM_ARCH=slc6_${THIS_ARCH}_gcc700
fi
echo "Job requires OS: $REQUIRED_OS, thus setting ScramArch to: $WMA_SCRAM_ARCH"
suffix=etc/profile.d/init.sh
if [ -d "$VO_CMS_SW_DIR"/COMP/"$WMA_SCRAM_ARCH"/external/python ]
then
prefix="$VO_CMS_SW_DIR"/COMP/"$WMA_SCRAM_ARCH"/external/python
elif [ -d "$OSG_APP"/cmssoft/cms/COMP/"$WMA_SCRAM_ARCH"/external/python ]
then
prefix="$OSG_APP"/cmssoft/cms/COMP/"$WMA_SCRAM_ARCH"/external/python
elif [ -d "$CVMFS"/COMP/"$WMA_SCRAM_ARCH"/external/python ]
then
prefix="$CVMFS"/COMP/"$WMA_SCRAM_ARCH"/external/python
else
echo "Failed to find a COMP python installation in the worker node setup." >&2
echo " Without a known python, there is nothing else we can do with this job. Quiting!" >&2
exit 11004
fi
compPythonPath=`echo $prefix | sed 's|/python||'`
echo "WMAgent bootstrap: COMP Python path is: $compPythonPath"
latestPythonVersion=`ls -t "$prefix"/*/"$suffix" | head -n1 | sed 's|.*/external/python/||' | cut -d '/' -f1`
pythonMajorVersion=`echo $latestPythonVersion | cut -d '.' -f1`
pythonCommand="python"${pythonMajorVersion}
echo "WMAgent bootstrap: latest python release is: $latestPythonVersion"
source "$prefix/$latestPythonVersion/$suffix"
source "$compPythonPath/py2-future/$PY_FUTURE_VERSION/$suffix"
command -v $pythonCommand > /dev/null
rc=$?
if [[ $rc != 0 ]]
then
echo "Error during job bootstrap: python isn't available on the worker node." >&2
echo " WMCore/WMAgent REQUIRES at least python2" >&2
exit 11005
else
echo "WMAgent bootstrap: found $pythonCommand at.."
echo `which $pythonCommand`
fi
echo -e "======== WMAgent Python bootstrap finished at $(TZ=GMT date) ========\n"
echo "======== WMAgent Unpack the job starting at $(TZ=GMT date) ========"
# Should be ready to unpack and run this
$pythonCommand Unpacker.py --sandbox=$SANDBOX --package=JobPackage.pkl --index=$INDEX
cd job
export WMAGENTJOBDIR=$PWD
export PYTHONPATH=$PYTHONPATH:$WMAGENTJOBDIR/WMCore.zip:$WMAGENTJOBDIR
echo -e "======== WMAgent Unpack the job finished at $(TZ=GMT date) ========\n"
echo "======== Current environment dump starting ========"
for i in `env`; do
echo " $i"
done
echo -e "======== Current environment dump finished ========\n"
echo "======== WMAgent Run the job starting at $(TZ=GMT date) ========"
$pythonCommand Startup.py
jobrc=$?
echo -e "======== WMAgent Run the job FINISH at $(TZ=GMT date) ========\n"
echo "WMAgent bootstrap: WMAgent finished the job, it's copying the pickled report"
set -x
cp WMTaskSpace/Report*.pkl ../
ls -l WMTaskSpace
ls -l WMTaskSpace/*
set +x
echo -e "======== WMAgent bootstrap FINISH at $(TZ=GMT date) ========\n"
exit $jobrc