Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add GNU and Cheyenne Support to Automated RT #444

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
19 commits
Select commit Hold shift + click to select a range
b8f7fd8
Add capability for GNU and Add support for Cheyenne
BrianCurtis-NOAA Feb 26, 2021
c9f213b
Auto: Added Updated RT Log file: tests/RegressionTests_hera.gnu.log s…
BrianCurtis-NOAA Feb 26, 2021
8da6a6e
Bug Fixes
BrianCurtis-NOAA Feb 26, 2021
3f6d3b9
Merge
BrianCurtis-NOAA Feb 26, 2021
59466fb
Auto: Added Updated RT Log file: tests/RegressionTests_hera.intel.log…
BrianCurtis-NOAA Feb 26, 2021
03b1e69
Getting read for PR
BrianCurtis-NOAA Feb 26, 2021
cc56609
Update rt_auto.sh
BrianCurtis-NOAA Mar 1, 2021
2e394bb
Fix for Cheyenne string match being incorrect.
BrianCurtis-NOAA Mar 1, 2021
25ff215
Add permission check for 600 in accesstoken.sh
BrianCurtis-NOAA Mar 1, 2021
b031d6d
Merge branch 'feature/rt-auto-gnu' of https://github.com/BrianCurtis-…
BrianCurtis-NOAA Mar 1, 2021
79da37c
Make labels more logical, check for accesstoken.sh permissions, comme…
BrianCurtis-NOAA Mar 1, 2021
06d93c9
Remove unneccesary logs
BrianCurtis-NOAA Mar 3, 2021
c396c00
Changed cheyenne hfe07 to match with chadmin* in rt_auto.sh and detec…
BrianCurtis-NOAA Mar 3, 2021
bcd4254
Auto: Add RT Log file: tests/RegressionTests_cheyenne.gnu.log skip-ci
climbfuji Mar 3, 2021
204190d
Auto: Add RT Log file: tests/RegressionTests_cheyenne.intel.log skip-ci
climbfuji Mar 3, 2021
2be827f
Bugfixes in rt_auto.sh and rt_auto.py
climbfuji Mar 3, 2021
8fc1b48
Merge pull request #3 from climbfuji/rt_auto_bugfixes_cheyenne
BrianCurtis-NOAA Mar 3, 2021
6f57b7d
Revert logs to upstream
BrianCurtis-NOAA Mar 3, 2021
2a483f3
Merge branch 'develop' of https://github.com/ufs-community/ufs-weathe…
BrianCurtis-NOAA Mar 3, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
119 changes: 68 additions & 51 deletions tests/auto/rt_auto.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,8 +42,8 @@ def parse_args_in():
parser = argparse.ArgumentParser()

# Setup Input Arguments
choices = ['hera.intel', 'orion.intel', 'gaea.intel', 'jet.intel', 'wcoss_dell_p3']
parser.add_argument('-m', '--machine', help='Machine and Compiler combination', required=True, choices=choices, type=str)
choices = ['cheyenne', 'hera', 'orion', 'gaea', 'jet', 'wcoss_dell_p3']
parser.add_argument('-m', '--machine', help='Machine name', required=True, choices=choices, type=str)
parser.add_argument('-w', '--workdir', help='Working directory', required=True, type=str)

# Get Arguments
Expand Down Expand Up @@ -73,15 +73,19 @@ def input_data(args):

def match_label_with_action(machine, actions, label):
''' Match the label that initiates a job with an action in the dict'''
# <machine>-<compiler>-<test> i.e. hera-gnu-RT
# RT = full regression test suite
logger = logging.getLogger('MATCH_LABEL_WITH_ACTIONS')
split_label = label.name.split('-')

if len(split_label) != 3: return False
if not re.match(split_label[0], 'Auto'): return False
if not re.match(split_label[2], machine['name'].split('.')[0]): return False
action_match = next((action for action in actions if re.match(action['name'], split_label[1])), False)

return action_match
if len(split_label) != 3: return False, False #Make sure it has three parts
if not re.match(split_label[0], machine['name']): return False, False #First check machine name matches
compiler = split_label[1]
if not str(compiler) in ["intel", "gnu"]: return False, False
action_match = next((action for action in actions if re.match(action['name'], split_label[2])), False)
action_match["command"] = f'export RT_COMPILER="{compiler}" && {action_match["command"]}'
if split_label[2] == "RT" and compiler == "gnu":
action_match["command"] = f'{action_match["command"]} -l rt_gnu.conf'
return compiler, action_match


def get_preqs_with_actions(repos, machine, ghinterface_obj, actions):
Expand All @@ -92,9 +96,10 @@ def get_preqs_with_actions(repos, machine, ghinterface_obj, actions):
preq_labels = [{'preq': pr, 'label': label} for pr in each_pr for label in pr.get_labels()]

for i, pr_label in enumerate(preq_labels):
match = match_label_with_action(machine, actions, pr_label['label'])
compiler, match = match_label_with_action(machine, actions, pr_label['label'])
if match:
preq_labels[i]['action'] = match
preq_labels[i]['compiler'] = compiler
else:
preq_labels[i] = False

Expand Down Expand Up @@ -130,8 +135,20 @@ def remove_pr_label(self):
self.logger.info(f'Removing Label: {self.preq_dict["label"]}')
self.preq_dict['preq'].remove_from_labels(self.preq_dict['label'])

def send_log_name_as_comment(self):
def check_label_before_job_start(self):
# LETS Check the label still exists before the start of the job in the
# case of multiple jobs
label_to_check = f'{self.machine["name"]}-{self.preq_dict["compiler"]}-{self.preq_dict["action"]["name"]}'
labels = self.preq_dict['preq'].get_labels()
label_match = next((label for label in labels if re.match(label.name, label_to_check)), False)

return label_match


def send_log_name_as_comment(self, log_filename):
logger = logging.getLogger('JOB/SEND_LOG_NAME_AS_COMMENT')

#Remove LAST MONTHS LOGS
logger.info('Removing last months logs (if any)')
last_month = datetime.date.today().replace(day=1) - datetime.timedelta(days=1)
rm_command = [[f'rm rt_auto_*_{last_month.strftime("%Y%m")}*.log', os.getcwd()]]
Expand All @@ -141,24 +158,16 @@ def send_log_name_as_comment(self):
except Exception as e:
logger.warning(f'"{rm_command}" failed with error:{e}')

new_log_name = f'rt_auto_{self.machine["name"]}_'\
f'{datetime.datetime.now().strftime("%Y%m%d%H%M%S")}.log'
cp_command = [[f'cp rt_auto.log {new_log_name}', os.getcwd()]]
logger.info(f'Running "{cp_command}"')
# Add log information to PR.
comment_text = f'Log Name:{log_filename}\n'\
f'Log Location:{os.getcwd()}\n'\
'Logs are kept for one month'
try:
self.run_commands(cp_command)
self.preq_dict['preq'].create_issue_comment(comment_text)
except Exception as e:
logger.warning('Renaming rt_auto failed')
logger.warning('Creating comment with log location failed with:{e}')
else:
comment_text = f'Log Name:{new_log_name}\n'\
f'Log Location:{os.getcwd()}\n'\
'Logs are kept for one month'
try:
self.preq_dict['preq'].create_issue_comment(comment_text)
except Exception as e:
logger.warning('Creating comment with log location failed with:{e}')
else:
logger.info(f'{comment_text}')
logger.info(f'{comment_text}')

def run_commands(self, commands_with_cwd):
logger = logging.getLogger('JOB/RUN_COMMANDS')
Expand Down Expand Up @@ -212,9 +221,12 @@ def clone_pr_repo(self):
def run_function(self):
''' Run the command associted with the label used to initiate this job '''
logger = logging.getLogger('JOB/RUN_FUNCTION')
compiler = self.preq_dict['compiler']
logger.info(f'Compiler being used for command is {compiler}')
command = self.preq_dict["action"]["command"]
try:
logger.info(f'Running: "{self.preq_dict["action"]["command"]}" in "{self.pr_repo_loc}"')
output = subprocess.Popen(self.preq_dict['action']['command'], cwd=self.pr_repo_loc, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
logger.info(f'Running: "{command}" in "{self.pr_repo_loc}"')
output = subprocess.Popen(command, cwd=self.pr_repo_loc, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
out,err = output.communicate()
out = [] if not out else out.decode('utf8').split('\n')
err = [] if not err else err.decode('utf8').split('\n')
Expand All @@ -225,7 +237,13 @@ def run_function(self):
assert(e)
else:
if output.returncode != 0:
logger.critical(f'{self.preq_dict["action"]["command"]} Failed')
comment_text = f'rt.sh failed \n'\
f'machine: {self.machine["name"]} \n'\
f'compiler: {self.preq_dict["compiler"]}\n'\
f'STDOUT: {out} \n'\
f'STDERR: {err}'
self.preq_dict['preq'].create_issue_comment(comment_text)
logger.critical(f'{command} Failed')
[logger.critical(f'stdout: {item}') for item in out if not None]
[logger.critical(f'stderr: {eitem}') for eitem in err if not None]
else:
Expand All @@ -245,28 +263,27 @@ def run_function(self):
def move_rt_logs(self):
''' This is the callback function associated with the "RT" command '''
logger = logging.getLogger('JOB/MOVE_RT_LOGS')
rt_log = f'tests/RegressionTests_{self.machine["name"]}.log'
rt_log = f'tests/RegressionTests_{self.machine["name"]}.{self.preq_dict["compiler"]}.log'
filepath = f'{self.pr_repo_loc}/{rt_log}'
rm_filepath = '/'.join((self.pr_repo_loc.split('/'))[:-1])
if os.path.exists(filepath):
move_rt_commands = [
[f'git pull --ff-only origin {self.branch}', self.pr_repo_loc],
[f'git add {rt_log}', self.pr_repo_loc],
[f'git commit -m "Auto: Added Updated RT Log file: {rt_log}"', self.pr_repo_loc],
[f'git pull --no-edit origin {self.branch}', self.pr_repo_loc],
[f'git commit -m "Auto: Add RT Log file: {rt_log} skip-ci"', self.pr_repo_loc],
['sleep 10', self.pr_repo_loc],
[f'git push origin {self.branch}', self.pr_repo_loc]
]
self.run_commands(move_rt_commands)

else:
logger.critical('Could not find RT log')
raise FileNotFoundError('Could not find RT log')
logger.critical('Could not find Intel RT log')
raise FileNotFoundError('Could not find Intel RT log')

def main():

# handle logging
log_path = os.getcwd()
log_filename = 'rt_auto.log'
log_filename = f'rt_auto_{datetime.datetime.now().strftime("%Y%m%d%H%M%S")}.log'
# Please don't run the following on cron with level=logging.DEBUG
# as it exposes the GH API Token
# Only set it to DEBUG while debugging
Expand All @@ -288,26 +305,26 @@ def main():
# get all pull requests from the GitHub object
logger.info('Getting all pull requests, labels and actions applicable to this machine.')
preq_dict = get_preqs_with_actions(repos, machine, ghinterface_obj, actions)

# add Job objects and run them
logger.info('Adding all jobs to an object list and running them.')
jobs = [Job(pullreq, ghinterface_obj, machine) for pullreq in preq_dict]
for job in jobs:
logger.info(f'Starting Job: {job}')
try:
logger.info('Calling remove_pr_label')
job.remove_pr_label()
logger.info('Calling clone_pr_repo')
job.clone_pr_repo()
logger.info('Calling run_function')
job.run_function()
logger.info('Calling remove_pr_dir')
job.remove_pr_dir()
logger.info('Calling send_log_name_as_comment')
job.send_log_name_as_comment()
except Exception as e:
logger.critical(e)
assert(e)
if job.check_label_before_job_start():
try:
logger.info('Calling remove_pr_label')
job.remove_pr_label()
logger.info('Calling clone_pr_repo')
job.clone_pr_repo()
logger.info('Calling run_function')
job.run_function()
logger.info('Calling remove_pr_dir')
# job.remove_pr_dir()
# logger.info('Calling send_log_name_as_comment')
job.send_log_name_as_comment(log_filename)
except Exception as e:
logger.critical(e)
assert(e)

logger.info('Script Finished')

Expand Down
36 changes: 22 additions & 14 deletions tests/auto/rt_auto.sh
Original file line number Diff line number Diff line change
@@ -1,45 +1,53 @@
#!/bin/bash --login
set -eux
if [ -f "accesstoken.sh" ]; then
source ./accesstoken.sh
if [ $(stat -L -c "%a" "accesstoken.sh") == "600" ]; then
echo "Sourcing accesstoken.sh"
source ./accesstoken.sh
else
echo "accesstoken.sh permissions NEED to be set to 600 before starting"
exit 1
fi
else
echo "Please create accesstoken.sh (600) with the following content\n"
echo "export ghapitoken=<GitHub API Token Here>"
exit 1
fi

export RT_COMPILER='intel'
source ../detect_machine.sh
echo "Machine ID: "+$MACHINE_ID
if [[ $MACHINE_ID = hera.* ]]; then
if [[ $HOSTNAME == hfe* ]]; then
MACHINE_NAME=hera
WORKDIR=/scratch1/NCEPDEV/nems/Brian.Curtis/test
export PATH=/scratch1/NCEPDEV/nems/emc.nemspara/soft/miniconda3/bin:$PATH
export PYTHONPATH=/scratch1/NCEPDEV/nems/emc.nemspara/soft/miniconda3/lib/python3.8/site-packages
elif [[ $MACHINE_ID = orion.* ]]; then
elif [[ $HOSTNAME == Orion-login-* ]]; then
BrianCurtis-NOAA marked this conversation as resolved.
Show resolved Hide resolved
MACHINE_NAME=orion
WORKDIR=/work/noaa/nems/bcurtis/test
export PATH=/work/noaa/nems/emc.nemspara/soft/miniconda3/bin:$PATH
export PYTHONPATH=/work/noaa/nems/emc.nemspara/soft/miniconda3/lib/python3.8/site-packages
elif [[ $MACHINE_ID = jet.* ]]; then
elif [[ $HOSTNAME == fe* ]]; then
MACHINE_NAME=jet
WORKDIR=/lfs4/HFIP/h-nems/Brian.Curtis/test
export ACCNR="h-nems"
export PATH=/lfs4/HFIP/hfv3gfs/software/miniconda3/4.8.3/envs/ufs-weather-model/bin:/lfs4/HFIP/hfv3gfs/software/miniconda3/4.8.3/bin:$PATH
export PYTHONPATH=/lfs4/HFIP/hfv3gfs/software/miniconda3/4.8.3/envs/ufs-weather-model/lib/python3.8/site-packages:/lfs4/HFIP/hfv3gfs/software/miniconda3/4.8.3/lib/python3.8/site-packages
elif [[ $MACHINE_ID = gaea.* ]]; then
elif [[ $HOSTNAME == gaea* ]]; then
MACHINE_NAME=gaea
WORKDIR=/lustre/f2/pdata/ncep/Brian.Curtis/test
export LOADEDMODULES=$LOADEDMODULES
export ACCNR="nggps_emc" # This applies to Brian.Curtis, may need change later
export PATH=/lustre/f2/pdata/esrl/gsd/contrib/miniconda3/4.8.3/envs/ufs-weather-model/bin:$PATH
export PYTHONPATH=/lustre/f2/pdata/esrl/gsd/contrib/miniconda3/4.8.3/lib/python3.8/site-packages
elif [[ $MACHINE_ID = cheyenne.* ]]; then
#export PATH=/glade/p/ral/jntp/tools/ecFlow-5.3.1/bin:$PATH
#export PYTHONPATH=/glade/p/ral/jntp/tools/ecFlow-5.3.1/lib/python2.7/site-packages
echo "cheyenne not currently supported. automated RT not starting"
exit 1
elif [[ $HOSTNAME == *.cheyenne.ucar.edu ]]; then
MACHINE_NAME=cheyenne
WORKDIR=/glade/work/heinzell/fv3/ufs-weather-model/auto-rt
export ACCNR="P48503002"
export PATH=/glade/p/ral/jntp/tools/miniconda3/4.8.3/envs/ufs-weather-model/bin:/glade/p/ral/jntp/tools/miniconda3/4.8.3/bin:$PATH
export PYTHONPATH=/glade/p/ral/jntp/tools/miniconda3/4.8.3/envs/ufs-weather-model/lib/python3.8/site-packages:/glade/p/ral/jntp/tools/miniconda3/4.8.3/lib/python3.8/site-packages
else
echo "No Python Path for this machine. automated RT not starting"
exit 1
fi

python rt_auto.py -m $MACHINE_ID -w $WORKDIR
python rt_auto.py -m $MACHINE_NAME -w $WORKDIR

exit 0
6 changes: 6 additions & 0 deletions tests/detect_machine.sh
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,12 @@ case $(hostname -f) in
cheyenne4.ib0.cheyenne.ucar.edu) MACHINE_ID=cheyenne ;; ### cheyenne4
cheyenne5.ib0.cheyenne.ucar.edu) MACHINE_ID=cheyenne ;; ### cheyenne5
cheyenne6.ib0.cheyenne.ucar.edu) MACHINE_ID=cheyenne ;; ### cheyenne6
chadmin1.ib0.cheyenne.ucar.edu) MACHINE_ID=cheyenne ;; ### cheyenne1
chadmin2.ib0.cheyenne.ucar.edu) MACHINE_ID=cheyenne ;; ### cheyenne1
chadmin3.ib0.cheyenne.ucar.edu) MACHINE_ID=cheyenne ;; ### cheyenne1
chadmin4.ib0.cheyenne.ucar.edu) MACHINE_ID=cheyenne ;; ### cheyenne1
chadmin5.ib0.cheyenne.ucar.edu) MACHINE_ID=cheyenne ;; ### cheyenne1
chadmin6.ib0.cheyenne.ucar.edu) MACHINE_ID=cheyenne ;; ### cheyenne1

login1.stampede2.tacc.utexas.edu) MACHINE_ID=stampede ;; ### stampede1
login2.stampede2.tacc.utexas.edu) MACHINE_ID=stampede ;; ### stampede2
Expand Down