Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

null spec updates (for paper) #270

Merged
merged 34 commits into from
Aug 5, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
34 commits
Select commit Hold shift + click to select a range
e903be1
made samples generation actually null; now uses touch, once only
ben-bay May 11, 2020
d8d22b6
improving samples
ben-bay May 12, 2020
fcdaedc
updated job launcher
ben-bay May 12, 2020
d43077d
modifications
ben-bay May 14, 2020
ec13092
added echo
ben-bay May 14, 2020
a093c68
added echo
ben-bay May 14, 2020
870954b
added times
ben-bay May 15, 2020
9413285
fixed slurm command logic
ben-bay May 15, 2020
3b0535c
script improvements to accomodate multiple node logs
ben-bay May 15, 2020
2d0a04d
read output script updates
ben-bay May 15, 2020
11d2db5
read output script fixes
ben-bay May 15, 2020
f285402
updated times
ben-bay May 15, 2020
edbb444
Merge branch 'develop' of github.com:LLNL/merlin into null_spec
ben-bay May 19, 2020
c0a8bec
updated times
ben-bay May 19, 2020
a376c17
script updates
ben-bay May 19, 2020
1ac9a0b
Merge branch 'develop' of github.com:LLNL/merlin into null_spec
ben-bay May 21, 2020
bcd2b5f
script changes
ben-bay May 26, 2020
81cf1c4
launcher script improvement: added run id to queue names
ben-bay May 26, 2020
3d3e787
tweak
ben-bay May 26, 2020
496a569
added account logic for quartz and pascal
ben-bay May 26, 2020
c6903fc
path
ben-bay May 27, 2020
6cf325e
fixed bug that mixed queues at w=32 and w=64
ben-bay May 27, 2020
f6da765
bugfix
ben-bay May 27, 2020
92a4535
tweak
ben-bay May 27, 2020
b9fff4b
added new launcher specs
ben-bay May 28, 2020
3063a79
added 24 hour cap
ben-bay May 28, 2020
dffafdd
fixed sample vals
ben-bay May 28, 2020
85442c4
script update
ben-bay May 29, 2020
fb4a0e2
Merge branch 'develop' of github.com:LLNL/merlin into null_spec
ben-bay Jun 11, 2020
7029a56
launcher changes
ben-bay Jun 11, 2020
ce30d60
Merge branch 'develop' of github.com:LLNL/merlin into null_spec
ben-bay Jun 11, 2020
047eeee
updated merlin monitor call
ben-bay Jun 11, 2020
1409bb2
Merge branch 'develop' of github.com:LLNL/merlin into null_spec
ben-bay Aug 5, 2020
8a21176
updates
ben-bay Aug 5, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 12 additions & 5 deletions merlin/examples/workflows/null_spec/null_spec.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,18 @@ description:
name: null_spec
description: run N_SAMPLES null steps at CONC concurrency. May be used to measure overhead in merlin.

batch:
type: slurm

env:
variables:
OUTPUT_PATH: ./
N_SAMPLES: 10
CONC: 1
RUN_ID: 0
N_WORK: 0
QUEUE: queue_s$(N_SAMPLES)_c$(N_WORK)_r$(RUN_ID)
WORKER: worker_s$(N_SAMPLES)_c$(N_WORK)_r$(RUN_ID)

study:
- name: null_step
Expand All @@ -15,22 +22,22 @@ study:
cmd: |
#echo $(SAMPLE)
exit $(MERLIN_SUCCESS)
task_queue: queue_s$(N_SAMPLES)_c$(CONC)
task_queue: $(QUEUE)

- name: verify
description: echo done
run:
cmd: echo "Done"
depends: [null_step_*]
task_queue: queue_s$(N_SAMPLES)_c$(CONC)
task_queue: $(QUEUE)

merlin:
resources:
workers:
worker_s$(N_SAMPLES)_c$(CONC):
$(WORKER):
args: -O fair --prefetch-multiplier 1 -E -l info --concurrency $(CONC) --logfile=%%p.log
samples:
generate:
cmd: for i in {1..$(N_SAMPLES)} ; do echo $i ; done >> $(MERLIN_INFO)/samples.csv
file: $(MERLIN_INFO)/samples.csv
cmd: python3 $(SPECROOT)/scripts/make_samples.py --number $(N_SAMPLES) --filepath $(MERLIN_INFO)/samples_file.npy
file: $(MERLIN_INFO)/samples_file.npy
column_labels: [SAMPLE]
10 changes: 7 additions & 3 deletions merlin/examples/workflows/null_spec/scripts/aggregate_output.sh
Original file line number Diff line number Diff line change
@@ -1,11 +1,14 @@

runs=( 4 5 6 7 )
# $1 run_id
# $2 read_path

runs=( $1 )
concurrencies=(1 2 4 8 16 32 64)
samples=(1 10 100 1000 10000)

for run in "${runs[@]}"
do
read_path="/p/lustre1/bay1/null_results/run_${run}"
read_path="$2/run_${run}"
DATA=my_data${run}.yaml
touch ${DATA}

Expand All @@ -14,7 +17,8 @@ for run in "${runs[@]}"
for s in "${samples[@]}"
do
echo "c${c}_s${s} : " >> ${DATA}
python3 read_output.py ${read_path}/c_$c/s_$s/*.log ${read_path}/c_$c/s_$s/*.err $c $s >> ${DATA}
wf_path="${read_path}/c_$c/s_$s"
python3 read_output.py ${wf_path}/ $c $s >> ${DATA}
done
done
perl -pi -e 's/ : \n/ : /g' ${DATA}
Expand Down
53 changes: 44 additions & 9 deletions merlin/examples/workflows/null_spec/scripts/launch_jobs.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import argparse
import os
import socket
import shutil
import subprocess

Expand All @@ -8,29 +9,63 @@
parser.add_argument("run_id", type=int, help="The ID of this run")
parser.add_argument("output_path", type=str, help="the output path")
parser.add_argument("spec_path", type=str, help="path to the spec to run")
parser.add_argument("script_path", type=str, help="path to the make samples script")
args = parser.parse_args()

machine = socket.gethostbyaddr(socket.gethostname())[0]
if "quartz" in machine:
machine = "quartz"
elif "pascal" in machine:
machine = "pascal"

# launch 35 merlin workflow jobs
script_path = os.path.abspath(os.path.dirname(os.path.abspath(__file__)))
concurrencies = [1, 2, 4, 8, 16, 32, 64]
nodes = [1, 1, 1, 1, 1, 1, 2]
samples = [1, 10, 100, 1000, 10000]
submit_path = os.path.abspath(os.path.dirname(os.path.abspath(__file__)))
# concurrencies = [2**4, 2**5, 2**6, 2**7]
# nodes = [1, 1, 2, 4]
# samples = [10**1, 10**2, 10**3, 10**4, 10**5, 10**6]
concurrencies = [2**7]
nodes = [4]
samples = [10**6]
output_path = os.path.join(args.output_path, f"run_{args.run_id}")
os.makedirs(output_path, exist_ok=True)
for i, concurrency in enumerate(concurrencies):
c_name = os.path.join(output_path, f"c_{concurrency}")
if not os.path.isdir(c_name):
os.mkdir(c_name)
os.chdir(c_name)
for sample in samples:
for j, sample in enumerate(samples):
s_name = os.path.join(c_name, f"s_{sample}")
if not os.path.isdir(s_name):
os.mkdir(s_name)
os.chdir(s_name)
submit = f"submit_{nodes[i]}_node.sbatch"
command = f"sbatch {submit} {sample} {int(concurrency/nodes[i])}"
shutil.copyfile(os.path.join(script_path, submit), submit)
shutil.copyfile(args.spec_path, os.path.basename(args.spec_path))
os.mkdir("scripts")
samp_per_worker = float(sample) / float(concurrency)
#if (samp_per_worker / 60) > times[j]:
# print(f"c{concurrency}_s{sample} : {round(samp_per_worker / 60, 0)}m.\ttime: {times[j]}m.\tdiff: {round((samp_per_worker / 60) - times[j], 0)}m")
if (samp_per_worker / 60) < 1.0:
real_time = 4
elif (samp_per_worker / 60) < 3.0:
real_time = 10
else:
real_time = samp_per_worker / 60
real_time *= 1.5
real_time = int(round(real_time, 0))
# print(f"c{concurrency}_s{sample} : {real_time}")
if machine == "quartz":
account = "lbpm"
partition = "pdebug"
elif machine == "pascal":
account = "wbronze"
partition = "pvis"
if real_time > 60:
partition = "pbatch"
if real_time > 1440:
real_time = 1440
submit = "submit.sbatch"
command = f"sbatch -J c{concurrency}s{sample}r{args.run_id} --time {real_time} -N {nodes[i]} -p {partition} -A {account} {submit} {sample} {int(concurrency/nodes[i])} {args.run_id} {concurrency}"
shutil.copyfile(os.path.join(submit_path, submit), submit)
shutil.copyfile(args.spec_path, "spec.yaml")
shutil.copyfile(args.script_path, os.path.join("scripts", "make_samples.py"))
lines = subprocess.check_output(command, shell=True).decode("ascii")
os.chdir(f"..")
os.chdir(f"..")
19 changes: 19 additions & 0 deletions merlin/examples/workflows/null_spec/scripts/make_samples.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
import argparse

import numpy as np


# argument parsing
parser = argparse.ArgumentParser(description="Make some samples (names of people).")
parser.add_argument(
"--number", type=int, action="store", help="the number of samples you want to make"
)
parser.add_argument(
"--filepath", type=str, default="samples_file.npy", help="output file"
)
args = parser.parse_args()

# sample making
result = np.random.random((args.number, 1))

np.save(args.filepath, result)
133 changes: 82 additions & 51 deletions merlin/examples/workflows/null_spec/scripts/read_output.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,83 +2,114 @@
import datetime
import re
import subprocess
import glob
import os
import sys


# argument parsing
parser = argparse.ArgumentParser(description="Make some samples (names of people).")
parser.add_argument("logfile", type=str, help="celery log file")
parser.add_argument("errfile", type=str, help="batch err file")
parser.add_argument("path", type=str, help="path to spec output")
parser.add_argument("c", type=int, help="concurrency")
parser.add_argument("s", type=int, help="n of samples")
args = parser.parse_args()

args.logfile = glob.glob(os.path.join(args.path, "*.log"))
args.errfile = glob.glob(os.path.join(args.path, "*.err"))

def single_task_times():
pre_lines = subprocess.check_output(
f'grep " succeeded in " {args.logfile}', shell=True
).decode("ascii")

pre_list = pre_lines.strip().split("\n")

task_durations = []
for line in pre_list:
matches = re.search(r"\d+.\d+s:", line)
if matches:
match = matches.group(0)
match = float(match.strip("s:"))
task_durations.append(match)
for log in args.logfile:
try:
pre_lines = subprocess.check_output(
f'grep " succeeded in " {log}', shell=True
).decode("ascii")

pre_list = pre_lines.strip().split("\n")

for line in pre_list:
matches = re.search(r"\d+.\d+s:", line)
if matches:
match = matches.group(0)
match = float(match.strip("s:"))
task_durations.append(match)
except:
continue

print(str(task_durations))


def merlin_run_time():
pre_line = subprocess.check_output(
f'grep "real" {args.errfile}', shell=True
).decode("ascii")
pre_line = pre_line.strip()
matches = re.search(r"\d\.\d\d\d", pre_line)
match = matches[0]
result = float(match)
print(f"c{args.c}_s{args.s} merlin run : " + str(result))


def start_verify_time():
try:
total = 0
for err in args.errfile:
pre_line = subprocess.check_output(
f'grep -m1 "verify" {args.logfile}', shell=True
f'grep "real" {err}', shell=True
).decode("ascii")
pre_line = pre_line.strip()
matches = re.search(r"\d\d\d\d-\d\d-\d\d \d\d:\d\d:\d\d,\d\d\d", pre_line)
matches = re.search(r"\d\.\d\d\d", pre_line)
match = matches[0]
element = datetime.datetime.strptime(match, "%Y-%m-%d %H:%M:%S,%f")
timestamp = datetime.datetime.timestamp(element)
print(f"c{args.c}_s{args.s} start verify : " + str(timestamp))
except BaseException:
print(f"c{args.c}_s{args.s} start verify : ERROR")
result = float(match)
total += result
print(f"c{args.c}_s{args.s} merlin run : " + str(result))


def start_verify_time():
all_timestamps = []
for log in args.logfile:
try:
pre_line = subprocess.check_output(
f'grep -m1 "verify" {log}', shell=True
).decode("ascii")
pre_line = pre_line.strip()
matches = re.search(r"\d\d\d\d-\d\d-\d\d \d\d:\d\d:\d\d,\d\d\d", pre_line)
match = matches[0]
element = datetime.datetime.strptime(match, "%Y-%m-%d %H:%M:%S,%f")
timestamp = datetime.datetime.timestamp(element)
all_timestamps.append(timestamp)
except:
continue
try:
print(f"c{args.c}_s{args.s} start verify : " + str(all_timestamps[0]))
except BaseException:
print(f"c{args.c}_s{args.s} start verify : ERROR")


def start_run_workers_time():
pre_line = subprocess.check_output(
f'grep -m1 "" {args.logfile}', shell=True
).decode("ascii")
pre_line = pre_line.strip()
matches = re.search(r"\d\d\d\d-\d\d-\d\d \d\d:\d\d:\d\d,\d\d\d", pre_line)
match = matches[0]
element = datetime.datetime.strptime(match, "%Y-%m-%d %H:%M:%S,%f")
timestamp = datetime.datetime.timestamp(element)
print(f"c{args.c}_s{args.s} start run-workers : " + str(timestamp))
all_timestamps = []
for log in args.logfile:
try:
pre_line = subprocess.check_output(
f'grep -m1 "" {log}', shell=True
).decode("ascii")
pre_line = pre_line.strip()
matches = re.search(r"\d\d\d\d-\d\d-\d\d \d\d:\d\d:\d\d,\d\d\d", pre_line)
match = matches[0]
element = datetime.datetime.strptime(match, "%Y-%m-%d %H:%M:%S,%f")
timestamp = datetime.datetime.timestamp(element)
all_timestamps.append(timestamp)
except:
continue
earliest = min(all_timestamps)
print(f"c{args.c}_s{args.s} start run-workers : " + str(earliest))


def start_sample1_time():
pre_line = subprocess.check_output(
f"grep -m1 \"Executing step 'null_step'\" {args.logfile}", shell=True
).decode("ascii")
pre_line = pre_line.strip()
matches = re.search(r"\d\d\d\d-\d\d-\d\d \d\d:\d\d:\d\d,\d\d\d", pre_line)
match = matches[0]
element = datetime.datetime.strptime(match, "%Y-%m-%d %H:%M:%S,%f")
timestamp = datetime.datetime.timestamp(element)
print(f"c{args.c}_s{args.s} start samp1 : " + str(timestamp))
all_timestamps = []
for log in args.logfile:
try:
pre_line = subprocess.check_output(
f"grep -m1 \"Executing step 'null_step'\" {log}", shell=True
).decode("ascii")
pre_line = pre_line.strip()
matches = re.search(r"\d\d\d\d-\d\d-\d\d \d\d:\d\d:\d\d,\d\d\d", pre_line)
match = matches[0]
element = datetime.datetime.strptime(match, "%Y-%m-%d %H:%M:%S,%f")
timestamp = datetime.datetime.timestamp(element)
all_timestamps.append(timestamp)
except:
continue
earliest = min(all_timestamps)
print(f"c{args.c}_s{args.s} start samp1 : " + str(earliest))


def main():
Expand Down
24 changes: 24 additions & 0 deletions merlin/examples/workflows/null_spec/scripts/submit.sbatch
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
#!/bin/sh

#SBATCH -N 1
#SBATCH -J MerlinBaseline
#SBATCH -t 10
#SBATCH -p pdebug
#SBATCH -A lbpm
#SBATCH --output=merlin%j.out
#SBATCH --error=merlin%j.err
#SBATCH --exclusive
#SBATCH --mpibind=off

YAML_FILE=spec.yaml

# print message that a new run is starting
echo "Starting new run: $SLURM_JOBID"
date

# run program
time merlin run ${YAML_FILE} --vars N_SAMPLES=$1 CONC=$2 RUN_ID=$3 N_WORK=$4
merlin run-workers ${YAML_FILE} --echo --vars N_SAMPLES=$1 CONC=$2 RUN_ID=$3 N_WORK=$4
merlin run-workers ${YAML_FILE} --vars N_SAMPLES=$1 CONC=$2 RUN_ID=$3 N_WORK=$4

merlin monitor ${YAML_FILE} --vars N_SAMPLES=$1 CONC=$2 RUN_ID=$3 N_WORK=$4
23 changes: 0 additions & 23 deletions merlin/examples/workflows/null_spec/scripts/submit_1_node.sbatch

This file was deleted.

Loading