You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
When using ray, there is a hard exit where we always see an stack trace printed because of sys.exit being called on worker nodes. Is there a way to exit more gracefully in these situations?
(pid=42763) experiment_impact_tracker.compute_tracker.ImpactTracker - ERROR - Encountered exception within power monitor thread!
(pid=42763) ERROR:Encountered exception within power monitor thread!
(pid=42763) INFO:Done - Logging final info.
(pid=42763) /u/nlp/anaconda/main/anaconda3/envs/<anon>/lib/python3.7/site-packages/experiment_impact_tracker/data_utils
.py:30: FutureWarning: pandas.io.json.json_normalize is deprecated, use pandas.json_normalize instead
(pid=42763) return json_normalize(json_array, max_level=max_level), json_array
(pid=42763) experiment_impact_tracker.compute_tracker.ImpactTracker - ERROR - File
"/u/nlp/anaconda/main/anaconda3/envs/<anon>/lib/python3.7/site-packages/experiment_impact_tracker/compute_tracker.py",
line 161, in launch_power_monitor
(pid=42763) _sample_and_log_power(log_dir, initial_info, logger=logger)
(pid=42763) File "/u/nlp/anaconda/main/anaconda3/envs/<anon>/lib/python3
.7/site-packages/experiment_impact_tracker/compute_tracker.py", line 112, in _sample_and_log_power
(pid=42763) log_dir=log_dir,
(pid=42763) File "/u/nlp/anaconda/main/anaconda3/envs/<anon>/lib/python3
.7/site-packages/experiment_impact_tracker/gpu/nvidia.py", line 127, in get_nvidia_gpu_power
(pid=42763) out_str = sp.communicate()
(pid=42763) File "/u/nlp/anaconda/main/anaconda3/envs/<anon>/lib/python3.7/subprocess.py", line 964, in communicate
(pid=42763) stdout, stderr = self._communicate(input, endtime, timeout)
(pid=42763) File "/u/nlp/anaconda/main/anaconda3/envs/<anon>/lib/python3.7/subprocess.py", line 1715, in _communicate
(pid=42763) ready = selector.select(timeout)
(pid=42763) File "/u/nlp/anaconda/main/anaconda3/envs/<anon>/lib/python3.7/selectors.py", line 415, in select
(pid=42763) fd_event_list = self._selector.poll(timeout)
(pid=42763) File "/u/nlp/anaconda/main/anaconda3/envs/<anon>/lib/python3.7/site-packages/ray/worker.py", line 392, in
sigterm_handler
(pid=42763) sys.exit(1)
(pid=42763)
(pid=42763) ERROR: File "/u/nlp/anaconda/main/anaconda3/envs/<anon>/lib/python3
.7/site-packages/experiment_impact_tracker/compute_tracker.py", line 161, in launch_power_monitor
(pid=42763) _sample_and_log_power(log_dir, initial_info, logger=logger)
(pid=42763) File "/u/nlp/anaconda/main/anaconda3/envs/<anon>/lib/python3
.7/site-packages/experiment_impact_tracker/compute_tracker.py", line 112, in _sample_and_log_power
(pid=42763) log_dir=log_dir,
(pid=42763) File "/u/nlp/anaconda/main/anaconda3/envs/<anon>/lib/python3
.7/site-packages/experiment_impact_tracker/gpu/nvidia.py", line 127, in get_nvidia_gpu_power
(pid=42763) out_str = sp.communicate()
(pid=42763) File "/u/nlp/anaconda/main/anaconda3/envs/<anon>/lib/python3.7/subprocess.py", line 964, in communicate
(pid=42763) stdout, stderr = self._communicate(input, endtime, timeout)
(pid=42763) File "/u/nlp/anaconda/main/anaconda3/envs/<anon>/lib/python3.7/subprocess.py", line 1715, in _communicate
(pid=42763) ready = selector.select(timeout)
(pid=42763) File "/u/nlp/anaconda/main/anaconda3/envs/<anon>/lib/python3.7/selectors.py", line 415, in select
(pid=42763) fd_event_list = self._selector.poll(timeout)
(pid=42763) File "/u/nlp/anaconda/main/anaconda3/envs/<anon>/lib/python3.7/site-packages/ray/worker.py", line 392, in
sigterm_handler
(pid=42763) sys.exit(1)
(pid=29659) experiment_impact_tracker.compute_tracker.ImpactTracker - ERROR - Encountered exception within power monitor thread!
(pid=29659) ERROR:Encountered exception within power monitor thread!
INFO:time to complete: 0:01:39.574842
(pid=29659) experiment_impact_tracker.compute_tracker.ImpactTracker - ERROR - File
"/u/nlp/anaconda/main/anaconda3/envs/anon/lib/python3.7/site-packages/experiment_impact_tracker/compute_tracker.py",
line
161, in launch_power_monitor
(pid=29659) _sample_and_log_power(log_dir, initial_info, logger=logger)
(pid=29659) File "/u/nlp/anaconda/main/anaconda3/envs/anon/lib/python3
.7/site-packages/experiment_impact_tracker/compute_tracker.py", line 93, in _sample_and_log_power
(pid=29659) required_headers = _get_compatible_data_headers(get_current_region_info_cached()[0])
(pid=29659) File "/u/nlp/anaconda/main/anaconda3/envs/anon/lib/python3
.7/site-packages/experiment_impact_tracker/compute_tracker.py", line 182, in _get_compatible_data_headers
(pid=29659) if not compatability_fn(region=region):
(pid=29659) File "/u/nlp/anaconda/main/anaconda3/envs/anon/lib/python3
.7/site-packages/experiment_impact_tracker/cpu/common.py", line 32, in is_cpu_freq_compatible
(pid=29659) test = [x._asdict() for x in psutil.cpu_freq(percpu=True)]
(pid=29659) File "/u/nlp/anaconda/main/anaconda3/envs/anon/lib/python3
.7/site-packages/ray/thirdparty_files/psutil/__init__.py", line 1859, in cpu_freq
(pid=29659) ret = _psplatform.cpu_freq()
(pid=29659) File "/u/nlp/anaconda/main/anaconda3/envs/anon/lib/python3
.7/site-packages/ray/thirdparty_files/psutil/_pslinux.py", line 742, in cpu_freq
(pid=29659) curr = cat(pjoin(path, "scaling_cur_freq"), fallback=None)
(pid=29659) File "/u/nlp/anaconda/main/anaconda3/envs/anon/lib/python3
.7/site-packages/ray/thirdparty_files/psutil/_pslinux.py", line 293, in cat
(pid=29659) return f.read().strip()
(pid=29659) File "/u/nlp/anaconda/main/anaconda3/envs/anon/lib/python3.7/site-packages/ray/worker.py", line 392, in
sigterm_handler
(pid=29659) sys.exit(1)
(pid=29659)
(pid=29659) ERROR: File "/u/nlp/anaconda/main/anaconda3/envs/anon/lib/python3
.7/site-packages/experiment_impact_tracker/compute_tracker.py", line 161, in launch_power_monitor
(pid=29659) _sample_and_log_power(log_dir, initial_info, logger=logger)
(pid=29659) File "/u/nlp/anaconda/main/anaconda3/envs/anon/lib/python3
.7/site-packages/experiment_impact_tracker/compute_tracker.py", line 93, in _sample_and_log_power
(pid=29659) required_headers = _get_compatible_data_headers(get_current_region_info_cached()[0])
(pid=29659) File "/u/nlp/anaconda/main/anaconda3/envs/anon/lib/python3
.7/site-packages/experiment_impact_tracker/compute_tracker.py", line 182, in _get_compatible_data_headers
(pid=29659) if not compatability_fn(region=region):
(pid=29659) File "/u/nlp/anaconda/main/anaconda3/envs/anon/lib/python3
.7/site-packages/experiment_impact_tracker/cpu/common.py", line 32, in is_cpu_freq_compatible
(pid=29659) test = [x._asdict() for x in psutil.cpu_freq(percpu=True)]
(pid=29659) File "/u/nlp/anaconda/main/anaconda3/envs/anon/lib/python3
.7/site-packages/ray/thirdparty_files/psutil/__init__.py", line 1859, in cpu_freq
(pid=29659) ret = _psplatform.cpu_freq()
(pid=29659) File "/u/nlp/anaconda/main/anaconda3/envs/anon/lib/python3
.7/site-packages/ray/thirdparty_files/psutil/_pslinux.py", line 742, in cpu_freq
(pid=29659) curr = cat(pjoin(path, "scaling_cur_freq"), fallback=None)
(pid=29659) File "/u/nlp/anaconda/main/anaconda3/envs/anon/lib/python3
.7/site-packages/ray/thirdparty_files/psutil/_pslinux.py", line 293, in cat
(pid=29659) return f.read().strip()
(pid=29659) File "/u/nlp/anaconda/main/anaconda3/envs/anon/lib/python3.7/site-packages/ray/worker.py", line 392, in
sigterm_handler
(pid=29659) sys.exit(1)
Code:
remote_class = ray.remote(num_cpus=1, num_gpus=num_gpus)(
TestClass
).remote()
output = remote_class.run.remote(
model_path=model_path,
dataset_path=data_path,
train_batch_size=train_batch_size,
run_stats=run_stats,
)
class TestClass(object):
def run(cls, model_path: str, dataset_path, train_batch_size, run_stats):
"""
Computes energy metrics for one training epoch
"""
# First copy model_path to temp directory
logging_path = os.path.join(
ENERGY_LOGGING_DIR, run_stats["hyperopt_results"]["experiment_id"]
)
tempdir = os.path.join(logging_path, "temp_model")
shutil.copytree(model_path, tempdir)
model = AnonModel.load(tempdir)
with ImpactTracker(logging_path):
(
_,
_,
_,
) = model.train(
dataset=dataset_path,
training_set_metadat=os.path.join(
tempdir, "training_set_metadata.json"
),
)
data_interface = DataInterface([logging_path])
carbon_output = {
"kg_carbon": data_interface.kg_carbon,
"total_power": data_interface.total_power,
"PUE": data_interface.PUE,
"duration_of_train_step": data_interface.exp_len_hours,
}
shutil.rmtree(tempdir)
return carbon_output
The text was updated successfully, but these errors were encountered:
When using ray, there is a hard exit where we always see an stack trace printed because of sys.exit being called on worker nodes. Is there a way to exit more gracefully in these situations?
Code:
The text was updated successfully, but these errors were encountered: